removed BEGIN_OPENCV_DEVICE_NAMESPACE macros
This commit is contained in:
parent
d926541311
commit
0f53f2993e
@ -425,21 +425,20 @@ void cv::gpu::magnitudeSqr(const GpuMat& src, GpuMat& dst, Stream& stream)
|
||||
////////////////////////////////////////////////////////////////////////
|
||||
// Polar <-> Cart
|
||||
|
||||
BEGIN_OPENCV_DEVICE_NAMESPACE
|
||||
|
||||
namespace mathfunc
|
||||
namespace cv { namespace gpu { namespace device
|
||||
{
|
||||
void cartToPolar_gpu(DevMem2Df x, DevMem2Df y, DevMem2Df mag, bool magSqr, DevMem2Df angle, bool angleInDegrees, cudaStream_t stream);
|
||||
void polarToCart_gpu(DevMem2Df mag, DevMem2Df angle, DevMem2Df x, DevMem2Df y, bool angleInDegrees, cudaStream_t stream);
|
||||
}
|
||||
|
||||
END_OPENCV_DEVICE_NAMESPACE
|
||||
namespace mathfunc
|
||||
{
|
||||
void cartToPolar_gpu(DevMem2Df x, DevMem2Df y, DevMem2Df mag, bool magSqr, DevMem2Df angle, bool angleInDegrees, cudaStream_t stream);
|
||||
void polarToCart_gpu(DevMem2Df mag, DevMem2Df angle, DevMem2Df x, DevMem2Df y, bool angleInDegrees, cudaStream_t stream);
|
||||
}
|
||||
}}}
|
||||
|
||||
namespace
|
||||
{
|
||||
inline void cartToPolar_caller(const GpuMat& x, const GpuMat& y, GpuMat* mag, bool magSqr, GpuMat* angle, bool angleInDegrees, cudaStream_t stream)
|
||||
{
|
||||
using namespace OPENCV_DEVICE_NAMESPACE_ mathfunc;
|
||||
using namespace ::cv::gpu::device::mathfunc;
|
||||
|
||||
CV_DbgAssert(x.size() == y.size() && x.type() == y.type());
|
||||
CV_Assert(x.depth() == CV_32F);
|
||||
@ -459,7 +458,7 @@ namespace
|
||||
|
||||
inline void polarToCart_caller(const GpuMat& mag, const GpuMat& angle, GpuMat& x, GpuMat& y, bool angleInDegrees, cudaStream_t stream)
|
||||
{
|
||||
using namespace OPENCV_DEVICE_NAMESPACE_ mathfunc;
|
||||
using namespace ::cv::gpu::device::mathfunc;
|
||||
|
||||
CV_DbgAssert((mag.empty() || mag.size() == angle.size()) && mag.type() == angle.type());
|
||||
CV_Assert(mag.depth() == CV_32F);
|
||||
|
@ -55,19 +55,18 @@ void cv::gpu::DisparityBilateralFilter::operator()(const GpuMat&, const GpuMat&,
|
||||
|
||||
#else /* !defined (HAVE_CUDA) */
|
||||
|
||||
BEGIN_OPENCV_DEVICE_NAMESPACE
|
||||
|
||||
namespace bilateral_filter
|
||||
namespace cv { namespace gpu { namespace device
|
||||
{
|
||||
void load_constants(float* table_color, DevMem2Df table_space, int ndisp, int radius, short edge_disc, short max_disc);
|
||||
namespace bilateral_filter
|
||||
{
|
||||
void load_constants(float* table_color, DevMem2Df table_space, int ndisp, int radius, short edge_disc, short max_disc);
|
||||
|
||||
void bilateral_filter_gpu(DevMem2Db disp, DevMem2Db img, int channels, int iters, cudaStream_t stream);
|
||||
void bilateral_filter_gpu(DevMem2D_<short> disp, DevMem2Db img, int channels, int iters, cudaStream_t stream);
|
||||
}
|
||||
void bilateral_filter_gpu(DevMem2Db disp, DevMem2Db img, int channels, int iters, cudaStream_t stream);
|
||||
void bilateral_filter_gpu(DevMem2D_<short> disp, DevMem2Db img, int channels, int iters, cudaStream_t stream);
|
||||
}
|
||||
}}}
|
||||
|
||||
END_OPENCV_DEVICE_NAMESPACE
|
||||
|
||||
using namespace OPENCV_DEVICE_NAMESPACE_ bilateral_filter;
|
||||
using namespace ::cv::gpu::device::bilateral_filter;
|
||||
|
||||
namespace
|
||||
{
|
||||
|
@ -52,19 +52,18 @@ void cv::gpu::blendLinear(const GpuMat&, const GpuMat&, const GpuMat&, const Gpu
|
||||
|
||||
#else
|
||||
|
||||
BEGIN_OPENCV_DEVICE_NAMESPACE
|
||||
|
||||
namespace blend
|
||||
namespace cv { namespace gpu { namespace device
|
||||
{
|
||||
template <typename T>
|
||||
void blendLinearCaller(int rows, int cols, int cn, PtrStep<T> img1, PtrStep<T> img2, PtrStepf weights1, PtrStepf weights2, PtrStep<T> result, cudaStream_t stream);
|
||||
namespace blend
|
||||
{
|
||||
template <typename T>
|
||||
void blendLinearCaller(int rows, int cols, int cn, PtrStep<T> img1, PtrStep<T> img2, PtrStepf weights1, PtrStepf weights2, PtrStep<T> result, cudaStream_t stream);
|
||||
|
||||
void blendLinearCaller8UC4(int rows, int cols, PtrStepb img1, PtrStepb img2, PtrStepf weights1, PtrStepf weights2, PtrStepb result, cudaStream_t stream);
|
||||
}
|
||||
void blendLinearCaller8UC4(int rows, int cols, PtrStepb img1, PtrStepb img2, PtrStepf weights1, PtrStepf weights2, PtrStepb result, cudaStream_t stream);
|
||||
}
|
||||
}}}
|
||||
|
||||
END_OPENCV_DEVICE_NAMESPACE
|
||||
|
||||
using namespace OPENCV_DEVICE_NAMESPACE_ blend;
|
||||
using namespace ::cv::gpu::device::blend;
|
||||
|
||||
void cv::gpu::blendLinear(const GpuMat& img1, const GpuMat& img2, const GpuMat& weights1, const GpuMat& weights2,
|
||||
GpuMat& result, Stream& stream)
|
||||
|
@ -82,80 +82,79 @@ void cv::gpu::BruteForceMatcher_GPU_base::radiusMatch(const GpuMat&, vector< vec
|
||||
|
||||
#else /* !defined (HAVE_CUDA) */
|
||||
|
||||
BEGIN_OPENCV_DEVICE_NAMESPACE
|
||||
|
||||
namespace bf_match
|
||||
namespace cv { namespace gpu { namespace device
|
||||
{
|
||||
template <typename T> void matchL1_gpu(const DevMem2Db& query, const DevMem2Db& train, const DevMem2Db& mask,
|
||||
const DevMem2Di& trainIdx, const DevMem2Df& distance,
|
||||
int cc, cudaStream_t stream);
|
||||
template <typename T> void matchL2_gpu(const DevMem2Db& query, const DevMem2Db& train, const DevMem2Db& mask,
|
||||
const DevMem2Di& trainIdx, const DevMem2Df& distance,
|
||||
int cc, cudaStream_t stream);
|
||||
template <typename T> void matchHamming_gpu(const DevMem2Db& query, const DevMem2Db& train, const DevMem2Db& mask,
|
||||
const DevMem2Di& trainIdx, const DevMem2Df& distance,
|
||||
int cc, cudaStream_t stream);
|
||||
namespace bf_match
|
||||
{
|
||||
template <typename T> void matchL1_gpu(const DevMem2Db& query, const DevMem2Db& train, const DevMem2Db& mask,
|
||||
const DevMem2Di& trainIdx, const DevMem2Df& distance,
|
||||
int cc, cudaStream_t stream);
|
||||
template <typename T> void matchL2_gpu(const DevMem2Db& query, const DevMem2Db& train, const DevMem2Db& mask,
|
||||
const DevMem2Di& trainIdx, const DevMem2Df& distance,
|
||||
int cc, cudaStream_t stream);
|
||||
template <typename T> void matchHamming_gpu(const DevMem2Db& query, const DevMem2Db& train, const DevMem2Db& mask,
|
||||
const DevMem2Di& trainIdx, const DevMem2Df& distance,
|
||||
int cc, cudaStream_t stream);
|
||||
|
||||
template <typename T> void matchL1_gpu(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& masks,
|
||||
const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance,
|
||||
int cc, cudaStream_t stream);
|
||||
template <typename T> void matchL2_gpu(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& masks,
|
||||
const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance,
|
||||
int cc, cudaStream_t stream);
|
||||
template <typename T> void matchHamming_gpu(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& masks,
|
||||
const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance,
|
||||
int cc, cudaStream_t stream);
|
||||
}
|
||||
template <typename T> void matchL1_gpu(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& masks,
|
||||
const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance,
|
||||
int cc, cudaStream_t stream);
|
||||
template <typename T> void matchL2_gpu(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& masks,
|
||||
const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance,
|
||||
int cc, cudaStream_t stream);
|
||||
template <typename T> void matchHamming_gpu(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& masks,
|
||||
const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance,
|
||||
int cc, cudaStream_t stream);
|
||||
}
|
||||
|
||||
namespace bf_knnmatch
|
||||
{
|
||||
template <typename T> void matchL1_gpu(const DevMem2Db& query, const DevMem2Db& train, int k, const DevMem2Db& mask,
|
||||
const DevMem2Db& trainIdx, const DevMem2Db& distance, const DevMem2Df& allDist,
|
||||
int cc, cudaStream_t stream);
|
||||
template <typename T> void matchL2_gpu(const DevMem2Db& query, const DevMem2Db& train, int k, const DevMem2Db& mask,
|
||||
const DevMem2Db& trainIdx, const DevMem2Db& distance, const DevMem2Df& allDist,
|
||||
int cc, cudaStream_t stream);
|
||||
template <typename T> void matchHamming_gpu(const DevMem2Db& query, const DevMem2Db& train, int k, const DevMem2Db& mask,
|
||||
const DevMem2Db& trainIdx, const DevMem2Db& distance, const DevMem2Df& allDist,
|
||||
int cc, cudaStream_t stream);
|
||||
namespace bf_knnmatch
|
||||
{
|
||||
template <typename T> void matchL1_gpu(const DevMem2Db& query, const DevMem2Db& train, int k, const DevMem2Db& mask,
|
||||
const DevMem2Db& trainIdx, const DevMem2Db& distance, const DevMem2Df& allDist,
|
||||
int cc, cudaStream_t stream);
|
||||
template <typename T> void matchL2_gpu(const DevMem2Db& query, const DevMem2Db& train, int k, const DevMem2Db& mask,
|
||||
const DevMem2Db& trainIdx, const DevMem2Db& distance, const DevMem2Df& allDist,
|
||||
int cc, cudaStream_t stream);
|
||||
template <typename T> void matchHamming_gpu(const DevMem2Db& query, const DevMem2Db& train, int k, const DevMem2Db& mask,
|
||||
const DevMem2Db& trainIdx, const DevMem2Db& distance, const DevMem2Df& allDist,
|
||||
int cc, cudaStream_t stream);
|
||||
|
||||
template <typename T> void match2L1_gpu(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& masks,
|
||||
const DevMem2Db& trainIdx, const DevMem2Db& imgIdx, const DevMem2Db& distance,
|
||||
int cc, cudaStream_t stream);
|
||||
template <typename T> void match2L2_gpu(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& masks,
|
||||
const DevMem2Db& trainIdx, const DevMem2Db& imgIdx, const DevMem2Db& distance,
|
||||
int cc, cudaStream_t stream);
|
||||
template <typename T> void match2Hamming_gpu(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& masks,
|
||||
const DevMem2Db& trainIdx, const DevMem2Db& imgIdx, const DevMem2Db& distance,
|
||||
int cc, cudaStream_t stream);
|
||||
}
|
||||
template <typename T> void match2L1_gpu(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& masks,
|
||||
const DevMem2Db& trainIdx, const DevMem2Db& imgIdx, const DevMem2Db& distance,
|
||||
int cc, cudaStream_t stream);
|
||||
template <typename T> void match2L2_gpu(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& masks,
|
||||
const DevMem2Db& trainIdx, const DevMem2Db& imgIdx, const DevMem2Db& distance,
|
||||
int cc, cudaStream_t stream);
|
||||
template <typename T> void match2Hamming_gpu(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& masks,
|
||||
const DevMem2Db& trainIdx, const DevMem2Db& imgIdx, const DevMem2Db& distance,
|
||||
int cc, cudaStream_t stream);
|
||||
}
|
||||
|
||||
namespace bf_radius_match
|
||||
{
|
||||
template <typename T> void matchL1_gpu(const DevMem2Db& query, const DevMem2Db& train, float maxDistance, const DevMem2Db& mask,
|
||||
const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches,
|
||||
int cc, cudaStream_t stream);
|
||||
template <typename T> void matchL2_gpu(const DevMem2Db& query, const DevMem2Db& train, float maxDistance, const DevMem2Db& mask,
|
||||
const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches,
|
||||
int cc, cudaStream_t stream);
|
||||
template <typename T> void matchHamming_gpu(const DevMem2Db& query, const DevMem2Db& train, float maxDistance, const DevMem2Db& mask,
|
||||
const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches,
|
||||
int cc, cudaStream_t stream);
|
||||
namespace bf_radius_match
|
||||
{
|
||||
template <typename T> void matchL1_gpu(const DevMem2Db& query, const DevMem2Db& train, float maxDistance, const DevMem2Db& mask,
|
||||
const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches,
|
||||
int cc, cudaStream_t stream);
|
||||
template <typename T> void matchL2_gpu(const DevMem2Db& query, const DevMem2Db& train, float maxDistance, const DevMem2Db& mask,
|
||||
const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches,
|
||||
int cc, cudaStream_t stream);
|
||||
template <typename T> void matchHamming_gpu(const DevMem2Db& query, const DevMem2Db& train, float maxDistance, const DevMem2Db& mask,
|
||||
const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches,
|
||||
int cc, cudaStream_t stream);
|
||||
|
||||
template <typename T> void matchL1_gpu(const DevMem2Db& query, const DevMem2Db* trains, int n, float maxDistance, const DevMem2Db* masks,
|
||||
const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches,
|
||||
int cc, cudaStream_t stream);
|
||||
template <typename T> void matchL1_gpu(const DevMem2Db& query, const DevMem2Db* trains, int n, float maxDistance, const DevMem2Db* masks,
|
||||
const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches,
|
||||
int cc, cudaStream_t stream);
|
||||
|
||||
template <typename T> void matchL2_gpu(const DevMem2Db& query, const DevMem2Db* trains, int n, float maxDistance, const DevMem2Db* masks,
|
||||
const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches,
|
||||
int cc, cudaStream_t stream);
|
||||
template <typename T> void matchL2_gpu(const DevMem2Db& query, const DevMem2Db* trains, int n, float maxDistance, const DevMem2Db* masks,
|
||||
const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches,
|
||||
int cc, cudaStream_t stream);
|
||||
|
||||
template <typename T> void matchHamming_gpu(const DevMem2Db& query, const DevMem2Db* trains, int n, float maxDistance, const DevMem2Db* masks,
|
||||
const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches,
|
||||
int cc, cudaStream_t stream);
|
||||
}
|
||||
|
||||
END_OPENCV_DEVICE_NAMESPACE
|
||||
template <typename T> void matchHamming_gpu(const DevMem2Db& query, const DevMem2Db* trains, int n, float maxDistance, const DevMem2Db* masks,
|
||||
const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches,
|
||||
int cc, cudaStream_t stream);
|
||||
}
|
||||
}}}
|
||||
|
||||
////////////////////////////////////////////////////////////////////
|
||||
// Train collection
|
||||
@ -199,7 +198,7 @@ void cv::gpu::BruteForceMatcher_GPU_base::matchSingle(const GpuMat& query, const
|
||||
if (query.empty() || train.empty())
|
||||
return;
|
||||
|
||||
using namespace OPENCV_DEVICE_NAMESPACE_ bf_match;
|
||||
using namespace ::cv::gpu::device::bf_match;
|
||||
|
||||
typedef void (*caller_t)(const DevMem2Db& query, const DevMem2Db& train, const DevMem2Db& mask,
|
||||
const DevMem2Di& trainIdx, const DevMem2Df& distance,
|
||||
@ -341,7 +340,7 @@ void cv::gpu::BruteForceMatcher_GPU_base::matchCollection(const GpuMat& query, c
|
||||
if (query.empty() || trainCollection.empty())
|
||||
return;
|
||||
|
||||
using namespace OPENCV_DEVICE_NAMESPACE_ bf_match;
|
||||
using namespace ::cv::gpu::device::bf_match;
|
||||
|
||||
typedef void (*caller_t)(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& masks,
|
||||
const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance,
|
||||
@ -452,7 +451,7 @@ void cv::gpu::BruteForceMatcher_GPU_base::knnMatchSingle(const GpuMat& query, co
|
||||
if (query.empty() || train.empty())
|
||||
return;
|
||||
|
||||
using namespace OPENCV_DEVICE_NAMESPACE_ bf_knnmatch;
|
||||
using namespace ::cv::gpu::device::bf_knnmatch;
|
||||
|
||||
typedef void (*caller_t)(const DevMem2Db& query, const DevMem2Db& train, int k, const DevMem2Db& mask,
|
||||
const DevMem2Db& trainIdx, const DevMem2Db& distance, const DevMem2Df& allDist,
|
||||
@ -581,7 +580,7 @@ void cv::gpu::BruteForceMatcher_GPU_base::knnMatch2Collection(const GpuMat& quer
|
||||
if (query.empty() || trainCollection.empty())
|
||||
return;
|
||||
|
||||
using namespace OPENCV_DEVICE_NAMESPACE_ bf_knnmatch;
|
||||
using namespace ::cv::gpu::device::bf_knnmatch;
|
||||
|
||||
typedef void (*caller_t)(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& masks,
|
||||
const DevMem2Db& trainIdx, const DevMem2Db& imgIdx, const DevMem2Db& distance,
|
||||
@ -762,7 +761,7 @@ void cv::gpu::BruteForceMatcher_GPU_base::radiusMatchSingle(const GpuMat& query,
|
||||
if (query.empty() || train.empty())
|
||||
return;
|
||||
|
||||
using namespace OPENCV_DEVICE_NAMESPACE_ bf_radius_match;
|
||||
using namespace ::cv::gpu::device::bf_radius_match;
|
||||
|
||||
typedef void (*caller_t)(const DevMem2Db& query, const DevMem2Db& train, float maxDistance, const DevMem2Db& mask,
|
||||
const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches,
|
||||
@ -893,7 +892,7 @@ void cv::gpu::BruteForceMatcher_GPU_base::radiusMatchCollection(const GpuMat& qu
|
||||
if (query.empty() || empty())
|
||||
return;
|
||||
|
||||
using namespace OPENCV_DEVICE_NAMESPACE_ bf_radius_match;
|
||||
using namespace ::cv::gpu::device::bf_radius_match;
|
||||
|
||||
typedef void (*caller_t)(const DevMem2Db& query, const DevMem2Db* trains, int n, float maxDistance, const DevMem2Db* masks,
|
||||
const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches,
|
||||
|
@ -56,31 +56,30 @@ void cv::gpu::solvePnPRansac(const Mat&, const Mat&, const Mat&, const Mat&, Mat
|
||||
|
||||
#else
|
||||
|
||||
BEGIN_OPENCV_DEVICE_NAMESPACE
|
||||
|
||||
namespace transform_points
|
||||
namespace cv { namespace gpu { namespace device
|
||||
{
|
||||
void call(const DevMem2D_<float3> src, const float* rot, const float* transl, DevMem2D_<float3> dst, cudaStream_t stream);
|
||||
}
|
||||
namespace transform_points
|
||||
{
|
||||
void call(const DevMem2D_<float3> src, const float* rot, const float* transl, DevMem2D_<float3> dst, cudaStream_t stream);
|
||||
}
|
||||
|
||||
namespace project_points
|
||||
{
|
||||
void call(const DevMem2D_<float3> src, const float* rot, const float* transl, const float* proj, DevMem2D_<float2> dst, cudaStream_t stream);
|
||||
}
|
||||
namespace project_points
|
||||
{
|
||||
void call(const DevMem2D_<float3> src, const float* rot, const float* transl, const float* proj, DevMem2D_<float2> dst, cudaStream_t stream);
|
||||
}
|
||||
|
||||
namespace solve_pnp_ransac
|
||||
{
|
||||
int maxNumIters();
|
||||
namespace solve_pnp_ransac
|
||||
{
|
||||
int maxNumIters();
|
||||
|
||||
void computeHypothesisScores(
|
||||
const int num_hypotheses, const int num_points, const float* rot_matrices,
|
||||
const float3* transl_vectors, const float3* object, const float2* image,
|
||||
const float dist_threshold, int* hypothesis_scores);
|
||||
}
|
||||
void computeHypothesisScores(
|
||||
const int num_hypotheses, const int num_points, const float* rot_matrices,
|
||||
const float3* transl_vectors, const float3* object, const float2* image,
|
||||
const float dist_threshold, int* hypothesis_scores);
|
||||
}
|
||||
}}}
|
||||
|
||||
END_OPENCV_DEVICE_NAMESPACE
|
||||
|
||||
using namespace OPENCV_DEVICE_NAMESPACE;
|
||||
using namespace ::cv::gpu::device;
|
||||
|
||||
namespace
|
||||
{
|
||||
|
@ -51,8 +51,8 @@ void cv::gpu::cvtColor(const GpuMat&, GpuMat&, int, int, Stream&) { throw_nogpu(
|
||||
|
||||
#else /* !defined (HAVE_CUDA) */
|
||||
|
||||
BEGIN_OPENCV_DEVICE_NAMESPACE
|
||||
|
||||
namespace cv { namespace gpu { namespace device
|
||||
{
|
||||
#define OPENCV_GPU_DECLARE_CVTCOLOR_ONE(name) \
|
||||
void name(const DevMem2Db& src, const DevMem2Db& dst, cudaStream_t stream);
|
||||
|
||||
@ -67,142 +67,141 @@ BEGIN_OPENCV_DEVICE_NAMESPACE
|
||||
OPENCV_GPU_DECLARE_CVTCOLOR_ONE(name ## _full_8u) \
|
||||
OPENCV_GPU_DECLARE_CVTCOLOR_ONE(name ## _full_32f)
|
||||
|
||||
OPENCV_GPU_DECLARE_CVTCOLOR_ALL(bgr_to_rgb)
|
||||
OPENCV_GPU_DECLARE_CVTCOLOR_ALL(bgr_to_bgra)
|
||||
OPENCV_GPU_DECLARE_CVTCOLOR_ALL(bgr_to_rgba)
|
||||
OPENCV_GPU_DECLARE_CVTCOLOR_ALL(bgra_to_bgr)
|
||||
OPENCV_GPU_DECLARE_CVTCOLOR_ALL(bgra_to_rgb)
|
||||
OPENCV_GPU_DECLARE_CVTCOLOR_ALL(bgra_to_rgba)
|
||||
OPENCV_GPU_DECLARE_CVTCOLOR_ALL(bgr_to_rgb)
|
||||
OPENCV_GPU_DECLARE_CVTCOLOR_ALL(bgr_to_bgra)
|
||||
OPENCV_GPU_DECLARE_CVTCOLOR_ALL(bgr_to_rgba)
|
||||
OPENCV_GPU_DECLARE_CVTCOLOR_ALL(bgra_to_bgr)
|
||||
OPENCV_GPU_DECLARE_CVTCOLOR_ALL(bgra_to_rgb)
|
||||
OPENCV_GPU_DECLARE_CVTCOLOR_ALL(bgra_to_rgba)
|
||||
|
||||
OPENCV_GPU_DECLARE_CVTCOLOR_ONE(bgr_to_bgr555)
|
||||
OPENCV_GPU_DECLARE_CVTCOLOR_ONE(bgr_to_bgr565)
|
||||
OPENCV_GPU_DECLARE_CVTCOLOR_ONE(rgb_to_bgr555)
|
||||
OPENCV_GPU_DECLARE_CVTCOLOR_ONE(rgb_to_bgr565)
|
||||
OPENCV_GPU_DECLARE_CVTCOLOR_ONE(bgra_to_bgr555)
|
||||
OPENCV_GPU_DECLARE_CVTCOLOR_ONE(bgra_to_bgr565)
|
||||
OPENCV_GPU_DECLARE_CVTCOLOR_ONE(rgba_to_bgr555)
|
||||
OPENCV_GPU_DECLARE_CVTCOLOR_ONE(rgba_to_bgr565)
|
||||
OPENCV_GPU_DECLARE_CVTCOLOR_ONE(bgr_to_bgr555)
|
||||
OPENCV_GPU_DECLARE_CVTCOLOR_ONE(bgr_to_bgr565)
|
||||
OPENCV_GPU_DECLARE_CVTCOLOR_ONE(rgb_to_bgr555)
|
||||
OPENCV_GPU_DECLARE_CVTCOLOR_ONE(rgb_to_bgr565)
|
||||
OPENCV_GPU_DECLARE_CVTCOLOR_ONE(bgra_to_bgr555)
|
||||
OPENCV_GPU_DECLARE_CVTCOLOR_ONE(bgra_to_bgr565)
|
||||
OPENCV_GPU_DECLARE_CVTCOLOR_ONE(rgba_to_bgr555)
|
||||
OPENCV_GPU_DECLARE_CVTCOLOR_ONE(rgba_to_bgr565)
|
||||
|
||||
OPENCV_GPU_DECLARE_CVTCOLOR_ONE(bgr555_to_rgb)
|
||||
OPENCV_GPU_DECLARE_CVTCOLOR_ONE(bgr565_to_rgb)
|
||||
OPENCV_GPU_DECLARE_CVTCOLOR_ONE(bgr555_to_bgr)
|
||||
OPENCV_GPU_DECLARE_CVTCOLOR_ONE(bgr565_to_bgr)
|
||||
OPENCV_GPU_DECLARE_CVTCOLOR_ONE(bgr555_to_rgba)
|
||||
OPENCV_GPU_DECLARE_CVTCOLOR_ONE(bgr565_to_rgba)
|
||||
OPENCV_GPU_DECLARE_CVTCOLOR_ONE(bgr555_to_bgra)
|
||||
OPENCV_GPU_DECLARE_CVTCOLOR_ONE(bgr565_to_bgra)
|
||||
OPENCV_GPU_DECLARE_CVTCOLOR_ONE(bgr555_to_rgb)
|
||||
OPENCV_GPU_DECLARE_CVTCOLOR_ONE(bgr565_to_rgb)
|
||||
OPENCV_GPU_DECLARE_CVTCOLOR_ONE(bgr555_to_bgr)
|
||||
OPENCV_GPU_DECLARE_CVTCOLOR_ONE(bgr565_to_bgr)
|
||||
OPENCV_GPU_DECLARE_CVTCOLOR_ONE(bgr555_to_rgba)
|
||||
OPENCV_GPU_DECLARE_CVTCOLOR_ONE(bgr565_to_rgba)
|
||||
OPENCV_GPU_DECLARE_CVTCOLOR_ONE(bgr555_to_bgra)
|
||||
OPENCV_GPU_DECLARE_CVTCOLOR_ONE(bgr565_to_bgra)
|
||||
|
||||
OPENCV_GPU_DECLARE_CVTCOLOR_ALL(gray_to_bgr)
|
||||
OPENCV_GPU_DECLARE_CVTCOLOR_ALL(gray_to_bgra)
|
||||
OPENCV_GPU_DECLARE_CVTCOLOR_ALL(gray_to_bgr)
|
||||
OPENCV_GPU_DECLARE_CVTCOLOR_ALL(gray_to_bgra)
|
||||
|
||||
OPENCV_GPU_DECLARE_CVTCOLOR_ONE(gray_to_bgr555)
|
||||
OPENCV_GPU_DECLARE_CVTCOLOR_ONE(gray_to_bgr565)
|
||||
OPENCV_GPU_DECLARE_CVTCOLOR_ONE(gray_to_bgr555)
|
||||
OPENCV_GPU_DECLARE_CVTCOLOR_ONE(gray_to_bgr565)
|
||||
|
||||
OPENCV_GPU_DECLARE_CVTCOLOR_ONE(bgr555_to_gray)
|
||||
OPENCV_GPU_DECLARE_CVTCOLOR_ONE(bgr565_to_gray)
|
||||
OPENCV_GPU_DECLARE_CVTCOLOR_ONE(bgr555_to_gray)
|
||||
OPENCV_GPU_DECLARE_CVTCOLOR_ONE(bgr565_to_gray)
|
||||
|
||||
OPENCV_GPU_DECLARE_CVTCOLOR_ALL(rgb_to_gray)
|
||||
OPENCV_GPU_DECLARE_CVTCOLOR_ALL(bgr_to_gray)
|
||||
OPENCV_GPU_DECLARE_CVTCOLOR_ALL(rgba_to_gray)
|
||||
OPENCV_GPU_DECLARE_CVTCOLOR_ALL(bgra_to_gray)
|
||||
OPENCV_GPU_DECLARE_CVTCOLOR_ALL(rgb_to_gray)
|
||||
OPENCV_GPU_DECLARE_CVTCOLOR_ALL(bgr_to_gray)
|
||||
OPENCV_GPU_DECLARE_CVTCOLOR_ALL(rgba_to_gray)
|
||||
OPENCV_GPU_DECLARE_CVTCOLOR_ALL(bgra_to_gray)
|
||||
|
||||
OPENCV_GPU_DECLARE_CVTCOLOR_ALL(rgb_to_yuv)
|
||||
OPENCV_GPU_DECLARE_CVTCOLOR_ALL(rgba_to_yuv)
|
||||
OPENCV_GPU_DECLARE_CVTCOLOR_ALL(rgb_to_yuv4)
|
||||
OPENCV_GPU_DECLARE_CVTCOLOR_ALL(rgba_to_yuv4)
|
||||
OPENCV_GPU_DECLARE_CVTCOLOR_ALL(bgr_to_yuv)
|
||||
OPENCV_GPU_DECLARE_CVTCOLOR_ALL(bgra_to_yuv)
|
||||
OPENCV_GPU_DECLARE_CVTCOLOR_ALL(bgr_to_yuv4)
|
||||
OPENCV_GPU_DECLARE_CVTCOLOR_ALL(bgra_to_yuv4)
|
||||
OPENCV_GPU_DECLARE_CVTCOLOR_ALL(rgb_to_yuv)
|
||||
OPENCV_GPU_DECLARE_CVTCOLOR_ALL(rgba_to_yuv)
|
||||
OPENCV_GPU_DECLARE_CVTCOLOR_ALL(rgb_to_yuv4)
|
||||
OPENCV_GPU_DECLARE_CVTCOLOR_ALL(rgba_to_yuv4)
|
||||
OPENCV_GPU_DECLARE_CVTCOLOR_ALL(bgr_to_yuv)
|
||||
OPENCV_GPU_DECLARE_CVTCOLOR_ALL(bgra_to_yuv)
|
||||
OPENCV_GPU_DECLARE_CVTCOLOR_ALL(bgr_to_yuv4)
|
||||
OPENCV_GPU_DECLARE_CVTCOLOR_ALL(bgra_to_yuv4)
|
||||
|
||||
OPENCV_GPU_DECLARE_CVTCOLOR_ALL(yuv_to_rgb)
|
||||
OPENCV_GPU_DECLARE_CVTCOLOR_ALL(yuv_to_rgba)
|
||||
OPENCV_GPU_DECLARE_CVTCOLOR_ALL(yuv4_to_rgb)
|
||||
OPENCV_GPU_DECLARE_CVTCOLOR_ALL(yuv4_to_rgba)
|
||||
OPENCV_GPU_DECLARE_CVTCOLOR_ALL(yuv_to_bgr)
|
||||
OPENCV_GPU_DECLARE_CVTCOLOR_ALL(yuv_to_bgra)
|
||||
OPENCV_GPU_DECLARE_CVTCOLOR_ALL(yuv4_to_bgr)
|
||||
OPENCV_GPU_DECLARE_CVTCOLOR_ALL(yuv4_to_bgra)
|
||||
OPENCV_GPU_DECLARE_CVTCOLOR_ALL(yuv_to_rgb)
|
||||
OPENCV_GPU_DECLARE_CVTCOLOR_ALL(yuv_to_rgba)
|
||||
OPENCV_GPU_DECLARE_CVTCOLOR_ALL(yuv4_to_rgb)
|
||||
OPENCV_GPU_DECLARE_CVTCOLOR_ALL(yuv4_to_rgba)
|
||||
OPENCV_GPU_DECLARE_CVTCOLOR_ALL(yuv_to_bgr)
|
||||
OPENCV_GPU_DECLARE_CVTCOLOR_ALL(yuv_to_bgra)
|
||||
OPENCV_GPU_DECLARE_CVTCOLOR_ALL(yuv4_to_bgr)
|
||||
OPENCV_GPU_DECLARE_CVTCOLOR_ALL(yuv4_to_bgra)
|
||||
|
||||
OPENCV_GPU_DECLARE_CVTCOLOR_ALL(rgb_to_YCrCb)
|
||||
OPENCV_GPU_DECLARE_CVTCOLOR_ALL(rgba_to_YCrCb)
|
||||
OPENCV_GPU_DECLARE_CVTCOLOR_ALL(rgb_to_YCrCb4)
|
||||
OPENCV_GPU_DECLARE_CVTCOLOR_ALL(rgba_to_YCrCb4)
|
||||
OPENCV_GPU_DECLARE_CVTCOLOR_ALL(bgr_to_YCrCb)
|
||||
OPENCV_GPU_DECLARE_CVTCOLOR_ALL(bgra_to_YCrCb)
|
||||
OPENCV_GPU_DECLARE_CVTCOLOR_ALL(bgr_to_YCrCb4)
|
||||
OPENCV_GPU_DECLARE_CVTCOLOR_ALL(bgra_to_YCrCb4)
|
||||
OPENCV_GPU_DECLARE_CVTCOLOR_ALL(rgb_to_YCrCb)
|
||||
OPENCV_GPU_DECLARE_CVTCOLOR_ALL(rgba_to_YCrCb)
|
||||
OPENCV_GPU_DECLARE_CVTCOLOR_ALL(rgb_to_YCrCb4)
|
||||
OPENCV_GPU_DECLARE_CVTCOLOR_ALL(rgba_to_YCrCb4)
|
||||
OPENCV_GPU_DECLARE_CVTCOLOR_ALL(bgr_to_YCrCb)
|
||||
OPENCV_GPU_DECLARE_CVTCOLOR_ALL(bgra_to_YCrCb)
|
||||
OPENCV_GPU_DECLARE_CVTCOLOR_ALL(bgr_to_YCrCb4)
|
||||
OPENCV_GPU_DECLARE_CVTCOLOR_ALL(bgra_to_YCrCb4)
|
||||
|
||||
OPENCV_GPU_DECLARE_CVTCOLOR_ALL(YCrCb_to_rgb)
|
||||
OPENCV_GPU_DECLARE_CVTCOLOR_ALL(YCrCb_to_rgba)
|
||||
OPENCV_GPU_DECLARE_CVTCOLOR_ALL(YCrCb4_to_rgb)
|
||||
OPENCV_GPU_DECLARE_CVTCOLOR_ALL(YCrCb4_to_rgba)
|
||||
OPENCV_GPU_DECLARE_CVTCOLOR_ALL(YCrCb_to_bgr)
|
||||
OPENCV_GPU_DECLARE_CVTCOLOR_ALL(YCrCb_to_bgra)
|
||||
OPENCV_GPU_DECLARE_CVTCOLOR_ALL(YCrCb4_to_bgr)
|
||||
OPENCV_GPU_DECLARE_CVTCOLOR_ALL(YCrCb4_to_bgra)
|
||||
OPENCV_GPU_DECLARE_CVTCOLOR_ALL(YCrCb_to_rgb)
|
||||
OPENCV_GPU_DECLARE_CVTCOLOR_ALL(YCrCb_to_rgba)
|
||||
OPENCV_GPU_DECLARE_CVTCOLOR_ALL(YCrCb4_to_rgb)
|
||||
OPENCV_GPU_DECLARE_CVTCOLOR_ALL(YCrCb4_to_rgba)
|
||||
OPENCV_GPU_DECLARE_CVTCOLOR_ALL(YCrCb_to_bgr)
|
||||
OPENCV_GPU_DECLARE_CVTCOLOR_ALL(YCrCb_to_bgra)
|
||||
OPENCV_GPU_DECLARE_CVTCOLOR_ALL(YCrCb4_to_bgr)
|
||||
OPENCV_GPU_DECLARE_CVTCOLOR_ALL(YCrCb4_to_bgra)
|
||||
|
||||
OPENCV_GPU_DECLARE_CVTCOLOR_ALL(rgb_to_xyz)
|
||||
OPENCV_GPU_DECLARE_CVTCOLOR_ALL(rgba_to_xyz)
|
||||
OPENCV_GPU_DECLARE_CVTCOLOR_ALL(rgb_to_xyz4)
|
||||
OPENCV_GPU_DECLARE_CVTCOLOR_ALL(rgba_to_xyz4)
|
||||
OPENCV_GPU_DECLARE_CVTCOLOR_ALL(bgr_to_xyz)
|
||||
OPENCV_GPU_DECLARE_CVTCOLOR_ALL(bgra_to_xyz)
|
||||
OPENCV_GPU_DECLARE_CVTCOLOR_ALL(bgr_to_xyz4)
|
||||
OPENCV_GPU_DECLARE_CVTCOLOR_ALL(bgra_to_xyz4)
|
||||
OPENCV_GPU_DECLARE_CVTCOLOR_ALL(rgb_to_xyz)
|
||||
OPENCV_GPU_DECLARE_CVTCOLOR_ALL(rgba_to_xyz)
|
||||
OPENCV_GPU_DECLARE_CVTCOLOR_ALL(rgb_to_xyz4)
|
||||
OPENCV_GPU_DECLARE_CVTCOLOR_ALL(rgba_to_xyz4)
|
||||
OPENCV_GPU_DECLARE_CVTCOLOR_ALL(bgr_to_xyz)
|
||||
OPENCV_GPU_DECLARE_CVTCOLOR_ALL(bgra_to_xyz)
|
||||
OPENCV_GPU_DECLARE_CVTCOLOR_ALL(bgr_to_xyz4)
|
||||
OPENCV_GPU_DECLARE_CVTCOLOR_ALL(bgra_to_xyz4)
|
||||
|
||||
OPENCV_GPU_DECLARE_CVTCOLOR_ALL(xyz_to_rgb)
|
||||
OPENCV_GPU_DECLARE_CVTCOLOR_ALL(xyz4_to_rgb)
|
||||
OPENCV_GPU_DECLARE_CVTCOLOR_ALL(xyz_to_rgba)
|
||||
OPENCV_GPU_DECLARE_CVTCOLOR_ALL(xyz4_to_rgba)
|
||||
OPENCV_GPU_DECLARE_CVTCOLOR_ALL(xyz_to_bgr)
|
||||
OPENCV_GPU_DECLARE_CVTCOLOR_ALL(xyz4_to_bgr)
|
||||
OPENCV_GPU_DECLARE_CVTCOLOR_ALL(xyz_to_bgra)
|
||||
OPENCV_GPU_DECLARE_CVTCOLOR_ALL(xyz4_to_bgra)
|
||||
OPENCV_GPU_DECLARE_CVTCOLOR_ALL(xyz_to_rgb)
|
||||
OPENCV_GPU_DECLARE_CVTCOLOR_ALL(xyz4_to_rgb)
|
||||
OPENCV_GPU_DECLARE_CVTCOLOR_ALL(xyz_to_rgba)
|
||||
OPENCV_GPU_DECLARE_CVTCOLOR_ALL(xyz4_to_rgba)
|
||||
OPENCV_GPU_DECLARE_CVTCOLOR_ALL(xyz_to_bgr)
|
||||
OPENCV_GPU_DECLARE_CVTCOLOR_ALL(xyz4_to_bgr)
|
||||
OPENCV_GPU_DECLARE_CVTCOLOR_ALL(xyz_to_bgra)
|
||||
OPENCV_GPU_DECLARE_CVTCOLOR_ALL(xyz4_to_bgra)
|
||||
|
||||
OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(rgb_to_hsv)
|
||||
OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(rgba_to_hsv)
|
||||
OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(rgb_to_hsv4)
|
||||
OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(rgba_to_hsv4)
|
||||
OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(bgr_to_hsv)
|
||||
OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(bgra_to_hsv)
|
||||
OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(bgr_to_hsv4)
|
||||
OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(bgra_to_hsv4)
|
||||
OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(rgb_to_hsv)
|
||||
OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(rgba_to_hsv)
|
||||
OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(rgb_to_hsv4)
|
||||
OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(rgba_to_hsv4)
|
||||
OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(bgr_to_hsv)
|
||||
OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(bgra_to_hsv)
|
||||
OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(bgr_to_hsv4)
|
||||
OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(bgra_to_hsv4)
|
||||
|
||||
OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(hsv_to_rgb)
|
||||
OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(hsv_to_rgba)
|
||||
OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(hsv4_to_rgb)
|
||||
OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(hsv4_to_rgba)
|
||||
OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(hsv_to_bgr)
|
||||
OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(hsv_to_bgra)
|
||||
OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(hsv4_to_bgr)
|
||||
OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(hsv4_to_bgra)
|
||||
OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(hsv_to_rgb)
|
||||
OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(hsv_to_rgba)
|
||||
OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(hsv4_to_rgb)
|
||||
OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(hsv4_to_rgba)
|
||||
OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(hsv_to_bgr)
|
||||
OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(hsv_to_bgra)
|
||||
OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(hsv4_to_bgr)
|
||||
OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(hsv4_to_bgra)
|
||||
|
||||
OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(rgb_to_hls)
|
||||
OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(rgba_to_hls)
|
||||
OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(rgb_to_hls4)
|
||||
OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(rgba_to_hls4)
|
||||
OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(bgr_to_hls)
|
||||
OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(bgra_to_hls)
|
||||
OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(bgr_to_hls4)
|
||||
OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(bgra_to_hls4)
|
||||
OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(rgb_to_hls)
|
||||
OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(rgba_to_hls)
|
||||
OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(rgb_to_hls4)
|
||||
OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(rgba_to_hls4)
|
||||
OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(bgr_to_hls)
|
||||
OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(bgra_to_hls)
|
||||
OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(bgr_to_hls4)
|
||||
OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(bgra_to_hls4)
|
||||
|
||||
OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(hls_to_rgb)
|
||||
OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(hls_to_rgba)
|
||||
OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(hls4_to_rgb)
|
||||
OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(hls4_to_rgba)
|
||||
OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(hls_to_bgr)
|
||||
OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(hls_to_bgra)
|
||||
OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(hls4_to_bgr)
|
||||
OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(hls4_to_bgra)
|
||||
OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(hls_to_rgb)
|
||||
OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(hls_to_rgba)
|
||||
OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(hls4_to_rgb)
|
||||
OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(hls4_to_rgba)
|
||||
OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(hls_to_bgr)
|
||||
OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(hls_to_bgra)
|
||||
OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(hls4_to_bgr)
|
||||
OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(hls4_to_bgra)
|
||||
|
||||
#undef OPENCV_GPU_DECLARE_CVTCOLOR_ONE
|
||||
#undef OPENCV_GPU_DECLARE_CVTCOLOR_ALL
|
||||
#undef OPENCV_GPU_DECLARE_CVTCOLOR_8U32F
|
||||
#undef OPENCV_GPU_DECLARE_CVTCOLOR_ONE
|
||||
#undef OPENCV_GPU_DECLARE_CVTCOLOR_ALL
|
||||
#undef OPENCV_GPU_DECLARE_CVTCOLOR_8U32F
|
||||
}}}
|
||||
|
||||
END_OPENCV_DEVICE_NAMESPACE
|
||||
|
||||
using namespace OPENCV_DEVICE_NAMESPACE;
|
||||
using namespace ::cv::gpu::device;
|
||||
|
||||
namespace
|
||||
{
|
||||
|
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
@ -45,423 +45,421 @@
|
||||
#include "opencv2/gpu/device/vec_distance.hpp"
|
||||
#include "opencv2/gpu/device/datamov_utils.hpp"
|
||||
|
||||
BEGIN_OPENCV_DEVICE_NAMESPACE
|
||||
|
||||
namespace bf_radius_match {
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
// Match Unrolled
|
||||
|
||||
template <int BLOCK_SIZE, int MAX_DESC_LEN, bool SAVE_IMG_IDX, typename Dist, typename T, typename Mask>
|
||||
__global__ void matchUnrolled(const DevMem2D_<T> query, int imgIdx, const DevMem2D_<T> train, float maxDistance, const Mask mask,
|
||||
PtrStepi bestTrainIdx, PtrStepi bestImgIdx, PtrStepf bestDistance, unsigned int* nMatches, int maxCount)
|
||||
namespace cv { namespace gpu { namespace device
|
||||
{
|
||||
#if __CUDA_ARCH__ >= 110
|
||||
|
||||
extern __shared__ int smem[];
|
||||
|
||||
const int queryIdx = blockIdx.y * BLOCK_SIZE + threadIdx.y;
|
||||
const int trainIdx = blockIdx.x * BLOCK_SIZE + threadIdx.x;
|
||||
|
||||
typename Dist::value_type* s_query = (typename Dist::value_type*)(smem);
|
||||
typename Dist::value_type* s_train = (typename Dist::value_type*)(smem + BLOCK_SIZE * BLOCK_SIZE);
|
||||
|
||||
Dist dist;
|
||||
|
||||
#pragma unroll
|
||||
for (int i = 0; i < MAX_DESC_LEN / BLOCK_SIZE; ++i)
|
||||
namespace bf_radius_match
|
||||
{
|
||||
const int loadX = threadIdx.x + i * BLOCK_SIZE;
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
// Match Unrolled
|
||||
|
||||
s_query[threadIdx.y * BLOCK_SIZE + threadIdx.x] = 0;
|
||||
s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = 0;
|
||||
|
||||
if (loadX < query.cols)
|
||||
template <int BLOCK_SIZE, int MAX_DESC_LEN, bool SAVE_IMG_IDX, typename Dist, typename T, typename Mask>
|
||||
__global__ void matchUnrolled(const DevMem2D_<T> query, int imgIdx, const DevMem2D_<T> train, float maxDistance, const Mask mask,
|
||||
PtrStepi bestTrainIdx, PtrStepi bestImgIdx, PtrStepf bestDistance, unsigned int* nMatches, int maxCount)
|
||||
{
|
||||
T val;
|
||||
#if __CUDA_ARCH__ >= 110
|
||||
|
||||
ForceGlob<T>::Load(query.ptr(::min(queryIdx, query.rows - 1)), loadX, val);
|
||||
s_query[threadIdx.y * BLOCK_SIZE + threadIdx.x] = val;
|
||||
extern __shared__ int smem[];
|
||||
|
||||
ForceGlob<T>::Load(train.ptr(::min(blockIdx.x * BLOCK_SIZE + threadIdx.y, train.rows - 1)), loadX, val);
|
||||
s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = val;
|
||||
const int queryIdx = blockIdx.y * BLOCK_SIZE + threadIdx.y;
|
||||
const int trainIdx = blockIdx.x * BLOCK_SIZE + threadIdx.x;
|
||||
|
||||
typename Dist::value_type* s_query = (typename Dist::value_type*)(smem);
|
||||
typename Dist::value_type* s_train = (typename Dist::value_type*)(smem + BLOCK_SIZE * BLOCK_SIZE);
|
||||
|
||||
Dist dist;
|
||||
|
||||
#pragma unroll
|
||||
for (int i = 0; i < MAX_DESC_LEN / BLOCK_SIZE; ++i)
|
||||
{
|
||||
const int loadX = threadIdx.x + i * BLOCK_SIZE;
|
||||
|
||||
s_query[threadIdx.y * BLOCK_SIZE + threadIdx.x] = 0;
|
||||
s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = 0;
|
||||
|
||||
if (loadX < query.cols)
|
||||
{
|
||||
T val;
|
||||
|
||||
ForceGlob<T>::Load(query.ptr(::min(queryIdx, query.rows - 1)), loadX, val);
|
||||
s_query[threadIdx.y * BLOCK_SIZE + threadIdx.x] = val;
|
||||
|
||||
ForceGlob<T>::Load(train.ptr(::min(blockIdx.x * BLOCK_SIZE + threadIdx.y, train.rows - 1)), loadX, val);
|
||||
s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = val;
|
||||
}
|
||||
|
||||
__syncthreads();
|
||||
|
||||
#pragma unroll
|
||||
for (int j = 0; j < BLOCK_SIZE; ++j)
|
||||
dist.reduceIter(s_query[threadIdx.y * BLOCK_SIZE + j], s_train[j * BLOCK_SIZE + threadIdx.x]);
|
||||
|
||||
__syncthreads();
|
||||
}
|
||||
|
||||
float distVal = (typename Dist::result_type)dist;
|
||||
|
||||
if (queryIdx < query.rows && trainIdx < train.rows && mask(queryIdx, trainIdx) && distVal < maxDistance)
|
||||
{
|
||||
unsigned int ind = atomicInc(nMatches + queryIdx, (unsigned int) -1);
|
||||
if (ind < maxCount)
|
||||
{
|
||||
bestTrainIdx.ptr(queryIdx)[ind] = trainIdx;
|
||||
if (SAVE_IMG_IDX) bestImgIdx.ptr(queryIdx)[ind] = imgIdx;
|
||||
bestDistance.ptr(queryIdx)[ind] = distVal;
|
||||
}
|
||||
}
|
||||
|
||||
#endif
|
||||
}
|
||||
|
||||
__syncthreads();
|
||||
|
||||
#pragma unroll
|
||||
for (int j = 0; j < BLOCK_SIZE; ++j)
|
||||
dist.reduceIter(s_query[threadIdx.y * BLOCK_SIZE + j], s_train[j * BLOCK_SIZE + threadIdx.x]);
|
||||
|
||||
__syncthreads();
|
||||
}
|
||||
|
||||
float distVal = (typename Dist::result_type)dist;
|
||||
|
||||
if (queryIdx < query.rows && trainIdx < train.rows && mask(queryIdx, trainIdx) && distVal < maxDistance)
|
||||
{
|
||||
unsigned int ind = atomicInc(nMatches + queryIdx, (unsigned int) -1);
|
||||
if (ind < maxCount)
|
||||
template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask>
|
||||
void matchUnrolled(const DevMem2D_<T>& query, const DevMem2D_<T>& train, float maxDistance, const Mask& mask,
|
||||
const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, cudaStream_t stream)
|
||||
{
|
||||
bestTrainIdx.ptr(queryIdx)[ind] = trainIdx;
|
||||
if (SAVE_IMG_IDX) bestImgIdx.ptr(queryIdx)[ind] = imgIdx;
|
||||
bestDistance.ptr(queryIdx)[ind] = distVal;
|
||||
}
|
||||
}
|
||||
const dim3 block(BLOCK_SIZE, BLOCK_SIZE);
|
||||
const dim3 grid(divUp(train.rows, BLOCK_SIZE), divUp(query.rows, BLOCK_SIZE));
|
||||
|
||||
#endif
|
||||
}
|
||||
const size_t smemSize = (2 * BLOCK_SIZE * BLOCK_SIZE) * sizeof(int);
|
||||
|
||||
template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask>
|
||||
void matchUnrolled(const DevMem2D_<T>& query, const DevMem2D_<T>& train, float maxDistance, const Mask& mask,
|
||||
const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, cudaStream_t stream)
|
||||
{
|
||||
const dim3 block(BLOCK_SIZE, BLOCK_SIZE);
|
||||
const dim3 grid(divUp(train.rows, BLOCK_SIZE), divUp(query.rows, BLOCK_SIZE));
|
||||
matchUnrolled<BLOCK_SIZE, MAX_DESC_LEN, false, Dist><<<grid, block, smemSize, stream>>>(query, 0, train, maxDistance, mask,
|
||||
trainIdx, PtrStepi(), distance, nMatches.data, trainIdx.cols);
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
|
||||
const size_t smemSize = (2 * BLOCK_SIZE * BLOCK_SIZE) * sizeof(int);
|
||||
if (stream == 0)
|
||||
cudaSafeCall( cudaDeviceSynchronize() );
|
||||
}
|
||||
|
||||
matchUnrolled<BLOCK_SIZE, MAX_DESC_LEN, false, Dist><<<grid, block, smemSize, stream>>>(query, 0, train, maxDistance, mask,
|
||||
trainIdx, PtrStepi(), distance, nMatches.data, trainIdx.cols);
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
|
||||
if (stream == 0)
|
||||
cudaSafeCall( cudaDeviceSynchronize() );
|
||||
}
|
||||
|
||||
template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T>
|
||||
void matchUnrolled(const DevMem2D_<T>& query, const DevMem2D_<T>* trains, int n, float maxDistance, const DevMem2Db* masks,
|
||||
const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches,
|
||||
cudaStream_t stream)
|
||||
{
|
||||
const dim3 block(BLOCK_SIZE, BLOCK_SIZE);
|
||||
|
||||
const size_t smemSize = (2 * BLOCK_SIZE * BLOCK_SIZE) * sizeof(int);
|
||||
|
||||
for (int i = 0; i < n; ++i)
|
||||
{
|
||||
const DevMem2D_<T> train = trains[i];
|
||||
|
||||
const dim3 grid(divUp(train.rows, BLOCK_SIZE), divUp(query.rows, BLOCK_SIZE));
|
||||
|
||||
if (masks != 0 && masks[i].data)
|
||||
template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T>
|
||||
void matchUnrolled(const DevMem2D_<T>& query, const DevMem2D_<T>* trains, int n, float maxDistance, const DevMem2Db* masks,
|
||||
const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches,
|
||||
cudaStream_t stream)
|
||||
{
|
||||
matchUnrolled<BLOCK_SIZE, MAX_DESC_LEN, true, Dist><<<grid, block, smemSize, stream>>>(query, i, train, maxDistance, SingleMask(masks[i]),
|
||||
trainIdx, imgIdx, distance, nMatches.data, trainIdx.cols);
|
||||
}
|
||||
else
|
||||
{
|
||||
matchUnrolled<BLOCK_SIZE, MAX_DESC_LEN, true, Dist><<<grid, block, smemSize, stream>>>(query, i, train, maxDistance, WithOutMask(),
|
||||
trainIdx, imgIdx, distance, nMatches.data, trainIdx.cols);
|
||||
}
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
}
|
||||
const dim3 block(BLOCK_SIZE, BLOCK_SIZE);
|
||||
|
||||
if (stream == 0)
|
||||
cudaSafeCall( cudaDeviceSynchronize() );
|
||||
}
|
||||
const size_t smemSize = (2 * BLOCK_SIZE * BLOCK_SIZE) * sizeof(int);
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
// Match
|
||||
for (int i = 0; i < n; ++i)
|
||||
{
|
||||
const DevMem2D_<T> train = trains[i];
|
||||
|
||||
template <int BLOCK_SIZE, bool SAVE_IMG_IDX, typename Dist, typename T, typename Mask>
|
||||
__global__ void match(const DevMem2D_<T> query, int imgIdx, const DevMem2D_<T> train, float maxDistance, const Mask mask,
|
||||
PtrStepi bestTrainIdx, PtrStepi bestImgIdx, PtrStepf bestDistance, unsigned int* nMatches, int maxCount)
|
||||
{
|
||||
#if __CUDA_ARCH__ >= 110
|
||||
const dim3 grid(divUp(train.rows, BLOCK_SIZE), divUp(query.rows, BLOCK_SIZE));
|
||||
|
||||
extern __shared__ int smem[];
|
||||
if (masks != 0 && masks[i].data)
|
||||
{
|
||||
matchUnrolled<BLOCK_SIZE, MAX_DESC_LEN, true, Dist><<<grid, block, smemSize, stream>>>(query, i, train, maxDistance, SingleMask(masks[i]),
|
||||
trainIdx, imgIdx, distance, nMatches.data, trainIdx.cols);
|
||||
}
|
||||
else
|
||||
{
|
||||
matchUnrolled<BLOCK_SIZE, MAX_DESC_LEN, true, Dist><<<grid, block, smemSize, stream>>>(query, i, train, maxDistance, WithOutMask(),
|
||||
trainIdx, imgIdx, distance, nMatches.data, trainIdx.cols);
|
||||
}
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
}
|
||||
|
||||
const int queryIdx = blockIdx.y * BLOCK_SIZE + threadIdx.y;
|
||||
const int trainIdx = blockIdx.x * BLOCK_SIZE + threadIdx.x;
|
||||
|
||||
typename Dist::value_type* s_query = (typename Dist::value_type*)(smem);
|
||||
typename Dist::value_type* s_train = (typename Dist::value_type*)(smem + BLOCK_SIZE * BLOCK_SIZE);
|
||||
|
||||
Dist dist;
|
||||
|
||||
for (int i = 0, endi = (query.cols + BLOCK_SIZE - 1) / BLOCK_SIZE; i < endi; ++i)
|
||||
{
|
||||
const int loadX = threadIdx.x + i * BLOCK_SIZE;
|
||||
|
||||
s_query[threadIdx.y * BLOCK_SIZE + threadIdx.x] = 0;
|
||||
s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = 0;
|
||||
|
||||
if (loadX < query.cols)
|
||||
{
|
||||
T val;
|
||||
|
||||
ForceGlob<T>::Load(query.ptr(::min(queryIdx, query.rows - 1)), loadX, val);
|
||||
s_query[threadIdx.y * BLOCK_SIZE + threadIdx.x] = val;
|
||||
|
||||
ForceGlob<T>::Load(train.ptr(::min(blockIdx.x * BLOCK_SIZE + threadIdx.y, train.rows - 1)), loadX, val);
|
||||
s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = val;
|
||||
if (stream == 0)
|
||||
cudaSafeCall( cudaDeviceSynchronize() );
|
||||
}
|
||||
|
||||
__syncthreads();
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
// Match
|
||||
|
||||
#pragma unroll
|
||||
for (int j = 0; j < BLOCK_SIZE; ++j)
|
||||
dist.reduceIter(s_query[threadIdx.y * BLOCK_SIZE + j], s_train[j * BLOCK_SIZE + threadIdx.x]);
|
||||
|
||||
__syncthreads();
|
||||
}
|
||||
|
||||
float distVal = (typename Dist::result_type)dist;
|
||||
|
||||
if (queryIdx < query.rows && trainIdx < train.rows && mask(queryIdx, trainIdx) && distVal < maxDistance)
|
||||
{
|
||||
unsigned int ind = atomicInc(nMatches + queryIdx, (unsigned int) -1);
|
||||
if (ind < maxCount)
|
||||
template <int BLOCK_SIZE, bool SAVE_IMG_IDX, typename Dist, typename T, typename Mask>
|
||||
__global__ void match(const DevMem2D_<T> query, int imgIdx, const DevMem2D_<T> train, float maxDistance, const Mask mask,
|
||||
PtrStepi bestTrainIdx, PtrStepi bestImgIdx, PtrStepf bestDistance, unsigned int* nMatches, int maxCount)
|
||||
{
|
||||
bestTrainIdx.ptr(queryIdx)[ind] = trainIdx;
|
||||
if (SAVE_IMG_IDX) bestImgIdx.ptr(queryIdx)[ind] = imgIdx;
|
||||
bestDistance.ptr(queryIdx)[ind] = distVal;
|
||||
#if __CUDA_ARCH__ >= 110
|
||||
|
||||
extern __shared__ int smem[];
|
||||
|
||||
const int queryIdx = blockIdx.y * BLOCK_SIZE + threadIdx.y;
|
||||
const int trainIdx = blockIdx.x * BLOCK_SIZE + threadIdx.x;
|
||||
|
||||
typename Dist::value_type* s_query = (typename Dist::value_type*)(smem);
|
||||
typename Dist::value_type* s_train = (typename Dist::value_type*)(smem + BLOCK_SIZE * BLOCK_SIZE);
|
||||
|
||||
Dist dist;
|
||||
|
||||
for (int i = 0, endi = (query.cols + BLOCK_SIZE - 1) / BLOCK_SIZE; i < endi; ++i)
|
||||
{
|
||||
const int loadX = threadIdx.x + i * BLOCK_SIZE;
|
||||
|
||||
s_query[threadIdx.y * BLOCK_SIZE + threadIdx.x] = 0;
|
||||
s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = 0;
|
||||
|
||||
if (loadX < query.cols)
|
||||
{
|
||||
T val;
|
||||
|
||||
ForceGlob<T>::Load(query.ptr(::min(queryIdx, query.rows - 1)), loadX, val);
|
||||
s_query[threadIdx.y * BLOCK_SIZE + threadIdx.x] = val;
|
||||
|
||||
ForceGlob<T>::Load(train.ptr(::min(blockIdx.x * BLOCK_SIZE + threadIdx.y, train.rows - 1)), loadX, val);
|
||||
s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = val;
|
||||
}
|
||||
|
||||
__syncthreads();
|
||||
|
||||
#pragma unroll
|
||||
for (int j = 0; j < BLOCK_SIZE; ++j)
|
||||
dist.reduceIter(s_query[threadIdx.y * BLOCK_SIZE + j], s_train[j * BLOCK_SIZE + threadIdx.x]);
|
||||
|
||||
__syncthreads();
|
||||
}
|
||||
|
||||
float distVal = (typename Dist::result_type)dist;
|
||||
|
||||
if (queryIdx < query.rows && trainIdx < train.rows && mask(queryIdx, trainIdx) && distVal < maxDistance)
|
||||
{
|
||||
unsigned int ind = atomicInc(nMatches + queryIdx, (unsigned int) -1);
|
||||
if (ind < maxCount)
|
||||
{
|
||||
bestTrainIdx.ptr(queryIdx)[ind] = trainIdx;
|
||||
if (SAVE_IMG_IDX) bestImgIdx.ptr(queryIdx)[ind] = imgIdx;
|
||||
bestDistance.ptr(queryIdx)[ind] = distVal;
|
||||
}
|
||||
}
|
||||
|
||||
#endif
|
||||
}
|
||||
}
|
||||
|
||||
#endif
|
||||
}
|
||||
|
||||
template <int BLOCK_SIZE, typename Dist, typename T, typename Mask>
|
||||
void match(const DevMem2D_<T>& query, const DevMem2D_<T>& train, float maxDistance, const Mask& mask,
|
||||
const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches,
|
||||
cudaStream_t stream)
|
||||
{
|
||||
const dim3 block(BLOCK_SIZE, BLOCK_SIZE);
|
||||
const dim3 grid(divUp(train.rows, BLOCK_SIZE), divUp(query.rows, BLOCK_SIZE));
|
||||
|
||||
const size_t smemSize = (2 * BLOCK_SIZE * BLOCK_SIZE) * sizeof(int);
|
||||
|
||||
match<BLOCK_SIZE, false, Dist><<<grid, block, smemSize, stream>>>(query, 0, train, maxDistance, mask,
|
||||
trainIdx, PtrStepi(), distance, nMatches.data, trainIdx.cols);
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
|
||||
if (stream == 0)
|
||||
cudaSafeCall( cudaDeviceSynchronize() );
|
||||
}
|
||||
|
||||
template <int BLOCK_SIZE, typename Dist, typename T>
|
||||
void match(const DevMem2D_<T>& query, const DevMem2D_<T>* trains, int n, float maxDistance, const DevMem2Db* masks,
|
||||
const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches,
|
||||
cudaStream_t stream)
|
||||
{
|
||||
const dim3 block(BLOCK_SIZE, BLOCK_SIZE);
|
||||
|
||||
const size_t smemSize = (2 * BLOCK_SIZE * BLOCK_SIZE) * sizeof(int);
|
||||
|
||||
for (int i = 0; i < n; ++i)
|
||||
{
|
||||
const DevMem2D_<T> train = trains[i];
|
||||
|
||||
const dim3 grid(divUp(train.rows, BLOCK_SIZE), divUp(query.rows, BLOCK_SIZE));
|
||||
|
||||
if (masks != 0 && masks[i].data)
|
||||
template <int BLOCK_SIZE, typename Dist, typename T, typename Mask>
|
||||
void match(const DevMem2D_<T>& query, const DevMem2D_<T>& train, float maxDistance, const Mask& mask,
|
||||
const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches,
|
||||
cudaStream_t stream)
|
||||
{
|
||||
match<BLOCK_SIZE, true, Dist><<<grid, block, smemSize, stream>>>(query, i, train, maxDistance, SingleMask(masks[i]),
|
||||
trainIdx, imgIdx, distance, nMatches.data, trainIdx.cols);
|
||||
const dim3 block(BLOCK_SIZE, BLOCK_SIZE);
|
||||
const dim3 grid(divUp(train.rows, BLOCK_SIZE), divUp(query.rows, BLOCK_SIZE));
|
||||
|
||||
const size_t smemSize = (2 * BLOCK_SIZE * BLOCK_SIZE) * sizeof(int);
|
||||
|
||||
match<BLOCK_SIZE, false, Dist><<<grid, block, smemSize, stream>>>(query, 0, train, maxDistance, mask,
|
||||
trainIdx, PtrStepi(), distance, nMatches.data, trainIdx.cols);
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
|
||||
if (stream == 0)
|
||||
cudaSafeCall( cudaDeviceSynchronize() );
|
||||
}
|
||||
else
|
||||
|
||||
template <int BLOCK_SIZE, typename Dist, typename T>
|
||||
void match(const DevMem2D_<T>& query, const DevMem2D_<T>* trains, int n, float maxDistance, const DevMem2Db* masks,
|
||||
const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches,
|
||||
cudaStream_t stream)
|
||||
{
|
||||
match<BLOCK_SIZE, true, Dist><<<grid, block, smemSize, stream>>>(query, i, train, maxDistance, WithOutMask(),
|
||||
trainIdx, imgIdx, distance, nMatches.data, trainIdx.cols);
|
||||
const dim3 block(BLOCK_SIZE, BLOCK_SIZE);
|
||||
|
||||
const size_t smemSize = (2 * BLOCK_SIZE * BLOCK_SIZE) * sizeof(int);
|
||||
|
||||
for (int i = 0; i < n; ++i)
|
||||
{
|
||||
const DevMem2D_<T> train = trains[i];
|
||||
|
||||
const dim3 grid(divUp(train.rows, BLOCK_SIZE), divUp(query.rows, BLOCK_SIZE));
|
||||
|
||||
if (masks != 0 && masks[i].data)
|
||||
{
|
||||
match<BLOCK_SIZE, true, Dist><<<grid, block, smemSize, stream>>>(query, i, train, maxDistance, SingleMask(masks[i]),
|
||||
trainIdx, imgIdx, distance, nMatches.data, trainIdx.cols);
|
||||
}
|
||||
else
|
||||
{
|
||||
match<BLOCK_SIZE, true, Dist><<<grid, block, smemSize, stream>>>(query, i, train, maxDistance, WithOutMask(),
|
||||
trainIdx, imgIdx, distance, nMatches.data, trainIdx.cols);
|
||||
}
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
}
|
||||
|
||||
if (stream == 0)
|
||||
cudaSafeCall( cudaDeviceSynchronize() );
|
||||
}
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
}
|
||||
|
||||
if (stream == 0)
|
||||
cudaSafeCall( cudaDeviceSynchronize() );
|
||||
}
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
// Match dispatcher
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
// Match dispatcher
|
||||
template <typename Dist, typename T, typename Mask>
|
||||
void matchDispatcher(const DevMem2D_<T>& query, const DevMem2D_<T>& train, float maxDistance, const Mask& mask,
|
||||
const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches,
|
||||
int cc, cudaStream_t stream)
|
||||
{
|
||||
if (query.cols <= 64)
|
||||
{
|
||||
matchUnrolled<16, 64, Dist>(query, train, maxDistance, mask, trainIdx, distance, nMatches, stream);
|
||||
}
|
||||
else if (query.cols <= 128)
|
||||
{
|
||||
matchUnrolled<16, 128, Dist>(query, train, maxDistance, mask, trainIdx, distance, nMatches, stream);
|
||||
}
|
||||
/*else if (query.cols <= 256)
|
||||
{
|
||||
matchUnrolled<16, 256, Dist>(query, train, maxDistance, mask, trainIdx, distance, nMatches, stream);
|
||||
}
|
||||
else if (query.cols <= 512)
|
||||
{
|
||||
matchUnrolled<16, 512, Dist>(query, train, maxDistance, mask, trainIdx, distance, nMatches, stream);
|
||||
}
|
||||
else if (query.cols <= 1024)
|
||||
{
|
||||
matchUnrolled<16, 1024, Dist>(query, train, maxDistance, mask, trainIdx, distance, nMatches, stream);
|
||||
}*/
|
||||
else
|
||||
{
|
||||
match<16, Dist>(query, train, maxDistance, mask, trainIdx, distance, nMatches, stream);
|
||||
}
|
||||
}
|
||||
|
||||
template <typename Dist, typename T, typename Mask>
|
||||
void matchDispatcher(const DevMem2D_<T>& query, const DevMem2D_<T>& train, float maxDistance, const Mask& mask,
|
||||
const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches,
|
||||
int cc, cudaStream_t stream)
|
||||
{
|
||||
if (query.cols <= 64)
|
||||
{
|
||||
matchUnrolled<16, 64, Dist>(query, train, maxDistance, mask, trainIdx, distance, nMatches, stream);
|
||||
}
|
||||
else if (query.cols <= 128)
|
||||
{
|
||||
matchUnrolled<16, 128, Dist>(query, train, maxDistance, mask, trainIdx, distance, nMatches, stream);
|
||||
}
|
||||
/*else if (query.cols <= 256)
|
||||
{
|
||||
matchUnrolled<16, 256, Dist>(query, train, maxDistance, mask, trainIdx, distance, nMatches, stream);
|
||||
}
|
||||
else if (query.cols <= 512)
|
||||
{
|
||||
matchUnrolled<16, 512, Dist>(query, train, maxDistance, mask, trainIdx, distance, nMatches, stream);
|
||||
}
|
||||
else if (query.cols <= 1024)
|
||||
{
|
||||
matchUnrolled<16, 1024, Dist>(query, train, maxDistance, mask, trainIdx, distance, nMatches, stream);
|
||||
}*/
|
||||
else
|
||||
{
|
||||
match<16, Dist>(query, train, maxDistance, mask, trainIdx, distance, nMatches, stream);
|
||||
}
|
||||
}
|
||||
template <typename Dist, typename T>
|
||||
void matchDispatcher(const DevMem2D_<T>& query, const DevMem2D_<T>* trains, int n, float maxDistance, const DevMem2Db* masks,
|
||||
const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches,
|
||||
int cc, cudaStream_t stream)
|
||||
{
|
||||
if (query.cols <= 64)
|
||||
{
|
||||
matchUnrolled<16, 64, Dist>(query, trains, n, maxDistance, masks, trainIdx, imgIdx, distance, nMatches, stream);
|
||||
}
|
||||
else if (query.cols <= 128)
|
||||
{
|
||||
matchUnrolled<16, 128, Dist>(query, trains, n, maxDistance, masks, trainIdx, imgIdx, distance, nMatches, stream);
|
||||
}
|
||||
/*else if (query.cols <= 256)
|
||||
{
|
||||
matchUnrolled<16, 256, Dist>(query, trains, n, maxDistance, masks, trainIdx, imgIdx, distance, nMatches, stream);
|
||||
}
|
||||
else if (query.cols <= 512)
|
||||
{
|
||||
matchUnrolled<16, 512, Dist>(query, trains, n, maxDistance, masks, trainIdx, imgIdx, distance, nMatches, stream);
|
||||
}
|
||||
else if (query.cols <= 1024)
|
||||
{
|
||||
matchUnrolled<16, 1024, Dist>(query, trains, n, maxDistance, masks, trainIdx, imgIdx, distance, nMatches, stream);
|
||||
}*/
|
||||
else
|
||||
{
|
||||
match<16, Dist>(query, trains, n, maxDistance, masks, trainIdx, imgIdx, distance, nMatches, stream);
|
||||
}
|
||||
}
|
||||
|
||||
template <typename Dist, typename T>
|
||||
void matchDispatcher(const DevMem2D_<T>& query, const DevMem2D_<T>* trains, int n, float maxDistance, const DevMem2Db* masks,
|
||||
const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches,
|
||||
int cc, cudaStream_t stream)
|
||||
{
|
||||
if (query.cols <= 64)
|
||||
{
|
||||
matchUnrolled<16, 64, Dist>(query, trains, n, maxDistance, masks, trainIdx, imgIdx, distance, nMatches, stream);
|
||||
}
|
||||
else if (query.cols <= 128)
|
||||
{
|
||||
matchUnrolled<16, 128, Dist>(query, trains, n, maxDistance, masks, trainIdx, imgIdx, distance, nMatches, stream);
|
||||
}
|
||||
/*else if (query.cols <= 256)
|
||||
{
|
||||
matchUnrolled<16, 256, Dist>(query, trains, n, maxDistance, masks, trainIdx, imgIdx, distance, nMatches, stream);
|
||||
}
|
||||
else if (query.cols <= 512)
|
||||
{
|
||||
matchUnrolled<16, 512, Dist>(query, trains, n, maxDistance, masks, trainIdx, imgIdx, distance, nMatches, stream);
|
||||
}
|
||||
else if (query.cols <= 1024)
|
||||
{
|
||||
matchUnrolled<16, 1024, Dist>(query, trains, n, maxDistance, masks, trainIdx, imgIdx, distance, nMatches, stream);
|
||||
}*/
|
||||
else
|
||||
{
|
||||
match<16, Dist>(query, trains, n, maxDistance, masks, trainIdx, imgIdx, distance, nMatches, stream);
|
||||
}
|
||||
}
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
// Radius Match caller
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
// Radius Match caller
|
||||
template <typename T> void matchL1_gpu(const DevMem2Db& query, const DevMem2Db& train, float maxDistance, const DevMem2Db& mask,
|
||||
const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches,
|
||||
int cc, cudaStream_t stream)
|
||||
{
|
||||
if (mask.data)
|
||||
{
|
||||
matchDispatcher< L1Dist<T> >(static_cast< DevMem2D_<T> >(query), static_cast< DevMem2D_<T> >(train), maxDistance, SingleMask(mask),
|
||||
trainIdx, distance, nMatches,
|
||||
cc, stream);
|
||||
}
|
||||
else
|
||||
{
|
||||
matchDispatcher< L1Dist<T> >(static_cast< DevMem2D_<T> >(query), static_cast< DevMem2D_<T> >(train), maxDistance, WithOutMask(),
|
||||
trainIdx, distance, nMatches,
|
||||
cc, stream);
|
||||
}
|
||||
}
|
||||
|
||||
template <typename T> void matchL1_gpu(const DevMem2Db& query, const DevMem2Db& train, float maxDistance, const DevMem2Db& mask,
|
||||
const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches,
|
||||
int cc, cudaStream_t stream)
|
||||
{
|
||||
if (mask.data)
|
||||
{
|
||||
matchDispatcher< L1Dist<T> >(static_cast< DevMem2D_<T> >(query), static_cast< DevMem2D_<T> >(train), maxDistance, SingleMask(mask),
|
||||
trainIdx, distance, nMatches,
|
||||
cc, stream);
|
||||
}
|
||||
else
|
||||
{
|
||||
matchDispatcher< L1Dist<T> >(static_cast< DevMem2D_<T> >(query), static_cast< DevMem2D_<T> >(train), maxDistance, WithOutMask(),
|
||||
trainIdx, distance, nMatches,
|
||||
cc, stream);
|
||||
}
|
||||
}
|
||||
template void matchL1_gpu<uchar >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, float maxDistance, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);
|
||||
//template void matchL1_gpu<schar >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, float maxDistance, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);
|
||||
template void matchL1_gpu<ushort>(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, float maxDistance, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);
|
||||
template void matchL1_gpu<short >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, float maxDistance, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);
|
||||
template void matchL1_gpu<int >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, float maxDistance, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);
|
||||
template void matchL1_gpu<float >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, float maxDistance, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);
|
||||
|
||||
template void matchL1_gpu<uchar >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, float maxDistance, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);
|
||||
//template void matchL1_gpu<schar >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, float maxDistance, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);
|
||||
template void matchL1_gpu<ushort>(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, float maxDistance, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);
|
||||
template void matchL1_gpu<short >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, float maxDistance, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);
|
||||
template void matchL1_gpu<int >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, float maxDistance, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);
|
||||
template void matchL1_gpu<float >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, float maxDistance, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);
|
||||
template <typename T> void matchL2_gpu(const DevMem2Db& query, const DevMem2Db& train, float maxDistance, const DevMem2Db& mask,
|
||||
const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches,
|
||||
int cc, cudaStream_t stream)
|
||||
{
|
||||
if (mask.data)
|
||||
{
|
||||
matchDispatcher<L2Dist>(static_cast< DevMem2D_<T> >(query), static_cast< DevMem2D_<T> >(train), maxDistance, SingleMask(mask),
|
||||
trainIdx, distance, nMatches,
|
||||
cc, stream);
|
||||
}
|
||||
else
|
||||
{
|
||||
matchDispatcher<L2Dist>(static_cast< DevMem2D_<T> >(query), static_cast< DevMem2D_<T> >(train), maxDistance, WithOutMask(),
|
||||
trainIdx, distance, nMatches,
|
||||
cc, stream);
|
||||
}
|
||||
}
|
||||
|
||||
template <typename T> void matchL2_gpu(const DevMem2Db& query, const DevMem2Db& train, float maxDistance, const DevMem2Db& mask,
|
||||
const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches,
|
||||
int cc, cudaStream_t stream)
|
||||
{
|
||||
if (mask.data)
|
||||
{
|
||||
matchDispatcher<L2Dist>(static_cast< DevMem2D_<T> >(query), static_cast< DevMem2D_<T> >(train), maxDistance, SingleMask(mask),
|
||||
trainIdx, distance, nMatches,
|
||||
cc, stream);
|
||||
}
|
||||
else
|
||||
{
|
||||
matchDispatcher<L2Dist>(static_cast< DevMem2D_<T> >(query), static_cast< DevMem2D_<T> >(train), maxDistance, WithOutMask(),
|
||||
trainIdx, distance, nMatches,
|
||||
cc, stream);
|
||||
}
|
||||
}
|
||||
//template void matchL2_gpu<uchar >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, float maxDistance, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);
|
||||
//template void matchL2_gpu<schar >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, float maxDistance, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);
|
||||
//template void matchL2_gpu<ushort>(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, float maxDistance, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);
|
||||
//template void matchL2_gpu<short >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, float maxDistance, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);
|
||||
//template void matchL2_gpu<int >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, float maxDistance, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);
|
||||
template void matchL2_gpu<float >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, float maxDistance, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);
|
||||
|
||||
//template void matchL2_gpu<uchar >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, float maxDistance, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);
|
||||
//template void matchL2_gpu<schar >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, float maxDistance, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);
|
||||
//template void matchL2_gpu<ushort>(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, float maxDistance, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);
|
||||
//template void matchL2_gpu<short >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, float maxDistance, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);
|
||||
//template void matchL2_gpu<int >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, float maxDistance, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);
|
||||
template void matchL2_gpu<float >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, float maxDistance, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);
|
||||
template <typename T> void matchHamming_gpu(const DevMem2Db& query, const DevMem2Db& train, float maxDistance, const DevMem2Db& mask,
|
||||
const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches,
|
||||
int cc, cudaStream_t stream)
|
||||
{
|
||||
if (mask.data)
|
||||
{
|
||||
matchDispatcher<HammingDist>(static_cast< DevMem2D_<T> >(query), static_cast< DevMem2D_<T> >(train), maxDistance, SingleMask(mask),
|
||||
trainIdx, distance, nMatches,
|
||||
cc, stream);
|
||||
}
|
||||
else
|
||||
{
|
||||
matchDispatcher<HammingDist>(static_cast< DevMem2D_<T> >(query), static_cast< DevMem2D_<T> >(train), maxDistance, WithOutMask(),
|
||||
trainIdx, distance, nMatches,
|
||||
cc, stream);
|
||||
}
|
||||
}
|
||||
|
||||
template <typename T> void matchHamming_gpu(const DevMem2Db& query, const DevMem2Db& train, float maxDistance, const DevMem2Db& mask,
|
||||
const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches,
|
||||
int cc, cudaStream_t stream)
|
||||
{
|
||||
if (mask.data)
|
||||
{
|
||||
matchDispatcher<HammingDist>(static_cast< DevMem2D_<T> >(query), static_cast< DevMem2D_<T> >(train), maxDistance, SingleMask(mask),
|
||||
trainIdx, distance, nMatches,
|
||||
cc, stream);
|
||||
}
|
||||
else
|
||||
{
|
||||
matchDispatcher<HammingDist>(static_cast< DevMem2D_<T> >(query), static_cast< DevMem2D_<T> >(train), maxDistance, WithOutMask(),
|
||||
trainIdx, distance, nMatches,
|
||||
cc, stream);
|
||||
}
|
||||
}
|
||||
template void matchHamming_gpu<uchar >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, float maxDistance, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);
|
||||
//template void matchHamming_gpu<schar >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, float maxDistance, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);
|
||||
template void matchHamming_gpu<ushort>(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, float maxDistance, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);
|
||||
//template void matchHamming_gpu<short >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, float maxDistance, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);
|
||||
template void matchHamming_gpu<int >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, float maxDistance, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);
|
||||
|
||||
template void matchHamming_gpu<uchar >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, float maxDistance, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);
|
||||
//template void matchHamming_gpu<schar >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, float maxDistance, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);
|
||||
template void matchHamming_gpu<ushort>(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, float maxDistance, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);
|
||||
//template void matchHamming_gpu<short >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, float maxDistance, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);
|
||||
template void matchHamming_gpu<int >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, float maxDistance, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);
|
||||
template <typename T> void matchL1_gpu(const DevMem2Db& query, const DevMem2Db* trains, int n, float maxDistance, const DevMem2Db* masks,
|
||||
const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches,
|
||||
int cc, cudaStream_t stream)
|
||||
{
|
||||
matchDispatcher< L1Dist<T> >(static_cast< DevMem2D_<T> >(query), (const DevMem2D_<T>*)trains, n, maxDistance, masks,
|
||||
trainIdx, imgIdx, distance, nMatches,
|
||||
cc, stream);
|
||||
}
|
||||
|
||||
template <typename T> void matchL1_gpu(const DevMem2Db& query, const DevMem2Db* trains, int n, float maxDistance, const DevMem2Db* masks,
|
||||
const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches,
|
||||
int cc, cudaStream_t stream)
|
||||
{
|
||||
matchDispatcher< L1Dist<T> >(static_cast< DevMem2D_<T> >(query), (const DevMem2D_<T>*)trains, n, maxDistance, masks,
|
||||
trainIdx, imgIdx, distance, nMatches,
|
||||
cc, stream);
|
||||
}
|
||||
template void matchL1_gpu<uchar >(const DevMem2Db& query, const DevMem2Db* trains, int n, float maxDistance, const DevMem2Db* masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);
|
||||
//template void matchL1_gpu<schar >(const DevMem2Db& query, const DevMem2Db* trains, int n, float maxDistance, const DevMem2Db* masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);
|
||||
template void matchL1_gpu<ushort>(const DevMem2Db& query, const DevMem2Db* trains, int n, float maxDistance, const DevMem2Db* masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);
|
||||
template void matchL1_gpu<short >(const DevMem2Db& query, const DevMem2Db* trains, int n, float maxDistance, const DevMem2Db* masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);
|
||||
template void matchL1_gpu<int >(const DevMem2Db& query, const DevMem2Db* trains, int n, float maxDistance, const DevMem2Db* masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);
|
||||
template void matchL1_gpu<float >(const DevMem2Db& query, const DevMem2Db* trains, int n, float maxDistance, const DevMem2Db* masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);
|
||||
|
||||
template void matchL1_gpu<uchar >(const DevMem2Db& query, const DevMem2Db* trains, int n, float maxDistance, const DevMem2Db* masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);
|
||||
//template void matchL1_gpu<schar >(const DevMem2Db& query, const DevMem2Db* trains, int n, float maxDistance, const DevMem2Db* masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);
|
||||
template void matchL1_gpu<ushort>(const DevMem2Db& query, const DevMem2Db* trains, int n, float maxDistance, const DevMem2Db* masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);
|
||||
template void matchL1_gpu<short >(const DevMem2Db& query, const DevMem2Db* trains, int n, float maxDistance, const DevMem2Db* masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);
|
||||
template void matchL1_gpu<int >(const DevMem2Db& query, const DevMem2Db* trains, int n, float maxDistance, const DevMem2Db* masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);
|
||||
template void matchL1_gpu<float >(const DevMem2Db& query, const DevMem2Db* trains, int n, float maxDistance, const DevMem2Db* masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);
|
||||
template <typename T> void matchL2_gpu(const DevMem2Db& query, const DevMem2Db* trains, int n, float maxDistance, const DevMem2Db* masks,
|
||||
const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches,
|
||||
int cc, cudaStream_t stream)
|
||||
{
|
||||
matchDispatcher<L2Dist>(static_cast< DevMem2D_<T> >(query), (const DevMem2D_<T>*)trains, n, maxDistance, masks,
|
||||
trainIdx, imgIdx, distance, nMatches,
|
||||
cc, stream);
|
||||
}
|
||||
|
||||
template <typename T> void matchL2_gpu(const DevMem2Db& query, const DevMem2Db* trains, int n, float maxDistance, const DevMem2Db* masks,
|
||||
const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches,
|
||||
int cc, cudaStream_t stream)
|
||||
{
|
||||
matchDispatcher<L2Dist>(static_cast< DevMem2D_<T> >(query), (const DevMem2D_<T>*)trains, n, maxDistance, masks,
|
||||
trainIdx, imgIdx, distance, nMatches,
|
||||
cc, stream);
|
||||
}
|
||||
//template void matchL2_gpu<uchar >(const DevMem2Db& query, const DevMem2Db* trains, int n, float maxDistance, const DevMem2Db* masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);
|
||||
//template void matchL2_gpu<schar >(const DevMem2Db& query, const DevMem2Db* trains, int n, float maxDistance, const DevMem2Db* masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);
|
||||
//template void matchL2_gpu<ushort>(const DevMem2Db& query, const DevMem2Db* trains, int n, float maxDistance, const DevMem2Db* masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);
|
||||
//template void matchL2_gpu<short >(const DevMem2Db& query, const DevMem2Db* trains, int n, float maxDistance, const DevMem2Db* masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);
|
||||
//template void matchL2_gpu<int >(const DevMem2Db& query, const DevMem2Db* trains, int n, float maxDistance, const DevMem2Db* masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);
|
||||
template void matchL2_gpu<float >(const DevMem2Db& query, const DevMem2Db* trains, int n, float maxDistance, const DevMem2Db* masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);
|
||||
|
||||
//template void matchL2_gpu<uchar >(const DevMem2Db& query, const DevMem2Db* trains, int n, float maxDistance, const DevMem2Db* masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);
|
||||
//template void matchL2_gpu<schar >(const DevMem2Db& query, const DevMem2Db* trains, int n, float maxDistance, const DevMem2Db* masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);
|
||||
//template void matchL2_gpu<ushort>(const DevMem2Db& query, const DevMem2Db* trains, int n, float maxDistance, const DevMem2Db* masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);
|
||||
//template void matchL2_gpu<short >(const DevMem2Db& query, const DevMem2Db* trains, int n, float maxDistance, const DevMem2Db* masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);
|
||||
//template void matchL2_gpu<int >(const DevMem2Db& query, const DevMem2Db* trains, int n, float maxDistance, const DevMem2Db* masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);
|
||||
template void matchL2_gpu<float >(const DevMem2Db& query, const DevMem2Db* trains, int n, float maxDistance, const DevMem2Db* masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);
|
||||
template <typename T> void matchHamming_gpu(const DevMem2Db& query, const DevMem2Db* trains, int n, float maxDistance, const DevMem2Db* masks,
|
||||
const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches,
|
||||
int cc, cudaStream_t stream)
|
||||
{
|
||||
matchDispatcher<HammingDist>(static_cast< DevMem2D_<T> >(query), (const DevMem2D_<T>*)trains, n, maxDistance, masks,
|
||||
trainIdx, imgIdx, distance, nMatches,
|
||||
cc, stream);
|
||||
}
|
||||
|
||||
template <typename T> void matchHamming_gpu(const DevMem2Db& query, const DevMem2Db* trains, int n, float maxDistance, const DevMem2Db* masks,
|
||||
const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches,
|
||||
int cc, cudaStream_t stream)
|
||||
{
|
||||
matchDispatcher<HammingDist>(static_cast< DevMem2D_<T> >(query), (const DevMem2D_<T>*)trains, n, maxDistance, masks,
|
||||
trainIdx, imgIdx, distance, nMatches,
|
||||
cc, stream);
|
||||
}
|
||||
|
||||
template void matchHamming_gpu<uchar >(const DevMem2Db& query, const DevMem2Db* trains, int n, float maxDistance, const DevMem2Db* masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);
|
||||
//template void matchHamming_gpu<schar >(const DevMem2Db& query, const DevMem2Db* trains, int n, float maxDistance, const DevMem2Db* masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);
|
||||
template void matchHamming_gpu<ushort>(const DevMem2Db& query, const DevMem2Db* trains, int n, float maxDistance, const DevMem2Db* masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);
|
||||
//template void matchHamming_gpu<short >(const DevMem2Db& query, const DevMem2Db* trains, int n, float maxDistance, const DevMem2Db* masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);
|
||||
template void matchHamming_gpu<int >(const DevMem2Db& query, const DevMem2Db* trains, int n, float maxDistance, const DevMem2Db* masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);
|
||||
|
||||
} // namespace bf_radius_match
|
||||
|
||||
END_OPENCV_DEVICE_NAMESPACE
|
||||
template void matchHamming_gpu<uchar >(const DevMem2Db& query, const DevMem2Db* trains, int n, float maxDistance, const DevMem2Db* masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);
|
||||
//template void matchHamming_gpu<schar >(const DevMem2Db& query, const DevMem2Db* trains, int n, float maxDistance, const DevMem2Db* masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);
|
||||
template void matchHamming_gpu<ushort>(const DevMem2Db& query, const DevMem2Db* trains, int n, float maxDistance, const DevMem2Db* masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);
|
||||
//template void matchHamming_gpu<short >(const DevMem2Db& query, const DevMem2Db* trains, int n, float maxDistance, const DevMem2Db* masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);
|
||||
template void matchHamming_gpu<int >(const DevMem2Db& query, const DevMem2Db* trains, int n, float maxDistance, const DevMem2Db* masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);
|
||||
} // namespace bf_radius_match
|
||||
}}} // namespace cv { namespace gpu { namespace device
|
||||
|
@ -43,186 +43,184 @@
|
||||
#include "internal_shared.hpp"
|
||||
#include "opencv2/gpu/device/limits.hpp"
|
||||
|
||||
BEGIN_OPENCV_DEVICE_NAMESPACE
|
||||
|
||||
namespace bilateral_filter {
|
||||
|
||||
__constant__ float* ctable_color;
|
||||
__constant__ float* ctable_space;
|
||||
__constant__ size_t ctable_space_step;
|
||||
|
||||
__constant__ int cndisp;
|
||||
__constant__ int cradius;
|
||||
|
||||
__constant__ short cedge_disc;
|
||||
__constant__ short cmax_disc;
|
||||
|
||||
void load_constants(float* table_color, DevMem2Df table_space, int ndisp, int radius, short edge_disc, short max_disc)
|
||||
namespace cv { namespace gpu { namespace device
|
||||
{
|
||||
cudaSafeCall( cudaMemcpyToSymbol(ctable_color, &table_color, sizeof(table_color)) );
|
||||
cudaSafeCall( cudaMemcpyToSymbol(ctable_space, &table_space.data, sizeof(table_space.data)) );
|
||||
size_t table_space_step = table_space.step / sizeof(float);
|
||||
cudaSafeCall( cudaMemcpyToSymbol(ctable_space_step, &table_space_step, sizeof(size_t)) );
|
||||
|
||||
cudaSafeCall( cudaMemcpyToSymbol(cndisp, &ndisp, sizeof(int)) );
|
||||
cudaSafeCall( cudaMemcpyToSymbol(cradius, &radius, sizeof(int)) );
|
||||
|
||||
cudaSafeCall( cudaMemcpyToSymbol(cedge_disc, &edge_disc, sizeof(short)) );
|
||||
cudaSafeCall( cudaMemcpyToSymbol(cmax_disc, &max_disc, sizeof(short)) );
|
||||
}
|
||||
|
||||
template <int channels>
|
||||
struct DistRgbMax
|
||||
{
|
||||
static __device__ __forceinline__ uchar calc(const uchar* a, const uchar* b)
|
||||
namespace bilateral_filter
|
||||
{
|
||||
uchar x = ::abs(a[0] - b[0]);
|
||||
uchar y = ::abs(a[1] - b[1]);
|
||||
uchar z = ::abs(a[2] - b[2]);
|
||||
return (::max(::max(x, y), z));
|
||||
}
|
||||
};
|
||||
__constant__ float* ctable_color;
|
||||
__constant__ float* ctable_space;
|
||||
__constant__ size_t ctable_space_step;
|
||||
|
||||
template <>
|
||||
struct DistRgbMax<1>
|
||||
{
|
||||
static __device__ __forceinline__ uchar calc(const uchar* a, const uchar* b)
|
||||
{
|
||||
return ::abs(a[0] - b[0]);
|
||||
}
|
||||
};
|
||||
__constant__ int cndisp;
|
||||
__constant__ int cradius;
|
||||
|
||||
template <int channels, typename T>
|
||||
__global__ void bilateral_filter(int t, T* disp, size_t disp_step, const uchar* img, size_t img_step, int h, int w)
|
||||
{
|
||||
const int y = blockIdx.y * blockDim.y + threadIdx.y;
|
||||
const int x = ((blockIdx.x * blockDim.x + threadIdx.x) << 1) + ((y + t) & 1);
|
||||
__constant__ short cedge_disc;
|
||||
__constant__ short cmax_disc;
|
||||
|
||||
T dp[5];
|
||||
|
||||
if (y > 0 && y < h - 1 && x > 0 && x < w - 1)
|
||||
{
|
||||
dp[0] = *(disp + (y ) * disp_step + x + 0);
|
||||
dp[1] = *(disp + (y-1) * disp_step + x + 0);
|
||||
dp[2] = *(disp + (y ) * disp_step + x - 1);
|
||||
dp[3] = *(disp + (y+1) * disp_step + x + 0);
|
||||
dp[4] = *(disp + (y ) * disp_step + x + 1);
|
||||
|
||||
if(::abs(dp[1] - dp[0]) >= cedge_disc || ::abs(dp[2] - dp[0]) >= cedge_disc || ::abs(dp[3] - dp[0]) >= cedge_disc || ::abs(dp[4] - dp[0]) >= cedge_disc)
|
||||
void load_constants(float* table_color, DevMem2Df table_space, int ndisp, int radius, short edge_disc, short max_disc)
|
||||
{
|
||||
const int ymin = ::max(0, y - cradius);
|
||||
const int xmin = ::max(0, x - cradius);
|
||||
const int ymax = ::min(h - 1, y + cradius);
|
||||
const int xmax = ::min(w - 1, x + cradius);
|
||||
cudaSafeCall( cudaMemcpyToSymbol(ctable_color, &table_color, sizeof(table_color)) );
|
||||
cudaSafeCall( cudaMemcpyToSymbol(ctable_space, &table_space.data, sizeof(table_space.data)) );
|
||||
size_t table_space_step = table_space.step / sizeof(float);
|
||||
cudaSafeCall( cudaMemcpyToSymbol(ctable_space_step, &table_space_step, sizeof(size_t)) );
|
||||
|
||||
float cost[] = {0.0f, 0.0f, 0.0f, 0.0f, 0.0f};
|
||||
cudaSafeCall( cudaMemcpyToSymbol(cndisp, &ndisp, sizeof(int)) );
|
||||
cudaSafeCall( cudaMemcpyToSymbol(cradius, &radius, sizeof(int)) );
|
||||
|
||||
const uchar* ic = img + y * img_step + channels * x;
|
||||
cudaSafeCall( cudaMemcpyToSymbol(cedge_disc, &edge_disc, sizeof(short)) );
|
||||
cudaSafeCall( cudaMemcpyToSymbol(cmax_disc, &max_disc, sizeof(short)) );
|
||||
}
|
||||
|
||||
for(int yi = ymin; yi <= ymax; yi++)
|
||||
template <int channels>
|
||||
struct DistRgbMax
|
||||
{
|
||||
static __device__ __forceinline__ uchar calc(const uchar* a, const uchar* b)
|
||||
{
|
||||
const T* disp_y = disp + yi * disp_step;
|
||||
uchar x = ::abs(a[0] - b[0]);
|
||||
uchar y = ::abs(a[1] - b[1]);
|
||||
uchar z = ::abs(a[2] - b[2]);
|
||||
return (::max(::max(x, y), z));
|
||||
}
|
||||
};
|
||||
|
||||
for(int xi = xmin; xi <= xmax; xi++)
|
||||
template <>
|
||||
struct DistRgbMax<1>
|
||||
{
|
||||
static __device__ __forceinline__ uchar calc(const uchar* a, const uchar* b)
|
||||
{
|
||||
return ::abs(a[0] - b[0]);
|
||||
}
|
||||
};
|
||||
|
||||
template <int channels, typename T>
|
||||
__global__ void bilateral_filter(int t, T* disp, size_t disp_step, const uchar* img, size_t img_step, int h, int w)
|
||||
{
|
||||
const int y = blockIdx.y * blockDim.y + threadIdx.y;
|
||||
const int x = ((blockIdx.x * blockDim.x + threadIdx.x) << 1) + ((y + t) & 1);
|
||||
|
||||
T dp[5];
|
||||
|
||||
if (y > 0 && y < h - 1 && x > 0 && x < w - 1)
|
||||
{
|
||||
dp[0] = *(disp + (y ) * disp_step + x + 0);
|
||||
dp[1] = *(disp + (y-1) * disp_step + x + 0);
|
||||
dp[2] = *(disp + (y ) * disp_step + x - 1);
|
||||
dp[3] = *(disp + (y+1) * disp_step + x + 0);
|
||||
dp[4] = *(disp + (y ) * disp_step + x + 1);
|
||||
|
||||
if(::abs(dp[1] - dp[0]) >= cedge_disc || ::abs(dp[2] - dp[0]) >= cedge_disc || ::abs(dp[3] - dp[0]) >= cedge_disc || ::abs(dp[4] - dp[0]) >= cedge_disc)
|
||||
{
|
||||
const uchar* in = img + yi * img_step + channels * xi;
|
||||
const int ymin = ::max(0, y - cradius);
|
||||
const int xmin = ::max(0, x - cradius);
|
||||
const int ymax = ::min(h - 1, y + cradius);
|
||||
const int xmax = ::min(w - 1, x + cradius);
|
||||
|
||||
uchar dist_rgb = DistRgbMax<channels>::calc(in, ic);
|
||||
float cost[] = {0.0f, 0.0f, 0.0f, 0.0f, 0.0f};
|
||||
|
||||
const float weight = ctable_color[dist_rgb] * (ctable_space + ::abs(y-yi)* ctable_space_step)[::abs(x-xi)];
|
||||
const uchar* ic = img + y * img_step + channels * x;
|
||||
|
||||
const T disp_reg = disp_y[xi];
|
||||
for(int yi = ymin; yi <= ymax; yi++)
|
||||
{
|
||||
const T* disp_y = disp + yi * disp_step;
|
||||
|
||||
cost[0] += ::min(cmax_disc, ::abs(disp_reg - dp[0])) * weight;
|
||||
cost[1] += ::min(cmax_disc, ::abs(disp_reg - dp[1])) * weight;
|
||||
cost[2] += ::min(cmax_disc, ::abs(disp_reg - dp[2])) * weight;
|
||||
cost[3] += ::min(cmax_disc, ::abs(disp_reg - dp[3])) * weight;
|
||||
cost[4] += ::min(cmax_disc, ::abs(disp_reg - dp[4])) * weight;
|
||||
for(int xi = xmin; xi <= xmax; xi++)
|
||||
{
|
||||
const uchar* in = img + yi * img_step + channels * xi;
|
||||
|
||||
uchar dist_rgb = DistRgbMax<channels>::calc(in, ic);
|
||||
|
||||
const float weight = ctable_color[dist_rgb] * (ctable_space + ::abs(y-yi)* ctable_space_step)[::abs(x-xi)];
|
||||
|
||||
const T disp_reg = disp_y[xi];
|
||||
|
||||
cost[0] += ::min(cmax_disc, ::abs(disp_reg - dp[0])) * weight;
|
||||
cost[1] += ::min(cmax_disc, ::abs(disp_reg - dp[1])) * weight;
|
||||
cost[2] += ::min(cmax_disc, ::abs(disp_reg - dp[2])) * weight;
|
||||
cost[3] += ::min(cmax_disc, ::abs(disp_reg - dp[3])) * weight;
|
||||
cost[4] += ::min(cmax_disc, ::abs(disp_reg - dp[4])) * weight;
|
||||
}
|
||||
}
|
||||
|
||||
float minimum = numeric_limits<float>::max();
|
||||
int id = 0;
|
||||
|
||||
if (cost[0] < minimum)
|
||||
{
|
||||
minimum = cost[0];
|
||||
id = 0;
|
||||
}
|
||||
if (cost[1] < minimum)
|
||||
{
|
||||
minimum = cost[1];
|
||||
id = 1;
|
||||
}
|
||||
if (cost[2] < minimum)
|
||||
{
|
||||
minimum = cost[2];
|
||||
id = 2;
|
||||
}
|
||||
if (cost[3] < minimum)
|
||||
{
|
||||
minimum = cost[3];
|
||||
id = 3;
|
||||
}
|
||||
if (cost[4] < minimum)
|
||||
{
|
||||
minimum = cost[4];
|
||||
id = 4;
|
||||
}
|
||||
|
||||
*(disp + y * disp_step + x) = dp[id];
|
||||
}
|
||||
}
|
||||
|
||||
float minimum = numeric_limits<float>::max();
|
||||
int id = 0;
|
||||
|
||||
if (cost[0] < minimum)
|
||||
{
|
||||
minimum = cost[0];
|
||||
id = 0;
|
||||
}
|
||||
if (cost[1] < minimum)
|
||||
{
|
||||
minimum = cost[1];
|
||||
id = 1;
|
||||
}
|
||||
if (cost[2] < minimum)
|
||||
{
|
||||
minimum = cost[2];
|
||||
id = 2;
|
||||
}
|
||||
if (cost[3] < minimum)
|
||||
{
|
||||
minimum = cost[3];
|
||||
id = 3;
|
||||
}
|
||||
if (cost[4] < minimum)
|
||||
{
|
||||
minimum = cost[4];
|
||||
id = 4;
|
||||
}
|
||||
|
||||
*(disp + y * disp_step + x) = dp[id];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
void bilateral_filter_caller(DevMem2D_<T> disp, DevMem2Db img, int channels, int iters, cudaStream_t stream)
|
||||
{
|
||||
dim3 threads(32, 8, 1);
|
||||
dim3 grid(1, 1, 1);
|
||||
grid.x = divUp(disp.cols, threads.x << 1);
|
||||
grid.y = divUp(disp.rows, threads.y);
|
||||
|
||||
switch (channels)
|
||||
{
|
||||
case 1:
|
||||
for (int i = 0; i < iters; ++i)
|
||||
template <typename T>
|
||||
void bilateral_filter_caller(DevMem2D_<T> disp, DevMem2Db img, int channels, int iters, cudaStream_t stream)
|
||||
{
|
||||
bilateral_filter<1><<<grid, threads, 0, stream>>>(0, disp.data, disp.step/sizeof(T), img.data, img.step, disp.rows, disp.cols);
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
dim3 threads(32, 8, 1);
|
||||
dim3 grid(1, 1, 1);
|
||||
grid.x = divUp(disp.cols, threads.x << 1);
|
||||
grid.y = divUp(disp.rows, threads.y);
|
||||
|
||||
bilateral_filter<1><<<grid, threads, 0, stream>>>(1, disp.data, disp.step/sizeof(T), img.data, img.step, disp.rows, disp.cols);
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
switch (channels)
|
||||
{
|
||||
case 1:
|
||||
for (int i = 0; i < iters; ++i)
|
||||
{
|
||||
bilateral_filter<1><<<grid, threads, 0, stream>>>(0, disp.data, disp.step/sizeof(T), img.data, img.step, disp.rows, disp.cols);
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
|
||||
bilateral_filter<1><<<grid, threads, 0, stream>>>(1, disp.data, disp.step/sizeof(T), img.data, img.step, disp.rows, disp.cols);
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
}
|
||||
break;
|
||||
case 3:
|
||||
for (int i = 0; i < iters; ++i)
|
||||
{
|
||||
bilateral_filter<3><<<grid, threads, 0, stream>>>(0, disp.data, disp.step/sizeof(T), img.data, img.step, disp.rows, disp.cols);
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
|
||||
bilateral_filter<3><<<grid, threads, 0, stream>>>(1, disp.data, disp.step/sizeof(T), img.data, img.step, disp.rows, disp.cols);
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
}
|
||||
break;
|
||||
default:
|
||||
cv::gpu::error("Unsupported channels count", __FILE__, __LINE__);
|
||||
}
|
||||
|
||||
if (stream != 0)
|
||||
cudaSafeCall( cudaDeviceSynchronize() );
|
||||
}
|
||||
break;
|
||||
case 3:
|
||||
for (int i = 0; i < iters; ++i)
|
||||
|
||||
void bilateral_filter_gpu(DevMem2Db disp, DevMem2Db img, int channels, int iters, cudaStream_t stream)
|
||||
{
|
||||
bilateral_filter<3><<<grid, threads, 0, stream>>>(0, disp.data, disp.step/sizeof(T), img.data, img.step, disp.rows, disp.cols);
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
|
||||
bilateral_filter<3><<<grid, threads, 0, stream>>>(1, disp.data, disp.step/sizeof(T), img.data, img.step, disp.rows, disp.cols);
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
bilateral_filter_caller(disp, img, channels, iters, stream);
|
||||
}
|
||||
break;
|
||||
default:
|
||||
cv::gpu::error("Unsupported channels count", __FILE__, __LINE__);
|
||||
}
|
||||
|
||||
if (stream != 0)
|
||||
cudaSafeCall( cudaDeviceSynchronize() );
|
||||
}
|
||||
|
||||
void bilateral_filter_gpu(DevMem2Db disp, DevMem2Db img, int channels, int iters, cudaStream_t stream)
|
||||
{
|
||||
bilateral_filter_caller(disp, img, channels, iters, stream);
|
||||
}
|
||||
|
||||
void bilateral_filter_gpu(DevMem2D_<short> disp, DevMem2Db img, int channels, int iters, cudaStream_t stream)
|
||||
{
|
||||
bilateral_filter_caller(disp, img, channels, iters, stream);
|
||||
}
|
||||
|
||||
} // namespace bilateral_filter
|
||||
|
||||
END_OPENCV_DEVICE_NAMESPACE
|
||||
void bilateral_filter_gpu(DevMem2D_<short> disp, DevMem2Db img, int channels, int iters, cudaStream_t stream)
|
||||
{
|
||||
bilateral_filter_caller(disp, img, channels, iters, stream);
|
||||
}
|
||||
} // namespace bilateral_filter
|
||||
}}} // namespace cv { namespace gpu { namespace device
|
||||
|
@ -42,77 +42,75 @@
|
||||
|
||||
#include "internal_shared.hpp"
|
||||
|
||||
BEGIN_OPENCV_DEVICE_NAMESPACE
|
||||
|
||||
namespace blend {
|
||||
|
||||
template <typename T>
|
||||
__global__ void blendLinearKernel(int rows, int cols, int cn, const PtrStep<T> img1, const PtrStep<T> img2,
|
||||
const PtrStepf weights1, const PtrStepf weights2, PtrStep<T> result)
|
||||
namespace cv { namespace gpu { namespace device
|
||||
{
|
||||
int x = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
int y = blockIdx.y * blockDim.y + threadIdx.y;
|
||||
|
||||
if (y < rows && x < cols)
|
||||
namespace blend
|
||||
{
|
||||
int x_ = x / cn;
|
||||
float w1 = weights1.ptr(y)[x_];
|
||||
float w2 = weights2.ptr(y)[x_];
|
||||
T p1 = img1.ptr(y)[x];
|
||||
T p2 = img2.ptr(y)[x];
|
||||
result.ptr(y)[x] = (p1 * w1 + p2 * w2) / (w1 + w2 + 1e-5f);
|
||||
}
|
||||
}
|
||||
template <typename T>
|
||||
__global__ void blendLinearKernel(int rows, int cols, int cn, const PtrStep<T> img1, const PtrStep<T> img2,
|
||||
const PtrStepf weights1, const PtrStepf weights2, PtrStep<T> result)
|
||||
{
|
||||
int x = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
int y = blockIdx.y * blockDim.y + threadIdx.y;
|
||||
|
||||
template <typename T>
|
||||
void blendLinearCaller(int rows, int cols, int cn, PtrStep<T> img1, PtrStep<T> img2, PtrStepf weights1, PtrStepf weights2, PtrStep<T> result, cudaStream_t stream)
|
||||
{
|
||||
dim3 threads(16, 16);
|
||||
dim3 grid(divUp(cols * cn, threads.x), divUp(rows, threads.y));
|
||||
|
||||
blendLinearKernel<<<grid, threads, 0, stream>>>(rows, cols * cn, cn, img1, img2, weights1, weights2, result);
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
if (y < rows && x < cols)
|
||||
{
|
||||
int x_ = x / cn;
|
||||
float w1 = weights1.ptr(y)[x_];
|
||||
float w2 = weights2.ptr(y)[x_];
|
||||
T p1 = img1.ptr(y)[x];
|
||||
T p2 = img2.ptr(y)[x];
|
||||
result.ptr(y)[x] = (p1 * w1 + p2 * w2) / (w1 + w2 + 1e-5f);
|
||||
}
|
||||
}
|
||||
|
||||
if (stream == 0)
|
||||
cudaSafeCall(cudaDeviceSynchronize());
|
||||
}
|
||||
template <typename T>
|
||||
void blendLinearCaller(int rows, int cols, int cn, PtrStep<T> img1, PtrStep<T> img2, PtrStepf weights1, PtrStepf weights2, PtrStep<T> result, cudaStream_t stream)
|
||||
{
|
||||
dim3 threads(16, 16);
|
||||
dim3 grid(divUp(cols * cn, threads.x), divUp(rows, threads.y));
|
||||
|
||||
blendLinearKernel<<<grid, threads, 0, stream>>>(rows, cols * cn, cn, img1, img2, weights1, weights2, result);
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
|
||||
template void blendLinearCaller<uchar>(int, int, int, PtrStep<uchar>, PtrStep<uchar>, PtrStepf, PtrStepf, PtrStep<uchar>, cudaStream_t stream);
|
||||
template void blendLinearCaller<float>(int, int, int, PtrStep<float>, PtrStep<float>, PtrStepf, PtrStepf, PtrStep<float>, cudaStream_t stream);
|
||||
if (stream == 0)
|
||||
cudaSafeCall(cudaDeviceSynchronize());
|
||||
}
|
||||
|
||||
template void blendLinearCaller<uchar>(int, int, int, PtrStep<uchar>, PtrStep<uchar>, PtrStepf, PtrStepf, PtrStep<uchar>, cudaStream_t stream);
|
||||
template void blendLinearCaller<float>(int, int, int, PtrStep<float>, PtrStep<float>, PtrStepf, PtrStepf, PtrStep<float>, cudaStream_t stream);
|
||||
|
||||
|
||||
__global__ void blendLinearKernel8UC4(int rows, int cols, const PtrStepb img1, const PtrStepb img2,
|
||||
const PtrStepf weights1, const PtrStepf weights2, PtrStepb result)
|
||||
{
|
||||
int x = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
int y = blockIdx.y * blockDim.y + threadIdx.y;
|
||||
__global__ void blendLinearKernel8UC4(int rows, int cols, const PtrStepb img1, const PtrStepb img2,
|
||||
const PtrStepf weights1, const PtrStepf weights2, PtrStepb result)
|
||||
{
|
||||
int x = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
int y = blockIdx.y * blockDim.y + threadIdx.y;
|
||||
|
||||
if (y < rows && x < cols)
|
||||
{
|
||||
float w1 = weights1.ptr(y)[x];
|
||||
float w2 = weights2.ptr(y)[x];
|
||||
float sum_inv = 1.f / (w1 + w2 + 1e-5f);
|
||||
w1 *= sum_inv;
|
||||
w2 *= sum_inv;
|
||||
uchar4 p1 = ((const uchar4*)img1.ptr(y))[x];
|
||||
uchar4 p2 = ((const uchar4*)img2.ptr(y))[x];
|
||||
((uchar4*)result.ptr(y))[x] = make_uchar4(p1.x * w1 + p2.x * w2, p1.y * w1 + p2.y * w2,
|
||||
p1.z * w1 + p2.z * w2, p1.w * w1 + p2.w * w2);
|
||||
}
|
||||
}
|
||||
if (y < rows && x < cols)
|
||||
{
|
||||
float w1 = weights1.ptr(y)[x];
|
||||
float w2 = weights2.ptr(y)[x];
|
||||
float sum_inv = 1.f / (w1 + w2 + 1e-5f);
|
||||
w1 *= sum_inv;
|
||||
w2 *= sum_inv;
|
||||
uchar4 p1 = ((const uchar4*)img1.ptr(y))[x];
|
||||
uchar4 p2 = ((const uchar4*)img2.ptr(y))[x];
|
||||
((uchar4*)result.ptr(y))[x] = make_uchar4(p1.x * w1 + p2.x * w2, p1.y * w1 + p2.y * w2,
|
||||
p1.z * w1 + p2.z * w2, p1.w * w1 + p2.w * w2);
|
||||
}
|
||||
}
|
||||
|
||||
void blendLinearCaller8UC4(int rows, int cols, PtrStepb img1, PtrStepb img2, PtrStepf weights1, PtrStepf weights2, PtrStepb result, cudaStream_t stream)
|
||||
{
|
||||
dim3 threads(16, 16);
|
||||
dim3 grid(divUp(cols, threads.x), divUp(rows, threads.y));
|
||||
|
||||
blendLinearKernel8UC4<<<grid, threads, 0, stream>>>(rows, cols, img1, img2, weights1, weights2, result);
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
void blendLinearCaller8UC4(int rows, int cols, PtrStepb img1, PtrStepb img2, PtrStepf weights1, PtrStepf weights2, PtrStepb result, cudaStream_t stream)
|
||||
{
|
||||
dim3 threads(16, 16);
|
||||
dim3 grid(divUp(cols, threads.x), divUp(rows, threads.y));
|
||||
|
||||
blendLinearKernel8UC4<<<grid, threads, 0, stream>>>(rows, cols, img1, img2, weights1, weights2, result);
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
|
||||
if (stream == 0)
|
||||
cudaSafeCall(cudaDeviceSynchronize());
|
||||
}
|
||||
|
||||
} // namespace blend
|
||||
|
||||
END_OPENCV_DEVICE_NAMESPACE
|
||||
if (stream == 0)
|
||||
cudaSafeCall(cudaDeviceSynchronize());
|
||||
}
|
||||
} // namespace blend
|
||||
}}} // namespace cv { namespace gpu { namespace device
|
||||
|
@ -44,149 +44,148 @@
|
||||
#include "opencv2/gpu/device/transform.hpp"
|
||||
#include "opencv2/gpu/device/functional.hpp"
|
||||
|
||||
BEGIN_OPENCV_DEVICE_NAMESPACE
|
||||
|
||||
#define SOLVE_PNP_RANSAC_MAX_NUM_ITERS 200
|
||||
|
||||
namespace transform_points
|
||||
namespace cv { namespace gpu { namespace device
|
||||
{
|
||||
__constant__ float3 crot0;
|
||||
__constant__ float3 crot1;
|
||||
__constant__ float3 crot2;
|
||||
__constant__ float3 ctransl;
|
||||
#define SOLVE_PNP_RANSAC_MAX_NUM_ITERS 200
|
||||
|
||||
struct TransformOp : unary_function<float3, float3>
|
||||
namespace transform_points
|
||||
{
|
||||
__device__ __forceinline__ float3 operator()(const float3& p) const
|
||||
__constant__ float3 crot0;
|
||||
__constant__ float3 crot1;
|
||||
__constant__ float3 crot2;
|
||||
__constant__ float3 ctransl;
|
||||
|
||||
struct TransformOp : unary_function<float3, float3>
|
||||
{
|
||||
return make_float3(
|
||||
crot0.x * p.x + crot0.y * p.y + crot0.z * p.z + ctransl.x,
|
||||
crot1.x * p.x + crot1.y * p.y + crot1.z * p.z + ctransl.y,
|
||||
crot2.x * p.x + crot2.y * p.y + crot2.z * p.z + ctransl.z);
|
||||
__device__ __forceinline__ float3 operator()(const float3& p) const
|
||||
{
|
||||
return make_float3(
|
||||
crot0.x * p.x + crot0.y * p.y + crot0.z * p.z + ctransl.x,
|
||||
crot1.x * p.x + crot1.y * p.y + crot1.z * p.z + ctransl.y,
|
||||
crot2.x * p.x + crot2.y * p.y + crot2.z * p.z + ctransl.z);
|
||||
}
|
||||
};
|
||||
|
||||
void call(const DevMem2D_<float3> src, const float* rot,
|
||||
const float* transl, DevMem2D_<float3> dst,
|
||||
cudaStream_t stream)
|
||||
{
|
||||
cudaSafeCall(cudaMemcpyToSymbol(crot0, rot, sizeof(float) * 3));
|
||||
cudaSafeCall(cudaMemcpyToSymbol(crot1, rot + 3, sizeof(float) * 3));
|
||||
cudaSafeCall(cudaMemcpyToSymbol(crot2, rot + 6, sizeof(float) * 3));
|
||||
cudaSafeCall(cudaMemcpyToSymbol(ctransl, transl, sizeof(float) * 3));
|
||||
::cv::gpu::device::transform(src, dst, TransformOp(), stream);
|
||||
}
|
||||
};
|
||||
} // namespace transform_points
|
||||
|
||||
void call(const DevMem2D_<float3> src, const float* rot,
|
||||
const float* transl, DevMem2D_<float3> dst,
|
||||
cudaStream_t stream)
|
||||
namespace project_points
|
||||
{
|
||||
cudaSafeCall(cudaMemcpyToSymbol(crot0, rot, sizeof(float) * 3));
|
||||
cudaSafeCall(cudaMemcpyToSymbol(crot1, rot + 3, sizeof(float) * 3));
|
||||
cudaSafeCall(cudaMemcpyToSymbol(crot2, rot + 6, sizeof(float) * 3));
|
||||
cudaSafeCall(cudaMemcpyToSymbol(ctransl, transl, sizeof(float) * 3));
|
||||
OPENCV_DEVICE_NAMESPACE_ transform(src, dst, TransformOp(), stream);
|
||||
}
|
||||
} // namespace transform_points
|
||||
__constant__ float3 crot0;
|
||||
__constant__ float3 crot1;
|
||||
__constant__ float3 crot2;
|
||||
__constant__ float3 ctransl;
|
||||
__constant__ float3 cproj0;
|
||||
__constant__ float3 cproj1;
|
||||
|
||||
namespace project_points
|
||||
{
|
||||
__constant__ float3 crot0;
|
||||
__constant__ float3 crot1;
|
||||
__constant__ float3 crot2;
|
||||
__constant__ float3 ctransl;
|
||||
__constant__ float3 cproj0;
|
||||
__constant__ float3 cproj1;
|
||||
|
||||
struct ProjectOp : unary_function<float3, float3>
|
||||
{
|
||||
__device__ __forceinline__ float2 operator()(const float3& p) const
|
||||
struct ProjectOp : unary_function<float3, float3>
|
||||
{
|
||||
// Rotate and translate in 3D
|
||||
float3 t = make_float3(
|
||||
crot0.x * p.x + crot0.y * p.y + crot0.z * p.z + ctransl.x,
|
||||
crot1.x * p.x + crot1.y * p.y + crot1.z * p.z + ctransl.y,
|
||||
crot2.x * p.x + crot2.y * p.y + crot2.z * p.z + ctransl.z);
|
||||
// Project on 2D plane
|
||||
return make_float2(
|
||||
(cproj0.x * t.x + cproj0.y * t.y) / t.z + cproj0.z,
|
||||
(cproj1.x * t.x + cproj1.y * t.y) / t.z + cproj1.z);
|
||||
__device__ __forceinline__ float2 operator()(const float3& p) const
|
||||
{
|
||||
// Rotate and translate in 3D
|
||||
float3 t = make_float3(
|
||||
crot0.x * p.x + crot0.y * p.y + crot0.z * p.z + ctransl.x,
|
||||
crot1.x * p.x + crot1.y * p.y + crot1.z * p.z + ctransl.y,
|
||||
crot2.x * p.x + crot2.y * p.y + crot2.z * p.z + ctransl.z);
|
||||
// Project on 2D plane
|
||||
return make_float2(
|
||||
(cproj0.x * t.x + cproj0.y * t.y) / t.z + cproj0.z,
|
||||
(cproj1.x * t.x + cproj1.y * t.y) / t.z + cproj1.z);
|
||||
}
|
||||
};
|
||||
|
||||
void call(const DevMem2D_<float3> src, const float* rot,
|
||||
const float* transl, const float* proj, DevMem2D_<float2> dst,
|
||||
cudaStream_t stream)
|
||||
{
|
||||
cudaSafeCall(cudaMemcpyToSymbol(crot0, rot, sizeof(float) * 3));
|
||||
cudaSafeCall(cudaMemcpyToSymbol(crot1, rot + 3, sizeof(float) * 3));
|
||||
cudaSafeCall(cudaMemcpyToSymbol(crot2, rot + 6, sizeof(float) * 3));
|
||||
cudaSafeCall(cudaMemcpyToSymbol(ctransl, transl, sizeof(float) * 3));
|
||||
cudaSafeCall(cudaMemcpyToSymbol(cproj0, proj, sizeof(float) * 3));
|
||||
cudaSafeCall(cudaMemcpyToSymbol(cproj1, proj + 3, sizeof(float) * 3));
|
||||
::cv::gpu::device::transform(src, dst, ProjectOp(), stream);
|
||||
}
|
||||
};
|
||||
} // namespace project_points
|
||||
|
||||
void call(const DevMem2D_<float3> src, const float* rot,
|
||||
const float* transl, const float* proj, DevMem2D_<float2> dst,
|
||||
cudaStream_t stream)
|
||||
namespace solve_pnp_ransac
|
||||
{
|
||||
cudaSafeCall(cudaMemcpyToSymbol(crot0, rot, sizeof(float) * 3));
|
||||
cudaSafeCall(cudaMemcpyToSymbol(crot1, rot + 3, sizeof(float) * 3));
|
||||
cudaSafeCall(cudaMemcpyToSymbol(crot2, rot + 6, sizeof(float) * 3));
|
||||
cudaSafeCall(cudaMemcpyToSymbol(ctransl, transl, sizeof(float) * 3));
|
||||
cudaSafeCall(cudaMemcpyToSymbol(cproj0, proj, sizeof(float) * 3));
|
||||
cudaSafeCall(cudaMemcpyToSymbol(cproj1, proj + 3, sizeof(float) * 3));
|
||||
OPENCV_DEVICE_NAMESPACE_ transform(src, dst, ProjectOp(), stream);
|
||||
}
|
||||
} // namespace project_points
|
||||
__constant__ float3 crot_matrices[SOLVE_PNP_RANSAC_MAX_NUM_ITERS * 3];
|
||||
__constant__ float3 ctransl_vectors[SOLVE_PNP_RANSAC_MAX_NUM_ITERS];
|
||||
|
||||
namespace solve_pnp_ransac
|
||||
{
|
||||
__constant__ float3 crot_matrices[SOLVE_PNP_RANSAC_MAX_NUM_ITERS * 3];
|
||||
__constant__ float3 ctransl_vectors[SOLVE_PNP_RANSAC_MAX_NUM_ITERS];
|
||||
|
||||
int maxNumIters()
|
||||
{
|
||||
return SOLVE_PNP_RANSAC_MAX_NUM_ITERS;
|
||||
}
|
||||
|
||||
__device__ __forceinline__ float sqr(float x)
|
||||
{
|
||||
return x * x;
|
||||
}
|
||||
|
||||
__global__ void computeHypothesisScoresKernel(
|
||||
const int num_points, const float3* object, const float2* image,
|
||||
const float dist_threshold, int* g_num_inliers)
|
||||
{
|
||||
const float3* const &rot_mat = crot_matrices + blockIdx.x * 3;
|
||||
const float3 &transl_vec = ctransl_vectors[blockIdx.x];
|
||||
int num_inliers = 0;
|
||||
|
||||
for (int i = threadIdx.x; i < num_points; i += blockDim.x)
|
||||
int maxNumIters()
|
||||
{
|
||||
float3 p = object[i];
|
||||
p = make_float3(
|
||||
rot_mat[0].x * p.x + rot_mat[0].y * p.y + rot_mat[0].z * p.z + transl_vec.x,
|
||||
rot_mat[1].x * p.x + rot_mat[1].y * p.y + rot_mat[1].z * p.z + transl_vec.y,
|
||||
rot_mat[2].x * p.x + rot_mat[2].y * p.y + rot_mat[2].z * p.z + transl_vec.z);
|
||||
p.x /= p.z;
|
||||
p.y /= p.z;
|
||||
float2 image_p = image[i];
|
||||
if (sqr(p.x - image_p.x) + sqr(p.y - image_p.y) < dist_threshold)
|
||||
++num_inliers;
|
||||
return SOLVE_PNP_RANSAC_MAX_NUM_ITERS;
|
||||
}
|
||||
|
||||
extern __shared__ float s_num_inliers[];
|
||||
s_num_inliers[threadIdx.x] = num_inliers;
|
||||
__syncthreads();
|
||||
|
||||
for (int step = blockDim.x / 2; step > 0; step >>= 1)
|
||||
__device__ __forceinline__ float sqr(float x)
|
||||
{
|
||||
if (threadIdx.x < step)
|
||||
s_num_inliers[threadIdx.x] += s_num_inliers[threadIdx.x + step];
|
||||
return x * x;
|
||||
}
|
||||
|
||||
__global__ void computeHypothesisScoresKernel(
|
||||
const int num_points, const float3* object, const float2* image,
|
||||
const float dist_threshold, int* g_num_inliers)
|
||||
{
|
||||
const float3* const &rot_mat = crot_matrices + blockIdx.x * 3;
|
||||
const float3 &transl_vec = ctransl_vectors[blockIdx.x];
|
||||
int num_inliers = 0;
|
||||
|
||||
for (int i = threadIdx.x; i < num_points; i += blockDim.x)
|
||||
{
|
||||
float3 p = object[i];
|
||||
p = make_float3(
|
||||
rot_mat[0].x * p.x + rot_mat[0].y * p.y + rot_mat[0].z * p.z + transl_vec.x,
|
||||
rot_mat[1].x * p.x + rot_mat[1].y * p.y + rot_mat[1].z * p.z + transl_vec.y,
|
||||
rot_mat[2].x * p.x + rot_mat[2].y * p.y + rot_mat[2].z * p.z + transl_vec.z);
|
||||
p.x /= p.z;
|
||||
p.y /= p.z;
|
||||
float2 image_p = image[i];
|
||||
if (sqr(p.x - image_p.x) + sqr(p.y - image_p.y) < dist_threshold)
|
||||
++num_inliers;
|
||||
}
|
||||
|
||||
extern __shared__ float s_num_inliers[];
|
||||
s_num_inliers[threadIdx.x] = num_inliers;
|
||||
__syncthreads();
|
||||
|
||||
for (int step = blockDim.x / 2; step > 0; step >>= 1)
|
||||
{
|
||||
if (threadIdx.x < step)
|
||||
s_num_inliers[threadIdx.x] += s_num_inliers[threadIdx.x + step];
|
||||
__syncthreads();
|
||||
}
|
||||
|
||||
if (threadIdx.x == 0)
|
||||
g_num_inliers[blockIdx.x] = s_num_inliers[0];
|
||||
}
|
||||
|
||||
if (threadIdx.x == 0)
|
||||
g_num_inliers[blockIdx.x] = s_num_inliers[0];
|
||||
}
|
||||
void computeHypothesisScores(
|
||||
const int num_hypotheses, const int num_points, const float* rot_matrices,
|
||||
const float3* transl_vectors, const float3* object, const float2* image,
|
||||
const float dist_threshold, int* hypothesis_scores)
|
||||
{
|
||||
cudaSafeCall(cudaMemcpyToSymbol(crot_matrices, rot_matrices, num_hypotheses * 3 * sizeof(float3)));
|
||||
cudaSafeCall(cudaMemcpyToSymbol(ctransl_vectors, transl_vectors, num_hypotheses * sizeof(float3)));
|
||||
|
||||
void computeHypothesisScores(
|
||||
const int num_hypotheses, const int num_points, const float* rot_matrices,
|
||||
const float3* transl_vectors, const float3* object, const float2* image,
|
||||
const float dist_threshold, int* hypothesis_scores)
|
||||
{
|
||||
cudaSafeCall(cudaMemcpyToSymbol(crot_matrices, rot_matrices, num_hypotheses * 3 * sizeof(float3)));
|
||||
cudaSafeCall(cudaMemcpyToSymbol(ctransl_vectors, transl_vectors, num_hypotheses * sizeof(float3)));
|
||||
dim3 threads(256);
|
||||
dim3 grid(num_hypotheses);
|
||||
int smem_size = threads.x * sizeof(float);
|
||||
|
||||
dim3 threads(256);
|
||||
dim3 grid(num_hypotheses);
|
||||
int smem_size = threads.x * sizeof(float);
|
||||
computeHypothesisScoresKernel<<<grid, threads, smem_size>>>(
|
||||
num_points, object, image, dist_threshold, hypothesis_scores);
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
|
||||
computeHypothesisScoresKernel<<<grid, threads, smem_size>>>(
|
||||
num_points, object, image, dist_threshold, hypothesis_scores);
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
|
||||
cudaSafeCall( cudaDeviceSynchronize() );
|
||||
}
|
||||
} // namespace solvepnp_ransac
|
||||
|
||||
END_OPENCV_DEVICE_NAMESPACE
|
||||
cudaSafeCall( cudaDeviceSynchronize() );
|
||||
}
|
||||
} // namespace solvepnp_ransac
|
||||
}}} // namespace cv { namespace gpu { namespace device
|
||||
|
@ -44,450 +44,448 @@
|
||||
#include <algorithm>
|
||||
#include "internal_shared.hpp"
|
||||
|
||||
BEGIN_OPENCV_DEVICE_NAMESPACE
|
||||
|
||||
namespace canny {
|
||||
|
||||
__global__ void calcSobelRowPass(const PtrStepb src, PtrStepi dx_buf, PtrStepi dy_buf, int rows, int cols)
|
||||
namespace cv { namespace gpu { namespace device
|
||||
{
|
||||
__shared__ int smem[16][18];
|
||||
|
||||
const int j = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
const int i = blockIdx.y * blockDim.y + threadIdx.y;
|
||||
|
||||
if (i < rows)
|
||||
namespace canny
|
||||
{
|
||||
smem[threadIdx.y][threadIdx.x + 1] = src.ptr(i)[j];
|
||||
if (threadIdx.x == 0)
|
||||
__global__ void calcSobelRowPass(const PtrStepb src, PtrStepi dx_buf, PtrStepi dy_buf, int rows, int cols)
|
||||
{
|
||||
smem[threadIdx.y][0] = src.ptr(i)[::max(j - 1, 0)];
|
||||
smem[threadIdx.y][17] = src.ptr(i)[::min(j + 16, cols - 1)];
|
||||
}
|
||||
__syncthreads();
|
||||
__shared__ int smem[16][18];
|
||||
|
||||
if (j < cols)
|
||||
{
|
||||
dx_buf.ptr(i)[j] = -smem[threadIdx.y][threadIdx.x] + smem[threadIdx.y][threadIdx.x + 2];
|
||||
dy_buf.ptr(i)[j] = smem[threadIdx.y][threadIdx.x] + 2 * smem[threadIdx.y][threadIdx.x + 1] + smem[threadIdx.y][threadIdx.x + 2];
|
||||
}
|
||||
}
|
||||
}
|
||||
const int j = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
const int i = blockIdx.y * blockDim.y + threadIdx.y;
|
||||
|
||||
void calcSobelRowPass_gpu(PtrStepb src, PtrStepi dx_buf, PtrStepi dy_buf, int rows, int cols)
|
||||
{
|
||||
dim3 block(16, 16, 1);
|
||||
dim3 grid(divUp(cols, block.x), divUp(rows, block.y), 1);
|
||||
|
||||
calcSobelRowPass<<<grid, block>>>(src, dx_buf, dy_buf, rows, cols);
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
|
||||
cudaSafeCall(cudaThreadSynchronize());
|
||||
}
|
||||
|
||||
struct L1
|
||||
{
|
||||
static __device__ __forceinline__ float calc(int x, int y)
|
||||
{
|
||||
return ::abs(x) + ::abs(y);
|
||||
}
|
||||
};
|
||||
struct L2
|
||||
{
|
||||
static __device__ __forceinline__ float calc(int x, int y)
|
||||
{
|
||||
return ::sqrtf(x * x + y * y);
|
||||
}
|
||||
};
|
||||
|
||||
template <typename Norm> __global__ void calcMagnitude(const PtrStepi dx_buf, const PtrStepi dy_buf,
|
||||
PtrStepi dx, PtrStepi dy, PtrStepf mag, int rows, int cols)
|
||||
{
|
||||
__shared__ int sdx[18][16];
|
||||
__shared__ int sdy[18][16];
|
||||
|
||||
const int j = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
const int i = blockIdx.y * blockDim.y + threadIdx.y;
|
||||
|
||||
if (j < cols)
|
||||
{
|
||||
sdx[threadIdx.y + 1][threadIdx.x] = dx_buf.ptr(i)[j];
|
||||
sdy[threadIdx.y + 1][threadIdx.x] = dy_buf.ptr(i)[j];
|
||||
if (threadIdx.y == 0)
|
||||
{
|
||||
sdx[0][threadIdx.x] = dx_buf.ptr(::max(i - 1, 0))[j];
|
||||
sdx[17][threadIdx.x] = dx_buf.ptr(::min(i + 16, rows - 1))[j];
|
||||
|
||||
sdy[0][threadIdx.x] = dy_buf.ptr(::max(i - 1, 0))[j];
|
||||
sdy[17][threadIdx.x] = dy_buf.ptr(::min(i + 16, rows - 1))[j];
|
||||
}
|
||||
__syncthreads();
|
||||
|
||||
if (i < rows)
|
||||
{
|
||||
int x = sdx[threadIdx.y][threadIdx.x] + 2 * sdx[threadIdx.y + 1][threadIdx.x] + sdx[threadIdx.y + 2][threadIdx.x];
|
||||
int y = -sdy[threadIdx.y][threadIdx.x] + sdy[threadIdx.y + 2][threadIdx.x];
|
||||
|
||||
dx.ptr(i)[j] = x;
|
||||
dy.ptr(i)[j] = y;
|
||||
|
||||
mag.ptr(i + 1)[j + 1] = Norm::calc(x, y);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void calcMagnitude_gpu(PtrStepi dx_buf, PtrStepi dy_buf, PtrStepi dx, PtrStepi dy, PtrStepf mag, int rows, int cols, bool L2Grad)
|
||||
{
|
||||
dim3 block(16, 16, 1);
|
||||
dim3 grid(divUp(cols, block.x), divUp(rows, block.y), 1);
|
||||
|
||||
if (L2Grad)
|
||||
calcMagnitude<L2><<<grid, block>>>(dx_buf, dy_buf, dx, dy, mag, rows, cols);
|
||||
else
|
||||
calcMagnitude<L1><<<grid, block>>>(dx_buf, dy_buf, dx, dy, mag, rows, cols);
|
||||
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
|
||||
cudaSafeCall(cudaThreadSynchronize());
|
||||
}
|
||||
|
||||
template <typename Norm> __global__ void calcMagnitude(PtrStepi dx, PtrStepi dy, PtrStepf mag, int rows, int cols)
|
||||
{
|
||||
const int j = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
const int i = blockIdx.y * blockDim.y + threadIdx.y;
|
||||
|
||||
if (i < rows && j < cols)
|
||||
mag.ptr(i + 1)[j + 1] = Norm::calc(dx.ptr(i)[j], dy.ptr(i)[j]);
|
||||
}
|
||||
|
||||
void calcMagnitude_gpu(PtrStepi dx, PtrStepi dy, PtrStepf mag, int rows, int cols, bool L2Grad)
|
||||
{
|
||||
dim3 block(16, 16, 1);
|
||||
dim3 grid(divUp(cols, block.x), divUp(rows, block.y), 1);
|
||||
|
||||
if (L2Grad)
|
||||
calcMagnitude<L2><<<grid, block>>>(dx, dy, mag, rows, cols);
|
||||
else
|
||||
calcMagnitude<L1><<<grid, block>>>(dx, dy, mag, rows, cols);
|
||||
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
|
||||
cudaSafeCall(cudaThreadSynchronize());
|
||||
}
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
#define CANNY_SHIFT 15
|
||||
#define TG22 (int)(0.4142135623730950488016887242097*(1<<CANNY_SHIFT) + 0.5)
|
||||
|
||||
__global__ void calcMap(const PtrStepi dx, const PtrStepi dy, const PtrStepf mag, PtrStepi map, int rows, int cols, float low_thresh, float high_thresh)
|
||||
{
|
||||
__shared__ float smem[18][18];
|
||||
|
||||
const int j = blockIdx.x * 16 + threadIdx.x;
|
||||
const int i = blockIdx.y * 16 + threadIdx.y;
|
||||
|
||||
const int tid = threadIdx.y * 16 + threadIdx.x;
|
||||
const int lx = tid % 18;
|
||||
const int ly = tid / 18;
|
||||
|
||||
if (ly < 14)
|
||||
smem[ly][lx] = mag.ptr(blockIdx.y * 16 + ly)[blockIdx.x * 16 + lx];
|
||||
|
||||
if (ly < 4 && blockIdx.y * 16 + ly + 14 <= rows && blockIdx.x * 16 + lx <= cols)
|
||||
smem[ly + 14][lx] = mag.ptr(blockIdx.y * 16 + ly + 14)[blockIdx.x * 16 + lx];
|
||||
|
||||
__syncthreads();
|
||||
|
||||
if (i < rows && j < cols)
|
||||
{
|
||||
int x = dx.ptr(i)[j];
|
||||
int y = dy.ptr(i)[j];
|
||||
const int s = (x ^ y) < 0 ? -1 : 1;
|
||||
const float m = smem[threadIdx.y + 1][threadIdx.x + 1];
|
||||
|
||||
x = ::abs(x);
|
||||
y = ::abs(y);
|
||||
|
||||
// 0 - the pixel can not belong to an edge
|
||||
// 1 - the pixel might belong to an edge
|
||||
// 2 - the pixel does belong to an edge
|
||||
int edge_type = 0;
|
||||
|
||||
if (m > low_thresh)
|
||||
{
|
||||
const int tg22x = x * TG22;
|
||||
const int tg67x = tg22x + ((x + x) << CANNY_SHIFT);
|
||||
|
||||
y <<= CANNY_SHIFT;
|
||||
|
||||
if (y < tg22x)
|
||||
if (i < rows)
|
||||
{
|
||||
if (m > smem[threadIdx.y + 1][threadIdx.x] && m >= smem[threadIdx.y + 1][threadIdx.x + 2])
|
||||
edge_type = 1 + (int)(m > high_thresh);
|
||||
}
|
||||
else if( y > tg67x )
|
||||
{
|
||||
if (m > smem[threadIdx.y][threadIdx.x + 1] && m >= smem[threadIdx.y + 2][threadIdx.x + 1])
|
||||
edge_type = 1 + (int)(m > high_thresh);
|
||||
}
|
||||
else
|
||||
{
|
||||
if (m > smem[threadIdx.y][threadIdx.x + 1 - s] && m > smem[threadIdx.y + 2][threadIdx.x + 1 + s])
|
||||
edge_type = 1 + (int)(m > high_thresh);
|
||||
}
|
||||
}
|
||||
|
||||
map.ptr(i + 1)[j + 1] = edge_type;
|
||||
}
|
||||
}
|
||||
|
||||
#undef CANNY_SHIFT
|
||||
#undef TG22
|
||||
|
||||
void calcMap_gpu(PtrStepi dx, PtrStepi dy, PtrStepf mag, PtrStepi map, int rows, int cols, float low_thresh, float high_thresh)
|
||||
{
|
||||
dim3 block(16, 16, 1);
|
||||
dim3 grid(divUp(cols, block.x), divUp(rows, block.y), 1);
|
||||
|
||||
calcMap<<<grid, block>>>(dx, dy, mag, map, rows, cols, low_thresh, high_thresh);
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
|
||||
cudaSafeCall(cudaThreadSynchronize());
|
||||
}
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
__device__ unsigned int counter = 0;
|
||||
|
||||
__global__ void edgesHysteresisLocal(PtrStepi map, ushort2* st, int rows, int cols)
|
||||
{
|
||||
#if __CUDA_ARCH__ >= 120
|
||||
|
||||
__shared__ int smem[18][18];
|
||||
|
||||
const int j = blockIdx.x * 16 + threadIdx.x;
|
||||
const int i = blockIdx.y * 16 + threadIdx.y;
|
||||
|
||||
const int tid = threadIdx.y * 16 + threadIdx.x;
|
||||
const int lx = tid % 18;
|
||||
const int ly = tid / 18;
|
||||
|
||||
if (ly < 14)
|
||||
smem[ly][lx] = map.ptr(blockIdx.y * 16 + ly)[blockIdx.x * 16 + lx];
|
||||
|
||||
if (ly < 4 && blockIdx.y * 16 + ly + 14 <= rows && blockIdx.x * 16 + lx <= cols)
|
||||
smem[ly + 14][lx] = map.ptr(blockIdx.y * 16 + ly + 14)[blockIdx.x * 16 + lx];
|
||||
|
||||
__syncthreads();
|
||||
|
||||
if (i < rows && j < cols)
|
||||
{
|
||||
int n;
|
||||
|
||||
#pragma unroll
|
||||
for (int k = 0; k < 16; ++k)
|
||||
{
|
||||
n = 0;
|
||||
|
||||
if (smem[threadIdx.y + 1][threadIdx.x + 1] == 1)
|
||||
{
|
||||
n += smem[threadIdx.y ][threadIdx.x ] == 2;
|
||||
n += smem[threadIdx.y ][threadIdx.x + 1] == 2;
|
||||
n += smem[threadIdx.y ][threadIdx.x + 2] == 2;
|
||||
|
||||
n += smem[threadIdx.y + 1][threadIdx.x ] == 2;
|
||||
n += smem[threadIdx.y + 1][threadIdx.x + 2] == 2;
|
||||
|
||||
n += smem[threadIdx.y + 2][threadIdx.x ] == 2;
|
||||
n += smem[threadIdx.y + 2][threadIdx.x + 1] == 2;
|
||||
n += smem[threadIdx.y + 2][threadIdx.x + 2] == 2;
|
||||
}
|
||||
|
||||
if (n > 0)
|
||||
smem[threadIdx.y + 1][threadIdx.x + 1] = 2;
|
||||
}
|
||||
|
||||
const int e = smem[threadIdx.y + 1][threadIdx.x + 1];
|
||||
|
||||
map.ptr(i + 1)[j + 1] = e;
|
||||
|
||||
n = 0;
|
||||
|
||||
if (e == 2)
|
||||
{
|
||||
n += smem[threadIdx.y ][threadIdx.x ] == 1;
|
||||
n += smem[threadIdx.y ][threadIdx.x + 1] == 1;
|
||||
n += smem[threadIdx.y ][threadIdx.x + 2] == 1;
|
||||
|
||||
n += smem[threadIdx.y + 1][threadIdx.x ] == 1;
|
||||
n += smem[threadIdx.y + 1][threadIdx.x + 2] == 1;
|
||||
|
||||
n += smem[threadIdx.y + 2][threadIdx.x ] == 1;
|
||||
n += smem[threadIdx.y + 2][threadIdx.x + 1] == 1;
|
||||
n += smem[threadIdx.y + 2][threadIdx.x + 2] == 1;
|
||||
}
|
||||
|
||||
if (n > 0)
|
||||
{
|
||||
const unsigned int ind = atomicInc(&counter, (unsigned int)(-1));
|
||||
st[ind] = make_ushort2(j + 1, i + 1);
|
||||
}
|
||||
}
|
||||
|
||||
#endif
|
||||
}
|
||||
|
||||
void edgesHysteresisLocal_gpu(PtrStepi map, ushort2* st1, int rows, int cols)
|
||||
{
|
||||
dim3 block(16, 16, 1);
|
||||
dim3 grid(divUp(cols, block.x), divUp(rows, block.y), 1);
|
||||
|
||||
edgesHysteresisLocal<<<grid, block>>>(map, st1, rows, cols);
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
|
||||
cudaSafeCall(cudaThreadSynchronize());
|
||||
}
|
||||
|
||||
__constant__ int c_dx[8] = {-1, 0, 1, -1, 1, -1, 0, 1};
|
||||
__constant__ int c_dy[8] = {-1, -1, -1, 0, 0, 1, 1, 1};
|
||||
|
||||
__global__ void edgesHysteresisGlobal(PtrStepi map, ushort2* st1, ushort2* st2, int rows, int cols, int count)
|
||||
{
|
||||
#if __CUDA_ARCH__ >= 120
|
||||
|
||||
const int stack_size = 512;
|
||||
|
||||
__shared__ unsigned int s_counter;
|
||||
__shared__ unsigned int s_ind;
|
||||
__shared__ ushort2 s_st[stack_size];
|
||||
|
||||
if (threadIdx.x == 0)
|
||||
s_counter = 0;
|
||||
__syncthreads();
|
||||
|
||||
int ind = blockIdx.y * gridDim.x + blockIdx.x;
|
||||
|
||||
if (ind < count)
|
||||
{
|
||||
ushort2 pos = st1[ind];
|
||||
|
||||
if (pos.x > 0 && pos.x <= cols && pos.y > 0 && pos.y <= rows)
|
||||
{
|
||||
if (threadIdx.x < 8)
|
||||
{
|
||||
pos.x += c_dx[threadIdx.x];
|
||||
pos.y += c_dy[threadIdx.x];
|
||||
|
||||
if (map.ptr(pos.y)[pos.x] == 1)
|
||||
smem[threadIdx.y][threadIdx.x + 1] = src.ptr(i)[j];
|
||||
if (threadIdx.x == 0)
|
||||
{
|
||||
map.ptr(pos.y)[pos.x] = 2;
|
||||
smem[threadIdx.y][0] = src.ptr(i)[::max(j - 1, 0)];
|
||||
smem[threadIdx.y][17] = src.ptr(i)[::min(j + 16, cols - 1)];
|
||||
}
|
||||
__syncthreads();
|
||||
|
||||
ind = atomicInc(&s_counter, (unsigned int)(-1));
|
||||
|
||||
s_st[ind] = pos;
|
||||
if (j < cols)
|
||||
{
|
||||
dx_buf.ptr(i)[j] = -smem[threadIdx.y][threadIdx.x] + smem[threadIdx.y][threadIdx.x + 2];
|
||||
dy_buf.ptr(i)[j] = smem[threadIdx.y][threadIdx.x] + 2 * smem[threadIdx.y][threadIdx.x + 1] + smem[threadIdx.y][threadIdx.x + 2];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void calcSobelRowPass_gpu(PtrStepb src, PtrStepi dx_buf, PtrStepi dy_buf, int rows, int cols)
|
||||
{
|
||||
dim3 block(16, 16, 1);
|
||||
dim3 grid(divUp(cols, block.x), divUp(rows, block.y), 1);
|
||||
|
||||
calcSobelRowPass<<<grid, block>>>(src, dx_buf, dy_buf, rows, cols);
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
|
||||
cudaSafeCall(cudaThreadSynchronize());
|
||||
}
|
||||
|
||||
struct L1
|
||||
{
|
||||
static __device__ __forceinline__ float calc(int x, int y)
|
||||
{
|
||||
return ::abs(x) + ::abs(y);
|
||||
}
|
||||
};
|
||||
struct L2
|
||||
{
|
||||
static __device__ __forceinline__ float calc(int x, int y)
|
||||
{
|
||||
return ::sqrtf(x * x + y * y);
|
||||
}
|
||||
};
|
||||
|
||||
template <typename Norm> __global__ void calcMagnitude(const PtrStepi dx_buf, const PtrStepi dy_buf,
|
||||
PtrStepi dx, PtrStepi dy, PtrStepf mag, int rows, int cols)
|
||||
{
|
||||
__shared__ int sdx[18][16];
|
||||
__shared__ int sdy[18][16];
|
||||
|
||||
const int j = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
const int i = blockIdx.y * blockDim.y + threadIdx.y;
|
||||
|
||||
if (j < cols)
|
||||
{
|
||||
sdx[threadIdx.y + 1][threadIdx.x] = dx_buf.ptr(i)[j];
|
||||
sdy[threadIdx.y + 1][threadIdx.x] = dy_buf.ptr(i)[j];
|
||||
if (threadIdx.y == 0)
|
||||
{
|
||||
sdx[0][threadIdx.x] = dx_buf.ptr(::max(i - 1, 0))[j];
|
||||
sdx[17][threadIdx.x] = dx_buf.ptr(::min(i + 16, rows - 1))[j];
|
||||
|
||||
sdy[0][threadIdx.x] = dy_buf.ptr(::max(i - 1, 0))[j];
|
||||
sdy[17][threadIdx.x] = dy_buf.ptr(::min(i + 16, rows - 1))[j];
|
||||
}
|
||||
__syncthreads();
|
||||
|
||||
if (i < rows)
|
||||
{
|
||||
int x = sdx[threadIdx.y][threadIdx.x] + 2 * sdx[threadIdx.y + 1][threadIdx.x] + sdx[threadIdx.y + 2][threadIdx.x];
|
||||
int y = -sdy[threadIdx.y][threadIdx.x] + sdy[threadIdx.y + 2][threadIdx.x];
|
||||
|
||||
dx.ptr(i)[j] = x;
|
||||
dy.ptr(i)[j] = y;
|
||||
|
||||
mag.ptr(i + 1)[j + 1] = Norm::calc(x, y);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void calcMagnitude_gpu(PtrStepi dx_buf, PtrStepi dy_buf, PtrStepi dx, PtrStepi dy, PtrStepf mag, int rows, int cols, bool L2Grad)
|
||||
{
|
||||
dim3 block(16, 16, 1);
|
||||
dim3 grid(divUp(cols, block.x), divUp(rows, block.y), 1);
|
||||
|
||||
if (L2Grad)
|
||||
calcMagnitude<L2><<<grid, block>>>(dx_buf, dy_buf, dx, dy, mag, rows, cols);
|
||||
else
|
||||
calcMagnitude<L1><<<grid, block>>>(dx_buf, dy_buf, dx, dy, mag, rows, cols);
|
||||
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
|
||||
cudaSafeCall(cudaThreadSynchronize());
|
||||
}
|
||||
|
||||
template <typename Norm> __global__ void calcMagnitude(PtrStepi dx, PtrStepi dy, PtrStepf mag, int rows, int cols)
|
||||
{
|
||||
const int j = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
const int i = blockIdx.y * blockDim.y + threadIdx.y;
|
||||
|
||||
if (i < rows && j < cols)
|
||||
mag.ptr(i + 1)[j + 1] = Norm::calc(dx.ptr(i)[j], dy.ptr(i)[j]);
|
||||
}
|
||||
|
||||
void calcMagnitude_gpu(PtrStepi dx, PtrStepi dy, PtrStepf mag, int rows, int cols, bool L2Grad)
|
||||
{
|
||||
dim3 block(16, 16, 1);
|
||||
dim3 grid(divUp(cols, block.x), divUp(rows, block.y), 1);
|
||||
|
||||
if (L2Grad)
|
||||
calcMagnitude<L2><<<grid, block>>>(dx, dy, mag, rows, cols);
|
||||
else
|
||||
calcMagnitude<L1><<<grid, block>>>(dx, dy, mag, rows, cols);
|
||||
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
|
||||
cudaSafeCall(cudaThreadSynchronize());
|
||||
}
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
#define CANNY_SHIFT 15
|
||||
#define TG22 (int)(0.4142135623730950488016887242097*(1<<CANNY_SHIFT) + 0.5)
|
||||
|
||||
__global__ void calcMap(const PtrStepi dx, const PtrStepi dy, const PtrStepf mag, PtrStepi map, int rows, int cols, float low_thresh, float high_thresh)
|
||||
{
|
||||
__shared__ float smem[18][18];
|
||||
|
||||
const int j = blockIdx.x * 16 + threadIdx.x;
|
||||
const int i = blockIdx.y * 16 + threadIdx.y;
|
||||
|
||||
const int tid = threadIdx.y * 16 + threadIdx.x;
|
||||
const int lx = tid % 18;
|
||||
const int ly = tid / 18;
|
||||
|
||||
if (ly < 14)
|
||||
smem[ly][lx] = mag.ptr(blockIdx.y * 16 + ly)[blockIdx.x * 16 + lx];
|
||||
|
||||
if (ly < 4 && blockIdx.y * 16 + ly + 14 <= rows && blockIdx.x * 16 + lx <= cols)
|
||||
smem[ly + 14][lx] = mag.ptr(blockIdx.y * 16 + ly + 14)[blockIdx.x * 16 + lx];
|
||||
|
||||
__syncthreads();
|
||||
|
||||
while (s_counter > 0 && s_counter <= stack_size - blockDim.x)
|
||||
if (i < rows && j < cols)
|
||||
{
|
||||
const int subTaskIdx = threadIdx.x >> 3;
|
||||
const int portion = ::min(s_counter, blockDim.x >> 3);
|
||||
int x = dx.ptr(i)[j];
|
||||
int y = dy.ptr(i)[j];
|
||||
const int s = (x ^ y) < 0 ? -1 : 1;
|
||||
const float m = smem[threadIdx.y + 1][threadIdx.x + 1];
|
||||
|
||||
pos.x = pos.y = 0;
|
||||
x = ::abs(x);
|
||||
y = ::abs(y);
|
||||
|
||||
if (subTaskIdx < portion)
|
||||
pos = s_st[s_counter - 1 - subTaskIdx];
|
||||
__syncthreads();
|
||||
|
||||
if (threadIdx.x == 0)
|
||||
s_counter -= portion;
|
||||
__syncthreads();
|
||||
|
||||
if (pos.x > 0 && pos.x <= cols && pos.y > 0 && pos.y <= rows)
|
||||
// 0 - the pixel can not belong to an edge
|
||||
// 1 - the pixel might belong to an edge
|
||||
// 2 - the pixel does belong to an edge
|
||||
int edge_type = 0;
|
||||
|
||||
if (m > low_thresh)
|
||||
{
|
||||
pos.x += c_dx[threadIdx.x & 7];
|
||||
pos.y += c_dy[threadIdx.x & 7];
|
||||
const int tg22x = x * TG22;
|
||||
const int tg67x = tg22x + ((x + x) << CANNY_SHIFT);
|
||||
|
||||
if (map.ptr(pos.y)[pos.x] == 1)
|
||||
y <<= CANNY_SHIFT;
|
||||
|
||||
if (y < tg22x)
|
||||
{
|
||||
map.ptr(pos.y)[pos.x] = 2;
|
||||
|
||||
ind = atomicInc(&s_counter, (unsigned int)(-1));
|
||||
|
||||
s_st[ind] = pos;
|
||||
if (m > smem[threadIdx.y + 1][threadIdx.x] && m >= smem[threadIdx.y + 1][threadIdx.x + 2])
|
||||
edge_type = 1 + (int)(m > high_thresh);
|
||||
}
|
||||
else if( y > tg67x )
|
||||
{
|
||||
if (m > smem[threadIdx.y][threadIdx.x + 1] && m >= smem[threadIdx.y + 2][threadIdx.x + 1])
|
||||
edge_type = 1 + (int)(m > high_thresh);
|
||||
}
|
||||
else
|
||||
{
|
||||
if (m > smem[threadIdx.y][threadIdx.x + 1 - s] && m > smem[threadIdx.y + 2][threadIdx.x + 1 + s])
|
||||
edge_type = 1 + (int)(m > high_thresh);
|
||||
}
|
||||
}
|
||||
__syncthreads();
|
||||
}
|
||||
|
||||
if (s_counter > 0)
|
||||
{
|
||||
if (threadIdx.x == 0)
|
||||
{
|
||||
ind = atomicAdd(&counter, s_counter);
|
||||
s_ind = ind - s_counter;
|
||||
}
|
||||
__syncthreads();
|
||||
|
||||
ind = s_ind;
|
||||
|
||||
for (int i = threadIdx.x; i < s_counter; i += blockDim.x)
|
||||
{
|
||||
st2[ind + i] = s_st[i];
|
||||
}
|
||||
|
||||
map.ptr(i + 1)[j + 1] = edge_type;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#endif
|
||||
}
|
||||
#undef CANNY_SHIFT
|
||||
#undef TG22
|
||||
|
||||
void edgesHysteresisGlobal_gpu(PtrStepi map, ushort2* st1, ushort2* st2, int rows, int cols)
|
||||
{
|
||||
void* counter_ptr;
|
||||
cudaSafeCall( cudaGetSymbolAddress(&counter_ptr, counter) );
|
||||
|
||||
unsigned int count;
|
||||
cudaSafeCall( cudaMemcpy(&count, counter_ptr, sizeof(unsigned int), cudaMemcpyDeviceToHost) );
|
||||
void calcMap_gpu(PtrStepi dx, PtrStepi dy, PtrStepf mag, PtrStepi map, int rows, int cols, float low_thresh, float high_thresh)
|
||||
{
|
||||
dim3 block(16, 16, 1);
|
||||
dim3 grid(divUp(cols, block.x), divUp(rows, block.y), 1);
|
||||
|
||||
while (count > 0)
|
||||
{
|
||||
cudaSafeCall( cudaMemset(counter_ptr, 0, sizeof(unsigned int)) );
|
||||
calcMap<<<grid, block>>>(dx, dy, mag, map, rows, cols, low_thresh, high_thresh);
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
|
||||
dim3 block(128, 1, 1);
|
||||
dim3 grid(std::min(count, 65535u), divUp(count, 65535), 1);
|
||||
edgesHysteresisGlobal<<<grid, block>>>(map, st1, st2, rows, cols, count);
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
cudaSafeCall(cudaThreadSynchronize());
|
||||
}
|
||||
|
||||
cudaSafeCall(cudaThreadSynchronize());
|
||||
//////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
cudaSafeCall( cudaMemcpy(&count, counter_ptr, sizeof(unsigned int), cudaMemcpyDeviceToHost) );
|
||||
__device__ unsigned int counter = 0;
|
||||
|
||||
std::swap(st1, st2);
|
||||
}
|
||||
}
|
||||
__global__ void edgesHysteresisLocal(PtrStepi map, ushort2* st, int rows, int cols)
|
||||
{
|
||||
#if __CUDA_ARCH__ >= 120
|
||||
|
||||
__global__ void getEdges(PtrStepi map, PtrStepb dst, int rows, int cols)
|
||||
{
|
||||
const int j = blockIdx.x * 16 + threadIdx.x;
|
||||
const int i = blockIdx.y * 16 + threadIdx.y;
|
||||
__shared__ int smem[18][18];
|
||||
|
||||
if (i < rows && j < cols)
|
||||
dst.ptr(i)[j] = (uchar)(-(map.ptr(i + 1)[j + 1] >> 1));
|
||||
}
|
||||
const int j = blockIdx.x * 16 + threadIdx.x;
|
||||
const int i = blockIdx.y * 16 + threadIdx.y;
|
||||
|
||||
void getEdges_gpu(PtrStepi map, PtrStepb dst, int rows, int cols)
|
||||
{
|
||||
dim3 block(16, 16, 1);
|
||||
dim3 grid(divUp(cols, block.x), divUp(rows, block.y), 1);
|
||||
const int tid = threadIdx.y * 16 + threadIdx.x;
|
||||
const int lx = tid % 18;
|
||||
const int ly = tid / 18;
|
||||
|
||||
getEdges<<<grid, block>>>(map, dst, rows, cols);
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
if (ly < 14)
|
||||
smem[ly][lx] = map.ptr(blockIdx.y * 16 + ly)[blockIdx.x * 16 + lx];
|
||||
|
||||
cudaSafeCall(cudaThreadSynchronize());
|
||||
}
|
||||
if (ly < 4 && blockIdx.y * 16 + ly + 14 <= rows && blockIdx.x * 16 + lx <= cols)
|
||||
smem[ly + 14][lx] = map.ptr(blockIdx.y * 16 + ly + 14)[blockIdx.x * 16 + lx];
|
||||
|
||||
} // namespace canny
|
||||
__syncthreads();
|
||||
|
||||
END_OPENCV_DEVICE_NAMESPACE
|
||||
if (i < rows && j < cols)
|
||||
{
|
||||
int n;
|
||||
|
||||
#pragma unroll
|
||||
for (int k = 0; k < 16; ++k)
|
||||
{
|
||||
n = 0;
|
||||
|
||||
if (smem[threadIdx.y + 1][threadIdx.x + 1] == 1)
|
||||
{
|
||||
n += smem[threadIdx.y ][threadIdx.x ] == 2;
|
||||
n += smem[threadIdx.y ][threadIdx.x + 1] == 2;
|
||||
n += smem[threadIdx.y ][threadIdx.x + 2] == 2;
|
||||
|
||||
n += smem[threadIdx.y + 1][threadIdx.x ] == 2;
|
||||
n += smem[threadIdx.y + 1][threadIdx.x + 2] == 2;
|
||||
|
||||
n += smem[threadIdx.y + 2][threadIdx.x ] == 2;
|
||||
n += smem[threadIdx.y + 2][threadIdx.x + 1] == 2;
|
||||
n += smem[threadIdx.y + 2][threadIdx.x + 2] == 2;
|
||||
}
|
||||
|
||||
if (n > 0)
|
||||
smem[threadIdx.y + 1][threadIdx.x + 1] = 2;
|
||||
}
|
||||
|
||||
const int e = smem[threadIdx.y + 1][threadIdx.x + 1];
|
||||
|
||||
map.ptr(i + 1)[j + 1] = e;
|
||||
|
||||
n = 0;
|
||||
|
||||
if (e == 2)
|
||||
{
|
||||
n += smem[threadIdx.y ][threadIdx.x ] == 1;
|
||||
n += smem[threadIdx.y ][threadIdx.x + 1] == 1;
|
||||
n += smem[threadIdx.y ][threadIdx.x + 2] == 1;
|
||||
|
||||
n += smem[threadIdx.y + 1][threadIdx.x ] == 1;
|
||||
n += smem[threadIdx.y + 1][threadIdx.x + 2] == 1;
|
||||
|
||||
n += smem[threadIdx.y + 2][threadIdx.x ] == 1;
|
||||
n += smem[threadIdx.y + 2][threadIdx.x + 1] == 1;
|
||||
n += smem[threadIdx.y + 2][threadIdx.x + 2] == 1;
|
||||
}
|
||||
|
||||
if (n > 0)
|
||||
{
|
||||
const unsigned int ind = atomicInc(&counter, (unsigned int)(-1));
|
||||
st[ind] = make_ushort2(j + 1, i + 1);
|
||||
}
|
||||
}
|
||||
|
||||
#endif
|
||||
}
|
||||
|
||||
void edgesHysteresisLocal_gpu(PtrStepi map, ushort2* st1, int rows, int cols)
|
||||
{
|
||||
dim3 block(16, 16, 1);
|
||||
dim3 grid(divUp(cols, block.x), divUp(rows, block.y), 1);
|
||||
|
||||
edgesHysteresisLocal<<<grid, block>>>(map, st1, rows, cols);
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
|
||||
cudaSafeCall(cudaThreadSynchronize());
|
||||
}
|
||||
|
||||
__constant__ int c_dx[8] = {-1, 0, 1, -1, 1, -1, 0, 1};
|
||||
__constant__ int c_dy[8] = {-1, -1, -1, 0, 0, 1, 1, 1};
|
||||
|
||||
__global__ void edgesHysteresisGlobal(PtrStepi map, ushort2* st1, ushort2* st2, int rows, int cols, int count)
|
||||
{
|
||||
#if __CUDA_ARCH__ >= 120
|
||||
|
||||
const int stack_size = 512;
|
||||
|
||||
__shared__ unsigned int s_counter;
|
||||
__shared__ unsigned int s_ind;
|
||||
__shared__ ushort2 s_st[stack_size];
|
||||
|
||||
if (threadIdx.x == 0)
|
||||
s_counter = 0;
|
||||
__syncthreads();
|
||||
|
||||
int ind = blockIdx.y * gridDim.x + blockIdx.x;
|
||||
|
||||
if (ind < count)
|
||||
{
|
||||
ushort2 pos = st1[ind];
|
||||
|
||||
if (pos.x > 0 && pos.x <= cols && pos.y > 0 && pos.y <= rows)
|
||||
{
|
||||
if (threadIdx.x < 8)
|
||||
{
|
||||
pos.x += c_dx[threadIdx.x];
|
||||
pos.y += c_dy[threadIdx.x];
|
||||
|
||||
if (map.ptr(pos.y)[pos.x] == 1)
|
||||
{
|
||||
map.ptr(pos.y)[pos.x] = 2;
|
||||
|
||||
ind = atomicInc(&s_counter, (unsigned int)(-1));
|
||||
|
||||
s_st[ind] = pos;
|
||||
}
|
||||
}
|
||||
__syncthreads();
|
||||
|
||||
while (s_counter > 0 && s_counter <= stack_size - blockDim.x)
|
||||
{
|
||||
const int subTaskIdx = threadIdx.x >> 3;
|
||||
const int portion = ::min(s_counter, blockDim.x >> 3);
|
||||
|
||||
pos.x = pos.y = 0;
|
||||
|
||||
if (subTaskIdx < portion)
|
||||
pos = s_st[s_counter - 1 - subTaskIdx];
|
||||
__syncthreads();
|
||||
|
||||
if (threadIdx.x == 0)
|
||||
s_counter -= portion;
|
||||
__syncthreads();
|
||||
|
||||
if (pos.x > 0 && pos.x <= cols && pos.y > 0 && pos.y <= rows)
|
||||
{
|
||||
pos.x += c_dx[threadIdx.x & 7];
|
||||
pos.y += c_dy[threadIdx.x & 7];
|
||||
|
||||
if (map.ptr(pos.y)[pos.x] == 1)
|
||||
{
|
||||
map.ptr(pos.y)[pos.x] = 2;
|
||||
|
||||
ind = atomicInc(&s_counter, (unsigned int)(-1));
|
||||
|
||||
s_st[ind] = pos;
|
||||
}
|
||||
}
|
||||
__syncthreads();
|
||||
}
|
||||
|
||||
if (s_counter > 0)
|
||||
{
|
||||
if (threadIdx.x == 0)
|
||||
{
|
||||
ind = atomicAdd(&counter, s_counter);
|
||||
s_ind = ind - s_counter;
|
||||
}
|
||||
__syncthreads();
|
||||
|
||||
ind = s_ind;
|
||||
|
||||
for (int i = threadIdx.x; i < s_counter; i += blockDim.x)
|
||||
{
|
||||
st2[ind + i] = s_st[i];
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#endif
|
||||
}
|
||||
|
||||
void edgesHysteresisGlobal_gpu(PtrStepi map, ushort2* st1, ushort2* st2, int rows, int cols)
|
||||
{
|
||||
void* counter_ptr;
|
||||
cudaSafeCall( cudaGetSymbolAddress(&counter_ptr, counter) );
|
||||
|
||||
unsigned int count;
|
||||
cudaSafeCall( cudaMemcpy(&count, counter_ptr, sizeof(unsigned int), cudaMemcpyDeviceToHost) );
|
||||
|
||||
while (count > 0)
|
||||
{
|
||||
cudaSafeCall( cudaMemset(counter_ptr, 0, sizeof(unsigned int)) );
|
||||
|
||||
dim3 block(128, 1, 1);
|
||||
dim3 grid(std::min(count, 65535u), divUp(count, 65535), 1);
|
||||
edgesHysteresisGlobal<<<grid, block>>>(map, st1, st2, rows, cols, count);
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
|
||||
cudaSafeCall(cudaThreadSynchronize());
|
||||
|
||||
cudaSafeCall( cudaMemcpy(&count, counter_ptr, sizeof(unsigned int), cudaMemcpyDeviceToHost) );
|
||||
|
||||
std::swap(st1, st2);
|
||||
}
|
||||
}
|
||||
|
||||
__global__ void getEdges(PtrStepi map, PtrStepb dst, int rows, int cols)
|
||||
{
|
||||
const int j = blockIdx.x * 16 + threadIdx.x;
|
||||
const int i = blockIdx.y * 16 + threadIdx.y;
|
||||
|
||||
if (i < rows && j < cols)
|
||||
dst.ptr(i)[j] = (uchar)(-(map.ptr(i + 1)[j + 1] >> 1));
|
||||
}
|
||||
|
||||
void getEdges_gpu(PtrStepi map, PtrStepb dst, int rows, int cols)
|
||||
{
|
||||
dim3 block(16, 16, 1);
|
||||
dim3 grid(divUp(cols, block.x), divUp(rows, block.y), 1);
|
||||
|
||||
getEdges<<<grid, block>>>(map, dst, rows, cols);
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
|
||||
cudaSafeCall(cudaThreadSynchronize());
|
||||
}
|
||||
} // namespace canny
|
||||
}}} // namespace cv { namespace gpu { namespace device
|
||||
|
@ -44,181 +44,181 @@
|
||||
#include "opencv2/gpu/device/transform.hpp"
|
||||
#include "opencv2/gpu/device/color.hpp"
|
||||
|
||||
BEGIN_OPENCV_DEVICE_NAMESPACE
|
||||
namespace cv { namespace gpu { namespace device
|
||||
{
|
||||
OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(bgra_to_rgba_traits<uchar>::functor_type)
|
||||
{
|
||||
enum { smart_block_dim_x = 8 };
|
||||
enum { smart_block_dim_y = 8 };
|
||||
enum { smart_shift = 4 };
|
||||
};
|
||||
|
||||
DEFINE_TRANSFORM_FUNCTOR_TRAITS(bgra_to_rgba_traits<uchar>::functor_type)
|
||||
{
|
||||
enum { smart_block_dim_x = 8 };
|
||||
enum { smart_block_dim_y = 8 };
|
||||
enum { smart_shift = 4 };
|
||||
};
|
||||
OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(bgra_to_bgr555_traits::functor_type)
|
||||
{
|
||||
enum { smart_block_dim_y = 8 };
|
||||
enum { smart_shift = 4 };
|
||||
};
|
||||
OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(rgba_to_bgr555_traits::functor_type)
|
||||
{
|
||||
enum { smart_block_dim_y = 8 };
|
||||
enum { smart_shift = 4 };
|
||||
};
|
||||
OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(bgra_to_bgr565_traits::functor_type)
|
||||
{
|
||||
enum { smart_block_dim_y = 8 };
|
||||
enum { smart_shift = 4 };
|
||||
};
|
||||
OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(rgba_to_bgr565_traits::functor_type)
|
||||
{
|
||||
enum { smart_block_dim_y = 8 };
|
||||
enum { smart_shift = 4 };
|
||||
};
|
||||
|
||||
DEFINE_TRANSFORM_FUNCTOR_TRAITS(bgra_to_bgr555_traits::functor_type)
|
||||
{
|
||||
enum { smart_block_dim_y = 8 };
|
||||
enum { smart_shift = 4 };
|
||||
};
|
||||
DEFINE_TRANSFORM_FUNCTOR_TRAITS(rgba_to_bgr555_traits::functor_type)
|
||||
{
|
||||
enum { smart_block_dim_y = 8 };
|
||||
enum { smart_shift = 4 };
|
||||
};
|
||||
DEFINE_TRANSFORM_FUNCTOR_TRAITS(bgra_to_bgr565_traits::functor_type)
|
||||
{
|
||||
enum { smart_block_dim_y = 8 };
|
||||
enum { smart_shift = 4 };
|
||||
};
|
||||
DEFINE_TRANSFORM_FUNCTOR_TRAITS(rgba_to_bgr565_traits::functor_type)
|
||||
{
|
||||
enum { smart_block_dim_y = 8 };
|
||||
enum { smart_shift = 4 };
|
||||
};
|
||||
OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(bgr555_to_bgra_traits::functor_type)
|
||||
{
|
||||
enum { smart_block_dim_y = 8 };
|
||||
enum { smart_shift = 4 };
|
||||
};
|
||||
OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(bgr555_to_rgba_traits::functor_type)
|
||||
{
|
||||
enum { smart_block_dim_y = 8 };
|
||||
enum { smart_shift = 4 };
|
||||
};
|
||||
OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(bgr565_to_bgra_traits::functor_type)
|
||||
{
|
||||
enum { smart_block_dim_y = 8 };
|
||||
enum { smart_shift = 4 };
|
||||
};
|
||||
OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(bgr565_to_rgba_traits::functor_type)
|
||||
{
|
||||
enum { smart_block_dim_y = 8 };
|
||||
enum { smart_shift = 4 };
|
||||
};
|
||||
|
||||
DEFINE_TRANSFORM_FUNCTOR_TRAITS(bgr555_to_bgra_traits::functor_type)
|
||||
{
|
||||
enum { smart_block_dim_y = 8 };
|
||||
enum { smart_shift = 4 };
|
||||
};
|
||||
DEFINE_TRANSFORM_FUNCTOR_TRAITS(bgr555_to_rgba_traits::functor_type)
|
||||
{
|
||||
enum { smart_block_dim_y = 8 };
|
||||
enum { smart_shift = 4 };
|
||||
};
|
||||
DEFINE_TRANSFORM_FUNCTOR_TRAITS(bgr565_to_bgra_traits::functor_type)
|
||||
{
|
||||
enum { smart_block_dim_y = 8 };
|
||||
enum { smart_shift = 4 };
|
||||
};
|
||||
DEFINE_TRANSFORM_FUNCTOR_TRAITS(bgr565_to_rgba_traits::functor_type)
|
||||
{
|
||||
enum { smart_block_dim_y = 8 };
|
||||
enum { smart_shift = 4 };
|
||||
};
|
||||
OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(gray_to_bgra_traits<uchar>::functor_type)
|
||||
{
|
||||
enum { smart_block_dim_y = 8 };
|
||||
enum { smart_shift = 4 };
|
||||
};
|
||||
|
||||
DEFINE_TRANSFORM_FUNCTOR_TRAITS(gray_to_bgra_traits<uchar>::functor_type)
|
||||
{
|
||||
enum { smart_block_dim_y = 8 };
|
||||
enum { smart_shift = 4 };
|
||||
};
|
||||
OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(gray_to_bgr555_traits::functor_type)
|
||||
{
|
||||
enum { smart_shift = 4 };
|
||||
};
|
||||
OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(gray_to_bgr565_traits::functor_type)
|
||||
{
|
||||
enum { smart_shift = 4 };
|
||||
};
|
||||
|
||||
DEFINE_TRANSFORM_FUNCTOR_TRAITS(gray_to_bgr555_traits::functor_type)
|
||||
{
|
||||
enum { smart_shift = 4 };
|
||||
};
|
||||
DEFINE_TRANSFORM_FUNCTOR_TRAITS(gray_to_bgr565_traits::functor_type)
|
||||
{
|
||||
enum { smart_shift = 4 };
|
||||
};
|
||||
OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(bgra_to_yuv4_traits<uchar>::functor_type)
|
||||
{
|
||||
enum { smart_block_dim_y = 8 };
|
||||
enum { smart_shift = 4 };
|
||||
};
|
||||
OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(rgba_to_yuv4_traits<uchar>::functor_type)
|
||||
{
|
||||
enum { smart_block_dim_y = 8 };
|
||||
enum { smart_shift = 4 };
|
||||
};
|
||||
|
||||
DEFINE_TRANSFORM_FUNCTOR_TRAITS(bgra_to_yuv4_traits<uchar>::functor_type)
|
||||
{
|
||||
enum { smart_block_dim_y = 8 };
|
||||
enum { smart_shift = 4 };
|
||||
};
|
||||
DEFINE_TRANSFORM_FUNCTOR_TRAITS(rgba_to_yuv4_traits<uchar>::functor_type)
|
||||
{
|
||||
enum { smart_block_dim_y = 8 };
|
||||
enum { smart_shift = 4 };
|
||||
};
|
||||
OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(yuv4_to_bgra_traits<uchar>::functor_type)
|
||||
{
|
||||
enum { smart_block_dim_y = 8 };
|
||||
enum { smart_shift = 4 };
|
||||
};
|
||||
OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(yuv4_to_rgba_traits<uchar>::functor_type)
|
||||
{
|
||||
enum { smart_block_dim_y = 8 };
|
||||
enum { smart_shift = 4 };
|
||||
};
|
||||
|
||||
DEFINE_TRANSFORM_FUNCTOR_TRAITS(yuv4_to_bgra_traits<uchar>::functor_type)
|
||||
{
|
||||
enum { smart_block_dim_y = 8 };
|
||||
enum { smart_shift = 4 };
|
||||
};
|
||||
DEFINE_TRANSFORM_FUNCTOR_TRAITS(yuv4_to_rgba_traits<uchar>::functor_type)
|
||||
{
|
||||
enum { smart_block_dim_y = 8 };
|
||||
enum { smart_shift = 4 };
|
||||
};
|
||||
OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(bgra_to_YCrCb4_traits<uchar>::functor_type)
|
||||
{
|
||||
enum { smart_block_dim_y = 8 };
|
||||
enum { smart_shift = 4 };
|
||||
};
|
||||
OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(rgba_to_YCrCb4_traits<uchar>::functor_type)
|
||||
{
|
||||
enum { smart_block_dim_y = 8 };
|
||||
enum { smart_shift = 4 };
|
||||
};
|
||||
|
||||
DEFINE_TRANSFORM_FUNCTOR_TRAITS(bgra_to_YCrCb4_traits<uchar>::functor_type)
|
||||
{
|
||||
enum { smart_block_dim_y = 8 };
|
||||
enum { smart_shift = 4 };
|
||||
};
|
||||
DEFINE_TRANSFORM_FUNCTOR_TRAITS(rgba_to_YCrCb4_traits<uchar>::functor_type)
|
||||
{
|
||||
enum { smart_block_dim_y = 8 };
|
||||
enum { smart_shift = 4 };
|
||||
};
|
||||
OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(YCrCb4_to_bgra_traits<uchar>::functor_type)
|
||||
{
|
||||
enum { smart_block_dim_y = 8 };
|
||||
enum { smart_shift = 4 };
|
||||
};
|
||||
OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(YCrCb4_to_rgba_traits<uchar>::functor_type)
|
||||
{
|
||||
enum { smart_block_dim_y = 8 };
|
||||
enum { smart_shift = 4 };
|
||||
};
|
||||
|
||||
DEFINE_TRANSFORM_FUNCTOR_TRAITS(YCrCb4_to_bgra_traits<uchar>::functor_type)
|
||||
{
|
||||
enum { smart_block_dim_y = 8 };
|
||||
enum { smart_shift = 4 };
|
||||
};
|
||||
DEFINE_TRANSFORM_FUNCTOR_TRAITS(YCrCb4_to_rgba_traits<uchar>::functor_type)
|
||||
{
|
||||
enum { smart_block_dim_y = 8 };
|
||||
enum { smart_shift = 4 };
|
||||
};
|
||||
OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(bgra_to_xyz4_traits<uchar>::functor_type)
|
||||
{
|
||||
enum { smart_block_dim_y = 8 };
|
||||
enum { smart_shift = 4 };
|
||||
};
|
||||
OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(rgba_to_xyz4_traits<uchar>::functor_type)
|
||||
{
|
||||
enum { smart_block_dim_y = 8 };
|
||||
enum { smart_shift = 4 };
|
||||
};
|
||||
|
||||
DEFINE_TRANSFORM_FUNCTOR_TRAITS(bgra_to_xyz4_traits<uchar>::functor_type)
|
||||
{
|
||||
enum { smart_block_dim_y = 8 };
|
||||
enum { smart_shift = 4 };
|
||||
};
|
||||
DEFINE_TRANSFORM_FUNCTOR_TRAITS(rgba_to_xyz4_traits<uchar>::functor_type)
|
||||
{
|
||||
enum { smart_block_dim_y = 8 };
|
||||
enum { smart_shift = 4 };
|
||||
};
|
||||
OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(xyz4_to_bgra_traits<uchar>::functor_type)
|
||||
{
|
||||
enum { smart_block_dim_y = 8 };
|
||||
enum { smart_shift = 4 };
|
||||
};
|
||||
OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(xyz4_to_rgba_traits<uchar>::functor_type)
|
||||
{
|
||||
enum { smart_block_dim_y = 8 };
|
||||
enum { smart_shift = 4 };
|
||||
};
|
||||
|
||||
DEFINE_TRANSFORM_FUNCTOR_TRAITS(xyz4_to_bgra_traits<uchar>::functor_type)
|
||||
{
|
||||
enum { smart_block_dim_y = 8 };
|
||||
enum { smart_shift = 4 };
|
||||
};
|
||||
DEFINE_TRANSFORM_FUNCTOR_TRAITS(xyz4_to_rgba_traits<uchar>::functor_type)
|
||||
{
|
||||
enum { smart_block_dim_y = 8 };
|
||||
enum { smart_shift = 4 };
|
||||
};
|
||||
OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(bgra_to_hsv4_traits<uchar>::functor_type)
|
||||
{
|
||||
enum { smart_block_dim_y = 8 };
|
||||
enum { smart_shift = 4 };
|
||||
};
|
||||
OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(rgba_to_hsv4_traits<uchar>::functor_type)
|
||||
{
|
||||
enum { smart_block_dim_y = 8 };
|
||||
enum { smart_shift = 4 };
|
||||
};
|
||||
|
||||
DEFINE_TRANSFORM_FUNCTOR_TRAITS(bgra_to_hsv4_traits<uchar>::functor_type)
|
||||
{
|
||||
enum { smart_block_dim_y = 8 };
|
||||
enum { smart_shift = 4 };
|
||||
};
|
||||
DEFINE_TRANSFORM_FUNCTOR_TRAITS(rgba_to_hsv4_traits<uchar>::functor_type)
|
||||
{
|
||||
enum { smart_block_dim_y = 8 };
|
||||
enum { smart_shift = 4 };
|
||||
};
|
||||
OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(hsv4_to_bgra_traits<uchar>::functor_type)
|
||||
{
|
||||
enum { smart_block_dim_y = 8 };
|
||||
enum { smart_shift = 4 };
|
||||
};
|
||||
OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(hsv4_to_rgba_traits<uchar>::functor_type)
|
||||
{
|
||||
enum { smart_block_dim_y = 8 };
|
||||
enum { smart_shift = 4 };
|
||||
};
|
||||
|
||||
DEFINE_TRANSFORM_FUNCTOR_TRAITS(hsv4_to_bgra_traits<uchar>::functor_type)
|
||||
{
|
||||
enum { smart_block_dim_y = 8 };
|
||||
enum { smart_shift = 4 };
|
||||
};
|
||||
DEFINE_TRANSFORM_FUNCTOR_TRAITS(hsv4_to_rgba_traits<uchar>::functor_type)
|
||||
{
|
||||
enum { smart_block_dim_y = 8 };
|
||||
enum { smart_shift = 4 };
|
||||
};
|
||||
OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(bgra_to_hls4_traits<uchar>::functor_type)
|
||||
{
|
||||
enum { smart_block_dim_y = 8 };
|
||||
enum { smart_shift = 4 };
|
||||
};
|
||||
OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(rgba_to_hls4_traits<uchar>::functor_type)
|
||||
{
|
||||
enum { smart_block_dim_y = 8 };
|
||||
enum { smart_shift = 4 };
|
||||
};
|
||||
|
||||
DEFINE_TRANSFORM_FUNCTOR_TRAITS(bgra_to_hls4_traits<uchar>::functor_type)
|
||||
{
|
||||
enum { smart_block_dim_y = 8 };
|
||||
enum { smart_shift = 4 };
|
||||
};
|
||||
DEFINE_TRANSFORM_FUNCTOR_TRAITS(rgba_to_hls4_traits<uchar>::functor_type)
|
||||
{
|
||||
enum { smart_block_dim_y = 8 };
|
||||
enum { smart_shift = 4 };
|
||||
};
|
||||
|
||||
DEFINE_TRANSFORM_FUNCTOR_TRAITS(hls4_to_bgra_traits<uchar>::functor_type)
|
||||
{
|
||||
enum { smart_block_dim_y = 8 };
|
||||
enum { smart_shift = 4 };
|
||||
};
|
||||
DEFINE_TRANSFORM_FUNCTOR_TRAITS(hls4_to_rgba_traits<uchar>::functor_type)
|
||||
{
|
||||
enum { smart_block_dim_y = 8 };
|
||||
enum { smart_shift = 4 };
|
||||
};
|
||||
OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(hls4_to_bgra_traits<uchar>::functor_type)
|
||||
{
|
||||
enum { smart_block_dim_y = 8 };
|
||||
enum { smart_shift = 4 };
|
||||
};
|
||||
OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(hls4_to_rgba_traits<uchar>::functor_type)
|
||||
{
|
||||
enum { smart_block_dim_y = 8 };
|
||||
enum { smart_shift = 4 };
|
||||
};
|
||||
|
||||
#define OPENCV_GPU_IMPLEMENT_CVTCOLOR(name, traits) \
|
||||
void name(const DevMem2Db& src, const DevMem2Db& dst, cudaStream_t stream) \
|
||||
@ -226,7 +226,7 @@ DEFINE_TRANSFORM_FUNCTOR_TRAITS(hls4_to_rgba_traits<uchar>::functor_type)
|
||||
traits::functor_type functor = traits::create_functor(); \
|
||||
typedef typename traits::functor_type::argument_type src_t; \
|
||||
typedef typename traits::functor_type::result_type dst_t; \
|
||||
OPENCV_DEVICE_NAMESPACE_ transform((DevMem2D_<src_t>)src, (DevMem2D_<dst_t>)dst, functor, stream); \
|
||||
::cv::gpu::device::transform((DevMem2D_<src_t>)src, (DevMem2D_<dst_t>)dst, functor, stream); \
|
||||
}
|
||||
|
||||
#define OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(name) \
|
||||
@ -243,138 +243,137 @@ DEFINE_TRANSFORM_FUNCTOR_TRAITS(hls4_to_rgba_traits<uchar>::functor_type)
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR(name ## _full_8u, name ## _full_traits<uchar>) \
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR(name ## _full_32f, name ## _full_traits<float>)
|
||||
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgr_to_rgb)
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgr_to_bgra)
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgr_to_rgba)
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgra_to_bgr)
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgra_to_rgb)
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgra_to_rgba)
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgr_to_rgb)
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgr_to_bgra)
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgr_to_rgba)
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgra_to_bgr)
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgra_to_rgb)
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgra_to_rgba)
|
||||
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(bgr_to_bgr555)
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(bgr_to_bgr565)
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(rgb_to_bgr555)
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(rgb_to_bgr565)
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(bgra_to_bgr555)
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(bgra_to_bgr565)
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(rgba_to_bgr555)
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(rgba_to_bgr565)
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(bgr_to_bgr555)
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(bgr_to_bgr565)
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(rgb_to_bgr555)
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(rgb_to_bgr565)
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(bgra_to_bgr555)
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(bgra_to_bgr565)
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(rgba_to_bgr555)
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(rgba_to_bgr565)
|
||||
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(bgr555_to_rgb)
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(bgr565_to_rgb)
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(bgr555_to_bgr)
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(bgr565_to_bgr)
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(bgr555_to_rgba)
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(bgr565_to_rgba)
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(bgr555_to_bgra)
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(bgr565_to_bgra)
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(bgr555_to_rgb)
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(bgr565_to_rgb)
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(bgr555_to_bgr)
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(bgr565_to_bgr)
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(bgr555_to_rgba)
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(bgr565_to_rgba)
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(bgr555_to_bgra)
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(bgr565_to_bgra)
|
||||
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(gray_to_bgr)
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(gray_to_bgra)
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(gray_to_bgr)
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(gray_to_bgra)
|
||||
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(gray_to_bgr555)
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(gray_to_bgr565)
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(gray_to_bgr555)
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(gray_to_bgr565)
|
||||
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(bgr555_to_gray)
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(bgr565_to_gray)
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(bgr555_to_gray)
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(bgr565_to_gray)
|
||||
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(rgb_to_gray)
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgr_to_gray)
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(rgba_to_gray)
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgra_to_gray)
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(rgb_to_gray)
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgr_to_gray)
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(rgba_to_gray)
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgra_to_gray)
|
||||
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(rgb_to_yuv)
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(rgba_to_yuv)
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(rgb_to_yuv4)
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(rgba_to_yuv4)
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgr_to_yuv)
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgra_to_yuv)
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgr_to_yuv4)
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgra_to_yuv4)
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(rgb_to_yuv)
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(rgba_to_yuv)
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(rgb_to_yuv4)
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(rgba_to_yuv4)
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgr_to_yuv)
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgra_to_yuv)
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgr_to_yuv4)
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgra_to_yuv4)
|
||||
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(yuv_to_rgb)
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(yuv_to_rgba)
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(yuv4_to_rgb)
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(yuv4_to_rgba)
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(yuv_to_bgr)
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(yuv_to_bgra)
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(yuv4_to_bgr)
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(yuv4_to_bgra)
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(yuv_to_rgb)
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(yuv_to_rgba)
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(yuv4_to_rgb)
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(yuv4_to_rgba)
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(yuv_to_bgr)
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(yuv_to_bgra)
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(yuv4_to_bgr)
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(yuv4_to_bgra)
|
||||
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(rgb_to_YCrCb)
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(rgba_to_YCrCb)
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(rgb_to_YCrCb4)
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(rgba_to_YCrCb4)
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgr_to_YCrCb)
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgra_to_YCrCb)
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgr_to_YCrCb4)
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgra_to_YCrCb4)
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(rgb_to_YCrCb)
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(rgba_to_YCrCb)
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(rgb_to_YCrCb4)
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(rgba_to_YCrCb4)
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgr_to_YCrCb)
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgra_to_YCrCb)
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgr_to_YCrCb4)
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgra_to_YCrCb4)
|
||||
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(YCrCb_to_rgb)
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(YCrCb_to_rgba)
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(YCrCb4_to_rgb)
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(YCrCb4_to_rgba)
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(YCrCb_to_bgr)
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(YCrCb_to_bgra)
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(YCrCb4_to_bgr)
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(YCrCb4_to_bgra)
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(YCrCb_to_rgb)
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(YCrCb_to_rgba)
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(YCrCb4_to_rgb)
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(YCrCb4_to_rgba)
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(YCrCb_to_bgr)
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(YCrCb_to_bgra)
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(YCrCb4_to_bgr)
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(YCrCb4_to_bgra)
|
||||
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(rgb_to_xyz)
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(rgba_to_xyz)
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(rgb_to_xyz4)
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(rgba_to_xyz4)
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgr_to_xyz)
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgra_to_xyz)
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgr_to_xyz4)
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgra_to_xyz4)
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(rgb_to_xyz)
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(rgba_to_xyz)
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(rgb_to_xyz4)
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(rgba_to_xyz4)
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgr_to_xyz)
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgra_to_xyz)
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgr_to_xyz4)
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgra_to_xyz4)
|
||||
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(xyz_to_rgb)
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(xyz4_to_rgb)
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(xyz_to_rgba)
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(xyz4_to_rgba)
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(xyz_to_bgr)
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(xyz4_to_bgr)
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(xyz_to_bgra)
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(xyz4_to_bgra)
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(xyz_to_rgb)
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(xyz4_to_rgb)
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(xyz_to_rgba)
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(xyz4_to_rgba)
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(xyz_to_bgr)
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(xyz4_to_bgr)
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(xyz_to_bgra)
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(xyz4_to_bgra)
|
||||
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(rgb_to_hsv)
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(rgba_to_hsv)
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(rgb_to_hsv4)
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(rgba_to_hsv4)
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(bgr_to_hsv)
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(bgra_to_hsv)
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(bgr_to_hsv4)
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(bgra_to_hsv4)
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(rgb_to_hsv)
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(rgba_to_hsv)
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(rgb_to_hsv4)
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(rgba_to_hsv4)
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(bgr_to_hsv)
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(bgra_to_hsv)
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(bgr_to_hsv4)
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(bgra_to_hsv4)
|
||||
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(hsv_to_rgb)
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(hsv_to_rgba)
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(hsv4_to_rgb)
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(hsv4_to_rgba)
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(hsv_to_bgr)
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(hsv_to_bgra)
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(hsv4_to_bgr)
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(hsv4_to_bgra)
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(hsv_to_rgb)
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(hsv_to_rgba)
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(hsv4_to_rgb)
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(hsv4_to_rgba)
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(hsv_to_bgr)
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(hsv_to_bgra)
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(hsv4_to_bgr)
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(hsv4_to_bgra)
|
||||
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(rgb_to_hls)
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(rgba_to_hls)
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(rgb_to_hls4)
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(rgba_to_hls4)
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(bgr_to_hls)
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(bgra_to_hls)
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(bgr_to_hls4)
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(bgra_to_hls4)
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(rgb_to_hls)
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(rgba_to_hls)
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(rgb_to_hls4)
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(rgba_to_hls4)
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(bgr_to_hls)
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(bgra_to_hls)
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(bgr_to_hls4)
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(bgra_to_hls4)
|
||||
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(hls_to_rgb)
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(hls_to_rgba)
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(hls4_to_rgb)
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(hls4_to_rgba)
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(hls_to_bgr)
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(hls_to_bgra)
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(hls4_to_bgr)
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(hls4_to_bgra)
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(hls_to_rgb)
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(hls_to_rgba)
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(hls4_to_rgb)
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(hls4_to_rgba)
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(hls_to_bgr)
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(hls_to_bgra)
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(hls4_to_bgr)
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(hls4_to_bgra)
|
||||
|
||||
#undef OPENCV_GPU_IMPLEMENT_CVTCOLOR
|
||||
#undef OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE
|
||||
#undef OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL
|
||||
#undef OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F
|
||||
|
||||
END_OPENCV_DEVICE_NAMESPACE
|
||||
#undef OPENCV_GPU_IMPLEMENT_CVTCOLOR
|
||||
#undef OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE
|
||||
#undef OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL
|
||||
#undef OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F
|
||||
}}} // namespace cv { namespace gpu { namespace device
|
||||
|
@ -47,203 +47,201 @@
|
||||
#include "opencv2/gpu/device/limits.hpp"
|
||||
#include "opencv2/gpu/device/border_interpolate.hpp"
|
||||
|
||||
BEGIN_OPENCV_DEVICE_NAMESPACE
|
||||
|
||||
#define MAX_KERNEL_SIZE 16
|
||||
#define BLOCK_DIM_X 16
|
||||
#define BLOCK_DIM_Y 4
|
||||
#define RESULT_STEPS 8
|
||||
#define HALO_STEPS 1
|
||||
|
||||
namespace column_filter {
|
||||
|
||||
__constant__ float c_kernel[MAX_KERNEL_SIZE];
|
||||
|
||||
void loadKernel(const float kernel[], int ksize)
|
||||
namespace cv { namespace gpu { namespace device
|
||||
{
|
||||
cudaSafeCall( cudaMemcpyToSymbol(c_kernel, kernel, ksize * sizeof(float)) );
|
||||
}
|
||||
#define MAX_KERNEL_SIZE 16
|
||||
#define BLOCK_DIM_X 16
|
||||
#define BLOCK_DIM_Y 4
|
||||
#define RESULT_STEPS 8
|
||||
#define HALO_STEPS 1
|
||||
|
||||
template <int KERNEL_SIZE, typename T, typename D, typename B>
|
||||
__global__ void linearColumnFilter(const DevMem2D_<T> src, PtrStep<D> dst, int anchor, const B b)
|
||||
{
|
||||
typedef typename TypeVec<float, VecTraits<T>::cn>::vec_type sum_t;
|
||||
|
||||
__shared__ T smem[BLOCK_DIM_X][(RESULT_STEPS + 2 * HALO_STEPS) * BLOCK_DIM_Y + 1];
|
||||
|
||||
//Offset to the upper halo edge
|
||||
const int x = blockIdx.x * BLOCK_DIM_X + threadIdx.x;
|
||||
const int y = (blockIdx.y * RESULT_STEPS - HALO_STEPS) * BLOCK_DIM_Y + threadIdx.y;
|
||||
|
||||
if (x < src.cols)
|
||||
namespace column_filter
|
||||
{
|
||||
const T* src_col = src.ptr() + x;
|
||||
__constant__ float c_kernel[MAX_KERNEL_SIZE];
|
||||
|
||||
//Main data
|
||||
#pragma unroll
|
||||
for(int i = HALO_STEPS; i < HALO_STEPS + RESULT_STEPS; ++i)
|
||||
smem[threadIdx.x][threadIdx.y + i * BLOCK_DIM_Y] = b.at_high(y + i * BLOCK_DIM_Y, src_col, src.step);
|
||||
|
||||
//Upper halo
|
||||
#pragma unroll
|
||||
for(int i = 0; i < HALO_STEPS; ++i)
|
||||
smem[threadIdx.x][threadIdx.y + i * BLOCK_DIM_Y] = b.at_low(y + i * BLOCK_DIM_Y, src_col, src.step);
|
||||
|
||||
//Lower halo
|
||||
#pragma unroll
|
||||
for(int i = HALO_STEPS + RESULT_STEPS; i < HALO_STEPS + RESULT_STEPS + HALO_STEPS; ++i)
|
||||
smem[threadIdx.x][threadIdx.y + i * BLOCK_DIM_Y]= b.at_high(y + i * BLOCK_DIM_Y, src_col, src.step);
|
||||
|
||||
__syncthreads();
|
||||
|
||||
#pragma unroll
|
||||
for(int i = HALO_STEPS; i < HALO_STEPS + RESULT_STEPS; ++i)
|
||||
void loadKernel(const float kernel[], int ksize)
|
||||
{
|
||||
sum_t sum = VecTraits<sum_t>::all(0);
|
||||
|
||||
#pragma unroll
|
||||
for(int j = 0; j < KERNEL_SIZE; ++j)
|
||||
sum = sum + smem[threadIdx.x][threadIdx.y + i * BLOCK_DIM_Y + j - anchor] * c_kernel[j];
|
||||
|
||||
int dstY = y + i * BLOCK_DIM_Y;
|
||||
|
||||
if (dstY < src.rows)
|
||||
dst.ptr(dstY)[x] = saturate_cast<D>(sum);
|
||||
cudaSafeCall( cudaMemcpyToSymbol(c_kernel, kernel, ksize * sizeof(float)) );
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template <int ksize, typename T, typename D, template<typename> class B>
|
||||
void linearColumnFilter_caller(const DevMem2D_<T>& src, const DevMem2D_<D>& dst, int anchor, cudaStream_t stream)
|
||||
{
|
||||
const dim3 block(BLOCK_DIM_X, BLOCK_DIM_Y);
|
||||
const dim3 grid(divUp(src.cols, BLOCK_DIM_X), divUp(src.rows, RESULT_STEPS * BLOCK_DIM_Y));
|
||||
template <int KERNEL_SIZE, typename T, typename D, typename B>
|
||||
__global__ void linearColumnFilter(const DevMem2D_<T> src, PtrStep<D> dst, int anchor, const B b)
|
||||
{
|
||||
typedef typename TypeVec<float, VecTraits<T>::cn>::vec_type sum_t;
|
||||
|
||||
B<T> b(src.rows);
|
||||
__shared__ T smem[BLOCK_DIM_X][(RESULT_STEPS + 2 * HALO_STEPS) * BLOCK_DIM_Y + 1];
|
||||
|
||||
linearColumnFilter<ksize, T, D><<<grid, block, 0, stream>>>(src, dst, anchor, b);
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
//Offset to the upper halo edge
|
||||
const int x = blockIdx.x * BLOCK_DIM_X + threadIdx.x;
|
||||
const int y = (blockIdx.y * RESULT_STEPS - HALO_STEPS) * BLOCK_DIM_Y + threadIdx.y;
|
||||
|
||||
if (stream == 0)
|
||||
cudaSafeCall( cudaDeviceSynchronize() );
|
||||
}
|
||||
if (x < src.cols)
|
||||
{
|
||||
const T* src_col = src.ptr() + x;
|
||||
|
||||
template <typename T, typename D>
|
||||
void linearColumnFilter_gpu(const DevMem2Db& src, const DevMem2Db& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream)
|
||||
{
|
||||
typedef void (*caller_t)(const DevMem2D_<T>& src, const DevMem2D_<D>& dst, int anchor, cudaStream_t stream);
|
||||
static const caller_t callers[5][17] =
|
||||
{
|
||||
{
|
||||
0,
|
||||
linearColumnFilter_caller<1 , T, D, BrdColReflect101>,
|
||||
linearColumnFilter_caller<2 , T, D, BrdColReflect101>,
|
||||
linearColumnFilter_caller<3 , T, D, BrdColReflect101>,
|
||||
linearColumnFilter_caller<4 , T, D, BrdColReflect101>,
|
||||
linearColumnFilter_caller<5 , T, D, BrdColReflect101>,
|
||||
linearColumnFilter_caller<6 , T, D, BrdColReflect101>,
|
||||
linearColumnFilter_caller<7 , T, D, BrdColReflect101>,
|
||||
linearColumnFilter_caller<8 , T, D, BrdColReflect101>,
|
||||
linearColumnFilter_caller<9 , T, D, BrdColReflect101>,
|
||||
linearColumnFilter_caller<10, T, D, BrdColReflect101>,
|
||||
linearColumnFilter_caller<11, T, D, BrdColReflect101>,
|
||||
linearColumnFilter_caller<12, T, D, BrdColReflect101>,
|
||||
linearColumnFilter_caller<13, T, D, BrdColReflect101>,
|
||||
linearColumnFilter_caller<14, T, D, BrdColReflect101>,
|
||||
linearColumnFilter_caller<15, T, D, BrdColReflect101>,
|
||||
linearColumnFilter_caller<16, T, D, BrdColReflect101>
|
||||
},
|
||||
{
|
||||
0,
|
||||
linearColumnFilter_caller<1 , T, D, BrdColReplicate>,
|
||||
linearColumnFilter_caller<2 , T, D, BrdColReplicate>,
|
||||
linearColumnFilter_caller<3 , T, D, BrdColReplicate>,
|
||||
linearColumnFilter_caller<4 , T, D, BrdColReplicate>,
|
||||
linearColumnFilter_caller<5 , T, D, BrdColReplicate>,
|
||||
linearColumnFilter_caller<6 , T, D, BrdColReplicate>,
|
||||
linearColumnFilter_caller<7 , T, D, BrdColReplicate>,
|
||||
linearColumnFilter_caller<8 , T, D, BrdColReplicate>,
|
||||
linearColumnFilter_caller<9 , T, D, BrdColReplicate>,
|
||||
linearColumnFilter_caller<10, T, D, BrdColReplicate>,
|
||||
linearColumnFilter_caller<11, T, D, BrdColReplicate>,
|
||||
linearColumnFilter_caller<12, T, D, BrdColReplicate>,
|
||||
linearColumnFilter_caller<13, T, D, BrdColReplicate>,
|
||||
linearColumnFilter_caller<14, T, D, BrdColReplicate>,
|
||||
linearColumnFilter_caller<15, T, D, BrdColReplicate>,
|
||||
linearColumnFilter_caller<16, T, D, BrdColReplicate>
|
||||
},
|
||||
{
|
||||
0,
|
||||
linearColumnFilter_caller<1 , T, D, BrdColConstant>,
|
||||
linearColumnFilter_caller<2 , T, D, BrdColConstant>,
|
||||
linearColumnFilter_caller<3 , T, D, BrdColConstant>,
|
||||
linearColumnFilter_caller<4 , T, D, BrdColConstant>,
|
||||
linearColumnFilter_caller<5 , T, D, BrdColConstant>,
|
||||
linearColumnFilter_caller<6 , T, D, BrdColConstant>,
|
||||
linearColumnFilter_caller<7 , T, D, BrdColConstant>,
|
||||
linearColumnFilter_caller<8 , T, D, BrdColConstant>,
|
||||
linearColumnFilter_caller<9 , T, D, BrdColConstant>,
|
||||
linearColumnFilter_caller<10, T, D, BrdColConstant>,
|
||||
linearColumnFilter_caller<11, T, D, BrdColConstant>,
|
||||
linearColumnFilter_caller<12, T, D, BrdColConstant>,
|
||||
linearColumnFilter_caller<13, T, D, BrdColConstant>,
|
||||
linearColumnFilter_caller<14, T, D, BrdColConstant>,
|
||||
linearColumnFilter_caller<15, T, D, BrdColConstant>,
|
||||
linearColumnFilter_caller<16, T, D, BrdColConstant>
|
||||
},
|
||||
{
|
||||
0,
|
||||
linearColumnFilter_caller<1 , T, D, BrdColReflect>,
|
||||
linearColumnFilter_caller<2 , T, D, BrdColReflect>,
|
||||
linearColumnFilter_caller<3 , T, D, BrdColReflect>,
|
||||
linearColumnFilter_caller<4 , T, D, BrdColReflect>,
|
||||
linearColumnFilter_caller<5 , T, D, BrdColReflect>,
|
||||
linearColumnFilter_caller<6 , T, D, BrdColReflect>,
|
||||
linearColumnFilter_caller<7 , T, D, BrdColReflect>,
|
||||
linearColumnFilter_caller<8 , T, D, BrdColReflect>,
|
||||
linearColumnFilter_caller<9 , T, D, BrdColReflect>,
|
||||
linearColumnFilter_caller<10, T, D, BrdColReflect>,
|
||||
linearColumnFilter_caller<11, T, D, BrdColReflect>,
|
||||
linearColumnFilter_caller<12, T, D, BrdColReflect>,
|
||||
linearColumnFilter_caller<13, T, D, BrdColReflect>,
|
||||
linearColumnFilter_caller<14, T, D, BrdColReflect>,
|
||||
linearColumnFilter_caller<15, T, D, BrdColReflect>,
|
||||
linearColumnFilter_caller<16, T, D, BrdColReflect>
|
||||
},
|
||||
{
|
||||
0,
|
||||
linearColumnFilter_caller<1 , T, D, BrdColWrap>,
|
||||
linearColumnFilter_caller<2 , T, D, BrdColWrap>,
|
||||
linearColumnFilter_caller<3 , T, D, BrdColWrap>,
|
||||
linearColumnFilter_caller<4 , T, D, BrdColWrap>,
|
||||
linearColumnFilter_caller<5 , T, D, BrdColWrap>,
|
||||
linearColumnFilter_caller<6 , T, D, BrdColWrap>,
|
||||
linearColumnFilter_caller<7 , T, D, BrdColWrap>,
|
||||
linearColumnFilter_caller<8 , T, D, BrdColWrap>,
|
||||
linearColumnFilter_caller<9 , T, D, BrdColWrap>,
|
||||
linearColumnFilter_caller<10, T, D, BrdColWrap>,
|
||||
linearColumnFilter_caller<11, T, D, BrdColWrap>,
|
||||
linearColumnFilter_caller<12, T, D, BrdColWrap>,
|
||||
linearColumnFilter_caller<13, T, D, BrdColWrap>,
|
||||
linearColumnFilter_caller<14, T, D, BrdColWrap>,
|
||||
linearColumnFilter_caller<15, T, D, BrdColWrap>,
|
||||
linearColumnFilter_caller<16, T, D, BrdColWrap>,
|
||||
//Main data
|
||||
#pragma unroll
|
||||
for(int i = HALO_STEPS; i < HALO_STEPS + RESULT_STEPS; ++i)
|
||||
smem[threadIdx.x][threadIdx.y + i * BLOCK_DIM_Y] = b.at_high(y + i * BLOCK_DIM_Y, src_col, src.step);
|
||||
|
||||
//Upper halo
|
||||
#pragma unroll
|
||||
for(int i = 0; i < HALO_STEPS; ++i)
|
||||
smem[threadIdx.x][threadIdx.y + i * BLOCK_DIM_Y] = b.at_low(y + i * BLOCK_DIM_Y, src_col, src.step);
|
||||
|
||||
//Lower halo
|
||||
#pragma unroll
|
||||
for(int i = HALO_STEPS + RESULT_STEPS; i < HALO_STEPS + RESULT_STEPS + HALO_STEPS; ++i)
|
||||
smem[threadIdx.x][threadIdx.y + i * BLOCK_DIM_Y]= b.at_high(y + i * BLOCK_DIM_Y, src_col, src.step);
|
||||
|
||||
__syncthreads();
|
||||
|
||||
#pragma unroll
|
||||
for(int i = HALO_STEPS; i < HALO_STEPS + RESULT_STEPS; ++i)
|
||||
{
|
||||
sum_t sum = VecTraits<sum_t>::all(0);
|
||||
|
||||
#pragma unroll
|
||||
for(int j = 0; j < KERNEL_SIZE; ++j)
|
||||
sum = sum + smem[threadIdx.x][threadIdx.y + i * BLOCK_DIM_Y + j - anchor] * c_kernel[j];
|
||||
|
||||
int dstY = y + i * BLOCK_DIM_Y;
|
||||
|
||||
if (dstY < src.rows)
|
||||
dst.ptr(dstY)[x] = saturate_cast<D>(sum);
|
||||
}
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
loadKernel(kernel, ksize);
|
||||
|
||||
callers[brd_type][ksize]((DevMem2D_<T>)src, (DevMem2D_<D>)dst, anchor, stream);
|
||||
}
|
||||
template <int ksize, typename T, typename D, template<typename> class B>
|
||||
void linearColumnFilter_caller(const DevMem2D_<T>& src, const DevMem2D_<D>& dst, int anchor, cudaStream_t stream)
|
||||
{
|
||||
const dim3 block(BLOCK_DIM_X, BLOCK_DIM_Y);
|
||||
const dim3 grid(divUp(src.cols, BLOCK_DIM_X), divUp(src.rows, RESULT_STEPS * BLOCK_DIM_Y));
|
||||
|
||||
template void linearColumnFilter_gpu<float , uchar >(const DevMem2Db& src, const DevMem2Db& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream);
|
||||
template void linearColumnFilter_gpu<float4, uchar4>(const DevMem2Db& src, const DevMem2Db& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream);
|
||||
//template void linearColumnFilter_gpu<float , short >(const DevMem2Db& src, const DevMem2Db& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream);
|
||||
//template void linearColumnFilter_gpu<float2, short2>(const DevMem2Db& src, const DevMem2Db& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream);
|
||||
template void linearColumnFilter_gpu<float3, short3>(const DevMem2Db& src, const DevMem2Db& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream);
|
||||
template void linearColumnFilter_gpu<float , int >(const DevMem2Db& src, const DevMem2Db& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream);
|
||||
template void linearColumnFilter_gpu<float , float >(const DevMem2Db& src, const DevMem2Db& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream);
|
||||
B<T> b(src.rows);
|
||||
|
||||
} // namespace column_filter
|
||||
linearColumnFilter<ksize, T, D><<<grid, block, 0, stream>>>(src, dst, anchor, b);
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
|
||||
END_OPENCV_DEVICE_NAMESPACE
|
||||
if (stream == 0)
|
||||
cudaSafeCall( cudaDeviceSynchronize() );
|
||||
}
|
||||
|
||||
template <typename T, typename D>
|
||||
void linearColumnFilter_gpu(const DevMem2Db& src, const DevMem2Db& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream)
|
||||
{
|
||||
typedef void (*caller_t)(const DevMem2D_<T>& src, const DevMem2D_<D>& dst, int anchor, cudaStream_t stream);
|
||||
static const caller_t callers[5][17] =
|
||||
{
|
||||
{
|
||||
0,
|
||||
linearColumnFilter_caller<1 , T, D, BrdColReflect101>,
|
||||
linearColumnFilter_caller<2 , T, D, BrdColReflect101>,
|
||||
linearColumnFilter_caller<3 , T, D, BrdColReflect101>,
|
||||
linearColumnFilter_caller<4 , T, D, BrdColReflect101>,
|
||||
linearColumnFilter_caller<5 , T, D, BrdColReflect101>,
|
||||
linearColumnFilter_caller<6 , T, D, BrdColReflect101>,
|
||||
linearColumnFilter_caller<7 , T, D, BrdColReflect101>,
|
||||
linearColumnFilter_caller<8 , T, D, BrdColReflect101>,
|
||||
linearColumnFilter_caller<9 , T, D, BrdColReflect101>,
|
||||
linearColumnFilter_caller<10, T, D, BrdColReflect101>,
|
||||
linearColumnFilter_caller<11, T, D, BrdColReflect101>,
|
||||
linearColumnFilter_caller<12, T, D, BrdColReflect101>,
|
||||
linearColumnFilter_caller<13, T, D, BrdColReflect101>,
|
||||
linearColumnFilter_caller<14, T, D, BrdColReflect101>,
|
||||
linearColumnFilter_caller<15, T, D, BrdColReflect101>,
|
||||
linearColumnFilter_caller<16, T, D, BrdColReflect101>
|
||||
},
|
||||
{
|
||||
0,
|
||||
linearColumnFilter_caller<1 , T, D, BrdColReplicate>,
|
||||
linearColumnFilter_caller<2 , T, D, BrdColReplicate>,
|
||||
linearColumnFilter_caller<3 , T, D, BrdColReplicate>,
|
||||
linearColumnFilter_caller<4 , T, D, BrdColReplicate>,
|
||||
linearColumnFilter_caller<5 , T, D, BrdColReplicate>,
|
||||
linearColumnFilter_caller<6 , T, D, BrdColReplicate>,
|
||||
linearColumnFilter_caller<7 , T, D, BrdColReplicate>,
|
||||
linearColumnFilter_caller<8 , T, D, BrdColReplicate>,
|
||||
linearColumnFilter_caller<9 , T, D, BrdColReplicate>,
|
||||
linearColumnFilter_caller<10, T, D, BrdColReplicate>,
|
||||
linearColumnFilter_caller<11, T, D, BrdColReplicate>,
|
||||
linearColumnFilter_caller<12, T, D, BrdColReplicate>,
|
||||
linearColumnFilter_caller<13, T, D, BrdColReplicate>,
|
||||
linearColumnFilter_caller<14, T, D, BrdColReplicate>,
|
||||
linearColumnFilter_caller<15, T, D, BrdColReplicate>,
|
||||
linearColumnFilter_caller<16, T, D, BrdColReplicate>
|
||||
},
|
||||
{
|
||||
0,
|
||||
linearColumnFilter_caller<1 , T, D, BrdColConstant>,
|
||||
linearColumnFilter_caller<2 , T, D, BrdColConstant>,
|
||||
linearColumnFilter_caller<3 , T, D, BrdColConstant>,
|
||||
linearColumnFilter_caller<4 , T, D, BrdColConstant>,
|
||||
linearColumnFilter_caller<5 , T, D, BrdColConstant>,
|
||||
linearColumnFilter_caller<6 , T, D, BrdColConstant>,
|
||||
linearColumnFilter_caller<7 , T, D, BrdColConstant>,
|
||||
linearColumnFilter_caller<8 , T, D, BrdColConstant>,
|
||||
linearColumnFilter_caller<9 , T, D, BrdColConstant>,
|
||||
linearColumnFilter_caller<10, T, D, BrdColConstant>,
|
||||
linearColumnFilter_caller<11, T, D, BrdColConstant>,
|
||||
linearColumnFilter_caller<12, T, D, BrdColConstant>,
|
||||
linearColumnFilter_caller<13, T, D, BrdColConstant>,
|
||||
linearColumnFilter_caller<14, T, D, BrdColConstant>,
|
||||
linearColumnFilter_caller<15, T, D, BrdColConstant>,
|
||||
linearColumnFilter_caller<16, T, D, BrdColConstant>
|
||||
},
|
||||
{
|
||||
0,
|
||||
linearColumnFilter_caller<1 , T, D, BrdColReflect>,
|
||||
linearColumnFilter_caller<2 , T, D, BrdColReflect>,
|
||||
linearColumnFilter_caller<3 , T, D, BrdColReflect>,
|
||||
linearColumnFilter_caller<4 , T, D, BrdColReflect>,
|
||||
linearColumnFilter_caller<5 , T, D, BrdColReflect>,
|
||||
linearColumnFilter_caller<6 , T, D, BrdColReflect>,
|
||||
linearColumnFilter_caller<7 , T, D, BrdColReflect>,
|
||||
linearColumnFilter_caller<8 , T, D, BrdColReflect>,
|
||||
linearColumnFilter_caller<9 , T, D, BrdColReflect>,
|
||||
linearColumnFilter_caller<10, T, D, BrdColReflect>,
|
||||
linearColumnFilter_caller<11, T, D, BrdColReflect>,
|
||||
linearColumnFilter_caller<12, T, D, BrdColReflect>,
|
||||
linearColumnFilter_caller<13, T, D, BrdColReflect>,
|
||||
linearColumnFilter_caller<14, T, D, BrdColReflect>,
|
||||
linearColumnFilter_caller<15, T, D, BrdColReflect>,
|
||||
linearColumnFilter_caller<16, T, D, BrdColReflect>
|
||||
},
|
||||
{
|
||||
0,
|
||||
linearColumnFilter_caller<1 , T, D, BrdColWrap>,
|
||||
linearColumnFilter_caller<2 , T, D, BrdColWrap>,
|
||||
linearColumnFilter_caller<3 , T, D, BrdColWrap>,
|
||||
linearColumnFilter_caller<4 , T, D, BrdColWrap>,
|
||||
linearColumnFilter_caller<5 , T, D, BrdColWrap>,
|
||||
linearColumnFilter_caller<6 , T, D, BrdColWrap>,
|
||||
linearColumnFilter_caller<7 , T, D, BrdColWrap>,
|
||||
linearColumnFilter_caller<8 , T, D, BrdColWrap>,
|
||||
linearColumnFilter_caller<9 , T, D, BrdColWrap>,
|
||||
linearColumnFilter_caller<10, T, D, BrdColWrap>,
|
||||
linearColumnFilter_caller<11, T, D, BrdColWrap>,
|
||||
linearColumnFilter_caller<12, T, D, BrdColWrap>,
|
||||
linearColumnFilter_caller<13, T, D, BrdColWrap>,
|
||||
linearColumnFilter_caller<14, T, D, BrdColWrap>,
|
||||
linearColumnFilter_caller<15, T, D, BrdColWrap>,
|
||||
linearColumnFilter_caller<16, T, D, BrdColWrap>,
|
||||
}
|
||||
};
|
||||
|
||||
loadKernel(kernel, ksize);
|
||||
|
||||
callers[brd_type][ksize]((DevMem2D_<T>)src, (DevMem2D_<D>)dst, anchor, stream);
|
||||
}
|
||||
|
||||
template void linearColumnFilter_gpu<float , uchar >(const DevMem2Db& src, const DevMem2Db& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream);
|
||||
template void linearColumnFilter_gpu<float4, uchar4>(const DevMem2Db& src, const DevMem2Db& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream);
|
||||
//template void linearColumnFilter_gpu<float , short >(const DevMem2Db& src, const DevMem2Db& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream);
|
||||
//template void linearColumnFilter_gpu<float2, short2>(const DevMem2Db& src, const DevMem2Db& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream);
|
||||
template void linearColumnFilter_gpu<float3, short3>(const DevMem2Db& src, const DevMem2Db& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream);
|
||||
template void linearColumnFilter_gpu<float , int >(const DevMem2Db& src, const DevMem2Db& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream);
|
||||
template void linearColumnFilter_gpu<float , float >(const DevMem2Db& src, const DevMem2Db& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream);
|
||||
} // namespace column_filter
|
||||
}}} // namespace cv { namespace gpu { namespace device
|
||||
|
@ -43,87 +43,85 @@
|
||||
#include "internal_shared.hpp"
|
||||
#include "opencv2/gpu/device/border_interpolate.hpp"
|
||||
|
||||
BEGIN_OPENCV_DEVICE_NAMESPACE
|
||||
|
||||
namespace imgproc {
|
||||
|
||||
template <typename Ptr2D, typename T> __global__ void copyMakeBorder(const Ptr2D src, DevMem2D_<T> dst, int top, int left)
|
||||
namespace cv { namespace gpu { namespace device
|
||||
{
|
||||
const int x = blockDim.x * blockIdx.x + threadIdx.x;
|
||||
const int y = blockDim.y * blockIdx.y + threadIdx.y;
|
||||
|
||||
if (x < dst.cols && y < dst.rows)
|
||||
dst.ptr(y)[x] = src(y - top, x - left);
|
||||
}
|
||||
|
||||
template <template <typename> class B, typename T> struct CopyMakeBorderDispatcher
|
||||
{
|
||||
static void call(const DevMem2D_<T>& src, const DevMem2D_<T>& dst, int top, int left,
|
||||
const typename VecTraits<T>::elem_type* borderValue, cudaStream_t stream)
|
||||
{
|
||||
dim3 block(32, 8);
|
||||
dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y));
|
||||
|
||||
B<T> brd(src.rows, src.cols, VecTraits<T>::make(borderValue));
|
||||
BorderReader< PtrStep<T>, B<T> > brdSrc(src, brd);
|
||||
|
||||
copyMakeBorder<<<grid, block, 0, stream>>>(brdSrc, dst, top, left);
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
|
||||
if (stream == 0)
|
||||
cudaSafeCall( cudaDeviceSynchronize() );
|
||||
}
|
||||
};
|
||||
|
||||
template <typename T, int cn> void copyMakeBorder_gpu(const DevMem2Db& src, const DevMem2Db& dst, int top, int left, int borderMode,
|
||||
const T* borderValue, cudaStream_t stream)
|
||||
{
|
||||
typedef typename TypeVec<T, cn>::vec_type vec_type;
|
||||
|
||||
typedef void (*caller_t)(const DevMem2D_<vec_type>& src, const DevMem2D_<vec_type>& dst, int top, int left, const T* borderValue, cudaStream_t stream);
|
||||
|
||||
static const caller_t callers[5] =
|
||||
namespace imgproc
|
||||
{
|
||||
CopyMakeBorderDispatcher<BrdReflect101, vec_type>::call,
|
||||
CopyMakeBorderDispatcher<BrdReplicate, vec_type>::call,
|
||||
CopyMakeBorderDispatcher<BrdConstant, vec_type>::call,
|
||||
CopyMakeBorderDispatcher<BrdReflect, vec_type>::call,
|
||||
CopyMakeBorderDispatcher<BrdWrap, vec_type>::call
|
||||
};
|
||||
template <typename Ptr2D, typename T> __global__ void copyMakeBorder(const Ptr2D src, DevMem2D_<T> dst, int top, int left)
|
||||
{
|
||||
const int x = blockDim.x * blockIdx.x + threadIdx.x;
|
||||
const int y = blockDim.y * blockIdx.y + threadIdx.y;
|
||||
|
||||
callers[borderMode](DevMem2D_<vec_type>(src), DevMem2D_<vec_type>(dst), top, left, borderValue, stream);
|
||||
}
|
||||
if (x < dst.cols && y < dst.rows)
|
||||
dst.ptr(y)[x] = src(y - top, x - left);
|
||||
}
|
||||
|
||||
template void copyMakeBorder_gpu<uchar, 1>(const DevMem2Db& src, const DevMem2Db& dst, int top, int left, int borderMode, const uchar* borderValue, cudaStream_t stream);
|
||||
//template void copyMakeBorder_gpu<uchar, 2>(const DevMem2Db& src, const DevMem2Db& dst, int top, int left, int borderMode, const uchar* borderValue, cudaStream_t stream);
|
||||
template void copyMakeBorder_gpu<uchar, 3>(const DevMem2Db& src, const DevMem2Db& dst, int top, int left, int borderMode, const uchar* borderValue, cudaStream_t stream);
|
||||
template void copyMakeBorder_gpu<uchar, 4>(const DevMem2Db& src, const DevMem2Db& dst, int top, int left, int borderMode, const uchar* borderValue, cudaStream_t stream);
|
||||
template <template <typename> class B, typename T> struct CopyMakeBorderDispatcher
|
||||
{
|
||||
static void call(const DevMem2D_<T>& src, const DevMem2D_<T>& dst, int top, int left,
|
||||
const typename VecTraits<T>::elem_type* borderValue, cudaStream_t stream)
|
||||
{
|
||||
dim3 block(32, 8);
|
||||
dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y));
|
||||
|
||||
//template void copyMakeBorder_gpu<schar, 1>(const DevMem2Db& src, const DevMem2Db& dst, int top, int left, int borderMode, const schar* borderValue, cudaStream_t stream);
|
||||
//template void copyMakeBorder_gpu<schar, 2>(const DevMem2Db& src, const DevMem2Db& dst, int top, int left, int borderMode, const schar* borderValue, cudaStream_t stream);
|
||||
//template void copyMakeBorder_gpu<schar, 3>(const DevMem2Db& src, const DevMem2Db& dst, int top, int left, int borderMode, const schar* borderValue, cudaStream_t stream);
|
||||
//template void copyMakeBorder_gpu<schar, 4>(const DevMem2Db& src, const DevMem2Db& dst, int top, int left, int borderMode, const schar* borderValue, cudaStream_t stream);
|
||||
B<T> brd(src.rows, src.cols, VecTraits<T>::make(borderValue));
|
||||
BorderReader< PtrStep<T>, B<T> > brdSrc(src, brd);
|
||||
|
||||
template void copyMakeBorder_gpu<ushort, 1>(const DevMem2Db& src, const DevMem2Db& dst, int top, int left, int borderMode, const ushort* borderValue, cudaStream_t stream);
|
||||
//template void copyMakeBorder_gpu<ushort, 2>(const DevMem2Db& src, const DevMem2Db& dst, int top, int left, int borderMode, const ushort* borderValue, cudaStream_t stream);
|
||||
template void copyMakeBorder_gpu<ushort, 3>(const DevMem2Db& src, const DevMem2Db& dst, int top, int left, int borderMode, const ushort* borderValue, cudaStream_t stream);
|
||||
template void copyMakeBorder_gpu<ushort, 4>(const DevMem2Db& src, const DevMem2Db& dst, int top, int left, int borderMode, const ushort* borderValue, cudaStream_t stream);
|
||||
copyMakeBorder<<<grid, block, 0, stream>>>(brdSrc, dst, top, left);
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
|
||||
template void copyMakeBorder_gpu<short, 1>(const DevMem2Db& src, const DevMem2Db& dst, int top, int left, int borderMode, const short* borderValue, cudaStream_t stream);
|
||||
//template void copyMakeBorder_gpu<short, 2>(const DevMem2Db& src, const DevMem2Db& dst, int top, int left, int borderMode, const short* borderValue, cudaStream_t stream);
|
||||
template void copyMakeBorder_gpu<short, 3>(const DevMem2Db& src, const DevMem2Db& dst, int top, int left, int borderMode, const short* borderValue, cudaStream_t stream);
|
||||
template void copyMakeBorder_gpu<short, 4>(const DevMem2Db& src, const DevMem2Db& dst, int top, int left, int borderMode, const short* borderValue, cudaStream_t stream);
|
||||
if (stream == 0)
|
||||
cudaSafeCall( cudaDeviceSynchronize() );
|
||||
}
|
||||
};
|
||||
|
||||
//template void copyMakeBorder_gpu<int, 1>(const DevMem2Db& src, const DevMem2Db& dst, int top, int left, int borderMode, const int* borderValue, cudaStream_t stream);
|
||||
//template void copyMakeBorder_gpu<int, 2>(const DevMem2Db& src, const DevMem2Db& dst, int top, int left, int borderMode, const int* borderValue, cudaStream_t stream);
|
||||
//template void copyMakeBorder_gpu<int, 3>(const DevMem2Db& src, const DevMem2Db& dst, int top, int left, int borderMode, const int* borderValue, cudaStream_t stream);
|
||||
//template void copyMakeBorder_gpu<int, 4>(const DevMem2Db& src, const DevMem2Db& dst, int top, int left, int borderMode, const int* borderValue, cudaStream_t stream);
|
||||
template <typename T, int cn> void copyMakeBorder_gpu(const DevMem2Db& src, const DevMem2Db& dst, int top, int left, int borderMode,
|
||||
const T* borderValue, cudaStream_t stream)
|
||||
{
|
||||
typedef typename TypeVec<T, cn>::vec_type vec_type;
|
||||
|
||||
template void copyMakeBorder_gpu<float, 1>(const DevMem2Db& src, const DevMem2Db& dst, int top, int left, int borderMode, const float* borderValue, cudaStream_t stream);
|
||||
//template void copyMakeBorder_gpu<float, 2>(const DevMem2Db& src, const DevMem2Db& dst, int top, int left, int borderMode, const float* borderValue, cudaStream_t stream);
|
||||
template void copyMakeBorder_gpu<float, 3>(const DevMem2Db& src, const DevMem2Db& dst, int top, int left, int borderMode, const float* borderValue, cudaStream_t stream);
|
||||
template void copyMakeBorder_gpu<float, 4>(const DevMem2Db& src, const DevMem2Db& dst, int top, int left, int borderMode, const float* borderValue, cudaStream_t stream);
|
||||
typedef void (*caller_t)(const DevMem2D_<vec_type>& src, const DevMem2D_<vec_type>& dst, int top, int left, const T* borderValue, cudaStream_t stream);
|
||||
|
||||
} // namespace imgproc
|
||||
static const caller_t callers[5] =
|
||||
{
|
||||
CopyMakeBorderDispatcher<BrdReflect101, vec_type>::call,
|
||||
CopyMakeBorderDispatcher<BrdReplicate, vec_type>::call,
|
||||
CopyMakeBorderDispatcher<BrdConstant, vec_type>::call,
|
||||
CopyMakeBorderDispatcher<BrdReflect, vec_type>::call,
|
||||
CopyMakeBorderDispatcher<BrdWrap, vec_type>::call
|
||||
};
|
||||
|
||||
END_OPENCV_DEVICE_NAMESPACE
|
||||
callers[borderMode](DevMem2D_<vec_type>(src), DevMem2D_<vec_type>(dst), top, left, borderValue, stream);
|
||||
}
|
||||
|
||||
template void copyMakeBorder_gpu<uchar, 1>(const DevMem2Db& src, const DevMem2Db& dst, int top, int left, int borderMode, const uchar* borderValue, cudaStream_t stream);
|
||||
//template void copyMakeBorder_gpu<uchar, 2>(const DevMem2Db& src, const DevMem2Db& dst, int top, int left, int borderMode, const uchar* borderValue, cudaStream_t stream);
|
||||
template void copyMakeBorder_gpu<uchar, 3>(const DevMem2Db& src, const DevMem2Db& dst, int top, int left, int borderMode, const uchar* borderValue, cudaStream_t stream);
|
||||
template void copyMakeBorder_gpu<uchar, 4>(const DevMem2Db& src, const DevMem2Db& dst, int top, int left, int borderMode, const uchar* borderValue, cudaStream_t stream);
|
||||
|
||||
//template void copyMakeBorder_gpu<schar, 1>(const DevMem2Db& src, const DevMem2Db& dst, int top, int left, int borderMode, const schar* borderValue, cudaStream_t stream);
|
||||
//template void copyMakeBorder_gpu<schar, 2>(const DevMem2Db& src, const DevMem2Db& dst, int top, int left, int borderMode, const schar* borderValue, cudaStream_t stream);
|
||||
//template void copyMakeBorder_gpu<schar, 3>(const DevMem2Db& src, const DevMem2Db& dst, int top, int left, int borderMode, const schar* borderValue, cudaStream_t stream);
|
||||
//template void copyMakeBorder_gpu<schar, 4>(const DevMem2Db& src, const DevMem2Db& dst, int top, int left, int borderMode, const schar* borderValue, cudaStream_t stream);
|
||||
|
||||
template void copyMakeBorder_gpu<ushort, 1>(const DevMem2Db& src, const DevMem2Db& dst, int top, int left, int borderMode, const ushort* borderValue, cudaStream_t stream);
|
||||
//template void copyMakeBorder_gpu<ushort, 2>(const DevMem2Db& src, const DevMem2Db& dst, int top, int left, int borderMode, const ushort* borderValue, cudaStream_t stream);
|
||||
template void copyMakeBorder_gpu<ushort, 3>(const DevMem2Db& src, const DevMem2Db& dst, int top, int left, int borderMode, const ushort* borderValue, cudaStream_t stream);
|
||||
template void copyMakeBorder_gpu<ushort, 4>(const DevMem2Db& src, const DevMem2Db& dst, int top, int left, int borderMode, const ushort* borderValue, cudaStream_t stream);
|
||||
|
||||
template void copyMakeBorder_gpu<short, 1>(const DevMem2Db& src, const DevMem2Db& dst, int top, int left, int borderMode, const short* borderValue, cudaStream_t stream);
|
||||
//template void copyMakeBorder_gpu<short, 2>(const DevMem2Db& src, const DevMem2Db& dst, int top, int left, int borderMode, const short* borderValue, cudaStream_t stream);
|
||||
template void copyMakeBorder_gpu<short, 3>(const DevMem2Db& src, const DevMem2Db& dst, int top, int left, int borderMode, const short* borderValue, cudaStream_t stream);
|
||||
template void copyMakeBorder_gpu<short, 4>(const DevMem2Db& src, const DevMem2Db& dst, int top, int left, int borderMode, const short* borderValue, cudaStream_t stream);
|
||||
|
||||
//template void copyMakeBorder_gpu<int, 1>(const DevMem2Db& src, const DevMem2Db& dst, int top, int left, int borderMode, const int* borderValue, cudaStream_t stream);
|
||||
//template void copyMakeBorder_gpu<int, 2>(const DevMem2Db& src, const DevMem2Db& dst, int top, int left, int borderMode, const int* borderValue, cudaStream_t stream);
|
||||
//template void copyMakeBorder_gpu<int, 3>(const DevMem2Db& src, const DevMem2Db& dst, int top, int left, int borderMode, const int* borderValue, cudaStream_t stream);
|
||||
//template void copyMakeBorder_gpu<int, 4>(const DevMem2Db& src, const DevMem2Db& dst, int top, int left, int borderMode, const int* borderValue, cudaStream_t stream);
|
||||
|
||||
template void copyMakeBorder_gpu<float, 1>(const DevMem2Db& src, const DevMem2Db& dst, int top, int left, int borderMode, const float* borderValue, cudaStream_t stream);
|
||||
//template void copyMakeBorder_gpu<float, 2>(const DevMem2Db& src, const DevMem2Db& dst, int top, int left, int borderMode, const float* borderValue, cudaStream_t stream);
|
||||
template void copyMakeBorder_gpu<float, 3>(const DevMem2Db& src, const DevMem2Db& dst, int top, int left, int borderMode, const float* borderValue, cudaStream_t stream);
|
||||
template void copyMakeBorder_gpu<float, 4>(const DevMem2Db& src, const DevMem2Db& dst, int top, int left, int borderMode, const float* borderValue, cudaStream_t stream);
|
||||
} // namespace imgproc
|
||||
}}} // namespace cv { namespace gpu { namespace device
|
||||
|
File diff suppressed because it is too large
Load Diff
@ -45,177 +45,175 @@
|
||||
#include "opencv2/gpu/device/utility.hpp"
|
||||
#include "opencv2/gpu/device/saturate_cast.hpp"
|
||||
|
||||
BEGIN_OPENCV_DEVICE_NAMESPACE
|
||||
namespace cv { namespace gpu { namespace device
|
||||
{
|
||||
#define UINT_BITS 32U
|
||||
|
||||
#define UINT_BITS 32U
|
||||
//Warps == subhistograms per threadblock
|
||||
#define WARP_COUNT 6
|
||||
|
||||
//Warps == subhistograms per threadblock
|
||||
#define WARP_COUNT 6
|
||||
//Threadblock size
|
||||
#define HISTOGRAM256_THREADBLOCK_SIZE (WARP_COUNT * OPENCV_GPU_WARP_SIZE)
|
||||
#define HISTOGRAM256_BIN_COUNT 256
|
||||
|
||||
//Threadblock size
|
||||
#define HISTOGRAM256_THREADBLOCK_SIZE (WARP_COUNT * OPENCV_GPU_WARP_SIZE)
|
||||
#define HISTOGRAM256_BIN_COUNT 256
|
||||
//Shared memory per threadblock
|
||||
#define HISTOGRAM256_THREADBLOCK_MEMORY (WARP_COUNT * HISTOGRAM256_BIN_COUNT)
|
||||
|
||||
//Shared memory per threadblock
|
||||
#define HISTOGRAM256_THREADBLOCK_MEMORY (WARP_COUNT * HISTOGRAM256_BIN_COUNT)
|
||||
#define PARTIAL_HISTOGRAM256_COUNT 240
|
||||
|
||||
#define PARTIAL_HISTOGRAM256_COUNT 240
|
||||
#define MERGE_THREADBLOCK_SIZE 256
|
||||
|
||||
#define MERGE_THREADBLOCK_SIZE 256
|
||||
#define USE_SMEM_ATOMICS (__CUDA_ARCH__ >= 120)
|
||||
|
||||
#define USE_SMEM_ATOMICS (__CUDA_ARCH__ >= 120)
|
||||
|
||||
namespace hist {
|
||||
|
||||
#if (!USE_SMEM_ATOMICS)
|
||||
|
||||
#define TAG_MASK ( (1U << (UINT_BITS - OPENCV_GPU_LOG_WARP_SIZE)) - 1U )
|
||||
|
||||
__forceinline__ __device__ void addByte(volatile uint* s_WarpHist, uint data, uint threadTag)
|
||||
namespace hist
|
||||
{
|
||||
uint count;
|
||||
do
|
||||
#if (!USE_SMEM_ATOMICS)
|
||||
|
||||
#define TAG_MASK ( (1U << (UINT_BITS - OPENCV_GPU_LOG_WARP_SIZE)) - 1U )
|
||||
|
||||
__forceinline__ __device__ void addByte(volatile uint* s_WarpHist, uint data, uint threadTag)
|
||||
{
|
||||
uint count;
|
||||
do
|
||||
{
|
||||
count = s_WarpHist[data] & TAG_MASK;
|
||||
count = threadTag | (count + 1);
|
||||
s_WarpHist[data] = count;
|
||||
} while (s_WarpHist[data] != count);
|
||||
}
|
||||
|
||||
#else
|
||||
|
||||
#define TAG_MASK 0xFFFFFFFFU
|
||||
|
||||
__forceinline__ __device__ void addByte(uint* s_WarpHist, uint data, uint threadTag)
|
||||
{
|
||||
atomicAdd(s_WarpHist + data, 1);
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
__forceinline__ __device__ void addWord(uint* s_WarpHist, uint data, uint tag, uint pos_x, uint cols)
|
||||
{
|
||||
count = s_WarpHist[data] & TAG_MASK;
|
||||
count = threadTag | (count + 1);
|
||||
s_WarpHist[data] = count;
|
||||
} while (s_WarpHist[data] != count);
|
||||
}
|
||||
uint x = pos_x << 2;
|
||||
|
||||
#else
|
||||
if (x + 0 < cols) addByte(s_WarpHist, (data >> 0) & 0xFFU, tag);
|
||||
if (x + 1 < cols) addByte(s_WarpHist, (data >> 8) & 0xFFU, tag);
|
||||
if (x + 2 < cols) addByte(s_WarpHist, (data >> 16) & 0xFFU, tag);
|
||||
if (x + 3 < cols) addByte(s_WarpHist, (data >> 24) & 0xFFU, tag);
|
||||
}
|
||||
|
||||
#define TAG_MASK 0xFFFFFFFFU
|
||||
__global__ void histogram256(const PtrStep<uint> d_Data, uint* d_PartialHistograms, uint dataCount, uint cols)
|
||||
{
|
||||
//Per-warp subhistogram storage
|
||||
__shared__ uint s_Hist[HISTOGRAM256_THREADBLOCK_MEMORY];
|
||||
uint* s_WarpHist= s_Hist + (threadIdx.x >> OPENCV_GPU_LOG_WARP_SIZE) * HISTOGRAM256_BIN_COUNT;
|
||||
|
||||
__forceinline__ __device__ void addByte(uint* s_WarpHist, uint data, uint threadTag)
|
||||
{
|
||||
atomicAdd(s_WarpHist + data, 1);
|
||||
}
|
||||
//Clear shared memory storage for current threadblock before processing
|
||||
#pragma unroll
|
||||
for (uint i = 0; i < (HISTOGRAM256_THREADBLOCK_MEMORY / HISTOGRAM256_THREADBLOCK_SIZE); i++)
|
||||
s_Hist[threadIdx.x + i * HISTOGRAM256_THREADBLOCK_SIZE] = 0;
|
||||
|
||||
#endif
|
||||
//Cycle through the entire data set, update subhistograms for each warp
|
||||
const uint tag = threadIdx.x << (UINT_BITS - OPENCV_GPU_LOG_WARP_SIZE);
|
||||
|
||||
__forceinline__ __device__ void addWord(uint* s_WarpHist, uint data, uint tag, uint pos_x, uint cols)
|
||||
{
|
||||
uint x = pos_x << 2;
|
||||
__syncthreads();
|
||||
const uint colsui = d_Data.step / sizeof(uint);
|
||||
for(uint pos = blockIdx.x * blockDim.x + threadIdx.x; pos < dataCount; pos += blockDim.x * gridDim.x)
|
||||
{
|
||||
uint pos_y = pos / colsui;
|
||||
uint pos_x = pos % colsui;
|
||||
uint data = d_Data.ptr(pos_y)[pos_x];
|
||||
addWord(s_WarpHist, data, tag, pos_x, cols);
|
||||
}
|
||||
|
||||
if (x + 0 < cols) addByte(s_WarpHist, (data >> 0) & 0xFFU, tag);
|
||||
if (x + 1 < cols) addByte(s_WarpHist, (data >> 8) & 0xFFU, tag);
|
||||
if (x + 2 < cols) addByte(s_WarpHist, (data >> 16) & 0xFFU, tag);
|
||||
if (x + 3 < cols) addByte(s_WarpHist, (data >> 24) & 0xFFU, tag);
|
||||
}
|
||||
//Merge per-warp histograms into per-block and write to global memory
|
||||
__syncthreads();
|
||||
for(uint bin = threadIdx.x; bin < HISTOGRAM256_BIN_COUNT; bin += HISTOGRAM256_THREADBLOCK_SIZE)
|
||||
{
|
||||
uint sum = 0;
|
||||
|
||||
__global__ void histogram256(const PtrStep<uint> d_Data, uint* d_PartialHistograms, uint dataCount, uint cols)
|
||||
{
|
||||
//Per-warp subhistogram storage
|
||||
__shared__ uint s_Hist[HISTOGRAM256_THREADBLOCK_MEMORY];
|
||||
uint* s_WarpHist= s_Hist + (threadIdx.x >> OPENCV_GPU_LOG_WARP_SIZE) * HISTOGRAM256_BIN_COUNT;
|
||||
for (uint i = 0; i < WARP_COUNT; i++)
|
||||
sum += s_Hist[bin + i * HISTOGRAM256_BIN_COUNT] & TAG_MASK;
|
||||
|
||||
//Clear shared memory storage for current threadblock before processing
|
||||
#pragma unroll
|
||||
for (uint i = 0; i < (HISTOGRAM256_THREADBLOCK_MEMORY / HISTOGRAM256_THREADBLOCK_SIZE); i++)
|
||||
s_Hist[threadIdx.x + i * HISTOGRAM256_THREADBLOCK_SIZE] = 0;
|
||||
d_PartialHistograms[blockIdx.x * HISTOGRAM256_BIN_COUNT + bin] = sum;
|
||||
}
|
||||
}
|
||||
|
||||
//Cycle through the entire data set, update subhistograms for each warp
|
||||
const uint tag = threadIdx.x << (UINT_BITS - OPENCV_GPU_LOG_WARP_SIZE);
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
// Merge histogram256() output
|
||||
// Run one threadblock per bin; each threadblock adds up the same bin counter
|
||||
// from every partial histogram. Reads are uncoalesced, but mergeHistogram256
|
||||
// takes only a fraction of total processing time
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
__syncthreads();
|
||||
const uint colsui = d_Data.step / sizeof(uint);
|
||||
for(uint pos = blockIdx.x * blockDim.x + threadIdx.x; pos < dataCount; pos += blockDim.x * gridDim.x)
|
||||
{
|
||||
uint pos_y = pos / colsui;
|
||||
uint pos_x = pos % colsui;
|
||||
uint data = d_Data.ptr(pos_y)[pos_x];
|
||||
addWord(s_WarpHist, data, tag, pos_x, cols);
|
||||
}
|
||||
__global__ void mergeHistogram256(const uint* d_PartialHistograms, int* d_Histogram)
|
||||
{
|
||||
uint sum = 0;
|
||||
|
||||
//Merge per-warp histograms into per-block and write to global memory
|
||||
__syncthreads();
|
||||
for(uint bin = threadIdx.x; bin < HISTOGRAM256_BIN_COUNT; bin += HISTOGRAM256_THREADBLOCK_SIZE)
|
||||
{
|
||||
uint sum = 0;
|
||||
#pragma unroll
|
||||
for (uint i = threadIdx.x; i < PARTIAL_HISTOGRAM256_COUNT; i += MERGE_THREADBLOCK_SIZE)
|
||||
sum += d_PartialHistograms[blockIdx.x + i * HISTOGRAM256_BIN_COUNT];
|
||||
|
||||
for (uint i = 0; i < WARP_COUNT; i++)
|
||||
sum += s_Hist[bin + i * HISTOGRAM256_BIN_COUNT] & TAG_MASK;
|
||||
__shared__ uint data[MERGE_THREADBLOCK_SIZE];
|
||||
data[threadIdx.x] = sum;
|
||||
|
||||
d_PartialHistograms[blockIdx.x * HISTOGRAM256_BIN_COUNT + bin] = sum;
|
||||
}
|
||||
}
|
||||
for (uint stride = MERGE_THREADBLOCK_SIZE / 2; stride > 0; stride >>= 1)
|
||||
{
|
||||
__syncthreads();
|
||||
if(threadIdx.x < stride)
|
||||
data[threadIdx.x] += data[threadIdx.x + stride];
|
||||
}
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
// Merge histogram256() output
|
||||
// Run one threadblock per bin; each threadblock adds up the same bin counter
|
||||
// from every partial histogram. Reads are uncoalesced, but mergeHistogram256
|
||||
// takes only a fraction of total processing time
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
if(threadIdx.x == 0)
|
||||
d_Histogram[blockIdx.x] = saturate_cast<int>(data[0]);
|
||||
}
|
||||
|
||||
__global__ void mergeHistogram256(const uint* d_PartialHistograms, int* d_Histogram)
|
||||
{
|
||||
uint sum = 0;
|
||||
void histogram256_gpu(DevMem2Db src, int* hist, uint* buf, cudaStream_t stream)
|
||||
{
|
||||
histogram256<<<PARTIAL_HISTOGRAM256_COUNT, HISTOGRAM256_THREADBLOCK_SIZE, 0, stream>>>(
|
||||
DevMem2D_<uint>(src),
|
||||
buf,
|
||||
static_cast<uint>(src.rows * src.step / sizeof(uint)),
|
||||
src.cols);
|
||||
|
||||
#pragma unroll
|
||||
for (uint i = threadIdx.x; i < PARTIAL_HISTOGRAM256_COUNT; i += MERGE_THREADBLOCK_SIZE)
|
||||
sum += d_PartialHistograms[blockIdx.x + i * HISTOGRAM256_BIN_COUNT];
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
|
||||
__shared__ uint data[MERGE_THREADBLOCK_SIZE];
|
||||
data[threadIdx.x] = sum;
|
||||
mergeHistogram256<<<HISTOGRAM256_BIN_COUNT, MERGE_THREADBLOCK_SIZE, 0, stream>>>(buf, hist);
|
||||
|
||||
for (uint stride = MERGE_THREADBLOCK_SIZE / 2; stride > 0; stride >>= 1)
|
||||
{
|
||||
__syncthreads();
|
||||
if(threadIdx.x < stride)
|
||||
data[threadIdx.x] += data[threadIdx.x + stride];
|
||||
}
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
|
||||
if(threadIdx.x == 0)
|
||||
d_Histogram[blockIdx.x] = saturate_cast<int>(data[0]);
|
||||
}
|
||||
if (stream == 0)
|
||||
cudaSafeCall( cudaDeviceSynchronize() );
|
||||
}
|
||||
|
||||
void histogram256_gpu(DevMem2Db src, int* hist, uint* buf, cudaStream_t stream)
|
||||
{
|
||||
histogram256<<<PARTIAL_HISTOGRAM256_COUNT, HISTOGRAM256_THREADBLOCK_SIZE, 0, stream>>>(
|
||||
DevMem2D_<uint>(src),
|
||||
buf,
|
||||
static_cast<uint>(src.rows * src.step / sizeof(uint)),
|
||||
src.cols);
|
||||
__constant__ int c_lut[256];
|
||||
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
__global__ void equalizeHist(const DevMem2Db src, PtrStepb dst)
|
||||
{
|
||||
const int x = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
const int y = blockIdx.y * blockDim.y + threadIdx.y;
|
||||
|
||||
mergeHistogram256<<<HISTOGRAM256_BIN_COUNT, MERGE_THREADBLOCK_SIZE, 0, stream>>>(buf, hist);
|
||||
if (x < src.cols && y < src.rows)
|
||||
{
|
||||
const uchar val = src.ptr(y)[x];
|
||||
const int lut = c_lut[val];
|
||||
dst.ptr(y)[x] = __float2int_rn(255.0f / (src.cols * src.rows) * lut);
|
||||
}
|
||||
}
|
||||
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
void equalizeHist_gpu(DevMem2Db src, DevMem2Db dst, const int* lut, cudaStream_t stream)
|
||||
{
|
||||
dim3 block(16, 16);
|
||||
dim3 grid(divUp(src.cols, block.x), divUp(src.rows, block.y));
|
||||
|
||||
if (stream == 0)
|
||||
cudaSafeCall( cudaDeviceSynchronize() );
|
||||
}
|
||||
cudaSafeCall( cudaMemcpyToSymbol(c_lut, lut, 256 * sizeof(int), 0, cudaMemcpyDeviceToDevice) );
|
||||
|
||||
__constant__ int c_lut[256];
|
||||
equalizeHist<<<grid, block, 0, stream>>>(src, dst);
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
|
||||
__global__ void equalizeHist(const DevMem2Db src, PtrStepb dst)
|
||||
{
|
||||
const int x = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
const int y = blockIdx.y * blockDim.y + threadIdx.y;
|
||||
|
||||
if (x < src.cols && y < src.rows)
|
||||
{
|
||||
const uchar val = src.ptr(y)[x];
|
||||
const int lut = c_lut[val];
|
||||
dst.ptr(y)[x] = __float2int_rn(255.0f / (src.cols * src.rows) * lut);
|
||||
}
|
||||
}
|
||||
|
||||
void equalizeHist_gpu(DevMem2Db src, DevMem2Db dst, const int* lut, cudaStream_t stream)
|
||||
{
|
||||
dim3 block(16, 16);
|
||||
dim3 grid(divUp(src.cols, block.x), divUp(src.rows, block.y));
|
||||
|
||||
cudaSafeCall( cudaMemcpyToSymbol(c_lut, lut, 256 * sizeof(int), 0, cudaMemcpyDeviceToDevice) );
|
||||
|
||||
equalizeHist<<<grid, block, 0, stream>>>(src, dst);
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
|
||||
if (stream == 0)
|
||||
cudaSafeCall( cudaDeviceSynchronize() );
|
||||
}
|
||||
|
||||
} // namespace hist
|
||||
|
||||
END_OPENCV_DEVICE_NAMESPACE
|
||||
if (stream == 0)
|
||||
cudaSafeCall( cudaDeviceSynchronize() );
|
||||
}
|
||||
} // namespace hist
|
||||
}}} // namespace cv { namespace gpu { namespace device
|
||||
|
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
@ -50,7 +50,7 @@
|
||||
#include "safe_call.hpp"
|
||||
|
||||
#ifndef CV_PI
|
||||
#define CV_PI 3.1415926535897932384626433832795f
|
||||
#define CV_PI 3.1415926535897932384626433832795
|
||||
#endif
|
||||
|
||||
#ifndef CV_PI_F
|
||||
@ -61,27 +61,21 @@
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#define BEGIN_OPENCV_DEVICE_NAMESPACE namespace cv { namespace gpu { namespace device {
|
||||
#define END_OPENCV_DEVICE_NAMESPACE }}}
|
||||
#define OPENCV_DEVICE_NAMESPACE ::cv::gpu::device
|
||||
#define OPENCV_DEVICE_NAMESPACE_ ::cv::gpu::device::
|
||||
|
||||
#ifdef __CUDACC__
|
||||
|
||||
BEGIN_OPENCV_DEVICE_NAMESPACE
|
||||
|
||||
typedef unsigned char uchar;
|
||||
typedef unsigned short ushort;
|
||||
typedef signed char schar;
|
||||
typedef unsigned int uint;
|
||||
|
||||
template<class T> static inline void bindTexture(const textureReference* tex, const DevMem2D_<T>& img)
|
||||
namespace cv { namespace gpu { namespace device
|
||||
{
|
||||
cudaChannelFormatDesc desc = cudaCreateChannelDesc<T>();
|
||||
cudaSafeCall( cudaBindTexture2D(0, tex, img.ptr(), &desc, img.cols, img.rows, img.step) );
|
||||
}
|
||||
typedef unsigned char uchar;
|
||||
typedef unsigned short ushort;
|
||||
typedef signed char schar;
|
||||
typedef unsigned int uint;
|
||||
|
||||
END_OPENCV_DEVICE_NAMESPACE
|
||||
template<class T> static inline void bindTexture(const textureReference* tex, const DevMem2D_<T>& img)
|
||||
{
|
||||
cudaChannelFormatDesc desc = cudaCreateChannelDesc<T>();
|
||||
cudaSafeCall( cudaBindTexture2D(0, tex, img.ptr(), &desc, img.cols, img.rows, img.step) );
|
||||
}
|
||||
}}}
|
||||
|
||||
#endif
|
||||
|
||||
@ -102,87 +96,6 @@ namespace cv { namespace gpu
|
||||
|
||||
static inline int divUp(int total, int grain) { return (total + grain - 1) / grain; }
|
||||
|
||||
/*template<class T> static inline void uploadConstant(const char* name, const T& value)
|
||||
{
|
||||
cudaSafeCall( cudaMemcpyToSymbol(name, &value, sizeof(T)) );
|
||||
}
|
||||
|
||||
template<class T> static inline void uploadConstant(const char* name, const T& value, cudaStream_t stream)
|
||||
{
|
||||
cudaSafeCall( cudaMemcpyToSymbolAsync(name, &value, sizeof(T), 0, cudaMemcpyHostToDevice, stream) );
|
||||
} */
|
||||
|
||||
//template<class T> static inline void bindTexture(const char* name, const DevMem2D_<T>& img)
|
||||
//{
|
||||
// //!!!! const_cast is disabled!
|
||||
// //!!!! Please use constructor of 'class texture' instead.
|
||||
//
|
||||
// //textureReference* tex;
|
||||
// //cudaSafeCall( cudaGetTextureReference((const textureReference**)&tex, name) );
|
||||
// //tex->normalized = normalized;
|
||||
// //tex->filterMode = filterMode;
|
||||
// //tex->addressMode[0] = addrMode;
|
||||
// //tex->addressMode[1] = addrMode;
|
||||
//
|
||||
// const textureReference* tex;
|
||||
// cudaSafeCall( cudaGetTextureReference(&tex, name) );
|
||||
//
|
||||
// cudaChannelFormatDesc desc = cudaCreateChannelDesc<T>();
|
||||
// cudaSafeCall( cudaBindTexture2D(0, tex, img.ptr(), &desc, img.cols, img.rows, img.step) );
|
||||
//}
|
||||
|
||||
//static inline void unbindTexture(const char *name)
|
||||
//{
|
||||
// const textureReference* tex;
|
||||
// cudaSafeCall( cudaGetTextureReference(&tex, name) );
|
||||
// cudaSafeCall( cudaUnbindTexture(tex) );
|
||||
//}
|
||||
|
||||
|
||||
|
||||
//class TextureBinder
|
||||
//{
|
||||
//public:
|
||||
// TextureBinder() : tex_(0) {}
|
||||
// template <typename T> TextureBinder(const textureReference* tex, const DevMem2D_<T>& img) : tex_(0)
|
||||
// {
|
||||
// bind(tex, img);
|
||||
// }
|
||||
// template <typename T> TextureBinder(const char* tex_name, const DevMem2D_<T>& img) : tex_(0)
|
||||
// {
|
||||
// bind(tex_name, img);
|
||||
// }
|
||||
// ~TextureBinder() { unbind(); }
|
||||
//
|
||||
// template <typename T> void bind(const textureReference* tex, const DevMem2D_<T>& img)
|
||||
// {
|
||||
// unbind();
|
||||
//
|
||||
// cudaChannelFormatDesc desc = cudaCreateChannelDesc<T>();
|
||||
// cudaSafeCall( cudaBindTexture2D(0, tex, img.ptr(), &desc, img.cols, img.rows, img.step) );
|
||||
//
|
||||
// tex_ = tex;
|
||||
// }
|
||||
// template <typename T> void bind(const char* tex_name, const DevMem2D_<T>& img)
|
||||
// {
|
||||
// const textureReference* tex;
|
||||
// cudaSafeCall( cudaGetTextureReference(&tex, tex_name) );
|
||||
// bind(tex, img);
|
||||
// }
|
||||
//
|
||||
// void unbind()
|
||||
// {
|
||||
// if (tex_)
|
||||
// {
|
||||
// cudaUnbindTexture(tex_);
|
||||
// tex_ = 0;
|
||||
// }
|
||||
// }
|
||||
//
|
||||
//private:
|
||||
// const textureReference* tex_;
|
||||
//};
|
||||
|
||||
class NppStreamHandler
|
||||
{
|
||||
public:
|
||||
|
File diff suppressed because it is too large
Load Diff
@ -42,174 +42,172 @@
|
||||
|
||||
#include "internal_shared.hpp"
|
||||
|
||||
BEGIN_OPENCV_DEVICE_NAMESPACE
|
||||
|
||||
namespace mathfunc {
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////////////////
|
||||
// Cart <-> Polar
|
||||
|
||||
struct Nothing
|
||||
namespace cv { namespace gpu { namespace device
|
||||
{
|
||||
static __device__ __forceinline__ void calc(int, int, float, float, float*, size_t, float)
|
||||
namespace mathfunc
|
||||
{
|
||||
}
|
||||
};
|
||||
struct Magnitude
|
||||
{
|
||||
static __device__ __forceinline__ void calc(int x, int y, float x_data, float y_data, float* dst, size_t dst_step, float)
|
||||
{
|
||||
dst[y * dst_step + x] = ::sqrtf(x_data * x_data + y_data * y_data);
|
||||
}
|
||||
};
|
||||
struct MagnitudeSqr
|
||||
{
|
||||
static __device__ __forceinline__ void calc(int x, int y, float x_data, float y_data, float* dst, size_t dst_step, float)
|
||||
{
|
||||
dst[y * dst_step + x] = x_data * x_data + y_data * y_data;
|
||||
}
|
||||
};
|
||||
struct Atan2
|
||||
{
|
||||
static __device__ __forceinline__ void calc(int x, int y, float x_data, float y_data, float* dst, size_t dst_step, float scale)
|
||||
{
|
||||
float angle = ::atan2f(y_data, x_data);
|
||||
angle += (angle < 0) * 2.0 * CV_PI;
|
||||
dst[y * dst_step + x] = scale * angle;
|
||||
}
|
||||
};
|
||||
template <typename Mag, typename Angle>
|
||||
__global__ void cartToPolar(const float* xptr, size_t x_step, const float* yptr, size_t y_step,
|
||||
float* mag, size_t mag_step, float* angle, size_t angle_step, float scale, int width, int height)
|
||||
{
|
||||
const int x = blockDim.x * blockIdx.x + threadIdx.x;
|
||||
const int y = blockDim.y * blockIdx.y + threadIdx.y;
|
||||
//////////////////////////////////////////////////////////////////////////////////////
|
||||
// Cart <-> Polar
|
||||
|
||||
if (x < width && y < height)
|
||||
{
|
||||
float x_data = xptr[y * x_step + x];
|
||||
float y_data = yptr[y * y_step + x];
|
||||
|
||||
Mag::calc(x, y, x_data, y_data, mag, mag_step, scale);
|
||||
Angle::calc(x, y, x_data, y_data, angle, angle_step, scale);
|
||||
}
|
||||
}
|
||||
|
||||
struct NonEmptyMag
|
||||
{
|
||||
static __device__ __forceinline__ float get(const float* mag, size_t mag_step, int x, int y)
|
||||
{
|
||||
return mag[y * mag_step + x];
|
||||
}
|
||||
};
|
||||
struct EmptyMag
|
||||
{
|
||||
static __device__ __forceinline__ float get(const float*, size_t, int, int)
|
||||
{
|
||||
return 1.0f;
|
||||
}
|
||||
};
|
||||
template <typename Mag>
|
||||
__global__ void polarToCart(const float* mag, size_t mag_step, const float* angle, size_t angle_step, float scale,
|
||||
float* xptr, size_t x_step, float* yptr, size_t y_step, int width, int height)
|
||||
{
|
||||
const int x = blockDim.x * blockIdx.x + threadIdx.x;
|
||||
const int y = blockDim.y * blockIdx.y + threadIdx.y;
|
||||
|
||||
if (x < width && y < height)
|
||||
{
|
||||
float mag_data = Mag::get(mag, mag_step, x, y);
|
||||
float angle_data = angle[y * angle_step + x];
|
||||
float sin_a, cos_a;
|
||||
|
||||
::sincosf(scale * angle_data, &sin_a, &cos_a);
|
||||
|
||||
xptr[y * x_step + x] = mag_data * cos_a;
|
||||
yptr[y * y_step + x] = mag_data * sin_a;
|
||||
}
|
||||
}
|
||||
|
||||
template <typename Mag, typename Angle>
|
||||
void cartToPolar_caller(DevMem2Df x, DevMem2Df y, DevMem2Df mag, DevMem2Df angle, bool angleInDegrees, cudaStream_t stream)
|
||||
{
|
||||
dim3 threads(32, 8, 1);
|
||||
dim3 grid(1, 1, 1);
|
||||
|
||||
grid.x = divUp(x.cols, threads.x);
|
||||
grid.y = divUp(x.rows, threads.y);
|
||||
|
||||
const float scale = angleInDegrees ? (float)(180.0f / CV_PI) : 1.f;
|
||||
|
||||
cartToPolar<Mag, Angle><<<grid, threads, 0, stream>>>(
|
||||
x.data, x.step/x.elemSize(), y.data, y.step/y.elemSize(),
|
||||
mag.data, mag.step/mag.elemSize(), angle.data, angle.step/angle.elemSize(), scale, x.cols, x.rows);
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
|
||||
if (stream == 0)
|
||||
cudaSafeCall( cudaDeviceSynchronize() );
|
||||
}
|
||||
|
||||
void cartToPolar_gpu(DevMem2Df x, DevMem2Df y, DevMem2Df mag, bool magSqr, DevMem2Df angle, bool angleInDegrees, cudaStream_t stream)
|
||||
{
|
||||
typedef void (*caller_t)(DevMem2Df x, DevMem2Df y, DevMem2Df mag, DevMem2Df angle, bool angleInDegrees, cudaStream_t stream);
|
||||
static const caller_t callers[2][2][2] =
|
||||
{
|
||||
struct Nothing
|
||||
{
|
||||
static __device__ __forceinline__ void calc(int, int, float, float, float*, size_t, float)
|
||||
{
|
||||
cartToPolar_caller<Magnitude, Atan2>,
|
||||
cartToPolar_caller<Magnitude, Nothing>
|
||||
},
|
||||
{
|
||||
cartToPolar_caller<MagnitudeSqr, Atan2>,
|
||||
cartToPolar_caller<MagnitudeSqr, Nothing>,
|
||||
}
|
||||
},
|
||||
};
|
||||
struct Magnitude
|
||||
{
|
||||
static __device__ __forceinline__ void calc(int x, int y, float x_data, float y_data, float* dst, size_t dst_step, float)
|
||||
{
|
||||
cartToPolar_caller<Nothing, Atan2>,
|
||||
cartToPolar_caller<Nothing, Nothing>
|
||||
},
|
||||
dst[y * dst_step + x] = ::sqrtf(x_data * x_data + y_data * y_data);
|
||||
}
|
||||
};
|
||||
struct MagnitudeSqr
|
||||
{
|
||||
static __device__ __forceinline__ void calc(int x, int y, float x_data, float y_data, float* dst, size_t dst_step, float)
|
||||
{
|
||||
cartToPolar_caller<Nothing, Atan2>,
|
||||
cartToPolar_caller<Nothing, Nothing>,
|
||||
dst[y * dst_step + x] = x_data * x_data + y_data * y_data;
|
||||
}
|
||||
};
|
||||
struct Atan2
|
||||
{
|
||||
static __device__ __forceinline__ void calc(int x, int y, float x_data, float y_data, float* dst, size_t dst_step, float scale)
|
||||
{
|
||||
float angle = ::atan2f(y_data, x_data);
|
||||
angle += (angle < 0) * 2.0 * CV_PI;
|
||||
dst[y * dst_step + x] = scale * angle;
|
||||
}
|
||||
};
|
||||
template <typename Mag, typename Angle>
|
||||
__global__ void cartToPolar(const float* xptr, size_t x_step, const float* yptr, size_t y_step,
|
||||
float* mag, size_t mag_step, float* angle, size_t angle_step, float scale, int width, int height)
|
||||
{
|
||||
const int x = blockDim.x * blockIdx.x + threadIdx.x;
|
||||
const int y = blockDim.y * blockIdx.y + threadIdx.y;
|
||||
|
||||
if (x < width && y < height)
|
||||
{
|
||||
float x_data = xptr[y * x_step + x];
|
||||
float y_data = yptr[y * y_step + x];
|
||||
|
||||
Mag::calc(x, y, x_data, y_data, mag, mag_step, scale);
|
||||
Angle::calc(x, y, x_data, y_data, angle, angle_step, scale);
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
callers[mag.data == 0][magSqr][angle.data == 0](x, y, mag, angle, angleInDegrees, stream);
|
||||
}
|
||||
struct NonEmptyMag
|
||||
{
|
||||
static __device__ __forceinline__ float get(const float* mag, size_t mag_step, int x, int y)
|
||||
{
|
||||
return mag[y * mag_step + x];
|
||||
}
|
||||
};
|
||||
struct EmptyMag
|
||||
{
|
||||
static __device__ __forceinline__ float get(const float*, size_t, int, int)
|
||||
{
|
||||
return 1.0f;
|
||||
}
|
||||
};
|
||||
template <typename Mag>
|
||||
__global__ void polarToCart(const float* mag, size_t mag_step, const float* angle, size_t angle_step, float scale,
|
||||
float* xptr, size_t x_step, float* yptr, size_t y_step, int width, int height)
|
||||
{
|
||||
const int x = blockDim.x * blockIdx.x + threadIdx.x;
|
||||
const int y = blockDim.y * blockIdx.y + threadIdx.y;
|
||||
|
||||
template <typename Mag>
|
||||
void polarToCart_caller(DevMem2Df mag, DevMem2Df angle, DevMem2Df x, DevMem2Df y, bool angleInDegrees, cudaStream_t stream)
|
||||
{
|
||||
dim3 threads(32, 8, 1);
|
||||
dim3 grid(1, 1, 1);
|
||||
if (x < width && y < height)
|
||||
{
|
||||
float mag_data = Mag::get(mag, mag_step, x, y);
|
||||
float angle_data = angle[y * angle_step + x];
|
||||
float sin_a, cos_a;
|
||||
|
||||
grid.x = divUp(mag.cols, threads.x);
|
||||
grid.y = divUp(mag.rows, threads.y);
|
||||
|
||||
const float scale = angleInDegrees ? (float)(CV_PI / 180.0f) : 1.0f;
|
||||
::sincosf(scale * angle_data, &sin_a, &cos_a);
|
||||
|
||||
polarToCart<Mag><<<grid, threads, 0, stream>>>(mag.data, mag.step/mag.elemSize(),
|
||||
angle.data, angle.step/angle.elemSize(), scale, x.data, x.step/x.elemSize(), y.data, y.step/y.elemSize(), mag.cols, mag.rows);
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
xptr[y * x_step + x] = mag_data * cos_a;
|
||||
yptr[y * y_step + x] = mag_data * sin_a;
|
||||
}
|
||||
}
|
||||
|
||||
if (stream == 0)
|
||||
cudaSafeCall( cudaDeviceSynchronize() );
|
||||
}
|
||||
template <typename Mag, typename Angle>
|
||||
void cartToPolar_caller(DevMem2Df x, DevMem2Df y, DevMem2Df mag, DevMem2Df angle, bool angleInDegrees, cudaStream_t stream)
|
||||
{
|
||||
dim3 threads(32, 8, 1);
|
||||
dim3 grid(1, 1, 1);
|
||||
|
||||
void polarToCart_gpu(DevMem2Df mag, DevMem2Df angle, DevMem2Df x, DevMem2Df y, bool angleInDegrees, cudaStream_t stream)
|
||||
{
|
||||
typedef void (*caller_t)(DevMem2Df mag, DevMem2Df angle, DevMem2Df x, DevMem2Df y, bool angleInDegrees, cudaStream_t stream);
|
||||
static const caller_t callers[2] =
|
||||
{
|
||||
polarToCart_caller<NonEmptyMag>,
|
||||
polarToCart_caller<EmptyMag>
|
||||
};
|
||||
grid.x = divUp(x.cols, threads.x);
|
||||
grid.y = divUp(x.rows, threads.y);
|
||||
|
||||
const float scale = angleInDegrees ? (float)(180.0f / CV_PI) : 1.f;
|
||||
|
||||
callers[mag.data == 0](mag, angle, x, y, angleInDegrees, stream);
|
||||
}
|
||||
cartToPolar<Mag, Angle><<<grid, threads, 0, stream>>>(
|
||||
x.data, x.step/x.elemSize(), y.data, y.step/y.elemSize(),
|
||||
mag.data, mag.step/mag.elemSize(), angle.data, angle.step/angle.elemSize(), scale, x.cols, x.rows);
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
|
||||
} // namespace mathfunc
|
||||
if (stream == 0)
|
||||
cudaSafeCall( cudaDeviceSynchronize() );
|
||||
}
|
||||
|
||||
END_OPENCV_DEVICE_NAMESPACE
|
||||
void cartToPolar_gpu(DevMem2Df x, DevMem2Df y, DevMem2Df mag, bool magSqr, DevMem2Df angle, bool angleInDegrees, cudaStream_t stream)
|
||||
{
|
||||
typedef void (*caller_t)(DevMem2Df x, DevMem2Df y, DevMem2Df mag, DevMem2Df angle, bool angleInDegrees, cudaStream_t stream);
|
||||
static const caller_t callers[2][2][2] =
|
||||
{
|
||||
{
|
||||
{
|
||||
cartToPolar_caller<Magnitude, Atan2>,
|
||||
cartToPolar_caller<Magnitude, Nothing>
|
||||
},
|
||||
{
|
||||
cartToPolar_caller<MagnitudeSqr, Atan2>,
|
||||
cartToPolar_caller<MagnitudeSqr, Nothing>,
|
||||
}
|
||||
},
|
||||
{
|
||||
{
|
||||
cartToPolar_caller<Nothing, Atan2>,
|
||||
cartToPolar_caller<Nothing, Nothing>
|
||||
},
|
||||
{
|
||||
cartToPolar_caller<Nothing, Atan2>,
|
||||
cartToPolar_caller<Nothing, Nothing>,
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
callers[mag.data == 0][magSqr][angle.data == 0](x, y, mag, angle, angleInDegrees, stream);
|
||||
}
|
||||
|
||||
template <typename Mag>
|
||||
void polarToCart_caller(DevMem2Df mag, DevMem2Df angle, DevMem2Df x, DevMem2Df y, bool angleInDegrees, cudaStream_t stream)
|
||||
{
|
||||
dim3 threads(32, 8, 1);
|
||||
dim3 grid(1, 1, 1);
|
||||
|
||||
grid.x = divUp(mag.cols, threads.x);
|
||||
grid.y = divUp(mag.rows, threads.y);
|
||||
|
||||
const float scale = angleInDegrees ? (float)(CV_PI / 180.0f) : 1.0f;
|
||||
|
||||
polarToCart<Mag><<<grid, threads, 0, stream>>>(mag.data, mag.step/mag.elemSize(),
|
||||
angle.data, angle.step/angle.elemSize(), scale, x.data, x.step/x.elemSize(), y.data, y.step/y.elemSize(), mag.cols, mag.rows);
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
|
||||
if (stream == 0)
|
||||
cudaSafeCall( cudaDeviceSynchronize() );
|
||||
}
|
||||
|
||||
void polarToCart_gpu(DevMem2Df mag, DevMem2Df angle, DevMem2Df x, DevMem2Df y, bool angleInDegrees, cudaStream_t stream)
|
||||
{
|
||||
typedef void (*caller_t)(DevMem2Df mag, DevMem2Df angle, DevMem2Df x, DevMem2Df y, bool angleInDegrees, cudaStream_t stream);
|
||||
static const caller_t callers[2] =
|
||||
{
|
||||
polarToCart_caller<NonEmptyMag>,
|
||||
polarToCart_caller<EmptyMag>
|
||||
};
|
||||
|
||||
callers[mag.data == 0](mag, angle, x, y, angleInDegrees, stream);
|
||||
}
|
||||
} // namespace mathfunc
|
||||
}}} // namespace cv { namespace gpu { namespace device
|
||||
|
@ -45,304 +45,303 @@
|
||||
#include "opencv2/gpu/device/transform.hpp"
|
||||
#include "opencv2/gpu/device/functional.hpp"
|
||||
|
||||
BEGIN_OPENCV_DEVICE_NAMESPACE
|
||||
|
||||
template <typename T> struct shift_and_sizeof;
|
||||
template <> struct shift_and_sizeof<signed char> { enum { shift = 0 }; };
|
||||
template <> struct shift_and_sizeof<unsigned char> { enum { shift = 0 }; };
|
||||
template <> struct shift_and_sizeof<short> { enum { shift = 1 }; };
|
||||
template <> struct shift_and_sizeof<unsigned short> { enum { shift = 1 }; };
|
||||
template <> struct shift_and_sizeof<int> { enum { shift = 2 }; };
|
||||
template <> struct shift_and_sizeof<float> { enum { shift = 2 }; };
|
||||
template <> struct shift_and_sizeof<double> { enum { shift = 3 }; };
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////
|
||||
////////////////////////////////// CopyTo /////////////////////////////////
|
||||
///////////////////////////////////////////////////////////////////////////
|
||||
|
||||
template<typename T>
|
||||
__global__ void copy_to_with_mask(const T* mat_src, T* mat_dst, const uchar* mask, int cols, int rows, size_t step_mat, size_t step_mask, int channels)
|
||||
namespace cv { namespace gpu { namespace device
|
||||
{
|
||||
size_t x = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
size_t y = blockIdx.y * blockDim.y + threadIdx.y;
|
||||
template <typename T> struct shift_and_sizeof;
|
||||
template <> struct shift_and_sizeof<signed char> { enum { shift = 0 }; };
|
||||
template <> struct shift_and_sizeof<unsigned char> { enum { shift = 0 }; };
|
||||
template <> struct shift_and_sizeof<short> { enum { shift = 1 }; };
|
||||
template <> struct shift_and_sizeof<unsigned short> { enum { shift = 1 }; };
|
||||
template <> struct shift_and_sizeof<int> { enum { shift = 2 }; };
|
||||
template <> struct shift_and_sizeof<float> { enum { shift = 2 }; };
|
||||
template <> struct shift_and_sizeof<double> { enum { shift = 3 }; };
|
||||
|
||||
if ((x < cols * channels ) && (y < rows))
|
||||
if (mask[y * step_mask + x / channels] != 0)
|
||||
{
|
||||
size_t idx = y * ( step_mat >> shift_and_sizeof<T>::shift ) + x;
|
||||
mat_dst[idx] = mat_src[idx];
|
||||
}
|
||||
}
|
||||
///////////////////////////////////////////////////////////////////////////
|
||||
////////////////////////////////// CopyTo /////////////////////////////////
|
||||
///////////////////////////////////////////////////////////////////////////
|
||||
|
||||
template<typename T>
|
||||
void copy_to_with_mask_run(const DevMem2Db& mat_src, const DevMem2Db& mat_dst, const DevMem2Db& mask, int channels, const cudaStream_t & stream)
|
||||
{
|
||||
dim3 threadsPerBlock(16,16, 1);
|
||||
dim3 numBlocks ( divUp(mat_src.cols * channels , threadsPerBlock.x) , divUp(mat_src.rows , threadsPerBlock.y), 1);
|
||||
|
||||
copy_to_with_mask<T><<<numBlocks,threadsPerBlock, 0, stream>>>
|
||||
((T*)mat_src.data, (T*)mat_dst.data, (unsigned char*)mask.data, mat_src.cols, mat_src.rows, mat_src.step, mask.step, channels);
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
|
||||
if (stream == 0)
|
||||
cudaSafeCall ( cudaDeviceSynchronize() );
|
||||
}
|
||||
|
||||
void copy_to_with_mask(const DevMem2Db& mat_src, DevMem2Db mat_dst, int depth, const DevMem2Db& mask, int channels, const cudaStream_t & stream)
|
||||
{
|
||||
typedef void (*CopyToFunc)(const DevMem2Db& mat_src, const DevMem2Db& mat_dst, const DevMem2Db& mask, int channels, const cudaStream_t & stream);
|
||||
|
||||
static CopyToFunc tab[8] =
|
||||
template<typename T>
|
||||
__global__ void copy_to_with_mask(const T* mat_src, T* mat_dst, const uchar* mask, int cols, int rows, size_t step_mat, size_t step_mask, int channels)
|
||||
{
|
||||
copy_to_with_mask_run<unsigned char>,
|
||||
copy_to_with_mask_run<signed char>,
|
||||
copy_to_with_mask_run<unsigned short>,
|
||||
copy_to_with_mask_run<short>,
|
||||
copy_to_with_mask_run<int>,
|
||||
copy_to_with_mask_run<float>,
|
||||
copy_to_with_mask_run<double>,
|
||||
0
|
||||
};
|
||||
size_t x = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
size_t y = blockIdx.y * blockDim.y + threadIdx.y;
|
||||
|
||||
CopyToFunc func = tab[depth];
|
||||
|
||||
if (func == 0) cv::gpu::error("Unsupported copyTo operation", __FILE__, __LINE__);
|
||||
|
||||
func(mat_src, mat_dst, mask, channels, stream);
|
||||
}
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////
|
||||
////////////////////////////////// SetTo //////////////////////////////////
|
||||
///////////////////////////////////////////////////////////////////////////
|
||||
|
||||
__constant__ uchar scalar_8u[4];
|
||||
__constant__ schar scalar_8s[4];
|
||||
__constant__ ushort scalar_16u[4];
|
||||
__constant__ short scalar_16s[4];
|
||||
__constant__ int scalar_32s[4];
|
||||
__constant__ float scalar_32f[4];
|
||||
__constant__ double scalar_64f[4];
|
||||
|
||||
template <typename T> __device__ __forceinline__ T readScalar(int i);
|
||||
template <> __device__ __forceinline__ uchar readScalar<uchar>(int i) {return scalar_8u[i];}
|
||||
template <> __device__ __forceinline__ schar readScalar<schar>(int i) {return scalar_8s[i];}
|
||||
template <> __device__ __forceinline__ ushort readScalar<ushort>(int i) {return scalar_16u[i];}
|
||||
template <> __device__ __forceinline__ short readScalar<short>(int i) {return scalar_16s[i];}
|
||||
template <> __device__ __forceinline__ int readScalar<int>(int i) {return scalar_32s[i];}
|
||||
template <> __device__ __forceinline__ float readScalar<float>(int i) {return scalar_32f[i];}
|
||||
template <> __device__ __forceinline__ double readScalar<double>(int i) {return scalar_64f[i];}
|
||||
|
||||
void writeScalar(const uchar* vals)
|
||||
{
|
||||
cudaSafeCall( cudaMemcpyToSymbol(scalar_8u, vals, sizeof(uchar) * 4) );
|
||||
}
|
||||
void writeScalar(const schar* vals)
|
||||
{
|
||||
cudaSafeCall( cudaMemcpyToSymbol(scalar_8s, vals, sizeof(schar) * 4) );
|
||||
}
|
||||
void writeScalar(const ushort* vals)
|
||||
{
|
||||
cudaSafeCall( cudaMemcpyToSymbol(scalar_16u, vals, sizeof(ushort) * 4) );
|
||||
}
|
||||
void writeScalar(const short* vals)
|
||||
{
|
||||
cudaSafeCall( cudaMemcpyToSymbol(scalar_16s, vals, sizeof(short) * 4) );
|
||||
}
|
||||
void writeScalar(const int* vals)
|
||||
{
|
||||
cudaSafeCall( cudaMemcpyToSymbol(scalar_32s, vals, sizeof(int) * 4) );
|
||||
}
|
||||
void writeScalar(const float* vals)
|
||||
{
|
||||
cudaSafeCall( cudaMemcpyToSymbol(scalar_32f, vals, sizeof(float) * 4) );
|
||||
}
|
||||
void writeScalar(const double* vals)
|
||||
{
|
||||
cudaSafeCall( cudaMemcpyToSymbol(scalar_64f, vals, sizeof(double) * 4) );
|
||||
}
|
||||
|
||||
template<typename T>
|
||||
__global__ void set_to_without_mask(T* mat, int cols, int rows, size_t step, int channels)
|
||||
{
|
||||
size_t x = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
size_t y = blockIdx.y * blockDim.y + threadIdx.y;
|
||||
|
||||
if ((x < cols * channels ) && (y < rows))
|
||||
{
|
||||
size_t idx = y * ( step >> shift_and_sizeof<T>::shift ) + x;
|
||||
mat[idx] = readScalar<T>(x % channels);
|
||||
if ((x < cols * channels ) && (y < rows))
|
||||
if (mask[y * step_mask + x / channels] != 0)
|
||||
{
|
||||
size_t idx = y * ( step_mat >> shift_and_sizeof<T>::shift ) + x;
|
||||
mat_dst[idx] = mat_src[idx];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template<typename T>
|
||||
__global__ void set_to_with_mask(T* mat, const uchar* mask, int cols, int rows, size_t step, int channels, size_t step_mask)
|
||||
{
|
||||
size_t x = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
size_t y = blockIdx.y * blockDim.y + threadIdx.y;
|
||||
template<typename T>
|
||||
void copy_to_with_mask_run(const DevMem2Db& mat_src, const DevMem2Db& mat_dst, const DevMem2Db& mask, int channels, const cudaStream_t & stream)
|
||||
{
|
||||
dim3 threadsPerBlock(16,16, 1);
|
||||
dim3 numBlocks ( divUp(mat_src.cols * channels , threadsPerBlock.x) , divUp(mat_src.rows , threadsPerBlock.y), 1);
|
||||
|
||||
if ((x < cols * channels ) && (y < rows))
|
||||
if (mask[y * step_mask + x / channels] != 0)
|
||||
copy_to_with_mask<T><<<numBlocks,threadsPerBlock, 0, stream>>>
|
||||
((T*)mat_src.data, (T*)mat_dst.data, (unsigned char*)mask.data, mat_src.cols, mat_src.rows, mat_src.step, mask.step, channels);
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
|
||||
if (stream == 0)
|
||||
cudaSafeCall ( cudaDeviceSynchronize() );
|
||||
}
|
||||
|
||||
void copy_to_with_mask(const DevMem2Db& mat_src, DevMem2Db mat_dst, int depth, const DevMem2Db& mask, int channels, const cudaStream_t & stream)
|
||||
{
|
||||
typedef void (*CopyToFunc)(const DevMem2Db& mat_src, const DevMem2Db& mat_dst, const DevMem2Db& mask, int channels, const cudaStream_t & stream);
|
||||
|
||||
static CopyToFunc tab[8] =
|
||||
{
|
||||
copy_to_with_mask_run<unsigned char>,
|
||||
copy_to_with_mask_run<signed char>,
|
||||
copy_to_with_mask_run<unsigned short>,
|
||||
copy_to_with_mask_run<short>,
|
||||
copy_to_with_mask_run<int>,
|
||||
copy_to_with_mask_run<float>,
|
||||
copy_to_with_mask_run<double>,
|
||||
0
|
||||
};
|
||||
|
||||
CopyToFunc func = tab[depth];
|
||||
|
||||
if (func == 0) cv::gpu::error("Unsupported copyTo operation", __FILE__, __LINE__);
|
||||
|
||||
func(mat_src, mat_dst, mask, channels, stream);
|
||||
}
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////
|
||||
////////////////////////////////// SetTo //////////////////////////////////
|
||||
///////////////////////////////////////////////////////////////////////////
|
||||
|
||||
__constant__ uchar scalar_8u[4];
|
||||
__constant__ schar scalar_8s[4];
|
||||
__constant__ ushort scalar_16u[4];
|
||||
__constant__ short scalar_16s[4];
|
||||
__constant__ int scalar_32s[4];
|
||||
__constant__ float scalar_32f[4];
|
||||
__constant__ double scalar_64f[4];
|
||||
|
||||
template <typename T> __device__ __forceinline__ T readScalar(int i);
|
||||
template <> __device__ __forceinline__ uchar readScalar<uchar>(int i) {return scalar_8u[i];}
|
||||
template <> __device__ __forceinline__ schar readScalar<schar>(int i) {return scalar_8s[i];}
|
||||
template <> __device__ __forceinline__ ushort readScalar<ushort>(int i) {return scalar_16u[i];}
|
||||
template <> __device__ __forceinline__ short readScalar<short>(int i) {return scalar_16s[i];}
|
||||
template <> __device__ __forceinline__ int readScalar<int>(int i) {return scalar_32s[i];}
|
||||
template <> __device__ __forceinline__ float readScalar<float>(int i) {return scalar_32f[i];}
|
||||
template <> __device__ __forceinline__ double readScalar<double>(int i) {return scalar_64f[i];}
|
||||
|
||||
void writeScalar(const uchar* vals)
|
||||
{
|
||||
cudaSafeCall( cudaMemcpyToSymbol(scalar_8u, vals, sizeof(uchar) * 4) );
|
||||
}
|
||||
void writeScalar(const schar* vals)
|
||||
{
|
||||
cudaSafeCall( cudaMemcpyToSymbol(scalar_8s, vals, sizeof(schar) * 4) );
|
||||
}
|
||||
void writeScalar(const ushort* vals)
|
||||
{
|
||||
cudaSafeCall( cudaMemcpyToSymbol(scalar_16u, vals, sizeof(ushort) * 4) );
|
||||
}
|
||||
void writeScalar(const short* vals)
|
||||
{
|
||||
cudaSafeCall( cudaMemcpyToSymbol(scalar_16s, vals, sizeof(short) * 4) );
|
||||
}
|
||||
void writeScalar(const int* vals)
|
||||
{
|
||||
cudaSafeCall( cudaMemcpyToSymbol(scalar_32s, vals, sizeof(int) * 4) );
|
||||
}
|
||||
void writeScalar(const float* vals)
|
||||
{
|
||||
cudaSafeCall( cudaMemcpyToSymbol(scalar_32f, vals, sizeof(float) * 4) );
|
||||
}
|
||||
void writeScalar(const double* vals)
|
||||
{
|
||||
cudaSafeCall( cudaMemcpyToSymbol(scalar_64f, vals, sizeof(double) * 4) );
|
||||
}
|
||||
|
||||
template<typename T>
|
||||
__global__ void set_to_without_mask(T* mat, int cols, int rows, size_t step, int channels)
|
||||
{
|
||||
size_t x = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
size_t y = blockIdx.y * blockDim.y + threadIdx.y;
|
||||
|
||||
if ((x < cols * channels ) && (y < rows))
|
||||
{
|
||||
size_t idx = y * ( step >> shift_and_sizeof<T>::shift ) + x;
|
||||
mat[idx] = readScalar<T>(x % channels);
|
||||
}
|
||||
}
|
||||
template <typename T>
|
||||
void set_to_gpu(const DevMem2Db& mat, const T* scalar, const DevMem2Db& mask, int channels, cudaStream_t stream)
|
||||
{
|
||||
writeScalar(scalar);
|
||||
|
||||
dim3 threadsPerBlock(32, 8, 1);
|
||||
dim3 numBlocks (mat.cols * channels / threadsPerBlock.x + 1, mat.rows / threadsPerBlock.y + 1, 1);
|
||||
|
||||
set_to_with_mask<T><<<numBlocks, threadsPerBlock, 0, stream>>>((T*)mat.data, (uchar*)mask.data, mat.cols, mat.rows, mat.step, channels, mask.step);
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
|
||||
if (stream == 0)
|
||||
cudaSafeCall ( cudaDeviceSynchronize() );
|
||||
}
|
||||
|
||||
template void set_to_gpu<uchar >(const DevMem2Db& mat, const uchar* scalar, const DevMem2Db& mask, int channels, cudaStream_t stream);
|
||||
template void set_to_gpu<schar >(const DevMem2Db& mat, const schar* scalar, const DevMem2Db& mask, int channels, cudaStream_t stream);
|
||||
template void set_to_gpu<ushort>(const DevMem2Db& mat, const ushort* scalar, const DevMem2Db& mask, int channels, cudaStream_t stream);
|
||||
template void set_to_gpu<short >(const DevMem2Db& mat, const short* scalar, const DevMem2Db& mask, int channels, cudaStream_t stream);
|
||||
template void set_to_gpu<int >(const DevMem2Db& mat, const int* scalar, const DevMem2Db& mask, int channels, cudaStream_t stream);
|
||||
template void set_to_gpu<float >(const DevMem2Db& mat, const float* scalar, const DevMem2Db& mask, int channels, cudaStream_t stream);
|
||||
template void set_to_gpu<double>(const DevMem2Db& mat, const double* scalar, const DevMem2Db& mask, int channels, cudaStream_t stream);
|
||||
|
||||
template <typename T>
|
||||
void set_to_gpu(const DevMem2Db& mat, const T* scalar, int channels, cudaStream_t stream)
|
||||
{
|
||||
writeScalar(scalar);
|
||||
|
||||
dim3 threadsPerBlock(32, 8, 1);
|
||||
dim3 numBlocks (mat.cols * channels / threadsPerBlock.x + 1, mat.rows / threadsPerBlock.y + 1, 1);
|
||||
|
||||
set_to_without_mask<T><<<numBlocks, threadsPerBlock, 0, stream>>>((T*)mat.data, mat.cols, mat.rows, mat.step, channels);
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
|
||||
if (stream == 0)
|
||||
cudaSafeCall ( cudaDeviceSynchronize() );
|
||||
}
|
||||
|
||||
template void set_to_gpu<uchar >(const DevMem2Db& mat, const uchar* scalar, int channels, cudaStream_t stream);
|
||||
template void set_to_gpu<schar >(const DevMem2Db& mat, const schar* scalar, int channels, cudaStream_t stream);
|
||||
template void set_to_gpu<ushort>(const DevMem2Db& mat, const ushort* scalar, int channels, cudaStream_t stream);
|
||||
template void set_to_gpu<short >(const DevMem2Db& mat, const short* scalar, int channels, cudaStream_t stream);
|
||||
template void set_to_gpu<int >(const DevMem2Db& mat, const int* scalar, int channels, cudaStream_t stream);
|
||||
template void set_to_gpu<float >(const DevMem2Db& mat, const float* scalar, int channels, cudaStream_t stream);
|
||||
template void set_to_gpu<double>(const DevMem2Db& mat, const double* scalar, int channels, cudaStream_t stream);
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////
|
||||
//////////////////////////////// ConvertTo ////////////////////////////////
|
||||
///////////////////////////////////////////////////////////////////////////
|
||||
|
||||
template <typename T, typename D> struct Convertor : unary_function<T, D>
|
||||
{
|
||||
Convertor(double alpha_, double beta_) : alpha(alpha_), beta(beta_) {}
|
||||
|
||||
__device__ __forceinline__ D operator()(const T& src) const
|
||||
{
|
||||
return saturate_cast<D>(alpha * src + beta);
|
||||
}
|
||||
|
||||
const double alpha, beta;
|
||||
};
|
||||
template<typename T>
|
||||
__global__ void set_to_with_mask(T* mat, const uchar* mask, int cols, int rows, size_t step, int channels, size_t step_mask)
|
||||
{
|
||||
size_t x = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
size_t y = blockIdx.y * blockDim.y + threadIdx.y;
|
||||
|
||||
namespace detail
|
||||
{
|
||||
template <size_t src_size, size_t dst_size, typename F> struct ConvertTraitsDispatcher : DefaultTransformFunctorTraits<F>
|
||||
if ((x < cols * channels ) && (y < rows))
|
||||
if (mask[y * step_mask + x / channels] != 0)
|
||||
{
|
||||
size_t idx = y * ( step >> shift_and_sizeof<T>::shift ) + x;
|
||||
mat[idx] = readScalar<T>(x % channels);
|
||||
}
|
||||
}
|
||||
template <typename T>
|
||||
void set_to_gpu(const DevMem2Db& mat, const T* scalar, const DevMem2Db& mask, int channels, cudaStream_t stream)
|
||||
{
|
||||
};
|
||||
template <typename F> struct ConvertTraitsDispatcher<1, 1, F> : DefaultTransformFunctorTraits<F>
|
||||
writeScalar(scalar);
|
||||
|
||||
dim3 threadsPerBlock(32, 8, 1);
|
||||
dim3 numBlocks (mat.cols * channels / threadsPerBlock.x + 1, mat.rows / threadsPerBlock.y + 1, 1);
|
||||
|
||||
set_to_with_mask<T><<<numBlocks, threadsPerBlock, 0, stream>>>((T*)mat.data, (uchar*)mask.data, mat.cols, mat.rows, mat.step, channels, mask.step);
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
|
||||
if (stream == 0)
|
||||
cudaSafeCall ( cudaDeviceSynchronize() );
|
||||
}
|
||||
|
||||
template void set_to_gpu<uchar >(const DevMem2Db& mat, const uchar* scalar, const DevMem2Db& mask, int channels, cudaStream_t stream);
|
||||
template void set_to_gpu<schar >(const DevMem2Db& mat, const schar* scalar, const DevMem2Db& mask, int channels, cudaStream_t stream);
|
||||
template void set_to_gpu<ushort>(const DevMem2Db& mat, const ushort* scalar, const DevMem2Db& mask, int channels, cudaStream_t stream);
|
||||
template void set_to_gpu<short >(const DevMem2Db& mat, const short* scalar, const DevMem2Db& mask, int channels, cudaStream_t stream);
|
||||
template void set_to_gpu<int >(const DevMem2Db& mat, const int* scalar, const DevMem2Db& mask, int channels, cudaStream_t stream);
|
||||
template void set_to_gpu<float >(const DevMem2Db& mat, const float* scalar, const DevMem2Db& mask, int channels, cudaStream_t stream);
|
||||
template void set_to_gpu<double>(const DevMem2Db& mat, const double* scalar, const DevMem2Db& mask, int channels, cudaStream_t stream);
|
||||
|
||||
template <typename T>
|
||||
void set_to_gpu(const DevMem2Db& mat, const T* scalar, int channels, cudaStream_t stream)
|
||||
{
|
||||
enum { smart_shift = 8 };
|
||||
};
|
||||
template <typename F> struct ConvertTraitsDispatcher<1, 2, F> : DefaultTransformFunctorTraits<F>
|
||||
writeScalar(scalar);
|
||||
|
||||
dim3 threadsPerBlock(32, 8, 1);
|
||||
dim3 numBlocks (mat.cols * channels / threadsPerBlock.x + 1, mat.rows / threadsPerBlock.y + 1, 1);
|
||||
|
||||
set_to_without_mask<T><<<numBlocks, threadsPerBlock, 0, stream>>>((T*)mat.data, mat.cols, mat.rows, mat.step, channels);
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
|
||||
if (stream == 0)
|
||||
cudaSafeCall ( cudaDeviceSynchronize() );
|
||||
}
|
||||
|
||||
template void set_to_gpu<uchar >(const DevMem2Db& mat, const uchar* scalar, int channels, cudaStream_t stream);
|
||||
template void set_to_gpu<schar >(const DevMem2Db& mat, const schar* scalar, int channels, cudaStream_t stream);
|
||||
template void set_to_gpu<ushort>(const DevMem2Db& mat, const ushort* scalar, int channels, cudaStream_t stream);
|
||||
template void set_to_gpu<short >(const DevMem2Db& mat, const short* scalar, int channels, cudaStream_t stream);
|
||||
template void set_to_gpu<int >(const DevMem2Db& mat, const int* scalar, int channels, cudaStream_t stream);
|
||||
template void set_to_gpu<float >(const DevMem2Db& mat, const float* scalar, int channels, cudaStream_t stream);
|
||||
template void set_to_gpu<double>(const DevMem2Db& mat, const double* scalar, int channels, cudaStream_t stream);
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////
|
||||
//////////////////////////////// ConvertTo ////////////////////////////////
|
||||
///////////////////////////////////////////////////////////////////////////
|
||||
|
||||
template <typename T, typename D> struct Convertor : unary_function<T, D>
|
||||
{
|
||||
enum { smart_shift = 4 };
|
||||
};
|
||||
template <typename F> struct ConvertTraitsDispatcher<1, 4, F> : DefaultTransformFunctorTraits<F>
|
||||
{
|
||||
enum { smart_block_dim_y = 8 };
|
||||
enum { smart_shift = 4 };
|
||||
Convertor(double alpha_, double beta_) : alpha(alpha_), beta(beta_) {}
|
||||
|
||||
__device__ __forceinline__ D operator()(const T& src) const
|
||||
{
|
||||
return saturate_cast<D>(alpha * src + beta);
|
||||
}
|
||||
|
||||
const double alpha, beta;
|
||||
};
|
||||
|
||||
template <typename F> struct ConvertTraitsDispatcher<2, 2, F> : DefaultTransformFunctorTraits<F>
|
||||
namespace detail
|
||||
{
|
||||
enum { smart_shift = 4 };
|
||||
};
|
||||
template <typename F> struct ConvertTraitsDispatcher<2, 4, F> : DefaultTransformFunctorTraits<F>
|
||||
{
|
||||
enum { smart_shift = 2 };
|
||||
};
|
||||
template <size_t src_size, size_t dst_size, typename F> struct ConvertTraitsDispatcher : DefaultTransformFunctorTraits<F>
|
||||
{
|
||||
};
|
||||
template <typename F> struct ConvertTraitsDispatcher<1, 1, F> : DefaultTransformFunctorTraits<F>
|
||||
{
|
||||
enum { smart_shift = 8 };
|
||||
};
|
||||
template <typename F> struct ConvertTraitsDispatcher<1, 2, F> : DefaultTransformFunctorTraits<F>
|
||||
{
|
||||
enum { smart_shift = 4 };
|
||||
};
|
||||
template <typename F> struct ConvertTraitsDispatcher<1, 4, F> : DefaultTransformFunctorTraits<F>
|
||||
{
|
||||
enum { smart_block_dim_y = 8 };
|
||||
enum { smart_shift = 4 };
|
||||
};
|
||||
|
||||
template <typename F> struct ConvertTraitsDispatcher<4, 2, F> : DefaultTransformFunctorTraits<F>
|
||||
{
|
||||
enum { smart_block_dim_y = 8 };
|
||||
enum { smart_shift = 4 };
|
||||
};
|
||||
template <typename F> struct ConvertTraitsDispatcher<4, 4, F> : DefaultTransformFunctorTraits<F>
|
||||
{
|
||||
enum { smart_block_dim_y = 8 };
|
||||
enum { smart_shift = 2 };
|
||||
};
|
||||
template <typename F> struct ConvertTraitsDispatcher<2, 2, F> : DefaultTransformFunctorTraits<F>
|
||||
{
|
||||
enum { smart_shift = 4 };
|
||||
};
|
||||
template <typename F> struct ConvertTraitsDispatcher<2, 4, F> : DefaultTransformFunctorTraits<F>
|
||||
{
|
||||
enum { smart_shift = 2 };
|
||||
};
|
||||
|
||||
template <typename F> struct ConvertTraits : ConvertTraitsDispatcher<sizeof(typename F::argument_type), sizeof(typename F::result_type), F>
|
||||
template <typename F> struct ConvertTraitsDispatcher<4, 2, F> : DefaultTransformFunctorTraits<F>
|
||||
{
|
||||
enum { smart_block_dim_y = 8 };
|
||||
enum { smart_shift = 4 };
|
||||
};
|
||||
template <typename F> struct ConvertTraitsDispatcher<4, 4, F> : DefaultTransformFunctorTraits<F>
|
||||
{
|
||||
enum { smart_block_dim_y = 8 };
|
||||
enum { smart_shift = 2 };
|
||||
};
|
||||
|
||||
template <typename F> struct ConvertTraits : ConvertTraitsDispatcher<sizeof(typename F::argument_type), sizeof(typename F::result_type), F>
|
||||
{
|
||||
};
|
||||
}
|
||||
|
||||
template <typename T, typename D> struct TransformFunctorTraits< Convertor<T, D> > : detail::ConvertTraits< Convertor<T, D> >
|
||||
{
|
||||
};
|
||||
}
|
||||
|
||||
template <typename T, typename D> struct TransformFunctorTraits< Convertor<T, D> > : detail::ConvertTraits< Convertor<T, D> >
|
||||
{
|
||||
};
|
||||
|
||||
template<typename T, typename D>
|
||||
void cvt_(const DevMem2Db& src, const DevMem2Db& dst, double alpha, double beta, cudaStream_t stream)
|
||||
{
|
||||
cudaSafeCall( cudaSetDoubleForDevice(&alpha) );
|
||||
cudaSafeCall( cudaSetDoubleForDevice(&beta) );
|
||||
Convertor<T, D> op(alpha, beta);
|
||||
OPENCV_DEVICE_NAMESPACE_ transform((DevMem2D_<T>)src, (DevMem2D_<D>)dst, op, stream);
|
||||
}
|
||||
|
||||
void convert_gpu(const DevMem2Db& src, int sdepth, const DevMem2Db& dst, int ddepth, double alpha, double beta,
|
||||
cudaStream_t stream = 0)
|
||||
{
|
||||
typedef void (*caller_t)(const DevMem2Db& src, const DevMem2Db& dst, double alpha, double beta,
|
||||
cudaStream_t stream);
|
||||
|
||||
static const caller_t tab[8][8] =
|
||||
|
||||
template<typename T, typename D>
|
||||
void cvt_(const DevMem2Db& src, const DevMem2Db& dst, double alpha, double beta, cudaStream_t stream)
|
||||
{
|
||||
{cvt_<uchar, uchar>, cvt_<uchar, schar>, cvt_<uchar, ushort>, cvt_<uchar, short>,
|
||||
cvt_<uchar, int>, cvt_<uchar, float>, cvt_<uchar, double>, 0},
|
||||
cudaSafeCall( cudaSetDoubleForDevice(&alpha) );
|
||||
cudaSafeCall( cudaSetDoubleForDevice(&beta) );
|
||||
Convertor<T, D> op(alpha, beta);
|
||||
::cv::gpu::device::transform((DevMem2D_<T>)src, (DevMem2D_<D>)dst, op, stream);
|
||||
}
|
||||
|
||||
{cvt_<schar, uchar>, cvt_<schar, schar>, cvt_<schar, ushort>, cvt_<schar, short>,
|
||||
cvt_<schar, int>, cvt_<schar, float>, cvt_<schar, double>, 0},
|
||||
void convert_gpu(const DevMem2Db& src, int sdepth, const DevMem2Db& dst, int ddepth, double alpha, double beta,
|
||||
cudaStream_t stream = 0)
|
||||
{
|
||||
typedef void (*caller_t)(const DevMem2Db& src, const DevMem2Db& dst, double alpha, double beta,
|
||||
cudaStream_t stream);
|
||||
|
||||
{cvt_<ushort, uchar>, cvt_<ushort, schar>, cvt_<ushort, ushort>, cvt_<ushort, short>,
|
||||
cvt_<ushort, int>, cvt_<ushort, float>, cvt_<ushort, double>, 0},
|
||||
static const caller_t tab[8][8] =
|
||||
{
|
||||
{cvt_<uchar, uchar>, cvt_<uchar, schar>, cvt_<uchar, ushort>, cvt_<uchar, short>,
|
||||
cvt_<uchar, int>, cvt_<uchar, float>, cvt_<uchar, double>, 0},
|
||||
|
||||
{cvt_<short, uchar>, cvt_<short, schar>, cvt_<short, ushort>, cvt_<short, short>,
|
||||
cvt_<short, int>, cvt_<short, float>, cvt_<short, double>, 0},
|
||||
{cvt_<schar, uchar>, cvt_<schar, schar>, cvt_<schar, ushort>, cvt_<schar, short>,
|
||||
cvt_<schar, int>, cvt_<schar, float>, cvt_<schar, double>, 0},
|
||||
|
||||
{cvt_<int, uchar>, cvt_<int, schar>, cvt_<int, ushort>,
|
||||
cvt_<int, short>, cvt_<int, int>, cvt_<int, float>, cvt_<int, double>, 0},
|
||||
{cvt_<ushort, uchar>, cvt_<ushort, schar>, cvt_<ushort, ushort>, cvt_<ushort, short>,
|
||||
cvt_<ushort, int>, cvt_<ushort, float>, cvt_<ushort, double>, 0},
|
||||
|
||||
{cvt_<float, uchar>, cvt_<float, schar>, cvt_<float, ushort>,
|
||||
cvt_<float, short>, cvt_<float, int>, cvt_<float, float>, cvt_<float, double>, 0},
|
||||
{cvt_<short, uchar>, cvt_<short, schar>, cvt_<short, ushort>, cvt_<short, short>,
|
||||
cvt_<short, int>, cvt_<short, float>, cvt_<short, double>, 0},
|
||||
|
||||
{cvt_<double, uchar>, cvt_<double, schar>, cvt_<double, ushort>,
|
||||
cvt_<double, short>, cvt_<double, int>, cvt_<double, float>, cvt_<double, double>, 0},
|
||||
{cvt_<int, uchar>, cvt_<int, schar>, cvt_<int, ushort>,
|
||||
cvt_<int, short>, cvt_<int, int>, cvt_<int, float>, cvt_<int, double>, 0},
|
||||
|
||||
{0,0,0,0,0,0,0,0}
|
||||
};
|
||||
{cvt_<float, uchar>, cvt_<float, schar>, cvt_<float, ushort>,
|
||||
cvt_<float, short>, cvt_<float, int>, cvt_<float, float>, cvt_<float, double>, 0},
|
||||
|
||||
caller_t func = tab[sdepth][ddepth];
|
||||
if (!func)
|
||||
cv::gpu::error("Unsupported convert operation", __FILE__, __LINE__);
|
||||
{cvt_<double, uchar>, cvt_<double, schar>, cvt_<double, ushort>,
|
||||
cvt_<double, short>, cvt_<double, int>, cvt_<double, float>, cvt_<double, double>, 0},
|
||||
|
||||
func(src, dst, alpha, beta, stream);
|
||||
}
|
||||
{0,0,0,0,0,0,0,0}
|
||||
};
|
||||
|
||||
END_OPENCV_DEVICE_NAMESPACE
|
||||
caller_t func = tab[sdepth][ddepth];
|
||||
if (!func)
|
||||
cv::gpu::error("Unsupported convert operation", __FILE__, __LINE__);
|
||||
|
||||
func(src, dst, alpha, beta, stream);
|
||||
}
|
||||
}}} // namespace cv { namespace gpu { namespace device
|
||||
|
File diff suppressed because it is too large
Load Diff
@ -46,142 +46,140 @@
|
||||
#include "opencv2/gpu/device/vec_math.hpp"
|
||||
#include "opencv2/gpu/device/saturate_cast.hpp"
|
||||
|
||||
BEGIN_OPENCV_DEVICE_NAMESPACE
|
||||
|
||||
namespace imgproc {
|
||||
|
||||
template <typename T, typename B> __global__ void pyrDown(const PtrStep<T> src, PtrStep<T> dst, const B b, int dst_cols)
|
||||
namespace cv { namespace gpu { namespace device
|
||||
{
|
||||
typedef typename TypeVec<float, VecTraits<T>::cn>::vec_type value_type;
|
||||
|
||||
const int x = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
const int y = blockIdx.y;
|
||||
|
||||
__shared__ value_type smem[256 + 4];
|
||||
|
||||
value_type sum;
|
||||
|
||||
const int src_y = 2*y;
|
||||
|
||||
sum = VecTraits<value_type>::all(0);
|
||||
|
||||
sum = sum + 0.0625f * b.at(src_y - 2, x, src.data, src.step);
|
||||
sum = sum + 0.25f * b.at(src_y - 1, x, src.data, src.step);
|
||||
sum = sum + 0.375f * b.at(src_y , x, src.data, src.step);
|
||||
sum = sum + 0.25f * b.at(src_y + 1, x, src.data, src.step);
|
||||
sum = sum + 0.0625f * b.at(src_y + 2, x, src.data, src.step);
|
||||
|
||||
smem[2 + threadIdx.x] = sum;
|
||||
|
||||
if (threadIdx.x < 2)
|
||||
namespace imgproc
|
||||
{
|
||||
const int left_x = x - 2 + threadIdx.x;
|
||||
template <typename T, typename B> __global__ void pyrDown(const PtrStep<T> src, PtrStep<T> dst, const B b, int dst_cols)
|
||||
{
|
||||
typedef typename TypeVec<float, VecTraits<T>::cn>::vec_type value_type;
|
||||
|
||||
sum = VecTraits<value_type>::all(0);
|
||||
|
||||
sum = sum + 0.0625f * b.at(src_y - 2, left_x, src.data, src.step);
|
||||
sum = sum + 0.25f * b.at(src_y - 1, left_x, src.data, src.step);
|
||||
sum = sum + 0.375f * b.at(src_y , left_x, src.data, src.step);
|
||||
sum = sum + 0.25f * b.at(src_y + 1, left_x, src.data, src.step);
|
||||
sum = sum + 0.0625f * b.at(src_y + 2, left_x, src.data, src.step);
|
||||
const int x = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
const int y = blockIdx.y;
|
||||
|
||||
smem[threadIdx.x] = sum;
|
||||
}
|
||||
__shared__ value_type smem[256 + 4];
|
||||
|
||||
if (threadIdx.x > 253)
|
||||
{
|
||||
const int right_x = x + threadIdx.x + 2;
|
||||
value_type sum;
|
||||
|
||||
const int src_y = 2*y;
|
||||
|
||||
sum = VecTraits<value_type>::all(0);
|
||||
|
||||
sum = sum + 0.0625f * b.at(src_y - 2, right_x, src.data, src.step);
|
||||
sum = sum + 0.25f * b.at(src_y - 1, right_x, src.data, src.step);
|
||||
sum = sum + 0.375f * b.at(src_y , right_x, src.data, src.step);
|
||||
sum = sum + 0.25f * b.at(src_y + 1, right_x, src.data, src.step);
|
||||
sum = sum + 0.0625f * b.at(src_y + 2, right_x, src.data, src.step);
|
||||
sum = VecTraits<value_type>::all(0);
|
||||
|
||||
sum = sum + 0.0625f * b.at(src_y - 2, x, src.data, src.step);
|
||||
sum = sum + 0.25f * b.at(src_y - 1, x, src.data, src.step);
|
||||
sum = sum + 0.375f * b.at(src_y , x, src.data, src.step);
|
||||
sum = sum + 0.25f * b.at(src_y + 1, x, src.data, src.step);
|
||||
sum = sum + 0.0625f * b.at(src_y + 2, x, src.data, src.step);
|
||||
|
||||
smem[4 + threadIdx.x] = sum;
|
||||
}
|
||||
smem[2 + threadIdx.x] = sum;
|
||||
|
||||
__syncthreads();
|
||||
if (threadIdx.x < 2)
|
||||
{
|
||||
const int left_x = x - 2 + threadIdx.x;
|
||||
|
||||
if (threadIdx.x < 128)
|
||||
{
|
||||
const int tid2 = threadIdx.x * 2;
|
||||
sum = VecTraits<value_type>::all(0);
|
||||
|
||||
sum = sum + 0.0625f * b.at(src_y - 2, left_x, src.data, src.step);
|
||||
sum = sum + 0.25f * b.at(src_y - 1, left_x, src.data, src.step);
|
||||
sum = sum + 0.375f * b.at(src_y , left_x, src.data, src.step);
|
||||
sum = sum + 0.25f * b.at(src_y + 1, left_x, src.data, src.step);
|
||||
sum = sum + 0.0625f * b.at(src_y + 2, left_x, src.data, src.step);
|
||||
|
||||
sum = VecTraits<value_type>::all(0);
|
||||
smem[threadIdx.x] = sum;
|
||||
}
|
||||
|
||||
sum = sum + 0.0625f * smem[2 + tid2 - 2];
|
||||
sum = sum + 0.25f * smem[2 + tid2 - 1];
|
||||
sum = sum + 0.375f * smem[2 + tid2 ];
|
||||
sum = sum + 0.25f * smem[2 + tid2 + 1];
|
||||
sum = sum + 0.0625f * smem[2 + tid2 + 2];
|
||||
if (threadIdx.x > 253)
|
||||
{
|
||||
const int right_x = x + threadIdx.x + 2;
|
||||
|
||||
const int dst_x = (blockIdx.x * blockDim.x + tid2) / 2;
|
||||
sum = VecTraits<value_type>::all(0);
|
||||
|
||||
sum = sum + 0.0625f * b.at(src_y - 2, right_x, src.data, src.step);
|
||||
sum = sum + 0.25f * b.at(src_y - 1, right_x, src.data, src.step);
|
||||
sum = sum + 0.375f * b.at(src_y , right_x, src.data, src.step);
|
||||
sum = sum + 0.25f * b.at(src_y + 1, right_x, src.data, src.step);
|
||||
sum = sum + 0.0625f * b.at(src_y + 2, right_x, src.data, src.step);
|
||||
|
||||
if (dst_x < dst_cols)
|
||||
dst.ptr(y)[dst_x] = saturate_cast<T>(sum);
|
||||
}
|
||||
}
|
||||
smem[4 + threadIdx.x] = sum;
|
||||
}
|
||||
|
||||
template <typename T, template <typename> class B> void pyrDown_caller(const DevMem2D_<T>& src, const DevMem2D_<T>& dst, cudaStream_t stream)
|
||||
{
|
||||
const dim3 block(256);
|
||||
const dim3 grid(divUp(src.cols, block.x), dst.rows);
|
||||
__syncthreads();
|
||||
|
||||
B<T> b(src.rows, src.cols);
|
||||
if (threadIdx.x < 128)
|
||||
{
|
||||
const int tid2 = threadIdx.x * 2;
|
||||
|
||||
pyrDown<T><<<grid, block, 0, stream>>>(src, dst, b, dst.cols);
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
sum = VecTraits<value_type>::all(0);
|
||||
|
||||
if (stream == 0)
|
||||
cudaSafeCall( cudaDeviceSynchronize() );
|
||||
}
|
||||
sum = sum + 0.0625f * smem[2 + tid2 - 2];
|
||||
sum = sum + 0.25f * smem[2 + tid2 - 1];
|
||||
sum = sum + 0.375f * smem[2 + tid2 ];
|
||||
sum = sum + 0.25f * smem[2 + tid2 + 1];
|
||||
sum = sum + 0.0625f * smem[2 + tid2 + 2];
|
||||
|
||||
template <typename T, int cn> void pyrDown_gpu(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream)
|
||||
{
|
||||
typedef typename TypeVec<T, cn>::vec_type type;
|
||||
const int dst_x = (blockIdx.x * blockDim.x + tid2) / 2;
|
||||
|
||||
typedef void (*caller_t)(const DevMem2D_<type>& src, const DevMem2D_<type>& dst, cudaStream_t stream);
|
||||
if (dst_x < dst_cols)
|
||||
dst.ptr(y)[dst_x] = saturate_cast<T>(sum);
|
||||
}
|
||||
}
|
||||
|
||||
static const caller_t callers[] =
|
||||
{
|
||||
pyrDown_caller<type, BrdReflect101>, pyrDown_caller<type, BrdReplicate>, pyrDown_caller<type, BrdConstant>, pyrDown_caller<type, BrdReflect>, pyrDown_caller<type, BrdWrap>
|
||||
};
|
||||
template <typename T, template <typename> class B> void pyrDown_caller(const DevMem2D_<T>& src, const DevMem2D_<T>& dst, cudaStream_t stream)
|
||||
{
|
||||
const dim3 block(256);
|
||||
const dim3 grid(divUp(src.cols, block.x), dst.rows);
|
||||
|
||||
callers[borderType](static_cast< DevMem2D_<type> >(src), static_cast< DevMem2D_<type> >(dst), stream);
|
||||
}
|
||||
B<T> b(src.rows, src.cols);
|
||||
|
||||
template void pyrDown_gpu<uchar, 1>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
|
||||
template void pyrDown_gpu<uchar, 2>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
|
||||
template void pyrDown_gpu<uchar, 3>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
|
||||
template void pyrDown_gpu<uchar, 4>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
|
||||
pyrDown<T><<<grid, block, 0, stream>>>(src, dst, b, dst.cols);
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
|
||||
template void pyrDown_gpu<schar, 1>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
|
||||
template void pyrDown_gpu<schar, 2>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
|
||||
template void pyrDown_gpu<schar, 3>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
|
||||
template void pyrDown_gpu<schar, 4>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
|
||||
if (stream == 0)
|
||||
cudaSafeCall( cudaDeviceSynchronize() );
|
||||
}
|
||||
|
||||
template void pyrDown_gpu<ushort, 1>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
|
||||
template void pyrDown_gpu<ushort, 2>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
|
||||
template void pyrDown_gpu<ushort, 3>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
|
||||
template void pyrDown_gpu<ushort, 4>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
|
||||
template <typename T, int cn> void pyrDown_gpu(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream)
|
||||
{
|
||||
typedef typename TypeVec<T, cn>::vec_type type;
|
||||
|
||||
template void pyrDown_gpu<short, 1>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
|
||||
template void pyrDown_gpu<short, 2>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
|
||||
template void pyrDown_gpu<short, 3>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
|
||||
template void pyrDown_gpu<short, 4>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
|
||||
typedef void (*caller_t)(const DevMem2D_<type>& src, const DevMem2D_<type>& dst, cudaStream_t stream);
|
||||
|
||||
template void pyrDown_gpu<int, 1>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
|
||||
template void pyrDown_gpu<int, 2>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
|
||||
template void pyrDown_gpu<int, 3>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
|
||||
template void pyrDown_gpu<int, 4>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
|
||||
static const caller_t callers[] =
|
||||
{
|
||||
pyrDown_caller<type, BrdReflect101>, pyrDown_caller<type, BrdReplicate>, pyrDown_caller<type, BrdConstant>, pyrDown_caller<type, BrdReflect>, pyrDown_caller<type, BrdWrap>
|
||||
};
|
||||
|
||||
template void pyrDown_gpu<float, 1>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
|
||||
template void pyrDown_gpu<float, 2>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
|
||||
template void pyrDown_gpu<float, 3>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
|
||||
template void pyrDown_gpu<float, 4>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
|
||||
callers[borderType](static_cast< DevMem2D_<type> >(src), static_cast< DevMem2D_<type> >(dst), stream);
|
||||
}
|
||||
|
||||
} // namespace imgproc
|
||||
template void pyrDown_gpu<uchar, 1>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
|
||||
template void pyrDown_gpu<uchar, 2>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
|
||||
template void pyrDown_gpu<uchar, 3>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
|
||||
template void pyrDown_gpu<uchar, 4>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
|
||||
|
||||
END_OPENCV_DEVICE_NAMESPACE
|
||||
template void pyrDown_gpu<schar, 1>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
|
||||
template void pyrDown_gpu<schar, 2>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
|
||||
template void pyrDown_gpu<schar, 3>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
|
||||
template void pyrDown_gpu<schar, 4>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
|
||||
|
||||
template void pyrDown_gpu<ushort, 1>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
|
||||
template void pyrDown_gpu<ushort, 2>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
|
||||
template void pyrDown_gpu<ushort, 3>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
|
||||
template void pyrDown_gpu<ushort, 4>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
|
||||
|
||||
template void pyrDown_gpu<short, 1>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
|
||||
template void pyrDown_gpu<short, 2>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
|
||||
template void pyrDown_gpu<short, 3>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
|
||||
template void pyrDown_gpu<short, 4>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
|
||||
|
||||
template void pyrDown_gpu<int, 1>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
|
||||
template void pyrDown_gpu<int, 2>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
|
||||
template void pyrDown_gpu<int, 3>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
|
||||
template void pyrDown_gpu<int, 4>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
|
||||
|
||||
template void pyrDown_gpu<float, 1>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
|
||||
template void pyrDown_gpu<float, 2>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
|
||||
template void pyrDown_gpu<float, 3>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
|
||||
template void pyrDown_gpu<float, 4>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
|
||||
} // namespace imgproc
|
||||
}}} // namespace cv { namespace gpu { namespace device
|
||||
|
@ -46,137 +46,135 @@
|
||||
#include "opencv2/gpu/device/vec_math.hpp"
|
||||
#include "opencv2/gpu/device/saturate_cast.hpp"
|
||||
|
||||
BEGIN_OPENCV_DEVICE_NAMESPACE
|
||||
|
||||
namespace imgproc {
|
||||
|
||||
template <typename T, typename B> __global__ void pyrUp(const PtrStep<T> src, DevMem2D_<T> dst, const B b)
|
||||
namespace cv { namespace gpu { namespace device
|
||||
{
|
||||
typedef typename TypeVec<float, VecTraits<T>::cn>::vec_type value_type;
|
||||
|
||||
const int x = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
const int y = blockIdx.y * blockDim.y + threadIdx.y;
|
||||
|
||||
__shared__ T smem1[10][10];
|
||||
__shared__ value_type smem2[20][16];
|
||||
|
||||
value_type sum;
|
||||
|
||||
if (threadIdx.x < 10 && threadIdx.y < 10)
|
||||
smem1[threadIdx.y][threadIdx.x] = b.at(blockIdx.y * blockDim.y / 2 + threadIdx.y - 1, blockIdx.x * blockDim.x / 2 + threadIdx.x - 1, src.data, src.step);
|
||||
|
||||
__syncthreads();
|
||||
|
||||
const int tidx = threadIdx.x;
|
||||
|
||||
sum = VecTraits<value_type>::all(0);
|
||||
|
||||
sum = sum + (tidx % 2 == 0) * 0.0625f * smem1[1 + threadIdx.y / 2][1 + ((tidx - 2) >> 1)];
|
||||
sum = sum + (tidx % 2 != 0) * 0.25f * smem1[1 + threadIdx.y / 2][1 + ((tidx - 1) >> 1)];
|
||||
sum = sum + (tidx % 2 == 0) * 0.375f * smem1[1 + threadIdx.y / 2][1 + ((tidx ) >> 1)];
|
||||
sum = sum + (tidx % 2 != 0) * 0.25f * smem1[1 + threadIdx.y / 2][1 + ((tidx + 1) >> 1)];
|
||||
sum = sum + (tidx % 2 == 0) * 0.0625f * smem1[1 + threadIdx.y / 2][1 + ((tidx + 2) >> 1)];
|
||||
|
||||
smem2[2 + threadIdx.y][tidx] = sum;
|
||||
|
||||
if (threadIdx.y < 2)
|
||||
namespace imgproc
|
||||
{
|
||||
sum = VecTraits<value_type>::all(0);
|
||||
template <typename T, typename B> __global__ void pyrUp(const PtrStep<T> src, DevMem2D_<T> dst, const B b)
|
||||
{
|
||||
typedef typename TypeVec<float, VecTraits<T>::cn>::vec_type value_type;
|
||||
|
||||
sum = sum + (tidx % 2 == 0) * 0.0625f * smem1[0][1 + ((tidx - 2) >> 1)];
|
||||
sum = sum + (tidx % 2 != 0) * 0.25f * smem1[0][1 + ((tidx - 1) >> 1)];
|
||||
sum = sum + (tidx % 2 == 0) * 0.375f * smem1[0][1 + ((tidx ) >> 1)];
|
||||
sum = sum + (tidx % 2 != 0) * 0.25f * smem1[0][1 + ((tidx + 1) >> 1)];
|
||||
sum = sum + (tidx % 2 == 0) * 0.0625f * smem1[0][1 + ((tidx + 2) >> 1)];
|
||||
const int x = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
const int y = blockIdx.y * blockDim.y + threadIdx.y;
|
||||
|
||||
smem2[threadIdx.y][tidx] = sum;
|
||||
}
|
||||
__shared__ T smem1[10][10];
|
||||
__shared__ value_type smem2[20][16];
|
||||
|
||||
if (threadIdx.y > 13)
|
||||
{
|
||||
sum = VecTraits<value_type>::all(0);
|
||||
value_type sum;
|
||||
|
||||
sum = sum + (tidx % 2 == 0) * 0.0625f * smem1[9][1 + ((tidx - 2) >> 1)];
|
||||
sum = sum + (tidx % 2 != 0) * 0.25f * smem1[9][1 + ((tidx - 1) >> 1)];
|
||||
sum = sum + (tidx % 2 == 0) * 0.375f * smem1[9][1 + ((tidx ) >> 1)];
|
||||
sum = sum + (tidx % 2 != 0) * 0.25f * smem1[9][1 + ((tidx + 1) >> 1)];
|
||||
sum = sum + (tidx % 2 == 0) * 0.0625f * smem1[9][1 + ((tidx + 2) >> 1)];
|
||||
if (threadIdx.x < 10 && threadIdx.y < 10)
|
||||
smem1[threadIdx.y][threadIdx.x] = b.at(blockIdx.y * blockDim.y / 2 + threadIdx.y - 1, blockIdx.x * blockDim.x / 2 + threadIdx.x - 1, src.data, src.step);
|
||||
|
||||
smem2[4 + threadIdx.y][tidx] = sum;
|
||||
}
|
||||
__syncthreads();
|
||||
|
||||
__syncthreads();
|
||||
const int tidx = threadIdx.x;
|
||||
|
||||
sum = VecTraits<value_type>::all(0);
|
||||
sum = VecTraits<value_type>::all(0);
|
||||
|
||||
sum = sum + (tidx % 2 == 0) * 0.0625f * smem2[2 + threadIdx.y - 2][tidx];
|
||||
sum = sum + (tidx % 2 != 0) * 0.25f * smem2[2 + threadIdx.y - 1][tidx];
|
||||
sum = sum + (tidx % 2 == 0) * 0.375f * smem2[2 + threadIdx.y ][tidx];
|
||||
sum = sum + (tidx % 2 != 0) * 0.25f * smem2[2 + threadIdx.y + 1][tidx];
|
||||
sum = sum + (tidx % 2 == 0) * 0.0625f * smem2[2 + threadIdx.y + 2][tidx];
|
||||
sum = sum + (tidx % 2 == 0) * 0.0625f * smem1[1 + threadIdx.y / 2][1 + ((tidx - 2) >> 1)];
|
||||
sum = sum + (tidx % 2 != 0) * 0.25f * smem1[1 + threadIdx.y / 2][1 + ((tidx - 1) >> 1)];
|
||||
sum = sum + (tidx % 2 == 0) * 0.375f * smem1[1 + threadIdx.y / 2][1 + ((tidx ) >> 1)];
|
||||
sum = sum + (tidx % 2 != 0) * 0.25f * smem1[1 + threadIdx.y / 2][1 + ((tidx + 1) >> 1)];
|
||||
sum = sum + (tidx % 2 == 0) * 0.0625f * smem1[1 + threadIdx.y / 2][1 + ((tidx + 2) >> 1)];
|
||||
|
||||
if (x < dst.cols && y < dst.rows)
|
||||
dst.ptr(y)[x] = saturate_cast<T>(4.0f * sum);
|
||||
}
|
||||
smem2[2 + threadIdx.y][tidx] = sum;
|
||||
|
||||
template <typename T, template <typename> class B> void pyrUp_caller(const DevMem2D_<T>& src, const DevMem2D_<T>& dst, cudaStream_t stream)
|
||||
{
|
||||
const dim3 block(16, 16);
|
||||
const dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y));
|
||||
if (threadIdx.y < 2)
|
||||
{
|
||||
sum = VecTraits<value_type>::all(0);
|
||||
|
||||
B<T> b(src.rows, src.cols);
|
||||
sum = sum + (tidx % 2 == 0) * 0.0625f * smem1[0][1 + ((tidx - 2) >> 1)];
|
||||
sum = sum + (tidx % 2 != 0) * 0.25f * smem1[0][1 + ((tidx - 1) >> 1)];
|
||||
sum = sum + (tidx % 2 == 0) * 0.375f * smem1[0][1 + ((tidx ) >> 1)];
|
||||
sum = sum + (tidx % 2 != 0) * 0.25f * smem1[0][1 + ((tidx + 1) >> 1)];
|
||||
sum = sum + (tidx % 2 == 0) * 0.0625f * smem1[0][1 + ((tidx + 2) >> 1)];
|
||||
|
||||
pyrUp<T><<<grid, block, 0, stream>>>(src, dst, b);
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
smem2[threadIdx.y][tidx] = sum;
|
||||
}
|
||||
|
||||
if (stream == 0)
|
||||
cudaSafeCall( cudaDeviceSynchronize() );
|
||||
}
|
||||
if (threadIdx.y > 13)
|
||||
{
|
||||
sum = VecTraits<value_type>::all(0);
|
||||
|
||||
template <typename T, int cn> void pyrUp_gpu(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream)
|
||||
{
|
||||
typedef typename TypeVec<T, cn>::vec_type type;
|
||||
sum = sum + (tidx % 2 == 0) * 0.0625f * smem1[9][1 + ((tidx - 2) >> 1)];
|
||||
sum = sum + (tidx % 2 != 0) * 0.25f * smem1[9][1 + ((tidx - 1) >> 1)];
|
||||
sum = sum + (tidx % 2 == 0) * 0.375f * smem1[9][1 + ((tidx ) >> 1)];
|
||||
sum = sum + (tidx % 2 != 0) * 0.25f * smem1[9][1 + ((tidx + 1) >> 1)];
|
||||
sum = sum + (tidx % 2 == 0) * 0.0625f * smem1[9][1 + ((tidx + 2) >> 1)];
|
||||
|
||||
typedef void (*caller_t)(const DevMem2D_<type>& src, const DevMem2D_<type>& dst, cudaStream_t stream);
|
||||
smem2[4 + threadIdx.y][tidx] = sum;
|
||||
}
|
||||
|
||||
static const caller_t callers[] =
|
||||
{
|
||||
pyrUp_caller<type, BrdReflect101>, pyrUp_caller<type, BrdReplicate>, pyrUp_caller<type, BrdConstant>, pyrUp_caller<type, BrdReflect>, pyrUp_caller<type, BrdWrap>
|
||||
};
|
||||
__syncthreads();
|
||||
|
||||
callers[borderType](static_cast< DevMem2D_<type> >(src), static_cast< DevMem2D_<type> >(dst), stream);
|
||||
}
|
||||
sum = VecTraits<value_type>::all(0);
|
||||
|
||||
template void pyrUp_gpu<uchar, 1>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
|
||||
template void pyrUp_gpu<uchar, 2>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
|
||||
template void pyrUp_gpu<uchar, 3>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
|
||||
template void pyrUp_gpu<uchar, 4>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
|
||||
sum = sum + (tidx % 2 == 0) * 0.0625f * smem2[2 + threadIdx.y - 2][tidx];
|
||||
sum = sum + (tidx % 2 != 0) * 0.25f * smem2[2 + threadIdx.y - 1][tidx];
|
||||
sum = sum + (tidx % 2 == 0) * 0.375f * smem2[2 + threadIdx.y ][tidx];
|
||||
sum = sum + (tidx % 2 != 0) * 0.25f * smem2[2 + threadIdx.y + 1][tidx];
|
||||
sum = sum + (tidx % 2 == 0) * 0.0625f * smem2[2 + threadIdx.y + 2][tidx];
|
||||
|
||||
template void pyrUp_gpu<schar, 1>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
|
||||
template void pyrUp_gpu<schar, 2>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
|
||||
template void pyrUp_gpu<schar, 3>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
|
||||
template void pyrUp_gpu<schar, 4>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
|
||||
if (x < dst.cols && y < dst.rows)
|
||||
dst.ptr(y)[x] = saturate_cast<T>(4.0f * sum);
|
||||
}
|
||||
|
||||
template void pyrUp_gpu<ushort, 1>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
|
||||
template void pyrUp_gpu<ushort, 2>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
|
||||
template void pyrUp_gpu<ushort, 3>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
|
||||
template void pyrUp_gpu<ushort, 4>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
|
||||
template <typename T, template <typename> class B> void pyrUp_caller(const DevMem2D_<T>& src, const DevMem2D_<T>& dst, cudaStream_t stream)
|
||||
{
|
||||
const dim3 block(16, 16);
|
||||
const dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y));
|
||||
|
||||
template void pyrUp_gpu<short, 1>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
|
||||
template void pyrUp_gpu<short, 2>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
|
||||
template void pyrUp_gpu<short, 3>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
|
||||
template void pyrUp_gpu<short, 4>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
|
||||
B<T> b(src.rows, src.cols);
|
||||
|
||||
template void pyrUp_gpu<int, 1>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
|
||||
template void pyrUp_gpu<int, 2>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
|
||||
template void pyrUp_gpu<int, 3>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
|
||||
template void pyrUp_gpu<int, 4>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
|
||||
pyrUp<T><<<grid, block, 0, stream>>>(src, dst, b);
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
|
||||
template void pyrUp_gpu<float, 1>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
|
||||
template void pyrUp_gpu<float, 2>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
|
||||
template void pyrUp_gpu<float, 3>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
|
||||
template void pyrUp_gpu<float, 4>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
|
||||
if (stream == 0)
|
||||
cudaSafeCall( cudaDeviceSynchronize() );
|
||||
}
|
||||
|
||||
} // namespace imgproc
|
||||
template <typename T, int cn> void pyrUp_gpu(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream)
|
||||
{
|
||||
typedef typename TypeVec<T, cn>::vec_type type;
|
||||
|
||||
END_OPENCV_DEVICE_NAMESPACE
|
||||
typedef void (*caller_t)(const DevMem2D_<type>& src, const DevMem2D_<type>& dst, cudaStream_t stream);
|
||||
|
||||
static const caller_t callers[] =
|
||||
{
|
||||
pyrUp_caller<type, BrdReflect101>, pyrUp_caller<type, BrdReplicate>, pyrUp_caller<type, BrdConstant>, pyrUp_caller<type, BrdReflect>, pyrUp_caller<type, BrdWrap>
|
||||
};
|
||||
|
||||
callers[borderType](static_cast< DevMem2D_<type> >(src), static_cast< DevMem2D_<type> >(dst), stream);
|
||||
}
|
||||
|
||||
template void pyrUp_gpu<uchar, 1>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
|
||||
template void pyrUp_gpu<uchar, 2>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
|
||||
template void pyrUp_gpu<uchar, 3>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
|
||||
template void pyrUp_gpu<uchar, 4>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
|
||||
|
||||
template void pyrUp_gpu<schar, 1>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
|
||||
template void pyrUp_gpu<schar, 2>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
|
||||
template void pyrUp_gpu<schar, 3>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
|
||||
template void pyrUp_gpu<schar, 4>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
|
||||
|
||||
template void pyrUp_gpu<ushort, 1>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
|
||||
template void pyrUp_gpu<ushort, 2>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
|
||||
template void pyrUp_gpu<ushort, 3>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
|
||||
template void pyrUp_gpu<ushort, 4>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
|
||||
|
||||
template void pyrUp_gpu<short, 1>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
|
||||
template void pyrUp_gpu<short, 2>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
|
||||
template void pyrUp_gpu<short, 3>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
|
||||
template void pyrUp_gpu<short, 4>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
|
||||
|
||||
template void pyrUp_gpu<int, 1>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
|
||||
template void pyrUp_gpu<int, 2>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
|
||||
template void pyrUp_gpu<int, 3>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
|
||||
template void pyrUp_gpu<int, 4>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
|
||||
|
||||
template void pyrUp_gpu<float, 1>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
|
||||
template void pyrUp_gpu<float, 2>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
|
||||
template void pyrUp_gpu<float, 3>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
|
||||
template void pyrUp_gpu<float, 4>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
|
||||
} // namespace imgproc
|
||||
}}} // namespace cv { namespace gpu { namespace device
|
||||
|
@ -47,208 +47,206 @@
|
||||
#include "opencv2/gpu/device/saturate_cast.hpp"
|
||||
#include "opencv2/gpu/device/filters.hpp"
|
||||
|
||||
BEGIN_OPENCV_DEVICE_NAMESPACE
|
||||
|
||||
namespace imgproc {
|
||||
|
||||
template <typename Ptr2D, typename T> __global__ void remap(const Ptr2D src, const PtrStepf mapx, const PtrStepf mapy, DevMem2D_<T> dst)
|
||||
namespace cv { namespace gpu { namespace device
|
||||
{
|
||||
const int x = blockDim.x * blockIdx.x + threadIdx.x;
|
||||
const int y = blockDim.y * blockIdx.y + threadIdx.y;
|
||||
namespace imgproc
|
||||
{
|
||||
template <typename Ptr2D, typename T> __global__ void remap(const Ptr2D src, const PtrStepf mapx, const PtrStepf mapy, DevMem2D_<T> dst)
|
||||
{
|
||||
const int x = blockDim.x * blockIdx.x + threadIdx.x;
|
||||
const int y = blockDim.y * blockIdx.y + threadIdx.y;
|
||||
|
||||
if (x < dst.cols && y < dst.rows)
|
||||
{
|
||||
const float xcoo = mapx.ptr(y)[x];
|
||||
const float ycoo = mapy.ptr(y)[x];
|
||||
if (x < dst.cols && y < dst.rows)
|
||||
{
|
||||
const float xcoo = mapx.ptr(y)[x];
|
||||
const float ycoo = mapy.ptr(y)[x];
|
||||
|
||||
dst.ptr(y)[x] = saturate_cast<T>(src(ycoo, xcoo));
|
||||
}
|
||||
}
|
||||
|
||||
template <template <typename> class Filter, template <typename> class B, typename T> struct RemapDispatcherStream
|
||||
{
|
||||
static void call(const DevMem2D_<T>& src, const DevMem2Df& mapx, const DevMem2Df& mapy, const DevMem2D_<T>& dst,
|
||||
const float* borderValue, cudaStream_t stream, int)
|
||||
{
|
||||
typedef typename TypeVec<float, VecTraits<T>::cn>::vec_type work_type;
|
||||
|
||||
dim3 block(32, 8);
|
||||
dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y));
|
||||
|
||||
B<work_type> brd(src.rows, src.cols, VecTraits<work_type>::make(borderValue));
|
||||
BorderReader< PtrStep<T>, B<work_type> > brdSrc(src, brd);
|
||||
Filter< BorderReader< PtrStep<T>, B<work_type> > > filter_src(brdSrc);
|
||||
|
||||
remap<<<grid, block, 0, stream>>>(filter_src, mapx, mapy, dst);
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
}
|
||||
};
|
||||
|
||||
template <template <typename> class Filter, template <typename> class B, typename T> struct RemapDispatcherNonStream
|
||||
{
|
||||
static void call(const DevMem2D_<T>& src, const DevMem2Df& mapx, const DevMem2Df& mapy, const DevMem2D_<T>& dst, const float* borderValue, int)
|
||||
{
|
||||
typedef typename TypeVec<float, VecTraits<T>::cn>::vec_type work_type;
|
||||
|
||||
dim3 block(32, 8);
|
||||
dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y));
|
||||
|
||||
B<work_type> brd(src.rows, src.cols, VecTraits<work_type>::make(borderValue));
|
||||
BorderReader< PtrStep<T>, B<work_type> > brdSrc(src, brd);
|
||||
Filter< BorderReader< PtrStep<T>, B<work_type> > > filter_src(brdSrc);
|
||||
|
||||
remap<<<grid, block>>>(filter_src, mapx, mapy, dst);
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
|
||||
cudaSafeCall( cudaDeviceSynchronize() );
|
||||
}
|
||||
};
|
||||
|
||||
#define OPENCV_GPU_IMPLEMENT_REMAP_TEX(type) \
|
||||
texture< type , cudaTextureType2D> tex_remap_ ## type (0, cudaFilterModePoint, cudaAddressModeClamp); \
|
||||
struct tex_remap_ ## type ## _reader \
|
||||
{ \
|
||||
typedef type elem_type; \
|
||||
typedef int index_type; \
|
||||
__device__ __forceinline__ elem_type operator ()(index_type y, index_type x) const \
|
||||
{ \
|
||||
return tex2D(tex_remap_ ## type , x, y); \
|
||||
} \
|
||||
}; \
|
||||
template <template <typename> class Filter, template <typename> class B> struct RemapDispatcherNonStream<Filter, B, type> \
|
||||
{ \
|
||||
static void call(const DevMem2D_< type >& src, const DevMem2Df& mapx, const DevMem2Df& mapy, const DevMem2D_< type >& dst, const float* borderValue, int cc) \
|
||||
{ \
|
||||
typedef typename TypeVec<float, VecTraits< type >::cn>::vec_type work_type; \
|
||||
dim3 block(32, cc >= 20 ? 8 : 4); \
|
||||
dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y)); \
|
||||
bindTexture(&tex_remap_ ## type , src); \
|
||||
tex_remap_ ## type ##_reader texSrc; \
|
||||
B<work_type> brd(src.rows, src.cols, VecTraits<work_type>::make(borderValue)); \
|
||||
BorderReader< tex_remap_ ## type ##_reader, B<work_type> > brdSrc(texSrc, brd); \
|
||||
Filter< BorderReader< tex_remap_ ## type ##_reader, B<work_type> > > filter_src(brdSrc); \
|
||||
remap<<<grid, block>>>(filter_src, mapx, mapy, dst); \
|
||||
cudaSafeCall( cudaGetLastError() ); \
|
||||
cudaSafeCall( cudaDeviceSynchronize() ); \
|
||||
} \
|
||||
}; \
|
||||
template <template <typename> class Filter> struct RemapDispatcherNonStream<Filter, BrdReplicate, type> \
|
||||
{ \
|
||||
static void call(const DevMem2D_< type >& src, const DevMem2Df& mapx, const DevMem2Df& mapy, const DevMem2D_< type >& dst, const float*, int) \
|
||||
{ \
|
||||
dim3 block(32, 8); \
|
||||
dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y)); \
|
||||
bindTexture(&tex_remap_ ## type , src); \
|
||||
tex_remap_ ## type ##_reader texSrc; \
|
||||
Filter< tex_remap_ ## type ##_reader > filter_src(texSrc); \
|
||||
remap<<<grid, block>>>(filter_src, mapx, mapy, dst); \
|
||||
cudaSafeCall( cudaGetLastError() ); \
|
||||
cudaSafeCall( cudaDeviceSynchronize() ); \
|
||||
} \
|
||||
};
|
||||
|
||||
OPENCV_GPU_IMPLEMENT_REMAP_TEX(uchar)
|
||||
//OPENCV_GPU_IMPLEMENT_REMAP_TEX(uchar2)
|
||||
OPENCV_GPU_IMPLEMENT_REMAP_TEX(uchar4)
|
||||
|
||||
//OPENCV_GPU_IMPLEMENT_REMAP_TEX(schar)
|
||||
//OPENCV_GPU_IMPLEMENT_REMAP_TEX(char2)
|
||||
//OPENCV_GPU_IMPLEMENT_REMAP_TEX(char4)
|
||||
|
||||
OPENCV_GPU_IMPLEMENT_REMAP_TEX(ushort)
|
||||
//OPENCV_GPU_IMPLEMENT_REMAP_TEX(ushort2)
|
||||
OPENCV_GPU_IMPLEMENT_REMAP_TEX(ushort4)
|
||||
|
||||
OPENCV_GPU_IMPLEMENT_REMAP_TEX(short)
|
||||
//OPENCV_GPU_IMPLEMENT_REMAP_TEX(short2)
|
||||
OPENCV_GPU_IMPLEMENT_REMAP_TEX(short4)
|
||||
|
||||
//OPENCV_GPU_IMPLEMENT_REMAP_TEX(int)
|
||||
//OPENCV_GPU_IMPLEMENT_REMAP_TEX(int2)
|
||||
//OPENCV_GPU_IMPLEMENT_REMAP_TEX(int4)
|
||||
|
||||
OPENCV_GPU_IMPLEMENT_REMAP_TEX(float)
|
||||
//OPENCV_GPU_IMPLEMENT_REMAP_TEX(float2)
|
||||
OPENCV_GPU_IMPLEMENT_REMAP_TEX(float4)
|
||||
|
||||
#undef OPENCV_GPU_IMPLEMENT_REMAP_TEX
|
||||
|
||||
template <template <typename> class Filter, template <typename> class B, typename T> struct RemapDispatcher
|
||||
{
|
||||
static void call(const DevMem2D_<T>& src, const DevMem2Df& mapx, const DevMem2Df& mapy, const DevMem2D_<T>& dst,
|
||||
const float* borderValue, cudaStream_t stream, int cc)
|
||||
{
|
||||
if (stream == 0)
|
||||
RemapDispatcherNonStream<Filter, B, T>::call(src, mapx, mapy, dst, borderValue, cc);
|
||||
else
|
||||
RemapDispatcherStream<Filter, B, T>::call(src, mapx, mapy, dst, borderValue, stream, cc);
|
||||
}
|
||||
};
|
||||
|
||||
template <typename T> void remap_gpu(const DevMem2Db& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2Db& dst, int interpolation,
|
||||
int borderMode, const float* borderValue, cudaStream_t stream, int cc)
|
||||
{
|
||||
typedef void (*caller_t)(const DevMem2D_<T>& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2D_<T>& dst,
|
||||
const float* borderValue, cudaStream_t stream, int cc);
|
||||
|
||||
static const caller_t callers[3][5] =
|
||||
{
|
||||
{
|
||||
RemapDispatcher<PointFilter, BrdReflect101, T>::call,
|
||||
RemapDispatcher<PointFilter, BrdReplicate, T>::call,
|
||||
RemapDispatcher<PointFilter, BrdConstant, T>::call,
|
||||
RemapDispatcher<PointFilter, BrdReflect, T>::call,
|
||||
RemapDispatcher<PointFilter, BrdWrap, T>::call
|
||||
},
|
||||
{
|
||||
RemapDispatcher<LinearFilter, BrdReflect101, T>::call,
|
||||
RemapDispatcher<LinearFilter, BrdReplicate, T>::call,
|
||||
RemapDispatcher<LinearFilter, BrdConstant, T>::call,
|
||||
RemapDispatcher<LinearFilter, BrdReflect, T>::call,
|
||||
RemapDispatcher<LinearFilter, BrdWrap, T>::call
|
||||
},
|
||||
{
|
||||
RemapDispatcher<CubicFilter, BrdReflect101, T>::call,
|
||||
RemapDispatcher<CubicFilter, BrdReplicate, T>::call,
|
||||
RemapDispatcher<CubicFilter, BrdConstant, T>::call,
|
||||
RemapDispatcher<CubicFilter, BrdReflect, T>::call,
|
||||
RemapDispatcher<CubicFilter, BrdWrap, T>::call
|
||||
dst.ptr(y)[x] = saturate_cast<T>(src(ycoo, xcoo));
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
callers[interpolation][borderMode](static_cast< DevMem2D_<T> >(src), xmap, ymap, static_cast< DevMem2D_<T> >(dst), borderValue, stream, cc);
|
||||
}
|
||||
template <template <typename> class Filter, template <typename> class B, typename T> struct RemapDispatcherStream
|
||||
{
|
||||
static void call(const DevMem2D_<T>& src, const DevMem2Df& mapx, const DevMem2Df& mapy, const DevMem2D_<T>& dst,
|
||||
const float* borderValue, cudaStream_t stream, int)
|
||||
{
|
||||
typedef typename TypeVec<float, VecTraits<T>::cn>::vec_type work_type;
|
||||
|
||||
dim3 block(32, 8);
|
||||
dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y));
|
||||
|
||||
template void remap_gpu<uchar >(const DevMem2Db& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2Db& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
|
||||
//template void remap_gpu<uchar2>(const DevMem2Db& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2Db& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
|
||||
template void remap_gpu<uchar3>(const DevMem2Db& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2Db& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
|
||||
template void remap_gpu<uchar4>(const DevMem2Db& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2Db& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
|
||||
B<work_type> brd(src.rows, src.cols, VecTraits<work_type>::make(borderValue));
|
||||
BorderReader< PtrStep<T>, B<work_type> > brdSrc(src, brd);
|
||||
Filter< BorderReader< PtrStep<T>, B<work_type> > > filter_src(brdSrc);
|
||||
|
||||
//template void remap_gpu<schar>(const DevMem2Db& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2Db& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
|
||||
//template void remap_gpu<char2>(const DevMem2Db& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2Db& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
|
||||
//template void remap_gpu<char3>(const DevMem2Db& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2Db& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
|
||||
//template void remap_gpu<char4>(const DevMem2Db& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2Db& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
|
||||
remap<<<grid, block, 0, stream>>>(filter_src, mapx, mapy, dst);
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
}
|
||||
};
|
||||
|
||||
template void remap_gpu<ushort >(const DevMem2Db& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2Db& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
|
||||
//template void remap_gpu<ushort2>(const DevMem2Db& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2Db& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
|
||||
template void remap_gpu<ushort3>(const DevMem2Db& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2Db& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
|
||||
template void remap_gpu<ushort4>(const DevMem2Db& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2Db& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
|
||||
template <template <typename> class Filter, template <typename> class B, typename T> struct RemapDispatcherNonStream
|
||||
{
|
||||
static void call(const DevMem2D_<T>& src, const DevMem2Df& mapx, const DevMem2Df& mapy, const DevMem2D_<T>& dst, const float* borderValue, int)
|
||||
{
|
||||
typedef typename TypeVec<float, VecTraits<T>::cn>::vec_type work_type;
|
||||
|
||||
dim3 block(32, 8);
|
||||
dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y));
|
||||
|
||||
template void remap_gpu<short >(const DevMem2Db& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2Db& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
|
||||
//template void remap_gpu<short2>(const DevMem2Db& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2Db& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
|
||||
template void remap_gpu<short3>(const DevMem2Db& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2Db& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
|
||||
template void remap_gpu<short4>(const DevMem2Db& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2Db& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
|
||||
B<work_type> brd(src.rows, src.cols, VecTraits<work_type>::make(borderValue));
|
||||
BorderReader< PtrStep<T>, B<work_type> > brdSrc(src, brd);
|
||||
Filter< BorderReader< PtrStep<T>, B<work_type> > > filter_src(brdSrc);
|
||||
|
||||
//template void remap_gpu<int >(const DevMem2Db& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2Db& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
|
||||
//template void remap_gpu<int2>(const DevMem2Db& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2Db& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
|
||||
//template void remap_gpu<int3>(const DevMem2Db& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2Db& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
|
||||
//template void remap_gpu<int4>(const DevMem2Db& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2Db& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
|
||||
remap<<<grid, block>>>(filter_src, mapx, mapy, dst);
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
|
||||
template void remap_gpu<float >(const DevMem2Db& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2Db& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
|
||||
//template void remap_gpu<float2>(const DevMem2Db& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2Db& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
|
||||
template void remap_gpu<float3>(const DevMem2Db& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2Db& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
|
||||
template void remap_gpu<float4>(const DevMem2Db& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2Db& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
|
||||
cudaSafeCall( cudaDeviceSynchronize() );
|
||||
}
|
||||
};
|
||||
|
||||
} // namespace imgproc
|
||||
#define OPENCV_GPU_IMPLEMENT_REMAP_TEX(type) \
|
||||
texture< type , cudaTextureType2D> tex_remap_ ## type (0, cudaFilterModePoint, cudaAddressModeClamp); \
|
||||
struct tex_remap_ ## type ## _reader \
|
||||
{ \
|
||||
typedef type elem_type; \
|
||||
typedef int index_type; \
|
||||
__device__ __forceinline__ elem_type operator ()(index_type y, index_type x) const \
|
||||
{ \
|
||||
return tex2D(tex_remap_ ## type , x, y); \
|
||||
} \
|
||||
}; \
|
||||
template <template <typename> class Filter, template <typename> class B> struct RemapDispatcherNonStream<Filter, B, type> \
|
||||
{ \
|
||||
static void call(const DevMem2D_< type >& src, const DevMem2Df& mapx, const DevMem2Df& mapy, const DevMem2D_< type >& dst, const float* borderValue, int cc) \
|
||||
{ \
|
||||
typedef typename TypeVec<float, VecTraits< type >::cn>::vec_type work_type; \
|
||||
dim3 block(32, cc >= 20 ? 8 : 4); \
|
||||
dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y)); \
|
||||
bindTexture(&tex_remap_ ## type , src); \
|
||||
tex_remap_ ## type ##_reader texSrc; \
|
||||
B<work_type> brd(src.rows, src.cols, VecTraits<work_type>::make(borderValue)); \
|
||||
BorderReader< tex_remap_ ## type ##_reader, B<work_type> > brdSrc(texSrc, brd); \
|
||||
Filter< BorderReader< tex_remap_ ## type ##_reader, B<work_type> > > filter_src(brdSrc); \
|
||||
remap<<<grid, block>>>(filter_src, mapx, mapy, dst); \
|
||||
cudaSafeCall( cudaGetLastError() ); \
|
||||
cudaSafeCall( cudaDeviceSynchronize() ); \
|
||||
} \
|
||||
}; \
|
||||
template <template <typename> class Filter> struct RemapDispatcherNonStream<Filter, BrdReplicate, type> \
|
||||
{ \
|
||||
static void call(const DevMem2D_< type >& src, const DevMem2Df& mapx, const DevMem2Df& mapy, const DevMem2D_< type >& dst, const float*, int) \
|
||||
{ \
|
||||
dim3 block(32, 8); \
|
||||
dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y)); \
|
||||
bindTexture(&tex_remap_ ## type , src); \
|
||||
tex_remap_ ## type ##_reader texSrc; \
|
||||
Filter< tex_remap_ ## type ##_reader > filter_src(texSrc); \
|
||||
remap<<<grid, block>>>(filter_src, mapx, mapy, dst); \
|
||||
cudaSafeCall( cudaGetLastError() ); \
|
||||
cudaSafeCall( cudaDeviceSynchronize() ); \
|
||||
} \
|
||||
};
|
||||
|
||||
OPENCV_GPU_IMPLEMENT_REMAP_TEX(uchar)
|
||||
//OPENCV_GPU_IMPLEMENT_REMAP_TEX(uchar2)
|
||||
OPENCV_GPU_IMPLEMENT_REMAP_TEX(uchar4)
|
||||
|
||||
END_OPENCV_DEVICE_NAMESPACE
|
||||
//OPENCV_GPU_IMPLEMENT_REMAP_TEX(schar)
|
||||
//OPENCV_GPU_IMPLEMENT_REMAP_TEX(char2)
|
||||
//OPENCV_GPU_IMPLEMENT_REMAP_TEX(char4)
|
||||
|
||||
OPENCV_GPU_IMPLEMENT_REMAP_TEX(ushort)
|
||||
//OPENCV_GPU_IMPLEMENT_REMAP_TEX(ushort2)
|
||||
OPENCV_GPU_IMPLEMENT_REMAP_TEX(ushort4)
|
||||
|
||||
OPENCV_GPU_IMPLEMENT_REMAP_TEX(short)
|
||||
//OPENCV_GPU_IMPLEMENT_REMAP_TEX(short2)
|
||||
OPENCV_GPU_IMPLEMENT_REMAP_TEX(short4)
|
||||
|
||||
//OPENCV_GPU_IMPLEMENT_REMAP_TEX(int)
|
||||
//OPENCV_GPU_IMPLEMENT_REMAP_TEX(int2)
|
||||
//OPENCV_GPU_IMPLEMENT_REMAP_TEX(int4)
|
||||
|
||||
OPENCV_GPU_IMPLEMENT_REMAP_TEX(float)
|
||||
//OPENCV_GPU_IMPLEMENT_REMAP_TEX(float2)
|
||||
OPENCV_GPU_IMPLEMENT_REMAP_TEX(float4)
|
||||
|
||||
#undef OPENCV_GPU_IMPLEMENT_REMAP_TEX
|
||||
|
||||
template <template <typename> class Filter, template <typename> class B, typename T> struct RemapDispatcher
|
||||
{
|
||||
static void call(const DevMem2D_<T>& src, const DevMem2Df& mapx, const DevMem2Df& mapy, const DevMem2D_<T>& dst,
|
||||
const float* borderValue, cudaStream_t stream, int cc)
|
||||
{
|
||||
if (stream == 0)
|
||||
RemapDispatcherNonStream<Filter, B, T>::call(src, mapx, mapy, dst, borderValue, cc);
|
||||
else
|
||||
RemapDispatcherStream<Filter, B, T>::call(src, mapx, mapy, dst, borderValue, stream, cc);
|
||||
}
|
||||
};
|
||||
|
||||
template <typename T> void remap_gpu(const DevMem2Db& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2Db& dst, int interpolation,
|
||||
int borderMode, const float* borderValue, cudaStream_t stream, int cc)
|
||||
{
|
||||
typedef void (*caller_t)(const DevMem2D_<T>& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2D_<T>& dst,
|
||||
const float* borderValue, cudaStream_t stream, int cc);
|
||||
|
||||
static const caller_t callers[3][5] =
|
||||
{
|
||||
{
|
||||
RemapDispatcher<PointFilter, BrdReflect101, T>::call,
|
||||
RemapDispatcher<PointFilter, BrdReplicate, T>::call,
|
||||
RemapDispatcher<PointFilter, BrdConstant, T>::call,
|
||||
RemapDispatcher<PointFilter, BrdReflect, T>::call,
|
||||
RemapDispatcher<PointFilter, BrdWrap, T>::call
|
||||
},
|
||||
{
|
||||
RemapDispatcher<LinearFilter, BrdReflect101, T>::call,
|
||||
RemapDispatcher<LinearFilter, BrdReplicate, T>::call,
|
||||
RemapDispatcher<LinearFilter, BrdConstant, T>::call,
|
||||
RemapDispatcher<LinearFilter, BrdReflect, T>::call,
|
||||
RemapDispatcher<LinearFilter, BrdWrap, T>::call
|
||||
},
|
||||
{
|
||||
RemapDispatcher<CubicFilter, BrdReflect101, T>::call,
|
||||
RemapDispatcher<CubicFilter, BrdReplicate, T>::call,
|
||||
RemapDispatcher<CubicFilter, BrdConstant, T>::call,
|
||||
RemapDispatcher<CubicFilter, BrdReflect, T>::call,
|
||||
RemapDispatcher<CubicFilter, BrdWrap, T>::call
|
||||
}
|
||||
};
|
||||
|
||||
callers[interpolation][borderMode](static_cast< DevMem2D_<T> >(src), xmap, ymap, static_cast< DevMem2D_<T> >(dst), borderValue, stream, cc);
|
||||
}
|
||||
|
||||
template void remap_gpu<uchar >(const DevMem2Db& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2Db& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
|
||||
//template void remap_gpu<uchar2>(const DevMem2Db& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2Db& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
|
||||
template void remap_gpu<uchar3>(const DevMem2Db& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2Db& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
|
||||
template void remap_gpu<uchar4>(const DevMem2Db& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2Db& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
|
||||
|
||||
//template void remap_gpu<schar>(const DevMem2Db& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2Db& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
|
||||
//template void remap_gpu<char2>(const DevMem2Db& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2Db& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
|
||||
//template void remap_gpu<char3>(const DevMem2Db& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2Db& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
|
||||
//template void remap_gpu<char4>(const DevMem2Db& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2Db& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
|
||||
|
||||
template void remap_gpu<ushort >(const DevMem2Db& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2Db& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
|
||||
//template void remap_gpu<ushort2>(const DevMem2Db& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2Db& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
|
||||
template void remap_gpu<ushort3>(const DevMem2Db& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2Db& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
|
||||
template void remap_gpu<ushort4>(const DevMem2Db& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2Db& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
|
||||
|
||||
template void remap_gpu<short >(const DevMem2Db& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2Db& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
|
||||
//template void remap_gpu<short2>(const DevMem2Db& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2Db& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
|
||||
template void remap_gpu<short3>(const DevMem2Db& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2Db& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
|
||||
template void remap_gpu<short4>(const DevMem2Db& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2Db& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
|
||||
|
||||
//template void remap_gpu<int >(const DevMem2Db& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2Db& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
|
||||
//template void remap_gpu<int2>(const DevMem2Db& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2Db& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
|
||||
//template void remap_gpu<int3>(const DevMem2Db& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2Db& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
|
||||
//template void remap_gpu<int4>(const DevMem2Db& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2Db& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
|
||||
|
||||
template void remap_gpu<float >(const DevMem2Db& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2Db& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
|
||||
//template void remap_gpu<float2>(const DevMem2Db& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2Db& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
|
||||
template void remap_gpu<float3>(const DevMem2Db& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2Db& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
|
||||
template void remap_gpu<float4>(const DevMem2Db& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2Db& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
|
||||
} // namespace imgproc
|
||||
}}} // namespace cv { namespace gpu { namespace device
|
||||
|
@ -47,219 +47,217 @@
|
||||
#include "opencv2/gpu/device/saturate_cast.hpp"
|
||||
#include "opencv2/gpu/device/filters.hpp"
|
||||
|
||||
BEGIN_OPENCV_DEVICE_NAMESPACE
|
||||
|
||||
namespace imgproc {
|
||||
|
||||
template <typename Ptr2D, typename T> __global__ void resize(const Ptr2D src, float fx, float fy, DevMem2D_<T> dst)
|
||||
namespace cv { namespace gpu { namespace device
|
||||
{
|
||||
const int x = blockDim.x * blockIdx.x + threadIdx.x;
|
||||
const int y = blockDim.y * blockIdx.y + threadIdx.y;
|
||||
namespace imgproc
|
||||
{
|
||||
template <typename Ptr2D, typename T> __global__ void resize(const Ptr2D src, float fx, float fy, DevMem2D_<T> dst)
|
||||
{
|
||||
const int x = blockDim.x * blockIdx.x + threadIdx.x;
|
||||
const int y = blockDim.y * blockIdx.y + threadIdx.y;
|
||||
|
||||
if (x < dst.cols && y < dst.rows)
|
||||
{
|
||||
const float xcoo = x / fx;
|
||||
const float ycoo = y / fy;
|
||||
if (x < dst.cols && y < dst.rows)
|
||||
{
|
||||
const float xcoo = x / fx;
|
||||
const float ycoo = y / fy;
|
||||
|
||||
dst.ptr(y)[x] = saturate_cast<T>(src(ycoo, xcoo));
|
||||
}
|
||||
}
|
||||
template <typename Ptr2D, typename T> __global__ void resizeNN(const Ptr2D src, float fx, float fy, DevMem2D_<T> dst)
|
||||
{
|
||||
const int x = blockDim.x * blockIdx.x + threadIdx.x;
|
||||
const int y = blockDim.y * blockIdx.y + threadIdx.y;
|
||||
dst.ptr(y)[x] = saturate_cast<T>(src(ycoo, xcoo));
|
||||
}
|
||||
}
|
||||
template <typename Ptr2D, typename T> __global__ void resizeNN(const Ptr2D src, float fx, float fy, DevMem2D_<T> dst)
|
||||
{
|
||||
const int x = blockDim.x * blockIdx.x + threadIdx.x;
|
||||
const int y = blockDim.y * blockIdx.y + threadIdx.y;
|
||||
|
||||
if (x < dst.cols && y < dst.rows)
|
||||
{
|
||||
const float xcoo = x / fx;
|
||||
const float ycoo = y / fy;
|
||||
if (x < dst.cols && y < dst.rows)
|
||||
{
|
||||
const float xcoo = x / fx;
|
||||
const float ycoo = y / fy;
|
||||
|
||||
dst.ptr(y)[x] = src(__float2int_rd(ycoo), __float2int_rd(xcoo));
|
||||
}
|
||||
}
|
||||
dst.ptr(y)[x] = src(__float2int_rd(ycoo), __float2int_rd(xcoo));
|
||||
}
|
||||
}
|
||||
|
||||
template <template <typename> class Filter, typename T> struct ResizeDispatcherStream
|
||||
{
|
||||
static void call(const DevMem2D_<T>& src, float fx, float fy, const DevMem2D_<T>& dst, cudaStream_t stream)
|
||||
{
|
||||
dim3 block(32, 8);
|
||||
dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y));
|
||||
template <template <typename> class Filter, typename T> struct ResizeDispatcherStream
|
||||
{
|
||||
static void call(const DevMem2D_<T>& src, float fx, float fy, const DevMem2D_<T>& dst, cudaStream_t stream)
|
||||
{
|
||||
dim3 block(32, 8);
|
||||
dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y));
|
||||
|
||||
BrdReplicate<T> brd(src.rows, src.cols);
|
||||
BorderReader< PtrStep<T>, BrdReplicate<T> > brdSrc(src, brd);
|
||||
Filter< BorderReader< PtrStep<T>, BrdReplicate<T> > > filter_src(brdSrc);
|
||||
BrdReplicate<T> brd(src.rows, src.cols);
|
||||
BorderReader< PtrStep<T>, BrdReplicate<T> > brdSrc(src, brd);
|
||||
Filter< BorderReader< PtrStep<T>, BrdReplicate<T> > > filter_src(brdSrc);
|
||||
|
||||
resize<<<grid, block, 0, stream>>>(filter_src, fx, fy, dst);
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
}
|
||||
};
|
||||
template <typename T> struct ResizeDispatcherStream<PointFilter, T>
|
||||
{
|
||||
static void call(const DevMem2D_<T>& src, float fx, float fy, const DevMem2D_<T>& dst, cudaStream_t stream)
|
||||
{
|
||||
dim3 block(32, 8);
|
||||
dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y));
|
||||
resize<<<grid, block, 0, stream>>>(filter_src, fx, fy, dst);
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
}
|
||||
};
|
||||
template <typename T> struct ResizeDispatcherStream<PointFilter, T>
|
||||
{
|
||||
static void call(const DevMem2D_<T>& src, float fx, float fy, const DevMem2D_<T>& dst, cudaStream_t stream)
|
||||
{
|
||||
dim3 block(32, 8);
|
||||
dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y));
|
||||
|
||||
BrdReplicate<T> brd(src.rows, src.cols);
|
||||
BorderReader< PtrStep<T>, BrdReplicate<T> > brdSrc(src, brd);
|
||||
BrdReplicate<T> brd(src.rows, src.cols);
|
||||
BorderReader< PtrStep<T>, BrdReplicate<T> > brdSrc(src, brd);
|
||||
|
||||
resizeNN<<<grid, block, 0, stream>>>(brdSrc, fx, fy, dst);
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
}
|
||||
};
|
||||
resizeNN<<<grid, block, 0, stream>>>(brdSrc, fx, fy, dst);
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
}
|
||||
};
|
||||
|
||||
template <template <typename> class Filter, typename T> struct ResizeDispatcherNonStream
|
||||
{
|
||||
static void call(const DevMem2D_<T>& src, float fx, float fy, const DevMem2D_<T>& dst)
|
||||
{
|
||||
dim3 block(32, 8);
|
||||
dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y));
|
||||
template <template <typename> class Filter, typename T> struct ResizeDispatcherNonStream
|
||||
{
|
||||
static void call(const DevMem2D_<T>& src, float fx, float fy, const DevMem2D_<T>& dst)
|
||||
{
|
||||
dim3 block(32, 8);
|
||||
dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y));
|
||||
|
||||
BrdReplicate<T> brd(src.rows, src.cols);
|
||||
BorderReader< PtrStep<T>, BrdReplicate<T> > brdSrc(src, brd);
|
||||
Filter< BorderReader< PtrStep<T>, BrdReplicate<T> > > filter_src(brdSrc);
|
||||
BrdReplicate<T> brd(src.rows, src.cols);
|
||||
BorderReader< PtrStep<T>, BrdReplicate<T> > brdSrc(src, brd);
|
||||
Filter< BorderReader< PtrStep<T>, BrdReplicate<T> > > filter_src(brdSrc);
|
||||
|
||||
resize<<<grid, block>>>(filter_src, fx, fy, dst);
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
resize<<<grid, block>>>(filter_src, fx, fy, dst);
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
|
||||
cudaSafeCall( cudaDeviceSynchronize() );
|
||||
}
|
||||
};
|
||||
template <typename T> struct ResizeDispatcherNonStream<PointFilter, T>
|
||||
{
|
||||
static void call(const DevMem2D_<T>& src, float fx, float fy, const DevMem2D_<T>& dst)
|
||||
{
|
||||
dim3 block(32, 8);
|
||||
dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y));
|
||||
cudaSafeCall( cudaDeviceSynchronize() );
|
||||
}
|
||||
};
|
||||
template <typename T> struct ResizeDispatcherNonStream<PointFilter, T>
|
||||
{
|
||||
static void call(const DevMem2D_<T>& src, float fx, float fy, const DevMem2D_<T>& dst)
|
||||
{
|
||||
dim3 block(32, 8);
|
||||
dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y));
|
||||
|
||||
BrdReplicate<T> brd(src.rows, src.cols);
|
||||
BorderReader< PtrStep<T>, BrdReplicate<T> > brdSrc(src, brd);
|
||||
BrdReplicate<T> brd(src.rows, src.cols);
|
||||
BorderReader< PtrStep<T>, BrdReplicate<T> > brdSrc(src, brd);
|
||||
|
||||
resizeNN<<<grid, block>>>(brdSrc, fx, fy, dst);
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
resizeNN<<<grid, block>>>(brdSrc, fx, fy, dst);
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
|
||||
cudaSafeCall( cudaDeviceSynchronize() );
|
||||
}
|
||||
};
|
||||
cudaSafeCall( cudaDeviceSynchronize() );
|
||||
}
|
||||
};
|
||||
|
||||
#define OPENCV_GPU_IMPLEMENT_RESIZE_TEX(type) \
|
||||
texture< type , cudaTextureType2D> tex_resize_ ## type (0, cudaFilterModePoint, cudaAddressModeClamp); \
|
||||
struct tex_resize_ ## type ## _reader \
|
||||
{ \
|
||||
typedef type elem_type; \
|
||||
typedef int index_type; \
|
||||
__device__ __forceinline__ elem_type operator ()(index_type y, index_type x) const \
|
||||
{ \
|
||||
return tex2D(tex_resize_ ## type , x, y); \
|
||||
} \
|
||||
}; \
|
||||
template <template <typename> class Filter> struct ResizeDispatcherNonStream<Filter, type> \
|
||||
{ \
|
||||
static void call(const DevMem2D_< type >& src, float fx, float fy, const DevMem2D_< type >& dst) \
|
||||
{ \
|
||||
dim3 block(32, 8); \
|
||||
dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y)); \
|
||||
bindTexture(&tex_resize_ ## type , src); \
|
||||
tex_resize_ ## type ##_reader texSrc; \
|
||||
Filter< tex_resize_ ## type ##_reader > filter_src(texSrc); \
|
||||
resize<<<grid, block>>>(filter_src, fx, fy, dst); \
|
||||
cudaSafeCall( cudaGetLastError() ); \
|
||||
cudaSafeCall( cudaDeviceSynchronize() ); \
|
||||
} \
|
||||
}; \
|
||||
template <> struct ResizeDispatcherNonStream<PointFilter, type> \
|
||||
{ \
|
||||
static void call(const DevMem2D_< type >& src, float fx, float fy, const DevMem2D_< type >& dst) \
|
||||
{ \
|
||||
dim3 block(32, 8); \
|
||||
dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y)); \
|
||||
bindTexture(&tex_resize_ ## type , src); \
|
||||
tex_resize_ ## type ##_reader texSrc; \
|
||||
resizeNN<<<grid, block>>>(texSrc, fx, fy, dst); \
|
||||
cudaSafeCall( cudaGetLastError() ); \
|
||||
cudaSafeCall( cudaDeviceSynchronize() ); \
|
||||
} \
|
||||
};
|
||||
|
||||
OPENCV_GPU_IMPLEMENT_RESIZE_TEX(uchar)
|
||||
//OPENCV_GPU_IMPLEMENT_RESIZE_TEX(uchar2)
|
||||
OPENCV_GPU_IMPLEMENT_RESIZE_TEX(uchar4)
|
||||
#define OPENCV_GPU_IMPLEMENT_RESIZE_TEX(type) \
|
||||
texture< type , cudaTextureType2D> tex_resize_ ## type (0, cudaFilterModePoint, cudaAddressModeClamp); \
|
||||
struct tex_resize_ ## type ## _reader \
|
||||
{ \
|
||||
typedef type elem_type; \
|
||||
typedef int index_type; \
|
||||
__device__ __forceinline__ elem_type operator ()(index_type y, index_type x) const \
|
||||
{ \
|
||||
return tex2D(tex_resize_ ## type , x, y); \
|
||||
} \
|
||||
}; \
|
||||
template <template <typename> class Filter> struct ResizeDispatcherNonStream<Filter, type> \
|
||||
{ \
|
||||
static void call(const DevMem2D_< type >& src, float fx, float fy, const DevMem2D_< type >& dst) \
|
||||
{ \
|
||||
dim3 block(32, 8); \
|
||||
dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y)); \
|
||||
bindTexture(&tex_resize_ ## type , src); \
|
||||
tex_resize_ ## type ##_reader texSrc; \
|
||||
Filter< tex_resize_ ## type ##_reader > filter_src(texSrc); \
|
||||
resize<<<grid, block>>>(filter_src, fx, fy, dst); \
|
||||
cudaSafeCall( cudaGetLastError() ); \
|
||||
cudaSafeCall( cudaDeviceSynchronize() ); \
|
||||
} \
|
||||
}; \
|
||||
template <> struct ResizeDispatcherNonStream<PointFilter, type> \
|
||||
{ \
|
||||
static void call(const DevMem2D_< type >& src, float fx, float fy, const DevMem2D_< type >& dst) \
|
||||
{ \
|
||||
dim3 block(32, 8); \
|
||||
dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y)); \
|
||||
bindTexture(&tex_resize_ ## type , src); \
|
||||
tex_resize_ ## type ##_reader texSrc; \
|
||||
resizeNN<<<grid, block>>>(texSrc, fx, fy, dst); \
|
||||
cudaSafeCall( cudaGetLastError() ); \
|
||||
cudaSafeCall( cudaDeviceSynchronize() ); \
|
||||
} \
|
||||
};
|
||||
|
||||
OPENCV_GPU_IMPLEMENT_RESIZE_TEX(uchar)
|
||||
//OPENCV_GPU_IMPLEMENT_RESIZE_TEX(uchar2)
|
||||
OPENCV_GPU_IMPLEMENT_RESIZE_TEX(uchar4)
|
||||
|
||||
//OPENCV_GPU_IMPLEMENT_RESIZE_TEX(schar)
|
||||
//OPENCV_GPU_IMPLEMENT_RESIZE_TEX(char2)
|
||||
//OPENCV_GPU_IMPLEMENT_RESIZE_TEX(char4)
|
||||
//OPENCV_GPU_IMPLEMENT_RESIZE_TEX(schar)
|
||||
//OPENCV_GPU_IMPLEMENT_RESIZE_TEX(char2)
|
||||
//OPENCV_GPU_IMPLEMENT_RESIZE_TEX(char4)
|
||||
|
||||
OPENCV_GPU_IMPLEMENT_RESIZE_TEX(ushort)
|
||||
//OPENCV_GPU_IMPLEMENT_RESIZE_TEX(ushort2)
|
||||
OPENCV_GPU_IMPLEMENT_RESIZE_TEX(ushort4)
|
||||
OPENCV_GPU_IMPLEMENT_RESIZE_TEX(ushort)
|
||||
//OPENCV_GPU_IMPLEMENT_RESIZE_TEX(ushort2)
|
||||
OPENCV_GPU_IMPLEMENT_RESIZE_TEX(ushort4)
|
||||
|
||||
OPENCV_GPU_IMPLEMENT_RESIZE_TEX(short)
|
||||
//OPENCV_GPU_IMPLEMENT_RESIZE_TEX(short2)
|
||||
OPENCV_GPU_IMPLEMENT_RESIZE_TEX(short4)
|
||||
OPENCV_GPU_IMPLEMENT_RESIZE_TEX(short)
|
||||
//OPENCV_GPU_IMPLEMENT_RESIZE_TEX(short2)
|
||||
OPENCV_GPU_IMPLEMENT_RESIZE_TEX(short4)
|
||||
|
||||
//OPENCV_GPU_IMPLEMENT_RESIZE_TEX(int)
|
||||
//OPENCV_GPU_IMPLEMENT_RESIZE_TEX(int2)
|
||||
//OPENCV_GPU_IMPLEMENT_RESIZE_TEX(int4)
|
||||
//OPENCV_GPU_IMPLEMENT_RESIZE_TEX(int)
|
||||
//OPENCV_GPU_IMPLEMENT_RESIZE_TEX(int2)
|
||||
//OPENCV_GPU_IMPLEMENT_RESIZE_TEX(int4)
|
||||
|
||||
OPENCV_GPU_IMPLEMENT_RESIZE_TEX(float)
|
||||
//OPENCV_GPU_IMPLEMENT_RESIZE_TEX(float2)
|
||||
OPENCV_GPU_IMPLEMENT_RESIZE_TEX(float4)
|
||||
OPENCV_GPU_IMPLEMENT_RESIZE_TEX(float)
|
||||
//OPENCV_GPU_IMPLEMENT_RESIZE_TEX(float2)
|
||||
OPENCV_GPU_IMPLEMENT_RESIZE_TEX(float4)
|
||||
|
||||
#undef OPENCV_GPU_IMPLEMENT_RESIZE_TEX
|
||||
#undef OPENCV_GPU_IMPLEMENT_RESIZE_TEX
|
||||
|
||||
template <template <typename> class Filter, typename T> struct ResizeDispatcher
|
||||
{
|
||||
static void call(const DevMem2D_<T>& src, float fx, float fy, const DevMem2D_<T>& dst, cudaStream_t stream)
|
||||
{
|
||||
if (stream == 0)
|
||||
ResizeDispatcherNonStream<Filter, T>::call(src, fx, fy, dst);
|
||||
else
|
||||
ResizeDispatcherStream<Filter, T>::call(src, fx, fy, dst, stream);
|
||||
}
|
||||
};
|
||||
template <template <typename> class Filter, typename T> struct ResizeDispatcher
|
||||
{
|
||||
static void call(const DevMem2D_<T>& src, float fx, float fy, const DevMem2D_<T>& dst, cudaStream_t stream)
|
||||
{
|
||||
if (stream == 0)
|
||||
ResizeDispatcherNonStream<Filter, T>::call(src, fx, fy, dst);
|
||||
else
|
||||
ResizeDispatcherStream<Filter, T>::call(src, fx, fy, dst, stream);
|
||||
}
|
||||
};
|
||||
|
||||
template <typename T> void resize_gpu(const DevMem2Db& src, float fx, float fy, const DevMem2Db& dst, int interpolation, cudaStream_t stream)
|
||||
{
|
||||
typedef void (*caller_t)(const DevMem2D_<T>& src, float fx, float fy, const DevMem2D_<T>& dst, cudaStream_t stream);
|
||||
template <typename T> void resize_gpu(const DevMem2Db& src, float fx, float fy, const DevMem2Db& dst, int interpolation, cudaStream_t stream)
|
||||
{
|
||||
typedef void (*caller_t)(const DevMem2D_<T>& src, float fx, float fy, const DevMem2D_<T>& dst, cudaStream_t stream);
|
||||
|
||||
static const caller_t callers[3] =
|
||||
{
|
||||
ResizeDispatcher<PointFilter, T>::call, ResizeDispatcher<LinearFilter, T>::call, ResizeDispatcher<CubicFilter, T>::call
|
||||
};
|
||||
static const caller_t callers[3] =
|
||||
{
|
||||
ResizeDispatcher<PointFilter, T>::call, ResizeDispatcher<LinearFilter, T>::call, ResizeDispatcher<CubicFilter, T>::call
|
||||
};
|
||||
|
||||
callers[interpolation](static_cast< DevMem2D_<T> >(src), fx, fy, static_cast< DevMem2D_<T> >(dst), stream);
|
||||
}
|
||||
callers[interpolation](static_cast< DevMem2D_<T> >(src), fx, fy, static_cast< DevMem2D_<T> >(dst), stream);
|
||||
}
|
||||
|
||||
template void resize_gpu<uchar >(const DevMem2Db& src, float fx, float fy, const DevMem2Db& dst, int interpolation, cudaStream_t stream);
|
||||
//template void resize_gpu<uchar2>(const DevMem2Db& src, float fx, float fy, const DevMem2Db& dst, int interpolation, cudaStream_t stream);
|
||||
template void resize_gpu<uchar3>(const DevMem2Db& src, float fx, float fy, const DevMem2Db& dst, int interpolation, cudaStream_t stream);
|
||||
template void resize_gpu<uchar4>(const DevMem2Db& src, float fx, float fy, const DevMem2Db& dst, int interpolation, cudaStream_t stream);
|
||||
template void resize_gpu<uchar >(const DevMem2Db& src, float fx, float fy, const DevMem2Db& dst, int interpolation, cudaStream_t stream);
|
||||
//template void resize_gpu<uchar2>(const DevMem2Db& src, float fx, float fy, const DevMem2Db& dst, int interpolation, cudaStream_t stream);
|
||||
template void resize_gpu<uchar3>(const DevMem2Db& src, float fx, float fy, const DevMem2Db& dst, int interpolation, cudaStream_t stream);
|
||||
template void resize_gpu<uchar4>(const DevMem2Db& src, float fx, float fy, const DevMem2Db& dst, int interpolation, cudaStream_t stream);
|
||||
|
||||
//template void resize_gpu<schar>(const DevMem2Db& src, float fx, float fy, const DevMem2Db& dst, int interpolation, cudaStream_t stream);
|
||||
//template void resize_gpu<char2>(const DevMem2Db& src, float fx, float fy, const DevMem2Db& dst, int interpolation, cudaStream_t stream);
|
||||
//template void resize_gpu<char3>(const DevMem2Db& src, float fx, float fy, const DevMem2Db& dst, int interpolation, cudaStream_t stream);
|
||||
//template void resize_gpu<char4>(const DevMem2Db& src, float fx, float fy, const DevMem2Db& dst, int interpolation, cudaStream_t stream);
|
||||
//template void resize_gpu<schar>(const DevMem2Db& src, float fx, float fy, const DevMem2Db& dst, int interpolation, cudaStream_t stream);
|
||||
//template void resize_gpu<char2>(const DevMem2Db& src, float fx, float fy, const DevMem2Db& dst, int interpolation, cudaStream_t stream);
|
||||
//template void resize_gpu<char3>(const DevMem2Db& src, float fx, float fy, const DevMem2Db& dst, int interpolation, cudaStream_t stream);
|
||||
//template void resize_gpu<char4>(const DevMem2Db& src, float fx, float fy, const DevMem2Db& dst, int interpolation, cudaStream_t stream);
|
||||
|
||||
template void resize_gpu<ushort >(const DevMem2Db& src,float fx, float fy, const DevMem2Db& dst, int interpolation, cudaStream_t stream);
|
||||
//template void resize_gpu<ushort2>(const DevMem2Db& src,float fx, float fy, const DevMem2Db& dst, int interpolation, cudaStream_t stream);
|
||||
template void resize_gpu<ushort3>(const DevMem2Db& src,float fx, float fy, const DevMem2Db& dst, int interpolation, cudaStream_t stream);
|
||||
template void resize_gpu<ushort4>(const DevMem2Db& src,float fx, float fy, const DevMem2Db& dst, int interpolation, cudaStream_t stream);
|
||||
template void resize_gpu<ushort >(const DevMem2Db& src,float fx, float fy, const DevMem2Db& dst, int interpolation, cudaStream_t stream);
|
||||
//template void resize_gpu<ushort2>(const DevMem2Db& src,float fx, float fy, const DevMem2Db& dst, int interpolation, cudaStream_t stream);
|
||||
template void resize_gpu<ushort3>(const DevMem2Db& src,float fx, float fy, const DevMem2Db& dst, int interpolation, cudaStream_t stream);
|
||||
template void resize_gpu<ushort4>(const DevMem2Db& src,float fx, float fy, const DevMem2Db& dst, int interpolation, cudaStream_t stream);
|
||||
|
||||
template void resize_gpu<short >(const DevMem2Db& src, float fx, float fy, const DevMem2Db& dst, int interpolation, cudaStream_t stream);
|
||||
//template void resize_gpu<short2>(const DevMem2Db& src, float fx, float fy, const DevMem2Db& dst, int interpolation, cudaStream_t stream);
|
||||
template void resize_gpu<short3>(const DevMem2Db& src, float fx, float fy, const DevMem2Db& dst, int interpolation, cudaStream_t stream);
|
||||
template void resize_gpu<short4>(const DevMem2Db& src, float fx, float fy, const DevMem2Db& dst, int interpolation, cudaStream_t stream);
|
||||
template void resize_gpu<short >(const DevMem2Db& src, float fx, float fy, const DevMem2Db& dst, int interpolation, cudaStream_t stream);
|
||||
//template void resize_gpu<short2>(const DevMem2Db& src, float fx, float fy, const DevMem2Db& dst, int interpolation, cudaStream_t stream);
|
||||
template void resize_gpu<short3>(const DevMem2Db& src, float fx, float fy, const DevMem2Db& dst, int interpolation, cudaStream_t stream);
|
||||
template void resize_gpu<short4>(const DevMem2Db& src, float fx, float fy, const DevMem2Db& dst, int interpolation, cudaStream_t stream);
|
||||
|
||||
//template void resize_gpu<int >(const DevMem2Db& src, float fx, float fy, const DevMem2Db& dst, int interpolation, cudaStream_t stream);
|
||||
//template void resize_gpu<int2>(const DevMem2Db& src, float fx, float fy, const DevMem2Db& dst, int interpolation, cudaStream_t stream);
|
||||
//template void resize_gpu<int3>(const DevMem2Db& src, float fx, float fy, const DevMem2Db& dst, int interpolation, cudaStream_t stream);
|
||||
//template void resize_gpu<int4>(const DevMem2Db& src, float fx, float fy, const DevMem2Db& dst, int interpolation, cudaStream_t stream);
|
||||
//template void resize_gpu<int >(const DevMem2Db& src, float fx, float fy, const DevMem2Db& dst, int interpolation, cudaStream_t stream);
|
||||
//template void resize_gpu<int2>(const DevMem2Db& src, float fx, float fy, const DevMem2Db& dst, int interpolation, cudaStream_t stream);
|
||||
//template void resize_gpu<int3>(const DevMem2Db& src, float fx, float fy, const DevMem2Db& dst, int interpolation, cudaStream_t stream);
|
||||
//template void resize_gpu<int4>(const DevMem2Db& src, float fx, float fy, const DevMem2Db& dst, int interpolation, cudaStream_t stream);
|
||||
|
||||
template void resize_gpu<float >(const DevMem2Db& src, float fx, float fy, const DevMem2Db& dst, int interpolation, cudaStream_t stream);
|
||||
//template void resize_gpu<float2>(const DevMem2Db& src, float fx, float fy, const DevMem2Db& dst, int interpolation, cudaStream_t stream);
|
||||
template void resize_gpu<float3>(const DevMem2Db& src, float fx, float fy, const DevMem2Db& dst, int interpolation, cudaStream_t stream);
|
||||
template void resize_gpu<float4>(const DevMem2Db& src, float fx, float fy, const DevMem2Db& dst, int interpolation, cudaStream_t stream);
|
||||
|
||||
} // namespace imgproc
|
||||
|
||||
END_OPENCV_DEVICE_NAMESPACE
|
||||
template void resize_gpu<float >(const DevMem2Db& src, float fx, float fy, const DevMem2Db& dst, int interpolation, cudaStream_t stream);
|
||||
//template void resize_gpu<float2>(const DevMem2Db& src, float fx, float fy, const DevMem2Db& dst, int interpolation, cudaStream_t stream);
|
||||
template void resize_gpu<float3>(const DevMem2Db& src, float fx, float fy, const DevMem2Db& dst, int interpolation, cudaStream_t stream);
|
||||
template void resize_gpu<float4>(const DevMem2Db& src, float fx, float fy, const DevMem2Db& dst, int interpolation, cudaStream_t stream);
|
||||
} // namespace imgproc
|
||||
}}} // namespace cv { namespace gpu { namespace device
|
||||
|
@ -47,226 +47,224 @@
|
||||
#include "opencv2/gpu/device/limits.hpp"
|
||||
#include "opencv2/gpu/device/border_interpolate.hpp"
|
||||
|
||||
BEGIN_OPENCV_DEVICE_NAMESPACE
|
||||
|
||||
#define MAX_KERNEL_SIZE 16
|
||||
#define BLOCK_DIM_X 16
|
||||
#define BLOCK_DIM_Y 4
|
||||
#define RESULT_STEPS 8
|
||||
#define HALO_STEPS 1
|
||||
|
||||
namespace row_filter {
|
||||
|
||||
__constant__ float c_kernel[MAX_KERNEL_SIZE];
|
||||
|
||||
void loadKernel(const float kernel[], int ksize)
|
||||
namespace cv { namespace gpu { namespace device
|
||||
{
|
||||
cudaSafeCall( cudaMemcpyToSymbol(c_kernel, kernel, ksize * sizeof(float)) );
|
||||
}
|
||||
#define MAX_KERNEL_SIZE 16
|
||||
#define BLOCK_DIM_X 16
|
||||
#define BLOCK_DIM_Y 4
|
||||
#define RESULT_STEPS 8
|
||||
#define HALO_STEPS 1
|
||||
|
||||
namespace detail
|
||||
{
|
||||
template <typename T, size_t size> struct SmemType
|
||||
namespace row_filter
|
||||
{
|
||||
typedef typename TypeVec<float, VecTraits<T>::cn>::vec_type smem_t;
|
||||
};
|
||||
__constant__ float c_kernel[MAX_KERNEL_SIZE];
|
||||
|
||||
template <typename T> struct SmemType<T, 4>
|
||||
{
|
||||
typedef T smem_t;
|
||||
};
|
||||
}
|
||||
|
||||
template <typename T> struct SmemType
|
||||
{
|
||||
typedef typename detail::SmemType<T, sizeof(T)>::smem_t smem_t;
|
||||
};
|
||||
|
||||
template <int KERNEL_SIZE, typename T, typename D, typename B>
|
||||
__global__ void linearRowFilter(const DevMem2D_<T> src, PtrStep<D> dst, int anchor, const B b)
|
||||
{
|
||||
typedef typename SmemType<T>::smem_t smem_t;
|
||||
typedef typename TypeVec<float, VecTraits<T>::cn>::vec_type sum_t;
|
||||
|
||||
__shared__ smem_t smem[BLOCK_DIM_Y][(RESULT_STEPS + 2 * HALO_STEPS) * BLOCK_DIM_X];
|
||||
|
||||
//Offset to the left halo edge
|
||||
const int x = (blockIdx.x * RESULT_STEPS - HALO_STEPS) * BLOCK_DIM_X + threadIdx.x;
|
||||
const int y = blockIdx.y * BLOCK_DIM_Y + threadIdx.y;
|
||||
|
||||
if (y < src.rows)
|
||||
{
|
||||
const T* src_row = src.ptr(y);
|
||||
|
||||
//Load main data
|
||||
#pragma unroll
|
||||
for(int i = HALO_STEPS; i < HALO_STEPS + RESULT_STEPS; ++i)
|
||||
smem[threadIdx.y][threadIdx.x + i * BLOCK_DIM_X] = b.at_high(i * BLOCK_DIM_X + x, src_row);
|
||||
|
||||
//Load left halo
|
||||
#pragma unroll
|
||||
for(int i = 0; i < HALO_STEPS; ++i)
|
||||
smem[threadIdx.y][threadIdx.x + i * BLOCK_DIM_X] = b.at_low(i * BLOCK_DIM_X + x, src_row);
|
||||
|
||||
//Load right halo
|
||||
#pragma unroll
|
||||
for(int i = HALO_STEPS + RESULT_STEPS; i < HALO_STEPS + RESULT_STEPS + HALO_STEPS; ++i)
|
||||
smem[threadIdx.y][threadIdx.x + i * BLOCK_DIM_X] = b.at_high(i * BLOCK_DIM_X + x, src_row);
|
||||
|
||||
__syncthreads();
|
||||
|
||||
D* dst_row = dst.ptr(y);
|
||||
|
||||
#pragma unroll
|
||||
for(int i = HALO_STEPS; i < HALO_STEPS + RESULT_STEPS; ++i)
|
||||
void loadKernel(const float kernel[], int ksize)
|
||||
{
|
||||
sum_t sum = VecTraits<sum_t>::all(0);
|
||||
|
||||
#pragma unroll
|
||||
for (int j = 0; j < KERNEL_SIZE; ++j)
|
||||
sum = sum + smem[threadIdx.y][threadIdx.x + i * BLOCK_DIM_X + j - anchor] * c_kernel[j];
|
||||
|
||||
int dstX = x + i * BLOCK_DIM_X;
|
||||
|
||||
if (dstX < src.cols)
|
||||
dst_row[dstX] = saturate_cast<D>(sum);
|
||||
cudaSafeCall( cudaMemcpyToSymbol(c_kernel, kernel, ksize * sizeof(float)) );
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template <int ksize, typename T, typename D, template<typename> class B>
|
||||
void linearRowFilter_caller(const DevMem2D_<T>& src, const DevMem2D_<D>& dst, int anchor, cudaStream_t stream)
|
||||
{
|
||||
typedef typename SmemType<T>::smem_t smem_t;
|
||||
|
||||
const dim3 block(BLOCK_DIM_X, BLOCK_DIM_Y);
|
||||
const dim3 grid(divUp(src.cols, RESULT_STEPS * BLOCK_DIM_X), divUp(src.rows, BLOCK_DIM_Y));
|
||||
|
||||
B<smem_t> b(src.cols);
|
||||
|
||||
linearRowFilter<ksize, T, D><<<grid, block, 0, stream>>>(src, dst, anchor, b);
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
|
||||
if (stream == 0)
|
||||
cudaSafeCall( cudaDeviceSynchronize() );
|
||||
}
|
||||
|
||||
template <typename T, typename D>
|
||||
void linearRowFilter_gpu(const DevMem2Db& src, const DevMem2Db& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream)
|
||||
{
|
||||
typedef void (*caller_t)(const DevMem2D_<T>& src, const DevMem2D_<D>& dst, int anchor, cudaStream_t stream);
|
||||
static const caller_t callers[5][17] =
|
||||
{
|
||||
namespace detail
|
||||
{
|
||||
0,
|
||||
linearRowFilter_caller<1 , T, D, BrdRowReflect101>,
|
||||
linearRowFilter_caller<2 , T, D, BrdRowReflect101>,
|
||||
linearRowFilter_caller<3 , T, D, BrdRowReflect101>,
|
||||
linearRowFilter_caller<4 , T, D, BrdRowReflect101>,
|
||||
linearRowFilter_caller<5 , T, D, BrdRowReflect101>,
|
||||
linearRowFilter_caller<6 , T, D, BrdRowReflect101>,
|
||||
linearRowFilter_caller<7 , T, D, BrdRowReflect101>,
|
||||
linearRowFilter_caller<8 , T, D, BrdRowReflect101>,
|
||||
linearRowFilter_caller<9 , T, D, BrdRowReflect101>,
|
||||
linearRowFilter_caller<10, T, D, BrdRowReflect101>,
|
||||
linearRowFilter_caller<11, T, D, BrdRowReflect101>,
|
||||
linearRowFilter_caller<12, T, D, BrdRowReflect101>,
|
||||
linearRowFilter_caller<13, T, D, BrdRowReflect101>,
|
||||
linearRowFilter_caller<14, T, D, BrdRowReflect101>,
|
||||
linearRowFilter_caller<15, T, D, BrdRowReflect101>,
|
||||
linearRowFilter_caller<16, T, D, BrdRowReflect101>
|
||||
},
|
||||
{
|
||||
0,
|
||||
linearRowFilter_caller<1 , T, D, BrdRowReplicate>,
|
||||
linearRowFilter_caller<2 , T, D, BrdRowReplicate>,
|
||||
linearRowFilter_caller<3 , T, D, BrdRowReplicate>,
|
||||
linearRowFilter_caller<4 , T, D, BrdRowReplicate>,
|
||||
linearRowFilter_caller<5 , T, D, BrdRowReplicate>,
|
||||
linearRowFilter_caller<6 , T, D, BrdRowReplicate>,
|
||||
linearRowFilter_caller<7 , T, D, BrdRowReplicate>,
|
||||
linearRowFilter_caller<8 , T, D, BrdRowReplicate>,
|
||||
linearRowFilter_caller<9 , T, D, BrdRowReplicate>,
|
||||
linearRowFilter_caller<10, T, D, BrdRowReplicate>,
|
||||
linearRowFilter_caller<11, T, D, BrdRowReplicate>,
|
||||
linearRowFilter_caller<12, T, D, BrdRowReplicate>,
|
||||
linearRowFilter_caller<13, T, D, BrdRowReplicate>,
|
||||
linearRowFilter_caller<14, T, D, BrdRowReplicate>,
|
||||
linearRowFilter_caller<15, T, D, BrdRowReplicate>,
|
||||
linearRowFilter_caller<16, T, D, BrdRowReplicate>
|
||||
},
|
||||
{
|
||||
0,
|
||||
linearRowFilter_caller<1 , T, D, BrdRowConstant>,
|
||||
linearRowFilter_caller<2 , T, D, BrdRowConstant>,
|
||||
linearRowFilter_caller<3 , T, D, BrdRowConstant>,
|
||||
linearRowFilter_caller<4 , T, D, BrdRowConstant>,
|
||||
linearRowFilter_caller<5 , T, D, BrdRowConstant>,
|
||||
linearRowFilter_caller<6 , T, D, BrdRowConstant>,
|
||||
linearRowFilter_caller<7 , T, D, BrdRowConstant>,
|
||||
linearRowFilter_caller<8 , T, D, BrdRowConstant>,
|
||||
linearRowFilter_caller<9 , T, D, BrdRowConstant>,
|
||||
linearRowFilter_caller<10, T, D, BrdRowConstant>,
|
||||
linearRowFilter_caller<11, T, D, BrdRowConstant>,
|
||||
linearRowFilter_caller<12, T, D, BrdRowConstant>,
|
||||
linearRowFilter_caller<13, T, D, BrdRowConstant>,
|
||||
linearRowFilter_caller<14, T, D, BrdRowConstant>,
|
||||
linearRowFilter_caller<15, T, D, BrdRowConstant>,
|
||||
linearRowFilter_caller<16, T, D, BrdRowConstant>
|
||||
},
|
||||
{
|
||||
0,
|
||||
linearRowFilter_caller<1 , T, D, BrdRowReflect>,
|
||||
linearRowFilter_caller<2 , T, D, BrdRowReflect>,
|
||||
linearRowFilter_caller<3 , T, D, BrdRowReflect>,
|
||||
linearRowFilter_caller<4 , T, D, BrdRowReflect>,
|
||||
linearRowFilter_caller<5 , T, D, BrdRowReflect>,
|
||||
linearRowFilter_caller<6 , T, D, BrdRowReflect>,
|
||||
linearRowFilter_caller<7 , T, D, BrdRowReflect>,
|
||||
linearRowFilter_caller<8 , T, D, BrdRowReflect>,
|
||||
linearRowFilter_caller<9 , T, D, BrdRowReflect>,
|
||||
linearRowFilter_caller<10, T, D, BrdRowReflect>,
|
||||
linearRowFilter_caller<11, T, D, BrdRowReflect>,
|
||||
linearRowFilter_caller<12, T, D, BrdRowReflect>,
|
||||
linearRowFilter_caller<13, T, D, BrdRowReflect>,
|
||||
linearRowFilter_caller<14, T, D, BrdRowReflect>,
|
||||
linearRowFilter_caller<15, T, D, BrdRowReflect>,
|
||||
linearRowFilter_caller<16, T, D, BrdRowReflect>
|
||||
},
|
||||
{
|
||||
0,
|
||||
linearRowFilter_caller<1 , T, D, BrdRowWrap>,
|
||||
linearRowFilter_caller<2 , T, D, BrdRowWrap>,
|
||||
linearRowFilter_caller<3 , T, D, BrdRowWrap>,
|
||||
linearRowFilter_caller<4 , T, D, BrdRowWrap>,
|
||||
linearRowFilter_caller<5 , T, D, BrdRowWrap>,
|
||||
linearRowFilter_caller<6 , T, D, BrdRowWrap>,
|
||||
linearRowFilter_caller<7 , T, D, BrdRowWrap>,
|
||||
linearRowFilter_caller<8 , T, D, BrdRowWrap>,
|
||||
linearRowFilter_caller<9 , T, D, BrdRowWrap>,
|
||||
linearRowFilter_caller<10, T, D, BrdRowWrap>,
|
||||
linearRowFilter_caller<11, T, D, BrdRowWrap>,
|
||||
linearRowFilter_caller<12, T, D, BrdRowWrap>,
|
||||
linearRowFilter_caller<13, T, D, BrdRowWrap>,
|
||||
linearRowFilter_caller<14, T, D, BrdRowWrap>,
|
||||
linearRowFilter_caller<15, T, D, BrdRowWrap>,
|
||||
linearRowFilter_caller<16, T, D, BrdRowWrap>
|
||||
template <typename T, size_t size> struct SmemType
|
||||
{
|
||||
typedef typename TypeVec<float, VecTraits<T>::cn>::vec_type smem_t;
|
||||
};
|
||||
|
||||
template <typename T> struct SmemType<T, 4>
|
||||
{
|
||||
typedef T smem_t;
|
||||
};
|
||||
}
|
||||
};
|
||||
|
||||
loadKernel(kernel, ksize);
|
||||
|
||||
callers[brd_type][ksize]((DevMem2D_<T>)src, (DevMem2D_<D>)dst, anchor, stream);
|
||||
}
|
||||
template <typename T> struct SmemType
|
||||
{
|
||||
typedef typename detail::SmemType<T, sizeof(T)>::smem_t smem_t;
|
||||
};
|
||||
|
||||
template void linearRowFilter_gpu<uchar , float >(const DevMem2Db& src, const DevMem2Db& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream);
|
||||
template void linearRowFilter_gpu<uchar4, float4>(const DevMem2Db& src, const DevMem2Db& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream);
|
||||
//template void linearRowFilter_gpu<short , float >(const DevMem2Db& src, const DevMem2Db& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream);
|
||||
//template void linearRowFilter_gpu<short2, float2>(const DevMem2Db& src, const DevMem2Db& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream);
|
||||
template void linearRowFilter_gpu<short3, float3>(const DevMem2Db& src, const DevMem2Db& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream);
|
||||
template void linearRowFilter_gpu<int , float >(const DevMem2Db& src, const DevMem2Db& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream);
|
||||
template void linearRowFilter_gpu<float , float >(const DevMem2Db& src, const DevMem2Db& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream);
|
||||
template <int KERNEL_SIZE, typename T, typename D, typename B>
|
||||
__global__ void linearRowFilter(const DevMem2D_<T> src, PtrStep<D> dst, int anchor, const B b)
|
||||
{
|
||||
typedef typename SmemType<T>::smem_t smem_t;
|
||||
typedef typename TypeVec<float, VecTraits<T>::cn>::vec_type sum_t;
|
||||
|
||||
} // namespace row_filter
|
||||
__shared__ smem_t smem[BLOCK_DIM_Y][(RESULT_STEPS + 2 * HALO_STEPS) * BLOCK_DIM_X];
|
||||
|
||||
END_OPENCV_DEVICE_NAMESPACE
|
||||
//Offset to the left halo edge
|
||||
const int x = (blockIdx.x * RESULT_STEPS - HALO_STEPS) * BLOCK_DIM_X + threadIdx.x;
|
||||
const int y = blockIdx.y * BLOCK_DIM_Y + threadIdx.y;
|
||||
|
||||
if (y < src.rows)
|
||||
{
|
||||
const T* src_row = src.ptr(y);
|
||||
|
||||
//Load main data
|
||||
#pragma unroll
|
||||
for(int i = HALO_STEPS; i < HALO_STEPS + RESULT_STEPS; ++i)
|
||||
smem[threadIdx.y][threadIdx.x + i * BLOCK_DIM_X] = b.at_high(i * BLOCK_DIM_X + x, src_row);
|
||||
|
||||
//Load left halo
|
||||
#pragma unroll
|
||||
for(int i = 0; i < HALO_STEPS; ++i)
|
||||
smem[threadIdx.y][threadIdx.x + i * BLOCK_DIM_X] = b.at_low(i * BLOCK_DIM_X + x, src_row);
|
||||
|
||||
//Load right halo
|
||||
#pragma unroll
|
||||
for(int i = HALO_STEPS + RESULT_STEPS; i < HALO_STEPS + RESULT_STEPS + HALO_STEPS; ++i)
|
||||
smem[threadIdx.y][threadIdx.x + i * BLOCK_DIM_X] = b.at_high(i * BLOCK_DIM_X + x, src_row);
|
||||
|
||||
__syncthreads();
|
||||
|
||||
D* dst_row = dst.ptr(y);
|
||||
|
||||
#pragma unroll
|
||||
for(int i = HALO_STEPS; i < HALO_STEPS + RESULT_STEPS; ++i)
|
||||
{
|
||||
sum_t sum = VecTraits<sum_t>::all(0);
|
||||
|
||||
#pragma unroll
|
||||
for (int j = 0; j < KERNEL_SIZE; ++j)
|
||||
sum = sum + smem[threadIdx.y][threadIdx.x + i * BLOCK_DIM_X + j - anchor] * c_kernel[j];
|
||||
|
||||
int dstX = x + i * BLOCK_DIM_X;
|
||||
|
||||
if (dstX < src.cols)
|
||||
dst_row[dstX] = saturate_cast<D>(sum);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template <int ksize, typename T, typename D, template<typename> class B>
|
||||
void linearRowFilter_caller(const DevMem2D_<T>& src, const DevMem2D_<D>& dst, int anchor, cudaStream_t stream)
|
||||
{
|
||||
typedef typename SmemType<T>::smem_t smem_t;
|
||||
|
||||
const dim3 block(BLOCK_DIM_X, BLOCK_DIM_Y);
|
||||
const dim3 grid(divUp(src.cols, RESULT_STEPS * BLOCK_DIM_X), divUp(src.rows, BLOCK_DIM_Y));
|
||||
|
||||
B<smem_t> b(src.cols);
|
||||
|
||||
linearRowFilter<ksize, T, D><<<grid, block, 0, stream>>>(src, dst, anchor, b);
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
|
||||
if (stream == 0)
|
||||
cudaSafeCall( cudaDeviceSynchronize() );
|
||||
}
|
||||
|
||||
template <typename T, typename D>
|
||||
void linearRowFilter_gpu(const DevMem2Db& src, const DevMem2Db& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream)
|
||||
{
|
||||
typedef void (*caller_t)(const DevMem2D_<T>& src, const DevMem2D_<D>& dst, int anchor, cudaStream_t stream);
|
||||
static const caller_t callers[5][17] =
|
||||
{
|
||||
{
|
||||
0,
|
||||
linearRowFilter_caller<1 , T, D, BrdRowReflect101>,
|
||||
linearRowFilter_caller<2 , T, D, BrdRowReflect101>,
|
||||
linearRowFilter_caller<3 , T, D, BrdRowReflect101>,
|
||||
linearRowFilter_caller<4 , T, D, BrdRowReflect101>,
|
||||
linearRowFilter_caller<5 , T, D, BrdRowReflect101>,
|
||||
linearRowFilter_caller<6 , T, D, BrdRowReflect101>,
|
||||
linearRowFilter_caller<7 , T, D, BrdRowReflect101>,
|
||||
linearRowFilter_caller<8 , T, D, BrdRowReflect101>,
|
||||
linearRowFilter_caller<9 , T, D, BrdRowReflect101>,
|
||||
linearRowFilter_caller<10, T, D, BrdRowReflect101>,
|
||||
linearRowFilter_caller<11, T, D, BrdRowReflect101>,
|
||||
linearRowFilter_caller<12, T, D, BrdRowReflect101>,
|
||||
linearRowFilter_caller<13, T, D, BrdRowReflect101>,
|
||||
linearRowFilter_caller<14, T, D, BrdRowReflect101>,
|
||||
linearRowFilter_caller<15, T, D, BrdRowReflect101>,
|
||||
linearRowFilter_caller<16, T, D, BrdRowReflect101>
|
||||
},
|
||||
{
|
||||
0,
|
||||
linearRowFilter_caller<1 , T, D, BrdRowReplicate>,
|
||||
linearRowFilter_caller<2 , T, D, BrdRowReplicate>,
|
||||
linearRowFilter_caller<3 , T, D, BrdRowReplicate>,
|
||||
linearRowFilter_caller<4 , T, D, BrdRowReplicate>,
|
||||
linearRowFilter_caller<5 , T, D, BrdRowReplicate>,
|
||||
linearRowFilter_caller<6 , T, D, BrdRowReplicate>,
|
||||
linearRowFilter_caller<7 , T, D, BrdRowReplicate>,
|
||||
linearRowFilter_caller<8 , T, D, BrdRowReplicate>,
|
||||
linearRowFilter_caller<9 , T, D, BrdRowReplicate>,
|
||||
linearRowFilter_caller<10, T, D, BrdRowReplicate>,
|
||||
linearRowFilter_caller<11, T, D, BrdRowReplicate>,
|
||||
linearRowFilter_caller<12, T, D, BrdRowReplicate>,
|
||||
linearRowFilter_caller<13, T, D, BrdRowReplicate>,
|
||||
linearRowFilter_caller<14, T, D, BrdRowReplicate>,
|
||||
linearRowFilter_caller<15, T, D, BrdRowReplicate>,
|
||||
linearRowFilter_caller<16, T, D, BrdRowReplicate>
|
||||
},
|
||||
{
|
||||
0,
|
||||
linearRowFilter_caller<1 , T, D, BrdRowConstant>,
|
||||
linearRowFilter_caller<2 , T, D, BrdRowConstant>,
|
||||
linearRowFilter_caller<3 , T, D, BrdRowConstant>,
|
||||
linearRowFilter_caller<4 , T, D, BrdRowConstant>,
|
||||
linearRowFilter_caller<5 , T, D, BrdRowConstant>,
|
||||
linearRowFilter_caller<6 , T, D, BrdRowConstant>,
|
||||
linearRowFilter_caller<7 , T, D, BrdRowConstant>,
|
||||
linearRowFilter_caller<8 , T, D, BrdRowConstant>,
|
||||
linearRowFilter_caller<9 , T, D, BrdRowConstant>,
|
||||
linearRowFilter_caller<10, T, D, BrdRowConstant>,
|
||||
linearRowFilter_caller<11, T, D, BrdRowConstant>,
|
||||
linearRowFilter_caller<12, T, D, BrdRowConstant>,
|
||||
linearRowFilter_caller<13, T, D, BrdRowConstant>,
|
||||
linearRowFilter_caller<14, T, D, BrdRowConstant>,
|
||||
linearRowFilter_caller<15, T, D, BrdRowConstant>,
|
||||
linearRowFilter_caller<16, T, D, BrdRowConstant>
|
||||
},
|
||||
{
|
||||
0,
|
||||
linearRowFilter_caller<1 , T, D, BrdRowReflect>,
|
||||
linearRowFilter_caller<2 , T, D, BrdRowReflect>,
|
||||
linearRowFilter_caller<3 , T, D, BrdRowReflect>,
|
||||
linearRowFilter_caller<4 , T, D, BrdRowReflect>,
|
||||
linearRowFilter_caller<5 , T, D, BrdRowReflect>,
|
||||
linearRowFilter_caller<6 , T, D, BrdRowReflect>,
|
||||
linearRowFilter_caller<7 , T, D, BrdRowReflect>,
|
||||
linearRowFilter_caller<8 , T, D, BrdRowReflect>,
|
||||
linearRowFilter_caller<9 , T, D, BrdRowReflect>,
|
||||
linearRowFilter_caller<10, T, D, BrdRowReflect>,
|
||||
linearRowFilter_caller<11, T, D, BrdRowReflect>,
|
||||
linearRowFilter_caller<12, T, D, BrdRowReflect>,
|
||||
linearRowFilter_caller<13, T, D, BrdRowReflect>,
|
||||
linearRowFilter_caller<14, T, D, BrdRowReflect>,
|
||||
linearRowFilter_caller<15, T, D, BrdRowReflect>,
|
||||
linearRowFilter_caller<16, T, D, BrdRowReflect>
|
||||
},
|
||||
{
|
||||
0,
|
||||
linearRowFilter_caller<1 , T, D, BrdRowWrap>,
|
||||
linearRowFilter_caller<2 , T, D, BrdRowWrap>,
|
||||
linearRowFilter_caller<3 , T, D, BrdRowWrap>,
|
||||
linearRowFilter_caller<4 , T, D, BrdRowWrap>,
|
||||
linearRowFilter_caller<5 , T, D, BrdRowWrap>,
|
||||
linearRowFilter_caller<6 , T, D, BrdRowWrap>,
|
||||
linearRowFilter_caller<7 , T, D, BrdRowWrap>,
|
||||
linearRowFilter_caller<8 , T, D, BrdRowWrap>,
|
||||
linearRowFilter_caller<9 , T, D, BrdRowWrap>,
|
||||
linearRowFilter_caller<10, T, D, BrdRowWrap>,
|
||||
linearRowFilter_caller<11, T, D, BrdRowWrap>,
|
||||
linearRowFilter_caller<12, T, D, BrdRowWrap>,
|
||||
linearRowFilter_caller<13, T, D, BrdRowWrap>,
|
||||
linearRowFilter_caller<14, T, D, BrdRowWrap>,
|
||||
linearRowFilter_caller<15, T, D, BrdRowWrap>,
|
||||
linearRowFilter_caller<16, T, D, BrdRowWrap>
|
||||
}
|
||||
};
|
||||
|
||||
loadKernel(kernel, ksize);
|
||||
|
||||
callers[brd_type][ksize]((DevMem2D_<T>)src, (DevMem2D_<D>)dst, anchor, stream);
|
||||
}
|
||||
|
||||
template void linearRowFilter_gpu<uchar , float >(const DevMem2Db& src, const DevMem2Db& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream);
|
||||
template void linearRowFilter_gpu<uchar4, float4>(const DevMem2Db& src, const DevMem2Db& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream);
|
||||
//template void linearRowFilter_gpu<short , float >(const DevMem2Db& src, const DevMem2Db& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream);
|
||||
//template void linearRowFilter_gpu<short2, float2>(const DevMem2Db& src, const DevMem2Db& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream);
|
||||
template void linearRowFilter_gpu<short3, float3>(const DevMem2Db& src, const DevMem2Db& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream);
|
||||
template void linearRowFilter_gpu<int , float >(const DevMem2Db& src, const DevMem2Db& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream);
|
||||
template void linearRowFilter_gpu<float , float >(const DevMem2Db& src, const DevMem2Db& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream);
|
||||
} // namespace row_filter
|
||||
}}} // namespace cv { namespace gpu { namespace device
|
||||
|
@ -62,44 +62,43 @@
|
||||
#define cublasSafeCall(expr) ___cublasSafeCall(expr, __FILE__, __LINE__)
|
||||
#endif
|
||||
|
||||
namespace cv { namespace gpu {
|
||||
|
||||
void error(const char *error_string, const char *file, const int line, const char *func = "");
|
||||
void nppError(int err, const char *file, const int line, const char *func = "");
|
||||
void ncvError(int err, const char *file, const int line, const char *func = "");
|
||||
void cufftError(int err, const char *file, const int line, const char *func = "");
|
||||
void cublasError(int err, const char *file, const int line, const char *func = "");
|
||||
|
||||
static inline void ___cudaSafeCall(cudaError_t err, const char *file, const int line, const char *func = "")
|
||||
namespace cv { namespace gpu
|
||||
{
|
||||
if (cudaSuccess != err)
|
||||
cv::gpu::error(cudaGetErrorString(err), file, line, func);
|
||||
}
|
||||
void error(const char *error_string, const char *file, const int line, const char *func = "");
|
||||
void nppError(int err, const char *file, const int line, const char *func = "");
|
||||
void ncvError(int err, const char *file, const int line, const char *func = "");
|
||||
void cufftError(int err, const char *file, const int line, const char *func = "");
|
||||
void cublasError(int err, const char *file, const int line, const char *func = "");
|
||||
|
||||
static inline void ___nppSafeCall(int err, const char *file, const int line, const char *func = "")
|
||||
{
|
||||
if (err < 0)
|
||||
cv::gpu::nppError(err, file, line, func);
|
||||
}
|
||||
static inline void ___cudaSafeCall(cudaError_t err, const char *file, const int line, const char *func = "")
|
||||
{
|
||||
if (cudaSuccess != err)
|
||||
cv::gpu::error(cudaGetErrorString(err), file, line, func);
|
||||
}
|
||||
|
||||
static inline void ___ncvSafeCall(int err, const char *file, const int line, const char *func = "")
|
||||
{
|
||||
if (NCV_SUCCESS != err)
|
||||
cv::gpu::ncvError(err, file, line, func);
|
||||
}
|
||||
static inline void ___nppSafeCall(int err, const char *file, const int line, const char *func = "")
|
||||
{
|
||||
if (err < 0)
|
||||
cv::gpu::nppError(err, file, line, func);
|
||||
}
|
||||
|
||||
static inline void ___cufftSafeCall(cufftResult_t err, const char *file, const int line, const char *func = "")
|
||||
{
|
||||
if (CUFFT_SUCCESS != err)
|
||||
cv::gpu::cufftError(err, file, line, func);
|
||||
}
|
||||
static inline void ___ncvSafeCall(int err, const char *file, const int line, const char *func = "")
|
||||
{
|
||||
if (NCV_SUCCESS != err)
|
||||
cv::gpu::ncvError(err, file, line, func);
|
||||
}
|
||||
|
||||
static inline void ___cublasSafeCall(cublasStatus_t err, const char *file, const int line, const char *func = "")
|
||||
{
|
||||
if (CUBLAS_STATUS_SUCCESS != err)
|
||||
cv::gpu::cublasError(err, file, line, func);
|
||||
}
|
||||
static inline void ___cufftSafeCall(cufftResult_t err, const char *file, const int line, const char *func = "")
|
||||
{
|
||||
if (CUFFT_SUCCESS != err)
|
||||
cv::gpu::cufftError(err, file, line, func);
|
||||
}
|
||||
|
||||
static inline void ___cublasSafeCall(cublasStatus_t err, const char *file, const int line, const char *func = "")
|
||||
{
|
||||
if (CUBLAS_STATUS_SUCCESS != err)
|
||||
cv::gpu::cublasError(err, file, line, func);
|
||||
}
|
||||
}}
|
||||
|
||||
#endif /* __OPENCV_CUDA_SAFE_CALL_HPP__ */
|
@ -42,467 +42,465 @@
|
||||
|
||||
#include "internal_shared.hpp"
|
||||
|
||||
BEGIN_OPENCV_DEVICE_NAMESPACE
|
||||
|
||||
namespace split_merge {
|
||||
|
||||
template <typename T, size_t elem_size = sizeof(T)>
|
||||
struct TypeTraits
|
||||
namespace cv { namespace gpu { namespace device
|
||||
{
|
||||
typedef T type;
|
||||
typedef T type2;
|
||||
typedef T type3;
|
||||
typedef T type4;
|
||||
};
|
||||
|
||||
template <typename T>
|
||||
struct TypeTraits<T, 1>
|
||||
{
|
||||
typedef char type;
|
||||
typedef char2 type2;
|
||||
typedef char3 type3;
|
||||
typedef char4 type4;
|
||||
};
|
||||
|
||||
template <typename T>
|
||||
struct TypeTraits<T, 2>
|
||||
{
|
||||
typedef short type;
|
||||
typedef short2 type2;
|
||||
typedef short3 type3;
|
||||
typedef short4 type4;
|
||||
};
|
||||
|
||||
template <typename T>
|
||||
struct TypeTraits<T, 4>
|
||||
{
|
||||
typedef int type;
|
||||
typedef int2 type2;
|
||||
typedef int3 type3;
|
||||
typedef int4 type4;
|
||||
};
|
||||
|
||||
template <typename T>
|
||||
struct TypeTraits<T, 8>
|
||||
{
|
||||
typedef double type;
|
||||
typedef double2 type2;
|
||||
//typedef double3 type3;
|
||||
//typedef double4 type3;
|
||||
};
|
||||
|
||||
typedef void (*MergeFunction)(const DevMem2Db* src, DevMem2Db& dst, const cudaStream_t& stream);
|
||||
typedef void (*SplitFunction)(const DevMem2Db& src, DevMem2Db* dst, const cudaStream_t& stream);
|
||||
|
||||
//------------------------------------------------------------
|
||||
// Merge
|
||||
|
||||
template <typename T>
|
||||
__global__ void mergeC2_(const uchar* src0, size_t src0_step,
|
||||
const uchar* src1, size_t src1_step,
|
||||
int rows, int cols, uchar* dst, size_t dst_step)
|
||||
{
|
||||
typedef typename TypeTraits<T>::type2 dst_type;
|
||||
|
||||
const int x = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
const int y = blockIdx.y * blockDim.y + threadIdx.y;
|
||||
|
||||
const T* src0_y = (const T*)(src0 + y * src0_step);
|
||||
const T* src1_y = (const T*)(src1 + y * src1_step);
|
||||
dst_type* dst_y = (dst_type*)(dst + y * dst_step);
|
||||
|
||||
if (x < cols && y < rows)
|
||||
{
|
||||
dst_type dst_elem;
|
||||
dst_elem.x = src0_y[x];
|
||||
dst_elem.y = src1_y[x];
|
||||
dst_y[x] = dst_elem;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
template <typename T>
|
||||
__global__ void mergeC3_(const uchar* src0, size_t src0_step,
|
||||
const uchar* src1, size_t src1_step,
|
||||
const uchar* src2, size_t src2_step,
|
||||
int rows, int cols, uchar* dst, size_t dst_step)
|
||||
{
|
||||
typedef typename TypeTraits<T>::type3 dst_type;
|
||||
|
||||
const int x = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
const int y = blockIdx.y * blockDim.y + threadIdx.y;
|
||||
|
||||
const T* src0_y = (const T*)(src0 + y * src0_step);
|
||||
const T* src1_y = (const T*)(src1 + y * src1_step);
|
||||
const T* src2_y = (const T*)(src2 + y * src2_step);
|
||||
dst_type* dst_y = (dst_type*)(dst + y * dst_step);
|
||||
|
||||
if (x < cols && y < rows)
|
||||
{
|
||||
dst_type dst_elem;
|
||||
dst_elem.x = src0_y[x];
|
||||
dst_elem.y = src1_y[x];
|
||||
dst_elem.z = src2_y[x];
|
||||
dst_y[x] = dst_elem;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
template <>
|
||||
__global__ void mergeC3_<double>(const uchar* src0, size_t src0_step,
|
||||
const uchar* src1, size_t src1_step,
|
||||
const uchar* src2, size_t src2_step,
|
||||
int rows, int cols, uchar* dst, size_t dst_step)
|
||||
{
|
||||
const int x = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
const int y = blockIdx.y * blockDim.y + threadIdx.y;
|
||||
|
||||
const double* src0_y = (const double*)(src0 + y * src0_step);
|
||||
const double* src1_y = (const double*)(src1 + y * src1_step);
|
||||
const double* src2_y = (const double*)(src2 + y * src2_step);
|
||||
double* dst_y = (double*)(dst + y * dst_step);
|
||||
|
||||
if (x < cols && y < rows)
|
||||
{
|
||||
dst_y[3 * x] = src0_y[x];
|
||||
dst_y[3 * x + 1] = src1_y[x];
|
||||
dst_y[3 * x + 2] = src2_y[x];
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
template <typename T>
|
||||
__global__ void mergeC4_(const uchar* src0, size_t src0_step,
|
||||
const uchar* src1, size_t src1_step,
|
||||
const uchar* src2, size_t src2_step,
|
||||
const uchar* src3, size_t src3_step,
|
||||
int rows, int cols, uchar* dst, size_t dst_step)
|
||||
{
|
||||
typedef typename TypeTraits<T>::type4 dst_type;
|
||||
|
||||
const int x = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
const int y = blockIdx.y * blockDim.y + threadIdx.y;
|
||||
|
||||
const T* src0_y = (const T*)(src0 + y * src0_step);
|
||||
const T* src1_y = (const T*)(src1 + y * src1_step);
|
||||
const T* src2_y = (const T*)(src2 + y * src2_step);
|
||||
const T* src3_y = (const T*)(src3 + y * src3_step);
|
||||
dst_type* dst_y = (dst_type*)(dst + y * dst_step);
|
||||
|
||||
if (x < cols && y < rows)
|
||||
{
|
||||
dst_type dst_elem;
|
||||
dst_elem.x = src0_y[x];
|
||||
dst_elem.y = src1_y[x];
|
||||
dst_elem.z = src2_y[x];
|
||||
dst_elem.w = src3_y[x];
|
||||
dst_y[x] = dst_elem;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
template <>
|
||||
__global__ void mergeC4_<double>(const uchar* src0, size_t src0_step,
|
||||
const uchar* src1, size_t src1_step,
|
||||
const uchar* src2, size_t src2_step,
|
||||
const uchar* src3, size_t src3_step,
|
||||
int rows, int cols, uchar* dst, size_t dst_step)
|
||||
{
|
||||
const int x = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
const int y = blockIdx.y * blockDim.y + threadIdx.y;
|
||||
|
||||
const double* src0_y = (const double*)(src0 + y * src0_step);
|
||||
const double* src1_y = (const double*)(src1 + y * src1_step);
|
||||
const double* src2_y = (const double*)(src2 + y * src2_step);
|
||||
const double* src3_y = (const double*)(src3 + y * src3_step);
|
||||
double2* dst_y = (double2*)(dst + y * dst_step);
|
||||
|
||||
if (x < cols && y < rows)
|
||||
{
|
||||
dst_y[2 * x] = make_double2(src0_y[x], src1_y[x]);
|
||||
dst_y[2 * x + 1] = make_double2(src2_y[x], src3_y[x]);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
template <typename T>
|
||||
static void mergeC2_(const DevMem2Db* src, DevMem2Db& dst, const cudaStream_t& stream)
|
||||
{
|
||||
dim3 blockDim(32, 8);
|
||||
dim3 gridDim(divUp(dst.cols, blockDim.x), divUp(dst.rows, blockDim.y));
|
||||
mergeC2_<T><<<gridDim, blockDim, 0, stream>>>(
|
||||
src[0].data, src[0].step,
|
||||
src[1].data, src[1].step,
|
||||
dst.rows, dst.cols, dst.data, dst.step);
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
|
||||
if (stream == 0)
|
||||
cudaSafeCall(cudaDeviceSynchronize());
|
||||
}
|
||||
|
||||
|
||||
template <typename T>
|
||||
static void mergeC3_(const DevMem2Db* src, DevMem2Db& dst, const cudaStream_t& stream)
|
||||
{
|
||||
dim3 blockDim(32, 8);
|
||||
dim3 gridDim(divUp(dst.cols, blockDim.x), divUp(dst.rows, blockDim.y));
|
||||
mergeC3_<T><<<gridDim, blockDim, 0, stream>>>(
|
||||
src[0].data, src[0].step,
|
||||
src[1].data, src[1].step,
|
||||
src[2].data, src[2].step,
|
||||
dst.rows, dst.cols, dst.data, dst.step);
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
|
||||
if (stream == 0)
|
||||
cudaSafeCall(cudaDeviceSynchronize());
|
||||
}
|
||||
|
||||
|
||||
template <typename T>
|
||||
static void mergeC4_(const DevMem2Db* src, DevMem2Db& dst, const cudaStream_t& stream)
|
||||
{
|
||||
dim3 blockDim(32, 8);
|
||||
dim3 gridDim(divUp(dst.cols, blockDim.x), divUp(dst.rows, blockDim.y));
|
||||
mergeC4_<T><<<gridDim, blockDim, 0, stream>>>(
|
||||
src[0].data, src[0].step,
|
||||
src[1].data, src[1].step,
|
||||
src[2].data, src[2].step,
|
||||
src[3].data, src[3].step,
|
||||
dst.rows, dst.cols, dst.data, dst.step);
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
|
||||
if (stream == 0)
|
||||
cudaSafeCall(cudaDeviceSynchronize());
|
||||
}
|
||||
|
||||
|
||||
void merge_caller(const DevMem2Db* src, DevMem2Db& dst,
|
||||
int total_channels, size_t elem_size,
|
||||
const cudaStream_t& stream)
|
||||
{
|
||||
static MergeFunction merge_func_tbl[] =
|
||||
namespace split_merge
|
||||
{
|
||||
mergeC2_<char>, mergeC2_<short>, mergeC2_<int>, 0, mergeC2_<double>,
|
||||
mergeC3_<char>, mergeC3_<short>, mergeC3_<int>, 0, mergeC3_<double>,
|
||||
mergeC4_<char>, mergeC4_<short>, mergeC4_<int>, 0, mergeC4_<double>,
|
||||
};
|
||||
template <typename T, size_t elem_size = sizeof(T)>
|
||||
struct TypeTraits
|
||||
{
|
||||
typedef T type;
|
||||
typedef T type2;
|
||||
typedef T type3;
|
||||
typedef T type4;
|
||||
};
|
||||
|
||||
size_t merge_func_id = (total_channels - 2) * 5 + (elem_size >> 1);
|
||||
MergeFunction merge_func = merge_func_tbl[merge_func_id];
|
||||
template <typename T>
|
||||
struct TypeTraits<T, 1>
|
||||
{
|
||||
typedef char type;
|
||||
typedef char2 type2;
|
||||
typedef char3 type3;
|
||||
typedef char4 type4;
|
||||
};
|
||||
|
||||
if (merge_func == 0)
|
||||
cv::gpu::error("Unsupported channel count or data type", __FILE__, __LINE__);
|
||||
template <typename T>
|
||||
struct TypeTraits<T, 2>
|
||||
{
|
||||
typedef short type;
|
||||
typedef short2 type2;
|
||||
typedef short3 type3;
|
||||
typedef short4 type4;
|
||||
};
|
||||
|
||||
merge_func(src, dst, stream);
|
||||
}
|
||||
template <typename T>
|
||||
struct TypeTraits<T, 4>
|
||||
{
|
||||
typedef int type;
|
||||
typedef int2 type2;
|
||||
typedef int3 type3;
|
||||
typedef int4 type4;
|
||||
};
|
||||
|
||||
template <typename T>
|
||||
struct TypeTraits<T, 8>
|
||||
{
|
||||
typedef double type;
|
||||
typedef double2 type2;
|
||||
//typedef double3 type3;
|
||||
//typedef double4 type3;
|
||||
};
|
||||
|
||||
typedef void (*MergeFunction)(const DevMem2Db* src, DevMem2Db& dst, const cudaStream_t& stream);
|
||||
typedef void (*SplitFunction)(const DevMem2Db& src, DevMem2Db* dst, const cudaStream_t& stream);
|
||||
|
||||
//------------------------------------------------------------
|
||||
// Merge
|
||||
|
||||
template <typename T>
|
||||
__global__ void mergeC2_(const uchar* src0, size_t src0_step,
|
||||
const uchar* src1, size_t src1_step,
|
||||
int rows, int cols, uchar* dst, size_t dst_step)
|
||||
{
|
||||
typedef typename TypeTraits<T>::type2 dst_type;
|
||||
|
||||
const int x = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
const int y = blockIdx.y * blockDim.y + threadIdx.y;
|
||||
|
||||
const T* src0_y = (const T*)(src0 + y * src0_step);
|
||||
const T* src1_y = (const T*)(src1 + y * src1_step);
|
||||
dst_type* dst_y = (dst_type*)(dst + y * dst_step);
|
||||
|
||||
if (x < cols && y < rows)
|
||||
{
|
||||
dst_type dst_elem;
|
||||
dst_elem.x = src0_y[x];
|
||||
dst_elem.y = src1_y[x];
|
||||
dst_y[x] = dst_elem;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
template <typename T>
|
||||
__global__ void mergeC3_(const uchar* src0, size_t src0_step,
|
||||
const uchar* src1, size_t src1_step,
|
||||
const uchar* src2, size_t src2_step,
|
||||
int rows, int cols, uchar* dst, size_t dst_step)
|
||||
{
|
||||
typedef typename TypeTraits<T>::type3 dst_type;
|
||||
|
||||
const int x = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
const int y = blockIdx.y * blockDim.y + threadIdx.y;
|
||||
|
||||
const T* src0_y = (const T*)(src0 + y * src0_step);
|
||||
const T* src1_y = (const T*)(src1 + y * src1_step);
|
||||
const T* src2_y = (const T*)(src2 + y * src2_step);
|
||||
dst_type* dst_y = (dst_type*)(dst + y * dst_step);
|
||||
|
||||
if (x < cols && y < rows)
|
||||
{
|
||||
dst_type dst_elem;
|
||||
dst_elem.x = src0_y[x];
|
||||
dst_elem.y = src1_y[x];
|
||||
dst_elem.z = src2_y[x];
|
||||
dst_y[x] = dst_elem;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
template <>
|
||||
__global__ void mergeC3_<double>(const uchar* src0, size_t src0_step,
|
||||
const uchar* src1, size_t src1_step,
|
||||
const uchar* src2, size_t src2_step,
|
||||
int rows, int cols, uchar* dst, size_t dst_step)
|
||||
{
|
||||
const int x = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
const int y = blockIdx.y * blockDim.y + threadIdx.y;
|
||||
|
||||
const double* src0_y = (const double*)(src0 + y * src0_step);
|
||||
const double* src1_y = (const double*)(src1 + y * src1_step);
|
||||
const double* src2_y = (const double*)(src2 + y * src2_step);
|
||||
double* dst_y = (double*)(dst + y * dst_step);
|
||||
|
||||
if (x < cols && y < rows)
|
||||
{
|
||||
dst_y[3 * x] = src0_y[x];
|
||||
dst_y[3 * x + 1] = src1_y[x];
|
||||
dst_y[3 * x + 2] = src2_y[x];
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
template <typename T>
|
||||
__global__ void mergeC4_(const uchar* src0, size_t src0_step,
|
||||
const uchar* src1, size_t src1_step,
|
||||
const uchar* src2, size_t src2_step,
|
||||
const uchar* src3, size_t src3_step,
|
||||
int rows, int cols, uchar* dst, size_t dst_step)
|
||||
{
|
||||
typedef typename TypeTraits<T>::type4 dst_type;
|
||||
|
||||
const int x = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
const int y = blockIdx.y * blockDim.y + threadIdx.y;
|
||||
|
||||
const T* src0_y = (const T*)(src0 + y * src0_step);
|
||||
const T* src1_y = (const T*)(src1 + y * src1_step);
|
||||
const T* src2_y = (const T*)(src2 + y * src2_step);
|
||||
const T* src3_y = (const T*)(src3 + y * src3_step);
|
||||
dst_type* dst_y = (dst_type*)(dst + y * dst_step);
|
||||
|
||||
if (x < cols && y < rows)
|
||||
{
|
||||
dst_type dst_elem;
|
||||
dst_elem.x = src0_y[x];
|
||||
dst_elem.y = src1_y[x];
|
||||
dst_elem.z = src2_y[x];
|
||||
dst_elem.w = src3_y[x];
|
||||
dst_y[x] = dst_elem;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
template <>
|
||||
__global__ void mergeC4_<double>(const uchar* src0, size_t src0_step,
|
||||
const uchar* src1, size_t src1_step,
|
||||
const uchar* src2, size_t src2_step,
|
||||
const uchar* src3, size_t src3_step,
|
||||
int rows, int cols, uchar* dst, size_t dst_step)
|
||||
{
|
||||
const int x = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
const int y = blockIdx.y * blockDim.y + threadIdx.y;
|
||||
|
||||
const double* src0_y = (const double*)(src0 + y * src0_step);
|
||||
const double* src1_y = (const double*)(src1 + y * src1_step);
|
||||
const double* src2_y = (const double*)(src2 + y * src2_step);
|
||||
const double* src3_y = (const double*)(src3 + y * src3_step);
|
||||
double2* dst_y = (double2*)(dst + y * dst_step);
|
||||
|
||||
if (x < cols && y < rows)
|
||||
{
|
||||
dst_y[2 * x] = make_double2(src0_y[x], src1_y[x]);
|
||||
dst_y[2 * x + 1] = make_double2(src2_y[x], src3_y[x]);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
template <typename T>
|
||||
static void mergeC2_(const DevMem2Db* src, DevMem2Db& dst, const cudaStream_t& stream)
|
||||
{
|
||||
dim3 blockDim(32, 8);
|
||||
dim3 gridDim(divUp(dst.cols, blockDim.x), divUp(dst.rows, blockDim.y));
|
||||
mergeC2_<T><<<gridDim, blockDim, 0, stream>>>(
|
||||
src[0].data, src[0].step,
|
||||
src[1].data, src[1].step,
|
||||
dst.rows, dst.cols, dst.data, dst.step);
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
|
||||
if (stream == 0)
|
||||
cudaSafeCall(cudaDeviceSynchronize());
|
||||
}
|
||||
|
||||
|
||||
template <typename T>
|
||||
static void mergeC3_(const DevMem2Db* src, DevMem2Db& dst, const cudaStream_t& stream)
|
||||
{
|
||||
dim3 blockDim(32, 8);
|
||||
dim3 gridDim(divUp(dst.cols, blockDim.x), divUp(dst.rows, blockDim.y));
|
||||
mergeC3_<T><<<gridDim, blockDim, 0, stream>>>(
|
||||
src[0].data, src[0].step,
|
||||
src[1].data, src[1].step,
|
||||
src[2].data, src[2].step,
|
||||
dst.rows, dst.cols, dst.data, dst.step);
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
|
||||
if (stream == 0)
|
||||
cudaSafeCall(cudaDeviceSynchronize());
|
||||
}
|
||||
|
||||
|
||||
template <typename T>
|
||||
static void mergeC4_(const DevMem2Db* src, DevMem2Db& dst, const cudaStream_t& stream)
|
||||
{
|
||||
dim3 blockDim(32, 8);
|
||||
dim3 gridDim(divUp(dst.cols, blockDim.x), divUp(dst.rows, blockDim.y));
|
||||
mergeC4_<T><<<gridDim, blockDim, 0, stream>>>(
|
||||
src[0].data, src[0].step,
|
||||
src[1].data, src[1].step,
|
||||
src[2].data, src[2].step,
|
||||
src[3].data, src[3].step,
|
||||
dst.rows, dst.cols, dst.data, dst.step);
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
|
||||
if (stream == 0)
|
||||
cudaSafeCall(cudaDeviceSynchronize());
|
||||
}
|
||||
|
||||
|
||||
void merge_caller(const DevMem2Db* src, DevMem2Db& dst,
|
||||
int total_channels, size_t elem_size,
|
||||
const cudaStream_t& stream)
|
||||
{
|
||||
static MergeFunction merge_func_tbl[] =
|
||||
{
|
||||
mergeC2_<char>, mergeC2_<short>, mergeC2_<int>, 0, mergeC2_<double>,
|
||||
mergeC3_<char>, mergeC3_<short>, mergeC3_<int>, 0, mergeC3_<double>,
|
||||
mergeC4_<char>, mergeC4_<short>, mergeC4_<int>, 0, mergeC4_<double>,
|
||||
};
|
||||
|
||||
size_t merge_func_id = (total_channels - 2) * 5 + (elem_size >> 1);
|
||||
MergeFunction merge_func = merge_func_tbl[merge_func_id];
|
||||
|
||||
if (merge_func == 0)
|
||||
cv::gpu::error("Unsupported channel count or data type", __FILE__, __LINE__);
|
||||
|
||||
merge_func(src, dst, stream);
|
||||
}
|
||||
|
||||
|
||||
|
||||
//------------------------------------------------------------
|
||||
// Split
|
||||
//------------------------------------------------------------
|
||||
// Split
|
||||
|
||||
|
||||
template <typename T>
|
||||
__global__ void splitC2_(const uchar* src, size_t src_step,
|
||||
int rows, int cols,
|
||||
uchar* dst0, size_t dst0_step,
|
||||
uchar* dst1, size_t dst1_step)
|
||||
{
|
||||
typedef typename TypeTraits<T>::type2 src_type;
|
||||
template <typename T>
|
||||
__global__ void splitC2_(const uchar* src, size_t src_step,
|
||||
int rows, int cols,
|
||||
uchar* dst0, size_t dst0_step,
|
||||
uchar* dst1, size_t dst1_step)
|
||||
{
|
||||
typedef typename TypeTraits<T>::type2 src_type;
|
||||
|
||||
const int x = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
const int y = blockIdx.y * blockDim.y + threadIdx.y;
|
||||
const int x = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
const int y = blockIdx.y * blockDim.y + threadIdx.y;
|
||||
|
||||
const src_type* src_y = (const src_type*)(src + y * src_step);
|
||||
T* dst0_y = (T*)(dst0 + y * dst0_step);
|
||||
T* dst1_y = (T*)(dst1 + y * dst1_step);
|
||||
const src_type* src_y = (const src_type*)(src + y * src_step);
|
||||
T* dst0_y = (T*)(dst0 + y * dst0_step);
|
||||
T* dst1_y = (T*)(dst1 + y * dst1_step);
|
||||
|
||||
if (x < cols && y < rows)
|
||||
{
|
||||
src_type src_elem = src_y[x];
|
||||
dst0_y[x] = src_elem.x;
|
||||
dst1_y[x] = src_elem.y;
|
||||
}
|
||||
}
|
||||
if (x < cols && y < rows)
|
||||
{
|
||||
src_type src_elem = src_y[x];
|
||||
dst0_y[x] = src_elem.x;
|
||||
dst1_y[x] = src_elem.y;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
template <typename T>
|
||||
__global__ void splitC3_(const uchar* src, size_t src_step,
|
||||
int rows, int cols,
|
||||
uchar* dst0, size_t dst0_step,
|
||||
uchar* dst1, size_t dst1_step,
|
||||
uchar* dst2, size_t dst2_step)
|
||||
{
|
||||
typedef typename TypeTraits<T>::type3 src_type;
|
||||
template <typename T>
|
||||
__global__ void splitC3_(const uchar* src, size_t src_step,
|
||||
int rows, int cols,
|
||||
uchar* dst0, size_t dst0_step,
|
||||
uchar* dst1, size_t dst1_step,
|
||||
uchar* dst2, size_t dst2_step)
|
||||
{
|
||||
typedef typename TypeTraits<T>::type3 src_type;
|
||||
|
||||
const int x = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
const int y = blockIdx.y * blockDim.y + threadIdx.y;
|
||||
const int x = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
const int y = blockIdx.y * blockDim.y + threadIdx.y;
|
||||
|
||||
const src_type* src_y = (const src_type*)(src + y * src_step);
|
||||
T* dst0_y = (T*)(dst0 + y * dst0_step);
|
||||
T* dst1_y = (T*)(dst1 + y * dst1_step);
|
||||
T* dst2_y = (T*)(dst2 + y * dst2_step);
|
||||
const src_type* src_y = (const src_type*)(src + y * src_step);
|
||||
T* dst0_y = (T*)(dst0 + y * dst0_step);
|
||||
T* dst1_y = (T*)(dst1 + y * dst1_step);
|
||||
T* dst2_y = (T*)(dst2 + y * dst2_step);
|
||||
|
||||
if (x < cols && y < rows)
|
||||
{
|
||||
src_type src_elem = src_y[x];
|
||||
dst0_y[x] = src_elem.x;
|
||||
dst1_y[x] = src_elem.y;
|
||||
dst2_y[x] = src_elem.z;
|
||||
}
|
||||
}
|
||||
if (x < cols && y < rows)
|
||||
{
|
||||
src_type src_elem = src_y[x];
|
||||
dst0_y[x] = src_elem.x;
|
||||
dst1_y[x] = src_elem.y;
|
||||
dst2_y[x] = src_elem.z;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
template <>
|
||||
__global__ void splitC3_<double>(
|
||||
const uchar* src, size_t src_step, int rows, int cols,
|
||||
uchar* dst0, size_t dst0_step,
|
||||
uchar* dst1, size_t dst1_step,
|
||||
uchar* dst2, size_t dst2_step)
|
||||
{
|
||||
const int x = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
const int y = blockIdx.y * blockDim.y + threadIdx.y;
|
||||
template <>
|
||||
__global__ void splitC3_<double>(
|
||||
const uchar* src, size_t src_step, int rows, int cols,
|
||||
uchar* dst0, size_t dst0_step,
|
||||
uchar* dst1, size_t dst1_step,
|
||||
uchar* dst2, size_t dst2_step)
|
||||
{
|
||||
const int x = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
const int y = blockIdx.y * blockDim.y + threadIdx.y;
|
||||
|
||||
const double* src_y = (const double*)(src + y * src_step);
|
||||
double* dst0_y = (double*)(dst0 + y * dst0_step);
|
||||
double* dst1_y = (double*)(dst1 + y * dst1_step);
|
||||
double* dst2_y = (double*)(dst2 + y * dst2_step);
|
||||
const double* src_y = (const double*)(src + y * src_step);
|
||||
double* dst0_y = (double*)(dst0 + y * dst0_step);
|
||||
double* dst1_y = (double*)(dst1 + y * dst1_step);
|
||||
double* dst2_y = (double*)(dst2 + y * dst2_step);
|
||||
|
||||
if (x < cols && y < rows)
|
||||
{
|
||||
dst0_y[x] = src_y[3 * x];
|
||||
dst1_y[x] = src_y[3 * x + 1];
|
||||
dst2_y[x] = src_y[3 * x + 2];
|
||||
}
|
||||
}
|
||||
if (x < cols && y < rows)
|
||||
{
|
||||
dst0_y[x] = src_y[3 * x];
|
||||
dst1_y[x] = src_y[3 * x + 1];
|
||||
dst2_y[x] = src_y[3 * x + 2];
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
template <typename T>
|
||||
__global__ void splitC4_(const uchar* src, size_t src_step, int rows, int cols,
|
||||
uchar* dst0, size_t dst0_step,
|
||||
uchar* dst1, size_t dst1_step,
|
||||
uchar* dst2, size_t dst2_step,
|
||||
uchar* dst3, size_t dst3_step)
|
||||
{
|
||||
typedef typename TypeTraits<T>::type4 src_type;
|
||||
template <typename T>
|
||||
__global__ void splitC4_(const uchar* src, size_t src_step, int rows, int cols,
|
||||
uchar* dst0, size_t dst0_step,
|
||||
uchar* dst1, size_t dst1_step,
|
||||
uchar* dst2, size_t dst2_step,
|
||||
uchar* dst3, size_t dst3_step)
|
||||
{
|
||||
typedef typename TypeTraits<T>::type4 src_type;
|
||||
|
||||
const int x = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
const int y = blockIdx.y * blockDim.y + threadIdx.y;
|
||||
const int x = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
const int y = blockIdx.y * blockDim.y + threadIdx.y;
|
||||
|
||||
const src_type* src_y = (const src_type*)(src + y * src_step);
|
||||
T* dst0_y = (T*)(dst0 + y * dst0_step);
|
||||
T* dst1_y = (T*)(dst1 + y * dst1_step);
|
||||
T* dst2_y = (T*)(dst2 + y * dst2_step);
|
||||
T* dst3_y = (T*)(dst3 + y * dst3_step);
|
||||
const src_type* src_y = (const src_type*)(src + y * src_step);
|
||||
T* dst0_y = (T*)(dst0 + y * dst0_step);
|
||||
T* dst1_y = (T*)(dst1 + y * dst1_step);
|
||||
T* dst2_y = (T*)(dst2 + y * dst2_step);
|
||||
T* dst3_y = (T*)(dst3 + y * dst3_step);
|
||||
|
||||
if (x < cols && y < rows)
|
||||
{
|
||||
src_type src_elem = src_y[x];
|
||||
dst0_y[x] = src_elem.x;
|
||||
dst1_y[x] = src_elem.y;
|
||||
dst2_y[x] = src_elem.z;
|
||||
dst3_y[x] = src_elem.w;
|
||||
}
|
||||
}
|
||||
if (x < cols && y < rows)
|
||||
{
|
||||
src_type src_elem = src_y[x];
|
||||
dst0_y[x] = src_elem.x;
|
||||
dst1_y[x] = src_elem.y;
|
||||
dst2_y[x] = src_elem.z;
|
||||
dst3_y[x] = src_elem.w;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
template <>
|
||||
__global__ void splitC4_<double>(
|
||||
const uchar* src, size_t src_step, int rows, int cols,
|
||||
uchar* dst0, size_t dst0_step,
|
||||
uchar* dst1, size_t dst1_step,
|
||||
uchar* dst2, size_t dst2_step,
|
||||
uchar* dst3, size_t dst3_step)
|
||||
{
|
||||
const int x = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
const int y = blockIdx.y * blockDim.y + threadIdx.y;
|
||||
template <>
|
||||
__global__ void splitC4_<double>(
|
||||
const uchar* src, size_t src_step, int rows, int cols,
|
||||
uchar* dst0, size_t dst0_step,
|
||||
uchar* dst1, size_t dst1_step,
|
||||
uchar* dst2, size_t dst2_step,
|
||||
uchar* dst3, size_t dst3_step)
|
||||
{
|
||||
const int x = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
const int y = blockIdx.y * blockDim.y + threadIdx.y;
|
||||
|
||||
const double2* src_y = (const double2*)(src + y * src_step);
|
||||
double* dst0_y = (double*)(dst0 + y * dst0_step);
|
||||
double* dst1_y = (double*)(dst1 + y * dst1_step);
|
||||
double* dst2_y = (double*)(dst2 + y * dst2_step);
|
||||
double* dst3_y = (double*)(dst3 + y * dst3_step);
|
||||
const double2* src_y = (const double2*)(src + y * src_step);
|
||||
double* dst0_y = (double*)(dst0 + y * dst0_step);
|
||||
double* dst1_y = (double*)(dst1 + y * dst1_step);
|
||||
double* dst2_y = (double*)(dst2 + y * dst2_step);
|
||||
double* dst3_y = (double*)(dst3 + y * dst3_step);
|
||||
|
||||
if (x < cols && y < rows)
|
||||
{
|
||||
double2 src_elem1 = src_y[2 * x];
|
||||
double2 src_elem2 = src_y[2 * x + 1];
|
||||
dst0_y[x] = src_elem1.x;
|
||||
dst1_y[x] = src_elem1.y;
|
||||
dst2_y[x] = src_elem2.x;
|
||||
dst3_y[x] = src_elem2.y;
|
||||
}
|
||||
}
|
||||
if (x < cols && y < rows)
|
||||
{
|
||||
double2 src_elem1 = src_y[2 * x];
|
||||
double2 src_elem2 = src_y[2 * x + 1];
|
||||
dst0_y[x] = src_elem1.x;
|
||||
dst1_y[x] = src_elem1.y;
|
||||
dst2_y[x] = src_elem2.x;
|
||||
dst3_y[x] = src_elem2.y;
|
||||
}
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
static void splitC2_(const DevMem2Db& src, DevMem2Db* dst, const cudaStream_t& stream)
|
||||
{
|
||||
dim3 blockDim(32, 8);
|
||||
dim3 gridDim(divUp(src.cols, blockDim.x), divUp(src.rows, blockDim.y));
|
||||
splitC2_<T><<<gridDim, blockDim, 0, stream>>>(
|
||||
src.data, src.step, src.rows, src.cols,
|
||||
dst[0].data, dst[0].step,
|
||||
dst[1].data, dst[1].step);
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
template <typename T>
|
||||
static void splitC2_(const DevMem2Db& src, DevMem2Db* dst, const cudaStream_t& stream)
|
||||
{
|
||||
dim3 blockDim(32, 8);
|
||||
dim3 gridDim(divUp(src.cols, blockDim.x), divUp(src.rows, blockDim.y));
|
||||
splitC2_<T><<<gridDim, blockDim, 0, stream>>>(
|
||||
src.data, src.step, src.rows, src.cols,
|
||||
dst[0].data, dst[0].step,
|
||||
dst[1].data, dst[1].step);
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
|
||||
if (stream == 0)
|
||||
cudaSafeCall(cudaDeviceSynchronize());
|
||||
}
|
||||
if (stream == 0)
|
||||
cudaSafeCall(cudaDeviceSynchronize());
|
||||
}
|
||||
|
||||
|
||||
template <typename T>
|
||||
static void splitC3_(const DevMem2Db& src, DevMem2Db* dst, const cudaStream_t& stream)
|
||||
{
|
||||
dim3 blockDim(32, 8);
|
||||
dim3 gridDim(divUp(src.cols, blockDim.x), divUp(src.rows, blockDim.y));
|
||||
splitC3_<T><<<gridDim, blockDim, 0, stream>>>(
|
||||
src.data, src.step, src.rows, src.cols,
|
||||
dst[0].data, dst[0].step,
|
||||
dst[1].data, dst[1].step,
|
||||
dst[2].data, dst[2].step);
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
template <typename T>
|
||||
static void splitC3_(const DevMem2Db& src, DevMem2Db* dst, const cudaStream_t& stream)
|
||||
{
|
||||
dim3 blockDim(32, 8);
|
||||
dim3 gridDim(divUp(src.cols, blockDim.x), divUp(src.rows, blockDim.y));
|
||||
splitC3_<T><<<gridDim, blockDim, 0, stream>>>(
|
||||
src.data, src.step, src.rows, src.cols,
|
||||
dst[0].data, dst[0].step,
|
||||
dst[1].data, dst[1].step,
|
||||
dst[2].data, dst[2].step);
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
|
||||
if (stream == 0)
|
||||
cudaSafeCall(cudaDeviceSynchronize());
|
||||
}
|
||||
if (stream == 0)
|
||||
cudaSafeCall(cudaDeviceSynchronize());
|
||||
}
|
||||
|
||||
|
||||
template <typename T>
|
||||
static void splitC4_(const DevMem2Db& src, DevMem2Db* dst, const cudaStream_t& stream)
|
||||
{
|
||||
dim3 blockDim(32, 8);
|
||||
dim3 gridDim(divUp(src.cols, blockDim.x), divUp(src.rows, blockDim.y));
|
||||
splitC4_<T><<<gridDim, blockDim, 0, stream>>>(
|
||||
src.data, src.step, src.rows, src.cols,
|
||||
dst[0].data, dst[0].step,
|
||||
dst[1].data, dst[1].step,
|
||||
dst[2].data, dst[2].step,
|
||||
dst[3].data, dst[3].step);
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
template <typename T>
|
||||
static void splitC4_(const DevMem2Db& src, DevMem2Db* dst, const cudaStream_t& stream)
|
||||
{
|
||||
dim3 blockDim(32, 8);
|
||||
dim3 gridDim(divUp(src.cols, blockDim.x), divUp(src.rows, blockDim.y));
|
||||
splitC4_<T><<<gridDim, blockDim, 0, stream>>>(
|
||||
src.data, src.step, src.rows, src.cols,
|
||||
dst[0].data, dst[0].step,
|
||||
dst[1].data, dst[1].step,
|
||||
dst[2].data, dst[2].step,
|
||||
dst[3].data, dst[3].step);
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
|
||||
if (stream == 0)
|
||||
cudaSafeCall(cudaDeviceSynchronize());
|
||||
}
|
||||
if (stream == 0)
|
||||
cudaSafeCall(cudaDeviceSynchronize());
|
||||
}
|
||||
|
||||
|
||||
void split_caller(const DevMem2Db& src, DevMem2Db* dst, int num_channels, size_t elem_size1, const cudaStream_t& stream)
|
||||
{
|
||||
static SplitFunction split_func_tbl[] =
|
||||
{
|
||||
splitC2_<char>, splitC2_<short>, splitC2_<int>, 0, splitC2_<double>,
|
||||
splitC3_<char>, splitC3_<short>, splitC3_<int>, 0, splitC3_<double>,
|
||||
splitC4_<char>, splitC4_<short>, splitC4_<int>, 0, splitC4_<double>,
|
||||
};
|
||||
void split_caller(const DevMem2Db& src, DevMem2Db* dst, int num_channels, size_t elem_size1, const cudaStream_t& stream)
|
||||
{
|
||||
static SplitFunction split_func_tbl[] =
|
||||
{
|
||||
splitC2_<char>, splitC2_<short>, splitC2_<int>, 0, splitC2_<double>,
|
||||
splitC3_<char>, splitC3_<short>, splitC3_<int>, 0, splitC3_<double>,
|
||||
splitC4_<char>, splitC4_<short>, splitC4_<int>, 0, splitC4_<double>,
|
||||
};
|
||||
|
||||
size_t split_func_id = (num_channels - 2) * 5 + (elem_size1 >> 1);
|
||||
SplitFunction split_func = split_func_tbl[split_func_id];
|
||||
size_t split_func_id = (num_channels - 2) * 5 + (elem_size1 >> 1);
|
||||
SplitFunction split_func = split_func_tbl[split_func_id];
|
||||
|
||||
if (split_func == 0)
|
||||
cv::gpu::error("Unsupported channel count or data type", __FILE__, __LINE__);
|
||||
if (split_func == 0)
|
||||
cv::gpu::error("Unsupported channel count or data type", __FILE__, __LINE__);
|
||||
|
||||
split_func(src, dst, stream);
|
||||
}
|
||||
|
||||
} // namespace split_merge
|
||||
|
||||
END_OPENCV_DEVICE_NAMESPACE
|
||||
split_func(src, dst, stream);
|
||||
}
|
||||
} // namespace split_merge
|
||||
}}} // namespace cv { namespace gpu { namespace device
|
||||
|
@ -42,496 +42,494 @@
|
||||
|
||||
#include "internal_shared.hpp"
|
||||
|
||||
BEGIN_OPENCV_DEVICE_NAMESPACE
|
||||
|
||||
namespace stereobm {
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
/////////////////////////////////////// Stereo BM ////////////////////////////////////////////////
|
||||
//////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
#define ROWSperTHREAD 21 // the number of rows a thread will process
|
||||
|
||||
#define BLOCK_W 128 // the thread block width (464)
|
||||
#define N_DISPARITIES 8
|
||||
|
||||
#define STEREO_MIND 0 // The minimum d range to check
|
||||
#define STEREO_DISP_STEP N_DISPARITIES // the d step, must be <= 1 to avoid aliasing
|
||||
|
||||
__constant__ unsigned int* cminSSDImage;
|
||||
__constant__ size_t cminSSD_step;
|
||||
__constant__ int cwidth;
|
||||
__constant__ int cheight;
|
||||
|
||||
__device__ __forceinline__ int SQ(int a)
|
||||
namespace cv { namespace gpu { namespace device
|
||||
{
|
||||
return a * a;
|
||||
}
|
||||
|
||||
template<int RADIUS>
|
||||
__device__ unsigned int CalcSSD(volatile unsigned int *col_ssd_cache, volatile unsigned int *col_ssd)
|
||||
{
|
||||
unsigned int cache = 0;
|
||||
unsigned int cache2 = 0;
|
||||
|
||||
for(int i = 1; i <= RADIUS; i++)
|
||||
cache += col_ssd[i];
|
||||
|
||||
col_ssd_cache[0] = cache;
|
||||
|
||||
__syncthreads();
|
||||
|
||||
if (threadIdx.x < BLOCK_W - RADIUS)
|
||||
cache2 = col_ssd_cache[RADIUS];
|
||||
else
|
||||
for(int i = RADIUS + 1; i < (2 * RADIUS + 1); i++)
|
||||
cache2 += col_ssd[i];
|
||||
|
||||
return col_ssd[0] + cache + cache2;
|
||||
}
|
||||
|
||||
template<int RADIUS>
|
||||
__device__ uint2 MinSSD(volatile unsigned int *col_ssd_cache, volatile unsigned int *col_ssd)
|
||||
{
|
||||
unsigned int ssd[N_DISPARITIES];
|
||||
|
||||
//See above: #define COL_SSD_SIZE (BLOCK_W + 2 * RADIUS)
|
||||
ssd[0] = CalcSSD<RADIUS>(col_ssd_cache, col_ssd + 0 * (BLOCK_W + 2 * RADIUS));
|
||||
__syncthreads();
|
||||
ssd[1] = CalcSSD<RADIUS>(col_ssd_cache, col_ssd + 1 * (BLOCK_W + 2 * RADIUS));
|
||||
__syncthreads();
|
||||
ssd[2] = CalcSSD<RADIUS>(col_ssd_cache, col_ssd + 2 * (BLOCK_W + 2 * RADIUS));
|
||||
__syncthreads();
|
||||
ssd[3] = CalcSSD<RADIUS>(col_ssd_cache, col_ssd + 3 * (BLOCK_W + 2 * RADIUS));
|
||||
__syncthreads();
|
||||
ssd[4] = CalcSSD<RADIUS>(col_ssd_cache, col_ssd + 4 * (BLOCK_W + 2 * RADIUS));
|
||||
__syncthreads();
|
||||
ssd[5] = CalcSSD<RADIUS>(col_ssd_cache, col_ssd + 5 * (BLOCK_W + 2 * RADIUS));
|
||||
__syncthreads();
|
||||
ssd[6] = CalcSSD<RADIUS>(col_ssd_cache, col_ssd + 6 * (BLOCK_W + 2 * RADIUS));
|
||||
__syncthreads();
|
||||
ssd[7] = CalcSSD<RADIUS>(col_ssd_cache, col_ssd + 7 * (BLOCK_W + 2 * RADIUS));
|
||||
|
||||
int mssd = ::min(::min(::min(ssd[0], ssd[1]), ::min(ssd[4], ssd[5])), ::min(::min(ssd[2], ssd[3]), ::min(ssd[6], ssd[7])));
|
||||
|
||||
int bestIdx = 0;
|
||||
for (int i = 0; i < N_DISPARITIES; i++)
|
||||
namespace stereobm
|
||||
{
|
||||
if (mssd == ssd[i])
|
||||
bestIdx = i;
|
||||
}
|
||||
//////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
/////////////////////////////////////// Stereo BM ////////////////////////////////////////////////
|
||||
//////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
return make_uint2(mssd, bestIdx);
|
||||
}
|
||||
#define ROWSperTHREAD 21 // the number of rows a thread will process
|
||||
|
||||
template<int RADIUS>
|
||||
__device__ void StepDown(int idx1, int idx2, unsigned char* imageL, unsigned char* imageR, int d, volatile unsigned int *col_ssd)
|
||||
{
|
||||
unsigned char leftPixel1;
|
||||
unsigned char leftPixel2;
|
||||
unsigned char rightPixel1[8];
|
||||
unsigned char rightPixel2[8];
|
||||
unsigned int diff1, diff2;
|
||||
#define BLOCK_W 128 // the thread block width (464)
|
||||
#define N_DISPARITIES 8
|
||||
|
||||
leftPixel1 = imageL[idx1];
|
||||
leftPixel2 = imageL[idx2];
|
||||
#define STEREO_MIND 0 // The minimum d range to check
|
||||
#define STEREO_DISP_STEP N_DISPARITIES // the d step, must be <= 1 to avoid aliasing
|
||||
|
||||
idx1 = idx1 - d;
|
||||
idx2 = idx2 - d;
|
||||
__constant__ unsigned int* cminSSDImage;
|
||||
__constant__ size_t cminSSD_step;
|
||||
__constant__ int cwidth;
|
||||
__constant__ int cheight;
|
||||
|
||||
rightPixel1[7] = imageR[idx1 - 7];
|
||||
rightPixel1[0] = imageR[idx1 - 0];
|
||||
rightPixel1[1] = imageR[idx1 - 1];
|
||||
rightPixel1[2] = imageR[idx1 - 2];
|
||||
rightPixel1[3] = imageR[idx1 - 3];
|
||||
rightPixel1[4] = imageR[idx1 - 4];
|
||||
rightPixel1[5] = imageR[idx1 - 5];
|
||||
rightPixel1[6] = imageR[idx1 - 6];
|
||||
|
||||
rightPixel2[7] = imageR[idx2 - 7];
|
||||
rightPixel2[0] = imageR[idx2 - 0];
|
||||
rightPixel2[1] = imageR[idx2 - 1];
|
||||
rightPixel2[2] = imageR[idx2 - 2];
|
||||
rightPixel2[3] = imageR[idx2 - 3];
|
||||
rightPixel2[4] = imageR[idx2 - 4];
|
||||
rightPixel2[5] = imageR[idx2 - 5];
|
||||
rightPixel2[6] = imageR[idx2 - 6];
|
||||
|
||||
//See above: #define COL_SSD_SIZE (BLOCK_W + 2 * RADIUS)
|
||||
diff1 = leftPixel1 - rightPixel1[0];
|
||||
diff2 = leftPixel2 - rightPixel2[0];
|
||||
col_ssd[0 * (BLOCK_W + 2 * RADIUS)] += SQ(diff2) - SQ(diff1);
|
||||
|
||||
diff1 = leftPixel1 - rightPixel1[1];
|
||||
diff2 = leftPixel2 - rightPixel2[1];
|
||||
col_ssd[1 * (BLOCK_W + 2 * RADIUS)] += SQ(diff2) - SQ(diff1);
|
||||
|
||||
diff1 = leftPixel1 - rightPixel1[2];
|
||||
diff2 = leftPixel2 - rightPixel2[2];
|
||||
col_ssd[2 * (BLOCK_W + 2 * RADIUS)] += SQ(diff2) - SQ(diff1);
|
||||
|
||||
diff1 = leftPixel1 - rightPixel1[3];
|
||||
diff2 = leftPixel2 - rightPixel2[3];
|
||||
col_ssd[3 * (BLOCK_W + 2 * RADIUS)] += SQ(diff2) - SQ(diff1);
|
||||
|
||||
diff1 = leftPixel1 - rightPixel1[4];
|
||||
diff2 = leftPixel2 - rightPixel2[4];
|
||||
col_ssd[4 * (BLOCK_W + 2 * RADIUS)] += SQ(diff2) - SQ(diff1);
|
||||
|
||||
diff1 = leftPixel1 - rightPixel1[5];
|
||||
diff2 = leftPixel2 - rightPixel2[5];
|
||||
col_ssd[5 * (BLOCK_W + 2 * RADIUS)] += SQ(diff2) - SQ(diff1);
|
||||
|
||||
diff1 = leftPixel1 - rightPixel1[6];
|
||||
diff2 = leftPixel2 - rightPixel2[6];
|
||||
col_ssd[6 * (BLOCK_W + 2 * RADIUS)] += SQ(diff2) - SQ(diff1);
|
||||
|
||||
diff1 = leftPixel1 - rightPixel1[7];
|
||||
diff2 = leftPixel2 - rightPixel2[7];
|
||||
col_ssd[7 * (BLOCK_W + 2 * RADIUS)] += SQ(diff2) - SQ(diff1);
|
||||
}
|
||||
|
||||
template<int RADIUS>
|
||||
__device__ void InitColSSD(int x_tex, int y_tex, int im_pitch, unsigned char* imageL, unsigned char* imageR, int d, volatile unsigned int *col_ssd)
|
||||
{
|
||||
unsigned char leftPixel1;
|
||||
int idx;
|
||||
unsigned int diffa[] = {0, 0, 0, 0, 0, 0, 0, 0};
|
||||
|
||||
for(int i = 0; i < (2 * RADIUS + 1); i++)
|
||||
{
|
||||
idx = y_tex * im_pitch + x_tex;
|
||||
leftPixel1 = imageL[idx];
|
||||
idx = idx - d;
|
||||
|
||||
diffa[0] += SQ(leftPixel1 - imageR[idx - 0]);
|
||||
diffa[1] += SQ(leftPixel1 - imageR[idx - 1]);
|
||||
diffa[2] += SQ(leftPixel1 - imageR[idx - 2]);
|
||||
diffa[3] += SQ(leftPixel1 - imageR[idx - 3]);
|
||||
diffa[4] += SQ(leftPixel1 - imageR[idx - 4]);
|
||||
diffa[5] += SQ(leftPixel1 - imageR[idx - 5]);
|
||||
diffa[6] += SQ(leftPixel1 - imageR[idx - 6]);
|
||||
diffa[7] += SQ(leftPixel1 - imageR[idx - 7]);
|
||||
|
||||
y_tex += 1;
|
||||
}
|
||||
//See above: #define COL_SSD_SIZE (BLOCK_W + 2 * RADIUS)
|
||||
col_ssd[0 * (BLOCK_W + 2 * RADIUS)] = diffa[0];
|
||||
col_ssd[1 * (BLOCK_W + 2 * RADIUS)] = diffa[1];
|
||||
col_ssd[2 * (BLOCK_W + 2 * RADIUS)] = diffa[2];
|
||||
col_ssd[3 * (BLOCK_W + 2 * RADIUS)] = diffa[3];
|
||||
col_ssd[4 * (BLOCK_W + 2 * RADIUS)] = diffa[4];
|
||||
col_ssd[5 * (BLOCK_W + 2 * RADIUS)] = diffa[5];
|
||||
col_ssd[6 * (BLOCK_W + 2 * RADIUS)] = diffa[6];
|
||||
col_ssd[7 * (BLOCK_W + 2 * RADIUS)] = diffa[7];
|
||||
}
|
||||
|
||||
template<int RADIUS>
|
||||
__global__ void stereoKernel(unsigned char *left, unsigned char *right, size_t img_step, PtrStepb disp, int maxdisp)
|
||||
{
|
||||
extern __shared__ unsigned int col_ssd_cache[];
|
||||
volatile unsigned int *col_ssd = col_ssd_cache + BLOCK_W + threadIdx.x;
|
||||
volatile unsigned int *col_ssd_extra = threadIdx.x < (2 * RADIUS) ? col_ssd + BLOCK_W : 0; //#define N_DIRTY_PIXELS (2 * RADIUS)
|
||||
|
||||
//#define X (blockIdx.x * BLOCK_W + threadIdx.x + STEREO_MAXD)
|
||||
int X = (blockIdx.x * BLOCK_W + threadIdx.x + maxdisp + RADIUS);
|
||||
//#define Y (__mul24(blockIdx.y, ROWSperTHREAD) + RADIUS)
|
||||
#define Y (blockIdx.y * ROWSperTHREAD + RADIUS)
|
||||
//int Y = blockIdx.y * ROWSperTHREAD + RADIUS;
|
||||
|
||||
unsigned int* minSSDImage = cminSSDImage + X + Y * cminSSD_step;
|
||||
unsigned char* disparImage = disp.data + X + Y * disp.step;
|
||||
/* if (X < cwidth)
|
||||
{
|
||||
unsigned int *minSSDImage_end = minSSDImage + min(ROWSperTHREAD, cheight - Y) * minssd_step;
|
||||
for(uint *ptr = minSSDImage; ptr != minSSDImage_end; ptr += minssd_step )
|
||||
*ptr = 0xFFFFFFFF;
|
||||
}*/
|
||||
int end_row = ::min(ROWSperTHREAD, cheight - Y - RADIUS);
|
||||
int y_tex;
|
||||
int x_tex = X - RADIUS;
|
||||
|
||||
if (x_tex >= cwidth)
|
||||
return;
|
||||
|
||||
for(int d = STEREO_MIND; d < maxdisp; d += STEREO_DISP_STEP)
|
||||
{
|
||||
y_tex = Y - RADIUS;
|
||||
|
||||
InitColSSD<RADIUS>(x_tex, y_tex, img_step, left, right, d, col_ssd);
|
||||
|
||||
if (col_ssd_extra > 0)
|
||||
if (x_tex + BLOCK_W < cwidth)
|
||||
InitColSSD<RADIUS>(x_tex + BLOCK_W, y_tex, img_step, left, right, d, col_ssd_extra);
|
||||
|
||||
__syncthreads(); //before MinSSD function
|
||||
|
||||
if (X < cwidth - RADIUS && Y < cheight - RADIUS)
|
||||
__device__ __forceinline__ int SQ(int a)
|
||||
{
|
||||
uint2 minSSD = MinSSD<RADIUS>(col_ssd_cache + threadIdx.x, col_ssd);
|
||||
if (minSSD.x < minSSDImage[0])
|
||||
{
|
||||
disparImage[0] = (unsigned char)(d + minSSD.y);
|
||||
minSSDImage[0] = minSSD.x;
|
||||
}
|
||||
return a * a;
|
||||
}
|
||||
|
||||
for(int row = 1; row < end_row; row++)
|
||||
{
|
||||
int idx1 = y_tex * img_step + x_tex;
|
||||
int idx2 = (y_tex + (2 * RADIUS + 1)) * img_step + x_tex;
|
||||
template<int RADIUS>
|
||||
__device__ unsigned int CalcSSD(volatile unsigned int *col_ssd_cache, volatile unsigned int *col_ssd)
|
||||
{
|
||||
unsigned int cache = 0;
|
||||
unsigned int cache2 = 0;
|
||||
|
||||
for(int i = 1; i <= RADIUS; i++)
|
||||
cache += col_ssd[i];
|
||||
|
||||
col_ssd_cache[0] = cache;
|
||||
|
||||
__syncthreads();
|
||||
|
||||
StepDown<RADIUS>(idx1, idx2, left, right, d, col_ssd);
|
||||
if (threadIdx.x < BLOCK_W - RADIUS)
|
||||
cache2 = col_ssd_cache[RADIUS];
|
||||
else
|
||||
for(int i = RADIUS + 1; i < (2 * RADIUS + 1); i++)
|
||||
cache2 += col_ssd[i];
|
||||
|
||||
if (col_ssd_extra)
|
||||
if (x_tex + BLOCK_W < cwidth)
|
||||
StepDown<RADIUS>(idx1, idx2, left + BLOCK_W, right + BLOCK_W, d, col_ssd_extra);
|
||||
return col_ssd[0] + cache + cache2;
|
||||
}
|
||||
|
||||
y_tex += 1;
|
||||
template<int RADIUS>
|
||||
__device__ uint2 MinSSD(volatile unsigned int *col_ssd_cache, volatile unsigned int *col_ssd)
|
||||
{
|
||||
unsigned int ssd[N_DISPARITIES];
|
||||
|
||||
__syncthreads(); //before MinSSD function
|
||||
//See above: #define COL_SSD_SIZE (BLOCK_W + 2 * RADIUS)
|
||||
ssd[0] = CalcSSD<RADIUS>(col_ssd_cache, col_ssd + 0 * (BLOCK_W + 2 * RADIUS));
|
||||
__syncthreads();
|
||||
ssd[1] = CalcSSD<RADIUS>(col_ssd_cache, col_ssd + 1 * (BLOCK_W + 2 * RADIUS));
|
||||
__syncthreads();
|
||||
ssd[2] = CalcSSD<RADIUS>(col_ssd_cache, col_ssd + 2 * (BLOCK_W + 2 * RADIUS));
|
||||
__syncthreads();
|
||||
ssd[3] = CalcSSD<RADIUS>(col_ssd_cache, col_ssd + 3 * (BLOCK_W + 2 * RADIUS));
|
||||
__syncthreads();
|
||||
ssd[4] = CalcSSD<RADIUS>(col_ssd_cache, col_ssd + 4 * (BLOCK_W + 2 * RADIUS));
|
||||
__syncthreads();
|
||||
ssd[5] = CalcSSD<RADIUS>(col_ssd_cache, col_ssd + 5 * (BLOCK_W + 2 * RADIUS));
|
||||
__syncthreads();
|
||||
ssd[6] = CalcSSD<RADIUS>(col_ssd_cache, col_ssd + 6 * (BLOCK_W + 2 * RADIUS));
|
||||
__syncthreads();
|
||||
ssd[7] = CalcSSD<RADIUS>(col_ssd_cache, col_ssd + 7 * (BLOCK_W + 2 * RADIUS));
|
||||
|
||||
if (X < cwidth - RADIUS && row < cheight - RADIUS - Y)
|
||||
int mssd = ::min(::min(::min(ssd[0], ssd[1]), ::min(ssd[4], ssd[5])), ::min(::min(ssd[2], ssd[3]), ::min(ssd[6], ssd[7])));
|
||||
|
||||
int bestIdx = 0;
|
||||
for (int i = 0; i < N_DISPARITIES; i++)
|
||||
{
|
||||
int idx = row * cminSSD_step;
|
||||
uint2 minSSD = MinSSD<RADIUS>(col_ssd_cache + threadIdx.x, col_ssd);
|
||||
if (minSSD.x < minSSDImage[idx])
|
||||
if (mssd == ssd[i])
|
||||
bestIdx = i;
|
||||
}
|
||||
|
||||
return make_uint2(mssd, bestIdx);
|
||||
}
|
||||
|
||||
template<int RADIUS>
|
||||
__device__ void StepDown(int idx1, int idx2, unsigned char* imageL, unsigned char* imageR, int d, volatile unsigned int *col_ssd)
|
||||
{
|
||||
unsigned char leftPixel1;
|
||||
unsigned char leftPixel2;
|
||||
unsigned char rightPixel1[8];
|
||||
unsigned char rightPixel2[8];
|
||||
unsigned int diff1, diff2;
|
||||
|
||||
leftPixel1 = imageL[idx1];
|
||||
leftPixel2 = imageL[idx2];
|
||||
|
||||
idx1 = idx1 - d;
|
||||
idx2 = idx2 - d;
|
||||
|
||||
rightPixel1[7] = imageR[idx1 - 7];
|
||||
rightPixel1[0] = imageR[idx1 - 0];
|
||||
rightPixel1[1] = imageR[idx1 - 1];
|
||||
rightPixel1[2] = imageR[idx1 - 2];
|
||||
rightPixel1[3] = imageR[idx1 - 3];
|
||||
rightPixel1[4] = imageR[idx1 - 4];
|
||||
rightPixel1[5] = imageR[idx1 - 5];
|
||||
rightPixel1[6] = imageR[idx1 - 6];
|
||||
|
||||
rightPixel2[7] = imageR[idx2 - 7];
|
||||
rightPixel2[0] = imageR[idx2 - 0];
|
||||
rightPixel2[1] = imageR[idx2 - 1];
|
||||
rightPixel2[2] = imageR[idx2 - 2];
|
||||
rightPixel2[3] = imageR[idx2 - 3];
|
||||
rightPixel2[4] = imageR[idx2 - 4];
|
||||
rightPixel2[5] = imageR[idx2 - 5];
|
||||
rightPixel2[6] = imageR[idx2 - 6];
|
||||
|
||||
//See above: #define COL_SSD_SIZE (BLOCK_W + 2 * RADIUS)
|
||||
diff1 = leftPixel1 - rightPixel1[0];
|
||||
diff2 = leftPixel2 - rightPixel2[0];
|
||||
col_ssd[0 * (BLOCK_W + 2 * RADIUS)] += SQ(diff2) - SQ(diff1);
|
||||
|
||||
diff1 = leftPixel1 - rightPixel1[1];
|
||||
diff2 = leftPixel2 - rightPixel2[1];
|
||||
col_ssd[1 * (BLOCK_W + 2 * RADIUS)] += SQ(diff2) - SQ(diff1);
|
||||
|
||||
diff1 = leftPixel1 - rightPixel1[2];
|
||||
diff2 = leftPixel2 - rightPixel2[2];
|
||||
col_ssd[2 * (BLOCK_W + 2 * RADIUS)] += SQ(diff2) - SQ(diff1);
|
||||
|
||||
diff1 = leftPixel1 - rightPixel1[3];
|
||||
diff2 = leftPixel2 - rightPixel2[3];
|
||||
col_ssd[3 * (BLOCK_W + 2 * RADIUS)] += SQ(diff2) - SQ(diff1);
|
||||
|
||||
diff1 = leftPixel1 - rightPixel1[4];
|
||||
diff2 = leftPixel2 - rightPixel2[4];
|
||||
col_ssd[4 * (BLOCK_W + 2 * RADIUS)] += SQ(diff2) - SQ(diff1);
|
||||
|
||||
diff1 = leftPixel1 - rightPixel1[5];
|
||||
diff2 = leftPixel2 - rightPixel2[5];
|
||||
col_ssd[5 * (BLOCK_W + 2 * RADIUS)] += SQ(diff2) - SQ(diff1);
|
||||
|
||||
diff1 = leftPixel1 - rightPixel1[6];
|
||||
diff2 = leftPixel2 - rightPixel2[6];
|
||||
col_ssd[6 * (BLOCK_W + 2 * RADIUS)] += SQ(diff2) - SQ(diff1);
|
||||
|
||||
diff1 = leftPixel1 - rightPixel1[7];
|
||||
diff2 = leftPixel2 - rightPixel2[7];
|
||||
col_ssd[7 * (BLOCK_W + 2 * RADIUS)] += SQ(diff2) - SQ(diff1);
|
||||
}
|
||||
|
||||
template<int RADIUS>
|
||||
__device__ void InitColSSD(int x_tex, int y_tex, int im_pitch, unsigned char* imageL, unsigned char* imageR, int d, volatile unsigned int *col_ssd)
|
||||
{
|
||||
unsigned char leftPixel1;
|
||||
int idx;
|
||||
unsigned int diffa[] = {0, 0, 0, 0, 0, 0, 0, 0};
|
||||
|
||||
for(int i = 0; i < (2 * RADIUS + 1); i++)
|
||||
{
|
||||
idx = y_tex * im_pitch + x_tex;
|
||||
leftPixel1 = imageL[idx];
|
||||
idx = idx - d;
|
||||
|
||||
diffa[0] += SQ(leftPixel1 - imageR[idx - 0]);
|
||||
diffa[1] += SQ(leftPixel1 - imageR[idx - 1]);
|
||||
diffa[2] += SQ(leftPixel1 - imageR[idx - 2]);
|
||||
diffa[3] += SQ(leftPixel1 - imageR[idx - 3]);
|
||||
diffa[4] += SQ(leftPixel1 - imageR[idx - 4]);
|
||||
diffa[5] += SQ(leftPixel1 - imageR[idx - 5]);
|
||||
diffa[6] += SQ(leftPixel1 - imageR[idx - 6]);
|
||||
diffa[7] += SQ(leftPixel1 - imageR[idx - 7]);
|
||||
|
||||
y_tex += 1;
|
||||
}
|
||||
//See above: #define COL_SSD_SIZE (BLOCK_W + 2 * RADIUS)
|
||||
col_ssd[0 * (BLOCK_W + 2 * RADIUS)] = diffa[0];
|
||||
col_ssd[1 * (BLOCK_W + 2 * RADIUS)] = diffa[1];
|
||||
col_ssd[2 * (BLOCK_W + 2 * RADIUS)] = diffa[2];
|
||||
col_ssd[3 * (BLOCK_W + 2 * RADIUS)] = diffa[3];
|
||||
col_ssd[4 * (BLOCK_W + 2 * RADIUS)] = diffa[4];
|
||||
col_ssd[5 * (BLOCK_W + 2 * RADIUS)] = diffa[5];
|
||||
col_ssd[6 * (BLOCK_W + 2 * RADIUS)] = diffa[6];
|
||||
col_ssd[7 * (BLOCK_W + 2 * RADIUS)] = diffa[7];
|
||||
}
|
||||
|
||||
template<int RADIUS>
|
||||
__global__ void stereoKernel(unsigned char *left, unsigned char *right, size_t img_step, PtrStepb disp, int maxdisp)
|
||||
{
|
||||
extern __shared__ unsigned int col_ssd_cache[];
|
||||
volatile unsigned int *col_ssd = col_ssd_cache + BLOCK_W + threadIdx.x;
|
||||
volatile unsigned int *col_ssd_extra = threadIdx.x < (2 * RADIUS) ? col_ssd + BLOCK_W : 0; //#define N_DIRTY_PIXELS (2 * RADIUS)
|
||||
|
||||
//#define X (blockIdx.x * BLOCK_W + threadIdx.x + STEREO_MAXD)
|
||||
int X = (blockIdx.x * BLOCK_W + threadIdx.x + maxdisp + RADIUS);
|
||||
//#define Y (__mul24(blockIdx.y, ROWSperTHREAD) + RADIUS)
|
||||
#define Y (blockIdx.y * ROWSperTHREAD + RADIUS)
|
||||
//int Y = blockIdx.y * ROWSperTHREAD + RADIUS;
|
||||
|
||||
unsigned int* minSSDImage = cminSSDImage + X + Y * cminSSD_step;
|
||||
unsigned char* disparImage = disp.data + X + Y * disp.step;
|
||||
/* if (X < cwidth)
|
||||
{
|
||||
unsigned int *minSSDImage_end = minSSDImage + min(ROWSperTHREAD, cheight - Y) * minssd_step;
|
||||
for(uint *ptr = minSSDImage; ptr != minSSDImage_end; ptr += minssd_step )
|
||||
*ptr = 0xFFFFFFFF;
|
||||
}*/
|
||||
int end_row = ::min(ROWSperTHREAD, cheight - Y - RADIUS);
|
||||
int y_tex;
|
||||
int x_tex = X - RADIUS;
|
||||
|
||||
if (x_tex >= cwidth)
|
||||
return;
|
||||
|
||||
for(int d = STEREO_MIND; d < maxdisp; d += STEREO_DISP_STEP)
|
||||
{
|
||||
y_tex = Y - RADIUS;
|
||||
|
||||
InitColSSD<RADIUS>(x_tex, y_tex, img_step, left, right, d, col_ssd);
|
||||
|
||||
if (col_ssd_extra > 0)
|
||||
if (x_tex + BLOCK_W < cwidth)
|
||||
InitColSSD<RADIUS>(x_tex + BLOCK_W, y_tex, img_step, left, right, d, col_ssd_extra);
|
||||
|
||||
__syncthreads(); //before MinSSD function
|
||||
|
||||
if (X < cwidth - RADIUS && Y < cheight - RADIUS)
|
||||
{
|
||||
disparImage[disp.step * row] = (unsigned char)(d + minSSD.y);
|
||||
minSSDImage[idx] = minSSD.x;
|
||||
uint2 minSSD = MinSSD<RADIUS>(col_ssd_cache + threadIdx.x, col_ssd);
|
||||
if (minSSD.x < minSSDImage[0])
|
||||
{
|
||||
disparImage[0] = (unsigned char)(d + minSSD.y);
|
||||
minSSDImage[0] = minSSD.x;
|
||||
}
|
||||
}
|
||||
|
||||
for(int row = 1; row < end_row; row++)
|
||||
{
|
||||
int idx1 = y_tex * img_step + x_tex;
|
||||
int idx2 = (y_tex + (2 * RADIUS + 1)) * img_step + x_tex;
|
||||
|
||||
__syncthreads();
|
||||
|
||||
StepDown<RADIUS>(idx1, idx2, left, right, d, col_ssd);
|
||||
|
||||
if (col_ssd_extra)
|
||||
if (x_tex + BLOCK_W < cwidth)
|
||||
StepDown<RADIUS>(idx1, idx2, left + BLOCK_W, right + BLOCK_W, d, col_ssd_extra);
|
||||
|
||||
y_tex += 1;
|
||||
|
||||
__syncthreads(); //before MinSSD function
|
||||
|
||||
if (X < cwidth - RADIUS && row < cheight - RADIUS - Y)
|
||||
{
|
||||
int idx = row * cminSSD_step;
|
||||
uint2 minSSD = MinSSD<RADIUS>(col_ssd_cache + threadIdx.x, col_ssd);
|
||||
if (minSSD.x < minSSDImage[idx])
|
||||
{
|
||||
disparImage[disp.step * row] = (unsigned char)(d + minSSD.y);
|
||||
minSSDImage[idx] = minSSD.x;
|
||||
}
|
||||
}
|
||||
} // for row loop
|
||||
} // for d loop
|
||||
}
|
||||
|
||||
|
||||
template<int RADIUS> void kernel_caller(const DevMem2Db& left, const DevMem2Db& right, const DevMem2Db& disp, int maxdisp, cudaStream_t & stream)
|
||||
{
|
||||
dim3 grid(1,1,1);
|
||||
dim3 threads(BLOCK_W, 1, 1);
|
||||
|
||||
grid.x = divUp(left.cols - maxdisp - 2 * RADIUS, BLOCK_W);
|
||||
grid.y = divUp(left.rows - 2 * RADIUS, ROWSperTHREAD);
|
||||
|
||||
//See above: #define COL_SSD_SIZE (BLOCK_W + 2 * RADIUS)
|
||||
size_t smem_size = (BLOCK_W + N_DISPARITIES * (BLOCK_W + 2 * RADIUS)) * sizeof(unsigned int);
|
||||
|
||||
stereoKernel<RADIUS><<<grid, threads, smem_size, stream>>>(left.data, right.data, left.step, disp, maxdisp);
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
|
||||
if (stream == 0)
|
||||
cudaSafeCall( cudaDeviceSynchronize() );
|
||||
};
|
||||
|
||||
typedef void (*kernel_caller_t)(const DevMem2Db& left, const DevMem2Db& right, const DevMem2Db& disp, int maxdisp, cudaStream_t & stream);
|
||||
|
||||
const static kernel_caller_t callers[] =
|
||||
{
|
||||
0,
|
||||
kernel_caller< 1>, kernel_caller< 2>, kernel_caller< 3>, kernel_caller< 4>, kernel_caller< 5>,
|
||||
kernel_caller< 6>, kernel_caller< 7>, kernel_caller< 8>, kernel_caller< 9>, kernel_caller<10>,
|
||||
kernel_caller<11>, kernel_caller<12>, kernel_caller<13>, kernel_caller<15>, kernel_caller<15>,
|
||||
kernel_caller<16>, kernel_caller<17>, kernel_caller<18>, kernel_caller<19>, kernel_caller<20>,
|
||||
kernel_caller<21>, kernel_caller<22>, kernel_caller<23>, kernel_caller<24>, kernel_caller<25>
|
||||
|
||||
//0,0,0, 0,0,0, 0,0,kernel_caller<9>
|
||||
};
|
||||
const int calles_num = sizeof(callers)/sizeof(callers[0]);
|
||||
|
||||
void stereoBM_GPU(const DevMem2Db& left, const DevMem2Db& right, const DevMem2Db& disp, int maxdisp, int winsz, const DevMem2D_<unsigned int>& minSSD_buf, cudaStream_t& stream)
|
||||
{
|
||||
int winsz2 = winsz >> 1;
|
||||
|
||||
if (winsz2 == 0 || winsz2 >= calles_num)
|
||||
cv::gpu::error("Unsupported window size", __FILE__, __LINE__);
|
||||
|
||||
//cudaSafeCall( cudaFuncSetCacheConfig(&stereoKernel, cudaFuncCachePreferL1) );
|
||||
//cudaSafeCall( cudaFuncSetCacheConfig(&stereoKernel, cudaFuncCachePreferShared) );
|
||||
|
||||
cudaSafeCall( cudaMemset2D(disp.data, disp.step, 0, disp.cols, disp.rows) );
|
||||
cudaSafeCall( cudaMemset2D(minSSD_buf.data, minSSD_buf.step, 0xFF, minSSD_buf.cols * minSSD_buf.elemSize(), disp.rows) );
|
||||
|
||||
cudaSafeCall( cudaMemcpyToSymbol( cwidth, &left.cols, sizeof(left.cols) ) );
|
||||
cudaSafeCall( cudaMemcpyToSymbol( cheight, &left.rows, sizeof(left.rows) ) );
|
||||
cudaSafeCall( cudaMemcpyToSymbol( cminSSDImage, &minSSD_buf.data, sizeof(minSSD_buf.data) ) );
|
||||
|
||||
size_t minssd_step = minSSD_buf.step/minSSD_buf.elemSize();
|
||||
cudaSafeCall( cudaMemcpyToSymbol( cminSSD_step, &minssd_step, sizeof(minssd_step) ) );
|
||||
|
||||
callers[winsz2](left, right, disp, maxdisp, stream);
|
||||
}
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
/////////////////////////////////////// Sobel Prefiler ///////////////////////////////////////////
|
||||
//////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
texture<unsigned char, 2, cudaReadModeElementType> texForSobel;
|
||||
|
||||
__global__ void prefilter_kernel(DevMem2Db output, int prefilterCap)
|
||||
{
|
||||
int x = blockDim.x * blockIdx.x + threadIdx.x;
|
||||
int y = blockDim.y * blockIdx.y + threadIdx.y;
|
||||
|
||||
if (x < output.cols && y < output.rows)
|
||||
{
|
||||
int conv = (int)tex2D(texForSobel, x - 1, y - 1) * (-1) + (int)tex2D(texForSobel, x + 1, y - 1) * (1) +
|
||||
(int)tex2D(texForSobel, x - 1, y ) * (-2) + (int)tex2D(texForSobel, x + 1, y ) * (2) +
|
||||
(int)tex2D(texForSobel, x - 1, y + 1) * (-1) + (int)tex2D(texForSobel, x + 1, y + 1) * (1);
|
||||
|
||||
|
||||
conv = ::min(::min(::max(-prefilterCap, conv), prefilterCap) + prefilterCap, 255);
|
||||
output.ptr(y)[x] = conv & 0xFF;
|
||||
}
|
||||
}
|
||||
|
||||
void prefilter_xsobel(const DevMem2Db& input, const DevMem2Db& output, int prefilterCap, cudaStream_t & stream)
|
||||
{
|
||||
cudaChannelFormatDesc desc = cudaCreateChannelDesc<unsigned char>();
|
||||
cudaSafeCall( cudaBindTexture2D( 0, texForSobel, input.data, desc, input.cols, input.rows, input.step ) );
|
||||
|
||||
dim3 threads(16, 16, 1);
|
||||
dim3 grid(1, 1, 1);
|
||||
|
||||
grid.x = divUp(input.cols, threads.x);
|
||||
grid.y = divUp(input.rows, threads.y);
|
||||
|
||||
prefilter_kernel<<<grid, threads, 0, stream>>>(output, prefilterCap);
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
|
||||
if (stream == 0)
|
||||
cudaSafeCall( cudaDeviceSynchronize() );
|
||||
|
||||
cudaSafeCall( cudaUnbindTexture (texForSobel ) );
|
||||
}
|
||||
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
/////////////////////////////////// Textureness filtering ////////////////////////////////////////
|
||||
//////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
texture<unsigned char, 2, cudaReadModeNormalizedFloat> texForTF;
|
||||
|
||||
__device__ __forceinline__ float sobel(int x, int y)
|
||||
{
|
||||
float conv = tex2D(texForTF, x - 1, y - 1) * (-1) + tex2D(texForTF, x + 1, y - 1) * (1) +
|
||||
tex2D(texForTF, x - 1, y ) * (-2) + tex2D(texForTF, x + 1, y ) * (2) +
|
||||
tex2D(texForTF, x - 1, y + 1) * (-1) + tex2D(texForTF, x + 1, y + 1) * (1);
|
||||
return fabs(conv);
|
||||
}
|
||||
|
||||
__device__ float CalcSums(float *cols, float *cols_cache, int winsz)
|
||||
{
|
||||
float cache = 0;
|
||||
float cache2 = 0;
|
||||
int winsz2 = winsz/2;
|
||||
|
||||
for(int i = 1; i <= winsz2; i++)
|
||||
cache += cols[i];
|
||||
|
||||
cols_cache[0] = cache;
|
||||
|
||||
__syncthreads();
|
||||
|
||||
if (threadIdx.x < blockDim.x - winsz2)
|
||||
cache2 = cols_cache[winsz2];
|
||||
else
|
||||
for(int i = winsz2 + 1; i < winsz; i++)
|
||||
cache2 += cols[i];
|
||||
|
||||
return cols[0] + cache + cache2;
|
||||
}
|
||||
|
||||
#define RpT (2 * ROWSperTHREAD) // got experimentally
|
||||
|
||||
__global__ void textureness_kernel(DevMem2Db disp, int winsz, float threshold)
|
||||
{
|
||||
int winsz2 = winsz/2;
|
||||
int n_dirty_pixels = (winsz2) * 2;
|
||||
|
||||
extern __shared__ float cols_cache[];
|
||||
float *cols = cols_cache + blockDim.x + threadIdx.x;
|
||||
float *cols_extra = threadIdx.x < n_dirty_pixels ? cols + blockDim.x : 0;
|
||||
|
||||
int x = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
int beg_row = blockIdx.y * RpT;
|
||||
int end_row = ::min(beg_row + RpT, disp.rows);
|
||||
|
||||
if (x < disp.cols)
|
||||
{
|
||||
int y = beg_row;
|
||||
|
||||
float sum = 0;
|
||||
float sum_extra = 0;
|
||||
|
||||
for(int i = y - winsz2; i <= y + winsz2; ++i)
|
||||
{
|
||||
sum += sobel(x - winsz2, i);
|
||||
if (cols_extra)
|
||||
sum_extra += sobel(x + blockDim.x - winsz2, i);
|
||||
}
|
||||
*cols = sum;
|
||||
if (cols_extra)
|
||||
*cols_extra = sum_extra;
|
||||
|
||||
__syncthreads();
|
||||
|
||||
float sum_win = CalcSums(cols, cols_cache + threadIdx.x, winsz) * 255;
|
||||
if (sum_win < threshold)
|
||||
disp.data[y * disp.step + x] = 0;
|
||||
|
||||
__syncthreads();
|
||||
|
||||
for(int y = beg_row + 1; y < end_row; ++y)
|
||||
{
|
||||
sum = sum - sobel(x - winsz2, y - winsz2 - 1) + sobel(x - winsz2, y + winsz2);
|
||||
*cols = sum;
|
||||
|
||||
if (cols_extra)
|
||||
{
|
||||
sum_extra = sum_extra - sobel(x + blockDim.x - winsz2, y - winsz2 - 1) + sobel(x + blockDim.x - winsz2, y + winsz2);
|
||||
*cols_extra = sum_extra;
|
||||
}
|
||||
|
||||
__syncthreads();
|
||||
float sum_win = CalcSums(cols, cols_cache + threadIdx.x, winsz) * 255;
|
||||
if (sum_win < threshold)
|
||||
disp.data[y * disp.step + x] = 0;
|
||||
|
||||
__syncthreads();
|
||||
}
|
||||
}
|
||||
} // for row loop
|
||||
} // for d loop
|
||||
}
|
||||
|
||||
|
||||
template<int RADIUS> void kernel_caller(const DevMem2Db& left, const DevMem2Db& right, const DevMem2Db& disp, int maxdisp, cudaStream_t & stream)
|
||||
{
|
||||
dim3 grid(1,1,1);
|
||||
dim3 threads(BLOCK_W, 1, 1);
|
||||
|
||||
grid.x = divUp(left.cols - maxdisp - 2 * RADIUS, BLOCK_W);
|
||||
grid.y = divUp(left.rows - 2 * RADIUS, ROWSperTHREAD);
|
||||
|
||||
//See above: #define COL_SSD_SIZE (BLOCK_W + 2 * RADIUS)
|
||||
size_t smem_size = (BLOCK_W + N_DISPARITIES * (BLOCK_W + 2 * RADIUS)) * sizeof(unsigned int);
|
||||
|
||||
stereoKernel<RADIUS><<<grid, threads, smem_size, stream>>>(left.data, right.data, left.step, disp, maxdisp);
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
|
||||
if (stream == 0)
|
||||
cudaSafeCall( cudaDeviceSynchronize() );
|
||||
};
|
||||
|
||||
typedef void (*kernel_caller_t)(const DevMem2Db& left, const DevMem2Db& right, const DevMem2Db& disp, int maxdisp, cudaStream_t & stream);
|
||||
|
||||
const static kernel_caller_t callers[] =
|
||||
{
|
||||
0,
|
||||
kernel_caller< 1>, kernel_caller< 2>, kernel_caller< 3>, kernel_caller< 4>, kernel_caller< 5>,
|
||||
kernel_caller< 6>, kernel_caller< 7>, kernel_caller< 8>, kernel_caller< 9>, kernel_caller<10>,
|
||||
kernel_caller<11>, kernel_caller<12>, kernel_caller<13>, kernel_caller<15>, kernel_caller<15>,
|
||||
kernel_caller<16>, kernel_caller<17>, kernel_caller<18>, kernel_caller<19>, kernel_caller<20>,
|
||||
kernel_caller<21>, kernel_caller<22>, kernel_caller<23>, kernel_caller<24>, kernel_caller<25>
|
||||
|
||||
//0,0,0, 0,0,0, 0,0,kernel_caller<9>
|
||||
};
|
||||
const int calles_num = sizeof(callers)/sizeof(callers[0]);
|
||||
|
||||
void stereoBM_GPU(const DevMem2Db& left, const DevMem2Db& right, const DevMem2Db& disp, int maxdisp, int winsz, const DevMem2D_<unsigned int>& minSSD_buf, cudaStream_t& stream)
|
||||
{
|
||||
int winsz2 = winsz >> 1;
|
||||
|
||||
if (winsz2 == 0 || winsz2 >= calles_num)
|
||||
cv::gpu::error("Unsupported window size", __FILE__, __LINE__);
|
||||
|
||||
//cudaSafeCall( cudaFuncSetCacheConfig(&stereoKernel, cudaFuncCachePreferL1) );
|
||||
//cudaSafeCall( cudaFuncSetCacheConfig(&stereoKernel, cudaFuncCachePreferShared) );
|
||||
|
||||
cudaSafeCall( cudaMemset2D(disp.data, disp.step, 0, disp.cols, disp.rows) );
|
||||
cudaSafeCall( cudaMemset2D(minSSD_buf.data, minSSD_buf.step, 0xFF, minSSD_buf.cols * minSSD_buf.elemSize(), disp.rows) );
|
||||
|
||||
cudaSafeCall( cudaMemcpyToSymbol( cwidth, &left.cols, sizeof(left.cols) ) );
|
||||
cudaSafeCall( cudaMemcpyToSymbol( cheight, &left.rows, sizeof(left.rows) ) );
|
||||
cudaSafeCall( cudaMemcpyToSymbol( cminSSDImage, &minSSD_buf.data, sizeof(minSSD_buf.data) ) );
|
||||
|
||||
size_t minssd_step = minSSD_buf.step/minSSD_buf.elemSize();
|
||||
cudaSafeCall( cudaMemcpyToSymbol( cminSSD_step, &minssd_step, sizeof(minssd_step) ) );
|
||||
|
||||
callers[winsz2](left, right, disp, maxdisp, stream);
|
||||
}
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
/////////////////////////////////////// Sobel Prefiler ///////////////////////////////////////////
|
||||
//////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
texture<unsigned char, 2, cudaReadModeElementType> texForSobel;
|
||||
|
||||
__global__ void prefilter_kernel(DevMem2Db output, int prefilterCap)
|
||||
{
|
||||
int x = blockDim.x * blockIdx.x + threadIdx.x;
|
||||
int y = blockDim.y * blockIdx.y + threadIdx.y;
|
||||
|
||||
if (x < output.cols && y < output.rows)
|
||||
{
|
||||
int conv = (int)tex2D(texForSobel, x - 1, y - 1) * (-1) + (int)tex2D(texForSobel, x + 1, y - 1) * (1) +
|
||||
(int)tex2D(texForSobel, x - 1, y ) * (-2) + (int)tex2D(texForSobel, x + 1, y ) * (2) +
|
||||
(int)tex2D(texForSobel, x - 1, y + 1) * (-1) + (int)tex2D(texForSobel, x + 1, y + 1) * (1);
|
||||
|
||||
|
||||
conv = ::min(::min(::max(-prefilterCap, conv), prefilterCap) + prefilterCap, 255);
|
||||
output.ptr(y)[x] = conv & 0xFF;
|
||||
}
|
||||
}
|
||||
|
||||
void prefilter_xsobel(const DevMem2Db& input, const DevMem2Db& output, int prefilterCap, cudaStream_t & stream)
|
||||
{
|
||||
cudaChannelFormatDesc desc = cudaCreateChannelDesc<unsigned char>();
|
||||
cudaSafeCall( cudaBindTexture2D( 0, texForSobel, input.data, desc, input.cols, input.rows, input.step ) );
|
||||
|
||||
dim3 threads(16, 16, 1);
|
||||
dim3 grid(1, 1, 1);
|
||||
|
||||
grid.x = divUp(input.cols, threads.x);
|
||||
grid.y = divUp(input.rows, threads.y);
|
||||
|
||||
prefilter_kernel<<<grid, threads, 0, stream>>>(output, prefilterCap);
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
|
||||
if (stream == 0)
|
||||
cudaSafeCall( cudaDeviceSynchronize() );
|
||||
|
||||
cudaSafeCall( cudaUnbindTexture (texForSobel ) );
|
||||
}
|
||||
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
/////////////////////////////////// Textureness filtering ////////////////////////////////////////
|
||||
//////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
texture<unsigned char, 2, cudaReadModeNormalizedFloat> texForTF;
|
||||
|
||||
__device__ __forceinline__ float sobel(int x, int y)
|
||||
{
|
||||
float conv = tex2D(texForTF, x - 1, y - 1) * (-1) + tex2D(texForTF, x + 1, y - 1) * (1) +
|
||||
tex2D(texForTF, x - 1, y ) * (-2) + tex2D(texForTF, x + 1, y ) * (2) +
|
||||
tex2D(texForTF, x - 1, y + 1) * (-1) + tex2D(texForTF, x + 1, y + 1) * (1);
|
||||
return fabs(conv);
|
||||
}
|
||||
|
||||
__device__ float CalcSums(float *cols, float *cols_cache, int winsz)
|
||||
{
|
||||
float cache = 0;
|
||||
float cache2 = 0;
|
||||
int winsz2 = winsz/2;
|
||||
|
||||
for(int i = 1; i <= winsz2; i++)
|
||||
cache += cols[i];
|
||||
|
||||
cols_cache[0] = cache;
|
||||
|
||||
__syncthreads();
|
||||
|
||||
if (threadIdx.x < blockDim.x - winsz2)
|
||||
cache2 = cols_cache[winsz2];
|
||||
else
|
||||
for(int i = winsz2 + 1; i < winsz; i++)
|
||||
cache2 += cols[i];
|
||||
|
||||
return cols[0] + cache + cache2;
|
||||
}
|
||||
|
||||
#define RpT (2 * ROWSperTHREAD) // got experimentally
|
||||
|
||||
__global__ void textureness_kernel(DevMem2Db disp, int winsz, float threshold)
|
||||
{
|
||||
int winsz2 = winsz/2;
|
||||
int n_dirty_pixels = (winsz2) * 2;
|
||||
|
||||
extern __shared__ float cols_cache[];
|
||||
float *cols = cols_cache + blockDim.x + threadIdx.x;
|
||||
float *cols_extra = threadIdx.x < n_dirty_pixels ? cols + blockDim.x : 0;
|
||||
|
||||
int x = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
int beg_row = blockIdx.y * RpT;
|
||||
int end_row = ::min(beg_row + RpT, disp.rows);
|
||||
|
||||
if (x < disp.cols)
|
||||
{
|
||||
int y = beg_row;
|
||||
|
||||
float sum = 0;
|
||||
float sum_extra = 0;
|
||||
|
||||
for(int i = y - winsz2; i <= y + winsz2; ++i)
|
||||
{
|
||||
sum += sobel(x - winsz2, i);
|
||||
if (cols_extra)
|
||||
sum_extra += sobel(x + blockDim.x - winsz2, i);
|
||||
}
|
||||
*cols = sum;
|
||||
if (cols_extra)
|
||||
*cols_extra = sum_extra;
|
||||
|
||||
__syncthreads();
|
||||
|
||||
float sum_win = CalcSums(cols, cols_cache + threadIdx.x, winsz) * 255;
|
||||
if (sum_win < threshold)
|
||||
disp.data[y * disp.step + x] = 0;
|
||||
|
||||
__syncthreads();
|
||||
|
||||
for(int y = beg_row + 1; y < end_row; ++y)
|
||||
void postfilter_textureness(const DevMem2Db& input, int winsz, float avgTexturenessThreshold, const DevMem2Db& disp, cudaStream_t & stream)
|
||||
{
|
||||
sum = sum - sobel(x - winsz2, y - winsz2 - 1) + sobel(x - winsz2, y + winsz2);
|
||||
*cols = sum;
|
||||
avgTexturenessThreshold *= winsz * winsz;
|
||||
|
||||
if (cols_extra)
|
||||
{
|
||||
sum_extra = sum_extra - sobel(x + blockDim.x - winsz2, y - winsz2 - 1) + sobel(x + blockDim.x - winsz2, y + winsz2);
|
||||
*cols_extra = sum_extra;
|
||||
}
|
||||
texForTF.filterMode = cudaFilterModeLinear;
|
||||
texForTF.addressMode[0] = cudaAddressModeWrap;
|
||||
texForTF.addressMode[1] = cudaAddressModeWrap;
|
||||
|
||||
__syncthreads();
|
||||
float sum_win = CalcSums(cols, cols_cache + threadIdx.x, winsz) * 255;
|
||||
if (sum_win < threshold)
|
||||
disp.data[y * disp.step + x] = 0;
|
||||
cudaChannelFormatDesc desc = cudaCreateChannelDesc<unsigned char>();
|
||||
cudaSafeCall( cudaBindTexture2D( 0, texForTF, input.data, desc, input.cols, input.rows, input.step ) );
|
||||
|
||||
__syncthreads();
|
||||
dim3 threads(128, 1, 1);
|
||||
dim3 grid(1, 1, 1);
|
||||
|
||||
grid.x = divUp(input.cols, threads.x);
|
||||
grid.y = divUp(input.rows, RpT);
|
||||
|
||||
size_t smem_size = (threads.x + threads.x + (winsz/2) * 2 ) * sizeof(float);
|
||||
textureness_kernel<<<grid, threads, smem_size, stream>>>(disp, winsz, avgTexturenessThreshold);
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
|
||||
if (stream == 0)
|
||||
cudaSafeCall( cudaDeviceSynchronize() );
|
||||
|
||||
cudaSafeCall( cudaUnbindTexture (texForTF) );
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void postfilter_textureness(const DevMem2Db& input, int winsz, float avgTexturenessThreshold, const DevMem2Db& disp, cudaStream_t & stream)
|
||||
{
|
||||
avgTexturenessThreshold *= winsz * winsz;
|
||||
|
||||
texForTF.filterMode = cudaFilterModeLinear;
|
||||
texForTF.addressMode[0] = cudaAddressModeWrap;
|
||||
texForTF.addressMode[1] = cudaAddressModeWrap;
|
||||
|
||||
cudaChannelFormatDesc desc = cudaCreateChannelDesc<unsigned char>();
|
||||
cudaSafeCall( cudaBindTexture2D( 0, texForTF, input.data, desc, input.cols, input.rows, input.step ) );
|
||||
|
||||
dim3 threads(128, 1, 1);
|
||||
dim3 grid(1, 1, 1);
|
||||
|
||||
grid.x = divUp(input.cols, threads.x);
|
||||
grid.y = divUp(input.rows, RpT);
|
||||
|
||||
size_t smem_size = (threads.x + threads.x + (winsz/2) * 2 ) * sizeof(float);
|
||||
textureness_kernel<<<grid, threads, smem_size, stream>>>(disp, winsz, avgTexturenessThreshold);
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
|
||||
if (stream == 0)
|
||||
cudaSafeCall( cudaDeviceSynchronize() );
|
||||
|
||||
cudaSafeCall( cudaUnbindTexture (texForTF) );
|
||||
}
|
||||
|
||||
} // namespace stereobm
|
||||
|
||||
END_OPENCV_DEVICE_NAMESPACE
|
||||
} // namespace stereobm
|
||||
}}} // namespace cv { namespace gpu { namespace device
|
||||
|
@ -44,489 +44,487 @@
|
||||
#include "opencv2/gpu/device/saturate_cast.hpp"
|
||||
#include "opencv2/gpu/device/limits.hpp"
|
||||
|
||||
BEGIN_OPENCV_DEVICE_NAMESPACE
|
||||
|
||||
namespace stereobp {
|
||||
|
||||
///////////////////////////////////////////////////////////////
|
||||
/////////////////////// load constants ////////////////////////
|
||||
///////////////////////////////////////////////////////////////
|
||||
|
||||
__constant__ int cndisp;
|
||||
__constant__ float cmax_data_term;
|
||||
__constant__ float cdata_weight;
|
||||
__constant__ float cmax_disc_term;
|
||||
__constant__ float cdisc_single_jump;
|
||||
|
||||
void load_constants(int ndisp, float max_data_term, float data_weight, float max_disc_term, float disc_single_jump)
|
||||
namespace cv { namespace gpu { namespace device
|
||||
{
|
||||
cudaSafeCall( cudaMemcpyToSymbol(cndisp, &ndisp, sizeof(int )) );
|
||||
cudaSafeCall( cudaMemcpyToSymbol(cmax_data_term, &max_data_term, sizeof(float)) );
|
||||
cudaSafeCall( cudaMemcpyToSymbol(cdata_weight, &data_weight, sizeof(float)) );
|
||||
cudaSafeCall( cudaMemcpyToSymbol(cmax_disc_term, &max_disc_term, sizeof(float)) );
|
||||
cudaSafeCall( cudaMemcpyToSymbol(cdisc_single_jump, &disc_single_jump, sizeof(float)) );
|
||||
}
|
||||
|
||||
///////////////////////////////////////////////////////////////
|
||||
////////////////////////// comp data //////////////////////////
|
||||
///////////////////////////////////////////////////////////////
|
||||
|
||||
template <int cn> struct PixDiff;
|
||||
template <> struct PixDiff<1>
|
||||
{
|
||||
__device__ __forceinline__ PixDiff(const uchar* ls)
|
||||
namespace stereobp
|
||||
{
|
||||
l = *ls;
|
||||
}
|
||||
__device__ __forceinline__ float operator()(const uchar* rs) const
|
||||
{
|
||||
return ::abs((int)l - *rs);
|
||||
}
|
||||
uchar l;
|
||||
};
|
||||
template <> struct PixDiff<3>
|
||||
{
|
||||
__device__ __forceinline__ PixDiff(const uchar* ls)
|
||||
{
|
||||
l = *((uchar3*)ls);
|
||||
}
|
||||
__device__ __forceinline__ float operator()(const uchar* rs) const
|
||||
{
|
||||
const float tr = 0.299f;
|
||||
const float tg = 0.587f;
|
||||
const float tb = 0.114f;
|
||||
///////////////////////////////////////////////////////////////
|
||||
/////////////////////// load constants ////////////////////////
|
||||
///////////////////////////////////////////////////////////////
|
||||
|
||||
float val = tb * ::abs((int)l.x - rs[0]);
|
||||
val += tg * ::abs((int)l.y - rs[1]);
|
||||
val += tr * ::abs((int)l.z - rs[2]);
|
||||
__constant__ int cndisp;
|
||||
__constant__ float cmax_data_term;
|
||||
__constant__ float cdata_weight;
|
||||
__constant__ float cmax_disc_term;
|
||||
__constant__ float cdisc_single_jump;
|
||||
|
||||
return val;
|
||||
}
|
||||
uchar3 l;
|
||||
};
|
||||
template <> struct PixDiff<4>
|
||||
{
|
||||
__device__ __forceinline__ PixDiff(const uchar* ls)
|
||||
{
|
||||
l = *((uchar4*)ls);
|
||||
}
|
||||
__device__ __forceinline__ float operator()(const uchar* rs) const
|
||||
{
|
||||
const float tr = 0.299f;
|
||||
const float tg = 0.587f;
|
||||
const float tb = 0.114f;
|
||||
|
||||
uchar4 r = *((uchar4*)rs);
|
||||
|
||||
float val = tb * ::abs((int)l.x - r.x);
|
||||
val += tg * ::abs((int)l.y - r.y);
|
||||
val += tr * ::abs((int)l.z - r.z);
|
||||
|
||||
return val;
|
||||
}
|
||||
uchar4 l;
|
||||
};
|
||||
|
||||
template <int cn, typename D>
|
||||
__global__ void comp_data(const DevMem2Db left, const PtrStepb right, PtrElemStep_<D> data)
|
||||
{
|
||||
const int x = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
const int y = blockIdx.y * blockDim.y + threadIdx.y;
|
||||
|
||||
if (y > 0 && y < left.rows - 1 && x > 0 && x < left.cols - 1)
|
||||
{
|
||||
const uchar* ls = left.ptr(y) + x * cn;
|
||||
const PixDiff<cn> pixDiff(ls);
|
||||
const uchar* rs = right.ptr(y) + x * cn;
|
||||
|
||||
D* ds = data.ptr(y) + x;
|
||||
const size_t disp_step = data.step * left.rows;
|
||||
|
||||
for (int disp = 0; disp < cndisp; disp++)
|
||||
void load_constants(int ndisp, float max_data_term, float data_weight, float max_disc_term, float disc_single_jump)
|
||||
{
|
||||
if (x - disp >= 1)
|
||||
{
|
||||
float val = pixDiff(rs - disp * cn);
|
||||
cudaSafeCall( cudaMemcpyToSymbol(cndisp, &ndisp, sizeof(int )) );
|
||||
cudaSafeCall( cudaMemcpyToSymbol(cmax_data_term, &max_data_term, sizeof(float)) );
|
||||
cudaSafeCall( cudaMemcpyToSymbol(cdata_weight, &data_weight, sizeof(float)) );
|
||||
cudaSafeCall( cudaMemcpyToSymbol(cmax_disc_term, &max_disc_term, sizeof(float)) );
|
||||
cudaSafeCall( cudaMemcpyToSymbol(cdisc_single_jump, &disc_single_jump, sizeof(float)) );
|
||||
}
|
||||
|
||||
ds[disp * disp_step] = saturate_cast<D>(fmin(cdata_weight * val, cdata_weight * cmax_data_term));
|
||||
///////////////////////////////////////////////////////////////
|
||||
////////////////////////// comp data //////////////////////////
|
||||
///////////////////////////////////////////////////////////////
|
||||
|
||||
template <int cn> struct PixDiff;
|
||||
template <> struct PixDiff<1>
|
||||
{
|
||||
__device__ __forceinline__ PixDiff(const uchar* ls)
|
||||
{
|
||||
l = *ls;
|
||||
}
|
||||
else
|
||||
__device__ __forceinline__ float operator()(const uchar* rs) const
|
||||
{
|
||||
ds[disp * disp_step] = saturate_cast<D>(cdata_weight * cmax_data_term);
|
||||
return ::abs((int)l - *rs);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template<typename T, typename D>
|
||||
void comp_data_gpu(const DevMem2Db& left, const DevMem2Db& right, const DevMem2Db& data, cudaStream_t stream);
|
||||
|
||||
template <> void comp_data_gpu<uchar, short>(const DevMem2Db& left, const DevMem2Db& right, const DevMem2Db& data, cudaStream_t stream)
|
||||
{
|
||||
dim3 threads(32, 8, 1);
|
||||
dim3 grid(1, 1, 1);
|
||||
|
||||
grid.x = divUp(left.cols, threads.x);
|
||||
grid.y = divUp(left.rows, threads.y);
|
||||
|
||||
comp_data<1, short><<<grid, threads, 0, stream>>>(left, right, (DevMem2D_<short>)data);
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
|
||||
if (stream == 0)
|
||||
cudaSafeCall( cudaDeviceSynchronize() );
|
||||
}
|
||||
template <> void comp_data_gpu<uchar, float>(const DevMem2Db& left, const DevMem2Db& right, const DevMem2Db& data, cudaStream_t stream)
|
||||
{
|
||||
dim3 threads(32, 8, 1);
|
||||
dim3 grid(1, 1, 1);
|
||||
|
||||
grid.x = divUp(left.cols, threads.x);
|
||||
grid.y = divUp(left.rows, threads.y);
|
||||
|
||||
comp_data<1, float><<<grid, threads, 0, stream>>>(left, right, (DevMem2D_<float>)data);
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
|
||||
if (stream == 0)
|
||||
cudaSafeCall( cudaDeviceSynchronize() );
|
||||
}
|
||||
|
||||
template <> void comp_data_gpu<uchar3, short>(const DevMem2Db& left, const DevMem2Db& right, const DevMem2Db& data, cudaStream_t stream)
|
||||
{
|
||||
dim3 threads(32, 8, 1);
|
||||
dim3 grid(1, 1, 1);
|
||||
|
||||
grid.x = divUp(left.cols, threads.x);
|
||||
grid.y = divUp(left.rows, threads.y);
|
||||
|
||||
comp_data<3, short><<<grid, threads, 0, stream>>>(left, right, (DevMem2D_<short>)data);
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
|
||||
if (stream == 0)
|
||||
cudaSafeCall( cudaDeviceSynchronize() );
|
||||
}
|
||||
template <> void comp_data_gpu<uchar3, float>(const DevMem2Db& left, const DevMem2Db& right, const DevMem2Db& data, cudaStream_t stream)
|
||||
{
|
||||
dim3 threads(32, 8, 1);
|
||||
dim3 grid(1, 1, 1);
|
||||
|
||||
grid.x = divUp(left.cols, threads.x);
|
||||
grid.y = divUp(left.rows, threads.y);
|
||||
|
||||
comp_data<3, float><<<grid, threads, 0, stream>>>(left, right, (DevMem2D_<float>)data);
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
|
||||
if (stream == 0)
|
||||
cudaSafeCall( cudaDeviceSynchronize() );
|
||||
}
|
||||
|
||||
template <> void comp_data_gpu<uchar4, short>(const DevMem2Db& left, const DevMem2Db& right, const DevMem2Db& data, cudaStream_t stream)
|
||||
{
|
||||
dim3 threads(32, 8, 1);
|
||||
dim3 grid(1, 1, 1);
|
||||
|
||||
grid.x = divUp(left.cols, threads.x);
|
||||
grid.y = divUp(left.rows, threads.y);
|
||||
|
||||
comp_data<4, short><<<grid, threads, 0, stream>>>(left, right, (DevMem2D_<short>)data);
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
|
||||
if (stream == 0)
|
||||
cudaSafeCall( cudaDeviceSynchronize() );
|
||||
}
|
||||
template <> void comp_data_gpu<uchar4, float>(const DevMem2Db& left, const DevMem2Db& right, const DevMem2Db& data, cudaStream_t stream)
|
||||
{
|
||||
dim3 threads(32, 8, 1);
|
||||
dim3 grid(1, 1, 1);
|
||||
|
||||
grid.x = divUp(left.cols, threads.x);
|
||||
grid.y = divUp(left.rows, threads.y);
|
||||
|
||||
comp_data<4, float><<<grid, threads, 0, stream>>>(left, right, (DevMem2D_<float>)data);
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
|
||||
if (stream == 0)
|
||||
cudaSafeCall( cudaDeviceSynchronize() );
|
||||
}
|
||||
|
||||
///////////////////////////////////////////////////////////////
|
||||
//////////////////////// data step down ///////////////////////
|
||||
///////////////////////////////////////////////////////////////
|
||||
|
||||
template <typename T>
|
||||
__global__ void data_step_down(int dst_cols, int dst_rows, int src_rows, const PtrStep<T> src, PtrStep<T> dst)
|
||||
{
|
||||
const int x = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
const int y = blockIdx.y * blockDim.y + threadIdx.y;
|
||||
|
||||
if (x < dst_cols && y < dst_rows)
|
||||
{
|
||||
for (int d = 0; d < cndisp; ++d)
|
||||
uchar l;
|
||||
};
|
||||
template <> struct PixDiff<3>
|
||||
{
|
||||
float dst_reg = src.ptr(d * src_rows + (2*y+0))[(2*x+0)];
|
||||
dst_reg += src.ptr(d * src_rows + (2*y+1))[(2*x+0)];
|
||||
dst_reg += src.ptr(d * src_rows + (2*y+0))[(2*x+1)];
|
||||
dst_reg += src.ptr(d * src_rows + (2*y+1))[(2*x+1)];
|
||||
|
||||
dst.ptr(d * dst_rows + y)[x] = saturate_cast<T>(dst_reg);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template<typename T>
|
||||
void data_step_down_gpu(int dst_cols, int dst_rows, int src_rows, const DevMem2Db& src, const DevMem2Db& dst, cudaStream_t stream)
|
||||
{
|
||||
dim3 threads(32, 8, 1);
|
||||
dim3 grid(1, 1, 1);
|
||||
|
||||
grid.x = divUp(dst_cols, threads.x);
|
||||
grid.y = divUp(dst_rows, threads.y);
|
||||
|
||||
data_step_down<T><<<grid, threads, 0, stream>>>(dst_cols, dst_rows, src_rows, (DevMem2D_<T>)src, (DevMem2D_<T>)dst);
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
|
||||
if (stream == 0)
|
||||
cudaSafeCall( cudaDeviceSynchronize() );
|
||||
}
|
||||
|
||||
template void data_step_down_gpu<short>(int dst_cols, int dst_rows, int src_rows, const DevMem2Db& src, const DevMem2Db& dst, cudaStream_t stream);
|
||||
template void data_step_down_gpu<float>(int dst_cols, int dst_rows, int src_rows, const DevMem2Db& src, const DevMem2Db& dst, cudaStream_t stream);
|
||||
|
||||
///////////////////////////////////////////////////////////////
|
||||
/////////////////// level up messages ////////////////////////
|
||||
///////////////////////////////////////////////////////////////
|
||||
|
||||
template <typename T>
|
||||
__global__ void level_up_message(int dst_cols, int dst_rows, int src_rows, const PtrElemStep_<T> src, PtrElemStep_<T> dst)
|
||||
{
|
||||
const int x = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
const int y = blockIdx.y * blockDim.y + threadIdx.y;
|
||||
|
||||
if (x < dst_cols && y < dst_rows)
|
||||
{
|
||||
const size_t dst_disp_step = dst.step * dst_rows;
|
||||
const size_t src_disp_step = src.step * src_rows;
|
||||
|
||||
T* dstr = dst.ptr(y ) + x;
|
||||
const T* srcr = src.ptr(y/2) + x/2;
|
||||
|
||||
for (int d = 0; d < cndisp; ++d)
|
||||
dstr[d * dst_disp_step] = srcr[d * src_disp_step];
|
||||
}
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
void level_up_messages_gpu(int dst_idx, int dst_cols, int dst_rows, int src_rows, DevMem2Db* mus, DevMem2Db* mds, DevMem2Db* mls, DevMem2Db* mrs, cudaStream_t stream)
|
||||
{
|
||||
dim3 threads(32, 8, 1);
|
||||
dim3 grid(1, 1, 1);
|
||||
|
||||
grid.x = divUp(dst_cols, threads.x);
|
||||
grid.y = divUp(dst_rows, threads.y);
|
||||
|
||||
int src_idx = (dst_idx + 1) & 1;
|
||||
|
||||
level_up_message<T><<<grid, threads, 0, stream>>>(dst_cols, dst_rows, src_rows, (DevMem2D_<T>)mus[src_idx], (DevMem2D_<T>)mus[dst_idx]);
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
|
||||
level_up_message<T><<<grid, threads, 0, stream>>>(dst_cols, dst_rows, src_rows, (DevMem2D_<T>)mds[src_idx], (DevMem2D_<T>)mds[dst_idx]);
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
|
||||
level_up_message<T><<<grid, threads, 0, stream>>>(dst_cols, dst_rows, src_rows, (DevMem2D_<T>)mls[src_idx], (DevMem2D_<T>)mls[dst_idx]);
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
|
||||
level_up_message<T><<<grid, threads, 0, stream>>>(dst_cols, dst_rows, src_rows, (DevMem2D_<T>)mrs[src_idx], (DevMem2D_<T>)mrs[dst_idx]);
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
|
||||
if (stream == 0)
|
||||
cudaSafeCall( cudaDeviceSynchronize() );
|
||||
}
|
||||
|
||||
template void level_up_messages_gpu<short>(int dst_idx, int dst_cols, int dst_rows, int src_rows, DevMem2Db* mus, DevMem2Db* mds, DevMem2Db* mls, DevMem2Db* mrs, cudaStream_t stream);
|
||||
template void level_up_messages_gpu<float>(int dst_idx, int dst_cols, int dst_rows, int src_rows, DevMem2Db* mus, DevMem2Db* mds, DevMem2Db* mls, DevMem2Db* mrs, cudaStream_t stream);
|
||||
|
||||
///////////////////////////////////////////////////////////////
|
||||
//////////////////// calc all iterations /////////////////////
|
||||
///////////////////////////////////////////////////////////////
|
||||
|
||||
template <typename T>
|
||||
__device__ void calc_min_linear_penalty(T* dst, size_t step)
|
||||
{
|
||||
float prev = dst[0];
|
||||
float cur;
|
||||
for (int disp = 1; disp < cndisp; ++disp)
|
||||
{
|
||||
prev += cdisc_single_jump;
|
||||
cur = dst[step * disp];
|
||||
if (prev < cur)
|
||||
{
|
||||
cur = prev;
|
||||
dst[step * disp] = saturate_cast<T>(prev);
|
||||
}
|
||||
prev = cur;
|
||||
}
|
||||
|
||||
prev = dst[(cndisp - 1) * step];
|
||||
for (int disp = cndisp - 2; disp >= 0; disp--)
|
||||
{
|
||||
prev += cdisc_single_jump;
|
||||
cur = dst[step * disp];
|
||||
if (prev < cur)
|
||||
{
|
||||
cur = prev;
|
||||
dst[step * disp] = saturate_cast<T>(prev);
|
||||
}
|
||||
prev = cur;
|
||||
}
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
__device__ void message(const T* msg1, const T* msg2, const T* msg3, const T* data, T* dst, size_t msg_disp_step, size_t data_disp_step)
|
||||
{
|
||||
float minimum = device::numeric_limits<float>::max();
|
||||
|
||||
for(int i = 0; i < cndisp; ++i)
|
||||
{
|
||||
float dst_reg = msg1[msg_disp_step * i];
|
||||
dst_reg += msg2[msg_disp_step * i];
|
||||
dst_reg += msg3[msg_disp_step * i];
|
||||
dst_reg += data[data_disp_step * i];
|
||||
|
||||
if (dst_reg < minimum)
|
||||
minimum = dst_reg;
|
||||
|
||||
dst[msg_disp_step * i] = saturate_cast<T>(dst_reg);
|
||||
}
|
||||
|
||||
calc_min_linear_penalty(dst, msg_disp_step);
|
||||
|
||||
minimum += cmax_disc_term;
|
||||
|
||||
float sum = 0;
|
||||
for(int i = 0; i < cndisp; ++i)
|
||||
{
|
||||
float dst_reg = dst[msg_disp_step * i];
|
||||
if (dst_reg > minimum)
|
||||
{
|
||||
dst_reg = minimum;
|
||||
dst[msg_disp_step * i] = saturate_cast<T>(minimum);
|
||||
}
|
||||
sum += dst_reg;
|
||||
}
|
||||
sum /= cndisp;
|
||||
|
||||
for(int i = 0; i < cndisp; ++i)
|
||||
dst[msg_disp_step * i] -= sum;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
__global__ void one_iteration(int t, PtrElemStep_<T> u, T* d, T* l, T* r, const PtrElemStep_<T> data, int cols, int rows)
|
||||
{
|
||||
const int y = blockIdx.y * blockDim.y + threadIdx.y;
|
||||
const int x = ((blockIdx.x * blockDim.x + threadIdx.x) << 1) + ((y + t) & 1);
|
||||
|
||||
if ((y > 0) && (y < rows - 1) && (x > 0) && (x < cols - 1))
|
||||
{
|
||||
T* us = u.ptr(y) + x;
|
||||
T* ds = d + y * u.step + x;
|
||||
T* ls = l + y * u.step + x;
|
||||
T* rs = r + y * u.step + x;
|
||||
const T* dt = data.ptr(y) + x;
|
||||
|
||||
size_t msg_disp_step = u.step * rows;
|
||||
size_t data_disp_step = data.step * rows;
|
||||
|
||||
message(us + u.step, ls + 1, rs - 1, dt, us, msg_disp_step, data_disp_step);
|
||||
message(ds - u.step, ls + 1, rs - 1, dt, ds, msg_disp_step, data_disp_step);
|
||||
message(us + u.step, ds - u.step, rs - 1, dt, rs, msg_disp_step, data_disp_step);
|
||||
message(us + u.step, ds - u.step, ls + 1, dt, ls, msg_disp_step, data_disp_step);
|
||||
}
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
void calc_all_iterations_gpu(int cols, int rows, int iters, const DevMem2Db& u, const DevMem2Db& d,
|
||||
const DevMem2Db& l, const DevMem2Db& r, const DevMem2Db& data, cudaStream_t stream)
|
||||
{
|
||||
dim3 threads(32, 8, 1);
|
||||
dim3 grid(1, 1, 1);
|
||||
|
||||
grid.x = divUp(cols, threads.x << 1);
|
||||
grid.y = divUp(rows, threads.y);
|
||||
|
||||
for(int t = 0; t < iters; ++t)
|
||||
{
|
||||
one_iteration<T><<<grid, threads, 0, stream>>>(t, (DevMem2D_<T>)u, (T*)d.data, (T*)l.data, (T*)r.data, (DevMem2D_<T>)data, cols, rows);
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
|
||||
if (stream == 0)
|
||||
cudaSafeCall( cudaDeviceSynchronize() );
|
||||
}
|
||||
}
|
||||
|
||||
template void calc_all_iterations_gpu<short>(int cols, int rows, int iters, const DevMem2Db& u, const DevMem2Db& d, const DevMem2Db& l, const DevMem2Db& r, const DevMem2Db& data, cudaStream_t stream);
|
||||
template void calc_all_iterations_gpu<float>(int cols, int rows, int iters, const DevMem2Db& u, const DevMem2Db& d, const DevMem2Db& l, const DevMem2Db& r, const DevMem2Db& data, cudaStream_t stream);
|
||||
|
||||
///////////////////////////////////////////////////////////////
|
||||
/////////////////////////// output ////////////////////////////
|
||||
///////////////////////////////////////////////////////////////
|
||||
|
||||
template <typename T>
|
||||
__global__ void output(const PtrElemStep_<T> u, const T* d, const T* l, const T* r, const T* data,
|
||||
DevMem2D_<short> disp)
|
||||
{
|
||||
const int x = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
const int y = blockIdx.y * blockDim.y + threadIdx.y;
|
||||
|
||||
if (y > 0 && y < disp.rows - 1 && x > 0 && x < disp.cols - 1)
|
||||
{
|
||||
const T* us = u.ptr(y + 1) + x;
|
||||
const T* ds = d + (y - 1) * u.step + x;
|
||||
const T* ls = l + y * u.step + (x + 1);
|
||||
const T* rs = r + y * u.step + (x - 1);
|
||||
const T* dt = data + y * u.step + x;
|
||||
|
||||
size_t disp_step = disp.rows * u.step;
|
||||
|
||||
int best = 0;
|
||||
float best_val = numeric_limits<float>::max();
|
||||
for (int d = 0; d < cndisp; ++d)
|
||||
{
|
||||
float val = us[d * disp_step];
|
||||
val += ds[d * disp_step];
|
||||
val += ls[d * disp_step];
|
||||
val += rs[d * disp_step];
|
||||
val += dt[d * disp_step];
|
||||
|
||||
if (val < best_val)
|
||||
__device__ __forceinline__ PixDiff(const uchar* ls)
|
||||
{
|
||||
best_val = val;
|
||||
best = d;
|
||||
l = *((uchar3*)ls);
|
||||
}
|
||||
__device__ __forceinline__ float operator()(const uchar* rs) const
|
||||
{
|
||||
const float tr = 0.299f;
|
||||
const float tg = 0.587f;
|
||||
const float tb = 0.114f;
|
||||
|
||||
float val = tb * ::abs((int)l.x - rs[0]);
|
||||
val += tg * ::abs((int)l.y - rs[1]);
|
||||
val += tr * ::abs((int)l.z - rs[2]);
|
||||
|
||||
return val;
|
||||
}
|
||||
uchar3 l;
|
||||
};
|
||||
template <> struct PixDiff<4>
|
||||
{
|
||||
__device__ __forceinline__ PixDiff(const uchar* ls)
|
||||
{
|
||||
l = *((uchar4*)ls);
|
||||
}
|
||||
__device__ __forceinline__ float operator()(const uchar* rs) const
|
||||
{
|
||||
const float tr = 0.299f;
|
||||
const float tg = 0.587f;
|
||||
const float tb = 0.114f;
|
||||
|
||||
uchar4 r = *((uchar4*)rs);
|
||||
|
||||
float val = tb * ::abs((int)l.x - r.x);
|
||||
val += tg * ::abs((int)l.y - r.y);
|
||||
val += tr * ::abs((int)l.z - r.z);
|
||||
|
||||
return val;
|
||||
}
|
||||
uchar4 l;
|
||||
};
|
||||
|
||||
template <int cn, typename D>
|
||||
__global__ void comp_data(const DevMem2Db left, const PtrStepb right, PtrElemStep_<D> data)
|
||||
{
|
||||
const int x = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
const int y = blockIdx.y * blockDim.y + threadIdx.y;
|
||||
|
||||
if (y > 0 && y < left.rows - 1 && x > 0 && x < left.cols - 1)
|
||||
{
|
||||
const uchar* ls = left.ptr(y) + x * cn;
|
||||
const PixDiff<cn> pixDiff(ls);
|
||||
const uchar* rs = right.ptr(y) + x * cn;
|
||||
|
||||
D* ds = data.ptr(y) + x;
|
||||
const size_t disp_step = data.step * left.rows;
|
||||
|
||||
for (int disp = 0; disp < cndisp; disp++)
|
||||
{
|
||||
if (x - disp >= 1)
|
||||
{
|
||||
float val = pixDiff(rs - disp * cn);
|
||||
|
||||
ds[disp * disp_step] = saturate_cast<D>(fmin(cdata_weight * val, cdata_weight * cmax_data_term));
|
||||
}
|
||||
else
|
||||
{
|
||||
ds[disp * disp_step] = saturate_cast<D>(cdata_weight * cmax_data_term);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
disp.ptr(y)[x] = saturate_cast<short>(best);
|
||||
}
|
||||
}
|
||||
template<typename T, typename D>
|
||||
void comp_data_gpu(const DevMem2Db& left, const DevMem2Db& right, const DevMem2Db& data, cudaStream_t stream);
|
||||
|
||||
template <typename T>
|
||||
void output_gpu(const DevMem2Db& u, const DevMem2Db& d, const DevMem2Db& l, const DevMem2Db& r, const DevMem2Db& data,
|
||||
const DevMem2D_<short>& disp, cudaStream_t stream)
|
||||
{
|
||||
dim3 threads(32, 8, 1);
|
||||
dim3 grid(1, 1, 1);
|
||||
template <> void comp_data_gpu<uchar, short>(const DevMem2Db& left, const DevMem2Db& right, const DevMem2Db& data, cudaStream_t stream)
|
||||
{
|
||||
dim3 threads(32, 8, 1);
|
||||
dim3 grid(1, 1, 1);
|
||||
|
||||
grid.x = divUp(disp.cols, threads.x);
|
||||
grid.y = divUp(disp.rows, threads.y);
|
||||
grid.x = divUp(left.cols, threads.x);
|
||||
grid.y = divUp(left.rows, threads.y);
|
||||
|
||||
output<T><<<grid, threads, 0, stream>>>((DevMem2D_<T>)u, (const T*)d.data, (const T*)l.data, (const T*)r.data, (const T*)data.data, disp);
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
comp_data<1, short><<<grid, threads, 0, stream>>>(left, right, (DevMem2D_<short>)data);
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
|
||||
if (stream == 0)
|
||||
cudaSafeCall( cudaDeviceSynchronize() );
|
||||
}
|
||||
if (stream == 0)
|
||||
cudaSafeCall( cudaDeviceSynchronize() );
|
||||
}
|
||||
template <> void comp_data_gpu<uchar, float>(const DevMem2Db& left, const DevMem2Db& right, const DevMem2Db& data, cudaStream_t stream)
|
||||
{
|
||||
dim3 threads(32, 8, 1);
|
||||
dim3 grid(1, 1, 1);
|
||||
|
||||
template void output_gpu<short>(const DevMem2Db& u, const DevMem2Db& d, const DevMem2Db& l, const DevMem2Db& r, const DevMem2Db& data, const DevMem2D_<short>& disp, cudaStream_t stream);
|
||||
template void output_gpu<float>(const DevMem2Db& u, const DevMem2Db& d, const DevMem2Db& l, const DevMem2Db& r, const DevMem2Db& data, const DevMem2D_<short>& disp, cudaStream_t stream);
|
||||
grid.x = divUp(left.cols, threads.x);
|
||||
grid.y = divUp(left.rows, threads.y);
|
||||
|
||||
} // namespace stereobp
|
||||
comp_data<1, float><<<grid, threads, 0, stream>>>(left, right, (DevMem2D_<float>)data);
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
|
||||
END_OPENCV_DEVICE_NAMESPACE
|
||||
if (stream == 0)
|
||||
cudaSafeCall( cudaDeviceSynchronize() );
|
||||
}
|
||||
|
||||
template <> void comp_data_gpu<uchar3, short>(const DevMem2Db& left, const DevMem2Db& right, const DevMem2Db& data, cudaStream_t stream)
|
||||
{
|
||||
dim3 threads(32, 8, 1);
|
||||
dim3 grid(1, 1, 1);
|
||||
|
||||
grid.x = divUp(left.cols, threads.x);
|
||||
grid.y = divUp(left.rows, threads.y);
|
||||
|
||||
comp_data<3, short><<<grid, threads, 0, stream>>>(left, right, (DevMem2D_<short>)data);
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
|
||||
if (stream == 0)
|
||||
cudaSafeCall( cudaDeviceSynchronize() );
|
||||
}
|
||||
template <> void comp_data_gpu<uchar3, float>(const DevMem2Db& left, const DevMem2Db& right, const DevMem2Db& data, cudaStream_t stream)
|
||||
{
|
||||
dim3 threads(32, 8, 1);
|
||||
dim3 grid(1, 1, 1);
|
||||
|
||||
grid.x = divUp(left.cols, threads.x);
|
||||
grid.y = divUp(left.rows, threads.y);
|
||||
|
||||
comp_data<3, float><<<grid, threads, 0, stream>>>(left, right, (DevMem2D_<float>)data);
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
|
||||
if (stream == 0)
|
||||
cudaSafeCall( cudaDeviceSynchronize() );
|
||||
}
|
||||
|
||||
template <> void comp_data_gpu<uchar4, short>(const DevMem2Db& left, const DevMem2Db& right, const DevMem2Db& data, cudaStream_t stream)
|
||||
{
|
||||
dim3 threads(32, 8, 1);
|
||||
dim3 grid(1, 1, 1);
|
||||
|
||||
grid.x = divUp(left.cols, threads.x);
|
||||
grid.y = divUp(left.rows, threads.y);
|
||||
|
||||
comp_data<4, short><<<grid, threads, 0, stream>>>(left, right, (DevMem2D_<short>)data);
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
|
||||
if (stream == 0)
|
||||
cudaSafeCall( cudaDeviceSynchronize() );
|
||||
}
|
||||
template <> void comp_data_gpu<uchar4, float>(const DevMem2Db& left, const DevMem2Db& right, const DevMem2Db& data, cudaStream_t stream)
|
||||
{
|
||||
dim3 threads(32, 8, 1);
|
||||
dim3 grid(1, 1, 1);
|
||||
|
||||
grid.x = divUp(left.cols, threads.x);
|
||||
grid.y = divUp(left.rows, threads.y);
|
||||
|
||||
comp_data<4, float><<<grid, threads, 0, stream>>>(left, right, (DevMem2D_<float>)data);
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
|
||||
if (stream == 0)
|
||||
cudaSafeCall( cudaDeviceSynchronize() );
|
||||
}
|
||||
|
||||
///////////////////////////////////////////////////////////////
|
||||
//////////////////////// data step down ///////////////////////
|
||||
///////////////////////////////////////////////////////////////
|
||||
|
||||
template <typename T>
|
||||
__global__ void data_step_down(int dst_cols, int dst_rows, int src_rows, const PtrStep<T> src, PtrStep<T> dst)
|
||||
{
|
||||
const int x = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
const int y = blockIdx.y * blockDim.y + threadIdx.y;
|
||||
|
||||
if (x < dst_cols && y < dst_rows)
|
||||
{
|
||||
for (int d = 0; d < cndisp; ++d)
|
||||
{
|
||||
float dst_reg = src.ptr(d * src_rows + (2*y+0))[(2*x+0)];
|
||||
dst_reg += src.ptr(d * src_rows + (2*y+1))[(2*x+0)];
|
||||
dst_reg += src.ptr(d * src_rows + (2*y+0))[(2*x+1)];
|
||||
dst_reg += src.ptr(d * src_rows + (2*y+1))[(2*x+1)];
|
||||
|
||||
dst.ptr(d * dst_rows + y)[x] = saturate_cast<T>(dst_reg);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template<typename T>
|
||||
void data_step_down_gpu(int dst_cols, int dst_rows, int src_rows, const DevMem2Db& src, const DevMem2Db& dst, cudaStream_t stream)
|
||||
{
|
||||
dim3 threads(32, 8, 1);
|
||||
dim3 grid(1, 1, 1);
|
||||
|
||||
grid.x = divUp(dst_cols, threads.x);
|
||||
grid.y = divUp(dst_rows, threads.y);
|
||||
|
||||
data_step_down<T><<<grid, threads, 0, stream>>>(dst_cols, dst_rows, src_rows, (DevMem2D_<T>)src, (DevMem2D_<T>)dst);
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
|
||||
if (stream == 0)
|
||||
cudaSafeCall( cudaDeviceSynchronize() );
|
||||
}
|
||||
|
||||
template void data_step_down_gpu<short>(int dst_cols, int dst_rows, int src_rows, const DevMem2Db& src, const DevMem2Db& dst, cudaStream_t stream);
|
||||
template void data_step_down_gpu<float>(int dst_cols, int dst_rows, int src_rows, const DevMem2Db& src, const DevMem2Db& dst, cudaStream_t stream);
|
||||
|
||||
///////////////////////////////////////////////////////////////
|
||||
/////////////////// level up messages ////////////////////////
|
||||
///////////////////////////////////////////////////////////////
|
||||
|
||||
template <typename T>
|
||||
__global__ void level_up_message(int dst_cols, int dst_rows, int src_rows, const PtrElemStep_<T> src, PtrElemStep_<T> dst)
|
||||
{
|
||||
const int x = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
const int y = blockIdx.y * blockDim.y + threadIdx.y;
|
||||
|
||||
if (x < dst_cols && y < dst_rows)
|
||||
{
|
||||
const size_t dst_disp_step = dst.step * dst_rows;
|
||||
const size_t src_disp_step = src.step * src_rows;
|
||||
|
||||
T* dstr = dst.ptr(y ) + x;
|
||||
const T* srcr = src.ptr(y/2) + x/2;
|
||||
|
||||
for (int d = 0; d < cndisp; ++d)
|
||||
dstr[d * dst_disp_step] = srcr[d * src_disp_step];
|
||||
}
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
void level_up_messages_gpu(int dst_idx, int dst_cols, int dst_rows, int src_rows, DevMem2Db* mus, DevMem2Db* mds, DevMem2Db* mls, DevMem2Db* mrs, cudaStream_t stream)
|
||||
{
|
||||
dim3 threads(32, 8, 1);
|
||||
dim3 grid(1, 1, 1);
|
||||
|
||||
grid.x = divUp(dst_cols, threads.x);
|
||||
grid.y = divUp(dst_rows, threads.y);
|
||||
|
||||
int src_idx = (dst_idx + 1) & 1;
|
||||
|
||||
level_up_message<T><<<grid, threads, 0, stream>>>(dst_cols, dst_rows, src_rows, (DevMem2D_<T>)mus[src_idx], (DevMem2D_<T>)mus[dst_idx]);
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
|
||||
level_up_message<T><<<grid, threads, 0, stream>>>(dst_cols, dst_rows, src_rows, (DevMem2D_<T>)mds[src_idx], (DevMem2D_<T>)mds[dst_idx]);
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
|
||||
level_up_message<T><<<grid, threads, 0, stream>>>(dst_cols, dst_rows, src_rows, (DevMem2D_<T>)mls[src_idx], (DevMem2D_<T>)mls[dst_idx]);
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
|
||||
level_up_message<T><<<grid, threads, 0, stream>>>(dst_cols, dst_rows, src_rows, (DevMem2D_<T>)mrs[src_idx], (DevMem2D_<T>)mrs[dst_idx]);
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
|
||||
if (stream == 0)
|
||||
cudaSafeCall( cudaDeviceSynchronize() );
|
||||
}
|
||||
|
||||
template void level_up_messages_gpu<short>(int dst_idx, int dst_cols, int dst_rows, int src_rows, DevMem2Db* mus, DevMem2Db* mds, DevMem2Db* mls, DevMem2Db* mrs, cudaStream_t stream);
|
||||
template void level_up_messages_gpu<float>(int dst_idx, int dst_cols, int dst_rows, int src_rows, DevMem2Db* mus, DevMem2Db* mds, DevMem2Db* mls, DevMem2Db* mrs, cudaStream_t stream);
|
||||
|
||||
///////////////////////////////////////////////////////////////
|
||||
//////////////////// calc all iterations /////////////////////
|
||||
///////////////////////////////////////////////////////////////
|
||||
|
||||
template <typename T>
|
||||
__device__ void calc_min_linear_penalty(T* dst, size_t step)
|
||||
{
|
||||
float prev = dst[0];
|
||||
float cur;
|
||||
for (int disp = 1; disp < cndisp; ++disp)
|
||||
{
|
||||
prev += cdisc_single_jump;
|
||||
cur = dst[step * disp];
|
||||
if (prev < cur)
|
||||
{
|
||||
cur = prev;
|
||||
dst[step * disp] = saturate_cast<T>(prev);
|
||||
}
|
||||
prev = cur;
|
||||
}
|
||||
|
||||
prev = dst[(cndisp - 1) * step];
|
||||
for (int disp = cndisp - 2; disp >= 0; disp--)
|
||||
{
|
||||
prev += cdisc_single_jump;
|
||||
cur = dst[step * disp];
|
||||
if (prev < cur)
|
||||
{
|
||||
cur = prev;
|
||||
dst[step * disp] = saturate_cast<T>(prev);
|
||||
}
|
||||
prev = cur;
|
||||
}
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
__device__ void message(const T* msg1, const T* msg2, const T* msg3, const T* data, T* dst, size_t msg_disp_step, size_t data_disp_step)
|
||||
{
|
||||
float minimum = device::numeric_limits<float>::max();
|
||||
|
||||
for(int i = 0; i < cndisp; ++i)
|
||||
{
|
||||
float dst_reg = msg1[msg_disp_step * i];
|
||||
dst_reg += msg2[msg_disp_step * i];
|
||||
dst_reg += msg3[msg_disp_step * i];
|
||||
dst_reg += data[data_disp_step * i];
|
||||
|
||||
if (dst_reg < minimum)
|
||||
minimum = dst_reg;
|
||||
|
||||
dst[msg_disp_step * i] = saturate_cast<T>(dst_reg);
|
||||
}
|
||||
|
||||
calc_min_linear_penalty(dst, msg_disp_step);
|
||||
|
||||
minimum += cmax_disc_term;
|
||||
|
||||
float sum = 0;
|
||||
for(int i = 0; i < cndisp; ++i)
|
||||
{
|
||||
float dst_reg = dst[msg_disp_step * i];
|
||||
if (dst_reg > minimum)
|
||||
{
|
||||
dst_reg = minimum;
|
||||
dst[msg_disp_step * i] = saturate_cast<T>(minimum);
|
||||
}
|
||||
sum += dst_reg;
|
||||
}
|
||||
sum /= cndisp;
|
||||
|
||||
for(int i = 0; i < cndisp; ++i)
|
||||
dst[msg_disp_step * i] -= sum;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
__global__ void one_iteration(int t, PtrElemStep_<T> u, T* d, T* l, T* r, const PtrElemStep_<T> data, int cols, int rows)
|
||||
{
|
||||
const int y = blockIdx.y * blockDim.y + threadIdx.y;
|
||||
const int x = ((blockIdx.x * blockDim.x + threadIdx.x) << 1) + ((y + t) & 1);
|
||||
|
||||
if ((y > 0) && (y < rows - 1) && (x > 0) && (x < cols - 1))
|
||||
{
|
||||
T* us = u.ptr(y) + x;
|
||||
T* ds = d + y * u.step + x;
|
||||
T* ls = l + y * u.step + x;
|
||||
T* rs = r + y * u.step + x;
|
||||
const T* dt = data.ptr(y) + x;
|
||||
|
||||
size_t msg_disp_step = u.step * rows;
|
||||
size_t data_disp_step = data.step * rows;
|
||||
|
||||
message(us + u.step, ls + 1, rs - 1, dt, us, msg_disp_step, data_disp_step);
|
||||
message(ds - u.step, ls + 1, rs - 1, dt, ds, msg_disp_step, data_disp_step);
|
||||
message(us + u.step, ds - u.step, rs - 1, dt, rs, msg_disp_step, data_disp_step);
|
||||
message(us + u.step, ds - u.step, ls + 1, dt, ls, msg_disp_step, data_disp_step);
|
||||
}
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
void calc_all_iterations_gpu(int cols, int rows, int iters, const DevMem2Db& u, const DevMem2Db& d,
|
||||
const DevMem2Db& l, const DevMem2Db& r, const DevMem2Db& data, cudaStream_t stream)
|
||||
{
|
||||
dim3 threads(32, 8, 1);
|
||||
dim3 grid(1, 1, 1);
|
||||
|
||||
grid.x = divUp(cols, threads.x << 1);
|
||||
grid.y = divUp(rows, threads.y);
|
||||
|
||||
for(int t = 0; t < iters; ++t)
|
||||
{
|
||||
one_iteration<T><<<grid, threads, 0, stream>>>(t, (DevMem2D_<T>)u, (T*)d.data, (T*)l.data, (T*)r.data, (DevMem2D_<T>)data, cols, rows);
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
|
||||
if (stream == 0)
|
||||
cudaSafeCall( cudaDeviceSynchronize() );
|
||||
}
|
||||
}
|
||||
|
||||
template void calc_all_iterations_gpu<short>(int cols, int rows, int iters, const DevMem2Db& u, const DevMem2Db& d, const DevMem2Db& l, const DevMem2Db& r, const DevMem2Db& data, cudaStream_t stream);
|
||||
template void calc_all_iterations_gpu<float>(int cols, int rows, int iters, const DevMem2Db& u, const DevMem2Db& d, const DevMem2Db& l, const DevMem2Db& r, const DevMem2Db& data, cudaStream_t stream);
|
||||
|
||||
///////////////////////////////////////////////////////////////
|
||||
/////////////////////////// output ////////////////////////////
|
||||
///////////////////////////////////////////////////////////////
|
||||
|
||||
template <typename T>
|
||||
__global__ void output(const PtrElemStep_<T> u, const T* d, const T* l, const T* r, const T* data,
|
||||
DevMem2D_<short> disp)
|
||||
{
|
||||
const int x = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
const int y = blockIdx.y * blockDim.y + threadIdx.y;
|
||||
|
||||
if (y > 0 && y < disp.rows - 1 && x > 0 && x < disp.cols - 1)
|
||||
{
|
||||
const T* us = u.ptr(y + 1) + x;
|
||||
const T* ds = d + (y - 1) * u.step + x;
|
||||
const T* ls = l + y * u.step + (x + 1);
|
||||
const T* rs = r + y * u.step + (x - 1);
|
||||
const T* dt = data + y * u.step + x;
|
||||
|
||||
size_t disp_step = disp.rows * u.step;
|
||||
|
||||
int best = 0;
|
||||
float best_val = numeric_limits<float>::max();
|
||||
for (int d = 0; d < cndisp; ++d)
|
||||
{
|
||||
float val = us[d * disp_step];
|
||||
val += ds[d * disp_step];
|
||||
val += ls[d * disp_step];
|
||||
val += rs[d * disp_step];
|
||||
val += dt[d * disp_step];
|
||||
|
||||
if (val < best_val)
|
||||
{
|
||||
best_val = val;
|
||||
best = d;
|
||||
}
|
||||
}
|
||||
|
||||
disp.ptr(y)[x] = saturate_cast<short>(best);
|
||||
}
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
void output_gpu(const DevMem2Db& u, const DevMem2Db& d, const DevMem2Db& l, const DevMem2Db& r, const DevMem2Db& data,
|
||||
const DevMem2D_<short>& disp, cudaStream_t stream)
|
||||
{
|
||||
dim3 threads(32, 8, 1);
|
||||
dim3 grid(1, 1, 1);
|
||||
|
||||
grid.x = divUp(disp.cols, threads.x);
|
||||
grid.y = divUp(disp.rows, threads.y);
|
||||
|
||||
output<T><<<grid, threads, 0, stream>>>((DevMem2D_<T>)u, (const T*)d.data, (const T*)l.data, (const T*)r.data, (const T*)data.data, disp);
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
|
||||
if (stream == 0)
|
||||
cudaSafeCall( cudaDeviceSynchronize() );
|
||||
}
|
||||
|
||||
template void output_gpu<short>(const DevMem2Db& u, const DevMem2Db& d, const DevMem2Db& l, const DevMem2Db& r, const DevMem2Db& data, const DevMem2D_<short>& disp, cudaStream_t stream);
|
||||
template void output_gpu<float>(const DevMem2Db& u, const DevMem2Db& d, const DevMem2Db& l, const DevMem2Db& r, const DevMem2Db& data, const DevMem2D_<short>& disp, cudaStream_t stream);
|
||||
} // namespace stereobp
|
||||
}}} // namespace cv { namespace gpu { namespace device
|
||||
|
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
@ -71,20 +71,19 @@ cv::gpu::Stream::operator bool() const { throw_nogpu(); return false; }
|
||||
|
||||
#include "opencv2/gpu/stream_accessor.hpp"
|
||||
|
||||
BEGIN_OPENCV_DEVICE_NAMESPACE
|
||||
namespace cv { namespace gpu { namespace device
|
||||
{
|
||||
void copy_to_with_mask(const DevMem2Db& src, DevMem2Db dst, int depth, const DevMem2Db& mask, int channels, const cudaStream_t & stream = 0);
|
||||
|
||||
void copy_to_with_mask(const DevMem2Db& src, DevMem2Db dst, int depth, const DevMem2Db& mask, int channels, const cudaStream_t & stream = 0);
|
||||
template <typename T>
|
||||
void set_to_gpu(const DevMem2Db& mat, const T* scalar, int channels, cudaStream_t stream);
|
||||
template <typename T>
|
||||
void set_to_gpu(const DevMem2Db& mat, const T* scalar, const DevMem2Db& mask, int channels, cudaStream_t stream);
|
||||
|
||||
template <typename T>
|
||||
void set_to_gpu(const DevMem2Db& mat, const T* scalar, int channels, cudaStream_t stream);
|
||||
template <typename T>
|
||||
void set_to_gpu(const DevMem2Db& mat, const T* scalar, const DevMem2Db& mask, int channels, cudaStream_t stream);
|
||||
void convert_gpu(const DevMem2Db& src, int sdepth, const DevMem2Db& dst, int ddepth, double alpha, double beta, cudaStream_t stream = 0);
|
||||
}}}
|
||||
|
||||
void convert_gpu(const DevMem2Db& src, int sdepth, const DevMem2Db& dst, int ddepth, double alpha, double beta, cudaStream_t stream = 0);
|
||||
|
||||
END_OPENCV_DEVICE_NAMESPACE
|
||||
|
||||
using namespace OPENCV_DEVICE_NAMESPACE;
|
||||
using namespace ::cv::gpu::device;
|
||||
|
||||
struct Stream::Impl
|
||||
{
|
||||
|
@ -123,19 +123,18 @@ namespace
|
||||
////////////////////////////////////////////////////////////////////////
|
||||
// add
|
||||
|
||||
BEGIN_OPENCV_DEVICE_NAMESPACE
|
||||
namespace cv { namespace gpu { namespace device
|
||||
{
|
||||
template <typename T, typename D>
|
||||
void add_gpu(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);
|
||||
|
||||
template <typename T, typename D>
|
||||
void add_gpu(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);
|
||||
|
||||
template <typename T, typename D>
|
||||
void add_gpu(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);
|
||||
|
||||
END_OPENCV_DEVICE_NAMESPACE
|
||||
template <typename T, typename D>
|
||||
void add_gpu(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);
|
||||
}}}
|
||||
|
||||
void cv::gpu::add(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, const GpuMat& mask, int dtype, Stream& s)
|
||||
{
|
||||
using namespace OPENCV_DEVICE_NAMESPACE;
|
||||
using namespace ::cv::gpu::device;
|
||||
|
||||
typedef void (*func_t)(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);
|
||||
|
||||
@ -174,7 +173,7 @@ void cv::gpu::add(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, const Gpu
|
||||
|
||||
void cv::gpu::add(const GpuMat& src, const Scalar& sc, GpuMat& dst, const GpuMat& mask, int dtype, Stream& s)
|
||||
{
|
||||
using namespace OPENCV_DEVICE_NAMESPACE;
|
||||
using namespace ::cv::gpu::device;
|
||||
|
||||
typedef void (*func_t)(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);
|
||||
|
||||
@ -236,19 +235,18 @@ void cv::gpu::add(const GpuMat& src, const Scalar& sc, GpuMat& dst, const GpuMat
|
||||
////////////////////////////////////////////////////////////////////////
|
||||
// subtract
|
||||
|
||||
BEGIN_OPENCV_DEVICE_NAMESPACE
|
||||
namespace cv { namespace gpu { namespace device
|
||||
{
|
||||
template <typename T, typename D>
|
||||
void subtract_gpu(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);
|
||||
|
||||
template <typename T, typename D>
|
||||
void subtract_gpu(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);
|
||||
|
||||
template <typename T, typename D>
|
||||
void subtract_gpu(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);
|
||||
|
||||
END_OPENCV_DEVICE_NAMESPACE
|
||||
template <typename T, typename D>
|
||||
void subtract_gpu(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);
|
||||
}}}
|
||||
|
||||
void cv::gpu::subtract(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, const GpuMat& mask, int dtype, Stream& s)
|
||||
{
|
||||
using namespace OPENCV_DEVICE_NAMESPACE;
|
||||
using namespace ::cv::gpu::device;
|
||||
|
||||
typedef void (*func_t)(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);
|
||||
|
||||
@ -287,7 +285,7 @@ void cv::gpu::subtract(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, cons
|
||||
|
||||
void cv::gpu::subtract(const GpuMat& src, const Scalar& sc, GpuMat& dst, const GpuMat& mask, int dtype, Stream& s)
|
||||
{
|
||||
using namespace OPENCV_DEVICE_NAMESPACE;
|
||||
using namespace ::cv::gpu::device;
|
||||
|
||||
typedef void (*func_t)(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);
|
||||
|
||||
@ -349,22 +347,21 @@ void cv::gpu::subtract(const GpuMat& src, const Scalar& sc, GpuMat& dst, const G
|
||||
////////////////////////////////////////////////////////////////////////
|
||||
// multiply
|
||||
|
||||
BEGIN_OPENCV_DEVICE_NAMESPACE
|
||||
namespace cv { namespace gpu { namespace device
|
||||
{
|
||||
void multiply_gpu(const DevMem2D_<uchar4>& src1, const DevMem2Df& src2, const DevMem2D_<uchar4>& dst, cudaStream_t stream);
|
||||
void multiply_gpu(const DevMem2D_<short4>& src1, const DevMem2Df& src2, const DevMem2D_<short4>& dst, cudaStream_t stream);
|
||||
|
||||
void multiply_gpu(const DevMem2D_<uchar4>& src1, const DevMem2Df& src2, const DevMem2D_<uchar4>& dst, cudaStream_t stream);
|
||||
void multiply_gpu(const DevMem2D_<short4>& src1, const DevMem2Df& src2, const DevMem2D_<short4>& dst, cudaStream_t stream);
|
||||
template <typename T, typename D>
|
||||
void multiply_gpu(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);
|
||||
|
||||
template <typename T, typename D>
|
||||
void multiply_gpu(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);
|
||||
|
||||
template <typename T, typename D>
|
||||
void multiply_gpu(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);
|
||||
|
||||
END_OPENCV_DEVICE_NAMESPACE
|
||||
template <typename T, typename D>
|
||||
void multiply_gpu(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);
|
||||
}}}
|
||||
|
||||
void cv::gpu::multiply(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, double scale, int dtype, Stream& s)
|
||||
{
|
||||
using namespace OPENCV_DEVICE_NAMESPACE;
|
||||
using namespace ::cv::gpu::device;
|
||||
|
||||
typedef void (*func_t)(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);
|
||||
|
||||
@ -422,7 +419,7 @@ void cv::gpu::multiply(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, doub
|
||||
|
||||
void cv::gpu::multiply(const GpuMat& src, const Scalar& sc, GpuMat& dst, double scale, int dtype, Stream& s)
|
||||
{
|
||||
using namespace OPENCV_DEVICE_NAMESPACE;
|
||||
using namespace ::cv::gpu::device;
|
||||
|
||||
typedef void (*func_t)(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);
|
||||
|
||||
@ -472,25 +469,24 @@ void cv::gpu::multiply(const GpuMat& src, const Scalar& sc, GpuMat& dst, double
|
||||
////////////////////////////////////////////////////////////////////////
|
||||
// divide
|
||||
|
||||
BEGIN_OPENCV_DEVICE_NAMESPACE
|
||||
namespace cv { namespace gpu { namespace device
|
||||
{
|
||||
void divide_gpu(const DevMem2D_<uchar4>& src1, const DevMem2Df& src2, const DevMem2D_<uchar4>& dst, cudaStream_t stream);
|
||||
void divide_gpu(const DevMem2D_<short4>& src1, const DevMem2Df& src2, const DevMem2D_<short4>& dst, cudaStream_t stream);
|
||||
|
||||
void divide_gpu(const DevMem2D_<uchar4>& src1, const DevMem2Df& src2, const DevMem2D_<uchar4>& dst, cudaStream_t stream);
|
||||
void divide_gpu(const DevMem2D_<short4>& src1, const DevMem2Df& src2, const DevMem2D_<short4>& dst, cudaStream_t stream);
|
||||
template <typename T, typename D>
|
||||
void divide_gpu(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);
|
||||
|
||||
template <typename T, typename D>
|
||||
void divide_gpu(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);
|
||||
template <typename T, typename D>
|
||||
void divide_gpu(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);
|
||||
|
||||
template <typename T, typename D>
|
||||
void divide_gpu(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);
|
||||
|
||||
template <typename T, typename D>
|
||||
void divide_gpu(double scalar, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);
|
||||
|
||||
END_OPENCV_DEVICE_NAMESPACE
|
||||
template <typename T, typename D>
|
||||
void divide_gpu(double scalar, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);
|
||||
}}}
|
||||
|
||||
void cv::gpu::divide(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, double scale, int dtype, Stream& s)
|
||||
{
|
||||
using namespace OPENCV_DEVICE_NAMESPACE;
|
||||
using namespace ::cv::gpu::device;
|
||||
|
||||
typedef void (*func_t)(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);
|
||||
|
||||
@ -548,7 +544,7 @@ void cv::gpu::divide(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, double
|
||||
|
||||
void cv::gpu::divide(const GpuMat& src, const Scalar& sc, GpuMat& dst, double scale, int dtype, Stream& s)
|
||||
{
|
||||
using namespace OPENCV_DEVICE_NAMESPACE;
|
||||
using namespace ::cv::gpu::device;
|
||||
|
||||
typedef void (*func_t)(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);
|
||||
|
||||
@ -597,7 +593,7 @@ void cv::gpu::divide(const GpuMat& src, const Scalar& sc, GpuMat& dst, double sc
|
||||
|
||||
void cv::gpu::divide(double scale, const GpuMat& src, GpuMat& dst, int dtype, Stream& s)
|
||||
{
|
||||
using namespace OPENCV_DEVICE_NAMESPACE;
|
||||
using namespace ::cv::gpu::device;
|
||||
|
||||
typedef void (*func_t)(double scalar, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);
|
||||
|
||||
@ -630,19 +626,18 @@ void cv::gpu::divide(double scale, const GpuMat& src, GpuMat& dst, int dtype, St
|
||||
//////////////////////////////////////////////////////////////////////////////
|
||||
// absdiff
|
||||
|
||||
BEGIN_OPENCV_DEVICE_NAMESPACE
|
||||
namespace cv { namespace gpu { namespace device
|
||||
{
|
||||
template <typename T>
|
||||
void absdiff_gpu(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);
|
||||
|
||||
template <typename T>
|
||||
void absdiff_gpu(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);
|
||||
|
||||
template <typename T>
|
||||
void absdiff_gpu(const DevMem2Db& src1, double val, const DevMem2Db& dst, cudaStream_t stream);
|
||||
|
||||
END_OPENCV_DEVICE_NAMESPACE
|
||||
template <typename T>
|
||||
void absdiff_gpu(const DevMem2Db& src1, double val, const DevMem2Db& dst, cudaStream_t stream);
|
||||
}}}
|
||||
|
||||
void cv::gpu::absdiff(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, Stream& s)
|
||||
{
|
||||
using namespace OPENCV_DEVICE_NAMESPACE;
|
||||
using namespace ::cv::gpu::device;
|
||||
|
||||
typedef void (*func_t)(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);
|
||||
|
||||
@ -714,7 +709,7 @@ void cv::gpu::absdiff(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, Strea
|
||||
|
||||
void cv::gpu::absdiff(const GpuMat& src1, const Scalar& src2, GpuMat& dst, Stream& s)
|
||||
{
|
||||
using namespace OPENCV_DEVICE_NAMESPACE;
|
||||
using namespace ::cv::gpu::device;
|
||||
|
||||
typedef void (*func_t)(const DevMem2Db& src1, double val, const DevMem2Db& dst, cudaStream_t stream);
|
||||
|
||||
@ -758,18 +753,17 @@ void cv::gpu::absdiff(const GpuMat& src1, const Scalar& src2, GpuMat& dst, Strea
|
||||
//////////////////////////////////////////////////////////////////////////////
|
||||
// Comparison of two matrixes
|
||||
|
||||
BEGIN_OPENCV_DEVICE_NAMESPACE
|
||||
|
||||
template <typename T> void compare_eq(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);
|
||||
template <typename T> void compare_ne(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);
|
||||
template <typename T> void compare_lt(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);
|
||||
template <typename T> void compare_le(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);
|
||||
|
||||
END_OPENCV_DEVICE_NAMESPACE
|
||||
namespace cv { namespace gpu { namespace device
|
||||
{
|
||||
template <typename T> void compare_eq(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);
|
||||
template <typename T> void compare_ne(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);
|
||||
template <typename T> void compare_lt(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);
|
||||
template <typename T> void compare_le(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);
|
||||
}}}
|
||||
|
||||
void cv::gpu::compare(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, int cmpop, Stream& stream)
|
||||
{
|
||||
using namespace OPENCV_DEVICE_NAMESPACE;
|
||||
using namespace ::cv::gpu::device;
|
||||
|
||||
typedef void (*func_t)(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);
|
||||
|
||||
@ -835,14 +829,13 @@ void cv::gpu::compare(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, int c
|
||||
//////////////////////////////////////////////////////////////////////////////
|
||||
// Unary bitwise logical operations
|
||||
|
||||
BEGIN_OPENCV_DEVICE_NAMESPACE
|
||||
namespace cv { namespace gpu { namespace device
|
||||
{
|
||||
void bitwiseNotCaller(int rows, int cols, size_t elem_size1, int cn, const PtrStepb src, PtrStepb dst, cudaStream_t stream);
|
||||
|
||||
void bitwiseNotCaller(int rows, int cols, size_t elem_size1, int cn, const PtrStepb src, PtrStepb dst, cudaStream_t stream);
|
||||
|
||||
template <typename T>
|
||||
void bitwiseMaskNotCaller(int rows, int cols, int cn, const PtrStepb src, const PtrStepb mask, PtrStepb dst, cudaStream_t stream);
|
||||
|
||||
END_OPENCV_DEVICE_NAMESPACE
|
||||
template <typename T>
|
||||
void bitwiseMaskNotCaller(int rows, int cols, int cn, const PtrStepb src, const PtrStepb mask, PtrStepb dst, cudaStream_t stream);
|
||||
}}}
|
||||
|
||||
namespace
|
||||
{
|
||||
@ -850,13 +843,13 @@ namespace
|
||||
{
|
||||
dst.create(src.size(), src.type());
|
||||
|
||||
OPENCV_DEVICE_NAMESPACE_ bitwiseNotCaller(src.rows, src.cols, src.elemSize1(), dst.channels(), src, dst, stream);
|
||||
::cv::gpu::device::bitwiseNotCaller(src.rows, src.cols, src.elemSize1(), dst.channels(), src, dst, stream);
|
||||
}
|
||||
|
||||
|
||||
void bitwiseNotCaller(const GpuMat& src, GpuMat& dst, const GpuMat& mask, cudaStream_t stream)
|
||||
{
|
||||
using namespace OPENCV_DEVICE_NAMESPACE;
|
||||
using namespace ::cv::gpu::device;
|
||||
|
||||
typedef void (*Caller)(int, int, int, const PtrStepb, const PtrStepb, PtrStepb, cudaStream_t);
|
||||
|
||||
@ -893,24 +886,23 @@ void cv::gpu::bitwise_not(const GpuMat& src, GpuMat& dst, const GpuMat& mask, St
|
||||
//////////////////////////////////////////////////////////////////////////////
|
||||
// Binary bitwise logical operations
|
||||
|
||||
BEGIN_OPENCV_DEVICE_NAMESPACE
|
||||
namespace cv { namespace gpu { namespace device
|
||||
{
|
||||
void bitwiseOrCaller(int rows, int cols, size_t elem_size1, int cn, const PtrStepb src1, const PtrStepb src2, PtrStepb dst, cudaStream_t stream);
|
||||
|
||||
void bitwiseOrCaller(int rows, int cols, size_t elem_size1, int cn, const PtrStepb src1, const PtrStepb src2, PtrStepb dst, cudaStream_t stream);
|
||||
template <typename T>
|
||||
void bitwiseMaskOrCaller(int rows, int cols, int cn, const PtrStepb src1, const PtrStepb src2, const PtrStepb mask, PtrStepb dst, cudaStream_t stream);
|
||||
|
||||
template <typename T>
|
||||
void bitwiseMaskOrCaller(int rows, int cols, int cn, const PtrStepb src1, const PtrStepb src2, const PtrStepb mask, PtrStepb dst, cudaStream_t stream);
|
||||
void bitwiseAndCaller(int rows, int cols, size_t elem_size1, int cn, const PtrStepb src1, const PtrStepb src2, PtrStepb dst, cudaStream_t stream);
|
||||
|
||||
void bitwiseAndCaller(int rows, int cols, size_t elem_size1, int cn, const PtrStepb src1, const PtrStepb src2, PtrStepb dst, cudaStream_t stream);
|
||||
template <typename T>
|
||||
void bitwiseMaskAndCaller(int rows, int cols, int cn, const PtrStepb src1, const PtrStepb src2, const PtrStepb mask, PtrStepb dst, cudaStream_t stream);
|
||||
|
||||
template <typename T>
|
||||
void bitwiseMaskAndCaller(int rows, int cols, int cn, const PtrStepb src1, const PtrStepb src2, const PtrStepb mask, PtrStepb dst, cudaStream_t stream);
|
||||
void bitwiseXorCaller(int rows, int cols, size_t elem_size1, int cn, const PtrStepb src1, const PtrStepb src2, PtrStepb dst, cudaStream_t stream);
|
||||
|
||||
void bitwiseXorCaller(int rows, int cols, size_t elem_size1, int cn, const PtrStepb src1, const PtrStepb src2, PtrStepb dst, cudaStream_t stream);
|
||||
|
||||
template <typename T>
|
||||
void bitwiseMaskXorCaller(int rows, int cols, int cn, const PtrStepb src1, const PtrStepb src2, const PtrStepb mask, PtrStepb dst, cudaStream_t stream);
|
||||
|
||||
END_OPENCV_DEVICE_NAMESPACE
|
||||
template <typename T>
|
||||
void bitwiseMaskXorCaller(int rows, int cols, int cn, const PtrStepb src1, const PtrStepb src2, const PtrStepb mask, PtrStepb dst, cudaStream_t stream);
|
||||
}}}
|
||||
|
||||
namespace
|
||||
{
|
||||
@ -919,12 +911,12 @@ namespace
|
||||
CV_Assert(src1.size() == src2.size() && src1.type() == src2.type());
|
||||
dst.create(src1.size(), src1.type());
|
||||
|
||||
OPENCV_DEVICE_NAMESPACE_ bitwiseOrCaller(dst.rows, dst.cols, dst.elemSize1(), dst.channels(), src1, src2, dst, stream);
|
||||
::cv::gpu::device::bitwiseOrCaller(dst.rows, dst.cols, dst.elemSize1(), dst.channels(), src1, src2, dst, stream);
|
||||
}
|
||||
|
||||
void bitwiseOrCaller(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, const GpuMat& mask, cudaStream_t stream)
|
||||
{
|
||||
using namespace OPENCV_DEVICE_NAMESPACE;
|
||||
using namespace ::cv::gpu::device;
|
||||
|
||||
typedef void (*Caller)(int, int, int, const PtrStepb, const PtrStepb, const PtrStepb, PtrStepb, cudaStream_t);
|
||||
|
||||
@ -952,13 +944,13 @@ namespace
|
||||
CV_Assert(src1.size() == src2.size() && src1.type() == src2.type());
|
||||
dst.create(src1.size(), src1.type());
|
||||
|
||||
OPENCV_DEVICE_NAMESPACE_ bitwiseAndCaller(dst.rows, dst.cols, dst.elemSize1(), dst.channels(), src1, src2, dst, stream);
|
||||
::cv::gpu::device::bitwiseAndCaller(dst.rows, dst.cols, dst.elemSize1(), dst.channels(), src1, src2, dst, stream);
|
||||
}
|
||||
|
||||
|
||||
void bitwiseAndCaller(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, const GpuMat& mask, cudaStream_t stream)
|
||||
{
|
||||
using namespace OPENCV_DEVICE_NAMESPACE;
|
||||
using namespace ::cv::gpu::device;
|
||||
|
||||
typedef void (*Caller)(int, int, int, const PtrStepb, const PtrStepb, const PtrStepb, PtrStepb, cudaStream_t);
|
||||
|
||||
@ -986,13 +978,13 @@ namespace
|
||||
CV_Assert(src1.size() == src2.size() && src1.type() == src2.type());
|
||||
dst.create(src1.size(), src1.type());
|
||||
|
||||
OPENCV_DEVICE_NAMESPACE_ bitwiseXorCaller(dst.rows, dst.cols, dst.elemSize1(), dst.channels(), src1, src2, dst, stream);
|
||||
::cv::gpu::device::bitwiseXorCaller(dst.rows, dst.cols, dst.elemSize1(), dst.channels(), src1, src2, dst, stream);
|
||||
}
|
||||
|
||||
|
||||
void bitwiseXorCaller(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, const GpuMat& mask, cudaStream_t stream)
|
||||
{
|
||||
using namespace OPENCV_DEVICE_NAMESPACE;
|
||||
using namespace ::cv::gpu::device;
|
||||
|
||||
typedef void (*Caller)(int, int, int, const PtrStepb, const PtrStepb, const PtrStepb, PtrStepb, cudaStream_t);
|
||||
|
||||
@ -1046,21 +1038,20 @@ void cv::gpu::bitwise_xor(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, c
|
||||
//////////////////////////////////////////////////////////////////////////////
|
||||
// Minimum and maximum operations
|
||||
|
||||
BEGIN_OPENCV_DEVICE_NAMESPACE
|
||||
namespace cv { namespace gpu { namespace device
|
||||
{
|
||||
template <typename T>
|
||||
void min_gpu(const DevMem2D_<T>& src1, const DevMem2D_<T>& src2, const DevMem2D_<T>& dst, cudaStream_t stream);
|
||||
|
||||
template <typename T>
|
||||
void min_gpu(const DevMem2D_<T>& src1, const DevMem2D_<T>& src2, const DevMem2D_<T>& dst, cudaStream_t stream);
|
||||
template <typename T>
|
||||
void max_gpu(const DevMem2D_<T>& src1, const DevMem2D_<T>& src2, const DevMem2D_<T>& dst, cudaStream_t stream);
|
||||
|
||||
template <typename T>
|
||||
void max_gpu(const DevMem2D_<T>& src1, const DevMem2D_<T>& src2, const DevMem2D_<T>& dst, cudaStream_t stream);
|
||||
template <typename T>
|
||||
void min_gpu(const DevMem2D_<T>& src1, T src2, const DevMem2D_<T>& dst, cudaStream_t stream);
|
||||
|
||||
template <typename T>
|
||||
void min_gpu(const DevMem2D_<T>& src1, T src2, const DevMem2D_<T>& dst, cudaStream_t stream);
|
||||
|
||||
template <typename T>
|
||||
void max_gpu(const DevMem2D_<T>& src1, T src2, const DevMem2D_<T>& dst, cudaStream_t stream);
|
||||
|
||||
END_OPENCV_DEVICE_NAMESPACE
|
||||
template <typename T>
|
||||
void max_gpu(const DevMem2D_<T>& src1, T src2, const DevMem2D_<T>& dst, cudaStream_t stream);
|
||||
}}}
|
||||
|
||||
namespace
|
||||
{
|
||||
@ -1069,14 +1060,14 @@ namespace
|
||||
{
|
||||
CV_Assert(src1.size() == src2.size() && src1.type() == src2.type());
|
||||
dst.create(src1.size(), src1.type());
|
||||
OPENCV_DEVICE_NAMESPACE_ min_gpu<T>(src1.reshape(1), src2.reshape(1), dst.reshape(1), stream);
|
||||
::cv::gpu::device::min_gpu<T>(src1.reshape(1), src2.reshape(1), dst.reshape(1), stream);
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
void min_caller(const GpuMat& src1, double src2, GpuMat& dst, cudaStream_t stream)
|
||||
{
|
||||
dst.create(src1.size(), src1.type());
|
||||
OPENCV_DEVICE_NAMESPACE_ min_gpu<T>(src1.reshape(1), saturate_cast<T>(src2), dst.reshape(1), stream);
|
||||
::cv::gpu::device::min_gpu<T>(src1.reshape(1), saturate_cast<T>(src2), dst.reshape(1), stream);
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
@ -1084,14 +1075,14 @@ namespace
|
||||
{
|
||||
CV_Assert(src1.size() == src2.size() && src1.type() == src2.type());
|
||||
dst.create(src1.size(), src1.type());
|
||||
OPENCV_DEVICE_NAMESPACE_ max_gpu<T>(src1.reshape(1), src2.reshape(1), dst.reshape(1), stream);
|
||||
::cv::gpu::device::max_gpu<T>(src1.reshape(1), src2.reshape(1), dst.reshape(1), stream);
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
void max_caller(const GpuMat& src1, double src2, GpuMat& dst, cudaStream_t stream)
|
||||
{
|
||||
dst.create(src1.size(), src1.type());
|
||||
OPENCV_DEVICE_NAMESPACE_ max_gpu<T>(src1.reshape(1), saturate_cast<T>(src2), dst.reshape(1), stream);
|
||||
::cv::gpu::device::max_gpu<T>(src1.reshape(1), saturate_cast<T>(src2), dst.reshape(1), stream);
|
||||
}
|
||||
}
|
||||
|
||||
@ -1155,18 +1146,17 @@ void cv::gpu::max(const GpuMat& src1, double src2, GpuMat& dst, Stream& stream)
|
||||
////////////////////////////////////////////////////////////////////////
|
||||
// threshold
|
||||
|
||||
BEGIN_OPENCV_DEVICE_NAMESPACE
|
||||
|
||||
template <typename T>
|
||||
void threshold_gpu(const DevMem2Db& src, const DevMem2Db& dst, T thresh, T maxVal, int type, cudaStream_t stream);
|
||||
|
||||
END_OPENCV_DEVICE_NAMESPACE
|
||||
namespace cv { namespace gpu { namespace device
|
||||
{
|
||||
template <typename T>
|
||||
void threshold_gpu(const DevMem2Db& src, const DevMem2Db& dst, T thresh, T maxVal, int type, cudaStream_t stream);
|
||||
}}}
|
||||
|
||||
namespace
|
||||
{
|
||||
template <typename T> void threshold_caller(const GpuMat& src, GpuMat& dst, double thresh, double maxVal, int type, cudaStream_t stream)
|
||||
{
|
||||
OPENCV_DEVICE_NAMESPACE_ threshold_gpu<T>(src, dst, saturate_cast<T>(thresh), saturate_cast<T>(maxVal), type, stream);
|
||||
::cv::gpu::device::threshold_gpu<T>(src, dst, saturate_cast<T>(thresh), saturate_cast<T>(maxVal), type, stream);
|
||||
}
|
||||
}
|
||||
|
||||
@ -1223,16 +1213,15 @@ double cv::gpu::threshold(const GpuMat& src, GpuMat& dst, double thresh, double
|
||||
////////////////////////////////////////////////////////////////////////
|
||||
// pow
|
||||
|
||||
BEGIN_OPENCV_DEVICE_NAMESPACE
|
||||
|
||||
template<typename T>
|
||||
void pow_caller(const DevMem2Db& src, float power, DevMem2Db dst, cudaStream_t stream);
|
||||
|
||||
END_OPENCV_DEVICE_NAMESPACE
|
||||
namespace cv { namespace gpu { namespace device
|
||||
{
|
||||
template<typename T>
|
||||
void pow_caller(const DevMem2Db& src, float power, DevMem2Db dst, cudaStream_t stream);
|
||||
}}}
|
||||
|
||||
void cv::gpu::pow(const GpuMat& src, double power, GpuMat& dst, Stream& stream)
|
||||
{
|
||||
using namespace OPENCV_DEVICE_NAMESPACE;
|
||||
using namespace ::cv::gpu::device;
|
||||
|
||||
CV_Assert(src.depth() != CV_64F);
|
||||
dst.create(src.size(), src.type());
|
||||
@ -1252,16 +1241,15 @@ void cv::gpu::pow(const GpuMat& src, double power, GpuMat& dst, Stream& stream)
|
||||
////////////////////////////////////////////////////////////////////////
|
||||
// addWeighted
|
||||
|
||||
BEGIN_OPENCV_DEVICE_NAMESPACE
|
||||
|
||||
template <typename T1, typename T2, typename D>
|
||||
void addWeighted_gpu(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);
|
||||
|
||||
END_OPENCV_DEVICE_NAMESPACE
|
||||
namespace cv { namespace gpu { namespace device
|
||||
{
|
||||
template <typename T1, typename T2, typename D>
|
||||
void addWeighted_gpu(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);
|
||||
}}}
|
||||
|
||||
void cv::gpu::addWeighted(const GpuMat& src1, double alpha, const GpuMat& src2, double beta, double gamma, GpuMat& dst, int dtype, Stream& stream)
|
||||
{
|
||||
using namespace OPENCV_DEVICE_NAMESPACE;
|
||||
using namespace ::cv::gpu::device;
|
||||
|
||||
CV_Assert(src1.size() == src2.size());
|
||||
CV_Assert(src1.type() == src2.type() || (dtype >= 0 && src1.channels() == src2.channels()));
|
||||
|
@ -735,21 +735,20 @@ void cv::gpu::filter2D(const GpuMat& src, GpuMat& dst, int ddepth, const Mat& ke
|
||||
////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
// Separable Linear Filter
|
||||
|
||||
BEGIN_OPENCV_DEVICE_NAMESPACE
|
||||
|
||||
namespace row_filter
|
||||
namespace cv { namespace gpu { namespace device
|
||||
{
|
||||
template <typename T, typename D>
|
||||
void linearRowFilter_gpu(const DevMem2Db& src, const DevMem2Db& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream);
|
||||
}
|
||||
namespace row_filter
|
||||
{
|
||||
template <typename T, typename D>
|
||||
void linearRowFilter_gpu(const DevMem2Db& src, const DevMem2Db& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream);
|
||||
}
|
||||
|
||||
namespace column_filter
|
||||
{
|
||||
template <typename T, typename D>
|
||||
void linearColumnFilter_gpu(const DevMem2Db& src, const DevMem2Db& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream);
|
||||
}
|
||||
|
||||
END_OPENCV_DEVICE_NAMESPACE
|
||||
namespace column_filter
|
||||
{
|
||||
template <typename T, typename D>
|
||||
void linearColumnFilter_gpu(const DevMem2Db& src, const DevMem2Db& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream);
|
||||
}
|
||||
}}}
|
||||
|
||||
namespace
|
||||
{
|
||||
@ -803,7 +802,7 @@ namespace
|
||||
|
||||
Ptr<BaseRowFilter_GPU> cv::gpu::getLinearRowFilter_GPU(int srcType, int bufType, const Mat& rowKernel, int anchor, int borderType)
|
||||
{
|
||||
using namespace OPENCV_DEVICE_NAMESPACE_ row_filter;
|
||||
using namespace ::cv::gpu::device::row_filter;
|
||||
|
||||
static const nppFilter1D_t nppFilter1D_callers[] = {0, nppiFilterRow_8u_C1R, 0, 0, nppiFilterRow_8u_C4R};
|
||||
|
||||
@ -918,7 +917,7 @@ namespace
|
||||
|
||||
Ptr<BaseColumnFilter_GPU> cv::gpu::getLinearColumnFilter_GPU(int bufType, int dstType, const Mat& columnKernel, int anchor, int borderType)
|
||||
{
|
||||
using namespace OPENCV_DEVICE_NAMESPACE_ column_filter;
|
||||
using namespace ::cv::gpu::device::column_filter;
|
||||
|
||||
static const nppFilter1D_t nppFilter1D_callers[] = {0, nppiFilterColumn_8u_C1R, 0, 0, nppiFilterColumn_8u_C4R};
|
||||
|
||||
|
@ -60,44 +60,43 @@ std::vector<float> cv::gpu::HOGDescriptor::getPeopleDetector64x128() { throw_nog
|
||||
|
||||
#else
|
||||
|
||||
BEGIN_OPENCV_DEVICE_NAMESPACE
|
||||
|
||||
namespace hog
|
||||
namespace cv { namespace gpu { namespace device
|
||||
{
|
||||
void set_up_constants(int nbins, int block_stride_x, int block_stride_y,
|
||||
int nblocks_win_x, int nblocks_win_y);
|
||||
namespace hog
|
||||
{
|
||||
void set_up_constants(int nbins, int block_stride_x, int block_stride_y,
|
||||
int nblocks_win_x, int nblocks_win_y);
|
||||
|
||||
void compute_hists(int nbins, int block_stride_x, int blovck_stride_y,
|
||||
int height, int width, const cv::gpu::DevMem2Df& grad,
|
||||
const cv::gpu::DevMem2Db& qangle, float sigma, float* block_hists);
|
||||
void compute_hists(int nbins, int block_stride_x, int blovck_stride_y,
|
||||
int height, int width, const cv::gpu::DevMem2Df& grad,
|
||||
const cv::gpu::DevMem2Db& qangle, float sigma, float* block_hists);
|
||||
|
||||
void normalize_hists(int nbins, int block_stride_x, int block_stride_y,
|
||||
int height, int width, float* block_hists, float threshold);
|
||||
void normalize_hists(int nbins, int block_stride_x, int block_stride_y,
|
||||
int height, int width, float* block_hists, float threshold);
|
||||
|
||||
void classify_hists(int win_height, int win_width, int block_stride_y,
|
||||
int block_stride_x, int win_stride_y, int win_stride_x, int height,
|
||||
int width, float* block_hists, float* coefs, float free_coef,
|
||||
float threshold, unsigned char* labels);
|
||||
void classify_hists(int win_height, int win_width, int block_stride_y,
|
||||
int block_stride_x, int win_stride_y, int win_stride_x, int height,
|
||||
int width, float* block_hists, float* coefs, float free_coef,
|
||||
float threshold, unsigned char* labels);
|
||||
|
||||
void extract_descrs_by_rows(int win_height, int win_width, int block_stride_y, int block_stride_x,
|
||||
int win_stride_y, int win_stride_x, int height, int width, float* block_hists,
|
||||
cv::gpu::DevMem2Df descriptors);
|
||||
void extract_descrs_by_cols(int win_height, int win_width, int block_stride_y, int block_stride_x,
|
||||
int win_stride_y, int win_stride_x, int height, int width, float* block_hists,
|
||||
cv::gpu::DevMem2Df descriptors);
|
||||
void extract_descrs_by_rows(int win_height, int win_width, int block_stride_y, int block_stride_x,
|
||||
int win_stride_y, int win_stride_x, int height, int width, float* block_hists,
|
||||
cv::gpu::DevMem2Df descriptors);
|
||||
void extract_descrs_by_cols(int win_height, int win_width, int block_stride_y, int block_stride_x,
|
||||
int win_stride_y, int win_stride_x, int height, int width, float* block_hists,
|
||||
cv::gpu::DevMem2Df descriptors);
|
||||
|
||||
void compute_gradients_8UC1(int nbins, int height, int width, const cv::gpu::DevMem2Db& img,
|
||||
float angle_scale, cv::gpu::DevMem2Df grad, cv::gpu::DevMem2Db qangle, bool correct_gamma);
|
||||
void compute_gradients_8UC4(int nbins, int height, int width, const cv::gpu::DevMem2Db& img,
|
||||
float angle_scale, cv::gpu::DevMem2Df grad, cv::gpu::DevMem2Db qangle, bool correct_gamma);
|
||||
void compute_gradients_8UC1(int nbins, int height, int width, const cv::gpu::DevMem2Db& img,
|
||||
float angle_scale, cv::gpu::DevMem2Df grad, cv::gpu::DevMem2Db qangle, bool correct_gamma);
|
||||
void compute_gradients_8UC4(int nbins, int height, int width, const cv::gpu::DevMem2Db& img,
|
||||
float angle_scale, cv::gpu::DevMem2Df grad, cv::gpu::DevMem2Db qangle, bool correct_gamma);
|
||||
|
||||
void resize_8UC1(const cv::gpu::DevMem2Db& src, cv::gpu::DevMem2Db dst);
|
||||
void resize_8UC4(const cv::gpu::DevMem2Db& src, cv::gpu::DevMem2Db dst);
|
||||
}
|
||||
void resize_8UC1(const cv::gpu::DevMem2Db& src, cv::gpu::DevMem2Db dst);
|
||||
void resize_8UC4(const cv::gpu::DevMem2Db& src, cv::gpu::DevMem2Db dst);
|
||||
}
|
||||
}}}
|
||||
|
||||
END_OPENCV_DEVICE_NAMESPACE
|
||||
|
||||
using namespace OPENCV_DEVICE_NAMESPACE;
|
||||
using namespace ::cv::gpu::device;
|
||||
|
||||
cv::gpu::HOGDescriptor::HOGDescriptor(Size win_size, Size block_size, Size block_stride, Size cell_size,
|
||||
int nbins, double win_sigma, double threshold_L2hys, bool gamma_correction, int nlevels)
|
||||
|
@ -107,20 +107,19 @@ void cv::gpu::CannyBuf::release() { throw_nogpu(); }
|
||||
////////////////////////////////////////////////////////////////////////
|
||||
// remap
|
||||
|
||||
BEGIN_OPENCV_DEVICE_NAMESPACE
|
||||
|
||||
namespace imgproc
|
||||
namespace cv { namespace gpu { namespace device
|
||||
{
|
||||
template <typename T>
|
||||
void remap_gpu(const DevMem2Db& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2Db& dst,
|
||||
int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
|
||||
}
|
||||
|
||||
END_OPENCV_DEVICE_NAMESPACE
|
||||
namespace imgproc
|
||||
{
|
||||
template <typename T>
|
||||
void remap_gpu(const DevMem2Db& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2Db& dst,
|
||||
int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
|
||||
}
|
||||
}}}
|
||||
|
||||
void cv::gpu::remap(const GpuMat& src, GpuMat& dst, const GpuMat& xmap, const GpuMat& ymap, int interpolation, int borderMode, const Scalar& borderValue, Stream& stream)
|
||||
{
|
||||
using namespace OPENCV_DEVICE_NAMESPACE_ imgproc;
|
||||
using namespace ::cv::gpu::device::imgproc;
|
||||
|
||||
typedef void (*caller_t)(const DevMem2Db& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2Db& dst, int interpolation,
|
||||
int borderMode, const float* borderValue, cudaStream_t stream, int cc);
|
||||
@ -160,18 +159,17 @@ void cv::gpu::remap(const GpuMat& src, GpuMat& dst, const GpuMat& xmap, const Gp
|
||||
////////////////////////////////////////////////////////////////////////
|
||||
// meanShiftFiltering_GPU
|
||||
|
||||
BEGIN_OPENCV_DEVICE_NAMESPACE
|
||||
|
||||
namespace imgproc
|
||||
namespace cv { namespace gpu { namespace device
|
||||
{
|
||||
void meanShiftFiltering_gpu(const DevMem2Db& src, DevMem2Db dst, int sp, int sr, int maxIter, float eps, cudaStream_t stream);
|
||||
}
|
||||
|
||||
END_OPENCV_DEVICE_NAMESPACE
|
||||
namespace imgproc
|
||||
{
|
||||
void meanShiftFiltering_gpu(const DevMem2Db& src, DevMem2Db dst, int sp, int sr, int maxIter, float eps, cudaStream_t stream);
|
||||
}
|
||||
}}}
|
||||
|
||||
void cv::gpu::meanShiftFiltering(const GpuMat& src, GpuMat& dst, int sp, int sr, TermCriteria criteria, Stream& stream)
|
||||
{
|
||||
using namespace OPENCV_DEVICE_NAMESPACE_ imgproc;
|
||||
using namespace ::cv::gpu::device::imgproc;
|
||||
|
||||
if( src.empty() )
|
||||
CV_Error( CV_StsBadArg, "The input image is empty" );
|
||||
@ -197,18 +195,17 @@ void cv::gpu::meanShiftFiltering(const GpuMat& src, GpuMat& dst, int sp, int sr,
|
||||
////////////////////////////////////////////////////////////////////////
|
||||
// meanShiftProc_GPU
|
||||
|
||||
BEGIN_OPENCV_DEVICE_NAMESPACE
|
||||
|
||||
namespace imgproc
|
||||
namespace cv { namespace gpu { namespace device
|
||||
{
|
||||
void meanShiftProc_gpu(const DevMem2Db& src, DevMem2Db dstr, DevMem2Db dstsp, int sp, int sr, int maxIter, float eps, cudaStream_t stream);
|
||||
}
|
||||
|
||||
END_OPENCV_DEVICE_NAMESPACE
|
||||
namespace imgproc
|
||||
{
|
||||
void meanShiftProc_gpu(const DevMem2Db& src, DevMem2Db dstr, DevMem2Db dstsp, int sp, int sr, int maxIter, float eps, cudaStream_t stream);
|
||||
}
|
||||
}}}
|
||||
|
||||
void cv::gpu::meanShiftProc(const GpuMat& src, GpuMat& dstr, GpuMat& dstsp, int sp, int sr, TermCriteria criteria, Stream& stream)
|
||||
{
|
||||
using namespace OPENCV_DEVICE_NAMESPACE_ imgproc;
|
||||
using namespace ::cv::gpu::device::imgproc;
|
||||
|
||||
if( src.empty() )
|
||||
CV_Error( CV_StsBadArg, "The input image is empty" );
|
||||
@ -235,22 +232,21 @@ void cv::gpu::meanShiftProc(const GpuMat& src, GpuMat& dstr, GpuMat& dstsp, int
|
||||
////////////////////////////////////////////////////////////////////////
|
||||
// drawColorDisp
|
||||
|
||||
BEGIN_OPENCV_DEVICE_NAMESPACE
|
||||
|
||||
namespace imgproc
|
||||
namespace cv { namespace gpu { namespace device
|
||||
{
|
||||
void drawColorDisp_gpu(const DevMem2Db& src, const DevMem2Db& dst, int ndisp, const cudaStream_t& stream);
|
||||
void drawColorDisp_gpu(const DevMem2D_<short>& src, const DevMem2Db& dst, int ndisp, const cudaStream_t& stream);
|
||||
}
|
||||
|
||||
END_OPENCV_DEVICE_NAMESPACE
|
||||
namespace imgproc
|
||||
{
|
||||
void drawColorDisp_gpu(const DevMem2Db& src, const DevMem2Db& dst, int ndisp, const cudaStream_t& stream);
|
||||
void drawColorDisp_gpu(const DevMem2D_<short>& src, const DevMem2Db& dst, int ndisp, const cudaStream_t& stream);
|
||||
}
|
||||
}}}
|
||||
|
||||
namespace
|
||||
{
|
||||
template <typename T>
|
||||
void drawColorDisp_caller(const GpuMat& src, GpuMat& dst, int ndisp, const cudaStream_t& stream)
|
||||
{
|
||||
using namespace OPENCV_DEVICE_NAMESPACE_ imgproc;
|
||||
using namespace ::cv::gpu::device::imgproc;
|
||||
|
||||
dst.create(src.size(), CV_8UC4);
|
||||
|
||||
@ -272,22 +268,21 @@ void cv::gpu::drawColorDisp(const GpuMat& src, GpuMat& dst, int ndisp, Stream& s
|
||||
////////////////////////////////////////////////////////////////////////
|
||||
// reprojectImageTo3D
|
||||
|
||||
BEGIN_OPENCV_DEVICE_NAMESPACE
|
||||
|
||||
namespace imgproc
|
||||
namespace cv { namespace gpu { namespace device
|
||||
{
|
||||
void reprojectImageTo3D_gpu(const DevMem2Db& disp, const DevMem2Df& xyzw, const float* q, const cudaStream_t& stream);
|
||||
void reprojectImageTo3D_gpu(const DevMem2D_<short>& disp, const DevMem2Df& xyzw, const float* q, const cudaStream_t& stream);
|
||||
}
|
||||
|
||||
END_OPENCV_DEVICE_NAMESPACE
|
||||
namespace imgproc
|
||||
{
|
||||
void reprojectImageTo3D_gpu(const DevMem2Db& disp, const DevMem2Df& xyzw, const float* q, const cudaStream_t& stream);
|
||||
void reprojectImageTo3D_gpu(const DevMem2D_<short>& disp, const DevMem2Df& xyzw, const float* q, const cudaStream_t& stream);
|
||||
}
|
||||
}}}
|
||||
|
||||
namespace
|
||||
{
|
||||
template <typename T>
|
||||
void reprojectImageTo3D_caller(const GpuMat& disp, GpuMat& xyzw, const Mat& Q, const cudaStream_t& stream)
|
||||
{
|
||||
using namespace OPENCV_DEVICE_NAMESPACE_ imgproc;
|
||||
using namespace ::cv::gpu::device::imgproc;
|
||||
|
||||
xyzw.create(disp.rows, disp.cols, CV_32FC4);
|
||||
|
||||
@ -309,14 +304,13 @@ void cv::gpu::reprojectImageTo3D(const GpuMat& disp, GpuMat& xyzw, const Mat& Q,
|
||||
////////////////////////////////////////////////////////////////////////
|
||||
// resize
|
||||
|
||||
BEGIN_OPENCV_DEVICE_NAMESPACE
|
||||
|
||||
namespace imgproc
|
||||
namespace cv { namespace gpu { namespace device
|
||||
{
|
||||
template <typename T> void resize_gpu(const DevMem2Db& src, float fx, float fy, const DevMem2Db& dst, int interpolation, cudaStream_t stream);
|
||||
}
|
||||
|
||||
END_OPENCV_DEVICE_NAMESPACE
|
||||
namespace imgproc
|
||||
{
|
||||
template <typename T> void resize_gpu(const DevMem2Db& src, float fx, float fy, const DevMem2Db& dst, int interpolation, cudaStream_t stream);
|
||||
}
|
||||
}}}
|
||||
|
||||
void cv::gpu::resize(const GpuMat& src, GpuMat& dst, Size dsize, double fx, double fy, int interpolation, Stream& s)
|
||||
{
|
||||
@ -380,7 +374,7 @@ void cv::gpu::resize(const GpuMat& src, GpuMat& dst, Size dsize, double fx, doub
|
||||
}
|
||||
else
|
||||
{
|
||||
using namespace OPENCV_DEVICE_NAMESPACE_ imgproc;
|
||||
using namespace ::cv::gpu::device::imgproc;
|
||||
|
||||
typedef void (*caller_t)(const DevMem2Db& src, float fx, float fy, const DevMem2Db& dst, int interpolation, cudaStream_t stream);
|
||||
static const caller_t callers[6][4] =
|
||||
@ -400,20 +394,19 @@ void cv::gpu::resize(const GpuMat& src, GpuMat& dst, Size dsize, double fx, doub
|
||||
////////////////////////////////////////////////////////////////////////
|
||||
// copyMakeBorder
|
||||
|
||||
BEGIN_OPENCV_DEVICE_NAMESPACE
|
||||
|
||||
namespace imgproc
|
||||
namespace cv { namespace gpu { namespace device
|
||||
{
|
||||
template <typename T, int cn> void copyMakeBorder_gpu(const DevMem2Db& src, const DevMem2Db& dst, int top, int left, int borderMode, const T* borderValue, cudaStream_t stream);
|
||||
}
|
||||
|
||||
END_OPENCV_DEVICE_NAMESPACE
|
||||
namespace imgproc
|
||||
{
|
||||
template <typename T, int cn> void copyMakeBorder_gpu(const DevMem2Db& src, const DevMem2Db& dst, int top, int left, int borderMode, const T* borderValue, cudaStream_t stream);
|
||||
}
|
||||
}}}
|
||||
|
||||
namespace
|
||||
{
|
||||
template <typename T, int cn> void copyMakeBorder_caller(const DevMem2Db& src, const DevMem2Db& dst, int top, int left, int borderType, const Scalar& value, cudaStream_t stream)
|
||||
{
|
||||
using namespace OPENCV_DEVICE_NAMESPACE_ imgproc;
|
||||
using namespace ::cv::gpu::device::imgproc;
|
||||
|
||||
Scalar_<T> val(saturate_cast<T>(value[0]), saturate_cast<T>(value[1]), saturate_cast<T>(value[2]), saturate_cast<T>(value[3]));
|
||||
|
||||
@ -666,21 +659,20 @@ void cv::gpu::warpPerspective(const GpuMat& src, GpuMat& dst, const Mat& M, Size
|
||||
//////////////////////////////////////////////////////////////////////////////
|
||||
// buildWarpPlaneMaps
|
||||
|
||||
BEGIN_OPENCV_DEVICE_NAMESPACE
|
||||
|
||||
namespace imgproc
|
||||
namespace cv { namespace gpu { namespace device
|
||||
{
|
||||
void buildWarpPlaneMaps(int tl_u, int tl_v, DevMem2Df map_x, DevMem2Df map_y,
|
||||
const float k_rinv[9], const float r_kinv[9], const float t[3], float scale,
|
||||
cudaStream_t stream);
|
||||
}
|
||||
|
||||
END_OPENCV_DEVICE_NAMESPACE
|
||||
namespace imgproc
|
||||
{
|
||||
void buildWarpPlaneMaps(int tl_u, int tl_v, DevMem2Df map_x, DevMem2Df map_y,
|
||||
const float k_rinv[9], const float r_kinv[9], const float t[3], float scale,
|
||||
cudaStream_t stream);
|
||||
}
|
||||
}}}
|
||||
|
||||
void cv::gpu::buildWarpPlaneMaps(Size src_size, Rect dst_roi, const Mat &K, const Mat& R, const Mat &T,
|
||||
float scale, GpuMat& map_x, GpuMat& map_y, Stream& stream)
|
||||
{
|
||||
using namespace OPENCV_DEVICE_NAMESPACE_ imgproc;
|
||||
using namespace ::cv::gpu::device::imgproc;
|
||||
|
||||
CV_Assert(K.size() == Size(3,3) && K.type() == CV_32F);
|
||||
CV_Assert(R.size() == Size(3,3) && R.type() == CV_32F);
|
||||
@ -700,21 +692,20 @@ void cv::gpu::buildWarpPlaneMaps(Size src_size, Rect dst_roi, const Mat &K, cons
|
||||
//////////////////////////////////////////////////////////////////////////////
|
||||
// buildWarpCylyndricalMaps
|
||||
|
||||
BEGIN_OPENCV_DEVICE_NAMESPACE
|
||||
|
||||
namespace imgproc
|
||||
namespace cv { namespace gpu { namespace device
|
||||
{
|
||||
void buildWarpCylindricalMaps(int tl_u, int tl_v, DevMem2Df map_x, DevMem2Df map_y,
|
||||
const float k_rinv[9], const float r_kinv[9], float scale,
|
||||
cudaStream_t stream);
|
||||
}
|
||||
|
||||
END_OPENCV_DEVICE_NAMESPACE
|
||||
namespace imgproc
|
||||
{
|
||||
void buildWarpCylindricalMaps(int tl_u, int tl_v, DevMem2Df map_x, DevMem2Df map_y,
|
||||
const float k_rinv[9], const float r_kinv[9], float scale,
|
||||
cudaStream_t stream);
|
||||
}
|
||||
}}}
|
||||
|
||||
void cv::gpu::buildWarpCylindricalMaps(Size src_size, Rect dst_roi, const Mat &K, const Mat& R, float scale,
|
||||
GpuMat& map_x, GpuMat& map_y, Stream& stream)
|
||||
{
|
||||
using namespace OPENCV_DEVICE_NAMESPACE_ imgproc;
|
||||
using namespace ::cv::gpu::device::imgproc;
|
||||
|
||||
CV_Assert(K.size() == Size(3,3) && K.type() == CV_32F);
|
||||
CV_Assert(R.size() == Size(3,3) && R.type() == CV_32F);
|
||||
@ -733,21 +724,20 @@ void cv::gpu::buildWarpCylindricalMaps(Size src_size, Rect dst_roi, const Mat &K
|
||||
//////////////////////////////////////////////////////////////////////////////
|
||||
// buildWarpSphericalMaps
|
||||
|
||||
BEGIN_OPENCV_DEVICE_NAMESPACE
|
||||
|
||||
namespace imgproc
|
||||
namespace cv { namespace gpu { namespace device
|
||||
{
|
||||
void buildWarpSphericalMaps(int tl_u, int tl_v, DevMem2Df map_x, DevMem2Df map_y,
|
||||
const float k_rinv[9], const float r_kinv[9], float scale,
|
||||
cudaStream_t stream);
|
||||
}
|
||||
|
||||
END_OPENCV_DEVICE_NAMESPACE
|
||||
namespace imgproc
|
||||
{
|
||||
void buildWarpSphericalMaps(int tl_u, int tl_v, DevMem2Df map_x, DevMem2Df map_y,
|
||||
const float k_rinv[9], const float r_kinv[9], float scale,
|
||||
cudaStream_t stream);
|
||||
}
|
||||
}}}
|
||||
|
||||
void cv::gpu::buildWarpSphericalMaps(Size src_size, Rect dst_roi, const Mat &K, const Mat& R, float scale,
|
||||
GpuMat& map_x, GpuMat& map_y, Stream& stream)
|
||||
{
|
||||
using namespace OPENCV_DEVICE_NAMESPACE_ imgproc;
|
||||
using namespace ::cv::gpu::device::imgproc;
|
||||
|
||||
CV_Assert(K.size() == Size(3,3) && K.type() == CV_32F);
|
||||
CV_Assert(R.size() == Size(3,3) && R.type() == CV_32F);
|
||||
@ -899,18 +889,17 @@ void cv::gpu::sqrIntegral(const GpuMat& src, GpuMat& sqsum, Stream& s)
|
||||
//////////////////////////////////////////////////////////////////////////////
|
||||
// columnSum
|
||||
|
||||
BEGIN_OPENCV_DEVICE_NAMESPACE
|
||||
|
||||
namespace imgproc
|
||||
namespace cv { namespace gpu { namespace device
|
||||
{
|
||||
void columnSum_32F(const DevMem2Db src, const DevMem2Db dst);
|
||||
}
|
||||
|
||||
END_OPENCV_DEVICE_NAMESPACE
|
||||
namespace imgproc
|
||||
{
|
||||
void columnSum_32F(const DevMem2Db src, const DevMem2Db dst);
|
||||
}
|
||||
}}}
|
||||
|
||||
void cv::gpu::columnSum(const GpuMat& src, GpuMat& dst)
|
||||
{
|
||||
using namespace OPENCV_DEVICE_NAMESPACE_ imgproc;
|
||||
using namespace ::cv::gpu::device::imgproc;
|
||||
|
||||
CV_Assert(src.type() == CV_32F);
|
||||
|
||||
@ -1245,19 +1234,18 @@ void cv::gpu::histRange(const GpuMat& src, GpuMat hist[4], const GpuMat levels[4
|
||||
hist_callers[src.depth()](src, hist, levels, buf, StreamAccessor::getStream(stream));
|
||||
}
|
||||
|
||||
BEGIN_OPENCV_DEVICE_NAMESPACE
|
||||
|
||||
namespace hist
|
||||
namespace cv { namespace gpu { namespace device
|
||||
{
|
||||
void histogram256_gpu(DevMem2Db src, int* hist, unsigned int* buf, cudaStream_t stream);
|
||||
namespace hist
|
||||
{
|
||||
void histogram256_gpu(DevMem2Db src, int* hist, unsigned int* buf, cudaStream_t stream);
|
||||
|
||||
const int PARTIAL_HISTOGRAM256_COUNT = 240;
|
||||
const int HISTOGRAM256_BIN_COUNT = 256;
|
||||
const int PARTIAL_HISTOGRAM256_COUNT = 240;
|
||||
const int HISTOGRAM256_BIN_COUNT = 256;
|
||||
|
||||
void equalizeHist_gpu(DevMem2Db src, DevMem2Db dst, const int* lut, cudaStream_t stream);
|
||||
}
|
||||
|
||||
END_OPENCV_DEVICE_NAMESPACE
|
||||
void equalizeHist_gpu(DevMem2Db src, DevMem2Db dst, const int* lut, cudaStream_t stream);
|
||||
}
|
||||
}}}
|
||||
|
||||
void cv::gpu::calcHist(const GpuMat& src, GpuMat& hist, Stream& stream)
|
||||
{
|
||||
@ -1267,7 +1255,7 @@ void cv::gpu::calcHist(const GpuMat& src, GpuMat& hist, Stream& stream)
|
||||
|
||||
void cv::gpu::calcHist(const GpuMat& src, GpuMat& hist, GpuMat& buf, Stream& stream)
|
||||
{
|
||||
using namespace OPENCV_DEVICE_NAMESPACE_ hist;
|
||||
using namespace ::cv::gpu::device::hist;
|
||||
|
||||
CV_Assert(src.type() == CV_8UC1);
|
||||
|
||||
@ -1293,7 +1281,7 @@ void cv::gpu::equalizeHist(const GpuMat& src, GpuMat& dst, GpuMat& hist, Stream&
|
||||
|
||||
void cv::gpu::equalizeHist(const GpuMat& src, GpuMat& dst, GpuMat& hist, GpuMat& buf, Stream& s)
|
||||
{
|
||||
using namespace OPENCV_DEVICE_NAMESPACE_ hist;
|
||||
using namespace ::cv::gpu::device::hist;
|
||||
|
||||
CV_Assert(src.type() == CV_8UC1);
|
||||
|
||||
@ -1327,16 +1315,15 @@ void cv::gpu::equalizeHist(const GpuMat& src, GpuMat& dst, GpuMat& hist, GpuMat&
|
||||
////////////////////////////////////////////////////////////////////////
|
||||
// cornerHarris & minEgenVal
|
||||
|
||||
BEGIN_OPENCV_DEVICE_NAMESPACE
|
||||
|
||||
namespace imgproc
|
||||
namespace cv { namespace gpu { namespace device
|
||||
{
|
||||
void extractCovData_caller(const DevMem2Df Dx, const DevMem2Df Dy, PtrStepf dst, cudaStream_t stream);
|
||||
void cornerHarris_caller(const int block_size, const float k, const DevMem2Db Dx, const DevMem2Db Dy, DevMem2Db dst, int border_type, cudaStream_t stream);
|
||||
void cornerMinEigenVal_caller(const int block_size, const DevMem2Db Dx, const DevMem2Db Dy, DevMem2Db dst, int border_type, cudaStream_t stream);
|
||||
}
|
||||
|
||||
END_OPENCV_DEVICE_NAMESPACE
|
||||
namespace imgproc
|
||||
{
|
||||
void extractCovData_caller(const DevMem2Df Dx, const DevMem2Df Dy, PtrStepf dst, cudaStream_t stream);
|
||||
void cornerHarris_caller(const int block_size, const float k, const DevMem2Db Dx, const DevMem2Db Dy, DevMem2Db dst, int border_type, cudaStream_t stream);
|
||||
void cornerMinEigenVal_caller(const int block_size, const DevMem2Db Dx, const DevMem2Db Dy, DevMem2Db dst, int border_type, cudaStream_t stream);
|
||||
}
|
||||
}}}
|
||||
|
||||
namespace
|
||||
{
|
||||
@ -1421,7 +1408,7 @@ void cv::gpu::cornerHarris(const GpuMat& src, GpuMat& dst, GpuMat& Dx, GpuMat& D
|
||||
|
||||
void cv::gpu::cornerHarris(const GpuMat& src, GpuMat& dst, GpuMat& Dx, GpuMat& Dy, GpuMat& buf, int blockSize, int ksize, double k, int borderType, Stream& stream)
|
||||
{
|
||||
using namespace OPENCV_DEVICE_NAMESPACE_ imgproc;
|
||||
using namespace ::cv::gpu::device::imgproc;
|
||||
|
||||
CV_Assert(borderType == cv::BORDER_REFLECT101 ||
|
||||
borderType == cv::BORDER_REPLICATE);
|
||||
@ -1448,7 +1435,7 @@ void cv::gpu::cornerMinEigenVal(const GpuMat& src, GpuMat& dst, GpuMat& Dx, GpuM
|
||||
|
||||
void cv::gpu::cornerMinEigenVal(const GpuMat& src, GpuMat& dst, GpuMat& Dx, GpuMat& Dy, GpuMat& buf, int blockSize, int ksize, int borderType, Stream& stream)
|
||||
{
|
||||
using namespace OPENCV_DEVICE_NAMESPACE_ imgproc;
|
||||
using namespace ::cv::gpu::device::imgproc;
|
||||
|
||||
CV_Assert(borderType == cv::BORDER_REFLECT101 ||
|
||||
borderType == cv::BORDER_REPLICATE);
|
||||
@ -1464,20 +1451,19 @@ void cv::gpu::cornerMinEigenVal(const GpuMat& src, GpuMat& dst, GpuMat& Dx, GpuM
|
||||
//////////////////////////////////////////////////////////////////////////////
|
||||
// mulSpectrums
|
||||
|
||||
BEGIN_OPENCV_DEVICE_NAMESPACE
|
||||
|
||||
namespace imgproc
|
||||
namespace cv { namespace gpu { namespace device
|
||||
{
|
||||
void mulSpectrums(const PtrStep<cufftComplex> a, const PtrStep<cufftComplex> b, DevMem2D_<cufftComplex> c, cudaStream_t stream);
|
||||
namespace imgproc
|
||||
{
|
||||
void mulSpectrums(const PtrStep<cufftComplex> a, const PtrStep<cufftComplex> b, DevMem2D_<cufftComplex> c, cudaStream_t stream);
|
||||
|
||||
void mulSpectrums_CONJ(const PtrStep<cufftComplex> a, const PtrStep<cufftComplex> b, DevMem2D_<cufftComplex> c, cudaStream_t stream);
|
||||
}
|
||||
|
||||
END_OPENCV_DEVICE_NAMESPACE
|
||||
void mulSpectrums_CONJ(const PtrStep<cufftComplex> a, const PtrStep<cufftComplex> b, DevMem2D_<cufftComplex> c, cudaStream_t stream);
|
||||
}
|
||||
}}}
|
||||
|
||||
void cv::gpu::mulSpectrums(const GpuMat& a, const GpuMat& b, GpuMat& c, int flags, bool conjB, Stream& stream)
|
||||
{
|
||||
using namespace OPENCV_DEVICE_NAMESPACE_ imgproc;
|
||||
using namespace ::cv::gpu::device::imgproc;
|
||||
|
||||
typedef void (*Caller)(const PtrStep<cufftComplex>, const PtrStep<cufftComplex>, DevMem2D_<cufftComplex>, cudaStream_t stream);
|
||||
|
||||
@ -1495,20 +1481,19 @@ void cv::gpu::mulSpectrums(const GpuMat& a, const GpuMat& b, GpuMat& c, int flag
|
||||
//////////////////////////////////////////////////////////////////////////////
|
||||
// mulAndScaleSpectrums
|
||||
|
||||
BEGIN_OPENCV_DEVICE_NAMESPACE
|
||||
|
||||
namespace imgproc
|
||||
namespace cv { namespace gpu { namespace device
|
||||
{
|
||||
void mulAndScaleSpectrums(const PtrStep<cufftComplex> a, const PtrStep<cufftComplex> b, float scale, DevMem2D_<cufftComplex> c, cudaStream_t stream);
|
||||
namespace imgproc
|
||||
{
|
||||
void mulAndScaleSpectrums(const PtrStep<cufftComplex> a, const PtrStep<cufftComplex> b, float scale, DevMem2D_<cufftComplex> c, cudaStream_t stream);
|
||||
|
||||
void mulAndScaleSpectrums_CONJ(const PtrStep<cufftComplex> a, const PtrStep<cufftComplex> b, float scale, DevMem2D_<cufftComplex> c, cudaStream_t stream);
|
||||
}
|
||||
|
||||
END_OPENCV_DEVICE_NAMESPACE
|
||||
void mulAndScaleSpectrums_CONJ(const PtrStep<cufftComplex> a, const PtrStep<cufftComplex> b, float scale, DevMem2D_<cufftComplex> c, cudaStream_t stream);
|
||||
}
|
||||
}}}
|
||||
|
||||
void cv::gpu::mulAndScaleSpectrums(const GpuMat& a, const GpuMat& b, GpuMat& c, int flags, float scale, bool conjB, Stream& stream)
|
||||
{
|
||||
using namespace OPENCV_DEVICE_NAMESPACE_ imgproc;
|
||||
using namespace ::cv::gpu::device::imgproc;
|
||||
|
||||
typedef void (*Caller)(const PtrStep<cufftComplex>, const PtrStep<cufftComplex>, float scale, DevMem2D_<cufftComplex>, cudaStream_t stream);
|
||||
static Caller callers[] = { mulAndScaleSpectrums, mulAndScaleSpectrums_CONJ };
|
||||
@ -1673,18 +1658,17 @@ void cv::gpu::convolve(const GpuMat& image, const GpuMat& templ, GpuMat& result,
|
||||
convolve(image, templ, result, ccorr, buf);
|
||||
}
|
||||
|
||||
BEGIN_OPENCV_DEVICE_NAMESPACE
|
||||
|
||||
namespace imgproc
|
||||
namespace cv { namespace gpu { namespace device
|
||||
{
|
||||
void convolve_gpu(const DevMem2Df& src, const PtrStepf& dst, int kWidth, int kHeight, float* kernel, cudaStream_t stream);
|
||||
}
|
||||
|
||||
END_OPENCV_DEVICE_NAMESPACE
|
||||
namespace imgproc
|
||||
{
|
||||
void convolve_gpu(const DevMem2Df& src, const PtrStepf& dst, int kWidth, int kHeight, float* kernel, cudaStream_t stream);
|
||||
}
|
||||
}}}
|
||||
|
||||
void cv::gpu::convolve(const GpuMat& image, const GpuMat& templ, GpuMat& result, bool ccorr, ConvolveBuf& buf, Stream& stream)
|
||||
{
|
||||
using namespace OPENCV_DEVICE_NAMESPACE_ imgproc;
|
||||
using namespace ::cv::gpu::device::imgproc;
|
||||
|
||||
#ifndef HAVE_CUFFT
|
||||
|
||||
@ -1811,18 +1795,17 @@ void cv::gpu::convolve(const GpuMat& image, const GpuMat& templ, GpuMat& result,
|
||||
//////////////////////////////////////////////////////////////////////////////
|
||||
// pyrDown
|
||||
|
||||
BEGIN_OPENCV_DEVICE_NAMESPACE
|
||||
|
||||
namespace imgproc
|
||||
namespace cv { namespace gpu { namespace device
|
||||
{
|
||||
template <typename T, int cn> void pyrDown_gpu(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
|
||||
}
|
||||
|
||||
END_OPENCV_DEVICE_NAMESPACE
|
||||
namespace imgproc
|
||||
{
|
||||
template <typename T, int cn> void pyrDown_gpu(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
|
||||
}
|
||||
}}}
|
||||
|
||||
void cv::gpu::pyrDown(const GpuMat& src, GpuMat& dst, int borderType, Stream& stream)
|
||||
{
|
||||
using namespace OPENCV_DEVICE_NAMESPACE_ imgproc;
|
||||
using namespace ::cv::gpu::device::imgproc;
|
||||
|
||||
typedef void (*func_t)(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
|
||||
|
||||
@ -1851,18 +1834,17 @@ void cv::gpu::pyrDown(const GpuMat& src, GpuMat& dst, int borderType, Stream& st
|
||||
//////////////////////////////////////////////////////////////////////////////
|
||||
// pyrUp
|
||||
|
||||
BEGIN_OPENCV_DEVICE_NAMESPACE
|
||||
|
||||
namespace imgproc
|
||||
namespace cv { namespace gpu { namespace device
|
||||
{
|
||||
template <typename T, int cn> void pyrUp_gpu(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
|
||||
}
|
||||
|
||||
END_OPENCV_DEVICE_NAMESPACE
|
||||
namespace imgproc
|
||||
{
|
||||
template <typename T, int cn> void pyrUp_gpu(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
|
||||
}
|
||||
}}}
|
||||
|
||||
void cv::gpu::pyrUp(const GpuMat& src, GpuMat& dst, int borderType, Stream& stream)
|
||||
{
|
||||
using namespace OPENCV_DEVICE_NAMESPACE_ imgproc;
|
||||
using namespace ::cv::gpu::device::imgproc;
|
||||
|
||||
typedef void (*func_t)(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
|
||||
|
||||
@ -1933,31 +1915,30 @@ void cv::gpu::CannyBuf::release()
|
||||
trackBuf2.release();
|
||||
}
|
||||
|
||||
BEGIN_OPENCV_DEVICE_NAMESPACE
|
||||
|
||||
namespace canny
|
||||
namespace cv { namespace gpu { namespace device
|
||||
{
|
||||
void calcSobelRowPass_gpu(PtrStepb src, PtrStepi dx_buf, PtrStepi dy_buf, int rows, int cols);
|
||||
namespace canny
|
||||
{
|
||||
void calcSobelRowPass_gpu(PtrStepb src, PtrStepi dx_buf, PtrStepi dy_buf, int rows, int cols);
|
||||
|
||||
void calcMagnitude_gpu(PtrStepi dx_buf, PtrStepi dy_buf, PtrStepi dx, PtrStepi dy, PtrStepf mag, int rows, int cols, bool L2Grad);
|
||||
void calcMagnitude_gpu(PtrStepi dx, PtrStepi dy, PtrStepf mag, int rows, int cols, bool L2Grad);
|
||||
void calcMagnitude_gpu(PtrStepi dx_buf, PtrStepi dy_buf, PtrStepi dx, PtrStepi dy, PtrStepf mag, int rows, int cols, bool L2Grad);
|
||||
void calcMagnitude_gpu(PtrStepi dx, PtrStepi dy, PtrStepf mag, int rows, int cols, bool L2Grad);
|
||||
|
||||
void calcMap_gpu(PtrStepi dx, PtrStepi dy, PtrStepf mag, PtrStepi map, int rows, int cols, float low_thresh, float high_thresh);
|
||||
|
||||
void edgesHysteresisLocal_gpu(PtrStepi map, ushort2* st1, int rows, int cols);
|
||||
void calcMap_gpu(PtrStepi dx, PtrStepi dy, PtrStepf mag, PtrStepi map, int rows, int cols, float low_thresh, float high_thresh);
|
||||
|
||||
void edgesHysteresisLocal_gpu(PtrStepi map, ushort2* st1, int rows, int cols);
|
||||
|
||||
void edgesHysteresisGlobal_gpu(PtrStepi map, ushort2* st1, ushort2* st2, int rows, int cols);
|
||||
void edgesHysteresisGlobal_gpu(PtrStepi map, ushort2* st1, ushort2* st2, int rows, int cols);
|
||||
|
||||
void getEdges_gpu(PtrStepi map, PtrStepb dst, int rows, int cols);
|
||||
}
|
||||
|
||||
END_OPENCV_DEVICE_NAMESPACE
|
||||
void getEdges_gpu(PtrStepi map, PtrStepb dst, int rows, int cols);
|
||||
}
|
||||
}}}
|
||||
|
||||
namespace
|
||||
{
|
||||
void CannyCaller(CannyBuf& buf, GpuMat& dst, float low_thresh, float high_thresh)
|
||||
{
|
||||
using namespace OPENCV_DEVICE_NAMESPACE_ canny;
|
||||
using namespace ::cv::gpu::device::canny;
|
||||
|
||||
calcMap_gpu(buf.dx, buf.dy, buf.edgeBuf, buf.edgeBuf, dst.rows, dst.cols, low_thresh, high_thresh);
|
||||
|
||||
@ -1977,7 +1958,7 @@ void cv::gpu::Canny(const GpuMat& src, GpuMat& dst, double low_thresh, double hi
|
||||
|
||||
void cv::gpu::Canny(const GpuMat& src, CannyBuf& buf, GpuMat& dst, double low_thresh, double high_thresh, int apperture_size, bool L2gradient)
|
||||
{
|
||||
using namespace OPENCV_DEVICE_NAMESPACE_ canny;
|
||||
using namespace ::cv::gpu::device::canny;
|
||||
|
||||
CV_Assert(TargetArchs::builtWith(SHARED_ATOMICS) && DeviceInfo().supports(SHARED_ATOMICS));
|
||||
CV_Assert(src.type() == CV_8UC1);
|
||||
@ -2016,7 +1997,7 @@ void cv::gpu::Canny(const GpuMat& dx, const GpuMat& dy, GpuMat& dst, double low_
|
||||
|
||||
void cv::gpu::Canny(const GpuMat& dx, const GpuMat& dy, CannyBuf& buf, GpuMat& dst, double low_thresh, double high_thresh, bool L2gradient)
|
||||
{
|
||||
using namespace OPENCV_DEVICE_NAMESPACE_ canny;
|
||||
using namespace ::cv::gpu::device::canny;
|
||||
|
||||
CV_Assert(TargetArchs::builtWith(SHARED_ATOMICS) && DeviceInfo().supports(SHARED_ATOMICS));
|
||||
CV_Assert(dx.type() == CV_32SC1 && dy.type() == CV_32SC1 && dx.size() == dy.size());
|
||||
|
@ -274,18 +274,17 @@ void cv::gpu::DeviceInfo::queryMemory(size_t& free_memory, size_t& total_memory)
|
||||
////////////////////////////////////////////////////////////////////
|
||||
// GpuFuncTable
|
||||
|
||||
BEGIN_OPENCV_DEVICE_NAMESPACE
|
||||
namespace cv { namespace gpu { namespace device
|
||||
{
|
||||
void copy_to_with_mask(const DevMem2Db& src, DevMem2Db dst, int depth, const DevMem2Db& mask, int channels, const cudaStream_t& stream = 0);
|
||||
|
||||
void copy_to_with_mask(const DevMem2Db& src, DevMem2Db dst, int depth, const DevMem2Db& mask, int channels, const cudaStream_t& stream = 0);
|
||||
template <typename T>
|
||||
void set_to_gpu(const DevMem2Db& mat, const T* scalar, int channels, cudaStream_t stream);
|
||||
template <typename T>
|
||||
void set_to_gpu(const DevMem2Db& mat, const T* scalar, const DevMem2Db& mask, int channels, cudaStream_t stream);
|
||||
|
||||
template <typename T>
|
||||
void set_to_gpu(const DevMem2Db& mat, const T* scalar, int channels, cudaStream_t stream);
|
||||
template <typename T>
|
||||
void set_to_gpu(const DevMem2Db& mat, const T* scalar, const DevMem2Db& mask, int channels, cudaStream_t stream);
|
||||
|
||||
void convert_gpu(const DevMem2Db& src, int sdepth, const DevMem2Db& dst, int ddepth, double alpha, double beta, cudaStream_t stream = 0);
|
||||
|
||||
END_OPENCV_DEVICE_NAMESPACE
|
||||
void convert_gpu(const DevMem2Db& src, int sdepth, const DevMem2Db& dst, int ddepth, double alpha, double beta, cudaStream_t stream = 0);
|
||||
}}}
|
||||
|
||||
namespace
|
||||
{
|
||||
@ -345,7 +344,7 @@ namespace
|
||||
|
||||
void convertToKernelCaller(const GpuMat& src, GpuMat& dst)
|
||||
{
|
||||
OPENCV_DEVICE_NAMESPACE_ convert_gpu(src.reshape(1), src.depth(), dst.reshape(1), dst.depth(), 1.0, 0.0);
|
||||
::cv::gpu::device::convert_gpu(src.reshape(1), src.depth(), dst.reshape(1), dst.depth(), 1.0, 0.0);
|
||||
}
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////
|
||||
@ -403,7 +402,7 @@ namespace
|
||||
void kernelSet(GpuMat& src, Scalar s)
|
||||
{
|
||||
Scalar_<T> sf = s;
|
||||
OPENCV_DEVICE_NAMESPACE_ set_to_gpu(src, sf.val, src.channels(), 0);
|
||||
::cv::gpu::device::set_to_gpu(src, sf.val, src.channels(), 0);
|
||||
}
|
||||
|
||||
template<int SDEPTH, int SCN> struct NppSetMaskFunc
|
||||
@ -458,7 +457,7 @@ namespace
|
||||
void kernelSetMask(GpuMat& src, Scalar s, const GpuMat& mask)
|
||||
{
|
||||
Scalar_<T> sf = s;
|
||||
OPENCV_DEVICE_NAMESPACE_ set_to_gpu(src, sf.val, mask, src.channels(), 0);
|
||||
::cv::gpu::device::set_to_gpu(src, sf.val, mask, src.channels(), 0);
|
||||
}
|
||||
|
||||
class CudaFuncTable : public GpuFuncTable
|
||||
@ -479,7 +478,7 @@ namespace
|
||||
|
||||
void copyWithMask(const GpuMat& src, GpuMat& dst, const GpuMat& mask) const
|
||||
{
|
||||
OPENCV_DEVICE_NAMESPACE_ copy_to_with_mask(src, dst, src.depth(), mask, src.channels());
|
||||
::cv::gpu::device::copy_to_with_mask(src, dst, src.depth(), mask, src.channels());
|
||||
}
|
||||
|
||||
void convert(const GpuMat& src, GpuMat& dst) const
|
||||
@ -560,7 +559,7 @@ namespace
|
||||
|
||||
void convert(const GpuMat& src, GpuMat& dst, double alpha, double beta) const
|
||||
{
|
||||
device::convert_gpu(src.reshape(1), src.depth(), dst.reshape(1), dst.depth(), alpha, beta);
|
||||
::cv::gpu::device::convert_gpu(src.reshape(1), src.depth(), dst.reshape(1), dst.depth(), alpha, beta);
|
||||
}
|
||||
|
||||
void setTo(GpuMat& m, Scalar s, const GpuMat& mask) const
|
||||
|
@ -52,94 +52,93 @@ void cv::gpu::matchTemplate(const GpuMat&, const GpuMat&, GpuMat&, int, Stream&)
|
||||
|
||||
#else
|
||||
|
||||
BEGIN_OPENCV_DEVICE_NAMESPACE
|
||||
|
||||
namespace match_template
|
||||
namespace cv { namespace gpu { namespace device
|
||||
{
|
||||
void matchTemplateNaive_CCORR_8U(const DevMem2Db image, const DevMem2Db templ, DevMem2Df result, int cn, cudaStream_t stream);
|
||||
void matchTemplateNaive_CCORR_32F(const DevMem2Db image, const DevMem2Db templ, DevMem2Df result, int cn, cudaStream_t stream);
|
||||
namespace match_template
|
||||
{
|
||||
void matchTemplateNaive_CCORR_8U(const DevMem2Db image, const DevMem2Db templ, DevMem2Df result, int cn, cudaStream_t stream);
|
||||
void matchTemplateNaive_CCORR_32F(const DevMem2Db image, const DevMem2Db templ, DevMem2Df result, int cn, cudaStream_t stream);
|
||||
|
||||
void matchTemplateNaive_SQDIFF_8U(const DevMem2Db image, const DevMem2Db templ, DevMem2Df result, int cn, cudaStream_t stream);
|
||||
void matchTemplateNaive_SQDIFF_32F(const DevMem2Db image, const DevMem2Db templ, DevMem2Df result, int cn, cudaStream_t stream);
|
||||
void matchTemplateNaive_SQDIFF_8U(const DevMem2Db image, const DevMem2Db templ, DevMem2Df result, int cn, cudaStream_t stream);
|
||||
void matchTemplateNaive_SQDIFF_32F(const DevMem2Db image, const DevMem2Db templ, DevMem2Df result, int cn, cudaStream_t stream);
|
||||
|
||||
void matchTemplatePrepared_SQDIFF_8U(int w, int h, const DevMem2D_<unsigned long long> image_sqsum, unsigned int templ_sqsum, DevMem2Df result,
|
||||
int cn, cudaStream_t stream);
|
||||
void matchTemplatePrepared_SQDIFF_8U(int w, int h, const DevMem2D_<unsigned long long> image_sqsum, unsigned int templ_sqsum, DevMem2Df result,
|
||||
int cn, cudaStream_t stream);
|
||||
|
||||
void matchTemplatePrepared_SQDIFF_NORMED_8U(int w, int h, const DevMem2D_<unsigned long long> image_sqsum, unsigned int templ_sqsum, DevMem2Df result,
|
||||
int cn, cudaStream_t stream);
|
||||
void matchTemplatePrepared_SQDIFF_NORMED_8U(int w, int h, const DevMem2D_<unsigned long long> image_sqsum, unsigned int templ_sqsum, DevMem2Df result,
|
||||
int cn, cudaStream_t stream);
|
||||
|
||||
void matchTemplatePrepared_CCOFF_8U(int w, int h, const DevMem2D_<unsigned int> image_sum, unsigned int templ_sum, DevMem2Df result, cudaStream_t stream);
|
||||
void matchTemplatePrepared_CCOFF_8UC2(
|
||||
int w, int h,
|
||||
const DevMem2D_<unsigned int> image_sum_r,
|
||||
const DevMem2D_<unsigned int> image_sum_g,
|
||||
unsigned int templ_sum_r,
|
||||
unsigned int templ_sum_g,
|
||||
DevMem2Df result, cudaStream_t stream);
|
||||
void matchTemplatePrepared_CCOFF_8UC3(
|
||||
int w, int h,
|
||||
void matchTemplatePrepared_CCOFF_8U(int w, int h, const DevMem2D_<unsigned int> image_sum, unsigned int templ_sum, DevMem2Df result, cudaStream_t stream);
|
||||
void matchTemplatePrepared_CCOFF_8UC2(
|
||||
int w, int h,
|
||||
const DevMem2D_<unsigned int> image_sum_r,
|
||||
const DevMem2D_<unsigned int> image_sum_g,
|
||||
const DevMem2D_<unsigned int> image_sum_b,
|
||||
unsigned int templ_sum_r,
|
||||
const DevMem2D_<unsigned int> image_sum_g,
|
||||
unsigned int templ_sum_r,
|
||||
unsigned int templ_sum_g,
|
||||
unsigned int templ_sum_b,
|
||||
DevMem2Df result, cudaStream_t stream);
|
||||
void matchTemplatePrepared_CCOFF_8UC4(
|
||||
int w, int h,
|
||||
const DevMem2D_<unsigned int> image_sum_r,
|
||||
const DevMem2D_<unsigned int> image_sum_g,
|
||||
const DevMem2D_<unsigned int> image_sum_b,
|
||||
const DevMem2D_<unsigned int> image_sum_a,
|
||||
unsigned int templ_sum_r,
|
||||
unsigned int templ_sum_g,
|
||||
unsigned int templ_sum_b,
|
||||
unsigned int templ_sum_a,
|
||||
DevMem2Df result, cudaStream_t stream);
|
||||
void matchTemplatePrepared_CCOFF_8UC3(
|
||||
int w, int h,
|
||||
const DevMem2D_<unsigned int> image_sum_r,
|
||||
const DevMem2D_<unsigned int> image_sum_g,
|
||||
const DevMem2D_<unsigned int> image_sum_b,
|
||||
unsigned int templ_sum_r,
|
||||
unsigned int templ_sum_g,
|
||||
unsigned int templ_sum_b,
|
||||
DevMem2Df result, cudaStream_t stream);
|
||||
void matchTemplatePrepared_CCOFF_8UC4(
|
||||
int w, int h,
|
||||
const DevMem2D_<unsigned int> image_sum_r,
|
||||
const DevMem2D_<unsigned int> image_sum_g,
|
||||
const DevMem2D_<unsigned int> image_sum_b,
|
||||
const DevMem2D_<unsigned int> image_sum_a,
|
||||
unsigned int templ_sum_r,
|
||||
unsigned int templ_sum_g,
|
||||
unsigned int templ_sum_b,
|
||||
unsigned int templ_sum_a,
|
||||
DevMem2Df result, cudaStream_t stream);
|
||||
|
||||
|
||||
void matchTemplatePrepared_CCOFF_NORMED_8U(
|
||||
int w, int h, const DevMem2D_<unsigned int> image_sum,
|
||||
const DevMem2D_<unsigned long long> image_sqsum,
|
||||
unsigned int templ_sum, unsigned int templ_sqsum,
|
||||
DevMem2Df result, cudaStream_t stream);
|
||||
void matchTemplatePrepared_CCOFF_NORMED_8UC2(
|
||||
int w, int h,
|
||||
const DevMem2D_<unsigned int> image_sum_r, const DevMem2D_<unsigned long long> image_sqsum_r,
|
||||
const DevMem2D_<unsigned int> image_sum_g, const DevMem2D_<unsigned long long> image_sqsum_g,
|
||||
unsigned int templ_sum_r, unsigned int templ_sqsum_r,
|
||||
unsigned int templ_sum_g, unsigned int templ_sqsum_g,
|
||||
DevMem2Df result, cudaStream_t stream);
|
||||
void matchTemplatePrepared_CCOFF_NORMED_8UC3(
|
||||
int w, int h,
|
||||
const DevMem2D_<unsigned int> image_sum_r, const DevMem2D_<unsigned long long> image_sqsum_r,
|
||||
const DevMem2D_<unsigned int> image_sum_g, const DevMem2D_<unsigned long long> image_sqsum_g,
|
||||
const DevMem2D_<unsigned int> image_sum_b, const DevMem2D_<unsigned long long> image_sqsum_b,
|
||||
unsigned int templ_sum_r, unsigned int templ_sqsum_r,
|
||||
unsigned int templ_sum_g, unsigned int templ_sqsum_g,
|
||||
unsigned int templ_sum_b, unsigned int templ_sqsum_b,
|
||||
DevMem2Df result, cudaStream_t stream);
|
||||
void matchTemplatePrepared_CCOFF_NORMED_8UC4(
|
||||
int w, int h,
|
||||
const DevMem2D_<unsigned int> image_sum_r, const DevMem2D_<unsigned long long> image_sqsum_r,
|
||||
const DevMem2D_<unsigned int> image_sum_g, const DevMem2D_<unsigned long long> image_sqsum_g,
|
||||
const DevMem2D_<unsigned int> image_sum_b, const DevMem2D_<unsigned long long> image_sqsum_b,
|
||||
const DevMem2D_<unsigned int> image_sum_a, const DevMem2D_<unsigned long long> image_sqsum_a,
|
||||
unsigned int templ_sum_r, unsigned int templ_sqsum_r,
|
||||
unsigned int templ_sum_g, unsigned int templ_sqsum_g,
|
||||
unsigned int templ_sum_b, unsigned int templ_sqsum_b,
|
||||
unsigned int templ_sum_a, unsigned int templ_sqsum_a,
|
||||
DevMem2Df result, cudaStream_t stream);
|
||||
void matchTemplatePrepared_CCOFF_NORMED_8U(
|
||||
int w, int h, const DevMem2D_<unsigned int> image_sum,
|
||||
const DevMem2D_<unsigned long long> image_sqsum,
|
||||
unsigned int templ_sum, unsigned int templ_sqsum,
|
||||
DevMem2Df result, cudaStream_t stream);
|
||||
void matchTemplatePrepared_CCOFF_NORMED_8UC2(
|
||||
int w, int h,
|
||||
const DevMem2D_<unsigned int> image_sum_r, const DevMem2D_<unsigned long long> image_sqsum_r,
|
||||
const DevMem2D_<unsigned int> image_sum_g, const DevMem2D_<unsigned long long> image_sqsum_g,
|
||||
unsigned int templ_sum_r, unsigned int templ_sqsum_r,
|
||||
unsigned int templ_sum_g, unsigned int templ_sqsum_g,
|
||||
DevMem2Df result, cudaStream_t stream);
|
||||
void matchTemplatePrepared_CCOFF_NORMED_8UC3(
|
||||
int w, int h,
|
||||
const DevMem2D_<unsigned int> image_sum_r, const DevMem2D_<unsigned long long> image_sqsum_r,
|
||||
const DevMem2D_<unsigned int> image_sum_g, const DevMem2D_<unsigned long long> image_sqsum_g,
|
||||
const DevMem2D_<unsigned int> image_sum_b, const DevMem2D_<unsigned long long> image_sqsum_b,
|
||||
unsigned int templ_sum_r, unsigned int templ_sqsum_r,
|
||||
unsigned int templ_sum_g, unsigned int templ_sqsum_g,
|
||||
unsigned int templ_sum_b, unsigned int templ_sqsum_b,
|
||||
DevMem2Df result, cudaStream_t stream);
|
||||
void matchTemplatePrepared_CCOFF_NORMED_8UC4(
|
||||
int w, int h,
|
||||
const DevMem2D_<unsigned int> image_sum_r, const DevMem2D_<unsigned long long> image_sqsum_r,
|
||||
const DevMem2D_<unsigned int> image_sum_g, const DevMem2D_<unsigned long long> image_sqsum_g,
|
||||
const DevMem2D_<unsigned int> image_sum_b, const DevMem2D_<unsigned long long> image_sqsum_b,
|
||||
const DevMem2D_<unsigned int> image_sum_a, const DevMem2D_<unsigned long long> image_sqsum_a,
|
||||
unsigned int templ_sum_r, unsigned int templ_sqsum_r,
|
||||
unsigned int templ_sum_g, unsigned int templ_sqsum_g,
|
||||
unsigned int templ_sum_b, unsigned int templ_sqsum_b,
|
||||
unsigned int templ_sum_a, unsigned int templ_sqsum_a,
|
||||
DevMem2Df result, cudaStream_t stream);
|
||||
|
||||
void normalize_8U(int w, int h, const DevMem2D_<unsigned long long> image_sqsum,
|
||||
unsigned int templ_sqsum, DevMem2Df result, int cn, cudaStream_t stream);
|
||||
void normalize_8U(int w, int h, const DevMem2D_<unsigned long long> image_sqsum,
|
||||
unsigned int templ_sqsum, DevMem2Df result, int cn, cudaStream_t stream);
|
||||
|
||||
void extractFirstChannel_32F(const DevMem2Db image, DevMem2Df result, int cn, cudaStream_t stream);
|
||||
}
|
||||
void extractFirstChannel_32F(const DevMem2Db image, DevMem2Df result, int cn, cudaStream_t stream);
|
||||
}
|
||||
}}}
|
||||
|
||||
END_OPENCV_DEVICE_NAMESPACE
|
||||
|
||||
using namespace OPENCV_DEVICE_NAMESPACE_ match_template;
|
||||
using namespace ::cv::gpu::device::match_template;
|
||||
|
||||
namespace
|
||||
{
|
||||
|
@ -190,35 +190,34 @@ double cv::gpu::norm(const GpuMat& src1, const GpuMat& src2, int normType)
|
||||
////////////////////////////////////////////////////////////////////////
|
||||
// Sum
|
||||
|
||||
BEGIN_OPENCV_DEVICE_NAMESPACE
|
||||
|
||||
namespace matrix_reductions
|
||||
namespace cv { namespace gpu { namespace device
|
||||
{
|
||||
namespace sum
|
||||
namespace matrix_reductions
|
||||
{
|
||||
template <typename T>
|
||||
void sumCaller(const DevMem2Db src, PtrStepb buf, double* sum, int cn);
|
||||
namespace sum
|
||||
{
|
||||
template <typename T>
|
||||
void sumCaller(const DevMem2Db src, PtrStepb buf, double* sum, int cn);
|
||||
|
||||
template <typename T>
|
||||
void sumMultipassCaller(const DevMem2Db src, PtrStepb buf, double* sum, int cn);
|
||||
template <typename T>
|
||||
void sumMultipassCaller(const DevMem2Db src, PtrStepb buf, double* sum, int cn);
|
||||
|
||||
template <typename T>
|
||||
void absSumCaller(const DevMem2Db src, PtrStepb buf, double* sum, int cn);
|
||||
template <typename T>
|
||||
void absSumCaller(const DevMem2Db src, PtrStepb buf, double* sum, int cn);
|
||||
|
||||
template <typename T>
|
||||
void absSumMultipassCaller(const DevMem2Db src, PtrStepb buf, double* sum, int cn);
|
||||
template <typename T>
|
||||
void absSumMultipassCaller(const DevMem2Db src, PtrStepb buf, double* sum, int cn);
|
||||
|
||||
template <typename T>
|
||||
void sqrSumCaller(const DevMem2Db src, PtrStepb buf, double* sum, int cn);
|
||||
template <typename T>
|
||||
void sqrSumCaller(const DevMem2Db src, PtrStepb buf, double* sum, int cn);
|
||||
|
||||
template <typename T>
|
||||
void sqrSumMultipassCaller(const DevMem2Db src, PtrStepb buf, double* sum, int cn);
|
||||
template <typename T>
|
||||
void sqrSumMultipassCaller(const DevMem2Db src, PtrStepb buf, double* sum, int cn);
|
||||
|
||||
void getBufSizeRequired(int cols, int rows, int cn, int& bufcols, int& bufrows);
|
||||
void getBufSizeRequired(int cols, int rows, int cn, int& bufcols, int& bufrows);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
END_OPENCV_DEVICE_NAMESPACE
|
||||
}}}
|
||||
|
||||
Scalar cv::gpu::sum(const GpuMat& src)
|
||||
{
|
||||
@ -229,7 +228,7 @@ Scalar cv::gpu::sum(const GpuMat& src)
|
||||
|
||||
Scalar cv::gpu::sum(const GpuMat& src, GpuMat& buf)
|
||||
{
|
||||
using namespace OPENCV_DEVICE_NAMESPACE_ matrix_reductions::sum;
|
||||
using namespace ::cv::gpu::device::matrix_reductions::sum;
|
||||
|
||||
typedef void (*Caller)(const DevMem2Db, PtrStepb, double*, int);
|
||||
|
||||
@ -272,7 +271,7 @@ Scalar cv::gpu::absSum(const GpuMat& src)
|
||||
|
||||
Scalar cv::gpu::absSum(const GpuMat& src, GpuMat& buf)
|
||||
{
|
||||
using namespace OPENCV_DEVICE_NAMESPACE_ matrix_reductions::sum;
|
||||
using namespace ::cv::gpu::device::matrix_reductions::sum;
|
||||
|
||||
typedef void (*Caller)(const DevMem2Db, PtrStepb, double*, int);
|
||||
|
||||
@ -316,7 +315,7 @@ Scalar cv::gpu::sqrSum(const GpuMat& src)
|
||||
|
||||
Scalar cv::gpu::sqrSum(const GpuMat& src, GpuMat& buf)
|
||||
{
|
||||
using namespace OPENCV_DEVICE_NAMESPACE_ matrix_reductions::sum;
|
||||
using namespace ::cv::gpu::device::matrix_reductions::sum;
|
||||
|
||||
typedef void (*Caller)(const DevMem2Db, PtrStepb, double*, int);
|
||||
|
||||
@ -353,29 +352,28 @@ Scalar cv::gpu::sqrSum(const GpuMat& src, GpuMat& buf)
|
||||
////////////////////////////////////////////////////////////////////////
|
||||
// Find min or max
|
||||
|
||||
BEGIN_OPENCV_DEVICE_NAMESPACE
|
||||
|
||||
namespace matrix_reductions
|
||||
namespace cv { namespace gpu { namespace device
|
||||
{
|
||||
namespace minmax
|
||||
namespace matrix_reductions
|
||||
{
|
||||
void getBufSizeRequired(int cols, int rows, int elem_size, int& bufcols, int& bufrows);
|
||||
|
||||
template <typename T>
|
||||
void minMaxCaller(const DevMem2Db src, double* minval, double* maxval, PtrStepb buf);
|
||||
namespace minmax
|
||||
{
|
||||
void getBufSizeRequired(int cols, int rows, int elem_size, int& bufcols, int& bufrows);
|
||||
|
||||
template <typename T>
|
||||
void minMaxCaller(const DevMem2Db src, double* minval, double* maxval, PtrStepb buf);
|
||||
|
||||
template <typename T>
|
||||
void minMaxMaskCaller(const DevMem2Db src, const PtrStepb mask, double* minval, double* maxval, PtrStepb buf);
|
||||
template <typename T>
|
||||
void minMaxMaskCaller(const DevMem2Db src, const PtrStepb mask, double* minval, double* maxval, PtrStepb buf);
|
||||
|
||||
template <typename T>
|
||||
void minMaxMultipassCaller(const DevMem2Db src, double* minval, double* maxval, PtrStepb buf);
|
||||
template <typename T>
|
||||
void minMaxMultipassCaller(const DevMem2Db src, double* minval, double* maxval, PtrStepb buf);
|
||||
|
||||
template <typename T>
|
||||
void minMaxMaskMultipassCaller(const DevMem2Db src, const PtrStepb mask, double* minval, double* maxval, PtrStepb buf);
|
||||
template <typename T>
|
||||
void minMaxMaskMultipassCaller(const DevMem2Db src, const PtrStepb mask, double* minval, double* maxval, PtrStepb buf);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
END_OPENCV_DEVICE_NAMESPACE
|
||||
}}}
|
||||
|
||||
|
||||
void cv::gpu::minMax(const GpuMat& src, double* minVal, double* maxVal, const GpuMat& mask)
|
||||
@ -387,7 +385,7 @@ void cv::gpu::minMax(const GpuMat& src, double* minVal, double* maxVal, const Gp
|
||||
|
||||
void cv::gpu::minMax(const GpuMat& src, double* minVal, double* maxVal, const GpuMat& mask, GpuMat& buf)
|
||||
{
|
||||
using namespace OPENCV_DEVICE_NAMESPACE_ matrix_reductions::minmax;
|
||||
using namespace ::cv::gpu::device::matrix_reductions::minmax;
|
||||
|
||||
typedef void (*Caller)(const DevMem2Db, double*, double*, PtrStepb);
|
||||
typedef void (*MaskedCaller)(const DevMem2Db, const PtrStepb, double*, double*, PtrStepb);
|
||||
@ -457,34 +455,33 @@ void cv::gpu::minMax(const GpuMat& src, double* minVal, double* maxVal, const Gp
|
||||
////////////////////////////////////////////////////////////////////////
|
||||
// Locate min and max
|
||||
|
||||
BEGIN_OPENCV_DEVICE_NAMESPACE
|
||||
|
||||
namespace matrix_reductions
|
||||
namespace cv { namespace gpu { namespace device
|
||||
{
|
||||
namespace minmaxloc
|
||||
namespace matrix_reductions
|
||||
{
|
||||
void getBufSizeRequired(int cols, int rows, int elem_size, int& b1cols,
|
||||
int& b1rows, int& b2cols, int& b2rows);
|
||||
namespace minmaxloc
|
||||
{
|
||||
void getBufSizeRequired(int cols, int rows, int elem_size, int& b1cols,
|
||||
int& b1rows, int& b2cols, int& b2rows);
|
||||
|
||||
template <typename T>
|
||||
void minMaxLocCaller(const DevMem2Db src, double* minval, double* maxval,
|
||||
int minloc[2], int maxloc[2], PtrStepb valBuf, PtrStepb locBuf);
|
||||
|
||||
template <typename T>
|
||||
void minMaxLocMaskCaller(const DevMem2Db src, const PtrStepb mask, double* minval, double* maxval,
|
||||
template <typename T>
|
||||
void minMaxLocCaller(const DevMem2Db src, double* minval, double* maxval,
|
||||
int minloc[2], int maxloc[2], PtrStepb valBuf, PtrStepb locBuf);
|
||||
|
||||
template <typename T>
|
||||
void minMaxLocMultipassCaller(const DevMem2Db src, double* minval, double* maxval,
|
||||
int minloc[2], int maxloc[2], PtrStepb valBuf, PtrStepb locBuf);
|
||||
template <typename T>
|
||||
void minMaxLocMaskCaller(const DevMem2Db src, const PtrStepb mask, double* minval, double* maxval,
|
||||
int minloc[2], int maxloc[2], PtrStepb valBuf, PtrStepb locBuf);
|
||||
|
||||
template <typename T>
|
||||
void minMaxLocMaskMultipassCaller(const DevMem2Db src, const PtrStepb mask, double* minval, double* maxval,
|
||||
template <typename T>
|
||||
void minMaxLocMultipassCaller(const DevMem2Db src, double* minval, double* maxval,
|
||||
int minloc[2], int maxloc[2], PtrStepb valBuf, PtrStepb locBuf);
|
||||
}
|
||||
}
|
||||
|
||||
END_OPENCV_DEVICE_NAMESPACE
|
||||
template <typename T>
|
||||
void minMaxLocMaskMultipassCaller(const DevMem2Db src, const PtrStepb mask, double* minval, double* maxval,
|
||||
int minloc[2], int maxloc[2], PtrStepb valBuf, PtrStepb locBuf);
|
||||
}
|
||||
}
|
||||
}}}
|
||||
|
||||
void cv::gpu::minMaxLoc(const GpuMat& src, double* minVal, double* maxVal, Point* minLoc, Point* maxLoc, const GpuMat& mask)
|
||||
{
|
||||
@ -495,7 +492,7 @@ void cv::gpu::minMaxLoc(const GpuMat& src, double* minVal, double* maxVal, Point
|
||||
void cv::gpu::minMaxLoc(const GpuMat& src, double* minVal, double* maxVal, Point* minLoc, Point* maxLoc,
|
||||
const GpuMat& mask, GpuMat& valBuf, GpuMat& locBuf)
|
||||
{
|
||||
using namespace OPENCV_DEVICE_NAMESPACE_ matrix_reductions::minmaxloc;
|
||||
using namespace ::cv::gpu::device::matrix_reductions::minmaxloc;
|
||||
|
||||
typedef void (*Caller)(const DevMem2Db, double*, double*, int[2], int[2], PtrStepb, PtrStepb);
|
||||
typedef void (*MaskedCaller)(const DevMem2Db, const PtrStepb, double*, double*, int[2], int[2], PtrStepb, PtrStepb);
|
||||
@ -571,23 +568,22 @@ void cv::gpu::minMaxLoc(const GpuMat& src, double* minVal, double* maxVal, Point
|
||||
//////////////////////////////////////////////////////////////////////////////
|
||||
// Count non-zero elements
|
||||
|
||||
BEGIN_OPENCV_DEVICE_NAMESPACE
|
||||
|
||||
namespace matrix_reductions
|
||||
namespace cv { namespace gpu { namespace device
|
||||
{
|
||||
namespace countnonzero
|
||||
namespace matrix_reductions
|
||||
{
|
||||
void getBufSizeRequired(int cols, int rows, int& bufcols, int& bufrows);
|
||||
namespace countnonzero
|
||||
{
|
||||
void getBufSizeRequired(int cols, int rows, int& bufcols, int& bufrows);
|
||||
|
||||
template <typename T>
|
||||
int countNonZeroCaller(const DevMem2Db src, PtrStepb buf);
|
||||
template <typename T>
|
||||
int countNonZeroCaller(const DevMem2Db src, PtrStepb buf);
|
||||
|
||||
template <typename T>
|
||||
int countNonZeroMultipassCaller(const DevMem2Db src, PtrStepb buf);
|
||||
template <typename T>
|
||||
int countNonZeroMultipassCaller(const DevMem2Db src, PtrStepb buf);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
END_OPENCV_DEVICE_NAMESPACE
|
||||
}}}
|
||||
|
||||
int cv::gpu::countNonZero(const GpuMat& src)
|
||||
{
|
||||
@ -598,7 +594,7 @@ int cv::gpu::countNonZero(const GpuMat& src)
|
||||
|
||||
int cv::gpu::countNonZero(const GpuMat& src, GpuMat& buf)
|
||||
{
|
||||
using namespace OPENCV_DEVICE_NAMESPACE_ matrix_reductions::countnonzero;
|
||||
using namespace ::cv::gpu::device::matrix_reductions::countnonzero;
|
||||
|
||||
typedef int (*Caller)(const DevMem2Db src, PtrStepb buf);
|
||||
|
||||
@ -632,19 +628,19 @@ int cv::gpu::countNonZero(const GpuMat& src, GpuMat& buf)
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////////
|
||||
// reduce
|
||||
BEGIN_OPENCV_DEVICE_NAMESPACE
|
||||
|
||||
namespace matrix_reductions
|
||||
namespace cv { namespace gpu { namespace device
|
||||
{
|
||||
template <typename T, typename S, typename D> void reduceRows_gpu(const DevMem2Db& src, const DevMem2Db& dst, int reduceOp, cudaStream_t stream);
|
||||
template <typename T, typename S, typename D> void reduceCols_gpu(const DevMem2Db& src, int cn, const DevMem2Db& dst, int reduceOp, cudaStream_t stream);
|
||||
}
|
||||
|
||||
END_OPENCV_DEVICE_NAMESPACE
|
||||
namespace matrix_reductions
|
||||
{
|
||||
template <typename T, typename S, typename D> void reduceRows_gpu(const DevMem2Db& src, const DevMem2Db& dst, int reduceOp, cudaStream_t stream);
|
||||
template <typename T, typename S, typename D> void reduceCols_gpu(const DevMem2Db& src, int cn, const DevMem2Db& dst, int reduceOp, cudaStream_t stream);
|
||||
}
|
||||
}}}
|
||||
|
||||
void cv::gpu::reduce(const GpuMat& src, GpuMat& dst, int dim, int reduceOp, int dtype, Stream& stream)
|
||||
{
|
||||
using namespace OPENCV_DEVICE_NAMESPACE_ matrix_reductions;
|
||||
using namespace ::cv::gpu::device::matrix_reductions;
|
||||
|
||||
CV_Assert(src.depth() <= CV_32F && src.channels() <= 4 && dtype <= CV_32F);
|
||||
CV_Assert(dim == 0 || dim == 1);
|
||||
|
File diff suppressed because it is too large
Load Diff
@ -46,178 +46,177 @@
|
||||
#include "internal_shared.hpp"
|
||||
#include "detail/color_detail.hpp"
|
||||
|
||||
BEGIN_OPENCV_DEVICE_NAMESPACE
|
||||
namespace cv { namespace gpu { namespace device
|
||||
{
|
||||
// All OPENCV_GPU_IMPLEMENT_*_TRAITS(ColorSpace1_to_ColorSpace2, ...) macros implements
|
||||
// template <typename T> class ColorSpace1_to_ColorSpace2_traits
|
||||
// {
|
||||
// typedef ... functor_type;
|
||||
// static __host__ __device__ functor_type create_functor();
|
||||
// };
|
||||
|
||||
// All OPENCV_GPU_IMPLEMENT_*_TRAITS(ColorSpace1_to_ColorSpace2, ...) macros implements
|
||||
// template <typename T> class ColorSpace1_to_ColorSpace2_traits
|
||||
// {
|
||||
// typedef ... functor_type;
|
||||
// static __host__ __device__ functor_type create_functor();
|
||||
// };
|
||||
OPENCV_GPU_IMPLEMENT_RGB2RGB_TRAITS(bgr_to_rgb, 3, 3, 2)
|
||||
OPENCV_GPU_IMPLEMENT_RGB2RGB_TRAITS(bgr_to_bgra, 3, 4, 0)
|
||||
OPENCV_GPU_IMPLEMENT_RGB2RGB_TRAITS(bgr_to_rgba, 3, 4, 2)
|
||||
OPENCV_GPU_IMPLEMENT_RGB2RGB_TRAITS(bgra_to_bgr, 4, 3, 0)
|
||||
OPENCV_GPU_IMPLEMENT_RGB2RGB_TRAITS(bgra_to_rgb, 4, 3, 2)
|
||||
OPENCV_GPU_IMPLEMENT_RGB2RGB_TRAITS(bgra_to_rgba, 4, 4, 2)
|
||||
|
||||
OPENCV_GPU_IMPLEMENT_RGB2RGB_TRAITS(bgr_to_rgb, 3, 3, 2)
|
||||
OPENCV_GPU_IMPLEMENT_RGB2RGB_TRAITS(bgr_to_bgra, 3, 4, 0)
|
||||
OPENCV_GPU_IMPLEMENT_RGB2RGB_TRAITS(bgr_to_rgba, 3, 4, 2)
|
||||
OPENCV_GPU_IMPLEMENT_RGB2RGB_TRAITS(bgra_to_bgr, 4, 3, 0)
|
||||
OPENCV_GPU_IMPLEMENT_RGB2RGB_TRAITS(bgra_to_rgb, 4, 3, 2)
|
||||
OPENCV_GPU_IMPLEMENT_RGB2RGB_TRAITS(bgra_to_rgba, 4, 4, 2)
|
||||
#undef OPENCV_GPU_IMPLEMENT_RGB2RGB_TRAITS
|
||||
|
||||
#undef OPENCV_GPU_IMPLEMENT_RGB2RGB_TRAITS
|
||||
OPENCV_GPU_IMPLEMENT_RGB2RGB5x5_TRAITS(bgr_to_bgr555, 3, 0, 5)
|
||||
OPENCV_GPU_IMPLEMENT_RGB2RGB5x5_TRAITS(bgr_to_bgr565, 3, 0, 6)
|
||||
OPENCV_GPU_IMPLEMENT_RGB2RGB5x5_TRAITS(rgb_to_bgr555, 3, 2, 5)
|
||||
OPENCV_GPU_IMPLEMENT_RGB2RGB5x5_TRAITS(rgb_to_bgr565, 3, 2, 6)
|
||||
OPENCV_GPU_IMPLEMENT_RGB2RGB5x5_TRAITS(bgra_to_bgr555, 4, 0, 5)
|
||||
OPENCV_GPU_IMPLEMENT_RGB2RGB5x5_TRAITS(bgra_to_bgr565, 4, 0, 6)
|
||||
OPENCV_GPU_IMPLEMENT_RGB2RGB5x5_TRAITS(rgba_to_bgr555, 4, 2, 5)
|
||||
OPENCV_GPU_IMPLEMENT_RGB2RGB5x5_TRAITS(rgba_to_bgr565, 4, 2, 6)
|
||||
|
||||
OPENCV_GPU_IMPLEMENT_RGB2RGB5x5_TRAITS(bgr_to_bgr555, 3, 0, 5)
|
||||
OPENCV_GPU_IMPLEMENT_RGB2RGB5x5_TRAITS(bgr_to_bgr565, 3, 0, 6)
|
||||
OPENCV_GPU_IMPLEMENT_RGB2RGB5x5_TRAITS(rgb_to_bgr555, 3, 2, 5)
|
||||
OPENCV_GPU_IMPLEMENT_RGB2RGB5x5_TRAITS(rgb_to_bgr565, 3, 2, 6)
|
||||
OPENCV_GPU_IMPLEMENT_RGB2RGB5x5_TRAITS(bgra_to_bgr555, 4, 0, 5)
|
||||
OPENCV_GPU_IMPLEMENT_RGB2RGB5x5_TRAITS(bgra_to_bgr565, 4, 0, 6)
|
||||
OPENCV_GPU_IMPLEMENT_RGB2RGB5x5_TRAITS(rgba_to_bgr555, 4, 2, 5)
|
||||
OPENCV_GPU_IMPLEMENT_RGB2RGB5x5_TRAITS(rgba_to_bgr565, 4, 2, 6)
|
||||
#undef OPENCV_GPU_IMPLEMENT_RGB2RGB5x5_TRAITS
|
||||
|
||||
#undef OPENCV_GPU_IMPLEMENT_RGB2RGB5x5_TRAITS
|
||||
OPENCV_GPU_IMPLEMENT_RGB5x52RGB_TRAITS(bgr555_to_rgb, 3, 2, 5)
|
||||
OPENCV_GPU_IMPLEMENT_RGB5x52RGB_TRAITS(bgr565_to_rgb, 3, 2, 6)
|
||||
OPENCV_GPU_IMPLEMENT_RGB5x52RGB_TRAITS(bgr555_to_bgr, 3, 0, 5)
|
||||
OPENCV_GPU_IMPLEMENT_RGB5x52RGB_TRAITS(bgr565_to_bgr, 3, 0, 6)
|
||||
OPENCV_GPU_IMPLEMENT_RGB5x52RGB_TRAITS(bgr555_to_rgba, 4, 2, 5)
|
||||
OPENCV_GPU_IMPLEMENT_RGB5x52RGB_TRAITS(bgr565_to_rgba, 4, 2, 6)
|
||||
OPENCV_GPU_IMPLEMENT_RGB5x52RGB_TRAITS(bgr555_to_bgra, 4, 0, 5)
|
||||
OPENCV_GPU_IMPLEMENT_RGB5x52RGB_TRAITS(bgr565_to_bgra, 4, 0, 6)
|
||||
|
||||
OPENCV_GPU_IMPLEMENT_RGB5x52RGB_TRAITS(bgr555_to_rgb, 3, 2, 5)
|
||||
OPENCV_GPU_IMPLEMENT_RGB5x52RGB_TRAITS(bgr565_to_rgb, 3, 2, 6)
|
||||
OPENCV_GPU_IMPLEMENT_RGB5x52RGB_TRAITS(bgr555_to_bgr, 3, 0, 5)
|
||||
OPENCV_GPU_IMPLEMENT_RGB5x52RGB_TRAITS(bgr565_to_bgr, 3, 0, 6)
|
||||
OPENCV_GPU_IMPLEMENT_RGB5x52RGB_TRAITS(bgr555_to_rgba, 4, 2, 5)
|
||||
OPENCV_GPU_IMPLEMENT_RGB5x52RGB_TRAITS(bgr565_to_rgba, 4, 2, 6)
|
||||
OPENCV_GPU_IMPLEMENT_RGB5x52RGB_TRAITS(bgr555_to_bgra, 4, 0, 5)
|
||||
OPENCV_GPU_IMPLEMENT_RGB5x52RGB_TRAITS(bgr565_to_bgra, 4, 0, 6)
|
||||
#undef OPENCV_GPU_IMPLEMENT_RGB5x52RGB_TRAITS
|
||||
|
||||
#undef OPENCV_GPU_IMPLEMENT_RGB5x52RGB_TRAITS
|
||||
OPENCV_GPU_IMPLEMENT_GRAY2RGB_TRAITS(gray_to_bgr, 3)
|
||||
OPENCV_GPU_IMPLEMENT_GRAY2RGB_TRAITS(gray_to_bgra, 4)
|
||||
|
||||
OPENCV_GPU_IMPLEMENT_GRAY2RGB_TRAITS(gray_to_bgr, 3)
|
||||
OPENCV_GPU_IMPLEMENT_GRAY2RGB_TRAITS(gray_to_bgra, 4)
|
||||
#undef OPENCV_GPU_IMPLEMENT_GRAY2RGB_TRAITS
|
||||
|
||||
#undef OPENCV_GPU_IMPLEMENT_GRAY2RGB_TRAITS
|
||||
OPENCV_GPU_IMPLEMENT_GRAY2RGB5x5_TRAITS(gray_to_bgr555, 5)
|
||||
OPENCV_GPU_IMPLEMENT_GRAY2RGB5x5_TRAITS(gray_to_bgr565, 6)
|
||||
|
||||
OPENCV_GPU_IMPLEMENT_GRAY2RGB5x5_TRAITS(gray_to_bgr555, 5)
|
||||
OPENCV_GPU_IMPLEMENT_GRAY2RGB5x5_TRAITS(gray_to_bgr565, 6)
|
||||
#undef OPENCV_GPU_IMPLEMENT_GRAY2RGB5x5_TRAITS
|
||||
|
||||
#undef OPENCV_GPU_IMPLEMENT_GRAY2RGB5x5_TRAITS
|
||||
OPENCV_GPU_IMPLEMENT_RGB5x52GRAY_TRAITS(bgr555_to_gray, 5)
|
||||
OPENCV_GPU_IMPLEMENT_RGB5x52GRAY_TRAITS(bgr565_to_gray, 6)
|
||||
|
||||
OPENCV_GPU_IMPLEMENT_RGB5x52GRAY_TRAITS(bgr555_to_gray, 5)
|
||||
OPENCV_GPU_IMPLEMENT_RGB5x52GRAY_TRAITS(bgr565_to_gray, 6)
|
||||
#undef OPENCV_GPU_IMPLEMENT_RGB5x52GRAY_TRAITS
|
||||
|
||||
#undef OPENCV_GPU_IMPLEMENT_RGB5x52GRAY_TRAITS
|
||||
OPENCV_GPU_IMPLEMENT_RGB2GRAY_TRAITS(rgb_to_gray, 3, 2)
|
||||
OPENCV_GPU_IMPLEMENT_RGB2GRAY_TRAITS(bgr_to_gray, 3, 0)
|
||||
OPENCV_GPU_IMPLEMENT_RGB2GRAY_TRAITS(rgba_to_gray, 4, 2)
|
||||
OPENCV_GPU_IMPLEMENT_RGB2GRAY_TRAITS(bgra_to_gray, 4, 0)
|
||||
|
||||
OPENCV_GPU_IMPLEMENT_RGB2GRAY_TRAITS(rgb_to_gray, 3, 2)
|
||||
OPENCV_GPU_IMPLEMENT_RGB2GRAY_TRAITS(bgr_to_gray, 3, 0)
|
||||
OPENCV_GPU_IMPLEMENT_RGB2GRAY_TRAITS(rgba_to_gray, 4, 2)
|
||||
OPENCV_GPU_IMPLEMENT_RGB2GRAY_TRAITS(bgra_to_gray, 4, 0)
|
||||
#undef OPENCV_GPU_IMPLEMENT_RGB2GRAY_TRAITS
|
||||
|
||||
#undef OPENCV_GPU_IMPLEMENT_RGB2GRAY_TRAITS
|
||||
OPENCV_GPU_IMPLEMENT_RGB2YUV_TRAITS(rgb_to_yuv, 3, 3, 0)
|
||||
OPENCV_GPU_IMPLEMENT_RGB2YUV_TRAITS(rgba_to_yuv, 4, 3, 0)
|
||||
OPENCV_GPU_IMPLEMENT_RGB2YUV_TRAITS(rgb_to_yuv4, 3, 4, 0)
|
||||
OPENCV_GPU_IMPLEMENT_RGB2YUV_TRAITS(rgba_to_yuv4, 4, 4, 0)
|
||||
OPENCV_GPU_IMPLEMENT_RGB2YUV_TRAITS(bgr_to_yuv, 3, 3, 2)
|
||||
OPENCV_GPU_IMPLEMENT_RGB2YUV_TRAITS(bgra_to_yuv, 4, 3, 2)
|
||||
OPENCV_GPU_IMPLEMENT_RGB2YUV_TRAITS(bgr_to_yuv4, 3, 4, 2)
|
||||
OPENCV_GPU_IMPLEMENT_RGB2YUV_TRAITS(bgra_to_yuv4, 4, 4, 2)
|
||||
|
||||
OPENCV_GPU_IMPLEMENT_RGB2YUV_TRAITS(rgb_to_yuv, 3, 3, 0)
|
||||
OPENCV_GPU_IMPLEMENT_RGB2YUV_TRAITS(rgba_to_yuv, 4, 3, 0)
|
||||
OPENCV_GPU_IMPLEMENT_RGB2YUV_TRAITS(rgb_to_yuv4, 3, 4, 0)
|
||||
OPENCV_GPU_IMPLEMENT_RGB2YUV_TRAITS(rgba_to_yuv4, 4, 4, 0)
|
||||
OPENCV_GPU_IMPLEMENT_RGB2YUV_TRAITS(bgr_to_yuv, 3, 3, 2)
|
||||
OPENCV_GPU_IMPLEMENT_RGB2YUV_TRAITS(bgra_to_yuv, 4, 3, 2)
|
||||
OPENCV_GPU_IMPLEMENT_RGB2YUV_TRAITS(bgr_to_yuv4, 3, 4, 2)
|
||||
OPENCV_GPU_IMPLEMENT_RGB2YUV_TRAITS(bgra_to_yuv4, 4, 4, 2)
|
||||
#undef OPENCV_GPU_IMPLEMENT_RGB2YUV_TRAITS
|
||||
|
||||
#undef OPENCV_GPU_IMPLEMENT_RGB2YUV_TRAITS
|
||||
OPENCV_GPU_IMPLEMENT_YUV2RGB_TRAITS(yuv_to_rgb, 3, 3, 0)
|
||||
OPENCV_GPU_IMPLEMENT_YUV2RGB_TRAITS(yuv_to_rgba, 3, 4, 0)
|
||||
OPENCV_GPU_IMPLEMENT_YUV2RGB_TRAITS(yuv4_to_rgb, 4, 3, 0)
|
||||
OPENCV_GPU_IMPLEMENT_YUV2RGB_TRAITS(yuv4_to_rgba, 4, 4, 0)
|
||||
OPENCV_GPU_IMPLEMENT_YUV2RGB_TRAITS(yuv_to_bgr, 3, 3, 2)
|
||||
OPENCV_GPU_IMPLEMENT_YUV2RGB_TRAITS(yuv_to_bgra, 3, 4, 2)
|
||||
OPENCV_GPU_IMPLEMENT_YUV2RGB_TRAITS(yuv4_to_bgr, 4, 3, 2)
|
||||
OPENCV_GPU_IMPLEMENT_YUV2RGB_TRAITS(yuv4_to_bgra, 4, 4, 2)
|
||||
|
||||
OPENCV_GPU_IMPLEMENT_YUV2RGB_TRAITS(yuv_to_rgb, 3, 3, 0)
|
||||
OPENCV_GPU_IMPLEMENT_YUV2RGB_TRAITS(yuv_to_rgba, 3, 4, 0)
|
||||
OPENCV_GPU_IMPLEMENT_YUV2RGB_TRAITS(yuv4_to_rgb, 4, 3, 0)
|
||||
OPENCV_GPU_IMPLEMENT_YUV2RGB_TRAITS(yuv4_to_rgba, 4, 4, 0)
|
||||
OPENCV_GPU_IMPLEMENT_YUV2RGB_TRAITS(yuv_to_bgr, 3, 3, 2)
|
||||
OPENCV_GPU_IMPLEMENT_YUV2RGB_TRAITS(yuv_to_bgra, 3, 4, 2)
|
||||
OPENCV_GPU_IMPLEMENT_YUV2RGB_TRAITS(yuv4_to_bgr, 4, 3, 2)
|
||||
OPENCV_GPU_IMPLEMENT_YUV2RGB_TRAITS(yuv4_to_bgra, 4, 4, 2)
|
||||
#undef OPENCV_GPU_IMPLEMENT_YUV2RGB_TRAITS
|
||||
|
||||
#undef OPENCV_GPU_IMPLEMENT_YUV2RGB_TRAITS
|
||||
OPENCV_GPU_IMPLEMENT_RGB2YCrCb_TRAITS(rgb_to_YCrCb, 3, 3, 2)
|
||||
OPENCV_GPU_IMPLEMENT_RGB2YCrCb_TRAITS(rgba_to_YCrCb, 4, 3, 2)
|
||||
OPENCV_GPU_IMPLEMENT_RGB2YCrCb_TRAITS(rgb_to_YCrCb4, 3, 4, 2)
|
||||
OPENCV_GPU_IMPLEMENT_RGB2YCrCb_TRAITS(rgba_to_YCrCb4, 4, 4, 2)
|
||||
OPENCV_GPU_IMPLEMENT_RGB2YCrCb_TRAITS(bgr_to_YCrCb, 3, 3, 0)
|
||||
OPENCV_GPU_IMPLEMENT_RGB2YCrCb_TRAITS(bgra_to_YCrCb, 4, 3, 0)
|
||||
OPENCV_GPU_IMPLEMENT_RGB2YCrCb_TRAITS(bgr_to_YCrCb4, 3, 4, 0)
|
||||
OPENCV_GPU_IMPLEMENT_RGB2YCrCb_TRAITS(bgra_to_YCrCb4, 4, 4, 0)
|
||||
|
||||
OPENCV_GPU_IMPLEMENT_RGB2YCrCb_TRAITS(rgb_to_YCrCb, 3, 3, 2)
|
||||
OPENCV_GPU_IMPLEMENT_RGB2YCrCb_TRAITS(rgba_to_YCrCb, 4, 3, 2)
|
||||
OPENCV_GPU_IMPLEMENT_RGB2YCrCb_TRAITS(rgb_to_YCrCb4, 3, 4, 2)
|
||||
OPENCV_GPU_IMPLEMENT_RGB2YCrCb_TRAITS(rgba_to_YCrCb4, 4, 4, 2)
|
||||
OPENCV_GPU_IMPLEMENT_RGB2YCrCb_TRAITS(bgr_to_YCrCb, 3, 3, 0)
|
||||
OPENCV_GPU_IMPLEMENT_RGB2YCrCb_TRAITS(bgra_to_YCrCb, 4, 3, 0)
|
||||
OPENCV_GPU_IMPLEMENT_RGB2YCrCb_TRAITS(bgr_to_YCrCb4, 3, 4, 0)
|
||||
OPENCV_GPU_IMPLEMENT_RGB2YCrCb_TRAITS(bgra_to_YCrCb4, 4, 4, 0)
|
||||
#undef OPENCV_GPU_IMPLEMENT_RGB2YCrCb_TRAITS
|
||||
|
||||
#undef OPENCV_GPU_IMPLEMENT_RGB2YCrCb_TRAITS
|
||||
OPENCV_GPU_IMPLEMENT_YCrCb2RGB_TRAITS(YCrCb_to_rgb, 3, 3, 2)
|
||||
OPENCV_GPU_IMPLEMENT_YCrCb2RGB_TRAITS(YCrCb_to_rgba, 3, 4, 2)
|
||||
OPENCV_GPU_IMPLEMENT_YCrCb2RGB_TRAITS(YCrCb4_to_rgb, 4, 3, 2)
|
||||
OPENCV_GPU_IMPLEMENT_YCrCb2RGB_TRAITS(YCrCb4_to_rgba, 4, 4, 2)
|
||||
OPENCV_GPU_IMPLEMENT_YCrCb2RGB_TRAITS(YCrCb_to_bgr, 3, 3, 0)
|
||||
OPENCV_GPU_IMPLEMENT_YCrCb2RGB_TRAITS(YCrCb_to_bgra, 3, 4, 0)
|
||||
OPENCV_GPU_IMPLEMENT_YCrCb2RGB_TRAITS(YCrCb4_to_bgr, 4, 3, 0)
|
||||
OPENCV_GPU_IMPLEMENT_YCrCb2RGB_TRAITS(YCrCb4_to_bgra, 4, 4, 0)
|
||||
|
||||
OPENCV_GPU_IMPLEMENT_YCrCb2RGB_TRAITS(YCrCb_to_rgb, 3, 3, 2)
|
||||
OPENCV_GPU_IMPLEMENT_YCrCb2RGB_TRAITS(YCrCb_to_rgba, 3, 4, 2)
|
||||
OPENCV_GPU_IMPLEMENT_YCrCb2RGB_TRAITS(YCrCb4_to_rgb, 4, 3, 2)
|
||||
OPENCV_GPU_IMPLEMENT_YCrCb2RGB_TRAITS(YCrCb4_to_rgba, 4, 4, 2)
|
||||
OPENCV_GPU_IMPLEMENT_YCrCb2RGB_TRAITS(YCrCb_to_bgr, 3, 3, 0)
|
||||
OPENCV_GPU_IMPLEMENT_YCrCb2RGB_TRAITS(YCrCb_to_bgra, 3, 4, 0)
|
||||
OPENCV_GPU_IMPLEMENT_YCrCb2RGB_TRAITS(YCrCb4_to_bgr, 4, 3, 0)
|
||||
OPENCV_GPU_IMPLEMENT_YCrCb2RGB_TRAITS(YCrCb4_to_bgra, 4, 4, 0)
|
||||
#undef OPENCV_GPU_IMPLEMENT_YCrCb2RGB_TRAITS
|
||||
|
||||
#undef OPENCV_GPU_IMPLEMENT_YCrCb2RGB_TRAITS
|
||||
OPENCV_GPU_IMPLEMENT_RGB2XYZ_TRAITS(rgb_to_xyz, 3, 3, 2)
|
||||
OPENCV_GPU_IMPLEMENT_RGB2XYZ_TRAITS(rgba_to_xyz, 4, 3, 2)
|
||||
OPENCV_GPU_IMPLEMENT_RGB2XYZ_TRAITS(rgb_to_xyz4, 3, 4, 2)
|
||||
OPENCV_GPU_IMPLEMENT_RGB2XYZ_TRAITS(rgba_to_xyz4, 4, 4, 2)
|
||||
OPENCV_GPU_IMPLEMENT_RGB2XYZ_TRAITS(bgr_to_xyz, 3, 3, 0)
|
||||
OPENCV_GPU_IMPLEMENT_RGB2XYZ_TRAITS(bgra_to_xyz, 4, 3, 0)
|
||||
OPENCV_GPU_IMPLEMENT_RGB2XYZ_TRAITS(bgr_to_xyz4, 3, 4, 0)
|
||||
OPENCV_GPU_IMPLEMENT_RGB2XYZ_TRAITS(bgra_to_xyz4, 4, 4, 0)
|
||||
|
||||
OPENCV_GPU_IMPLEMENT_RGB2XYZ_TRAITS(rgb_to_xyz, 3, 3, 2)
|
||||
OPENCV_GPU_IMPLEMENT_RGB2XYZ_TRAITS(rgba_to_xyz, 4, 3, 2)
|
||||
OPENCV_GPU_IMPLEMENT_RGB2XYZ_TRAITS(rgb_to_xyz4, 3, 4, 2)
|
||||
OPENCV_GPU_IMPLEMENT_RGB2XYZ_TRAITS(rgba_to_xyz4, 4, 4, 2)
|
||||
OPENCV_GPU_IMPLEMENT_RGB2XYZ_TRAITS(bgr_to_xyz, 3, 3, 0)
|
||||
OPENCV_GPU_IMPLEMENT_RGB2XYZ_TRAITS(bgra_to_xyz, 4, 3, 0)
|
||||
OPENCV_GPU_IMPLEMENT_RGB2XYZ_TRAITS(bgr_to_xyz4, 3, 4, 0)
|
||||
OPENCV_GPU_IMPLEMENT_RGB2XYZ_TRAITS(bgra_to_xyz4, 4, 4, 0)
|
||||
#undef OPENCV_GPU_IMPLEMENT_RGB2XYZ_TRAITS
|
||||
|
||||
#undef OPENCV_GPU_IMPLEMENT_RGB2XYZ_TRAITS
|
||||
OPENCV_GPU_IMPLEMENT_XYZ2RGB_TRAITS(xyz_to_rgb, 3, 3, 2)
|
||||
OPENCV_GPU_IMPLEMENT_XYZ2RGB_TRAITS(xyz4_to_rgb, 4, 3, 2)
|
||||
OPENCV_GPU_IMPLEMENT_XYZ2RGB_TRAITS(xyz_to_rgba, 3, 4, 2)
|
||||
OPENCV_GPU_IMPLEMENT_XYZ2RGB_TRAITS(xyz4_to_rgba, 4, 4, 2)
|
||||
OPENCV_GPU_IMPLEMENT_XYZ2RGB_TRAITS(xyz_to_bgr, 3, 3, 0)
|
||||
OPENCV_GPU_IMPLEMENT_XYZ2RGB_TRAITS(xyz4_to_bgr, 4, 3, 0)
|
||||
OPENCV_GPU_IMPLEMENT_XYZ2RGB_TRAITS(xyz_to_bgra, 3, 4, 0)
|
||||
OPENCV_GPU_IMPLEMENT_XYZ2RGB_TRAITS(xyz4_to_bgra, 4, 4, 0)
|
||||
|
||||
OPENCV_GPU_IMPLEMENT_XYZ2RGB_TRAITS(xyz_to_rgb, 3, 3, 2)
|
||||
OPENCV_GPU_IMPLEMENT_XYZ2RGB_TRAITS(xyz4_to_rgb, 4, 3, 2)
|
||||
OPENCV_GPU_IMPLEMENT_XYZ2RGB_TRAITS(xyz_to_rgba, 3, 4, 2)
|
||||
OPENCV_GPU_IMPLEMENT_XYZ2RGB_TRAITS(xyz4_to_rgba, 4, 4, 2)
|
||||
OPENCV_GPU_IMPLEMENT_XYZ2RGB_TRAITS(xyz_to_bgr, 3, 3, 0)
|
||||
OPENCV_GPU_IMPLEMENT_XYZ2RGB_TRAITS(xyz4_to_bgr, 4, 3, 0)
|
||||
OPENCV_GPU_IMPLEMENT_XYZ2RGB_TRAITS(xyz_to_bgra, 3, 4, 0)
|
||||
OPENCV_GPU_IMPLEMENT_XYZ2RGB_TRAITS(xyz4_to_bgra, 4, 4, 0)
|
||||
#undef OPENCV_GPU_IMPLEMENT_XYZ2RGB_TRAITS
|
||||
|
||||
#undef OPENCV_GPU_IMPLEMENT_XYZ2RGB_TRAITS
|
||||
OPENCV_GPU_IMPLEMENT_RGB2HSV_TRAITS(rgb_to_hsv, 3, 3, 2)
|
||||
OPENCV_GPU_IMPLEMENT_RGB2HSV_TRAITS(rgba_to_hsv, 4, 3, 2)
|
||||
OPENCV_GPU_IMPLEMENT_RGB2HSV_TRAITS(rgb_to_hsv4, 3, 4, 2)
|
||||
OPENCV_GPU_IMPLEMENT_RGB2HSV_TRAITS(rgba_to_hsv4, 4, 4, 2)
|
||||
OPENCV_GPU_IMPLEMENT_RGB2HSV_TRAITS(bgr_to_hsv, 3, 3, 0)
|
||||
OPENCV_GPU_IMPLEMENT_RGB2HSV_TRAITS(bgra_to_hsv, 4, 3, 0)
|
||||
OPENCV_GPU_IMPLEMENT_RGB2HSV_TRAITS(bgr_to_hsv4, 3, 4, 0)
|
||||
OPENCV_GPU_IMPLEMENT_RGB2HSV_TRAITS(bgra_to_hsv4, 4, 4, 0)
|
||||
|
||||
OPENCV_GPU_IMPLEMENT_RGB2HSV_TRAITS(rgb_to_hsv, 3, 3, 2)
|
||||
OPENCV_GPU_IMPLEMENT_RGB2HSV_TRAITS(rgba_to_hsv, 4, 3, 2)
|
||||
OPENCV_GPU_IMPLEMENT_RGB2HSV_TRAITS(rgb_to_hsv4, 3, 4, 2)
|
||||
OPENCV_GPU_IMPLEMENT_RGB2HSV_TRAITS(rgba_to_hsv4, 4, 4, 2)
|
||||
OPENCV_GPU_IMPLEMENT_RGB2HSV_TRAITS(bgr_to_hsv, 3, 3, 0)
|
||||
OPENCV_GPU_IMPLEMENT_RGB2HSV_TRAITS(bgra_to_hsv, 4, 3, 0)
|
||||
OPENCV_GPU_IMPLEMENT_RGB2HSV_TRAITS(bgr_to_hsv4, 3, 4, 0)
|
||||
OPENCV_GPU_IMPLEMENT_RGB2HSV_TRAITS(bgra_to_hsv4, 4, 4, 0)
|
||||
#undef OPENCV_GPU_IMPLEMENT_RGB2HSV_TRAITS
|
||||
|
||||
#undef OPENCV_GPU_IMPLEMENT_RGB2HSV_TRAITS
|
||||
OPENCV_GPU_IMPLEMENT_HSV2RGB_TRAITS(hsv_to_rgb, 3, 3, 2)
|
||||
OPENCV_GPU_IMPLEMENT_HSV2RGB_TRAITS(hsv_to_rgba, 3, 4, 2)
|
||||
OPENCV_GPU_IMPLEMENT_HSV2RGB_TRAITS(hsv4_to_rgb, 4, 3, 2)
|
||||
OPENCV_GPU_IMPLEMENT_HSV2RGB_TRAITS(hsv4_to_rgba, 4, 4, 2)
|
||||
OPENCV_GPU_IMPLEMENT_HSV2RGB_TRAITS(hsv_to_bgr, 3, 3, 0)
|
||||
OPENCV_GPU_IMPLEMENT_HSV2RGB_TRAITS(hsv_to_bgra, 3, 4, 0)
|
||||
OPENCV_GPU_IMPLEMENT_HSV2RGB_TRAITS(hsv4_to_bgr, 4, 3, 0)
|
||||
OPENCV_GPU_IMPLEMENT_HSV2RGB_TRAITS(hsv4_to_bgra, 4, 4, 0)
|
||||
|
||||
OPENCV_GPU_IMPLEMENT_HSV2RGB_TRAITS(hsv_to_rgb, 3, 3, 2)
|
||||
OPENCV_GPU_IMPLEMENT_HSV2RGB_TRAITS(hsv_to_rgba, 3, 4, 2)
|
||||
OPENCV_GPU_IMPLEMENT_HSV2RGB_TRAITS(hsv4_to_rgb, 4, 3, 2)
|
||||
OPENCV_GPU_IMPLEMENT_HSV2RGB_TRAITS(hsv4_to_rgba, 4, 4, 2)
|
||||
OPENCV_GPU_IMPLEMENT_HSV2RGB_TRAITS(hsv_to_bgr, 3, 3, 0)
|
||||
OPENCV_GPU_IMPLEMENT_HSV2RGB_TRAITS(hsv_to_bgra, 3, 4, 0)
|
||||
OPENCV_GPU_IMPLEMENT_HSV2RGB_TRAITS(hsv4_to_bgr, 4, 3, 0)
|
||||
OPENCV_GPU_IMPLEMENT_HSV2RGB_TRAITS(hsv4_to_bgra, 4, 4, 0)
|
||||
#undef OPENCV_GPU_IMPLEMENT_HSV2RGB_TRAITS
|
||||
|
||||
#undef OPENCV_GPU_IMPLEMENT_HSV2RGB_TRAITS
|
||||
OPENCV_GPU_IMPLEMENT_RGB2HLS_TRAITS(rgb_to_hls, 3, 3, 2)
|
||||
OPENCV_GPU_IMPLEMENT_RGB2HLS_TRAITS(rgba_to_hls, 4, 3, 2)
|
||||
OPENCV_GPU_IMPLEMENT_RGB2HLS_TRAITS(rgb_to_hls4, 3, 4, 2)
|
||||
OPENCV_GPU_IMPLEMENT_RGB2HLS_TRAITS(rgba_to_hls4, 4, 4, 2)
|
||||
OPENCV_GPU_IMPLEMENT_RGB2HLS_TRAITS(bgr_to_hls, 3, 3, 0)
|
||||
OPENCV_GPU_IMPLEMENT_RGB2HLS_TRAITS(bgra_to_hls, 4, 3, 0)
|
||||
OPENCV_GPU_IMPLEMENT_RGB2HLS_TRAITS(bgr_to_hls4, 3, 4, 0)
|
||||
OPENCV_GPU_IMPLEMENT_RGB2HLS_TRAITS(bgra_to_hls4, 4, 4, 0)
|
||||
|
||||
OPENCV_GPU_IMPLEMENT_RGB2HLS_TRAITS(rgb_to_hls, 3, 3, 2)
|
||||
OPENCV_GPU_IMPLEMENT_RGB2HLS_TRAITS(rgba_to_hls, 4, 3, 2)
|
||||
OPENCV_GPU_IMPLEMENT_RGB2HLS_TRAITS(rgb_to_hls4, 3, 4, 2)
|
||||
OPENCV_GPU_IMPLEMENT_RGB2HLS_TRAITS(rgba_to_hls4, 4, 4, 2)
|
||||
OPENCV_GPU_IMPLEMENT_RGB2HLS_TRAITS(bgr_to_hls, 3, 3, 0)
|
||||
OPENCV_GPU_IMPLEMENT_RGB2HLS_TRAITS(bgra_to_hls, 4, 3, 0)
|
||||
OPENCV_GPU_IMPLEMENT_RGB2HLS_TRAITS(bgr_to_hls4, 3, 4, 0)
|
||||
OPENCV_GPU_IMPLEMENT_RGB2HLS_TRAITS(bgra_to_hls4, 4, 4, 0)
|
||||
#undef OPENCV_GPU_IMPLEMENT_RGB2HLS_TRAITS
|
||||
|
||||
#undef OPENCV_GPU_IMPLEMENT_RGB2HLS_TRAITS
|
||||
OPENCV_GPU_IMPLEMENT_HLS2RGB_TRAITS(hls_to_rgb, 3, 3, 2)
|
||||
OPENCV_GPU_IMPLEMENT_HLS2RGB_TRAITS(hls_to_rgba, 3, 4, 2)
|
||||
OPENCV_GPU_IMPLEMENT_HLS2RGB_TRAITS(hls4_to_rgb, 4, 3, 2)
|
||||
OPENCV_GPU_IMPLEMENT_HLS2RGB_TRAITS(hls4_to_rgba, 4, 4, 2)
|
||||
OPENCV_GPU_IMPLEMENT_HLS2RGB_TRAITS(hls_to_bgr, 3, 3, 0)
|
||||
OPENCV_GPU_IMPLEMENT_HLS2RGB_TRAITS(hls_to_bgra, 3, 4, 0)
|
||||
OPENCV_GPU_IMPLEMENT_HLS2RGB_TRAITS(hls4_to_bgr, 4, 3, 0)
|
||||
OPENCV_GPU_IMPLEMENT_HLS2RGB_TRAITS(hls4_to_bgra, 4, 4, 0)
|
||||
|
||||
OPENCV_GPU_IMPLEMENT_HLS2RGB_TRAITS(hls_to_rgb, 3, 3, 2)
|
||||
OPENCV_GPU_IMPLEMENT_HLS2RGB_TRAITS(hls_to_rgba, 3, 4, 2)
|
||||
OPENCV_GPU_IMPLEMENT_HLS2RGB_TRAITS(hls4_to_rgb, 4, 3, 2)
|
||||
OPENCV_GPU_IMPLEMENT_HLS2RGB_TRAITS(hls4_to_rgba, 4, 4, 2)
|
||||
OPENCV_GPU_IMPLEMENT_HLS2RGB_TRAITS(hls_to_bgr, 3, 3, 0)
|
||||
OPENCV_GPU_IMPLEMENT_HLS2RGB_TRAITS(hls_to_bgra, 3, 4, 0)
|
||||
OPENCV_GPU_IMPLEMENT_HLS2RGB_TRAITS(hls4_to_bgr, 4, 3, 0)
|
||||
OPENCV_GPU_IMPLEMENT_HLS2RGB_TRAITS(hls4_to_bgra, 4, 4, 0)
|
||||
|
||||
#undef OPENCV_GPU_IMPLEMENT_HLS2RGB_TRAITS
|
||||
|
||||
END_OPENCV_DEVICE_NAMESPACE
|
||||
#undef OPENCV_GPU_IMPLEMENT_HLS2RGB_TRAITS
|
||||
}}} // namespace cv { namespace gpu { namespace device
|
||||
|
||||
#endif // __OPENCV_GPU_BORDER_INTERPOLATE_HPP__
|
||||
|
@ -45,16 +45,8 @@
|
||||
|
||||
#include "internal_shared.hpp"
|
||||
|
||||
BEGIN_OPENCV_DEVICE_NAMESPACE
|
||||
|
||||
#if defined(_WIN64) || defined(__LP64__)
|
||||
// 64-bit register modifier for inlined asm
|
||||
#define OPENCV_GPU_ASM_PTR "l"
|
||||
#else
|
||||
// 32-bit register modifier for inlined asm
|
||||
#define OPENCV_GPU_ASM_PTR "r"
|
||||
#endif
|
||||
|
||||
namespace cv { namespace gpu { namespace device
|
||||
{
|
||||
#if __CUDA_ARCH__ >= 200
|
||||
|
||||
// for Fermi memory space is detected automatically
|
||||
@ -65,6 +57,14 @@ BEGIN_OPENCV_DEVICE_NAMESPACE
|
||||
|
||||
#else // __CUDA_ARCH__ >= 200
|
||||
|
||||
#if defined(_WIN64) || defined(__LP64__)
|
||||
// 64-bit register modifier for inlined asm
|
||||
#define OPENCV_GPU_ASM_PTR "l"
|
||||
#else
|
||||
// 32-bit register modifier for inlined asm
|
||||
#define OPENCV_GPU_ASM_PTR "r"
|
||||
#endif
|
||||
|
||||
template<class T> struct ForceGlob;
|
||||
|
||||
#define OPENCV_GPU_DEFINE_FORCE_GLOB(base_type, ptx_type, reg_mod) \
|
||||
@ -85,21 +85,21 @@ BEGIN_OPENCV_DEVICE_NAMESPACE
|
||||
} \
|
||||
};
|
||||
|
||||
OPENCV_GPU_DEFINE_FORCE_GLOB_B(uchar, u8)
|
||||
OPENCV_GPU_DEFINE_FORCE_GLOB_B(schar, s8)
|
||||
OPENCV_GPU_DEFINE_FORCE_GLOB_B(char, b8)
|
||||
OPENCV_GPU_DEFINE_FORCE_GLOB (ushort, u16, h)
|
||||
OPENCV_GPU_DEFINE_FORCE_GLOB (short, s16, h)
|
||||
OPENCV_GPU_DEFINE_FORCE_GLOB (uint, u32, r)
|
||||
OPENCV_GPU_DEFINE_FORCE_GLOB (int, s32, r)
|
||||
OPENCV_GPU_DEFINE_FORCE_GLOB (float, f32, f)
|
||||
OPENCV_GPU_DEFINE_FORCE_GLOB (double, f64, d)
|
||||
OPENCV_GPU_DEFINE_FORCE_GLOB_B(uchar, u8)
|
||||
OPENCV_GPU_DEFINE_FORCE_GLOB_B(schar, s8)
|
||||
OPENCV_GPU_DEFINE_FORCE_GLOB_B(char, b8)
|
||||
OPENCV_GPU_DEFINE_FORCE_GLOB (ushort, u16, h)
|
||||
OPENCV_GPU_DEFINE_FORCE_GLOB (short, s16, h)
|
||||
OPENCV_GPU_DEFINE_FORCE_GLOB (uint, u32, r)
|
||||
OPENCV_GPU_DEFINE_FORCE_GLOB (int, s32, r)
|
||||
OPENCV_GPU_DEFINE_FORCE_GLOB (float, f32, f)
|
||||
OPENCV_GPU_DEFINE_FORCE_GLOB (double, f64, d)
|
||||
|
||||
#undef OPENCV_GPU_DEFINE_FORCE_GLOB
|
||||
#undef OPENCV_GPU_DEFINE_FORCE_GLOB_B
|
||||
#undef OPENCV_GPU_DEFINE_FORCE_GLOB
|
||||
#undef OPENCV_GPU_DEFINE_FORCE_GLOB_B
|
||||
#undef OPENCV_GPU_ASM_PTR
|
||||
|
||||
#endif // __CUDA_ARCH__ >= 200
|
||||
|
||||
END_OPENCV_DEVICE_NAMESPACE
|
||||
}}} // namespace cv { namespace gpu { namespace device
|
||||
|
||||
#endif // __OPENCV_GPU_DATAMOV_UTILS_HPP__
|
||||
|
File diff suppressed because it is too large
Load Diff
@ -47,365 +47,348 @@
|
||||
#include "../vec_traits.hpp"
|
||||
#include "../functional.hpp"
|
||||
|
||||
BEGIN_OPENCV_DEVICE_NAMESPACE
|
||||
|
||||
namespace detail
|
||||
namespace cv { namespace gpu { namespace device
|
||||
{
|
||||
//! Mask accessor
|
||||
|
||||
struct MaskReader
|
||||
namespace transform_detail
|
||||
{
|
||||
explicit MaskReader(const PtrStepb& mask_): mask(mask_) {}
|
||||
//! Read Write Traits
|
||||
|
||||
__device__ __forceinline__ bool operator()(int y, int x) const { return mask.ptr(y)[x]; }
|
||||
|
||||
const PtrStepb mask;
|
||||
};
|
||||
|
||||
struct NoMask
|
||||
{
|
||||
__device__ __forceinline__ bool operator()(int y, int x) const { return true; }
|
||||
};
|
||||
|
||||
//! Read Write Traits
|
||||
|
||||
template <typename T, typename D, int shift> struct UnaryReadWriteTraits
|
||||
{
|
||||
typedef typename TypeVec<T, shift>::vec_type read_type;
|
||||
typedef typename TypeVec<D, shift>::vec_type write_type;
|
||||
};
|
||||
|
||||
template <typename T1, typename T2, typename D, int shift> struct BinaryReadWriteTraits
|
||||
{
|
||||
typedef typename TypeVec<T1, shift>::vec_type read_type1;
|
||||
typedef typename TypeVec<T2, shift>::vec_type read_type2;
|
||||
typedef typename TypeVec<D, shift>::vec_type write_type;
|
||||
};
|
||||
|
||||
//! Transform kernels
|
||||
|
||||
template <int shift> struct OpUnroller;
|
||||
template <> struct OpUnroller<1>
|
||||
{
|
||||
template <typename T, typename D, typename UnOp, typename Mask>
|
||||
static __device__ __forceinline__ void unroll(const T& src, D& dst, const Mask& mask, UnOp& op, int x_shifted, int y)
|
||||
template <typename T, typename D, int shift> struct UnaryReadWriteTraits
|
||||
{
|
||||
if (mask(y, x_shifted))
|
||||
dst.x = op(src.x);
|
||||
}
|
||||
typedef typename TypeVec<T, shift>::vec_type read_type;
|
||||
typedef typename TypeVec<D, shift>::vec_type write_type;
|
||||
};
|
||||
|
||||
template <typename T1, typename T2, typename D, typename BinOp, typename Mask>
|
||||
static __device__ __forceinline__ void unroll(const T1& src1, const T2& src2, D& dst, const Mask& mask, BinOp& op, int x_shifted, int y)
|
||||
template <typename T1, typename T2, typename D, int shift> struct BinaryReadWriteTraits
|
||||
{
|
||||
if (mask(y, x_shifted))
|
||||
dst.x = op(src1.x, src2.x);
|
||||
}
|
||||
};
|
||||
template <> struct OpUnroller<2>
|
||||
{
|
||||
template <typename T, typename D, typename UnOp, typename Mask>
|
||||
static __device__ __forceinline__ void unroll(const T& src, D& dst, const Mask& mask, UnOp& op, int x_shifted, int y)
|
||||
{
|
||||
if (mask(y, x_shifted))
|
||||
dst.x = op(src.x);
|
||||
if (mask(y, x_shifted + 1))
|
||||
dst.y = op(src.y);
|
||||
}
|
||||
typedef typename TypeVec<T1, shift>::vec_type read_type1;
|
||||
typedef typename TypeVec<T2, shift>::vec_type read_type2;
|
||||
typedef typename TypeVec<D, shift>::vec_type write_type;
|
||||
};
|
||||
|
||||
template <typename T1, typename T2, typename D, typename BinOp, typename Mask>
|
||||
static __device__ __forceinline__ void unroll(const T1& src1, const T2& src2, D& dst, const Mask& mask, BinOp& op, int x_shifted, int y)
|
||||
{
|
||||
if (mask(y, x_shifted))
|
||||
dst.x = op(src1.x, src2.x);
|
||||
if (mask(y, x_shifted + 1))
|
||||
dst.y = op(src1.y, src2.y);
|
||||
}
|
||||
};
|
||||
template <> struct OpUnroller<3>
|
||||
{
|
||||
template <typename T, typename D, typename UnOp, typename Mask>
|
||||
static __device__ __forceinline__ void unroll(const T& src, D& dst, const Mask& mask, const UnOp& op, int x_shifted, int y)
|
||||
{
|
||||
if (mask(y, x_shifted))
|
||||
dst.x = op(src.x);
|
||||
if (mask(y, x_shifted + 1))
|
||||
dst.y = op(src.y);
|
||||
if (mask(y, x_shifted + 2))
|
||||
dst.z = op(src.z);
|
||||
}
|
||||
//! Transform kernels
|
||||
|
||||
template <typename T1, typename T2, typename D, typename BinOp, typename Mask>
|
||||
static __device__ __forceinline__ void unroll(const T1& src1, const T2& src2, D& dst, const Mask& mask, const BinOp& op, int x_shifted, int y)
|
||||
template <int shift> struct OpUnroller;
|
||||
template <> struct OpUnroller<1>
|
||||
{
|
||||
if (mask(y, x_shifted))
|
||||
dst.x = op(src1.x, src2.x);
|
||||
if (mask(y, x_shifted + 1))
|
||||
dst.y = op(src1.y, src2.y);
|
||||
if (mask(y, x_shifted + 2))
|
||||
dst.z = op(src1.z, src2.z);
|
||||
}
|
||||
};
|
||||
template <> struct OpUnroller<4>
|
||||
{
|
||||
template <typename T, typename D, typename UnOp, typename Mask>
|
||||
static __device__ __forceinline__ void unroll(const T& src, D& dst, const Mask& mask, const UnOp& op, int x_shifted, int y)
|
||||
{
|
||||
if (mask(y, x_shifted))
|
||||
dst.x = op(src.x);
|
||||
if (mask(y, x_shifted + 1))
|
||||
dst.y = op(src.y);
|
||||
if (mask(y, x_shifted + 2))
|
||||
dst.z = op(src.z);
|
||||
if (mask(y, x_shifted + 3))
|
||||
dst.w = op(src.w);
|
||||
}
|
||||
|
||||
template <typename T1, typename T2, typename D, typename BinOp, typename Mask>
|
||||
static __device__ __forceinline__ void unroll(const T1& src1, const T2& src2, D& dst, const Mask& mask, const BinOp& op, int x_shifted, int y)
|
||||
{
|
||||
if (mask(y, x_shifted))
|
||||
dst.x = op(src1.x, src2.x);
|
||||
if (mask(y, x_shifted + 1))
|
||||
dst.y = op(src1.y, src2.y);
|
||||
if (mask(y, x_shifted + 2))
|
||||
dst.z = op(src1.z, src2.z);
|
||||
if (mask(y, x_shifted + 3))
|
||||
dst.w = op(src1.w, src2.w);
|
||||
}
|
||||
};
|
||||
template <> struct OpUnroller<8>
|
||||
{
|
||||
template <typename T, typename D, typename UnOp, typename Mask>
|
||||
static __device__ __forceinline__ void unroll(const T& src, D& dst, const Mask& mask, const UnOp& op, int x_shifted, int y)
|
||||
{
|
||||
if (mask(y, x_shifted))
|
||||
dst.a0 = op(src.a0);
|
||||
if (mask(y, x_shifted + 1))
|
||||
dst.a1 = op(src.a1);
|
||||
if (mask(y, x_shifted + 2))
|
||||
dst.a2 = op(src.a2);
|
||||
if (mask(y, x_shifted + 3))
|
||||
dst.a3 = op(src.a3);
|
||||
if (mask(y, x_shifted + 4))
|
||||
dst.a4 = op(src.a4);
|
||||
if (mask(y, x_shifted + 5))
|
||||
dst.a5 = op(src.a5);
|
||||
if (mask(y, x_shifted + 6))
|
||||
dst.a6 = op(src.a6);
|
||||
if (mask(y, x_shifted + 7))
|
||||
dst.a7 = op(src.a7);
|
||||
}
|
||||
|
||||
template <typename T1, typename T2, typename D, typename BinOp, typename Mask>
|
||||
static __device__ __forceinline__ void unroll(const T1& src1, const T2& src2, D& dst, const Mask& mask, const BinOp& op, int x_shifted, int y)
|
||||
{
|
||||
if (mask(y, x_shifted))
|
||||
dst.a0 = op(src1.a0, src2.a0);
|
||||
if (mask(y, x_shifted + 1))
|
||||
dst.a1 = op(src1.a1, src2.a1);
|
||||
if (mask(y, x_shifted + 2))
|
||||
dst.a2 = op(src1.a2, src2.a2);
|
||||
if (mask(y, x_shifted + 3))
|
||||
dst.a3 = op(src1.a3, src2.a3);
|
||||
if (mask(y, x_shifted + 4))
|
||||
dst.a4 = op(src1.a4, src2.a4);
|
||||
if (mask(y, x_shifted + 5))
|
||||
dst.a5 = op(src1.a5, src2.a5);
|
||||
if (mask(y, x_shifted + 6))
|
||||
dst.a6 = op(src1.a6, src2.a6);
|
||||
if (mask(y, x_shifted + 7))
|
||||
dst.a7 = op(src1.a7, src2.a7);
|
||||
}
|
||||
};
|
||||
|
||||
template <typename T, typename D, typename UnOp, typename Mask>
|
||||
__global__ static void transformSmart(const DevMem2D_<T> src_, PtrStep<D> dst_, const Mask mask, const UnOp op)
|
||||
{
|
||||
typedef TransformFunctorTraits<UnOp> ft;
|
||||
typedef typename UnaryReadWriteTraits<T, D, ft::smart_shift>::read_type read_type;
|
||||
typedef typename UnaryReadWriteTraits<T, D, ft::smart_shift>::write_type write_type;
|
||||
|
||||
const int x = threadIdx.x + blockIdx.x * blockDim.x;
|
||||
const int y = threadIdx.y + blockIdx.y * blockDim.y;
|
||||
const int x_shifted = x * ft::smart_shift;
|
||||
|
||||
if (y < src_.rows)
|
||||
{
|
||||
const T* src = src_.ptr(y);
|
||||
D* dst = dst_.ptr(y);
|
||||
|
||||
if (x_shifted + ft::smart_shift - 1 < src_.cols)
|
||||
template <typename T, typename D, typename UnOp, typename Mask>
|
||||
static __device__ __forceinline__ void unroll(const T& src, D& dst, const Mask& mask, UnOp& op, int x_shifted, int y)
|
||||
{
|
||||
const read_type src_n_el = ((const read_type*)src)[x];
|
||||
write_type dst_n_el;
|
||||
|
||||
OpUnroller<ft::smart_shift>::unroll(src_n_el, dst_n_el, mask, op, x_shifted, y);
|
||||
|
||||
((write_type*)dst)[x] = dst_n_el;
|
||||
if (mask(y, x_shifted))
|
||||
dst.x = op(src.x);
|
||||
}
|
||||
else
|
||||
|
||||
template <typename T1, typename T2, typename D, typename BinOp, typename Mask>
|
||||
static __device__ __forceinline__ void unroll(const T1& src1, const T2& src2, D& dst, const Mask& mask, BinOp& op, int x_shifted, int y)
|
||||
{
|
||||
for (int real_x = x_shifted; real_x < src_.cols; ++real_x)
|
||||
if (mask(y, x_shifted))
|
||||
dst.x = op(src1.x, src2.x);
|
||||
}
|
||||
};
|
||||
template <> struct OpUnroller<2>
|
||||
{
|
||||
template <typename T, typename D, typename UnOp, typename Mask>
|
||||
static __device__ __forceinline__ void unroll(const T& src, D& dst, const Mask& mask, UnOp& op, int x_shifted, int y)
|
||||
{
|
||||
if (mask(y, x_shifted))
|
||||
dst.x = op(src.x);
|
||||
if (mask(y, x_shifted + 1))
|
||||
dst.y = op(src.y);
|
||||
}
|
||||
|
||||
template <typename T1, typename T2, typename D, typename BinOp, typename Mask>
|
||||
static __device__ __forceinline__ void unroll(const T1& src1, const T2& src2, D& dst, const Mask& mask, BinOp& op, int x_shifted, int y)
|
||||
{
|
||||
if (mask(y, x_shifted))
|
||||
dst.x = op(src1.x, src2.x);
|
||||
if (mask(y, x_shifted + 1))
|
||||
dst.y = op(src1.y, src2.y);
|
||||
}
|
||||
};
|
||||
template <> struct OpUnroller<3>
|
||||
{
|
||||
template <typename T, typename D, typename UnOp, typename Mask>
|
||||
static __device__ __forceinline__ void unroll(const T& src, D& dst, const Mask& mask, const UnOp& op, int x_shifted, int y)
|
||||
{
|
||||
if (mask(y, x_shifted))
|
||||
dst.x = op(src.x);
|
||||
if (mask(y, x_shifted + 1))
|
||||
dst.y = op(src.y);
|
||||
if (mask(y, x_shifted + 2))
|
||||
dst.z = op(src.z);
|
||||
}
|
||||
|
||||
template <typename T1, typename T2, typename D, typename BinOp, typename Mask>
|
||||
static __device__ __forceinline__ void unroll(const T1& src1, const T2& src2, D& dst, const Mask& mask, const BinOp& op, int x_shifted, int y)
|
||||
{
|
||||
if (mask(y, x_shifted))
|
||||
dst.x = op(src1.x, src2.x);
|
||||
if (mask(y, x_shifted + 1))
|
||||
dst.y = op(src1.y, src2.y);
|
||||
if (mask(y, x_shifted + 2))
|
||||
dst.z = op(src1.z, src2.z);
|
||||
}
|
||||
};
|
||||
template <> struct OpUnroller<4>
|
||||
{
|
||||
template <typename T, typename D, typename UnOp, typename Mask>
|
||||
static __device__ __forceinline__ void unroll(const T& src, D& dst, const Mask& mask, const UnOp& op, int x_shifted, int y)
|
||||
{
|
||||
if (mask(y, x_shifted))
|
||||
dst.x = op(src.x);
|
||||
if (mask(y, x_shifted + 1))
|
||||
dst.y = op(src.y);
|
||||
if (mask(y, x_shifted + 2))
|
||||
dst.z = op(src.z);
|
||||
if (mask(y, x_shifted + 3))
|
||||
dst.w = op(src.w);
|
||||
}
|
||||
|
||||
template <typename T1, typename T2, typename D, typename BinOp, typename Mask>
|
||||
static __device__ __forceinline__ void unroll(const T1& src1, const T2& src2, D& dst, const Mask& mask, const BinOp& op, int x_shifted, int y)
|
||||
{
|
||||
if (mask(y, x_shifted))
|
||||
dst.x = op(src1.x, src2.x);
|
||||
if (mask(y, x_shifted + 1))
|
||||
dst.y = op(src1.y, src2.y);
|
||||
if (mask(y, x_shifted + 2))
|
||||
dst.z = op(src1.z, src2.z);
|
||||
if (mask(y, x_shifted + 3))
|
||||
dst.w = op(src1.w, src2.w);
|
||||
}
|
||||
};
|
||||
template <> struct OpUnroller<8>
|
||||
{
|
||||
template <typename T, typename D, typename UnOp, typename Mask>
|
||||
static __device__ __forceinline__ void unroll(const T& src, D& dst, const Mask& mask, const UnOp& op, int x_shifted, int y)
|
||||
{
|
||||
if (mask(y, x_shifted))
|
||||
dst.a0 = op(src.a0);
|
||||
if (mask(y, x_shifted + 1))
|
||||
dst.a1 = op(src.a1);
|
||||
if (mask(y, x_shifted + 2))
|
||||
dst.a2 = op(src.a2);
|
||||
if (mask(y, x_shifted + 3))
|
||||
dst.a3 = op(src.a3);
|
||||
if (mask(y, x_shifted + 4))
|
||||
dst.a4 = op(src.a4);
|
||||
if (mask(y, x_shifted + 5))
|
||||
dst.a5 = op(src.a5);
|
||||
if (mask(y, x_shifted + 6))
|
||||
dst.a6 = op(src.a6);
|
||||
if (mask(y, x_shifted + 7))
|
||||
dst.a7 = op(src.a7);
|
||||
}
|
||||
|
||||
template <typename T1, typename T2, typename D, typename BinOp, typename Mask>
|
||||
static __device__ __forceinline__ void unroll(const T1& src1, const T2& src2, D& dst, const Mask& mask, const BinOp& op, int x_shifted, int y)
|
||||
{
|
||||
if (mask(y, x_shifted))
|
||||
dst.a0 = op(src1.a0, src2.a0);
|
||||
if (mask(y, x_shifted + 1))
|
||||
dst.a1 = op(src1.a1, src2.a1);
|
||||
if (mask(y, x_shifted + 2))
|
||||
dst.a2 = op(src1.a2, src2.a2);
|
||||
if (mask(y, x_shifted + 3))
|
||||
dst.a3 = op(src1.a3, src2.a3);
|
||||
if (mask(y, x_shifted + 4))
|
||||
dst.a4 = op(src1.a4, src2.a4);
|
||||
if (mask(y, x_shifted + 5))
|
||||
dst.a5 = op(src1.a5, src2.a5);
|
||||
if (mask(y, x_shifted + 6))
|
||||
dst.a6 = op(src1.a6, src2.a6);
|
||||
if (mask(y, x_shifted + 7))
|
||||
dst.a7 = op(src1.a7, src2.a7);
|
||||
}
|
||||
};
|
||||
|
||||
template <typename T, typename D, typename UnOp, typename Mask>
|
||||
__global__ static void transformSmart(const DevMem2D_<T> src_, PtrStep<D> dst_, const Mask mask, const UnOp op)
|
||||
{
|
||||
typedef TransformFunctorTraits<UnOp> ft;
|
||||
typedef typename UnaryReadWriteTraits<T, D, ft::smart_shift>::read_type read_type;
|
||||
typedef typename UnaryReadWriteTraits<T, D, ft::smart_shift>::write_type write_type;
|
||||
|
||||
const int x = threadIdx.x + blockIdx.x * blockDim.x;
|
||||
const int y = threadIdx.y + blockIdx.y * blockDim.y;
|
||||
const int x_shifted = x * ft::smart_shift;
|
||||
|
||||
if (y < src_.rows)
|
||||
{
|
||||
const T* src = src_.ptr(y);
|
||||
D* dst = dst_.ptr(y);
|
||||
|
||||
if (x_shifted + ft::smart_shift - 1 < src_.cols)
|
||||
{
|
||||
if (mask(y, real_x))
|
||||
dst[real_x] = op(src[real_x]);
|
||||
const read_type src_n_el = ((const read_type*)src)[x];
|
||||
write_type dst_n_el;
|
||||
|
||||
OpUnroller<ft::smart_shift>::unroll(src_n_el, dst_n_el, mask, op, x_shifted, y);
|
||||
|
||||
((write_type*)dst)[x] = dst_n_el;
|
||||
}
|
||||
else
|
||||
{
|
||||
for (int real_x = x_shifted; real_x < src_.cols; ++real_x)
|
||||
{
|
||||
if (mask(y, real_x))
|
||||
dst[real_x] = op(src[real_x]);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template <typename T, typename D, typename UnOp, typename Mask>
|
||||
static __global__ void transformSimple(const DevMem2D_<T> src, PtrStep<D> dst, const Mask mask, const UnOp op)
|
||||
{
|
||||
const int x = blockDim.x * blockIdx.x + threadIdx.x;
|
||||
const int y = blockDim.y * blockIdx.y + threadIdx.y;
|
||||
|
||||
if (x < src.cols && y < src.rows && mask(y, x))
|
||||
template <typename T, typename D, typename UnOp, typename Mask>
|
||||
static __global__ void transformSimple(const DevMem2D_<T> src, PtrStep<D> dst, const Mask mask, const UnOp op)
|
||||
{
|
||||
dst.ptr(y)[x] = op(src.ptr(y)[x]);
|
||||
}
|
||||
}
|
||||
const int x = blockDim.x * blockIdx.x + threadIdx.x;
|
||||
const int y = blockDim.y * blockIdx.y + threadIdx.y;
|
||||
|
||||
template <typename T1, typename T2, typename D, typename BinOp, typename Mask>
|
||||
__global__ static void transformSmart(const DevMem2D_<T1> src1_, const PtrStep<T2> src2_, PtrStep<D> dst_,
|
||||
const Mask mask, const BinOp op)
|
||||
{
|
||||
typedef TransformFunctorTraits<BinOp> ft;
|
||||
typedef typename BinaryReadWriteTraits<T1, T2, D, ft::smart_shift>::read_type1 read_type1;
|
||||
typedef typename BinaryReadWriteTraits<T1, T2, D, ft::smart_shift>::read_type2 read_type2;
|
||||
typedef typename BinaryReadWriteTraits<T1, T2, D, ft::smart_shift>::write_type write_type;
|
||||
|
||||
const int x = threadIdx.x + blockIdx.x * blockDim.x;
|
||||
const int y = threadIdx.y + blockIdx.y * blockDim.y;
|
||||
const int x_shifted = x * ft::smart_shift;
|
||||
|
||||
if (y < src1_.rows)
|
||||
{
|
||||
const T1* src1 = src1_.ptr(y);
|
||||
const T2* src2 = src2_.ptr(y);
|
||||
D* dst = dst_.ptr(y);
|
||||
|
||||
if (x_shifted + ft::smart_shift - 1 < src1_.cols)
|
||||
if (x < src.cols && y < src.rows && mask(y, x))
|
||||
{
|
||||
const read_type1 src1_n_el = ((const read_type1*)src1)[x];
|
||||
const read_type2 src2_n_el = ((const read_type2*)src2)[x];
|
||||
write_type dst_n_el;
|
||||
|
||||
OpUnroller<ft::smart_shift>::unroll(src1_n_el, src2_n_el, dst_n_el, mask, op, x_shifted, y);
|
||||
|
||||
((write_type*)dst)[x] = dst_n_el;
|
||||
dst.ptr(y)[x] = op(src.ptr(y)[x]);
|
||||
}
|
||||
else
|
||||
}
|
||||
|
||||
template <typename T1, typename T2, typename D, typename BinOp, typename Mask>
|
||||
__global__ static void transformSmart(const DevMem2D_<T1> src1_, const PtrStep<T2> src2_, PtrStep<D> dst_,
|
||||
const Mask mask, const BinOp op)
|
||||
{
|
||||
typedef TransformFunctorTraits<BinOp> ft;
|
||||
typedef typename BinaryReadWriteTraits<T1, T2, D, ft::smart_shift>::read_type1 read_type1;
|
||||
typedef typename BinaryReadWriteTraits<T1, T2, D, ft::smart_shift>::read_type2 read_type2;
|
||||
typedef typename BinaryReadWriteTraits<T1, T2, D, ft::smart_shift>::write_type write_type;
|
||||
|
||||
const int x = threadIdx.x + blockIdx.x * blockDim.x;
|
||||
const int y = threadIdx.y + blockIdx.y * blockDim.y;
|
||||
const int x_shifted = x * ft::smart_shift;
|
||||
|
||||
if (y < src1_.rows)
|
||||
{
|
||||
for (int real_x = x_shifted; real_x < src1_.cols; ++real_x)
|
||||
const T1* src1 = src1_.ptr(y);
|
||||
const T2* src2 = src2_.ptr(y);
|
||||
D* dst = dst_.ptr(y);
|
||||
|
||||
if (x_shifted + ft::smart_shift - 1 < src1_.cols)
|
||||
{
|
||||
if (mask(y, real_x))
|
||||
dst[real_x] = op(src1[real_x], src2[real_x]);
|
||||
const read_type1 src1_n_el = ((const read_type1*)src1)[x];
|
||||
const read_type2 src2_n_el = ((const read_type2*)src2)[x];
|
||||
write_type dst_n_el;
|
||||
|
||||
OpUnroller<ft::smart_shift>::unroll(src1_n_el, src2_n_el, dst_n_el, mask, op, x_shifted, y);
|
||||
|
||||
((write_type*)dst)[x] = dst_n_el;
|
||||
}
|
||||
else
|
||||
{
|
||||
for (int real_x = x_shifted; real_x < src1_.cols; ++real_x)
|
||||
{
|
||||
if (mask(y, real_x))
|
||||
dst[real_x] = op(src1[real_x], src2[real_x]);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template <typename T1, typename T2, typename D, typename BinOp, typename Mask>
|
||||
static __global__ void transformSimple(const DevMem2D_<T1> src1, const PtrStep<T2> src2, PtrStep<D> dst,
|
||||
const Mask mask, const BinOp op)
|
||||
{
|
||||
const int x = blockDim.x * blockIdx.x + threadIdx.x;
|
||||
const int y = blockDim.y * blockIdx.y + threadIdx.y;
|
||||
|
||||
if (x < src1.cols && y < src1.rows && mask(y, x))
|
||||
template <typename T1, typename T2, typename D, typename BinOp, typename Mask>
|
||||
static __global__ void transformSimple(const DevMem2D_<T1> src1, const PtrStep<T2> src2, PtrStep<D> dst,
|
||||
const Mask mask, const BinOp op)
|
||||
{
|
||||
const T1 src1_data = src1.ptr(y)[x];
|
||||
const T2 src2_data = src2.ptr(y)[x];
|
||||
dst.ptr(y)[x] = op(src1_data, src2_data);
|
||||
}
|
||||
}
|
||||
const int x = blockDim.x * blockIdx.x + threadIdx.x;
|
||||
const int y = blockDim.y * blockIdx.y + threadIdx.y;
|
||||
|
||||
if (x < src1.cols && y < src1.rows && mask(y, x))
|
||||
{
|
||||
const T1 src1_data = src1.ptr(y)[x];
|
||||
const T2 src2_data = src2.ptr(y)[x];
|
||||
dst.ptr(y)[x] = op(src1_data, src2_data);
|
||||
}
|
||||
}
|
||||
|
||||
template <bool UseSmart> struct TransformDispatcher;
|
||||
template<> struct TransformDispatcher<false>
|
||||
{
|
||||
template <typename T, typename D, typename UnOp, typename Mask>
|
||||
static void call(const DevMem2D_<T>& src, const DevMem2D_<D>& dst, const UnOp& op, const Mask& mask, cudaStream_t stream)
|
||||
{
|
||||
typedef TransformFunctorTraits<UnOp> ft;
|
||||
|
||||
const dim3 threads(ft::simple_block_dim_x, ft::simple_block_dim_y, 1);
|
||||
const dim3 grid(divUp(src.cols, threads.x), divUp(src.rows, threads.y), 1);
|
||||
|
||||
transformSimple<T, D><<<grid, threads, 0, stream>>>(src, dst, mask, op);
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
|
||||
if (stream == 0)
|
||||
cudaSafeCall( cudaDeviceSynchronize() );
|
||||
}
|
||||
|
||||
template <typename T1, typename T2, typename D, typename BinOp, typename Mask>
|
||||
static void call(const DevMem2D_<T1>& src1, const DevMem2D_<T2>& src2, const DevMem2D_<D>& dst, const BinOp& op, const Mask& mask, cudaStream_t stream)
|
||||
{
|
||||
typedef TransformFunctorTraits<BinOp> ft;
|
||||
|
||||
const dim3 threads(ft::simple_block_dim_x, ft::simple_block_dim_y, 1);
|
||||
const dim3 grid(divUp(src1.cols, threads.x), divUp(src1.rows, threads.y), 1);
|
||||
|
||||
transformSimple<T1, T2, D><<<grid, threads, 0, stream>>>(src1, src2, dst, mask, op);
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
|
||||
if (stream == 0)
|
||||
cudaSafeCall( cudaDeviceSynchronize() );
|
||||
}
|
||||
};
|
||||
template<> struct TransformDispatcher<true>
|
||||
{
|
||||
template <typename T, typename D, typename UnOp, typename Mask>
|
||||
static void call(const DevMem2D_<T>& src, const DevMem2D_<D>& dst, const UnOp& op, const Mask& mask, cudaStream_t stream)
|
||||
{
|
||||
typedef TransformFunctorTraits<UnOp> ft;
|
||||
|
||||
StaticAssert<ft::smart_shift != 1>::check();
|
||||
|
||||
const dim3 threads(ft::smart_block_dim_x, ft::smart_block_dim_y, 1);
|
||||
const dim3 grid(divUp(src.cols, threads.x * ft::smart_shift), divUp(src.rows, threads.y), 1);
|
||||
|
||||
transformSmart<T, D><<<grid, threads, 0, stream>>>(src, dst, mask, op);
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
|
||||
if (stream == 0)
|
||||
cudaSafeCall( cudaDeviceSynchronize() );
|
||||
}
|
||||
|
||||
template <typename T1, typename T2, typename D, typename BinOp, typename Mask>
|
||||
static void call(const DevMem2D_<T1>& src1, const DevMem2D_<T2>& src2, const DevMem2D_<D>& dst, const BinOp& op, const Mask& mask, cudaStream_t stream)
|
||||
{
|
||||
typedef TransformFunctorTraits<BinOp> ft;
|
||||
|
||||
StaticAssert<ft::smart_shift != 1>::check();
|
||||
|
||||
const dim3 threads(ft::smart_block_dim_x, ft::smart_block_dim_y, 1);
|
||||
const dim3 grid(divUp(src1.cols, threads.x * ft::smart_shift), divUp(src1.rows, threads.y), 1);
|
||||
|
||||
transformSmart<T1, T2, D><<<grid, threads, 0, stream>>>(src1, src2, dst, mask, op);
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
|
||||
if (stream == 0)
|
||||
cudaSafeCall( cudaDeviceSynchronize() );
|
||||
}
|
||||
};
|
||||
|
||||
template <bool UseSmart> struct TransformDispatcher;
|
||||
template<> struct TransformDispatcher<false>
|
||||
{
|
||||
template <typename T, typename D, typename UnOp, typename Mask>
|
||||
static void call(const DevMem2D_<T>& src, const DevMem2D_<D>& dst, const UnOp& op, const Mask& mask, cudaStream_t stream)
|
||||
static void transform_caller(const DevMem2D_<T>& src, const DevMem2D_<D>& dst, const UnOp& op, const Mask& mask, cudaStream_t stream)
|
||||
{
|
||||
typedef TransformFunctorTraits<UnOp> ft;
|
||||
|
||||
const dim3 threads(ft::simple_block_dim_x, ft::simple_block_dim_y, 1);
|
||||
const dim3 grid(divUp(src.cols, threads.x), divUp(src.rows, threads.y), 1);
|
||||
|
||||
transformSimple<T, D><<<grid, threads, 0, stream>>>(src, dst, mask, op);
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
|
||||
if (stream == 0)
|
||||
cudaSafeCall( cudaDeviceSynchronize() );
|
||||
TransformDispatcher<VecTraits<T>::cn == 1 && VecTraits<D>::cn == 1 && ft::smart_shift != 1>::call(src, dst, op, mask, stream);
|
||||
}
|
||||
|
||||
template <typename T1, typename T2, typename D, typename BinOp, typename Mask>
|
||||
static void call(const DevMem2D_<T1>& src1, const DevMem2D_<T2>& src2, const DevMem2D_<D>& dst, const BinOp& op, const Mask& mask, cudaStream_t stream)
|
||||
static void transform_caller(const DevMem2D_<T1>& src1, const DevMem2D_<T2>& src2, const DevMem2D_<D>& dst, const BinOp& op, const Mask& mask, cudaStream_t stream)
|
||||
{
|
||||
typedef TransformFunctorTraits<BinOp> ft;
|
||||
|
||||
const dim3 threads(ft::simple_block_dim_x, ft::simple_block_dim_y, 1);
|
||||
const dim3 grid(divUp(src1.cols, threads.x), divUp(src1.rows, threads.y), 1);
|
||||
|
||||
transformSimple<T1, T2, D><<<grid, threads, 0, stream>>>(src1, src2, dst, mask, op);
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
|
||||
if (stream == 0)
|
||||
cudaSafeCall( cudaDeviceSynchronize() );
|
||||
TransformDispatcher<VecTraits<T1>::cn == 1 && VecTraits<T2>::cn == 1 && VecTraits<D>::cn == 1 && ft::smart_shift != 1>::call(src1, src2, dst, op, mask, stream);
|
||||
}
|
||||
};
|
||||
template<> struct TransformDispatcher<true>
|
||||
{
|
||||
template <typename T, typename D, typename UnOp, typename Mask>
|
||||
static void call(const DevMem2D_<T>& src, const DevMem2D_<D>& dst, const UnOp& op, const Mask& mask, cudaStream_t stream)
|
||||
{
|
||||
typedef TransformFunctorTraits<UnOp> ft;
|
||||
|
||||
StaticAssert<ft::smart_shift != 1>::check();
|
||||
|
||||
const dim3 threads(ft::smart_block_dim_x, ft::smart_block_dim_y, 1);
|
||||
const dim3 grid(divUp(src.cols, threads.x * ft::smart_shift), divUp(src.rows, threads.y), 1);
|
||||
|
||||
transformSmart<T, D><<<grid, threads, 0, stream>>>(src, dst, mask, op);
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
|
||||
if (stream == 0)
|
||||
cudaSafeCall( cudaDeviceSynchronize() );
|
||||
}
|
||||
|
||||
template <typename T1, typename T2, typename D, typename BinOp, typename Mask>
|
||||
static void call(const DevMem2D_<T1>& src1, const DevMem2D_<T2>& src2, const DevMem2D_<D>& dst, const BinOp& op, const Mask& mask, cudaStream_t stream)
|
||||
{
|
||||
typedef TransformFunctorTraits<BinOp> ft;
|
||||
|
||||
StaticAssert<ft::smart_shift != 1>::check();
|
||||
|
||||
const dim3 threads(ft::smart_block_dim_x, ft::smart_block_dim_y, 1);
|
||||
const dim3 grid(divUp(src1.cols, threads.x * ft::smart_shift), divUp(src1.rows, threads.y), 1);
|
||||
|
||||
transformSmart<T1, T2, D><<<grid, threads, 0, stream>>>(src1, src2, dst, mask, op);
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
|
||||
if (stream == 0)
|
||||
cudaSafeCall( cudaDeviceSynchronize() );
|
||||
}
|
||||
};
|
||||
|
||||
template <typename T, typename D, typename UnOp, typename Mask>
|
||||
static void transform_caller(const DevMem2D_<T>& src, const DevMem2D_<D>& dst, const UnOp& op, const Mask& mask, cudaStream_t stream)
|
||||
{
|
||||
typedef TransformFunctorTraits<UnOp> ft;
|
||||
TransformDispatcher<VecTraits<T>::cn == 1 && VecTraits<D>::cn == 1 && ft::smart_shift != 1>::call(src, dst, op, mask, stream);
|
||||
}
|
||||
|
||||
template <typename T1, typename T2, typename D, typename BinOp, typename Mask>
|
||||
static void transform_caller(const DevMem2D_<T1>& src1, const DevMem2D_<T2>& src2, const DevMem2D_<D>& dst, const BinOp& op, const Mask& mask, cudaStream_t stream)
|
||||
{
|
||||
typedef TransformFunctorTraits<BinOp> ft;
|
||||
TransformDispatcher<VecTraits<T1>::cn == 1 && VecTraits<T2>::cn == 1 && VecTraits<D>::cn == 1 && ft::smart_shift != 1>::call(src1, src2, dst, op, mask, stream);
|
||||
}
|
||||
}
|
||||
|
||||
END_OPENCV_DEVICE_NAMESPACE
|
||||
} // namespace transform_detail
|
||||
}}} // namespace cv { namespace gpu { namespace device
|
||||
|
||||
#endif // __OPENCV_GPU_TRANSFORM_DETAIL_HPP__
|
||||
|
@ -46,143 +46,142 @@
|
||||
#include "internal_shared.hpp"
|
||||
#include "../vec_traits.hpp"
|
||||
|
||||
BEGIN_OPENCV_DEVICE_NAMESPACE
|
||||
|
||||
namespace detail
|
||||
namespace cv { namespace gpu { namespace device
|
||||
{
|
||||
template <bool, typename T1, typename T2> struct Select { typedef T1 type; };
|
||||
template <typename T1, typename T2> struct Select<false, T1, T2> { typedef T2 type; };
|
||||
namespace type_traits_detail
|
||||
{
|
||||
template <bool, typename T1, typename T2> struct Select { typedef T1 type; };
|
||||
template <typename T1, typename T2> struct Select<false, T1, T2> { typedef T2 type; };
|
||||
|
||||
template <typename T> struct IsSignedIntergral { enum {value = 0}; };
|
||||
template <> struct IsSignedIntergral<schar> { enum {value = 1}; };
|
||||
template <> struct IsSignedIntergral<char1> { enum {value = 1}; };
|
||||
template <> struct IsSignedIntergral<short> { enum {value = 1}; };
|
||||
template <> struct IsSignedIntergral<short1> { enum {value = 1}; };
|
||||
template <> struct IsSignedIntergral<int> { enum {value = 1}; };
|
||||
template <> struct IsSignedIntergral<int1> { enum {value = 1}; };
|
||||
template <typename T> struct IsSignedIntergral { enum {value = 0}; };
|
||||
template <> struct IsSignedIntergral<schar> { enum {value = 1}; };
|
||||
template <> struct IsSignedIntergral<char1> { enum {value = 1}; };
|
||||
template <> struct IsSignedIntergral<short> { enum {value = 1}; };
|
||||
template <> struct IsSignedIntergral<short1> { enum {value = 1}; };
|
||||
template <> struct IsSignedIntergral<int> { enum {value = 1}; };
|
||||
template <> struct IsSignedIntergral<int1> { enum {value = 1}; };
|
||||
|
||||
template <typename T> struct IsUnsignedIntegral { enum {value = 0}; };
|
||||
template <> struct IsUnsignedIntegral<uchar> { enum {value = 1}; };
|
||||
template <> struct IsUnsignedIntegral<uchar1> { enum {value = 1}; };
|
||||
template <> struct IsUnsignedIntegral<ushort> { enum {value = 1}; };
|
||||
template <> struct IsUnsignedIntegral<ushort1> { enum {value = 1}; };
|
||||
template <> struct IsUnsignedIntegral<uint> { enum {value = 1}; };
|
||||
template <> struct IsUnsignedIntegral<uint1> { enum {value = 1}; };
|
||||
template <typename T> struct IsUnsignedIntegral { enum {value = 0}; };
|
||||
template <> struct IsUnsignedIntegral<uchar> { enum {value = 1}; };
|
||||
template <> struct IsUnsignedIntegral<uchar1> { enum {value = 1}; };
|
||||
template <> struct IsUnsignedIntegral<ushort> { enum {value = 1}; };
|
||||
template <> struct IsUnsignedIntegral<ushort1> { enum {value = 1}; };
|
||||
template <> struct IsUnsignedIntegral<uint> { enum {value = 1}; };
|
||||
template <> struct IsUnsignedIntegral<uint1> { enum {value = 1}; };
|
||||
|
||||
template <typename T> struct IsIntegral { enum {value = IsSignedIntergral<T>::value || IsUnsignedIntegral<T>::value}; };
|
||||
template <> struct IsIntegral<char> { enum {value = 1}; };
|
||||
template <> struct IsIntegral<bool> { enum {value = 1}; };
|
||||
template <typename T> struct IsIntegral { enum {value = IsSignedIntergral<T>::value || IsUnsignedIntegral<T>::value}; };
|
||||
template <> struct IsIntegral<char> { enum {value = 1}; };
|
||||
template <> struct IsIntegral<bool> { enum {value = 1}; };
|
||||
|
||||
template <typename T> struct IsFloat { enum {value = 0}; };
|
||||
template <> struct IsFloat<float> { enum {value = 1}; };
|
||||
template <> struct IsFloat<double> { enum {value = 1}; };
|
||||
template <typename T> struct IsFloat { enum {value = 0}; };
|
||||
template <> struct IsFloat<float> { enum {value = 1}; };
|
||||
template <> struct IsFloat<double> { enum {value = 1}; };
|
||||
|
||||
template <typename T> struct IsVec { enum {value = 0}; };
|
||||
template <> struct IsVec<uchar1> { enum {value = 1}; };
|
||||
template <> struct IsVec<uchar2> { enum {value = 1}; };
|
||||
template <> struct IsVec<uchar3> { enum {value = 1}; };
|
||||
template <> struct IsVec<uchar4> { enum {value = 1}; };
|
||||
template <> struct IsVec<uchar8> { enum {value = 1}; };
|
||||
template <> struct IsVec<char1> { enum {value = 1}; };
|
||||
template <> struct IsVec<char2> { enum {value = 1}; };
|
||||
template <> struct IsVec<char3> { enum {value = 1}; };
|
||||
template <> struct IsVec<char4> { enum {value = 1}; };
|
||||
template <> struct IsVec<char8> { enum {value = 1}; };
|
||||
template <> struct IsVec<ushort1> { enum {value = 1}; };
|
||||
template <> struct IsVec<ushort2> { enum {value = 1}; };
|
||||
template <> struct IsVec<ushort3> { enum {value = 1}; };
|
||||
template <> struct IsVec<ushort4> { enum {value = 1}; };
|
||||
template <> struct IsVec<ushort8> { enum {value = 1}; };
|
||||
template <> struct IsVec<short1> { enum {value = 1}; };
|
||||
template <> struct IsVec<short2> { enum {value = 1}; };
|
||||
template <> struct IsVec<short3> { enum {value = 1}; };
|
||||
template <> struct IsVec<short4> { enum {value = 1}; };
|
||||
template <> struct IsVec<short8> { enum {value = 1}; };
|
||||
template <> struct IsVec<uint1> { enum {value = 1}; };
|
||||
template <> struct IsVec<uint2> { enum {value = 1}; };
|
||||
template <> struct IsVec<uint3> { enum {value = 1}; };
|
||||
template <> struct IsVec<uint4> { enum {value = 1}; };
|
||||
template <> struct IsVec<uint8> { enum {value = 1}; };
|
||||
template <> struct IsVec<int1> { enum {value = 1}; };
|
||||
template <> struct IsVec<int2> { enum {value = 1}; };
|
||||
template <> struct IsVec<int3> { enum {value = 1}; };
|
||||
template <> struct IsVec<int4> { enum {value = 1}; };
|
||||
template <> struct IsVec<int8> { enum {value = 1}; };
|
||||
template <> struct IsVec<float1> { enum {value = 1}; };
|
||||
template <> struct IsVec<float2> { enum {value = 1}; };
|
||||
template <> struct IsVec<float3> { enum {value = 1}; };
|
||||
template <> struct IsVec<float4> { enum {value = 1}; };
|
||||
template <> struct IsVec<float8> { enum {value = 1}; };
|
||||
template <> struct IsVec<double1> { enum {value = 1}; };
|
||||
template <> struct IsVec<double2> { enum {value = 1}; };
|
||||
template <> struct IsVec<double3> { enum {value = 1}; };
|
||||
template <> struct IsVec<double4> { enum {value = 1}; };
|
||||
template <> struct IsVec<double8> { enum {value = 1}; };
|
||||
template <typename T> struct IsVec { enum {value = 0}; };
|
||||
template <> struct IsVec<uchar1> { enum {value = 1}; };
|
||||
template <> struct IsVec<uchar2> { enum {value = 1}; };
|
||||
template <> struct IsVec<uchar3> { enum {value = 1}; };
|
||||
template <> struct IsVec<uchar4> { enum {value = 1}; };
|
||||
template <> struct IsVec<uchar8> { enum {value = 1}; };
|
||||
template <> struct IsVec<char1> { enum {value = 1}; };
|
||||
template <> struct IsVec<char2> { enum {value = 1}; };
|
||||
template <> struct IsVec<char3> { enum {value = 1}; };
|
||||
template <> struct IsVec<char4> { enum {value = 1}; };
|
||||
template <> struct IsVec<char8> { enum {value = 1}; };
|
||||
template <> struct IsVec<ushort1> { enum {value = 1}; };
|
||||
template <> struct IsVec<ushort2> { enum {value = 1}; };
|
||||
template <> struct IsVec<ushort3> { enum {value = 1}; };
|
||||
template <> struct IsVec<ushort4> { enum {value = 1}; };
|
||||
template <> struct IsVec<ushort8> { enum {value = 1}; };
|
||||
template <> struct IsVec<short1> { enum {value = 1}; };
|
||||
template <> struct IsVec<short2> { enum {value = 1}; };
|
||||
template <> struct IsVec<short3> { enum {value = 1}; };
|
||||
template <> struct IsVec<short4> { enum {value = 1}; };
|
||||
template <> struct IsVec<short8> { enum {value = 1}; };
|
||||
template <> struct IsVec<uint1> { enum {value = 1}; };
|
||||
template <> struct IsVec<uint2> { enum {value = 1}; };
|
||||
template <> struct IsVec<uint3> { enum {value = 1}; };
|
||||
template <> struct IsVec<uint4> { enum {value = 1}; };
|
||||
template <> struct IsVec<uint8> { enum {value = 1}; };
|
||||
template <> struct IsVec<int1> { enum {value = 1}; };
|
||||
template <> struct IsVec<int2> { enum {value = 1}; };
|
||||
template <> struct IsVec<int3> { enum {value = 1}; };
|
||||
template <> struct IsVec<int4> { enum {value = 1}; };
|
||||
template <> struct IsVec<int8> { enum {value = 1}; };
|
||||
template <> struct IsVec<float1> { enum {value = 1}; };
|
||||
template <> struct IsVec<float2> { enum {value = 1}; };
|
||||
template <> struct IsVec<float3> { enum {value = 1}; };
|
||||
template <> struct IsVec<float4> { enum {value = 1}; };
|
||||
template <> struct IsVec<float8> { enum {value = 1}; };
|
||||
template <> struct IsVec<double1> { enum {value = 1}; };
|
||||
template <> struct IsVec<double2> { enum {value = 1}; };
|
||||
template <> struct IsVec<double3> { enum {value = 1}; };
|
||||
template <> struct IsVec<double4> { enum {value = 1}; };
|
||||
template <> struct IsVec<double8> { enum {value = 1}; };
|
||||
|
||||
template <class U> struct AddParameterType { typedef const U& type; };
|
||||
template <class U> struct AddParameterType<U&> { typedef U& type; };
|
||||
template <> struct AddParameterType<void> { typedef void type; };
|
||||
template <class U> struct AddParameterType { typedef const U& type; };
|
||||
template <class U> struct AddParameterType<U&> { typedef U& type; };
|
||||
template <> struct AddParameterType<void> { typedef void type; };
|
||||
|
||||
template <class U> struct ReferenceTraits
|
||||
{
|
||||
enum { value = false };
|
||||
typedef U type;
|
||||
};
|
||||
template <class U> struct ReferenceTraits<U&>
|
||||
{
|
||||
enum { value = true };
|
||||
typedef U type;
|
||||
};
|
||||
|
||||
template <class U> struct PointerTraits
|
||||
{
|
||||
enum { value = false };
|
||||
typedef void type;
|
||||
};
|
||||
template <class U> struct PointerTraits<U*>
|
||||
{
|
||||
enum { value = true };
|
||||
typedef U type;
|
||||
};
|
||||
template <class U> struct PointerTraits<U*&>
|
||||
{
|
||||
enum { value = true };
|
||||
typedef U type;
|
||||
};
|
||||
|
||||
template <class U> struct UnConst
|
||||
{
|
||||
typedef U type;
|
||||
enum { value = 0 };
|
||||
};
|
||||
template <class U> struct UnConst<const U>
|
||||
{
|
||||
typedef U type;
|
||||
enum { value = 1 };
|
||||
};
|
||||
template <class U> struct UnConst<const U&>
|
||||
{
|
||||
typedef U& type;
|
||||
enum { value = 1 };
|
||||
};
|
||||
template <class U> struct ReferenceTraits
|
||||
{
|
||||
enum { value = false };
|
||||
typedef U type;
|
||||
};
|
||||
template <class U> struct ReferenceTraits<U&>
|
||||
{
|
||||
enum { value = true };
|
||||
typedef U type;
|
||||
};
|
||||
|
||||
template <class U> struct PointerTraits
|
||||
{
|
||||
enum { value = false };
|
||||
typedef void type;
|
||||
};
|
||||
template <class U> struct PointerTraits<U*>
|
||||
{
|
||||
enum { value = true };
|
||||
typedef U type;
|
||||
};
|
||||
template <class U> struct PointerTraits<U*&>
|
||||
{
|
||||
enum { value = true };
|
||||
typedef U type;
|
||||
};
|
||||
|
||||
template <class U> struct UnConst
|
||||
{
|
||||
typedef U type;
|
||||
enum { value = 0 };
|
||||
};
|
||||
template <class U> struct UnConst<const U>
|
||||
{
|
||||
typedef U type;
|
||||
enum { value = 1 };
|
||||
};
|
||||
template <class U> struct UnConst<const U&>
|
||||
{
|
||||
typedef U& type;
|
||||
enum { value = 1 };
|
||||
};
|
||||
|
||||
template <class U> struct UnVolatile
|
||||
{
|
||||
typedef U type;
|
||||
enum { value = 0 };
|
||||
};
|
||||
template <class U> struct UnVolatile<volatile U>
|
||||
{
|
||||
typedef U type;
|
||||
enum { value = 1 };
|
||||
};
|
||||
template <class U> struct UnVolatile<volatile U&>
|
||||
{
|
||||
typedef U& type;
|
||||
enum { value = 1 };
|
||||
};
|
||||
}
|
||||
|
||||
END_OPENCV_DEVICE_NAMESPACE
|
||||
template <class U> struct UnVolatile
|
||||
{
|
||||
typedef U type;
|
||||
enum { value = 0 };
|
||||
};
|
||||
template <class U> struct UnVolatile<volatile U>
|
||||
{
|
||||
typedef U type;
|
||||
enum { value = 1 };
|
||||
};
|
||||
template <class U> struct UnVolatile<volatile U&>
|
||||
{
|
||||
typedef U& type;
|
||||
enum { value = 1 };
|
||||
};
|
||||
} // namespace type_traits_detail
|
||||
}}} // namespace cv { namespace gpu { namespace device
|
||||
|
||||
#endif // __OPENCV_GPU_TYPE_TRAITS_DETAIL_HPP__
|
||||
|
File diff suppressed because it is too large
Load Diff
@ -46,74 +46,73 @@
|
||||
#include "internal_shared.hpp"
|
||||
#include "../datamov_utils.hpp"
|
||||
|
||||
BEGIN_OPENCV_DEVICE_NAMESPACE
|
||||
|
||||
namespace detail
|
||||
namespace cv { namespace gpu { namespace device
|
||||
{
|
||||
template <int THREAD_DIM, int N> struct UnrollVecDiffCached
|
||||
namespace vec_distance_detail
|
||||
{
|
||||
template <typename Dist, typename T1, typename T2>
|
||||
static __device__ void calcCheck(const T1* vecCached, const T2* vecGlob, int len, Dist& dist, int ind)
|
||||
template <int THREAD_DIM, int N> struct UnrollVecDiffCached
|
||||
{
|
||||
if (ind < len)
|
||||
template <typename Dist, typename T1, typename T2>
|
||||
static __device__ void calcCheck(const T1* vecCached, const T2* vecGlob, int len, Dist& dist, int ind)
|
||||
{
|
||||
if (ind < len)
|
||||
{
|
||||
T1 val1 = *vecCached++;
|
||||
|
||||
T2 val2;
|
||||
ForceGlob<T2>::Load(vecGlob, ind, val2);
|
||||
|
||||
dist.reduceIter(val1, val2);
|
||||
|
||||
UnrollVecDiffCached<THREAD_DIM, N - 1>::calcCheck(vecCached, vecGlob, len, dist, ind + THREAD_DIM);
|
||||
}
|
||||
}
|
||||
|
||||
template <typename Dist, typename T1, typename T2>
|
||||
static __device__ void calcWithoutCheck(const T1* vecCached, const T2* vecGlob, Dist& dist)
|
||||
{
|
||||
T1 val1 = *vecCached++;
|
||||
|
||||
T2 val2;
|
||||
ForceGlob<T2>::Load(vecGlob, ind, val2);
|
||||
ForceGlob<T2>::Load(vecGlob, 0, val2);
|
||||
vecGlob += THREAD_DIM;
|
||||
|
||||
dist.reduceIter(val1, val2);
|
||||
|
||||
UnrollVecDiffCached<THREAD_DIM, N - 1>::calcCheck(vecCached, vecGlob, len, dist, ind + THREAD_DIM);
|
||||
UnrollVecDiffCached<THREAD_DIM, N - 1>::calcWithoutCheck(vecCached, vecGlob, dist);
|
||||
}
|
||||
}
|
||||
|
||||
template <typename Dist, typename T1, typename T2>
|
||||
static __device__ void calcWithoutCheck(const T1* vecCached, const T2* vecGlob, Dist& dist)
|
||||
};
|
||||
template <int THREAD_DIM> struct UnrollVecDiffCached<THREAD_DIM, 0>
|
||||
{
|
||||
T1 val1 = *vecCached++;
|
||||
template <typename Dist, typename T1, typename T2>
|
||||
static __device__ __forceinline__ void calcCheck(const T1*, const T2*, int, Dist&, int)
|
||||
{
|
||||
}
|
||||
|
||||
T2 val2;
|
||||
ForceGlob<T2>::Load(vecGlob, 0, val2);
|
||||
vecGlob += THREAD_DIM;
|
||||
template <typename Dist, typename T1, typename T2>
|
||||
static __device__ __forceinline__ void calcWithoutCheck(const T1*, const T2*, Dist&)
|
||||
{
|
||||
}
|
||||
};
|
||||
|
||||
dist.reduceIter(val1, val2);
|
||||
|
||||
UnrollVecDiffCached<THREAD_DIM, N - 1>::calcWithoutCheck(vecCached, vecGlob, dist);
|
||||
}
|
||||
};
|
||||
template <int THREAD_DIM> struct UnrollVecDiffCached<THREAD_DIM, 0>
|
||||
{
|
||||
template <typename Dist, typename T1, typename T2>
|
||||
static __device__ __forceinline__ void calcCheck(const T1*, const T2*, int, Dist&, int)
|
||||
template <int THREAD_DIM, int MAX_LEN, bool LEN_EQ_MAX_LEN> struct VecDiffCachedCalculator;
|
||||
template <int THREAD_DIM, int MAX_LEN> struct VecDiffCachedCalculator<THREAD_DIM, MAX_LEN, false>
|
||||
{
|
||||
}
|
||||
|
||||
template <typename Dist, typename T1, typename T2>
|
||||
static __device__ __forceinline__ void calcWithoutCheck(const T1*, const T2*, Dist&)
|
||||
template <typename Dist, typename T1, typename T2>
|
||||
static __device__ __forceinline__ void calc(const T1* vecCached, const T2* vecGlob, int len, Dist& dist, int tid)
|
||||
{
|
||||
UnrollVecDiffCached<THREAD_DIM, MAX_LEN / THREAD_DIM>::calcCheck(vecCached, vecGlob, len, dist, tid);
|
||||
}
|
||||
};
|
||||
template <int THREAD_DIM, int MAX_LEN> struct VecDiffCachedCalculator<THREAD_DIM, MAX_LEN, true>
|
||||
{
|
||||
}
|
||||
};
|
||||
|
||||
template <int THREAD_DIM, int MAX_LEN, bool LEN_EQ_MAX_LEN> struct VecDiffCachedCalculator;
|
||||
template <int THREAD_DIM, int MAX_LEN> struct VecDiffCachedCalculator<THREAD_DIM, MAX_LEN, false>
|
||||
{
|
||||
template <typename Dist, typename T1, typename T2>
|
||||
static __device__ __forceinline__ void calc(const T1* vecCached, const T2* vecGlob, int len, Dist& dist, int tid)
|
||||
{
|
||||
UnrollVecDiffCached<THREAD_DIM, MAX_LEN / THREAD_DIM>::calcCheck(vecCached, vecGlob, len, dist, tid);
|
||||
}
|
||||
};
|
||||
template <int THREAD_DIM, int MAX_LEN> struct VecDiffCachedCalculator<THREAD_DIM, MAX_LEN, true>
|
||||
{
|
||||
template <typename Dist, typename T1, typename T2>
|
||||
static __device__ __forceinline__ void calc(const T1* vecCached, const T2* vecGlob, int len, Dist& dist, int tid)
|
||||
{
|
||||
UnrollVecDiffCached<THREAD_DIM, MAX_LEN / THREAD_DIM>::calcWithoutCheck(vecCached, vecGlob + tid, dist);
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
END_OPENCV_DEVICE_NAMESPACE
|
||||
template <typename Dist, typename T1, typename T2>
|
||||
static __device__ __forceinline__ void calc(const T1* vecCached, const T2* vecGlob, int len, Dist& dist, int tid)
|
||||
{
|
||||
UnrollVecDiffCached<THREAD_DIM, MAX_LEN / THREAD_DIM>::calcWithoutCheck(vecCached, vecGlob + tid, dist);
|
||||
}
|
||||
};
|
||||
} // namespace vec_distance_detail
|
||||
}}} // namespace cv { namespace gpu { namespace device
|
||||
|
||||
#endif // __OPENCV_GPU_VEC_DISTANCE_DETAIL_HPP__
|
||||
|
@ -46,23 +46,22 @@
|
||||
#include "internal_shared.hpp"
|
||||
#include "warp_reduce.hpp"
|
||||
|
||||
BEGIN_OPENCV_DEVICE_NAMESPACE
|
||||
|
||||
struct Emulation
|
||||
namespace cv { namespace gpu { namespace device
|
||||
{
|
||||
static __forceinline__ __device__ int Ballot(int predicate, volatile int* cta_buffer)
|
||||
{
|
||||
#if __CUDA_ARCH__ >= 200
|
||||
(void)cta_buffer;
|
||||
return __ballot(predicate);
|
||||
#else
|
||||
int tid = threadIdx.x;
|
||||
cta_buffer[tid] = predicate ? (1 << (tid & 31)) : 0;
|
||||
return warp_reduce(cta_buffer);
|
||||
#endif
|
||||
}
|
||||
};
|
||||
|
||||
END_OPENCV_DEVICE_NAMESPACE
|
||||
struct Emulation
|
||||
{
|
||||
static __forceinline__ __device__ int Ballot(int predicate, volatile int* cta_buffer)
|
||||
{
|
||||
#if __CUDA_ARCH__ >= 200
|
||||
(void)cta_buffer;
|
||||
return __ballot(predicate);
|
||||
#else
|
||||
int tid = threadIdx.x;
|
||||
cta_buffer[tid] = predicate ? (1 << (tid & 31)) : 0;
|
||||
return warp_reduce(cta_buffer);
|
||||
#endif
|
||||
}
|
||||
};
|
||||
}}} // namespace cv { namespace gpu { namespace device
|
||||
|
||||
#endif /* OPENCV_GPU_EMULATION_HPP_ */
|
@ -48,90 +48,89 @@
|
||||
#include "vec_traits.hpp"
|
||||
#include "vec_math.hpp"
|
||||
|
||||
BEGIN_OPENCV_DEVICE_NAMESPACE
|
||||
|
||||
template <typename Ptr2D> struct PointFilter
|
||||
namespace cv { namespace gpu { namespace device
|
||||
{
|
||||
typedef typename Ptr2D::elem_type elem_type;
|
||||
typedef float index_type;
|
||||
|
||||
explicit __host__ __device__ __forceinline__ PointFilter(const Ptr2D& src_) : src(src_) {}
|
||||
|
||||
__device__ __forceinline__ elem_type operator ()(float y, float x) const
|
||||
template <typename Ptr2D> struct PointFilter
|
||||
{
|
||||
return src(__float2int_rn(y), __float2int_rn(x));
|
||||
}
|
||||
typedef typename Ptr2D::elem_type elem_type;
|
||||
typedef float index_type;
|
||||
|
||||
const Ptr2D src;
|
||||
};
|
||||
explicit __host__ __device__ __forceinline__ PointFilter(const Ptr2D& src_) : src(src_) {}
|
||||
|
||||
__device__ __forceinline__ elem_type operator ()(float y, float x) const
|
||||
{
|
||||
return src(__float2int_rn(y), __float2int_rn(x));
|
||||
}
|
||||
|
||||
template <typename Ptr2D> struct LinearFilter
|
||||
{
|
||||
typedef typename Ptr2D::elem_type elem_type;
|
||||
typedef float index_type;
|
||||
const Ptr2D src;
|
||||
};
|
||||
|
||||
explicit __host__ __device__ __forceinline__ LinearFilter(const Ptr2D& src_) : src(src_) {}
|
||||
|
||||
__device__ __forceinline__ elem_type operator ()(float y, float x) const
|
||||
template <typename Ptr2D> struct LinearFilter
|
||||
{
|
||||
typedef typename Ptr2D::elem_type elem_type;
|
||||
typedef float index_type;
|
||||
|
||||
explicit __host__ __device__ __forceinline__ LinearFilter(const Ptr2D& src_) : src(src_) {}
|
||||
|
||||
__device__ __forceinline__ elem_type operator ()(float y, float x) const
|
||||
{
|
||||
typedef typename TypeVec<float, VecTraits<elem_type>::cn>::vec_type work_type;
|
||||
|
||||
work_type out = VecTraits<work_type>::all(0);
|
||||
|
||||
const int x1 = __float2int_rd(x);
|
||||
const int y1 = __float2int_rd(y);
|
||||
const int x2 = x1 + 1;
|
||||
const int y2 = y1 + 1;
|
||||
|
||||
elem_type src_reg = src(y1, x1);
|
||||
out = out + src_reg * ((x2 - x) * (y2 - y));
|
||||
|
||||
src_reg = src(y1, x2);
|
||||
out = out + src_reg * ((x - x1) * (y2 - y));
|
||||
|
||||
src_reg = src(y2, x1);
|
||||
out = out + src_reg * ((x2 - x) * (y - y1));
|
||||
|
||||
src_reg = src(y2, x2);
|
||||
out = out + src_reg * ((x - x1) * (y - y1));
|
||||
|
||||
return saturate_cast<elem_type>(out);
|
||||
}
|
||||
|
||||
const Ptr2D src;
|
||||
};
|
||||
|
||||
template <typename Ptr2D> struct CubicFilter
|
||||
{
|
||||
typedef typename Ptr2D::elem_type elem_type;
|
||||
typedef float index_type;
|
||||
typedef typename TypeVec<float, VecTraits<elem_type>::cn>::vec_type work_type;
|
||||
|
||||
work_type out = VecTraits<work_type>::all(0);
|
||||
|
||||
const int x1 = __float2int_rd(x);
|
||||
const int y1 = __float2int_rd(y);
|
||||
const int x2 = x1 + 1;
|
||||
const int y2 = y1 + 1;
|
||||
|
||||
elem_type src_reg = src(y1, x1);
|
||||
out = out + src_reg * ((x2 - x) * (y2 - y));
|
||||
|
||||
src_reg = src(y1, x2);
|
||||
out = out + src_reg * ((x - x1) * (y2 - y));
|
||||
|
||||
src_reg = src(y2, x1);
|
||||
out = out + src_reg * ((x2 - x) * (y - y1));
|
||||
|
||||
src_reg = src(y2, x2);
|
||||
out = out + src_reg * ((x - x1) * (y - y1));
|
||||
|
||||
return saturate_cast<elem_type>(out);
|
||||
}
|
||||
|
||||
const Ptr2D src;
|
||||
};
|
||||
|
||||
template <typename Ptr2D> struct CubicFilter
|
||||
{
|
||||
typedef typename Ptr2D::elem_type elem_type;
|
||||
typedef float index_type;
|
||||
typedef typename TypeVec<float, VecTraits<elem_type>::cn>::vec_type work_type;
|
||||
|
||||
explicit __host__ __device__ __forceinline__ CubicFilter(const Ptr2D& src_) : src(src_) {}
|
||||
|
||||
static __device__ __forceinline__ work_type cubicInterpolate(const work_type& p0, const work_type& p1, const work_type& p2, const work_type& p3, float x)
|
||||
{
|
||||
return p1 + 0.5f * x * (p2 - p0 + x * (2.0f * p0 - 5.0f * p1 + 4.0f * p2 - p3 + x * (3.0f * (p1 - p2) + p3 - p0)));
|
||||
}
|
||||
|
||||
__device__ elem_type operator ()(float y, float x) const
|
||||
{
|
||||
const int xi = __float2int_rn(x);
|
||||
const int yi = __float2int_rn(y);
|
||||
explicit __host__ __device__ __forceinline__ CubicFilter(const Ptr2D& src_) : src(src_) {}
|
||||
|
||||
work_type arr[4];
|
||||
|
||||
arr[0] = cubicInterpolate(saturate_cast<work_type>(src(yi - 1, xi - 1)), saturate_cast<work_type>(src(yi - 1, xi)), saturate_cast<work_type>(src(yi - 1, xi + 1)), saturate_cast<work_type>(src(yi - 1, xi + 2)), x - xi);
|
||||
arr[1] = cubicInterpolate(saturate_cast<work_type>(src(yi , xi - 1)), saturate_cast<work_type>(src(yi , xi)), saturate_cast<work_type>(src(yi , xi + 1)), saturate_cast<work_type>(src(yi , xi + 2)), x - xi);
|
||||
arr[2] = cubicInterpolate(saturate_cast<work_type>(src(yi + 1, xi - 1)), saturate_cast<work_type>(src(yi + 1, xi)), saturate_cast<work_type>(src(yi + 1, xi + 1)), saturate_cast<work_type>(src(yi + 1, xi + 2)), x - xi);
|
||||
arr[3] = cubicInterpolate(saturate_cast<work_type>(src(yi + 2, xi - 1)), saturate_cast<work_type>(src(yi + 2, xi)), saturate_cast<work_type>(src(yi + 2, xi + 1)), saturate_cast<work_type>(src(yi + 2, xi + 2)), x - xi);
|
||||
|
||||
return saturate_cast<elem_type>(cubicInterpolate(arr[0], arr[1], arr[2], arr[3], y - yi));
|
||||
}
|
||||
static __device__ __forceinline__ work_type cubicInterpolate(const work_type& p0, const work_type& p1, const work_type& p2, const work_type& p3, float x)
|
||||
{
|
||||
return p1 + 0.5f * x * (p2 - p0 + x * (2.0f * p0 - 5.0f * p1 + 4.0f * p2 - p3 + x * (3.0f * (p1 - p2) + p3 - p0)));
|
||||
}
|
||||
|
||||
const Ptr2D src;
|
||||
};
|
||||
__device__ elem_type operator ()(float y, float x) const
|
||||
{
|
||||
const int xi = __float2int_rn(x);
|
||||
const int yi = __float2int_rn(y);
|
||||
|
||||
work_type arr[4];
|
||||
|
||||
arr[0] = cubicInterpolate(saturate_cast<work_type>(src(yi - 1, xi - 1)), saturate_cast<work_type>(src(yi - 1, xi)), saturate_cast<work_type>(src(yi - 1, xi + 1)), saturate_cast<work_type>(src(yi - 1, xi + 2)), x - xi);
|
||||
arr[1] = cubicInterpolate(saturate_cast<work_type>(src(yi , xi - 1)), saturate_cast<work_type>(src(yi , xi)), saturate_cast<work_type>(src(yi , xi + 1)), saturate_cast<work_type>(src(yi , xi + 2)), x - xi);
|
||||
arr[2] = cubicInterpolate(saturate_cast<work_type>(src(yi + 1, xi - 1)), saturate_cast<work_type>(src(yi + 1, xi)), saturate_cast<work_type>(src(yi + 1, xi + 1)), saturate_cast<work_type>(src(yi + 1, xi + 2)), x - xi);
|
||||
arr[3] = cubicInterpolate(saturate_cast<work_type>(src(yi + 2, xi - 1)), saturate_cast<work_type>(src(yi + 2, xi)), saturate_cast<work_type>(src(yi + 2, xi + 1)), saturate_cast<work_type>(src(yi + 2, xi + 2)), x - xi);
|
||||
|
||||
return saturate_cast<elem_type>(cubicInterpolate(arr[0], arr[1], arr[2], arr[3], y - yi));
|
||||
}
|
||||
|
||||
END_OPENCV_DEVICE_NAMESPACE
|
||||
const Ptr2D src;
|
||||
};
|
||||
}}} // namespace cv { namespace gpu { namespace device
|
||||
|
||||
#endif // __OPENCV_GPU_FILTERS_HPP__
|
||||
|
@ -47,28 +47,27 @@
|
||||
#include <cstdio>
|
||||
#include "internal_shared.hpp"
|
||||
|
||||
BEGIN_OPENCV_DEVICE_NAMESPACE
|
||||
|
||||
template<class Func>
|
||||
void printFuncAttrib(Func& func)
|
||||
namespace cv { namespace gpu { namespace device
|
||||
{
|
||||
template<class Func>
|
||||
void printFuncAttrib(Func& func)
|
||||
{
|
||||
|
||||
cudaFuncAttributes attrs;
|
||||
cudaFuncGetAttributes(&attrs, func);
|
||||
cudaFuncAttributes attrs;
|
||||
cudaFuncGetAttributes(&attrs, func);
|
||||
|
||||
printf("=== Function stats ===\n");
|
||||
printf("Name: \n");
|
||||
printf("sharedSizeBytes = %d\n", attrs.sharedSizeBytes);
|
||||
printf("constSizeBytes = %d\n", attrs.constSizeBytes);
|
||||
printf("localSizeBytes = %d\n", attrs.localSizeBytes);
|
||||
printf("maxThreadsPerBlock = %d\n", attrs.maxThreadsPerBlock);
|
||||
printf("numRegs = %d\n", attrs.numRegs);
|
||||
printf("ptxVersion = %d\n", attrs.ptxVersion);
|
||||
printf("binaryVersion = %d\n", attrs.binaryVersion);
|
||||
printf("\n");
|
||||
fflush(stdout);
|
||||
}
|
||||
|
||||
END_OPENCV_DEVICE_NAMESPACE
|
||||
printf("=== Function stats ===\n");
|
||||
printf("Name: \n");
|
||||
printf("sharedSizeBytes = %d\n", attrs.sharedSizeBytes);
|
||||
printf("constSizeBytes = %d\n", attrs.constSizeBytes);
|
||||
printf("localSizeBytes = %d\n", attrs.localSizeBytes);
|
||||
printf("maxThreadsPerBlock = %d\n", attrs.maxThreadsPerBlock);
|
||||
printf("numRegs = %d\n", attrs.numRegs);
|
||||
printf("ptxVersion = %d\n", attrs.ptxVersion);
|
||||
printf("binaryVersion = %d\n", attrs.binaryVersion);
|
||||
printf("\n");
|
||||
fflush(stdout);
|
||||
}
|
||||
}}} // namespace cv { namespace gpu { namespace device
|
||||
|
||||
#endif /* __OPENCV_GPU_DEVICE_FUNCATTRIB_HPP_ */
|
@ -49,182 +49,182 @@
|
||||
#include "vec_traits.hpp"
|
||||
#include "type_traits.hpp"
|
||||
|
||||
BEGIN_OPENCV_DEVICE_NAMESPACE
|
||||
namespace cv { namespace gpu { namespace device
|
||||
{
|
||||
// Function Objects
|
||||
|
||||
// Function Objects
|
||||
using thrust::unary_function;
|
||||
using thrust::binary_function;
|
||||
|
||||
using thrust::unary_function;
|
||||
using thrust::binary_function;
|
||||
// Arithmetic Operations
|
||||
|
||||
// Arithmetic Operations
|
||||
template <typename T> struct plus : binary_function<T, T, T>
|
||||
{
|
||||
__device__ __forceinline__ T operator ()(typename TypeTraits<T>::ParameterType a, typename TypeTraits<T>::ParameterType b) const
|
||||
{
|
||||
return a + b;
|
||||
}
|
||||
};
|
||||
template <typename T> struct minus : binary_function<T, T, T>
|
||||
{
|
||||
__device__ __forceinline__ T operator ()(typename TypeTraits<T>::ParameterType a, typename TypeTraits<T>::ParameterType b) const
|
||||
{
|
||||
return a - b;
|
||||
}
|
||||
};
|
||||
template <typename T> struct multiplies : binary_function<T, T, T>
|
||||
{
|
||||
__device__ __forceinline__ T operator ()(typename TypeTraits<T>::ParameterType a, typename TypeTraits<T>::ParameterType b) const
|
||||
{
|
||||
return a * b;
|
||||
}
|
||||
};
|
||||
template <typename T> struct divides : binary_function<T, T, T>
|
||||
{
|
||||
__device__ __forceinline__ T operator ()(typename TypeTraits<T>::ParameterType a, typename TypeTraits<T>::ParameterType b) const
|
||||
{
|
||||
return a / b;
|
||||
}
|
||||
};
|
||||
template <typename T> struct modulus : binary_function<T, T, T>
|
||||
{
|
||||
__device__ __forceinline__ T operator ()(typename TypeTraits<T>::ParameterType a, typename TypeTraits<T>::ParameterType b) const
|
||||
{
|
||||
return a % b;
|
||||
}
|
||||
};
|
||||
template <typename T> struct negate : unary_function<T, T>
|
||||
{
|
||||
__device__ __forceinline__ T operator ()(typename TypeTraits<T>::ParameterType a) const
|
||||
{
|
||||
return -a;
|
||||
}
|
||||
};
|
||||
|
||||
template <typename T> struct plus : binary_function<T, T, T>
|
||||
{
|
||||
__device__ __forceinline__ T operator ()(typename TypeTraits<T>::ParameterType a, typename TypeTraits<T>::ParameterType b) const
|
||||
{
|
||||
return a + b;
|
||||
}
|
||||
};
|
||||
template <typename T> struct minus : binary_function<T, T, T>
|
||||
{
|
||||
__device__ __forceinline__ T operator ()(typename TypeTraits<T>::ParameterType a, typename TypeTraits<T>::ParameterType b) const
|
||||
{
|
||||
return a - b;
|
||||
}
|
||||
};
|
||||
template <typename T> struct multiplies : binary_function<T, T, T>
|
||||
{
|
||||
__device__ __forceinline__ T operator ()(typename TypeTraits<T>::ParameterType a, typename TypeTraits<T>::ParameterType b) const
|
||||
{
|
||||
return a * b;
|
||||
}
|
||||
};
|
||||
template <typename T> struct divides : binary_function<T, T, T>
|
||||
{
|
||||
__device__ __forceinline__ T operator ()(typename TypeTraits<T>::ParameterType a, typename TypeTraits<T>::ParameterType b) const
|
||||
{
|
||||
return a / b;
|
||||
}
|
||||
};
|
||||
template <typename T> struct modulus : binary_function<T, T, T>
|
||||
{
|
||||
__device__ __forceinline__ T operator ()(typename TypeTraits<T>::ParameterType a, typename TypeTraits<T>::ParameterType b) const
|
||||
{
|
||||
return a % b;
|
||||
}
|
||||
};
|
||||
template <typename T> struct negate : unary_function<T, T>
|
||||
{
|
||||
__device__ __forceinline__ T operator ()(typename TypeTraits<T>::ParameterType a) const
|
||||
{
|
||||
return -a;
|
||||
}
|
||||
};
|
||||
// Comparison Operations
|
||||
|
||||
// Comparison Operations
|
||||
template <typename T> struct equal_to : binary_function<T, T, bool>
|
||||
{
|
||||
__device__ __forceinline__ bool operator ()(typename TypeTraits<T>::ParameterType a, typename TypeTraits<T>::ParameterType b) const
|
||||
{
|
||||
return a == b;
|
||||
}
|
||||
};
|
||||
template <typename T> struct not_equal_to : binary_function<T, T, bool>
|
||||
{
|
||||
__device__ __forceinline__ bool operator ()(typename TypeTraits<T>::ParameterType a, typename TypeTraits<T>::ParameterType b) const
|
||||
{
|
||||
return a != b;
|
||||
}
|
||||
};
|
||||
template <typename T> struct greater : binary_function<T, T, bool>
|
||||
{
|
||||
__device__ __forceinline__ bool operator ()(typename TypeTraits<T>::ParameterType a, typename TypeTraits<T>::ParameterType b) const
|
||||
{
|
||||
return a > b;
|
||||
}
|
||||
};
|
||||
template <typename T> struct less : binary_function<T, T, bool>
|
||||
{
|
||||
__device__ __forceinline__ bool operator ()(typename TypeTraits<T>::ParameterType a, typename TypeTraits<T>::ParameterType b) const
|
||||
{
|
||||
return a < b;
|
||||
}
|
||||
};
|
||||
template <typename T> struct greater_equal : binary_function<T, T, bool>
|
||||
{
|
||||
__device__ __forceinline__ bool operator ()(typename TypeTraits<T>::ParameterType a, typename TypeTraits<T>::ParameterType b) const
|
||||
{
|
||||
return a >= b;
|
||||
}
|
||||
};
|
||||
template <typename T> struct less_equal : binary_function<T, T, bool>
|
||||
{
|
||||
__device__ __forceinline__ bool operator ()(typename TypeTraits<T>::ParameterType a, typename TypeTraits<T>::ParameterType b) const
|
||||
{
|
||||
return a <= b;
|
||||
}
|
||||
};
|
||||
|
||||
template <typename T> struct equal_to : binary_function<T, T, bool>
|
||||
{
|
||||
__device__ __forceinline__ bool operator ()(typename TypeTraits<T>::ParameterType a, typename TypeTraits<T>::ParameterType b) const
|
||||
{
|
||||
return a == b;
|
||||
}
|
||||
};
|
||||
template <typename T> struct not_equal_to : binary_function<T, T, bool>
|
||||
{
|
||||
__device__ __forceinline__ bool operator ()(typename TypeTraits<T>::ParameterType a, typename TypeTraits<T>::ParameterType b) const
|
||||
{
|
||||
return a != b;
|
||||
}
|
||||
};
|
||||
template <typename T> struct greater : binary_function<T, T, bool>
|
||||
{
|
||||
__device__ __forceinline__ bool operator ()(typename TypeTraits<T>::ParameterType a, typename TypeTraits<T>::ParameterType b) const
|
||||
{
|
||||
return a > b;
|
||||
}
|
||||
};
|
||||
template <typename T> struct less : binary_function<T, T, bool>
|
||||
{
|
||||
__device__ __forceinline__ bool operator ()(typename TypeTraits<T>::ParameterType a, typename TypeTraits<T>::ParameterType b) const
|
||||
{
|
||||
return a < b;
|
||||
}
|
||||
};
|
||||
template <typename T> struct greater_equal : binary_function<T, T, bool>
|
||||
{
|
||||
__device__ __forceinline__ bool operator ()(typename TypeTraits<T>::ParameterType a, typename TypeTraits<T>::ParameterType b) const
|
||||
{
|
||||
return a >= b;
|
||||
}
|
||||
};
|
||||
template <typename T> struct less_equal : binary_function<T, T, bool>
|
||||
{
|
||||
__device__ __forceinline__ bool operator ()(typename TypeTraits<T>::ParameterType a, typename TypeTraits<T>::ParameterType b) const
|
||||
{
|
||||
return a <= b;
|
||||
}
|
||||
};
|
||||
// Logical Operations
|
||||
|
||||
// Logical Operations
|
||||
template <typename T> struct logical_and : binary_function<T, T, bool>
|
||||
{
|
||||
__device__ __forceinline__ bool operator ()(typename TypeTraits<T>::ParameterType a, typename TypeTraits<T>::ParameterType b) const
|
||||
{
|
||||
return a && b;
|
||||
}
|
||||
};
|
||||
template <typename T> struct logical_or : binary_function<T, T, bool>
|
||||
{
|
||||
__device__ __forceinline__ bool operator ()(typename TypeTraits<T>::ParameterType a, typename TypeTraits<T>::ParameterType b) const
|
||||
{
|
||||
return a || b;
|
||||
}
|
||||
};
|
||||
template <typename T> struct logical_not : unary_function<T, bool>
|
||||
{
|
||||
__device__ __forceinline__ bool operator ()(typename TypeTraits<T>::ParameterType a) const
|
||||
{
|
||||
return !a;
|
||||
}
|
||||
};
|
||||
|
||||
template <typename T> struct logical_and : binary_function<T, T, bool>
|
||||
{
|
||||
__device__ __forceinline__ bool operator ()(typename TypeTraits<T>::ParameterType a, typename TypeTraits<T>::ParameterType b) const
|
||||
{
|
||||
return a && b;
|
||||
}
|
||||
};
|
||||
template <typename T> struct logical_or : binary_function<T, T, bool>
|
||||
{
|
||||
__device__ __forceinline__ bool operator ()(typename TypeTraits<T>::ParameterType a, typename TypeTraits<T>::ParameterType b) const
|
||||
{
|
||||
return a || b;
|
||||
}
|
||||
};
|
||||
template <typename T> struct logical_not : unary_function<T, bool>
|
||||
{
|
||||
__device__ __forceinline__ bool operator ()(typename TypeTraits<T>::ParameterType a) const
|
||||
{
|
||||
return !a;
|
||||
}
|
||||
};
|
||||
// Bitwise Operations
|
||||
|
||||
// Bitwise Operations
|
||||
template <typename T> struct bit_and : binary_function<T, T, T>
|
||||
{
|
||||
__device__ __forceinline__ T operator ()(typename TypeTraits<T>::ParameterType a, typename TypeTraits<T>::ParameterType b) const
|
||||
{
|
||||
return a & b;
|
||||
}
|
||||
};
|
||||
template <typename T> struct bit_or : binary_function<T, T, T>
|
||||
{
|
||||
__device__ __forceinline__ T operator ()(typename TypeTraits<T>::ParameterType a, typename TypeTraits<T>::ParameterType b) const
|
||||
{
|
||||
return a | b;
|
||||
}
|
||||
};
|
||||
template <typename T> struct bit_xor : binary_function<T, T, T>
|
||||
{
|
||||
__device__ __forceinline__ T operator ()(typename TypeTraits<T>::ParameterType a, typename TypeTraits<T>::ParameterType b) const
|
||||
{
|
||||
return a ^ b;
|
||||
}
|
||||
};
|
||||
template <typename T> struct bit_not : unary_function<T, T>
|
||||
{
|
||||
__device__ __forceinline__ T operator ()(typename TypeTraits<T>::ParameterType v) const
|
||||
{
|
||||
return ~v;
|
||||
}
|
||||
};
|
||||
|
||||
template <typename T> struct bit_and : binary_function<T, T, T>
|
||||
{
|
||||
__device__ __forceinline__ T operator ()(typename TypeTraits<T>::ParameterType a, typename TypeTraits<T>::ParameterType b) const
|
||||
{
|
||||
return a & b;
|
||||
}
|
||||
};
|
||||
template <typename T> struct bit_or : binary_function<T, T, T>
|
||||
{
|
||||
__device__ __forceinline__ T operator ()(typename TypeTraits<T>::ParameterType a, typename TypeTraits<T>::ParameterType b) const
|
||||
{
|
||||
return a | b;
|
||||
}
|
||||
};
|
||||
template <typename T> struct bit_xor : binary_function<T, T, T>
|
||||
{
|
||||
__device__ __forceinline__ T operator ()(typename TypeTraits<T>::ParameterType a, typename TypeTraits<T>::ParameterType b) const
|
||||
{
|
||||
return a ^ b;
|
||||
}
|
||||
};
|
||||
template <typename T> struct bit_not : unary_function<T, T>
|
||||
{
|
||||
__device__ __forceinline__ T operator ()(typename TypeTraits<T>::ParameterType v) const
|
||||
{
|
||||
return ~v;
|
||||
}
|
||||
};
|
||||
// Generalized Identity Operations
|
||||
|
||||
// Generalized Identity Operations
|
||||
template <typename T> struct identity : unary_function<T, T>
|
||||
{
|
||||
__device__ __forceinline__ typename TypeTraits<T>::ParameterType operator()(typename TypeTraits<T>::ParameterType x) const
|
||||
{
|
||||
return x;
|
||||
}
|
||||
};
|
||||
|
||||
template <typename T> struct identity : unary_function<T, T>
|
||||
{
|
||||
__device__ __forceinline__ typename TypeTraits<T>::ParameterType operator()(typename TypeTraits<T>::ParameterType x) const
|
||||
template <typename T1, typename T2> struct project1st : binary_function<T1, T2, T1>
|
||||
{
|
||||
return x;
|
||||
}
|
||||
};
|
||||
|
||||
template <typename T1, typename T2> struct project1st : binary_function<T1, T2, T1>
|
||||
{
|
||||
__device__ __forceinline__ typename TypeTraits<T1>::ParameterType operator()(typename TypeTraits<T1>::ParameterType lhs, typename TypeTraits<T2>::ParameterType rhs) const
|
||||
__device__ __forceinline__ typename TypeTraits<T1>::ParameterType operator()(typename TypeTraits<T1>::ParameterType lhs, typename TypeTraits<T2>::ParameterType rhs) const
|
||||
{
|
||||
return lhs;
|
||||
}
|
||||
};
|
||||
template <typename T1, typename T2> struct project2nd : binary_function<T1, T2, T2>
|
||||
{
|
||||
return lhs;
|
||||
}
|
||||
};
|
||||
template <typename T1, typename T2> struct project2nd : binary_function<T1, T2, T2>
|
||||
{
|
||||
__device__ __forceinline__ typename TypeTraits<T2>::ParameterType operator()(typename TypeTraits<T1>::ParameterType lhs, typename TypeTraits<T2>::ParameterType rhs) const
|
||||
{
|
||||
return rhs;
|
||||
}
|
||||
};
|
||||
__device__ __forceinline__ typename TypeTraits<T2>::ParameterType operator()(typename TypeTraits<T1>::ParameterType lhs, typename TypeTraits<T2>::ParameterType rhs) const
|
||||
{
|
||||
return rhs;
|
||||
}
|
||||
};
|
||||
|
||||
// Min/Max Operations
|
||||
|
||||
@ -234,39 +234,41 @@ template <typename T1, typename T2> struct project2nd : binary_function<T1, T2,
|
||||
__device__ __forceinline__ type operator()(type lhs, type rhs) const {return op(lhs, rhs);} \
|
||||
};
|
||||
|
||||
template <typename T> struct maximum : binary_function<T, T, T>
|
||||
{
|
||||
__device__ __forceinline__ T operator()(typename TypeTraits<T>::ParameterType lhs, typename TypeTraits<T>::ParameterType rhs) const
|
||||
template <typename T> struct maximum : binary_function<T, T, T>
|
||||
{
|
||||
return lhs < rhs ? rhs : lhs;
|
||||
}
|
||||
};
|
||||
OPENCV_GPU_IMPLEMENT_MINMAX(maximum, uchar, ::max)
|
||||
OPENCV_GPU_IMPLEMENT_MINMAX(maximum, schar, ::max)
|
||||
OPENCV_GPU_IMPLEMENT_MINMAX(maximum, char, ::max)
|
||||
OPENCV_GPU_IMPLEMENT_MINMAX(maximum, ushort, ::max)
|
||||
OPENCV_GPU_IMPLEMENT_MINMAX(maximum, short, ::max)
|
||||
OPENCV_GPU_IMPLEMENT_MINMAX(maximum, int, ::max)
|
||||
OPENCV_GPU_IMPLEMENT_MINMAX(maximum, uint, ::max)
|
||||
OPENCV_GPU_IMPLEMENT_MINMAX(maximum, float, ::fmax)
|
||||
OPENCV_GPU_IMPLEMENT_MINMAX(maximum, double, ::fmax)
|
||||
__device__ __forceinline__ T operator()(typename TypeTraits<T>::ParameterType lhs, typename TypeTraits<T>::ParameterType rhs) const
|
||||
{
|
||||
return lhs < rhs ? rhs : lhs;
|
||||
}
|
||||
};
|
||||
|
||||
template <typename T> struct minimum : binary_function<T, T, T>
|
||||
{
|
||||
__device__ __forceinline__ T operator()(typename TypeTraits<T>::ParameterType lhs, typename TypeTraits<T>::ParameterType rhs) const
|
||||
OPENCV_GPU_IMPLEMENT_MINMAX(maximum, uchar, ::max)
|
||||
OPENCV_GPU_IMPLEMENT_MINMAX(maximum, schar, ::max)
|
||||
OPENCV_GPU_IMPLEMENT_MINMAX(maximum, char, ::max)
|
||||
OPENCV_GPU_IMPLEMENT_MINMAX(maximum, ushort, ::max)
|
||||
OPENCV_GPU_IMPLEMENT_MINMAX(maximum, short, ::max)
|
||||
OPENCV_GPU_IMPLEMENT_MINMAX(maximum, int, ::max)
|
||||
OPENCV_GPU_IMPLEMENT_MINMAX(maximum, uint, ::max)
|
||||
OPENCV_GPU_IMPLEMENT_MINMAX(maximum, float, ::fmax)
|
||||
OPENCV_GPU_IMPLEMENT_MINMAX(maximum, double, ::fmax)
|
||||
|
||||
template <typename T> struct minimum : binary_function<T, T, T>
|
||||
{
|
||||
return lhs < rhs ? lhs : rhs;
|
||||
}
|
||||
};
|
||||
OPENCV_GPU_IMPLEMENT_MINMAX(minimum, uchar, ::min)
|
||||
OPENCV_GPU_IMPLEMENT_MINMAX(minimum, schar, ::min)
|
||||
OPENCV_GPU_IMPLEMENT_MINMAX(minimum, char, ::min)
|
||||
OPENCV_GPU_IMPLEMENT_MINMAX(minimum, ushort, ::min)
|
||||
OPENCV_GPU_IMPLEMENT_MINMAX(minimum, short, ::min)
|
||||
OPENCV_GPU_IMPLEMENT_MINMAX(minimum, int, ::min)
|
||||
OPENCV_GPU_IMPLEMENT_MINMAX(minimum, uint, ::min)
|
||||
OPENCV_GPU_IMPLEMENT_MINMAX(minimum, float, ::fmin)
|
||||
OPENCV_GPU_IMPLEMENT_MINMAX(minimum, double, ::fmin)
|
||||
__device__ __forceinline__ T operator()(typename TypeTraits<T>::ParameterType lhs, typename TypeTraits<T>::ParameterType rhs) const
|
||||
{
|
||||
return lhs < rhs ? lhs : rhs;
|
||||
}
|
||||
};
|
||||
|
||||
OPENCV_GPU_IMPLEMENT_MINMAX(minimum, uchar, ::min)
|
||||
OPENCV_GPU_IMPLEMENT_MINMAX(minimum, schar, ::min)
|
||||
OPENCV_GPU_IMPLEMENT_MINMAX(minimum, char, ::min)
|
||||
OPENCV_GPU_IMPLEMENT_MINMAX(minimum, ushort, ::min)
|
||||
OPENCV_GPU_IMPLEMENT_MINMAX(minimum, short, ::min)
|
||||
OPENCV_GPU_IMPLEMENT_MINMAX(minimum, int, ::min)
|
||||
OPENCV_GPU_IMPLEMENT_MINMAX(minimum, uint, ::min)
|
||||
OPENCV_GPU_IMPLEMENT_MINMAX(minimum, float, ::fmin)
|
||||
OPENCV_GPU_IMPLEMENT_MINMAX(minimum, double, ::fmin)
|
||||
|
||||
#undef OPENCV_GPU_IMPLEMENT_MINMAX
|
||||
|
||||
@ -287,6 +289,7 @@ OPENCV_GPU_IMPLEMENT_MINMAX(minimum, double, ::fmin)
|
||||
return func(v); \
|
||||
} \
|
||||
};
|
||||
|
||||
#define OPENCV_GPU_IMPLEMENT_BIN_FUNCTOR(name, func) \
|
||||
template <typename T> struct name ## _func : binary_function<T, T, float> \
|
||||
{ \
|
||||
@ -303,259 +306,258 @@ OPENCV_GPU_IMPLEMENT_MINMAX(minimum, double, ::fmin)
|
||||
} \
|
||||
};
|
||||
|
||||
OPENCV_GPU_IMPLEMENT_UN_FUNCTOR(fabs, ::fabs)
|
||||
OPENCV_GPU_IMPLEMENT_UN_FUNCTOR(sqrt, ::sqrt)
|
||||
OPENCV_GPU_IMPLEMENT_UN_FUNCTOR(exp, ::exp)
|
||||
OPENCV_GPU_IMPLEMENT_UN_FUNCTOR(exp2, ::exp2)
|
||||
OPENCV_GPU_IMPLEMENT_UN_FUNCTOR(exp10, ::exp10)
|
||||
OPENCV_GPU_IMPLEMENT_UN_FUNCTOR(log, ::log)
|
||||
OPENCV_GPU_IMPLEMENT_UN_FUNCTOR(log2, ::log2)
|
||||
OPENCV_GPU_IMPLEMENT_UN_FUNCTOR(log10, ::log10)
|
||||
OPENCV_GPU_IMPLEMENT_UN_FUNCTOR(sin, ::sin)
|
||||
OPENCV_GPU_IMPLEMENT_UN_FUNCTOR(cos, ::cos)
|
||||
OPENCV_GPU_IMPLEMENT_UN_FUNCTOR(tan, ::tan)
|
||||
OPENCV_GPU_IMPLEMENT_UN_FUNCTOR(asin, ::asin)
|
||||
OPENCV_GPU_IMPLEMENT_UN_FUNCTOR(acos, ::acos)
|
||||
OPENCV_GPU_IMPLEMENT_UN_FUNCTOR(atan, ::atan)
|
||||
OPENCV_GPU_IMPLEMENT_UN_FUNCTOR(sinh, ::sinh)
|
||||
OPENCV_GPU_IMPLEMENT_UN_FUNCTOR(cosh, ::cosh)
|
||||
OPENCV_GPU_IMPLEMENT_UN_FUNCTOR(tanh, ::tanh)
|
||||
OPENCV_GPU_IMPLEMENT_UN_FUNCTOR(asinh, ::asinh)
|
||||
OPENCV_GPU_IMPLEMENT_UN_FUNCTOR(acosh, ::acosh)
|
||||
OPENCV_GPU_IMPLEMENT_UN_FUNCTOR(atanh, ::atanh)
|
||||
OPENCV_GPU_IMPLEMENT_UN_FUNCTOR(fabs, ::fabs)
|
||||
OPENCV_GPU_IMPLEMENT_UN_FUNCTOR(sqrt, ::sqrt)
|
||||
OPENCV_GPU_IMPLEMENT_UN_FUNCTOR(exp, ::exp)
|
||||
OPENCV_GPU_IMPLEMENT_UN_FUNCTOR(exp2, ::exp2)
|
||||
OPENCV_GPU_IMPLEMENT_UN_FUNCTOR(exp10, ::exp10)
|
||||
OPENCV_GPU_IMPLEMENT_UN_FUNCTOR(log, ::log)
|
||||
OPENCV_GPU_IMPLEMENT_UN_FUNCTOR(log2, ::log2)
|
||||
OPENCV_GPU_IMPLEMENT_UN_FUNCTOR(log10, ::log10)
|
||||
OPENCV_GPU_IMPLEMENT_UN_FUNCTOR(sin, ::sin)
|
||||
OPENCV_GPU_IMPLEMENT_UN_FUNCTOR(cos, ::cos)
|
||||
OPENCV_GPU_IMPLEMENT_UN_FUNCTOR(tan, ::tan)
|
||||
OPENCV_GPU_IMPLEMENT_UN_FUNCTOR(asin, ::asin)
|
||||
OPENCV_GPU_IMPLEMENT_UN_FUNCTOR(acos, ::acos)
|
||||
OPENCV_GPU_IMPLEMENT_UN_FUNCTOR(atan, ::atan)
|
||||
OPENCV_GPU_IMPLEMENT_UN_FUNCTOR(sinh, ::sinh)
|
||||
OPENCV_GPU_IMPLEMENT_UN_FUNCTOR(cosh, ::cosh)
|
||||
OPENCV_GPU_IMPLEMENT_UN_FUNCTOR(tanh, ::tanh)
|
||||
OPENCV_GPU_IMPLEMENT_UN_FUNCTOR(asinh, ::asinh)
|
||||
OPENCV_GPU_IMPLEMENT_UN_FUNCTOR(acosh, ::acosh)
|
||||
OPENCV_GPU_IMPLEMENT_UN_FUNCTOR(atanh, ::atanh)
|
||||
|
||||
OPENCV_GPU_IMPLEMENT_BIN_FUNCTOR(hypot, ::hypot)
|
||||
OPENCV_GPU_IMPLEMENT_BIN_FUNCTOR(atan2, ::atan2)
|
||||
OPENCV_GPU_IMPLEMENT_BIN_FUNCTOR(pow, ::pow)
|
||||
OPENCV_GPU_IMPLEMENT_BIN_FUNCTOR(hypot, ::hypot)
|
||||
OPENCV_GPU_IMPLEMENT_BIN_FUNCTOR(atan2, ::atan2)
|
||||
OPENCV_GPU_IMPLEMENT_BIN_FUNCTOR(pow, ::pow)
|
||||
|
||||
#undef OPENCV_GPU_IMPLEMENT_UN_FUNCTOR
|
||||
#undef OPENCV_GPU_IMPLEMENT_BIN_FUNCTOR
|
||||
#undef OPENCV_GPU_IMPLEMENT_UN_FUNCTOR
|
||||
#undef OPENCV_GPU_IMPLEMENT_BIN_FUNCTOR
|
||||
|
||||
template<typename T> struct hypot_sqr_func : binary_function<T, T, float>
|
||||
{
|
||||
__device__ __forceinline__ T operator ()(typename TypeTraits<T>::ParameterType src1, typename TypeTraits<T>::ParameterType src2) const
|
||||
template<typename T> struct hypot_sqr_func : binary_function<T, T, float>
|
||||
{
|
||||
return src1 * src1 + src2 * src2;
|
||||
}
|
||||
};
|
||||
|
||||
// Saturate Cast Functor
|
||||
|
||||
template <typename T, typename D> struct saturate_cast_func : unary_function<T, D>
|
||||
{
|
||||
__device__ __forceinline__ D operator ()(typename TypeTraits<T>::ParameterType v) const
|
||||
{
|
||||
return saturate_cast<D>(v);
|
||||
}
|
||||
};
|
||||
|
||||
// Threshold Functors
|
||||
|
||||
template <typename T> struct thresh_binary_func : unary_function<T, T>
|
||||
{
|
||||
__host__ __device__ __forceinline__ thresh_binary_func(T thresh_, T maxVal_) : thresh(thresh_), maxVal(maxVal_) {}
|
||||
|
||||
__device__ __forceinline__ T operator()(typename TypeTraits<T>::ParameterType src) const
|
||||
{
|
||||
return (src > thresh) * maxVal;
|
||||
}
|
||||
|
||||
const T thresh;
|
||||
const T maxVal;
|
||||
};
|
||||
template <typename T> struct thresh_binary_inv_func : unary_function<T, T>
|
||||
{
|
||||
__host__ __device__ __forceinline__ thresh_binary_inv_func(T thresh_, T maxVal_) : thresh(thresh_), maxVal(maxVal_) {}
|
||||
|
||||
__device__ __forceinline__ T operator()(typename TypeTraits<T>::ParameterType src) const
|
||||
{
|
||||
return (src <= thresh) * maxVal;
|
||||
}
|
||||
|
||||
const T thresh;
|
||||
const T maxVal;
|
||||
};
|
||||
template <typename T> struct thresh_trunc_func : unary_function<T, T>
|
||||
{
|
||||
explicit __host__ __device__ __forceinline__ thresh_trunc_func(T thresh_, T maxVal_ = 0) : thresh(thresh_) {}
|
||||
|
||||
__device__ __forceinline__ T operator()(typename TypeTraits<T>::ParameterType src) const
|
||||
{
|
||||
return minimum<T>()(src, thresh);
|
||||
}
|
||||
|
||||
const T thresh;
|
||||
};
|
||||
template <typename T> struct thresh_to_zero_func : unary_function<T, T>
|
||||
{
|
||||
explicit __host__ __device__ __forceinline__ thresh_to_zero_func(T thresh_, T maxVal_ = 0) : thresh(thresh_) {}
|
||||
|
||||
__device__ __forceinline__ T operator()(typename TypeTraits<T>::ParameterType src) const
|
||||
{
|
||||
return (src > thresh) * src;
|
||||
}
|
||||
|
||||
const T thresh;
|
||||
};
|
||||
template <typename T> struct thresh_to_zero_inv_func : unary_function<T, T>
|
||||
{
|
||||
explicit __host__ __device__ __forceinline__ thresh_to_zero_inv_func(T thresh_, T maxVal_ = 0) : thresh(thresh_) {}
|
||||
|
||||
__device__ __forceinline__ T operator()(typename TypeTraits<T>::ParameterType src) const
|
||||
{
|
||||
return (src <= thresh) * src;
|
||||
}
|
||||
|
||||
const T thresh;
|
||||
};
|
||||
|
||||
// Function Object Adaptors
|
||||
|
||||
template <typename Predicate> struct unary_negate : unary_function<typename Predicate::argument_type, bool>
|
||||
{
|
||||
explicit __host__ __device__ __forceinline__ unary_negate(const Predicate& p) : pred(p) {}
|
||||
|
||||
__device__ __forceinline__ bool operator()(typename TypeTraits<typename Predicate::argument_type>::ParameterType x) const
|
||||
{
|
||||
return !pred(x);
|
||||
}
|
||||
|
||||
const Predicate pred;
|
||||
};
|
||||
template <typename Predicate> __host__ __device__ __forceinline__ unary_negate<Predicate> not1(const Predicate& pred)
|
||||
{
|
||||
return unary_negate<Predicate>(pred);
|
||||
}
|
||||
|
||||
template <typename Predicate> struct binary_negate : binary_function<typename Predicate::first_argument_type, typename Predicate::second_argument_type, bool>
|
||||
{
|
||||
explicit __host__ __device__ __forceinline__ binary_negate(const Predicate& p) : pred(p) {}
|
||||
|
||||
__device__ __forceinline__ bool operator()(typename TypeTraits<typename Predicate::first_argument_type>::ParameterType x, typename TypeTraits<typename Predicate::second_argument_type>::ParameterType y) const
|
||||
{
|
||||
return !pred(x,y);
|
||||
}
|
||||
|
||||
const Predicate pred;
|
||||
};
|
||||
template <typename BinaryPredicate> __host__ __device__ __forceinline__ binary_negate<BinaryPredicate> not2(const BinaryPredicate& pred)
|
||||
{
|
||||
return binary_negate<BinaryPredicate>(pred);
|
||||
}
|
||||
|
||||
template <typename Op> struct binder1st : unary_function<typename Op::second_argument_type, typename Op::result_type>
|
||||
{
|
||||
__host__ __device__ __forceinline__ binder1st(const Op& op_, const typename Op::first_argument_type& arg1_) : op(op_), arg1(arg1_) {}
|
||||
|
||||
__device__ __forceinline__ typename Op::result_type operator ()(typename TypeTraits<typename Op::second_argument_type>::ParameterType a) const
|
||||
{
|
||||
return op(arg1, a);
|
||||
}
|
||||
|
||||
const Op op;
|
||||
const typename Op::first_argument_type arg1;
|
||||
};
|
||||
template <typename Op, typename T> __host__ __device__ __forceinline__ binder1st<Op> bind1st(const Op& op, const T& x)
|
||||
{
|
||||
return binder1st<Op>(op, typename Op::first_argument_type(x));
|
||||
}
|
||||
|
||||
template <typename Op> struct binder2nd : unary_function<typename Op::first_argument_type, typename Op::result_type>
|
||||
{
|
||||
__host__ __device__ __forceinline__ binder2nd(const Op& op_, const typename Op::second_argument_type& arg2_) : op(op_), arg2(arg2_) {}
|
||||
|
||||
__forceinline__ __device__ typename Op::result_type operator ()(typename TypeTraits<typename Op::first_argument_type>::ParameterType a) const
|
||||
{
|
||||
return op(a, arg2);
|
||||
}
|
||||
|
||||
const Op op;
|
||||
const typename Op::second_argument_type arg2;
|
||||
};
|
||||
template <typename Op, typename T> __host__ __device__ __forceinline__ binder2nd<Op> bind2nd(const Op& op, const T& x)
|
||||
{
|
||||
return binder2nd<Op>(op, typename Op::second_argument_type(x));
|
||||
}
|
||||
|
||||
// Functor Traits
|
||||
|
||||
template <typename F> struct IsUnaryFunction
|
||||
{
|
||||
typedef char Yes;
|
||||
struct No {Yes a[2];};
|
||||
|
||||
template <typename T, typename D> static Yes check(unary_function<T, D>);
|
||||
static No check(...);
|
||||
|
||||
static F makeF();
|
||||
|
||||
enum { value = (sizeof(check(makeF())) == sizeof(Yes)) };
|
||||
};
|
||||
|
||||
template <typename F> struct IsBinaryFunction
|
||||
{
|
||||
typedef char Yes;
|
||||
struct No {Yes a[2];};
|
||||
|
||||
template <typename T1, typename T2, typename D> static Yes check(binary_function<T1, T2, D>);
|
||||
static No check(...);
|
||||
|
||||
static F makeF();
|
||||
|
||||
enum { value = (sizeof(check(makeF())) == sizeof(Yes)) };
|
||||
};
|
||||
|
||||
namespace detail
|
||||
{
|
||||
template <size_t src_elem_size, size_t dst_elem_size> struct UnOpShift { enum { shift = 1 }; };
|
||||
template <size_t src_elem_size> struct UnOpShift<src_elem_size, 1> { enum { shift = 4 }; };
|
||||
template <size_t src_elem_size> struct UnOpShift<src_elem_size, 2> { enum { shift = 2 }; };
|
||||
|
||||
template <typename T, typename D> struct DefaultUnaryShift
|
||||
{
|
||||
enum { shift = detail::UnOpShift<sizeof(T), sizeof(D)>::shift };
|
||||
};
|
||||
|
||||
template <size_t src_elem_size1, size_t src_elem_size2, size_t dst_elem_size> struct BinOpShift { enum { shift = 1 }; };
|
||||
template <size_t src_elem_size1, size_t src_elem_size2> struct BinOpShift<src_elem_size1, src_elem_size2, 1> { enum { shift = 4 }; };
|
||||
template <size_t src_elem_size1, size_t src_elem_size2> struct BinOpShift<src_elem_size1, src_elem_size2, 2> { enum { shift = 2 }; };
|
||||
|
||||
template <typename T1, typename T2, typename D> struct DefaultBinaryShift
|
||||
{
|
||||
enum { shift = detail::BinOpShift<sizeof(T1), sizeof(T2), sizeof(D)>::shift };
|
||||
__device__ __forceinline__ T operator ()(typename TypeTraits<T>::ParameterType src1, typename TypeTraits<T>::ParameterType src2) const
|
||||
{
|
||||
return src1 * src1 + src2 * src2;
|
||||
}
|
||||
};
|
||||
|
||||
template <typename Func, bool unary = IsUnaryFunction<Func>::value> struct ShiftDispatcher;
|
||||
template <typename Func> struct ShiftDispatcher<Func, true>
|
||||
// Saturate Cast Functor
|
||||
|
||||
template <typename T, typename D> struct saturate_cast_func : unary_function<T, D>
|
||||
{
|
||||
enum { shift = DefaultUnaryShift<typename Func::argument_type, typename Func::result_type>::shift };
|
||||
__device__ __forceinline__ D operator ()(typename TypeTraits<T>::ParameterType v) const
|
||||
{
|
||||
return saturate_cast<D>(v);
|
||||
}
|
||||
};
|
||||
template <typename Func> struct ShiftDispatcher<Func, false>
|
||||
|
||||
// Threshold Functors
|
||||
|
||||
template <typename T> struct thresh_binary_func : unary_function<T, T>
|
||||
{
|
||||
enum { shift = DefaultBinaryShift<typename Func::first_argument_type, typename Func::second_argument_type, typename Func::result_type>::shift };
|
||||
__host__ __device__ __forceinline__ thresh_binary_func(T thresh_, T maxVal_) : thresh(thresh_), maxVal(maxVal_) {}
|
||||
|
||||
__device__ __forceinline__ T operator()(typename TypeTraits<T>::ParameterType src) const
|
||||
{
|
||||
return (src > thresh) * maxVal;
|
||||
}
|
||||
|
||||
const T thresh;
|
||||
const T maxVal;
|
||||
};
|
||||
}
|
||||
template <typename T> struct thresh_binary_inv_func : unary_function<T, T>
|
||||
{
|
||||
__host__ __device__ __forceinline__ thresh_binary_inv_func(T thresh_, T maxVal_) : thresh(thresh_), maxVal(maxVal_) {}
|
||||
|
||||
template <typename Func> struct DefaultTransformShift
|
||||
{
|
||||
enum { shift = detail::ShiftDispatcher<Func>::shift };
|
||||
};
|
||||
__device__ __forceinline__ T operator()(typename TypeTraits<T>::ParameterType src) const
|
||||
{
|
||||
return (src <= thresh) * maxVal;
|
||||
}
|
||||
|
||||
template <typename Func> struct DefaultTransformFunctorTraits
|
||||
{
|
||||
enum { simple_block_dim_x = 16 };
|
||||
enum { simple_block_dim_y = 16 };
|
||||
const T thresh;
|
||||
const T maxVal;
|
||||
};
|
||||
template <typename T> struct thresh_trunc_func : unary_function<T, T>
|
||||
{
|
||||
explicit __host__ __device__ __forceinline__ thresh_trunc_func(T thresh_, T maxVal_ = 0) : thresh(thresh_) {}
|
||||
|
||||
enum { smart_block_dim_x = 16 };
|
||||
enum { smart_block_dim_y = 16 };
|
||||
enum { smart_shift = DefaultTransformShift<Func>::shift };
|
||||
};
|
||||
__device__ __forceinline__ T operator()(typename TypeTraits<T>::ParameterType src) const
|
||||
{
|
||||
return minimum<T>()(src, thresh);
|
||||
}
|
||||
|
||||
template <typename Func> struct TransformFunctorTraits : DefaultTransformFunctorTraits<Func> {};
|
||||
const T thresh;
|
||||
};
|
||||
template <typename T> struct thresh_to_zero_func : unary_function<T, T>
|
||||
{
|
||||
explicit __host__ __device__ __forceinline__ thresh_to_zero_func(T thresh_, T maxVal_ = 0) : thresh(thresh_) {}
|
||||
|
||||
#define DEFINE_TRANSFORM_FUNCTOR_TRAITS(type) \
|
||||
__device__ __forceinline__ T operator()(typename TypeTraits<T>::ParameterType src) const
|
||||
{
|
||||
return (src > thresh) * src;
|
||||
}
|
||||
|
||||
const T thresh;
|
||||
};
|
||||
template <typename T> struct thresh_to_zero_inv_func : unary_function<T, T>
|
||||
{
|
||||
explicit __host__ __device__ __forceinline__ thresh_to_zero_inv_func(T thresh_, T maxVal_ = 0) : thresh(thresh_) {}
|
||||
|
||||
__device__ __forceinline__ T operator()(typename TypeTraits<T>::ParameterType src) const
|
||||
{
|
||||
return (src <= thresh) * src;
|
||||
}
|
||||
|
||||
const T thresh;
|
||||
};
|
||||
|
||||
// Function Object Adaptors
|
||||
|
||||
template <typename Predicate> struct unary_negate : unary_function<typename Predicate::argument_type, bool>
|
||||
{
|
||||
explicit __host__ __device__ __forceinline__ unary_negate(const Predicate& p) : pred(p) {}
|
||||
|
||||
__device__ __forceinline__ bool operator()(typename TypeTraits<typename Predicate::argument_type>::ParameterType x) const
|
||||
{
|
||||
return !pred(x);
|
||||
}
|
||||
|
||||
const Predicate pred;
|
||||
};
|
||||
template <typename Predicate> __host__ __device__ __forceinline__ unary_negate<Predicate> not1(const Predicate& pred)
|
||||
{
|
||||
return unary_negate<Predicate>(pred);
|
||||
}
|
||||
|
||||
template <typename Predicate> struct binary_negate : binary_function<typename Predicate::first_argument_type, typename Predicate::second_argument_type, bool>
|
||||
{
|
||||
explicit __host__ __device__ __forceinline__ binary_negate(const Predicate& p) : pred(p) {}
|
||||
|
||||
__device__ __forceinline__ bool operator()(typename TypeTraits<typename Predicate::first_argument_type>::ParameterType x, typename TypeTraits<typename Predicate::second_argument_type>::ParameterType y) const
|
||||
{
|
||||
return !pred(x,y);
|
||||
}
|
||||
|
||||
const Predicate pred;
|
||||
};
|
||||
template <typename BinaryPredicate> __host__ __device__ __forceinline__ binary_negate<BinaryPredicate> not2(const BinaryPredicate& pred)
|
||||
{
|
||||
return binary_negate<BinaryPredicate>(pred);
|
||||
}
|
||||
|
||||
template <typename Op> struct binder1st : unary_function<typename Op::second_argument_type, typename Op::result_type>
|
||||
{
|
||||
__host__ __device__ __forceinline__ binder1st(const Op& op_, const typename Op::first_argument_type& arg1_) : op(op_), arg1(arg1_) {}
|
||||
|
||||
__device__ __forceinline__ typename Op::result_type operator ()(typename TypeTraits<typename Op::second_argument_type>::ParameterType a) const
|
||||
{
|
||||
return op(arg1, a);
|
||||
}
|
||||
|
||||
const Op op;
|
||||
const typename Op::first_argument_type arg1;
|
||||
};
|
||||
template <typename Op, typename T> __host__ __device__ __forceinline__ binder1st<Op> bind1st(const Op& op, const T& x)
|
||||
{
|
||||
return binder1st<Op>(op, typename Op::first_argument_type(x));
|
||||
}
|
||||
|
||||
template <typename Op> struct binder2nd : unary_function<typename Op::first_argument_type, typename Op::result_type>
|
||||
{
|
||||
__host__ __device__ __forceinline__ binder2nd(const Op& op_, const typename Op::second_argument_type& arg2_) : op(op_), arg2(arg2_) {}
|
||||
|
||||
__forceinline__ __device__ typename Op::result_type operator ()(typename TypeTraits<typename Op::first_argument_type>::ParameterType a) const
|
||||
{
|
||||
return op(a, arg2);
|
||||
}
|
||||
|
||||
const Op op;
|
||||
const typename Op::second_argument_type arg2;
|
||||
};
|
||||
template <typename Op, typename T> __host__ __device__ __forceinline__ binder2nd<Op> bind2nd(const Op& op, const T& x)
|
||||
{
|
||||
return binder2nd<Op>(op, typename Op::second_argument_type(x));
|
||||
}
|
||||
|
||||
// Functor Traits
|
||||
|
||||
template <typename F> struct IsUnaryFunction
|
||||
{
|
||||
typedef char Yes;
|
||||
struct No {Yes a[2];};
|
||||
|
||||
template <typename T, typename D> static Yes check(unary_function<T, D>);
|
||||
static No check(...);
|
||||
|
||||
static F makeF();
|
||||
|
||||
enum { value = (sizeof(check(makeF())) == sizeof(Yes)) };
|
||||
};
|
||||
|
||||
template <typename F> struct IsBinaryFunction
|
||||
{
|
||||
typedef char Yes;
|
||||
struct No {Yes a[2];};
|
||||
|
||||
template <typename T1, typename T2, typename D> static Yes check(binary_function<T1, T2, D>);
|
||||
static No check(...);
|
||||
|
||||
static F makeF();
|
||||
|
||||
enum { value = (sizeof(check(makeF())) == sizeof(Yes)) };
|
||||
};
|
||||
|
||||
namespace functional_detail
|
||||
{
|
||||
template <size_t src_elem_size, size_t dst_elem_size> struct UnOpShift { enum { shift = 1 }; };
|
||||
template <size_t src_elem_size> struct UnOpShift<src_elem_size, 1> { enum { shift = 4 }; };
|
||||
template <size_t src_elem_size> struct UnOpShift<src_elem_size, 2> { enum { shift = 2 }; };
|
||||
|
||||
template <typename T, typename D> struct DefaultUnaryShift
|
||||
{
|
||||
enum { shift = UnOpShift<sizeof(T), sizeof(D)>::shift };
|
||||
};
|
||||
|
||||
template <size_t src_elem_size1, size_t src_elem_size2, size_t dst_elem_size> struct BinOpShift { enum { shift = 1 }; };
|
||||
template <size_t src_elem_size1, size_t src_elem_size2> struct BinOpShift<src_elem_size1, src_elem_size2, 1> { enum { shift = 4 }; };
|
||||
template <size_t src_elem_size1, size_t src_elem_size2> struct BinOpShift<src_elem_size1, src_elem_size2, 2> { enum { shift = 2 }; };
|
||||
|
||||
template <typename T1, typename T2, typename D> struct DefaultBinaryShift
|
||||
{
|
||||
enum { shift = BinOpShift<sizeof(T1), sizeof(T2), sizeof(D)>::shift };
|
||||
};
|
||||
|
||||
template <typename Func, bool unary = IsUnaryFunction<Func>::value> struct ShiftDispatcher;
|
||||
template <typename Func> struct ShiftDispatcher<Func, true>
|
||||
{
|
||||
enum { shift = DefaultUnaryShift<typename Func::argument_type, typename Func::result_type>::shift };
|
||||
};
|
||||
template <typename Func> struct ShiftDispatcher<Func, false>
|
||||
{
|
||||
enum { shift = DefaultBinaryShift<typename Func::first_argument_type, typename Func::second_argument_type, typename Func::result_type>::shift };
|
||||
};
|
||||
}
|
||||
|
||||
template <typename Func> struct DefaultTransformShift
|
||||
{
|
||||
enum { shift = functional_detail::ShiftDispatcher<Func>::shift };
|
||||
};
|
||||
|
||||
template <typename Func> struct DefaultTransformFunctorTraits
|
||||
{
|
||||
enum { simple_block_dim_x = 16 };
|
||||
enum { simple_block_dim_y = 16 };
|
||||
|
||||
enum { smart_block_dim_x = 16 };
|
||||
enum { smart_block_dim_y = 16 };
|
||||
enum { smart_shift = DefaultTransformShift<Func>::shift };
|
||||
};
|
||||
|
||||
template <typename Func> struct TransformFunctorTraits : DefaultTransformFunctorTraits<Func> {};
|
||||
|
||||
#define OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(type) \
|
||||
template <> struct TransformFunctorTraits< type > : DefaultTransformFunctorTraits< type >
|
||||
|
||||
END_OPENCV_DEVICE_NAMESPACE
|
||||
}}} // namespace cv { namespace gpu { namespace device
|
||||
|
||||
#endif // __OPENCV_GPU_FUNCTIONAL_HPP__
|
||||
|
@ -43,193 +43,193 @@
|
||||
#ifndef __OPENCV_GPU_LIMITS_GPU_HPP__
|
||||
#define __OPENCV_GPU_LIMITS_GPU_HPP__
|
||||
|
||||
#include <limits>
|
||||
#include "internal_shared.hpp"
|
||||
|
||||
BEGIN_OPENCV_DEVICE_NAMESPACE
|
||||
|
||||
template<class T> struct numeric_limits
|
||||
namespace cv { namespace gpu { namespace device
|
||||
{
|
||||
typedef T type;
|
||||
__device__ __forceinline__ static type min() { return type(); };
|
||||
__device__ __forceinline__ static type max() { return type(); };
|
||||
__device__ __forceinline__ static type epsilon() { return type(); }
|
||||
__device__ __forceinline__ static type round_error() { return type(); }
|
||||
__device__ __forceinline__ static type denorm_min() { return type(); }
|
||||
__device__ __forceinline__ static type infinity() { return type(); }
|
||||
__device__ __forceinline__ static type quiet_NaN() { return type(); }
|
||||
__device__ __forceinline__ static type signaling_NaN() { return T(); }
|
||||
static const bool is_signed;
|
||||
};
|
||||
template<class T> struct numeric_limits
|
||||
{
|
||||
typedef T type;
|
||||
__device__ __forceinline__ static type min() { return type(); };
|
||||
__device__ __forceinline__ static type max() { return type(); };
|
||||
__device__ __forceinline__ static type epsilon() { return type(); }
|
||||
__device__ __forceinline__ static type round_error() { return type(); }
|
||||
__device__ __forceinline__ static type denorm_min() { return type(); }
|
||||
__device__ __forceinline__ static type infinity() { return type(); }
|
||||
__device__ __forceinline__ static type quiet_NaN() { return type(); }
|
||||
__device__ __forceinline__ static type signaling_NaN() { return T(); }
|
||||
static const bool is_signed;
|
||||
};
|
||||
|
||||
template<> struct numeric_limits<bool>
|
||||
{
|
||||
typedef bool type;
|
||||
__device__ __forceinline__ static type min() { return false; };
|
||||
__device__ __forceinline__ static type max() { return true; };
|
||||
__device__ __forceinline__ static type epsilon();
|
||||
__device__ __forceinline__ static type round_error();
|
||||
__device__ __forceinline__ static type denorm_min();
|
||||
__device__ __forceinline__ static type infinity();
|
||||
__device__ __forceinline__ static type quiet_NaN();
|
||||
__device__ __forceinline__ static type signaling_NaN();
|
||||
static const bool is_signed = false;
|
||||
};
|
||||
template<> struct numeric_limits<bool>
|
||||
{
|
||||
typedef bool type;
|
||||
__device__ __forceinline__ static type min() { return false; };
|
||||
__device__ __forceinline__ static type max() { return true; };
|
||||
__device__ __forceinline__ static type epsilon();
|
||||
__device__ __forceinline__ static type round_error();
|
||||
__device__ __forceinline__ static type denorm_min();
|
||||
__device__ __forceinline__ static type infinity();
|
||||
__device__ __forceinline__ static type quiet_NaN();
|
||||
__device__ __forceinline__ static type signaling_NaN();
|
||||
static const bool is_signed = false;
|
||||
};
|
||||
|
||||
template<> struct numeric_limits<char>
|
||||
{
|
||||
typedef char type;
|
||||
__device__ __forceinline__ static type min() { return CHAR_MIN; };
|
||||
__device__ __forceinline__ static type max() { return CHAR_MAX; };
|
||||
__device__ __forceinline__ static type epsilon();
|
||||
__device__ __forceinline__ static type round_error();
|
||||
__device__ __forceinline__ static type denorm_min();
|
||||
__device__ __forceinline__ static type infinity();
|
||||
__device__ __forceinline__ static type quiet_NaN();
|
||||
__device__ __forceinline__ static type signaling_NaN();
|
||||
static const bool is_signed = (char)-1 == -1;
|
||||
};
|
||||
template<> struct numeric_limits<char>
|
||||
{
|
||||
typedef char type;
|
||||
__device__ __forceinline__ static type min() { return CHAR_MIN; };
|
||||
__device__ __forceinline__ static type max() { return CHAR_MAX; };
|
||||
__device__ __forceinline__ static type epsilon();
|
||||
__device__ __forceinline__ static type round_error();
|
||||
__device__ __forceinline__ static type denorm_min();
|
||||
__device__ __forceinline__ static type infinity();
|
||||
__device__ __forceinline__ static type quiet_NaN();
|
||||
__device__ __forceinline__ static type signaling_NaN();
|
||||
static const bool is_signed = (char)-1 == -1;
|
||||
};
|
||||
|
||||
template<> struct numeric_limits<signed char>
|
||||
{
|
||||
typedef char type;
|
||||
__device__ __forceinline__ static type min() { return CHAR_MIN; };
|
||||
__device__ __forceinline__ static type max() { return CHAR_MAX; };
|
||||
__device__ __forceinline__ static type epsilon();
|
||||
__device__ __forceinline__ static type round_error();
|
||||
__device__ __forceinline__ static type denorm_min();
|
||||
__device__ __forceinline__ static type infinity();
|
||||
__device__ __forceinline__ static type quiet_NaN();
|
||||
__device__ __forceinline__ static type signaling_NaN();
|
||||
static const bool is_signed = (signed char)-1 == -1;
|
||||
};
|
||||
template<> struct numeric_limits<signed char>
|
||||
{
|
||||
typedef char type;
|
||||
__device__ __forceinline__ static type min() { return SCHAR_MIN; };
|
||||
__device__ __forceinline__ static type max() { return SCHAR_MAX; };
|
||||
__device__ __forceinline__ static type epsilon();
|
||||
__device__ __forceinline__ static type round_error();
|
||||
__device__ __forceinline__ static type denorm_min();
|
||||
__device__ __forceinline__ static type infinity();
|
||||
__device__ __forceinline__ static type quiet_NaN();
|
||||
__device__ __forceinline__ static type signaling_NaN();
|
||||
static const bool is_signed = (signed char)-1 == -1;
|
||||
};
|
||||
|
||||
template<> struct numeric_limits<unsigned char>
|
||||
{
|
||||
typedef unsigned char type;
|
||||
__device__ __forceinline__ static type min() { return 0; };
|
||||
__device__ __forceinline__ static type max() { return UCHAR_MAX; };
|
||||
__device__ __forceinline__ static type epsilon();
|
||||
__device__ __forceinline__ static type round_error();
|
||||
__device__ __forceinline__ static type denorm_min();
|
||||
__device__ __forceinline__ static type infinity();
|
||||
__device__ __forceinline__ static type quiet_NaN();
|
||||
__device__ __forceinline__ static type signaling_NaN();
|
||||
static const bool is_signed = false;
|
||||
};
|
||||
template<> struct numeric_limits<unsigned char>
|
||||
{
|
||||
typedef unsigned char type;
|
||||
__device__ __forceinline__ static type min() { return 0; };
|
||||
__device__ __forceinline__ static type max() { return UCHAR_MAX; };
|
||||
__device__ __forceinline__ static type epsilon();
|
||||
__device__ __forceinline__ static type round_error();
|
||||
__device__ __forceinline__ static type denorm_min();
|
||||
__device__ __forceinline__ static type infinity();
|
||||
__device__ __forceinline__ static type quiet_NaN();
|
||||
__device__ __forceinline__ static type signaling_NaN();
|
||||
static const bool is_signed = false;
|
||||
};
|
||||
|
||||
template<> struct numeric_limits<short>
|
||||
{
|
||||
typedef short type;
|
||||
__device__ __forceinline__ static type min() { return SHRT_MIN; };
|
||||
__device__ __forceinline__ static type max() { return SHRT_MAX; };
|
||||
__device__ __forceinline__ static type epsilon();
|
||||
__device__ __forceinline__ static type round_error();
|
||||
__device__ __forceinline__ static type denorm_min();
|
||||
__device__ __forceinline__ static type infinity();
|
||||
__device__ __forceinline__ static type quiet_NaN();
|
||||
__device__ __forceinline__ static type signaling_NaN();
|
||||
static const bool is_signed = true;
|
||||
};
|
||||
template<> struct numeric_limits<short>
|
||||
{
|
||||
typedef short type;
|
||||
__device__ __forceinline__ static type min() { return SHRT_MIN; };
|
||||
__device__ __forceinline__ static type max() { return SHRT_MAX; };
|
||||
__device__ __forceinline__ static type epsilon();
|
||||
__device__ __forceinline__ static type round_error();
|
||||
__device__ __forceinline__ static type denorm_min();
|
||||
__device__ __forceinline__ static type infinity();
|
||||
__device__ __forceinline__ static type quiet_NaN();
|
||||
__device__ __forceinline__ static type signaling_NaN();
|
||||
static const bool is_signed = true;
|
||||
};
|
||||
|
||||
template<> struct numeric_limits<unsigned short>
|
||||
{
|
||||
typedef unsigned short type;
|
||||
__device__ __forceinline__ static type min() { return 0; };
|
||||
__device__ __forceinline__ static type max() { return USHRT_MAX; };
|
||||
__device__ __forceinline__ static type epsilon();
|
||||
__device__ __forceinline__ static type round_error();
|
||||
__device__ __forceinline__ static type denorm_min();
|
||||
__device__ __forceinline__ static type infinity();
|
||||
__device__ __forceinline__ static type quiet_NaN();
|
||||
__device__ __forceinline__ static type signaling_NaN();
|
||||
static const bool is_signed = false;
|
||||
};
|
||||
template<> struct numeric_limits<unsigned short>
|
||||
{
|
||||
typedef unsigned short type;
|
||||
__device__ __forceinline__ static type min() { return 0; };
|
||||
__device__ __forceinline__ static type max() { return USHRT_MAX; };
|
||||
__device__ __forceinline__ static type epsilon();
|
||||
__device__ __forceinline__ static type round_error();
|
||||
__device__ __forceinline__ static type denorm_min();
|
||||
__device__ __forceinline__ static type infinity();
|
||||
__device__ __forceinline__ static type quiet_NaN();
|
||||
__device__ __forceinline__ static type signaling_NaN();
|
||||
static const bool is_signed = false;
|
||||
};
|
||||
|
||||
template<> struct numeric_limits<int>
|
||||
{
|
||||
typedef int type;
|
||||
__device__ __forceinline__ static type min() { return INT_MIN; };
|
||||
__device__ __forceinline__ static type max() { return INT_MAX; };
|
||||
__device__ __forceinline__ static type epsilon();
|
||||
__device__ __forceinline__ static type round_error();
|
||||
__device__ __forceinline__ static type denorm_min();
|
||||
__device__ __forceinline__ static type infinity();
|
||||
__device__ __forceinline__ static type quiet_NaN();
|
||||
__device__ __forceinline__ static type signaling_NaN();
|
||||
static const bool is_signed = true;
|
||||
};
|
||||
template<> struct numeric_limits<int>
|
||||
{
|
||||
typedef int type;
|
||||
__device__ __forceinline__ static type min() { return INT_MIN; };
|
||||
__device__ __forceinline__ static type max() { return INT_MAX; };
|
||||
__device__ __forceinline__ static type epsilon();
|
||||
__device__ __forceinline__ static type round_error();
|
||||
__device__ __forceinline__ static type denorm_min();
|
||||
__device__ __forceinline__ static type infinity();
|
||||
__device__ __forceinline__ static type quiet_NaN();
|
||||
__device__ __forceinline__ static type signaling_NaN();
|
||||
static const bool is_signed = true;
|
||||
};
|
||||
|
||||
|
||||
template<> struct numeric_limits<unsigned int>
|
||||
{
|
||||
typedef unsigned int type;
|
||||
__device__ __forceinline__ static type min() { return 0; };
|
||||
__device__ __forceinline__ static type max() { return UINT_MAX; };
|
||||
__device__ __forceinline__ static type epsilon();
|
||||
__device__ __forceinline__ static type round_error();
|
||||
__device__ __forceinline__ static type denorm_min();
|
||||
__device__ __forceinline__ static type infinity();
|
||||
__device__ __forceinline__ static type quiet_NaN();
|
||||
__device__ __forceinline__ static type signaling_NaN();
|
||||
static const bool is_signed = false;
|
||||
};
|
||||
template<> struct numeric_limits<unsigned int>
|
||||
{
|
||||
typedef unsigned int type;
|
||||
__device__ __forceinline__ static type min() { return 0; };
|
||||
__device__ __forceinline__ static type max() { return UINT_MAX; };
|
||||
__device__ __forceinline__ static type epsilon();
|
||||
__device__ __forceinline__ static type round_error();
|
||||
__device__ __forceinline__ static type denorm_min();
|
||||
__device__ __forceinline__ static type infinity();
|
||||
__device__ __forceinline__ static type quiet_NaN();
|
||||
__device__ __forceinline__ static type signaling_NaN();
|
||||
static const bool is_signed = false;
|
||||
};
|
||||
|
||||
template<> struct numeric_limits<long>
|
||||
{
|
||||
typedef long type;
|
||||
__device__ __forceinline__ static type min() { return LONG_MIN; };
|
||||
__device__ __forceinline__ static type max() { return LONG_MAX; };
|
||||
__device__ __forceinline__ static type epsilon();
|
||||
__device__ __forceinline__ static type round_error();
|
||||
__device__ __forceinline__ static type denorm_min();
|
||||
__device__ __forceinline__ static type infinity();
|
||||
__device__ __forceinline__ static type quiet_NaN();
|
||||
__device__ __forceinline__ static type signaling_NaN();
|
||||
static const bool is_signed = true;
|
||||
};
|
||||
template<> struct numeric_limits<long>
|
||||
{
|
||||
typedef long type;
|
||||
__device__ __forceinline__ static type min() { return LONG_MIN; };
|
||||
__device__ __forceinline__ static type max() { return LONG_MAX; };
|
||||
__device__ __forceinline__ static type epsilon();
|
||||
__device__ __forceinline__ static type round_error();
|
||||
__device__ __forceinline__ static type denorm_min();
|
||||
__device__ __forceinline__ static type infinity();
|
||||
__device__ __forceinline__ static type quiet_NaN();
|
||||
__device__ __forceinline__ static type signaling_NaN();
|
||||
static const bool is_signed = true;
|
||||
};
|
||||
|
||||
template<> struct numeric_limits<unsigned long>
|
||||
{
|
||||
typedef unsigned long type;
|
||||
__device__ __forceinline__ static type min() { return 0; };
|
||||
__device__ __forceinline__ static type max() { return ULONG_MAX; };
|
||||
__device__ __forceinline__ static type epsilon();
|
||||
__device__ __forceinline__ static type round_error();
|
||||
__device__ __forceinline__ static type denorm_min();
|
||||
__device__ __forceinline__ static type infinity();
|
||||
__device__ __forceinline__ static type quiet_NaN();
|
||||
__device__ __forceinline__ static type signaling_NaN();
|
||||
static const bool is_signed = false;
|
||||
};
|
||||
template<> struct numeric_limits<unsigned long>
|
||||
{
|
||||
typedef unsigned long type;
|
||||
__device__ __forceinline__ static type min() { return 0; };
|
||||
__device__ __forceinline__ static type max() { return ULONG_MAX; };
|
||||
__device__ __forceinline__ static type epsilon();
|
||||
__device__ __forceinline__ static type round_error();
|
||||
__device__ __forceinline__ static type denorm_min();
|
||||
__device__ __forceinline__ static type infinity();
|
||||
__device__ __forceinline__ static type quiet_NaN();
|
||||
__device__ __forceinline__ static type signaling_NaN();
|
||||
static const bool is_signed = false;
|
||||
};
|
||||
|
||||
template<> struct numeric_limits<float>
|
||||
{
|
||||
typedef float type;
|
||||
__device__ __forceinline__ static type min() { return 1.175494351e-38f/*FLT_MIN*/; };
|
||||
__device__ __forceinline__ static type max() { return 3.402823466e+38f/*FLT_MAX*/; };
|
||||
__device__ __forceinline__ static type epsilon() { return 1.192092896e-07f/*FLT_EPSILON*/; };
|
||||
__device__ __forceinline__ static type round_error();
|
||||
__device__ __forceinline__ static type denorm_min();
|
||||
__device__ __forceinline__ static type infinity();
|
||||
__device__ __forceinline__ static type quiet_NaN();
|
||||
__device__ __forceinline__ static type signaling_NaN();
|
||||
static const bool is_signed = true;
|
||||
};
|
||||
template<> struct numeric_limits<float>
|
||||
{
|
||||
typedef float type;
|
||||
__device__ __forceinline__ static type min() { return 1.175494351e-38f/*FLT_MIN*/; };
|
||||
__device__ __forceinline__ static type max() { return 3.402823466e+38f/*FLT_MAX*/; };
|
||||
__device__ __forceinline__ static type epsilon() { return 1.192092896e-07f/*FLT_EPSILON*/; };
|
||||
__device__ __forceinline__ static type round_error();
|
||||
__device__ __forceinline__ static type denorm_min();
|
||||
__device__ __forceinline__ static type infinity();
|
||||
__device__ __forceinline__ static type quiet_NaN();
|
||||
__device__ __forceinline__ static type signaling_NaN();
|
||||
static const bool is_signed = true;
|
||||
};
|
||||
|
||||
template<> struct numeric_limits<double>
|
||||
{
|
||||
typedef double type;
|
||||
__device__ __forceinline__ static type min() { return 2.2250738585072014e-308/*DBL_MIN*/; };
|
||||
__device__ __forceinline__ static type max() { return 1.7976931348623158e+308/*DBL_MAX*/; };
|
||||
__device__ __forceinline__ static type epsilon();
|
||||
__device__ __forceinline__ static type round_error();
|
||||
__device__ __forceinline__ static type denorm_min();
|
||||
__device__ __forceinline__ static type infinity();
|
||||
__device__ __forceinline__ static type quiet_NaN();
|
||||
__device__ __forceinline__ static type signaling_NaN();
|
||||
static const bool is_signed = true;
|
||||
};
|
||||
|
||||
END_OPENCV_DEVICE_NAMESPACE
|
||||
template<> struct numeric_limits<double>
|
||||
{
|
||||
typedef double type;
|
||||
__device__ __forceinline__ static type min() { return 2.2250738585072014e-308/*DBL_MIN*/; };
|
||||
__device__ __forceinline__ static type max() { return 1.7976931348623158e+308/*DBL_MAX*/; };
|
||||
__device__ __forceinline__ static type epsilon();
|
||||
__device__ __forceinline__ static type round_error();
|
||||
__device__ __forceinline__ static type denorm_min();
|
||||
__device__ __forceinline__ static type infinity();
|
||||
__device__ __forceinline__ static type quiet_NaN();
|
||||
__device__ __forceinline__ static type signaling_NaN();
|
||||
static const bool is_signed = true;
|
||||
};
|
||||
}}} // namespace cv { namespace gpu { namespace device {
|
||||
|
||||
#endif // __OPENCV_GPU_LIMITS_GPU_HPP__
|
||||
|
@ -45,173 +45,172 @@
|
||||
|
||||
#include "internal_shared.hpp"
|
||||
|
||||
BEGIN_OPENCV_DEVICE_NAMESPACE
|
||||
|
||||
template<typename _Tp> __device__ __forceinline__ _Tp saturate_cast(uchar v) { return _Tp(v); }
|
||||
template<typename _Tp> __device__ __forceinline__ _Tp saturate_cast(schar v) { return _Tp(v); }
|
||||
template<typename _Tp> __device__ __forceinline__ _Tp saturate_cast(ushort v) { return _Tp(v); }
|
||||
template<typename _Tp> __device__ __forceinline__ _Tp saturate_cast(short v) { return _Tp(v); }
|
||||
template<typename _Tp> __device__ __forceinline__ _Tp saturate_cast(uint v) { return _Tp(v); }
|
||||
template<typename _Tp> __device__ __forceinline__ _Tp saturate_cast(int v) { return _Tp(v); }
|
||||
template<typename _Tp> __device__ __forceinline__ _Tp saturate_cast(float v) { return _Tp(v); }
|
||||
template<typename _Tp> __device__ __forceinline__ _Tp saturate_cast(double v) { return _Tp(v); }
|
||||
|
||||
template<> __device__ __forceinline__ uchar saturate_cast<uchar>(schar v)
|
||||
{
|
||||
return (uchar) ::max((int)v, 0);
|
||||
}
|
||||
template<> __device__ __forceinline__ uchar saturate_cast<uchar>(ushort v)
|
||||
{
|
||||
return (uchar) ::min((uint)v, (uint)UCHAR_MAX);
|
||||
}
|
||||
template<> __device__ __forceinline__ uchar saturate_cast<uchar>(int v)
|
||||
{
|
||||
return (uchar)((uint)v <= UCHAR_MAX ? v : v > 0 ? UCHAR_MAX : 0);
|
||||
}
|
||||
template<> __device__ __forceinline__ uchar saturate_cast<uchar>(uint v)
|
||||
{
|
||||
return (uchar) ::min(v, (uint)UCHAR_MAX);
|
||||
}
|
||||
template<> __device__ __forceinline__ uchar saturate_cast<uchar>(short v)
|
||||
{
|
||||
return saturate_cast<uchar>((uint)v);
|
||||
}
|
||||
|
||||
template<> __device__ __forceinline__ uchar saturate_cast<uchar>(float v)
|
||||
{
|
||||
int iv = __float2int_rn(v);
|
||||
return saturate_cast<uchar>(iv);
|
||||
}
|
||||
template<> __device__ __forceinline__ uchar saturate_cast<uchar>(double v)
|
||||
namespace cv { namespace gpu { namespace device
|
||||
{
|
||||
#if __CUDA_ARCH__ >= 130
|
||||
int iv = __double2int_rn(v);
|
||||
return saturate_cast<uchar>(iv);
|
||||
#else
|
||||
return saturate_cast<uchar>((float)v);
|
||||
#endif
|
||||
}
|
||||
template<typename _Tp> __device__ __forceinline__ _Tp saturate_cast(uchar v) { return _Tp(v); }
|
||||
template<typename _Tp> __device__ __forceinline__ _Tp saturate_cast(schar v) { return _Tp(v); }
|
||||
template<typename _Tp> __device__ __forceinline__ _Tp saturate_cast(ushort v) { return _Tp(v); }
|
||||
template<typename _Tp> __device__ __forceinline__ _Tp saturate_cast(short v) { return _Tp(v); }
|
||||
template<typename _Tp> __device__ __forceinline__ _Tp saturate_cast(uint v) { return _Tp(v); }
|
||||
template<typename _Tp> __device__ __forceinline__ _Tp saturate_cast(int v) { return _Tp(v); }
|
||||
template<typename _Tp> __device__ __forceinline__ _Tp saturate_cast(float v) { return _Tp(v); }
|
||||
template<typename _Tp> __device__ __forceinline__ _Tp saturate_cast(double v) { return _Tp(v); }
|
||||
|
||||
template<> __device__ __forceinline__ schar saturate_cast<schar>(uchar v)
|
||||
{
|
||||
return (schar) ::min((int)v, SCHAR_MAX);
|
||||
}
|
||||
template<> __device__ __forceinline__ schar saturate_cast<schar>(ushort v)
|
||||
{
|
||||
return (schar) ::min((uint)v, (uint)SCHAR_MAX);
|
||||
}
|
||||
template<> __device__ __forceinline__ schar saturate_cast<schar>(int v)
|
||||
{
|
||||
return (schar)((uint)(v-SCHAR_MIN) <= (uint)UCHAR_MAX ? v : v > 0 ? SCHAR_MAX : SCHAR_MIN);
|
||||
}
|
||||
template<> __device__ __forceinline__ schar saturate_cast<schar>(short v)
|
||||
{
|
||||
return saturate_cast<schar>((int)v);
|
||||
}
|
||||
template<> __device__ __forceinline__ schar saturate_cast<schar>(uint v)
|
||||
{
|
||||
return (schar) ::min(v, (uint)SCHAR_MAX);
|
||||
}
|
||||
template<> __device__ __forceinline__ uchar saturate_cast<uchar>(schar v)
|
||||
{
|
||||
return (uchar) ::max((int)v, 0);
|
||||
}
|
||||
template<> __device__ __forceinline__ uchar saturate_cast<uchar>(ushort v)
|
||||
{
|
||||
return (uchar) ::min((uint)v, (uint)UCHAR_MAX);
|
||||
}
|
||||
template<> __device__ __forceinline__ uchar saturate_cast<uchar>(int v)
|
||||
{
|
||||
return (uchar)((uint)v <= UCHAR_MAX ? v : v > 0 ? UCHAR_MAX : 0);
|
||||
}
|
||||
template<> __device__ __forceinline__ uchar saturate_cast<uchar>(uint v)
|
||||
{
|
||||
return (uchar) ::min(v, (uint)UCHAR_MAX);
|
||||
}
|
||||
template<> __device__ __forceinline__ uchar saturate_cast<uchar>(short v)
|
||||
{
|
||||
return saturate_cast<uchar>((uint)v);
|
||||
}
|
||||
|
||||
template<> __device__ __forceinline__ schar saturate_cast<schar>(float v)
|
||||
{
|
||||
int iv = __float2int_rn(v);
|
||||
return saturate_cast<schar>(iv);
|
||||
}
|
||||
template<> __device__ __forceinline__ schar saturate_cast<schar>(double v)
|
||||
{
|
||||
#if __CUDA_ARCH__ >= 130
|
||||
int iv = __double2int_rn(v);
|
||||
return saturate_cast<schar>(iv);
|
||||
#else
|
||||
return saturate_cast<schar>((float)v);
|
||||
#endif
|
||||
}
|
||||
template<> __device__ __forceinline__ uchar saturate_cast<uchar>(float v)
|
||||
{
|
||||
int iv = __float2int_rn(v);
|
||||
return saturate_cast<uchar>(iv);
|
||||
}
|
||||
template<> __device__ __forceinline__ uchar saturate_cast<uchar>(double v)
|
||||
{
|
||||
#if __CUDA_ARCH__ >= 130
|
||||
int iv = __double2int_rn(v);
|
||||
return saturate_cast<uchar>(iv);
|
||||
#else
|
||||
return saturate_cast<uchar>((float)v);
|
||||
#endif
|
||||
}
|
||||
|
||||
template<> __device__ __forceinline__ ushort saturate_cast<ushort>(schar v)
|
||||
{
|
||||
return (ushort) ::max((int)v, 0);
|
||||
}
|
||||
template<> __device__ __forceinline__ ushort saturate_cast<ushort>(short v)
|
||||
{
|
||||
return (ushort) ::max((int)v, 0);
|
||||
}
|
||||
template<> __device__ __forceinline__ ushort saturate_cast<ushort>(int v)
|
||||
{
|
||||
return (ushort)((uint)v <= (uint)USHRT_MAX ? v : v > 0 ? USHRT_MAX : 0);
|
||||
}
|
||||
template<> __device__ __forceinline__ ushort saturate_cast<ushort>(uint v)
|
||||
{
|
||||
return (ushort) ::min(v, (uint)USHRT_MAX);
|
||||
}
|
||||
template<> __device__ __forceinline__ ushort saturate_cast<ushort>(float v)
|
||||
{
|
||||
int iv = __float2int_rn(v);
|
||||
return saturate_cast<ushort>(iv);
|
||||
}
|
||||
template<> __device__ __forceinline__ ushort saturate_cast<ushort>(double v)
|
||||
{
|
||||
#if __CUDA_ARCH__ >= 130
|
||||
int iv = __double2int_rn(v);
|
||||
return saturate_cast<ushort>(iv);
|
||||
#else
|
||||
return saturate_cast<ushort>((float)v);
|
||||
#endif
|
||||
}
|
||||
template<> __device__ __forceinline__ schar saturate_cast<schar>(uchar v)
|
||||
{
|
||||
return (schar) ::min((int)v, SCHAR_MAX);
|
||||
}
|
||||
template<> __device__ __forceinline__ schar saturate_cast<schar>(ushort v)
|
||||
{
|
||||
return (schar) ::min((uint)v, (uint)SCHAR_MAX);
|
||||
}
|
||||
template<> __device__ __forceinline__ schar saturate_cast<schar>(int v)
|
||||
{
|
||||
return (schar)((uint)(v-SCHAR_MIN) <= (uint)UCHAR_MAX ? v : v > 0 ? SCHAR_MAX : SCHAR_MIN);
|
||||
}
|
||||
template<> __device__ __forceinline__ schar saturate_cast<schar>(short v)
|
||||
{
|
||||
return saturate_cast<schar>((int)v);
|
||||
}
|
||||
template<> __device__ __forceinline__ schar saturate_cast<schar>(uint v)
|
||||
{
|
||||
return (schar) ::min(v, (uint)SCHAR_MAX);
|
||||
}
|
||||
|
||||
template<> __device__ __forceinline__ short saturate_cast<short>(ushort v)
|
||||
{
|
||||
return (short) ::min((int)v, SHRT_MAX);
|
||||
}
|
||||
template<> __device__ __forceinline__ short saturate_cast<short>(int v)
|
||||
{
|
||||
return (short)((uint)(v - SHRT_MIN) <= (uint)USHRT_MAX ? v : v > 0 ? SHRT_MAX : SHRT_MIN);
|
||||
}
|
||||
template<> __device__ __forceinline__ short saturate_cast<short>(uint v)
|
||||
{
|
||||
return (short) ::min(v, (uint)SHRT_MAX);
|
||||
}
|
||||
template<> __device__ __forceinline__ short saturate_cast<short>(float v)
|
||||
{
|
||||
int iv = __float2int_rn(v);
|
||||
return saturate_cast<short>(iv);
|
||||
}
|
||||
template<> __device__ __forceinline__ short saturate_cast<short>(double v)
|
||||
{
|
||||
#if __CUDA_ARCH__ >= 130
|
||||
int iv = __double2int_rn(v);
|
||||
return saturate_cast<short>(iv);
|
||||
#else
|
||||
return saturate_cast<short>((float)v);
|
||||
#endif
|
||||
}
|
||||
template<> __device__ __forceinline__ schar saturate_cast<schar>(float v)
|
||||
{
|
||||
int iv = __float2int_rn(v);
|
||||
return saturate_cast<schar>(iv);
|
||||
}
|
||||
template<> __device__ __forceinline__ schar saturate_cast<schar>(double v)
|
||||
{
|
||||
#if __CUDA_ARCH__ >= 130
|
||||
int iv = __double2int_rn(v);
|
||||
return saturate_cast<schar>(iv);
|
||||
#else
|
||||
return saturate_cast<schar>((float)v);
|
||||
#endif
|
||||
}
|
||||
|
||||
template<> __device__ __forceinline__ int saturate_cast<int>(float v)
|
||||
{
|
||||
return __float2int_rn(v);
|
||||
}
|
||||
template<> __device__ __forceinline__ int saturate_cast<int>(double v)
|
||||
{
|
||||
#if __CUDA_ARCH__ >= 130
|
||||
return __double2int_rn(v);
|
||||
#else
|
||||
return saturate_cast<int>((float)v);
|
||||
#endif
|
||||
}
|
||||
template<> __device__ __forceinline__ ushort saturate_cast<ushort>(schar v)
|
||||
{
|
||||
return (ushort) ::max((int)v, 0);
|
||||
}
|
||||
template<> __device__ __forceinline__ ushort saturate_cast<ushort>(short v)
|
||||
{
|
||||
return (ushort) ::max((int)v, 0);
|
||||
}
|
||||
template<> __device__ __forceinline__ ushort saturate_cast<ushort>(int v)
|
||||
{
|
||||
return (ushort)((uint)v <= (uint)USHRT_MAX ? v : v > 0 ? USHRT_MAX : 0);
|
||||
}
|
||||
template<> __device__ __forceinline__ ushort saturate_cast<ushort>(uint v)
|
||||
{
|
||||
return (ushort) ::min(v, (uint)USHRT_MAX);
|
||||
}
|
||||
template<> __device__ __forceinline__ ushort saturate_cast<ushort>(float v)
|
||||
{
|
||||
int iv = __float2int_rn(v);
|
||||
return saturate_cast<ushort>(iv);
|
||||
}
|
||||
template<> __device__ __forceinline__ ushort saturate_cast<ushort>(double v)
|
||||
{
|
||||
#if __CUDA_ARCH__ >= 130
|
||||
int iv = __double2int_rn(v);
|
||||
return saturate_cast<ushort>(iv);
|
||||
#else
|
||||
return saturate_cast<ushort>((float)v);
|
||||
#endif
|
||||
}
|
||||
|
||||
template<> __device__ __forceinline__ uint saturate_cast<uint>(float v)
|
||||
{
|
||||
return __float2uint_rn(v);
|
||||
}
|
||||
template<> __device__ __forceinline__ uint saturate_cast<uint>(double v)
|
||||
{
|
||||
#if __CUDA_ARCH__ >= 130
|
||||
return __double2uint_rn(v);
|
||||
#else
|
||||
return saturate_cast<uint>((float)v);
|
||||
#endif
|
||||
}
|
||||
template<> __device__ __forceinline__ short saturate_cast<short>(ushort v)
|
||||
{
|
||||
return (short) ::min((int)v, SHRT_MAX);
|
||||
}
|
||||
template<> __device__ __forceinline__ short saturate_cast<short>(int v)
|
||||
{
|
||||
return (short)((uint)(v - SHRT_MIN) <= (uint)USHRT_MAX ? v : v > 0 ? SHRT_MAX : SHRT_MIN);
|
||||
}
|
||||
template<> __device__ __forceinline__ short saturate_cast<short>(uint v)
|
||||
{
|
||||
return (short) ::min(v, (uint)SHRT_MAX);
|
||||
}
|
||||
template<> __device__ __forceinline__ short saturate_cast<short>(float v)
|
||||
{
|
||||
int iv = __float2int_rn(v);
|
||||
return saturate_cast<short>(iv);
|
||||
}
|
||||
template<> __device__ __forceinline__ short saturate_cast<short>(double v)
|
||||
{
|
||||
#if __CUDA_ARCH__ >= 130
|
||||
int iv = __double2int_rn(v);
|
||||
return saturate_cast<short>(iv);
|
||||
#else
|
||||
return saturate_cast<short>((float)v);
|
||||
#endif
|
||||
}
|
||||
|
||||
END_OPENCV_DEVICE_NAMESPACE
|
||||
template<> __device__ __forceinline__ int saturate_cast<int>(float v)
|
||||
{
|
||||
return __float2int_rn(v);
|
||||
}
|
||||
template<> __device__ __forceinline__ int saturate_cast<int>(double v)
|
||||
{
|
||||
#if __CUDA_ARCH__ >= 130
|
||||
return __double2int_rn(v);
|
||||
#else
|
||||
return saturate_cast<int>((float)v);
|
||||
#endif
|
||||
}
|
||||
|
||||
template<> __device__ __forceinline__ uint saturate_cast<uint>(float v)
|
||||
{
|
||||
return __float2uint_rn(v);
|
||||
}
|
||||
template<> __device__ __forceinline__ uint saturate_cast<uint>(double v)
|
||||
{
|
||||
#if __CUDA_ARCH__ >= 130
|
||||
return __double2uint_rn(v);
|
||||
#else
|
||||
return saturate_cast<uint>((float)v);
|
||||
#endif
|
||||
}
|
||||
}}}
|
||||
|
||||
#endif /* __OPENCV_GPU_SATURATE_CAST_HPP__ */
|
@ -49,24 +49,21 @@
|
||||
#define __OPENCV_GPU_HOST_DEVICE__
|
||||
#endif
|
||||
|
||||
namespace cv
|
||||
{
|
||||
namespace gpu
|
||||
namespace cv { namespace gpu
|
||||
{
|
||||
namespace device
|
||||
{
|
||||
namespace device
|
||||
{
|
||||
template<bool expr> struct Static {};
|
||||
|
||||
template<> struct Static<true>
|
||||
{
|
||||
__OPENCV_GPU_HOST_DEVICE__ static void check() {};
|
||||
};
|
||||
}
|
||||
template<bool expr> struct Static {};
|
||||
|
||||
template<> struct Static<true>
|
||||
{
|
||||
__OPENCV_GPU_HOST_DEVICE__ static void check() {};
|
||||
};
|
||||
}
|
||||
|
||||
using cv::gpu::device::Static;
|
||||
}
|
||||
}
|
||||
using ::cv::gpu::device::Static;
|
||||
}}
|
||||
|
||||
#undef __PCL_GPU_HOST_DEVICE__
|
||||
#undef __OPENCV_GPU_HOST_DEVICE__
|
||||
|
||||
#endif /* __OPENCV_GPU_GPU_DEVICE_STATIC_CHECK_HPP__ */
|
@ -47,30 +47,31 @@
|
||||
#include "utility.hpp"
|
||||
#include "detail/transform_detail.hpp"
|
||||
|
||||
BEGIN_OPENCV_DEVICE_NAMESPACE
|
||||
namespace cv { namespace gpu { namespace device
|
||||
{
|
||||
template <typename T, typename D, typename UnOp>
|
||||
void transform(const DevMem2D_<T>& src, const DevMem2D_<D>& dst, const UnOp& op, cudaStream_t stream = 0)
|
||||
{
|
||||
transform_detail::transform_caller(src, dst, op, WithOutMask(), stream);
|
||||
}
|
||||
|
||||
template <typename T, typename D, typename UnOp>
|
||||
void transform(const DevMem2D_<T>& src, const DevMem2D_<D>& dst, const UnOp& op, cudaStream_t stream = 0)
|
||||
{
|
||||
detail::transform_caller(src, dst, op, WithOutMask(), stream);
|
||||
}
|
||||
template <typename T, typename D, typename UnOp>
|
||||
void transform(const DevMem2D_<T>& src, const DevMem2D_<D>& dst, const PtrStepb& mask, const UnOp& op, cudaStream_t stream = 0)
|
||||
{
|
||||
detail::transform_caller(src, dst, op, SingleMask(mask), stream);
|
||||
}
|
||||
template <typename T, typename D, typename UnOp>
|
||||
void transform(const DevMem2D_<T>& src, const DevMem2D_<D>& dst, const PtrStepb& mask, const UnOp& op, cudaStream_t stream = 0)
|
||||
{
|
||||
transform_detail::transform_caller(src, dst, op, SingleMask(mask), stream);
|
||||
}
|
||||
|
||||
template <typename T1, typename T2, typename D, typename BinOp>
|
||||
void transform(const DevMem2D_<T1>& src1, const DevMem2D_<T2>& src2, const DevMem2D_<D>& dst, const BinOp& op, cudaStream_t stream = 0)
|
||||
{
|
||||
detail::transform_caller(src1, src2, dst, op, WithOutMask(), stream);
|
||||
}
|
||||
template <typename T1, typename T2, typename D, typename BinOp>
|
||||
void transform(const DevMem2D_<T1>& src1, const DevMem2D_<T2>& src2, const DevMem2D_<D>& dst, const PtrStepb& mask, const BinOp& op, cudaStream_t stream = 0)
|
||||
{
|
||||
detail::transform_caller(src1, src2, dst, op, SingleMask(mask), stream);
|
||||
}
|
||||
template <typename T1, typename T2, typename D, typename BinOp>
|
||||
void transform(const DevMem2D_<T1>& src1, const DevMem2D_<T2>& src2, const DevMem2D_<D>& dst, const BinOp& op, cudaStream_t stream = 0)
|
||||
{
|
||||
transform_detail::transform_caller(src1, src2, dst, op, WithOutMask(), stream);
|
||||
}
|
||||
|
||||
END_OPENCV_DEVICE_NAMESPACE
|
||||
template <typename T1, typename T2, typename D, typename BinOp>
|
||||
void transform(const DevMem2D_<T1>& src1, const DevMem2D_<T2>& src2, const DevMem2D_<D>& dst, const PtrStepb& mask, const BinOp& op, cudaStream_t stream = 0)
|
||||
{
|
||||
transform_detail::transform_caller(src1, src2, dst, op, SingleMask(mask), stream);
|
||||
}
|
||||
}}}
|
||||
|
||||
#endif // __OPENCV_GPU_TRANSFORM_HPP__
|
||||
|
@ -46,37 +46,38 @@
|
||||
#include "internal_shared.hpp"
|
||||
#include "detail/type_traits_detail.hpp"
|
||||
|
||||
BEGIN_OPENCV_DEVICE_NAMESPACE
|
||||
|
||||
template <typename T> struct IsSimpleParameter
|
||||
namespace cv { namespace gpu { namespace device
|
||||
{
|
||||
enum {value = detail::IsIntegral<T>::value || detail::IsFloat<T>::value || detail::PointerTraits<typename detail::ReferenceTraits<T>::type>::value};
|
||||
};
|
||||
template <typename T> struct IsSimpleParameter
|
||||
{
|
||||
enum {value = type_traits_detail::IsIntegral<T>::value || type_traits_detail::IsFloat<T>::value ||
|
||||
type_traits_detail::PointerTraits<typename type_traits_detail::ReferenceTraits<T>::type>::value};
|
||||
};
|
||||
|
||||
template <typename T> struct TypeTraits
|
||||
{
|
||||
typedef typename detail::UnConst<T>::type NonConstType;
|
||||
typedef typename detail::UnVolatile<T>::type NonVolatileType;
|
||||
typedef typename detail::UnVolatile<typename detail::UnConst<T>::type>::type UnqualifiedType;
|
||||
typedef typename detail::PointerTraits<UnqualifiedType>::type PointeeType;
|
||||
typedef typename detail::ReferenceTraits<T>::type ReferredType;
|
||||
template <typename T> struct TypeTraits
|
||||
{
|
||||
typedef typename type_traits_detail::UnConst<T>::type NonConstType;
|
||||
typedef typename type_traits_detail::UnVolatile<T>::type NonVolatileType;
|
||||
typedef typename type_traits_detail::UnVolatile<typename type_traits_detail::UnConst<T>::type>::type UnqualifiedType;
|
||||
typedef typename type_traits_detail::PointerTraits<UnqualifiedType>::type PointeeType;
|
||||
typedef typename type_traits_detail::ReferenceTraits<T>::type ReferredType;
|
||||
|
||||
enum { isConst = detail::UnConst<T>::value };
|
||||
enum { isVolatile = detail::UnVolatile<T>::value };
|
||||
enum { isConst = type_traits_detail::UnConst<T>::value };
|
||||
enum { isVolatile = type_traits_detail::UnVolatile<T>::value };
|
||||
|
||||
enum { isReference = detail::ReferenceTraits<UnqualifiedType>::value };
|
||||
enum { isPointer = detail::PointerTraits<typename detail::ReferenceTraits<UnqualifiedType>::type>::value };
|
||||
enum { isReference = type_traits_detail::ReferenceTraits<UnqualifiedType>::value };
|
||||
enum { isPointer = type_traits_detail::PointerTraits<typename type_traits_detail::ReferenceTraits<UnqualifiedType>::type>::value };
|
||||
|
||||
enum { isUnsignedInt = detail::IsUnsignedIntegral<UnqualifiedType>::value };
|
||||
enum { isSignedInt = detail::IsSignedIntergral<UnqualifiedType>::value };
|
||||
enum { isIntegral = detail::IsIntegral<UnqualifiedType>::value };
|
||||
enum { isFloat = detail::IsFloat<UnqualifiedType>::value };
|
||||
enum { isArith = isIntegral || isFloat };
|
||||
enum { isVec = detail::IsVec<UnqualifiedType>::value };
|
||||
|
||||
typedef typename detail::Select<IsSimpleParameter<UnqualifiedType>::value, T, typename detail::AddParameterType<T>::type>::type ParameterType;
|
||||
};
|
||||
|
||||
END_OPENCV_DEVICE_NAMESPACE
|
||||
enum { isUnsignedInt = type_traits_detail::IsUnsignedIntegral<UnqualifiedType>::value };
|
||||
enum { isSignedInt = type_traits_detail::IsSignedIntergral<UnqualifiedType>::value };
|
||||
enum { isIntegral = type_traits_detail::IsIntegral<UnqualifiedType>::value };
|
||||
enum { isFloat = type_traits_detail::IsFloat<UnqualifiedType>::value };
|
||||
enum { isArith = isIntegral || isFloat };
|
||||
enum { isVec = type_traits_detail::IsVec<UnqualifiedType>::value };
|
||||
|
||||
typedef typename type_traits_detail::Select<IsSimpleParameter<UnqualifiedType>::value,
|
||||
T, typename type_traits_detail::AddParameterType<T>::type>::type ParameterType;
|
||||
};
|
||||
}}}
|
||||
|
||||
#endif // __OPENCV_GPU_TYPE_TRAITS_HPP__
|
||||
|
@ -48,168 +48,167 @@
|
||||
#include "datamov_utils.hpp"
|
||||
#include "detail/utility_detail.hpp"
|
||||
|
||||
BEGIN_OPENCV_DEVICE_NAMESPACE
|
||||
|
||||
#define OPENCV_GPU_LOG_WARP_SIZE (5)
|
||||
#define OPENCV_GPU_WARP_SIZE (1 << OPENCV_GPU_LOG_WARP_SIZE)
|
||||
#define OPENCV_GPU_LOG_MEM_BANKS ((__CUDA_ARCH__ >= 200) ? 5 : 4) // 32 banks on fermi, 16 on tesla
|
||||
#define OPENCV_GPU_MEM_BANKS (1 << OPENCV_GPU_LOG_MEM_BANKS)
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
// swap
|
||||
|
||||
template <typename T> void __device__ __host__ __forceinline__ swap(T& a, T& b)
|
||||
namespace cv { namespace gpu { namespace device
|
||||
{
|
||||
const T temp = a;
|
||||
a = b;
|
||||
b = temp;
|
||||
}
|
||||
#define OPENCV_GPU_LOG_WARP_SIZE (5)
|
||||
#define OPENCV_GPU_WARP_SIZE (1 << OPENCV_GPU_LOG_WARP_SIZE)
|
||||
#define OPENCV_GPU_LOG_MEM_BANKS ((__CUDA_ARCH__ >= 200) ? 5 : 4) // 32 banks on fermi, 16 on tesla
|
||||
#define OPENCV_GPU_MEM_BANKS (1 << OPENCV_GPU_LOG_MEM_BANKS)
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
// Mask Reader
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
// swap
|
||||
|
||||
struct SingleMask
|
||||
{
|
||||
explicit __host__ __device__ __forceinline__ SingleMask(const PtrStepb& mask_) : mask(mask_) {}
|
||||
|
||||
__device__ __forceinline__ bool operator()(int y, int x) const
|
||||
{
|
||||
return mask.ptr(y)[x] != 0;
|
||||
template <typename T> void __device__ __host__ __forceinline__ swap(T& a, T& b)
|
||||
{
|
||||
const T temp = a;
|
||||
a = b;
|
||||
b = temp;
|
||||
}
|
||||
|
||||
const PtrStepb mask;
|
||||
};
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
// Mask Reader
|
||||
|
||||
struct MaskCollection
|
||||
{
|
||||
explicit __host__ __device__ __forceinline__ MaskCollection(PtrStepb* maskCollection_) : maskCollection(maskCollection_) {}
|
||||
|
||||
__device__ __forceinline__ void next()
|
||||
struct SingleMask
|
||||
{
|
||||
curMask = *maskCollection++;
|
||||
explicit __host__ __device__ __forceinline__ SingleMask(const PtrStepb& mask_) : mask(mask_) {}
|
||||
|
||||
__device__ __forceinline__ bool operator()(int y, int x) const
|
||||
{
|
||||
return mask.ptr(y)[x] != 0;
|
||||
}
|
||||
|
||||
const PtrStepb mask;
|
||||
};
|
||||
|
||||
struct MaskCollection
|
||||
{
|
||||
explicit __host__ __device__ __forceinline__ MaskCollection(PtrStepb* maskCollection_) : maskCollection(maskCollection_) {}
|
||||
|
||||
__device__ __forceinline__ void next()
|
||||
{
|
||||
curMask = *maskCollection++;
|
||||
}
|
||||
__device__ __forceinline__ void setMask(int z)
|
||||
{
|
||||
curMask = maskCollection[z];
|
||||
}
|
||||
|
||||
__device__ __forceinline__ bool operator()(int y, int x) const
|
||||
{
|
||||
uchar val;
|
||||
return curMask.data == 0 || (ForceGlob<uchar>::Load(curMask.ptr(y), x, val), (val != 0));
|
||||
}
|
||||
|
||||
const PtrStepb* maskCollection;
|
||||
PtrStepb curMask;
|
||||
};
|
||||
|
||||
struct WithOutMask
|
||||
{
|
||||
__device__ __forceinline__ void next() const
|
||||
{
|
||||
}
|
||||
__device__ __forceinline__ void setMask(int) const
|
||||
{
|
||||
}
|
||||
|
||||
__device__ __forceinline__ bool operator()(int, int) const
|
||||
{
|
||||
return true;
|
||||
}
|
||||
|
||||
__device__ __forceinline__ bool operator()(int, int, int) const
|
||||
{
|
||||
return true;
|
||||
}
|
||||
|
||||
static __device__ __forceinline__ bool check(int, int)
|
||||
{
|
||||
return true;
|
||||
}
|
||||
|
||||
static __device__ __forceinline__ bool check(int, int, int)
|
||||
{
|
||||
return true;
|
||||
}
|
||||
};
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
// Reduction
|
||||
|
||||
template <int n, typename T, typename Op> __device__ __forceinline__ void reduce(volatile T* data, T& partial_reduction, int tid, const Op& op)
|
||||
{
|
||||
StaticAssert<n >= 8 && n <= 512>::check();
|
||||
utility_detail::ReductionDispatcher<n <= 64>::reduce<n>(data, partial_reduction, tid, op);
|
||||
}
|
||||
__device__ __forceinline__ void setMask(int z)
|
||||
|
||||
template <int n, typename T, typename V, typename Pred>
|
||||
__device__ __forceinline__ void reducePredVal(volatile T* sdata, T& myData, V* sval, V& myVal, int tid, const Pred& pred)
|
||||
{
|
||||
curMask = maskCollection[z];
|
||||
StaticAssert<n >= 8 && n <= 512>::check();
|
||||
utility_detail::PredValReductionDispatcher<n <= 64>::reduce<n>(myData, myVal, sdata, sval, tid, pred);
|
||||
}
|
||||
|
||||
template <int n, typename T, typename V1, typename V2, typename Pred>
|
||||
__device__ __forceinline__ void reducePredVal2(volatile T* sdata, T& myData, V1* sval1, V1& myVal1, V2* sval2, V2& myVal2, int tid, const Pred& pred)
|
||||
{
|
||||
StaticAssert<n >= 8 && n <= 512>::check();
|
||||
utility_detail::PredVal2ReductionDispatcher<n <= 64>::reduce<n>(myData, myVal1, myVal2, sdata, sval1, sval2, tid, pred);
|
||||
}
|
||||
|
||||
__device__ __forceinline__ bool operator()(int y, int x) const
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
// Solve linear system
|
||||
|
||||
// solve 2x2 linear system Ax=b
|
||||
template <typename T> __device__ __forceinline__ bool solve2x2(const T A[2][2], const T b[2], T x[2])
|
||||
{
|
||||
uchar val;
|
||||
return curMask.data == 0 || (ForceGlob<uchar>::Load(curMask.ptr(y), x, val), (val != 0));
|
||||
T det = A[0][0] * A[1][1] - A[1][0] * A[0][1];
|
||||
|
||||
if (det != 0)
|
||||
{
|
||||
double invdet = 1.0 / det;
|
||||
|
||||
x[0] = saturate_cast<T>(invdet * (b[0] * A[1][1] - b[1] * A[0][1]));
|
||||
|
||||
x[1] = saturate_cast<T>(invdet * (A[0][0] * b[1] - A[1][0] * b[0]));
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
const PtrStepb* maskCollection;
|
||||
PtrStepb curMask;
|
||||
};
|
||||
|
||||
struct WithOutMask
|
||||
{
|
||||
__device__ __forceinline__ void next() const
|
||||
// solve 3x3 linear system Ax=b
|
||||
template <typename T> __device__ __forceinline__ bool solve3x3(const T A[3][3], const T b[3], T x[3])
|
||||
{
|
||||
T det = A[0][0] * (A[1][1] * A[2][2] - A[1][2] * A[2][1])
|
||||
- A[0][1] * (A[1][0] * A[2][2] - A[1][2] * A[2][0])
|
||||
+ A[0][2] * (A[1][0] * A[2][1] - A[1][1] * A[2][0]);
|
||||
|
||||
if (det != 0)
|
||||
{
|
||||
double invdet = 1.0 / det;
|
||||
|
||||
x[0] = saturate_cast<T>(invdet *
|
||||
(b[0] * (A[1][1] * A[2][2] - A[1][2] * A[2][1]) -
|
||||
A[0][1] * (b[1] * A[2][2] - A[1][2] * b[2] ) +
|
||||
A[0][2] * (b[1] * A[2][1] - A[1][1] * b[2] )));
|
||||
|
||||
x[1] = saturate_cast<T>(invdet *
|
||||
(A[0][0] * (b[1] * A[2][2] - A[1][2] * b[2] ) -
|
||||
b[0] * (A[1][0] * A[2][2] - A[1][2] * A[2][0]) +
|
||||
A[0][2] * (A[1][0] * b[2] - b[1] * A[2][0])));
|
||||
|
||||
x[2] = saturate_cast<T>(invdet *
|
||||
(A[0][0] * (A[1][1] * b[2] - b[1] * A[2][1]) -
|
||||
A[0][1] * (A[1][0] * b[2] - b[1] * A[2][0]) +
|
||||
b[0] * (A[1][0] * A[2][1] - A[1][1] * A[2][0])));
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
__device__ __forceinline__ void setMask(int) const
|
||||
{
|
||||
}
|
||||
|
||||
__device__ __forceinline__ bool operator()(int, int) const
|
||||
{
|
||||
return true;
|
||||
}
|
||||
|
||||
__device__ __forceinline__ bool operator()(int, int, int) const
|
||||
{
|
||||
return true;
|
||||
}
|
||||
|
||||
static __device__ __forceinline__ bool check(int, int)
|
||||
{
|
||||
return true;
|
||||
}
|
||||
|
||||
static __device__ __forceinline__ bool check(int, int, int)
|
||||
{
|
||||
return true;
|
||||
}
|
||||
};
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
// Reduction
|
||||
|
||||
template <int n, typename T, typename Op> __device__ __forceinline__ void reduce(volatile T* data, T& partial_reduction, int tid, const Op& op)
|
||||
{
|
||||
StaticAssert<n >= 8 && n <= 512>::check();
|
||||
detail::ReductionDispatcher<n <= 64>::reduce<n>(data, partial_reduction, tid, op);
|
||||
}
|
||||
|
||||
template <int n, typename T, typename V, typename Pred>
|
||||
__device__ __forceinline__ void reducePredVal(volatile T* sdata, T& myData, V* sval, V& myVal, int tid, const Pred& pred)
|
||||
{
|
||||
StaticAssert<n >= 8 && n <= 512>::check();
|
||||
detail::PredValReductionDispatcher<n <= 64>::reduce<n>(myData, myVal, sdata, sval, tid, pred);
|
||||
}
|
||||
|
||||
template <int n, typename T, typename V1, typename V2, typename Pred>
|
||||
__device__ __forceinline__ void reducePredVal2(volatile T* sdata, T& myData, V1* sval1, V1& myVal1, V2* sval2, V2& myVal2, int tid, const Pred& pred)
|
||||
{
|
||||
StaticAssert<n >= 8 && n <= 512>::check();
|
||||
detail::PredVal2ReductionDispatcher<n <= 64>::reduce<n>(myData, myVal1, myVal2, sdata, sval1, sval2, tid, pred);
|
||||
}
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
// Solve linear system
|
||||
|
||||
// solve 2x2 linear system Ax=b
|
||||
template <typename T> __device__ __forceinline__ bool solve2x2(const T A[2][2], const T b[2], T x[2])
|
||||
{
|
||||
T det = A[0][0] * A[1][1] - A[1][0] * A[0][1];
|
||||
|
||||
if (det != 0)
|
||||
{
|
||||
double invdet = 1.0 / det;
|
||||
|
||||
x[0] = saturate_cast<T>(invdet * (b[0] * A[1][1] - b[1] * A[0][1]));
|
||||
|
||||
x[1] = saturate_cast<T>(invdet * (A[0][0] * b[1] - A[1][0] * b[0]));
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
// solve 3x3 linear system Ax=b
|
||||
template <typename T> __device__ __forceinline__ bool solve3x3(const T A[3][3], const T b[3], T x[3])
|
||||
{
|
||||
T det = A[0][0] * (A[1][1] * A[2][2] - A[1][2] * A[2][1])
|
||||
- A[0][1] * (A[1][0] * A[2][2] - A[1][2] * A[2][0])
|
||||
+ A[0][2] * (A[1][0] * A[2][1] - A[1][1] * A[2][0]);
|
||||
|
||||
if (det != 0)
|
||||
{
|
||||
double invdet = 1.0 / det;
|
||||
|
||||
x[0] = saturate_cast<T>(invdet *
|
||||
(b[0] * (A[1][1] * A[2][2] - A[1][2] * A[2][1]) -
|
||||
A[0][1] * (b[1] * A[2][2] - A[1][2] * b[2] ) +
|
||||
A[0][2] * (b[1] * A[2][1] - A[1][1] * b[2] )));
|
||||
|
||||
x[1] = saturate_cast<T>(invdet *
|
||||
(A[0][0] * (b[1] * A[2][2] - A[1][2] * b[2] ) -
|
||||
b[0] * (A[1][0] * A[2][2] - A[1][2] * A[2][0]) +
|
||||
A[0][2] * (A[1][0] * b[2] - b[1] * A[2][0])));
|
||||
|
||||
x[2] = saturate_cast<T>(invdet *
|
||||
(A[0][0] * (A[1][1] * b[2] - b[1] * A[2][1]) -
|
||||
A[0][1] * (A[1][0] * b[2] - b[1] * A[2][0]) +
|
||||
b[0] * (A[1][0] * A[2][1] - A[1][1] * A[2][0])));
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
END_OPENCV_DEVICE_NAMESPACE
|
||||
}}} // namespace cv { namespace gpu { namespace device
|
||||
|
||||
#endif // __OPENCV_GPU_UTILITY_HPP__
|
||||
|
@ -48,179 +48,178 @@
|
||||
#include "functional.hpp"
|
||||
#include "detail/vec_distance_detail.hpp"
|
||||
|
||||
BEGIN_OPENCV_DEVICE_NAMESPACE
|
||||
|
||||
template <typename T> struct L1Dist
|
||||
namespace cv { namespace gpu { namespace device
|
||||
{
|
||||
typedef int value_type;
|
||||
typedef int result_type;
|
||||
|
||||
__device__ __forceinline__ L1Dist() : mySum(0) {}
|
||||
|
||||
__device__ __forceinline__ void reduceIter(int val1, int val2)
|
||||
template <typename T> struct L1Dist
|
||||
{
|
||||
mySum = __sad(val1, val2, mySum);
|
||||
typedef int value_type;
|
||||
typedef int result_type;
|
||||
|
||||
__device__ __forceinline__ L1Dist() : mySum(0) {}
|
||||
|
||||
__device__ __forceinline__ void reduceIter(int val1, int val2)
|
||||
{
|
||||
mySum = __sad(val1, val2, mySum);
|
||||
}
|
||||
|
||||
template <int THREAD_DIM> __device__ __forceinline__ void reduceAll(int* smem, int tid)
|
||||
{
|
||||
reduce<THREAD_DIM>(smem, mySum, tid, plus<volatile int>());
|
||||
}
|
||||
|
||||
__device__ __forceinline__ operator int() const
|
||||
{
|
||||
return mySum;
|
||||
}
|
||||
|
||||
int mySum;
|
||||
};
|
||||
template <> struct L1Dist<float>
|
||||
{
|
||||
typedef float value_type;
|
||||
typedef float result_type;
|
||||
|
||||
__device__ __forceinline__ L1Dist() : mySum(0.0f) {}
|
||||
|
||||
__device__ __forceinline__ void reduceIter(float val1, float val2)
|
||||
{
|
||||
mySum += ::fabs(val1 - val2);
|
||||
}
|
||||
|
||||
template <int THREAD_DIM> __device__ __forceinline__ void reduceAll(float* smem, int tid)
|
||||
{
|
||||
reduce<THREAD_DIM>(smem, mySum, tid, plus<volatile float>());
|
||||
}
|
||||
|
||||
__device__ __forceinline__ operator float() const
|
||||
{
|
||||
return mySum;
|
||||
}
|
||||
|
||||
float mySum;
|
||||
};
|
||||
|
||||
struct L2Dist
|
||||
{
|
||||
typedef float value_type;
|
||||
typedef float result_type;
|
||||
|
||||
__device__ __forceinline__ L2Dist() : mySum(0.0f) {}
|
||||
|
||||
__device__ __forceinline__ void reduceIter(float val1, float val2)
|
||||
{
|
||||
float reg = val1 - val2;
|
||||
mySum += reg * reg;
|
||||
}
|
||||
|
||||
template <int THREAD_DIM> __device__ __forceinline__ void reduceAll(float* smem, int tid)
|
||||
{
|
||||
reduce<THREAD_DIM>(smem, mySum, tid, plus<volatile float>());
|
||||
}
|
||||
|
||||
__device__ __forceinline__ operator float() const
|
||||
{
|
||||
return sqrtf(mySum);
|
||||
}
|
||||
|
||||
float mySum;
|
||||
};
|
||||
|
||||
struct HammingDist
|
||||
{
|
||||
typedef int value_type;
|
||||
typedef int result_type;
|
||||
|
||||
__device__ __forceinline__ HammingDist() : mySum(0) {}
|
||||
|
||||
__device__ __forceinline__ void reduceIter(int val1, int val2)
|
||||
{
|
||||
mySum += __popc(val1 ^ val2);
|
||||
}
|
||||
|
||||
template <int THREAD_DIM> __device__ __forceinline__ void reduceAll(int* smem, int tid)
|
||||
{
|
||||
reduce<THREAD_DIM>(smem, mySum, tid, plus<volatile int>());
|
||||
}
|
||||
|
||||
__device__ __forceinline__ operator int() const
|
||||
{
|
||||
return mySum;
|
||||
}
|
||||
|
||||
int mySum;
|
||||
};
|
||||
|
||||
// calc distance between two vectors in global memory
|
||||
template <int THREAD_DIM, typename Dist, typename T1, typename T2>
|
||||
__device__ void calcVecDiffGlobal(const T1* vec1, const T2* vec2, int len, Dist& dist, typename Dist::result_type* smem, int tid)
|
||||
{
|
||||
for (int i = tid; i < len; i += THREAD_DIM)
|
||||
{
|
||||
T1 val1;
|
||||
ForceGlob<T1>::Load(vec1, i, val1);
|
||||
|
||||
T2 val2;
|
||||
ForceGlob<T2>::Load(vec2, i, val2);
|
||||
|
||||
dist.reduceIter(val1, val2);
|
||||
}
|
||||
|
||||
dist.reduceAll<THREAD_DIM>(smem, tid);
|
||||
}
|
||||
|
||||
template <int THREAD_DIM> __device__ __forceinline__ void reduceAll(int* smem, int tid)
|
||||
{
|
||||
reduce<THREAD_DIM>(smem, mySum, tid, plus<volatile int>());
|
||||
// calc distance between two vectors, first vector is cached in register or shared memory, second vector is in global memory
|
||||
template <int THREAD_DIM, int MAX_LEN, bool LEN_EQ_MAX_LEN, typename Dist, typename T1, typename T2>
|
||||
__device__ __forceinline__ void calcVecDiffCached(const T1* vecCached, const T2* vecGlob, int len, Dist& dist, typename Dist::result_type* smem, int tid)
|
||||
{
|
||||
vec_distance_detail::VecDiffCachedCalculator<THREAD_DIM, MAX_LEN, LEN_EQ_MAX_LEN>::calc(vecCached, vecGlob, len, dist, tid);
|
||||
|
||||
dist.reduceAll<THREAD_DIM>(smem, tid);
|
||||
}
|
||||
|
||||
__device__ __forceinline__ operator int() const
|
||||
// calc distance between two vectors in global memory
|
||||
template <int THREAD_DIM, typename T1> struct VecDiffGlobal
|
||||
{
|
||||
return mySum;
|
||||
}
|
||||
explicit __device__ __forceinline__ VecDiffGlobal(const T1* vec1_, int = 0, void* = 0, int = 0, int = 0)
|
||||
{
|
||||
vec1 = vec1_;
|
||||
}
|
||||
|
||||
int mySum;
|
||||
};
|
||||
template <> struct L1Dist<float>
|
||||
{
|
||||
typedef float value_type;
|
||||
typedef float result_type;
|
||||
template <typename T2, typename Dist>
|
||||
__device__ __forceinline__ void calc(const T2* vec2, int len, Dist& dist, typename Dist::result_type* smem, int tid) const
|
||||
{
|
||||
calcVecDiffGlobal<THREAD_DIM>(vec1, vec2, len, dist, smem, tid);
|
||||
}
|
||||
|
||||
__device__ __forceinline__ L1Dist() : mySum(0.0f) {}
|
||||
const T1* vec1;
|
||||
};
|
||||
|
||||
__device__ __forceinline__ void reduceIter(float val1, float val2)
|
||||
// calc distance between two vectors, first vector is cached in register memory, second vector is in global memory
|
||||
template <int THREAD_DIM, int MAX_LEN, bool LEN_EQ_MAX_LEN, typename U> struct VecDiffCachedRegister
|
||||
{
|
||||
mySum += ::fabs(val1 - val2);
|
||||
}
|
||||
template <typename T1> __device__ __forceinline__ VecDiffCachedRegister(const T1* vec1, int len, U* smem, int glob_tid, int tid)
|
||||
{
|
||||
if (glob_tid < len)
|
||||
smem[glob_tid] = vec1[glob_tid];
|
||||
__syncthreads();
|
||||
|
||||
template <int THREAD_DIM> __device__ __forceinline__ void reduceAll(float* smem, int tid)
|
||||
{
|
||||
reduce<THREAD_DIM>(smem, mySum, tid, plus<volatile float>());
|
||||
}
|
||||
U* vec1ValsPtr = vec1Vals;
|
||||
|
||||
__device__ __forceinline__ operator float() const
|
||||
{
|
||||
return mySum;
|
||||
}
|
||||
#pragma unroll
|
||||
for (int i = tid; i < MAX_LEN; i += THREAD_DIM)
|
||||
*vec1ValsPtr++ = smem[i];
|
||||
|
||||
float mySum;
|
||||
};
|
||||
__syncthreads();
|
||||
}
|
||||
|
||||
struct L2Dist
|
||||
{
|
||||
typedef float value_type;
|
||||
typedef float result_type;
|
||||
template <typename T2, typename Dist>
|
||||
__device__ __forceinline__ void calc(const T2* vec2, int len, Dist& dist, typename Dist::result_type* smem, int tid) const
|
||||
{
|
||||
calcVecDiffCached<THREAD_DIM, MAX_LEN, LEN_EQ_MAX_LEN>(vec1Vals, vec2, len, dist, smem, tid);
|
||||
}
|
||||
|
||||
__device__ __forceinline__ L2Dist() : mySum(0.0f) {}
|
||||
|
||||
__device__ __forceinline__ void reduceIter(float val1, float val2)
|
||||
{
|
||||
float reg = val1 - val2;
|
||||
mySum += reg * reg;
|
||||
}
|
||||
|
||||
template <int THREAD_DIM> __device__ __forceinline__ void reduceAll(float* smem, int tid)
|
||||
{
|
||||
reduce<THREAD_DIM>(smem, mySum, tid, plus<volatile float>());
|
||||
}
|
||||
|
||||
__device__ __forceinline__ operator float() const
|
||||
{
|
||||
return sqrtf(mySum);
|
||||
}
|
||||
|
||||
float mySum;
|
||||
};
|
||||
|
||||
struct HammingDist
|
||||
{
|
||||
typedef int value_type;
|
||||
typedef int result_type;
|
||||
|
||||
__device__ __forceinline__ HammingDist() : mySum(0) {}
|
||||
|
||||
__device__ __forceinline__ void reduceIter(int val1, int val2)
|
||||
{
|
||||
mySum += __popc(val1 ^ val2);
|
||||
}
|
||||
|
||||
template <int THREAD_DIM> __device__ __forceinline__ void reduceAll(int* smem, int tid)
|
||||
{
|
||||
reduce<THREAD_DIM>(smem, mySum, tid, plus<volatile int>());
|
||||
}
|
||||
|
||||
__device__ __forceinline__ operator int() const
|
||||
{
|
||||
return mySum;
|
||||
}
|
||||
|
||||
int mySum;
|
||||
};
|
||||
|
||||
// calc distance between two vectors in global memory
|
||||
template <int THREAD_DIM, typename Dist, typename T1, typename T2>
|
||||
__device__ void calcVecDiffGlobal(const T1* vec1, const T2* vec2, int len, Dist& dist, typename Dist::result_type* smem, int tid)
|
||||
{
|
||||
for (int i = tid; i < len; i += THREAD_DIM)
|
||||
{
|
||||
T1 val1;
|
||||
ForceGlob<T1>::Load(vec1, i, val1);
|
||||
|
||||
T2 val2;
|
||||
ForceGlob<T2>::Load(vec2, i, val2);
|
||||
|
||||
dist.reduceIter(val1, val2);
|
||||
}
|
||||
|
||||
dist.reduceAll<THREAD_DIM>(smem, tid);
|
||||
}
|
||||
|
||||
// calc distance between two vectors, first vector is cached in register or shared memory, second vector is in global memory
|
||||
template <int THREAD_DIM, int MAX_LEN, bool LEN_EQ_MAX_LEN, typename Dist, typename T1, typename T2>
|
||||
__device__ __forceinline__ void calcVecDiffCached(const T1* vecCached, const T2* vecGlob, int len, Dist& dist, typename Dist::result_type* smem, int tid)
|
||||
{
|
||||
detail::VecDiffCachedCalculator<THREAD_DIM, MAX_LEN, LEN_EQ_MAX_LEN>::calc(vecCached, vecGlob, len, dist, tid);
|
||||
|
||||
dist.reduceAll<THREAD_DIM>(smem, tid);
|
||||
}
|
||||
|
||||
// calc distance between two vectors in global memory
|
||||
template <int THREAD_DIM, typename T1> struct VecDiffGlobal
|
||||
{
|
||||
explicit __device__ __forceinline__ VecDiffGlobal(const T1* vec1_, int = 0, void* = 0, int = 0, int = 0)
|
||||
{
|
||||
vec1 = vec1_;
|
||||
}
|
||||
|
||||
template <typename T2, typename Dist>
|
||||
__device__ __forceinline__ void calc(const T2* vec2, int len, Dist& dist, typename Dist::result_type* smem, int tid) const
|
||||
{
|
||||
calcVecDiffGlobal<THREAD_DIM>(vec1, vec2, len, dist, smem, tid);
|
||||
}
|
||||
|
||||
const T1* vec1;
|
||||
};
|
||||
|
||||
// calc distance between two vectors, first vector is cached in register memory, second vector is in global memory
|
||||
template <int THREAD_DIM, int MAX_LEN, bool LEN_EQ_MAX_LEN, typename U> struct VecDiffCachedRegister
|
||||
{
|
||||
template <typename T1> __device__ __forceinline__ VecDiffCachedRegister(const T1* vec1, int len, U* smem, int glob_tid, int tid)
|
||||
{
|
||||
if (glob_tid < len)
|
||||
smem[glob_tid] = vec1[glob_tid];
|
||||
__syncthreads();
|
||||
|
||||
U* vec1ValsPtr = vec1Vals;
|
||||
|
||||
#pragma unroll
|
||||
for (int i = tid; i < MAX_LEN; i += THREAD_DIM)
|
||||
*vec1ValsPtr++ = smem[i];
|
||||
|
||||
__syncthreads();
|
||||
}
|
||||
|
||||
template <typename T2, typename Dist>
|
||||
__device__ __forceinline__ void calc(const T2* vec2, int len, Dist& dist, typename Dist::result_type* smem, int tid) const
|
||||
{
|
||||
calcVecDiffCached<THREAD_DIM, MAX_LEN, LEN_EQ_MAX_LEN>(vec1Vals, vec2, len, dist, smem, tid);
|
||||
}
|
||||
|
||||
U vec1Vals[MAX_LEN / THREAD_DIM];
|
||||
};
|
||||
|
||||
END_OPENCV_DEVICE_NAMESPACE
|
||||
U vec1Vals[MAX_LEN / THREAD_DIM];
|
||||
};
|
||||
}}} // namespace cv { namespace gpu { namespace device
|
||||
|
||||
#endif // __OPENCV_GPU_VEC_DISTANCE_HPP__
|
||||
|
@ -48,85 +48,85 @@
|
||||
#include "vec_traits.hpp"
|
||||
#include "functional.hpp"
|
||||
|
||||
BEGIN_OPENCV_DEVICE_NAMESPACE
|
||||
|
||||
namespace detail
|
||||
namespace cv { namespace gpu { namespace device
|
||||
{
|
||||
template <int cn, typename VecD> struct SatCastHelper;
|
||||
template <typename VecD> struct SatCastHelper<1, VecD>
|
||||
namespace vec_math_detail
|
||||
{
|
||||
template <typename VecS> static __device__ __forceinline__ VecD cast(const VecS& v)
|
||||
template <int cn, typename VecD> struct SatCastHelper;
|
||||
template <typename VecD> struct SatCastHelper<1, VecD>
|
||||
{
|
||||
typedef typename VecTraits<VecD>::elem_type D;
|
||||
return VecTraits<VecD>::make(saturate_cast<D>(v.x));
|
||||
}
|
||||
};
|
||||
template <typename VecD> struct SatCastHelper<2, VecD>
|
||||
{
|
||||
template <typename VecS> static __device__ __forceinline__ VecD cast(const VecS& v)
|
||||
template <typename VecS> static __device__ __forceinline__ VecD cast(const VecS& v)
|
||||
{
|
||||
typedef typename VecTraits<VecD>::elem_type D;
|
||||
return VecTraits<VecD>::make(saturate_cast<D>(v.x));
|
||||
}
|
||||
};
|
||||
template <typename VecD> struct SatCastHelper<2, VecD>
|
||||
{
|
||||
typedef typename VecTraits<VecD>::elem_type D;
|
||||
return VecTraits<VecD>::make(saturate_cast<D>(v.x), saturate_cast<D>(v.y));
|
||||
}
|
||||
};
|
||||
template <typename VecD> struct SatCastHelper<3, VecD>
|
||||
{
|
||||
template <typename VecS> static __device__ __forceinline__ VecD cast(const VecS& v)
|
||||
template <typename VecS> static __device__ __forceinline__ VecD cast(const VecS& v)
|
||||
{
|
||||
typedef typename VecTraits<VecD>::elem_type D;
|
||||
return VecTraits<VecD>::make(saturate_cast<D>(v.x), saturate_cast<D>(v.y));
|
||||
}
|
||||
};
|
||||
template <typename VecD> struct SatCastHelper<3, VecD>
|
||||
{
|
||||
typedef typename VecTraits<VecD>::elem_type D;
|
||||
return VecTraits<VecD>::make(saturate_cast<D>(v.x), saturate_cast<D>(v.y), saturate_cast<D>(v.z));
|
||||
}
|
||||
};
|
||||
template <typename VecD> struct SatCastHelper<4, VecD>
|
||||
{
|
||||
template <typename VecS> static __device__ __forceinline__ VecD cast(const VecS& v)
|
||||
template <typename VecS> static __device__ __forceinline__ VecD cast(const VecS& v)
|
||||
{
|
||||
typedef typename VecTraits<VecD>::elem_type D;
|
||||
return VecTraits<VecD>::make(saturate_cast<D>(v.x), saturate_cast<D>(v.y), saturate_cast<D>(v.z));
|
||||
}
|
||||
};
|
||||
template <typename VecD> struct SatCastHelper<4, VecD>
|
||||
{
|
||||
typedef typename VecTraits<VecD>::elem_type D;
|
||||
return VecTraits<VecD>::make(saturate_cast<D>(v.x), saturate_cast<D>(v.y), saturate_cast<D>(v.z), saturate_cast<D>(v.w));
|
||||
}
|
||||
};
|
||||
template <typename VecS> static __device__ __forceinline__ VecD cast(const VecS& v)
|
||||
{
|
||||
typedef typename VecTraits<VecD>::elem_type D;
|
||||
return VecTraits<VecD>::make(saturate_cast<D>(v.x), saturate_cast<D>(v.y), saturate_cast<D>(v.z), saturate_cast<D>(v.w));
|
||||
}
|
||||
};
|
||||
|
||||
template <typename VecD, typename VecS> static __device__ __forceinline__ VecD saturate_cast_caller(const VecS& v)
|
||||
{
|
||||
return SatCastHelper<VecTraits<VecD>::cn, VecD>::cast(v);
|
||||
template <typename VecD, typename VecS> static __device__ __forceinline__ VecD saturate_cast_caller(const VecS& v)
|
||||
{
|
||||
return SatCastHelper<VecTraits<VecD>::cn, VecD>::cast(v);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const uchar1& v) {return detail::saturate_cast_caller<_Tp>(v);}
|
||||
template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const char1& v) {return detail::saturate_cast_caller<_Tp>(v);}
|
||||
template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const ushort1& v) {return detail::saturate_cast_caller<_Tp>(v);}
|
||||
template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const short1& v) {return detail::saturate_cast_caller<_Tp>(v);}
|
||||
template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const uint1& v) {return detail::saturate_cast_caller<_Tp>(v);}
|
||||
template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const int1& v) {return detail::saturate_cast_caller<_Tp>(v);}
|
||||
template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const float1& v) {return detail::saturate_cast_caller<_Tp>(v);}
|
||||
template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const double1& v) {return detail::saturate_cast_caller<_Tp>(v);}
|
||||
template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const uchar1& v) {return vec_math_detail::saturate_cast_caller<_Tp>(v);}
|
||||
template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const char1& v) {return vec_math_detail::saturate_cast_caller<_Tp>(v);}
|
||||
template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const ushort1& v) {return vec_math_detail::saturate_cast_caller<_Tp>(v);}
|
||||
template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const short1& v) {return vec_math_detail::saturate_cast_caller<_Tp>(v);}
|
||||
template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const uint1& v) {return vec_math_detail::saturate_cast_caller<_Tp>(v);}
|
||||
template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const int1& v) {return vec_math_detail::saturate_cast_caller<_Tp>(v);}
|
||||
template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const float1& v) {return vec_math_detail::saturate_cast_caller<_Tp>(v);}
|
||||
template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const double1& v) {return vec_math_detail::saturate_cast_caller<_Tp>(v);}
|
||||
|
||||
template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const uchar2& v) {return detail::saturate_cast_caller<_Tp>(v);}
|
||||
template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const char2& v) {return detail::saturate_cast_caller<_Tp>(v);}
|
||||
template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const ushort2& v) {return detail::saturate_cast_caller<_Tp>(v);}
|
||||
template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const short2& v) {return detail::saturate_cast_caller<_Tp>(v);}
|
||||
template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const uint2& v) {return detail::saturate_cast_caller<_Tp>(v);}
|
||||
template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const int2& v) {return detail::saturate_cast_caller<_Tp>(v);}
|
||||
template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const float2& v) {return detail::saturate_cast_caller<_Tp>(v);}
|
||||
template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const double2& v) {return detail::saturate_cast_caller<_Tp>(v);}
|
||||
template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const uchar2& v) {return vec_math_detail::saturate_cast_caller<_Tp>(v);}
|
||||
template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const char2& v) {return vec_math_detail::saturate_cast_caller<_Tp>(v);}
|
||||
template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const ushort2& v) {return vec_math_detail::saturate_cast_caller<_Tp>(v);}
|
||||
template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const short2& v) {return vec_math_detail::saturate_cast_caller<_Tp>(v);}
|
||||
template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const uint2& v) {return vec_math_detail::saturate_cast_caller<_Tp>(v);}
|
||||
template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const int2& v) {return vec_math_detail::saturate_cast_caller<_Tp>(v);}
|
||||
template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const float2& v) {return vec_math_detail::saturate_cast_caller<_Tp>(v);}
|
||||
template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const double2& v) {return vec_math_detail::saturate_cast_caller<_Tp>(v);}
|
||||
|
||||
template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const uchar3& v) {return detail::saturate_cast_caller<_Tp>(v);}
|
||||
template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const char3& v) {return detail::saturate_cast_caller<_Tp>(v);}
|
||||
template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const ushort3& v) {return detail::saturate_cast_caller<_Tp>(v);}
|
||||
template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const short3& v) {return detail::saturate_cast_caller<_Tp>(v);}
|
||||
template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const uint3& v) {return detail::saturate_cast_caller<_Tp>(v);}
|
||||
template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const int3& v) {return detail::saturate_cast_caller<_Tp>(v);}
|
||||
template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const float3& v) {return detail::saturate_cast_caller<_Tp>(v);}
|
||||
template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const double3& v) {return detail::saturate_cast_caller<_Tp>(v);}
|
||||
template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const uchar3& v) {return vec_math_detail::saturate_cast_caller<_Tp>(v);}
|
||||
template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const char3& v) {return vec_math_detail::saturate_cast_caller<_Tp>(v);}
|
||||
template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const ushort3& v) {return vec_math_detail::saturate_cast_caller<_Tp>(v);}
|
||||
template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const short3& v) {return vec_math_detail::saturate_cast_caller<_Tp>(v);}
|
||||
template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const uint3& v) {return vec_math_detail::saturate_cast_caller<_Tp>(v);}
|
||||
template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const int3& v) {return vec_math_detail::saturate_cast_caller<_Tp>(v);}
|
||||
template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const float3& v) {return vec_math_detail::saturate_cast_caller<_Tp>(v);}
|
||||
template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const double3& v) {return vec_math_detail::saturate_cast_caller<_Tp>(v);}
|
||||
|
||||
template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const uchar4& v) {return detail::saturate_cast_caller<_Tp>(v);}
|
||||
template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const char4& v) {return detail::saturate_cast_caller<_Tp>(v);}
|
||||
template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const ushort4& v) {return detail::saturate_cast_caller<_Tp>(v);}
|
||||
template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const short4& v) {return detail::saturate_cast_caller<_Tp>(v);}
|
||||
template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const uint4& v) {return detail::saturate_cast_caller<_Tp>(v);}
|
||||
template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const int4& v) {return detail::saturate_cast_caller<_Tp>(v);}
|
||||
template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const float4& v) {return detail::saturate_cast_caller<_Tp>(v);}
|
||||
template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const double4& v) {return detail::saturate_cast_caller<_Tp>(v);}
|
||||
template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const uchar4& v) {return vec_math_detail::saturate_cast_caller<_Tp>(v);}
|
||||
template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const char4& v) {return vec_math_detail::saturate_cast_caller<_Tp>(v);}
|
||||
template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const ushort4& v) {return vec_math_detail::saturate_cast_caller<_Tp>(v);}
|
||||
template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const short4& v) {return vec_math_detail::saturate_cast_caller<_Tp>(v);}
|
||||
template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const uint4& v) {return vec_math_detail::saturate_cast_caller<_Tp>(v);}
|
||||
template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const int4& v) {return vec_math_detail::saturate_cast_caller<_Tp>(v);}
|
||||
template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const float4& v) {return vec_math_detail::saturate_cast_caller<_Tp>(v);}
|
||||
template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const double4& v) {return vec_math_detail::saturate_cast_caller<_Tp>(v);}
|
||||
|
||||
#define OPENCV_GPU_IMPLEMENT_VEC_UNOP(type, op, func) \
|
||||
__device__ __forceinline__ TypeVec<func<type>::result_type, 1>::vec_type op(const type ## 1 & a) \
|
||||
@ -150,49 +150,49 @@ template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const
|
||||
return VecTraits<TypeVec<func<type>::result_type, 4>::vec_type>::make(f(a.x), f(a.y), f(a.z), f(a.w)); \
|
||||
}
|
||||
|
||||
namespace detail
|
||||
{
|
||||
template <typename T1, typename T2> struct BinOpTraits
|
||||
{
|
||||
typedef int argument_type;
|
||||
};
|
||||
template <typename T> struct BinOpTraits<T, T>
|
||||
{
|
||||
typedef T argument_type;
|
||||
};
|
||||
template <typename T> struct BinOpTraits<T, double>
|
||||
{
|
||||
typedef double argument_type;
|
||||
};
|
||||
template <typename T> struct BinOpTraits<double, T>
|
||||
{
|
||||
typedef double argument_type;
|
||||
};
|
||||
template <> struct BinOpTraits<double, double>
|
||||
{
|
||||
typedef double argument_type;
|
||||
};
|
||||
template <typename T> struct BinOpTraits<T, float>
|
||||
{
|
||||
typedef float argument_type;
|
||||
};
|
||||
template <typename T> struct BinOpTraits<float, T>
|
||||
{
|
||||
typedef float argument_type;
|
||||
};
|
||||
template <> struct BinOpTraits<float, float>
|
||||
{
|
||||
typedef float argument_type;
|
||||
};
|
||||
template <> struct BinOpTraits<double, float>
|
||||
{
|
||||
typedef double argument_type;
|
||||
};
|
||||
template <> struct BinOpTraits<float, double>
|
||||
{
|
||||
typedef double argument_type;
|
||||
};
|
||||
}
|
||||
namespace vec_math_detail
|
||||
{
|
||||
template <typename T1, typename T2> struct BinOpTraits
|
||||
{
|
||||
typedef int argument_type;
|
||||
};
|
||||
template <typename T> struct BinOpTraits<T, T>
|
||||
{
|
||||
typedef T argument_type;
|
||||
};
|
||||
template <typename T> struct BinOpTraits<T, double>
|
||||
{
|
||||
typedef double argument_type;
|
||||
};
|
||||
template <typename T> struct BinOpTraits<double, T>
|
||||
{
|
||||
typedef double argument_type;
|
||||
};
|
||||
template <> struct BinOpTraits<double, double>
|
||||
{
|
||||
typedef double argument_type;
|
||||
};
|
||||
template <typename T> struct BinOpTraits<T, float>
|
||||
{
|
||||
typedef float argument_type;
|
||||
};
|
||||
template <typename T> struct BinOpTraits<float, T>
|
||||
{
|
||||
typedef float argument_type;
|
||||
};
|
||||
template <> struct BinOpTraits<float, float>
|
||||
{
|
||||
typedef float argument_type;
|
||||
};
|
||||
template <> struct BinOpTraits<double, float>
|
||||
{
|
||||
typedef double argument_type;
|
||||
};
|
||||
template <> struct BinOpTraits<float, double>
|
||||
{
|
||||
typedef double argument_type;
|
||||
};
|
||||
}
|
||||
|
||||
#define OPENCV_GPU_IMPLEMENT_VEC_BINOP(type, op, func) \
|
||||
__device__ __forceinline__ TypeVec<func<type>::result_type, 1>::vec_type op(const type ## 1 & a, const type ## 1 & b) \
|
||||
@ -201,16 +201,16 @@ namespace detail
|
||||
return VecTraits<TypeVec<func<type>::result_type, 1>::vec_type>::make(f(a.x, b.x)); \
|
||||
} \
|
||||
template <typename T> \
|
||||
__device__ __forceinline__ typename TypeVec<typename func<typename detail::BinOpTraits<type, T>::argument_type>::result_type, 1>::vec_type op(const type ## 1 & v, T s) \
|
||||
__device__ __forceinline__ typename TypeVec<typename func<typename vec_math_detail::BinOpTraits<type, T>::argument_type>::result_type, 1>::vec_type op(const type ## 1 & v, T s) \
|
||||
{ \
|
||||
func<typename detail::BinOpTraits<type, T>::argument_type> f; \
|
||||
return VecTraits<typename TypeVec<typename func<typename detail::BinOpTraits<type, T>::argument_type>::result_type, 1>::vec_type>::make(f(v.x, s)); \
|
||||
func<typename vec_math_detail::BinOpTraits<type, T>::argument_type> f; \
|
||||
return VecTraits<typename TypeVec<typename func<typename vec_math_detail::BinOpTraits<type, T>::argument_type>::result_type, 1>::vec_type>::make(f(v.x, s)); \
|
||||
} \
|
||||
template <typename T> \
|
||||
__device__ __forceinline__ typename TypeVec<typename func<typename detail::BinOpTraits<type, T>::argument_type>::result_type, 1>::vec_type op(T s, const type ## 1 & v) \
|
||||
__device__ __forceinline__ typename TypeVec<typename func<typename vec_math_detail::BinOpTraits<type, T>::argument_type>::result_type, 1>::vec_type op(T s, const type ## 1 & v) \
|
||||
{ \
|
||||
func<typename detail::BinOpTraits<type, T>::argument_type> f; \
|
||||
return VecTraits<typename TypeVec<typename func<typename detail::BinOpTraits<type, T>::argument_type>::result_type, 1>::vec_type>::make(f(s, v.x)); \
|
||||
func<typename vec_math_detail::BinOpTraits<type, T>::argument_type> f; \
|
||||
return VecTraits<typename TypeVec<typename func<typename vec_math_detail::BinOpTraits<type, T>::argument_type>::result_type, 1>::vec_type>::make(f(s, v.x)); \
|
||||
} \
|
||||
__device__ __forceinline__ TypeVec<func<type>::result_type, 2>::vec_type op(const type ## 2 & a, const type ## 2 & b) \
|
||||
{ \
|
||||
@ -218,16 +218,16 @@ namespace detail
|
||||
return VecTraits<TypeVec<func<type>::result_type, 2>::vec_type>::make(f(a.x, b.x), f(a.y, b.y)); \
|
||||
} \
|
||||
template <typename T> \
|
||||
__device__ __forceinline__ typename TypeVec<typename func<typename detail::BinOpTraits<type, T>::argument_type>::result_type, 2>::vec_type op(const type ## 2 & v, T s) \
|
||||
__device__ __forceinline__ typename TypeVec<typename func<typename vec_math_detail::BinOpTraits<type, T>::argument_type>::result_type, 2>::vec_type op(const type ## 2 & v, T s) \
|
||||
{ \
|
||||
func<typename detail::BinOpTraits<type, T>::argument_type> f; \
|
||||
return VecTraits<typename TypeVec<typename func<typename detail::BinOpTraits<type, T>::argument_type>::result_type, 2>::vec_type>::make(f(v.x, s), f(v.y, s)); \
|
||||
func<typename vec_math_detail::BinOpTraits<type, T>::argument_type> f; \
|
||||
return VecTraits<typename TypeVec<typename func<typename vec_math_detail::BinOpTraits<type, T>::argument_type>::result_type, 2>::vec_type>::make(f(v.x, s), f(v.y, s)); \
|
||||
} \
|
||||
template <typename T> \
|
||||
__device__ __forceinline__ typename TypeVec<typename func<typename detail::BinOpTraits<type, T>::argument_type>::result_type, 2>::vec_type op(T s, const type ## 2 & v) \
|
||||
__device__ __forceinline__ typename TypeVec<typename func<typename vec_math_detail::BinOpTraits<type, T>::argument_type>::result_type, 2>::vec_type op(T s, const type ## 2 & v) \
|
||||
{ \
|
||||
func<typename detail::BinOpTraits<type, T>::argument_type> f; \
|
||||
return VecTraits<typename TypeVec<typename func<typename detail::BinOpTraits<type, T>::argument_type>::result_type, 2>::vec_type>::make(f(s, v.x), f(s, v.y)); \
|
||||
func<typename vec_math_detail::BinOpTraits<type, T>::argument_type> f; \
|
||||
return VecTraits<typename TypeVec<typename func<typename vec_math_detail::BinOpTraits<type, T>::argument_type>::result_type, 2>::vec_type>::make(f(s, v.x), f(s, v.y)); \
|
||||
} \
|
||||
__device__ __forceinline__ TypeVec<func<type>::result_type, 3>::vec_type op(const type ## 3 & a, const type ## 3 & b) \
|
||||
{ \
|
||||
@ -235,16 +235,16 @@ namespace detail
|
||||
return VecTraits<TypeVec<func<type>::result_type, 3>::vec_type>::make(f(a.x, b.x), f(a.y, b.y), f(a.z, b.z)); \
|
||||
} \
|
||||
template <typename T> \
|
||||
__device__ __forceinline__ typename TypeVec<typename func<typename detail::BinOpTraits<type, T>::argument_type>::result_type, 3>::vec_type op(const type ## 3 & v, T s) \
|
||||
__device__ __forceinline__ typename TypeVec<typename func<typename vec_math_detail::BinOpTraits<type, T>::argument_type>::result_type, 3>::vec_type op(const type ## 3 & v, T s) \
|
||||
{ \
|
||||
func<typename detail::BinOpTraits<type, T>::argument_type> f; \
|
||||
return VecTraits<typename TypeVec<typename func<typename detail::BinOpTraits<type, T>::argument_type>::result_type, 3>::vec_type>::make(f(v.x, s), f(v.y, s), f(v.z, s)); \
|
||||
func<typename vec_math_detail::BinOpTraits<type, T>::argument_type> f; \
|
||||
return VecTraits<typename TypeVec<typename func<typename vec_math_detail::BinOpTraits<type, T>::argument_type>::result_type, 3>::vec_type>::make(f(v.x, s), f(v.y, s), f(v.z, s)); \
|
||||
} \
|
||||
template <typename T> \
|
||||
__device__ __forceinline__ typename TypeVec<typename func<typename detail::BinOpTraits<type, T>::argument_type>::result_type, 3>::vec_type op(T s, const type ## 3 & v) \
|
||||
__device__ __forceinline__ typename TypeVec<typename func<typename vec_math_detail::BinOpTraits<type, T>::argument_type>::result_type, 3>::vec_type op(T s, const type ## 3 & v) \
|
||||
{ \
|
||||
func<typename detail::BinOpTraits<type, T>::argument_type> f; \
|
||||
return VecTraits<typename TypeVec<typename func<typename detail::BinOpTraits<type, T>::argument_type>::result_type, 3>::vec_type>::make(f(s, v.x), f(s, v.y), f(s, v.z)); \
|
||||
func<typename vec_math_detail::BinOpTraits<type, T>::argument_type> f; \
|
||||
return VecTraits<typename TypeVec<typename func<typename vec_math_detail::BinOpTraits<type, T>::argument_type>::result_type, 3>::vec_type>::make(f(s, v.x), f(s, v.y), f(s, v.z)); \
|
||||
} \
|
||||
__device__ __forceinline__ TypeVec<func<type>::result_type, 4>::vec_type op(const type ## 4 & a, const type ## 4 & b) \
|
||||
{ \
|
||||
@ -252,16 +252,16 @@ namespace detail
|
||||
return VecTraits<TypeVec<func<type>::result_type, 4>::vec_type>::make(f(a.x, b.x), f(a.y, b.y), f(a.z, b.z), f(a.w, b.w)); \
|
||||
} \
|
||||
template <typename T> \
|
||||
__device__ __forceinline__ typename TypeVec<typename func<typename detail::BinOpTraits<type, T>::argument_type>::result_type, 4>::vec_type op(const type ## 4 & v, T s) \
|
||||
__device__ __forceinline__ typename TypeVec<typename func<typename vec_math_detail::BinOpTraits<type, T>::argument_type>::result_type, 4>::vec_type op(const type ## 4 & v, T s) \
|
||||
{ \
|
||||
func<typename detail::BinOpTraits<type, T>::argument_type> f; \
|
||||
return VecTraits<typename TypeVec<typename func<typename detail::BinOpTraits<type, T>::argument_type>::result_type, 4>::vec_type>::make(f(v.x, s), f(v.y, s), f(v.z, s), f(v.w, s)); \
|
||||
func<typename vec_math_detail::BinOpTraits<type, T>::argument_type> f; \
|
||||
return VecTraits<typename TypeVec<typename func<typename vec_math_detail::BinOpTraits<type, T>::argument_type>::result_type, 4>::vec_type>::make(f(v.x, s), f(v.y, s), f(v.z, s), f(v.w, s)); \
|
||||
} \
|
||||
template <typename T> \
|
||||
__device__ __forceinline__ typename TypeVec<typename func<typename detail::BinOpTraits<type, T>::argument_type>::result_type, 4>::vec_type op(T s, const type ## 4 & v) \
|
||||
__device__ __forceinline__ typename TypeVec<typename func<typename vec_math_detail::BinOpTraits<type, T>::argument_type>::result_type, 4>::vec_type op(T s, const type ## 4 & v) \
|
||||
{ \
|
||||
func<typename detail::BinOpTraits<T, type>::argument_type> f; \
|
||||
return VecTraits<typename TypeVec<typename func<typename detail::BinOpTraits<type, T>::argument_type>::result_type, 4>::vec_type>::make(f(s, v.x), f(s, v.y), f(s, v.z), f(s, v.w)); \
|
||||
func<typename vec_math_detail::BinOpTraits<T, type>::argument_type> f; \
|
||||
return VecTraits<typename TypeVec<typename func<typename vec_math_detail::BinOpTraits<type, T>::argument_type>::result_type, 4>::vec_type>::make(f(s, v.x), f(s, v.y), f(s, v.z), f(s, v.w)); \
|
||||
}
|
||||
|
||||
#define OPENCV_GPU_IMPLEMENT_VEC_OP(type) \
|
||||
@ -313,20 +313,19 @@ namespace detail
|
||||
OPENCV_GPU_IMPLEMENT_VEC_BINOP(type, operator ^, bit_xor) \
|
||||
OPENCV_GPU_IMPLEMENT_VEC_UNOP (type, operator ~, bit_not)
|
||||
|
||||
OPENCV_GPU_IMPLEMENT_VEC_INT_OP(uchar)
|
||||
OPENCV_GPU_IMPLEMENT_VEC_INT_OP(char)
|
||||
OPENCV_GPU_IMPLEMENT_VEC_INT_OP(ushort)
|
||||
OPENCV_GPU_IMPLEMENT_VEC_INT_OP(short)
|
||||
OPENCV_GPU_IMPLEMENT_VEC_INT_OP(int)
|
||||
OPENCV_GPU_IMPLEMENT_VEC_INT_OP(uint)
|
||||
OPENCV_GPU_IMPLEMENT_VEC_OP(float)
|
||||
OPENCV_GPU_IMPLEMENT_VEC_OP(double)
|
||||
OPENCV_GPU_IMPLEMENT_VEC_INT_OP(uchar)
|
||||
OPENCV_GPU_IMPLEMENT_VEC_INT_OP(char)
|
||||
OPENCV_GPU_IMPLEMENT_VEC_INT_OP(ushort)
|
||||
OPENCV_GPU_IMPLEMENT_VEC_INT_OP(short)
|
||||
OPENCV_GPU_IMPLEMENT_VEC_INT_OP(int)
|
||||
OPENCV_GPU_IMPLEMENT_VEC_INT_OP(uint)
|
||||
OPENCV_GPU_IMPLEMENT_VEC_OP(float)
|
||||
OPENCV_GPU_IMPLEMENT_VEC_OP(double)
|
||||
|
||||
#undef OPENCV_GPU_IMPLEMENT_VEC_UNOP
|
||||
#undef OPENCV_GPU_IMPLEMENT_VEC_BINOP
|
||||
#undef OPENCV_GPU_IMPLEMENT_VEC_OP
|
||||
#undef OPENCV_GPU_IMPLEMENT_VEC_INT_OP
|
||||
|
||||
END_OPENCV_DEVICE_NAMESPACE
|
||||
#undef OPENCV_GPU_IMPLEMENT_VEC_UNOP
|
||||
#undef OPENCV_GPU_IMPLEMENT_VEC_BINOP
|
||||
#undef OPENCV_GPU_IMPLEMENT_VEC_OP
|
||||
#undef OPENCV_GPU_IMPLEMENT_VEC_INT_OP
|
||||
}}} // namespace cv { namespace gpu { namespace device
|
||||
|
||||
#endif // __OPENCV_GPU_VECMATH_HPP__
|
@ -45,82 +45,82 @@
|
||||
|
||||
#include "internal_shared.hpp"
|
||||
|
||||
BEGIN_OPENCV_DEVICE_NAMESPACE
|
||||
namespace cv { namespace gpu { namespace device
|
||||
{
|
||||
template<typename T, int N> struct TypeVec;
|
||||
|
||||
template<typename T, int N> struct TypeVec;
|
||||
|
||||
struct __align__(8) uchar8
|
||||
{
|
||||
uchar a0, a1, a2, a3, a4, a5, a6, a7;
|
||||
};
|
||||
static __host__ __device__ __forceinline__ uchar8 make_uchar8(uchar a0, uchar a1, uchar a2, uchar a3, uchar a4, uchar a5, uchar a6, uchar a7)
|
||||
{
|
||||
uchar8 val = {a0, a1, a2, a3, a4, a5, a6, a7};
|
||||
return val;
|
||||
}
|
||||
struct __align__(8) char8
|
||||
{
|
||||
schar a0, a1, a2, a3, a4, a5, a6, a7;
|
||||
};
|
||||
static __host__ __device__ __forceinline__ char8 make_char8(schar a0, schar a1, schar a2, schar a3, schar a4, schar a5, schar a6, schar a7)
|
||||
{
|
||||
char8 val = {a0, a1, a2, a3, a4, a5, a6, a7};
|
||||
return val;
|
||||
}
|
||||
struct __align__(16) ushort8
|
||||
{
|
||||
ushort a0, a1, a2, a3, a4, a5, a6, a7;
|
||||
};
|
||||
static __host__ __device__ __forceinline__ ushort8 make_ushort8(ushort a0, ushort a1, ushort a2, ushort a3, ushort a4, ushort a5, ushort a6, ushort a7)
|
||||
{
|
||||
ushort8 val = {a0, a1, a2, a3, a4, a5, a6, a7};
|
||||
return val;
|
||||
}
|
||||
struct __align__(16) short8
|
||||
{
|
||||
short a0, a1, a2, a3, a4, a5, a6, a7;
|
||||
};
|
||||
static __host__ __device__ __forceinline__ short8 make_short8(short a0, short a1, short a2, short a3, short a4, short a5, short a6, short a7)
|
||||
{
|
||||
short8 val = {a0, a1, a2, a3, a4, a5, a6, a7};
|
||||
return val;
|
||||
}
|
||||
struct __align__(32) uint8
|
||||
{
|
||||
uint a0, a1, a2, a3, a4, a5, a6, a7;
|
||||
};
|
||||
static __host__ __device__ __forceinline__ uint8 make_uint8(uint a0, uint a1, uint a2, uint a3, uint a4, uint a5, uint a6, uint a7)
|
||||
{
|
||||
uint8 val = {a0, a1, a2, a3, a4, a5, a6, a7};
|
||||
return val;
|
||||
}
|
||||
struct __align__(32) int8
|
||||
{
|
||||
int a0, a1, a2, a3, a4, a5, a6, a7;
|
||||
};
|
||||
static __host__ __device__ __forceinline__ int8 make_int8(int a0, int a1, int a2, int a3, int a4, int a5, int a6, int a7)
|
||||
{
|
||||
int8 val = {a0, a1, a2, a3, a4, a5, a6, a7};
|
||||
return val;
|
||||
}
|
||||
struct __align__(32) float8
|
||||
{
|
||||
float a0, a1, a2, a3, a4, a5, a6, a7;
|
||||
};
|
||||
static __host__ __device__ __forceinline__ float8 make_float8(float a0, float a1, float a2, float a3, float a4, float a5, float a6, float a7)
|
||||
{
|
||||
float8 val = {a0, a1, a2, a3, a4, a5, a6, a7};
|
||||
return val;
|
||||
}
|
||||
struct double8
|
||||
{
|
||||
double a0, a1, a2, a3, a4, a5, a6, a7;
|
||||
};
|
||||
static __host__ __device__ __forceinline__ double8 make_double8(double a0, double a1, double a2, double a3, double a4, double a5, double a6, double a7)
|
||||
{
|
||||
double8 val = {a0, a1, a2, a3, a4, a5, a6, a7};
|
||||
return val;
|
||||
}
|
||||
struct __align__(8) uchar8
|
||||
{
|
||||
uchar a0, a1, a2, a3, a4, a5, a6, a7;
|
||||
};
|
||||
static __host__ __device__ __forceinline__ uchar8 make_uchar8(uchar a0, uchar a1, uchar a2, uchar a3, uchar a4, uchar a5, uchar a6, uchar a7)
|
||||
{
|
||||
uchar8 val = {a0, a1, a2, a3, a4, a5, a6, a7};
|
||||
return val;
|
||||
}
|
||||
struct __align__(8) char8
|
||||
{
|
||||
schar a0, a1, a2, a3, a4, a5, a6, a7;
|
||||
};
|
||||
static __host__ __device__ __forceinline__ char8 make_char8(schar a0, schar a1, schar a2, schar a3, schar a4, schar a5, schar a6, schar a7)
|
||||
{
|
||||
char8 val = {a0, a1, a2, a3, a4, a5, a6, a7};
|
||||
return val;
|
||||
}
|
||||
struct __align__(16) ushort8
|
||||
{
|
||||
ushort a0, a1, a2, a3, a4, a5, a6, a7;
|
||||
};
|
||||
static __host__ __device__ __forceinline__ ushort8 make_ushort8(ushort a0, ushort a1, ushort a2, ushort a3, ushort a4, ushort a5, ushort a6, ushort a7)
|
||||
{
|
||||
ushort8 val = {a0, a1, a2, a3, a4, a5, a6, a7};
|
||||
return val;
|
||||
}
|
||||
struct __align__(16) short8
|
||||
{
|
||||
short a0, a1, a2, a3, a4, a5, a6, a7;
|
||||
};
|
||||
static __host__ __device__ __forceinline__ short8 make_short8(short a0, short a1, short a2, short a3, short a4, short a5, short a6, short a7)
|
||||
{
|
||||
short8 val = {a0, a1, a2, a3, a4, a5, a6, a7};
|
||||
return val;
|
||||
}
|
||||
struct __align__(32) uint8
|
||||
{
|
||||
uint a0, a1, a2, a3, a4, a5, a6, a7;
|
||||
};
|
||||
static __host__ __device__ __forceinline__ uint8 make_uint8(uint a0, uint a1, uint a2, uint a3, uint a4, uint a5, uint a6, uint a7)
|
||||
{
|
||||
uint8 val = {a0, a1, a2, a3, a4, a5, a6, a7};
|
||||
return val;
|
||||
}
|
||||
struct __align__(32) int8
|
||||
{
|
||||
int a0, a1, a2, a3, a4, a5, a6, a7;
|
||||
};
|
||||
static __host__ __device__ __forceinline__ int8 make_int8(int a0, int a1, int a2, int a3, int a4, int a5, int a6, int a7)
|
||||
{
|
||||
int8 val = {a0, a1, a2, a3, a4, a5, a6, a7};
|
||||
return val;
|
||||
}
|
||||
struct __align__(32) float8
|
||||
{
|
||||
float a0, a1, a2, a3, a4, a5, a6, a7;
|
||||
};
|
||||
static __host__ __device__ __forceinline__ float8 make_float8(float a0, float a1, float a2, float a3, float a4, float a5, float a6, float a7)
|
||||
{
|
||||
float8 val = {a0, a1, a2, a3, a4, a5, a6, a7};
|
||||
return val;
|
||||
}
|
||||
struct double8
|
||||
{
|
||||
double a0, a1, a2, a3, a4, a5, a6, a7;
|
||||
};
|
||||
static __host__ __device__ __forceinline__ double8 make_double8(double a0, double a1, double a2, double a3, double a4, double a5, double a6, double a7)
|
||||
{
|
||||
double8 val = {a0, a1, a2, a3, a4, a5, a6, a7};
|
||||
return val;
|
||||
}
|
||||
|
||||
#define OPENCV_GPU_IMPLEMENT_TYPE_VEC(type) \
|
||||
template<> struct TypeVec<type, 1> { typedef type vec_type; }; \
|
||||
@ -134,28 +134,28 @@ static __host__ __device__ __forceinline__ double8 make_double8(double a0, doubl
|
||||
template<> struct TypeVec<type, 8> { typedef type ## 8 vec_type; }; \
|
||||
template<> struct TypeVec<type ## 8, 8> { typedef type ## 8 vec_type; };
|
||||
|
||||
OPENCV_GPU_IMPLEMENT_TYPE_VEC(uchar)
|
||||
OPENCV_GPU_IMPLEMENT_TYPE_VEC(char)
|
||||
OPENCV_GPU_IMPLEMENT_TYPE_VEC(ushort)
|
||||
OPENCV_GPU_IMPLEMENT_TYPE_VEC(short)
|
||||
OPENCV_GPU_IMPLEMENT_TYPE_VEC(int)
|
||||
OPENCV_GPU_IMPLEMENT_TYPE_VEC(uint)
|
||||
OPENCV_GPU_IMPLEMENT_TYPE_VEC(float)
|
||||
OPENCV_GPU_IMPLEMENT_TYPE_VEC(double)
|
||||
OPENCV_GPU_IMPLEMENT_TYPE_VEC(uchar)
|
||||
OPENCV_GPU_IMPLEMENT_TYPE_VEC(char)
|
||||
OPENCV_GPU_IMPLEMENT_TYPE_VEC(ushort)
|
||||
OPENCV_GPU_IMPLEMENT_TYPE_VEC(short)
|
||||
OPENCV_GPU_IMPLEMENT_TYPE_VEC(int)
|
||||
OPENCV_GPU_IMPLEMENT_TYPE_VEC(uint)
|
||||
OPENCV_GPU_IMPLEMENT_TYPE_VEC(float)
|
||||
OPENCV_GPU_IMPLEMENT_TYPE_VEC(double)
|
||||
|
||||
#undef OPENCV_GPU_IMPLEMENT_TYPE_VEC
|
||||
#undef OPENCV_GPU_IMPLEMENT_TYPE_VEC
|
||||
|
||||
template<> struct TypeVec<schar, 1> { typedef schar vec_type; };
|
||||
template<> struct TypeVec<schar, 2> { typedef char2 vec_type; };
|
||||
template<> struct TypeVec<schar, 3> { typedef char3 vec_type; };
|
||||
template<> struct TypeVec<schar, 4> { typedef char4 vec_type; };
|
||||
template<> struct TypeVec<schar, 8> { typedef char8 vec_type; };
|
||||
template<> struct TypeVec<schar, 1> { typedef schar vec_type; };
|
||||
template<> struct TypeVec<schar, 2> { typedef char2 vec_type; };
|
||||
template<> struct TypeVec<schar, 3> { typedef char3 vec_type; };
|
||||
template<> struct TypeVec<schar, 4> { typedef char4 vec_type; };
|
||||
template<> struct TypeVec<schar, 8> { typedef char8 vec_type; };
|
||||
|
||||
template<> struct TypeVec<bool, 1> { typedef uchar vec_type; };
|
||||
template<> struct TypeVec<bool, 2> { typedef uchar2 vec_type; };
|
||||
template<> struct TypeVec<bool, 3> { typedef uchar3 vec_type; };
|
||||
template<> struct TypeVec<bool, 4> { typedef uchar4 vec_type; };
|
||||
template<> struct TypeVec<bool, 8> { typedef uchar8 vec_type; };
|
||||
template<> struct TypeVec<bool, 1> { typedef uchar vec_type; };
|
||||
template<> struct TypeVec<bool, 2> { typedef uchar2 vec_type; };
|
||||
template<> struct TypeVec<bool, 3> { typedef uchar3 vec_type; };
|
||||
template<> struct TypeVec<bool, 4> { typedef uchar4 vec_type; };
|
||||
template<> struct TypeVec<bool, 8> { typedef uchar8 vec_type; };
|
||||
|
||||
template<typename T> struct VecTraits;
|
||||
|
||||
@ -209,73 +209,72 @@ template<> struct TypeVec<bool, 8> { typedef uchar8 vec_type; };
|
||||
static __device__ __host__ __forceinline__ type ## 8 make(const type* v) {return make_ ## type ## 8(v[0], v[1], v[2], v[3], v[4], v[5], v[6], v[7]);} \
|
||||
};
|
||||
|
||||
OPENCV_GPU_IMPLEMENT_VEC_TRAITS(uchar)
|
||||
OPENCV_GPU_IMPLEMENT_VEC_TRAITS(ushort)
|
||||
OPENCV_GPU_IMPLEMENT_VEC_TRAITS(short)
|
||||
OPENCV_GPU_IMPLEMENT_VEC_TRAITS(int)
|
||||
OPENCV_GPU_IMPLEMENT_VEC_TRAITS(uint)
|
||||
OPENCV_GPU_IMPLEMENT_VEC_TRAITS(float)
|
||||
OPENCV_GPU_IMPLEMENT_VEC_TRAITS(double)
|
||||
OPENCV_GPU_IMPLEMENT_VEC_TRAITS(uchar)
|
||||
OPENCV_GPU_IMPLEMENT_VEC_TRAITS(ushort)
|
||||
OPENCV_GPU_IMPLEMENT_VEC_TRAITS(short)
|
||||
OPENCV_GPU_IMPLEMENT_VEC_TRAITS(int)
|
||||
OPENCV_GPU_IMPLEMENT_VEC_TRAITS(uint)
|
||||
OPENCV_GPU_IMPLEMENT_VEC_TRAITS(float)
|
||||
OPENCV_GPU_IMPLEMENT_VEC_TRAITS(double)
|
||||
|
||||
#undef OPENCV_GPU_IMPLEMENT_VEC_TRAITS
|
||||
#undef OPENCV_GPU_IMPLEMENT_VEC_TRAITS
|
||||
|
||||
template<> struct VecTraits<char>
|
||||
{
|
||||
typedef char elem_type;
|
||||
enum {cn=1};
|
||||
static __device__ __host__ __forceinline__ char all(char v) {return v;}
|
||||
static __device__ __host__ __forceinline__ char make(char x) {return x;}
|
||||
static __device__ __host__ __forceinline__ char make(const char* x) {return *x;}
|
||||
};
|
||||
template<> struct VecTraits<schar>
|
||||
{
|
||||
typedef schar elem_type;
|
||||
enum {cn=1};
|
||||
static __device__ __host__ __forceinline__ schar all(schar v) {return v;}
|
||||
static __device__ __host__ __forceinline__ schar make(schar x) {return x;}
|
||||
static __device__ __host__ __forceinline__ schar make(const schar* x) {return *x;}
|
||||
};
|
||||
template<> struct VecTraits<char1>
|
||||
{
|
||||
typedef schar elem_type;
|
||||
enum {cn=1};
|
||||
static __device__ __host__ __forceinline__ char1 all(schar v) {return make_char1(v);}
|
||||
static __device__ __host__ __forceinline__ char1 make(schar x) {return make_char1(x);}
|
||||
static __device__ __host__ __forceinline__ char1 make(const schar* v) {return make_char1(v[0]);}
|
||||
};
|
||||
template<> struct VecTraits<char2>
|
||||
{
|
||||
typedef schar elem_type;
|
||||
enum {cn=2};
|
||||
static __device__ __host__ __forceinline__ char2 all(schar v) {return make_char2(v, v);}
|
||||
static __device__ __host__ __forceinline__ char2 make(schar x, schar y) {return make_char2(x, y);}
|
||||
static __device__ __host__ __forceinline__ char2 make(const schar* v) {return make_char2(v[0], v[1]);}
|
||||
};
|
||||
template<> struct VecTraits<char3>
|
||||
{
|
||||
typedef schar elem_type;
|
||||
enum {cn=3};
|
||||
static __device__ __host__ __forceinline__ char3 all(schar v) {return make_char3(v, v, v);}
|
||||
static __device__ __host__ __forceinline__ char3 make(schar x, schar y, schar z) {return make_char3(x, y, z);}
|
||||
static __device__ __host__ __forceinline__ char3 make(const schar* v) {return make_char3(v[0], v[1], v[2]);}
|
||||
};
|
||||
template<> struct VecTraits<char4>
|
||||
{
|
||||
typedef schar elem_type;
|
||||
enum {cn=4};
|
||||
static __device__ __host__ __forceinline__ char4 all(schar v) {return make_char4(v, v, v, v);}
|
||||
static __device__ __host__ __forceinline__ char4 make(schar x, schar y, schar z, schar w) {return make_char4(x, y, z, w);}
|
||||
static __device__ __host__ __forceinline__ char4 make(const schar* v) {return make_char4(v[0], v[1], v[2], v[3]);}
|
||||
};
|
||||
template<> struct VecTraits<char8>
|
||||
{
|
||||
typedef schar elem_type;
|
||||
enum {cn=8};
|
||||
static __device__ __host__ __forceinline__ char8 all(schar v) {return make_char8(v, v, v, v, v, v, v, v);}
|
||||
static __device__ __host__ __forceinline__ char8 make(schar a0, schar a1, schar a2, schar a3, schar a4, schar a5, schar a6, schar a7) {return make_char8(a0, a1, a2, a3, a4, a5, a6, a7);}
|
||||
static __device__ __host__ __forceinline__ char8 make(const schar* v) {return make_char8(v[0], v[1], v[2], v[3], v[4], v[5], v[6], v[7]);}
|
||||
};
|
||||
|
||||
END_OPENCV_DEVICE_NAMESPACE
|
||||
template<> struct VecTraits<char>
|
||||
{
|
||||
typedef char elem_type;
|
||||
enum {cn=1};
|
||||
static __device__ __host__ __forceinline__ char all(char v) {return v;}
|
||||
static __device__ __host__ __forceinline__ char make(char x) {return x;}
|
||||
static __device__ __host__ __forceinline__ char make(const char* x) {return *x;}
|
||||
};
|
||||
template<> struct VecTraits<schar>
|
||||
{
|
||||
typedef schar elem_type;
|
||||
enum {cn=1};
|
||||
static __device__ __host__ __forceinline__ schar all(schar v) {return v;}
|
||||
static __device__ __host__ __forceinline__ schar make(schar x) {return x;}
|
||||
static __device__ __host__ __forceinline__ schar make(const schar* x) {return *x;}
|
||||
};
|
||||
template<> struct VecTraits<char1>
|
||||
{
|
||||
typedef schar elem_type;
|
||||
enum {cn=1};
|
||||
static __device__ __host__ __forceinline__ char1 all(schar v) {return make_char1(v);}
|
||||
static __device__ __host__ __forceinline__ char1 make(schar x) {return make_char1(x);}
|
||||
static __device__ __host__ __forceinline__ char1 make(const schar* v) {return make_char1(v[0]);}
|
||||
};
|
||||
template<> struct VecTraits<char2>
|
||||
{
|
||||
typedef schar elem_type;
|
||||
enum {cn=2};
|
||||
static __device__ __host__ __forceinline__ char2 all(schar v) {return make_char2(v, v);}
|
||||
static __device__ __host__ __forceinline__ char2 make(schar x, schar y) {return make_char2(x, y);}
|
||||
static __device__ __host__ __forceinline__ char2 make(const schar* v) {return make_char2(v[0], v[1]);}
|
||||
};
|
||||
template<> struct VecTraits<char3>
|
||||
{
|
||||
typedef schar elem_type;
|
||||
enum {cn=3};
|
||||
static __device__ __host__ __forceinline__ char3 all(schar v) {return make_char3(v, v, v);}
|
||||
static __device__ __host__ __forceinline__ char3 make(schar x, schar y, schar z) {return make_char3(x, y, z);}
|
||||
static __device__ __host__ __forceinline__ char3 make(const schar* v) {return make_char3(v[0], v[1], v[2]);}
|
||||
};
|
||||
template<> struct VecTraits<char4>
|
||||
{
|
||||
typedef schar elem_type;
|
||||
enum {cn=4};
|
||||
static __device__ __host__ __forceinline__ char4 all(schar v) {return make_char4(v, v, v, v);}
|
||||
static __device__ __host__ __forceinline__ char4 make(schar x, schar y, schar z, schar w) {return make_char4(x, y, z, w);}
|
||||
static __device__ __host__ __forceinline__ char4 make(const schar* v) {return make_char4(v[0], v[1], v[2], v[3]);}
|
||||
};
|
||||
template<> struct VecTraits<char8>
|
||||
{
|
||||
typedef schar elem_type;
|
||||
enum {cn=8};
|
||||
static __device__ __host__ __forceinline__ char8 all(schar v) {return make_char8(v, v, v, v, v, v, v, v);}
|
||||
static __device__ __host__ __forceinline__ char8 make(schar a0, schar a1, schar a2, schar a3, schar a4, schar a5, schar a6, schar a7) {return make_char8(a0, a1, a2, a3, a4, a5, a6, a7);}
|
||||
static __device__ __host__ __forceinline__ char8 make(const schar* v) {return make_char8(v[0], v[1], v[2], v[3], v[4], v[5], v[6], v[7]);}
|
||||
};
|
||||
}}} // namespace cv { namespace gpu { namespace device
|
||||
|
||||
#endif // __OPENCV_GPU_VEC_TRAITS_HPP__
|
||||
|
@ -45,71 +45,70 @@
|
||||
|
||||
#include "internal_shared.hpp"
|
||||
|
||||
BEGIN_OPENCV_DEVICE_NAMESPACE
|
||||
|
||||
struct Warp
|
||||
namespace cv { namespace gpu { namespace device
|
||||
{
|
||||
enum
|
||||
struct Warp
|
||||
{
|
||||
LOG_WARP_SIZE = 5,
|
||||
WARP_SIZE = 1 << LOG_WARP_SIZE,
|
||||
STRIDE = WARP_SIZE
|
||||
enum
|
||||
{
|
||||
LOG_WARP_SIZE = 5,
|
||||
WARP_SIZE = 1 << LOG_WARP_SIZE,
|
||||
STRIDE = WARP_SIZE
|
||||
};
|
||||
|
||||
/** \brief Returns the warp lane ID of the calling thread. */
|
||||
static __device__ __forceinline__ unsigned int laneId()
|
||||
{
|
||||
unsigned int ret;
|
||||
asm("mov.u32 %0, %laneid;" : "=r"(ret) );
|
||||
return ret;
|
||||
}
|
||||
|
||||
template<typename It, typename T>
|
||||
static __device__ __forceinline__ void fill(It beg, It end, const T& value)
|
||||
{
|
||||
for(It t = beg + laneId(); t < end; t += STRIDE)
|
||||
*t = value;
|
||||
}
|
||||
|
||||
template<typename InIt, typename OutIt>
|
||||
static __device__ __forceinline__ OutIt copy(InIt beg, InIt end, OutIt out)
|
||||
{
|
||||
for(InIt t = beg + laneId(); t < end; t += STRIDE, out += STRIDE)
|
||||
*out = *t;
|
||||
return out;
|
||||
}
|
||||
|
||||
template<typename InIt, typename OutIt, class UnOp>
|
||||
static __device__ __forceinline__ OutIt transform(InIt beg, InIt end, OutIt out, UnOp op)
|
||||
{
|
||||
for(InIt t = beg + laneId(); t < end; t += STRIDE, out += STRIDE)
|
||||
*out = op(*t);
|
||||
return out;
|
||||
}
|
||||
|
||||
template<typename InIt1, typename InIt2, typename OutIt, class BinOp>
|
||||
static __device__ __forceinline__ OutIt transform(InIt1 beg1, InIt1 end1, InIt2 beg2, OutIt out, BinOp op)
|
||||
{
|
||||
unsigned int lane = laneId();
|
||||
|
||||
InIt1 t1 = beg1 + lane;
|
||||
InIt2 t2 = beg2 + lane;
|
||||
for(; t1 < end1; t1 += STRIDE, t2 += STRIDE, out += STRIDE)
|
||||
*out = op(*t1, *t2);
|
||||
return out;
|
||||
}
|
||||
|
||||
template<typename OutIt, typename T>
|
||||
static __device__ __forceinline__ void yota(OutIt beg, OutIt end, T value)
|
||||
{
|
||||
unsigned int lane = laneId();
|
||||
value += lane;
|
||||
|
||||
for(OutIt t = beg + lane; t < end; t += STRIDE, value += STRIDE)
|
||||
*t = value;
|
||||
}
|
||||
};
|
||||
|
||||
/** \brief Returns the warp lane ID of the calling thread. */
|
||||
static __device__ __forceinline__ unsigned int laneId()
|
||||
{
|
||||
unsigned int ret;
|
||||
asm("mov.u32 %0, %laneid;" : "=r"(ret) );
|
||||
return ret;
|
||||
}
|
||||
|
||||
template<typename It, typename T>
|
||||
static __device__ __forceinline__ void fill(It beg, It end, const T& value)
|
||||
{
|
||||
for(It t = beg + laneId(); t < end; t += STRIDE)
|
||||
*t = value;
|
||||
}
|
||||
|
||||
template<typename InIt, typename OutIt>
|
||||
static __device__ __forceinline__ OutIt copy(InIt beg, InIt end, OutIt out)
|
||||
{
|
||||
for(InIt t = beg + laneId(); t < end; t += STRIDE, out += STRIDE)
|
||||
*out = *t;
|
||||
return out;
|
||||
}
|
||||
|
||||
template<typename InIt, typename OutIt, class UnOp>
|
||||
static __device__ __forceinline__ OutIt transform(InIt beg, InIt end, OutIt out, UnOp op)
|
||||
{
|
||||
for(InIt t = beg + laneId(); t < end; t += STRIDE, out += STRIDE)
|
||||
*out = op(*t);
|
||||
return out;
|
||||
}
|
||||
|
||||
template<typename InIt1, typename InIt2, typename OutIt, class BinOp>
|
||||
static __device__ __forceinline__ OutIt transform(InIt1 beg1, InIt1 end1, InIt2 beg2, OutIt out, BinOp op)
|
||||
{
|
||||
unsigned int lane = laneId();
|
||||
|
||||
InIt1 t1 = beg1 + lane;
|
||||
InIt2 t2 = beg2 + lane;
|
||||
for(; t1 < end1; t1 += STRIDE, t2 += STRIDE, out += STRIDE)
|
||||
*out = op(*t1, *t2);
|
||||
return out;
|
||||
}
|
||||
|
||||
template<typename OutIt, typename T>
|
||||
static __device__ __forceinline__ void yota(OutIt beg, OutIt end, T value)
|
||||
{
|
||||
unsigned int lane = laneId();
|
||||
value += lane;
|
||||
|
||||
for(OutIt t = beg + lane; t < end; t += STRIDE, value += STRIDE)
|
||||
*t = value;
|
||||
}
|
||||
};
|
||||
|
||||
END_OPENCV_DEVICE_NAMESPACE
|
||||
}}} // namespace cv { namespace gpu { namespace device
|
||||
|
||||
#endif /* __OPENCV_GPU_DEVICE_WARP_HPP__ */
|
@ -46,27 +46,26 @@
|
||||
|
||||
#include "internal_shared.hpp"
|
||||
|
||||
BEGIN_OPENCV_DEVICE_NAMESPACE
|
||||
|
||||
template <class T>
|
||||
__device__ __forceinline__ T warp_reduce ( volatile T *ptr , const unsigned int tid = threadIdx.x )
|
||||
{
|
||||
const unsigned int lane = tid & 31; // index of thread in warp (0..31)
|
||||
|
||||
if (lane < 16)
|
||||
{
|
||||
T partial = ptr[tid];
|
||||
namespace cv { namespace gpu { namespace device
|
||||
{
|
||||
template <class T>
|
||||
__device__ __forceinline__ T warp_reduce(volatile T *ptr , const unsigned int tid = threadIdx.x)
|
||||
{
|
||||
const unsigned int lane = tid & 31; // index of thread in warp (0..31)
|
||||
|
||||
if (lane < 16)
|
||||
{
|
||||
T partial = ptr[tid];
|
||||
|
||||
ptr[tid] = partial = partial + ptr[tid + 16];
|
||||
ptr[tid] = partial = partial + ptr[tid + 8];
|
||||
ptr[tid] = partial = partial + ptr[tid + 4];
|
||||
ptr[tid] = partial = partial + ptr[tid + 2];
|
||||
ptr[tid] = partial = partial + ptr[tid + 1];
|
||||
}
|
||||
ptr[tid] = partial = partial + ptr[tid + 16];
|
||||
ptr[tid] = partial = partial + ptr[tid + 8];
|
||||
ptr[tid] = partial = partial + ptr[tid + 4];
|
||||
ptr[tid] = partial = partial + ptr[tid + 2];
|
||||
ptr[tid] = partial = partial + ptr[tid + 1];
|
||||
}
|
||||
|
||||
return ptr[tid - lane];
|
||||
}
|
||||
|
||||
END_OPENCV_DEVICE_NAMESPACE
|
||||
return ptr[tid - lane];
|
||||
}
|
||||
}}} // namespace cv { namespace gpu { namespace device {
|
||||
|
||||
#endif /* OPENCV_GPU_WARP_REDUCE_HPP__ */
|
@ -55,21 +55,20 @@ void cv::gpu::split(const GpuMat& /*src*/, vector<GpuMat>& /*dst*/, Stream& /*st
|
||||
|
||||
#else /* !defined (HAVE_CUDA) */
|
||||
|
||||
BEGIN_OPENCV_DEVICE_NAMESPACE
|
||||
|
||||
namespace split_merge
|
||||
{
|
||||
void merge_caller(const DevMem2Db* src, DevMem2Db& dst, int total_channels, size_t elem_size, const cudaStream_t& stream);
|
||||
void split_caller(const DevMem2Db& src, DevMem2Db* dst, int num_channels, size_t elem_size1, const cudaStream_t& stream);
|
||||
}
|
||||
|
||||
END_OPENCV_DEVICE_NAMESPACE
|
||||
namespace cv { namespace gpu { namespace device
|
||||
{
|
||||
namespace split_merge
|
||||
{
|
||||
void merge_caller(const DevMem2Db* src, DevMem2Db& dst, int total_channels, size_t elem_size, const cudaStream_t& stream);
|
||||
void split_caller(const DevMem2Db& src, DevMem2Db* dst, int num_channels, size_t elem_size1, const cudaStream_t& stream);
|
||||
}
|
||||
}}}
|
||||
|
||||
namespace
|
||||
{
|
||||
void merge(const GpuMat* src, size_t n, GpuMat& dst, const cudaStream_t& stream)
|
||||
{
|
||||
using namespace OPENCV_DEVICE_NAMESPACE_ split_merge;
|
||||
using namespace ::cv::gpu::device::split_merge;
|
||||
|
||||
CV_Assert(src);
|
||||
CV_Assert(n > 0);
|
||||
@ -108,7 +107,7 @@ namespace
|
||||
|
||||
void split(const GpuMat& src, GpuMat* dst, const cudaStream_t& stream)
|
||||
{
|
||||
using namespace OPENCV_DEVICE_NAMESPACE_ split_merge;
|
||||
using namespace ::cv::gpu::device::split_merge;
|
||||
|
||||
CV_Assert(dst);
|
||||
|
||||
|
@ -55,16 +55,15 @@ void cv::gpu::StereoBM_GPU::operator() ( const GpuMat&, const GpuMat&, GpuMat&,
|
||||
|
||||
#else /* !defined (HAVE_CUDA) */
|
||||
|
||||
BEGIN_OPENCV_DEVICE_NAMESPACE
|
||||
|
||||
namespace stereobm
|
||||
namespace cv { namespace gpu { namespace device
|
||||
{
|
||||
void stereoBM_GPU(const DevMem2Db& left, const DevMem2Db& right, const DevMem2Db& disp, int ndisp, int winsz, const DevMem2D_<unsigned int>& minSSD_buf, cudaStream_t & stream);
|
||||
void prefilter_xsobel(const DevMem2Db& input, const DevMem2Db& output, int prefilterCap /*= 31*/, cudaStream_t & stream);
|
||||
void postfilter_textureness(const DevMem2Db& input, int winsz, float avgTexturenessThreshold, const DevMem2Db& disp, cudaStream_t & stream);
|
||||
}
|
||||
|
||||
END_OPENCV_DEVICE_NAMESPACE
|
||||
namespace stereobm
|
||||
{
|
||||
void stereoBM_GPU(const DevMem2Db& left, const DevMem2Db& right, const DevMem2Db& disp, int ndisp, int winsz, const DevMem2D_<unsigned int>& minSSD_buf, cudaStream_t & stream);
|
||||
void prefilter_xsobel(const DevMem2Db& input, const DevMem2Db& output, int prefilterCap /*= 31*/, cudaStream_t & stream);
|
||||
void postfilter_textureness(const DevMem2Db& input, int winsz, float avgTexturenessThreshold, const DevMem2Db& disp, cudaStream_t & stream);
|
||||
}
|
||||
}}}
|
||||
|
||||
const float defaultAvgTexThreshold = 3;
|
||||
|
||||
@ -99,7 +98,7 @@ namespace
|
||||
{
|
||||
void stereo_bm_gpu_operator( GpuMat& minSSD, GpuMat& leBuf, GpuMat& riBuf, int preset, int ndisp, int winSize, float avergeTexThreshold, const GpuMat& left, const GpuMat& right, GpuMat& disparity, cudaStream_t stream)
|
||||
{
|
||||
using namespace OPENCV_DEVICE_NAMESPACE_ stereobm;
|
||||
using namespace ::cv::gpu::device::stereobm;
|
||||
|
||||
CV_DbgAssert(left.rows == right.rows && left.cols == right.cols);
|
||||
CV_DbgAssert(left.type() == CV_8UC1);
|
||||
|
@ -59,28 +59,27 @@ void cv::gpu::StereoBeliefPropagation::operator()(const GpuMat&, GpuMat&, Stream
|
||||
|
||||
#else /* !defined (HAVE_CUDA) */
|
||||
|
||||
BEGIN_OPENCV_DEVICE_NAMESPACE
|
||||
|
||||
namespace stereobp
|
||||
namespace cv { namespace gpu { namespace device
|
||||
{
|
||||
void load_constants(int ndisp, float max_data_term, float data_weight, float max_disc_term, float disc_single_jump);
|
||||
template<typename T, typename D>
|
||||
void comp_data_gpu(const DevMem2Db& left, const DevMem2Db& right, const DevMem2Db& data, cudaStream_t stream);
|
||||
template<typename T>
|
||||
void data_step_down_gpu(int dst_cols, int dst_rows, int src_rows, const DevMem2Db& src, const DevMem2Db& dst, cudaStream_t stream);
|
||||
template <typename T>
|
||||
void level_up_messages_gpu(int dst_idx, int dst_cols, int dst_rows, int src_rows, DevMem2Db* mus, DevMem2Db* mds, DevMem2Db* mls, DevMem2Db* mrs, cudaStream_t stream);
|
||||
template <typename T>
|
||||
void calc_all_iterations_gpu(int cols, int rows, int iters, const DevMem2Db& u, const DevMem2Db& d,
|
||||
const DevMem2Db& l, const DevMem2Db& r, const DevMem2Db& data, cudaStream_t stream);
|
||||
template <typename T>
|
||||
void output_gpu(const DevMem2Db& u, const DevMem2Db& d, const DevMem2Db& l, const DevMem2Db& r, const DevMem2Db& data,
|
||||
const DevMem2D_<short>& disp, cudaStream_t stream);
|
||||
}
|
||||
namespace stereobp
|
||||
{
|
||||
void load_constants(int ndisp, float max_data_term, float data_weight, float max_disc_term, float disc_single_jump);
|
||||
template<typename T, typename D>
|
||||
void comp_data_gpu(const DevMem2Db& left, const DevMem2Db& right, const DevMem2Db& data, cudaStream_t stream);
|
||||
template<typename T>
|
||||
void data_step_down_gpu(int dst_cols, int dst_rows, int src_rows, const DevMem2Db& src, const DevMem2Db& dst, cudaStream_t stream);
|
||||
template <typename T>
|
||||
void level_up_messages_gpu(int dst_idx, int dst_cols, int dst_rows, int src_rows, DevMem2Db* mus, DevMem2Db* mds, DevMem2Db* mls, DevMem2Db* mrs, cudaStream_t stream);
|
||||
template <typename T>
|
||||
void calc_all_iterations_gpu(int cols, int rows, int iters, const DevMem2Db& u, const DevMem2Db& d,
|
||||
const DevMem2Db& l, const DevMem2Db& r, const DevMem2Db& data, cudaStream_t stream);
|
||||
template <typename T>
|
||||
void output_gpu(const DevMem2Db& u, const DevMem2Db& d, const DevMem2Db& l, const DevMem2Db& r, const DevMem2Db& data,
|
||||
const DevMem2D_<short>& disp, cudaStream_t stream);
|
||||
}
|
||||
}}}
|
||||
|
||||
END_OPENCV_DEVICE_NAMESPACE
|
||||
|
||||
using namespace OPENCV_DEVICE_NAMESPACE_ stereobp;
|
||||
using namespace ::cv::gpu::device::stereobp;
|
||||
|
||||
namespace
|
||||
{
|
||||
|
@ -57,40 +57,39 @@ void cv::gpu::StereoConstantSpaceBP::operator()(const GpuMat&, const GpuMat&, Gp
|
||||
|
||||
#else /* !defined (HAVE_CUDA) */
|
||||
|
||||
BEGIN_OPENCV_DEVICE_NAMESPACE
|
||||
|
||||
namespace stereocsbp
|
||||
namespace cv { namespace gpu { namespace device
|
||||
{
|
||||
void load_constants(int ndisp, float max_data_term, float data_weight, float max_disc_term, float disc_single_jump, int min_disp_th,
|
||||
const DevMem2Db& left, const DevMem2Db& right, const DevMem2Db& temp);
|
||||
namespace stereocsbp
|
||||
{
|
||||
void load_constants(int ndisp, float max_data_term, float data_weight, float max_disc_term, float disc_single_jump, int min_disp_th,
|
||||
const DevMem2Db& left, const DevMem2Db& right, const DevMem2Db& temp);
|
||||
|
||||
template<class T>
|
||||
void init_data_cost(int rows, int cols, T* disp_selected_pyr, T* data_cost_selected, size_t msg_step,
|
||||
int h, int w, int level, int nr_plane, int ndisp, int channels, bool use_local_init_data_cost, cudaStream_t stream);
|
||||
template<class T>
|
||||
void init_data_cost(int rows, int cols, T* disp_selected_pyr, T* data_cost_selected, size_t msg_step,
|
||||
int h, int w, int level, int nr_plane, int ndisp, int channels, bool use_local_init_data_cost, cudaStream_t stream);
|
||||
|
||||
template<class T>
|
||||
void compute_data_cost(const T* disp_selected_pyr, T* data_cost, size_t msg_step1, size_t msg_step2,
|
||||
int rows, int cols, int h, int w, int h2, int level, int nr_plane, int channels, cudaStream_t stream);
|
||||
template<class T>
|
||||
void compute_data_cost(const T* disp_selected_pyr, T* data_cost, size_t msg_step1, size_t msg_step2,
|
||||
int rows, int cols, int h, int w, int h2, int level, int nr_plane, int channels, cudaStream_t stream);
|
||||
|
||||
template<class T>
|
||||
void init_message(T* u_new, T* d_new, T* l_new, T* r_new,
|
||||
const T* u_cur, const T* d_cur, const T* l_cur, const T* r_cur,
|
||||
T* selected_disp_pyr_new, const T* selected_disp_pyr_cur,
|
||||
T* data_cost_selected, const T* data_cost, size_t msg_step1, size_t msg_step2,
|
||||
int h, int w, int nr_plane, int h2, int w2, int nr_plane2, cudaStream_t stream);
|
||||
template<class T>
|
||||
void init_message(T* u_new, T* d_new, T* l_new, T* r_new,
|
||||
const T* u_cur, const T* d_cur, const T* l_cur, const T* r_cur,
|
||||
T* selected_disp_pyr_new, const T* selected_disp_pyr_cur,
|
||||
T* data_cost_selected, const T* data_cost, size_t msg_step1, size_t msg_step2,
|
||||
int h, int w, int nr_plane, int h2, int w2, int nr_plane2, cudaStream_t stream);
|
||||
|
||||
template<class T>
|
||||
void calc_all_iterations(T* u, T* d, T* l, T* r, const T* data_cost_selected,
|
||||
const T* selected_disp_pyr_cur, size_t msg_step, int h, int w, int nr_plane, int iters, cudaStream_t stream);
|
||||
template<class T>
|
||||
void calc_all_iterations(T* u, T* d, T* l, T* r, const T* data_cost_selected,
|
||||
const T* selected_disp_pyr_cur, size_t msg_step, int h, int w, int nr_plane, int iters, cudaStream_t stream);
|
||||
|
||||
template<class T>
|
||||
void compute_disp(const T* u, const T* d, const T* l, const T* r, const T* data_cost_selected, const T* disp_selected, size_t msg_step,
|
||||
const DevMem2D_<short>& disp, int nr_plane, cudaStream_t stream);
|
||||
}
|
||||
template<class T>
|
||||
void compute_disp(const T* u, const T* d, const T* l, const T* r, const T* data_cost_selected, const T* disp_selected, size_t msg_step,
|
||||
const DevMem2D_<short>& disp, int nr_plane, cudaStream_t stream);
|
||||
}
|
||||
}}}
|
||||
|
||||
END_OPENCV_DEVICE_NAMESPACE
|
||||
|
||||
using namespace OPENCV_DEVICE_NAMESPACE_ stereocsbp;
|
||||
using namespace ::cv::gpu::device::stereocsbp;
|
||||
|
||||
namespace
|
||||
{
|
||||
|
@ -63,35 +63,34 @@ void cv::gpu::SURF_GPU::releaseMemory() { throw_nogpu(); }
|
||||
|
||||
#else /* !defined (HAVE_CUDA) */
|
||||
|
||||
BEGIN_OPENCV_DEVICE_NAMESPACE
|
||||
|
||||
namespace surf
|
||||
namespace cv { namespace gpu { namespace device
|
||||
{
|
||||
void loadGlobalConstants(int maxCandidates, int maxFeatures, int img_rows, int img_cols, int nOctaveLayers, float hessianThreshold);
|
||||
void loadOctaveConstants(int octave, int layer_rows, int layer_cols);
|
||||
namespace surf
|
||||
{
|
||||
void loadGlobalConstants(int maxCandidates, int maxFeatures, int img_rows, int img_cols, int nOctaveLayers, float hessianThreshold);
|
||||
void loadOctaveConstants(int octave, int layer_rows, int layer_cols);
|
||||
|
||||
void bindImgTex(DevMem2Db img);
|
||||
void bindSumTex(DevMem2D_<unsigned int> sum);
|
||||
void bindMaskSumTex(DevMem2D_<unsigned int> maskSum);
|
||||
void bindImgTex(DevMem2Db img);
|
||||
void bindSumTex(DevMem2D_<unsigned int> sum);
|
||||
void bindMaskSumTex(DevMem2D_<unsigned int> maskSum);
|
||||
|
||||
void icvCalcLayerDetAndTrace_gpu(const PtrStepf& det, const PtrStepf& trace, int img_rows, int img_cols, int octave, int nOctaveLayers);
|
||||
void icvCalcLayerDetAndTrace_gpu(const PtrStepf& det, const PtrStepf& trace, int img_rows, int img_cols, int octave, int nOctaveLayers);
|
||||
|
||||
void icvFindMaximaInLayer_gpu(const PtrStepf& det, const PtrStepf& trace, int4* maxPosBuffer, unsigned int* maxCounter,
|
||||
int img_rows, int img_cols, int octave, bool use_mask, int nLayers);
|
||||
void icvFindMaximaInLayer_gpu(const PtrStepf& det, const PtrStepf& trace, int4* maxPosBuffer, unsigned int* maxCounter,
|
||||
int img_rows, int img_cols, int octave, bool use_mask, int nLayers);
|
||||
|
||||
void icvInterpolateKeypoint_gpu(const PtrStepf& det, const int4* maxPosBuffer, unsigned int maxCounter,
|
||||
float* featureX, float* featureY, int* featureLaplacian, float* featureSize, float* featureHessian,
|
||||
unsigned int* featureCounter);
|
||||
void icvInterpolateKeypoint_gpu(const PtrStepf& det, const int4* maxPosBuffer, unsigned int maxCounter,
|
||||
float* featureX, float* featureY, int* featureLaplacian, float* featureSize, float* featureHessian,
|
||||
unsigned int* featureCounter);
|
||||
|
||||
void icvCalcOrientation_gpu(const float* featureX, const float* featureY, const float* featureSize, float* featureDir, int nFeatures);
|
||||
void icvCalcOrientation_gpu(const float* featureX, const float* featureY, const float* featureSize, float* featureDir, int nFeatures);
|
||||
|
||||
void compute_descriptors_gpu(const DevMem2Df& descriptors,
|
||||
const float* featureX, const float* featureY, const float* featureSize, const float* featureDir, int nFeatures);
|
||||
}
|
||||
void compute_descriptors_gpu(const DevMem2Df& descriptors,
|
||||
const float* featureX, const float* featureY, const float* featureSize, const float* featureDir, int nFeatures);
|
||||
}
|
||||
}}}
|
||||
|
||||
END_OPENCV_DEVICE_NAMESPACE
|
||||
|
||||
using namespace OPENCV_DEVICE_NAMESPACE_ surf;
|
||||
using namespace ::cv::gpu::device::surf;
|
||||
|
||||
namespace
|
||||
{
|
||||
|
@ -225,7 +225,7 @@ TEST_P(InterpolateFrames, Regression)
|
||||
|
||||
#ifndef DUMP
|
||||
|
||||
EXPECT_MAT_NEAR(newFrame_gold, newFrame, 1e-4);
|
||||
EXPECT_MAT_NEAR(newFrame_gold, newFrame, 1e-3);
|
||||
|
||||
#else
|
||||
|
||||
|
Loading…
x
Reference in New Issue
Block a user