added TransformFunctorTraits, optimized some functions that use transform
This commit is contained in:
parent
6ce2277cc7
commit
5e9ae6b19f
@ -47,37 +47,33 @@
|
||||
#include "opencv2/gpu/device/saturate_cast.hpp"
|
||||
#include "internal_shared.hpp"
|
||||
|
||||
using namespace cv::gpu;
|
||||
using namespace cv::gpu::device;
|
||||
|
||||
namespace cv { namespace gpu { namespace mathfunc
|
||||
namespace cv { namespace gpu { namespace device
|
||||
{
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////////////////
|
||||
// Compare
|
||||
|
||||
template <typename T1, typename T2> struct NotEqual : binary_function<T1, T2, uchar>
|
||||
template <typename T> struct NotEqual : binary_function<T, T, uchar>
|
||||
{
|
||||
__device__ __forceinline__ uchar operator()(const T1& src1, const T2& src2) const
|
||||
__device__ __forceinline__ uchar operator()(T src1, T src2) const
|
||||
{
|
||||
return static_cast<uchar>(static_cast<int>(src1 != src2) * 255);
|
||||
}
|
||||
};
|
||||
|
||||
template <typename T1, typename T2>
|
||||
template <typename T>
|
||||
inline void compare_ne(const DevMem2D& src1, const DevMem2D& src2, const DevMem2D& dst, cudaStream_t stream)
|
||||
{
|
||||
NotEqual<T1, T2> op;
|
||||
transform(static_cast< DevMem2D_<T1> >(src1), static_cast< DevMem2D_<T2> >(src2), dst, op, stream);
|
||||
NotEqual<T> op;
|
||||
transform(static_cast< DevMem2D_<T> >(src1), static_cast< DevMem2D_<T> >(src2), dst, op, stream);
|
||||
}
|
||||
|
||||
void compare_ne_8uc4(const DevMem2D& src1, const DevMem2D& src2, const DevMem2D& dst, cudaStream_t stream)
|
||||
{
|
||||
compare_ne<uint, uint>(src1, src2, dst, stream);
|
||||
compare_ne<uint>(src1, src2, dst, stream);
|
||||
}
|
||||
void compare_ne_32f(const DevMem2D& src1, const DevMem2D& src2, const DevMem2D& dst, cudaStream_t stream)
|
||||
{
|
||||
compare_ne<float, float>(src1, src2, dst, stream);
|
||||
compare_ne<float>(src1, src2, dst, stream);
|
||||
}
|
||||
|
||||
|
||||
@ -354,6 +350,35 @@ namespace cv { namespace gpu { namespace mathfunc
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////
|
||||
// min/max
|
||||
|
||||
namespace detail
|
||||
{
|
||||
template <size_t size, typename F> struct MinMaxTraits : DefaultTransformFunctorTraits<F>
|
||||
{
|
||||
};
|
||||
template <typename F> struct MinMaxTraits<2, F> : DefaultTransformFunctorTraits<F>
|
||||
{
|
||||
enum { smart_shift = 4 };
|
||||
};
|
||||
template <typename F> struct MinMaxTraits<4, F> : DefaultTransformFunctorTraits<F>
|
||||
{
|
||||
enum { smart_block_dim_y = 4 };
|
||||
enum { smart_shift = 4 };
|
||||
};
|
||||
}
|
||||
|
||||
template <typename T> struct TransformFunctorTraits< minimum<T> > : detail::MinMaxTraits< sizeof(T), minimum<T> >
|
||||
{
|
||||
};
|
||||
template <typename T> struct TransformFunctorTraits< maximum<T> > : detail::MinMaxTraits< sizeof(T), maximum<T> >
|
||||
{
|
||||
};
|
||||
template <typename T> struct TransformFunctorTraits< binder2nd< minimum<T> > > : detail::MinMaxTraits< sizeof(T), binder2nd< minimum<T> > >
|
||||
{
|
||||
};
|
||||
template <typename T> struct TransformFunctorTraits< binder2nd< maximum<T> > > : detail::MinMaxTraits< sizeof(T), binder2nd< maximum<T> > >
|
||||
{
|
||||
};
|
||||
|
||||
template <typename T>
|
||||
void min_gpu(const DevMem2D_<T>& src1, const DevMem2D_<T>& src2, const DevMem2D_<T>& dst, cudaStream_t stream)
|
||||
@ -413,7 +438,39 @@ namespace cv { namespace gpu { namespace mathfunc
|
||||
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////
|
||||
// threshold
|
||||
// threshold
|
||||
|
||||
namespace detail
|
||||
{
|
||||
template <size_t size, typename F> struct ThresholdTraits : DefaultTransformFunctorTraits<F>
|
||||
{
|
||||
};
|
||||
template <typename F> struct ThresholdTraits<2, F> : DefaultTransformFunctorTraits<F>
|
||||
{
|
||||
enum { smart_shift = 4 };
|
||||
};
|
||||
template <typename F> struct ThresholdTraits<4, F> : DefaultTransformFunctorTraits<F>
|
||||
{
|
||||
enum { smart_block_dim_y = 4 };
|
||||
enum { smart_shift = 4 };
|
||||
};
|
||||
}
|
||||
|
||||
template <typename T> struct TransformFunctorTraits< thresh_binary_func<T> > : detail::ThresholdTraits< sizeof(T), thresh_binary_func<T> >
|
||||
{
|
||||
};
|
||||
template <typename T> struct TransformFunctorTraits< thresh_binary_inv_func<T> > : detail::ThresholdTraits< sizeof(T), thresh_binary_inv_func<T> >
|
||||
{
|
||||
};
|
||||
template <typename T> struct TransformFunctorTraits< thresh_trunc_func<T> > : detail::ThresholdTraits< sizeof(T), thresh_trunc_func<T> >
|
||||
{
|
||||
};
|
||||
template <typename T> struct TransformFunctorTraits< thresh_to_zero_func<T> > : detail::ThresholdTraits< sizeof(T), thresh_to_zero_func<T> >
|
||||
{
|
||||
};
|
||||
template <typename T> struct TransformFunctorTraits< thresh_to_zero_inv_func<T> > : detail::ThresholdTraits< sizeof(T), thresh_to_zero_inv_func<T> >
|
||||
{
|
||||
};
|
||||
|
||||
template <template <typename> class Op, typename T>
|
||||
void threshold_caller(const DevMem2D_<T>& src, const DevMem2D_<T>& dst, T thresh, T maxVal,
|
||||
@ -454,8 +511,13 @@ namespace cv { namespace gpu { namespace mathfunc
|
||||
//////////////////////////////////////////////////////////////////////////
|
||||
// subtract
|
||||
|
||||
template <typename T>
|
||||
void subtractCaller(const DevMem2D src1, const DevMem2D src2, DevMem2D dst, cudaStream_t stream)
|
||||
template <> struct TransformFunctorTraits< minus<short> > : DefaultTransformFunctorTraits< minus<short> >
|
||||
{
|
||||
enum { smart_block_dim_y = 8 };
|
||||
enum { smart_shift = 4 };
|
||||
};
|
||||
|
||||
template <typename T> void subtractCaller(const DevMem2D src1, const DevMem2D src2, DevMem2D dst, cudaStream_t stream)
|
||||
{
|
||||
transform((DevMem2D_<T>)src1, (DevMem2D_<T>)src2, (DevMem2D_<T>)dst, minus<T>(), stream);
|
||||
}
|
||||
@ -499,10 +561,35 @@ namespace cv { namespace gpu { namespace mathfunc
|
||||
|
||||
__device__ __forceinline__ float operator()(const float& e) const
|
||||
{
|
||||
return __powf(fabs(e), power);
|
||||
return __powf(::fabs(e), power);
|
||||
}
|
||||
};
|
||||
|
||||
namespace detail
|
||||
{
|
||||
template <size_t size, typename T> struct PowOpTraits : DefaultTransformFunctorTraits< PowOp<T> >
|
||||
{
|
||||
};
|
||||
template <typename T> struct PowOpTraits<1, T> : DefaultTransformFunctorTraits< PowOp<T> >
|
||||
{
|
||||
enum { smart_block_dim_y = 8 };
|
||||
enum { smart_shift = 8 };
|
||||
};
|
||||
template <typename T> struct PowOpTraits<2, T> : DefaultTransformFunctorTraits< PowOp<T> >
|
||||
{
|
||||
enum { smart_shift = 4 };
|
||||
};
|
||||
template <typename T> struct PowOpTraits<4, T> : DefaultTransformFunctorTraits< PowOp<T> >
|
||||
{
|
||||
enum { smart_block_dim_y = 4 };
|
||||
enum { smart_shift = 4 };
|
||||
};
|
||||
}
|
||||
|
||||
template <typename T> struct TransformFunctorTraits< PowOp<T> > : detail::PowOpTraits<sizeof(T), T>
|
||||
{
|
||||
};
|
||||
|
||||
template<typename T>
|
||||
void pow_caller(const DevMem2D& src, float power, DevMem2D dst, cudaStream_t stream)
|
||||
{
|
||||
@ -514,6 +601,5 @@ namespace cv { namespace gpu { namespace mathfunc
|
||||
template void pow_caller<short>(const DevMem2D& src, float power, DevMem2D dst, cudaStream_t stream);
|
||||
template void pow_caller<ushort>(const DevMem2D& src, float power, DevMem2D dst, cudaStream_t stream);
|
||||
template void pow_caller<int>(const DevMem2D& src, float power, DevMem2D dst, cudaStream_t stream);
|
||||
template void pow_caller<uint>(const DevMem2D& src, float power, DevMem2D dst, cudaStream_t stream);
|
||||
template void pow_caller<float>(const DevMem2D& src, float power, DevMem2D dst, cudaStream_t stream);
|
||||
}}}
|
||||
|
@ -40,14 +40,9 @@
|
||||
//
|
||||
//M*/
|
||||
|
||||
#include "opencv2/gpu/device/limits.hpp"
|
||||
#include "opencv2/gpu/device/saturate_cast.hpp"
|
||||
#include "opencv2/gpu/device/vec_math.hpp"
|
||||
#include "opencv2/gpu/device/transform.hpp"
|
||||
#include "internal_shared.hpp"
|
||||
|
||||
using namespace cv::gpu;
|
||||
using namespace cv::gpu::device;
|
||||
|
||||
#ifndef CV_PI
|
||||
#define CV_PI 3.1415926535897932384626433832795f
|
||||
|
@ -45,9 +45,7 @@
|
||||
#include "opencv2/gpu/device/transform.hpp"
|
||||
#include "opencv2/gpu/device/functional.hpp"
|
||||
|
||||
using namespace cv::gpu::device;
|
||||
|
||||
namespace cv { namespace gpu { namespace matrix_operations {
|
||||
namespace cv { namespace gpu { namespace device {
|
||||
|
||||
template <typename T> struct shift_and_sizeof;
|
||||
template <> struct shift_and_sizeof<signed char> { enum { shift = 0 }; };
|
||||
@ -249,7 +247,55 @@ namespace cv { namespace gpu { namespace matrix_operations {
|
||||
|
||||
const double alpha, beta;
|
||||
};
|
||||
|
||||
|
||||
namespace detail
|
||||
{
|
||||
template <size_t src_size, size_t dst_size, typename F> struct ConvertTraitsDispatcher : DefaultTransformFunctorTraits<F>
|
||||
{
|
||||
};
|
||||
template <typename F> struct ConvertTraitsDispatcher<1, 1, F> : DefaultTransformFunctorTraits<F>
|
||||
{
|
||||
enum { smart_shift = 8 };
|
||||
};
|
||||
template <typename F> struct ConvertTraitsDispatcher<1, 2, F> : DefaultTransformFunctorTraits<F>
|
||||
{
|
||||
enum { smart_shift = 4 };
|
||||
};
|
||||
template <typename F> struct ConvertTraitsDispatcher<1, 4, F> : DefaultTransformFunctorTraits<F>
|
||||
{
|
||||
enum { smart_block_dim_y = 8 };
|
||||
enum { smart_shift = 4 };
|
||||
};
|
||||
|
||||
template <typename F> struct ConvertTraitsDispatcher<2, 2, F> : DefaultTransformFunctorTraits<F>
|
||||
{
|
||||
enum { smart_shift = 4 };
|
||||
};
|
||||
template <typename F> struct ConvertTraitsDispatcher<2, 4, F> : DefaultTransformFunctorTraits<F>
|
||||
{
|
||||
enum { smart_shift = 2 };
|
||||
};
|
||||
|
||||
template <typename F> struct ConvertTraitsDispatcher<4, 2, F> : DefaultTransformFunctorTraits<F>
|
||||
{
|
||||
enum { smart_block_dim_y = 8 };
|
||||
enum { smart_shift = 4 };
|
||||
};
|
||||
template <typename F> struct ConvertTraitsDispatcher<4, 4, F> : DefaultTransformFunctorTraits<F>
|
||||
{
|
||||
enum { smart_block_dim_y = 8 };
|
||||
enum { smart_shift = 2 };
|
||||
};
|
||||
|
||||
template <typename F> struct ConvertTraits : ConvertTraitsDispatcher<sizeof(typename F::argument_type), sizeof(typename F::result_type), F>
|
||||
{
|
||||
};
|
||||
}
|
||||
|
||||
template <typename T, typename D> struct TransformFunctorTraits< Convertor<T, D> > : detail::ConvertTraits< Convertor<T, D> >
|
||||
{
|
||||
};
|
||||
|
||||
template<typename T, typename D>
|
||||
void cvt_(const DevMem2D& src, const DevMem2D& dst, double alpha, double beta, cudaStream_t stream)
|
||||
{
|
||||
|
@ -71,23 +71,16 @@ cv::gpu::Stream::operator bool() const { throw_nogpu(); return false; }
|
||||
|
||||
#include "opencv2/gpu/stream_accessor.hpp"
|
||||
|
||||
namespace cv
|
||||
{
|
||||
namespace gpu
|
||||
{
|
||||
namespace matrix_operations
|
||||
{
|
||||
void copy_to_with_mask(const DevMem2D& src, DevMem2D dst, int depth, const DevMem2D& mask, int channels, const cudaStream_t & stream = 0);
|
||||
namespace cv { namespace gpu { namespace device {
|
||||
void copy_to_with_mask(const DevMem2D& src, DevMem2D dst, int depth, const DevMem2D& mask, int channels, const cudaStream_t & stream = 0);
|
||||
|
||||
template <typename T>
|
||||
void set_to_gpu(const DevMem2D& mat, const T* scalar, int channels, cudaStream_t stream);
|
||||
template <typename T>
|
||||
void set_to_gpu(const DevMem2D& mat, const T* scalar, const DevMem2D& mask, int channels, cudaStream_t stream);
|
||||
template <typename T>
|
||||
void set_to_gpu(const DevMem2D& mat, const T* scalar, int channels, cudaStream_t stream);
|
||||
template <typename T>
|
||||
void set_to_gpu(const DevMem2D& mat, const T* scalar, const DevMem2D& mask, int channels, cudaStream_t stream);
|
||||
|
||||
void convert_gpu(const DevMem2D& src, int sdepth, const DevMem2D& dst, int ddepth, double alpha, double beta, cudaStream_t stream = 0);
|
||||
}
|
||||
}
|
||||
}
|
||||
void convert_gpu(const DevMem2D& src, int sdepth, const DevMem2D& dst, int ddepth, double alpha, double beta, cudaStream_t stream = 0);
|
||||
}}}
|
||||
|
||||
struct Stream::Impl
|
||||
{
|
||||
@ -108,14 +101,14 @@ namespace
|
||||
void kernelSet(GpuMat& src, const Scalar& s, cudaStream_t stream)
|
||||
{
|
||||
Scalar_<T> sf = s;
|
||||
matrix_operations::set_to_gpu(src, sf.val, src.channels(), stream);
|
||||
device::set_to_gpu(src, sf.val, src.channels(), stream);
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
void kernelSetMask(GpuMat& src, const Scalar& s, const GpuMat& mask, cudaStream_t stream)
|
||||
{
|
||||
Scalar_<T> sf = s;
|
||||
matrix_operations::set_to_gpu(src, sf.val, mask, src.channels(), stream);
|
||||
device::set_to_gpu(src, sf.val, mask, src.channels(), stream);
|
||||
}
|
||||
}
|
||||
|
||||
@ -262,7 +255,7 @@ void cv::gpu::Stream::enqueueConvert(const GpuMat& src, GpuMat& dst, int rtype,
|
||||
psrc = &(temp = src);
|
||||
|
||||
dst.create( src.size(), rtype );
|
||||
matrix_operations::convert_gpu(psrc->reshape(1), sdepth, dst.reshape(1), ddepth, alpha, beta, impl->stream);
|
||||
device::convert_gpu(psrc->reshape(1), sdepth, dst.reshape(1), ddepth, alpha, beta, impl->stream);
|
||||
}
|
||||
|
||||
cv::gpu::Stream::operator bool() const
|
||||
|
@ -67,7 +67,6 @@ void cv::gpu::min(const GpuMat&, double, GpuMat&, Stream&) { throw_nogpu(); }
|
||||
void cv::gpu::max(const GpuMat&, const GpuMat&, GpuMat&, Stream&) { throw_nogpu(); }
|
||||
void cv::gpu::max(const GpuMat&, double, GpuMat&, Stream&) { throw_nogpu(); }
|
||||
double cv::gpu::threshold(const GpuMat&, GpuMat&, double, double, int, Stream&) {throw_nogpu(); return 0.0;}
|
||||
|
||||
void cv::gpu::pow(const GpuMat&, double, GpuMat&, Stream&) { throw_nogpu(); }
|
||||
|
||||
#else
|
||||
@ -180,7 +179,7 @@ void cv::gpu::add(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, Stream& s
|
||||
nppArithmCaller(src1, src2, dst, nppiAdd_8u_C1RSfs, nppiAdd_8u_C4RSfs, nppiAdd_32s_C1R, nppiAdd_32f_C1R, StreamAccessor::getStream(stream));
|
||||
}
|
||||
|
||||
namespace cv { namespace gpu { namespace mathfunc
|
||||
namespace cv { namespace gpu { namespace device
|
||||
{
|
||||
template <typename T>
|
||||
void subtractCaller(const DevMem2D src1, const DevMem2D src2, DevMem2D dst, cudaStream_t stream);
|
||||
@ -192,7 +191,7 @@ void cv::gpu::subtract(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, Stre
|
||||
{
|
||||
CV_Assert(src1.size() == src2.size());
|
||||
dst.create(src1.size(), src1.type());
|
||||
mathfunc::subtractCaller<short>(src1.reshape(1), src2.reshape(1), dst.reshape(1), StreamAccessor::getStream(stream));
|
||||
device::subtractCaller<short>(src1.reshape(1), src2.reshape(1), dst.reshape(1), StreamAccessor::getStream(stream));
|
||||
}
|
||||
else
|
||||
nppArithmCaller(src2, src1, dst, nppiSub_8u_C1RSfs, nppiSub_8u_C4RSfs, nppiSub_32s_C1R, nppiSub_32f_C1R, StreamAccessor::getStream(stream));
|
||||
@ -338,7 +337,7 @@ void cv::gpu::absdiff(const GpuMat& src1, const Scalar& src2, GpuMat& dst, Strea
|
||||
//////////////////////////////////////////////////////////////////////////////
|
||||
// Comparison of two matrixes
|
||||
|
||||
namespace cv { namespace gpu { namespace mathfunc
|
||||
namespace cv { namespace gpu { namespace device
|
||||
{
|
||||
void compare_ne_8uc4(const DevMem2D& src1, const DevMem2D& src2, const DevMem2D& dst, cudaStream_t stream);
|
||||
void compare_ne_32f(const DevMem2D& src1, const DevMem2D& src2, const DevMem2D& dst, cudaStream_t stream);
|
||||
@ -375,7 +374,7 @@ void cv::gpu::compare(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, int c
|
||||
}
|
||||
else
|
||||
{
|
||||
mathfunc::compare_ne_8uc4(src1, src2, dst, stream);
|
||||
device::compare_ne_8uc4(src1, src2, dst, stream);
|
||||
}
|
||||
}
|
||||
else
|
||||
@ -393,7 +392,7 @@ void cv::gpu::compare(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, int c
|
||||
}
|
||||
else
|
||||
{
|
||||
mathfunc::compare_ne_32f(src1, src2, dst, stream);
|
||||
device::compare_ne_32f(src1, src2, dst, stream);
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -402,7 +401,7 @@ void cv::gpu::compare(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, int c
|
||||
//////////////////////////////////////////////////////////////////////////////
|
||||
// Unary bitwise logical operations
|
||||
|
||||
namespace cv { namespace gpu { namespace mathfunc
|
||||
namespace cv { namespace gpu { namespace device
|
||||
{
|
||||
void bitwiseNotCaller(int rows, int cols, size_t elem_size1, int cn, const PtrStep src, PtrStep dst, cudaStream_t stream);
|
||||
|
||||
@ -416,7 +415,7 @@ namespace
|
||||
{
|
||||
dst.create(src.size(), src.type());
|
||||
|
||||
cv::gpu::mathfunc::bitwiseNotCaller(src.rows, src.cols, src.elemSize1(),
|
||||
cv::gpu::device::bitwiseNotCaller(src.rows, src.cols, src.elemSize1(),
|
||||
dst.channels(), src, dst, stream);
|
||||
}
|
||||
|
||||
@ -426,10 +425,10 @@ namespace
|
||||
using namespace cv::gpu;
|
||||
|
||||
typedef void (*Caller)(int, int, int, const PtrStep, const PtrStep, PtrStep, cudaStream_t);
|
||||
static Caller callers[] = {mathfunc::bitwiseMaskNotCaller<unsigned char>, mathfunc::bitwiseMaskNotCaller<unsigned char>,
|
||||
mathfunc::bitwiseMaskNotCaller<unsigned short>, mathfunc::bitwiseMaskNotCaller<unsigned short>,
|
||||
mathfunc::bitwiseMaskNotCaller<unsigned int>, mathfunc::bitwiseMaskNotCaller<unsigned int>,
|
||||
mathfunc::bitwiseMaskNotCaller<unsigned int>};
|
||||
static Caller callers[] = {device::bitwiseMaskNotCaller<unsigned char>, device::bitwiseMaskNotCaller<unsigned char>,
|
||||
device::bitwiseMaskNotCaller<unsigned short>, device::bitwiseMaskNotCaller<unsigned short>,
|
||||
device::bitwiseMaskNotCaller<unsigned int>, device::bitwiseMaskNotCaller<unsigned int>,
|
||||
device::bitwiseMaskNotCaller<unsigned int>};
|
||||
|
||||
CV_Assert(mask.type() == CV_8U && mask.size() == src.size());
|
||||
dst.create(src.size(), src.type());
|
||||
@ -456,7 +455,7 @@ void cv::gpu::bitwise_not(const GpuMat& src, GpuMat& dst, const GpuMat& mask, St
|
||||
//////////////////////////////////////////////////////////////////////////////
|
||||
// Binary bitwise logical operations
|
||||
|
||||
namespace cv { namespace gpu { namespace mathfunc
|
||||
namespace cv { namespace gpu { namespace device
|
||||
{
|
||||
void bitwiseOrCaller(int rows, int cols, size_t elem_size1, int cn, const PtrStep src1, const PtrStep src2, PtrStep dst, cudaStream_t stream);
|
||||
|
||||
@ -482,7 +481,7 @@ namespace
|
||||
CV_Assert(src1.size() == src2.size() && src1.type() == src2.type());
|
||||
dst.create(src1.size(), src1.type());
|
||||
|
||||
cv::gpu::mathfunc::bitwiseOrCaller(dst.rows, dst.cols, dst.elemSize1(),
|
||||
cv::gpu::device::bitwiseOrCaller(dst.rows, dst.cols, dst.elemSize1(),
|
||||
dst.channels(), src1, src2, dst, stream);
|
||||
}
|
||||
|
||||
@ -492,10 +491,10 @@ namespace
|
||||
using namespace cv::gpu;
|
||||
|
||||
typedef void (*Caller)(int, int, int, const PtrStep, const PtrStep, const PtrStep, PtrStep, cudaStream_t);
|
||||
static Caller callers[] = {mathfunc::bitwiseMaskOrCaller<unsigned char>, mathfunc::bitwiseMaskOrCaller<unsigned char>,
|
||||
mathfunc::bitwiseMaskOrCaller<unsigned short>, mathfunc::bitwiseMaskOrCaller<unsigned short>,
|
||||
mathfunc::bitwiseMaskOrCaller<unsigned int>, mathfunc::bitwiseMaskOrCaller<unsigned int>,
|
||||
mathfunc::bitwiseMaskOrCaller<unsigned int>};
|
||||
static Caller callers[] = {device::bitwiseMaskOrCaller<unsigned char>, device::bitwiseMaskOrCaller<unsigned char>,
|
||||
device::bitwiseMaskOrCaller<unsigned short>, device::bitwiseMaskOrCaller<unsigned short>,
|
||||
device::bitwiseMaskOrCaller<unsigned int>, device::bitwiseMaskOrCaller<unsigned int>,
|
||||
device::bitwiseMaskOrCaller<unsigned int>};
|
||||
|
||||
CV_Assert(src1.size() == src2.size() && src1.type() == src2.type());
|
||||
dst.create(src1.size(), src1.type());
|
||||
@ -513,7 +512,7 @@ namespace
|
||||
CV_Assert(src1.size() == src2.size() && src1.type() == src2.type());
|
||||
dst.create(src1.size(), src1.type());
|
||||
|
||||
cv::gpu::mathfunc::bitwiseAndCaller(dst.rows, dst.cols, dst.elemSize1(),
|
||||
cv::gpu::device::bitwiseAndCaller(dst.rows, dst.cols, dst.elemSize1(),
|
||||
dst.channels(), src1, src2, dst, stream);
|
||||
}
|
||||
|
||||
@ -523,10 +522,10 @@ namespace
|
||||
using namespace cv::gpu;
|
||||
|
||||
typedef void (*Caller)(int, int, int, const PtrStep, const PtrStep, const PtrStep, PtrStep, cudaStream_t);
|
||||
static Caller callers[] = {mathfunc::bitwiseMaskAndCaller<unsigned char>, mathfunc::bitwiseMaskAndCaller<unsigned char>,
|
||||
mathfunc::bitwiseMaskAndCaller<unsigned short>, mathfunc::bitwiseMaskAndCaller<unsigned short>,
|
||||
mathfunc::bitwiseMaskAndCaller<unsigned int>, mathfunc::bitwiseMaskAndCaller<unsigned int>,
|
||||
mathfunc::bitwiseMaskAndCaller<unsigned int>};
|
||||
static Caller callers[] = {device::bitwiseMaskAndCaller<unsigned char>, device::bitwiseMaskAndCaller<unsigned char>,
|
||||
device::bitwiseMaskAndCaller<unsigned short>, device::bitwiseMaskAndCaller<unsigned short>,
|
||||
device::bitwiseMaskAndCaller<unsigned int>, device::bitwiseMaskAndCaller<unsigned int>,
|
||||
device::bitwiseMaskAndCaller<unsigned int>};
|
||||
|
||||
CV_Assert(src1.size() == src2.size() && src1.type() == src2.type());
|
||||
dst.create(src1.size(), src1.type());
|
||||
@ -544,7 +543,7 @@ namespace
|
||||
CV_Assert(src1.size() == src2.size() && src1.type() == src2.type());
|
||||
dst.create(src1.size(), src1.type());
|
||||
|
||||
cv::gpu::mathfunc::bitwiseXorCaller(dst.rows, dst.cols, dst.elemSize1(),
|
||||
cv::gpu::device::bitwiseXorCaller(dst.rows, dst.cols, dst.elemSize1(),
|
||||
dst.channels(), src1, src2, dst, stream);
|
||||
}
|
||||
|
||||
@ -554,10 +553,10 @@ namespace
|
||||
using namespace cv::gpu;
|
||||
|
||||
typedef void (*Caller)(int, int, int, const PtrStep, const PtrStep, const PtrStep, PtrStep, cudaStream_t);
|
||||
static Caller callers[] = {mathfunc::bitwiseMaskXorCaller<unsigned char>, mathfunc::bitwiseMaskXorCaller<unsigned char>,
|
||||
mathfunc::bitwiseMaskXorCaller<unsigned short>, mathfunc::bitwiseMaskXorCaller<unsigned short>,
|
||||
mathfunc::bitwiseMaskXorCaller<unsigned int>, mathfunc::bitwiseMaskXorCaller<unsigned int>,
|
||||
mathfunc::bitwiseMaskXorCaller<unsigned int>};
|
||||
static Caller callers[] = {device::bitwiseMaskXorCaller<unsigned char>, device::bitwiseMaskXorCaller<unsigned char>,
|
||||
device::bitwiseMaskXorCaller<unsigned short>, device::bitwiseMaskXorCaller<unsigned short>,
|
||||
device::bitwiseMaskXorCaller<unsigned int>, device::bitwiseMaskXorCaller<unsigned int>,
|
||||
device::bitwiseMaskXorCaller<unsigned int>};
|
||||
|
||||
CV_Assert(src1.size() == src2.size() && src1.type() == src2.type());
|
||||
dst.create(src1.size(), src1.type());
|
||||
@ -601,7 +600,7 @@ void cv::gpu::bitwise_xor(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, c
|
||||
//////////////////////////////////////////////////////////////////////////////
|
||||
// Minimum and maximum operations
|
||||
|
||||
namespace cv { namespace gpu { namespace mathfunc
|
||||
namespace cv { namespace gpu { namespace device
|
||||
{
|
||||
template <typename T>
|
||||
void min_gpu(const DevMem2D_<T>& src1, const DevMem2D_<T>& src2, const DevMem2D_<T>& dst, cudaStream_t stream);
|
||||
@ -623,14 +622,14 @@ namespace
|
||||
{
|
||||
CV_Assert(src1.size() == src2.size() && src1.type() == src2.type());
|
||||
dst.create(src1.size(), src1.type());
|
||||
mathfunc::min_gpu<T>(src1.reshape(1), src2.reshape(1), dst.reshape(1), stream);
|
||||
device::min_gpu<T>(src1.reshape(1), src2.reshape(1), dst.reshape(1), stream);
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
void min_caller(const GpuMat& src1, double src2, GpuMat& dst, cudaStream_t stream)
|
||||
{
|
||||
dst.create(src1.size(), src1.type());
|
||||
mathfunc::min_gpu<T>(src1.reshape(1), saturate_cast<T>(src2), dst.reshape(1), stream);
|
||||
device::min_gpu<T>(src1.reshape(1), saturate_cast<T>(src2), dst.reshape(1), stream);
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
@ -638,14 +637,14 @@ namespace
|
||||
{
|
||||
CV_Assert(src1.size() == src2.size() && src1.type() == src2.type());
|
||||
dst.create(src1.size(), src1.type());
|
||||
mathfunc::max_gpu<T>(src1.reshape(1), src2.reshape(1), dst.reshape(1), stream);
|
||||
device::max_gpu<T>(src1.reshape(1), src2.reshape(1), dst.reshape(1), stream);
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
void max_caller(const GpuMat& src1, double src2, GpuMat& dst, cudaStream_t stream)
|
||||
{
|
||||
dst.create(src1.size(), src1.type());
|
||||
mathfunc::max_gpu<T>(src1.reshape(1), saturate_cast<T>(src2), dst.reshape(1), stream);
|
||||
device::max_gpu<T>(src1.reshape(1), saturate_cast<T>(src2), dst.reshape(1), stream);
|
||||
}
|
||||
}
|
||||
|
||||
@ -709,7 +708,7 @@ void cv::gpu::max(const GpuMat& src1, double src2, GpuMat& dst, Stream& stream)
|
||||
////////////////////////////////////////////////////////////////////////
|
||||
// threshold
|
||||
|
||||
namespace cv { namespace gpu { namespace mathfunc
|
||||
namespace cv { namespace gpu { namespace device
|
||||
{
|
||||
template <typename T>
|
||||
void threshold_gpu(const DevMem2D& src, const DevMem2D& dst, T thresh, T maxVal, int type,
|
||||
@ -718,24 +717,25 @@ namespace cv { namespace gpu { namespace mathfunc
|
||||
|
||||
namespace
|
||||
{
|
||||
template <typename T>
|
||||
void threshold_caller(const GpuMat& src, GpuMat& dst, double thresh, double maxVal, int type,
|
||||
cudaStream_t stream)
|
||||
template <typename T> void threshold_caller(const GpuMat& src, GpuMat& dst, double thresh, double maxVal, int type, cudaStream_t stream)
|
||||
{
|
||||
mathfunc::threshold_gpu<T>(src, dst, saturate_cast<T>(thresh), saturate_cast<T>(maxVal), type, stream);
|
||||
device::threshold_gpu<T>(src, dst, saturate_cast<T>(thresh), saturate_cast<T>(maxVal), type, stream);
|
||||
}
|
||||
}
|
||||
|
||||
double cv::gpu::threshold(const GpuMat& src, GpuMat& dst, double thresh, double maxVal, int type, Stream& s)
|
||||
{
|
||||
CV_Assert(src.channels() == 1 && src.depth() <= CV_64F);
|
||||
CV_Assert(type <= THRESH_TOZERO_INV);
|
||||
|
||||
dst.create(src.size(), src.type());
|
||||
|
||||
cudaStream_t stream = StreamAccessor::getStream(s);
|
||||
|
||||
if (src.type() == CV_32FC1 && type == THRESH_TRUNC)
|
||||
{
|
||||
NppStreamHandler h(stream);
|
||||
|
||||
dst.create(src.size(), src.type());
|
||||
|
||||
NppiSize sz;
|
||||
sz.width = src.cols;
|
||||
sz.height = src.rows;
|
||||
@ -761,12 +761,7 @@ double cv::gpu::threshold(const GpuMat& src, GpuMat& dst, double thresh, double
|
||||
threshold_caller<int>, threshold_caller<float>, threshold_caller<double>
|
||||
};
|
||||
|
||||
CV_Assert(src.channels() == 1 && src.depth() <= CV_64F);
|
||||
CV_Assert(type <= THRESH_TOZERO_INV);
|
||||
|
||||
dst.create(src.size(), src.type());
|
||||
|
||||
if (src.depth() != CV_32F)
|
||||
if (src.depth() != CV_32F && src.depth() != CV_64F)
|
||||
{
|
||||
thresh = cvFloor(thresh);
|
||||
maxVal = cvRound(maxVal);
|
||||
@ -781,17 +776,11 @@ double cv::gpu::threshold(const GpuMat& src, GpuMat& dst, double thresh, double
|
||||
////////////////////////////////////////////////////////////////////////
|
||||
// pow
|
||||
|
||||
namespace cv
|
||||
namespace cv { namespace gpu { namespace device
|
||||
{
|
||||
namespace gpu
|
||||
{
|
||||
namespace mathfunc
|
||||
{
|
||||
template<typename T>
|
||||
void pow_caller(const DevMem2D& src, float power, DevMem2D dst, cudaStream_t stream);
|
||||
}
|
||||
}
|
||||
}
|
||||
template<typename T>
|
||||
void pow_caller(const DevMem2D& src, float power, DevMem2D dst, cudaStream_t stream);
|
||||
}}}
|
||||
|
||||
void cv::gpu::pow(const GpuMat& src, double power, GpuMat& dst, Stream& stream)
|
||||
{
|
||||
@ -802,9 +791,9 @@ void cv::gpu::pow(const GpuMat& src, double power, GpuMat& dst, Stream& stream)
|
||||
|
||||
static const caller_t callers[] =
|
||||
{
|
||||
mathfunc::pow_caller<unsigned char>, mathfunc::pow_caller<signed char>,
|
||||
mathfunc::pow_caller<unsigned short>, mathfunc::pow_caller<short>,
|
||||
mathfunc::pow_caller<int>, mathfunc::pow_caller<float>
|
||||
device::pow_caller<unsigned char>, device::pow_caller<signed char>,
|
||||
device::pow_caller<unsigned short>, device::pow_caller<short>,
|
||||
device::pow_caller<int>, device::pow_caller<float>
|
||||
};
|
||||
|
||||
callers[src.depth()](src.reshape(1), (float)power, dst.reshape(1), StreamAccessor::getStream(stream));
|
||||
|
@ -393,7 +393,7 @@ void cv::gpu::ensureSizeIsEnough(int, int, int, GpuMat&) { throw_nogpu(); }
|
||||
|
||||
#else /* !defined (HAVE_CUDA) */
|
||||
|
||||
namespace cv { namespace gpu { namespace matrix_operations
|
||||
namespace cv { namespace gpu { namespace device
|
||||
{
|
||||
void copy_to_with_mask(const DevMem2D& src, DevMem2D dst, int depth, const DevMem2D& mask, int channels, const cudaStream_t & stream = 0);
|
||||
|
||||
@ -449,7 +449,7 @@ void cv::gpu::GpuMat::copyTo(GpuMat& mat, const GpuMat& mask) const
|
||||
else
|
||||
{
|
||||
mat.create(size(), type());
|
||||
cv::gpu::matrix_operations::copy_to_with_mask(*this, mat, depth(), mask, channels());
|
||||
device::copy_to_with_mask(*this, mat, depth(), mask, channels());
|
||||
}
|
||||
}
|
||||
|
||||
@ -508,7 +508,7 @@ namespace
|
||||
|
||||
void convertToKernelCaller(const GpuMat& src, GpuMat& dst)
|
||||
{
|
||||
matrix_operations::convert_gpu(src.reshape(1), src.depth(), dst.reshape(1), dst.depth(), 1.0, 0.0);
|
||||
device::convert_gpu(src.reshape(1), src.depth(), dst.reshape(1), dst.depth(), 1.0, 0.0);
|
||||
}
|
||||
}
|
||||
|
||||
@ -540,7 +540,7 @@ void cv::gpu::GpuMat::convertTo( GpuMat& dst, int rtype, double alpha, double be
|
||||
dst.create( size(), rtype );
|
||||
|
||||
if (!noScale)
|
||||
matrix_operations::convert_gpu(psrc->reshape(1), sdepth, dst.reshape(1), ddepth, alpha, beta);
|
||||
device::convert_gpu(psrc->reshape(1), sdepth, dst.reshape(1), ddepth, alpha, beta);
|
||||
else
|
||||
{
|
||||
typedef void (*convert_caller_t)(const GpuMat& src, GpuMat& dst);
|
||||
@ -681,7 +681,7 @@ namespace
|
||||
void kernelSet(GpuMat& src, const Scalar& s)
|
||||
{
|
||||
Scalar_<T> sf = s;
|
||||
matrix_operations::set_to_gpu(src, sf.val, src.channels(), 0);
|
||||
device::set_to_gpu(src, sf.val, src.channels(), 0);
|
||||
}
|
||||
|
||||
template<int SDEPTH, int SCN> struct NppSetMaskFunc
|
||||
@ -732,7 +732,7 @@ namespace
|
||||
void kernelSetMask(GpuMat& src, const Scalar& s, const GpuMat& mask)
|
||||
{
|
||||
Scalar_<T> sf = s;
|
||||
matrix_operations::set_to_gpu(src, sf.val, mask, src.channels(), 0);
|
||||
device::set_to_gpu(src, sf.val, mask, src.channels(), 0);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -45,6 +45,7 @@
|
||||
|
||||
#include "internal_shared.hpp"
|
||||
#include "../vec_traits.hpp"
|
||||
#include "../functional.hpp"
|
||||
|
||||
namespace cv { namespace gpu { namespace device
|
||||
{
|
||||
@ -68,51 +69,17 @@ namespace cv { namespace gpu { namespace device
|
||||
|
||||
//! Read Write Traits
|
||||
|
||||
template <size_t src_elem_size, size_t dst_elem_size>
|
||||
struct UnReadWriteTraits_
|
||||
template <typename T, typename D, int shift> struct UnaryReadWriteTraits
|
||||
{
|
||||
enum { shift = 1 };
|
||||
};
|
||||
template <size_t src_elem_size>
|
||||
struct UnReadWriteTraits_<src_elem_size, 1>
|
||||
{
|
||||
enum { shift = 4 };
|
||||
};
|
||||
template <size_t src_elem_size>
|
||||
struct UnReadWriteTraits_<src_elem_size, 2>
|
||||
{
|
||||
enum { shift = 2 };
|
||||
};
|
||||
template <typename T, typename D> struct UnReadWriteTraits
|
||||
{
|
||||
enum { shift = UnReadWriteTraits_<sizeof(T), sizeof(D)>::shift };
|
||||
|
||||
typedef typename TypeVec<T, shift>::vec_type read_type;
|
||||
typedef typename TypeVec<D, shift>::vec_type write_type;
|
||||
};
|
||||
|
||||
template <size_t src_elem_size1, size_t src_elem_size2, size_t dst_elem_size>
|
||||
struct BinReadWriteTraits_
|
||||
template <typename T1, typename T2, typename D, int shift> struct BinaryReadWriteTraits
|
||||
{
|
||||
enum { shift = 1 };
|
||||
};
|
||||
template <size_t src_elem_size1, size_t src_elem_size2>
|
||||
struct BinReadWriteTraits_<src_elem_size1, src_elem_size2, 1>
|
||||
{
|
||||
enum { shift = 4 };
|
||||
};
|
||||
template <size_t src_elem_size1, size_t src_elem_size2>
|
||||
struct BinReadWriteTraits_<src_elem_size1, src_elem_size2, 2>
|
||||
{
|
||||
enum { shift = 2 };
|
||||
};
|
||||
template <typename T1, typename T2, typename D> struct BinReadWriteTraits
|
||||
{
|
||||
enum {shift = BinReadWriteTraits_<sizeof(T1), sizeof(T2), sizeof(D)>::shift};
|
||||
|
||||
typedef typename TypeVec<T1, shift>::vec_type read_type1;
|
||||
typedef typename TypeVec<T2, shift>::vec_type read_type2;
|
||||
typedef typename TypeVec<D , shift>::vec_type write_type;
|
||||
typedef typename TypeVec<D, shift>::vec_type write_type;
|
||||
};
|
||||
|
||||
//! Transform kernels
|
||||
@ -206,29 +173,73 @@ namespace cv { namespace gpu { namespace device
|
||||
dst.w = op(src1.w, src2.w);
|
||||
}
|
||||
};
|
||||
template <> struct OpUnroller<8>
|
||||
{
|
||||
template <typename T, typename D, typename UnOp, typename Mask>
|
||||
static __device__ __forceinline__ void unroll(const T& src, D& dst, const Mask& mask, const UnOp& op, int x_shifted, int y)
|
||||
{
|
||||
if (mask(y, x_shifted))
|
||||
dst.a0 = op(src.a0);
|
||||
if (mask(y, x_shifted + 1))
|
||||
dst.a1 = op(src.a1);
|
||||
if (mask(y, x_shifted + 2))
|
||||
dst.a2 = op(src.a2);
|
||||
if (mask(y, x_shifted + 3))
|
||||
dst.a3 = op(src.a3);
|
||||
if (mask(y, x_shifted + 4))
|
||||
dst.a4 = op(src.a4);
|
||||
if (mask(y, x_shifted + 5))
|
||||
dst.a5 = op(src.a5);
|
||||
if (mask(y, x_shifted + 6))
|
||||
dst.a6 = op(src.a6);
|
||||
if (mask(y, x_shifted + 7))
|
||||
dst.a7 = op(src.a7);
|
||||
}
|
||||
|
||||
template <typename T1, typename T2, typename D, typename BinOp, typename Mask>
|
||||
static __device__ __forceinline__ void unroll(const T1& src1, const T2& src2, D& dst, const Mask& mask, const BinOp& op, int x_shifted, int y)
|
||||
{
|
||||
if (mask(y, x_shifted))
|
||||
dst.a0 = op(src1.a0, src2.a0);
|
||||
if (mask(y, x_shifted + 1))
|
||||
dst.a1 = op(src1.a1, src2.a1);
|
||||
if (mask(y, x_shifted + 2))
|
||||
dst.a2 = op(src1.a2, src2.a2);
|
||||
if (mask(y, x_shifted + 3))
|
||||
dst.a3 = op(src1.a3, src2.a3);
|
||||
if (mask(y, x_shifted + 4))
|
||||
dst.a4 = op(src1.a4, src2.a4);
|
||||
if (mask(y, x_shifted + 5))
|
||||
dst.a5 = op(src1.a5, src2.a5);
|
||||
if (mask(y, x_shifted + 6))
|
||||
dst.a6 = op(src1.a6, src2.a6);
|
||||
if (mask(y, x_shifted + 7))
|
||||
dst.a7 = op(src1.a7, src2.a7);
|
||||
}
|
||||
};
|
||||
|
||||
template <typename T, typename D, typename UnOp, typename Mask>
|
||||
__global__ static void transformSmart(const DevMem2D_<T> src_, PtrStep_<D> dst_, const Mask mask, const UnOp op)
|
||||
{
|
||||
typedef typename UnReadWriteTraits<T, D>::read_type read_type;
|
||||
typedef typename UnReadWriteTraits<T, D>::write_type write_type;
|
||||
const int shift = UnReadWriteTraits<T, D>::shift;
|
||||
typedef TransformFunctorTraits<UnOp> ft;
|
||||
typedef typename UnaryReadWriteTraits<T, D, ft::smart_shift>::read_type read_type;
|
||||
typedef typename UnaryReadWriteTraits<T, D, ft::smart_shift>::write_type write_type;
|
||||
|
||||
const int x = threadIdx.x + blockIdx.x * blockDim.x;
|
||||
const int y = threadIdx.y + blockIdx.y * blockDim.y;
|
||||
const int x_shifted = x * shift;
|
||||
const int x_shifted = x * ft::smart_shift;
|
||||
|
||||
if (y < src_.rows)
|
||||
{
|
||||
const T* src = src_.ptr(y);
|
||||
D* dst = dst_.ptr(y);
|
||||
|
||||
if (x_shifted + shift - 1 < src_.cols)
|
||||
if (x_shifted + ft::smart_shift - 1 < src_.cols)
|
||||
{
|
||||
const read_type src_n_el = ((const read_type*)src)[x];
|
||||
write_type dst_n_el;
|
||||
|
||||
OpUnroller<shift>::unroll(src_n_el, dst_n_el, mask, op, x_shifted, y);
|
||||
OpUnroller<ft::smart_shift>::unroll(src_n_el, dst_n_el, mask, op, x_shifted, y);
|
||||
|
||||
((write_type*)dst)[x] = dst_n_el;
|
||||
}
|
||||
@ -259,14 +270,14 @@ namespace cv { namespace gpu { namespace device
|
||||
__global__ static void transformSmart(const DevMem2D_<T1> src1_, const PtrStep_<T2> src2_, PtrStep_<D> dst_,
|
||||
const Mask mask, const BinOp op)
|
||||
{
|
||||
typedef typename BinReadWriteTraits<T1, T2, D>::read_type1 read_type1;
|
||||
typedef typename BinReadWriteTraits<T1, T2, D>::read_type2 read_type2;
|
||||
typedef typename BinReadWriteTraits<T1, T2, D>::write_type write_type;
|
||||
const int shift = BinReadWriteTraits<T1, T2, D>::shift;
|
||||
typedef TransformFunctorTraits<BinOp> ft;
|
||||
typedef typename BinaryReadWriteTraits<T1, T2, D, ft::smart_shift>::read_type1 read_type1;
|
||||
typedef typename BinaryReadWriteTraits<T1, T2, D, ft::smart_shift>::read_type2 read_type2;
|
||||
typedef typename BinaryReadWriteTraits<T1, T2, D, ft::smart_shift>::write_type write_type;
|
||||
|
||||
const int x = threadIdx.x + blockIdx.x * blockDim.x;
|
||||
const int y = threadIdx.y + blockIdx.y * blockDim.y;
|
||||
const int x_shifted = x * shift;
|
||||
const int x_shifted = x * ft::smart_shift;
|
||||
|
||||
if (y < src1_.rows)
|
||||
{
|
||||
@ -274,13 +285,13 @@ namespace cv { namespace gpu { namespace device
|
||||
const T2* src2 = src2_.ptr(y);
|
||||
D* dst = dst_.ptr(y);
|
||||
|
||||
if (x_shifted + shift - 1 < src1_.cols)
|
||||
if (x_shifted + ft::smart_shift - 1 < src1_.cols)
|
||||
{
|
||||
const read_type1 src1_n_el = ((const read_type1*)src1)[x];
|
||||
const read_type2 src2_n_el = ((const read_type2*)src2)[x];
|
||||
write_type dst_n_el;
|
||||
|
||||
OpUnroller<shift>::unroll(src1_n_el, src2_n_el, dst_n_el, mask, op, x_shifted, y);
|
||||
OpUnroller<ft::smart_shift>::unroll(src1_n_el, src2_n_el, dst_n_el, mask, op, x_shifted, y);
|
||||
|
||||
((write_type*)dst)[x] = dst_n_el;
|
||||
}
|
||||
@ -308,7 +319,7 @@ namespace cv { namespace gpu { namespace device
|
||||
const T2 src2_data = src2.ptr(y)[x];
|
||||
dst.ptr(y)[x] = op(src1_data, src2_data);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template <bool UseSmart> struct TransformDispatcher;
|
||||
template<> struct TransformDispatcher<false>
|
||||
@ -316,11 +327,10 @@ namespace cv { namespace gpu { namespace device
|
||||
template <typename T, typename D, typename UnOp, typename Mask>
|
||||
static void call(const DevMem2D_<T>& src, const DevMem2D_<D>& dst, const UnOp& op, const Mask& mask, cudaStream_t stream)
|
||||
{
|
||||
dim3 threads(16, 16, 1);
|
||||
dim3 grid(1, 1, 1);
|
||||
typedef TransformFunctorTraits<UnOp> ft;
|
||||
|
||||
grid.x = divUp(src.cols, threads.x);
|
||||
grid.y = divUp(src.rows, threads.y);
|
||||
const dim3 threads(ft::simple_block_dim_x, ft::simple_block_dim_y, 1);
|
||||
const dim3 grid(divUp(src.cols, threads.x), divUp(src.rows, threads.y), 1);
|
||||
|
||||
transformSimple<T, D><<<grid, threads, 0, stream>>>(src, dst, mask, op);
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
@ -332,11 +342,10 @@ namespace cv { namespace gpu { namespace device
|
||||
template <typename T1, typename T2, typename D, typename BinOp, typename Mask>
|
||||
static void call(const DevMem2D_<T1>& src1, const DevMem2D_<T2>& src2, const DevMem2D_<D>& dst, const BinOp& op, const Mask& mask, cudaStream_t stream)
|
||||
{
|
||||
dim3 threads(16, 16, 1);
|
||||
dim3 grid(1, 1, 1);
|
||||
typedef TransformFunctorTraits<BinOp> ft;
|
||||
|
||||
grid.x = divUp(src1.cols, threads.x);
|
||||
grid.y = divUp(src1.rows, threads.y);
|
||||
const dim3 threads(ft::simple_block_dim_x, ft::simple_block_dim_y, 1);
|
||||
const dim3 grid(divUp(src1.cols, threads.x), divUp(src1.rows, threads.y), 1);
|
||||
|
||||
transformSimple<T1, T2, D><<<grid, threads, 0, stream>>>(src1, src2, dst, mask, op);
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
@ -350,13 +359,12 @@ namespace cv { namespace gpu { namespace device
|
||||
template <typename T, typename D, typename UnOp, typename Mask>
|
||||
static void call(const DevMem2D_<T>& src, const DevMem2D_<D>& dst, const UnOp& op, const Mask& mask, cudaStream_t stream)
|
||||
{
|
||||
const int shift = UnReadWriteTraits<T, D>::shift;
|
||||
typedef TransformFunctorTraits<UnOp> ft;
|
||||
|
||||
dim3 threads(16, 16, 1);
|
||||
dim3 grid(1, 1, 1);
|
||||
StaticAssert<ft::smart_shift != 1>::check();
|
||||
|
||||
grid.x = divUp(src.cols, threads.x * shift);
|
||||
grid.y = divUp(src.rows, threads.y);
|
||||
const dim3 threads(ft::smart_block_dim_x, ft::smart_block_dim_y, 1);
|
||||
const dim3 grid(divUp(src.cols, threads.x * ft::smart_shift), divUp(src.rows, threads.y), 1);
|
||||
|
||||
transformSmart<T, D><<<grid, threads, 0, stream>>>(src, dst, mask, op);
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
@ -368,13 +376,12 @@ namespace cv { namespace gpu { namespace device
|
||||
template <typename T1, typename T2, typename D, typename BinOp, typename Mask>
|
||||
static void call(const DevMem2D_<T1>& src1, const DevMem2D_<T2>& src2, const DevMem2D_<D>& dst, const BinOp& op, const Mask& mask, cudaStream_t stream)
|
||||
{
|
||||
const int shift = BinReadWriteTraits<T1, T2, D>::shift;
|
||||
typedef TransformFunctorTraits<BinOp> ft;
|
||||
|
||||
dim3 threads(16, 16, 1);
|
||||
dim3 grid(1, 1, 1);
|
||||
StaticAssert<ft::smart_shift != 1>::check();
|
||||
|
||||
grid.x = divUp(src1.cols, threads.x * shift);
|
||||
grid.y = divUp(src1.rows, threads.y);
|
||||
const dim3 threads(ft::smart_block_dim_x, ft::smart_block_dim_y, 1);
|
||||
const dim3 grid(divUp(src1.cols, threads.x * ft::smart_shift), divUp(src1.rows, threads.y), 1);
|
||||
|
||||
transformSmart<T1, T2, D><<<grid, threads, 0, stream>>>(src1, src2, dst, mask, op);
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
@ -382,44 +389,20 @@ namespace cv { namespace gpu { namespace device
|
||||
if (stream == 0)
|
||||
cudaSafeCall( cudaDeviceSynchronize() );
|
||||
}
|
||||
};
|
||||
|
||||
template <typename T, typename D, int scn, int dcn> struct UseSmartUn_
|
||||
{
|
||||
static const bool value = false;
|
||||
};
|
||||
template <typename T, typename D> struct UseSmartUn_<T, D, 1, 1>
|
||||
{
|
||||
static const bool value = UnReadWriteTraits<T, D>::shift != 1;
|
||||
};
|
||||
template <typename T, typename D> struct UseSmartUn
|
||||
{
|
||||
static const bool value = UseSmartUn_<T, D, VecTraits<T>::cn, VecTraits<D>::cn>::value;
|
||||
};
|
||||
|
||||
template <typename T1, typename T2, typename D, int src1cn, int src2cn, int dstcn> struct UseSmartBin_
|
||||
{
|
||||
static const bool value = false;
|
||||
};
|
||||
template <typename T1, typename T2, typename D> struct UseSmartBin_<T1, T2, D, 1, 1, 1>
|
||||
{
|
||||
static const bool value = BinReadWriteTraits<T1, T2, D>::shift != 1;
|
||||
};
|
||||
template <typename T1, typename T2, typename D> struct UseSmartBin
|
||||
{
|
||||
static const bool value = UseSmartBin_<T1, T2, D, VecTraits<T1>::cn, VecTraits<T2>::cn, VecTraits<D>::cn>::value;
|
||||
};
|
||||
};
|
||||
|
||||
template <typename T, typename D, typename UnOp, typename Mask>
|
||||
static void transform_caller(const DevMem2D_<T>& src, const DevMem2D_<D>& dst, const UnOp& op, const Mask& mask, cudaStream_t stream)
|
||||
{
|
||||
TransformDispatcher< UseSmartUn<T, D>::value >::call(src, dst, op, mask, stream);
|
||||
typedef TransformFunctorTraits<UnOp> ft;
|
||||
TransformDispatcher<VecTraits<T>::cn == 1 && VecTraits<D>::cn == 1 && ft::smart_shift != 1>::call(src, dst, op, mask, stream);
|
||||
}
|
||||
|
||||
template <typename T1, typename T2, typename D, typename BinOp, typename Mask>
|
||||
static void transform_caller(const DevMem2D_<T1>& src1, const DevMem2D_<T2>& src2, const DevMem2D_<D>& dst, const BinOp& op, const Mask& mask, cudaStream_t stream)
|
||||
{
|
||||
TransformDispatcher< UseSmartBin<T1, T2, D>::value >::call(src1, src2, dst, op, mask, stream);
|
||||
typedef TransformFunctorTraits<BinOp> ft;
|
||||
TransformDispatcher<VecTraits<T1>::cn == 1 && VecTraits<T2>::cn == 1 && VecTraits<D>::cn == 1 && ft::smart_shift != 1>::call(src1, src2, dst, op, mask, stream);
|
||||
}
|
||||
}
|
||||
}}}
|
||||
|
@ -46,18 +46,25 @@
|
||||
#include <thrust/functional.h>
|
||||
#include "internal_shared.hpp"
|
||||
#include "saturate_cast.hpp"
|
||||
#include "vec_traits.hpp"
|
||||
|
||||
namespace cv { namespace gpu { namespace device
|
||||
{
|
||||
// Function Objects
|
||||
|
||||
using thrust::unary_function;
|
||||
using thrust::binary_function;
|
||||
|
||||
// Arithmetic Operations
|
||||
|
||||
using thrust::plus;
|
||||
using thrust::minus;
|
||||
using thrust::multiplies;
|
||||
using thrust::divides;
|
||||
using thrust::modulus;
|
||||
using thrust::negate;
|
||||
|
||||
// Comparison Operations
|
||||
|
||||
using thrust::equal_to;
|
||||
using thrust::not_equal_to;
|
||||
@ -65,11 +72,15 @@ namespace cv { namespace gpu { namespace device
|
||||
using thrust::less;
|
||||
using thrust::greater_equal;
|
||||
using thrust::less_equal;
|
||||
|
||||
// Logical Operations
|
||||
|
||||
using thrust::logical_and;
|
||||
using thrust::logical_or;
|
||||
using thrust::logical_not;
|
||||
|
||||
// Bitwise Operations
|
||||
|
||||
using thrust::bit_and;
|
||||
using thrust::bit_or;
|
||||
using thrust::bit_xor;
|
||||
@ -78,7 +89,13 @@ namespace cv { namespace gpu { namespace device
|
||||
__forceinline__ __device__ T operator ()(const T& v) const {return ~v;}
|
||||
};
|
||||
|
||||
using thrust::identity;
|
||||
// Generalized Identity Operations
|
||||
|
||||
using thrust::identity;
|
||||
using thrust::project1st;
|
||||
using thrust::project2nd;
|
||||
|
||||
// Min/Max Operations
|
||||
|
||||
#define OPENCV_GPU_IMPLEMENT_MINMAX(name, type, op) \
|
||||
template <> struct name<type> : binary_function<type, type, type> \
|
||||
@ -115,15 +132,8 @@ namespace cv { namespace gpu { namespace device
|
||||
OPENCV_GPU_IMPLEMENT_MINMAX(minimum, double, fmin)
|
||||
|
||||
#undef OPENCV_GPU_IMPLEMENT_MINMAX
|
||||
|
||||
using thrust::project1st;
|
||||
using thrust::project2nd;
|
||||
|
||||
using thrust::unary_negate;
|
||||
using thrust::not1;
|
||||
|
||||
using thrust::binary_negate;
|
||||
using thrust::not2;
|
||||
// Math functions
|
||||
|
||||
#define OPENCV_GPU_IMPLEMENT_UN_FUNCTOR(func) \
|
||||
template <typename T> struct func ## _func : unary_function<T, float> \
|
||||
@ -192,6 +202,8 @@ namespace cv { namespace gpu { namespace device
|
||||
}
|
||||
};
|
||||
|
||||
// Saturate Cast Functor
|
||||
|
||||
template <typename T, typename D> struct saturate_cast_func : unary_function<T, D>
|
||||
{
|
||||
__forceinline__ __device__ D operator ()(const T& v) const
|
||||
@ -200,6 +212,8 @@ namespace cv { namespace gpu { namespace device
|
||||
}
|
||||
};
|
||||
|
||||
// Threshold Functors
|
||||
|
||||
template <typename T> struct thresh_binary_func : unary_function<T, T>
|
||||
{
|
||||
__forceinline__ __host__ __device__ thresh_binary_func(T thresh_, T maxVal_) : thresh(thresh_), maxVal(maxVal_) {}
|
||||
@ -256,7 +270,15 @@ namespace cv { namespace gpu { namespace device
|
||||
}
|
||||
|
||||
const T thresh;
|
||||
};
|
||||
};
|
||||
|
||||
// Function Object Adaptors
|
||||
|
||||
using thrust::unary_negate;
|
||||
using thrust::not1;
|
||||
|
||||
using thrust::binary_negate;
|
||||
using thrust::not2;
|
||||
|
||||
template <typename Op> struct binder1st : unary_function<typename Op::second_argument_type, typename Op::result_type>
|
||||
{
|
||||
@ -291,46 +313,77 @@ namespace cv { namespace gpu { namespace device
|
||||
return binder2nd<Op>(op, typename Op::second_argument_type(x));
|
||||
}
|
||||
|
||||
template <typename T1, typename T2> struct BinOpTraits
|
||||
// Functor Traits
|
||||
|
||||
template <typename F> struct IsUnaryFunction
|
||||
{
|
||||
typedef int argument_type;
|
||||
struct Yes {};
|
||||
struct No {Yes a[2];};
|
||||
|
||||
template <typename T, typename D> static Yes check(unary_function<T, D>*);
|
||||
static No check(...);
|
||||
|
||||
enum { value = (sizeof(check((F*)0)) == sizeof(Yes)) };
|
||||
};
|
||||
template <typename T> struct BinOpTraits<T, T>
|
||||
|
||||
template <typename F> struct IsBinaryFunction
|
||||
{
|
||||
typedef T argument_type;
|
||||
struct Yes {};
|
||||
struct No {Yes a[2];};
|
||||
|
||||
template <typename T1, typename T2, typename D> static Yes check(binary_function<T1, T2, D>*);
|
||||
static No check(...);
|
||||
|
||||
enum { value = (sizeof(check((F*)0)) == sizeof(Yes)) };
|
||||
};
|
||||
template <typename T> struct BinOpTraits<T, double>
|
||||
|
||||
namespace detail
|
||||
{
|
||||
typedef double argument_type;
|
||||
};
|
||||
template <typename T> struct BinOpTraits<double, T>
|
||||
template <size_t src_elem_size, size_t dst_elem_size> struct UnOpShift { enum { shift = 1 }; };
|
||||
template <size_t src_elem_size> struct UnOpShift<src_elem_size, 1> { enum { shift = 4 }; };
|
||||
template <size_t src_elem_size> struct UnOpShift<src_elem_size, 2> { enum { shift = 2 }; };
|
||||
|
||||
template <typename T, typename D> struct DefaultUnaryShift
|
||||
{
|
||||
enum { shift = detail::UnOpShift<sizeof(T), sizeof(D)>::shift };
|
||||
};
|
||||
|
||||
template <size_t src_elem_size1, size_t src_elem_size2, size_t dst_elem_size> struct BinOpShift { enum { shift = 1 }; };
|
||||
template <size_t src_elem_size1, size_t src_elem_size2> struct BinOpShift<src_elem_size1, src_elem_size2, 1> { enum { shift = 4 }; };
|
||||
template <size_t src_elem_size1, size_t src_elem_size2> struct BinOpShift<src_elem_size1, src_elem_size2, 2> { enum { shift = 2 }; };
|
||||
|
||||
template <typename T1, typename T2, typename D> struct DefaultBinaryShift
|
||||
{
|
||||
enum { shift = detail::BinOpShift<sizeof(T1), sizeof(T2), sizeof(D)>::shift };
|
||||
};
|
||||
|
||||
template <typename Func, bool unary = IsUnaryFunction<Func>::value> struct ShiftDispatcher;
|
||||
template <typename Func> struct ShiftDispatcher<Func, true>
|
||||
{
|
||||
enum { shift = DefaultUnaryShift<typename Func::argument_type, typename Func::result_type>::shift };
|
||||
};
|
||||
template <typename Func> struct ShiftDispatcher<Func, false>
|
||||
{
|
||||
enum { shift = DefaultBinaryShift<typename Func::first_argument_type, typename Func::second_argument_type, typename Func::result_type>::shift };
|
||||
};
|
||||
}
|
||||
|
||||
template <typename Func> struct DefaultTransformShift
|
||||
{
|
||||
typedef double argument_type;
|
||||
enum { shift = detail::ShiftDispatcher<Func>::shift };
|
||||
};
|
||||
template <> struct BinOpTraits<double, double>
|
||||
|
||||
template <typename Func> struct DefaultTransformFunctorTraits
|
||||
{
|
||||
typedef double argument_type;
|
||||
};
|
||||
template <typename T> struct BinOpTraits<T, float>
|
||||
{
|
||||
typedef float argument_type;
|
||||
};
|
||||
template <typename T> struct BinOpTraits<float, T>
|
||||
{
|
||||
typedef float argument_type;
|
||||
};
|
||||
template <> struct BinOpTraits<float, float>
|
||||
{
|
||||
typedef float argument_type;
|
||||
};
|
||||
template <> struct BinOpTraits<double, float>
|
||||
{
|
||||
typedef double argument_type;
|
||||
};
|
||||
template <> struct BinOpTraits<float, double>
|
||||
{
|
||||
typedef double argument_type;
|
||||
enum { simple_block_dim_x = 16 };
|
||||
enum { simple_block_dim_y = 16 };
|
||||
|
||||
enum { smart_block_dim_x = 16 };
|
||||
enum { smart_block_dim_y = 16 };
|
||||
enum { smart_shift = DefaultTransformShift<Func>::shift };
|
||||
};
|
||||
|
||||
template <typename Func> struct TransformFunctorTraits : DefaultTransformFunctorTraits<Func> {};
|
||||
}}}
|
||||
|
||||
#endif // __OPENCV_GPU_FUNCTIONAL_HPP__
|
||||
|
@ -150,6 +150,50 @@ namespace cv { namespace gpu { namespace device
|
||||
return VecTraits<TypeVec<func<type>::result_type, 4>::vec_type>::make(f(a.x), f(a.y), f(a.z), f(a.w)); \
|
||||
}
|
||||
|
||||
namespace detail
|
||||
{
|
||||
template <typename T1, typename T2> struct BinOpTraits
|
||||
{
|
||||
typedef int argument_type;
|
||||
};
|
||||
template <typename T> struct BinOpTraits<T, T>
|
||||
{
|
||||
typedef T argument_type;
|
||||
};
|
||||
template <typename T> struct BinOpTraits<T, double>
|
||||
{
|
||||
typedef double argument_type;
|
||||
};
|
||||
template <typename T> struct BinOpTraits<double, T>
|
||||
{
|
||||
typedef double argument_type;
|
||||
};
|
||||
template <> struct BinOpTraits<double, double>
|
||||
{
|
||||
typedef double argument_type;
|
||||
};
|
||||
template <typename T> struct BinOpTraits<T, float>
|
||||
{
|
||||
typedef float argument_type;
|
||||
};
|
||||
template <typename T> struct BinOpTraits<float, T>
|
||||
{
|
||||
typedef float argument_type;
|
||||
};
|
||||
template <> struct BinOpTraits<float, float>
|
||||
{
|
||||
typedef float argument_type;
|
||||
};
|
||||
template <> struct BinOpTraits<double, float>
|
||||
{
|
||||
typedef double argument_type;
|
||||
};
|
||||
template <> struct BinOpTraits<float, double>
|
||||
{
|
||||
typedef double argument_type;
|
||||
};
|
||||
}
|
||||
|
||||
#define OPENCV_GPU_IMPLEMENT_VEC_BINOP(type, op, func) \
|
||||
static __device__ TypeVec<func<type>::result_type, 1>::vec_type op(const type ## 1 & a, const type ## 1 & b) \
|
||||
{ \
|
||||
@ -157,16 +201,16 @@ namespace cv { namespace gpu { namespace device
|
||||
return VecTraits<TypeVec<func<type>::result_type, 1>::vec_type>::make(f(a.x, b.x)); \
|
||||
} \
|
||||
template <typename T> \
|
||||
static __device__ typename TypeVec<typename func<typename BinOpTraits<type, T>::argument_type>::result_type, 1>::vec_type op(const type ## 1 & v, T s) \
|
||||
static __device__ typename TypeVec<typename func<typename detail::BinOpTraits<type, T>::argument_type>::result_type, 1>::vec_type op(const type ## 1 & v, T s) \
|
||||
{ \
|
||||
func<typename BinOpTraits<type, T>::argument_type> f; \
|
||||
return VecTraits<typename TypeVec<typename func<typename BinOpTraits<type, T>::argument_type>::result_type, 1>::vec_type>::make(f(v.x, s)); \
|
||||
func<typename detail::BinOpTraits<type, T>::argument_type> f; \
|
||||
return VecTraits<typename TypeVec<typename func<typename detail::BinOpTraits<type, T>::argument_type>::result_type, 1>::vec_type>::make(f(v.x, s)); \
|
||||
} \
|
||||
template <typename T> \
|
||||
static __device__ typename TypeVec<typename func<typename BinOpTraits<type, T>::argument_type>::result_type, 1>::vec_type op(T s, const type ## 1 & v) \
|
||||
static __device__ typename TypeVec<typename func<typename detail::BinOpTraits<type, T>::argument_type>::result_type, 1>::vec_type op(T s, const type ## 1 & v) \
|
||||
{ \
|
||||
func<typename BinOpTraits<type, T>::argument_type> f; \
|
||||
return VecTraits<typename TypeVec<typename func<typename BinOpTraits<type, T>::argument_type>::result_type, 1>::vec_type>::make(f(s, v.x)); \
|
||||
func<typename detail::BinOpTraits<type, T>::argument_type> f; \
|
||||
return VecTraits<typename TypeVec<typename func<typename detail::BinOpTraits<type, T>::argument_type>::result_type, 1>::vec_type>::make(f(s, v.x)); \
|
||||
} \
|
||||
static __device__ TypeVec<func<type>::result_type, 2>::vec_type op(const type ## 2 & a, const type ## 2 & b) \
|
||||
{ \
|
||||
@ -174,16 +218,16 @@ namespace cv { namespace gpu { namespace device
|
||||
return VecTraits<TypeVec<func<type>::result_type, 2>::vec_type>::make(f(a.x, b.x), f(a.y, b.y)); \
|
||||
} \
|
||||
template <typename T> \
|
||||
static __device__ typename TypeVec<typename func<typename BinOpTraits<type, T>::argument_type>::result_type, 2>::vec_type op(const type ## 2 & v, T s) \
|
||||
static __device__ typename TypeVec<typename func<typename detail::BinOpTraits<type, T>::argument_type>::result_type, 2>::vec_type op(const type ## 2 & v, T s) \
|
||||
{ \
|
||||
func<typename BinOpTraits<type, T>::argument_type> f; \
|
||||
return VecTraits<typename TypeVec<typename func<typename BinOpTraits<type, T>::argument_type>::result_type, 2>::vec_type>::make(f(v.x, s), f(v.y, s)); \
|
||||
func<typename detail::BinOpTraits<type, T>::argument_type> f; \
|
||||
return VecTraits<typename TypeVec<typename func<typename detail::BinOpTraits<type, T>::argument_type>::result_type, 2>::vec_type>::make(f(v.x, s), f(v.y, s)); \
|
||||
} \
|
||||
template <typename T> \
|
||||
static __device__ typename TypeVec<typename func<typename BinOpTraits<type, T>::argument_type>::result_type, 2>::vec_type op(T s, const type ## 2 & v) \
|
||||
static __device__ typename TypeVec<typename func<typename detail::BinOpTraits<type, T>::argument_type>::result_type, 2>::vec_type op(T s, const type ## 2 & v) \
|
||||
{ \
|
||||
func<typename BinOpTraits<type, T>::argument_type> f; \
|
||||
return VecTraits<typename TypeVec<typename func<typename BinOpTraits<type, T>::argument_type>::result_type, 2>::vec_type>::make(f(s, v.x), f(s, v.y)); \
|
||||
func<typename detail::BinOpTraits<type, T>::argument_type> f; \
|
||||
return VecTraits<typename TypeVec<typename func<typename detail::BinOpTraits<type, T>::argument_type>::result_type, 2>::vec_type>::make(f(s, v.x), f(s, v.y)); \
|
||||
} \
|
||||
static __device__ TypeVec<func<type>::result_type, 3>::vec_type op(const type ## 3 & a, const type ## 3 & b) \
|
||||
{ \
|
||||
@ -191,16 +235,16 @@ namespace cv { namespace gpu { namespace device
|
||||
return VecTraits<TypeVec<func<type>::result_type, 3>::vec_type>::make(f(a.x, b.x), f(a.y, b.y), f(a.z, b.z)); \
|
||||
} \
|
||||
template <typename T> \
|
||||
static __device__ typename TypeVec<typename func<typename BinOpTraits<type, T>::argument_type>::result_type, 3>::vec_type op(const type ## 3 & v, T s) \
|
||||
static __device__ typename TypeVec<typename func<typename detail::BinOpTraits<type, T>::argument_type>::result_type, 3>::vec_type op(const type ## 3 & v, T s) \
|
||||
{ \
|
||||
func<typename BinOpTraits<type, T>::argument_type> f; \
|
||||
return VecTraits<typename TypeVec<typename func<typename BinOpTraits<type, T>::argument_type>::result_type, 3>::vec_type>::make(f(v.x, s), f(v.y, s), f(v.z, s)); \
|
||||
func<typename detail::BinOpTraits<type, T>::argument_type> f; \
|
||||
return VecTraits<typename TypeVec<typename func<typename detail::BinOpTraits<type, T>::argument_type>::result_type, 3>::vec_type>::make(f(v.x, s), f(v.y, s), f(v.z, s)); \
|
||||
} \
|
||||
template <typename T> \
|
||||
static __device__ typename TypeVec<typename func<typename BinOpTraits<type, T>::argument_type>::result_type, 3>::vec_type op(T s, const type ## 3 & v) \
|
||||
static __device__ typename TypeVec<typename func<typename detail::BinOpTraits<type, T>::argument_type>::result_type, 3>::vec_type op(T s, const type ## 3 & v) \
|
||||
{ \
|
||||
func<typename BinOpTraits<type, T>::argument_type> f; \
|
||||
return VecTraits<typename TypeVec<typename func<typename BinOpTraits<type, T>::argument_type>::result_type, 3>::vec_type>::make(f(s, v.x), f(s, v.y), f(s, v.z)); \
|
||||
func<typename detail::BinOpTraits<type, T>::argument_type> f; \
|
||||
return VecTraits<typename TypeVec<typename func<typename detail::BinOpTraits<type, T>::argument_type>::result_type, 3>::vec_type>::make(f(s, v.x), f(s, v.y), f(s, v.z)); \
|
||||
} \
|
||||
static __device__ TypeVec<func<type>::result_type, 4>::vec_type op(const type ## 4 & a, const type ## 4 & b) \
|
||||
{ \
|
||||
@ -208,16 +252,16 @@ namespace cv { namespace gpu { namespace device
|
||||
return VecTraits<TypeVec<func<type>::result_type, 4>::vec_type>::make(f(a.x, b.x), f(a.y, b.y), f(a.z, b.z), f(a.w, b.w)); \
|
||||
} \
|
||||
template <typename T> \
|
||||
static __device__ typename TypeVec<typename func<typename BinOpTraits<type, T>::argument_type>::result_type, 4>::vec_type op(const type ## 4 & v, T s) \
|
||||
static __device__ typename TypeVec<typename func<typename detail::BinOpTraits<type, T>::argument_type>::result_type, 4>::vec_type op(const type ## 4 & v, T s) \
|
||||
{ \
|
||||
func<typename BinOpTraits<type, T>::argument_type> f; \
|
||||
return VecTraits<typename TypeVec<typename func<typename BinOpTraits<type, T>::argument_type>::result_type, 4>::vec_type>::make(f(v.x, s), f(v.y, s), f(v.z, s), f(v.w, s)); \
|
||||
func<typename detail::BinOpTraits<type, T>::argument_type> f; \
|
||||
return VecTraits<typename TypeVec<typename func<typename detail::BinOpTraits<type, T>::argument_type>::result_type, 4>::vec_type>::make(f(v.x, s), f(v.y, s), f(v.z, s), f(v.w, s)); \
|
||||
} \
|
||||
template <typename T> \
|
||||
static __device__ typename TypeVec<typename func<typename BinOpTraits<type, T>::argument_type>::result_type, 4>::vec_type op(T s, const type ## 4 & v) \
|
||||
static __device__ typename TypeVec<typename func<typename detail::BinOpTraits<type, T>::argument_type>::result_type, 4>::vec_type op(T s, const type ## 4 & v) \
|
||||
{ \
|
||||
func<typename BinOpTraits<T, type>::argument_type> f; \
|
||||
return VecTraits<typename TypeVec<typename func<typename BinOpTraits<type, T>::argument_type>::result_type, 4>::vec_type>::make(f(s, v.x), f(s, v.y), f(s, v.z), f(s, v.w)); \
|
||||
func<typename detail::BinOpTraits<T, type>::argument_type> f; \
|
||||
return VecTraits<typename TypeVec<typename func<typename detail::BinOpTraits<type, T>::argument_type>::result_type, 4>::vec_type>::make(f(s, v.x), f(s, v.y), f(s, v.z), f(s, v.w)); \
|
||||
}
|
||||
|
||||
#define OPENCV_GPU_IMPLEMENT_VEC_OP(type) \
|
||||
|
@ -49,6 +49,79 @@ namespace cv { namespace gpu { namespace device
|
||||
{
|
||||
template<typename T, int N> struct TypeVec;
|
||||
|
||||
struct __align__(8) uchar8
|
||||
{
|
||||
uchar a0, a1, a2, a3, a4, a5, a6, a7;
|
||||
};
|
||||
static __host__ __device__ __forceinline__ uchar8 make_uchar8(uchar a0, uchar a1, uchar a2, uchar a3, uchar a4, uchar a5, uchar a6, uchar a7)
|
||||
{
|
||||
uchar8 val = {a0, a1, a2, a3, a4, a5, a6, a7};
|
||||
return val;
|
||||
}
|
||||
struct __align__(8) char8
|
||||
{
|
||||
schar a0, a1, a2, a3, a4, a5, a6, a7;
|
||||
};
|
||||
static __host__ __device__ __forceinline__ char8 make_char8(schar a0, schar a1, schar a2, schar a3, schar a4, schar a5, schar a6, schar a7)
|
||||
{
|
||||
char8 val = {a0, a1, a2, a3, a4, a5, a6, a7};
|
||||
return val;
|
||||
}
|
||||
struct __align__(16) ushort8
|
||||
{
|
||||
ushort a0, a1, a2, a3, a4, a5, a6, a7;
|
||||
};
|
||||
static __host__ __device__ __forceinline__ ushort8 make_ushort8(ushort a0, ushort a1, ushort a2, ushort a3, ushort a4, ushort a5, ushort a6, ushort a7)
|
||||
{
|
||||
ushort8 val = {a0, a1, a2, a3, a4, a5, a6, a7};
|
||||
return val;
|
||||
}
|
||||
struct __align__(16) short8
|
||||
{
|
||||
short a0, a1, a2, a3, a4, a5, a6, a7;
|
||||
};
|
||||
static __host__ __device__ __forceinline__ short8 make_short8(short a0, short a1, short a2, short a3, short a4, short a5, short a6, short a7)
|
||||
{
|
||||
short8 val = {a0, a1, a2, a3, a4, a5, a6, a7};
|
||||
return val;
|
||||
}
|
||||
struct __align__(32) uint8
|
||||
{
|
||||
uint a0, a1, a2, a3, a4, a5, a6, a7;
|
||||
};
|
||||
static __host__ __device__ __forceinline__ uint8 make_uint8(uint a0, uint a1, uint a2, uint a3, uint a4, uint a5, uint a6, uint a7)
|
||||
{
|
||||
uint8 val = {a0, a1, a2, a3, a4, a5, a6, a7};
|
||||
return val;
|
||||
}
|
||||
struct __align__(32) int8
|
||||
{
|
||||
int a0, a1, a2, a3, a4, a5, a6, a7;
|
||||
};
|
||||
static __host__ __device__ __forceinline__ int8 make_int8(int a0, int a1, int a2, int a3, int a4, int a5, int a6, int a7)
|
||||
{
|
||||
int8 val = {a0, a1, a2, a3, a4, a5, a6, a7};
|
||||
return val;
|
||||
}
|
||||
struct __align__(32) float8
|
||||
{
|
||||
float a0, a1, a2, a3, a4, a5, a6, a7;
|
||||
};
|
||||
static __host__ __device__ __forceinline__ float8 make_float8(float a0, float a1, float a2, float a3, float a4, float a5, float a6, float a7)
|
||||
{
|
||||
float8 val = {a0, a1, a2, a3, a4, a5, a6, a7};
|
||||
return val;
|
||||
}
|
||||
struct double8
|
||||
{
|
||||
double a0, a1, a2, a3, a4, a5, a6, a7;
|
||||
};
|
||||
static __host__ __device__ __forceinline__ double8 make_double8(double a0, double a1, double a2, double a3, double a4, double a5, double a6, double a7)
|
||||
{
|
||||
double8 val = {a0, a1, a2, a3, a4, a5, a6, a7};
|
||||
return val;
|
||||
}
|
||||
|
||||
#define OPENCV_GPU_IMPLEMENT_TYPE_VEC(type) \
|
||||
template<> struct TypeVec<type, 1> { typedef type vec_type; }; \
|
||||
template<> struct TypeVec<type ## 1, 1> { typedef type ## 1 vec_type; }; \
|
||||
@ -57,7 +130,9 @@ namespace cv { namespace gpu { namespace device
|
||||
template<> struct TypeVec<type, 3> { typedef type ## 3 vec_type; }; \
|
||||
template<> struct TypeVec<type ## 3, 3> { typedef type ## 3 vec_type; }; \
|
||||
template<> struct TypeVec<type, 4> { typedef type ## 4 vec_type; }; \
|
||||
template<> struct TypeVec<type ## 4, 4> { typedef type ## 4 vec_type; };
|
||||
template<> struct TypeVec<type ## 4, 4> { typedef type ## 4 vec_type; }; \
|
||||
template<> struct TypeVec<type, 8> { typedef type ## 8 vec_type; }; \
|
||||
template<> struct TypeVec<type ## 8, 8> { typedef type ## 8 vec_type; };
|
||||
|
||||
OPENCV_GPU_IMPLEMENT_TYPE_VEC(uchar)
|
||||
OPENCV_GPU_IMPLEMENT_TYPE_VEC(char)
|
||||
@ -74,11 +149,13 @@ namespace cv { namespace gpu { namespace device
|
||||
template<> struct TypeVec<schar, 2> { typedef char2 vec_type; };
|
||||
template<> struct TypeVec<schar, 3> { typedef char3 vec_type; };
|
||||
template<> struct TypeVec<schar, 4> { typedef char4 vec_type; };
|
||||
template<> struct TypeVec<schar, 8> { typedef char8 vec_type; };
|
||||
|
||||
template<> struct TypeVec<bool, 1> { typedef uchar vec_type; };
|
||||
template<> struct TypeVec<bool, 2> { typedef uchar2 vec_type; };
|
||||
template<> struct TypeVec<bool, 3> { typedef uchar3 vec_type; };
|
||||
template<> struct TypeVec<bool, 4> { typedef uchar4 vec_type; };
|
||||
template<> struct TypeVec<bool, 8> { typedef uchar8 vec_type; };
|
||||
|
||||
template<typename T> struct VecTraits;
|
||||
|
||||
@ -87,36 +164,43 @@ namespace cv { namespace gpu { namespace device
|
||||
{ \
|
||||
typedef type elem_type; \
|
||||
enum {cn=1}; \
|
||||
static __device__ __host__ type all(type v) {return v;} \
|
||||
static __device__ __host__ type make(type x) {return x;} \
|
||||
static __device__ __host__ __forceinline__ type all(type v) {return v;} \
|
||||
static __device__ __host__ __forceinline__ type make(type x) {return x;} \
|
||||
}; \
|
||||
template<> struct VecTraits<type ## 1> \
|
||||
{ \
|
||||
typedef type elem_type; \
|
||||
enum {cn=1}; \
|
||||
static __device__ __host__ type ## 1 all(type v) {return make_ ## type ## 1(v);} \
|
||||
static __device__ __host__ type ## 1 make(type x) {return make_ ## type ## 1(x);} \
|
||||
static __device__ __host__ __forceinline__ type ## 1 all(type v) {return make_ ## type ## 1(v);} \
|
||||
static __device__ __host__ __forceinline__ type ## 1 make(type x) {return make_ ## type ## 1(x);} \
|
||||
}; \
|
||||
template<> struct VecTraits<type ## 2> \
|
||||
{ \
|
||||
typedef type elem_type; \
|
||||
enum {cn=2}; \
|
||||
static __device__ __host__ type ## 2 all(type v) {return make_ ## type ## 2(v, v);} \
|
||||
static __device__ __host__ type ## 2 make(type x, type y) {return make_ ## type ## 2(x, y);} \
|
||||
static __device__ __host__ __forceinline__ type ## 2 all(type v) {return make_ ## type ## 2(v, v);} \
|
||||
static __device__ __host__ __forceinline__ type ## 2 make(type x, type y) {return make_ ## type ## 2(x, y);} \
|
||||
}; \
|
||||
template<> struct VecTraits<type ## 3> \
|
||||
{ \
|
||||
typedef type elem_type; \
|
||||
enum {cn=3}; \
|
||||
static __device__ __host__ type ## 3 all(type v) {return make_ ## type ## 3(v, v, v);} \
|
||||
static __device__ __host__ type ## 3 make(type x, type y, type z) {return make_ ## type ## 3(x, y, z);} \
|
||||
static __device__ __host__ __forceinline__ type ## 3 all(type v) {return make_ ## type ## 3(v, v, v);} \
|
||||
static __device__ __host__ __forceinline__ type ## 3 make(type x, type y, type z) {return make_ ## type ## 3(x, y, z);} \
|
||||
}; \
|
||||
template<> struct VecTraits<type ## 4> \
|
||||
{ \
|
||||
typedef type elem_type; \
|
||||
enum {cn=4}; \
|
||||
static __device__ __host__ type ## 4 all(type v) {return make_ ## type ## 4(v, v, v, v);} \
|
||||
static __device__ __host__ type ## 4 make(type x, type y, type z, type w) {return make_ ## type ## 4(x, y, z, w);} \
|
||||
static __device__ __host__ __forceinline__ type ## 4 all(type v) {return make_ ## type ## 4(v, v, v, v);} \
|
||||
static __device__ __host__ __forceinline__ type ## 4 make(type x, type y, type z, type w) {return make_ ## type ## 4(x, y, z, w);} \
|
||||
}; \
|
||||
template<> struct VecTraits<type ## 8> \
|
||||
{ \
|
||||
typedef type elem_type; \
|
||||
enum {cn=8}; \
|
||||
static __device__ __host__ __forceinline__ type ## 8 all(type v) {return make_ ## type ## 8(v, v, v, v, v, v, v, v);} \
|
||||
static __device__ __host__ __forceinline__ type ## 8 make(type a0, type a1, type a2, type a3, type a4, type a5, type a6, type a7) {return make_ ## type ## 8(a0, a1, a2, a3, a4, a5, a6, a7);} \
|
||||
};
|
||||
|
||||
OPENCV_GPU_IMPLEMENT_VEC_TRAITS(uchar)
|
||||
@ -134,8 +218,8 @@ namespace cv { namespace gpu { namespace device
|
||||
{
|
||||
typedef schar elem_type;
|
||||
enum {cn=1};
|
||||
static __device__ __host__ schar all(schar v) {return v;}
|
||||
static __device__ __host__ schar make(schar x) {return x;}
|
||||
static __device__ __host__ __forceinline__ schar all(schar v) {return v;}
|
||||
static __device__ __host__ __forceinline__ schar make(schar x) {return x;}
|
||||
};
|
||||
}}}
|
||||
|
||||
|
@ -286,7 +286,7 @@ TEST(BruteForceMatcher)
|
||||
{
|
||||
// Init CPU matcher
|
||||
|
||||
int desc_len = 64;
|
||||
int desc_len = 128;
|
||||
|
||||
BruteForceMatcher< L2<float> > matcher;
|
||||
|
||||
@ -328,7 +328,7 @@ TEST(BruteForceMatcher)
|
||||
d_matcher.knnMatch(d_query, d_train, d_matches, knn);
|
||||
GPU_OFF;
|
||||
|
||||
/*SUBTEST << "radiusMatch";
|
||||
SUBTEST << "radiusMatch";
|
||||
float max_distance = 3.8f;
|
||||
|
||||
CPU_ON;
|
||||
@ -337,7 +337,7 @@ TEST(BruteForceMatcher)
|
||||
|
||||
GPU_ON;
|
||||
d_matcher.radiusMatch(d_query, d_train, d_matches, max_distance);
|
||||
GPU_OFF;*/
|
||||
GPU_OFF;
|
||||
}
|
||||
|
||||
|
||||
@ -689,26 +689,7 @@ TEST(threshold)
|
||||
Mat src, dst;
|
||||
gpu::GpuMat d_src, d_dst;
|
||||
|
||||
for (int size = 2000; size <= 4000; size += 1000)
|
||||
{
|
||||
SUBTEST << "size " << size << ", 8U, THRESH_TRUNC";
|
||||
|
||||
gen(src, size, size, CV_8U, 0, 100);
|
||||
dst.create(size, size, CV_8U);
|
||||
|
||||
CPU_ON;
|
||||
threshold(src, dst, 50.0, 0.0, THRESH_TRUNC);
|
||||
CPU_OFF;
|
||||
|
||||
d_src = src;
|
||||
d_dst.create(size, size, CV_8U);
|
||||
|
||||
GPU_ON;
|
||||
gpu::threshold(d_src, d_dst, 50.0, 0.0, THRESH_TRUNC);
|
||||
GPU_OFF;
|
||||
}
|
||||
|
||||
for (int size = 2000; size <= 4000; size += 1000)
|
||||
for (int size = 1000; size <= 4000; size += 1000)
|
||||
{
|
||||
SUBTEST << "size " << size << ", 8U, THRESH_BINARY";
|
||||
|
||||
@ -727,22 +708,47 @@ TEST(threshold)
|
||||
GPU_OFF;
|
||||
}
|
||||
|
||||
for (int size = 2000; size <= 4000; size += 1000)
|
||||
for (int size = 1000; size <= 4000; size += 1000)
|
||||
{
|
||||
SUBTEST << "size " << size << ", 32F, THRESH_TRUNC";
|
||||
SUBTEST << "size " << size << ", 32F, THRESH_BINARY";
|
||||
|
||||
gen(src, size, size, CV_32F, 0, 100);
|
||||
dst.create(size, size, CV_32F);
|
||||
|
||||
CPU_ON;
|
||||
threshold(src, dst, 50.0, 0.0, THRESH_TRUNC);
|
||||
threshold(src, dst, 50.0, 0.0, THRESH_BINARY);
|
||||
CPU_OFF;
|
||||
|
||||
d_src = src;
|
||||
d_dst.create(size, size, CV_32F);
|
||||
|
||||
GPU_ON;
|
||||
gpu::threshold(d_src, d_dst, 50.0, 0.0, THRESH_TRUNC);
|
||||
gpu::threshold(d_src, d_dst, 50.0, 0.0, THRESH_BINARY);
|
||||
GPU_OFF;
|
||||
}
|
||||
}
|
||||
|
||||
TEST(pow)
|
||||
{
|
||||
Mat src, dst;
|
||||
gpu::GpuMat d_src, d_dst;
|
||||
|
||||
for (int size = 1000; size <= 4000; size += 1000)
|
||||
{
|
||||
SUBTEST << "size " << size << ", 32F";
|
||||
|
||||
gen(src, size, size, CV_32F, 0, 100);
|
||||
dst.create(size, size, CV_32F);
|
||||
|
||||
CPU_ON;
|
||||
pow(src, -2.0, dst);
|
||||
CPU_OFF;
|
||||
|
||||
d_src = src;
|
||||
d_dst.create(size, size, CV_32F);
|
||||
|
||||
GPU_ON;
|
||||
gpu::pow(d_src, -2.0, d_dst);
|
||||
GPU_OFF;
|
||||
}
|
||||
}
|
||||
|
Loading…
x
Reference in New Issue
Block a user