added TransformFunctorTraits, optimized some functions that use transform

This commit is contained in:
Vladislav Vinogradov 2011-08-17 11:32:24 +00:00
parent 6ce2277cc7
commit 5e9ae6b19f
11 changed files with 591 additions and 312 deletions

View File

@ -47,37 +47,33 @@
#include "opencv2/gpu/device/saturate_cast.hpp"
#include "internal_shared.hpp"
using namespace cv::gpu;
using namespace cv::gpu::device;
namespace cv { namespace gpu { namespace mathfunc
namespace cv { namespace gpu { namespace device
{
//////////////////////////////////////////////////////////////////////////////////////
// Compare
template <typename T1, typename T2> struct NotEqual : binary_function<T1, T2, uchar>
template <typename T> struct NotEqual : binary_function<T, T, uchar>
{
__device__ __forceinline__ uchar operator()(const T1& src1, const T2& src2) const
__device__ __forceinline__ uchar operator()(T src1, T src2) const
{
return static_cast<uchar>(static_cast<int>(src1 != src2) * 255);
}
};
template <typename T1, typename T2>
template <typename T>
inline void compare_ne(const DevMem2D& src1, const DevMem2D& src2, const DevMem2D& dst, cudaStream_t stream)
{
NotEqual<T1, T2> op;
transform(static_cast< DevMem2D_<T1> >(src1), static_cast< DevMem2D_<T2> >(src2), dst, op, stream);
NotEqual<T> op;
transform(static_cast< DevMem2D_<T> >(src1), static_cast< DevMem2D_<T> >(src2), dst, op, stream);
}
void compare_ne_8uc4(const DevMem2D& src1, const DevMem2D& src2, const DevMem2D& dst, cudaStream_t stream)
{
compare_ne<uint, uint>(src1, src2, dst, stream);
compare_ne<uint>(src1, src2, dst, stream);
}
void compare_ne_32f(const DevMem2D& src1, const DevMem2D& src2, const DevMem2D& dst, cudaStream_t stream)
{
compare_ne<float, float>(src1, src2, dst, stream);
compare_ne<float>(src1, src2, dst, stream);
}
@ -354,6 +350,35 @@ namespace cv { namespace gpu { namespace mathfunc
//////////////////////////////////////////////////////////////////////////
// min/max
namespace detail
{
template <size_t size, typename F> struct MinMaxTraits : DefaultTransformFunctorTraits<F>
{
};
template <typename F> struct MinMaxTraits<2, F> : DefaultTransformFunctorTraits<F>
{
enum { smart_shift = 4 };
};
template <typename F> struct MinMaxTraits<4, F> : DefaultTransformFunctorTraits<F>
{
enum { smart_block_dim_y = 4 };
enum { smart_shift = 4 };
};
}
template <typename T> struct TransformFunctorTraits< minimum<T> > : detail::MinMaxTraits< sizeof(T), minimum<T> >
{
};
template <typename T> struct TransformFunctorTraits< maximum<T> > : detail::MinMaxTraits< sizeof(T), maximum<T> >
{
};
template <typename T> struct TransformFunctorTraits< binder2nd< minimum<T> > > : detail::MinMaxTraits< sizeof(T), binder2nd< minimum<T> > >
{
};
template <typename T> struct TransformFunctorTraits< binder2nd< maximum<T> > > : detail::MinMaxTraits< sizeof(T), binder2nd< maximum<T> > >
{
};
template <typename T>
void min_gpu(const DevMem2D_<T>& src1, const DevMem2D_<T>& src2, const DevMem2D_<T>& dst, cudaStream_t stream)
@ -413,7 +438,39 @@ namespace cv { namespace gpu { namespace mathfunc
//////////////////////////////////////////////////////////////////////////
// threshold
// threshold
namespace detail
{
template <size_t size, typename F> struct ThresholdTraits : DefaultTransformFunctorTraits<F>
{
};
template <typename F> struct ThresholdTraits<2, F> : DefaultTransformFunctorTraits<F>
{
enum { smart_shift = 4 };
};
template <typename F> struct ThresholdTraits<4, F> : DefaultTransformFunctorTraits<F>
{
enum { smart_block_dim_y = 4 };
enum { smart_shift = 4 };
};
}
template <typename T> struct TransformFunctorTraits< thresh_binary_func<T> > : detail::ThresholdTraits< sizeof(T), thresh_binary_func<T> >
{
};
template <typename T> struct TransformFunctorTraits< thresh_binary_inv_func<T> > : detail::ThresholdTraits< sizeof(T), thresh_binary_inv_func<T> >
{
};
template <typename T> struct TransformFunctorTraits< thresh_trunc_func<T> > : detail::ThresholdTraits< sizeof(T), thresh_trunc_func<T> >
{
};
template <typename T> struct TransformFunctorTraits< thresh_to_zero_func<T> > : detail::ThresholdTraits< sizeof(T), thresh_to_zero_func<T> >
{
};
template <typename T> struct TransformFunctorTraits< thresh_to_zero_inv_func<T> > : detail::ThresholdTraits< sizeof(T), thresh_to_zero_inv_func<T> >
{
};
template <template <typename> class Op, typename T>
void threshold_caller(const DevMem2D_<T>& src, const DevMem2D_<T>& dst, T thresh, T maxVal,
@ -454,8 +511,13 @@ namespace cv { namespace gpu { namespace mathfunc
//////////////////////////////////////////////////////////////////////////
// subtract
template <typename T>
void subtractCaller(const DevMem2D src1, const DevMem2D src2, DevMem2D dst, cudaStream_t stream)
template <> struct TransformFunctorTraits< minus<short> > : DefaultTransformFunctorTraits< minus<short> >
{
enum { smart_block_dim_y = 8 };
enum { smart_shift = 4 };
};
template <typename T> void subtractCaller(const DevMem2D src1, const DevMem2D src2, DevMem2D dst, cudaStream_t stream)
{
transform((DevMem2D_<T>)src1, (DevMem2D_<T>)src2, (DevMem2D_<T>)dst, minus<T>(), stream);
}
@ -499,10 +561,35 @@ namespace cv { namespace gpu { namespace mathfunc
__device__ __forceinline__ float operator()(const float& e) const
{
return __powf(fabs(e), power);
return __powf(::fabs(e), power);
}
};
namespace detail
{
template <size_t size, typename T> struct PowOpTraits : DefaultTransformFunctorTraits< PowOp<T> >
{
};
template <typename T> struct PowOpTraits<1, T> : DefaultTransformFunctorTraits< PowOp<T> >
{
enum { smart_block_dim_y = 8 };
enum { smart_shift = 8 };
};
template <typename T> struct PowOpTraits<2, T> : DefaultTransformFunctorTraits< PowOp<T> >
{
enum { smart_shift = 4 };
};
template <typename T> struct PowOpTraits<4, T> : DefaultTransformFunctorTraits< PowOp<T> >
{
enum { smart_block_dim_y = 4 };
enum { smart_shift = 4 };
};
}
template <typename T> struct TransformFunctorTraits< PowOp<T> > : detail::PowOpTraits<sizeof(T), T>
{
};
template<typename T>
void pow_caller(const DevMem2D& src, float power, DevMem2D dst, cudaStream_t stream)
{
@ -514,6 +601,5 @@ namespace cv { namespace gpu { namespace mathfunc
template void pow_caller<short>(const DevMem2D& src, float power, DevMem2D dst, cudaStream_t stream);
template void pow_caller<ushort>(const DevMem2D& src, float power, DevMem2D dst, cudaStream_t stream);
template void pow_caller<int>(const DevMem2D& src, float power, DevMem2D dst, cudaStream_t stream);
template void pow_caller<uint>(const DevMem2D& src, float power, DevMem2D dst, cudaStream_t stream);
template void pow_caller<float>(const DevMem2D& src, float power, DevMem2D dst, cudaStream_t stream);
}}}

View File

@ -40,14 +40,9 @@
//
//M*/
#include "opencv2/gpu/device/limits.hpp"
#include "opencv2/gpu/device/saturate_cast.hpp"
#include "opencv2/gpu/device/vec_math.hpp"
#include "opencv2/gpu/device/transform.hpp"
#include "internal_shared.hpp"
using namespace cv::gpu;
using namespace cv::gpu::device;
#ifndef CV_PI
#define CV_PI 3.1415926535897932384626433832795f

View File

@ -45,9 +45,7 @@
#include "opencv2/gpu/device/transform.hpp"
#include "opencv2/gpu/device/functional.hpp"
using namespace cv::gpu::device;
namespace cv { namespace gpu { namespace matrix_operations {
namespace cv { namespace gpu { namespace device {
template <typename T> struct shift_and_sizeof;
template <> struct shift_and_sizeof<signed char> { enum { shift = 0 }; };
@ -249,7 +247,55 @@ namespace cv { namespace gpu { namespace matrix_operations {
const double alpha, beta;
};
namespace detail
{
template <size_t src_size, size_t dst_size, typename F> struct ConvertTraitsDispatcher : DefaultTransformFunctorTraits<F>
{
};
template <typename F> struct ConvertTraitsDispatcher<1, 1, F> : DefaultTransformFunctorTraits<F>
{
enum { smart_shift = 8 };
};
template <typename F> struct ConvertTraitsDispatcher<1, 2, F> : DefaultTransformFunctorTraits<F>
{
enum { smart_shift = 4 };
};
template <typename F> struct ConvertTraitsDispatcher<1, 4, F> : DefaultTransformFunctorTraits<F>
{
enum { smart_block_dim_y = 8 };
enum { smart_shift = 4 };
};
template <typename F> struct ConvertTraitsDispatcher<2, 2, F> : DefaultTransformFunctorTraits<F>
{
enum { smart_shift = 4 };
};
template <typename F> struct ConvertTraitsDispatcher<2, 4, F> : DefaultTransformFunctorTraits<F>
{
enum { smart_shift = 2 };
};
template <typename F> struct ConvertTraitsDispatcher<4, 2, F> : DefaultTransformFunctorTraits<F>
{
enum { smart_block_dim_y = 8 };
enum { smart_shift = 4 };
};
template <typename F> struct ConvertTraitsDispatcher<4, 4, F> : DefaultTransformFunctorTraits<F>
{
enum { smart_block_dim_y = 8 };
enum { smart_shift = 2 };
};
template <typename F> struct ConvertTraits : ConvertTraitsDispatcher<sizeof(typename F::argument_type), sizeof(typename F::result_type), F>
{
};
}
template <typename T, typename D> struct TransformFunctorTraits< Convertor<T, D> > : detail::ConvertTraits< Convertor<T, D> >
{
};
template<typename T, typename D>
void cvt_(const DevMem2D& src, const DevMem2D& dst, double alpha, double beta, cudaStream_t stream)
{

View File

@ -71,23 +71,16 @@ cv::gpu::Stream::operator bool() const { throw_nogpu(); return false; }
#include "opencv2/gpu/stream_accessor.hpp"
namespace cv
{
namespace gpu
{
namespace matrix_operations
{
void copy_to_with_mask(const DevMem2D& src, DevMem2D dst, int depth, const DevMem2D& mask, int channels, const cudaStream_t & stream = 0);
namespace cv { namespace gpu { namespace device {
void copy_to_with_mask(const DevMem2D& src, DevMem2D dst, int depth, const DevMem2D& mask, int channels, const cudaStream_t & stream = 0);
template <typename T>
void set_to_gpu(const DevMem2D& mat, const T* scalar, int channels, cudaStream_t stream);
template <typename T>
void set_to_gpu(const DevMem2D& mat, const T* scalar, const DevMem2D& mask, int channels, cudaStream_t stream);
template <typename T>
void set_to_gpu(const DevMem2D& mat, const T* scalar, int channels, cudaStream_t stream);
template <typename T>
void set_to_gpu(const DevMem2D& mat, const T* scalar, const DevMem2D& mask, int channels, cudaStream_t stream);
void convert_gpu(const DevMem2D& src, int sdepth, const DevMem2D& dst, int ddepth, double alpha, double beta, cudaStream_t stream = 0);
}
}
}
void convert_gpu(const DevMem2D& src, int sdepth, const DevMem2D& dst, int ddepth, double alpha, double beta, cudaStream_t stream = 0);
}}}
struct Stream::Impl
{
@ -108,14 +101,14 @@ namespace
void kernelSet(GpuMat& src, const Scalar& s, cudaStream_t stream)
{
Scalar_<T> sf = s;
matrix_operations::set_to_gpu(src, sf.val, src.channels(), stream);
device::set_to_gpu(src, sf.val, src.channels(), stream);
}
template <typename T>
void kernelSetMask(GpuMat& src, const Scalar& s, const GpuMat& mask, cudaStream_t stream)
{
Scalar_<T> sf = s;
matrix_operations::set_to_gpu(src, sf.val, mask, src.channels(), stream);
device::set_to_gpu(src, sf.val, mask, src.channels(), stream);
}
}
@ -262,7 +255,7 @@ void cv::gpu::Stream::enqueueConvert(const GpuMat& src, GpuMat& dst, int rtype,
psrc = &(temp = src);
dst.create( src.size(), rtype );
matrix_operations::convert_gpu(psrc->reshape(1), sdepth, dst.reshape(1), ddepth, alpha, beta, impl->stream);
device::convert_gpu(psrc->reshape(1), sdepth, dst.reshape(1), ddepth, alpha, beta, impl->stream);
}
cv::gpu::Stream::operator bool() const

View File

@ -67,7 +67,6 @@ void cv::gpu::min(const GpuMat&, double, GpuMat&, Stream&) { throw_nogpu(); }
void cv::gpu::max(const GpuMat&, const GpuMat&, GpuMat&, Stream&) { throw_nogpu(); }
void cv::gpu::max(const GpuMat&, double, GpuMat&, Stream&) { throw_nogpu(); }
double cv::gpu::threshold(const GpuMat&, GpuMat&, double, double, int, Stream&) {throw_nogpu(); return 0.0;}
void cv::gpu::pow(const GpuMat&, double, GpuMat&, Stream&) { throw_nogpu(); }
#else
@ -180,7 +179,7 @@ void cv::gpu::add(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, Stream& s
nppArithmCaller(src1, src2, dst, nppiAdd_8u_C1RSfs, nppiAdd_8u_C4RSfs, nppiAdd_32s_C1R, nppiAdd_32f_C1R, StreamAccessor::getStream(stream));
}
namespace cv { namespace gpu { namespace mathfunc
namespace cv { namespace gpu { namespace device
{
template <typename T>
void subtractCaller(const DevMem2D src1, const DevMem2D src2, DevMem2D dst, cudaStream_t stream);
@ -192,7 +191,7 @@ void cv::gpu::subtract(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, Stre
{
CV_Assert(src1.size() == src2.size());
dst.create(src1.size(), src1.type());
mathfunc::subtractCaller<short>(src1.reshape(1), src2.reshape(1), dst.reshape(1), StreamAccessor::getStream(stream));
device::subtractCaller<short>(src1.reshape(1), src2.reshape(1), dst.reshape(1), StreamAccessor::getStream(stream));
}
else
nppArithmCaller(src2, src1, dst, nppiSub_8u_C1RSfs, nppiSub_8u_C4RSfs, nppiSub_32s_C1R, nppiSub_32f_C1R, StreamAccessor::getStream(stream));
@ -338,7 +337,7 @@ void cv::gpu::absdiff(const GpuMat& src1, const Scalar& src2, GpuMat& dst, Strea
//////////////////////////////////////////////////////////////////////////////
// Comparison of two matrixes
namespace cv { namespace gpu { namespace mathfunc
namespace cv { namespace gpu { namespace device
{
void compare_ne_8uc4(const DevMem2D& src1, const DevMem2D& src2, const DevMem2D& dst, cudaStream_t stream);
void compare_ne_32f(const DevMem2D& src1, const DevMem2D& src2, const DevMem2D& dst, cudaStream_t stream);
@ -375,7 +374,7 @@ void cv::gpu::compare(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, int c
}
else
{
mathfunc::compare_ne_8uc4(src1, src2, dst, stream);
device::compare_ne_8uc4(src1, src2, dst, stream);
}
}
else
@ -393,7 +392,7 @@ void cv::gpu::compare(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, int c
}
else
{
mathfunc::compare_ne_32f(src1, src2, dst, stream);
device::compare_ne_32f(src1, src2, dst, stream);
}
}
}
@ -402,7 +401,7 @@ void cv::gpu::compare(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, int c
//////////////////////////////////////////////////////////////////////////////
// Unary bitwise logical operations
namespace cv { namespace gpu { namespace mathfunc
namespace cv { namespace gpu { namespace device
{
void bitwiseNotCaller(int rows, int cols, size_t elem_size1, int cn, const PtrStep src, PtrStep dst, cudaStream_t stream);
@ -416,7 +415,7 @@ namespace
{
dst.create(src.size(), src.type());
cv::gpu::mathfunc::bitwiseNotCaller(src.rows, src.cols, src.elemSize1(),
cv::gpu::device::bitwiseNotCaller(src.rows, src.cols, src.elemSize1(),
dst.channels(), src, dst, stream);
}
@ -426,10 +425,10 @@ namespace
using namespace cv::gpu;
typedef void (*Caller)(int, int, int, const PtrStep, const PtrStep, PtrStep, cudaStream_t);
static Caller callers[] = {mathfunc::bitwiseMaskNotCaller<unsigned char>, mathfunc::bitwiseMaskNotCaller<unsigned char>,
mathfunc::bitwiseMaskNotCaller<unsigned short>, mathfunc::bitwiseMaskNotCaller<unsigned short>,
mathfunc::bitwiseMaskNotCaller<unsigned int>, mathfunc::bitwiseMaskNotCaller<unsigned int>,
mathfunc::bitwiseMaskNotCaller<unsigned int>};
static Caller callers[] = {device::bitwiseMaskNotCaller<unsigned char>, device::bitwiseMaskNotCaller<unsigned char>,
device::bitwiseMaskNotCaller<unsigned short>, device::bitwiseMaskNotCaller<unsigned short>,
device::bitwiseMaskNotCaller<unsigned int>, device::bitwiseMaskNotCaller<unsigned int>,
device::bitwiseMaskNotCaller<unsigned int>};
CV_Assert(mask.type() == CV_8U && mask.size() == src.size());
dst.create(src.size(), src.type());
@ -456,7 +455,7 @@ void cv::gpu::bitwise_not(const GpuMat& src, GpuMat& dst, const GpuMat& mask, St
//////////////////////////////////////////////////////////////////////////////
// Binary bitwise logical operations
namespace cv { namespace gpu { namespace mathfunc
namespace cv { namespace gpu { namespace device
{
void bitwiseOrCaller(int rows, int cols, size_t elem_size1, int cn, const PtrStep src1, const PtrStep src2, PtrStep dst, cudaStream_t stream);
@ -482,7 +481,7 @@ namespace
CV_Assert(src1.size() == src2.size() && src1.type() == src2.type());
dst.create(src1.size(), src1.type());
cv::gpu::mathfunc::bitwiseOrCaller(dst.rows, dst.cols, dst.elemSize1(),
cv::gpu::device::bitwiseOrCaller(dst.rows, dst.cols, dst.elemSize1(),
dst.channels(), src1, src2, dst, stream);
}
@ -492,10 +491,10 @@ namespace
using namespace cv::gpu;
typedef void (*Caller)(int, int, int, const PtrStep, const PtrStep, const PtrStep, PtrStep, cudaStream_t);
static Caller callers[] = {mathfunc::bitwiseMaskOrCaller<unsigned char>, mathfunc::bitwiseMaskOrCaller<unsigned char>,
mathfunc::bitwiseMaskOrCaller<unsigned short>, mathfunc::bitwiseMaskOrCaller<unsigned short>,
mathfunc::bitwiseMaskOrCaller<unsigned int>, mathfunc::bitwiseMaskOrCaller<unsigned int>,
mathfunc::bitwiseMaskOrCaller<unsigned int>};
static Caller callers[] = {device::bitwiseMaskOrCaller<unsigned char>, device::bitwiseMaskOrCaller<unsigned char>,
device::bitwiseMaskOrCaller<unsigned short>, device::bitwiseMaskOrCaller<unsigned short>,
device::bitwiseMaskOrCaller<unsigned int>, device::bitwiseMaskOrCaller<unsigned int>,
device::bitwiseMaskOrCaller<unsigned int>};
CV_Assert(src1.size() == src2.size() && src1.type() == src2.type());
dst.create(src1.size(), src1.type());
@ -513,7 +512,7 @@ namespace
CV_Assert(src1.size() == src2.size() && src1.type() == src2.type());
dst.create(src1.size(), src1.type());
cv::gpu::mathfunc::bitwiseAndCaller(dst.rows, dst.cols, dst.elemSize1(),
cv::gpu::device::bitwiseAndCaller(dst.rows, dst.cols, dst.elemSize1(),
dst.channels(), src1, src2, dst, stream);
}
@ -523,10 +522,10 @@ namespace
using namespace cv::gpu;
typedef void (*Caller)(int, int, int, const PtrStep, const PtrStep, const PtrStep, PtrStep, cudaStream_t);
static Caller callers[] = {mathfunc::bitwiseMaskAndCaller<unsigned char>, mathfunc::bitwiseMaskAndCaller<unsigned char>,
mathfunc::bitwiseMaskAndCaller<unsigned short>, mathfunc::bitwiseMaskAndCaller<unsigned short>,
mathfunc::bitwiseMaskAndCaller<unsigned int>, mathfunc::bitwiseMaskAndCaller<unsigned int>,
mathfunc::bitwiseMaskAndCaller<unsigned int>};
static Caller callers[] = {device::bitwiseMaskAndCaller<unsigned char>, device::bitwiseMaskAndCaller<unsigned char>,
device::bitwiseMaskAndCaller<unsigned short>, device::bitwiseMaskAndCaller<unsigned short>,
device::bitwiseMaskAndCaller<unsigned int>, device::bitwiseMaskAndCaller<unsigned int>,
device::bitwiseMaskAndCaller<unsigned int>};
CV_Assert(src1.size() == src2.size() && src1.type() == src2.type());
dst.create(src1.size(), src1.type());
@ -544,7 +543,7 @@ namespace
CV_Assert(src1.size() == src2.size() && src1.type() == src2.type());
dst.create(src1.size(), src1.type());
cv::gpu::mathfunc::bitwiseXorCaller(dst.rows, dst.cols, dst.elemSize1(),
cv::gpu::device::bitwiseXorCaller(dst.rows, dst.cols, dst.elemSize1(),
dst.channels(), src1, src2, dst, stream);
}
@ -554,10 +553,10 @@ namespace
using namespace cv::gpu;
typedef void (*Caller)(int, int, int, const PtrStep, const PtrStep, const PtrStep, PtrStep, cudaStream_t);
static Caller callers[] = {mathfunc::bitwiseMaskXorCaller<unsigned char>, mathfunc::bitwiseMaskXorCaller<unsigned char>,
mathfunc::bitwiseMaskXorCaller<unsigned short>, mathfunc::bitwiseMaskXorCaller<unsigned short>,
mathfunc::bitwiseMaskXorCaller<unsigned int>, mathfunc::bitwiseMaskXorCaller<unsigned int>,
mathfunc::bitwiseMaskXorCaller<unsigned int>};
static Caller callers[] = {device::bitwiseMaskXorCaller<unsigned char>, device::bitwiseMaskXorCaller<unsigned char>,
device::bitwiseMaskXorCaller<unsigned short>, device::bitwiseMaskXorCaller<unsigned short>,
device::bitwiseMaskXorCaller<unsigned int>, device::bitwiseMaskXorCaller<unsigned int>,
device::bitwiseMaskXorCaller<unsigned int>};
CV_Assert(src1.size() == src2.size() && src1.type() == src2.type());
dst.create(src1.size(), src1.type());
@ -601,7 +600,7 @@ void cv::gpu::bitwise_xor(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, c
//////////////////////////////////////////////////////////////////////////////
// Minimum and maximum operations
namespace cv { namespace gpu { namespace mathfunc
namespace cv { namespace gpu { namespace device
{
template <typename T>
void min_gpu(const DevMem2D_<T>& src1, const DevMem2D_<T>& src2, const DevMem2D_<T>& dst, cudaStream_t stream);
@ -623,14 +622,14 @@ namespace
{
CV_Assert(src1.size() == src2.size() && src1.type() == src2.type());
dst.create(src1.size(), src1.type());
mathfunc::min_gpu<T>(src1.reshape(1), src2.reshape(1), dst.reshape(1), stream);
device::min_gpu<T>(src1.reshape(1), src2.reshape(1), dst.reshape(1), stream);
}
template <typename T>
void min_caller(const GpuMat& src1, double src2, GpuMat& dst, cudaStream_t stream)
{
dst.create(src1.size(), src1.type());
mathfunc::min_gpu<T>(src1.reshape(1), saturate_cast<T>(src2), dst.reshape(1), stream);
device::min_gpu<T>(src1.reshape(1), saturate_cast<T>(src2), dst.reshape(1), stream);
}
template <typename T>
@ -638,14 +637,14 @@ namespace
{
CV_Assert(src1.size() == src2.size() && src1.type() == src2.type());
dst.create(src1.size(), src1.type());
mathfunc::max_gpu<T>(src1.reshape(1), src2.reshape(1), dst.reshape(1), stream);
device::max_gpu<T>(src1.reshape(1), src2.reshape(1), dst.reshape(1), stream);
}
template <typename T>
void max_caller(const GpuMat& src1, double src2, GpuMat& dst, cudaStream_t stream)
{
dst.create(src1.size(), src1.type());
mathfunc::max_gpu<T>(src1.reshape(1), saturate_cast<T>(src2), dst.reshape(1), stream);
device::max_gpu<T>(src1.reshape(1), saturate_cast<T>(src2), dst.reshape(1), stream);
}
}
@ -709,7 +708,7 @@ void cv::gpu::max(const GpuMat& src1, double src2, GpuMat& dst, Stream& stream)
////////////////////////////////////////////////////////////////////////
// threshold
namespace cv { namespace gpu { namespace mathfunc
namespace cv { namespace gpu { namespace device
{
template <typename T>
void threshold_gpu(const DevMem2D& src, const DevMem2D& dst, T thresh, T maxVal, int type,
@ -718,24 +717,25 @@ namespace cv { namespace gpu { namespace mathfunc
namespace
{
template <typename T>
void threshold_caller(const GpuMat& src, GpuMat& dst, double thresh, double maxVal, int type,
cudaStream_t stream)
template <typename T> void threshold_caller(const GpuMat& src, GpuMat& dst, double thresh, double maxVal, int type, cudaStream_t stream)
{
mathfunc::threshold_gpu<T>(src, dst, saturate_cast<T>(thresh), saturate_cast<T>(maxVal), type, stream);
device::threshold_gpu<T>(src, dst, saturate_cast<T>(thresh), saturate_cast<T>(maxVal), type, stream);
}
}
double cv::gpu::threshold(const GpuMat& src, GpuMat& dst, double thresh, double maxVal, int type, Stream& s)
{
CV_Assert(src.channels() == 1 && src.depth() <= CV_64F);
CV_Assert(type <= THRESH_TOZERO_INV);
dst.create(src.size(), src.type());
cudaStream_t stream = StreamAccessor::getStream(s);
if (src.type() == CV_32FC1 && type == THRESH_TRUNC)
{
NppStreamHandler h(stream);
dst.create(src.size(), src.type());
NppiSize sz;
sz.width = src.cols;
sz.height = src.rows;
@ -761,12 +761,7 @@ double cv::gpu::threshold(const GpuMat& src, GpuMat& dst, double thresh, double
threshold_caller<int>, threshold_caller<float>, threshold_caller<double>
};
CV_Assert(src.channels() == 1 && src.depth() <= CV_64F);
CV_Assert(type <= THRESH_TOZERO_INV);
dst.create(src.size(), src.type());
if (src.depth() != CV_32F)
if (src.depth() != CV_32F && src.depth() != CV_64F)
{
thresh = cvFloor(thresh);
maxVal = cvRound(maxVal);
@ -781,17 +776,11 @@ double cv::gpu::threshold(const GpuMat& src, GpuMat& dst, double thresh, double
////////////////////////////////////////////////////////////////////////
// pow
namespace cv
namespace cv { namespace gpu { namespace device
{
namespace gpu
{
namespace mathfunc
{
template<typename T>
void pow_caller(const DevMem2D& src, float power, DevMem2D dst, cudaStream_t stream);
}
}
}
template<typename T>
void pow_caller(const DevMem2D& src, float power, DevMem2D dst, cudaStream_t stream);
}}}
void cv::gpu::pow(const GpuMat& src, double power, GpuMat& dst, Stream& stream)
{
@ -802,9 +791,9 @@ void cv::gpu::pow(const GpuMat& src, double power, GpuMat& dst, Stream& stream)
static const caller_t callers[] =
{
mathfunc::pow_caller<unsigned char>, mathfunc::pow_caller<signed char>,
mathfunc::pow_caller<unsigned short>, mathfunc::pow_caller<short>,
mathfunc::pow_caller<int>, mathfunc::pow_caller<float>
device::pow_caller<unsigned char>, device::pow_caller<signed char>,
device::pow_caller<unsigned short>, device::pow_caller<short>,
device::pow_caller<int>, device::pow_caller<float>
};
callers[src.depth()](src.reshape(1), (float)power, dst.reshape(1), StreamAccessor::getStream(stream));

View File

@ -393,7 +393,7 @@ void cv::gpu::ensureSizeIsEnough(int, int, int, GpuMat&) { throw_nogpu(); }
#else /* !defined (HAVE_CUDA) */
namespace cv { namespace gpu { namespace matrix_operations
namespace cv { namespace gpu { namespace device
{
void copy_to_with_mask(const DevMem2D& src, DevMem2D dst, int depth, const DevMem2D& mask, int channels, const cudaStream_t & stream = 0);
@ -449,7 +449,7 @@ void cv::gpu::GpuMat::copyTo(GpuMat& mat, const GpuMat& mask) const
else
{
mat.create(size(), type());
cv::gpu::matrix_operations::copy_to_with_mask(*this, mat, depth(), mask, channels());
device::copy_to_with_mask(*this, mat, depth(), mask, channels());
}
}
@ -508,7 +508,7 @@ namespace
void convertToKernelCaller(const GpuMat& src, GpuMat& dst)
{
matrix_operations::convert_gpu(src.reshape(1), src.depth(), dst.reshape(1), dst.depth(), 1.0, 0.0);
device::convert_gpu(src.reshape(1), src.depth(), dst.reshape(1), dst.depth(), 1.0, 0.0);
}
}
@ -540,7 +540,7 @@ void cv::gpu::GpuMat::convertTo( GpuMat& dst, int rtype, double alpha, double be
dst.create( size(), rtype );
if (!noScale)
matrix_operations::convert_gpu(psrc->reshape(1), sdepth, dst.reshape(1), ddepth, alpha, beta);
device::convert_gpu(psrc->reshape(1), sdepth, dst.reshape(1), ddepth, alpha, beta);
else
{
typedef void (*convert_caller_t)(const GpuMat& src, GpuMat& dst);
@ -681,7 +681,7 @@ namespace
void kernelSet(GpuMat& src, const Scalar& s)
{
Scalar_<T> sf = s;
matrix_operations::set_to_gpu(src, sf.val, src.channels(), 0);
device::set_to_gpu(src, sf.val, src.channels(), 0);
}
template<int SDEPTH, int SCN> struct NppSetMaskFunc
@ -732,7 +732,7 @@ namespace
void kernelSetMask(GpuMat& src, const Scalar& s, const GpuMat& mask)
{
Scalar_<T> sf = s;
matrix_operations::set_to_gpu(src, sf.val, mask, src.channels(), 0);
device::set_to_gpu(src, sf.val, mask, src.channels(), 0);
}
}

View File

@ -45,6 +45,7 @@
#include "internal_shared.hpp"
#include "../vec_traits.hpp"
#include "../functional.hpp"
namespace cv { namespace gpu { namespace device
{
@ -68,51 +69,17 @@ namespace cv { namespace gpu { namespace device
//! Read Write Traits
template <size_t src_elem_size, size_t dst_elem_size>
struct UnReadWriteTraits_
template <typename T, typename D, int shift> struct UnaryReadWriteTraits
{
enum { shift = 1 };
};
template <size_t src_elem_size>
struct UnReadWriteTraits_<src_elem_size, 1>
{
enum { shift = 4 };
};
template <size_t src_elem_size>
struct UnReadWriteTraits_<src_elem_size, 2>
{
enum { shift = 2 };
};
template <typename T, typename D> struct UnReadWriteTraits
{
enum { shift = UnReadWriteTraits_<sizeof(T), sizeof(D)>::shift };
typedef typename TypeVec<T, shift>::vec_type read_type;
typedef typename TypeVec<D, shift>::vec_type write_type;
};
template <size_t src_elem_size1, size_t src_elem_size2, size_t dst_elem_size>
struct BinReadWriteTraits_
template <typename T1, typename T2, typename D, int shift> struct BinaryReadWriteTraits
{
enum { shift = 1 };
};
template <size_t src_elem_size1, size_t src_elem_size2>
struct BinReadWriteTraits_<src_elem_size1, src_elem_size2, 1>
{
enum { shift = 4 };
};
template <size_t src_elem_size1, size_t src_elem_size2>
struct BinReadWriteTraits_<src_elem_size1, src_elem_size2, 2>
{
enum { shift = 2 };
};
template <typename T1, typename T2, typename D> struct BinReadWriteTraits
{
enum {shift = BinReadWriteTraits_<sizeof(T1), sizeof(T2), sizeof(D)>::shift};
typedef typename TypeVec<T1, shift>::vec_type read_type1;
typedef typename TypeVec<T2, shift>::vec_type read_type2;
typedef typename TypeVec<D , shift>::vec_type write_type;
typedef typename TypeVec<D, shift>::vec_type write_type;
};
//! Transform kernels
@ -206,29 +173,73 @@ namespace cv { namespace gpu { namespace device
dst.w = op(src1.w, src2.w);
}
};
template <> struct OpUnroller<8>
{
template <typename T, typename D, typename UnOp, typename Mask>
static __device__ __forceinline__ void unroll(const T& src, D& dst, const Mask& mask, const UnOp& op, int x_shifted, int y)
{
if (mask(y, x_shifted))
dst.a0 = op(src.a0);
if (mask(y, x_shifted + 1))
dst.a1 = op(src.a1);
if (mask(y, x_shifted + 2))
dst.a2 = op(src.a2);
if (mask(y, x_shifted + 3))
dst.a3 = op(src.a3);
if (mask(y, x_shifted + 4))
dst.a4 = op(src.a4);
if (mask(y, x_shifted + 5))
dst.a5 = op(src.a5);
if (mask(y, x_shifted + 6))
dst.a6 = op(src.a6);
if (mask(y, x_shifted + 7))
dst.a7 = op(src.a7);
}
template <typename T1, typename T2, typename D, typename BinOp, typename Mask>
static __device__ __forceinline__ void unroll(const T1& src1, const T2& src2, D& dst, const Mask& mask, const BinOp& op, int x_shifted, int y)
{
if (mask(y, x_shifted))
dst.a0 = op(src1.a0, src2.a0);
if (mask(y, x_shifted + 1))
dst.a1 = op(src1.a1, src2.a1);
if (mask(y, x_shifted + 2))
dst.a2 = op(src1.a2, src2.a2);
if (mask(y, x_shifted + 3))
dst.a3 = op(src1.a3, src2.a3);
if (mask(y, x_shifted + 4))
dst.a4 = op(src1.a4, src2.a4);
if (mask(y, x_shifted + 5))
dst.a5 = op(src1.a5, src2.a5);
if (mask(y, x_shifted + 6))
dst.a6 = op(src1.a6, src2.a6);
if (mask(y, x_shifted + 7))
dst.a7 = op(src1.a7, src2.a7);
}
};
template <typename T, typename D, typename UnOp, typename Mask>
__global__ static void transformSmart(const DevMem2D_<T> src_, PtrStep_<D> dst_, const Mask mask, const UnOp op)
{
typedef typename UnReadWriteTraits<T, D>::read_type read_type;
typedef typename UnReadWriteTraits<T, D>::write_type write_type;
const int shift = UnReadWriteTraits<T, D>::shift;
typedef TransformFunctorTraits<UnOp> ft;
typedef typename UnaryReadWriteTraits<T, D, ft::smart_shift>::read_type read_type;
typedef typename UnaryReadWriteTraits<T, D, ft::smart_shift>::write_type write_type;
const int x = threadIdx.x + blockIdx.x * blockDim.x;
const int y = threadIdx.y + blockIdx.y * blockDim.y;
const int x_shifted = x * shift;
const int x_shifted = x * ft::smart_shift;
if (y < src_.rows)
{
const T* src = src_.ptr(y);
D* dst = dst_.ptr(y);
if (x_shifted + shift - 1 < src_.cols)
if (x_shifted + ft::smart_shift - 1 < src_.cols)
{
const read_type src_n_el = ((const read_type*)src)[x];
write_type dst_n_el;
OpUnroller<shift>::unroll(src_n_el, dst_n_el, mask, op, x_shifted, y);
OpUnroller<ft::smart_shift>::unroll(src_n_el, dst_n_el, mask, op, x_shifted, y);
((write_type*)dst)[x] = dst_n_el;
}
@ -259,14 +270,14 @@ namespace cv { namespace gpu { namespace device
__global__ static void transformSmart(const DevMem2D_<T1> src1_, const PtrStep_<T2> src2_, PtrStep_<D> dst_,
const Mask mask, const BinOp op)
{
typedef typename BinReadWriteTraits<T1, T2, D>::read_type1 read_type1;
typedef typename BinReadWriteTraits<T1, T2, D>::read_type2 read_type2;
typedef typename BinReadWriteTraits<T1, T2, D>::write_type write_type;
const int shift = BinReadWriteTraits<T1, T2, D>::shift;
typedef TransformFunctorTraits<BinOp> ft;
typedef typename BinaryReadWriteTraits<T1, T2, D, ft::smart_shift>::read_type1 read_type1;
typedef typename BinaryReadWriteTraits<T1, T2, D, ft::smart_shift>::read_type2 read_type2;
typedef typename BinaryReadWriteTraits<T1, T2, D, ft::smart_shift>::write_type write_type;
const int x = threadIdx.x + blockIdx.x * blockDim.x;
const int y = threadIdx.y + blockIdx.y * blockDim.y;
const int x_shifted = x * shift;
const int x_shifted = x * ft::smart_shift;
if (y < src1_.rows)
{
@ -274,13 +285,13 @@ namespace cv { namespace gpu { namespace device
const T2* src2 = src2_.ptr(y);
D* dst = dst_.ptr(y);
if (x_shifted + shift - 1 < src1_.cols)
if (x_shifted + ft::smart_shift - 1 < src1_.cols)
{
const read_type1 src1_n_el = ((const read_type1*)src1)[x];
const read_type2 src2_n_el = ((const read_type2*)src2)[x];
write_type dst_n_el;
OpUnroller<shift>::unroll(src1_n_el, src2_n_el, dst_n_el, mask, op, x_shifted, y);
OpUnroller<ft::smart_shift>::unroll(src1_n_el, src2_n_el, dst_n_el, mask, op, x_shifted, y);
((write_type*)dst)[x] = dst_n_el;
}
@ -308,7 +319,7 @@ namespace cv { namespace gpu { namespace device
const T2 src2_data = src2.ptr(y)[x];
dst.ptr(y)[x] = op(src1_data, src2_data);
}
}
}
template <bool UseSmart> struct TransformDispatcher;
template<> struct TransformDispatcher<false>
@ -316,11 +327,10 @@ namespace cv { namespace gpu { namespace device
template <typename T, typename D, typename UnOp, typename Mask>
static void call(const DevMem2D_<T>& src, const DevMem2D_<D>& dst, const UnOp& op, const Mask& mask, cudaStream_t stream)
{
dim3 threads(16, 16, 1);
dim3 grid(1, 1, 1);
typedef TransformFunctorTraits<UnOp> ft;
grid.x = divUp(src.cols, threads.x);
grid.y = divUp(src.rows, threads.y);
const dim3 threads(ft::simple_block_dim_x, ft::simple_block_dim_y, 1);
const dim3 grid(divUp(src.cols, threads.x), divUp(src.rows, threads.y), 1);
transformSimple<T, D><<<grid, threads, 0, stream>>>(src, dst, mask, op);
cudaSafeCall( cudaGetLastError() );
@ -332,11 +342,10 @@ namespace cv { namespace gpu { namespace device
template <typename T1, typename T2, typename D, typename BinOp, typename Mask>
static void call(const DevMem2D_<T1>& src1, const DevMem2D_<T2>& src2, const DevMem2D_<D>& dst, const BinOp& op, const Mask& mask, cudaStream_t stream)
{
dim3 threads(16, 16, 1);
dim3 grid(1, 1, 1);
typedef TransformFunctorTraits<BinOp> ft;
grid.x = divUp(src1.cols, threads.x);
grid.y = divUp(src1.rows, threads.y);
const dim3 threads(ft::simple_block_dim_x, ft::simple_block_dim_y, 1);
const dim3 grid(divUp(src1.cols, threads.x), divUp(src1.rows, threads.y), 1);
transformSimple<T1, T2, D><<<grid, threads, 0, stream>>>(src1, src2, dst, mask, op);
cudaSafeCall( cudaGetLastError() );
@ -350,13 +359,12 @@ namespace cv { namespace gpu { namespace device
template <typename T, typename D, typename UnOp, typename Mask>
static void call(const DevMem2D_<T>& src, const DevMem2D_<D>& dst, const UnOp& op, const Mask& mask, cudaStream_t stream)
{
const int shift = UnReadWriteTraits<T, D>::shift;
typedef TransformFunctorTraits<UnOp> ft;
dim3 threads(16, 16, 1);
dim3 grid(1, 1, 1);
StaticAssert<ft::smart_shift != 1>::check();
grid.x = divUp(src.cols, threads.x * shift);
grid.y = divUp(src.rows, threads.y);
const dim3 threads(ft::smart_block_dim_x, ft::smart_block_dim_y, 1);
const dim3 grid(divUp(src.cols, threads.x * ft::smart_shift), divUp(src.rows, threads.y), 1);
transformSmart<T, D><<<grid, threads, 0, stream>>>(src, dst, mask, op);
cudaSafeCall( cudaGetLastError() );
@ -368,13 +376,12 @@ namespace cv { namespace gpu { namespace device
template <typename T1, typename T2, typename D, typename BinOp, typename Mask>
static void call(const DevMem2D_<T1>& src1, const DevMem2D_<T2>& src2, const DevMem2D_<D>& dst, const BinOp& op, const Mask& mask, cudaStream_t stream)
{
const int shift = BinReadWriteTraits<T1, T2, D>::shift;
typedef TransformFunctorTraits<BinOp> ft;
dim3 threads(16, 16, 1);
dim3 grid(1, 1, 1);
StaticAssert<ft::smart_shift != 1>::check();
grid.x = divUp(src1.cols, threads.x * shift);
grid.y = divUp(src1.rows, threads.y);
const dim3 threads(ft::smart_block_dim_x, ft::smart_block_dim_y, 1);
const dim3 grid(divUp(src1.cols, threads.x * ft::smart_shift), divUp(src1.rows, threads.y), 1);
transformSmart<T1, T2, D><<<grid, threads, 0, stream>>>(src1, src2, dst, mask, op);
cudaSafeCall( cudaGetLastError() );
@ -382,44 +389,20 @@ namespace cv { namespace gpu { namespace device
if (stream == 0)
cudaSafeCall( cudaDeviceSynchronize() );
}
};
template <typename T, typename D, int scn, int dcn> struct UseSmartUn_
{
static const bool value = false;
};
template <typename T, typename D> struct UseSmartUn_<T, D, 1, 1>
{
static const bool value = UnReadWriteTraits<T, D>::shift != 1;
};
template <typename T, typename D> struct UseSmartUn
{
static const bool value = UseSmartUn_<T, D, VecTraits<T>::cn, VecTraits<D>::cn>::value;
};
template <typename T1, typename T2, typename D, int src1cn, int src2cn, int dstcn> struct UseSmartBin_
{
static const bool value = false;
};
template <typename T1, typename T2, typename D> struct UseSmartBin_<T1, T2, D, 1, 1, 1>
{
static const bool value = BinReadWriteTraits<T1, T2, D>::shift != 1;
};
template <typename T1, typename T2, typename D> struct UseSmartBin
{
static const bool value = UseSmartBin_<T1, T2, D, VecTraits<T1>::cn, VecTraits<T2>::cn, VecTraits<D>::cn>::value;
};
};
template <typename T, typename D, typename UnOp, typename Mask>
static void transform_caller(const DevMem2D_<T>& src, const DevMem2D_<D>& dst, const UnOp& op, const Mask& mask, cudaStream_t stream)
{
TransformDispatcher< UseSmartUn<T, D>::value >::call(src, dst, op, mask, stream);
typedef TransformFunctorTraits<UnOp> ft;
TransformDispatcher<VecTraits<T>::cn == 1 && VecTraits<D>::cn == 1 && ft::smart_shift != 1>::call(src, dst, op, mask, stream);
}
template <typename T1, typename T2, typename D, typename BinOp, typename Mask>
static void transform_caller(const DevMem2D_<T1>& src1, const DevMem2D_<T2>& src2, const DevMem2D_<D>& dst, const BinOp& op, const Mask& mask, cudaStream_t stream)
{
TransformDispatcher< UseSmartBin<T1, T2, D>::value >::call(src1, src2, dst, op, mask, stream);
typedef TransformFunctorTraits<BinOp> ft;
TransformDispatcher<VecTraits<T1>::cn == 1 && VecTraits<T2>::cn == 1 && VecTraits<D>::cn == 1 && ft::smart_shift != 1>::call(src1, src2, dst, op, mask, stream);
}
}
}}}

View File

@ -46,18 +46,25 @@
#include <thrust/functional.h>
#include "internal_shared.hpp"
#include "saturate_cast.hpp"
#include "vec_traits.hpp"
namespace cv { namespace gpu { namespace device
{
// Function Objects
using thrust::unary_function;
using thrust::binary_function;
// Arithmetic Operations
using thrust::plus;
using thrust::minus;
using thrust::multiplies;
using thrust::divides;
using thrust::modulus;
using thrust::negate;
// Comparison Operations
using thrust::equal_to;
using thrust::not_equal_to;
@ -65,11 +72,15 @@ namespace cv { namespace gpu { namespace device
using thrust::less;
using thrust::greater_equal;
using thrust::less_equal;
// Logical Operations
using thrust::logical_and;
using thrust::logical_or;
using thrust::logical_not;
// Bitwise Operations
using thrust::bit_and;
using thrust::bit_or;
using thrust::bit_xor;
@ -78,7 +89,13 @@ namespace cv { namespace gpu { namespace device
__forceinline__ __device__ T operator ()(const T& v) const {return ~v;}
};
using thrust::identity;
// Generalized Identity Operations
using thrust::identity;
using thrust::project1st;
using thrust::project2nd;
// Min/Max Operations
#define OPENCV_GPU_IMPLEMENT_MINMAX(name, type, op) \
template <> struct name<type> : binary_function<type, type, type> \
@ -115,15 +132,8 @@ namespace cv { namespace gpu { namespace device
OPENCV_GPU_IMPLEMENT_MINMAX(minimum, double, fmin)
#undef OPENCV_GPU_IMPLEMENT_MINMAX
using thrust::project1st;
using thrust::project2nd;
using thrust::unary_negate;
using thrust::not1;
using thrust::binary_negate;
using thrust::not2;
// Math functions
#define OPENCV_GPU_IMPLEMENT_UN_FUNCTOR(func) \
template <typename T> struct func ## _func : unary_function<T, float> \
@ -192,6 +202,8 @@ namespace cv { namespace gpu { namespace device
}
};
// Saturate Cast Functor
template <typename T, typename D> struct saturate_cast_func : unary_function<T, D>
{
__forceinline__ __device__ D operator ()(const T& v) const
@ -200,6 +212,8 @@ namespace cv { namespace gpu { namespace device
}
};
// Threshold Functors
template <typename T> struct thresh_binary_func : unary_function<T, T>
{
__forceinline__ __host__ __device__ thresh_binary_func(T thresh_, T maxVal_) : thresh(thresh_), maxVal(maxVal_) {}
@ -256,7 +270,15 @@ namespace cv { namespace gpu { namespace device
}
const T thresh;
};
};
// Function Object Adaptors
using thrust::unary_negate;
using thrust::not1;
using thrust::binary_negate;
using thrust::not2;
template <typename Op> struct binder1st : unary_function<typename Op::second_argument_type, typename Op::result_type>
{
@ -291,46 +313,77 @@ namespace cv { namespace gpu { namespace device
return binder2nd<Op>(op, typename Op::second_argument_type(x));
}
template <typename T1, typename T2> struct BinOpTraits
// Functor Traits
template <typename F> struct IsUnaryFunction
{
typedef int argument_type;
struct Yes {};
struct No {Yes a[2];};
template <typename T, typename D> static Yes check(unary_function<T, D>*);
static No check(...);
enum { value = (sizeof(check((F*)0)) == sizeof(Yes)) };
};
template <typename T> struct BinOpTraits<T, T>
template <typename F> struct IsBinaryFunction
{
typedef T argument_type;
struct Yes {};
struct No {Yes a[2];};
template <typename T1, typename T2, typename D> static Yes check(binary_function<T1, T2, D>*);
static No check(...);
enum { value = (sizeof(check((F*)0)) == sizeof(Yes)) };
};
template <typename T> struct BinOpTraits<T, double>
namespace detail
{
typedef double argument_type;
};
template <typename T> struct BinOpTraits<double, T>
template <size_t src_elem_size, size_t dst_elem_size> struct UnOpShift { enum { shift = 1 }; };
template <size_t src_elem_size> struct UnOpShift<src_elem_size, 1> { enum { shift = 4 }; };
template <size_t src_elem_size> struct UnOpShift<src_elem_size, 2> { enum { shift = 2 }; };
template <typename T, typename D> struct DefaultUnaryShift
{
enum { shift = detail::UnOpShift<sizeof(T), sizeof(D)>::shift };
};
template <size_t src_elem_size1, size_t src_elem_size2, size_t dst_elem_size> struct BinOpShift { enum { shift = 1 }; };
template <size_t src_elem_size1, size_t src_elem_size2> struct BinOpShift<src_elem_size1, src_elem_size2, 1> { enum { shift = 4 }; };
template <size_t src_elem_size1, size_t src_elem_size2> struct BinOpShift<src_elem_size1, src_elem_size2, 2> { enum { shift = 2 }; };
template <typename T1, typename T2, typename D> struct DefaultBinaryShift
{
enum { shift = detail::BinOpShift<sizeof(T1), sizeof(T2), sizeof(D)>::shift };
};
template <typename Func, bool unary = IsUnaryFunction<Func>::value> struct ShiftDispatcher;
template <typename Func> struct ShiftDispatcher<Func, true>
{
enum { shift = DefaultUnaryShift<typename Func::argument_type, typename Func::result_type>::shift };
};
template <typename Func> struct ShiftDispatcher<Func, false>
{
enum { shift = DefaultBinaryShift<typename Func::first_argument_type, typename Func::second_argument_type, typename Func::result_type>::shift };
};
}
template <typename Func> struct DefaultTransformShift
{
typedef double argument_type;
enum { shift = detail::ShiftDispatcher<Func>::shift };
};
template <> struct BinOpTraits<double, double>
template <typename Func> struct DefaultTransformFunctorTraits
{
typedef double argument_type;
};
template <typename T> struct BinOpTraits<T, float>
{
typedef float argument_type;
};
template <typename T> struct BinOpTraits<float, T>
{
typedef float argument_type;
};
template <> struct BinOpTraits<float, float>
{
typedef float argument_type;
};
template <> struct BinOpTraits<double, float>
{
typedef double argument_type;
};
template <> struct BinOpTraits<float, double>
{
typedef double argument_type;
enum { simple_block_dim_x = 16 };
enum { simple_block_dim_y = 16 };
enum { smart_block_dim_x = 16 };
enum { smart_block_dim_y = 16 };
enum { smart_shift = DefaultTransformShift<Func>::shift };
};
template <typename Func> struct TransformFunctorTraits : DefaultTransformFunctorTraits<Func> {};
}}}
#endif // __OPENCV_GPU_FUNCTIONAL_HPP__

View File

@ -150,6 +150,50 @@ namespace cv { namespace gpu { namespace device
return VecTraits<TypeVec<func<type>::result_type, 4>::vec_type>::make(f(a.x), f(a.y), f(a.z), f(a.w)); \
}
namespace detail
{
template <typename T1, typename T2> struct BinOpTraits
{
typedef int argument_type;
};
template <typename T> struct BinOpTraits<T, T>
{
typedef T argument_type;
};
template <typename T> struct BinOpTraits<T, double>
{
typedef double argument_type;
};
template <typename T> struct BinOpTraits<double, T>
{
typedef double argument_type;
};
template <> struct BinOpTraits<double, double>
{
typedef double argument_type;
};
template <typename T> struct BinOpTraits<T, float>
{
typedef float argument_type;
};
template <typename T> struct BinOpTraits<float, T>
{
typedef float argument_type;
};
template <> struct BinOpTraits<float, float>
{
typedef float argument_type;
};
template <> struct BinOpTraits<double, float>
{
typedef double argument_type;
};
template <> struct BinOpTraits<float, double>
{
typedef double argument_type;
};
}
#define OPENCV_GPU_IMPLEMENT_VEC_BINOP(type, op, func) \
static __device__ TypeVec<func<type>::result_type, 1>::vec_type op(const type ## 1 & a, const type ## 1 & b) \
{ \
@ -157,16 +201,16 @@ namespace cv { namespace gpu { namespace device
return VecTraits<TypeVec<func<type>::result_type, 1>::vec_type>::make(f(a.x, b.x)); \
} \
template <typename T> \
static __device__ typename TypeVec<typename func<typename BinOpTraits<type, T>::argument_type>::result_type, 1>::vec_type op(const type ## 1 & v, T s) \
static __device__ typename TypeVec<typename func<typename detail::BinOpTraits<type, T>::argument_type>::result_type, 1>::vec_type op(const type ## 1 & v, T s) \
{ \
func<typename BinOpTraits<type, T>::argument_type> f; \
return VecTraits<typename TypeVec<typename func<typename BinOpTraits<type, T>::argument_type>::result_type, 1>::vec_type>::make(f(v.x, s)); \
func<typename detail::BinOpTraits<type, T>::argument_type> f; \
return VecTraits<typename TypeVec<typename func<typename detail::BinOpTraits<type, T>::argument_type>::result_type, 1>::vec_type>::make(f(v.x, s)); \
} \
template <typename T> \
static __device__ typename TypeVec<typename func<typename BinOpTraits<type, T>::argument_type>::result_type, 1>::vec_type op(T s, const type ## 1 & v) \
static __device__ typename TypeVec<typename func<typename detail::BinOpTraits<type, T>::argument_type>::result_type, 1>::vec_type op(T s, const type ## 1 & v) \
{ \
func<typename BinOpTraits<type, T>::argument_type> f; \
return VecTraits<typename TypeVec<typename func<typename BinOpTraits<type, T>::argument_type>::result_type, 1>::vec_type>::make(f(s, v.x)); \
func<typename detail::BinOpTraits<type, T>::argument_type> f; \
return VecTraits<typename TypeVec<typename func<typename detail::BinOpTraits<type, T>::argument_type>::result_type, 1>::vec_type>::make(f(s, v.x)); \
} \
static __device__ TypeVec<func<type>::result_type, 2>::vec_type op(const type ## 2 & a, const type ## 2 & b) \
{ \
@ -174,16 +218,16 @@ namespace cv { namespace gpu { namespace device
return VecTraits<TypeVec<func<type>::result_type, 2>::vec_type>::make(f(a.x, b.x), f(a.y, b.y)); \
} \
template <typename T> \
static __device__ typename TypeVec<typename func<typename BinOpTraits<type, T>::argument_type>::result_type, 2>::vec_type op(const type ## 2 & v, T s) \
static __device__ typename TypeVec<typename func<typename detail::BinOpTraits<type, T>::argument_type>::result_type, 2>::vec_type op(const type ## 2 & v, T s) \
{ \
func<typename BinOpTraits<type, T>::argument_type> f; \
return VecTraits<typename TypeVec<typename func<typename BinOpTraits<type, T>::argument_type>::result_type, 2>::vec_type>::make(f(v.x, s), f(v.y, s)); \
func<typename detail::BinOpTraits<type, T>::argument_type> f; \
return VecTraits<typename TypeVec<typename func<typename detail::BinOpTraits<type, T>::argument_type>::result_type, 2>::vec_type>::make(f(v.x, s), f(v.y, s)); \
} \
template <typename T> \
static __device__ typename TypeVec<typename func<typename BinOpTraits<type, T>::argument_type>::result_type, 2>::vec_type op(T s, const type ## 2 & v) \
static __device__ typename TypeVec<typename func<typename detail::BinOpTraits<type, T>::argument_type>::result_type, 2>::vec_type op(T s, const type ## 2 & v) \
{ \
func<typename BinOpTraits<type, T>::argument_type> f; \
return VecTraits<typename TypeVec<typename func<typename BinOpTraits<type, T>::argument_type>::result_type, 2>::vec_type>::make(f(s, v.x), f(s, v.y)); \
func<typename detail::BinOpTraits<type, T>::argument_type> f; \
return VecTraits<typename TypeVec<typename func<typename detail::BinOpTraits<type, T>::argument_type>::result_type, 2>::vec_type>::make(f(s, v.x), f(s, v.y)); \
} \
static __device__ TypeVec<func<type>::result_type, 3>::vec_type op(const type ## 3 & a, const type ## 3 & b) \
{ \
@ -191,16 +235,16 @@ namespace cv { namespace gpu { namespace device
return VecTraits<TypeVec<func<type>::result_type, 3>::vec_type>::make(f(a.x, b.x), f(a.y, b.y), f(a.z, b.z)); \
} \
template <typename T> \
static __device__ typename TypeVec<typename func<typename BinOpTraits<type, T>::argument_type>::result_type, 3>::vec_type op(const type ## 3 & v, T s) \
static __device__ typename TypeVec<typename func<typename detail::BinOpTraits<type, T>::argument_type>::result_type, 3>::vec_type op(const type ## 3 & v, T s) \
{ \
func<typename BinOpTraits<type, T>::argument_type> f; \
return VecTraits<typename TypeVec<typename func<typename BinOpTraits<type, T>::argument_type>::result_type, 3>::vec_type>::make(f(v.x, s), f(v.y, s), f(v.z, s)); \
func<typename detail::BinOpTraits<type, T>::argument_type> f; \
return VecTraits<typename TypeVec<typename func<typename detail::BinOpTraits<type, T>::argument_type>::result_type, 3>::vec_type>::make(f(v.x, s), f(v.y, s), f(v.z, s)); \
} \
template <typename T> \
static __device__ typename TypeVec<typename func<typename BinOpTraits<type, T>::argument_type>::result_type, 3>::vec_type op(T s, const type ## 3 & v) \
static __device__ typename TypeVec<typename func<typename detail::BinOpTraits<type, T>::argument_type>::result_type, 3>::vec_type op(T s, const type ## 3 & v) \
{ \
func<typename BinOpTraits<type, T>::argument_type> f; \
return VecTraits<typename TypeVec<typename func<typename BinOpTraits<type, T>::argument_type>::result_type, 3>::vec_type>::make(f(s, v.x), f(s, v.y), f(s, v.z)); \
func<typename detail::BinOpTraits<type, T>::argument_type> f; \
return VecTraits<typename TypeVec<typename func<typename detail::BinOpTraits<type, T>::argument_type>::result_type, 3>::vec_type>::make(f(s, v.x), f(s, v.y), f(s, v.z)); \
} \
static __device__ TypeVec<func<type>::result_type, 4>::vec_type op(const type ## 4 & a, const type ## 4 & b) \
{ \
@ -208,16 +252,16 @@ namespace cv { namespace gpu { namespace device
return VecTraits<TypeVec<func<type>::result_type, 4>::vec_type>::make(f(a.x, b.x), f(a.y, b.y), f(a.z, b.z), f(a.w, b.w)); \
} \
template <typename T> \
static __device__ typename TypeVec<typename func<typename BinOpTraits<type, T>::argument_type>::result_type, 4>::vec_type op(const type ## 4 & v, T s) \
static __device__ typename TypeVec<typename func<typename detail::BinOpTraits<type, T>::argument_type>::result_type, 4>::vec_type op(const type ## 4 & v, T s) \
{ \
func<typename BinOpTraits<type, T>::argument_type> f; \
return VecTraits<typename TypeVec<typename func<typename BinOpTraits<type, T>::argument_type>::result_type, 4>::vec_type>::make(f(v.x, s), f(v.y, s), f(v.z, s), f(v.w, s)); \
func<typename detail::BinOpTraits<type, T>::argument_type> f; \
return VecTraits<typename TypeVec<typename func<typename detail::BinOpTraits<type, T>::argument_type>::result_type, 4>::vec_type>::make(f(v.x, s), f(v.y, s), f(v.z, s), f(v.w, s)); \
} \
template <typename T> \
static __device__ typename TypeVec<typename func<typename BinOpTraits<type, T>::argument_type>::result_type, 4>::vec_type op(T s, const type ## 4 & v) \
static __device__ typename TypeVec<typename func<typename detail::BinOpTraits<type, T>::argument_type>::result_type, 4>::vec_type op(T s, const type ## 4 & v) \
{ \
func<typename BinOpTraits<T, type>::argument_type> f; \
return VecTraits<typename TypeVec<typename func<typename BinOpTraits<type, T>::argument_type>::result_type, 4>::vec_type>::make(f(s, v.x), f(s, v.y), f(s, v.z), f(s, v.w)); \
func<typename detail::BinOpTraits<T, type>::argument_type> f; \
return VecTraits<typename TypeVec<typename func<typename detail::BinOpTraits<type, T>::argument_type>::result_type, 4>::vec_type>::make(f(s, v.x), f(s, v.y), f(s, v.z), f(s, v.w)); \
}
#define OPENCV_GPU_IMPLEMENT_VEC_OP(type) \

View File

@ -49,6 +49,79 @@ namespace cv { namespace gpu { namespace device
{
template<typename T, int N> struct TypeVec;
struct __align__(8) uchar8
{
uchar a0, a1, a2, a3, a4, a5, a6, a7;
};
static __host__ __device__ __forceinline__ uchar8 make_uchar8(uchar a0, uchar a1, uchar a2, uchar a3, uchar a4, uchar a5, uchar a6, uchar a7)
{
uchar8 val = {a0, a1, a2, a3, a4, a5, a6, a7};
return val;
}
struct __align__(8) char8
{
schar a0, a1, a2, a3, a4, a5, a6, a7;
};
static __host__ __device__ __forceinline__ char8 make_char8(schar a0, schar a1, schar a2, schar a3, schar a4, schar a5, schar a6, schar a7)
{
char8 val = {a0, a1, a2, a3, a4, a5, a6, a7};
return val;
}
struct __align__(16) ushort8
{
ushort a0, a1, a2, a3, a4, a5, a6, a7;
};
static __host__ __device__ __forceinline__ ushort8 make_ushort8(ushort a0, ushort a1, ushort a2, ushort a3, ushort a4, ushort a5, ushort a6, ushort a7)
{
ushort8 val = {a0, a1, a2, a3, a4, a5, a6, a7};
return val;
}
struct __align__(16) short8
{
short a0, a1, a2, a3, a4, a5, a6, a7;
};
static __host__ __device__ __forceinline__ short8 make_short8(short a0, short a1, short a2, short a3, short a4, short a5, short a6, short a7)
{
short8 val = {a0, a1, a2, a3, a4, a5, a6, a7};
return val;
}
struct __align__(32) uint8
{
uint a0, a1, a2, a3, a4, a5, a6, a7;
};
static __host__ __device__ __forceinline__ uint8 make_uint8(uint a0, uint a1, uint a2, uint a3, uint a4, uint a5, uint a6, uint a7)
{
uint8 val = {a0, a1, a2, a3, a4, a5, a6, a7};
return val;
}
struct __align__(32) int8
{
int a0, a1, a2, a3, a4, a5, a6, a7;
};
static __host__ __device__ __forceinline__ int8 make_int8(int a0, int a1, int a2, int a3, int a4, int a5, int a6, int a7)
{
int8 val = {a0, a1, a2, a3, a4, a5, a6, a7};
return val;
}
struct __align__(32) float8
{
float a0, a1, a2, a3, a4, a5, a6, a7;
};
static __host__ __device__ __forceinline__ float8 make_float8(float a0, float a1, float a2, float a3, float a4, float a5, float a6, float a7)
{
float8 val = {a0, a1, a2, a3, a4, a5, a6, a7};
return val;
}
struct double8
{
double a0, a1, a2, a3, a4, a5, a6, a7;
};
static __host__ __device__ __forceinline__ double8 make_double8(double a0, double a1, double a2, double a3, double a4, double a5, double a6, double a7)
{
double8 val = {a0, a1, a2, a3, a4, a5, a6, a7};
return val;
}
#define OPENCV_GPU_IMPLEMENT_TYPE_VEC(type) \
template<> struct TypeVec<type, 1> { typedef type vec_type; }; \
template<> struct TypeVec<type ## 1, 1> { typedef type ## 1 vec_type; }; \
@ -57,7 +130,9 @@ namespace cv { namespace gpu { namespace device
template<> struct TypeVec<type, 3> { typedef type ## 3 vec_type; }; \
template<> struct TypeVec<type ## 3, 3> { typedef type ## 3 vec_type; }; \
template<> struct TypeVec<type, 4> { typedef type ## 4 vec_type; }; \
template<> struct TypeVec<type ## 4, 4> { typedef type ## 4 vec_type; };
template<> struct TypeVec<type ## 4, 4> { typedef type ## 4 vec_type; }; \
template<> struct TypeVec<type, 8> { typedef type ## 8 vec_type; }; \
template<> struct TypeVec<type ## 8, 8> { typedef type ## 8 vec_type; };
OPENCV_GPU_IMPLEMENT_TYPE_VEC(uchar)
OPENCV_GPU_IMPLEMENT_TYPE_VEC(char)
@ -74,11 +149,13 @@ namespace cv { namespace gpu { namespace device
template<> struct TypeVec<schar, 2> { typedef char2 vec_type; };
template<> struct TypeVec<schar, 3> { typedef char3 vec_type; };
template<> struct TypeVec<schar, 4> { typedef char4 vec_type; };
template<> struct TypeVec<schar, 8> { typedef char8 vec_type; };
template<> struct TypeVec<bool, 1> { typedef uchar vec_type; };
template<> struct TypeVec<bool, 2> { typedef uchar2 vec_type; };
template<> struct TypeVec<bool, 3> { typedef uchar3 vec_type; };
template<> struct TypeVec<bool, 4> { typedef uchar4 vec_type; };
template<> struct TypeVec<bool, 8> { typedef uchar8 vec_type; };
template<typename T> struct VecTraits;
@ -87,36 +164,43 @@ namespace cv { namespace gpu { namespace device
{ \
typedef type elem_type; \
enum {cn=1}; \
static __device__ __host__ type all(type v) {return v;} \
static __device__ __host__ type make(type x) {return x;} \
static __device__ __host__ __forceinline__ type all(type v) {return v;} \
static __device__ __host__ __forceinline__ type make(type x) {return x;} \
}; \
template<> struct VecTraits<type ## 1> \
{ \
typedef type elem_type; \
enum {cn=1}; \
static __device__ __host__ type ## 1 all(type v) {return make_ ## type ## 1(v);} \
static __device__ __host__ type ## 1 make(type x) {return make_ ## type ## 1(x);} \
static __device__ __host__ __forceinline__ type ## 1 all(type v) {return make_ ## type ## 1(v);} \
static __device__ __host__ __forceinline__ type ## 1 make(type x) {return make_ ## type ## 1(x);} \
}; \
template<> struct VecTraits<type ## 2> \
{ \
typedef type elem_type; \
enum {cn=2}; \
static __device__ __host__ type ## 2 all(type v) {return make_ ## type ## 2(v, v);} \
static __device__ __host__ type ## 2 make(type x, type y) {return make_ ## type ## 2(x, y);} \
static __device__ __host__ __forceinline__ type ## 2 all(type v) {return make_ ## type ## 2(v, v);} \
static __device__ __host__ __forceinline__ type ## 2 make(type x, type y) {return make_ ## type ## 2(x, y);} \
}; \
template<> struct VecTraits<type ## 3> \
{ \
typedef type elem_type; \
enum {cn=3}; \
static __device__ __host__ type ## 3 all(type v) {return make_ ## type ## 3(v, v, v);} \
static __device__ __host__ type ## 3 make(type x, type y, type z) {return make_ ## type ## 3(x, y, z);} \
static __device__ __host__ __forceinline__ type ## 3 all(type v) {return make_ ## type ## 3(v, v, v);} \
static __device__ __host__ __forceinline__ type ## 3 make(type x, type y, type z) {return make_ ## type ## 3(x, y, z);} \
}; \
template<> struct VecTraits<type ## 4> \
{ \
typedef type elem_type; \
enum {cn=4}; \
static __device__ __host__ type ## 4 all(type v) {return make_ ## type ## 4(v, v, v, v);} \
static __device__ __host__ type ## 4 make(type x, type y, type z, type w) {return make_ ## type ## 4(x, y, z, w);} \
static __device__ __host__ __forceinline__ type ## 4 all(type v) {return make_ ## type ## 4(v, v, v, v);} \
static __device__ __host__ __forceinline__ type ## 4 make(type x, type y, type z, type w) {return make_ ## type ## 4(x, y, z, w);} \
}; \
template<> struct VecTraits<type ## 8> \
{ \
typedef type elem_type; \
enum {cn=8}; \
static __device__ __host__ __forceinline__ type ## 8 all(type v) {return make_ ## type ## 8(v, v, v, v, v, v, v, v);} \
static __device__ __host__ __forceinline__ type ## 8 make(type a0, type a1, type a2, type a3, type a4, type a5, type a6, type a7) {return make_ ## type ## 8(a0, a1, a2, a3, a4, a5, a6, a7);} \
};
OPENCV_GPU_IMPLEMENT_VEC_TRAITS(uchar)
@ -134,8 +218,8 @@ namespace cv { namespace gpu { namespace device
{
typedef schar elem_type;
enum {cn=1};
static __device__ __host__ schar all(schar v) {return v;}
static __device__ __host__ schar make(schar x) {return x;}
static __device__ __host__ __forceinline__ schar all(schar v) {return v;}
static __device__ __host__ __forceinline__ schar make(schar x) {return x;}
};
}}}

View File

@ -286,7 +286,7 @@ TEST(BruteForceMatcher)
{
// Init CPU matcher
int desc_len = 64;
int desc_len = 128;
BruteForceMatcher< L2<float> > matcher;
@ -328,7 +328,7 @@ TEST(BruteForceMatcher)
d_matcher.knnMatch(d_query, d_train, d_matches, knn);
GPU_OFF;
/*SUBTEST << "radiusMatch";
SUBTEST << "radiusMatch";
float max_distance = 3.8f;
CPU_ON;
@ -337,7 +337,7 @@ TEST(BruteForceMatcher)
GPU_ON;
d_matcher.radiusMatch(d_query, d_train, d_matches, max_distance);
GPU_OFF;*/
GPU_OFF;
}
@ -689,26 +689,7 @@ TEST(threshold)
Mat src, dst;
gpu::GpuMat d_src, d_dst;
for (int size = 2000; size <= 4000; size += 1000)
{
SUBTEST << "size " << size << ", 8U, THRESH_TRUNC";
gen(src, size, size, CV_8U, 0, 100);
dst.create(size, size, CV_8U);
CPU_ON;
threshold(src, dst, 50.0, 0.0, THRESH_TRUNC);
CPU_OFF;
d_src = src;
d_dst.create(size, size, CV_8U);
GPU_ON;
gpu::threshold(d_src, d_dst, 50.0, 0.0, THRESH_TRUNC);
GPU_OFF;
}
for (int size = 2000; size <= 4000; size += 1000)
for (int size = 1000; size <= 4000; size += 1000)
{
SUBTEST << "size " << size << ", 8U, THRESH_BINARY";
@ -727,22 +708,47 @@ TEST(threshold)
GPU_OFF;
}
for (int size = 2000; size <= 4000; size += 1000)
for (int size = 1000; size <= 4000; size += 1000)
{
SUBTEST << "size " << size << ", 32F, THRESH_TRUNC";
SUBTEST << "size " << size << ", 32F, THRESH_BINARY";
gen(src, size, size, CV_32F, 0, 100);
dst.create(size, size, CV_32F);
CPU_ON;
threshold(src, dst, 50.0, 0.0, THRESH_TRUNC);
threshold(src, dst, 50.0, 0.0, THRESH_BINARY);
CPU_OFF;
d_src = src;
d_dst.create(size, size, CV_32F);
GPU_ON;
gpu::threshold(d_src, d_dst, 50.0, 0.0, THRESH_TRUNC);
gpu::threshold(d_src, d_dst, 50.0, 0.0, THRESH_BINARY);
GPU_OFF;
}
}
TEST(pow)
{
Mat src, dst;
gpu::GpuMat d_src, d_dst;
for (int size = 1000; size <= 4000; size += 1000)
{
SUBTEST << "size " << size << ", 32F";
gen(src, size, size, CV_32F, 0, 100);
dst.create(size, size, CV_32F);
CPU_ON;
pow(src, -2.0, dst);
CPU_OFF;
d_src = src;
d_dst.create(size, size, CV_32F);
GPU_ON;
gpu::pow(d_src, -2.0, d_dst);
GPU_OFF;
}
}