diff --git a/modules/gpu/include/opencv2/gpu/device/functional.hpp b/modules/gpu/include/opencv2/gpu/device/functional.hpp index 6e0471e9a..cd63c3ac9 100644 --- a/modules/gpu/include/opencv2/gpu/device/functional.hpp +++ b/modules/gpu/include/opencv2/gpu/device/functional.hpp @@ -357,6 +357,9 @@ namespace cv { namespace gpu { namespace device { return abs(x); } + + __device__ __forceinline__ abs_func() {} + __device__ __forceinline__ abs_func(const abs_func&) {} }; template <> struct abs_func<unsigned char> : unary_function<unsigned char, unsigned char> { @@ -364,6 +367,9 @@ namespace cv { namespace gpu { namespace device { return x; } + + __device__ __forceinline__ abs_func() {} + __device__ __forceinline__ abs_func(const abs_func&) {} }; template <> struct abs_func<signed char> : unary_function<signed char, signed char> { @@ -371,6 +377,9 @@ namespace cv { namespace gpu { namespace device { return ::abs(x); } + + __device__ __forceinline__ abs_func() {} + __device__ __forceinline__ abs_func(const abs_func&) {} }; template <> struct abs_func<char> : unary_function<char, char> { @@ -378,6 +387,9 @@ namespace cv { namespace gpu { namespace device { return ::abs(x); } + + __device__ __forceinline__ abs_func() {} + __device__ __forceinline__ abs_func(const abs_func&) {} }; template <> struct abs_func<unsigned short> : unary_function<unsigned short, unsigned short> { @@ -385,6 +397,9 @@ namespace cv { namespace gpu { namespace device { return x; } + + __device__ __forceinline__ abs_func() {} + __device__ __forceinline__ abs_func(const abs_func&) {} }; template <> struct abs_func<short> : unary_function<short, short> { @@ -392,6 +407,9 @@ namespace cv { namespace gpu { namespace device { return ::abs(x); } + + __device__ __forceinline__ abs_func() {} + __device__ __forceinline__ abs_func(const abs_func&) {} }; template <> struct abs_func<unsigned int> : unary_function<unsigned int, unsigned int> { @@ -399,6 +417,9 @@ namespace cv { namespace gpu { namespace device { return x; } + + __device__ __forceinline__ abs_func() {} + __device__ __forceinline__ abs_func(const abs_func&) {} }; template <> struct abs_func<int> : unary_function<int, int> { @@ -406,6 +427,9 @@ namespace cv { namespace gpu { namespace device { return ::abs(x); } + + __device__ __forceinline__ abs_func() {} + __device__ __forceinline__ abs_func(const abs_func&) {} }; template <> struct abs_func<float> : unary_function<float, float> { @@ -413,6 +437,9 @@ namespace cv { namespace gpu { namespace device { return ::fabsf(x); } + + __device__ __forceinline__ abs_func() {} + __device__ __forceinline__ abs_func(const abs_func&) {} }; template <> struct abs_func<double> : unary_function<double, double> { @@ -420,6 +447,9 @@ namespace cv { namespace gpu { namespace device { return ::fabs(x); } + + __device__ __forceinline__ abs_func() {} + __device__ __forceinline__ abs_func(const abs_func&) {} }; #define OPENCV_GPU_IMPLEMENT_UN_FUNCTOR(name, func) \ @@ -429,6 +459,8 @@ namespace cv { namespace gpu { namespace device { \ return func ## f(v); \ } \ + __device__ __forceinline__ name ## _func() {} \ + __device__ __forceinline__ name ## _func(const name ## _func&) {} \ }; \ template <> struct name ## _func<double> : unary_function<double, double> \ { \ @@ -436,6 +468,8 @@ namespace cv { namespace gpu { namespace device { \ return func(v); \ } \ + __device__ __forceinline__ name ## _func() {} \ + __device__ __forceinline__ name ## _func(const name ## _func&) {} \ }; #define OPENCV_GPU_IMPLEMENT_BIN_FUNCTOR(name, func) \ diff --git a/modules/gpu/src/cuda/element_operations.cu b/modules/gpu/src/cuda/element_operations.cu index c61601d4f..eaf577bac 100644 --- a/modules/gpu/src/cuda/element_operations.cu +++ b/modules/gpu/src/cuda/element_operations.cu @@ -42,405 +42,833 @@ #if !defined CUDA_DISABLER -#include "internal_shared.hpp" +#include "opencv2/gpu/device/common.hpp" #include "opencv2/gpu/device/functional.hpp" #include "opencv2/gpu/device/vec_math.hpp" #include "opencv2/gpu/device/transform.hpp" #include "opencv2/gpu/device/limits.hpp" #include "opencv2/gpu/device/saturate_cast.hpp" -namespace cv { namespace gpu { namespace device -{ - ////////////////////////////////////////////////////////////////////////// - // add +using namespace cv::gpu; +using namespace cv::gpu::device; - template <typename T, typename D> struct Add : binary_function<T, T, D> +////////////////////////////////////////////////////////////////////////// +// addMat + +namespace +{ + template <typename T, typename D> struct VAdd4; + template <> struct VAdd4<uint, uint> : binary_function<uint, uint, uint> + { + __device__ __forceinline__ uint operator ()(uint a, uint b) const + { + uint res = 0; + + #if __CUDA_ARCH__ >= 300 + asm("vadd4.u32.u32.u32.sat %0, %1, %2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); + #elif __CUDA_ARCH__ >= 200 + asm("vadd.u32.u32.u32.sat %0.b0, %1.b0, %2.b0, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); + asm("vadd.u32.u32.u32.sat %0.b1, %1.b1, %2.b1, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); + asm("vadd.u32.u32.u32.sat %0.b2, %1.b2, %2.b2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); + asm("vadd.u32.u32.u32.sat %0.b3, %1.b3, %2.b3, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); + #endif + + return res; + } + + __device__ __forceinline__ VAdd4() {} + __device__ __forceinline__ VAdd4(const VAdd4<uint, uint>& other) {} + }; + template <> struct VAdd4<int, uint> : binary_function<int, int, uint> + { + __device__ __forceinline__ uint operator ()(int a, int b) const + { + uint res = 0; + + #if __CUDA_ARCH__ >= 300 + asm("vadd4.u32.s32.s32.sat %0, %1, %2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); + #elif __CUDA_ARCH__ >= 200 + asm("vadd.u32.s32.s32.sat %0.b0, %1.b0, %2.b0, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); + asm("vadd.u32.s32.s32.sat %0.b1, %1.b1, %2.b1, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); + asm("vadd.u32.s32.s32.sat %0.b2, %1.b2, %2.b2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); + asm("vadd.u32.s32.s32.sat %0.b3, %1.b3, %2.b3, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); + #endif + + return res; + } + + __device__ __forceinline__ VAdd4() {} + __device__ __forceinline__ VAdd4(const VAdd4<int, uint>& other) {} + }; + template <> struct VAdd4<uint, int> : binary_function<uint, uint, int> + { + __device__ __forceinline__ int operator ()(uint a, uint b) const + { + int res = 0; + + #if __CUDA_ARCH__ >= 300 + asm("vadd4.s32.u32.u32.sat %0, %1, %2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); + #elif __CUDA_ARCH__ >= 200 + asm("vadd.s32.u32.u32.sat %0.b0, %1.b0, %2.b0, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); + asm("vadd.s32.u32.u32.sat %0.b1, %1.b1, %2.b1, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); + asm("vadd.s32.u32.u32.sat %0.b2, %1.b2, %2.b2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); + asm("vadd.s32.u32.u32.sat %0.b3, %1.b3, %2.b3, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); + #endif + + return res; + } + + __device__ __forceinline__ VAdd4() {} + __device__ __forceinline__ VAdd4(const VAdd4<uint, int>& other) {} + }; + template <> struct VAdd4<int, int> : binary_function<int, int, int> + { + __device__ __forceinline__ int operator ()(int a, int b) const + { + int res = 0; + + #if __CUDA_ARCH__ >= 300 + asm("vadd4.s32.s32.s32.sat %0, %1, %2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); + #elif __CUDA_ARCH__ >= 200 + asm("vadd.s32.s32.s32.sat %0.b0, %1.b0, %2.b0, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); + asm("vadd.s32.s32.s32.sat %0.b1, %1.b1, %2.b1, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); + asm("vadd.s32.s32.s32.sat %0.b2, %1.b2, %2.b2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); + asm("vadd.s32.s32.s32.sat %0.b3, %1.b3, %2.b3, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); + #endif + + return res; + } + + __device__ __forceinline__ VAdd4() {} + __device__ __forceinline__ VAdd4(const VAdd4<int, int>& other) {} + }; + + //////////////////////////////////// + + template <typename T, typename D> struct VAdd2; + template <> struct VAdd2<uint, uint> : binary_function<uint, uint, uint> + { + __device__ __forceinline__ uint operator ()(uint a, uint b) const + { + uint res = 0; + + #if __CUDA_ARCH__ >= 300 + asm("vadd2.u32.u32.u32.sat %0, %1, %2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); + #elif __CUDA_ARCH__ >= 200 + asm("vadd.u32.u32.u32.sat %0.h0, %1.h0, %2.h0, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); + asm("vadd.u32.u32.u32.sat %0.h1, %1.h1, %2.h1, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); + #endif + + return res; + } + + __device__ __forceinline__ VAdd2() {} + __device__ __forceinline__ VAdd2(const VAdd2<uint, uint>& other) {} + }; + template <> struct VAdd2<uint, int> : binary_function<uint, uint, int> + { + __device__ __forceinline__ int operator ()(uint a, uint b) const + { + int res = 0; + + #if __CUDA_ARCH__ >= 300 + asm("vadd2.s32.u32.u32.sat %0, %1, %2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); + #elif __CUDA_ARCH__ >= 200 + asm("vadd.s32.u32.u32.sat %0.h0, %1.h0, %2.h0, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); + asm("vadd.s32.u32.u32.sat %0.h1, %1.h1, %2.h1, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); + #endif + + return res; + } + + __device__ __forceinline__ VAdd2() {} + __device__ __forceinline__ VAdd2(const VAdd2<uint, int>& other) {} + }; + template <> struct VAdd2<int, uint> : binary_function<int, int, uint> + { + __device__ __forceinline__ uint operator ()(int a, int b) const + { + uint res = 0; + + #if __CUDA_ARCH__ >= 300 + asm("vadd2.u32.s32.s32.sat %0, %1, %2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); + #elif __CUDA_ARCH__ >= 200 + asm("vadd.u32.s32.s32.sat %0.h0, %1.h0, %2.h0, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); + asm("vadd.u32.s32.s32.sat %0.h1, %1.h1, %2.h1, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); + #endif + + return res; + } + + __device__ __forceinline__ VAdd2() {} + __device__ __forceinline__ VAdd2(const VAdd2<int, uint>& other) {} + }; + template <> struct VAdd2<int, int> : binary_function<int, int, int> + { + __device__ __forceinline__ int operator ()(int a, int b) const + { + int res = 0; + + #if __CUDA_ARCH__ >= 300 + asm("vadd2.s32.s32.s32.sat %0, %1, %2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); + #elif __CUDA_ARCH__ >= 200 + asm("vadd.s32.s32.s32.sat %0.h0, %1.h0, %2.h0, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); + asm("vadd.s32.s32.s32.sat %0.h1, %1.h1, %2.h1, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); + #endif + + return res; + } + + __device__ __forceinline__ VAdd2() {} + __device__ __forceinline__ VAdd2(const VAdd2<int, int>& other) {} + }; + + //////////////////////////////////// + + template <typename T, typename D> struct AddMat : binary_function<T, T, D> { __device__ __forceinline__ D operator ()(T a, T b) const { return saturate_cast<D>(a + b); } + + __device__ __forceinline__ AddMat() {} + __device__ __forceinline__ AddMat(const AddMat& other) {} + }; +} + +namespace cv { namespace gpu { namespace device +{ + template <typename T, typename D> struct TransformFunctorTraits< VAdd4<T, D> > : DefaultTransformFunctorTraits< VAdd4<T, D> > + { + enum { smart_shift = 2 }; }; - template <> struct TransformFunctorTraits< Add<ushort, ushort> > : DefaultTransformFunctorTraits< Add<ushort, ushort> > + //////////////////////////////////// + + template <typename T, typename D> struct TransformFunctorTraits< VAdd2<T, D> > : DefaultTransformFunctorTraits< VAdd4<T, D> > { - enum { smart_block_dim_y = 8 }; - enum { smart_shift = 4 }; - }; - template <> struct TransformFunctorTraits< Add<short, short> > : DefaultTransformFunctorTraits< Add<short, short> > - { - enum { smart_block_dim_y = 8 }; - enum { smart_shift = 4 }; - }; - template <> struct TransformFunctorTraits< Add<int, int> > : DefaultTransformFunctorTraits< Add<int, int> > - { - enum { smart_block_dim_y = 8 }; - enum { smart_shift = 4 }; - }; - template <> struct TransformFunctorTraits< Add<float, float> > : DefaultTransformFunctorTraits< Add<float, float> > - { - enum { smart_block_dim_y = 8 }; - enum { smart_shift = 4 }; + enum { smart_shift = 2 }; }; - template <typename T, typename D> void add_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream) + //////////////////////////////////// + + template <> struct TransformFunctorTraits< AddMat<ushort, ushort> > : DefaultTransformFunctorTraits< AddMat<ushort, ushort> > { - if (mask.data) - cv::gpu::device::transform((PtrStepSz<T>)src1, (PtrStepSz<T>)src2, (PtrStepSz<D>)dst, Add<T, D>(), SingleMask(mask), stream); - else - cv::gpu::device::transform((PtrStepSz<T>)src1, (PtrStepSz<T>)src2, (PtrStepSz<D>)dst, Add<T, D>(), WithOutMask(), stream); + enum { smart_block_dim_y = 8 }; + enum { smart_shift = 4 }; + }; + template <> struct TransformFunctorTraits< AddMat<short, short> > : DefaultTransformFunctorTraits< AddMat<short, short> > + { + enum { smart_block_dim_y = 8 }; + enum { smart_shift = 4 }; + }; + template <> struct TransformFunctorTraits< AddMat<int, int> > : DefaultTransformFunctorTraits< AddMat<int, int> > + { + enum { smart_block_dim_y = 8 }; + enum { smart_shift = 4 }; + }; + template <> struct TransformFunctorTraits< AddMat<float, float> > : DefaultTransformFunctorTraits< AddMat<float, float> > + { + enum { smart_block_dim_y = 8 }; + enum { smart_shift = 4 }; + }; +}}} + +namespace arithm +{ + template <typename T, typename D> + void vadd4(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream) + { + transform((PtrStepSz<T>) src1, (PtrStepSz<T>) src2, (PtrStepSz<D>) dst, VAdd4<T, D>(), WithOutMask(), stream); } - template void add_gpu<uchar, uchar>(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void add_gpu<uchar, schar>(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - template void add_gpu<uchar, ushort>(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - template void add_gpu<uchar, short>(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - template void add_gpu<uchar, int>(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - template void add_gpu<uchar, float>(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - template void add_gpu<uchar, double>(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); + template void vadd4<uint, uint>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream); + template void vadd4<uint, int>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream); + template void vadd4<int, uint>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream); + template void vadd4<int, int>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream); - //template void add_gpu<schar, uchar>(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void add_gpu<schar, schar>(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void add_gpu<schar, ushort>(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void add_gpu<schar, short>(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void add_gpu<schar, int>(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void add_gpu<schar, float>(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void add_gpu<schar, double>(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - - //template void add_gpu<ushort, uchar>(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void add_gpu<ushort, schar>(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - template void add_gpu<ushort, ushort>(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void add_gpu<ushort, short>(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - template void add_gpu<ushort, int>(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - template void add_gpu<ushort, float>(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - template void add_gpu<ushort, double>(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - - //template void add_gpu<short, uchar>(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void add_gpu<short, schar>(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void add_gpu<short, ushort>(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - template void add_gpu<short, short>(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - template void add_gpu<short, int>(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - template void add_gpu<short, float>(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - template void add_gpu<short, double>(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - - //template void add_gpu<int, uchar>(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void add_gpu<int, schar>(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void add_gpu<int, ushort>(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void add_gpu<int, short>(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - template void add_gpu<int, int>(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - template void add_gpu<int, float>(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - template void add_gpu<int, double>(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - - //template void add_gpu<float, uchar>(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void add_gpu<float, schar>(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void add_gpu<float, ushort>(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void add_gpu<float, short>(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void add_gpu<float, int>(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - template void add_gpu<float, float>(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - template void add_gpu<float, double>(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - - //template void add_gpu<double, uchar>(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void add_gpu<double, schar>(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void add_gpu<double, ushort>(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void add_gpu<double, short>(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void add_gpu<double, int>(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void add_gpu<double, float>(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - template void add_gpu<double, double>(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - - template <typename T, typename D> struct AddScalar : unary_function<T, D> + template <typename T, typename D> + void vadd2(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream) { - AddScalar(double val_) : val(val_) {} + transform((PtrStepSz<T>) src1, (PtrStepSz<T>) src2, (PtrStepSz<D>) dst, VAdd2<T, D>(), WithOutMask(), stream); + } + + template void vadd2<uint, uint>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream); + template void vadd2<uint, int>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream); + template void vadd2<int, uint>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream); + template void vadd2<int, int>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream); + + template <typename T, typename D> + void addMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream) + { + if (mask.data) + transform((PtrStepSz<T>) src1, (PtrStepSz<T>) src2, (PtrStepSz<D>) dst, AddMat<T, D>(), mask, stream); + else + transform((PtrStepSz<T>) src1, (PtrStepSz<T>) src2, (PtrStepSz<D>) dst, AddMat<T, D>(), WithOutMask(), stream); + } + + template void addMat<uchar, uchar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void addMat<uchar, schar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void addMat<uchar, ushort>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void addMat<uchar, short>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void addMat<uchar, int>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void addMat<uchar, float>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void addMat<uchar, double>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + + template void addMat<schar, uchar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void addMat<schar, schar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void addMat<schar, ushort>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void addMat<schar, short>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void addMat<schar, int>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void addMat<schar, float>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void addMat<schar, double>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + + //template void addMat<ushort, uchar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + //template void addMat<ushort, schar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void addMat<ushort, ushort>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void addMat<ushort, short>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void addMat<ushort, int>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void addMat<ushort, float>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void addMat<ushort, double>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + + //template void addMat<short, uchar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + //template void addMat<short, schar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void addMat<short, ushort>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void addMat<short, short>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void addMat<short, int>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void addMat<short, float>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void addMat<short, double>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + + //template void addMat<int, uchar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + //template void addMat<int, schar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + //template void addMat<int, ushort>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + //template void addMat<int, short>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void addMat<int, int>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void addMat<int, float>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void addMat<int, double>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + + //template void addMat<float, uchar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + //template void addMat<float, schar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + //template void addMat<float, ushort>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + //template void addMat<float, short>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + //template void addMat<float, int>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void addMat<float, float>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void addMat<float, double>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + + //template void addMat<double, uchar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + //template void addMat<double, schar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + //template void addMat<double, ushort>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + //template void addMat<double, short>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + //template void addMat<double, int>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + //template void addMat<double, float>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void addMat<double, double>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); +} + +////////////////////////////////////////////////////////////////////////// +// addScalar + +namespace +{ + template <typename T, typename S, typename D> struct AddScalar : unary_function<T, D> + { + S val; + + explicit AddScalar(S val_) : val(val_) {} + __device__ __forceinline__ D operator ()(T a) const { return saturate_cast<D>(a + val); } - const double val; }; +} - template <> struct TransformFunctorTraits< AddScalar<ushort, ushort> > : DefaultTransformFunctorTraits< AddScalar<ushort, ushort> > +namespace cv { namespace gpu { namespace device +{ + template <> struct TransformFunctorTraits< AddScalar<ushort, float, ushort> > : DefaultTransformFunctorTraits< AddScalar<ushort, float, ushort> > { enum { smart_block_dim_y = 8 }; enum { smart_shift = 4 }; }; - template <> struct TransformFunctorTraits< AddScalar<short, short> > : DefaultTransformFunctorTraits< AddScalar<short, short> > + template <> struct TransformFunctorTraits< AddScalar<short, float, short> > : DefaultTransformFunctorTraits< AddScalar<short, float, short> > { enum { smart_block_dim_y = 8 }; enum { smart_shift = 4 }; }; - template <> struct TransformFunctorTraits< AddScalar<int, int> > : DefaultTransformFunctorTraits< AddScalar<int, int> > + template <> struct TransformFunctorTraits< AddScalar<int, float, int> > : DefaultTransformFunctorTraits< AddScalar<int, float, int> > { enum { smart_block_dim_y = 8 }; enum { smart_shift = 4 }; }; - template <> struct TransformFunctorTraits< AddScalar<float, float> > : DefaultTransformFunctorTraits< AddScalar<float, float> > + template <> struct TransformFunctorTraits< AddScalar<float, float, float> > : DefaultTransformFunctorTraits< AddScalar<float, float, float> > { enum { smart_block_dim_y = 8 }; enum { smart_shift = 4 }; }; +}}} - template <typename T, typename D> void add_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream) +namespace arithm +{ + template <typename T, typename S, typename D> + void addScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream) { - cudaSafeCall( cudaSetDoubleForDevice(&val) ); - AddScalar<T, D> op(val); + AddScalar<T, S, D> op(static_cast<S>(val)); + if (mask.data) - cv::gpu::device::transform((PtrStepSz<T>)src1, (PtrStepSz<D>)dst, op, SingleMask(mask), stream); + transform((PtrStepSz<T>) src1, (PtrStepSz<D>) dst, op, mask, stream); else - cv::gpu::device::transform((PtrStepSz<T>)src1, (PtrStepSz<D>)dst, op, WithOutMask(), stream); + transform((PtrStepSz<T>) src1, (PtrStepSz<D>) dst, op, WithOutMask(), stream); } - template void add_gpu<uchar, uchar>(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void add_gpu<uchar, schar>(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - template void add_gpu<uchar, ushort>(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - template void add_gpu<uchar, short >(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - template void add_gpu<uchar, int >(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - template void add_gpu<uchar, float >(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - template void add_gpu<uchar, double>(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); + template void addScalar<uchar, float, uchar>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void addScalar<uchar, float, schar>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void addScalar<uchar, float, ushort>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void addScalar<uchar, float, short>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void addScalar<uchar, float, int>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void addScalar<uchar, float, float>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void addScalar<uchar, double, double>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); - //template void add_gpu<schar, uchar>(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void add_gpu<schar, schar>(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void add_gpu<schar, ushort>(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void add_gpu<schar, short>(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void add_gpu<schar, int>(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void add_gpu<schar, float>(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void add_gpu<schar, double>(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); + template void addScalar<schar, float, uchar>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void addScalar<schar, float, schar>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void addScalar<schar, float, ushort>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void addScalar<schar, float, short>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void addScalar<schar, float, int>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void addScalar<schar, float, float>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void addScalar<schar, double, double>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); - //template void add_gpu<ushort, uchar>(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void add_gpu<ushort, schar>(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - template void add_gpu<ushort, ushort>(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void add_gpu<ushort, short>(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - template void add_gpu<ushort, int>(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - template void add_gpu<ushort, float>(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - template void add_gpu<ushort, double>(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); + //template void addScalar<ushort, float, uchar>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + //template void addScalar<ushort, float, schar>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void addScalar<ushort, float, ushort>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void addScalar<ushort, float, short>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void addScalar<ushort, float, int>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void addScalar<ushort, float, float>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void addScalar<ushort, double, double>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); - //template void add_gpu<short, uchar>(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void add_gpu<short, schar>(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void add_gpu<short, ushort>(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - template void add_gpu<short, short>(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - template void add_gpu<short, int>(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - template void add_gpu<short, float>(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - template void add_gpu<short, double>(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); + //template void addScalar<short, float, uchar>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + //template void addScalar<short, float, schar>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void addScalar<short, float, ushort>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void addScalar<short, float, short>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void addScalar<short, float, int>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void addScalar<short, float, float>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void addScalar<short, double, double>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); - //template void add_gpu<int, uchar>(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void add_gpu<int, schar>(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void add_gpu<int, ushort>(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void add_gpu<int, short>(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - template void add_gpu<int, int>(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - template void add_gpu<int, float>(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - template void add_gpu<int, double>(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); + //template void addScalar<int, float, uchar>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + //template void addScalar<int, float, schar>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + //template void addScalar<int, float, ushort>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + //template void addScalar<int, float, short>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void addScalar<int, float, int>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void addScalar<int, float, float>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void addScalar<int, double, double>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); - //template void add_gpu<float, uchar>(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void add_gpu<float, schar>(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void add_gpu<float, ushort>(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void add_gpu<float, short>(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void add_gpu<float, int>(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - template void add_gpu<float, float>(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - template void add_gpu<float, double>(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); + //template void addScalar<float, float, uchar>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + //template void addScalar<float, float, schar>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + //template void addScalar<float, float, ushort>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + //template void addScalar<float, float, short>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + //template void addScalar<float, float, int>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void addScalar<float, float, float>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void addScalar<float, double, double>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); - //template void add_gpu<double, uchar>(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void add_gpu<double, schar>(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void add_gpu<double, ushort>(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void add_gpu<double, short>(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void add_gpu<double, int>(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void add_gpu<double, float>(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - template void add_gpu<double, double>(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); + //template void addScalar<double, double, uchar>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + //template void addScalar<double, double, schar>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + //template void addScalar<double, double, ushort>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + //template void addScalar<double, double, short>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + //template void addScalar<double, double, int>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + //template void addScalar<double, double, float>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void addScalar<double, double, double>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); +} - ////////////////////////////////////////////////////////////////////////// - // subtract +////////////////////////////////////////////////////////////////////////// +// subMat - template <typename T, typename D> struct Subtract : binary_function<T, T, D> +namespace +{ + template <typename T, typename D> struct VSub4; + template <> struct VSub4<uint, uint> : binary_function<uint, uint, uint> + { + __device__ __forceinline__ uint operator ()(uint a, uint b) const + { + uint res = 0; + + #if __CUDA_ARCH__ >= 300 + asm("vsub4.u32.u32.u32.sat %0, %1, %2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); + #elif __CUDA_ARCH__ >= 200 + asm("vsub.u32.u32.u32.sat %0.b0, %1.b0, %2.b0, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); + asm("vsub.u32.u32.u32.sat %0.b1, %1.b1, %2.b1, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); + asm("vsub.u32.u32.u32.sat %0.b2, %1.b2, %2.b2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); + asm("vsub.u32.u32.u32.sat %0.b3, %1.b3, %2.b3, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); + #endif + + return res; + } + + __device__ __forceinline__ VSub4() {} + __device__ __forceinline__ VSub4(const VSub4<uint, uint>& other) {} + }; + template <> struct VSub4<int, uint> : binary_function<int, int, uint> + { + __device__ __forceinline__ uint operator ()(int a, int b) const + { + uint res = 0; + + #if __CUDA_ARCH__ >= 300 + asm("vsub4.u32.s32.s32.sat %0, %1, %2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); + #elif __CUDA_ARCH__ >= 200 + asm("vsub.u32.s32.s32.sat %0.b0, %1.b0, %2.b0, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); + asm("vsub.u32.s32.s32.sat %0.b1, %1.b1, %2.b1, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); + asm("vsub.u32.s32.s32.sat %0.b2, %1.b2, %2.b2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); + asm("vsub.u32.s32.s32.sat %0.b3, %1.b3, %2.b3, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); + #endif + + return res; + } + + __device__ __forceinline__ VSub4() {} + __device__ __forceinline__ VSub4(const VSub4<int, uint>& other) {} + }; + template <> struct VSub4<uint, int> : binary_function<uint, uint, int> + { + __device__ __forceinline__ int operator ()(uint a, uint b) const + { + int res = 0; + + #if __CUDA_ARCH__ >= 300 + asm("vsub4.s32.u32.u32.sat %0, %1, %2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); + #elif __CUDA_ARCH__ >= 200 + asm("vsub.s32.u32.u32.sat %0.b0, %1.b0, %2.b0, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); + asm("vsub.s32.u32.u32.sat %0.b1, %1.b1, %2.b1, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); + asm("vsub.s32.u32.u32.sat %0.b2, %1.b2, %2.b2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); + asm("vsub.s32.u32.u32.sat %0.b3, %1.b3, %2.b3, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); + #endif + + return res; + } + + __device__ __forceinline__ VSub4() {} + __device__ __forceinline__ VSub4(const VSub4<uint, int>& other) {} + }; + template <> struct VSub4<int, int> : binary_function<int, int, int> + { + __device__ __forceinline__ int operator ()(int a, int b) const + { + int res = 0; + + #if __CUDA_ARCH__ >= 300 + asm("vsub4.s32.s32.s32.sat %0, %1, %2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); + #elif __CUDA_ARCH__ >= 200 + asm("vsub.s32.s32.s32.sat %0.b0, %1.b0, %2.b0, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); + asm("vsub.s32.s32.s32.sat %0.b1, %1.b1, %2.b1, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); + asm("vsub.s32.s32.s32.sat %0.b2, %1.b2, %2.b2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); + asm("vsub.s32.s32.s32.sat %0.b3, %1.b3, %2.b3, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); + #endif + + return res; + } + + __device__ __forceinline__ VSub4() {} + __device__ __forceinline__ VSub4(const VSub4<int, int>& other) {} + }; + + //////////////////////////////////// + + template <typename T, typename D> struct VSub2; + template <> struct VSub2<uint, uint> : binary_function<uint, uint, uint> + { + __device__ __forceinline__ uint operator ()(uint a, uint b) const + { + uint res = 0; + + #if __CUDA_ARCH__ >= 300 + asm("vsub2.u32.u32.u32.sat %0, %1, %2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); + #elif __CUDA_ARCH__ >= 200 + asm("vsub.u32.u32.u32.sat %0.h0, %1.h0, %2.h0, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); + asm("vsub.u32.u32.u32.sat %0.h1, %1.h1, %2.h1, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); + #endif + + return res; + } + + __device__ __forceinline__ VSub2() {} + __device__ __forceinline__ VSub2(const VSub2<uint, uint>& other) {} + }; + template <> struct VSub2<uint, int> : binary_function<uint, uint, int> + { + __device__ __forceinline__ int operator ()(uint a, uint b) const + { + int res = 0; + + #if __CUDA_ARCH__ >= 300 + asm("vsub2.s32.u32.u32.sat %0, %1, %2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); + #elif __CUDA_ARCH__ >= 200 + asm("vsub.s32.u32.u32.sat %0.h0, %1.h0, %2.h0, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); + asm("vsub.s32.u32.u32.sat %0.h1, %1.h1, %2.h1, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); + #endif + + return res; + } + + __device__ __forceinline__ VSub2() {} + __device__ __forceinline__ VSub2(const VSub2<uint, int>& other) {} + }; + template <> struct VSub2<int, uint> : binary_function<int, int, uint> + { + __device__ __forceinline__ uint operator ()(int a, int b) const + { + uint res = 0; + + #if __CUDA_ARCH__ >= 300 + asm("vsub2.u32.s32.s32.sat %0, %1, %2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); + #elif __CUDA_ARCH__ >= 200 + asm("vsub.u32.s32.s32.sat %0.h0, %1.h0, %2.h0, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); + asm("vsub.u32.s32.s32.sat %0.h1, %1.h1, %2.h1, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); + #endif + + return res; + } + + __device__ __forceinline__ VSub2() {} + __device__ __forceinline__ VSub2(const VSub2<int, uint>& other) {} + }; + template <> struct VSub2<int, int> : binary_function<int, int, int> + { + __device__ __forceinline__ int operator ()(int a, int b) const + { + int res = 0; + + #if __CUDA_ARCH__ >= 300 + asm("vsub2.s32.s32.s32.sat %0, %1, %2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); + #elif __CUDA_ARCH__ >= 200 + asm("vsub.s32.s32.s32.sat %0.h0, %1.h0, %2.h0, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); + asm("vsub.s32.s32.s32.sat %0.h1, %1.h1, %2.h1, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); + #endif + + return res; + } + + __device__ __forceinline__ VSub2() {} + __device__ __forceinline__ VSub2(const VSub2<int, int>& other) {} + }; + + //////////////////////////////////// + + template <typename T, typename D> struct SubMat : binary_function<T, T, D> { __device__ __forceinline__ D operator ()(T a, T b) const { return saturate_cast<D>(a - b); } + + __device__ __forceinline__ SubMat() {} + __device__ __forceinline__ SubMat(const SubMat& other) {} + }; +} + +namespace cv { namespace gpu { namespace device +{ + template <typename T, typename D> struct TransformFunctorTraits< VSub4<T, D> > : DefaultTransformFunctorTraits< VSub4<T, D> > + { + enum { smart_shift = 2 }; }; - template <> struct TransformFunctorTraits< Subtract<ushort, ushort> > : DefaultTransformFunctorTraits< Subtract<ushort, ushort> > + //////////////////////////////////// + + template <typename T, typename D> struct TransformFunctorTraits< VSub2<T, D> > : DefaultTransformFunctorTraits< VSub2<T, D> > { - enum { smart_block_dim_y = 8 }; - enum { smart_shift = 4 }; - }; - template <> struct TransformFunctorTraits< Subtract<short, short> > : DefaultTransformFunctorTraits< Subtract<short, short> > - { - enum { smart_block_dim_y = 8 }; - enum { smart_shift = 4 }; - }; - template <> struct TransformFunctorTraits< Subtract<int, int> > : DefaultTransformFunctorTraits< Subtract<int, int> > - { - enum { smart_block_dim_y = 8 }; - enum { smart_shift = 4 }; - }; - template <> struct TransformFunctorTraits< Subtract<float, float> > : DefaultTransformFunctorTraits< Subtract<float, float> > - { - enum { smart_block_dim_y = 8 }; - enum { smart_shift = 4 }; + enum { smart_shift = 2 }; }; - template <typename T, typename D> void subtract_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream) + //////////////////////////////////// + + template <> struct TransformFunctorTraits< SubMat<ushort, ushort> > : DefaultTransformFunctorTraits< SubMat<ushort, ushort> > { - if (mask.data) - cv::gpu::device::transform((PtrStepSz<T>)src1, (PtrStepSz<T>)src2, (PtrStepSz<D>)dst, Subtract<T, D>(), SingleMask(mask), stream); - else - cv::gpu::device::transform((PtrStepSz<T>)src1, (PtrStepSz<T>)src2, (PtrStepSz<D>)dst, Subtract<T, D>(), WithOutMask(), stream); + enum { smart_block_dim_y = 8 }; + enum { smart_shift = 4 }; + }; + template <> struct TransformFunctorTraits< SubMat<short, short> > : DefaultTransformFunctorTraits< SubMat<short, short> > + { + enum { smart_block_dim_y = 8 }; + enum { smart_shift = 4 }; + }; + template <> struct TransformFunctorTraits< SubMat<int, int> > : DefaultTransformFunctorTraits< SubMat<int, int> > + { + enum { smart_block_dim_y = 8 }; + enum { smart_shift = 4 }; + }; + template <> struct TransformFunctorTraits< SubMat<float, float> > : DefaultTransformFunctorTraits< SubMat<float, float> > + { + enum { smart_block_dim_y = 8 }; + enum { smart_shift = 4 }; + }; +}}} + +namespace arithm +{ + template <typename T, typename D> + void vsub4(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream) + { + transform((PtrStepSz<T>) src1, (PtrStepSz<T>) src2, (PtrStepSz<D>) dst, VSub4<T, D>(), WithOutMask(), stream); } - template void subtract_gpu<uchar, uchar>(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void subtract_gpu<uchar, schar>(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - template void subtract_gpu<uchar, ushort>(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - template void subtract_gpu<uchar, short>(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - template void subtract_gpu<uchar, int>(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - template void subtract_gpu<uchar, float>(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - template void subtract_gpu<uchar, double>(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); + template void vsub4<uint, uint>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream); + template void vsub4<uint, int>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream); + template void vsub4<int, uint>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream); + template void vsub4<int, int>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream); - //template void subtract_gpu<schar, uchar>(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void subtract_gpu<schar, schar>(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void subtract_gpu<schar, ushort>(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void subtract_gpu<schar, short>(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void subtract_gpu<schar, int>(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void subtract_gpu<schar, float>(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void subtract_gpu<schar, double>(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - - //template void subtract_gpu<ushort, uchar>(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void subtract_gpu<ushort, schar>(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - template void subtract_gpu<ushort, ushort>(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void subtract_gpu<ushort, short>(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - template void subtract_gpu<ushort, int>(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - template void subtract_gpu<ushort, float>(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - template void subtract_gpu<ushort, double>(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - - //template void subtract_gpu<short, uchar>(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void subtract_gpu<short, schar>(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void subtract_gpu<short, ushort>(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - template void subtract_gpu<short, short>(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - template void subtract_gpu<short, int>(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - template void subtract_gpu<short, float>(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - template void subtract_gpu<short, double>(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - - //template void subtract_gpu<int, uchar>(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void subtract_gpu<int, schar>(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void subtract_gpu<int, ushort>(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void subtract_gpu<int, short>(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - template void subtract_gpu<int, int>(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - template void subtract_gpu<int, float>(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - template void subtract_gpu<int, double>(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - - //template void subtract_gpu<float, uchar>(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void subtract_gpu<float, schar>(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void subtract_gpu<float, ushort>(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void subtract_gpu<float, short>(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void subtract_gpu<float, int>(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - template void subtract_gpu<float, float>(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - template void subtract_gpu<float, double>(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - - //template void subtract_gpu<double, uchar>(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void subtract_gpu<double, schar>(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void subtract_gpu<double, ushort>(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void subtract_gpu<double, short>(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void subtract_gpu<double, int>(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void subtract_gpu<double, float>(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - template void subtract_gpu<double, double>(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - - template <typename T, typename D> struct SubtractScalar : unary_function<T, D> + template <typename T, typename D> + void vsub2(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream) { - SubtractScalar(double val_) : val(val_) {} - __device__ __forceinline__ D operator ()(T a) const - { - return saturate_cast<D>(a - val); - } - const double val; - }; - - template <> struct TransformFunctorTraits< SubtractScalar<ushort, ushort> > : DefaultTransformFunctorTraits< SubtractScalar<ushort, ushort> > - { - enum { smart_block_dim_y = 8 }; - enum { smart_shift = 4 }; - }; - template <> struct TransformFunctorTraits< SubtractScalar<short, short> > : DefaultTransformFunctorTraits< SubtractScalar<short, short> > - { - enum { smart_block_dim_y = 8 }; - enum { smart_shift = 4 }; - }; - template <> struct TransformFunctorTraits< SubtractScalar<int, int> > : DefaultTransformFunctorTraits< SubtractScalar<int, int> > - { - enum { smart_block_dim_y = 8 }; - enum { smart_shift = 4 }; - }; - template <> struct TransformFunctorTraits< SubtractScalar<float, float> > : DefaultTransformFunctorTraits< SubtractScalar<float, float> > - { - enum { smart_block_dim_y = 8 }; - enum { smart_shift = 4 }; - }; - - template <typename T, typename D> void subtract_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream) - { - cudaSafeCall( cudaSetDoubleForDevice(&val) ); - SubtractScalar<T, D> op(val); - if (mask.data) - cv::gpu::device::transform((PtrStepSz<T>)src1, (PtrStepSz<D>)dst, op, SingleMask(mask), stream); - else - cv::gpu::device::transform((PtrStepSz<T>)src1, (PtrStepSz<D>)dst, op, WithOutMask(), stream); + transform((PtrStepSz<T>) src1, (PtrStepSz<T>) src2, (PtrStepSz<D>) dst, VSub2<T, D>(), WithOutMask(), stream); } - template void subtract_gpu<uchar, uchar>(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void subtract_gpu<uchar, schar>(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - template void subtract_gpu<uchar, ushort>(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - template void subtract_gpu<uchar, short >(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - template void subtract_gpu<uchar, int >(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - template void subtract_gpu<uchar, float >(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - template void subtract_gpu<uchar, double>(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); + template void vsub2<uint, uint>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream); + template void vsub2<uint, int>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream); + template void vsub2<int, uint>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream); + template void vsub2<int, int>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream); - //template void subtract_gpu<schar, uchar>(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void subtract_gpu<schar, schar>(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void subtract_gpu<schar, ushort>(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void subtract_gpu<schar, short>(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void subtract_gpu<schar, int>(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void subtract_gpu<schar, float>(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void subtract_gpu<schar, double>(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); + template <typename T, typename D> + void subMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream) + { + if (mask.data) + transform((PtrStepSz<T>) src1, (PtrStepSz<T>) src2, (PtrStepSz<D>) dst, SubMat<T, D>(), mask, stream); + else + transform((PtrStepSz<T>) src1, (PtrStepSz<T>) src2, (PtrStepSz<D>) dst, SubMat<T, D>(), WithOutMask(), stream); + } - //template void subtract_gpu<ushort, uchar>(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void subtract_gpu<ushort, schar>(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - template void subtract_gpu<ushort, ushort>(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void subtract_gpu<ushort, short>(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - template void subtract_gpu<ushort, int>(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - template void subtract_gpu<ushort, float>(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - template void subtract_gpu<ushort, double>(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); + template void subMat<uchar, uchar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void subMat<uchar, schar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void subMat<uchar, ushort>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void subMat<uchar, short>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void subMat<uchar, int>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void subMat<uchar, float>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void subMat<uchar, double>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); - //template void subtract_gpu<short, uchar>(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void subtract_gpu<short, schar>(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void subtract_gpu<short, ushort>(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - template void subtract_gpu<short, short>(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - template void subtract_gpu<short, int>(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - template void subtract_gpu<short, float>(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - template void subtract_gpu<short, double>(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); + template void subMat<schar, uchar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void subMat<schar, schar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void subMat<schar, ushort>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void subMat<schar, short>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void subMat<schar, int>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void subMat<schar, float>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void subMat<schar, double>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); - //template void subtract_gpu<int, uchar>(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void subtract_gpu<int, schar>(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void subtract_gpu<int, ushort>(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void subtract_gpu<int, short>(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - template void subtract_gpu<int, int>(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - template void subtract_gpu<int, float>(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - template void subtract_gpu<int, double>(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); + //template void subMat<ushort, uchar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + //template void subMat<ushort, schar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void subMat<ushort, ushort>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void subMat<ushort, short>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void subMat<ushort, int>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void subMat<ushort, float>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void subMat<ushort, double>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); - //template void subtract_gpu<float, uchar>(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void subtract_gpu<float, schar>(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void subtract_gpu<float, ushort>(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void subtract_gpu<float, short>(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void subtract_gpu<float, int>(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - template void subtract_gpu<float, float>(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - template void subtract_gpu<float, double>(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); + //template void subMat<short, uchar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + //template void subMat<short, schar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void subMat<short, ushort>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void subMat<short, short>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void subMat<short, int>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void subMat<short, float>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void subMat<short, double>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); - //template void subtract_gpu<double, uchar>(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void subtract_gpu<double, schar>(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void subtract_gpu<double, ushort>(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void subtract_gpu<double, short>(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void subtract_gpu<double, int>(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void subtract_gpu<double, float>(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - template void subtract_gpu<double, double>(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); + //template void subMat<int, uchar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + //template void subMat<int, schar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + //template void subMat<int, ushort>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + //template void subMat<int, short>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void subMat<int, int>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void subMat<int, float>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void subMat<int, double>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); - ////////////////////////////////////////////////////////////////////////// - // multiply + //template void subMat<float, uchar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + //template void subMat<float, schar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + //template void subMat<float, ushort>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + //template void subMat<float, short>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + //template void subMat<float, int>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void subMat<float, float>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void subMat<float, double>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); - struct multiply_8uc4_32f : binary_function<uint, float, uint> + //template void subMat<double, uchar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + //template void subMat<double, schar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + //template void subMat<double, ushort>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + //template void subMat<double, short>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + //template void subMat<double, int>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + //template void subMat<double, float>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void subMat<double, double>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); +} + +////////////////////////////////////////////////////////////////////////// +// subScalar + +namespace arithm +{ + template <typename T, typename S, typename D> + void subScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream) + { + AddScalar<T, S, D> op(-static_cast<S>(val)); + + if (mask.data) + transform((PtrStepSz<T>) src1, (PtrStepSz<D>) dst, op, mask, stream); + else + transform((PtrStepSz<T>) src1, (PtrStepSz<D>) dst, op, WithOutMask(), stream); + } + + template void subScalar<uchar, float, uchar>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void subScalar<uchar, float, schar>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void subScalar<uchar, float, ushort>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void subScalar<uchar, float, short>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void subScalar<uchar, float, int>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void subScalar<uchar, float, float>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void subScalar<uchar, double, double>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + + template void subScalar<schar, float, uchar>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void subScalar<schar, float, schar>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void subScalar<schar, float, ushort>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void subScalar<schar, float, short>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void subScalar<schar, float, int>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void subScalar<schar, float, float>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void subScalar<schar, double, double>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + + //template void subScalar<ushort, float, uchar>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + //template void subScalar<ushort, float, schar>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void subScalar<ushort, float, ushort>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void subScalar<ushort, float, short>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void subScalar<ushort, float, int>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void subScalar<ushort, float, float>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void subScalar<ushort, double, double>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + + //template void subScalar<short, float, uchar>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + //template void subScalar<short, float, schar>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void subScalar<short, float, ushort>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void subScalar<short, float, short>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void subScalar<short, float, int>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void subScalar<short, float, float>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void subScalar<short, double, double>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + + //template void subScalar<int, float, uchar>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + //template void subScalar<int, float, schar>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + //template void subScalar<int, float, ushort>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + //template void subScalar<int, float, short>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void subScalar<int, float, int>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void subScalar<int, float, float>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void subScalar<int, double, double>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + + //template void subScalar<float, float, uchar>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + //template void subScalar<float, float, schar>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + //template void subScalar<float, float, ushort>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + //template void subScalar<float, float, short>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + //template void subScalar<float, float, int>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void subScalar<float, float, float>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void subScalar<float, double, double>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + + //template void subScalar<double, double, uchar>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + //template void subScalar<double, double, schar>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + //template void subScalar<double, double, ushort>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + //template void subScalar<double, double, short>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + //template void subScalar<double, double, int>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + //template void subScalar<double, double, float>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void subScalar<double, double, double>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); +} + +////////////////////////////////////////////////////////////////////////// +// mulMat + +namespace +{ + struct Mul_8uc4_32f : binary_function<uint, float, uint> { __device__ __forceinline__ uint operator ()(uint a, float b) const { @@ -453,301 +881,316 @@ namespace cv { namespace gpu { namespace device return res; } + + __device__ __forceinline__ Mul_8uc4_32f() {} + __device__ __forceinline__ Mul_8uc4_32f(const Mul_8uc4_32f& other) {} }; - OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(multiply_8uc4_32f) - { - enum { smart_block_dim_x = 8 }; - enum { smart_block_dim_y = 8 }; - enum { smart_shift = 8 }; - }; - - void multiply_gpu(const PtrStepSz<uchar4>& src1, const PtrStepSzf& src2, const PtrStepSz<uchar4>& dst, cudaStream_t stream) - { - cv::gpu::device::transform(static_cast< PtrStepSz<uint> >(src1), src2, static_cast< PtrStepSz<uint> >(dst), multiply_8uc4_32f(), WithOutMask(), stream); - } - - struct multiply_16sc4_32f : binary_function<short4, float, short4> + struct Mul_16sc4_32f : binary_function<short4, float, short4> { __device__ __forceinline__ short4 operator ()(short4 a, float b) const { return make_short4(saturate_cast<short>(a.x * b), saturate_cast<short>(a.y * b), saturate_cast<short>(a.z * b), saturate_cast<short>(a.w * b)); } + + __device__ __forceinline__ Mul_16sc4_32f() {} + __device__ __forceinline__ Mul_16sc4_32f(const Mul_16sc4_32f& other) {} }; - OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(multiply_16sc4_32f) + template <typename T, typename D> struct Mul : binary_function<T, T, D> { - enum { smart_block_dim_x = 8 }; - enum { smart_block_dim_y = 8 }; - enum { smart_shift = 8 }; + __device__ __forceinline__ D operator ()(T a, T b) const + { + return saturate_cast<D>(a * b); + } + + __device__ __forceinline__ Mul() {} + __device__ __forceinline__ Mul(const Mul& other) {} }; - void multiply_gpu(const PtrStepSz<short4>& src1, const PtrStepSzf& src2, const PtrStepSz<short4>& dst, cudaStream_t stream) + template <typename T, typename S, typename D> struct MulScale : binary_function<T, T, D> { - cv::gpu::device::transform(static_cast< PtrStepSz<short4> >(src1), src2, static_cast< PtrStepSz<short4> >(dst), multiply_16sc4_32f(), WithOutMask(), stream); - } + S scale; + + explicit MulScale(S scale_) : scale(scale_) {} - template <typename T, typename D> struct Multiply : binary_function<T, T, D> - { - Multiply(float scale_) : scale(scale_) {} __device__ __forceinline__ D operator ()(T a, T b) const { return saturate_cast<D>(scale * a * b); } - const float scale; - }; - template <typename T> struct Multiply<T, double> : binary_function<T, T, double> - { - Multiply(double scale_) : scale(scale_) {} - __device__ __forceinline__ double operator ()(T a, T b) const - { - return scale * a * b; - } - const double scale; - }; - template <> struct Multiply<int, int> : binary_function<int, int, int> - { - Multiply(double scale_) : scale(scale_) {} - __device__ __forceinline__ int operator ()(int a, int b) const - { - return saturate_cast<int>(scale * a * b); - } - const double scale; }; +} - template <> struct TransformFunctorTraits< Multiply<ushort, ushort> > : DefaultTransformFunctorTraits< Multiply<ushort, ushort> > - { - enum { smart_block_dim_y = 8 }; - enum { smart_shift = 4 }; - }; - template <> struct TransformFunctorTraits< Multiply<short, short> > : DefaultTransformFunctorTraits< Multiply<short, short> > - { - enum { smart_block_dim_y = 8 }; - enum { smart_shift = 4 }; - }; - template <> struct TransformFunctorTraits< Multiply<int, int> > : DefaultTransformFunctorTraits< Multiply<int, int> > - { - enum { smart_block_dim_y = 8 }; - enum { smart_shift = 4 }; - }; - template <> struct TransformFunctorTraits< Multiply<float, float> > : DefaultTransformFunctorTraits< Multiply<float, float> > - { - enum { smart_block_dim_y = 8 }; - enum { smart_shift = 4 }; - }; - - template <typename T, typename D> struct MultiplyCaller - { - static void call(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream) - { - Multiply<T, D> op(static_cast<float>(scale)); - cv::gpu::device::transform((PtrStepSz<T>)src1, (PtrStepSz<T>)src2, (PtrStepSz<D>)dst, op, WithOutMask(), stream); - } - }; - template <typename T> struct MultiplyCaller<T, double> - { - static void call(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream) - { - cudaSafeCall( cudaSetDoubleForDevice(&scale) ); - Multiply<T, double> op(scale); - cv::gpu::device::transform((PtrStepSz<T>)src1, (PtrStepSz<T>)src2, (PtrStepSz<double>)dst, op, WithOutMask(), stream); - } - }; - template <> struct MultiplyCaller<int, int> - { - static void call(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream) - { - cudaSafeCall( cudaSetDoubleForDevice(&scale) ); - Multiply<int, int> op(scale); - cv::gpu::device::transform((PtrStepSz<int>)src1, (PtrStepSz<int>)src2, (PtrStepSz<int>)dst, op, WithOutMask(), stream); - } - }; - - template <typename T, typename D> void multiply_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream) - { - MultiplyCaller<T, D>::call(src1, src2, dst, scale, stream); - } - - template void multiply_gpu<uchar, uchar >(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void multiply_gpu<uchar, schar >(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - template void multiply_gpu<uchar, ushort>(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - template void multiply_gpu<uchar, short >(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - template void multiply_gpu<uchar, int >(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - template void multiply_gpu<uchar, float >(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - template void multiply_gpu<uchar, double>(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - - //template void multiply_gpu<schar, uchar >(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void multiply_gpu<schar, schar >(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void multiply_gpu<schar, ushort>(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void multiply_gpu<schar, short >(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void multiply_gpu<schar, int >(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void multiply_gpu<schar, float >(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void multiply_gpu<schar, double>(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - - //template void multiply_gpu<ushort, uchar >(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void multiply_gpu<ushort, schar >(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - template void multiply_gpu<ushort, ushort>(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void multiply_gpu<ushort, short >(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - template void multiply_gpu<ushort, int >(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - template void multiply_gpu<ushort, float >(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - template void multiply_gpu<ushort, double>(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - - //template void multiply_gpu<short, uchar >(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void multiply_gpu<short, schar >(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void multiply_gpu<short, ushort>(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - template void multiply_gpu<short, short >(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - template void multiply_gpu<short, int >(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - template void multiply_gpu<short, float >(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - template void multiply_gpu<short, double>(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - - //template void multiply_gpu<int, uchar >(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void multiply_gpu<int, schar >(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void multiply_gpu<int, ushort>(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void multiply_gpu<int, short >(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - template void multiply_gpu<int, int >(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - template void multiply_gpu<int, float >(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - template void multiply_gpu<int, double>(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - - //template void multiply_gpu<float, uchar >(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void multiply_gpu<float, schar >(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void multiply_gpu<float, ushort>(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void multiply_gpu<float, short >(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void multiply_gpu<float, int >(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - template void multiply_gpu<float, float >(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - template void multiply_gpu<float, double>(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - - //template void multiply_gpu<double, uchar >(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void multiply_gpu<double, schar >(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void multiply_gpu<double, ushort>(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void multiply_gpu<double, short >(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void multiply_gpu<double, int >(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void multiply_gpu<double, float >(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - template void multiply_gpu<double, double>(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - - template <typename T, typename D> struct MultiplyScalar : unary_function<T, D> - { - MultiplyScalar(double val_, double scale_) : val(val_), scale(scale_) {} - __device__ __forceinline__ D operator ()(T a) const - { - return saturate_cast<D>(scale * a * val); - } - const double val; - const double scale; - }; - - template <> struct TransformFunctorTraits< MultiplyScalar<ushort, ushort> > : DefaultTransformFunctorTraits< MultiplyScalar<ushort, ushort> > - { - enum { smart_block_dim_y = 8 }; - enum { smart_shift = 4 }; - }; - template <> struct TransformFunctorTraits< MultiplyScalar<short, short> > : DefaultTransformFunctorTraits< MultiplyScalar<short, short> > - { - enum { smart_block_dim_y = 8 }; - enum { smart_shift = 4 }; - }; - template <> struct TransformFunctorTraits< MultiplyScalar<int, int> > : DefaultTransformFunctorTraits< MultiplyScalar<int, int> > - { - enum { smart_block_dim_y = 8 }; - enum { smart_shift = 4 }; - }; - template <> struct TransformFunctorTraits< MultiplyScalar<float, float> > : DefaultTransformFunctorTraits< MultiplyScalar<float, float> > - { - enum { smart_block_dim_y = 8 }; - enum { smart_shift = 4 }; - }; - - template <typename T, typename D> void multiply_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream) - { - cudaSafeCall( cudaSetDoubleForDevice(&val) ); - cudaSafeCall( cudaSetDoubleForDevice(&scale) ); - MultiplyScalar<T, D> op(val, scale); - cv::gpu::device::transform((PtrStepSz<T>)src1, (PtrStepSz<D>)dst, op, WithOutMask(), stream); - } - - template void multiply_gpu<uchar, uchar >(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void multiply_gpu<uchar, schar >(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - template void multiply_gpu<uchar, ushort>(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - template void multiply_gpu<uchar, short >(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - template void multiply_gpu<uchar, int >(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - template void multiply_gpu<uchar, float >(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - template void multiply_gpu<uchar, double>(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - - //template void multiply_gpu<schar, uchar >(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void multiply_gpu<schar, schar >(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void multiply_gpu<schar, ushort>(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void multiply_gpu<schar, short >(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void multiply_gpu<schar, int >(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void multiply_gpu<schar, float >(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void multiply_gpu<schar, double>(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - - //template void multiply_gpu<ushort, uchar >(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void multiply_gpu<ushort, schar >(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - template void multiply_gpu<ushort, ushort>(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void multiply_gpu<ushort, short >(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - template void multiply_gpu<ushort, int >(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - template void multiply_gpu<ushort, float >(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - template void multiply_gpu<ushort, double>(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - - //template void multiply_gpu<short, uchar >(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void multiply_gpu<short, schar >(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void multiply_gpu<short, ushort>(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - template void multiply_gpu<short, short >(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - template void multiply_gpu<short, int >(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - template void multiply_gpu<short, float >(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - template void multiply_gpu<short, double>(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - - //template void multiply_gpu<int, uchar >(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void multiply_gpu<int, schar >(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void multiply_gpu<int, ushort>(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void multiply_gpu<int, short >(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - template void multiply_gpu<int, int >(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - template void multiply_gpu<int, float >(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - template void multiply_gpu<int, double>(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - - //template void multiply_gpu<float, uchar >(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void multiply_gpu<float, schar >(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void multiply_gpu<float, ushort>(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void multiply_gpu<float, short >(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void multiply_gpu<float, int >(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - template void multiply_gpu<float, float >(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - template void multiply_gpu<float, double>(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - - //template void multiply_gpu<double, uchar >(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void multiply_gpu<double, schar >(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void multiply_gpu<double, ushort>(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void multiply_gpu<double, short >(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void multiply_gpu<double, int >(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void multiply_gpu<double, float >(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - template void multiply_gpu<double, double>(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - - ////////////////////////////////////////////////////////////////////////// - // divide - - struct divide_8uc4_32f : binary_function<uchar4, float, uchar4> - { - __device__ __forceinline__ uchar4 operator ()(uchar4 a, float b) const - { - return b != 0 ? make_uchar4(saturate_cast<uchar>(a.x / b), saturate_cast<uchar>(a.y / b), - saturate_cast<uchar>(a.z / b), saturate_cast<uchar>(a.w / b)) - : make_uchar4(0,0,0,0); - } - }; - - OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(divide_8uc4_32f) +namespace cv { namespace gpu { namespace device +{ + OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(Mul_8uc4_32f) { enum { smart_block_dim_x = 8 }; enum { smart_block_dim_y = 8 }; enum { smart_shift = 8 }; }; - void divide_gpu(const PtrStepSz<uchar4>& src1, const PtrStepSzf& src2, const PtrStepSz<uchar4>& dst, cudaStream_t stream) + template <> struct TransformFunctorTraits< Mul<ushort, ushort> > : DefaultTransformFunctorTraits< Mul<ushort, ushort> > { - cv::gpu::device::transform(static_cast< PtrStepSz<uchar4> >(src1), src2, static_cast< PtrStepSz<uchar4> >(dst), divide_8uc4_32f(), WithOutMask(), stream); + enum { smart_block_dim_y = 8 }; + enum { smart_shift = 4 }; + }; + template <> struct TransformFunctorTraits< Mul<short, short> > : DefaultTransformFunctorTraits< Mul<short, short> > + { + enum { smart_block_dim_y = 8 }; + enum { smart_shift = 4 }; + }; + template <> struct TransformFunctorTraits< Mul<int, int> > : DefaultTransformFunctorTraits< Mul<int, int> > + { + enum { smart_block_dim_y = 8 }; + enum { smart_shift = 4 }; + }; + template <> struct TransformFunctorTraits< Mul<float, float> > : DefaultTransformFunctorTraits< Mul<float, float> > + { + enum { smart_block_dim_y = 8 }; + enum { smart_shift = 4 }; + }; + + template <> struct TransformFunctorTraits< MulScale<ushort, float, ushort> > : DefaultTransformFunctorTraits< MulScale<ushort, float, ushort> > + { + enum { smart_block_dim_y = 8 }; + enum { smart_shift = 4 }; + }; + template <> struct TransformFunctorTraits< MulScale<short, float, short> > : DefaultTransformFunctorTraits< MulScale<short, float, short> > + { + enum { smart_block_dim_y = 8 }; + enum { smart_shift = 4 }; + }; + template <> struct TransformFunctorTraits< MulScale<int, float, int> > : DefaultTransformFunctorTraits< MulScale<int, float, int> > + { + enum { smart_block_dim_y = 8 }; + enum { smart_shift = 4 }; + }; + template <> struct TransformFunctorTraits< MulScale<float, float, float> > : DefaultTransformFunctorTraits< MulScale<float, float, float> > + { + enum { smart_block_dim_y = 8 }; + enum { smart_shift = 4 }; + }; +}}} + +namespace arithm +{ + void mulMat_8uc4_32f(PtrStepSz<uint> src1, PtrStepSzf src2, PtrStepSz<uint> dst, cudaStream_t stream) + { + transform(src1, src2, dst, Mul_8uc4_32f(), WithOutMask(), stream); } + void mulMat_16sc4_32f(PtrStepSz<short4> src1, PtrStepSzf src2, PtrStepSz<short4> dst, cudaStream_t stream) + { + transform(src1, src2, dst, Mul_16sc4_32f(), WithOutMask(), stream); + } - struct divide_16sc4_32f : binary_function<short4, float, short4> + template <typename T, typename S, typename D> + void mulMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream) + { + if (scale == 1) + { + Mul<T, D> op; + transform((PtrStepSz<T>) src1, (PtrStepSz<T>) src2, (PtrStepSz<D>) dst, op, WithOutMask(), stream); + } + else + { + MulScale<T, S, D> op(static_cast<S>(scale)); + transform((PtrStepSz<T>) src1, (PtrStepSz<T>) src2, (PtrStepSz<D>) dst, op, WithOutMask(), stream); + } + } + + template void mulMat<uchar, float, uchar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + template void mulMat<uchar, float, schar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + template void mulMat<uchar, float, ushort>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + template void mulMat<uchar, float, short>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + template void mulMat<uchar, float, int>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + template void mulMat<uchar, float, float>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + template void mulMat<uchar, double, double>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + + template void mulMat<schar, float, uchar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + template void mulMat<schar, float, schar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + template void mulMat<schar, float, ushort>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + template void mulMat<schar, float, short>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + template void mulMat<schar, float, int>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + template void mulMat<schar, float, float>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + template void mulMat<schar, double, double>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + + //template void mulMat<ushort, float, uchar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + //template void mulMat<ushort, float, schar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + template void mulMat<ushort, float, ushort>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + template void mulMat<ushort, float, short>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + template void mulMat<ushort, float, int>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + template void mulMat<ushort, float, float>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + template void mulMat<ushort, double, double>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + + //template void mulMat<short, float, uchar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + //template void mulMat<short, float, schar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + template void mulMat<short, float, ushort>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + template void mulMat<short, float, short>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + template void mulMat<short, float, int>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + template void mulMat<short, float, float>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + template void mulMat<short, double, double>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + + //template void mulMat<int, float, uchar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + //template void mulMat<int, float, schar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + //template void mulMat<int, float, ushort>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + //template void mulMat<int, float, short>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + template void mulMat<int, float, int>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + template void mulMat<int, float, float>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + template void mulMat<int, double, double>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + + //template void mulMat<float, float, uchar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + //template void mulMat<float, float, schar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + //template void mulMat<float, float, ushort>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + //template void mulMat<float, float, short>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + //template void mulMat<float, float, int>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + template void mulMat<float, float, float>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + template void mulMat<float, double, double>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + + //template void mulMat<double, double, uchar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + //template void mulMat<double, double, schar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + //template void mulMat<double, double, ushort>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + //template void mulMat<double, double, short>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + //template void mulMat<double, double, int>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + //template void mulMat<double, double, float>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + template void mulMat<double, double, double>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); +} + +////////////////////////////////////////////////////////////////////////// +// mulScalar + +namespace +{ + template <typename T, typename S, typename D> struct MulScalar : unary_function<T, D> + { + S val; + + explicit MulScalar(S val_) : val(val_) {} + + __device__ __forceinline__ D operator ()(T a) const + { + return saturate_cast<D>(a * val); + } + }; +} + +namespace cv { namespace gpu { namespace device +{ + template <> struct TransformFunctorTraits< MulScalar<ushort, float, ushort> > : DefaultTransformFunctorTraits< MulScalar<ushort, float, ushort> > + { + enum { smart_block_dim_y = 8 }; + enum { smart_shift = 4 }; + }; + template <> struct TransformFunctorTraits< MulScalar<short, float, short> > : DefaultTransformFunctorTraits< MulScalar<short, float, short> > + { + enum { smart_block_dim_y = 8 }; + enum { smart_shift = 4 }; + }; + template <> struct TransformFunctorTraits< MulScalar<int, float, int> > : DefaultTransformFunctorTraits< MulScalar<int, float, int> > + { + enum { smart_block_dim_y = 8 }; + enum { smart_shift = 4 }; + }; + template <> struct TransformFunctorTraits< MulScalar<float, float, float> > : DefaultTransformFunctorTraits< MulScalar<float, float, float> > + { + enum { smart_block_dim_y = 8 }; + enum { smart_shift = 4 }; + }; +}}} + +namespace arithm +{ + template <typename T, typename S, typename D> + void mulScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream) + { + MulScalar<T, S, D> op(static_cast<S>(val)); + transform((PtrStepSz<T>) src1, (PtrStepSz<D>) dst, op, WithOutMask(), stream); + } + + template void mulScalar<uchar, float, uchar>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + template void mulScalar<uchar, float, schar>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + template void mulScalar<uchar, float, ushort>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + template void mulScalar<uchar, float, short>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + template void mulScalar<uchar, float, int>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + template void mulScalar<uchar, float, float>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + template void mulScalar<uchar, double, double>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + + template void mulScalar<schar, float, uchar>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + template void mulScalar<schar, float, schar>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + template void mulScalar<schar, float, ushort>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + template void mulScalar<schar, float, short>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + template void mulScalar<schar, float, int>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + template void mulScalar<schar, float, float>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + template void mulScalar<schar, double, double>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + + //template void mulScalar<ushort, float, uchar>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + //template void mulScalar<ushort, float, schar>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + template void mulScalar<ushort, float, ushort>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + template void mulScalar<ushort, float, short>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + template void mulScalar<ushort, float, int>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + template void mulScalar<ushort, float, float>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + template void mulScalar<ushort, double, double>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + + //template void mulScalar<short, float, uchar>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + //template void mulScalar<short, float, schar>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + template void mulScalar<short, float, ushort>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + template void mulScalar<short, float, short>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + template void mulScalar<short, float, int>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + template void mulScalar<short, float, float>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + template void mulScalar<short, double, double>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + + //template void mulScalar<int, float, uchar>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + //template void mulScalar<int, float, schar>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + //template void mulScalar<int, float, ushort>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + //template void mulScalar<int, float, short>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + template void mulScalar<int, float, int>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + template void mulScalar<int, float, float>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + template void mulScalar<int, double, double>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + + //template void mulScalar<float, float, uchar>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + //template void mulScalar<float, float, schar>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + //template void mulScalar<float, float, ushort>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + //template void mulScalar<float, float, short>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + //template void mulScalar<float, float, int>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + template void mulScalar<float, float, float>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + template void mulScalar<float, double, double>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + + //template void mulScalar<double, double, uchar>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + //template void mulScalar<double, double, schar>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + //template void mulScalar<double, double, ushort>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + //template void mulScalar<double, double, short>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + //template void mulScalar<double, double, int>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + //template void mulScalar<double, double, float>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + template void mulScalar<double, double, double>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); +} + +////////////////////////////////////////////////////////////////////////// +// divMat + +namespace +{ + struct Div_8uc4_32f : binary_function<uint, float, uint> + { + __device__ __forceinline__ uint operator ()(uint a, float b) const + { + uint res = 0; + + if (b != 0) + { + b = 1.0f / b; + res |= (saturate_cast<uchar>((0xffu & (a )) * b) ); + res |= (saturate_cast<uchar>((0xffu & (a >> 8)) * b) << 8); + res |= (saturate_cast<uchar>((0xffu & (a >> 16)) * b) << 16); + res |= (saturate_cast<uchar>((0xffu & (a >> 24)) * b) << 24); + } + + return res; + } + }; + + struct Div_16sc4_32f : binary_function<short4, float, short4> { __device__ __forceinline__ short4 operator ()(short4 a, float b) const { @@ -757,425 +1200,905 @@ namespace cv { namespace gpu { namespace device } }; - OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(divide_16sc4_32f) + template <typename T, typename D> struct Div : binary_function<T, T, D> + { + __device__ __forceinline__ D operator ()(T a, T b) const + { + return b != 0 ? saturate_cast<D>(a / b) : 0; + } + + __device__ __forceinline__ Div() {} + __device__ __forceinline__ Div(const Div& other) {} + }; + template <typename T> struct Div<T, float> : binary_function<T, T, float> + { + __device__ __forceinline__ float operator ()(T a, T b) const + { + return b != 0 ? static_cast<float>(a) / b : 0; + } + + __device__ __forceinline__ Div() {} + __device__ __forceinline__ Div(const Div& other) {} + }; + template <typename T> struct Div<T, double> : binary_function<T, T, double> + { + __device__ __forceinline__ double operator ()(T a, T b) const + { + return b != 0 ? static_cast<double>(a) / b : 0; + } + + __device__ __forceinline__ Div() {} + __device__ __forceinline__ Div(const Div& other) {} + }; + + template <typename T, typename S, typename D> struct DivScale : binary_function<T, T, D> + { + S scale; + + explicit DivScale(S scale_) : scale(scale_) {} + + __device__ __forceinline__ D operator ()(T a, T b) const + { + return b != 0 ? saturate_cast<D>(scale * a / b) : 0; + } + }; +} + +namespace cv { namespace gpu { namespace device +{ + OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(Div_8uc4_32f) { enum { smart_block_dim_x = 8 }; enum { smart_block_dim_y = 8 }; enum { smart_shift = 8 }; }; - void divide_gpu(const PtrStepSz<short4>& src1, const PtrStepSzf& src2, const PtrStepSz<short4>& dst, cudaStream_t stream) + template <> struct TransformFunctorTraits< Div<ushort, ushort> > : DefaultTransformFunctorTraits< Div<ushort, ushort> > { - cv::gpu::device::transform(static_cast< PtrStepSz<short4> >(src1), src2, static_cast< PtrStepSz<short4> >(dst), divide_16sc4_32f(), WithOutMask(), stream); + enum { smart_block_dim_y = 8 }; + enum { smart_shift = 4 }; + }; + template <> struct TransformFunctorTraits< Div<short, short> > : DefaultTransformFunctorTraits< Div<short, short> > + { + enum { smart_block_dim_y = 8 }; + enum { smart_shift = 4 }; + }; + template <> struct TransformFunctorTraits< Div<int, int> > : DefaultTransformFunctorTraits< Div<int, int> > + { + enum { smart_block_dim_y = 8 }; + enum { smart_shift = 4 }; + }; + template <> struct TransformFunctorTraits< Div<float, float> > : DefaultTransformFunctorTraits< Div<float, float> > + { + enum { smart_block_dim_y = 8 }; + enum { smart_shift = 4 }; + }; + + template <> struct TransformFunctorTraits< DivScale<ushort, float, ushort> > : DefaultTransformFunctorTraits< DivScale<ushort, float, ushort> > + { + enum { smart_block_dim_y = 8 }; + enum { smart_shift = 4 }; + }; + template <> struct TransformFunctorTraits< DivScale<short, float, short> > : DefaultTransformFunctorTraits< DivScale<short, float, short> > + { + enum { smart_block_dim_y = 8 }; + enum { smart_shift = 4 }; + }; + template <> struct TransformFunctorTraits< DivScale<int, float, int> > : DefaultTransformFunctorTraits< DivScale<int, float, int> > + { + enum { smart_block_dim_y = 8 }; + enum { smart_shift = 4 }; + }; + template <> struct TransformFunctorTraits< DivScale<float, float, float> > : DefaultTransformFunctorTraits< DivScale<float, float, float> > + { + enum { smart_block_dim_y = 8 }; + enum { smart_shift = 4 }; + }; +}}} + +namespace arithm +{ + void divMat_8uc4_32f(PtrStepSz<uint> src1, PtrStepSzf src2, PtrStepSz<uint> dst, cudaStream_t stream) + { + transform(src1, src2, dst, Div_8uc4_32f(), WithOutMask(), stream); } - template <typename T, typename D> struct Divide : binary_function<T, T, D> + void divMat_16sc4_32f(PtrStepSz<short4> src1, PtrStepSzf src2, PtrStepSz<short4> dst, cudaStream_t stream) { - Divide(double scale_) : scale(scale_) {} - __device__ __forceinline__ D operator ()(T a, T b) const + transform(src1, src2, dst, Div_16sc4_32f(), WithOutMask(), stream); + } + + template <typename T, typename S, typename D> + void divMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream) + { + if (scale == 1) { - return b != 0 ? saturate_cast<D>(a * scale / b) : 0; + Div<T, D> op; + transform((PtrStepSz<T>) src1, (PtrStepSz<T>) src2, (PtrStepSz<D>) dst, op, WithOutMask(), stream); + } + else + { + DivScale<T, S, D> op(static_cast<S>(scale)); + transform((PtrStepSz<T>) src1, (PtrStepSz<T>) src2, (PtrStepSz<D>) dst, op, WithOutMask(), stream); } - const double scale; - }; - - template <> struct TransformFunctorTraits< Divide<ushort, ushort> > : DefaultTransformFunctorTraits< Divide<ushort, ushort> > - { - enum { smart_block_dim_y = 8 }; - enum { smart_shift = 4 }; - }; - template <> struct TransformFunctorTraits< Divide<short, short> > : DefaultTransformFunctorTraits< Divide<short, short> > - { - enum { smart_block_dim_y = 8 }; - enum { smart_shift = 4 }; - }; - template <> struct TransformFunctorTraits< Divide<int, int> > : DefaultTransformFunctorTraits< Divide<int, int> > - { - enum { smart_block_dim_y = 8 }; - enum { smart_shift = 4 }; - }; - template <> struct TransformFunctorTraits< Divide<float, float> > : DefaultTransformFunctorTraits< Divide<float, float> > - { - enum { smart_block_dim_y = 8 }; - enum { smart_shift = 4 }; - }; - - template <typename T, typename D> void divide_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream) - { - cudaSafeCall( cudaSetDoubleForDevice(&scale) ); - Divide<T, D> op(scale); - cv::gpu::device::transform((PtrStepSz<T>)src1, (PtrStepSz<T>)src2, (PtrStepSz<D>)dst, op, WithOutMask(), stream); } - template void divide_gpu<uchar, uchar >(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void divide_gpu<uchar, schar >(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - template void divide_gpu<uchar, ushort>(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - template void divide_gpu<uchar, short >(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - template void divide_gpu<uchar, int >(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - template void divide_gpu<uchar, float >(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - template void divide_gpu<uchar, double>(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); + template void divMat<uchar, float, uchar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + template void divMat<uchar, float, schar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + template void divMat<uchar, float, ushort>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + template void divMat<uchar, float, short>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + template void divMat<uchar, float, int>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + template void divMat<uchar, float, float>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + template void divMat<uchar, double, double>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); - //template void divide_gpu<schar, uchar >(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void divide_gpu<schar, schar >(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void divide_gpu<schar, ushort>(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void divide_gpu<schar, short >(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void divide_gpu<schar, int >(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void divide_gpu<schar, float >(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void divide_gpu<schar, double>(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); + template void divMat<schar, float, uchar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + template void divMat<schar, float, schar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + template void divMat<schar, float, ushort>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + template void divMat<schar, float, short>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + template void divMat<schar, float, int>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + template void divMat<schar, float, float>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + template void divMat<schar, double, double>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); - //template void divide_gpu<ushort, uchar >(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void divide_gpu<ushort, schar >(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - template void divide_gpu<ushort, ushort>(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void divide_gpu<ushort, short >(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - template void divide_gpu<ushort, int >(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - template void divide_gpu<ushort, float >(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - template void divide_gpu<ushort, double>(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); + //template void divMat<ushort, float, uchar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + //template void divMat<ushort, float, schar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + template void divMat<ushort, float, ushort>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + template void divMat<ushort, float, short>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + template void divMat<ushort, float, int>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + template void divMat<ushort, float, float>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + template void divMat<ushort, double, double>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); - //template void divide_gpu<short, uchar >(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void divide_gpu<short, schar >(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void divide_gpu<short, ushort>(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - template void divide_gpu<short, short >(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - template void divide_gpu<short, int >(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - template void divide_gpu<short, float >(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - template void divide_gpu<short, double>(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); + //template void divMat<short, float, uchar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + //template void divMat<short, float, schar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + template void divMat<short, float, ushort>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + template void divMat<short, float, short>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + template void divMat<short, float, int>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + template void divMat<short, float, float>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + template void divMat<short, double, double>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); - //template void divide_gpu<int, uchar >(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void divide_gpu<int, schar >(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void divide_gpu<int, ushort>(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void divide_gpu<int, short >(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - template void divide_gpu<int, int >(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - template void divide_gpu<int, float >(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - template void divide_gpu<int, double>(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); + //template void divMat<int, float, uchar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + //template void divMat<int, float, schar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + //template void divMat<int, float, ushort>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + //template void divMat<int, float, short>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + template void divMat<int, float, int>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + template void divMat<int, float, float>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + template void divMat<int, double, double>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); - //template void divide_gpu<float, uchar >(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void divide_gpu<float, schar >(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void divide_gpu<float, ushort>(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void divide_gpu<float, short >(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void divide_gpu<float, int >(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - template void divide_gpu<float, float >(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - template void divide_gpu<float, double>(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); + //template void divMat<float, float, uchar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + //template void divMat<float, float, schar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + //template void divMat<float, float, ushort>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + //template void divMat<float, float, short>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + //template void divMat<float, float, int>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + template void divMat<float, float, float>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + template void divMat<float, double, double>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); - //template void divide_gpu<double, uchar >(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void divide_gpu<double, schar >(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void divide_gpu<double, ushort>(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void divide_gpu<double, short >(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void divide_gpu<double, int >(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void divide_gpu<double, float >(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - template void divide_gpu<double, double>(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); + //template void divMat<double, double, uchar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + //template void divMat<double, double, schar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + //template void divMat<double, double, ushort>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + //template void divMat<double, double, short>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + //template void divMat<double, double, int>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + //template void divMat<double, double, float>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + template void divMat<double, double, double>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); +} - template <typename T, typename D> struct DivideScalar : unary_function<T, D> +////////////////////////////////////////////////////////////////////////// +// divScalar + +namespace arithm +{ + template <typename T, typename S, typename D> + void divScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream) { - DivideScalar(double val_, double scale_) : val(val_), scale(scale_) {} + MulScalar<T, S, D> op(static_cast<S>(1.0 / val)); + transform((PtrStepSz<T>) src1, (PtrStepSz<D>) dst, op, WithOutMask(), stream); + } + + template void divScalar<uchar, float, uchar>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + template void divScalar<uchar, float, schar>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + template void divScalar<uchar, float, ushort>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + template void divScalar<uchar, float, short>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + template void divScalar<uchar, float, int>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + template void divScalar<uchar, float, float>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + template void divScalar<uchar, double, double>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + + template void divScalar<schar, float, uchar>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + template void divScalar<schar, float, schar>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + template void divScalar<schar, float, ushort>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + template void divScalar<schar, float, short>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + template void divScalar<schar, float, int>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + template void divScalar<schar, float, float>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + template void divScalar<schar, double, double>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + + //template void divScalar<ushort, float, uchar>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + //template void divScalar<ushort, float, schar>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + template void divScalar<ushort, float, ushort>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + template void divScalar<ushort, float, short>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + template void divScalar<ushort, float, int>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + template void divScalar<ushort, float, float>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + template void divScalar<ushort, double, double>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + + //template void divScalar<short, float, uchar>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + //template void divScalar<short, float, schar>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + template void divScalar<short, float, ushort>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + template void divScalar<short, float, short>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + template void divScalar<short, float, int>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + template void divScalar<short, float, float>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + template void divScalar<short, double, double>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + + //template void divScalar<int, float, uchar>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + //template void divScalar<int, float, schar>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + //template void divScalar<int, float, ushort>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + //template void divScalar<int, float, short>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + template void divScalar<int, float, int>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + template void divScalar<int, float, float>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + template void divScalar<int, double, double>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + + //template void divScalar<float, float, uchar>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + //template void divScalar<float, float, schar>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + //template void divScalar<float, float, ushort>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + //template void divScalar<float, float, short>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + //template void divScalar<float, float, int>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + template void divScalar<float, float, float>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + template void divScalar<float, double, double>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + + //template void divScalar<double, double, uchar>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + //template void divScalar<double, double, schar>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + //template void divScalar<double, double, ushort>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + //template void divScalar<double, double, short>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + //template void divScalar<double, double, int>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + //template void divScalar<double, double, float>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + template void divScalar<double, double, double>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); +} + +////////////////////////////////////////////////////////////////////////// +// divInv + +namespace +{ + template <typename T, typename S, typename D> struct DivInv : unary_function<T, D> + { + S val; + + explicit DivInv(double val_) : val(val_) {} + __device__ __forceinline__ D operator ()(T a) const { - return saturate_cast<D>(scale * a / val); + return a != 0 ? saturate_cast<D>(val / a) : 0; } - const double val; - const double scale; }; +} - template <> struct TransformFunctorTraits< DivideScalar<ushort, ushort> > : DefaultTransformFunctorTraits< DivideScalar<ushort, ushort> > +namespace cv { namespace gpu { namespace device +{ + template <> struct TransformFunctorTraits< DivInv<ushort, float, ushort> > : DefaultTransformFunctorTraits< DivInv<ushort, float, ushort> > { enum { smart_block_dim_y = 8 }; enum { smart_shift = 4 }; }; - template <> struct TransformFunctorTraits< DivideScalar<short, short> > : DefaultTransformFunctorTraits< DivideScalar<short, short> > + template <> struct TransformFunctorTraits< DivInv<short, float, short> > : DefaultTransformFunctorTraits< DivInv<short, float, short> > { enum { smart_block_dim_y = 8 }; enum { smart_shift = 4 }; }; - template <> struct TransformFunctorTraits< DivideScalar<int, int> > : DefaultTransformFunctorTraits< DivideScalar<int, int> > + template <> struct TransformFunctorTraits< DivInv<int, float, int> > : DefaultTransformFunctorTraits< DivInv<int, float, int> > { enum { smart_block_dim_y = 8 }; enum { smart_shift = 4 }; }; - template <> struct TransformFunctorTraits< DivideScalar<float, float> > : DefaultTransformFunctorTraits< DivideScalar<float, float> > + template <> struct TransformFunctorTraits< DivInv<float, float, float> > : DefaultTransformFunctorTraits< DivInv<float, float, float> > { enum { smart_block_dim_y = 8 }; enum { smart_shift = 4 }; }; +}}} - template <typename T, typename D> void divide_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream) +namespace arithm +{ + template <typename T, typename S, typename D> + void divInv(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream) { - cudaSafeCall( cudaSetDoubleForDevice(&val) ); - cudaSafeCall( cudaSetDoubleForDevice(&scale) ); - DivideScalar<T, D> op(val, scale); - cv::gpu::device::transform((PtrStepSz<T>)src1, (PtrStepSz<D>)dst, op, WithOutMask(), stream); + DivInv<T, S, D> op(static_cast<S>(val)); + transform((PtrStepSz<T>) src1, (PtrStepSz<D>) dst, op, WithOutMask(), stream); } - template void divide_gpu<uchar, uchar >(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void divide_gpu<uchar, schar >(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - template void divide_gpu<uchar, ushort>(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - template void divide_gpu<uchar, short >(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - template void divide_gpu<uchar, int >(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - template void divide_gpu<uchar, float >(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - template void divide_gpu<uchar, double>(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); + template void divInv<uchar, float, uchar>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + template void divInv<uchar, float, schar>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + template void divInv<uchar, float, ushort>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + template void divInv<uchar, float, short>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + template void divInv<uchar, float, int>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + template void divInv<uchar, float, float>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + template void divInv<uchar, double, double>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); - //template void divide_gpu<schar, uchar >(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void divide_gpu<schar, schar >(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void divide_gpu<schar, ushort>(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void divide_gpu<schar, short >(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void divide_gpu<schar, int >(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void divide_gpu<schar, float >(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void divide_gpu<schar, double>(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); + template void divInv<schar, float, uchar>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + template void divInv<schar, float, schar>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + template void divInv<schar, float, ushort>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + template void divInv<schar, float, short>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + template void divInv<schar, float, int>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + template void divInv<schar, float, float>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + template void divInv<schar, double, double>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); - //template void divide_gpu<ushort, uchar >(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void divide_gpu<ushort, schar >(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - template void divide_gpu<ushort, ushort>(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void divide_gpu<ushort, short >(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - template void divide_gpu<ushort, int >(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - template void divide_gpu<ushort, float >(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - template void divide_gpu<ushort, double>(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); + //template void divInv<ushort, float, uchar>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + //template void divInv<ushort, float, schar>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + template void divInv<ushort, float, ushort>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + template void divInv<ushort, float, short>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + template void divInv<ushort, float, int>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + template void divInv<ushort, float, float>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + template void divInv<ushort, double, double>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); - //template void divide_gpu<short, uchar >(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void divide_gpu<short, schar >(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void divide_gpu<short, ushort>(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - template void divide_gpu<short, short >(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - template void divide_gpu<short, int >(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - template void divide_gpu<short, float >(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - template void divide_gpu<short, double>(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); + //template void divInv<short, float, uchar>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + //template void divInv<short, float, schar>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + template void divInv<short, float, ushort>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + template void divInv<short, float, short>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + template void divInv<short, float, int>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + template void divInv<short, float, float>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + template void divInv<short, double, double>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); - //template void divide_gpu<int, uchar >(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void divide_gpu<int, schar >(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void divide_gpu<int, ushort>(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void divide_gpu<int, short >(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - template void divide_gpu<int, int >(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - template void divide_gpu<int, float >(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - template void divide_gpu<int, double>(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); + //template void divInv<int, float, uchar>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + //template void divInv<int, float, schar>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + //template void divInv<int, float, ushort>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + //template void divInv<int, float, short>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + template void divInv<int, float, int>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + template void divInv<int, float, float>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + template void divInv<int, double, double>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); - //template void divide_gpu<float, uchar >(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void divide_gpu<float, schar >(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void divide_gpu<float, ushort>(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void divide_gpu<float, short >(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void divide_gpu<float, int >(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - template void divide_gpu<float, float >(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - template void divide_gpu<float, double>(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); + //template void divInv<float, float, uchar>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + //template void divInv<float, float, schar>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + //template void divInv<float, float, ushort>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + //template void divInv<float, float, short>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + //template void divInv<float, float, int>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + template void divInv<float, float, float>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + template void divInv<float, double, double>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); - //template void divide_gpu<double, uchar >(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void divide_gpu<double, schar >(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void divide_gpu<double, ushort>(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void divide_gpu<double, short >(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void divide_gpu<double, int >(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void divide_gpu<double, float >(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - template void divide_gpu<double, double>(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); + //template void divInv<double, double, uchar>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + //template void divInv<double, double, schar>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + //template void divInv<double, double, ushort>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + //template void divInv<double, double, short>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + //template void divInv<double, double, int>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + //template void divInv<double, double, float>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + template void divInv<double, double, double>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); +} - template <typename T, typename D> struct Reciprocal : unary_function<T, D> +////////////////////////////////////////////////////////////////////////// +// absDiffMat + +namespace +{ + template <typename T, typename D> struct VAbsDiff4; + template <> struct VAbsDiff4<uint, uint> : binary_function<uint, uint, uint> { - Reciprocal(double scale_) : scale(scale_) {} - __device__ __forceinline__ D operator ()(T a) const + __device__ __forceinline__ uint operator ()(uint a, uint b) const { - return a != 0 ? saturate_cast<D>(scale / a) : 0; + uint res = 0; + + #if __CUDA_ARCH__ >= 300 + asm("vabsdiff4.u32.u32.u32.sat %0, %1, %2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); + #elif __CUDA_ARCH__ >= 200 + asm("vabsdiff.u32.u32.u32.sat %0.b0, %1.b0, %2.b0, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); + asm("vabsdiff.u32.u32.u32.sat %0.b1, %1.b1, %2.b1, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); + asm("vabsdiff.u32.u32.u32.sat %0.b2, %1.b2, %2.b2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); + asm("vabsdiff.u32.u32.u32.sat %0.b3, %1.b3, %2.b3, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); + #endif + + return res; } - const double scale; + + __device__ __forceinline__ VAbsDiff4() {} + __device__ __forceinline__ VAbsDiff4(const VAbsDiff4<uint, uint>& other) {} + }; + template <> struct VAbsDiff4<int, int> : binary_function<int, int, int> + { + __device__ __forceinline__ int operator ()(int a, int b) const + { + int res = 0; + + #if __CUDA_ARCH__ >= 300 + asm("vabsdiff4.s32.s32.s32.sat %0, %1, %2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); + #elif __CUDA_ARCH__ >= 200 + asm("vabsdiff.s32.s32.s32.sat %0.b0, %1.b0, %2.b0, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); + asm("vabsdiff.s32.s32.s32.sat %0.b1, %1.b1, %2.b1, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); + asm("vabsdiff.s32.s32.s32.sat %0.b2, %1.b2, %2.b2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); + asm("vabsdiff.s32.s32.s32.sat %0.b3, %1.b3, %2.b3, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); + #endif + + return res; + } + + __device__ __forceinline__ VAbsDiff4() {} + __device__ __forceinline__ VAbsDiff4(const VAbsDiff4<int, int>& other) {} }; - template <> struct TransformFunctorTraits< Reciprocal<ushort, ushort> > : DefaultTransformFunctorTraits< Reciprocal<ushort, ushort> > + //////////////////////////////////// + + template <typename T, typename D> struct VAbsDiff2; + template <> struct VAbsDiff2<uint, uint> : binary_function<uint, uint, uint> { - enum { smart_block_dim_y = 8 }; - enum { smart_shift = 4 }; + __device__ __forceinline__ uint operator ()(uint a, uint b) const + { + uint res = 0; + + #if __CUDA_ARCH__ >= 300 + asm("vabsdiff2.u32.u32.u32.sat %0, %1, %2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); + #elif __CUDA_ARCH__ >= 200 + asm("vabsdiff.u32.u32.u32.sat %0.h0, %1.h0, %2.h0, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); + asm("vabsdiff.u32.u32.u32.sat %0.h1, %1.h1, %2.h1, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); + #endif + + return res; + } + + __device__ __forceinline__ VAbsDiff2() {} + __device__ __forceinline__ VAbsDiff2(const VAbsDiff2<uint, uint>& other) {} }; - template <> struct TransformFunctorTraits< Reciprocal<short, short> > : DefaultTransformFunctorTraits< Reciprocal<short, short> > + template <> struct VAbsDiff2<int, int> : binary_function<int, int, int> { - enum { smart_block_dim_y = 8 }; - enum { smart_shift = 4 }; - }; - template <> struct TransformFunctorTraits< Reciprocal<int, int> > : DefaultTransformFunctorTraits< Reciprocal<int, int> > - { - enum { smart_block_dim_y = 8 }; - enum { smart_shift = 4 }; - }; - template <> struct TransformFunctorTraits< Reciprocal<float, float> > : DefaultTransformFunctorTraits< Reciprocal<float, float> > - { - enum { smart_block_dim_y = 8 }; - enum { smart_shift = 4 }; + __device__ __forceinline__ int operator ()(int a, int b) const + { + int res = 0; + + #if __CUDA_ARCH__ >= 300 + asm("vabsdiff2.s32.s32.s32.sat %0, %1, %2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); + #elif __CUDA_ARCH__ >= 200 + asm("vabsdiff.s32.s32.s32.sat %0.h0, %1.h0, %2.h0, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); + asm("vabsdiff.s32.s32.s32.sat %0.h1, %1.h1, %2.h1, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); + #endif + + return res; + } + + __device__ __forceinline__ VAbsDiff2() {} + __device__ __forceinline__ VAbsDiff2(const VAbsDiff2<int, int>& other) {} }; - template <typename T, typename D> void divide_gpu(double scalar, const PtrStepSzb& src2, const PtrStepSzb& dst, cudaStream_t stream) + //////////////////////////////////// + + __device__ __forceinline__ int _abs(int a) { - cudaSafeCall( cudaSetDoubleForDevice(&scalar) ); - Reciprocal<T, D> op(scalar); - cv::gpu::device::transform((PtrStepSz<T>)src2, (PtrStepSz<D>)dst, op, WithOutMask(), stream); + return ::abs(a); + } + __device__ __forceinline__ float _abs(float a) + { + return ::fabsf(a); + } + __device__ __forceinline__ double _abs(double a) + { + return ::fabs(a); } - template void divide_gpu<uchar, uchar >(double scalar, const PtrStepSzb& src2, const PtrStepSzb& dst, cudaStream_t stream); - //template void divide_gpu<uchar, schar >(double scalar, const PtrStepSzb& src2, const PtrStepSzb& dst, cudaStream_t stream); - template void divide_gpu<uchar, ushort>(double scalar, const PtrStepSzb& src2, const PtrStepSzb& dst, cudaStream_t stream); - template void divide_gpu<uchar, short >(double scalar, const PtrStepSzb& src2, const PtrStepSzb& dst, cudaStream_t stream); - template void divide_gpu<uchar, int >(double scalar, const PtrStepSzb& src2, const PtrStepSzb& dst, cudaStream_t stream); - template void divide_gpu<uchar, float >(double scalar, const PtrStepSzb& src2, const PtrStepSzb& dst, cudaStream_t stream); - template void divide_gpu<uchar, double>(double scalar, const PtrStepSzb& src2, const PtrStepSzb& dst, cudaStream_t stream); - - //template void divide_gpu<schar, uchar >(double scalar, const PtrStepSzb& src2, const PtrStepSzb& dst, cudaStream_t stream); - //template void divide_gpu<schar, schar >(double scalar, const PtrStepSzb& src2, const PtrStepSzb& dst, cudaStream_t stream); - //template void divide_gpu<schar, ushort>(double scalar, const PtrStepSzb& src2, const PtrStepSzb& dst, cudaStream_t stream); - //template void divide_gpu<schar, short >(double scalar, const PtrStepSzb& src2, const PtrStepSzb& dst, cudaStream_t stream); - //template void divide_gpu<schar, int >(double scalar, const PtrStepSzb& src2, const PtrStepSzb& dst, cudaStream_t stream); - //template void divide_gpu<schar, float >(double scalar, const PtrStepSzb& src2, const PtrStepSzb& dst, cudaStream_t stream); - //template void divide_gpu<schar, double>(double scalar, const PtrStepSzb& src2, const PtrStepSzb& dst, cudaStream_t stream); - - //template void divide_gpu<ushort, uchar >(double scalar, const PtrStepSzb& src2, const PtrStepSzb& dst, cudaStream_t stream); - //template void divide_gpu<ushort, schar >(double scalar, const PtrStepSzb& src2, const PtrStepSzb& dst, cudaStream_t stream); - template void divide_gpu<ushort, ushort>(double scalar, const PtrStepSzb& src2, const PtrStepSzb& dst, cudaStream_t stream); - //template void divide_gpu<ushort, short >(double scalar, const PtrStepSzb& src2, const PtrStepSzb& dst, cudaStream_t stream); - template void divide_gpu<ushort, int >(double scalar, const PtrStepSzb& src2, const PtrStepSzb& dst, cudaStream_t stream); - template void divide_gpu<ushort, float >(double scalar, const PtrStepSzb& src2, const PtrStepSzb& dst, cudaStream_t stream); - template void divide_gpu<ushort, double>(double scalar, const PtrStepSzb& src2, const PtrStepSzb& dst, cudaStream_t stream); - - //template void divide_gpu<short, uchar >(double scalar, const PtrStepSzb& src2, const PtrStepSzb& dst, cudaStream_t stream); - //template void divide_gpu<short, schar >(double scalar, const PtrStepSzb& src2, const PtrStepSzb& dst, cudaStream_t stream); - //template void divide_gpu<short, ushort>(double scalar, const PtrStepSzb& src2, const PtrStepSzb& dst, cudaStream_t stream); - template void divide_gpu<short, short >(double scalar, const PtrStepSzb& src2, const PtrStepSzb& dst, cudaStream_t stream); - template void divide_gpu<short, int >(double scalar, const PtrStepSzb& src2, const PtrStepSzb& dst, cudaStream_t stream); - template void divide_gpu<short, float >(double scalar, const PtrStepSzb& src2, const PtrStepSzb& dst, cudaStream_t stream); - template void divide_gpu<short, double>(double scalar, const PtrStepSzb& src2, const PtrStepSzb& dst, cudaStream_t stream); - - //template void divide_gpu<int, uchar >(double scalar, const PtrStepSzb& src2, const PtrStepSzb& dst, cudaStream_t stream); - //template void divide_gpu<int, schar >(double scalar, const PtrStepSzb& src2, const PtrStepSzb& dst, cudaStream_t stream); - //template void divide_gpu<int, ushort>(double scalar, const PtrStepSzb& src2, const PtrStepSzb& dst, cudaStream_t stream); - //template void divide_gpu<int, short >(double scalar, const PtrStepSzb& src2, const PtrStepSzb& dst, cudaStream_t stream); - template void divide_gpu<int, int >(double scalar, const PtrStepSzb& src2, const PtrStepSzb& dst, cudaStream_t stream); - template void divide_gpu<int, float >(double scalar, const PtrStepSzb& src2, const PtrStepSzb& dst, cudaStream_t stream); - template void divide_gpu<int, double>(double scalar, const PtrStepSzb& src2, const PtrStepSzb& dst, cudaStream_t stream); - - //template void divide_gpu<float, uchar >(double scalar, const PtrStepSzb& src2, const PtrStepSzb& dst, cudaStream_t stream); - //template void divide_gpu<float, schar >(double scalar, const PtrStepSzb& src2, const PtrStepSzb& dst, cudaStream_t stream); - //template void divide_gpu<float, ushort>(double scalar, const PtrStepSzb& src2, const PtrStepSzb& dst, cudaStream_t stream); - //template void divide_gpu<float, short >(double scalar, const PtrStepSzb& src2, const PtrStepSzb& dst, cudaStream_t stream); - //template void divide_gpu<float, int >(double scalar, const PtrStepSzb& src2, const PtrStepSzb& dst, cudaStream_t stream); - template void divide_gpu<float, float >(double scalar, const PtrStepSzb& src2, const PtrStepSzb& dst, cudaStream_t stream); - template void divide_gpu<float, double>(double scalar, const PtrStepSzb& src2, const PtrStepSzb& dst, cudaStream_t stream); - - //template void divide_gpu<double, uchar >(double scalar, const PtrStepSzb& src2, const PtrStepSzb& dst, cudaStream_t stream); - //template void divide_gpu<double, schar >(double scalar, const PtrStepSzb& src2, const PtrStepSzb& dst, cudaStream_t stream); - //template void divide_gpu<double, ushort>(double scalar, const PtrStepSzb& src2, const PtrStepSzb& dst, cudaStream_t stream); - //template void divide_gpu<double, short >(double scalar, const PtrStepSzb& src2, const PtrStepSzb& dst, cudaStream_t stream); - //template void divide_gpu<double, int >(double scalar, const PtrStepSzb& src2, const PtrStepSzb& dst, cudaStream_t stream); - //template void divide_gpu<double, float >(double scalar, const PtrStepSzb& src2, const PtrStepSzb& dst, cudaStream_t stream); - template void divide_gpu<double, double>(double scalar, const PtrStepSzb& src2, const PtrStepSzb& dst, cudaStream_t stream); - - ////////////////////////////////////////////////////////////////////////// - // absdiff - - template <typename T> struct Absdiff : binary_function<T, T, T> + template <typename T> struct AbsDiffMat : binary_function<T, T, T> { - static __device__ __forceinline__ int abs(int a) - { - return ::abs(a); - } - static __device__ __forceinline__ float abs(float a) - { - return ::fabsf(a); - } - static __device__ __forceinline__ double abs(double a) - { - return ::fabs(a); - } - __device__ __forceinline__ T operator ()(T a, T b) const { - return saturate_cast<T>(::abs(a - b)); + return saturate_cast<T>(_abs(a - b)); } + + __device__ __forceinline__ AbsDiffMat() {} + __device__ __forceinline__ AbsDiffMat(const AbsDiffMat& other) {} + }; +} + +namespace cv { namespace gpu { namespace device +{ + template <typename T, typename D> struct TransformFunctorTraits< VAbsDiff4<T, D> > : DefaultTransformFunctorTraits< VAbsDiff4<T, D> > + { + enum { smart_shift = 2 }; }; - template <> struct TransformFunctorTraits< Absdiff<ushort> > : DefaultTransformFunctorTraits< Absdiff<ushort> > + //////////////////////////////////// + + template <typename T, typename D> struct TransformFunctorTraits< VAbsDiff2<T, D> > : DefaultTransformFunctorTraits< VAbsDiff4<T, D> > { - enum { smart_block_dim_y = 8 }; - enum { smart_shift = 4 }; - }; - template <> struct TransformFunctorTraits< Absdiff<short> > : DefaultTransformFunctorTraits< Absdiff<short> > - { - enum { smart_block_dim_y = 8 }; - enum { smart_shift = 4 }; - }; - template <> struct TransformFunctorTraits< Absdiff<int> > : DefaultTransformFunctorTraits< Absdiff<int> > - { - enum { smart_block_dim_y = 8 }; - enum { smart_shift = 4 }; - }; - template <> struct TransformFunctorTraits< Absdiff<float> > : DefaultTransformFunctorTraits< Absdiff<float> > - { - enum { smart_block_dim_y = 8 }; - enum { smart_shift = 4 }; + enum { smart_shift = 2 }; }; - template <typename T> void absdiff_gpu(const PtrStepSzb src1, const PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream) + //////////////////////////////////// + + template <> struct TransformFunctorTraits< AbsDiffMat<ushort> > : DefaultTransformFunctorTraits< AbsDiffMat<ushort> > { - cv::gpu::device::transform((PtrStepSz<T>)src1, (PtrStepSz<T>)src2, (PtrStepSz<T>)dst, Absdiff<T>(), WithOutMask(), stream); + enum { smart_block_dim_y = 8 }; + enum { smart_shift = 4 }; + }; + template <> struct TransformFunctorTraits< AbsDiffMat<short> > : DefaultTransformFunctorTraits< AbsDiffMat<short> > + { + enum { smart_block_dim_y = 8 }; + enum { smart_shift = 4 }; + }; + template <> struct TransformFunctorTraits< AbsDiffMat<int> > : DefaultTransformFunctorTraits< AbsDiffMat<int> > + { + enum { smart_block_dim_y = 8 }; + enum { smart_shift = 4 }; + }; + template <> struct TransformFunctorTraits< AbsDiffMat<float> > : DefaultTransformFunctorTraits< AbsDiffMat<float> > + { + enum { smart_block_dim_y = 8 }; + enum { smart_shift = 4 }; + }; +}}} + +namespace arithm +{ + template <typename T> + void vabsDiff4(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream) + { + transform((PtrStepSz<T>) src1, (PtrStepSz<T>) src2, (PtrStepSz<T>) dst, VAbsDiff4<T, T>(), WithOutMask(), stream); } - //template void absdiff_gpu<uchar >(const PtrStepSzb src1, const PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream); - template void absdiff_gpu<schar >(const PtrStepSzb src1, const PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream); - //template void absdiff_gpu<ushort>(const PtrStepSzb src1, const PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream); - template void absdiff_gpu<short >(const PtrStepSzb src1, const PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream); - template void absdiff_gpu<int >(const PtrStepSzb src1, const PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream); - //template void absdiff_gpu<float >(const PtrStepSzb src1, const PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream); - template void absdiff_gpu<double>(const PtrStepSzb src1, const PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream); + template void vabsDiff4<uint>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream); + template void vabsDiff4<int>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream); - template <typename T> struct AbsdiffScalar : unary_function<T, T> + template <typename T> + void vabsDiff2(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream) { - AbsdiffScalar(double val_) : val(val_) {} + transform((PtrStepSz<T>) src1, (PtrStepSz<T>) src2, (PtrStepSz<T>) dst, VAbsDiff2<T, T>(), WithOutMask(), stream); + } + + template void vabsDiff2<uint>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream); + template void vabsDiff2<int>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream); + + template <typename T> + void absDiffMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream) + { + transform((PtrStepSz<T>) src1, (PtrStepSz<T>) src2, (PtrStepSz<T>) dst, AbsDiffMat<T>(), WithOutMask(), stream); + } + + template void absDiffMat<uchar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream); + template void absDiffMat<schar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream); + template void absDiffMat<ushort>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream); + template void absDiffMat<short>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream); + template void absDiffMat<int>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream); + template void absDiffMat<float>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream); + template void absDiffMat<double>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream); +} + +////////////////////////////////////////////////////////////////////////// +// absDiffScalar + +namespace +{ + template <typename T, typename S> struct AbsDiffScalar : unary_function<T, T> + { + S val; + + explicit AbsDiffScalar(S val_) : val(val_) {} + __device__ __forceinline__ T operator ()(T a) const { - return saturate_cast<T>(::fabs(a - val)); + abs_func<S> f; + return saturate_cast<T>(f(a - val)); } - double val; }; +} - template <> struct TransformFunctorTraits< AbsdiffScalar<ushort> > : DefaultTransformFunctorTraits< AbsdiffScalar<ushort> > +namespace cv { namespace gpu { namespace device +{ + template <> struct TransformFunctorTraits< AbsDiffScalar<ushort, float> > : DefaultTransformFunctorTraits< AbsDiffScalar<ushort, float> > { enum { smart_block_dim_y = 8 }; enum { smart_shift = 4 }; }; - template <> struct TransformFunctorTraits< AbsdiffScalar<short> > : DefaultTransformFunctorTraits< AbsdiffScalar<short> > + template <> struct TransformFunctorTraits< AbsDiffScalar<short, float> > : DefaultTransformFunctorTraits< AbsDiffScalar<short, float> > { enum { smart_block_dim_y = 8 }; enum { smart_shift = 4 }; }; - template <> struct TransformFunctorTraits< AbsdiffScalar<int> > : DefaultTransformFunctorTraits< AbsdiffScalar<int> > + template <> struct TransformFunctorTraits< AbsDiffScalar<int, float> > : DefaultTransformFunctorTraits< AbsDiffScalar<int, float> > { enum { smart_block_dim_y = 8 }; enum { smart_shift = 4 }; }; - template <> struct TransformFunctorTraits< AbsdiffScalar<float> > : DefaultTransformFunctorTraits< AbsdiffScalar<float> > + template <> struct TransformFunctorTraits< AbsDiffScalar<float, float> > : DefaultTransformFunctorTraits< AbsDiffScalar<float, float> > { enum { smart_block_dim_y = 8 }; enum { smart_shift = 4 }; }; +}}} - template <typename T> void absdiff_gpu(const PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream) +namespace arithm +{ + template <typename T, typename S> + void absDiffScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream) { - cudaSafeCall( cudaSetDoubleForDevice(&val) ); - AbsdiffScalar<T> op(val); - cv::gpu::device::transform((PtrStepSz<T>)src1, (PtrStepSz<T>)dst, op, WithOutMask(), stream); + AbsDiffScalar<T, S> op(static_cast<S>(val)); + + transform((PtrStepSz<T>) src1, (PtrStepSz<T>) dst, op, WithOutMask(), stream); } - //template void absdiff_gpu<uchar >(const PtrStepSzb src1, double src2, PtrStepSzb dst, cudaStream_t stream); - template void absdiff_gpu<schar >(const PtrStepSzb src1, double src2, PtrStepSzb dst, cudaStream_t stream); - //template void absdiff_gpu<ushort>(const PtrStepSzb src1, double src2, PtrStepSzb dst, cudaStream_t stream); - template void absdiff_gpu<short >(const PtrStepSzb src1, double src2, PtrStepSzb dst, cudaStream_t stream); - template void absdiff_gpu<int >(const PtrStepSzb src1, double src2, PtrStepSzb dst, cudaStream_t stream); - //template void absdiff_gpu<float >(const PtrStepSzb src1, double src2, PtrStepSzb dst, cudaStream_t stream); - template void absdiff_gpu<double>(const PtrStepSzb src1, double src2, PtrStepSzb dst, cudaStream_t stream); + template void absDiffScalar<uchar, float>(PtrStepSzb src1, double src2, PtrStepSzb dst, cudaStream_t stream); + template void absDiffScalar<schar, float>(PtrStepSzb src1, double src2, PtrStepSzb dst, cudaStream_t stream); + template void absDiffScalar<ushort, float>(PtrStepSzb src1, double src2, PtrStepSzb dst, cudaStream_t stream); + template void absDiffScalar<short, float>(PtrStepSzb src1, double src2, PtrStepSzb dst, cudaStream_t stream); + template void absDiffScalar<int, float>(PtrStepSzb src1, double src2, PtrStepSzb dst, cudaStream_t stream); + template void absDiffScalar<float, float>(PtrStepSzb src1, double src2, PtrStepSzb dst, cudaStream_t stream); + template void absDiffScalar<double, double>(PtrStepSzb src1, double src2, PtrStepSzb dst, cudaStream_t stream); +} - ////////////////////////////////////////////////////////////////////////////////////// - // Compare +////////////////////////////////////////////////////////////////////////// +// absMat - template <template <typename> class Op, typename T> - struct Compare: binary_function<T, T, uchar> +namespace cv { namespace gpu { namespace device +{ + template <> struct TransformFunctorTraits< abs_func<ushort> > : DefaultTransformFunctorTraits< abs_func<ushort> > { - __device__ __forceinline__ uchar operator()(T src1, T src2) const + enum { smart_block_dim_y = 8 }; + enum { smart_shift = 4 }; + }; + template <> struct TransformFunctorTraits< abs_func<short> > : DefaultTransformFunctorTraits< abs_func<short> > + { + enum { smart_block_dim_y = 8 }; + enum { smart_shift = 4 }; + }; + template <> struct TransformFunctorTraits< abs_func<int> > : DefaultTransformFunctorTraits< abs_func<int> > + { + enum { smart_block_dim_y = 8 }; + enum { smart_shift = 4 }; + }; + template <> struct TransformFunctorTraits< abs_func<float> > : DefaultTransformFunctorTraits< abs_func<float> > + { + enum { smart_block_dim_y = 8 }; + enum { smart_shift = 4 }; + }; +}}} + +namespace arithm +{ + template <typename T> + void absMat(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream) + { + transform((PtrStepSz<T>) src, (PtrStepSz<T>) dst, abs_func<T>(), WithOutMask(), stream); + } + + template void absMat<uchar>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream); + template void absMat<schar>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream); + template void absMat<ushort>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream); + template void absMat<short>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream); + template void absMat<int>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream); + template void absMat<float>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream); + template void absMat<double>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream); +} + +////////////////////////////////////////////////////////////////////////// +// sqrMat + +namespace +{ + template <typename T> struct Sqr : unary_function<T, T> + { + __device__ __forceinline__ T operator ()(T x) const + { + return saturate_cast<T>(x * x); + } + + __device__ __forceinline__ Sqr() {} + __device__ __forceinline__ Sqr(const Sqr& other) {} + }; +} + +namespace cv { namespace gpu { namespace device +{ + template <> struct TransformFunctorTraits< Sqr<ushort> > : DefaultTransformFunctorTraits< Sqr<ushort> > + { + enum { smart_block_dim_y = 8 }; + enum { smart_shift = 4 }; + }; + template <> struct TransformFunctorTraits< Sqr<short> > : DefaultTransformFunctorTraits< Sqr<short> > + { + enum { smart_block_dim_y = 8 }; + enum { smart_shift = 4 }; + }; + template <> struct TransformFunctorTraits< Sqr<int> > : DefaultTransformFunctorTraits< Sqr<int> > + { + enum { smart_block_dim_y = 8 }; + enum { smart_shift = 4 }; + }; + template <> struct TransformFunctorTraits< Sqr<float> > : DefaultTransformFunctorTraits< Sqr<float> > + { + enum { smart_block_dim_y = 8 }; + enum { smart_shift = 4 }; + }; +}}} + +namespace arithm +{ + template <typename T> + void sqrMat(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream) + { + transform((PtrStepSz<T>) src, (PtrStepSz<T>) dst, Sqr<T>(), WithOutMask(), stream); + } + + template void sqrMat<uchar>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream); + template void sqrMat<schar>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream); + template void sqrMat<ushort>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream); + template void sqrMat<short>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream); + template void sqrMat<int>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream); + template void sqrMat<float>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream); + template void sqrMat<double>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream); +} + +////////////////////////////////////////////////////////////////////////// +// sqrtMat + +namespace cv { namespace gpu { namespace device +{ + template <> struct TransformFunctorTraits< sqrt_func<uchar> > : DefaultTransformFunctorTraits< sqrt_func<ushort> > + { + enum { smart_shift = 4 }; + }; + template <> struct TransformFunctorTraits< sqrt_func<schar> > : DefaultTransformFunctorTraits< sqrt_func<schar> > + { + enum { smart_shift = 4 }; + }; + template <> struct TransformFunctorTraits< sqrt_func<ushort> > : DefaultTransformFunctorTraits< sqrt_func<ushort> > + { + enum { smart_block_dim_y = 8 }; + enum { smart_shift = 4 }; + }; + template <> struct TransformFunctorTraits< sqrt_func<short> > : DefaultTransformFunctorTraits< sqrt_func<short> > + { + enum { smart_block_dim_y = 8 }; + enum { smart_shift = 4 }; + }; + template <> struct TransformFunctorTraits< sqrt_func<int> > : DefaultTransformFunctorTraits< sqrt_func<int> > + { + enum { smart_block_dim_y = 8 }; + enum { smart_shift = 4 }; + }; + template <> struct TransformFunctorTraits< sqrt_func<float> > : DefaultTransformFunctorTraits< sqrt_func<float> > + { + enum { smart_block_dim_y = 8 }; + enum { smart_shift = 4 }; + }; +}}} + +namespace arithm +{ + template <typename T> + void sqrtMat(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream) + { + transform((PtrStepSz<T>) src, (PtrStepSz<T>) dst, sqrt_func<T>(), WithOutMask(), stream); + } + + template void sqrtMat<uchar>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream); + template void sqrtMat<schar>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream); + template void sqrtMat<ushort>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream); + template void sqrtMat<short>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream); + template void sqrtMat<int>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream); + template void sqrtMat<float>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream); + template void sqrtMat<double>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream); +} + +////////////////////////////////////////////////////////////////////////// +// logMat + +namespace cv { namespace gpu { namespace device +{ + template <> struct TransformFunctorTraits< log_func<uchar> > : DefaultTransformFunctorTraits< log_func<ushort> > + { + enum { smart_shift = 4 }; + }; + template <> struct TransformFunctorTraits< log_func<schar> > : DefaultTransformFunctorTraits< log_func<schar> > + { + enum { smart_shift = 4 }; + }; + template <> struct TransformFunctorTraits< log_func<ushort> > : DefaultTransformFunctorTraits< log_func<ushort> > + { + enum { smart_block_dim_y = 8 }; + enum { smart_shift = 4 }; + }; + template <> struct TransformFunctorTraits< log_func<short> > : DefaultTransformFunctorTraits< log_func<short> > + { + enum { smart_block_dim_y = 8 }; + enum { smart_shift = 4 }; + }; + template <> struct TransformFunctorTraits< log_func<int> > : DefaultTransformFunctorTraits< log_func<int> > + { + enum { smart_block_dim_y = 8 }; + enum { smart_shift = 4 }; + }; + template <> struct TransformFunctorTraits< log_func<float> > : DefaultTransformFunctorTraits< log_func<float> > + { + enum { smart_block_dim_y = 8 }; + enum { smart_shift = 4 }; + }; +}}} + +namespace arithm +{ + template <typename T> + void logMat(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream) + { + transform((PtrStepSz<T>) src, (PtrStepSz<T>) dst, log_func<T>(), WithOutMask(), stream); + } + + template void logMat<uchar>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream); + template void logMat<schar>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream); + template void logMat<ushort>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream); + template void logMat<short>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream); + template void logMat<int>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream); + template void logMat<float>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream); + template void logMat<double>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream); +} + +////////////////////////////////////////////////////////////////////////// +// expMat + +namespace +{ + template <typename T> struct Exp : unary_function<T, T> + { + __device__ __forceinline__ T operator ()(T x) const + { + exp_func<T> f; + return saturate_cast<T>(f(x)); + } + + __device__ __forceinline__ Exp() {} + __device__ __forceinline__ Exp(const Exp& other) {} + }; +} + +namespace cv { namespace gpu { namespace device +{ + template <> struct TransformFunctorTraits< Exp<ushort> > : DefaultTransformFunctorTraits< Exp<ushort> > + { + enum { smart_block_dim_y = 8 }; + enum { smart_shift = 4 }; + }; + template <> struct TransformFunctorTraits< Exp<short> > : DefaultTransformFunctorTraits< Exp<short> > + { + enum { smart_block_dim_y = 8 }; + enum { smart_shift = 4 }; + }; + template <> struct TransformFunctorTraits< Exp<int> > : DefaultTransformFunctorTraits< Exp<int> > + { + enum { smart_block_dim_y = 8 }; + enum { smart_shift = 4 }; + }; + template <> struct TransformFunctorTraits< Exp<float> > : DefaultTransformFunctorTraits< Exp<float> > + { + enum { smart_block_dim_y = 8 }; + enum { smart_shift = 4 }; + }; +}}} + +namespace arithm +{ + template <typename T> + void expMat(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream) + { + transform((PtrStepSz<T>) src, (PtrStepSz<T>) dst, Exp<T>(), WithOutMask(), stream); + } + + template void expMat<uchar>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream); + template void expMat<schar>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream); + template void expMat<ushort>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream); + template void expMat<short>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream); + template void expMat<int>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream); + template void expMat<float>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream); + template void expMat<double>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream); +} + +////////////////////////////////////////////////////////////////////////////////////// +// cmpMat + +namespace +{ + template <template <typename> class Op, typename T> + struct Cmp: binary_function<T, T, uchar> + { + __device__ __forceinline__ uchar operator()(T a, T b) const { Op<T> op; - return static_cast<uchar>(static_cast<int>(op(src1, src2)) * 255); + return -op(a, b); } }; +} -#define IMPLEMENT_COMPARE_TRANSFORM_FUNCTOR_TRAITS(op, type, block_dim_y, shift) \ - template <> struct TransformFunctorTraits< Compare<op, type> > : DefaultTransformFunctorTraits< Compare<op, type> > \ - { \ - enum { smart_block_dim_y = block_dim_y }; \ - enum { smart_shift = shift }; \ - }; +namespace cv { namespace gpu { namespace device +{ + #define IMPLEMENT_COMPARE_TRANSFORM_FUNCTOR_TRAITS(op, type, block_dim_y, shift) \ + template <> struct TransformFunctorTraits< Cmp<op, type> > : DefaultTransformFunctorTraits< Cmp<op, type> > \ + { \ + enum { smart_block_dim_y = block_dim_y }; \ + enum { smart_shift = shift }; \ + }; IMPLEMENT_COMPARE_TRANSFORM_FUNCTOR_TRAITS(equal_to, int, 8, 4) IMPLEMENT_COMPARE_TRANSFORM_FUNCTOR_TRAITS(equal_to, float, 8, 4) @@ -1190,132 +2113,136 @@ namespace cv { namespace gpu { namespace device IMPLEMENT_COMPARE_TRANSFORM_FUNCTOR_TRAITS(less_equal, int, 8, 4) IMPLEMENT_COMPARE_TRANSFORM_FUNCTOR_TRAITS(less_equal, float, 8, 4) -#undef IMPLEMENT_COMPARE_TRANSFORM_FUNCTOR_TRAITS + #undef IMPLEMENT_COMPARE_TRANSFORM_FUNCTOR_TRAITS +}}} - template <template <typename> class Op, typename T> void compare(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream) +namespace arithm +{ + template <template <typename> class Op, typename T> + void cmpMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream) { - Compare<Op, T> op; - cv::gpu::device::transform(static_cast< PtrStepSz<T> >(src1), static_cast< PtrStepSz<T> >(src2), dst, op, WithOutMask(), stream); + Cmp<Op, T> op; + transform((PtrStepSz<T>) src1, (PtrStepSz<T>) src2, dst, op, WithOutMask(), stream); } - template <typename T> void compare_eq(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream) + template <typename T> void cmpMatEq(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream) { - compare<equal_to, T>(src1, src2, dst, stream); + cmpMat<equal_to, T>(src1, src2, dst, stream); } - template <typename T> void compare_ne(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream) + template <typename T> void cmpMatNe(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream) { - compare<not_equal_to, T>(src1, src2, dst, stream); + cmpMat<not_equal_to, T>(src1, src2, dst, stream); } - template <typename T> void compare_lt(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream) + template <typename T> void cmpMatLt(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream) { - compare<less, T>(src1, src2, dst, stream); + cmpMat<less, T>(src1, src2, dst, stream); } - template <typename T> void compare_le(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream) + template <typename T> void cmpMatLe(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream) { - compare<less_equal, T>(src1, src2, dst, stream); + cmpMat<less_equal, T>(src1, src2, dst, stream); } - template void compare_eq<uchar >(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream); - template void compare_eq<schar >(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream); - template void compare_eq<ushort>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream); - template void compare_eq<short >(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream); - template void compare_eq<int >(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream); - template void compare_eq<float >(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream); - template void compare_eq<double>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream); + template void cmpMatEq<uchar >(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream); + template void cmpMatEq<schar >(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream); + template void cmpMatEq<ushort>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream); + template void cmpMatEq<short >(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream); + template void cmpMatEq<int >(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream); + template void cmpMatEq<float >(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream); + template void cmpMatEq<double>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream); - template void compare_ne<uchar >(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream); - template void compare_ne<schar >(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream); - template void compare_ne<ushort>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream); - template void compare_ne<short >(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream); - template void compare_ne<int >(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream); - template void compare_ne<float >(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream); - template void compare_ne<double>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream); + template void cmpMatNe<uchar >(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream); + template void cmpMatNe<schar >(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream); + template void cmpMatNe<ushort>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream); + template void cmpMatNe<short >(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream); + template void cmpMatNe<int >(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream); + template void cmpMatNe<float >(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream); + template void cmpMatNe<double>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream); - template void compare_lt<uchar >(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream); - template void compare_lt<schar >(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream); - template void compare_lt<ushort>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream); - template void compare_lt<short >(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream); - template void compare_lt<int >(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream); - template void compare_lt<float >(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream); - template void compare_lt<double>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream); + template void cmpMatLt<uchar >(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream); + template void cmpMatLt<schar >(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream); + template void cmpMatLt<ushort>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream); + template void cmpMatLt<short >(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream); + template void cmpMatLt<int >(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream); + template void cmpMatLt<float >(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream); + template void cmpMatLt<double>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream); - template void compare_le<uchar >(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream); - template void compare_le<schar >(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream); - template void compare_le<ushort>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream); - template void compare_le<short >(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream); - template void compare_le<int >(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream); - template void compare_le<float >(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream); - template void compare_le<double>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream); + template void cmpMatLe<uchar >(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream); + template void cmpMatLe<schar >(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream); + template void cmpMatLe<ushort>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream); + template void cmpMatLe<short >(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream); + template void cmpMatLe<int >(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream); + template void cmpMatLe<float >(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream); + template void cmpMatLe<double>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream); +} +////////////////////////////////////////////////////////////////////////////////////// +// cmpScalar + +namespace +{ #define TYPE_VEC(type, cn) typename TypeVec<type, cn>::vec_type - template <template <typename> class Op, typename T, int cn> struct CompareScalar; + template <template <typename> class Op, typename T, int cn> struct CmpScalar; template <template <typename> class Op, typename T> - struct CompareScalar<Op, T, 1>: unary_function<T, uchar> + struct CmpScalar<Op, T, 1> : unary_function<T, uchar> { const T val; - __host__ explicit CompareScalar(T val_) : val(val_) {} + __host__ explicit CmpScalar(T val_) : val(val_) {} __device__ __forceinline__ uchar operator()(T src) const { - Op<T> op; - return static_cast<uchar>(static_cast<int>(op(src, val)) * 255); + Cmp<Op, T> op; + return op(src, val); } }; template <template <typename> class Op, typename T> - struct CompareScalar<Op, T, 2>: unary_function<TYPE_VEC(T, 2), TYPE_VEC(uchar, 2)> + struct CmpScalar<Op, T, 2> : unary_function<TYPE_VEC(T, 2), TYPE_VEC(uchar, 2)> { const TYPE_VEC(T, 2) val; - __host__ explicit CompareScalar(TYPE_VEC(T, 2) val_) : val(val_) {} + __host__ explicit CmpScalar(TYPE_VEC(T, 2) val_) : val(val_) {} __device__ __forceinline__ TYPE_VEC(uchar, 2) operator()(const TYPE_VEC(T, 2) & src) const { - Op<T> op; - return VecTraits<TYPE_VEC(uchar, 2)>::make( - static_cast<uchar>(static_cast<int>(op(src.x, val.x)) * 255), - static_cast<uchar>(static_cast<int>(op(src.y, val.y)) * 255)); + Cmp<Op, T> op; + return VecTraits<TYPE_VEC(uchar, 2)>::make(op(src.x, val.x), op(src.y, val.y)); } }; template <template <typename> class Op, typename T> - struct CompareScalar<Op, T, 3>: unary_function<TYPE_VEC(T, 3), TYPE_VEC(uchar, 3)> + struct CmpScalar<Op, T, 3> : unary_function<TYPE_VEC(T, 3), TYPE_VEC(uchar, 3)> { const TYPE_VEC(T, 3) val; - __host__ explicit CompareScalar(TYPE_VEC(T, 3) val_) : val(val_) {} + __host__ explicit CmpScalar(TYPE_VEC(T, 3) val_) : val(val_) {} __device__ __forceinline__ TYPE_VEC(uchar, 3) operator()(const TYPE_VEC(T, 3) & src) const { - Op<T> op; - return VecTraits<TYPE_VEC(uchar, 3)>::make( - static_cast<uchar>(static_cast<int>(op(src.x, val.x)) * 255), - static_cast<uchar>(static_cast<int>(op(src.y, val.y)) * 255), - static_cast<uchar>(static_cast<int>(op(src.z, val.z)) * 255)); + Cmp<Op, T> op; + return VecTraits<TYPE_VEC(uchar, 3)>::make(op(src.x, val.x), op(src.y, val.y), op(src.z, val.z)); } }; template <template <typename> class Op, typename T> - struct CompareScalar<Op, T, 4>: unary_function<TYPE_VEC(T, 4), TYPE_VEC(uchar, 4)> + struct CmpScalar<Op, T, 4> : unary_function<TYPE_VEC(T, 4), TYPE_VEC(uchar, 4)> { const TYPE_VEC(T, 4) val; - __host__ explicit CompareScalar(TYPE_VEC(T, 4) val_) : val(val_) {} + __host__ explicit CmpScalar(TYPE_VEC(T, 4) val_) : val(val_) {} __device__ __forceinline__ TYPE_VEC(uchar, 4) operator()(const TYPE_VEC(T, 4) & src) const { - Op<T> op; - return VecTraits<TYPE_VEC(uchar, 4)>::make( - static_cast<uchar>(static_cast<int>(op(src.x, val.x)) * 255), - static_cast<uchar>(static_cast<int>(op(src.y, val.y)) * 255), - static_cast<uchar>(static_cast<int>(op(src.z, val.z)) * 255), - static_cast<uchar>(static_cast<int>(op(src.w, val.w)) * 255)); + Cmp<Op, T> op; + return VecTraits<TYPE_VEC(uchar, 4)>::make(op(src.x, val.x), op(src.y, val.y), op(src.z, val.z), op(src.w, val.w)); } }; #undef TYPE_VEC +} +namespace cv { namespace gpu { namespace device +{ #define IMPLEMENT_COMPARE_TRANSFORM_FUNCTOR_TRAITS(op, type, block_dim_y, shift) \ - template <> struct TransformFunctorTraits< CompareScalar<op, type, 1> > : DefaultTransformFunctorTraits< CompareScalar<op, type, 1> > \ + template <> struct TransformFunctorTraits< CmpScalar<op, type, 1> > : DefaultTransformFunctorTraits< CmpScalar<op, type, 1> > \ { \ enum { smart_block_dim_y = block_dim_y }; \ enum { smart_shift = shift }; \ @@ -1335,8 +2262,12 @@ namespace cv { namespace gpu { namespace device IMPLEMENT_COMPARE_TRANSFORM_FUNCTOR_TRAITS(less_equal, float, 8, 4) #undef IMPLEMENT_COMPARE_TRANSFORM_FUNCTOR_TRAITS +}}} - template <template <typename> class Op, typename T, int cn> void compare(PtrStepSzb src, double val[4], PtrStepSzb dst, cudaStream_t stream) +namespace arithm +{ + template <template <typename> class Op, typename T, int cn> + void cmpScalar(PtrStepSzb src, double val[4], PtrStepSzb dst, cudaStream_t stream) { typedef typename TypeVec<T, cn>::vec_type src_t; typedef typename TypeVec<uchar, cn>::vec_type dst_t; @@ -1344,505 +2275,721 @@ namespace cv { namespace gpu { namespace device T sval[] = {static_cast<T>(val[0]), static_cast<T>(val[1]), static_cast<T>(val[2]), static_cast<T>(val[3])}; src_t val1 = VecTraits<src_t>::make(sval); - CompareScalar<Op, T, cn> op(val1); - - cv::gpu::device::transform(static_cast< PtrStepSz<src_t> >(src), static_cast< PtrStepSz<dst_t> >(dst), op, WithOutMask(), stream); + CmpScalar<Op, T, cn> op(val1); + transform((PtrStepSz<src_t>) src, (PtrStepSz<dst_t>) dst, op, WithOutMask(), stream); } - template <typename T> void compare_eq(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream) + template <typename T> void cmpScalarEq(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream) { typedef void (*func_t)(PtrStepSzb src, double val[4], PtrStepSzb dst, cudaStream_t stream); static const func_t funcs[] = { 0, - compare<equal_to, T, 1>, - compare<equal_to, T, 2>, - compare<equal_to, T, 3>, - compare<equal_to, T, 4> + cmpScalar<equal_to, T, 1>, + cmpScalar<equal_to, T, 2>, + cmpScalar<equal_to, T, 3>, + cmpScalar<equal_to, T, 4> }; funcs[cn](src, val, dst, stream); } - template <typename T> void compare_ne(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream) + template <typename T> void cmpScalarNe(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream) { typedef void (*func_t)(PtrStepSzb src, double val[4], PtrStepSzb dst, cudaStream_t stream); static const func_t funcs[] = { 0, - compare<not_equal_to, T, 1>, - compare<not_equal_to, T, 2>, - compare<not_equal_to, T, 3>, - compare<not_equal_to, T, 4> + cmpScalar<not_equal_to, T, 1>, + cmpScalar<not_equal_to, T, 2>, + cmpScalar<not_equal_to, T, 3>, + cmpScalar<not_equal_to, T, 4> }; funcs[cn](src, val, dst, stream); } - template <typename T> void compare_lt(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream) + template <typename T> void cmpScalarLt(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream) { typedef void (*func_t)(PtrStepSzb src, double val[4], PtrStepSzb dst, cudaStream_t stream); static const func_t funcs[] = { 0, - compare<less, T, 1>, - compare<less, T, 2>, - compare<less, T, 3>, - compare<less, T, 4> + cmpScalar<less, T, 1>, + cmpScalar<less, T, 2>, + cmpScalar<less, T, 3>, + cmpScalar<less, T, 4> }; funcs[cn](src, val, dst, stream); } - template <typename T> void compare_le(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream) + template <typename T> void cmpScalarLe(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream) { typedef void (*func_t)(PtrStepSzb src, double val[4], PtrStepSzb dst, cudaStream_t stream); static const func_t funcs[] = { 0, - compare<less_equal, T, 1>, - compare<less_equal, T, 2>, - compare<less_equal, T, 3>, - compare<less_equal, T, 4> + cmpScalar<less_equal, T, 1>, + cmpScalar<less_equal, T, 2>, + cmpScalar<less_equal, T, 3>, + cmpScalar<less_equal, T, 4> }; funcs[cn](src, val, dst, stream); } - template <typename T> void compare_gt(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream) + template <typename T> void cmpScalarGt(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream) { typedef void (*func_t)(PtrStepSzb src, double val[4], PtrStepSzb dst, cudaStream_t stream); static const func_t funcs[] = { 0, - compare<greater, T, 1>, - compare<greater, T, 2>, - compare<greater, T, 3>, - compare<greater, T, 4> + cmpScalar<greater, T, 1>, + cmpScalar<greater, T, 2>, + cmpScalar<greater, T, 3>, + cmpScalar<greater, T, 4> }; funcs[cn](src, val, dst, stream); } - template <typename T> void compare_ge(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream) + template <typename T> void cmpScalarGe(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream) { typedef void (*func_t)(PtrStepSzb src, double val[4], PtrStepSzb dst, cudaStream_t stream); static const func_t funcs[] = { 0, - compare<greater_equal, T, 1>, - compare<greater_equal, T, 2>, - compare<greater_equal, T, 3>, - compare<greater_equal, T, 4> + cmpScalar<greater_equal, T, 1>, + cmpScalar<greater_equal, T, 2>, + cmpScalar<greater_equal, T, 3>, + cmpScalar<greater_equal, T, 4> }; funcs[cn](src, val, dst, stream); } - template void compare_eq<uchar >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream); - template void compare_eq<schar >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream); - template void compare_eq<ushort>(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream); - template void compare_eq<short >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream); - template void compare_eq<int >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream); - template void compare_eq<float >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream); - template void compare_eq<double>(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream); + template void cmpScalarEq<uchar >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream); + template void cmpScalarEq<schar >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream); + template void cmpScalarEq<ushort>(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream); + template void cmpScalarEq<short >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream); + template void cmpScalarEq<int >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream); + template void cmpScalarEq<float >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream); + template void cmpScalarEq<double>(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream); - template void compare_ne<uchar >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream); - template void compare_ne<schar >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream); - template void compare_ne<ushort>(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream); - template void compare_ne<short >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream); - template void compare_ne<int >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream); - template void compare_ne<float >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream); - template void compare_ne<double>(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream); + template void cmpScalarNe<uchar >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream); + template void cmpScalarNe<schar >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream); + template void cmpScalarNe<ushort>(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream); + template void cmpScalarNe<short >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream); + template void cmpScalarNe<int >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream); + template void cmpScalarNe<float >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream); + template void cmpScalarNe<double>(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream); - template void compare_lt<uchar >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream); - template void compare_lt<schar >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream); - template void compare_lt<ushort>(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream); - template void compare_lt<short >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream); - template void compare_lt<int >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream); - template void compare_lt<float >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream); - template void compare_lt<double>(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream); + template void cmpScalarLt<uchar >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream); + template void cmpScalarLt<schar >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream); + template void cmpScalarLt<ushort>(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream); + template void cmpScalarLt<short >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream); + template void cmpScalarLt<int >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream); + template void cmpScalarLt<float >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream); + template void cmpScalarLt<double>(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream); - template void compare_le<uchar >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream); - template void compare_le<schar >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream); - template void compare_le<ushort>(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream); - template void compare_le<short >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream); - template void compare_le<int >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream); - template void compare_le<float >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream); - template void compare_le<double>(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream); + template void cmpScalarLe<uchar >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream); + template void cmpScalarLe<schar >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream); + template void cmpScalarLe<ushort>(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream); + template void cmpScalarLe<short >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream); + template void cmpScalarLe<int >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream); + template void cmpScalarLe<float >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream); + template void cmpScalarLe<double>(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream); - template void compare_gt<uchar >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream); - template void compare_gt<schar >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream); - template void compare_gt<ushort>(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream); - template void compare_gt<short >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream); - template void compare_gt<int >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream); - template void compare_gt<float >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream); - template void compare_gt<double>(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream); + template void cmpScalarGt<uchar >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream); + template void cmpScalarGt<schar >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream); + template void cmpScalarGt<ushort>(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream); + template void cmpScalarGt<short >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream); + template void cmpScalarGt<int >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream); + template void cmpScalarGt<float >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream); + template void cmpScalarGt<double>(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream); - template void compare_ge<uchar >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream); - template void compare_ge<schar >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream); - template void compare_ge<ushort>(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream); - template void compare_ge<short >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream); - template void compare_ge<int >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream); - template void compare_ge<float >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream); - template void compare_ge<double>(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream); + template void cmpScalarGe<uchar >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream); + template void cmpScalarGe<schar >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream); + template void cmpScalarGe<ushort>(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream); + template void cmpScalarGe<short >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream); + template void cmpScalarGe<int >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream); + template void cmpScalarGe<float >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream); + template void cmpScalarGe<double>(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream); +} - ////////////////////////////////////////////////////////////////////////// - // Unary bitwise logical matrix operations +////////////////////////////////////////////////////////////////////////////////////// +// bitMat - enum { UN_OP_NOT }; - - template <typename T, int opid> - struct UnOp; - - template <typename T> - struct UnOp<T, UN_OP_NOT> +namespace cv { namespace gpu { namespace device +{ + template <> struct TransformFunctorTraits< bit_not<uchar> > : DefaultTransformFunctorTraits< bit_not<uchar> > { - static __device__ __forceinline__ T call(T v) { return ~v; } + enum { smart_shift = 4 }; + }; + template <> struct TransformFunctorTraits< bit_not<ushort> > : DefaultTransformFunctorTraits< bit_not<ushort> > + { + enum { smart_shift = 4 }; + }; + template <> struct TransformFunctorTraits< bit_not<uint> > : DefaultTransformFunctorTraits< bit_not<uint> > + { + enum { smart_shift = 2 }; }; - - template <int opid> - __global__ void bitwiseUnOpKernel(int rows, int width, const PtrStepb src, PtrStepb dst) + template <> struct TransformFunctorTraits< bit_and<uchar> > : DefaultTransformFunctorTraits< bit_and<uchar> > { - const int x = (blockDim.x * blockIdx.x + threadIdx.x) * 4; - const int y = blockDim.y * blockIdx.y + threadIdx.y; + enum { smart_shift = 4 }; + }; + template <> struct TransformFunctorTraits< bit_and<ushort> > : DefaultTransformFunctorTraits< bit_and<ushort> > + { + enum { smart_shift = 4 }; + }; + template <> struct TransformFunctorTraits< bit_and<uint> > : DefaultTransformFunctorTraits< bit_and<uint> > + { + enum { smart_shift = 2 }; + }; - if (y < rows) + template <> struct TransformFunctorTraits< bit_or<uchar> > : DefaultTransformFunctorTraits< bit_or<uchar> > + { + enum { smart_shift = 4 }; + }; + template <> struct TransformFunctorTraits< bit_or<ushort> > : DefaultTransformFunctorTraits< bit_or<ushort> > + { + enum { smart_shift = 4 }; + }; + template <> struct TransformFunctorTraits< bit_or<uint> > : DefaultTransformFunctorTraits< bit_or<uint> > + { + enum { smart_shift = 2 }; + }; + + template <> struct TransformFunctorTraits< bit_xor<uchar> > : DefaultTransformFunctorTraits< bit_xor<uchar> > + { + enum { smart_shift = 4 }; + }; + template <> struct TransformFunctorTraits< bit_xor<ushort> > : DefaultTransformFunctorTraits< bit_xor<ushort> > + { + enum { smart_shift = 4 }; + }; + template <> struct TransformFunctorTraits< bit_xor<uint> > : DefaultTransformFunctorTraits< bit_xor<uint> > + { + enum { smart_shift = 2 }; + }; +}}} + +namespace arithm +{ + template <typename T> void bitMatNot(PtrStepSzb src, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream) + { + if (mask.data) + transform((PtrStepSz<T>) src, (PtrStepSz<T>) dst, bit_not<T>(), mask, stream); + else + transform((PtrStepSz<T>) src, (PtrStepSz<T>) dst, bit_not<T>(), WithOutMask(), stream); + } + + template <typename T> void bitMatAnd(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream) + { + if (mask.data) + transform((PtrStepSz<T>) src1, (PtrStepSz<T>) src2, (PtrStepSz<T>) dst, bit_and<T>(), mask, stream); + else + transform((PtrStepSz<T>) src1, (PtrStepSz<T>) src2, (PtrStepSz<T>) dst, bit_and<T>(), WithOutMask(), stream); + } + + template <typename T> void bitMatOr(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream) + { + if (mask.data) + transform((PtrStepSz<T>) src1, (PtrStepSz<T>) src2, (PtrStepSz<T>) dst, bit_or<T>(), mask, stream); + else + transform((PtrStepSz<T>) src1, (PtrStepSz<T>) src2, (PtrStepSz<T>) dst, bit_or<T>(), WithOutMask(), stream); + } + + template <typename T> void bitMatXor(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream) + { + if (mask.data) + transform((PtrStepSz<T>) src1, (PtrStepSz<T>) src2, (PtrStepSz<T>) dst, bit_xor<T>(), mask, stream); + else + transform((PtrStepSz<T>) src1, (PtrStepSz<T>) src2, (PtrStepSz<T>) dst, bit_xor<T>(), WithOutMask(), stream); + } + + template void bitMatNot<uchar>(PtrStepSzb src, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void bitMatNot<ushort>(PtrStepSzb src, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void bitMatNot<uint>(PtrStepSzb src, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + + template void bitMatAnd<uchar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void bitMatAnd<ushort>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void bitMatAnd<uint>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + + template void bitMatOr<uchar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void bitMatOr<ushort>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void bitMatOr<uint>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + + template void bitMatXor<uchar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void bitMatXor<ushort>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void bitMatXor<uint>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); +} + +////////////////////////////////////////////////////////////////////////////////////// +// bitScalar + +namespace cv { namespace gpu { namespace device +{ + template <> struct TransformFunctorTraits< binder2nd< bit_and<uchar> > > : DefaultTransformFunctorTraits< binder2nd< bit_and<uchar> > > + { + enum { smart_shift = 4 }; + }; + template <> struct TransformFunctorTraits< binder2nd< bit_and<ushort> > > : DefaultTransformFunctorTraits< binder2nd< bit_and<ushort> > > + { + enum { smart_shift = 4 }; + }; + template <> struct TransformFunctorTraits< binder2nd< bit_and<uint> > > : DefaultTransformFunctorTraits< binder2nd< bit_and<uint> > > + { + enum { smart_shift = 2 }; + }; + + template <> struct TransformFunctorTraits< binder2nd< bit_or<uchar> > > : DefaultTransformFunctorTraits< binder2nd< bit_or<uchar> > > + { + enum { smart_shift = 4 }; + }; + template <> struct TransformFunctorTraits< binder2nd< bit_or<ushort> > > : DefaultTransformFunctorTraits< binder2nd< bit_or<ushort> > > + { + enum { smart_shift = 4 }; + }; + template <> struct TransformFunctorTraits< binder2nd< bit_or<uint> > > : DefaultTransformFunctorTraits< binder2nd< bit_or<uint> > > + { + enum { smart_shift = 2 }; + }; + + template <> struct TransformFunctorTraits< binder2nd< bit_xor<uchar> > > : DefaultTransformFunctorTraits< binder2nd< bit_xor<uchar> > > + { + enum { smart_shift = 4 }; + }; + template <> struct TransformFunctorTraits< binder2nd< bit_xor<ushort> > > : DefaultTransformFunctorTraits< binder2nd< bit_xor<ushort> > > + { + enum { smart_shift = 4 }; + }; + template <> struct TransformFunctorTraits< binder2nd< bit_xor<uint> > > : DefaultTransformFunctorTraits< binder2nd< bit_xor<uint> > > + { + enum { smart_shift = 2 }; + }; +}}} + +namespace arithm +{ + template <typename T> void bitScalarAnd(PtrStepSzb src1, uint src2, PtrStepSzb dst, cudaStream_t stream) + { + transform((PtrStepSz<T>) src1, (PtrStepSz<T>) dst, cv::gpu::device::bind2nd(bit_and<T>(), src2), WithOutMask(), stream); + } + + template <typename T> void bitScalarOr(PtrStepSzb src1, uint src2, PtrStepSzb dst, cudaStream_t stream) + { + transform((PtrStepSz<T>) src1, (PtrStepSz<T>) dst, cv::gpu::device::bind2nd(bit_or<T>(), src2), WithOutMask(), stream); + } + + template <typename T> void bitScalarXor(PtrStepSzb src1, uint src2, PtrStepSzb dst, cudaStream_t stream) + { + transform((PtrStepSz<T>) src1, (PtrStepSz<T>) dst, cv::gpu::device::bind2nd(bit_xor<T>(), src2), WithOutMask(), stream); + } + + template void bitScalarAnd<uchar>(PtrStepSzb src1, uint src2, PtrStepSzb dst, cudaStream_t stream); + template void bitScalarAnd<ushort>(PtrStepSzb src1, uint src2, PtrStepSzb dst, cudaStream_t stream); + template void bitScalarAnd<uint>(PtrStepSzb src1, uint src2, PtrStepSzb dst, cudaStream_t stream); + + template void bitScalarOr<uchar>(PtrStepSzb src1, uint src2, PtrStepSzb dst, cudaStream_t stream); + template void bitScalarOr<ushort>(PtrStepSzb src1, uint src2, PtrStepSzb dst, cudaStream_t stream); + template void bitScalarOr<uint>(PtrStepSzb src1, uint src2, PtrStepSzb dst, cudaStream_t stream); + + template void bitScalarXor<uchar>(PtrStepSzb src1, uint src2, PtrStepSzb dst, cudaStream_t stream); + template void bitScalarXor<ushort>(PtrStepSzb src1, uint src2, PtrStepSzb dst, cudaStream_t stream); + template void bitScalarXor<uint>(PtrStepSzb src1, uint src2, PtrStepSzb dst, cudaStream_t stream); +} + +////////////////////////////////////////////////////////////////////////// +// min + +namespace +{ + template <typename T> struct VMin4; + template <> struct VMin4<uint> : binary_function<uint, uint, uint> + { + __device__ __forceinline__ uint operator ()(uint a, uint b) const { - uchar* dst_ptr = dst.ptr(y) + x; - const uchar* src_ptr = src.ptr(y) + x; - if (x + sizeof(uint) - 1 < width) - { - *(uint*)dst_ptr = UnOp<uint, opid>::call(*(uint*)src_ptr); - } - else - { - const uchar* src_end = src.ptr(y) + width; - while (src_ptr < src_end) - { - *dst_ptr++ = UnOp<uchar, opid>::call(*src_ptr++); - } - } + uint res = 0; + + #if __CUDA_ARCH__ >= 300 + asm("vmin4.u32.u32.u32 %0, %1, %2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); + #elif __CUDA_ARCH__ >= 200 + asm("vmin.u32.u32.u32 %0.b0, %1.b0, %2.b0, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); + asm("vmin.u32.u32.u32 %0.b1, %1.b1, %2.b1, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); + asm("vmin.u32.u32.u32 %0.b2, %1.b2, %2.b2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); + asm("vmin.u32.u32.u32 %0.b3, %1.b3, %2.b3, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); + #endif + + return res; } - } - - template <int opid> - void bitwiseUnOp(int rows, int width, const PtrStepb src, PtrStepb dst, - cudaStream_t stream) + __device__ __forceinline__ VMin4() {} + __device__ __forceinline__ VMin4(const VMin4& other) {} + }; + template <> struct VMin4<int> : binary_function<int, int, int> { - dim3 threads(16, 16); - dim3 grid(divUp(width, threads.x * sizeof(uint)), - divUp(rows, threads.y)); - - bitwiseUnOpKernel<opid><<<grid, threads>>>(rows, width, src, dst); - cudaSafeCall( cudaGetLastError() ); - - if (stream == 0) - cudaSafeCall( cudaDeviceSynchronize() ); - } - - - template <typename T, int opid> - __global__ void bitwiseUnOpKernel(int rows, int cols, int cn, const PtrStepb src, - const PtrStepb mask, PtrStepb dst) - { - const int x = blockDim.x * blockIdx.x + threadIdx.x; - const int y = blockDim.y * blockIdx.y + threadIdx.y; - - if (x < cols && y < rows && mask.ptr(y)[x / cn]) + __device__ __forceinline__ int operator ()(int a, int b) const { - T* dst_row = (T*)dst.ptr(y); - const T* src_row = (const T*)src.ptr(y); + int res = 0; - dst_row[x] = UnOp<T, opid>::call(src_row[x]); + #if __CUDA_ARCH__ >= 300 + asm("vmin4.s32.s32.s32 %0, %1, %2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); + #elif __CUDA_ARCH__ >= 200 + asm("vmin.s32.s32.s32 %0.b0, %1.b0, %2.b0, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); + asm("vmin.s32.s32.s32 %0.b1, %1.b1, %2.b1, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); + asm("vmin.s32.s32.s32 %0.b2, %1.b2, %2.b2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); + asm("vmin.s32.s32.s32 %0.b3, %1.b3, %2.b3, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); + #endif + + return res; } - } - - template <typename T, int opid> - void bitwiseUnOp(int rows, int cols, int cn, const PtrStepb src, - const PtrStepb mask, PtrStepb dst, cudaStream_t stream) - { - dim3 threads(16, 16); - dim3 grid(divUp(cols, threads.x), divUp(rows, threads.y)); - - bitwiseUnOpKernel<T, opid><<<grid, threads>>>(rows, cols, cn, src, mask, dst); - cudaSafeCall( cudaGetLastError() ); - - if (stream == 0) - cudaSafeCall( cudaDeviceSynchronize() ); - } - - - void bitwiseNotCaller(int rows, int cols, size_t elem_size1, int cn, - const PtrStepb src, PtrStepb dst, cudaStream_t stream) - { - bitwiseUnOp<UN_OP_NOT>(rows, static_cast<int>(cols * elem_size1 * cn), src, dst, stream); - } - - - template <typename T> - void bitwiseMaskNotCaller(int rows, int cols, int cn, const PtrStepb src, - const PtrStepb mask, PtrStepb dst, cudaStream_t stream) - { - bitwiseUnOp<T, UN_OP_NOT>(rows, cols * cn, cn, src, mask, dst, stream); - } - - template void bitwiseMaskNotCaller<uchar>(int, int, int, const PtrStepb, const PtrStepb, PtrStepb, cudaStream_t); - template void bitwiseMaskNotCaller<ushort>(int, int, int, const PtrStepb, const PtrStepb, PtrStepb, cudaStream_t); - template void bitwiseMaskNotCaller<uint>(int, int, int, const PtrStepb, const PtrStepb, PtrStepb, cudaStream_t); - - - ////////////////////////////////////////////////////////////////////////// - // Binary bitwise logical matrix operations - - enum { BIN_OP_OR, BIN_OP_AND, BIN_OP_XOR }; - - template <typename T, int opid> - struct BinOp; - - template <typename T> - struct BinOp<T, BIN_OP_OR> - { - static __device__ __forceinline__ T call(T a, T b) { return a | b; } + __device__ __forceinline__ VMin4() {} + __device__ __forceinline__ VMin4(const VMin4& other) {} }; + //////////////////////////////////// - template <typename T> - struct BinOp<T, BIN_OP_AND> + template <typename T> struct VMin2; + template <> struct VMin2<uint> : binary_function<uint, uint, uint> { - static __device__ __forceinline__ T call(T a, T b) { return a & b; } - }; - - template <typename T> - struct BinOp<T, BIN_OP_XOR> - { - static __device__ __forceinline__ T call(T a, T b) { return a ^ b; } - }; - - - template <int opid> - __global__ void bitwiseBinOpKernel(int rows, int width, const PtrStepb src1, - const PtrStepb src2, PtrStepb dst) - { - const int x = (blockDim.x * blockIdx.x + threadIdx.x) * 4; - const int y = blockDim.y * blockIdx.y + threadIdx.y; - - if (y < rows) + __device__ __forceinline__ uint operator ()(uint a, uint b) const { - uchar* dst_ptr = dst.ptr(y) + x; - const uchar* src1_ptr = src1.ptr(y) + x; - const uchar* src2_ptr = src2.ptr(y) + x; + uint res = 0; - if (x + sizeof(uint) - 1 < width) - { - *(uint*)dst_ptr = BinOp<uint, opid>::call(*(uint*)src1_ptr, *(uint*)src2_ptr); - } - else - { - const uchar* src1_end = src1.ptr(y) + width; - while (src1_ptr < src1_end) - { - *dst_ptr++ = BinOp<uchar, opid>::call(*src1_ptr++, *src2_ptr++); - } - } + #if __CUDA_ARCH__ >= 300 + asm("vmin2.u32.u32.u32 %0, %1, %2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); + #elif __CUDA_ARCH__ >= 200 + asm("vmin.u32.u32.u32 %0.h0, %1.h0, %2.h0, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); + asm("vmin.u32.u32.u32 %0.h1, %1.h1, %2.h1, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); + #endif + + return res; } - } - - template <int opid> - void bitwiseBinOp(int rows, int width, const PtrStepb src1, const PtrStepb src2, - PtrStepb dst, cudaStream_t stream) + __device__ __forceinline__ VMin2() {} + __device__ __forceinline__ VMin2(const VMin2& other) {} + }; + template <> struct VMin2<int> : binary_function<int, int, int> { - dim3 threads(16, 16); - dim3 grid(divUp(width, threads.x * sizeof(uint)), divUp(rows, threads.y)); - - bitwiseBinOpKernel<opid><<<grid, threads>>>(rows, width, src1, src2, dst); - cudaSafeCall( cudaGetLastError() ); - - if (stream == 0) - cudaSafeCall( cudaDeviceSynchronize() ); - } - - - template <typename T, int opid> - __global__ void bitwiseBinOpKernel( - int rows, int cols, int cn, const PtrStepb src1, const PtrStepb src2, - const PtrStepb mask, PtrStepb dst) - { - const int x = blockDim.x * blockIdx.x + threadIdx.x; - const int y = blockDim.y * blockIdx.y + threadIdx.y; - - if (x < cols && y < rows && mask.ptr(y)[x / cn]) + __device__ __forceinline__ int operator ()(int a, int b) const { - T* dst_row = (T*)dst.ptr(y); - const T* src1_row = (const T*)src1.ptr(y); - const T* src2_row = (const T*)src2.ptr(y); + int res = 0; - dst_row[x] = BinOp<T, opid>::call(src1_row[x], src2_row[x]); + #if __CUDA_ARCH__ >= 300 + asm("vmin2.s32.s32.s32 %0, %1, %2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); + #elif __CUDA_ARCH__ >= 200 + asm("vmin.s32.s32.s32 %0.h0, %1.h0, %2.h0, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); + asm("vmin.s32.s32.s32 %0.h1, %1.h1, %2.h1, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); + #endif + + return res; } + + __device__ __forceinline__ VMin2() {} + __device__ __forceinline__ VMin2(const VMin2& other) {} + }; +} + +namespace cv { namespace gpu { namespace device +{ + template <typename T> struct TransformFunctorTraits< VMin4<T> > : DefaultTransformFunctorTraits< VMin4<T> > + { + enum { smart_block_dim_y = 4 }; + enum { smart_shift = 4 }; + }; + + //////////////////////////////////// + + template <typename T> struct TransformFunctorTraits< VMin2<T> > : DefaultTransformFunctorTraits< VMin2<T> > + { + enum { smart_block_dim_y = 4 }; + enum { smart_shift = 4 }; + }; + + //////////////////////////////////// + + template <> struct TransformFunctorTraits< minimum<ushort> > : DefaultTransformFunctorTraits< minimum<ushort> > + { + enum { smart_shift = 4 }; + }; + template <> struct TransformFunctorTraits< minimum<short> > : DefaultTransformFunctorTraits< minimum<short> > + { + enum { smart_shift = 4 }; + }; + template <> struct TransformFunctorTraits< minimum<int> > : DefaultTransformFunctorTraits< minimum<int> > + { + enum { smart_block_dim_y = 4 }; + enum { smart_shift = 4 }; + }; + template <> struct TransformFunctorTraits< minimum<float> > : DefaultTransformFunctorTraits< minimum<float> > + { + enum { smart_block_dim_y = 4 }; + enum { smart_shift = 4 }; + }; + + template <> struct TransformFunctorTraits< binder2nd< minimum<ushort> > > : DefaultTransformFunctorTraits< binder2nd< minimum<ushort> > > + { + enum { smart_shift = 4 }; + }; + template <> struct TransformFunctorTraits< binder2nd< minimum<short> > > : DefaultTransformFunctorTraits< binder2nd< minimum<short> > > + { + enum { smart_shift = 4 }; + }; + template <> struct TransformFunctorTraits< binder2nd< minimum<int> > > : DefaultTransformFunctorTraits< binder2nd< minimum<int> > > + { + enum { smart_block_dim_y = 4 }; + enum { smart_shift = 4 }; + }; + template <> struct TransformFunctorTraits< binder2nd< minimum<float> > > : DefaultTransformFunctorTraits< binder2nd< minimum<float> > > + { + enum { smart_block_dim_y = 4 }; + enum { smart_shift = 4 }; + }; +}}} + +namespace arithm +{ + template <typename T> void vmin4(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream) + { + transform((PtrStepSz<T>) src1, (PtrStepSz<T>) src2, (PtrStepSz<T>) dst, VMin4<T>(), WithOutMask(), stream); } - - template <typename T, int opid> - void bitwiseBinOp(int rows, int cols, int cn, const PtrStepb src1, const PtrStepb src2, - const PtrStepb mask, PtrStepb dst, cudaStream_t stream) + template <typename T> void vmin2(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream) { - dim3 threads(16, 16); - dim3 grid(divUp(cols, threads.x), divUp(rows, threads.y)); - - bitwiseBinOpKernel<T, opid><<<grid, threads>>>(rows, cols, cn, src1, src2, mask, dst); - cudaSafeCall( cudaGetLastError() ); - - if (stream == 0) - cudaSafeCall( cudaDeviceSynchronize() ); + transform((PtrStepSz<T>) src1, (PtrStepSz<T>) src2, (PtrStepSz<T>) dst, VMin2<T>(), WithOutMask(), stream); } - - void bitwiseOrCaller(int rows, int cols, size_t elem_size1, int cn, const PtrStepb src1, - const PtrStepb src2, PtrStepb dst, cudaStream_t stream) + template <typename T> void minMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream) { - bitwiseBinOp<BIN_OP_OR>(rows, static_cast<int>(cols * elem_size1 * cn), src1, src2, dst, stream); + transform((PtrStepSz<T>) src1, (PtrStepSz<T>) src2, (PtrStepSz<T>) dst, minimum<T>(), WithOutMask(), stream); } + template void vmin4<uint>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream); + template void vmin4<int>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream); - template <typename T> - void bitwiseMaskOrCaller(int rows, int cols, int cn, const PtrStepb src1, const PtrStepb src2, - const PtrStepb mask, PtrStepb dst, cudaStream_t stream) + template void vmin2<uint>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream); + template void vmin2<int>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream); + + template void minMat<uchar >(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream); + template void minMat<schar >(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream); + template void minMat<ushort>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream); + template void minMat<short >(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream); + template void minMat<int >(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream); + template void minMat<float >(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream); + template void minMat<double>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream); + + template <typename T> void minScalar(PtrStepSzb src1, double src2, PtrStepSzb dst, cudaStream_t stream) { - bitwiseBinOp<T, BIN_OP_OR>(rows, cols * cn, cn, src1, src2, mask, dst, stream); + transform((PtrStepSz<T>) src1, (PtrStepSz<T>) dst, cv::gpu::device::bind2nd(minimum<T>(), src2), WithOutMask(), stream); } - template void bitwiseMaskOrCaller<uchar>(int, int, int, const PtrStepb, const PtrStepb, const PtrStepb, PtrStepb, cudaStream_t); - template void bitwiseMaskOrCaller<ushort>(int, int, int, const PtrStepb, const PtrStepb, const PtrStepb, PtrStepb, cudaStream_t); - template void bitwiseMaskOrCaller<uint>(int, int, int, const PtrStepb, const PtrStepb, const PtrStepb, PtrStepb, cudaStream_t); + template void minScalar<uchar >(PtrStepSzb src1, double src2, PtrStepSzb dst, cudaStream_t stream); + template void minScalar<schar >(PtrStepSzb src1, double src2, PtrStepSzb dst, cudaStream_t stream); + template void minScalar<ushort>(PtrStepSzb src1, double src2, PtrStepSzb dst, cudaStream_t stream); + template void minScalar<short >(PtrStepSzb src1, double src2, PtrStepSzb dst, cudaStream_t stream); + template void minScalar<int >(PtrStepSzb src1, double src2, PtrStepSzb dst, cudaStream_t stream); + template void minScalar<float >(PtrStepSzb src1, double src2, PtrStepSzb dst, cudaStream_t stream); + template void minScalar<double>(PtrStepSzb src1, double src2, PtrStepSzb dst, cudaStream_t stream); +} +////////////////////////////////////////////////////////////////////////// +// max - void bitwiseAndCaller(int rows, int cols, size_t elem_size1, int cn, const PtrStepb src1, - const PtrStepb src2, PtrStepb dst, cudaStream_t stream) +namespace +{ + template <typename T> struct VMax4; + template <> struct VMax4<uint> : binary_function<uint, uint, uint> { - bitwiseBinOp<BIN_OP_AND>(rows, static_cast<int>(cols * elem_size1 * cn), src1, src2, dst, stream); - } - - - template <typename T> - void bitwiseMaskAndCaller(int rows, int cols, int cn, const PtrStepb src1, const PtrStepb src2, - const PtrStepb mask, PtrStepb dst, cudaStream_t stream) - { - bitwiseBinOp<T, BIN_OP_AND>(rows, cols * cn, cn, src1, src2, mask, dst, stream); - } - - template void bitwiseMaskAndCaller<uchar>(int, int, int, const PtrStepb, const PtrStepb, const PtrStepb, PtrStepb, cudaStream_t); - template void bitwiseMaskAndCaller<ushort>(int, int, int, const PtrStepb, const PtrStepb, const PtrStepb, PtrStepb, cudaStream_t); - template void bitwiseMaskAndCaller<uint>(int, int, int, const PtrStepb, const PtrStepb, const PtrStepb, PtrStepb, cudaStream_t); - - - void bitwiseXorCaller(int rows, int cols, size_t elem_size1, int cn, const PtrStepb src1, - const PtrStepb src2, PtrStepb dst, cudaStream_t stream) - { - bitwiseBinOp<BIN_OP_XOR>(rows, static_cast<int>(cols * elem_size1 * cn), src1, src2, dst, stream); - } - - - template <typename T> - void bitwiseMaskXorCaller(int rows, int cols, int cn, const PtrStepb src1, const PtrStepb src2, - const PtrStepb mask, PtrStepb dst, cudaStream_t stream) - { - bitwiseBinOp<T, BIN_OP_XOR>(rows, cols * cn, cn, src1, src2, mask, dst, stream); - } - - template void bitwiseMaskXorCaller<uchar>(int, int, int, const PtrStepb, const PtrStepb, const PtrStepb, PtrStepb, cudaStream_t); - template void bitwiseMaskXorCaller<ushort>(int, int, int, const PtrStepb, const PtrStepb, const PtrStepb, PtrStepb, cudaStream_t); - template void bitwiseMaskXorCaller<uint>(int, int, int, const PtrStepb, const PtrStepb, const PtrStepb, PtrStepb, cudaStream_t); - - ////////////////////////////////////////////////////////////////////////// - // min/max - - namespace detail - { - template <size_t size, typename F> struct MinMaxTraits : DefaultTransformFunctorTraits<F> + __device__ __forceinline__ uint operator ()(uint a, uint b) const { - }; - template <typename F> struct MinMaxTraits<2, F> : DefaultTransformFunctorTraits<F> + uint res = 0; + + #if __CUDA_ARCH__ >= 300 + asm("vmax4.u32.u32.u32 %0, %1, %2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); + #elif __CUDA_ARCH__ >= 200 + asm("vmax.u32.u32.u32 %0.b0, %1.b0, %2.b0, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); + asm("vmax.u32.u32.u32 %0.b1, %1.b1, %2.b1, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); + asm("vmax.u32.u32.u32 %0.b2, %1.b2, %2.b2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); + asm("vmax.u32.u32.u32 %0.b3, %1.b3, %2.b3, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); + #endif + + return res; + } + + __device__ __forceinline__ VMax4() {} + __device__ __forceinline__ VMax4(const VMax4& other) {} + }; + template <> struct VMax4<int> : binary_function<int, int, int> + { + __device__ __forceinline__ int operator ()(int a, int b) const { - enum { smart_shift = 4 }; - }; - template <typename F> struct MinMaxTraits<4, F> : DefaultTransformFunctorTraits<F> + int res = 0; + + #if __CUDA_ARCH__ >= 300 + asm("vmax4.s32.s32.s32 %0, %1, %2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); + #elif __CUDA_ARCH__ >= 200 + asm("vmax.s32.s32.s32 %0.b0, %1.b0, %2.b0, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); + asm("vmax.s32.s32.s32 %0.b1, %1.b1, %2.b1, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); + asm("vmax.s32.s32.s32 %0.b2, %1.b2, %2.b2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); + asm("vmax.s32.s32.s32 %0.b3, %1.b3, %2.b3, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); + #endif + + return res; + } + + __device__ __forceinline__ VMax4() {} + __device__ __forceinline__ VMax4(const VMax4& other) {} + }; + + //////////////////////////////////// + + template <typename T> struct VMax2; + template <> struct VMax2<uint> : binary_function<uint, uint, uint> + { + __device__ __forceinline__ uint operator ()(uint a, uint b) const { - enum { smart_block_dim_y = 4 }; - enum { smart_shift = 4 }; - }; - } + uint res = 0; - template <typename T> struct TransformFunctorTraits< minimum<T> > : detail::MinMaxTraits< sizeof(T), minimum<T> > - { + #if __CUDA_ARCH__ >= 300 + asm("vmax2.u32.u32.u32 %0, %1, %2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); + #elif __CUDA_ARCH__ >= 200 + asm("vmax.u32.u32.u32 %0.h0, %1.h0, %2.h0, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); + asm("vmax.u32.u32.u32 %0.h1, %1.h1, %2.h1, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); + #endif + + return res; + } + + __device__ __forceinline__ VMax2() {} + __device__ __forceinline__ VMax2(const VMax2& other) {} }; - template <typename T> struct TransformFunctorTraits< maximum<T> > : detail::MinMaxTraits< sizeof(T), maximum<T> > + template <> struct VMax2<int> : binary_function<int, int, int> { + __device__ __forceinline__ int operator ()(int a, int b) const + { + int res = 0; + + #if __CUDA_ARCH__ >= 300 + asm("vmax2.s32.s32.s32 %0, %1, %2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); + #elif __CUDA_ARCH__ >= 200 + asm("vmax.s32.s32.s32 %0.h0, %1.h0, %2.h0, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); + asm("vmax.s32.s32.s32 %0.h1, %1.h1, %2.h1, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); + #endif + + return res; + } + + __device__ __forceinline__ VMax2() {} + __device__ __forceinline__ VMax2(const VMax2& other) {} }; - template <typename T> struct TransformFunctorTraits< binder2nd< minimum<T> > > : detail::MinMaxTraits< sizeof(T), binder2nd< minimum<T> > > - { - }; - template <typename T> struct TransformFunctorTraits< binder2nd< maximum<T> > > : detail::MinMaxTraits< sizeof(T), binder2nd< maximum<T> > > +} + +namespace cv { namespace gpu { namespace device +{ + template <typename T> struct TransformFunctorTraits< VMax4<T> > : DefaultTransformFunctorTraits< VMax4<T> > { + enum { smart_block_dim_y = 4 }; + enum { smart_shift = 4 }; }; - template <typename T> - void min_gpu(const PtrStepSzb src1, const PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream) + //////////////////////////////////// + + template <typename T> struct TransformFunctorTraits< VMax2<T> > : DefaultTransformFunctorTraits< VMax2<T> > { - cv::gpu::device::transform((PtrStepSz<T>)src1, (PtrStepSz<T>)src2, (PtrStepSz<T>)dst, minimum<T>(), WithOutMask(), stream); + enum { smart_block_dim_y = 4 }; + enum { smart_shift = 4 }; + }; + + //////////////////////////////////// + + template <> struct TransformFunctorTraits< maximum<ushort> > : DefaultTransformFunctorTraits< maximum<ushort> > + { + enum { smart_shift = 4 }; + }; + template <> struct TransformFunctorTraits< maximum<short> > : DefaultTransformFunctorTraits< maximum<short> > + { + enum { smart_shift = 4 }; + }; + template <> struct TransformFunctorTraits< maximum<int> > : DefaultTransformFunctorTraits< maximum<int> > + { + enum { smart_block_dim_y = 4 }; + enum { smart_shift = 4 }; + }; + template <> struct TransformFunctorTraits< maximum<float> > : DefaultTransformFunctorTraits< maximum<float> > + { + enum { smart_block_dim_y = 4 }; + enum { smart_shift = 4 }; + }; + + template <> struct TransformFunctorTraits< binder2nd< maximum<ushort> > > : DefaultTransformFunctorTraits< binder2nd< maximum<ushort> > > + { + enum { smart_shift = 4 }; + }; + template <> struct TransformFunctorTraits< binder2nd< maximum<short> > > : DefaultTransformFunctorTraits< binder2nd< maximum<short> > > + { + enum { smart_shift = 4 }; + }; + template <> struct TransformFunctorTraits< binder2nd< maximum<int> > > : DefaultTransformFunctorTraits< binder2nd< maximum<int> > > + { + enum { smart_block_dim_y = 4 }; + enum { smart_shift = 4 }; + }; + template <> struct TransformFunctorTraits< binder2nd< maximum<float> > > : DefaultTransformFunctorTraits< binder2nd< maximum<float> > > + { + enum { smart_block_dim_y = 4 }; + enum { smart_shift = 4 }; + }; +}}} + +namespace arithm +{ + template <typename T> void vmax4(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream) + { + transform((PtrStepSz<T>) src1, (PtrStepSz<T>) src2, (PtrStepSz<T>) dst, VMax4<T>(), WithOutMask(), stream); } - template void min_gpu<uchar >(const PtrStepSzb src1, const PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream); - template void min_gpu<schar >(const PtrStepSzb src1, const PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream); - template void min_gpu<ushort>(const PtrStepSzb src1, const PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream); - template void min_gpu<short >(const PtrStepSzb src1, const PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream); - template void min_gpu<int >(const PtrStepSzb src1, const PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream); - template void min_gpu<float >(const PtrStepSzb src1, const PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream); - template void min_gpu<double>(const PtrStepSzb src1, const PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream); - - template <typename T> - void max_gpu(const PtrStepSzb src1, const PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream) + template <typename T> void vmax2(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream) { - cv::gpu::device::transform((PtrStepSz<T>)src1, (PtrStepSz<T>)src2, (PtrStepSz<T>)dst, maximum<T>(), WithOutMask(), stream); + transform((PtrStepSz<T>) src1, (PtrStepSz<T>) src2, (PtrStepSz<T>) dst, VMax2<T>(), WithOutMask(), stream); } - template void max_gpu<uchar >(const PtrStepSzb src1, const PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream); - template void max_gpu<schar >(const PtrStepSzb src1, const PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream); - template void max_gpu<ushort>(const PtrStepSzb src1, const PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream); - template void max_gpu<short >(const PtrStepSzb src1, const PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream); - template void max_gpu<int >(const PtrStepSzb src1, const PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream); - template void max_gpu<float >(const PtrStepSzb src1, const PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream); - template void max_gpu<double>(const PtrStepSzb src1, const PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream); - - template <typename T> - void min_gpu(const PtrStepSzb src, T val, PtrStepSzb dst, cudaStream_t stream) + template <typename T> void maxMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream) { - cv::gpu::device::transform((PtrStepSz<T>)src, (PtrStepSz<T>)dst, device::bind2nd(minimum<T>(), val), WithOutMask(), stream); + transform((PtrStepSz<T>) src1, (PtrStepSz<T>) src2, (PtrStepSz<T>) dst, maximum<T>(), WithOutMask(), stream); } - template void min_gpu<uchar >(const PtrStepSzb src, uchar val, PtrStepSzb dst, cudaStream_t stream); - template void min_gpu<schar >(const PtrStepSzb src, schar val, PtrStepSzb dst, cudaStream_t stream); - template void min_gpu<ushort>(const PtrStepSzb src, ushort val, PtrStepSzb dst, cudaStream_t stream); - template void min_gpu<short >(const PtrStepSzb src, short val, PtrStepSzb dst, cudaStream_t stream); - template void min_gpu<int >(const PtrStepSzb src, int val, PtrStepSzb dst, cudaStream_t stream); - template void min_gpu<float >(const PtrStepSzb src, float val, PtrStepSzb dst, cudaStream_t stream); - template void min_gpu<double>(const PtrStepSzb src, double val, PtrStepSzb dst, cudaStream_t stream); + template void vmax4<uint>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream); + template void vmax4<int>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream); - template <typename T> - void max_gpu(const PtrStepSzb src, T val, PtrStepSzb dst, cudaStream_t stream) + template void vmax2<uint>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream); + template void vmax2<int>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream); + + template void maxMat<uchar >(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream); + template void maxMat<schar >(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream); + template void maxMat<ushort>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream); + template void maxMat<short >(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream); + template void maxMat<int >(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream); + template void maxMat<float >(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream); + template void maxMat<double>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream); + + template <typename T> void maxScalar(PtrStepSzb src1, double src2, PtrStepSzb dst, cudaStream_t stream) { - cv::gpu::device::transform((PtrStepSz<T>)src, (PtrStepSz<T>)dst, device::bind2nd(maximum<T>(), val), WithOutMask(), stream); + transform((PtrStepSz<T>) src1, (PtrStepSz<T>) dst, cv::gpu::device::bind2nd(maximum<T>(), src2), WithOutMask(), stream); } - template void max_gpu<uchar >(const PtrStepSzb src, uchar val, PtrStepSzb dst, cudaStream_t stream); - template void max_gpu<schar >(const PtrStepSzb src, schar val, PtrStepSzb dst, cudaStream_t stream); - template void max_gpu<ushort>(const PtrStepSzb src, ushort val, PtrStepSzb dst, cudaStream_t stream); - template void max_gpu<short >(const PtrStepSzb src, short val, PtrStepSzb dst, cudaStream_t stream); - template void max_gpu<int >(const PtrStepSzb src, int val, PtrStepSzb dst, cudaStream_t stream); - template void max_gpu<float >(const PtrStepSzb src, float val, PtrStepSzb dst, cudaStream_t stream); - template void max_gpu<double>(const PtrStepSzb src, double val, PtrStepSzb dst, cudaStream_t stream); + template void maxScalar<uchar >(PtrStepSzb src1, double src2, PtrStepSzb dst, cudaStream_t stream); + template void maxScalar<schar >(PtrStepSzb src1, double src2, PtrStepSzb dst, cudaStream_t stream); + template void maxScalar<ushort>(PtrStepSzb src1, double src2, PtrStepSzb dst, cudaStream_t stream); + template void maxScalar<short >(PtrStepSzb src1, double src2, PtrStepSzb dst, cudaStream_t stream); + template void maxScalar<int >(PtrStepSzb src1, double src2, PtrStepSzb dst, cudaStream_t stream); + template void maxScalar<float >(PtrStepSzb src1, double src2, PtrStepSzb dst, cudaStream_t stream); + template void maxScalar<double>(PtrStepSzb src1, double src2, PtrStepSzb dst, cudaStream_t stream); +} - ////////////////////////////////////////////////////////////////////////// - // threshold +////////////////////////////////////////////////////////////////////////// +// threshold +namespace cv { namespace gpu { namespace device +{ namespace detail { template <size_t size, typename F> struct ThresholdTraits : DefaultTransformFunctorTraits<F> @@ -1874,19 +3021,21 @@ namespace cv { namespace gpu { namespace device template <typename T> struct TransformFunctorTraits< thresh_to_zero_inv_func<T> > : detail::ThresholdTraits< sizeof(T), thresh_to_zero_inv_func<T> > { }; +}}} +namespace arithm +{ template <template <typename> class Op, typename T> - void threshold_caller(const PtrStepSz<T>& src, const PtrStepSz<T>& dst, T thresh, T maxVal, cudaStream_t stream) + void threshold_caller(PtrStepSz<T> src, PtrStepSz<T> dst, T thresh, T maxVal, cudaStream_t stream) { Op<T> op(thresh, maxVal); - cv::gpu::device::transform(src, dst, op, WithOutMask(), stream); + transform(src, dst, op, WithOutMask(), stream); } template <typename T> - void threshold_gpu(const PtrStepSzb& src, const PtrStepSzb& dst, T thresh, T maxVal, int type, - cudaStream_t stream) + void threshold(PtrStepSzb src, PtrStepSzb dst, double thresh, double maxVal, int type, cudaStream_t stream) { - typedef void (*caller_t)(const PtrStepSz<T>& src, const PtrStepSz<T>& dst, T thresh, T maxVal, cudaStream_t stream); + typedef void (*caller_t)(PtrStepSz<T> src, PtrStepSz<T> dst, T thresh, T maxVal, cudaStream_t stream); static const caller_t callers[] = { @@ -1897,23 +3046,26 @@ namespace cv { namespace gpu { namespace device threshold_caller<thresh_to_zero_inv_func, T> }; - callers[type]((PtrStepSz<T>)src, (PtrStepSz<T>)dst, thresh, maxVal, stream); + callers[type]((PtrStepSz<T>) src, (PtrStepSz<T>) dst, static_cast<T>(thresh), static_cast<T>(maxVal), stream); } - template void threshold_gpu<uchar>(const PtrStepSzb& src, const PtrStepSzb& dst, uchar thresh, uchar maxVal, int type, cudaStream_t stream); - template void threshold_gpu<schar>(const PtrStepSzb& src, const PtrStepSzb& dst, schar thresh, schar maxVal, int type, cudaStream_t stream); - template void threshold_gpu<ushort>(const PtrStepSzb& src, const PtrStepSzb& dst, ushort thresh, ushort maxVal, int type, cudaStream_t stream); - template void threshold_gpu<short>(const PtrStepSzb& src, const PtrStepSzb& dst, short thresh, short maxVal, int type, cudaStream_t stream); - template void threshold_gpu<int>(const PtrStepSzb& src, const PtrStepSzb& dst, int thresh, int maxVal, int type, cudaStream_t stream); - template void threshold_gpu<float>(const PtrStepSzb& src, const PtrStepSzb& dst, float thresh, float maxVal, int type, cudaStream_t stream); - template void threshold_gpu<double>(const PtrStepSzb& src, const PtrStepSzb& dst, double thresh, double maxVal, int type, cudaStream_t stream); + template void threshold<uchar>(PtrStepSzb src, PtrStepSzb dst, double thresh, double maxVal, int type, cudaStream_t stream); + template void threshold<schar>(PtrStepSzb src, PtrStepSzb dst, double thresh, double maxVal, int type, cudaStream_t stream); + template void threshold<ushort>(PtrStepSzb src, PtrStepSzb dst, double thresh, double maxVal, int type, cudaStream_t stream); + template void threshold<short>(PtrStepSzb src, PtrStepSzb dst, double thresh, double maxVal, int type, cudaStream_t stream); + template void threshold<int>(PtrStepSzb src, PtrStepSzb dst, double thresh, double maxVal, int type, cudaStream_t stream); + template void threshold<float>(PtrStepSzb src, PtrStepSzb dst, double thresh, double maxVal, int type, cudaStream_t stream); + template void threshold<double>(PtrStepSzb src, PtrStepSzb dst, double thresh, double maxVal, int type, cudaStream_t stream); +} - ////////////////////////////////////////////////////////////////////////// - // pow +////////////////////////////////////////////////////////////////////////// +// pow - template<typename T, bool Signed = device::numeric_limits<T>::is_signed> struct PowOp : unary_function<T, T> +namespace +{ + template<typename T, bool Signed = numeric_limits<T>::is_signed> struct PowOp : unary_function<T, T> { - const float power; + float power; PowOp(double power_) : power(static_cast<float>(power_)) {} @@ -1924,7 +3076,7 @@ namespace cv { namespace gpu { namespace device }; template<typename T> struct PowOp<T, true> : unary_function<T, T> { - const float power; + float power; PowOp(double power_) : power(static_cast<float>(power_)) {} @@ -1951,7 +3103,7 @@ namespace cv { namespace gpu { namespace device }; template<> struct PowOp<double> : unary_function<double, double> { - const double power; + double power; PowOp(double power_) : power(power_) {} @@ -1960,7 +3112,10 @@ namespace cv { namespace gpu { namespace device return ::pow(::fabs(e), power); } }; +} +namespace cv { namespace gpu { namespace device +{ namespace detail { template <size_t size, typename T> struct PowOpTraits : DefaultTransformFunctorTraits< PowOp<T> > @@ -1985,83 +3140,78 @@ namespace cv { namespace gpu { namespace device template <typename T> struct TransformFunctorTraits< PowOp<T> > : detail::PowOpTraits<sizeof(T), T> { }; +}}} +namespace arithm +{ template<typename T> - void pow_caller(PtrStepSzb src, double power, PtrStepSzb dst, cudaStream_t stream) + void pow(PtrStepSzb src, double power, PtrStepSzb dst, cudaStream_t stream) { - cv::gpu::device::transform((PtrStepSz<T>)src, (PtrStepSz<T>)dst, PowOp<T>(power), WithOutMask(), stream); + transform((PtrStepSz<T>) src, (PtrStepSz<T>) dst, PowOp<T>(power), WithOutMask(), stream); } - template void pow_caller<uchar>(PtrStepSzb src, double power, PtrStepSzb dst, cudaStream_t stream); - template void pow_caller<schar>(PtrStepSzb src, double power, PtrStepSzb dst, cudaStream_t stream); - template void pow_caller<short>(PtrStepSzb src, double power, PtrStepSzb dst, cudaStream_t stream); - template void pow_caller<ushort>(PtrStepSzb src, double power, PtrStepSzb dst, cudaStream_t stream); - template void pow_caller<int>(PtrStepSzb src, double power, PtrStepSzb dst, cudaStream_t stream); - template void pow_caller<float>(PtrStepSzb src, double power, PtrStepSzb dst, cudaStream_t stream); - template void pow_caller<double>(PtrStepSzb src, double power, PtrStepSzb dst, cudaStream_t stream); + template void pow<uchar>(PtrStepSzb src, double power, PtrStepSzb dst, cudaStream_t stream); + template void pow<schar>(PtrStepSzb src, double power, PtrStepSzb dst, cudaStream_t stream); + template void pow<short>(PtrStepSzb src, double power, PtrStepSzb dst, cudaStream_t stream); + template void pow<ushort>(PtrStepSzb src, double power, PtrStepSzb dst, cudaStream_t stream); + template void pow<int>(PtrStepSzb src, double power, PtrStepSzb dst, cudaStream_t stream); + template void pow<float>(PtrStepSzb src, double power, PtrStepSzb dst, cudaStream_t stream); + template void pow<double>(PtrStepSzb src, double power, PtrStepSzb dst, cudaStream_t stream); +} - ////////////////////////////////////////////////////////////////////////// - // addWeighted +////////////////////////////////////////////////////////////////////////// +// addWeighted - namespace detail +namespace +{ + template <typename T> struct UseDouble_ { - template <typename T> struct UseDouble - { - enum {value = 0}; - }; - template <> struct UseDouble<int> - { - enum {value = 1}; - }; - template <> struct UseDouble<float> - { - enum {value = 1}; - }; - template <> struct UseDouble<double> - { - enum {value = 1}; - }; - } + enum {value = 0}; + }; + template <> struct UseDouble_<double> + { + enum {value = 1}; + }; template <typename T1, typename T2, typename D> struct UseDouble { - enum {value = (detail::UseDouble<T1>::value || detail::UseDouble<T2>::value || detail::UseDouble<D>::value)}; + enum {value = (UseDouble_<T1>::value || UseDouble_<T2>::value || UseDouble_<D>::value)}; }; - namespace detail + template <typename T1, typename T2, typename D, bool useDouble> struct AddWeighted_; + template <typename T1, typename T2, typename D> struct AddWeighted_<T1, T2, D, false> : binary_function<T1, T2, D> { - template <typename T1, typename T2, typename D, bool useDouble> struct AddWeighted; - template <typename T1, typename T2, typename D> struct AddWeighted<T1, T2, D, false> : binary_function<T1, T2, D> + float alpha; + float beta; + float gamma; + + AddWeighted_(double alpha_, double beta_, double gamma_) : alpha(static_cast<float>(alpha_)), beta(static_cast<float>(beta_)), gamma(static_cast<float>(gamma_)) {} + + __device__ __forceinline__ D operator ()(T1 a, T2 b) const { - AddWeighted(double alpha_, double beta_, double gamma_) : alpha(static_cast<float>(alpha_)), beta(static_cast<float>(beta_)), gamma(static_cast<float>(gamma_)) {} - - __device__ __forceinline__ D operator ()(T1 a, T2 b) const - { - return saturate_cast<D>(a * alpha + b * beta + gamma); - } - - const float alpha; - const float beta; - const float gamma; - }; - template <typename T1, typename T2, typename D> struct AddWeighted<T1, T2, D, true> : binary_function<T1, T2, D> - { - AddWeighted(double alpha_, double beta_, double gamma_) : alpha(alpha_), beta(beta_), gamma(gamma_) {} - - __device__ __forceinline__ D operator ()(T1 a, T2 b) const - { - return saturate_cast<D>(a * alpha + b * beta + gamma); - } - - const double alpha; - const double beta; - const double gamma; - }; - } - template <typename T1, typename T2, typename D> struct AddWeighted : detail::AddWeighted<T1, T2, D, UseDouble<T1, T2, D>::value> - { - AddWeighted(double alpha_, double beta_, double gamma_) : detail::AddWeighted<T1, T2, D, UseDouble<T1, T2, D>::value>(alpha_, beta_, gamma_) {} + return saturate_cast<D>(a * alpha + b * beta + gamma); + } }; + template <typename T1, typename T2, typename D> struct AddWeighted_<T1, T2, D, true> : binary_function<T1, T2, D> + { + double alpha; + double beta; + double gamma; + AddWeighted_(double alpha_, double beta_, double gamma_) : alpha(alpha_), beta(beta_), gamma(gamma_) {} + + __device__ __forceinline__ D operator ()(T1 a, T2 b) const + { + return saturate_cast<D>(a * alpha + b * beta + gamma); + } + }; + template <typename T1, typename T2, typename D> struct AddWeighted : AddWeighted_<T1, T2, D, UseDouble<T1, T2, D>::value> + { + AddWeighted(double alpha_, double beta_, double gamma_) : AddWeighted_<T1, T2, D, UseDouble<T1, T2, D>::value>(alpha_, beta_, gamma_) {} + }; +} + +namespace cv { namespace gpu { namespace device +{ template <> struct TransformFunctorTraits< AddWeighted<ushort, ushort, ushort> > : DefaultTransformFunctorTraits< AddWeighted<ushort, ushort, ushort> > { enum { smart_shift = 4 }; @@ -2117,257 +3267,253 @@ namespace cv { namespace gpu { namespace device enum { smart_block_dim_y = 8 }; enum { smart_shift = 4 }; }; +}}} +namespace arithm +{ template <typename T1, typename T2, typename D> - void addWeighted_gpu(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream) + void addWeighted(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream) { - if (UseDouble<T1, T2, D>::value) - { - cudaSafeCall( cudaSetDoubleForDevice(&alpha) ); - cudaSafeCall( cudaSetDoubleForDevice(&beta) ); - cudaSafeCall( cudaSetDoubleForDevice(&gamma) ); - } - AddWeighted<T1, T2, D> op(alpha, beta, gamma); - cv::gpu::device::transform(static_cast< PtrStepSz<T1> >(src1), static_cast< PtrStepSz<T2> >(src2), static_cast< PtrStepSz<D> >(dst), op, WithOutMask(), stream); + transform((PtrStepSz<T1>) src1, (PtrStepSz<T2>) src2, (PtrStepSz<D>) dst, op, WithOutMask(), stream); } - template void addWeighted_gpu<uchar, uchar, uchar>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream); - template void addWeighted_gpu<uchar, uchar, schar>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream); - template void addWeighted_gpu<uchar, uchar, ushort>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream); - template void addWeighted_gpu<uchar, uchar, short>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream); - template void addWeighted_gpu<uchar, uchar, int>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream); - template void addWeighted_gpu<uchar, uchar, float>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream); - template void addWeighted_gpu<uchar, uchar, double>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream); + template void addWeighted<uchar, uchar, uchar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream); + template void addWeighted<uchar, uchar, schar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream); + template void addWeighted<uchar, uchar, ushort>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream); + template void addWeighted<uchar, uchar, short>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream); + template void addWeighted<uchar, uchar, int>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream); + template void addWeighted<uchar, uchar, float>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream); + template void addWeighted<uchar, uchar, double>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream); - template void addWeighted_gpu<uchar, schar, uchar>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream); - template void addWeighted_gpu<uchar, schar, schar>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream); - template void addWeighted_gpu<uchar, schar, ushort>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream); - template void addWeighted_gpu<uchar, schar, short>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream); - template void addWeighted_gpu<uchar, schar, int>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream); - template void addWeighted_gpu<uchar, schar, float>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream); - template void addWeighted_gpu<uchar, schar, double>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream); + template void addWeighted<uchar, schar, uchar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream); + template void addWeighted<uchar, schar, schar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream); + template void addWeighted<uchar, schar, ushort>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream); + template void addWeighted<uchar, schar, short>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream); + template void addWeighted<uchar, schar, int>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream); + template void addWeighted<uchar, schar, float>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream); + template void addWeighted<uchar, schar, double>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream); - template void addWeighted_gpu<uchar, ushort, uchar>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream); - template void addWeighted_gpu<uchar, ushort, schar>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream); - template void addWeighted_gpu<uchar, ushort, ushort>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream); - template void addWeighted_gpu<uchar, ushort, short>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream); - template void addWeighted_gpu<uchar, ushort, int>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream); - template void addWeighted_gpu<uchar, ushort, float>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream); - template void addWeighted_gpu<uchar, ushort, double>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream); + template void addWeighted<uchar, ushort, uchar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream); + template void addWeighted<uchar, ushort, schar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream); + template void addWeighted<uchar, ushort, ushort>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream); + template void addWeighted<uchar, ushort, short>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream); + template void addWeighted<uchar, ushort, int>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream); + template void addWeighted<uchar, ushort, float>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream); + template void addWeighted<uchar, ushort, double>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream); - template void addWeighted_gpu<uchar, short, uchar>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream); - template void addWeighted_gpu<uchar, short, schar>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream); - template void addWeighted_gpu<uchar, short, ushort>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream); - template void addWeighted_gpu<uchar, short, short>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream); - template void addWeighted_gpu<uchar, short, int>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream); - template void addWeighted_gpu<uchar, short, float>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream); - template void addWeighted_gpu<uchar, short, double>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream); + template void addWeighted<uchar, short, uchar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream); + template void addWeighted<uchar, short, schar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream); + template void addWeighted<uchar, short, ushort>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream); + template void addWeighted<uchar, short, short>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream); + template void addWeighted<uchar, short, int>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream); + template void addWeighted<uchar, short, float>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream); + template void addWeighted<uchar, short, double>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream); - template void addWeighted_gpu<uchar, int, uchar>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream); - template void addWeighted_gpu<uchar, int, schar>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream); - template void addWeighted_gpu<uchar, int, ushort>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream); - template void addWeighted_gpu<uchar, int, short>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream); - template void addWeighted_gpu<uchar, int, int>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream); - template void addWeighted_gpu<uchar, int, float>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream); - template void addWeighted_gpu<uchar, int, double>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream); + template void addWeighted<uchar, int, uchar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream); + template void addWeighted<uchar, int, schar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream); + template void addWeighted<uchar, int, ushort>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream); + template void addWeighted<uchar, int, short>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream); + template void addWeighted<uchar, int, int>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream); + template void addWeighted<uchar, int, float>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream); + template void addWeighted<uchar, int, double>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream); - template void addWeighted_gpu<uchar, float, uchar>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream); - template void addWeighted_gpu<uchar, float, schar>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream); - template void addWeighted_gpu<uchar, float, ushort>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream); - template void addWeighted_gpu<uchar, float, short>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream); - template void addWeighted_gpu<uchar, float, int>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream); - template void addWeighted_gpu<uchar, float, float>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream); - template void addWeighted_gpu<uchar, float, double>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream); + template void addWeighted<uchar, float, uchar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream); + template void addWeighted<uchar, float, schar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream); + template void addWeighted<uchar, float, ushort>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream); + template void addWeighted<uchar, float, short>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream); + template void addWeighted<uchar, float, int>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream); + template void addWeighted<uchar, float, float>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream); + template void addWeighted<uchar, float, double>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream); - template void addWeighted_gpu<uchar, double, uchar>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream); - template void addWeighted_gpu<uchar, double, schar>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream); - template void addWeighted_gpu<uchar, double, ushort>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream); - template void addWeighted_gpu<uchar, double, short>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream); - template void addWeighted_gpu<uchar, double, int>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream); - template void addWeighted_gpu<uchar, double, float>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream); - template void addWeighted_gpu<uchar, double, double>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream); + template void addWeighted<uchar, double, uchar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream); + template void addWeighted<uchar, double, schar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream); + template void addWeighted<uchar, double, ushort>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream); + template void addWeighted<uchar, double, short>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream); + template void addWeighted<uchar, double, int>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream); + template void addWeighted<uchar, double, float>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream); + template void addWeighted<uchar, double, double>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream); - template void addWeighted_gpu<schar, schar, uchar>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream); - template void addWeighted_gpu<schar, schar, schar>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream); - template void addWeighted_gpu<schar, schar, ushort>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream); - template void addWeighted_gpu<schar, schar, short>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream); - template void addWeighted_gpu<schar, schar, int>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream); - template void addWeighted_gpu<schar, schar, float>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream); - template void addWeighted_gpu<schar, schar, double>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream); + template void addWeighted<schar, schar, uchar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream); + template void addWeighted<schar, schar, schar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream); + template void addWeighted<schar, schar, ushort>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream); + template void addWeighted<schar, schar, short>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream); + template void addWeighted<schar, schar, int>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream); + template void addWeighted<schar, schar, float>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream); + template void addWeighted<schar, schar, double>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream); - template void addWeighted_gpu<schar, ushort, uchar>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream); - template void addWeighted_gpu<schar, ushort, schar>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream); - template void addWeighted_gpu<schar, ushort, ushort>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream); - template void addWeighted_gpu<schar, ushort, short>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream); - template void addWeighted_gpu<schar, ushort, int>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream); - template void addWeighted_gpu<schar, ushort, float>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream); - template void addWeighted_gpu<schar, ushort, double>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream); + template void addWeighted<schar, ushort, uchar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream); + template void addWeighted<schar, ushort, schar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream); + template void addWeighted<schar, ushort, ushort>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream); + template void addWeighted<schar, ushort, short>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream); + template void addWeighted<schar, ushort, int>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream); + template void addWeighted<schar, ushort, float>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream); + template void addWeighted<schar, ushort, double>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream); - template void addWeighted_gpu<schar, short, uchar>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream); - template void addWeighted_gpu<schar, short, schar>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream); - template void addWeighted_gpu<schar, short, ushort>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream); - template void addWeighted_gpu<schar, short, short>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream); - template void addWeighted_gpu<schar, short, int>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream); - template void addWeighted_gpu<schar, short, float>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream); - template void addWeighted_gpu<schar, short, double>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream); + template void addWeighted<schar, short, uchar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream); + template void addWeighted<schar, short, schar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream); + template void addWeighted<schar, short, ushort>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream); + template void addWeighted<schar, short, short>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream); + template void addWeighted<schar, short, int>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream); + template void addWeighted<schar, short, float>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream); + template void addWeighted<schar, short, double>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream); - template void addWeighted_gpu<schar, int, uchar>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream); - template void addWeighted_gpu<schar, int, schar>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream); - template void addWeighted_gpu<schar, int, ushort>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream); - template void addWeighted_gpu<schar, int, short>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream); - template void addWeighted_gpu<schar, int, int>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream); - template void addWeighted_gpu<schar, int, float>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream); - template void addWeighted_gpu<schar, int, double>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream); + template void addWeighted<schar, int, uchar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream); + template void addWeighted<schar, int, schar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream); + template void addWeighted<schar, int, ushort>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream); + template void addWeighted<schar, int, short>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream); + template void addWeighted<schar, int, int>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream); + template void addWeighted<schar, int, float>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream); + template void addWeighted<schar, int, double>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream); - template void addWeighted_gpu<schar, float, uchar>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream); - template void addWeighted_gpu<schar, float, schar>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream); - template void addWeighted_gpu<schar, float, ushort>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream); - template void addWeighted_gpu<schar, float, short>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream); - template void addWeighted_gpu<schar, float, int>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream); - template void addWeighted_gpu<schar, float, float>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream); - template void addWeighted_gpu<schar, float, double>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream); + template void addWeighted<schar, float, uchar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream); + template void addWeighted<schar, float, schar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream); + template void addWeighted<schar, float, ushort>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream); + template void addWeighted<schar, float, short>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream); + template void addWeighted<schar, float, int>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream); + template void addWeighted<schar, float, float>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream); + template void addWeighted<schar, float, double>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream); - template void addWeighted_gpu<schar, double, uchar>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream); - template void addWeighted_gpu<schar, double, schar>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream); - template void addWeighted_gpu<schar, double, ushort>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream); - template void addWeighted_gpu<schar, double, short>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream); - template void addWeighted_gpu<schar, double, int>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream); - template void addWeighted_gpu<schar, double, float>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream); - template void addWeighted_gpu<schar, double, double>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream); + template void addWeighted<schar, double, uchar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream); + template void addWeighted<schar, double, schar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream); + template void addWeighted<schar, double, ushort>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream); + template void addWeighted<schar, double, short>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream); + template void addWeighted<schar, double, int>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream); + template void addWeighted<schar, double, float>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream); + template void addWeighted<schar, double, double>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream); - template void addWeighted_gpu<ushort, ushort, uchar>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream); - template void addWeighted_gpu<ushort, ushort, schar>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream); - template void addWeighted_gpu<ushort, ushort, ushort>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream); - template void addWeighted_gpu<ushort, ushort, short>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream); - template void addWeighted_gpu<ushort, ushort, int>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream); - template void addWeighted_gpu<ushort, ushort, float>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream); - template void addWeighted_gpu<ushort, ushort, double>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream); + template void addWeighted<ushort, ushort, uchar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream); + template void addWeighted<ushort, ushort, schar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream); + template void addWeighted<ushort, ushort, ushort>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream); + template void addWeighted<ushort, ushort, short>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream); + template void addWeighted<ushort, ushort, int>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream); + template void addWeighted<ushort, ushort, float>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream); + template void addWeighted<ushort, ushort, double>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream); - template void addWeighted_gpu<ushort, short, uchar>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream); - template void addWeighted_gpu<ushort, short, schar>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream); - template void addWeighted_gpu<ushort, short, ushort>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream); - template void addWeighted_gpu<ushort, short, short>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream); - template void addWeighted_gpu<ushort, short, int>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream); - template void addWeighted_gpu<ushort, short, float>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream); - template void addWeighted_gpu<ushort, short, double>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream); + template void addWeighted<ushort, short, uchar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream); + template void addWeighted<ushort, short, schar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream); + template void addWeighted<ushort, short, ushort>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream); + template void addWeighted<ushort, short, short>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream); + template void addWeighted<ushort, short, int>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream); + template void addWeighted<ushort, short, float>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream); + template void addWeighted<ushort, short, double>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream); - template void addWeighted_gpu<ushort, int, uchar>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream); - template void addWeighted_gpu<ushort, int, schar>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream); - template void addWeighted_gpu<ushort, int, ushort>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream); - template void addWeighted_gpu<ushort, int, short>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream); - template void addWeighted_gpu<ushort, int, int>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream); - template void addWeighted_gpu<ushort, int, float>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream); - template void addWeighted_gpu<ushort, int, double>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream); + template void addWeighted<ushort, int, uchar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream); + template void addWeighted<ushort, int, schar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream); + template void addWeighted<ushort, int, ushort>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream); + template void addWeighted<ushort, int, short>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream); + template void addWeighted<ushort, int, int>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream); + template void addWeighted<ushort, int, float>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream); + template void addWeighted<ushort, int, double>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream); - template void addWeighted_gpu<ushort, float, uchar>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream); - template void addWeighted_gpu<ushort, float, schar>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream); - template void addWeighted_gpu<ushort, float, ushort>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream); - template void addWeighted_gpu<ushort, float, short>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream); - template void addWeighted_gpu<ushort, float, int>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream); - template void addWeighted_gpu<ushort, float, float>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream); - template void addWeighted_gpu<ushort, float, double>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream); + template void addWeighted<ushort, float, uchar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream); + template void addWeighted<ushort, float, schar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream); + template void addWeighted<ushort, float, ushort>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream); + template void addWeighted<ushort, float, short>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream); + template void addWeighted<ushort, float, int>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream); + template void addWeighted<ushort, float, float>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream); + template void addWeighted<ushort, float, double>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream); - template void addWeighted_gpu<ushort, double, uchar>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream); - template void addWeighted_gpu<ushort, double, schar>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream); - template void addWeighted_gpu<ushort, double, ushort>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream); - template void addWeighted_gpu<ushort, double, short>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream); - template void addWeighted_gpu<ushort, double, int>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream); - template void addWeighted_gpu<ushort, double, float>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream); - template void addWeighted_gpu<ushort, double, double>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream); + template void addWeighted<ushort, double, uchar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream); + template void addWeighted<ushort, double, schar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream); + template void addWeighted<ushort, double, ushort>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream); + template void addWeighted<ushort, double, short>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream); + template void addWeighted<ushort, double, int>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream); + template void addWeighted<ushort, double, float>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream); + template void addWeighted<ushort, double, double>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream); - template void addWeighted_gpu<short, short, uchar>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream); - template void addWeighted_gpu<short, short, schar>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream); - template void addWeighted_gpu<short, short, ushort>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream); - template void addWeighted_gpu<short, short, short>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream); - template void addWeighted_gpu<short, short, int>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream); - template void addWeighted_gpu<short, short, float>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream); - template void addWeighted_gpu<short, short, double>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream); + template void addWeighted<short, short, uchar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream); + template void addWeighted<short, short, schar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream); + template void addWeighted<short, short, ushort>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream); + template void addWeighted<short, short, short>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream); + template void addWeighted<short, short, int>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream); + template void addWeighted<short, short, float>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream); + template void addWeighted<short, short, double>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream); - template void addWeighted_gpu<short, int, uchar>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream); - template void addWeighted_gpu<short, int, schar>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream); - template void addWeighted_gpu<short, int, ushort>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream); - template void addWeighted_gpu<short, int, short>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream); - template void addWeighted_gpu<short, int, int>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream); - template void addWeighted_gpu<short, int, float>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream); - template void addWeighted_gpu<short, int, double>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream); + template void addWeighted<short, int, uchar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream); + template void addWeighted<short, int, schar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream); + template void addWeighted<short, int, ushort>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream); + template void addWeighted<short, int, short>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream); + template void addWeighted<short, int, int>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream); + template void addWeighted<short, int, float>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream); + template void addWeighted<short, int, double>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream); - template void addWeighted_gpu<short, float, uchar>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream); - template void addWeighted_gpu<short, float, schar>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream); - template void addWeighted_gpu<short, float, ushort>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream); - template void addWeighted_gpu<short, float, short>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream); - template void addWeighted_gpu<short, float, int>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream); - template void addWeighted_gpu<short, float, float>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream); - template void addWeighted_gpu<short, float, double>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream); + template void addWeighted<short, float, uchar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream); + template void addWeighted<short, float, schar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream); + template void addWeighted<short, float, ushort>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream); + template void addWeighted<short, float, short>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream); + template void addWeighted<short, float, int>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream); + template void addWeighted<short, float, float>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream); + template void addWeighted<short, float, double>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream); - template void addWeighted_gpu<short, double, uchar>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream); - template void addWeighted_gpu<short, double, schar>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream); - template void addWeighted_gpu<short, double, ushort>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream); - template void addWeighted_gpu<short, double, short>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream); - template void addWeighted_gpu<short, double, int>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream); - template void addWeighted_gpu<short, double, float>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream); - template void addWeighted_gpu<short, double, double>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream); + template void addWeighted<short, double, uchar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream); + template void addWeighted<short, double, schar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream); + template void addWeighted<short, double, ushort>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream); + template void addWeighted<short, double, short>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream); + template void addWeighted<short, double, int>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream); + template void addWeighted<short, double, float>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream); + template void addWeighted<short, double, double>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream); - template void addWeighted_gpu<int, int, uchar>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream); - template void addWeighted_gpu<int, int, schar>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream); - template void addWeighted_gpu<int, int, ushort>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream); - template void addWeighted_gpu<int, int, short>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream); - template void addWeighted_gpu<int, int, int>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream); - template void addWeighted_gpu<int, int, float>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream); - template void addWeighted_gpu<int, int, double>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream); + template void addWeighted<int, int, uchar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream); + template void addWeighted<int, int, schar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream); + template void addWeighted<int, int, ushort>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream); + template void addWeighted<int, int, short>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream); + template void addWeighted<int, int, int>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream); + template void addWeighted<int, int, float>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream); + template void addWeighted<int, int, double>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream); - template void addWeighted_gpu<int, float, uchar>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream); - template void addWeighted_gpu<int, float, schar>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream); - template void addWeighted_gpu<int, float, ushort>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream); - template void addWeighted_gpu<int, float, short>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream); - template void addWeighted_gpu<int, float, int>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream); - template void addWeighted_gpu<int, float, float>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream); - template void addWeighted_gpu<int, float, double>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream); + template void addWeighted<int, float, uchar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream); + template void addWeighted<int, float, schar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream); + template void addWeighted<int, float, ushort>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream); + template void addWeighted<int, float, short>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream); + template void addWeighted<int, float, int>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream); + template void addWeighted<int, float, float>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream); + template void addWeighted<int, float, double>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream); - template void addWeighted_gpu<int, double, uchar>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream); - template void addWeighted_gpu<int, double, schar>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream); - template void addWeighted_gpu<int, double, ushort>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream); - template void addWeighted_gpu<int, double, short>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream); - template void addWeighted_gpu<int, double, int>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream); - template void addWeighted_gpu<int, double, float>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream); - template void addWeighted_gpu<int, double, double>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream); + template void addWeighted<int, double, uchar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream); + template void addWeighted<int, double, schar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream); + template void addWeighted<int, double, ushort>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream); + template void addWeighted<int, double, short>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream); + template void addWeighted<int, double, int>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream); + template void addWeighted<int, double, float>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream); + template void addWeighted<int, double, double>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream); - template void addWeighted_gpu<float, float, uchar>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream); - template void addWeighted_gpu<float, float, schar>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream); - template void addWeighted_gpu<float, float, ushort>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream); - template void addWeighted_gpu<float, float, short>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream); - template void addWeighted_gpu<float, float, int>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream); - template void addWeighted_gpu<float, float, float>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream); - template void addWeighted_gpu<float, float, double>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream); + template void addWeighted<float, float, uchar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream); + template void addWeighted<float, float, schar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream); + template void addWeighted<float, float, ushort>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream); + template void addWeighted<float, float, short>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream); + template void addWeighted<float, float, int>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream); + template void addWeighted<float, float, float>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream); + template void addWeighted<float, float, double>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream); - template void addWeighted_gpu<float, double, uchar>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream); - template void addWeighted_gpu<float, double, schar>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream); - template void addWeighted_gpu<float, double, ushort>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream); - template void addWeighted_gpu<float, double, short>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream); - template void addWeighted_gpu<float, double, int>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream); - template void addWeighted_gpu<float, double, float>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream); - template void addWeighted_gpu<float, double, double>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream); + template void addWeighted<float, double, uchar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream); + template void addWeighted<float, double, schar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream); + template void addWeighted<float, double, ushort>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream); + template void addWeighted<float, double, short>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream); + template void addWeighted<float, double, int>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream); + template void addWeighted<float, double, float>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream); + template void addWeighted<float, double, double>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream); - template void addWeighted_gpu<double, double, uchar>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream); - template void addWeighted_gpu<double, double, schar>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream); - template void addWeighted_gpu<double, double, ushort>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream); - template void addWeighted_gpu<double, double, short>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream); - template void addWeighted_gpu<double, double, int>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream); - template void addWeighted_gpu<double, double, float>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream); - template void addWeighted_gpu<double, double, double>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream); -}}} // namespace cv { namespace gpu { namespace device + template void addWeighted<double, double, uchar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream); + template void addWeighted<double, double, schar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream); + template void addWeighted<double, double, ushort>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream); + template void addWeighted<double, double, short>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream); + template void addWeighted<double, double, int>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream); + template void addWeighted<double, double, float>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream); + template void addWeighted<double, double, double>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream); +} -#endif /* CUDA_DISABLER */ \ No newline at end of file +#endif /* CUDA_DISABLER */ diff --git a/modules/gpu/src/element_operations.cpp b/modules/gpu/src/element_operations.cpp index 2a22b2ffb..1943b315d 100644 --- a/modules/gpu/src/element_operations.cpp +++ b/modules/gpu/src/element_operations.cpp @@ -99,58 +99,6 @@ namespace template<> struct NppTypeTraits<CV_32F> { typedef Npp32f npp_t; typedef Npp32fc npp_complex_type; }; template<> struct NppTypeTraits<CV_64F> { typedef Npp64f npp_t; typedef Npp64fc npp_complex_type; }; - template <int DEPTH> struct NppArithmFunc - { - typedef typename NppTypeTraits<DEPTH>::npp_t npp_t; - - typedef NppStatus (*func_t)(const npp_t* pSrc1, int nSrc1Step, const npp_t* pSrc2, int nSrc2Step, npp_t* pDst, int nDstStep, NppiSize oSizeROI, int nScaleFactor); - }; - template <> struct NppArithmFunc<CV_32F> - { - typedef NppTypeTraits<CV_32F>::npp_t npp_t; - - typedef NppStatus (*func_t)(const Npp32f* pSrc1, int nSrc1Step, const Npp32f* pSrc2, int nSrc2Step, Npp32f* pDst, int nDstStep, NppiSize oSizeROI); - }; - - template <int DEPTH, typename NppArithmFunc<DEPTH>::func_t func> struct NppArithm - { - typedef typename NppArithmFunc<DEPTH>::npp_t npp_t; - - static void call(const PtrStepSzb src1, const PtrStepb src2, PtrStepb dst, cudaStream_t stream) - { - NppStreamHandler h(stream); - - NppiSize sz; - sz.width = src1.cols; - sz.height = src1.rows; - - nppSafeCall( func((const npp_t*)src1.data, static_cast<int>(src1.step), (const npp_t*)src2.data, static_cast<int>(src2.step), - (npp_t*)dst.data, static_cast<int>(dst.step), sz, 0) ); - - if (stream == 0) - cudaSafeCall( cudaDeviceSynchronize() ); - } - }; - template <typename NppArithmFunc<CV_32F>::func_t func> struct NppArithm<CV_32F, func> - { - typedef typename NppArithmFunc<CV_32F>::npp_t npp_t; - - static void call(const PtrStepSzb src1, const PtrStepb src2, PtrStepb dst, cudaStream_t stream) - { - NppStreamHandler h(stream); - - NppiSize sz; - sz.width = src1.cols; - sz.height = src1.rows; - - nppSafeCall( func((const npp_t*)src1.data, static_cast<int>(src1.step), (const npp_t*)src2.data, static_cast<int>(src2.step), - (npp_t*)dst.data, static_cast<int>(dst.step), sz) ); - - if (stream == 0) - cudaSafeCall( cudaDeviceSynchronize() ); - } - }; - template<int DEPTH, int cn> struct NppArithmScalarFunc { typedef typename NppTypeTraits<DEPTH>::npp_t npp_t; @@ -313,87 +261,294 @@ namespace //////////////////////////////////////////////////////////////////////// // add -namespace cv { namespace gpu { namespace device +namespace arithm { template <typename T, typename D> - void add_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); + void vadd4(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream); template <typename T, typename D> - void add_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); -}}} + void vadd2(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream); + + template <typename T, typename D> + void addMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); +} void cv::gpu::add(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, const GpuMat& mask, int dtype, Stream& s) { - using namespace cv::gpu::device; + using namespace arithm; - typedef void (*func_t)(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); + typedef void (*func_t)(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); static const func_t funcs[7][7] = { - {add_gpu<unsigned char, unsigned char> , 0 /*add_gpu<unsigned char, signed char>*/ , add_gpu<unsigned char, unsigned short> , add_gpu<unsigned char, short> , add_gpu<unsigned char, int> , add_gpu<unsigned char, float> , add_gpu<unsigned char, double> }, - {0 /*add_gpu<signed char, unsigned char>*/ , 0 /*add_gpu<signed char, signed char>*/ , 0 /*add_gpu<signed char, unsigned short>*/, 0 /*add_gpu<signed char, short>*/ , 0 /*add_gpu<signed char, int>*/, 0 /*add_gpu<signed char, float>*/, 0 /*add_gpu<signed char, double>*/}, - {0 /*add_gpu<unsigned short, unsigned char>*/, 0 /*add_gpu<unsigned short, signed char>*/, add_gpu<unsigned short, unsigned short> , 0 /*add_gpu<unsigned short, short>*/, add_gpu<unsigned short, int> , add_gpu<unsigned short, float> , add_gpu<unsigned short, double> }, - {0 /*add_gpu<short, unsigned char>*/ , 0 /*add_gpu<short, signed char>*/ , 0 /*add_gpu<short, unsigned short>*/ , add_gpu<short, short> , add_gpu<short, int> , add_gpu<short, float> , add_gpu<short, double> }, - {0 /*add_gpu<int, unsigned char>*/ , 0 /*add_gpu<int, signed char>*/ , 0 /*add_gpu<int, unsigned short>*/ , 0 /*add_gpu<int, short>*/ , add_gpu<int, int> , add_gpu<int, float> , add_gpu<int, double> }, - {0 /*add_gpu<float, unsigned char>*/ , 0 /*add_gpu<float, signed char>*/ , 0 /*add_gpu<float, unsigned short>*/ , 0 /*add_gpu<float, short>*/ , 0 /*add_gpu<float, int>*/ , add_gpu<float, float> , add_gpu<float, double> }, - {0 /*add_gpu<double, unsigned char>*/ , 0 /*add_gpu<double, signed char>*/ , 0 /*add_gpu<double, unsigned short>*/ , 0 /*add_gpu<double, short>*/ , 0 /*add_gpu<double, int>*/ , 0 /*add_gpu<double, float>*/ , add_gpu<double, double> } + { + addMat<unsigned char, unsigned char>, + addMat<unsigned char, signed char>, + addMat<unsigned char, unsigned short>, + addMat<unsigned char, short>, + addMat<unsigned char, int>, + addMat<unsigned char, float>, + addMat<unsigned char, double> + }, + { + addMat<signed char, unsigned char>, + addMat<signed char, signed char>, + addMat<signed char, unsigned short>, + addMat<signed char, short>, + addMat<signed char, int>, + addMat<signed char, float>, + addMat<signed char, double> + }, + { + 0 /*addMat<unsigned short, unsigned char>*/, + 0 /*addMat<unsigned short, signed char>*/, + addMat<unsigned short, unsigned short>, + addMat<unsigned short, short>, + addMat<unsigned short, int>, + addMat<unsigned short, float>, + addMat<unsigned short, double> + }, + { + 0 /*addMat<short, unsigned char>*/, + 0 /*addMat<short, signed char>*/, + addMat<short, unsigned short>, + addMat<short, short>, + addMat<short, int>, + addMat<short, float>, + addMat<short, double> + }, + { + 0 /*addMat<int, unsigned char>*/, + 0 /*addMat<int, signed char>*/, + 0 /*addMat<int, unsigned short>*/, + 0 /*addMat<int, short>*/, + addMat<int, int>, + addMat<int, float>, + addMat<int, double> + }, + { + 0 /*addMat<float, unsigned char>*/, + 0 /*addMat<float, signed char>*/, + 0 /*addMat<float, unsigned short>*/, + 0 /*addMat<float, short>*/, + 0 /*addMat<float, int>*/, + addMat<float, float>, + addMat<float, double> + }, + { + 0 /*addMat<double, unsigned char>*/, + 0 /*addMat<double, signed char>*/, + 0 /*addMat<double, unsigned short>*/, + 0 /*addMat<double, short>*/, + 0 /*addMat<double, int>*/, + 0 /*addMat<double, float>*/, + addMat<double, double> + } }; - typedef void (*npp_func_t)(const PtrStepSzb src1, const PtrStepb src2, PtrStepb dst, cudaStream_t stream); - static const npp_func_t npp_funcs[] = + typedef void (*vfunc_t)(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream); + static const vfunc_t vfuncs4[4][4] = { - NppArithm<CV_8U , nppiAdd_8u_C1RSfs >::call, - 0, - NppArithm<CV_16U, nppiAdd_16u_C1RSfs>::call, - NppArithm<CV_16S, nppiAdd_16s_C1RSfs>::call, - NppArithm<CV_32S, nppiAdd_32s_C1RSfs>::call, - NppArithm<CV_32F, nppiAdd_32f_C1R >::call + { + vadd4<unsigned int, unsigned int>, + vadd4<unsigned int, int>, + 0, + 0 + }, + { + vadd4<int, unsigned int>, + vadd4<int, int>, + 0, + 0 + }, + { + 0, + 0, + 0, + 0 + }, + { + 0, + 0, + 0, + 0 + } + }; + static const vfunc_t vfuncs2[4][4] = + { + { + 0, + 0, + 0, + 0 + }, + { + 0, + 0, + 0, + 0 + }, + { + 0, + 0, + vadd2<unsigned int, unsigned int>, + vadd2<unsigned int, int> + }, + { + 0, + 0, + vadd2<int, unsigned int>, + vadd2<int, int> + } }; if (dtype < 0) dtype = src1.depth(); - CV_Assert(src1.depth() <= CV_64F && CV_MAT_DEPTH(dtype) <= CV_64F); - CV_Assert(src1.type() == src2.type() && src1.size() == src2.size()); - CV_Assert(mask.empty() || (src1.channels() == 1 && mask.size() == src1.size() && mask.type() == CV_8U)); + const int sdepth = src1.depth(); + const int ddepth = CV_MAT_DEPTH(dtype); + const int cn = src1.channels(); - if (src1.depth() == CV_64F || CV_MAT_DEPTH(dtype) == CV_64F) + CV_Assert( sdepth <= CV_64F && ddepth <= CV_64F ); + CV_Assert( src2.type() == src1.type() && src2.size() == src1.size() ); + CV_Assert( mask.empty() || (cn == 1 && mask.size() == src1.size() && mask.type() == CV_8U) ); + + if (sdepth == CV_64F || ddepth == CV_64F) { - if (!TargetArchs::builtWith(NATIVE_DOUBLE) || !DeviceInfo().supports(NATIVE_DOUBLE)) + if (!deviceSupports(NATIVE_DOUBLE)) CV_Error(CV_StsUnsupportedFormat, "The device doesn't support double"); } - dst.create(src1.size(), CV_MAKE_TYPE(CV_MAT_DEPTH(dtype), src1.channels())); + dst.create(src1.size(), CV_MAKE_TYPE(ddepth, cn)); cudaStream_t stream = StreamAccessor::getStream(s); - if (mask.empty() && dst.type() == src1.type() && src1.depth() <= CV_32F) + PtrStepSzb src1_(src1.rows, src1.cols * cn, src1.data, src1.step); + PtrStepSzb src2_(src1.rows, src1.cols * cn, src2.data, src2.step); + PtrStepSzb dst_(src1.rows, src1.cols * cn, dst.data, dst.step); + + if (mask.empty() && sdepth < CV_32S && ddepth < CV_32S) { - npp_funcs[src1.depth()](src1.reshape(1), src2.reshape(1), dst.reshape(1), stream); - return; + const intptr_t src1ptr = reinterpret_cast<intptr_t>(src1_.data); + const intptr_t src2ptr = reinterpret_cast<intptr_t>(src2_.data); + const intptr_t dstptr = reinterpret_cast<intptr_t>(dst_.data); + + const bool isAllAligned = (src1ptr & 31) == 0 && (src2ptr & 31) == 0 && (dstptr & 31) == 0; + + if (deviceSupports(FEATURE_SET_COMPUTE_20) && isAllAligned) + { + const vfunc_t vfunc4 = vfuncs4[sdepth][ddepth]; + const vfunc_t vfunc2 = vfuncs2[sdepth][ddepth]; + + if (vfunc4 != 0 && (src1_.cols & 3) == 0) + { + const int vcols = src1_.cols >> 2; + + vfunc4(PtrStepSzb(src1_.rows, vcols, src1_.data, src1_.step), + PtrStepSzb(src1_.rows, vcols, src2_.data, src2_.step), + PtrStepSzb(src1_.rows, vcols, dst_.data, dst_.step), + stream); + + return; + } + + if (vfunc2 != 0 && (src1_.cols & 1) == 0) + { + const int vcols = src1_.cols >> 1; + + vfunc2(PtrStepSzb(src1_.rows, vcols, src1_.data, src1_.step), + PtrStepSzb(src1_.rows, vcols, src2_.data, src2_.step), + PtrStepSzb(src1_.rows, vcols, dst_.data, dst_.step), + stream); + + return; + } + } } - const func_t func = funcs[src1.depth()][dst.depth()]; + const func_t func = funcs[sdepth][ddepth]; if (!func) CV_Error(CV_StsUnsupportedFormat, "Unsupported combination of source and destination types"); - func(src1.reshape(1), src2.reshape(1), dst.reshape(1), mask, stream); + func(src1_, src2_, dst_, mask, stream); +} + +namespace arithm +{ + template <typename T, typename S, typename D> + void addScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); } void cv::gpu::add(const GpuMat& src, const Scalar& sc, GpuMat& dst, const GpuMat& mask, int dtype, Stream& s) { - using namespace cv::gpu::device; + using namespace arithm; - typedef void (*func_t)(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); + typedef void (*func_t)(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); static const func_t funcs[7][7] = { - {add_gpu<unsigned char, unsigned char> , 0 /*add_gpu<unsigned char, signed char>*/ , add_gpu<unsigned char, unsigned short> , add_gpu<unsigned char, short> , add_gpu<unsigned char, int> , add_gpu<unsigned char, float> , add_gpu<unsigned char, double> }, - {0 /*add_gpu<signed char, unsigned char>*/ , 0 /*add_gpu<signed char, signed char>*/ , 0 /*add_gpu<signed char, unsigned short>*/, 0 /*add_gpu<signed char, short>*/ , 0 /*add_gpu<signed char, int>*/, 0 /*add_gpu<signed char, float>*/, 0 /*add_gpu<signed char, double>*/}, - {0 /*add_gpu<unsigned short, unsigned char>*/, 0 /*add_gpu<unsigned short, signed char>*/, add_gpu<unsigned short, unsigned short> , 0 /*add_gpu<unsigned short, short>*/, add_gpu<unsigned short, int> , add_gpu<unsigned short, float> , add_gpu<unsigned short, double> }, - {0 /*add_gpu<short, unsigned char>*/ , 0 /*add_gpu<short, signed char>*/ , 0 /*add_gpu<short, unsigned short>*/ , add_gpu<short, short> , add_gpu<short, int> , add_gpu<short, float> , add_gpu<short, double> }, - {0 /*add_gpu<int, unsigned char>*/ , 0 /*add_gpu<int, signed char>*/ , 0 /*add_gpu<int, unsigned short>*/ , 0 /*add_gpu<int, short>*/ , add_gpu<int, int> , add_gpu<int, float> , add_gpu<int, double> }, - {0 /*add_gpu<float, unsigned char>*/ , 0 /*add_gpu<float, signed char>*/ , 0 /*add_gpu<float, unsigned short>*/ , 0 /*add_gpu<float, short>*/ , 0 /*add_gpu<float, int>*/ , add_gpu<float, float> , add_gpu<float, double> }, - {0 /*add_gpu<double, unsigned char>*/ , 0 /*add_gpu<double, signed char>*/ , 0 /*add_gpu<double, unsigned short>*/ , 0 /*add_gpu<double, short>*/ , 0 /*add_gpu<double, int>*/ , 0 /*add_gpu<double, float>*/ , add_gpu<double, double> } + { + addScalar<unsigned char, float, unsigned char>, + addScalar<unsigned char, float, signed char>, + addScalar<unsigned char, float, unsigned short>, + addScalar<unsigned char, float, short>, + addScalar<unsigned char, float, int>, + addScalar<unsigned char, float, float>, + addScalar<unsigned char, double, double> + }, + { + addScalar<signed char, float, unsigned char>, + addScalar<signed char, float, signed char>, + addScalar<signed char, float, unsigned short>, + addScalar<signed char, float, short>, + addScalar<signed char, float, int>, + addScalar<signed char, float, float>, + addScalar<signed char, double, double> + }, + { + 0 /*addScalar<unsigned short, float, unsigned char>*/, + 0 /*addScalar<unsigned short, float, signed char>*/, + addScalar<unsigned short, float, unsigned short>, + addScalar<unsigned short, float, short>, + addScalar<unsigned short, float, int>, + addScalar<unsigned short, float, float>, + addScalar<unsigned short, double, double> + }, + { + 0 /*addScalar<short, float, unsigned char>*/, + 0 /*addScalar<short, float, signed char>*/, + addScalar<short, float, unsigned short>, + addScalar<short, float, short>, + addScalar<short, float, int>, + addScalar<short, float, float>, + addScalar<short, double, double> + }, + { + 0 /*addScalar<int, float, unsigned char>*/, + 0 /*addScalar<int, float, signed char>*/, + 0 /*addScalar<int, float, unsigned short>*/, + 0 /*addScalar<int, float, short>*/, + addScalar<int, float, int>, + addScalar<int, float, float>, + addScalar<int, double, double> + }, + { + 0 /*addScalar<float, float, unsigned char>*/, + 0 /*addScalar<float, float, signed char>*/, + 0 /*addScalar<float, float, unsigned short>*/, + 0 /*addScalar<float, float, short>*/, + 0 /*addScalar<float, float, int>*/, + addScalar<float, float, float>, + addScalar<float, double, double> + }, + { + 0 /*addScalar<double, double, unsigned char>*/, + 0 /*addScalar<double, double, signed char>*/, + 0 /*addScalar<double, double, unsigned short>*/, + 0 /*addScalar<double, double, short>*/, + 0 /*addScalar<double, double, int>*/, + 0 /*addScalar<double, double, float>*/, + addScalar<double, double, double> + } }; typedef void (*npp_func_t)(const PtrStepSzb src, Scalar sc, PtrStepb dst, cudaStream_t stream); @@ -411,34 +566,34 @@ void cv::gpu::add(const GpuMat& src, const Scalar& sc, GpuMat& dst, const GpuMat if (dtype < 0) dtype = src.depth(); - CV_Assert(src.depth() <= CV_64F && CV_MAT_DEPTH(dtype) <= CV_64F); - CV_Assert(src.channels() <= 4); - CV_Assert(mask.empty() || (src.channels() == 1 && mask.size() == src.size() && mask.type() == CV_8U)); + const int sdepth = src.depth(); + const int ddepth = CV_MAT_DEPTH(dtype); + const int cn = src.channels(); - if (src.depth() == CV_64F || CV_MAT_DEPTH(dtype) == CV_64F) + CV_Assert( sdepth <= CV_64F && ddepth <= CV_64F ); + CV_Assert( cn <= 4 ); + CV_Assert( mask.empty() || (cn == 1 && mask.size() == src.size() && mask.type() == CV_8U) ); + + if (sdepth == CV_64F || ddepth == CV_64F) { - if (!TargetArchs::builtWith(NATIVE_DOUBLE) || !DeviceInfo().supports(NATIVE_DOUBLE)) + if (!deviceSupports(NATIVE_DOUBLE)) CV_Error(CV_StsUnsupportedFormat, "The device doesn't support double"); } - dst.create(src.size(), CV_MAKE_TYPE(CV_MAT_DEPTH(dtype), src.channels())); + dst.create(src.size(), CV_MAKE_TYPE(ddepth, cn)); cudaStream_t stream = StreamAccessor::getStream(s); - if (mask.empty() && dst.type() == src.type()) + const npp_func_t npp_func = npp_funcs[sdepth][cn - 1]; + if (ddepth == sdepth && cn > 1 && npp_func != 0) { - const npp_func_t npp_func = npp_funcs[src.depth()][src.channels() - 1]; - - if (npp_func) - { - npp_func(src, sc, dst, stream); - return; - } + npp_func(src, sc, dst, stream); + return; } - CV_Assert(src.channels() == 1); + CV_Assert( cn == 1 ); - const func_t func = funcs[src.depth()][dst.depth()]; + const func_t func = funcs[sdepth][ddepth]; if (!func) CV_Error(CV_StsUnsupportedFormat, "Unsupported combination of source and destination types"); @@ -449,87 +604,294 @@ void cv::gpu::add(const GpuMat& src, const Scalar& sc, GpuMat& dst, const GpuMat //////////////////////////////////////////////////////////////////////// // subtract -namespace cv { namespace gpu { namespace device +namespace arithm { template <typename T, typename D> - void subtract_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); + void vsub4(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream); template <typename T, typename D> - void subtract_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); -}}} + void vsub2(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream); + + template <typename T, typename D> + void subMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); +} void cv::gpu::subtract(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, const GpuMat& mask, int dtype, Stream& s) { - using namespace cv::gpu::device; + using namespace arithm; - typedef void (*func_t)(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); + typedef void (*func_t)(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); static const func_t funcs[7][7] = { - {subtract_gpu<unsigned char, unsigned char> , 0 /*subtract_gpu<unsigned char, signed char>*/ , subtract_gpu<unsigned char, unsigned short> , subtract_gpu<unsigned char, short> , subtract_gpu<unsigned char, int> , subtract_gpu<unsigned char, float> , subtract_gpu<unsigned char, double> }, - {0 /*subtract_gpu<signed char, unsigned char>*/ , 0 /*subtract_gpu<signed char, signed char>*/ , 0 /*subtract_gpu<signed char, unsigned short>*/, 0 /*subtract_gpu<signed char, short>*/ , 0 /*subtract_gpu<signed char, int>*/, 0 /*subtract_gpu<signed char, float>*/, 0 /*subtract_gpu<signed char, double>*/}, - {0 /*subtract_gpu<unsigned short, unsigned char>*/, 0 /*subtract_gpu<unsigned short, signed char>*/, subtract_gpu<unsigned short, unsigned short> , 0 /*subtract_gpu<unsigned short, short>*/, subtract_gpu<unsigned short, int> , subtract_gpu<unsigned short, float> , subtract_gpu<unsigned short, double> }, - {0 /*subtract_gpu<short, unsigned char>*/ , 0 /*subtract_gpu<short, signed char>*/ , 0 /*subtract_gpu<short, unsigned short>*/ , subtract_gpu<short, short> , subtract_gpu<short, int> , subtract_gpu<short, float> , subtract_gpu<short, double> }, - {0 /*subtract_gpu<int, unsigned char>*/ , 0 /*subtract_gpu<int, signed char>*/ , 0 /*subtract_gpu<int, unsigned short>*/ , 0 /*subtract_gpu<int, short>*/ , subtract_gpu<int, int> , subtract_gpu<int, float> , subtract_gpu<int, double> }, - {0 /*subtract_gpu<float, unsigned char>*/ , 0 /*subtract_gpu<float, signed char>*/ , 0 /*subtract_gpu<float, unsigned short>*/ , 0 /*subtract_gpu<float, short>*/ , 0 /*subtract_gpu<float, int>*/ , subtract_gpu<float, float> , subtract_gpu<float, double> }, - {0 /*subtract_gpu<double, unsigned char>*/ , 0 /*subtract_gpu<double, signed char>*/ , 0 /*subtract_gpu<double, unsigned short>*/ , 0 /*subtract_gpu<double, short>*/ , 0 /*subtract_gpu<double, int>*/ , 0 /*subtract_gpu<double, float>*/ , subtract_gpu<double, double> } + { + subMat<unsigned char, unsigned char>, + subMat<unsigned char, signed char>, + subMat<unsigned char, unsigned short>, + subMat<unsigned char, short>, + subMat<unsigned char, int>, + subMat<unsigned char, float>, + subMat<unsigned char, double> + }, + { + subMat<signed char, unsigned char>, + subMat<signed char, signed char>, + subMat<signed char, unsigned short>, + subMat<signed char, short>, + subMat<signed char, int>, + subMat<signed char, float>, + subMat<signed char, double> + }, + { + 0 /*subMat<unsigned short, unsigned char>*/, + 0 /*subMat<unsigned short, signed char>*/, + subMat<unsigned short, unsigned short>, + subMat<unsigned short, short>, + subMat<unsigned short, int>, + subMat<unsigned short, float>, + subMat<unsigned short, double> + }, + { + 0 /*subMat<short, unsigned char>*/, + 0 /*subMat<short, signed char>*/, + subMat<short, unsigned short>, + subMat<short, short>, + subMat<short, int>, + subMat<short, float>, + subMat<short, double> + }, + { + 0 /*subMat<int, unsigned char>*/, + 0 /*subMat<int, signed char>*/, + 0 /*subMat<int, unsigned short>*/, + 0 /*subMat<int, short>*/, + subMat<int, int>, + subMat<int, float>, + subMat<int, double> + }, + { + 0 /*subMat<float, unsigned char>*/, + 0 /*subMat<float, signed char>*/, + 0 /*subMat<float, unsigned short>*/, + 0 /*subMat<float, short>*/, + 0 /*subMat<float, int>*/, + subMat<float, float>, + subMat<float, double> + }, + { + 0 /*subMat<double, unsigned char>*/, + 0 /*subMat<double, signed char>*/, + 0 /*subMat<double, unsigned short>*/, + 0 /*subMat<double, short>*/, + 0 /*subMat<double, int>*/, + 0 /*subMat<double, float>*/, + subMat<double, double> + } }; - typedef void (*npp_func_t)(const PtrStepSzb src1, const PtrStepb src2, PtrStepb dst, cudaStream_t stream); - static const npp_func_t npp_funcs[6] = + typedef void (*vfunc_t)(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream); + static const vfunc_t vfuncs4[4][4] = { - NppArithm<CV_8U , nppiSub_8u_C1RSfs>::call, - 0, - NppArithm<CV_16U, nppiSub_16u_C1RSfs>::call, - NppArithm<CV_16S, nppiSub_16s_C1RSfs>::call, - NppArithm<CV_32S, nppiSub_32s_C1RSfs>::call, - NppArithm<CV_32F, nppiSub_32f_C1R >::call + { + vsub4<unsigned int, unsigned int>, + vsub4<unsigned int, int>, + 0, + 0 + }, + { + vsub4<int, unsigned int>, + vsub4<int, int>, + 0, + 0 + }, + { + 0, + 0, + 0, + 0 + }, + { + 0, + 0, + 0, + 0 + } + }; + static const vfunc_t vfuncs2[4][4] = + { + { + 0, + 0, + 0, + 0 + }, + { + 0, + 0, + 0, + 0 + }, + { + 0, + 0, + vsub2<unsigned int, unsigned int>, + vsub2<unsigned int, int> + }, + { + 0, + 0, + vsub2<int, unsigned int>, + vsub2<int, int> + } }; if (dtype < 0) dtype = src1.depth(); - CV_Assert(src1.depth() <= CV_64F && CV_MAT_DEPTH(dtype) <= CV_64F); - CV_Assert(src1.type() == src2.type() && src1.size() == src2.size()); - CV_Assert(mask.empty() || (src1.channels() == 1 && mask.size() == src1.size() && mask.type() == CV_8U)); + const int sdepth = src1.depth(); + const int ddepth = CV_MAT_DEPTH(dtype); + const int cn = src1.channels(); - if (src1.depth() == CV_64F || CV_MAT_DEPTH(dtype) == CV_64F) + CV_Assert( sdepth <= CV_64F && ddepth <= CV_64F ); + CV_Assert( src2.type() == src1.type() && src2.size() == src1.size() ); + CV_Assert( mask.empty() || (cn == 1 && mask.size() == src1.size() && mask.type() == CV_8U) ); + + if (sdepth == CV_64F || ddepth == CV_64F) { - if (!TargetArchs::builtWith(NATIVE_DOUBLE) || !DeviceInfo().supports(NATIVE_DOUBLE)) + if (!deviceSupports(NATIVE_DOUBLE)) CV_Error(CV_StsUnsupportedFormat, "The device doesn't support double"); } - dst.create(src1.size(), CV_MAKE_TYPE(CV_MAT_DEPTH(dtype), src1.channels())); + dst.create(src1.size(), CV_MAKE_TYPE(ddepth, cn)); cudaStream_t stream = StreamAccessor::getStream(s); - if (mask.empty() && dst.type() == src1.type() && src1.depth() <= CV_32F) + PtrStepSzb src1_(src1.rows, src1.cols * cn, src1.data, src1.step); + PtrStepSzb src2_(src1.rows, src1.cols * cn, src2.data, src2.step); + PtrStepSzb dst_(src1.rows, src1.cols * cn, dst.data, dst.step); + + if (mask.empty() && sdepth < CV_32S && ddepth < CV_32S) { - npp_funcs[src1.depth()](src2.reshape(1), src1.reshape(1), dst.reshape(1), stream); - return; + const intptr_t src1ptr = reinterpret_cast<intptr_t>(src1_.data); + const intptr_t src2ptr = reinterpret_cast<intptr_t>(src2_.data); + const intptr_t dstptr = reinterpret_cast<intptr_t>(dst_.data); + + const bool isAllAligned = (src1ptr & 31) == 0 && (src2ptr & 31) == 0 && (dstptr & 31) == 0; + + if (deviceSupports(FEATURE_SET_COMPUTE_20) && isAllAligned) + { + const vfunc_t vfunc4 = vfuncs4[sdepth][ddepth]; + const vfunc_t vfunc2 = vfuncs2[sdepth][ddepth]; + + if (vfunc4 != 0 && (src1_.cols & 3) == 0) + { + const int vcols = src1_.cols >> 2; + + vfunc4(PtrStepSzb(src1_.rows, vcols, src1_.data, src1_.step), + PtrStepSzb(src1_.rows, vcols, src2_.data, src2_.step), + PtrStepSzb(src1_.rows, vcols, dst_.data, dst_.step), + stream); + + return; + } + + if (vfunc2 != 0 && (src1_.cols & 1) == 0) + { + const int vcols = src1_.cols >> 1; + + vfunc2(PtrStepSzb(src1_.rows, vcols, src1_.data, src1_.step), + PtrStepSzb(src1_.rows, vcols, src2_.data, src2_.step), + PtrStepSzb(src1_.rows, vcols, dst_.data, dst_.step), + stream); + + return; + } + } } - const func_t func = funcs[src1.depth()][dst.depth()]; + const func_t func = funcs[sdepth][ddepth]; if (!func) CV_Error(CV_StsUnsupportedFormat, "Unsupported combination of source and destination types"); - func(src1.reshape(1), src2.reshape(1), dst.reshape(1), mask, stream); + func(src1_, src2_, dst_, mask, stream); +} + +namespace arithm +{ + template <typename T, typename S, typename D> + void subScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); } void cv::gpu::subtract(const GpuMat& src, const Scalar& sc, GpuMat& dst, const GpuMat& mask, int dtype, Stream& s) { - using namespace cv::gpu::device; + using namespace arithm; - typedef void (*func_t)(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); + typedef void (*func_t)(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); static const func_t funcs[7][7] = { - {subtract_gpu<unsigned char, unsigned char> , 0 /*subtract_gpu<unsigned char, signed char>*/ , subtract_gpu<unsigned char, unsigned short> , subtract_gpu<unsigned char, short> , subtract_gpu<unsigned char, int> , subtract_gpu<unsigned char, float> , subtract_gpu<unsigned char, double> }, - {0 /*subtract_gpu<signed char, unsigned char>*/ , 0 /*subtract_gpu<signed char, signed char>*/ , 0 /*subtract_gpu<signed char, unsigned short>*/, 0 /*subtract_gpu<signed char, short>*/ , 0 /*subtract_gpu<signed char, int>*/, 0 /*subtract_gpu<signed char, float>*/, 0 /*subtract_gpu<signed char, double>*/}, - {0 /*subtract_gpu<unsigned short, unsigned char>*/, 0 /*subtract_gpu<unsigned short, signed char>*/, subtract_gpu<unsigned short, unsigned short> , 0 /*subtract_gpu<unsigned short, short>*/, subtract_gpu<unsigned short, int> , subtract_gpu<unsigned short, float> , subtract_gpu<unsigned short, double> }, - {0 /*subtract_gpu<short, unsigned char>*/ , 0 /*subtract_gpu<short, signed char>*/ , 0 /*subtract_gpu<short, unsigned short>*/ , subtract_gpu<short, short> , subtract_gpu<short, int> , subtract_gpu<short, float> , subtract_gpu<short, double> }, - {0 /*subtract_gpu<int, unsigned char>*/ , 0 /*subtract_gpu<int, signed char>*/ , 0 /*subtract_gpu<int, unsigned short>*/ , 0 /*subtract_gpu<int, short>*/ , subtract_gpu<int, int> , subtract_gpu<int, float> , subtract_gpu<int, double> }, - {0 /*subtract_gpu<float, unsigned char>*/ , 0 /*subtract_gpu<float, signed char>*/ , 0 /*subtract_gpu<float, unsigned short>*/ , 0 /*subtract_gpu<float, short>*/ , 0 /*subtract_gpu<float, int>*/ , subtract_gpu<float, float> , subtract_gpu<float, double> }, - {0 /*subtract_gpu<double, unsigned char>*/ , 0 /*subtract_gpu<double, signed char>*/ , 0 /*subtract_gpu<double, unsigned short>*/ , 0 /*subtract_gpu<double, short>*/ , 0 /*subtract_gpu<double, int>*/ , 0 /*subtract_gpu<double, float>*/ , subtract_gpu<double, double> } + { + subScalar<unsigned char, float, unsigned char>, + subScalar<unsigned char, float, signed char>, + subScalar<unsigned char, float, unsigned short>, + subScalar<unsigned char, float, short>, + subScalar<unsigned char, float, int>, + subScalar<unsigned char, float, float>, + subScalar<unsigned char, double, double> + }, + { + subScalar<signed char, float, unsigned char>, + subScalar<signed char, float, signed char>, + subScalar<signed char, float, unsigned short>, + subScalar<signed char, float, short>, + subScalar<signed char, float, int>, + subScalar<signed char, float, float>, + subScalar<signed char, double, double> + }, + { + 0 /*subScalar<unsigned short, float, unsigned char>*/, + 0 /*subScalar<unsigned short, float, signed char>*/, + subScalar<unsigned short, float, unsigned short>, + subScalar<unsigned short, float, short>, + subScalar<unsigned short, float, int>, + subScalar<unsigned short, float, float>, + subScalar<unsigned short, double, double> + }, + { + 0 /*subScalar<short, float, unsigned char>*/, + 0 /*subScalar<short, float, signed char>*/, + subScalar<short, float, unsigned short>, + subScalar<short, float, short>, + subScalar<short, float, int>, + subScalar<short, float, float>, + subScalar<short, double, double> + }, + { + 0 /*subScalar<int, float, unsigned char>*/, + 0 /*subScalar<int, float, signed char>*/, + 0 /*subScalar<int, float, unsigned short>*/, + 0 /*subScalar<int, float, short>*/, + subScalar<int, float, int>, + subScalar<int, float, float>, + subScalar<int, double, double> + }, + { + 0 /*subScalar<float, float, unsigned char>*/, + 0 /*subScalar<float, float, signed char>*/, + 0 /*subScalar<float, float, unsigned short>*/, + 0 /*subScalar<float, float, short>*/, + 0 /*subScalar<float, float, int>*/, + subScalar<float, float, float>, + subScalar<float, double, double> + }, + { + 0 /*subScalar<double, double, unsigned char>*/, + 0 /*subScalar<double, double, signed char>*/, + 0 /*subScalar<double, double, unsigned short>*/, + 0 /*subScalar<double, double, short>*/, + 0 /*subScalar<double, double, int>*/, + 0 /*subScalar<double, double, float>*/, + subScalar<double, double, double> + } }; typedef void (*npp_func_t)(const PtrStepSzb src, Scalar sc, PtrStepb dst, cudaStream_t stream); @@ -547,34 +909,34 @@ void cv::gpu::subtract(const GpuMat& src, const Scalar& sc, GpuMat& dst, const G if (dtype < 0) dtype = src.depth(); - CV_Assert(src.depth() <= CV_64F && CV_MAT_DEPTH(dtype) <= CV_64F); - CV_Assert(src.channels() <= 4); - CV_Assert(mask.empty() || (src.channels() == 1 && mask.size() == src.size() && mask.type() == CV_8U)); + const int sdepth = src.depth(); + const int ddepth = CV_MAT_DEPTH(dtype); + const int cn = src.channels(); - if (src.depth() == CV_64F || CV_MAT_DEPTH(dtype) == CV_64F) + CV_Assert( sdepth <= CV_64F && ddepth <= CV_64F ); + CV_Assert( cn <= 4 ); + CV_Assert( mask.empty() || (cn == 1 && mask.size() == src.size() && mask.type() == CV_8U) ); + + if (sdepth == CV_64F || ddepth == CV_64F) { - if (!TargetArchs::builtWith(NATIVE_DOUBLE) || !DeviceInfo().supports(NATIVE_DOUBLE)) + if (!deviceSupports(NATIVE_DOUBLE)) CV_Error(CV_StsUnsupportedFormat, "The device doesn't support double"); } - dst.create(src.size(), CV_MAKE_TYPE(CV_MAT_DEPTH(dtype), src.channels())); + dst.create(src.size(), CV_MAKE_TYPE(ddepth, cn)); cudaStream_t stream = StreamAccessor::getStream(s); - if (mask.empty() && dst.type() == src.type()) + const npp_func_t npp_func = npp_funcs[sdepth][cn - 1]; + if (ddepth == sdepth && cn > 1 && npp_func != 0) { - const npp_func_t npp_func = npp_funcs[src.depth()][src.channels() - 1]; - - if (npp_func) - { - npp_func(src, sc, dst, stream); - return; - } + npp_func(src, sc, dst, stream); + return; } - CV_Assert(src.channels() == 1); + CV_Assert( cn == 1 ); - const func_t func = funcs[src.depth()][dst.depth()]; + const func_t func = funcs[sdepth][ddepth]; if (!func) CV_Error(CV_StsUnsupportedFormat, "Unsupported combination of source and destination types"); @@ -585,120 +947,215 @@ void cv::gpu::subtract(const GpuMat& src, const Scalar& sc, GpuMat& dst, const G //////////////////////////////////////////////////////////////////////// // multiply -namespace cv { namespace gpu { namespace device +namespace arithm { - void multiply_gpu(const PtrStepSz<uchar4>& src1, const PtrStepSzf& src2, const PtrStepSz<uchar4>& dst, cudaStream_t stream); - void multiply_gpu(const PtrStepSz<short4>& src1, const PtrStepSzf& src2, const PtrStepSz<short4>& dst, cudaStream_t stream); + void mulMat_8uc4_32f(PtrStepSz<unsigned int> src1, PtrStepSzf src2, PtrStepSz<unsigned int> dst, cudaStream_t stream); - template <typename T, typename D> - void multiply_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); + void mulMat_16sc4_32f(PtrStepSz<short4> src1, PtrStepSzf src2, PtrStepSz<short4> dst, cudaStream_t stream); - template <typename T, typename D> - void multiply_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); -}}} + template <typename T, typename S, typename D> + void mulMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); +} void cv::gpu::multiply(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, double scale, int dtype, Stream& s) { - using namespace cv::gpu::device; + using namespace arithm; cudaStream_t stream = StreamAccessor::getStream(s); if (src1.type() == CV_8UC4 && src2.type() == CV_32FC1) { - CV_Assert(src1.size() == src2.size()); + CV_Assert( src1.size() == src2.size() ); dst.create(src1.size(), src1.type()); - multiply_gpu(static_cast<PtrStepSz<uchar4> >(src1), static_cast<PtrStepSzf>(src2), static_cast<PtrStepSz<uchar4> >(dst), stream); + mulMat_8uc4_32f(src1, src2, dst, stream); } else if (src1.type() == CV_16SC4 && src2.type() == CV_32FC1) { - CV_Assert(src1.size() == src2.size()); + CV_Assert( src1.size() == src2.size() ); dst.create(src1.size(), src1.type()); - multiply_gpu(static_cast<PtrStepSz<short4> >(src1), static_cast<PtrStepSzf>(src2), static_cast<PtrStepSz<short4> >(dst), stream); + mulMat_16sc4_32f(src1, src2, dst, stream); } else { - typedef void (*func_t)(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); + typedef void (*func_t)(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); static const func_t funcs[7][7] = { - {multiply_gpu<unsigned char, unsigned char> , 0 /*multiply_gpu<unsigned char, signed char>*/ , multiply_gpu<unsigned char, unsigned short> , multiply_gpu<unsigned char, short> , multiply_gpu<unsigned char, int> , multiply_gpu<unsigned char, float> , multiply_gpu<unsigned char, double> }, - {0 /*multiply_gpu<signed char, unsigned char>*/ , 0 /*multiply_gpu<signed char, signed char>*/ , 0 /*multiply_gpu<signed char, unsigned short>*/, 0 /*multiply_gpu<signed char, short>*/ , 0 /*multiply_gpu<signed char, int>*/, 0 /*multiply_gpu<signed char, float>*/, 0 /*multiply_gpu<signed char, double>*/}, - {0 /*multiply_gpu<unsigned short, unsigned char>*/, 0 /*multiply_gpu<unsigned short, signed char>*/, multiply_gpu<unsigned short, unsigned short> , 0 /*multiply_gpu<unsigned short, short>*/, multiply_gpu<unsigned short, int> , multiply_gpu<unsigned short, float> , multiply_gpu<unsigned short, double> }, - {0 /*multiply_gpu<short, unsigned char>*/ , 0 /*multiply_gpu<short, signed char>*/ , 0 /*multiply_gpu<short, unsigned short>*/ , multiply_gpu<short, short> , multiply_gpu<short, int> , multiply_gpu<short, float> , multiply_gpu<short, double> }, - {0 /*multiply_gpu<int, unsigned char>*/ , 0 /*multiply_gpu<int, signed char>*/ , 0 /*multiply_gpu<int, unsigned short>*/ , 0 /*multiply_gpu<int, short>*/ , multiply_gpu<int, int> , multiply_gpu<int, float> , multiply_gpu<int, double> }, - {0 /*multiply_gpu<float, unsigned char>*/ , 0 /*multiply_gpu<float, signed char>*/ , 0 /*multiply_gpu<float, unsigned short>*/ , 0 /*multiply_gpu<float, short>*/ , 0 /*multiply_gpu<float, int>*/ , multiply_gpu<float, float> , multiply_gpu<float, double> }, - {0 /*multiply_gpu<double, unsigned char>*/ , 0 /*multiply_gpu<double, signed char>*/ , 0 /*multiply_gpu<double, unsigned short>*/ , 0 /*multiply_gpu<double, short>*/ , 0 /*multiply_gpu<double, int>*/ , 0 /*multiply_gpu<double, float>*/ , multiply_gpu<double, double> } - }; - - typedef void (*npp_func_t)(const PtrStepSzb src1, const PtrStepb src2, PtrStepb dst, cudaStream_t stream); - static const npp_func_t npp_funcs[] = - { - NppArithm<CV_8U , nppiMul_8u_C1RSfs >::call, - 0, - NppArithm<CV_16U, nppiMul_16u_C1RSfs>::call, - NppArithm<CV_16S, nppiMul_16s_C1RSfs>::call, - NppArithm<CV_32S, nppiMul_32s_C1RSfs>::call, - NppArithm<CV_32F, nppiMul_32f_C1R >::call + { + mulMat<unsigned char, float, unsigned char>, + mulMat<unsigned char, float, signed char>, + mulMat<unsigned char, float, unsigned short>, + mulMat<unsigned char, float, short>, + mulMat<unsigned char, float, int>, + mulMat<unsigned char, float, float>, + mulMat<unsigned char, double, double> + }, + { + mulMat<signed char, float, unsigned char>, + mulMat<signed char, float, signed char>, + mulMat<signed char, float, unsigned short>, + mulMat<signed char, float, short>, + mulMat<signed char, float, int>, + mulMat<signed char, float, float>, + mulMat<signed char, double, double> + }, + { + 0 /*mulMat<unsigned short, float, unsigned char>*/, + 0 /*mulMat<unsigned short, float, signed char>*/, + mulMat<unsigned short, float, unsigned short>, + mulMat<unsigned short, float, short>, + mulMat<unsigned short, float, int>, + mulMat<unsigned short, float, float>, + mulMat<unsigned short, double, double> + }, + { + 0 /*mulMat<short, float, unsigned char>*/, + 0 /*mulMat<short, float, signed char>*/, + mulMat<short, float, unsigned short>, + mulMat<short, float, short>, + mulMat<short, float, int>, + mulMat<short, float, float>, + mulMat<short, double, double> + }, + { + 0 /*mulMat<int, float, unsigned char>*/, + 0 /*mulMat<int, float, signed char>*/, + 0 /*mulMat<int, float, unsigned short>*/, + 0 /*mulMat<int, float, short>*/, + mulMat<int, float, int>, + mulMat<int, float, float>, + mulMat<int, double, double> + }, + { + 0 /*mulMat<float, float, unsigned char>*/, + 0 /*mulMat<float, float, signed char>*/, + 0 /*mulMat<float, float, unsigned short>*/, + 0 /*mulMat<float, float, short>*/, + 0 /*mulMat<float, float, int>*/, + mulMat<float, float, float>, + mulMat<float, double, double> + }, + { + 0 /*mulMat<double, double, unsigned char>*/, + 0 /*mulMat<double, double, signed char>*/, + 0 /*mulMat<double, double, unsigned short>*/, + 0 /*mulMat<double, double, short>*/, + 0 /*mulMat<double, double, int>*/, + 0 /*mulMat<double, double, float>*/, + mulMat<double, double, double> + } }; if (dtype < 0) dtype = src1.depth(); - CV_Assert(src1.depth() <= CV_64F && CV_MAT_DEPTH(dtype) <= CV_64F); - CV_Assert(src1.type() == src2.type() && src1.size() == src2.size()); + const int sdepth = src1.depth(); + const int ddepth = CV_MAT_DEPTH(dtype); + const int cn = src1.channels(); - if (src1.depth() == CV_64F || CV_MAT_DEPTH(dtype) == CV_64F) + CV_Assert( sdepth <= CV_64F && ddepth <= CV_64F ); + CV_Assert( src2.type() == src1.type() && src2.size() == src1.size() ); + + if (sdepth == CV_64F || ddepth == CV_64F) { - if (!TargetArchs::builtWith(NATIVE_DOUBLE) || !DeviceInfo().supports(NATIVE_DOUBLE)) + if (!deviceSupports(NATIVE_DOUBLE)) CV_Error(CV_StsUnsupportedFormat, "The device doesn't support double"); } - dst.create(src1.size(), CV_MAKE_TYPE(CV_MAT_DEPTH(dtype), src1.channels())); + dst.create(src1.size(), CV_MAKE_TYPE(ddepth, cn)); -#if (CUDA_VERSION <= 4020) - if (scale == 1 && dst.type() == src1.type() && src1.depth() <= CV_32F) -#else - if (scale == 1 && dst.type() == src1.type() && src1.depth() <= CV_32F && src1.depth() > CV_8U) -#endif - { - npp_funcs[src1.depth()](src1.reshape(1), src2.reshape(1), dst.reshape(1), stream); - return; - } + PtrStepSzb src1_(src1.rows, src1.cols * cn, src1.data, src1.step); + PtrStepSzb src2_(src1.rows, src1.cols * cn, src2.data, src2.step); + PtrStepSzb dst_(src1.rows, src1.cols * cn, dst.data, dst.step); - const func_t func = funcs[src1.depth()][dst.depth()]; + const func_t func = funcs[sdepth][ddepth]; if (!func) CV_Error(CV_StsUnsupportedFormat, "Unsupported combination of source and destination types"); - func(src1.reshape(1), src2.reshape(1), dst.reshape(1), scale, stream); + func(src1_, src2_, dst_, scale, stream); } } -namespace +namespace arithm { - inline bool isIntScalar(Scalar sc) - { - return sc.val[0] == static_cast<int>(sc.val[0]) && sc.val[1] == static_cast<int>(sc.val[1]) && sc.val[2] == static_cast<int>(sc.val[2]) && sc.val[3] == static_cast<int>(sc.val[3]); - } + template <typename T, typename S, typename D> + void mulScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); } void cv::gpu::multiply(const GpuMat& src, const Scalar& sc, GpuMat& dst, double scale, int dtype, Stream& s) { - using namespace cv::gpu::device; + using namespace arithm; - typedef void (*func_t)(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); + typedef void (*func_t)(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); static const func_t funcs[7][7] = { - {multiply_gpu<unsigned char, unsigned char> , 0 /*multiply_gpu<unsigned char, signed char>*/ , multiply_gpu<unsigned char, unsigned short> , multiply_gpu<unsigned char, short> , multiply_gpu<unsigned char, int> , multiply_gpu<unsigned char, float> , multiply_gpu<unsigned char, double> }, - {0 /*multiply_gpu<signed char, unsigned char>*/ , 0 /*multiply_gpu<signed char, signed char>*/ , 0 /*multiply_gpu<signed char, unsigned short>*/, 0 /*multiply_gpu<signed char, short>*/ , 0 /*multiply_gpu<signed char, int>*/, 0 /*multiply_gpu<signed char, float>*/, 0 /*multiply_gpu<signed char, double>*/}, - {0 /*multiply_gpu<unsigned short, unsigned char>*/, 0 /*multiply_gpu<unsigned short, signed char>*/, multiply_gpu<unsigned short, unsigned short> , 0 /*multiply_gpu<unsigned short, short>*/, multiply_gpu<unsigned short, int> , multiply_gpu<unsigned short, float> , multiply_gpu<unsigned short, double> }, - {0 /*multiply_gpu<short, unsigned char>*/ , 0 /*multiply_gpu<short, signed char>*/ , 0 /*multiply_gpu<short, unsigned short>*/ , multiply_gpu<short, short> , multiply_gpu<short, int> , multiply_gpu<short, float> , multiply_gpu<short, double> }, - {0 /*multiply_gpu<int, unsigned char>*/ , 0 /*multiply_gpu<int, signed char>*/ , 0 /*multiply_gpu<int, unsigned short>*/ , 0 /*multiply_gpu<int, short>*/ , multiply_gpu<int, int> , multiply_gpu<int, float> , multiply_gpu<int, double> }, - {0 /*multiply_gpu<float, unsigned char>*/ , 0 /*multiply_gpu<float, signed char>*/ , 0 /*multiply_gpu<float, unsigned short>*/ , 0 /*multiply_gpu<float, short>*/ , 0 /*multiply_gpu<float, int>*/ , multiply_gpu<float, float> , multiply_gpu<float, double> }, - {0 /*multiply_gpu<double, unsigned char>*/ , 0 /*multiply_gpu<double, signed char>*/ , 0 /*multiply_gpu<double, unsigned short>*/ , 0 /*multiply_gpu<double, short>*/ , 0 /*multiply_gpu<double, int>*/ , 0 /*multiply_gpu<double, float>*/ , multiply_gpu<double, double> } + { + mulScalar<unsigned char, float, unsigned char>, + mulScalar<unsigned char, float, signed char>, + mulScalar<unsigned char, float, unsigned short>, + mulScalar<unsigned char, float, short>, + mulScalar<unsigned char, float, int>, + mulScalar<unsigned char, float, float>, + mulScalar<unsigned char, double, double> + }, + { + mulScalar<signed char, float, unsigned char>, + mulScalar<signed char, float, signed char>, + mulScalar<signed char, float, unsigned short>, + mulScalar<signed char, float, short>, + mulScalar<signed char, float, int>, + mulScalar<signed char, float, float>, + mulScalar<signed char, double, double> + }, + { + 0 /*mulScalar<unsigned short, float, unsigned char>*/, + 0 /*mulScalar<unsigned short, float, signed char>*/, + mulScalar<unsigned short, float, unsigned short>, + mulScalar<unsigned short, float, short>, + mulScalar<unsigned short, float, int>, + mulScalar<unsigned short, float, float>, + mulScalar<unsigned short, double, double> + }, + { + 0 /*mulScalar<short, float, unsigned char>*/, + 0 /*mulScalar<short, float, signed char>*/, + mulScalar<short, float, unsigned short>, + mulScalar<short, float, short>, + mulScalar<short, float, int>, + mulScalar<short, float, float>, + mulScalar<short, double, double> + }, + { + 0 /*mulScalar<int, float, unsigned char>*/, + 0 /*mulScalar<int, float, signed char>*/, + 0 /*mulScalar<int, float, unsigned short>*/, + 0 /*mulScalar<int, float, short>*/, + mulScalar<int, float, int>, + mulScalar<int, float, float>, + mulScalar<int, double, double> + }, + { + 0 /*mulScalar<float, float, unsigned char>*/, + 0 /*mulScalar<float, float, signed char>*/, + 0 /*mulScalar<float, float, unsigned short>*/, + 0 /*mulScalar<float, float, short>*/, + 0 /*mulScalar<float, float, int>*/, + mulScalar<float, float, float>, + mulScalar<float, double, double> + }, + { + 0 /*mulScalar<double, double, unsigned char>*/, + 0 /*mulScalar<double, double, signed char>*/, + 0 /*mulScalar<double, double, unsigned short>*/, + 0 /*mulScalar<double, double, short>*/, + 0 /*mulScalar<double, double, int>*/, + 0 /*mulScalar<double, double, float>*/, + mulScalar<double, double, double> + } }; typedef void (*npp_func_t)(const PtrStepSzb src, Scalar sc, PtrStepb dst, cudaStream_t stream); @@ -716,148 +1173,254 @@ void cv::gpu::multiply(const GpuMat& src, const Scalar& sc, GpuMat& dst, double if (dtype < 0) dtype = src.depth(); - CV_Assert(src.depth() <= CV_64F && CV_MAT_DEPTH(dtype) <= CV_64F); - CV_Assert(src.channels() <= 4); + const int sdepth = src.depth(); + const int ddepth = CV_MAT_DEPTH(dtype); + const int cn = src.channels(); - if (src.depth() == CV_64F || CV_MAT_DEPTH(dtype) == CV_64F) + CV_Assert( sdepth <= CV_64F && ddepth <= CV_64F ); + CV_Assert( cn <= 4 ); + + if (sdepth == CV_64F || ddepth == CV_64F) { - if (!TargetArchs::builtWith(NATIVE_DOUBLE) || !DeviceInfo().supports(NATIVE_DOUBLE)) + if (!deviceSupports(NATIVE_DOUBLE)) CV_Error(CV_StsUnsupportedFormat, "The device doesn't support double"); } - dst.create(src.size(), CV_MAKE_TYPE(CV_MAT_DEPTH(dtype), src.channels())); + dst.create(src.size(), CV_MAKE_TYPE(ddepth, cn)); cudaStream_t stream = StreamAccessor::getStream(s); - if (dst.type() == src.type() && scale == 1 && (src.depth() == CV_32F || isIntScalar(sc))) - { - const npp_func_t npp_func = npp_funcs[src.depth()][src.channels() - 1]; + const Scalar nsc(sc.val[0] * scale, sc.val[1] * scale, sc.val[2] * scale, sc.val[3] * scale); - if (npp_func) - { - npp_func(src, sc, dst, stream); - return; - } + const npp_func_t npp_func = npp_funcs[sdepth][cn - 1]; + if (ddepth == sdepth && cn > 1 && npp_func != 0) + { + npp_func(src, nsc, dst, stream); + return; } - CV_Assert(src.channels() == 1); + CV_Assert( cn == 1 ); - const func_t func = funcs[src.depth()][dst.depth()]; + const func_t func = funcs[sdepth][ddepth]; if (!func) CV_Error(CV_StsUnsupportedFormat, "Unsupported combination of source and destination types"); - func(src, sc.val[0], dst, scale, stream); + func(src, nsc.val[0], dst, stream); } //////////////////////////////////////////////////////////////////////// // divide -namespace cv { namespace gpu { namespace device +namespace arithm { - void divide_gpu(const PtrStepSz<uchar4>& src1, const PtrStepSzf& src2, const PtrStepSz<uchar4>& dst, cudaStream_t stream); - void divide_gpu(const PtrStepSz<short4>& src1, const PtrStepSzf& src2, const PtrStepSz<short4>& dst, cudaStream_t stream); + void divMat_8uc4_32f(PtrStepSz<unsigned int> src1, PtrStepSzf src2, PtrStepSz<unsigned int> dst, cudaStream_t stream); - template <typename T, typename D> - void divide_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); + void divMat_16sc4_32f(PtrStepSz<short4> src1, PtrStepSzf src2, PtrStepSz<short4> dst, cudaStream_t stream); - template <typename T, typename D> - void divide_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - - template <typename T, typename D> - void divide_gpu(double scalar, const PtrStepSzb& src2, const PtrStepSzb& dst, cudaStream_t stream); -}}} + template <typename T, typename S, typename D> + void divMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); +} void cv::gpu::divide(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, double scale, int dtype, Stream& s) { - using namespace cv::gpu::device; + using namespace arithm; cudaStream_t stream = StreamAccessor::getStream(s); if (src1.type() == CV_8UC4 && src2.type() == CV_32FC1) { - CV_Assert(src1.size() == src2.size()); + CV_Assert( src1.size() == src2.size() ); dst.create(src1.size(), src1.type()); - divide_gpu(static_cast<PtrStepSz<uchar4> >(src1), static_cast<PtrStepSzf>(src2), static_cast<PtrStepSz<uchar4> >(dst), stream); + divMat_8uc4_32f(src1, src2, dst, stream); } else if (src1.type() == CV_16SC4 && src2.type() == CV_32FC1) { - CV_Assert(src1.size() == src2.size()); + CV_Assert( src1.size() == src2.size() ); dst.create(src1.size(), src1.type()); - divide_gpu(static_cast<PtrStepSz<short4> >(src1), static_cast<PtrStepSzf>(src2), static_cast<PtrStepSz<short4> >(dst), stream); + divMat_16sc4_32f(src1, src2, dst, stream); } else { - typedef void (*func_t)(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); + typedef void (*func_t)(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); static const func_t funcs[7][7] = { - {divide_gpu<unsigned char, unsigned char> , 0 /*divide_gpu<unsigned char, signed char>*/ , divide_gpu<unsigned char, unsigned short> , divide_gpu<unsigned char, short> , divide_gpu<unsigned char, int> , divide_gpu<unsigned char, float> , divide_gpu<unsigned char, double> }, - {0 /*divide_gpu<signed char, unsigned char>*/ , 0 /*divide_gpu<signed char, signed char>*/ , 0 /*divide_gpu<signed char, unsigned short>*/, 0 /*divide_gpu<signed char, short>*/ , 0 /*divide_gpu<signed char, int>*/, 0 /*divide_gpu<signed char, float>*/, 0 /*divide_gpu<signed char, double>*/}, - {0 /*divide_gpu<unsigned short, unsigned char>*/, 0 /*divide_gpu<unsigned short, signed char>*/, divide_gpu<unsigned short, unsigned short> , 0 /*divide_gpu<unsigned short, short>*/, divide_gpu<unsigned short, int> , divide_gpu<unsigned short, float> , divide_gpu<unsigned short, double> }, - {0 /*divide_gpu<short, unsigned char>*/ , 0 /*divide_gpu<short, signed char>*/ , 0 /*divide_gpu<short, unsigned short>*/ , divide_gpu<short, short> , divide_gpu<short, int> , divide_gpu<short, float> , divide_gpu<short, double> }, - {0 /*divide_gpu<int, unsigned char>*/ , 0 /*divide_gpu<int, signed char>*/ , 0 /*divide_gpu<int, unsigned short>*/ , 0 /*divide_gpu<int, short>*/ , divide_gpu<int, int> , divide_gpu<int, float> , divide_gpu<int, double> }, - {0 /*divide_gpu<float, unsigned char>*/ , 0 /*divide_gpu<float, signed char>*/ , 0 /*divide_gpu<float, unsigned short>*/ , 0 /*divide_gpu<float, short>*/ , 0 /*divide_gpu<float, int>*/ , divide_gpu<float, float> , divide_gpu<float, double> }, - {0 /*divide_gpu<double, unsigned char>*/ , 0 /*divide_gpu<double, signed char>*/ , 0 /*divide_gpu<double, unsigned short>*/ , 0 /*divide_gpu<double, short>*/ , 0 /*divide_gpu<double, int>*/ , 0 /*divide_gpu<double, float>*/ , divide_gpu<double, double> } - }; - - typedef void (*npp_func_t)(const PtrStepSzb src1, const PtrStepb src2, PtrStepb dst, cudaStream_t stream); - static const npp_func_t npp_funcs[6] = - { - NppArithm<CV_8U , nppiDiv_8u_C1RSfs >::call, - 0, - NppArithm<CV_16U, nppiDiv_16u_C1RSfs>::call, - NppArithm<CV_16S, nppiDiv_16s_C1RSfs>::call, - NppArithm<CV_32S, nppiDiv_32s_C1RSfs>::call, - NppArithm<CV_32F, nppiDiv_32f_C1R >::call + { + divMat<unsigned char, float, unsigned char>, + divMat<unsigned char, float, signed char>, + divMat<unsigned char, float, unsigned short>, + divMat<unsigned char, float, short>, + divMat<unsigned char, float, int>, + divMat<unsigned char, float, float>, + divMat<unsigned char, double, double> + }, + { + divMat<signed char, float, unsigned char>, + divMat<signed char, float, signed char>, + divMat<signed char, float, unsigned short>, + divMat<signed char, float, short>, + divMat<signed char, float, int>, + divMat<signed char, float, float>, + divMat<signed char, double, double> + }, + { + 0 /*divMat<unsigned short, float, unsigned char>*/, + 0 /*divMat<unsigned short, float, signed char>*/, + divMat<unsigned short, float, unsigned short>, + divMat<unsigned short, float, short>, + divMat<unsigned short, float, int>, + divMat<unsigned short, float, float>, + divMat<unsigned short, double, double> + }, + { + 0 /*divMat<short, float, unsigned char>*/, + 0 /*divMat<short, float, signed char>*/, + divMat<short, float, unsigned short>, + divMat<short, float, short>, + divMat<short, float, int>, + divMat<short, float, float>, + divMat<short, double, double> + }, + { + 0 /*divMat<int, float, unsigned char>*/, + 0 /*divMat<int, float, signed char>*/, + 0 /*divMat<int, float, unsigned short>*/, + 0 /*divMat<int, float, short>*/, + divMat<int, float, int>, + divMat<int, float, float>, + divMat<int, double, double> + }, + { + 0 /*divMat<float, float, unsigned char>*/, + 0 /*divMat<float, float, signed char>*/, + 0 /*divMat<float, float, unsigned short>*/, + 0 /*divMat<float, float, short>*/, + 0 /*divMat<float, float, int>*/, + divMat<float, float, float>, + divMat<float, double, double> + }, + { + 0 /*divMat<double, double, unsigned char>*/, + 0 /*divMat<double, double, signed char>*/, + 0 /*divMat<double, double, unsigned short>*/, + 0 /*divMat<double, double, short>*/, + 0 /*divMat<double, double, int>*/, + 0 /*divMat<double, double, float>*/, + divMat<double, double, double> + } }; if (dtype < 0) dtype = src1.depth(); - CV_Assert(src1.depth() <= CV_64F && CV_MAT_DEPTH(dtype) <= CV_64F); - CV_Assert(src1.type() == src2.type() && src1.size() == src2.size()); + const int sdepth = src1.depth(); + const int ddepth = CV_MAT_DEPTH(dtype); + const int cn = src1.channels(); - if (src1.depth() == CV_64F || CV_MAT_DEPTH(dtype) == CV_64F) + CV_Assert( sdepth <= CV_64F && ddepth <= CV_64F ); + CV_Assert( src2.type() == src1.type() && src2.size() == src1.size() ); + + if (sdepth == CV_64F || ddepth == CV_64F) { - if (!TargetArchs::builtWith(NATIVE_DOUBLE) || !DeviceInfo().supports(NATIVE_DOUBLE)) + if (!deviceSupports(NATIVE_DOUBLE)) CV_Error(CV_StsUnsupportedFormat, "The device doesn't support double"); } - dst.create(src1.size(), CV_MAKE_TYPE(CV_MAT_DEPTH(dtype), src1.channels())); + dst.create(src1.size(), CV_MAKE_TYPE(ddepth, cn)); - if (scale == 1 && dst.type() == src1.type() && src1.depth() <= CV_32F) - { - npp_funcs[src1.depth()](src2.reshape(1), src1.reshape(1), dst.reshape(1), stream); - return; - } + PtrStepSzb src1_(src1.rows, src1.cols * cn, src1.data, src1.step); + PtrStepSzb src2_(src1.rows, src1.cols * cn, src2.data, src2.step); + PtrStepSzb dst_(src1.rows, src1.cols * cn, dst.data, dst.step); - const func_t func = funcs[src1.depth()][dst.depth()]; + const func_t func = funcs[sdepth][ddepth]; if (!func) CV_Error(CV_StsUnsupportedFormat, "Unsupported combination of source and destination types"); - func(src1.reshape(1), src2.reshape(1), dst.reshape(1), scale, stream); + func(src1_, src2_, dst_, scale, stream); } } +namespace arithm +{ + template <typename T, typename S, typename D> + void divScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); +} + void cv::gpu::divide(const GpuMat& src, const Scalar& sc, GpuMat& dst, double scale, int dtype, Stream& s) { - using namespace cv::gpu::device; + using namespace arithm; - typedef void (*func_t)(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); + typedef void (*func_t)(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); static const func_t funcs[7][7] = { - {divide_gpu<unsigned char, unsigned char> , 0 /*divide_gpu<unsigned char, signed char>*/ , divide_gpu<unsigned char, unsigned short> , divide_gpu<unsigned char, short> , divide_gpu<unsigned char, int> , divide_gpu<unsigned char, float> , divide_gpu<unsigned char, double> }, - {0 /*divide_gpu<signed char, unsigned char>*/ , 0 /*divide_gpu<signed char, signed char>*/ , 0 /*divide_gpu<signed char, unsigned short>*/, 0 /*divide_gpu<signed char, short>*/ , 0 /*divide_gpu<signed char, int>*/, 0 /*divide_gpu<signed char, float>*/, 0 /*divide_gpu<signed char, double>*/}, - {0 /*divide_gpu<unsigned short, unsigned char>*/, 0 /*divide_gpu<unsigned short, signed char>*/, divide_gpu<unsigned short, unsigned short> , 0 /*divide_gpu<unsigned short, short>*/, divide_gpu<unsigned short, int> , divide_gpu<unsigned short, float> , divide_gpu<unsigned short, double> }, - {0 /*divide_gpu<short, unsigned char>*/ , 0 /*divide_gpu<short, signed char>*/ , 0 /*divide_gpu<short, unsigned short>*/ , divide_gpu<short, short> , divide_gpu<short, int> , divide_gpu<short, float> , divide_gpu<short, double> }, - {0 /*divide_gpu<int, unsigned char>*/ , 0 /*divide_gpu<int, signed char>*/ , 0 /*divide_gpu<int, unsigned short>*/ , 0 /*divide_gpu<int, short>*/ , divide_gpu<int, int> , divide_gpu<int, float> , divide_gpu<int, double> }, - {0 /*divide_gpu<float, unsigned char>*/ , 0 /*divide_gpu<float, signed char>*/ , 0 /*divide_gpu<float, unsigned short>*/ , 0 /*divide_gpu<float, short>*/ , 0 /*divide_gpu<float, int>*/ , divide_gpu<float, float> , divide_gpu<float, double> }, - {0 /*divide_gpu<double, unsigned char>*/ , 0 /*divide_gpu<double, signed char>*/ , 0 /*divide_gpu<double, unsigned short>*/ , 0 /*divide_gpu<double, short>*/ , 0 /*divide_gpu<double, int>*/ , 0 /*divide_gpu<double, float>*/ , divide_gpu<double, double> } + { + divScalar<unsigned char, float, unsigned char>, + divScalar<unsigned char, float, signed char>, + divScalar<unsigned char, float, unsigned short>, + divScalar<unsigned char, float, short>, + divScalar<unsigned char, float, int>, + divScalar<unsigned char, float, float>, + divScalar<unsigned char, double, double> + }, + { + divScalar<signed char, float, unsigned char>, + divScalar<signed char, float, signed char>, + divScalar<signed char, float, unsigned short>, + divScalar<signed char, float, short>, + divScalar<signed char, float, int>, + divScalar<signed char, float, float>, + divScalar<signed char, double, double> + }, + { + 0 /*divScalar<unsigned short, float, unsigned char>*/, + 0 /*divScalar<unsigned short, float, signed char>*/, + divScalar<unsigned short, float, unsigned short>, + divScalar<unsigned short, float, short>, + divScalar<unsigned short, float, int>, + divScalar<unsigned short, float, float>, + divScalar<unsigned short, double, double> + }, + { + 0 /*divScalar<short, float, unsigned char>*/, + 0 /*divScalar<short, float, signed char>*/, + divScalar<short, float, unsigned short>, + divScalar<short, float, short>, + divScalar<short, float, int>, + divScalar<short, float, float>, + divScalar<short, double, double> + }, + { + 0 /*divScalar<int, float, unsigned char>*/, + 0 /*divScalar<int, float, signed char>*/, + 0 /*divScalar<int, float, unsigned short>*/, + 0 /*divScalar<int, float, short>*/, + divScalar<int, float, int>, + divScalar<int, float, float>, + divScalar<int, double, double> + }, + { + 0 /*divScalar<float, float, unsigned char>*/, + 0 /*divScalar<float, float, signed char>*/, + 0 /*divScalar<float, float, unsigned short>*/, + 0 /*divScalar<float, float, short>*/, + 0 /*divScalar<float, float, int>*/, + divScalar<float, float, float>, + divScalar<float, double, double> + }, + { + 0 /*divScalar<double, double, unsigned char>*/, + 0 /*divScalar<double, double, signed char>*/, + 0 /*divScalar<double, double, unsigned short>*/, + 0 /*divScalar<double, double, short>*/, + 0 /*divScalar<double, double, int>*/, + 0 /*divScalar<double, double, float>*/, + divScalar<double, double, double> + } }; typedef void (*npp_func_t)(const PtrStepSzb src, Scalar sc, PtrStepb dst, cudaStream_t stream); @@ -875,536 +1438,547 @@ void cv::gpu::divide(const GpuMat& src, const Scalar& sc, GpuMat& dst, double sc if (dtype < 0) dtype = src.depth(); - CV_Assert(src.depth() <= CV_64F && CV_MAT_DEPTH(dtype) <= CV_64F); - CV_Assert(src.channels() <= 4); + const int sdepth = src.depth(); + const int ddepth = CV_MAT_DEPTH(dtype); + const int cn = src.channels(); - if (src.depth() == CV_64F || CV_MAT_DEPTH(dtype) == CV_64F) + CV_Assert( sdepth <= CV_64F && ddepth <= CV_64F ); + CV_Assert( cn <= 4 ); + + if (sdepth == CV_64F || ddepth == CV_64F) { - if (!TargetArchs::builtWith(NATIVE_DOUBLE) || !DeviceInfo().supports(NATIVE_DOUBLE)) + if (!deviceSupports(NATIVE_DOUBLE)) CV_Error(CV_StsUnsupportedFormat, "The device doesn't support double"); } - dst.create(src.size(), CV_MAKE_TYPE(CV_MAT_DEPTH(dtype), src.channels())); + dst.create(src.size(), CV_MAKE_TYPE(ddepth, cn)); cudaStream_t stream = StreamAccessor::getStream(s); - if (dst.type() == src.type() && scale == 1 && (src.depth() == CV_32F || isIntScalar(sc))) - { - const npp_func_t npp_func = npp_funcs[src.depth()][src.channels() - 1]; + const Scalar nsc(sc.val[0] / scale, sc.val[1] / scale, sc.val[2] / scale, sc.val[3] / scale); - if (npp_func) - { - npp_func(src, sc, dst, stream); - return; - } + const npp_func_t npp_func = npp_funcs[sdepth][cn - 1]; + if (ddepth == sdepth && cn > 1 && npp_func != 0) + { + npp_func(src, nsc, dst, stream); + return; } - CV_Assert(src.channels() == 1); + CV_Assert( cn == 1 ); - const func_t func = funcs[src.depth()][dst.depth()]; + const func_t func = funcs[sdepth][ddepth]; if (!func) CV_Error(CV_StsUnsupportedFormat, "Unsupported combination of source and destination types"); - func(src, sc.val[0], dst, scale, stream); + func(src, nsc.val[0], dst, stream); +} + +namespace arithm +{ + template <typename T, typename S, typename D> + void divInv(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); } void cv::gpu::divide(double scale, const GpuMat& src, GpuMat& dst, int dtype, Stream& s) { - using namespace cv::gpu::device; + using namespace arithm; - typedef void (*func_t)(double scalar, const PtrStepSzb& src2, const PtrStepSzb& dst, cudaStream_t stream); + typedef void (*func_t)(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); static const func_t funcs[7][7] = { - {divide_gpu<unsigned char, unsigned char> , 0 /*divide_gpu<unsigned char, signed char>*/ , divide_gpu<unsigned char, unsigned short> , divide_gpu<unsigned char, short> , divide_gpu<unsigned char, int> , divide_gpu<unsigned char, float> , divide_gpu<unsigned char, double> }, - {0 /*divide_gpu<signed char, unsigned char>*/ , 0 /*divide_gpu<signed char, signed char>*/ , 0 /*divide_gpu<signed char, unsigned short>*/, 0 /*divide_gpu<signed char, short>*/ , 0 /*divide_gpu<signed char, int>*/, 0 /*divide_gpu<signed char, float>*/, 0 /*divide_gpu<signed char, double>*/}, - {0 /*divide_gpu<unsigned short, unsigned char>*/, 0 /*divide_gpu<unsigned short, signed char>*/, divide_gpu<unsigned short, unsigned short> , 0 /*divide_gpu<unsigned short, short>*/, divide_gpu<unsigned short, int> , divide_gpu<unsigned short, float> , divide_gpu<unsigned short, double> }, - {0 /*divide_gpu<short, unsigned char>*/ , 0 /*divide_gpu<short, signed char>*/ , 0 /*divide_gpu<short, unsigned short>*/ , divide_gpu<short, short> , divide_gpu<short, int> , divide_gpu<short, float> , divide_gpu<short, double> }, - {0 /*divide_gpu<int, unsigned char>*/ , 0 /*divide_gpu<int, signed char>*/ , 0 /*divide_gpu<int, unsigned short>*/ , 0 /*divide_gpu<int, short>*/ , divide_gpu<int, int> , divide_gpu<int, float> , divide_gpu<int, double> }, - {0 /*divide_gpu<float, unsigned char>*/ , 0 /*divide_gpu<float, signed char>*/ , 0 /*divide_gpu<float, unsigned short>*/ , 0 /*divide_gpu<float, short>*/ , 0 /*divide_gpu<float, int>*/ , divide_gpu<float, float> , divide_gpu<float, double> }, - {0 /*divide_gpu<double, unsigned char>*/ , 0 /*divide_gpu<double, signed char>*/ , 0 /*divide_gpu<double, unsigned short>*/ , 0 /*divide_gpu<double, short>*/ , 0 /*divide_gpu<double, int>*/ , 0 /*divide_gpu<double, float>*/ , divide_gpu<double, double> } + { + divInv<unsigned char, float, unsigned char>, + divInv<unsigned char, float, signed char>, + divInv<unsigned char, float, unsigned short>, + divInv<unsigned char, float, short>, + divInv<unsigned char, float, int>, + divInv<unsigned char, float, float>, + divInv<unsigned char, double, double> + }, + { + divInv<signed char, float, unsigned char>, + divInv<signed char, float, signed char>, + divInv<signed char, float, unsigned short>, + divInv<signed char, float, short>, + divInv<signed char, float, int>, + divInv<signed char, float, float>, + divInv<signed char, double, double> + }, + { + 0 /*divInv<unsigned short, float, unsigned char>*/, + 0 /*divInv<unsigned short, float, signed char>*/, + divInv<unsigned short, float, unsigned short>, + divInv<unsigned short, float, short>, + divInv<unsigned short, float, int>, + divInv<unsigned short, float, float>, + divInv<unsigned short, double, double> + }, + { + 0 /*divInv<short, float, unsigned char>*/, + 0 /*divInv<short, float, signed char>*/, + divInv<short, float, unsigned short>, + divInv<short, float, short>, + divInv<short, float, int>, + divInv<short, float, float>, + divInv<short, double, double> + }, + { + 0 /*divInv<int, float, unsigned char>*/, + 0 /*divInv<int, float, signed char>*/, + 0 /*divInv<int, float, unsigned short>*/, + 0 /*divInv<int, float, short>*/, + divInv<int, float, int>, + divInv<int, float, float>, + divInv<int, double, double> + }, + { + 0 /*divInv<float, float, unsigned char>*/, + 0 /*divInv<float, float, signed char>*/, + 0 /*divInv<float, float, unsigned short>*/, + 0 /*divInv<float, float, short>*/, + 0 /*divInv<float, float, int>*/, + divInv<float, float, float>, + divInv<float, double, double> + }, + { + 0 /*divInv<double, double, unsigned char>*/, + 0 /*divInv<double, double, signed char>*/, + 0 /*divInv<double, double, unsigned short>*/, + 0 /*divInv<double, double, short>*/, + 0 /*divInv<double, double, int>*/, + 0 /*divInv<double, double, float>*/, + divInv<double, double, double> + } }; if (dtype < 0) dtype = src.depth(); - CV_Assert(src.depth() <= CV_64F && CV_MAT_DEPTH(dtype) <= CV_64F); - CV_Assert(src.channels() == 1); + const int sdepth = src.depth(); + const int ddepth = CV_MAT_DEPTH(dtype); + const int cn = src.channels(); - if (src.depth() == CV_64F || CV_MAT_DEPTH(dtype) == CV_64F) + CV_Assert( sdepth <= CV_64F && ddepth <= CV_64F ); + CV_Assert( cn == 1 ); + + if (sdepth == CV_64F || ddepth == CV_64F) { - if (!TargetArchs::builtWith(NATIVE_DOUBLE) || !DeviceInfo().supports(NATIVE_DOUBLE)) + if (!deviceSupports(NATIVE_DOUBLE)) CV_Error(CV_StsUnsupportedFormat, "The device doesn't support double"); } - dst.create(src.size(), CV_MAKE_TYPE(CV_MAT_DEPTH(dtype), src.channels())); + dst.create(src.size(), CV_MAKE_TYPE(ddepth, cn)); cudaStream_t stream = StreamAccessor::getStream(s); - const func_t func = funcs[src.depth()][dst.depth()]; + const func_t func = funcs[sdepth][ddepth]; if (!func) CV_Error(CV_StsUnsupportedFormat, "Unsupported combination of source and destination types"); - func(scale, src, dst, stream); + func(src, scale, dst, stream); } ////////////////////////////////////////////////////////////////////////////// // absdiff -namespace cv { namespace gpu { namespace device +namespace arithm { template <typename T> - void absdiff_gpu(const PtrStepSzb src1, const PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream); + void vabsDiff4(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream); template <typename T> - void absdiff_gpu(const PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); -}}} + void vabsDiff2(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream); -namespace -{ - template <int DEPTH> struct NppAbsDiffFunc - { - typedef typename NppTypeTraits<DEPTH>::npp_t npp_t; - - typedef NppStatus (*func_t)(const npp_t* src1, int src1_step, const npp_t* src2, int src2_step, npp_t* dst, int dst_step, NppiSize sz); - }; - - template <int DEPTH, typename NppAbsDiffFunc<DEPTH>::func_t func> struct NppAbsDiff - { - typedef typename NppAbsDiffFunc<DEPTH>::npp_t npp_t; - - static void call(const PtrStepSzb src1, const PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream) - { - NppStreamHandler h(stream); - - NppiSize sz; - sz.width = src1.cols; - sz.height = src1.rows; - - nppSafeCall( func((const npp_t*)src1.data, static_cast<int>(src1.step), (const npp_t*)src2.data, static_cast<int>(src2.step), - (npp_t*)dst.data, static_cast<int>(dst.step), sz) ); - - if (stream == 0) - cudaSafeCall( cudaDeviceSynchronize() ); - } - }; - - template <int DEPTH> struct NppAbsDiffCFunc - { - typedef typename NppTypeTraits<DEPTH>::npp_t npp_t; - typedef npp_t scalar_t; - - typedef NppStatus (*func_t)(const npp_t* pSrc1, int nSrc1Step, npp_t* pDst, int nDstStep, NppiSize oSizeROI, npp_t nConstant); - }; - template <> struct NppAbsDiffCFunc<CV_16U> - { - typedef NppTypeTraits<CV_16U>::npp_t npp_t; - typedef Npp32u scalar_t; - -#if (CUDA_VERSION <= 4020) - typedef NppStatus (*func_t)(const Npp16u* pSrc1, int nSrc1Step, Npp16u* pDst, int nDstStep, NppiSize oSizeROI, Npp32u nConstant); -#else - typedef NppStatus (*func_t)(const Npp16u * pSrc1, int nSrc1Step, Npp16u * pDst, int nDstStep, NppiSize oSizeROI, Npp16u nConstant); -#endif - }; - - template <int DEPTH, typename NppAbsDiffCFunc<DEPTH>::func_t func> struct NppAbsDiffC - { - typedef typename NppAbsDiffCFunc<DEPTH>::npp_t npp_t; - typedef typename NppAbsDiffCFunc<DEPTH>::scalar_t scalar_t; - - static void call(const PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream) - { - NppStreamHandler h(stream); - - NppiSize sz; - sz.width = src1.cols; - sz.height = src1.rows; - - nppSafeCall( func((const npp_t*)src1.data, static_cast<int>(src1.step), - (npp_t*)dst.data, static_cast<int>(dst.step), sz, static_cast<scalar_t>(val)) ); - - if (stream == 0) - cudaSafeCall( cudaDeviceSynchronize() ); - } - }; + template <typename T> + void absDiffMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream); } -void cv::gpu::absdiff(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, Stream& stream) +void cv::gpu::absdiff(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, Stream& s) { - using namespace cv::gpu::device; + using namespace arithm; - typedef void (*func_t)(const PtrStepSzb src1, const PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream); + typedef void (*func_t)(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream); static const func_t funcs[] = { - NppAbsDiff<CV_8U, nppiAbsDiff_8u_C1R>::call, - absdiff_gpu<signed char>, - NppAbsDiff<CV_16U, nppiAbsDiff_16u_C1R>::call, - absdiff_gpu<short>, - absdiff_gpu<int>, - NppAbsDiff<CV_32F, nppiAbsDiff_32f_C1R>::call, - absdiff_gpu<double> + absDiffMat<unsigned char>, + absDiffMat<signed char>, + absDiffMat<unsigned short>, + absDiffMat<short>, + absDiffMat<int>, + absDiffMat<float>, + absDiffMat<double> + }; + static const func_t vfuncs4[] = + { + vabsDiff4<unsigned int>, + vabsDiff4<int>, + 0, + 0 + }; + static const func_t vfuncs2[] = + { + 0, + 0, + vabsDiff2<unsigned int>, + vabsDiff2<int> }; - CV_Assert(src1.depth() <= CV_64F); - CV_Assert(src1.size() == src2.size() && src1.type() == src2.type()); + const int depth = src1.depth(); + const int cn = src1.channels(); - if (src1.depth() == CV_64F) + CV_Assert( depth <= CV_64F ); + CV_Assert( src2.type() == src1.type() && src2.size() == src1.size() ); + + if (depth == CV_64F) { - if (!TargetArchs::builtWith(NATIVE_DOUBLE) || !DeviceInfo().supports(NATIVE_DOUBLE)) + if (!deviceSupports(NATIVE_DOUBLE)) CV_Error(CV_StsUnsupportedFormat, "The device doesn't support double"); } dst.create(src1.size(), src1.type()); - funcs[src1.depth()](src1.reshape(1), src2.reshape(1), dst.reshape(1), StreamAccessor::getStream(stream)); + cudaStream_t stream = StreamAccessor::getStream(s); + + PtrStepSzb src1_(src1.rows, src1.cols * cn, src1.data, src1.step); + PtrStepSzb src2_(src1.rows, src1.cols * cn, src2.data, src2.step); + PtrStepSzb dst_(src1.rows, src1.cols * cn, dst.data, dst.step); + + if (depth < CV_32S) + { + const intptr_t src1ptr = reinterpret_cast<intptr_t>(src1_.data); + const intptr_t src2ptr = reinterpret_cast<intptr_t>(src2_.data); + const intptr_t dstptr = reinterpret_cast<intptr_t>(dst_.data); + + const bool isAllAligned = (src1ptr & 31) == 0 && (src2ptr & 31) == 0 && (dstptr & 31) == 0; + + if (deviceSupports(FEATURE_SET_COMPUTE_20) && isAllAligned) + { + const func_t vfunc4 = vfuncs4[depth]; + const func_t vfunc2 = vfuncs2[depth]; + + if (vfunc4 != 0 && (src1_.cols & 3) == 0) + { + const int vcols = src1_.cols >> 2; + + vfunc4(PtrStepSzb(src1_.rows, vcols, src1_.data, src1_.step), + PtrStepSzb(src1_.rows, vcols, src2_.data, src2_.step), + PtrStepSzb(src1_.rows, vcols, dst_.data, dst_.step), + stream); + + return; + } + + if (vfunc2 != 0 && (src1_.cols & 1) == 0) + { + const int vcols = src1_.cols >> 1; + + vfunc2(PtrStepSzb(src1_.rows, vcols, src1_.data, src1_.step), + PtrStepSzb(src1_.rows, vcols, src2_.data, src2_.step), + PtrStepSzb(src1_.rows, vcols, dst_.data, dst_.step), + stream); + + return; + } + } + } + + const func_t func = funcs[depth]; + + if (!func) + CV_Error(CV_StsUnsupportedFormat, "Unsupported combination of source and destination types"); + + func(src1_, src2_, dst_, stream); +} + +namespace arithm +{ + template <typename T, typename S> + void absDiffScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); } void cv::gpu::absdiff(const GpuMat& src1, const Scalar& src2, GpuMat& dst, Stream& stream) { - using namespace cv::gpu::device; + using namespace arithm; - typedef void (*func_t)(const PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + typedef void (*func_t)(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); static const func_t funcs[] = { - NppAbsDiffC<CV_8U, nppiAbsDiffC_8u_C1R>::call, - absdiff_gpu<signed char>, - NppAbsDiffC<CV_16U, nppiAbsDiffC_16u_C1R>::call, - absdiff_gpu<short>, - absdiff_gpu<int>, - NppAbsDiffC<CV_32F, nppiAbsDiffC_32f_C1R>::call, - absdiff_gpu<double> + absDiffScalar<unsigned char, float>, + absDiffScalar<signed char, float>, + absDiffScalar<unsigned short, float>, + absDiffScalar<short, float>, + absDiffScalar<int, float>, + absDiffScalar<float, float>, + absDiffScalar<double, double> }; - CV_Assert(src1.depth() <= CV_64F); - CV_Assert(src1.channels() == 1); + const int depth = src1.depth(); - if (src1.depth() == CV_64F) + CV_Assert( depth <= CV_64F ); + CV_Assert( src1.channels() == 1 ); + + if (depth == CV_64F) { - if (!TargetArchs::builtWith(NATIVE_DOUBLE) || !DeviceInfo().supports(NATIVE_DOUBLE)) + if (!deviceSupports(NATIVE_DOUBLE)) CV_Error(CV_StsUnsupportedFormat, "The device doesn't support double"); } dst.create(src1.size(), src1.type()); - funcs[src1.depth()](src1, src2.val[0], dst, StreamAccessor::getStream(stream)); + funcs[depth](src1, src2.val[0], dst, StreamAccessor::getStream(stream)); } ////////////////////////////////////////////////////////////////////////////// // abs -void cv::gpu::abs(const GpuMat& src, GpuMat& dst, Stream& s) +namespace arithm { - CV_Assert(src.depth() == CV_16S || src.depth() == CV_32F); + template <typename T> + void absMat(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream); +} + +void cv::gpu::abs(const GpuMat& src, GpuMat& dst, Stream& stream) +{ + using namespace arithm; + + typedef void (*func_t)(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream); + static const func_t funcs[] = + { + absMat<unsigned char>, + absMat<signed char>, + absMat<unsigned short>, + absMat<short>, + absMat<int>, + absMat<float>, + absMat<double> + }; + + const int depth = src.depth(); + + CV_Assert( depth <= CV_64F ); + CV_Assert( src.channels() == 1 ); + + if (depth == CV_64F) + { + if (!deviceSupports(NATIVE_DOUBLE)) + CV_Error(CV_StsUnsupportedFormat, "The device doesn't support double"); + } dst.create(src.size(), src.type()); - cudaStream_t stream = StreamAccessor::getStream(s); - - NppStreamHandler h(stream); - - NppiSize oSizeROI; - oSizeROI.width = src.cols * src.channels(); - oSizeROI.height = src.rows; - - bool aligned = isAligned(src.data, 16) && isAligned(dst.data, 16); - - if (src.depth() == CV_16S) - { - if (aligned && oSizeROI.width % 4 == 0) - { - oSizeROI.width /= 4; - nppSafeCall( nppiAbs_16s_C4R(src.ptr<Npp16s>(), static_cast<int>(src.step), dst.ptr<Npp16s>(), static_cast<int>(dst.step), oSizeROI) ); - } - else - { - nppSafeCall( nppiAbs_16s_C1R(src.ptr<Npp16s>(), static_cast<int>(src.step), dst.ptr<Npp16s>(), static_cast<int>(dst.step), oSizeROI) ); - } - } - else - { - if (aligned && oSizeROI.width % 4 == 0) - { - oSizeROI.width /= 4; - nppSafeCall( nppiAbs_32f_C4R(src.ptr<Npp32f>(), static_cast<int>(src.step), dst.ptr<Npp32f>(), static_cast<int>(dst.step), oSizeROI) ); - } - else - { - nppSafeCall( nppiAbs_32f_C1R(src.ptr<Npp32f>(), static_cast<int>(src.step), dst.ptr<Npp32f>(), static_cast<int>(dst.step), oSizeROI) ); - } - } - - if (stream == 0) - cudaSafeCall( cudaDeviceSynchronize() ); + funcs[depth](src, dst, StreamAccessor::getStream(stream)); } ////////////////////////////////////////////////////////////////////////////// // sqr -namespace +namespace arithm { - template <int DEPTH> struct NppSqrFunc - { - typedef typename NppTypeTraits<DEPTH>::npp_t npp_t; - - typedef NppStatus (*func_t)(const npp_t* pSrc, int nSrcStep, npp_t* pDst, int nDstStep, NppiSize oSizeROI, int nScaleFactor); - }; - template <> struct NppSqrFunc<CV_32F> - { - typedef NppTypeTraits<CV_32F>::npp_t npp_t; - - typedef NppStatus (*func_t)(const npp_t* pSrc, int nSrcStep, npp_t* pDst, int nDstStep, NppiSize oSizeROI); - }; - - template <int DEPTH, typename NppSqrFunc<DEPTH>::func_t func, typename NppSqrFunc<DEPTH>::func_t func_c4> struct NppSqr - { - typedef typename NppSqrFunc<DEPTH>::npp_t npp_t; - - static void call(const GpuMat& src, GpuMat& dst, cudaStream_t stream) - { - NppStreamHandler h(stream); - - NppiSize oSizeROI; - oSizeROI.width = src.cols * src.channels(); - oSizeROI.height = src.rows; - - bool aligned = isAligned(src.data, 16) && isAligned(dst.data, 16); - - if (aligned && oSizeROI.width % 4 == 0) - { - oSizeROI.width /= 4; - nppSafeCall( func_c4(src.ptr<npp_t>(), static_cast<int>(src.step), dst.ptr<npp_t>(), static_cast<int>(dst.step), oSizeROI, 0) ); - } - else - { - nppSafeCall( func(src.ptr<npp_t>(), static_cast<int>(src.step), dst.ptr<npp_t>(), static_cast<int>(dst.step), oSizeROI, 0) ); - } - - if (stream == 0) - cudaSafeCall( cudaDeviceSynchronize() ); - } - }; - template <typename NppSqrFunc<CV_32F>::func_t func, typename NppSqrFunc<CV_32F>::func_t func_c4> struct NppSqr<CV_32F, func, func_c4> - { - typedef NppSqrFunc<CV_32F>::npp_t npp_t; - - static void call(const GpuMat& src, GpuMat& dst, cudaStream_t stream) - { - NppStreamHandler h(stream); - - NppiSize oSizeROI; - oSizeROI.width = src.cols * src.channels(); - oSizeROI.height = src.rows; - - bool aligned = isAligned(src.data, 16) && isAligned(dst.data, 16); - - if (aligned && oSizeROI.width % 4 == 0) - { - oSizeROI.width /= 4; - nppSafeCall( func_c4(src.ptr<npp_t>(), static_cast<int>(src.step), dst.ptr<npp_t>(), static_cast<int>(dst.step), oSizeROI) ); - } - else - { - nppSafeCall( func(src.ptr<npp_t>(), static_cast<int>(src.step), dst.ptr<npp_t>(), static_cast<int>(dst.step), oSizeROI) ); - } - - if (stream == 0) - cudaSafeCall( cudaDeviceSynchronize() ); - } - }; + template <typename T> + void sqrMat(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream); } void cv::gpu::sqr(const GpuMat& src, GpuMat& dst, Stream& stream) { - typedef void (*func_t)(const GpuMat& src, GpuMat& dst, cudaStream_t stream); + using namespace arithm; + typedef void (*func_t)(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream); static const func_t funcs[] = { - NppSqr<CV_8U, nppiSqr_8u_C1RSfs, nppiSqr_8u_C4RSfs>::call, - 0, - NppSqr<CV_16U, nppiSqr_16u_C1RSfs, nppiSqr_16u_C4RSfs>::call, - NppSqr<CV_16S, nppiSqr_16s_C1RSfs, nppiSqr_16s_C4RSfs>::call, - 0, - NppSqr<CV_32F, nppiSqr_32f_C1R, nppiSqr_32f_C4R>::call + sqrMat<unsigned char>, + sqrMat<signed char>, + sqrMat<unsigned short>, + sqrMat<short>, + sqrMat<int>, + sqrMat<float>, + sqrMat<double> }; - CV_Assert(src.depth() == CV_8U || src.depth() == CV_16U || src.depth() == CV_16S || src.depth() == CV_32F); + const int depth = src.depth(); + + CV_Assert( depth <= CV_64F ); + CV_Assert( src.channels() == 1 ); + + if (depth == CV_64F) + { + if (!deviceSupports(NATIVE_DOUBLE)) + CV_Error(CV_StsUnsupportedFormat, "The device doesn't support double"); + } dst.create(src.size(), src.type()); - funcs[src.depth()](src, dst, StreamAccessor::getStream(stream)); + funcs[depth](src, dst, StreamAccessor::getStream(stream)); } ////////////////////////////////////////////////////////////////////////////// // sqrt -namespace +namespace arithm { - template <int DEPTH> struct NppOneSourceFunc - { - typedef typename NppTypeTraits<DEPTH>::npp_t npp_t; - - typedef NppStatus (*func_t)(const npp_t* pSrc, int nSrcStep, npp_t* pDst, int nDstStep, NppiSize oSizeROI, int nScaleFactor); - }; - template <> struct NppOneSourceFunc<CV_32F> - { - typedef NppTypeTraits<CV_32F>::npp_t npp_t; - - typedef NppStatus (*func_t)(const npp_t* pSrc, int nSrcStep, npp_t* pDst, int nDstStep, NppiSize oSizeROI); - }; - - template <int DEPTH, typename NppOneSourceFunc<DEPTH>::func_t func> struct NppOneSource - { - typedef typename NppOneSourceFunc<DEPTH>::npp_t npp_t; - - static void call(const GpuMat& src, GpuMat& dst, cudaStream_t stream) - { - NppStreamHandler h(stream); - - NppiSize oSizeROI; - oSizeROI.width = src.cols * src.channels(); - oSizeROI.height = src.rows; - - nppSafeCall( func(src.ptr<npp_t>(), static_cast<int>(src.step), dst.ptr<npp_t>(), static_cast<int>(dst.step), oSizeROI, 0) ); - - if (stream == 0) - cudaSafeCall( cudaDeviceSynchronize() ); - } - }; - template <typename NppOneSourceFunc<CV_32F>::func_t func> struct NppOneSource<CV_32F, func> - { - typedef NppOneSourceFunc<CV_32F>::npp_t npp_t; - - static void call(const GpuMat& src, GpuMat& dst, cudaStream_t stream) - { - NppStreamHandler h(stream); - - NppiSize oSizeROI; - oSizeROI.width = src.cols * src.channels(); - oSizeROI.height = src.rows; - - nppSafeCall( func(src.ptr<npp_t>(), static_cast<int>(src.step), dst.ptr<npp_t>(), static_cast<int>(dst.step), oSizeROI) ); - - if (stream == 0) - cudaSafeCall( cudaDeviceSynchronize() ); - } - }; + template <typename T> + void sqrtMat(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream); } void cv::gpu::sqrt(const GpuMat& src, GpuMat& dst, Stream& stream) { - typedef void (*func_t)(const GpuMat& src, GpuMat& dst, cudaStream_t stream); + using namespace arithm; + typedef void (*func_t)(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream); static const func_t funcs[] = { - NppOneSource<CV_8U, nppiSqrt_8u_C1RSfs>::call, - 0, - NppOneSource<CV_16U, nppiSqrt_16u_C1RSfs>::call, - NppOneSource<CV_16S, nppiSqrt_16s_C1RSfs>::call, - 0, - NppOneSource<CV_32F, nppiSqrt_32f_C1R>::call + sqrtMat<unsigned char>, + sqrtMat<signed char>, + sqrtMat<unsigned short>, + sqrtMat<short>, + sqrtMat<int>, + sqrtMat<float>, + sqrtMat<double> }; - CV_Assert(src.depth() == CV_8U || src.depth() == CV_16U || src.depth() == CV_16S || src.depth() == CV_32F); + const int depth = src.depth(); + + CV_Assert( depth <= CV_64F ); + CV_Assert( src.channels() == 1 ); + + if (depth == CV_64F) + { + if (!deviceSupports(NATIVE_DOUBLE)) + CV_Error(CV_StsUnsupportedFormat, "The device doesn't support double"); + } dst.create(src.size(), src.type()); - funcs[src.depth()](src, dst, StreamAccessor::getStream(stream)); + funcs[depth](src, dst, StreamAccessor::getStream(stream)); } //////////////////////////////////////////////////////////////////////// // log +namespace arithm +{ + template <typename T> + void logMat(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream); +} + void cv::gpu::log(const GpuMat& src, GpuMat& dst, Stream& stream) { - typedef void (*func_t)(const GpuMat& src, GpuMat& dst, cudaStream_t stream); + using namespace arithm; + typedef void (*func_t)(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream); static const func_t funcs[] = { - NppOneSource<CV_8U, nppiLn_8u_C1RSfs>::call, - 0, - NppOneSource<CV_16U, nppiLn_16u_C1RSfs>::call, - NppOneSource<CV_16S, nppiLn_16s_C1RSfs>::call, - 0, - NppOneSource<CV_32F, nppiLn_32f_C1R>::call + logMat<unsigned char>, + logMat<signed char>, + logMat<unsigned short>, + logMat<short>, + logMat<int>, + logMat<float>, + logMat<double> }; - CV_Assert(src.depth() == CV_8U || src.depth() == CV_16U || src.depth() == CV_16S || src.depth() == CV_32F); + const int depth = src.depth(); + + CV_Assert( depth <= CV_64F ); + CV_Assert( src.channels() == 1 ); + + if (depth == CV_64F) + { + if (!deviceSupports(NATIVE_DOUBLE)) + CV_Error(CV_StsUnsupportedFormat, "The device doesn't support double"); + } dst.create(src.size(), src.type()); - funcs[src.depth()](src, dst, StreamAccessor::getStream(stream)); + funcs[depth](src, dst, StreamAccessor::getStream(stream)); } //////////////////////////////////////////////////////////////////////// // exp +namespace arithm +{ + template <typename T> + void expMat(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream); +} + void cv::gpu::exp(const GpuMat& src, GpuMat& dst, Stream& stream) { - typedef void (*func_t)(const GpuMat& src, GpuMat& dst, cudaStream_t stream); + using namespace arithm; + typedef void (*func_t)(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream); static const func_t funcs[] = { - NppOneSource<CV_8U, nppiExp_8u_C1RSfs>::call, - 0, - NppOneSource<CV_16U, nppiExp_16u_C1RSfs>::call, - NppOneSource<CV_16S, nppiExp_16s_C1RSfs>::call, - 0, - NppOneSource<CV_32F, nppiExp_32f_C1R>::call + expMat<unsigned char>, + expMat<signed char>, + expMat<unsigned short>, + expMat<short>, + expMat<int>, + expMat<float>, + expMat<double> }; - CV_Assert(src.depth() == CV_8U || src.depth() == CV_16U || src.depth() == CV_16S || src.depth() == CV_32F); + const int depth = src.depth(); + + CV_Assert( depth <= CV_64F ); + CV_Assert( src.channels() == 1 ); + + if (depth == CV_64F) + { + if (!deviceSupports(NATIVE_DOUBLE)) + CV_Error(CV_StsUnsupportedFormat, "The device doesn't support double"); + } dst.create(src.size(), src.type()); - funcs[src.depth()](src, dst, StreamAccessor::getStream(stream)); + funcs[depth](src, dst, StreamAccessor::getStream(stream)); } ////////////////////////////////////////////////////////////////////////////// -// Comparison of two matrixes +// compare -namespace cv { namespace gpu { namespace device +namespace arithm { - template <typename T> void compare_eq(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream); - template <typename T> void compare_ne(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream); - template <typename T> void compare_lt(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream); - template <typename T> void compare_le(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream); + template <typename T> void cmpMatEq(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream); + template <typename T> void cmpMatNe(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream); + template <typename T> void cmpMatLt(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream); + template <typename T> void cmpMatLe(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream); +} - template <typename T> void compare_eq(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream); - template <typename T> void compare_ne(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream); - template <typename T> void compare_lt(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream); - template <typename T> void compare_le(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream); - template <typename T> void compare_gt(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream); - template <typename T> void compare_ge(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream); -}}} - -void cv::gpu::compare(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, int cmpop, Stream& stream) +void cv::gpu::compare(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, int cmpop, Stream& s) { - using namespace cv::gpu::device; + using namespace arithm; typedef void (*func_t)(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream); static const func_t funcs[7][4] = { - {compare_eq<unsigned char> , compare_ne<unsigned char> , compare_lt<unsigned char> , compare_le<unsigned char> }, - {compare_eq<signed char> , compare_ne<signed char> , compare_lt<signed char> , compare_le<signed char> }, - {compare_eq<unsigned short>, compare_ne<unsigned short>, compare_lt<unsigned short>, compare_le<unsigned short>}, - {compare_eq<short> , compare_ne<short> , compare_lt<short> , compare_le<short> }, - {compare_eq<int> , compare_ne<int> , compare_lt<int> , compare_le<int> }, - {compare_eq<float> , compare_ne<float> , compare_lt<float> , compare_le<float> }, - {compare_eq<double> , compare_ne<double> , compare_lt<double> , compare_le<double> } + {cmpMatEq<unsigned char> , cmpMatNe<unsigned char> , cmpMatLt<unsigned char> , cmpMatLe<unsigned char> }, + {cmpMatEq<signed char> , cmpMatNe<signed char> , cmpMatLt<signed char> , cmpMatLe<signed char> }, + {cmpMatEq<unsigned short>, cmpMatNe<unsigned short>, cmpMatLt<unsigned short>, cmpMatLe<unsigned short>}, + {cmpMatEq<short> , cmpMatNe<short> , cmpMatLt<short> , cmpMatLe<short> }, + {cmpMatEq<int> , cmpMatNe<int> , cmpMatLt<int> , cmpMatLe<int> }, + {cmpMatEq<float> , cmpMatNe<float> , cmpMatLt<float> , cmpMatLe<float> }, + {cmpMatEq<double> , cmpMatNe<double> , cmpMatLt<double> , cmpMatLe<double> } }; - CV_Assert(src1.depth() <= CV_64F); - CV_Assert(src1.size() == src2.size() && src1.type() == src2.type()); - CV_Assert(cmpop >= CMP_EQ && cmpop <= CMP_NE); + const int depth = src1.depth(); + const int cn = src1.channels(); - if (src1.depth() == CV_64F) + CV_Assert( depth <= CV_64F ); + CV_Assert( src2.size() == src1.size() && src2.type() == src1.type() ); + CV_Assert( cmpop >= CMP_EQ && cmpop <= CMP_NE ); + + if (depth == CV_64F) { - if (!TargetArchs::builtWith(NATIVE_DOUBLE) || !DeviceInfo().supports(NATIVE_DOUBLE)) + if (!deviceSupports(NATIVE_DOUBLE)) CV_Error(CV_StsUnsupportedFormat, "The device doesn't support double"); } + dst.create(src1.size(), CV_MAKE_TYPE(CV_8U, cn)); + + cudaStream_t stream = StreamAccessor::getStream(s); + static const int codes[] = { 0, 2, 3, 2, 3, 1 @@ -1418,15 +1992,29 @@ void cv::gpu::compare(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, int c &src2, &src1, &src1, &src2, &src2, &src2 }; - dst.create(src1.size(), CV_MAKE_TYPE(CV_8U, src1.channels())); + const int code = codes[cmpop]; + PtrStepSzb src1_(src1.rows, src1.cols * cn, psrc1[cmpop]->data, psrc1[cmpop]->step); + PtrStepSzb src2_(src1.rows, src1.cols * cn, psrc2[cmpop]->data, psrc2[cmpop]->step); + PtrStepSzb dst_(src1.rows, src1.cols * cn, dst.data, dst.step); - funcs[src1.depth()][codes[cmpop]](psrc1[cmpop]->reshape(1), psrc2[cmpop]->reshape(1), dst.reshape(1), StreamAccessor::getStream(stream)); + const func_t func = funcs[depth][code]; + + func(src1_, src2_, dst_, stream); +} + +namespace arithm +{ + template <typename T> void cmpScalarEq(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream); + template <typename T> void cmpScalarNe(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream); + template <typename T> void cmpScalarLt(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream); + template <typename T> void cmpScalarLe(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream); + template <typename T> void cmpScalarGt(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream); + template <typename T> void cmpScalarGe(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream); } namespace { - template <typename T> - void castScalar(Scalar& sc) + template <typename T> void castScalar(Scalar& sc) { sc.val[0] = saturate_cast<T>(sc.val[0]); sc.val[1] = saturate_cast<T>(sc.val[1]); @@ -1437,18 +2025,18 @@ namespace void cv::gpu::compare(const GpuMat& src, Scalar sc, GpuMat& dst, int cmpop, Stream& stream) { - using namespace cv::gpu::device; + using namespace arithm; typedef void (*func_t)(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream); static const func_t funcs[7][6] = { - {compare_eq<unsigned char> , compare_gt<unsigned char> , compare_ge<unsigned char> , compare_lt<unsigned char> , compare_le<unsigned char> , compare_ne<unsigned char> }, - {compare_eq<signed char> , compare_gt<signed char> , compare_ge<signed char> , compare_lt<signed char> , compare_le<signed char> , compare_ne<signed char> }, - {compare_eq<unsigned short>, compare_gt<unsigned short>, compare_ge<unsigned short>, compare_lt<unsigned short>, compare_le<unsigned short>, compare_ne<unsigned short>}, - {compare_eq<short> , compare_gt<short> , compare_ge<short> , compare_lt<short> , compare_le<short> , compare_ne<short> }, - {compare_eq<int> , compare_gt<int> , compare_ge<int> , compare_lt<int> , compare_le<int> , compare_ne<int> }, - {compare_eq<float> , compare_gt<float> , compare_ge<float> , compare_lt<float> , compare_le<float> , compare_ne<float> }, - {compare_eq<double> , compare_gt<double> , compare_ge<double> , compare_lt<double> , compare_le<double> , compare_ne<double> } + {cmpScalarEq<unsigned char> , cmpScalarGt<unsigned char> , cmpScalarGe<unsigned char> , cmpScalarLt<unsigned char> , cmpScalarLe<unsigned char> , cmpScalarNe<unsigned char> }, + {cmpScalarEq<signed char> , cmpScalarGt<signed char> , cmpScalarGe<signed char> , cmpScalarLt<signed char> , cmpScalarLe<signed char> , cmpScalarNe<signed char> }, + {cmpScalarEq<unsigned short>, cmpScalarGt<unsigned short>, cmpScalarGe<unsigned short>, cmpScalarLt<unsigned short>, cmpScalarLe<unsigned short>, cmpScalarNe<unsigned short>}, + {cmpScalarEq<short> , cmpScalarGt<short> , cmpScalarGe<short> , cmpScalarLt<short> , cmpScalarLe<short> , cmpScalarNe<short> }, + {cmpScalarEq<int> , cmpScalarGt<int> , cmpScalarGe<int> , cmpScalarLt<int> , cmpScalarLe<int> , cmpScalarNe<int> }, + {cmpScalarEq<float> , cmpScalarGt<float> , cmpScalarGe<float> , cmpScalarLt<float> , cmpScalarLe<float> , cmpScalarNe<float> }, + {cmpScalarEq<double> , cmpScalarGt<double> , cmpScalarGe<double> , cmpScalarLt<double> , cmpScalarLe<double> , cmpScalarNe<double> } }; typedef void (*cast_func_t)(Scalar& sc); @@ -1457,235 +2045,266 @@ void cv::gpu::compare(const GpuMat& src, Scalar sc, GpuMat& dst, int cmpop, Stre castScalar<unsigned char>, castScalar<signed char>, castScalar<unsigned short>, castScalar<short>, castScalar<int>, castScalar<float>, castScalar<double> }; - CV_Assert(src.depth() <= CV_64F); - CV_Assert(src.channels() <= 4); - CV_Assert(cmpop >= CMP_EQ && cmpop <= CMP_NE); + const int depth = src.depth(); + const int cn = src.channels(); - if (src.depth() == CV_64F) + CV_Assert( depth <= CV_64F ); + CV_Assert( cn <= 4 ); + CV_Assert( cmpop >= CMP_EQ && cmpop <= CMP_NE ); + + if (depth == CV_64F) { - if (!TargetArchs::builtWith(NATIVE_DOUBLE) || !DeviceInfo().supports(NATIVE_DOUBLE)) + if (!deviceSupports(NATIVE_DOUBLE)) CV_Error(CV_StsUnsupportedFormat, "The device doesn't support double"); } - dst.create(src.size(), CV_MAKE_TYPE(CV_8U, src.channels())); + dst.create(src.size(), CV_MAKE_TYPE(CV_8U, cn)); - cast_func[src.depth()](sc); + cast_func[depth](sc); - funcs[src.depth()][cmpop](src, src.channels(), sc.val, dst, StreamAccessor::getStream(stream)); + funcs[depth][cmpop](src, cn, sc.val, dst, StreamAccessor::getStream(stream)); } - ////////////////////////////////////////////////////////////////////////////// // Unary bitwise logical operations -namespace cv { namespace gpu { namespace device +namespace arithm { - void bitwiseNotCaller(int rows, int cols, size_t elem_size1, int cn, const PtrStepb src, PtrStepb dst, cudaStream_t stream); - - template <typename T> - void bitwiseMaskNotCaller(int rows, int cols, int cn, const PtrStepb src, const PtrStepb mask, PtrStepb dst, cudaStream_t stream); -}}} - -namespace -{ - void bitwiseNotCaller(const GpuMat& src, GpuMat& dst, cudaStream_t stream) - { - dst.create(src.size(), src.type()); - - cv::gpu::device::bitwiseNotCaller(src.rows, src.cols, src.elemSize1(), dst.channels(), src, dst, stream); - } - - void bitwiseNotCaller(const GpuMat& src, GpuMat& dst, const GpuMat& mask, cudaStream_t stream) - { - using namespace cv::gpu::device; - - typedef void (*func_t)(int, int, int, const PtrStepb, const PtrStepb, PtrStepb, cudaStream_t); - static func_t funcs[] = - { - bitwiseMaskNotCaller<unsigned char>, bitwiseMaskNotCaller<unsigned char>, - bitwiseMaskNotCaller<unsigned short>, bitwiseMaskNotCaller<unsigned short>, - bitwiseMaskNotCaller<unsigned int>, bitwiseMaskNotCaller<unsigned int>, - bitwiseMaskNotCaller<unsigned int> - }; - - CV_Assert(src.depth() <= CV_64F); - CV_Assert(mask.type() == CV_8U && mask.size() == src.size()); - - dst.create(src.size(), src.type()); - - const func_t func = funcs[src.depth()]; - - int cn = src.depth() != CV_64F ? src.channels() : src.channels() * (sizeof(double) / sizeof(unsigned int)); - - func(src.rows, src.cols, cn, src, mask, dst, stream); - } + template <typename T> void bitMatNot(PtrStepSzb src, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); } -void cv::gpu::bitwise_not(const GpuMat& src, GpuMat& dst, const GpuMat& mask, Stream& stream) +void cv::gpu::bitwise_not(const GpuMat& src, GpuMat& dst, const GpuMat& mask, Stream& s) { - if (mask.empty()) - bitwiseNotCaller(src, dst, StreamAccessor::getStream(stream)); + using namespace arithm; + + const int depth = src.depth(); + + CV_Assert( depth <= CV_64F ); + CV_Assert( mask.empty() || (mask.type() == CV_8UC1 && mask.size() == src.size()) ); + + dst.create(src.size(), src.type()); + + cudaStream_t stream = StreamAccessor::getStream(s); + + const int bcols = src.cols * src.elemSize(); + + if ((bcols & 3) == 0) + { + const int vcols = bcols >> 2; + + bitMatNot<unsigned int>( + PtrStepSzb(src.rows, vcols, src.data, src.step), + PtrStepSzb(src.rows, vcols, dst.data, dst.step), + mask, stream); + } + else if ((bcols & 1) == 0) + { + const int vcols = bcols >> 1; + + bitMatNot<unsigned short>( + PtrStepSzb(src.rows, vcols, src.data, src.step), + PtrStepSzb(src.rows, vcols, dst.data, dst.step), + mask, stream); + } else - bitwiseNotCaller(src, dst, mask, StreamAccessor::getStream(stream)); + { + bitMatNot<unsigned short>( + PtrStepSzb(src.rows, bcols, src.data, src.step), + PtrStepSzb(src.rows, bcols, dst.data, dst.step), + mask, stream); + } } ////////////////////////////////////////////////////////////////////////////// // Binary bitwise logical operations -namespace cv { namespace gpu { namespace device +namespace arithm { - void bitwiseOrCaller(int rows, int cols, size_t elem_size1, int cn, const PtrStepb src1, const PtrStepb src2, PtrStepb dst, cudaStream_t stream); + template <typename T> void bitMatAnd(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template <typename T> void bitMatOr(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template <typename T> void bitMatXor(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); +} - template <typename T> - void bitwiseMaskOrCaller(int rows, int cols, int cn, const PtrStepb src1, const PtrStepb src2, const PtrStepb mask, PtrStepb dst, cudaStream_t stream); - - void bitwiseAndCaller(int rows, int cols, size_t elem_size1, int cn, const PtrStepb src1, const PtrStepb src2, PtrStepb dst, cudaStream_t stream); - - template <typename T> - void bitwiseMaskAndCaller(int rows, int cols, int cn, const PtrStepb src1, const PtrStepb src2, const PtrStepb mask, PtrStepb dst, cudaStream_t stream); - - void bitwiseXorCaller(int rows, int cols, size_t elem_size1, int cn, const PtrStepb src1, const PtrStepb src2, PtrStepb dst, cudaStream_t stream); - - template <typename T> - void bitwiseMaskXorCaller(int rows, int cols, int cn, const PtrStepb src1, const PtrStepb src2, const PtrStepb mask, PtrStepb dst, cudaStream_t stream); -}}} - -namespace +void cv::gpu::bitwise_and(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, const GpuMat& mask, Stream& s) { - void bitwiseOrCaller(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, cudaStream_t stream) + using namespace arithm; + + const int depth = src1.depth(); + + CV_Assert( depth <= CV_64F ); + CV_Assert( src2.size() == src1.size() && src2.type() == src1.type() ); + CV_Assert( mask.empty() || (mask.type() == CV_8UC1 && mask.size() == src1.size()) ); + + dst.create(src1.size(), src1.type()); + + cudaStream_t stream = StreamAccessor::getStream(s); + + const int bcols = src1.cols * src1.elemSize(); + + if ((bcols & 3) == 0) { - CV_Assert(src1.size() == src2.size() && src1.type() == src2.type()); + const int vcols = bcols >> 2; - dst.create(src1.size(), src1.type()); - - cv::gpu::device::bitwiseOrCaller(dst.rows, dst.cols, dst.elemSize1(), dst.channels(), src1, src2, dst, stream); + bitMatAnd<unsigned int>( + PtrStepSzb(src1.rows, vcols, src1.data, src1.step), + PtrStepSzb(src1.rows, vcols, src2.data, src2.step), + PtrStepSzb(src1.rows, vcols, dst.data, dst.step), + mask, stream); } - - void bitwiseOrCaller(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, const GpuMat& mask, cudaStream_t stream) + else if ((bcols & 1) == 0) { - using namespace cv::gpu::device; + const int vcols = bcols >> 1; - typedef void (*func_t)(int, int, int, const PtrStepb, const PtrStepb, const PtrStepb, PtrStepb, cudaStream_t); - static func_t funcs[] = - { - bitwiseMaskOrCaller<unsigned char>, bitwiseMaskOrCaller<unsigned char>, - bitwiseMaskOrCaller<unsigned short>, bitwiseMaskOrCaller<unsigned short>, - bitwiseMaskOrCaller<unsigned int>, bitwiseMaskOrCaller<unsigned int>, - bitwiseMaskOrCaller<unsigned int> - }; - - CV_Assert(src1.depth() <= CV_64F); - CV_Assert(src1.size() == src2.size() && src1.type() == src2.type()); - CV_Assert(mask.type() == CV_8U && mask.size() == src1.size()); - - dst.create(src1.size(), src1.type()); - - const func_t func = funcs[src1.depth()]; - - int cn = dst.depth() != CV_64F ? dst.channels() : dst.channels() * (sizeof(double) / sizeof(unsigned int)); - - func(dst.rows, dst.cols, cn, src1, src2, mask, dst, stream); + bitMatAnd<unsigned int>( + PtrStepSzb(src1.rows, vcols, src1.data, src1.step), + PtrStepSzb(src1.rows, vcols, src2.data, src2.step), + PtrStepSzb(src1.rows, vcols, dst.data, dst.step), + mask, stream); } - - - void bitwiseAndCaller(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, cudaStream_t stream) + else { - CV_Assert(src1.size() == src2.size() && src1.type() == src2.type()); - dst.create(src1.size(), src1.type()); - - cv::gpu::device::bitwiseAndCaller(dst.rows, dst.cols, dst.elemSize1(), dst.channels(), src1, src2, dst, stream); - } - - void bitwiseAndCaller(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, const GpuMat& mask, cudaStream_t stream) - { - using namespace cv::gpu::device; - - typedef void (*func_t)(int, int, int, const PtrStepb, const PtrStepb, const PtrStepb, PtrStepb, cudaStream_t); - static func_t funcs[] = - { - bitwiseMaskAndCaller<unsigned char>, bitwiseMaskAndCaller<unsigned char>, - bitwiseMaskAndCaller<unsigned short>, bitwiseMaskAndCaller<unsigned short>, - bitwiseMaskAndCaller<unsigned int>, bitwiseMaskAndCaller<unsigned int>, - bitwiseMaskAndCaller<unsigned int> - }; - - CV_Assert(src1.depth() <= CV_64F); - CV_Assert(src1.size() == src2.size() && src1.type() == src2.type()); - CV_Assert(mask.type() == CV_8U && mask.size() == src1.size()); - - dst.create(src1.size(), src1.type()); - - const func_t func = funcs[src1.depth()]; - - int cn = dst.depth() != CV_64F ? dst.channels() : dst.channels() * (sizeof(double) / sizeof(unsigned int)); - - func(dst.rows, dst.cols, cn, src1, src2, mask, dst, stream); - } - - - void bitwiseXorCaller(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, cudaStream_t stream) - { - CV_Assert(src1.size() == src2.size() && src1.type() == src2.type()); - - dst.create(src1.size(), src1.type()); - - cv::gpu::device::bitwiseXorCaller(dst.rows, dst.cols, dst.elemSize1(), dst.channels(), src1, src2, dst, stream); - } - - void bitwiseXorCaller(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, const GpuMat& mask, cudaStream_t stream) - { - using namespace cv::gpu::device; - - typedef void (*func_t)(int, int, int, const PtrStepb, const PtrStepb, const PtrStepb, PtrStepb, cudaStream_t); - static func_t funcs[] = - { - bitwiseMaskXorCaller<unsigned char>, bitwiseMaskXorCaller<unsigned char>, - bitwiseMaskXorCaller<unsigned short>, bitwiseMaskXorCaller<unsigned short>, - bitwiseMaskXorCaller<unsigned int>, bitwiseMaskXorCaller<unsigned int>, - bitwiseMaskXorCaller<unsigned int> - }; - - CV_Assert(src1.depth() <= CV_64F); - CV_Assert(src1.size() == src2.size() && src1.type() == src2.type()); - CV_Assert(mask.type() == CV_8U && mask.size() == src1.size()); - - dst.create(src1.size(), src1.type()); - - const func_t func = funcs[src1.depth()]; - - int cn = dst.depth() != CV_64F ? dst.channels() : dst.channels() * (sizeof(double) / sizeof(unsigned int)); - - func(dst.rows, dst.cols, cn, src1, src2, mask, dst, stream); + bitMatAnd<unsigned int>( + PtrStepSzb(src1.rows, bcols, src1.data, src1.step), + PtrStepSzb(src1.rows, bcols, src2.data, src2.step), + PtrStepSzb(src1.rows, bcols, dst.data, dst.step), + mask, stream); } } -void cv::gpu::bitwise_or(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, const GpuMat& mask, Stream& stream) +void cv::gpu::bitwise_or(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, const GpuMat& mask, Stream& s) { - if (mask.empty()) - bitwiseOrCaller(src1, src2, dst, StreamAccessor::getStream(stream)); + using namespace arithm; + + const int depth = src1.depth(); + + CV_Assert( depth <= CV_64F ); + CV_Assert( src2.size() == src1.size() && src2.type() == src1.type() ); + CV_Assert( mask.empty() || (mask.type() == CV_8UC1 && mask.size() == src1.size()) ); + + dst.create(src1.size(), src1.type()); + + cudaStream_t stream = StreamAccessor::getStream(s); + + const int bcols = src1.cols * src1.elemSize(); + + if ((bcols & 3) == 0) + { + const int vcols = bcols >> 2; + + bitMatOr<unsigned int>( + PtrStepSzb(src1.rows, vcols, src1.data, src1.step), + PtrStepSzb(src1.rows, vcols, src2.data, src2.step), + PtrStepSzb(src1.rows, vcols, dst.data, dst.step), + mask, stream); + } + else if ((bcols & 1) == 0) + { + const int vcols = bcols >> 1; + + bitMatOr<unsigned int>( + PtrStepSzb(src1.rows, vcols, src1.data, src1.step), + PtrStepSzb(src1.rows, vcols, src2.data, src2.step), + PtrStepSzb(src1.rows, vcols, dst.data, dst.step), + mask, stream); + } else - bitwiseOrCaller(src1, src2, dst, mask, StreamAccessor::getStream(stream)); + { + + bitMatOr<unsigned int>( + PtrStepSzb(src1.rows, bcols, src1.data, src1.step), + PtrStepSzb(src1.rows, bcols, src2.data, src2.step), + PtrStepSzb(src1.rows, bcols, dst.data, dst.step), + mask, stream); + } } -void cv::gpu::bitwise_and(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, const GpuMat& mask, Stream& stream) +void cv::gpu::bitwise_xor(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, const GpuMat& mask, Stream& s) { - if (mask.empty()) - bitwiseAndCaller(src1, src2, dst, StreamAccessor::getStream(stream)); + using namespace arithm; + + const int depth = src1.depth(); + + CV_Assert( depth <= CV_64F ); + CV_Assert( src2.size() == src1.size() && src2.type() == src1.type() ); + CV_Assert( mask.empty() || (mask.type() == CV_8UC1 && mask.size() == src1.size()) ); + + dst.create(src1.size(), src1.type()); + + cudaStream_t stream = StreamAccessor::getStream(s); + + const int bcols = src1.cols * src1.elemSize(); + + if ((bcols & 3) == 0) + { + const int vcols = bcols >> 2; + + bitMatXor<unsigned int>( + PtrStepSzb(src1.rows, vcols, src1.data, src1.step), + PtrStepSzb(src1.rows, vcols, src2.data, src2.step), + PtrStepSzb(src1.rows, vcols, dst.data, dst.step), + mask, stream); + } + else if ((bcols & 1) == 0) + { + const int vcols = bcols >> 1; + + bitMatXor<unsigned int>( + PtrStepSzb(src1.rows, vcols, src1.data, src1.step), + PtrStepSzb(src1.rows, vcols, src2.data, src2.step), + PtrStepSzb(src1.rows, vcols, dst.data, dst.step), + mask, stream); + } else - bitwiseAndCaller(src1, src2, dst, mask, StreamAccessor::getStream(stream)); + { + + bitMatXor<unsigned int>( + PtrStepSzb(src1.rows, bcols, src1.data, src1.step), + PtrStepSzb(src1.rows, bcols, src2.data, src2.step), + PtrStepSzb(src1.rows, bcols, dst.data, dst.step), + mask, stream); + } } -void cv::gpu::bitwise_xor(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, const GpuMat& mask, Stream& stream) +////////////////////////////////////////////////////////////////////////////// +// Binary bitwise logical operations with scalars + +namespace arithm { - if (mask.empty()) - bitwiseXorCaller(src1, src2, dst, StreamAccessor::getStream(stream)); - else - bitwiseXorCaller(src1, src2, dst, mask, StreamAccessor::getStream(stream)); + template <typename T> void bitScalarAnd(PtrStepSzb src1, unsigned int src2, PtrStepSzb dst, cudaStream_t stream); + template <typename T> void bitScalarOr(PtrStepSzb src1, unsigned int src2, PtrStepSzb dst, cudaStream_t stream); + template <typename T> void bitScalarXor(PtrStepSzb src1, unsigned int src2, PtrStepSzb dst, cudaStream_t stream); } namespace { + typedef void (*bit_scalar_func_t)(PtrStepSzb src1, unsigned int src2, PtrStepSzb dst, cudaStream_t stream); + + template <bit_scalar_func_t func> struct BitScalar + { + static void call(const GpuMat& src, Scalar sc, GpuMat& dst, cudaStream_t stream) + { + func(src, static_cast<unsigned int>(sc.val[0]), dst, stream); + } + }; + + template <bit_scalar_func_t func> struct BitScalar4 + { + static void call(const GpuMat& src, Scalar sc, GpuMat& dst, cudaStream_t stream) + { + Scalar_<unsigned int> isc = sc; + + unsigned int packedVal = 0; + + packedVal |= (isc.val[0] & 0xffff); + packedVal |= (isc.val[1] & 0xffff) << 8; + packedVal |= (isc.val[2] & 0xffff) << 16; + packedVal |= (isc.val[3] & 0xffff) << 24; + + func(src, packedVal, dst, stream); + } + }; + template <int DEPTH, int cn> struct NppBitwiseCFunc { typedef typename NppTypeTraits<DEPTH>::npp_t npp_t; @@ -1739,64 +2358,79 @@ namespace }; } -void cv::gpu::bitwise_or(const GpuMat& src, const Scalar& sc, GpuMat& dst, Stream& stream) -{ - typedef void (*func_t)(const GpuMat& src, Scalar sc, GpuMat& dst, cudaStream_t stream); - static const func_t funcs[5][4] = - { - {NppBitwiseC<CV_8U , 1, nppiOrC_8u_C1R >::call, 0, NppBitwiseC<CV_8U , 3, nppiOrC_8u_C3R >::call, NppBitwiseC<CV_8U , 4, nppiOrC_8u_C4R >::call}, - {0,0,0,0}, - {NppBitwiseC<CV_16U, 1, nppiOrC_16u_C1R>::call, 0, NppBitwiseC<CV_16U, 3, nppiOrC_16u_C3R>::call, NppBitwiseC<CV_16U, 4, nppiOrC_16u_C4R>::call}, - {0,0,0,0}, - {NppBitwiseC<CV_32S, 1, nppiOrC_32s_C1R>::call, 0, NppBitwiseC<CV_32S, 3, nppiOrC_32s_C3R>::call, NppBitwiseC<CV_32S, 4, nppiOrC_32s_C4R>::call} - }; - - CV_Assert(src.depth() == CV_8U || src.depth() == CV_16U || src.depth() == CV_32S); - CV_Assert(src.channels() == 1 || src.channels() == 3 || src.channels() == 4); - - dst.create(src.size(), src.type()); - - funcs[src.depth()][src.channels() - 1](src, sc, dst, StreamAccessor::getStream(stream)); -} - void cv::gpu::bitwise_and(const GpuMat& src, const Scalar& sc, GpuMat& dst, Stream& stream) { + using namespace arithm; + typedef void (*func_t)(const GpuMat& src, Scalar sc, GpuMat& dst, cudaStream_t stream); static const func_t funcs[5][4] = { - {NppBitwiseC<CV_8U , 1, nppiAndC_8u_C1R >::call, 0, NppBitwiseC<CV_8U , 3, nppiAndC_8u_C3R >::call, NppBitwiseC<CV_8U , 4, nppiAndC_8u_C4R >::call}, + {BitScalar< bitScalarAnd<unsigned char> >::call , 0, NppBitwiseC<CV_8U , 3, nppiAndC_8u_C3R >::call, BitScalar4< bitScalarAnd<unsigned int> >::call}, {0,0,0,0}, - {NppBitwiseC<CV_16U, 1, nppiAndC_16u_C1R>::call, 0, NppBitwiseC<CV_16U, 3, nppiAndC_16u_C3R>::call, NppBitwiseC<CV_16U, 4, nppiAndC_16u_C4R>::call}, + {BitScalar< bitScalarAnd<unsigned short> >::call, 0, NppBitwiseC<CV_16U, 3, nppiAndC_16u_C3R>::call, NppBitwiseC<CV_16U, 4, nppiAndC_16u_C4R>::call}, {0,0,0,0}, - {NppBitwiseC<CV_32S, 1, nppiAndC_32s_C1R>::call, 0, NppBitwiseC<CV_32S, 3, nppiAndC_32s_C3R>::call, NppBitwiseC<CV_32S, 4, nppiAndC_32s_C4R>::call} + {BitScalar< bitScalarAnd<unsigned int> >::call , 0, NppBitwiseC<CV_32S, 3, nppiAndC_32s_C3R>::call, NppBitwiseC<CV_32S, 4, nppiAndC_32s_C4R>::call} }; - CV_Assert(src.depth() == CV_8U || src.depth() == CV_16U || src.depth() == CV_32S); - CV_Assert(src.channels() == 1 || src.channels() == 3 || src.channels() == 4); + const int depth = src.depth(); + const int cn = src.channels(); + + CV_Assert( depth == CV_8U || depth == CV_16U || depth == CV_32S ); + CV_Assert( cn == 1 || cn == 3 || cn == 4 ); dst.create(src.size(), src.type()); - funcs[src.depth()][src.channels() - 1](src, sc, dst, StreamAccessor::getStream(stream)); + funcs[depth][cn - 1](src, sc, dst, StreamAccessor::getStream(stream)); +} + +void cv::gpu::bitwise_or(const GpuMat& src, const Scalar& sc, GpuMat& dst, Stream& stream) +{ + using namespace arithm; + + typedef void (*func_t)(const GpuMat& src, Scalar sc, GpuMat& dst, cudaStream_t stream); + static const func_t funcs[5][4] = + { + {BitScalar< bitScalarOr<unsigned char> >::call , 0, NppBitwiseC<CV_8U , 3, nppiOrC_8u_C3R >::call, BitScalar4< bitScalarOr<unsigned int> >::call}, + {0,0,0,0}, + {BitScalar< bitScalarOr<unsigned short> >::call, 0, NppBitwiseC<CV_16U, 3, nppiOrC_16u_C3R>::call, NppBitwiseC<CV_16U, 4, nppiOrC_16u_C4R>::call}, + {0,0,0,0}, + {BitScalar< bitScalarOr<unsigned int> >::call , 0, NppBitwiseC<CV_32S, 3, nppiOrC_32s_C3R>::call, NppBitwiseC<CV_32S, 4, nppiOrC_32s_C4R>::call} + }; + + const int depth = src.depth(); + const int cn = src.channels(); + + CV_Assert( depth == CV_8U || depth == CV_16U || depth == CV_32S ); + CV_Assert( cn == 1 || cn == 3 || cn == 4 ); + + dst.create(src.size(), src.type()); + + funcs[depth][cn - 1](src, sc, dst, StreamAccessor::getStream(stream)); } void cv::gpu::bitwise_xor(const GpuMat& src, const Scalar& sc, GpuMat& dst, Stream& stream) { + using namespace arithm; + typedef void (*func_t)(const GpuMat& src, Scalar sc, GpuMat& dst, cudaStream_t stream); static const func_t funcs[5][4] = { - {NppBitwiseC<CV_8U , 1, nppiXorC_8u_C1R >::call, 0, NppBitwiseC<CV_8U , 3, nppiXorC_8u_C3R >::call, NppBitwiseC<CV_8U , 4, nppiXorC_8u_C4R >::call}, + {BitScalar< bitScalarXor<unsigned char> >::call , 0, NppBitwiseC<CV_8U , 3, nppiXorC_8u_C3R >::call, BitScalar4< bitScalarXor<unsigned int> >::call}, {0,0,0,0}, - {NppBitwiseC<CV_16U, 1, nppiXorC_16u_C1R>::call, 0, NppBitwiseC<CV_16U, 3, nppiXorC_16u_C3R>::call, NppBitwiseC<CV_16U, 4, nppiXorC_16u_C4R>::call}, + {BitScalar< bitScalarXor<unsigned short> >::call, 0, NppBitwiseC<CV_16U, 3, nppiXorC_16u_C3R>::call, NppBitwiseC<CV_16U, 4, nppiXorC_16u_C4R>::call}, {0,0,0,0}, - {NppBitwiseC<CV_32S, 1, nppiXorC_32s_C1R>::call, 0, NppBitwiseC<CV_32S, 3, nppiXorC_32s_C3R>::call, NppBitwiseC<CV_32S, 4, nppiXorC_32s_C4R>::call} + {BitScalar< bitScalarXor<unsigned int> >::call , 0, NppBitwiseC<CV_32S, 3, nppiXorC_32s_C3R>::call, NppBitwiseC<CV_32S, 4, nppiXorC_32s_C4R>::call} }; - CV_Assert(src.depth() == CV_8U || src.depth() == CV_16U || src.depth() == CV_32S); - CV_Assert(src.channels() == 1 || src.channels() == 3 || src.channels() == 4); + const int depth = src.depth(); + const int cn = src.channels(); + + CV_Assert( depth == CV_8U || depth == CV_16U || depth == CV_32S ); + CV_Assert( cn == 1 || cn == 3 || cn == 4 ); dst.create(src.size(), src.type()); - funcs[src.depth()][src.channels() - 1](src, sc, dst, StreamAccessor::getStream(stream)); + funcs[depth][cn - 1](src, sc, dst, StreamAccessor::getStream(stream)); } ////////////////////////////////////////////////////////////////////////////// @@ -1898,91 +2532,226 @@ void cv::gpu::lshift(const GpuMat& src, Scalar_<int> sc, GpuMat& dst, Stream& st ////////////////////////////////////////////////////////////////////////////// // Minimum and maximum operations -namespace cv { namespace gpu { namespace device +namespace arithm { - template <typename T> void min_gpu(const PtrStepSzb src1, const PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream); - template <typename T> void max_gpu(const PtrStepSzb src1, const PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream); + template <typename T> void vmin4(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream); + template <typename T> void vmin2(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream); + template <typename T> void minMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream); + template <typename T> void minScalar(PtrStepSzb src1, double src2, PtrStepSzb dst, cudaStream_t stream); - template <typename T> void min_gpu(const PtrStepSzb src, T val, PtrStepSzb dst, cudaStream_t stream); - template <typename T> void max_gpu(const PtrStepSzb src, T val, PtrStepSzb dst, cudaStream_t stream); -}}} - -void cv::gpu::min(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, Stream& stream) -{ - using namespace cv::gpu::device; - - typedef void (*func_t)(const PtrStepSzb src1, const PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream); - static const func_t funcs[] = - { - min_gpu<unsigned char>, - min_gpu<signed char>, - min_gpu<unsigned short>, - min_gpu<short>, - min_gpu<int>, - min_gpu<float>, - min_gpu<double> - }; - - CV_Assert(src1.depth() <= CV_64F); - CV_Assert(src1.size() == src2.size() && src1.type() == src2.type()); - - if (src1.depth() == CV_64F) - { - if (!TargetArchs::builtWith(NATIVE_DOUBLE) || !DeviceInfo().supports(NATIVE_DOUBLE)) - CV_Error(CV_StsUnsupportedFormat, "The device doesn't support double"); - } - - dst.create(src1.size(), src1.type()); - - funcs[src1.depth()](src1.reshape(1), src2.reshape(1), dst.reshape(1), StreamAccessor::getStream(stream)); + template <typename T> void vmax4(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream); + template <typename T> void vmax2(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream); + template <typename T> void maxMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream); + template <typename T> void maxScalar(PtrStepSzb src1, double src2, PtrStepSzb dst, cudaStream_t stream); } -void cv::gpu::max(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, Stream& stream) +void cv::gpu::min(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, Stream& s) { - using namespace cv::gpu::device; + using namespace arithm; - typedef void (*func_t)(const PtrStepSzb src1, const PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream); + typedef void (*func_t)(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream); static const func_t funcs[] = { - max_gpu<unsigned char>, - max_gpu<signed char>, - max_gpu<unsigned short>, - max_gpu<short>, - max_gpu<int>, - max_gpu<float>, - max_gpu<double> + minMat<unsigned char>, + minMat<signed char>, + minMat<unsigned short>, + minMat<short>, + minMat<int>, + minMat<float>, + minMat<double> + }; + static const func_t vfuncs4[] = + { + vmin4<unsigned int>, + vmin4<int>, + 0, + 0 + }; + static const func_t vfuncs2[] = + { + 0, + 0, + vmin2<unsigned int>, + vmin2<int> }; - CV_Assert(src1.depth() <= CV_64F); - CV_Assert(src1.size() == src2.size() && src1.type() == src2.type()); + const int depth = src1.depth(); + const int cn = src1.channels(); - if (src1.depth() == CV_64F) + CV_Assert( depth <= CV_64F ); + CV_Assert( src2.type() == src1.type() && src2.size() == src1.size() ); + + if (depth == CV_64F) { - if (!TargetArchs::builtWith(NATIVE_DOUBLE) || !DeviceInfo().supports(NATIVE_DOUBLE)) + if (!deviceSupports(NATIVE_DOUBLE)) CV_Error(CV_StsUnsupportedFormat, "The device doesn't support double"); } dst.create(src1.size(), src1.type()); - funcs[src1.depth()](src1.reshape(1), src2.reshape(1), dst.reshape(1), StreamAccessor::getStream(stream)); + cudaStream_t stream = StreamAccessor::getStream(s); + + PtrStepSzb src1_(src1.rows, src1.cols * cn, src1.data, src1.step); + PtrStepSzb src2_(src1.rows, src1.cols * cn, src2.data, src2.step); + PtrStepSzb dst_(src1.rows, src1.cols * cn, dst.data, dst.step); + + if (depth < CV_32S) + { + const intptr_t src1ptr = reinterpret_cast<intptr_t>(src1_.data); + const intptr_t src2ptr = reinterpret_cast<intptr_t>(src2_.data); + const intptr_t dstptr = reinterpret_cast<intptr_t>(dst_.data); + + const bool isAllAligned = (src1ptr & 31) == 0 && (src2ptr & 31) == 0 && (dstptr & 31) == 0; + + if (deviceSupports(FEATURE_SET_COMPUTE_20) && isAllAligned) + { + const func_t vfunc4 = vfuncs4[depth]; + const func_t vfunc2 = vfuncs2[depth]; + + if (vfunc4 != 0 && (src1_.cols & 3) == 0) + { + const int vcols = src1_.cols >> 2; + + vfunc4(PtrStepSzb(src1_.rows, vcols, src1_.data, src1_.step), + PtrStepSzb(src1_.rows, vcols, src2_.data, src2_.step), + PtrStepSzb(src1_.rows, vcols, dst_.data, dst_.step), + stream); + + return; + } + + if (vfunc2 != 0 && (src1_.cols & 1) == 0) + { + const int vcols = src1_.cols >> 1; + + vfunc2(PtrStepSzb(src1_.rows, vcols, src1_.data, src1_.step), + PtrStepSzb(src1_.rows, vcols, src2_.data, src2_.step), + PtrStepSzb(src1_.rows, vcols, dst_.data, dst_.step), + stream); + + return; + } + } + } + + const func_t func = funcs[depth]; + + if (!func) + CV_Error(CV_StsUnsupportedFormat, "Unsupported combination of source and destination types"); + + func(src1_, src2_, dst_, stream); +} + +void cv::gpu::max(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, Stream& s) +{ + using namespace arithm; + + typedef void (*func_t)(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream); + static const func_t funcs[] = + { + maxMat<unsigned char>, + maxMat<signed char>, + maxMat<unsigned short>, + maxMat<short>, + maxMat<int>, + maxMat<float>, + maxMat<double> + }; + static const func_t vfuncs4[] = + { + vmax4<unsigned int>, + vmax4<int>, + 0, + 0 + }; + static const func_t vfuncs2[] = + { + 0, + 0, + vmax2<unsigned int>, + vmax2<int> + }; + + const int depth = src1.depth(); + const int cn = src1.channels(); + + CV_Assert( depth <= CV_64F ); + CV_Assert( src2.type() == src1.type() && src2.size() == src1.size() ); + + if (depth == CV_64F) + { + if (!deviceSupports(NATIVE_DOUBLE)) + CV_Error(CV_StsUnsupportedFormat, "The device doesn't support double"); + } + + dst.create(src1.size(), src1.type()); + + cudaStream_t stream = StreamAccessor::getStream(s); + + PtrStepSzb src1_(src1.rows, src1.cols * cn, src1.data, src1.step); + PtrStepSzb src2_(src1.rows, src1.cols * cn, src2.data, src2.step); + PtrStepSzb dst_(src1.rows, src1.cols * cn, dst.data, dst.step); + + if (depth < CV_32S) + { + const intptr_t src1ptr = reinterpret_cast<intptr_t>(src1_.data); + const intptr_t src2ptr = reinterpret_cast<intptr_t>(src2_.data); + const intptr_t dstptr = reinterpret_cast<intptr_t>(dst_.data); + + const bool isAllAligned = (src1ptr & 31) == 0 && (src2ptr & 31) == 0 && (dstptr & 31) == 0; + + if (deviceSupports(FEATURE_SET_COMPUTE_20) && isAllAligned) + { + const func_t vfunc4 = vfuncs4[depth]; + const func_t vfunc2 = vfuncs2[depth]; + + if (vfunc4 != 0 && (src1_.cols & 3) == 0) + { + const int vcols = src1_.cols >> 2; + + vfunc4(PtrStepSzb(src1_.rows, vcols, src1_.data, src1_.step), + PtrStepSzb(src1_.rows, vcols, src2_.data, src2_.step), + PtrStepSzb(src1_.rows, vcols, dst_.data, dst_.step), + stream); + + return; + } + + if (vfunc2 != 0 && (src1_.cols & 1) == 0) + { + const int vcols = src1_.cols >> 1; + + vfunc2(PtrStepSzb(src1_.rows, vcols, src1_.data, src1_.step), + PtrStepSzb(src1_.rows, vcols, src2_.data, src2_.step), + PtrStepSzb(src1_.rows, vcols, dst_.data, dst_.step), + stream); + + return; + } + } + } + + const func_t func = funcs[depth]; + + if (!func) + CV_Error(CV_StsUnsupportedFormat, "Unsupported combination of source and destination types"); + + func(src1_, src2_, dst_, stream); } namespace { - template <typename T> void minScalar(const PtrStepSzb src, double val, PtrStepSzb dst, cudaStream_t stream) + template <typename T> double castScalar(double val) { - cv::gpu::device::min_gpu(src, saturate_cast<T>(val), dst, stream); - } - - template <typename T> void maxScalar(const PtrStepSzb src, double val, PtrStepSzb dst, cudaStream_t stream) - { - cv::gpu::device::max_gpu(src, saturate_cast<T>(val), dst, stream); + return saturate_cast<T>(val); } } void cv::gpu::min(const GpuMat& src, double val, GpuMat& dst, Stream& stream) { - typedef void (*func_t)(const PtrStepSzb src1, double src2, PtrStepSzb dst, cudaStream_t stream); + using namespace arithm; + + typedef void (*func_t)(PtrStepSzb src1, double src2, PtrStepSzb dst, cudaStream_t stream); static const func_t funcs[] = { minScalar<unsigned char>, @@ -1994,23 +2763,33 @@ void cv::gpu::min(const GpuMat& src, double val, GpuMat& dst, Stream& stream) minScalar<double> }; - CV_Assert(src.depth() <= CV_64F); - CV_Assert(src.channels() == 1); - - if (src.depth() == CV_64F) + typedef double (*cast_func_t)(double sc); + static const cast_func_t cast_func[] = { - if (!TargetArchs::builtWith(NATIVE_DOUBLE) || !DeviceInfo().supports(NATIVE_DOUBLE)) + castScalar<unsigned char>, castScalar<signed char>, castScalar<unsigned short>, castScalar<short>, castScalar<int>, castScalar<float>, castScalar<double> + }; + + const int depth = src.depth(); + + CV_Assert( depth <= CV_64F ); + CV_Assert( src.channels() == 1 ); + + if (depth == CV_64F) + { + if (!deviceSupports(NATIVE_DOUBLE)) CV_Error(CV_StsUnsupportedFormat, "The device doesn't support double"); } dst.create(src.size(), src.type()); - funcs[src.depth()](src, val, dst, StreamAccessor::getStream(stream)); + funcs[depth](src, cast_func[depth](val), dst, StreamAccessor::getStream(stream)); } void cv::gpu::max(const GpuMat& src, double val, GpuMat& dst, Stream& stream) { - typedef void (*func_t)(const PtrStepSzb src1, double src2, PtrStepSzb dst, cudaStream_t stream); + using namespace arithm; + + typedef void (*func_t)(PtrStepSzb src1, double src2, PtrStepSzb dst, cudaStream_t stream); static const func_t funcs[] = { maxScalar<unsigned char>, @@ -2022,45 +2801,47 @@ void cv::gpu::max(const GpuMat& src, double val, GpuMat& dst, Stream& stream) maxScalar<double> }; - CV_Assert(src.depth() <= CV_64F); - CV_Assert(src.channels() == 1); - - if (src.depth() == CV_64F) + typedef double (*cast_func_t)(double sc); + static const cast_func_t cast_func[] = { - if (!TargetArchs::builtWith(NATIVE_DOUBLE) || !DeviceInfo().supports(NATIVE_DOUBLE)) + castScalar<unsigned char>, castScalar<signed char>, castScalar<unsigned short>, castScalar<short>, castScalar<int>, castScalar<float>, castScalar<double> + }; + + const int depth = src.depth(); + + CV_Assert( depth <= CV_64F ); + CV_Assert( src.channels() == 1 ); + + if (depth == CV_64F) + { + if (!deviceSupports(NATIVE_DOUBLE)) CV_Error(CV_StsUnsupportedFormat, "The device doesn't support double"); } dst.create(src.size(), src.type()); - funcs[src.depth()](src, val, dst, StreamAccessor::getStream(stream)); + funcs[depth](src, cast_func[depth](val), dst, StreamAccessor::getStream(stream)); } //////////////////////////////////////////////////////////////////////// // threshold -namespace cv { namespace gpu { namespace device +namespace arithm { template <typename T> - void threshold_gpu(const PtrStepSzb& src, const PtrStepSzb& dst, T thresh, T maxVal, int type, cudaStream_t stream); -}}} - -namespace -{ - template <typename T> void threshold_caller(const GpuMat& src, GpuMat& dst, double thresh, double maxVal, int type, cudaStream_t stream) - { - cv::gpu::device::threshold_gpu<T>(src, dst, saturate_cast<T>(thresh), saturate_cast<T>(maxVal), type, stream); - } + void threshold(PtrStepSzb src, PtrStepSzb dst, double thresh, double maxVal, int type, cudaStream_t stream); } double cv::gpu::threshold(const GpuMat& src, GpuMat& dst, double thresh, double maxVal, int type, Stream& s) { - CV_Assert(src.channels() == 1 && src.depth() <= CV_64F); - CV_Assert(type <= THRESH_TOZERO_INV); + const int depth = src.depth(); - if (src.depth() == CV_64F) + CV_Assert( src.channels() == 1 && depth <= CV_64F ); + CV_Assert( type <= THRESH_TOZERO_INV ); + + if (depth == CV_64F) { - if (!TargetArchs::builtWith(NATIVE_DOUBLE) || !DeviceInfo().supports(NATIVE_DOUBLE)) + if (!deviceSupports(NATIVE_DOUBLE)) CV_Error(CV_StsUnsupportedFormat, "The device doesn't support double"); } @@ -2084,21 +2865,25 @@ double cv::gpu::threshold(const GpuMat& src, GpuMat& dst, double thresh, double } else { - typedef void (*func_t)(const GpuMat& src, GpuMat& dst, double thresh, double maxVal, int type, cudaStream_t stream); + typedef void (*func_t)(PtrStepSzb src, PtrStepSzb dst, double thresh, double maxVal, int type, cudaStream_t stream); static const func_t funcs[] = { - threshold_caller<unsigned char>, threshold_caller<signed char>, - threshold_caller<unsigned short>, threshold_caller<short>, - threshold_caller<int>, threshold_caller<float>, threshold_caller<double> + arithm::threshold<unsigned char>, + arithm::threshold<signed char>, + arithm::threshold<unsigned short>, + arithm::threshold<short>, + arithm::threshold<int>, + arithm::threshold<float>, + arithm::threshold<double> }; - if (src.depth() != CV_32F && src.depth() != CV_64F) + if (depth != CV_32F && depth != CV_64F) { thresh = cvFloor(thresh); maxVal = cvRound(maxVal); } - funcs[src.depth()](src, dst, thresh, maxVal, type, stream); + funcs[depth](src, dst, thresh, maxVal, type, stream); } return thresh; @@ -2107,34 +2892,42 @@ double cv::gpu::threshold(const GpuMat& src, GpuMat& dst, double thresh, double //////////////////////////////////////////////////////////////////////// // pow -namespace cv { namespace gpu { namespace device +namespace arithm { - template<typename T> void pow_caller(PtrStepSzb src, double power, PtrStepSzb dst, cudaStream_t stream); -}}} + template<typename T> void pow(PtrStepSzb src, double power, PtrStepSzb dst, cudaStream_t stream); +} void cv::gpu::pow(const GpuMat& src, double power, GpuMat& dst, Stream& stream) { - using namespace cv::gpu::device; - typedef void (*func_t)(PtrStepSzb src, double power, PtrStepSzb dst, cudaStream_t stream); static const func_t funcs[] = { - pow_caller<unsigned char>, pow_caller<signed char>, - pow_caller<unsigned short>, pow_caller<short>, - pow_caller<int>, pow_caller<float>, pow_caller<double> + arithm::pow<unsigned char>, + arithm::pow<signed char>, + arithm::pow<unsigned short>, + arithm::pow<short>, + arithm::pow<int>, + arithm::pow<float>, + arithm::pow<double> }; - CV_Assert(src.depth() <= CV_64F); + const int depth = src.depth(); + const int cn = src.channels(); - if (src.depth() == CV_64F) + CV_Assert(depth <= CV_64F); + + if (depth == CV_64F) { - if (!TargetArchs::builtWith(NATIVE_DOUBLE) || !DeviceInfo().supports(NATIVE_DOUBLE)) + if (!deviceSupports(NATIVE_DOUBLE)) CV_Error(CV_StsUnsupportedFormat, "The device doesn't support double"); } dst.create(src.size(), src.type()); - funcs[src.depth()](src.reshape(1), power, dst.reshape(1), StreamAccessor::getStream(stream)); + PtrStepSzb src_(src.rows, src.cols * cn, src.data, src.step); + PtrStepSzb dst_(src.rows, src.cols * cn, dst.data, dst.step); + + funcs[depth](src_, power, dst_, StreamAccessor::getStream(stream)); } //////////////////////////////////////////////////////////////////////// @@ -2200,8 +2993,8 @@ void cv::gpu::alphaComp(const GpuMat& img1, const GpuMat& img2, GpuMat& dst, int NppAlphaComp<CV_32F, nppiAlphaComp_32f_AC4R>::call }; - CV_Assert(img1.type() == CV_8UC4 || img1.type() == CV_16UC4 || img1.type() == CV_32SC4 || img1.type() == CV_32FC4); - CV_Assert(img1.size() == img2.size() && img1.type() == img2.type()); + CV_Assert( img1.type() == CV_8UC4 || img1.type() == CV_16UC4 || img1.type() == CV_32SC4 || img1.type() == CV_32FC4 ); + CV_Assert( img1.size() == img2.size() && img1.type() == img2.type() ); dst.create(img1.size(), img1.type()); @@ -2213,507 +3006,508 @@ void cv::gpu::alphaComp(const GpuMat& img1, const GpuMat& img2, GpuMat& dst, int //////////////////////////////////////////////////////////////////////// // addWeighted -namespace cv { namespace gpu { namespace device +namespace arithm { template <typename T1, typename T2, typename D> - void addWeighted_gpu(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream); -}}} + void addWeighted(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream); +} -void cv::gpu::addWeighted(const GpuMat& src1, double alpha, const GpuMat& src2, double beta, double gamma, GpuMat& dst, int dtype, Stream& stream) +void cv::gpu::addWeighted(const GpuMat& src1, double alpha, const GpuMat& src2, double beta, double gamma, GpuMat& dst, int ddepth, Stream& stream) { - using namespace cv::gpu::device; - - typedef void (*func_t)(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream); - + typedef void (*func_t)(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream); static const func_t funcs[7][7][7] = { { { - addWeighted_gpu<unsigned char, unsigned char, unsigned char >, - addWeighted_gpu<unsigned char, unsigned char, signed char >, - addWeighted_gpu<unsigned char, unsigned char, unsigned short>, - addWeighted_gpu<unsigned char, unsigned char, short >, - addWeighted_gpu<unsigned char, unsigned char, int >, - addWeighted_gpu<unsigned char, unsigned char, float >, - addWeighted_gpu<unsigned char, unsigned char, double> + arithm::addWeighted<unsigned char, unsigned char, unsigned char >, + arithm::addWeighted<unsigned char, unsigned char, signed char >, + arithm::addWeighted<unsigned char, unsigned char, unsigned short>, + arithm::addWeighted<unsigned char, unsigned char, short >, + arithm::addWeighted<unsigned char, unsigned char, int >, + arithm::addWeighted<unsigned char, unsigned char, float >, + arithm::addWeighted<unsigned char, unsigned char, double> }, { - addWeighted_gpu<unsigned char, signed char, unsigned char >, - addWeighted_gpu<unsigned char, signed char, signed char >, - addWeighted_gpu<unsigned char, signed char, unsigned short>, - addWeighted_gpu<unsigned char, signed char, short >, - addWeighted_gpu<unsigned char, signed char, int >, - addWeighted_gpu<unsigned char, signed char, float >, - addWeighted_gpu<unsigned char, signed char, double> + arithm::addWeighted<unsigned char, signed char, unsigned char >, + arithm::addWeighted<unsigned char, signed char, signed char >, + arithm::addWeighted<unsigned char, signed char, unsigned short>, + arithm::addWeighted<unsigned char, signed char, short >, + arithm::addWeighted<unsigned char, signed char, int >, + arithm::addWeighted<unsigned char, signed char, float >, + arithm::addWeighted<unsigned char, signed char, double> }, { - addWeighted_gpu<unsigned char, unsigned short, unsigned char >, - addWeighted_gpu<unsigned char, unsigned short, signed char >, - addWeighted_gpu<unsigned char, unsigned short, unsigned short>, - addWeighted_gpu<unsigned char, unsigned short, short >, - addWeighted_gpu<unsigned char, unsigned short, int >, - addWeighted_gpu<unsigned char, unsigned short, float >, - addWeighted_gpu<unsigned char, unsigned short, double> + arithm::addWeighted<unsigned char, unsigned short, unsigned char >, + arithm::addWeighted<unsigned char, unsigned short, signed char >, + arithm::addWeighted<unsigned char, unsigned short, unsigned short>, + arithm::addWeighted<unsigned char, unsigned short, short >, + arithm::addWeighted<unsigned char, unsigned short, int >, + arithm::addWeighted<unsigned char, unsigned short, float >, + arithm::addWeighted<unsigned char, unsigned short, double> }, { - addWeighted_gpu<unsigned char, short, unsigned char >, - addWeighted_gpu<unsigned char, short, signed char >, - addWeighted_gpu<unsigned char, short, unsigned short>, - addWeighted_gpu<unsigned char, short, short >, - addWeighted_gpu<unsigned char, short, int >, - addWeighted_gpu<unsigned char, short, float >, - addWeighted_gpu<unsigned char, short, double> + arithm::addWeighted<unsigned char, short, unsigned char >, + arithm::addWeighted<unsigned char, short, signed char >, + arithm::addWeighted<unsigned char, short, unsigned short>, + arithm::addWeighted<unsigned char, short, short >, + arithm::addWeighted<unsigned char, short, int >, + arithm::addWeighted<unsigned char, short, float >, + arithm::addWeighted<unsigned char, short, double> }, { - addWeighted_gpu<unsigned char, int, unsigned char >, - addWeighted_gpu<unsigned char, int, signed char >, - addWeighted_gpu<unsigned char, int, unsigned short>, - addWeighted_gpu<unsigned char, int, short >, - addWeighted_gpu<unsigned char, int, int >, - addWeighted_gpu<unsigned char, int, float >, - addWeighted_gpu<unsigned char, int, double> + arithm::addWeighted<unsigned char, int, unsigned char >, + arithm::addWeighted<unsigned char, int, signed char >, + arithm::addWeighted<unsigned char, int, unsigned short>, + arithm::addWeighted<unsigned char, int, short >, + arithm::addWeighted<unsigned char, int, int >, + arithm::addWeighted<unsigned char, int, float >, + arithm::addWeighted<unsigned char, int, double> }, { - addWeighted_gpu<unsigned char, float, unsigned char >, - addWeighted_gpu<unsigned char, float, signed char >, - addWeighted_gpu<unsigned char, float, unsigned short>, - addWeighted_gpu<unsigned char, float, short >, - addWeighted_gpu<unsigned char, float, int >, - addWeighted_gpu<unsigned char, float, float >, - addWeighted_gpu<unsigned char, float, double> + arithm::addWeighted<unsigned char, float, unsigned char >, + arithm::addWeighted<unsigned char, float, signed char >, + arithm::addWeighted<unsigned char, float, unsigned short>, + arithm::addWeighted<unsigned char, float, short >, + arithm::addWeighted<unsigned char, float, int >, + arithm::addWeighted<unsigned char, float, float >, + arithm::addWeighted<unsigned char, float, double> }, { - addWeighted_gpu<unsigned char, double, unsigned char >, - addWeighted_gpu<unsigned char, double, signed char >, - addWeighted_gpu<unsigned char, double, unsigned short>, - addWeighted_gpu<unsigned char, double, short >, - addWeighted_gpu<unsigned char, double, int >, - addWeighted_gpu<unsigned char, double, float >, - addWeighted_gpu<unsigned char, double, double> + arithm::addWeighted<unsigned char, double, unsigned char >, + arithm::addWeighted<unsigned char, double, signed char >, + arithm::addWeighted<unsigned char, double, unsigned short>, + arithm::addWeighted<unsigned char, double, short >, + arithm::addWeighted<unsigned char, double, int >, + arithm::addWeighted<unsigned char, double, float >, + arithm::addWeighted<unsigned char, double, double> } }, { { - 0/*addWeighted_gpu<signed char, unsigned char, unsigned char >*/, - 0/*addWeighted_gpu<signed char, unsigned char, signed char >*/, - 0/*addWeighted_gpu<signed char, unsigned char, unsigned short>*/, - 0/*addWeighted_gpu<signed char, unsigned char, short >*/, - 0/*addWeighted_gpu<signed char, unsigned char, int >*/, - 0/*addWeighted_gpu<signed char, unsigned char, float >*/, - 0/*addWeighted_gpu<signed char, unsigned char, double>*/ + 0/*arithm::addWeighted<signed char, unsigned char, unsigned char >*/, + 0/*arithm::addWeighted<signed char, unsigned char, signed char >*/, + 0/*arithm::addWeighted<signed char, unsigned char, unsigned short>*/, + 0/*arithm::addWeighted<signed char, unsigned char, short >*/, + 0/*arithm::addWeighted<signed char, unsigned char, int >*/, + 0/*arithm::addWeighted<signed char, unsigned char, float >*/, + 0/*arithm::addWeighted<signed char, unsigned char, double>*/ }, { - addWeighted_gpu<signed char, signed char, unsigned char >, - addWeighted_gpu<signed char, signed char, signed char >, - addWeighted_gpu<signed char, signed char, unsigned short>, - addWeighted_gpu<signed char, signed char, short >, - addWeighted_gpu<signed char, signed char, int >, - addWeighted_gpu<signed char, signed char, float >, - addWeighted_gpu<signed char, signed char, double> + arithm::addWeighted<signed char, signed char, unsigned char >, + arithm::addWeighted<signed char, signed char, signed char >, + arithm::addWeighted<signed char, signed char, unsigned short>, + arithm::addWeighted<signed char, signed char, short >, + arithm::addWeighted<signed char, signed char, int >, + arithm::addWeighted<signed char, signed char, float >, + arithm::addWeighted<signed char, signed char, double> }, { - addWeighted_gpu<signed char, unsigned short, unsigned char >, - addWeighted_gpu<signed char, unsigned short, signed char >, - addWeighted_gpu<signed char, unsigned short, unsigned short>, - addWeighted_gpu<signed char, unsigned short, short >, - addWeighted_gpu<signed char, unsigned short, int >, - addWeighted_gpu<signed char, unsigned short, float >, - addWeighted_gpu<signed char, unsigned short, double> + arithm::addWeighted<signed char, unsigned short, unsigned char >, + arithm::addWeighted<signed char, unsigned short, signed char >, + arithm::addWeighted<signed char, unsigned short, unsigned short>, + arithm::addWeighted<signed char, unsigned short, short >, + arithm::addWeighted<signed char, unsigned short, int >, + arithm::addWeighted<signed char, unsigned short, float >, + arithm::addWeighted<signed char, unsigned short, double> }, { - addWeighted_gpu<signed char, short, unsigned char >, - addWeighted_gpu<signed char, short, signed char >, - addWeighted_gpu<signed char, short, unsigned short>, - addWeighted_gpu<signed char, short, short >, - addWeighted_gpu<signed char, short, int >, - addWeighted_gpu<signed char, short, float >, - addWeighted_gpu<signed char, short, double> + arithm::addWeighted<signed char, short, unsigned char >, + arithm::addWeighted<signed char, short, signed char >, + arithm::addWeighted<signed char, short, unsigned short>, + arithm::addWeighted<signed char, short, short >, + arithm::addWeighted<signed char, short, int >, + arithm::addWeighted<signed char, short, float >, + arithm::addWeighted<signed char, short, double> }, { - addWeighted_gpu<signed char, int, unsigned char >, - addWeighted_gpu<signed char, int, signed char >, - addWeighted_gpu<signed char, int, unsigned short>, - addWeighted_gpu<signed char, int, short >, - addWeighted_gpu<signed char, int, int >, - addWeighted_gpu<signed char, int, float >, - addWeighted_gpu<signed char, int, double> + arithm::addWeighted<signed char, int, unsigned char >, + arithm::addWeighted<signed char, int, signed char >, + arithm::addWeighted<signed char, int, unsigned short>, + arithm::addWeighted<signed char, int, short >, + arithm::addWeighted<signed char, int, int >, + arithm::addWeighted<signed char, int, float >, + arithm::addWeighted<signed char, int, double> }, { - addWeighted_gpu<signed char, float, unsigned char >, - addWeighted_gpu<signed char, float, signed char >, - addWeighted_gpu<signed char, float, unsigned short>, - addWeighted_gpu<signed char, float, short >, - addWeighted_gpu<signed char, float, int >, - addWeighted_gpu<signed char, float, float >, - addWeighted_gpu<signed char, float, double> + arithm::addWeighted<signed char, float, unsigned char >, + arithm::addWeighted<signed char, float, signed char >, + arithm::addWeighted<signed char, float, unsigned short>, + arithm::addWeighted<signed char, float, short >, + arithm::addWeighted<signed char, float, int >, + arithm::addWeighted<signed char, float, float >, + arithm::addWeighted<signed char, float, double> }, { - addWeighted_gpu<signed char, double, unsigned char >, - addWeighted_gpu<signed char, double, signed char >, - addWeighted_gpu<signed char, double, unsigned short>, - addWeighted_gpu<signed char, double, short >, - addWeighted_gpu<signed char, double, int >, - addWeighted_gpu<signed char, double, float >, - addWeighted_gpu<signed char, double, double> + arithm::addWeighted<signed char, double, unsigned char >, + arithm::addWeighted<signed char, double, signed char >, + arithm::addWeighted<signed char, double, unsigned short>, + arithm::addWeighted<signed char, double, short >, + arithm::addWeighted<signed char, double, int >, + arithm::addWeighted<signed char, double, float >, + arithm::addWeighted<signed char, double, double> } }, { { - 0/*addWeighted_gpu<unsigned short, unsigned char, unsigned char >*/, - 0/*addWeighted_gpu<unsigned short, unsigned char, signed char >*/, - 0/*addWeighted_gpu<unsigned short, unsigned char, unsigned short>*/, - 0/*addWeighted_gpu<unsigned short, unsigned char, short >*/, - 0/*addWeighted_gpu<unsigned short, unsigned char, int >*/, - 0/*addWeighted_gpu<unsigned short, unsigned char, float >*/, - 0/*addWeighted_gpu<unsigned short, unsigned char, double>*/ + 0/*arithm::addWeighted<unsigned short, unsigned char, unsigned char >*/, + 0/*arithm::addWeighted<unsigned short, unsigned char, signed char >*/, + 0/*arithm::addWeighted<unsigned short, unsigned char, unsigned short>*/, + 0/*arithm::addWeighted<unsigned short, unsigned char, short >*/, + 0/*arithm::addWeighted<unsigned short, unsigned char, int >*/, + 0/*arithm::addWeighted<unsigned short, unsigned char, float >*/, + 0/*arithm::addWeighted<unsigned short, unsigned char, double>*/ }, { - 0/*addWeighted_gpu<unsigned short, signed char, unsigned char >*/, - 0/*addWeighted_gpu<unsigned short, signed char, signed char >*/, - 0/*addWeighted_gpu<unsigned short, signed char, unsigned short>*/, - 0/*addWeighted_gpu<unsigned short, signed char, short >*/, - 0/*addWeighted_gpu<unsigned short, signed char, int >*/, - 0/*addWeighted_gpu<unsigned short, signed char, float >*/, - 0/*addWeighted_gpu<unsigned short, signed char, double>*/ + 0/*arithm::addWeighted<unsigned short, signed char, unsigned char >*/, + 0/*arithm::addWeighted<unsigned short, signed char, signed char >*/, + 0/*arithm::addWeighted<unsigned short, signed char, unsigned short>*/, + 0/*arithm::addWeighted<unsigned short, signed char, short >*/, + 0/*arithm::addWeighted<unsigned short, signed char, int >*/, + 0/*arithm::addWeighted<unsigned short, signed char, float >*/, + 0/*arithm::addWeighted<unsigned short, signed char, double>*/ }, { - addWeighted_gpu<unsigned short, unsigned short, unsigned char >, - addWeighted_gpu<unsigned short, unsigned short, signed char >, - addWeighted_gpu<unsigned short, unsigned short, unsigned short>, - addWeighted_gpu<unsigned short, unsigned short, short >, - addWeighted_gpu<unsigned short, unsigned short, int >, - addWeighted_gpu<unsigned short, unsigned short, float >, - addWeighted_gpu<unsigned short, unsigned short, double> + arithm::addWeighted<unsigned short, unsigned short, unsigned char >, + arithm::addWeighted<unsigned short, unsigned short, signed char >, + arithm::addWeighted<unsigned short, unsigned short, unsigned short>, + arithm::addWeighted<unsigned short, unsigned short, short >, + arithm::addWeighted<unsigned short, unsigned short, int >, + arithm::addWeighted<unsigned short, unsigned short, float >, + arithm::addWeighted<unsigned short, unsigned short, double> }, { - addWeighted_gpu<unsigned short, short, unsigned char >, - addWeighted_gpu<unsigned short, short, signed char >, - addWeighted_gpu<unsigned short, short, unsigned short>, - addWeighted_gpu<unsigned short, short, short >, - addWeighted_gpu<unsigned short, short, int >, - addWeighted_gpu<unsigned short, short, float >, - addWeighted_gpu<unsigned short, short, double> + arithm::addWeighted<unsigned short, short, unsigned char >, + arithm::addWeighted<unsigned short, short, signed char >, + arithm::addWeighted<unsigned short, short, unsigned short>, + arithm::addWeighted<unsigned short, short, short >, + arithm::addWeighted<unsigned short, short, int >, + arithm::addWeighted<unsigned short, short, float >, + arithm::addWeighted<unsigned short, short, double> }, { - addWeighted_gpu<unsigned short, int, unsigned char >, - addWeighted_gpu<unsigned short, int, signed char >, - addWeighted_gpu<unsigned short, int, unsigned short>, - addWeighted_gpu<unsigned short, int, short >, - addWeighted_gpu<unsigned short, int, int >, - addWeighted_gpu<unsigned short, int, float >, - addWeighted_gpu<unsigned short, int, double> + arithm::addWeighted<unsigned short, int, unsigned char >, + arithm::addWeighted<unsigned short, int, signed char >, + arithm::addWeighted<unsigned short, int, unsigned short>, + arithm::addWeighted<unsigned short, int, short >, + arithm::addWeighted<unsigned short, int, int >, + arithm::addWeighted<unsigned short, int, float >, + arithm::addWeighted<unsigned short, int, double> }, { - addWeighted_gpu<unsigned short, float, unsigned char >, - addWeighted_gpu<unsigned short, float, signed char >, - addWeighted_gpu<unsigned short, float, unsigned short>, - addWeighted_gpu<unsigned short, float, short >, - addWeighted_gpu<unsigned short, float, int >, - addWeighted_gpu<unsigned short, float, float >, - addWeighted_gpu<unsigned short, float, double> + arithm::addWeighted<unsigned short, float, unsigned char >, + arithm::addWeighted<unsigned short, float, signed char >, + arithm::addWeighted<unsigned short, float, unsigned short>, + arithm::addWeighted<unsigned short, float, short >, + arithm::addWeighted<unsigned short, float, int >, + arithm::addWeighted<unsigned short, float, float >, + arithm::addWeighted<unsigned short, float, double> }, { - addWeighted_gpu<unsigned short, double, unsigned char >, - addWeighted_gpu<unsigned short, double, signed char >, - addWeighted_gpu<unsigned short, double, unsigned short>, - addWeighted_gpu<unsigned short, double, short >, - addWeighted_gpu<unsigned short, double, int >, - addWeighted_gpu<unsigned short, double, float >, - addWeighted_gpu<unsigned short, double, double> + arithm::addWeighted<unsigned short, double, unsigned char >, + arithm::addWeighted<unsigned short, double, signed char >, + arithm::addWeighted<unsigned short, double, unsigned short>, + arithm::addWeighted<unsigned short, double, short >, + arithm::addWeighted<unsigned short, double, int >, + arithm::addWeighted<unsigned short, double, float >, + arithm::addWeighted<unsigned short, double, double> } }, { { - 0/*addWeighted_gpu<short, unsigned char, unsigned char >*/, - 0/*addWeighted_gpu<short, unsigned char, signed char >*/, - 0/*addWeighted_gpu<short, unsigned char, unsigned short>*/, - 0/*addWeighted_gpu<short, unsigned char, short >*/, - 0/*addWeighted_gpu<short, unsigned char, int >*/, - 0/*addWeighted_gpu<short, unsigned char, float >*/, - 0/*addWeighted_gpu<short, unsigned char, double>*/ + 0/*arithm::addWeighted<short, unsigned char, unsigned char >*/, + 0/*arithm::addWeighted<short, unsigned char, signed char >*/, + 0/*arithm::addWeighted<short, unsigned char, unsigned short>*/, + 0/*arithm::addWeighted<short, unsigned char, short >*/, + 0/*arithm::addWeighted<short, unsigned char, int >*/, + 0/*arithm::addWeighted<short, unsigned char, float >*/, + 0/*arithm::addWeighted<short, unsigned char, double>*/ }, { - 0/*addWeighted_gpu<short, signed char, unsigned char >*/, - 0/*addWeighted_gpu<short, signed char, signed char >*/, - 0/*addWeighted_gpu<short, signed char, unsigned short>*/, - 0/*addWeighted_gpu<short, signed char, short >*/, - 0/*addWeighted_gpu<short, signed char, int >*/, - 0/*addWeighted_gpu<short, signed char, float >*/, - 0/*addWeighted_gpu<short, signed char, double>*/ + 0/*arithm::addWeighted<short, signed char, unsigned char >*/, + 0/*arithm::addWeighted<short, signed char, signed char >*/, + 0/*arithm::addWeighted<short, signed char, unsigned short>*/, + 0/*arithm::addWeighted<short, signed char, short >*/, + 0/*arithm::addWeighted<short, signed char, int >*/, + 0/*arithm::addWeighted<short, signed char, float >*/, + 0/*arithm::addWeighted<short, signed char, double>*/ }, { - 0/*addWeighted_gpu<short, unsigned short, unsigned char >*/, - 0/*addWeighted_gpu<short, unsigned short, signed char >*/, - 0/*addWeighted_gpu<short, unsigned short, unsigned short>*/, - 0/*addWeighted_gpu<short, unsigned short, short >*/, - 0/*addWeighted_gpu<short, unsigned short, int >*/, - 0/*addWeighted_gpu<short, unsigned short, float >*/, - 0/*addWeighted_gpu<short, unsigned short, double>*/ + 0/*arithm::addWeighted<short, unsigned short, unsigned char >*/, + 0/*arithm::addWeighted<short, unsigned short, signed char >*/, + 0/*arithm::addWeighted<short, unsigned short, unsigned short>*/, + 0/*arithm::addWeighted<short, unsigned short, short >*/, + 0/*arithm::addWeighted<short, unsigned short, int >*/, + 0/*arithm::addWeighted<short, unsigned short, float >*/, + 0/*arithm::addWeighted<short, unsigned short, double>*/ }, { - addWeighted_gpu<short, short, unsigned char >, - addWeighted_gpu<short, short, signed char >, - addWeighted_gpu<short, short, unsigned short>, - addWeighted_gpu<short, short, short >, - addWeighted_gpu<short, short, int >, - addWeighted_gpu<short, short, float >, - addWeighted_gpu<short, short, double> + arithm::addWeighted<short, short, unsigned char >, + arithm::addWeighted<short, short, signed char >, + arithm::addWeighted<short, short, unsigned short>, + arithm::addWeighted<short, short, short >, + arithm::addWeighted<short, short, int >, + arithm::addWeighted<short, short, float >, + arithm::addWeighted<short, short, double> }, { - addWeighted_gpu<short, int, unsigned char >, - addWeighted_gpu<short, int, signed char >, - addWeighted_gpu<short, int, unsigned short>, - addWeighted_gpu<short, int, short >, - addWeighted_gpu<short, int, int >, - addWeighted_gpu<short, int, float >, - addWeighted_gpu<short, int, double> + arithm::addWeighted<short, int, unsigned char >, + arithm::addWeighted<short, int, signed char >, + arithm::addWeighted<short, int, unsigned short>, + arithm::addWeighted<short, int, short >, + arithm::addWeighted<short, int, int >, + arithm::addWeighted<short, int, float >, + arithm::addWeighted<short, int, double> }, { - addWeighted_gpu<short, float, unsigned char >, - addWeighted_gpu<short, float, signed char >, - addWeighted_gpu<short, float, unsigned short>, - addWeighted_gpu<short, float, short >, - addWeighted_gpu<short, float, int >, - addWeighted_gpu<short, float, float >, - addWeighted_gpu<short, float, double> + arithm::addWeighted<short, float, unsigned char >, + arithm::addWeighted<short, float, signed char >, + arithm::addWeighted<short, float, unsigned short>, + arithm::addWeighted<short, float, short >, + arithm::addWeighted<short, float, int >, + arithm::addWeighted<short, float, float >, + arithm::addWeighted<short, float, double> }, { - addWeighted_gpu<short, double, unsigned char >, - addWeighted_gpu<short, double, signed char >, - addWeighted_gpu<short, double, unsigned short>, - addWeighted_gpu<short, double, short >, - addWeighted_gpu<short, double, int >, - addWeighted_gpu<short, double, float >, - addWeighted_gpu<short, double, double> + arithm::addWeighted<short, double, unsigned char >, + arithm::addWeighted<short, double, signed char >, + arithm::addWeighted<short, double, unsigned short>, + arithm::addWeighted<short, double, short >, + arithm::addWeighted<short, double, int >, + arithm::addWeighted<short, double, float >, + arithm::addWeighted<short, double, double> } }, { { - 0/*addWeighted_gpu<int, unsigned char, unsigned char >*/, - 0/*addWeighted_gpu<int, unsigned char, signed char >*/, - 0/*addWeighted_gpu<int, unsigned char, unsigned short>*/, - 0/*addWeighted_gpu<int, unsigned char, short >*/, - 0/*addWeighted_gpu<int, unsigned char, int >*/, - 0/*addWeighted_gpu<int, unsigned char, float >*/, - 0/*addWeighted_gpu<int, unsigned char, double>*/ + 0/*arithm::addWeighted<int, unsigned char, unsigned char >*/, + 0/*arithm::addWeighted<int, unsigned char, signed char >*/, + 0/*arithm::addWeighted<int, unsigned char, unsigned short>*/, + 0/*arithm::addWeighted<int, unsigned char, short >*/, + 0/*arithm::addWeighted<int, unsigned char, int >*/, + 0/*arithm::addWeighted<int, unsigned char, float >*/, + 0/*arithm::addWeighted<int, unsigned char, double>*/ }, { - 0/*addWeighted_gpu<int, signed char, unsigned char >*/, - 0/*addWeighted_gpu<int, signed char, signed char >*/, - 0/*addWeighted_gpu<int, signed char, unsigned short>*/, - 0/*addWeighted_gpu<int, signed char, short >*/, - 0/*addWeighted_gpu<int, signed char, int >*/, - 0/*addWeighted_gpu<int, signed char, float >*/, - 0/*addWeighted_gpu<int, signed char, double>*/ + 0/*arithm::addWeighted<int, signed char, unsigned char >*/, + 0/*arithm::addWeighted<int, signed char, signed char >*/, + 0/*arithm::addWeighted<int, signed char, unsigned short>*/, + 0/*arithm::addWeighted<int, signed char, short >*/, + 0/*arithm::addWeighted<int, signed char, int >*/, + 0/*arithm::addWeighted<int, signed char, float >*/, + 0/*arithm::addWeighted<int, signed char, double>*/ }, { - 0/*addWeighted_gpu<int, unsigned short, unsigned char >*/, - 0/*addWeighted_gpu<int, unsigned short, signed char >*/, - 0/*addWeighted_gpu<int, unsigned short, unsigned short>*/, - 0/*addWeighted_gpu<int, unsigned short, short >*/, - 0/*addWeighted_gpu<int, unsigned short, int >*/, - 0/*addWeighted_gpu<int, unsigned short, float >*/, - 0/*addWeighted_gpu<int, unsigned short, double>*/ + 0/*arithm::addWeighted<int, unsigned short, unsigned char >*/, + 0/*arithm::addWeighted<int, unsigned short, signed char >*/, + 0/*arithm::addWeighted<int, unsigned short, unsigned short>*/, + 0/*arithm::addWeighted<int, unsigned short, short >*/, + 0/*arithm::addWeighted<int, unsigned short, int >*/, + 0/*arithm::addWeighted<int, unsigned short, float >*/, + 0/*arithm::addWeighted<int, unsigned short, double>*/ }, { - 0/*addWeighted_gpu<int, short, unsigned char >*/, - 0/*addWeighted_gpu<int, short, signed char >*/, - 0/*addWeighted_gpu<int, short, unsigned short>*/, - 0/*addWeighted_gpu<int, short, short >*/, - 0/*addWeighted_gpu<int, short, int >*/, - 0/*addWeighted_gpu<int, short, float >*/, - 0/*addWeighted_gpu<int, short, double>*/ + 0/*arithm::addWeighted<int, short, unsigned char >*/, + 0/*arithm::addWeighted<int, short, signed char >*/, + 0/*arithm::addWeighted<int, short, unsigned short>*/, + 0/*arithm::addWeighted<int, short, short >*/, + 0/*arithm::addWeighted<int, short, int >*/, + 0/*arithm::addWeighted<int, short, float >*/, + 0/*arithm::addWeighted<int, short, double>*/ }, { - addWeighted_gpu<int, int, unsigned char >, - addWeighted_gpu<int, int, signed char >, - addWeighted_gpu<int, int, unsigned short>, - addWeighted_gpu<int, int, short >, - addWeighted_gpu<int, int, int >, - addWeighted_gpu<int, int, float >, - addWeighted_gpu<int, int, double> + arithm::addWeighted<int, int, unsigned char >, + arithm::addWeighted<int, int, signed char >, + arithm::addWeighted<int, int, unsigned short>, + arithm::addWeighted<int, int, short >, + arithm::addWeighted<int, int, int >, + arithm::addWeighted<int, int, float >, + arithm::addWeighted<int, int, double> }, { - addWeighted_gpu<int, float, unsigned char >, - addWeighted_gpu<int, float, signed char >, - addWeighted_gpu<int, float, unsigned short>, - addWeighted_gpu<int, float, short >, - addWeighted_gpu<int, float, int >, - addWeighted_gpu<int, float, float >, - addWeighted_gpu<int, float, double> + arithm::addWeighted<int, float, unsigned char >, + arithm::addWeighted<int, float, signed char >, + arithm::addWeighted<int, float, unsigned short>, + arithm::addWeighted<int, float, short >, + arithm::addWeighted<int, float, int >, + arithm::addWeighted<int, float, float >, + arithm::addWeighted<int, float, double> }, { - addWeighted_gpu<int, double, unsigned char >, - addWeighted_gpu<int, double, signed char >, - addWeighted_gpu<int, double, unsigned short>, - addWeighted_gpu<int, double, short >, - addWeighted_gpu<int, double, int >, - addWeighted_gpu<int, double, float >, - addWeighted_gpu<int, double, double> + arithm::addWeighted<int, double, unsigned char >, + arithm::addWeighted<int, double, signed char >, + arithm::addWeighted<int, double, unsigned short>, + arithm::addWeighted<int, double, short >, + arithm::addWeighted<int, double, int >, + arithm::addWeighted<int, double, float >, + arithm::addWeighted<int, double, double> } }, { { - 0/*addWeighted_gpu<float, unsigned char, unsigned char >*/, - 0/*addWeighted_gpu<float, unsigned char, signed char >*/, - 0/*addWeighted_gpu<float, unsigned char, unsigned short>*/, - 0/*addWeighted_gpu<float, unsigned char, short >*/, - 0/*addWeighted_gpu<float, unsigned char, int >*/, - 0/*addWeighted_gpu<float, unsigned char, float >*/, - 0/*addWeighted_gpu<float, unsigned char, double>*/ + 0/*arithm::addWeighted<float, unsigned char, unsigned char >*/, + 0/*arithm::addWeighted<float, unsigned char, signed char >*/, + 0/*arithm::addWeighted<float, unsigned char, unsigned short>*/, + 0/*arithm::addWeighted<float, unsigned char, short >*/, + 0/*arithm::addWeighted<float, unsigned char, int >*/, + 0/*arithm::addWeighted<float, unsigned char, float >*/, + 0/*arithm::addWeighted<float, unsigned char, double>*/ }, { - 0/*addWeighted_gpu<float, signed char, unsigned char >*/, - 0/*addWeighted_gpu<float, signed char, signed char >*/, - 0/*addWeighted_gpu<float, signed char, unsigned short>*/, - 0/*addWeighted_gpu<float, signed char, short >*/, - 0/*addWeighted_gpu<float, signed char, int >*/, - 0/*addWeighted_gpu<float, signed char, float >*/, - 0/*addWeighted_gpu<float, signed char, double>*/ + 0/*arithm::addWeighted<float, signed char, unsigned char >*/, + 0/*arithm::addWeighted<float, signed char, signed char >*/, + 0/*arithm::addWeighted<float, signed char, unsigned short>*/, + 0/*arithm::addWeighted<float, signed char, short >*/, + 0/*arithm::addWeighted<float, signed char, int >*/, + 0/*arithm::addWeighted<float, signed char, float >*/, + 0/*arithm::addWeighted<float, signed char, double>*/ }, { - 0/*addWeighted_gpu<float, unsigned short, unsigned char >*/, - 0/*addWeighted_gpu<float, unsigned short, signed char >*/, - 0/*addWeighted_gpu<float, unsigned short, unsigned short>*/, - 0/*addWeighted_gpu<float, unsigned short, short >*/, - 0/*addWeighted_gpu<float, unsigned short, int >*/, - 0/*addWeighted_gpu<float, unsigned short, float >*/, - 0/*addWeighted_gpu<float, unsigned short, double>*/ + 0/*arithm::addWeighted<float, unsigned short, unsigned char >*/, + 0/*arithm::addWeighted<float, unsigned short, signed char >*/, + 0/*arithm::addWeighted<float, unsigned short, unsigned short>*/, + 0/*arithm::addWeighted<float, unsigned short, short >*/, + 0/*arithm::addWeighted<float, unsigned short, int >*/, + 0/*arithm::addWeighted<float, unsigned short, float >*/, + 0/*arithm::addWeighted<float, unsigned short, double>*/ }, { - 0/*addWeighted_gpu<float, short, unsigned char >*/, - 0/*addWeighted_gpu<float, short, signed char >*/, - 0/*addWeighted_gpu<float, short, unsigned short>*/, - 0/*addWeighted_gpu<float, short, short >*/, - 0/*addWeighted_gpu<float, short, int >*/, - 0/*addWeighted_gpu<float, short, float >*/, - 0/*addWeighted_gpu<float, short, double>*/ + 0/*arithm::addWeighted<float, short, unsigned char >*/, + 0/*arithm::addWeighted<float, short, signed char >*/, + 0/*arithm::addWeighted<float, short, unsigned short>*/, + 0/*arithm::addWeighted<float, short, short >*/, + 0/*arithm::addWeighted<float, short, int >*/, + 0/*arithm::addWeighted<float, short, float >*/, + 0/*arithm::addWeighted<float, short, double>*/ }, { - 0/*addWeighted_gpu<float, int, unsigned char >*/, - 0/*addWeighted_gpu<float, int, signed char >*/, - 0/*addWeighted_gpu<float, int, unsigned short>*/, - 0/*addWeighted_gpu<float, int, short >*/, - 0/*addWeighted_gpu<float, int, int >*/, - 0/*addWeighted_gpu<float, int, float >*/, - 0/*addWeighted_gpu<float, int, double>*/ + 0/*arithm::addWeighted<float, int, unsigned char >*/, + 0/*arithm::addWeighted<float, int, signed char >*/, + 0/*arithm::addWeighted<float, int, unsigned short>*/, + 0/*arithm::addWeighted<float, int, short >*/, + 0/*arithm::addWeighted<float, int, int >*/, + 0/*arithm::addWeighted<float, int, float >*/, + 0/*arithm::addWeighted<float, int, double>*/ }, { - addWeighted_gpu<float, float, unsigned char >, - addWeighted_gpu<float, float, signed char >, - addWeighted_gpu<float, float, unsigned short>, - addWeighted_gpu<float, float, short >, - addWeighted_gpu<float, float, int >, - addWeighted_gpu<float, float, float >, - addWeighted_gpu<float, float, double> + arithm::addWeighted<float, float, unsigned char >, + arithm::addWeighted<float, float, signed char >, + arithm::addWeighted<float, float, unsigned short>, + arithm::addWeighted<float, float, short >, + arithm::addWeighted<float, float, int >, + arithm::addWeighted<float, float, float >, + arithm::addWeighted<float, float, double> }, { - addWeighted_gpu<float, double, unsigned char >, - addWeighted_gpu<float, double, signed char >, - addWeighted_gpu<float, double, unsigned short>, - addWeighted_gpu<float, double, short >, - addWeighted_gpu<float, double, int >, - addWeighted_gpu<float, double, float >, - addWeighted_gpu<float, double, double> + arithm::addWeighted<float, double, unsigned char >, + arithm::addWeighted<float, double, signed char >, + arithm::addWeighted<float, double, unsigned short>, + arithm::addWeighted<float, double, short >, + arithm::addWeighted<float, double, int >, + arithm::addWeighted<float, double, float >, + arithm::addWeighted<float, double, double> } }, { { - 0/*addWeighted_gpu<double, unsigned char, unsigned char >*/, - 0/*addWeighted_gpu<double, unsigned char, signed char >*/, - 0/*addWeighted_gpu<double, unsigned char, unsigned short>*/, - 0/*addWeighted_gpu<double, unsigned char, short >*/, - 0/*addWeighted_gpu<double, unsigned char, int >*/, - 0/*addWeighted_gpu<double, unsigned char, float >*/, - 0/*addWeighted_gpu<double, unsigned char, double>*/ + 0/*arithm::addWeighted<double, unsigned char, unsigned char >*/, + 0/*arithm::addWeighted<double, unsigned char, signed char >*/, + 0/*arithm::addWeighted<double, unsigned char, unsigned short>*/, + 0/*arithm::addWeighted<double, unsigned char, short >*/, + 0/*arithm::addWeighted<double, unsigned char, int >*/, + 0/*arithm::addWeighted<double, unsigned char, float >*/, + 0/*arithm::addWeighted<double, unsigned char, double>*/ }, { - 0/*addWeighted_gpu<double, signed char, unsigned char >*/, - 0/*addWeighted_gpu<double, signed char, signed char >*/, - 0/*addWeighted_gpu<double, signed char, unsigned short>*/, - 0/*addWeighted_gpu<double, signed char, short >*/, - 0/*addWeighted_gpu<double, signed char, int >*/, - 0/*addWeighted_gpu<double, signed char, float >*/, - 0/*addWeighted_gpu<double, signed char, double>*/ + 0/*arithm::addWeighted<double, signed char, unsigned char >*/, + 0/*arithm::addWeighted<double, signed char, signed char >*/, + 0/*arithm::addWeighted<double, signed char, unsigned short>*/, + 0/*arithm::addWeighted<double, signed char, short >*/, + 0/*arithm::addWeighted<double, signed char, int >*/, + 0/*arithm::addWeighted<double, signed char, float >*/, + 0/*arithm::addWeighted<double, signed char, double>*/ }, { - 0/*addWeighted_gpu<double, unsigned short, unsigned char >*/, - 0/*addWeighted_gpu<double, unsigned short, signed char >*/, - 0/*addWeighted_gpu<double, unsigned short, unsigned short>*/, - 0/*addWeighted_gpu<double, unsigned short, short >*/, - 0/*addWeighted_gpu<double, unsigned short, int >*/, - 0/*addWeighted_gpu<double, unsigned short, float >*/, - 0/*addWeighted_gpu<double, unsigned short, double>*/ + 0/*arithm::addWeighted<double, unsigned short, unsigned char >*/, + 0/*arithm::addWeighted<double, unsigned short, signed char >*/, + 0/*arithm::addWeighted<double, unsigned short, unsigned short>*/, + 0/*arithm::addWeighted<double, unsigned short, short >*/, + 0/*arithm::addWeighted<double, unsigned short, int >*/, + 0/*arithm::addWeighted<double, unsigned short, float >*/, + 0/*arithm::addWeighted<double, unsigned short, double>*/ }, { - 0/*addWeighted_gpu<double, short, unsigned char >*/, - 0/*addWeighted_gpu<double, short, signed char >*/, - 0/*addWeighted_gpu<double, short, unsigned short>*/, - 0/*addWeighted_gpu<double, short, short >*/, - 0/*addWeighted_gpu<double, short, int >*/, - 0/*addWeighted_gpu<double, short, float >*/, - 0/*addWeighted_gpu<double, short, double>*/ + 0/*arithm::addWeighted<double, short, unsigned char >*/, + 0/*arithm::addWeighted<double, short, signed char >*/, + 0/*arithm::addWeighted<double, short, unsigned short>*/, + 0/*arithm::addWeighted<double, short, short >*/, + 0/*arithm::addWeighted<double, short, int >*/, + 0/*arithm::addWeighted<double, short, float >*/, + 0/*arithm::addWeighted<double, short, double>*/ }, { - 0/*addWeighted_gpu<double, int, unsigned char >*/, - 0/*addWeighted_gpu<double, int, signed char >*/, - 0/*addWeighted_gpu<double, int, unsigned short>*/, - 0/*addWeighted_gpu<double, int, short >*/, - 0/*addWeighted_gpu<double, int, int >*/, - 0/*addWeighted_gpu<double, int, float >*/, - 0/*addWeighted_gpu<double, int, double>*/ + 0/*arithm::addWeighted<double, int, unsigned char >*/, + 0/*arithm::addWeighted<double, int, signed char >*/, + 0/*arithm::addWeighted<double, int, unsigned short>*/, + 0/*arithm::addWeighted<double, int, short >*/, + 0/*arithm::addWeighted<double, int, int >*/, + 0/*arithm::addWeighted<double, int, float >*/, + 0/*arithm::addWeighted<double, int, double>*/ }, { - 0/*addWeighted_gpu<double, float, unsigned char >*/, - 0/*addWeighted_gpu<double, float, signed char >*/, - 0/*addWeighted_gpu<double, float, unsigned short>*/, - 0/*addWeighted_gpu<double, float, short >*/, - 0/*addWeighted_gpu<double, float, int >*/, - 0/*addWeighted_gpu<double, float, float >*/, - 0/*addWeighted_gpu<double, float, double>*/ + 0/*arithm::addWeighted<double, float, unsigned char >*/, + 0/*arithm::addWeighted<double, float, signed char >*/, + 0/*arithm::addWeighted<double, float, unsigned short>*/, + 0/*arithm::addWeighted<double, float, short >*/, + 0/*arithm::addWeighted<double, float, int >*/, + 0/*arithm::addWeighted<double, float, float >*/, + 0/*arithm::addWeighted<double, float, double>*/ }, { - addWeighted_gpu<double, double, unsigned char >, - addWeighted_gpu<double, double, signed char >, - addWeighted_gpu<double, double, unsigned short>, - addWeighted_gpu<double, double, short >, - addWeighted_gpu<double, double, int >, - addWeighted_gpu<double, double, float >, - addWeighted_gpu<double, double, double> + arithm::addWeighted<double, double, unsigned char >, + arithm::addWeighted<double, double, signed char >, + arithm::addWeighted<double, double, unsigned short>, + arithm::addWeighted<double, double, short >, + arithm::addWeighted<double, double, int >, + arithm::addWeighted<double, double, float >, + arithm::addWeighted<double, double, double> } } }; - CV_Assert(src1.size() == src2.size()); - CV_Assert(src1.type() == src2.type() || (dtype >= 0 && src1.channels() == src2.channels())); + int sdepth1 = src1.depth(); + int sdepth2 = src2.depth(); + ddepth = ddepth >= 0 ? CV_MAT_DEPTH(ddepth) : std::max(sdepth1, sdepth2); + const int cn = src1.channels(); - dtype = dtype >= 0 ? CV_MAKETYPE(dtype, src1.channels()) : src1.type(); + CV_Assert( src2.size() == src1.size() && src2.channels() == cn ); + CV_Assert( sdepth1 <= CV_64F && sdepth2 <= CV_64F && ddepth <= CV_64F ); - CV_Assert(src1.depth() <= CV_64F && src2.depth() <= CV_64F && CV_MAT_DEPTH(dtype) <= CV_64F); - - if (src1.depth() == CV_64F || src2.depth() == CV_64F || CV_MAT_DEPTH(dtype) == CV_64F) + if (sdepth1 == CV_64F || sdepth2 == CV_64F || ddepth == CV_64F) { - if (!TargetArchs::builtWith(NATIVE_DOUBLE) || !DeviceInfo().supports(NATIVE_DOUBLE)) + if (!deviceSupports(NATIVE_DOUBLE)) CV_Error(CV_StsUnsupportedFormat, "The device doesn't support double"); } - dst.create(src1.size(), dtype); + dst.create(src1.size(), CV_MAKE_TYPE(ddepth, cn)); - const GpuMat* psrc1 = &src1; - const GpuMat* psrc2 = &src2; + PtrStepSzb src1_(src1.rows, src1.cols * cn, src1.data, src1.step); + PtrStepSzb src2_(src1.rows, src1.cols * cn, src2.data, src2.step); + PtrStepSzb dst_(src1.rows, src1.cols * cn, dst.data, dst.step); - if (src1.depth() > src2.depth()) + if (sdepth1 > sdepth2) { - std::swap(psrc1, psrc2); + std::swap(src1_.data, src2_.data); + std::swap(src1_.step, src2_.step); std::swap(alpha, beta); + std::swap(sdepth1, sdepth2); } - const func_t func = funcs[psrc1->depth()][psrc2->depth()][dst.depth()]; + const func_t func = funcs[sdepth1][sdepth2][ddepth]; if (!func) CV_Error(CV_StsUnsupportedFormat, "Unsupported combination of source and destination types"); - func(psrc1->reshape(1), alpha, psrc2->reshape(1), beta, gamma, dst.reshape(1), StreamAccessor::getStream(stream)); + func(src1_, alpha, src2_, beta, gamma, dst_, StreamAccessor::getStream(stream)); } #endif diff --git a/modules/gpu/test/test_core.cpp b/modules/gpu/test/test_core.cpp index abdcb0fa7..695ec9758 100644 --- a/modules/gpu/test/test_core.cpp +++ b/modules/gpu/test/test_core.cpp @@ -210,7 +210,6 @@ TEST_P(Add_Array, Accuracy) { cv::Mat mat1 = randomMat(size, stype); cv::Mat mat2 = randomMat(size, stype); - cv::Mat mask = randomMat(size, CV_8UC1, 0.0, 2.0); if ((depth.first == CV_64F || depth.second == CV_64F) && !supportFeature(devInfo, cv::gpu::NATIVE_DOUBLE)) { @@ -228,10 +227,10 @@ TEST_P(Add_Array, Accuracy) { cv::gpu::GpuMat dst = createMat(size, dtype, useRoi); dst.setTo(cv::Scalar::all(0)); - cv::gpu::add(loadMat(mat1, useRoi), loadMat(mat2, useRoi), dst, channels == 1 ? loadMat(mask, useRoi) : cv::gpu::GpuMat(), depth.second); + cv::gpu::add(loadMat(mat1, useRoi), loadMat(mat2, useRoi), dst, cv::gpu::GpuMat(), depth.second); cv::Mat dst_gold(size, dtype, cv::Scalar::all(0)); - cv::add(mat1, mat2, dst_gold, channels == 1 ? mask : cv::noArray(), depth.second); + cv::add(mat1, mat2, dst_gold, cv::noArray(), depth.second); EXPECT_MAT_NEAR(dst_gold, dst, depth.first >= CV_32F || depth.second >= CV_32F ? 1e-4 : 0.0); } @@ -244,6 +243,67 @@ INSTANTIATE_TEST_CASE_P(GPU_Core, Add_Array, testing::Combine( ALL_CHANNELS, WHOLE_SUBMAT)); +PARAM_TEST_CASE(Add_Array_Mask, cv::gpu::DeviceInfo, cv::Size, std::pair<MatDepth, MatDepth>, UseRoi) +{ + cv::gpu::DeviceInfo devInfo; + cv::Size size; + std::pair<MatDepth, MatDepth> depth; + bool useRoi; + + int stype; + int dtype; + + virtual void SetUp() + { + devInfo = GET_PARAM(0); + size = GET_PARAM(1); + depth = GET_PARAM(2); + useRoi = GET_PARAM(3); + + cv::gpu::setDevice(devInfo.deviceID()); + + stype = CV_MAKE_TYPE(depth.first, 1); + dtype = CV_MAKE_TYPE(depth.second, 1); + } +}; + +TEST_P(Add_Array_Mask, Accuracy) +{ + cv::Mat mat1 = randomMat(size, stype); + cv::Mat mat2 = randomMat(size, stype); + cv::Mat mask = randomMat(size, CV_8UC1, 0, 2); + + if ((depth.first == CV_64F || depth.second == CV_64F) && !supportFeature(devInfo, cv::gpu::NATIVE_DOUBLE)) + { + try + { + cv::gpu::GpuMat dst; + cv::gpu::add(loadMat(mat1), loadMat(mat2), dst, cv::gpu::GpuMat(), depth.second); + } + catch (const cv::Exception& e) + { + ASSERT_EQ(CV_StsUnsupportedFormat, e.code); + } + } + else + { + cv::gpu::GpuMat dst = createMat(size, dtype, useRoi); + dst.setTo(cv::Scalar::all(0)); + cv::gpu::add(loadMat(mat1, useRoi), loadMat(mat2, useRoi), dst, loadMat(mask, useRoi), depth.second); + + cv::Mat dst_gold(size, dtype, cv::Scalar::all(0)); + cv::add(mat1, mat2, dst_gold, mask, depth.second); + + EXPECT_MAT_NEAR(dst_gold, dst, depth.first >= CV_32F || depth.second >= CV_32F ? 1e-4 : 0.0); + } +} + +INSTANTIATE_TEST_CASE_P(GPU_Core, Add_Array_Mask, testing::Combine( + ALL_DEVICES, + DIFFERENT_SIZES, + DEPTH_PAIRS, + WHOLE_SUBMAT)); + //////////////////////////////////////////////////////////////////////////////// // Add_Scalar @@ -362,6 +422,67 @@ PARAM_TEST_CASE(Subtract_Array, cv::gpu::DeviceInfo, cv::Size, std::pair<MatDept }; TEST_P(Subtract_Array, Accuracy) +{ + cv::Mat mat1 = randomMat(size, stype); + cv::Mat mat2 = randomMat(size, stype); + + if ((depth.first == CV_64F || depth.second == CV_64F) && !supportFeature(devInfo, cv::gpu::NATIVE_DOUBLE)) + { + try + { + cv::gpu::GpuMat dst; + cv::gpu::subtract(loadMat(mat1), loadMat(mat2), dst, cv::gpu::GpuMat(), depth.second); + } + catch (const cv::Exception& e) + { + ASSERT_EQ(CV_StsUnsupportedFormat, e.code); + } + } + else + { + cv::gpu::GpuMat dst = createMat(size, dtype, useRoi); + dst.setTo(cv::Scalar::all(0)); + cv::gpu::subtract(loadMat(mat1, useRoi), loadMat(mat2, useRoi), dst, cv::gpu::GpuMat(), depth.second); + + cv::Mat dst_gold(size, dtype, cv::Scalar::all(0)); + cv::subtract(mat1, mat2, dst_gold, cv::noArray(), depth.second); + + EXPECT_MAT_NEAR(dst_gold, dst, depth.first >= CV_32F || depth.second >= CV_32F ? 1e-4 : 0.0); + } +} + +INSTANTIATE_TEST_CASE_P(GPU_Core, Subtract_Array, testing::Combine( + ALL_DEVICES, + DIFFERENT_SIZES, + DEPTH_PAIRS, + ALL_CHANNELS, + WHOLE_SUBMAT)); + +PARAM_TEST_CASE(Subtract_Array_Mask, cv::gpu::DeviceInfo, cv::Size, std::pair<MatDepth, MatDepth>, UseRoi) +{ + cv::gpu::DeviceInfo devInfo; + cv::Size size; + std::pair<MatDepth, MatDepth> depth; + bool useRoi; + + int stype; + int dtype; + + virtual void SetUp() + { + devInfo = GET_PARAM(0); + size = GET_PARAM(1); + depth = GET_PARAM(2); + useRoi = GET_PARAM(3); + + cv::gpu::setDevice(devInfo.deviceID()); + + stype = CV_MAKE_TYPE(depth.first, 1); + dtype = CV_MAKE_TYPE(depth.second, 1); + } +}; + +TEST_P(Subtract_Array_Mask, Accuracy) { cv::Mat mat1 = randomMat(size, stype); cv::Mat mat2 = randomMat(size, stype); @@ -383,20 +504,19 @@ TEST_P(Subtract_Array, Accuracy) { cv::gpu::GpuMat dst = createMat(size, dtype, useRoi); dst.setTo(cv::Scalar::all(0)); - cv::gpu::subtract(loadMat(mat1, useRoi), loadMat(mat2, useRoi), dst, channels == 1 ? loadMat(mask, useRoi) : cv::gpu::GpuMat(), depth.second); + cv::gpu::subtract(loadMat(mat1, useRoi), loadMat(mat2, useRoi), dst, loadMat(mask, useRoi), depth.second); cv::Mat dst_gold(size, dtype, cv::Scalar::all(0)); - cv::subtract(mat1, mat2, dst_gold, channels == 1 ? mask : cv::noArray(), depth.second); + cv::subtract(mat1, mat2, dst_gold, mask, depth.second); EXPECT_MAT_NEAR(dst_gold, dst, depth.first >= CV_32F || depth.second >= CV_32F ? 1e-4 : 0.0); } } -INSTANTIATE_TEST_CASE_P(GPU_Core, Subtract_Array, testing::Combine( +INSTANTIATE_TEST_CASE_P(GPU_Core, Subtract_Array_Mask, testing::Combine( ALL_DEVICES, DIFFERENT_SIZES, DEPTH_PAIRS, - ALL_CHANNELS, WHOLE_SUBMAT)); //////////////////////////////////////////////////////////////////////////////// @@ -541,7 +661,7 @@ TEST_P(Multiply_Array, WithOutScale) cv::Mat dst_gold; cv::multiply(mat1, mat2, dst_gold, 1, depth.second); - EXPECT_MAT_NEAR(dst_gold, dst, depth.first >= CV_32F || depth.second >= CV_32F ? 1e-4 : 0.0); + EXPECT_MAT_NEAR(dst_gold, dst, depth.first >= CV_32F || depth.second >= CV_32F ? 1e-2 : 0.0); } } @@ -571,7 +691,7 @@ TEST_P(Multiply_Array, WithScale) cv::Mat dst_gold; cv::multiply(mat1, mat2, dst_gold, scale, depth.second); - EXPECT_MAT_NEAR(dst_gold, dst, 1.0); + EXPECT_MAT_NEAR(dst_gold, dst, 2.0); } } @@ -726,7 +846,7 @@ TEST_P(Multiply_Scalar, WithOutScale) cv::Mat dst_gold; cv::multiply(mat, val, dst_gold, 1, depth.second); - EXPECT_MAT_NEAR(dst_gold, dst, depth.first >= CV_32F || depth.second >= CV_32F ? 1e-2 : 0.0); + EXPECT_MAT_NEAR(dst_gold, dst, 1.0); } } @@ -757,7 +877,7 @@ TEST_P(Multiply_Scalar, WithScale) cv::Mat dst_gold; cv::multiply(mat, val, dst_gold, scale, depth.second); - EXPECT_MAT_NEAR(dst_gold, dst, depth.first >= CV_32F || depth.second >= CV_32F ? 1e-4 : 0.0); + EXPECT_MAT_NEAR(dst_gold, dst, 1.0); } } @@ -1037,7 +1157,7 @@ TEST_P(Divide_Scalar, WithScale) cv::Mat dst_gold; cv::divide(mat, val, dst_gold, scale, depth.second); - EXPECT_MAT_NEAR(dst_gold, dst, depth.first >= CV_32F || depth.second >= CV_32F ? 1e-4 : 0.0); + EXPECT_MAT_NEAR(dst_gold, dst, depth.first >= CV_32F || depth.second >= CV_32F ? 1e-2 : 0.0); } }