From 11c6eb63051d9160865436db3cdf69987bc5fe06 Mon Sep 17 00:00:00 2001 From: Vladislav Vinogradov Date: Wed, 21 Nov 2012 12:46:11 +0400 Subject: [PATCH] element operations --- .../include/opencv2/gpu/device/functional.hpp | 34 + modules/gpu/src/cuda/element_operations.cu | 4366 +++++++++++------ modules/gpu/src/element_operations.cpp | 3494 ++++++++----- modules/gpu/test/test_core.cpp | 144 +- 4 files changed, 5066 insertions(+), 2972 deletions(-) diff --git a/modules/gpu/include/opencv2/gpu/device/functional.hpp b/modules/gpu/include/opencv2/gpu/device/functional.hpp index 6e0471e9a..cd63c3ac9 100644 --- a/modules/gpu/include/opencv2/gpu/device/functional.hpp +++ b/modules/gpu/include/opencv2/gpu/device/functional.hpp @@ -357,6 +357,9 @@ namespace cv { namespace gpu { namespace device { return abs(x); } + + __device__ __forceinline__ abs_func() {} + __device__ __forceinline__ abs_func(const abs_func&) {} }; template <> struct abs_func : unary_function { @@ -364,6 +367,9 @@ namespace cv { namespace gpu { namespace device { return x; } + + __device__ __forceinline__ abs_func() {} + __device__ __forceinline__ abs_func(const abs_func&) {} }; template <> struct abs_func : unary_function { @@ -371,6 +377,9 @@ namespace cv { namespace gpu { namespace device { return ::abs(x); } + + __device__ __forceinline__ abs_func() {} + __device__ __forceinline__ abs_func(const abs_func&) {} }; template <> struct abs_func : unary_function { @@ -378,6 +387,9 @@ namespace cv { namespace gpu { namespace device { return ::abs(x); } + + __device__ __forceinline__ abs_func() {} + __device__ __forceinline__ abs_func(const abs_func&) {} }; template <> struct abs_func : unary_function { @@ -385,6 +397,9 @@ namespace cv { namespace gpu { namespace device { return x; } + + __device__ __forceinline__ abs_func() {} + __device__ __forceinline__ abs_func(const abs_func&) {} }; template <> struct abs_func : unary_function { @@ -392,6 +407,9 @@ namespace cv { namespace gpu { namespace device { return ::abs(x); } + + __device__ __forceinline__ abs_func() {} + __device__ __forceinline__ abs_func(const abs_func&) {} }; template <> struct abs_func : unary_function { @@ -399,6 +417,9 @@ namespace cv { namespace gpu { namespace device { return x; } + + __device__ __forceinline__ abs_func() {} + __device__ __forceinline__ abs_func(const abs_func&) {} }; template <> struct abs_func : unary_function { @@ -406,6 +427,9 @@ namespace cv { namespace gpu { namespace device { return ::abs(x); } + + __device__ __forceinline__ abs_func() {} + __device__ __forceinline__ abs_func(const abs_func&) {} }; template <> struct abs_func : unary_function { @@ -413,6 +437,9 @@ namespace cv { namespace gpu { namespace device { return ::fabsf(x); } + + __device__ __forceinline__ abs_func() {} + __device__ __forceinline__ abs_func(const abs_func&) {} }; template <> struct abs_func : unary_function { @@ -420,6 +447,9 @@ namespace cv { namespace gpu { namespace device { return ::fabs(x); } + + __device__ __forceinline__ abs_func() {} + __device__ __forceinline__ abs_func(const abs_func&) {} }; #define OPENCV_GPU_IMPLEMENT_UN_FUNCTOR(name, func) \ @@ -429,6 +459,8 @@ namespace cv { namespace gpu { namespace device { \ return func ## f(v); \ } \ + __device__ __forceinline__ name ## _func() {} \ + __device__ __forceinline__ name ## _func(const name ## _func&) {} \ }; \ template <> struct name ## _func : unary_function \ { \ @@ -436,6 +468,8 @@ namespace cv { namespace gpu { namespace device { \ return func(v); \ } \ + __device__ __forceinline__ name ## _func() {} \ + __device__ __forceinline__ name ## _func(const name ## _func&) {} \ }; #define OPENCV_GPU_IMPLEMENT_BIN_FUNCTOR(name, func) \ diff --git a/modules/gpu/src/cuda/element_operations.cu b/modules/gpu/src/cuda/element_operations.cu index c61601d4f..eaf577bac 100644 --- a/modules/gpu/src/cuda/element_operations.cu +++ b/modules/gpu/src/cuda/element_operations.cu @@ -42,405 +42,833 @@ #if !defined CUDA_DISABLER -#include "internal_shared.hpp" +#include "opencv2/gpu/device/common.hpp" #include "opencv2/gpu/device/functional.hpp" #include "opencv2/gpu/device/vec_math.hpp" #include "opencv2/gpu/device/transform.hpp" #include "opencv2/gpu/device/limits.hpp" #include "opencv2/gpu/device/saturate_cast.hpp" -namespace cv { namespace gpu { namespace device -{ - ////////////////////////////////////////////////////////////////////////// - // add +using namespace cv::gpu; +using namespace cv::gpu::device; - template struct Add : binary_function +////////////////////////////////////////////////////////////////////////// +// addMat + +namespace +{ + template struct VAdd4; + template <> struct VAdd4 : binary_function + { + __device__ __forceinline__ uint operator ()(uint a, uint b) const + { + uint res = 0; + + #if __CUDA_ARCH__ >= 300 + asm("vadd4.u32.u32.u32.sat %0, %1, %2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); + #elif __CUDA_ARCH__ >= 200 + asm("vadd.u32.u32.u32.sat %0.b0, %1.b0, %2.b0, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); + asm("vadd.u32.u32.u32.sat %0.b1, %1.b1, %2.b1, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); + asm("vadd.u32.u32.u32.sat %0.b2, %1.b2, %2.b2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); + asm("vadd.u32.u32.u32.sat %0.b3, %1.b3, %2.b3, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); + #endif + + return res; + } + + __device__ __forceinline__ VAdd4() {} + __device__ __forceinline__ VAdd4(const VAdd4& other) {} + }; + template <> struct VAdd4 : binary_function + { + __device__ __forceinline__ uint operator ()(int a, int b) const + { + uint res = 0; + + #if __CUDA_ARCH__ >= 300 + asm("vadd4.u32.s32.s32.sat %0, %1, %2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); + #elif __CUDA_ARCH__ >= 200 + asm("vadd.u32.s32.s32.sat %0.b0, %1.b0, %2.b0, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); + asm("vadd.u32.s32.s32.sat %0.b1, %1.b1, %2.b1, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); + asm("vadd.u32.s32.s32.sat %0.b2, %1.b2, %2.b2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); + asm("vadd.u32.s32.s32.sat %0.b3, %1.b3, %2.b3, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); + #endif + + return res; + } + + __device__ __forceinline__ VAdd4() {} + __device__ __forceinline__ VAdd4(const VAdd4& other) {} + }; + template <> struct VAdd4 : binary_function + { + __device__ __forceinline__ int operator ()(uint a, uint b) const + { + int res = 0; + + #if __CUDA_ARCH__ >= 300 + asm("vadd4.s32.u32.u32.sat %0, %1, %2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); + #elif __CUDA_ARCH__ >= 200 + asm("vadd.s32.u32.u32.sat %0.b0, %1.b0, %2.b0, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); + asm("vadd.s32.u32.u32.sat %0.b1, %1.b1, %2.b1, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); + asm("vadd.s32.u32.u32.sat %0.b2, %1.b2, %2.b2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); + asm("vadd.s32.u32.u32.sat %0.b3, %1.b3, %2.b3, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); + #endif + + return res; + } + + __device__ __forceinline__ VAdd4() {} + __device__ __forceinline__ VAdd4(const VAdd4& other) {} + }; + template <> struct VAdd4 : binary_function + { + __device__ __forceinline__ int operator ()(int a, int b) const + { + int res = 0; + + #if __CUDA_ARCH__ >= 300 + asm("vadd4.s32.s32.s32.sat %0, %1, %2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); + #elif __CUDA_ARCH__ >= 200 + asm("vadd.s32.s32.s32.sat %0.b0, %1.b0, %2.b0, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); + asm("vadd.s32.s32.s32.sat %0.b1, %1.b1, %2.b1, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); + asm("vadd.s32.s32.s32.sat %0.b2, %1.b2, %2.b2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); + asm("vadd.s32.s32.s32.sat %0.b3, %1.b3, %2.b3, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); + #endif + + return res; + } + + __device__ __forceinline__ VAdd4() {} + __device__ __forceinline__ VAdd4(const VAdd4& other) {} + }; + + //////////////////////////////////// + + template struct VAdd2; + template <> struct VAdd2 : binary_function + { + __device__ __forceinline__ uint operator ()(uint a, uint b) const + { + uint res = 0; + + #if __CUDA_ARCH__ >= 300 + asm("vadd2.u32.u32.u32.sat %0, %1, %2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); + #elif __CUDA_ARCH__ >= 200 + asm("vadd.u32.u32.u32.sat %0.h0, %1.h0, %2.h0, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); + asm("vadd.u32.u32.u32.sat %0.h1, %1.h1, %2.h1, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); + #endif + + return res; + } + + __device__ __forceinline__ VAdd2() {} + __device__ __forceinline__ VAdd2(const VAdd2& other) {} + }; + template <> struct VAdd2 : binary_function + { + __device__ __forceinline__ int operator ()(uint a, uint b) const + { + int res = 0; + + #if __CUDA_ARCH__ >= 300 + asm("vadd2.s32.u32.u32.sat %0, %1, %2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); + #elif __CUDA_ARCH__ >= 200 + asm("vadd.s32.u32.u32.sat %0.h0, %1.h0, %2.h0, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); + asm("vadd.s32.u32.u32.sat %0.h1, %1.h1, %2.h1, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); + #endif + + return res; + } + + __device__ __forceinline__ VAdd2() {} + __device__ __forceinline__ VAdd2(const VAdd2& other) {} + }; + template <> struct VAdd2 : binary_function + { + __device__ __forceinline__ uint operator ()(int a, int b) const + { + uint res = 0; + + #if __CUDA_ARCH__ >= 300 + asm("vadd2.u32.s32.s32.sat %0, %1, %2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); + #elif __CUDA_ARCH__ >= 200 + asm("vadd.u32.s32.s32.sat %0.h0, %1.h0, %2.h0, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); + asm("vadd.u32.s32.s32.sat %0.h1, %1.h1, %2.h1, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); + #endif + + return res; + } + + __device__ __forceinline__ VAdd2() {} + __device__ __forceinline__ VAdd2(const VAdd2& other) {} + }; + template <> struct VAdd2 : binary_function + { + __device__ __forceinline__ int operator ()(int a, int b) const + { + int res = 0; + + #if __CUDA_ARCH__ >= 300 + asm("vadd2.s32.s32.s32.sat %0, %1, %2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); + #elif __CUDA_ARCH__ >= 200 + asm("vadd.s32.s32.s32.sat %0.h0, %1.h0, %2.h0, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); + asm("vadd.s32.s32.s32.sat %0.h1, %1.h1, %2.h1, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); + #endif + + return res; + } + + __device__ __forceinline__ VAdd2() {} + __device__ __forceinline__ VAdd2(const VAdd2& other) {} + }; + + //////////////////////////////////// + + template struct AddMat : binary_function { __device__ __forceinline__ D operator ()(T a, T b) const { return saturate_cast(a + b); } + + __device__ __forceinline__ AddMat() {} + __device__ __forceinline__ AddMat(const AddMat& other) {} + }; +} + +namespace cv { namespace gpu { namespace device +{ + template struct TransformFunctorTraits< VAdd4 > : DefaultTransformFunctorTraits< VAdd4 > + { + enum { smart_shift = 2 }; }; - template <> struct TransformFunctorTraits< Add > : DefaultTransformFunctorTraits< Add > + //////////////////////////////////// + + template struct TransformFunctorTraits< VAdd2 > : DefaultTransformFunctorTraits< VAdd4 > { - enum { smart_block_dim_y = 8 }; - enum { smart_shift = 4 }; - }; - template <> struct TransformFunctorTraits< Add > : DefaultTransformFunctorTraits< Add > - { - enum { smart_block_dim_y = 8 }; - enum { smart_shift = 4 }; - }; - template <> struct TransformFunctorTraits< Add > : DefaultTransformFunctorTraits< Add > - { - enum { smart_block_dim_y = 8 }; - enum { smart_shift = 4 }; - }; - template <> struct TransformFunctorTraits< Add > : DefaultTransformFunctorTraits< Add > - { - enum { smart_block_dim_y = 8 }; - enum { smart_shift = 4 }; + enum { smart_shift = 2 }; }; - template void add_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream) + //////////////////////////////////// + + template <> struct TransformFunctorTraits< AddMat > : DefaultTransformFunctorTraits< AddMat > { - if (mask.data) - cv::gpu::device::transform((PtrStepSz)src1, (PtrStepSz)src2, (PtrStepSz)dst, Add(), SingleMask(mask), stream); - else - cv::gpu::device::transform((PtrStepSz)src1, (PtrStepSz)src2, (PtrStepSz)dst, Add(), WithOutMask(), stream); + enum { smart_block_dim_y = 8 }; + enum { smart_shift = 4 }; + }; + template <> struct TransformFunctorTraits< AddMat > : DefaultTransformFunctorTraits< AddMat > + { + enum { smart_block_dim_y = 8 }; + enum { smart_shift = 4 }; + }; + template <> struct TransformFunctorTraits< AddMat > : DefaultTransformFunctorTraits< AddMat > + { + enum { smart_block_dim_y = 8 }; + enum { smart_shift = 4 }; + }; + template <> struct TransformFunctorTraits< AddMat > : DefaultTransformFunctorTraits< AddMat > + { + enum { smart_block_dim_y = 8 }; + enum { smart_shift = 4 }; + }; +}}} + +namespace arithm +{ + template + void vadd4(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream) + { + transform((PtrStepSz) src1, (PtrStepSz) src2, (PtrStepSz) dst, VAdd4(), WithOutMask(), stream); } - template void add_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void add_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - template void add_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - template void add_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - template void add_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - template void add_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - template void add_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); + template void vadd4(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream); + template void vadd4(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream); + template void vadd4(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream); + template void vadd4(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream); - //template void add_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void add_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void add_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void add_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void add_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void add_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void add_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - - //template void add_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void add_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - template void add_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void add_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - template void add_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - template void add_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - template void add_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - - //template void add_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void add_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void add_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - template void add_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - template void add_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - template void add_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - template void add_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - - //template void add_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void add_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void add_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void add_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - template void add_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - template void add_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - template void add_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - - //template void add_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void add_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void add_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void add_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void add_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - template void add_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - template void add_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - - //template void add_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void add_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void add_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void add_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void add_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void add_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - template void add_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - - template struct AddScalar : unary_function + template + void vadd2(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream) { - AddScalar(double val_) : val(val_) {} + transform((PtrStepSz) src1, (PtrStepSz) src2, (PtrStepSz) dst, VAdd2(), WithOutMask(), stream); + } + + template void vadd2(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream); + template void vadd2(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream); + template void vadd2(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream); + template void vadd2(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream); + + template + void addMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream) + { + if (mask.data) + transform((PtrStepSz) src1, (PtrStepSz) src2, (PtrStepSz) dst, AddMat(), mask, stream); + else + transform((PtrStepSz) src1, (PtrStepSz) src2, (PtrStepSz) dst, AddMat(), WithOutMask(), stream); + } + + template void addMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void addMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void addMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void addMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void addMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void addMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void addMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + + template void addMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void addMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void addMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void addMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void addMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void addMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void addMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + + //template void addMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + //template void addMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void addMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void addMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void addMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void addMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void addMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + + //template void addMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + //template void addMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void addMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void addMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void addMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void addMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void addMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + + //template void addMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + //template void addMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + //template void addMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + //template void addMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void addMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void addMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void addMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + + //template void addMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + //template void addMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + //template void addMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + //template void addMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + //template void addMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void addMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void addMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + + //template void addMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + //template void addMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + //template void addMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + //template void addMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + //template void addMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + //template void addMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void addMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); +} + +////////////////////////////////////////////////////////////////////////// +// addScalar + +namespace +{ + template struct AddScalar : unary_function + { + S val; + + explicit AddScalar(S val_) : val(val_) {} + __device__ __forceinline__ D operator ()(T a) const { return saturate_cast(a + val); } - const double val; }; +} - template <> struct TransformFunctorTraits< AddScalar > : DefaultTransformFunctorTraits< AddScalar > +namespace cv { namespace gpu { namespace device +{ + template <> struct TransformFunctorTraits< AddScalar > : DefaultTransformFunctorTraits< AddScalar > { enum { smart_block_dim_y = 8 }; enum { smart_shift = 4 }; }; - template <> struct TransformFunctorTraits< AddScalar > : DefaultTransformFunctorTraits< AddScalar > + template <> struct TransformFunctorTraits< AddScalar > : DefaultTransformFunctorTraits< AddScalar > { enum { smart_block_dim_y = 8 }; enum { smart_shift = 4 }; }; - template <> struct TransformFunctorTraits< AddScalar > : DefaultTransformFunctorTraits< AddScalar > + template <> struct TransformFunctorTraits< AddScalar > : DefaultTransformFunctorTraits< AddScalar > { enum { smart_block_dim_y = 8 }; enum { smart_shift = 4 }; }; - template <> struct TransformFunctorTraits< AddScalar > : DefaultTransformFunctorTraits< AddScalar > + template <> struct TransformFunctorTraits< AddScalar > : DefaultTransformFunctorTraits< AddScalar > { enum { smart_block_dim_y = 8 }; enum { smart_shift = 4 }; }; +}}} - template void add_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream) +namespace arithm +{ + template + void addScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream) { - cudaSafeCall( cudaSetDoubleForDevice(&val) ); - AddScalar op(val); + AddScalar op(static_cast(val)); + if (mask.data) - cv::gpu::device::transform((PtrStepSz)src1, (PtrStepSz)dst, op, SingleMask(mask), stream); + transform((PtrStepSz) src1, (PtrStepSz) dst, op, mask, stream); else - cv::gpu::device::transform((PtrStepSz)src1, (PtrStepSz)dst, op, WithOutMask(), stream); + transform((PtrStepSz) src1, (PtrStepSz) dst, op, WithOutMask(), stream); } - template void add_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void add_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - template void add_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - template void add_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - template void add_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - template void add_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - template void add_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); + template void addScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void addScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void addScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void addScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void addScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void addScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void addScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); - //template void add_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void add_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void add_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void add_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void add_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void add_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void add_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); + template void addScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void addScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void addScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void addScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void addScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void addScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void addScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); - //template void add_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void add_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - template void add_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void add_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - template void add_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - template void add_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - template void add_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); + //template void addScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + //template void addScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void addScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void addScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void addScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void addScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void addScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); - //template void add_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void add_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void add_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - template void add_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - template void add_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - template void add_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - template void add_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); + //template void addScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + //template void addScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void addScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void addScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void addScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void addScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void addScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); - //template void add_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void add_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void add_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void add_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - template void add_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - template void add_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - template void add_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); + //template void addScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + //template void addScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + //template void addScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + //template void addScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void addScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void addScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void addScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); - //template void add_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void add_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void add_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void add_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void add_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - template void add_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - template void add_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); + //template void addScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + //template void addScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + //template void addScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + //template void addScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + //template void addScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void addScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void addScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); - //template void add_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void add_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void add_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void add_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void add_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void add_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - template void add_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); + //template void addScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + //template void addScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + //template void addScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + //template void addScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + //template void addScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + //template void addScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void addScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); +} - ////////////////////////////////////////////////////////////////////////// - // subtract +////////////////////////////////////////////////////////////////////////// +// subMat - template struct Subtract : binary_function +namespace +{ + template struct VSub4; + template <> struct VSub4 : binary_function + { + __device__ __forceinline__ uint operator ()(uint a, uint b) const + { + uint res = 0; + + #if __CUDA_ARCH__ >= 300 + asm("vsub4.u32.u32.u32.sat %0, %1, %2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); + #elif __CUDA_ARCH__ >= 200 + asm("vsub.u32.u32.u32.sat %0.b0, %1.b0, %2.b0, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); + asm("vsub.u32.u32.u32.sat %0.b1, %1.b1, %2.b1, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); + asm("vsub.u32.u32.u32.sat %0.b2, %1.b2, %2.b2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); + asm("vsub.u32.u32.u32.sat %0.b3, %1.b3, %2.b3, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); + #endif + + return res; + } + + __device__ __forceinline__ VSub4() {} + __device__ __forceinline__ VSub4(const VSub4& other) {} + }; + template <> struct VSub4 : binary_function + { + __device__ __forceinline__ uint operator ()(int a, int b) const + { + uint res = 0; + + #if __CUDA_ARCH__ >= 300 + asm("vsub4.u32.s32.s32.sat %0, %1, %2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); + #elif __CUDA_ARCH__ >= 200 + asm("vsub.u32.s32.s32.sat %0.b0, %1.b0, %2.b0, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); + asm("vsub.u32.s32.s32.sat %0.b1, %1.b1, %2.b1, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); + asm("vsub.u32.s32.s32.sat %0.b2, %1.b2, %2.b2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); + asm("vsub.u32.s32.s32.sat %0.b3, %1.b3, %2.b3, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); + #endif + + return res; + } + + __device__ __forceinline__ VSub4() {} + __device__ __forceinline__ VSub4(const VSub4& other) {} + }; + template <> struct VSub4 : binary_function + { + __device__ __forceinline__ int operator ()(uint a, uint b) const + { + int res = 0; + + #if __CUDA_ARCH__ >= 300 + asm("vsub4.s32.u32.u32.sat %0, %1, %2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); + #elif __CUDA_ARCH__ >= 200 + asm("vsub.s32.u32.u32.sat %0.b0, %1.b0, %2.b0, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); + asm("vsub.s32.u32.u32.sat %0.b1, %1.b1, %2.b1, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); + asm("vsub.s32.u32.u32.sat %0.b2, %1.b2, %2.b2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); + asm("vsub.s32.u32.u32.sat %0.b3, %1.b3, %2.b3, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); + #endif + + return res; + } + + __device__ __forceinline__ VSub4() {} + __device__ __forceinline__ VSub4(const VSub4& other) {} + }; + template <> struct VSub4 : binary_function + { + __device__ __forceinline__ int operator ()(int a, int b) const + { + int res = 0; + + #if __CUDA_ARCH__ >= 300 + asm("vsub4.s32.s32.s32.sat %0, %1, %2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); + #elif __CUDA_ARCH__ >= 200 + asm("vsub.s32.s32.s32.sat %0.b0, %1.b0, %2.b0, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); + asm("vsub.s32.s32.s32.sat %0.b1, %1.b1, %2.b1, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); + asm("vsub.s32.s32.s32.sat %0.b2, %1.b2, %2.b2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); + asm("vsub.s32.s32.s32.sat %0.b3, %1.b3, %2.b3, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); + #endif + + return res; + } + + __device__ __forceinline__ VSub4() {} + __device__ __forceinline__ VSub4(const VSub4& other) {} + }; + + //////////////////////////////////// + + template struct VSub2; + template <> struct VSub2 : binary_function + { + __device__ __forceinline__ uint operator ()(uint a, uint b) const + { + uint res = 0; + + #if __CUDA_ARCH__ >= 300 + asm("vsub2.u32.u32.u32.sat %0, %1, %2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); + #elif __CUDA_ARCH__ >= 200 + asm("vsub.u32.u32.u32.sat %0.h0, %1.h0, %2.h0, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); + asm("vsub.u32.u32.u32.sat %0.h1, %1.h1, %2.h1, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); + #endif + + return res; + } + + __device__ __forceinline__ VSub2() {} + __device__ __forceinline__ VSub2(const VSub2& other) {} + }; + template <> struct VSub2 : binary_function + { + __device__ __forceinline__ int operator ()(uint a, uint b) const + { + int res = 0; + + #if __CUDA_ARCH__ >= 300 + asm("vsub2.s32.u32.u32.sat %0, %1, %2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); + #elif __CUDA_ARCH__ >= 200 + asm("vsub.s32.u32.u32.sat %0.h0, %1.h0, %2.h0, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); + asm("vsub.s32.u32.u32.sat %0.h1, %1.h1, %2.h1, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); + #endif + + return res; + } + + __device__ __forceinline__ VSub2() {} + __device__ __forceinline__ VSub2(const VSub2& other) {} + }; + template <> struct VSub2 : binary_function + { + __device__ __forceinline__ uint operator ()(int a, int b) const + { + uint res = 0; + + #if __CUDA_ARCH__ >= 300 + asm("vsub2.u32.s32.s32.sat %0, %1, %2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); + #elif __CUDA_ARCH__ >= 200 + asm("vsub.u32.s32.s32.sat %0.h0, %1.h0, %2.h0, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); + asm("vsub.u32.s32.s32.sat %0.h1, %1.h1, %2.h1, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); + #endif + + return res; + } + + __device__ __forceinline__ VSub2() {} + __device__ __forceinline__ VSub2(const VSub2& other) {} + }; + template <> struct VSub2 : binary_function + { + __device__ __forceinline__ int operator ()(int a, int b) const + { + int res = 0; + + #if __CUDA_ARCH__ >= 300 + asm("vsub2.s32.s32.s32.sat %0, %1, %2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); + #elif __CUDA_ARCH__ >= 200 + asm("vsub.s32.s32.s32.sat %0.h0, %1.h0, %2.h0, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); + asm("vsub.s32.s32.s32.sat %0.h1, %1.h1, %2.h1, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); + #endif + + return res; + } + + __device__ __forceinline__ VSub2() {} + __device__ __forceinline__ VSub2(const VSub2& other) {} + }; + + //////////////////////////////////// + + template struct SubMat : binary_function { __device__ __forceinline__ D operator ()(T a, T b) const { return saturate_cast(a - b); } + + __device__ __forceinline__ SubMat() {} + __device__ __forceinline__ SubMat(const SubMat& other) {} + }; +} + +namespace cv { namespace gpu { namespace device +{ + template struct TransformFunctorTraits< VSub4 > : DefaultTransformFunctorTraits< VSub4 > + { + enum { smart_shift = 2 }; }; - template <> struct TransformFunctorTraits< Subtract > : DefaultTransformFunctorTraits< Subtract > + //////////////////////////////////// + + template struct TransformFunctorTraits< VSub2 > : DefaultTransformFunctorTraits< VSub2 > { - enum { smart_block_dim_y = 8 }; - enum { smart_shift = 4 }; - }; - template <> struct TransformFunctorTraits< Subtract > : DefaultTransformFunctorTraits< Subtract > - { - enum { smart_block_dim_y = 8 }; - enum { smart_shift = 4 }; - }; - template <> struct TransformFunctorTraits< Subtract > : DefaultTransformFunctorTraits< Subtract > - { - enum { smart_block_dim_y = 8 }; - enum { smart_shift = 4 }; - }; - template <> struct TransformFunctorTraits< Subtract > : DefaultTransformFunctorTraits< Subtract > - { - enum { smart_block_dim_y = 8 }; - enum { smart_shift = 4 }; + enum { smart_shift = 2 }; }; - template void subtract_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream) + //////////////////////////////////// + + template <> struct TransformFunctorTraits< SubMat > : DefaultTransformFunctorTraits< SubMat > { - if (mask.data) - cv::gpu::device::transform((PtrStepSz)src1, (PtrStepSz)src2, (PtrStepSz)dst, Subtract(), SingleMask(mask), stream); - else - cv::gpu::device::transform((PtrStepSz)src1, (PtrStepSz)src2, (PtrStepSz)dst, Subtract(), WithOutMask(), stream); + enum { smart_block_dim_y = 8 }; + enum { smart_shift = 4 }; + }; + template <> struct TransformFunctorTraits< SubMat > : DefaultTransformFunctorTraits< SubMat > + { + enum { smart_block_dim_y = 8 }; + enum { smart_shift = 4 }; + }; + template <> struct TransformFunctorTraits< SubMat > : DefaultTransformFunctorTraits< SubMat > + { + enum { smart_block_dim_y = 8 }; + enum { smart_shift = 4 }; + }; + template <> struct TransformFunctorTraits< SubMat > : DefaultTransformFunctorTraits< SubMat > + { + enum { smart_block_dim_y = 8 }; + enum { smart_shift = 4 }; + }; +}}} + +namespace arithm +{ + template + void vsub4(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream) + { + transform((PtrStepSz) src1, (PtrStepSz) src2, (PtrStepSz) dst, VSub4(), WithOutMask(), stream); } - template void subtract_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void subtract_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - template void subtract_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - template void subtract_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - template void subtract_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - template void subtract_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - template void subtract_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); + template void vsub4(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream); + template void vsub4(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream); + template void vsub4(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream); + template void vsub4(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream); - //template void subtract_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void subtract_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void subtract_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void subtract_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void subtract_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void subtract_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void subtract_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - - //template void subtract_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void subtract_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - template void subtract_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void subtract_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - template void subtract_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - template void subtract_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - template void subtract_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - - //template void subtract_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void subtract_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void subtract_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - template void subtract_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - template void subtract_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - template void subtract_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - template void subtract_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - - //template void subtract_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void subtract_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void subtract_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void subtract_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - template void subtract_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - template void subtract_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - template void subtract_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - - //template void subtract_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void subtract_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void subtract_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void subtract_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void subtract_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - template void subtract_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - template void subtract_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - - //template void subtract_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void subtract_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void subtract_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void subtract_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void subtract_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void subtract_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - template void subtract_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - - template struct SubtractScalar : unary_function + template + void vsub2(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream) { - SubtractScalar(double val_) : val(val_) {} - __device__ __forceinline__ D operator ()(T a) const - { - return saturate_cast(a - val); - } - const double val; - }; - - template <> struct TransformFunctorTraits< SubtractScalar > : DefaultTransformFunctorTraits< SubtractScalar > - { - enum { smart_block_dim_y = 8 }; - enum { smart_shift = 4 }; - }; - template <> struct TransformFunctorTraits< SubtractScalar > : DefaultTransformFunctorTraits< SubtractScalar > - { - enum { smart_block_dim_y = 8 }; - enum { smart_shift = 4 }; - }; - template <> struct TransformFunctorTraits< SubtractScalar > : DefaultTransformFunctorTraits< SubtractScalar > - { - enum { smart_block_dim_y = 8 }; - enum { smart_shift = 4 }; - }; - template <> struct TransformFunctorTraits< SubtractScalar > : DefaultTransformFunctorTraits< SubtractScalar > - { - enum { smart_block_dim_y = 8 }; - enum { smart_shift = 4 }; - }; - - template void subtract_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream) - { - cudaSafeCall( cudaSetDoubleForDevice(&val) ); - SubtractScalar op(val); - if (mask.data) - cv::gpu::device::transform((PtrStepSz)src1, (PtrStepSz)dst, op, SingleMask(mask), stream); - else - cv::gpu::device::transform((PtrStepSz)src1, (PtrStepSz)dst, op, WithOutMask(), stream); + transform((PtrStepSz) src1, (PtrStepSz) src2, (PtrStepSz) dst, VSub2(), WithOutMask(), stream); } - template void subtract_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void subtract_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - template void subtract_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - template void subtract_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - template void subtract_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - template void subtract_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - template void subtract_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); + template void vsub2(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream); + template void vsub2(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream); + template void vsub2(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream); + template void vsub2(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream); - //template void subtract_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void subtract_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void subtract_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void subtract_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void subtract_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void subtract_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void subtract_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); + template + void subMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream) + { + if (mask.data) + transform((PtrStepSz) src1, (PtrStepSz) src2, (PtrStepSz) dst, SubMat(), mask, stream); + else + transform((PtrStepSz) src1, (PtrStepSz) src2, (PtrStepSz) dst, SubMat(), WithOutMask(), stream); + } - //template void subtract_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void subtract_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - template void subtract_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void subtract_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - template void subtract_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - template void subtract_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - template void subtract_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); + template void subMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void subMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void subMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void subMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void subMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void subMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void subMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); - //template void subtract_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void subtract_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void subtract_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - template void subtract_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - template void subtract_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - template void subtract_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - template void subtract_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); + template void subMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void subMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void subMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void subMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void subMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void subMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void subMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); - //template void subtract_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void subtract_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void subtract_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void subtract_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - template void subtract_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - template void subtract_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - template void subtract_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); + //template void subMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + //template void subMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void subMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void subMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void subMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void subMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void subMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); - //template void subtract_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void subtract_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void subtract_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void subtract_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void subtract_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - template void subtract_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - template void subtract_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); + //template void subMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + //template void subMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void subMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void subMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void subMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void subMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void subMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); - //template void subtract_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void subtract_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void subtract_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void subtract_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void subtract_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void subtract_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - template void subtract_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); + //template void subMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + //template void subMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + //template void subMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + //template void subMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void subMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void subMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void subMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); - ////////////////////////////////////////////////////////////////////////// - // multiply + //template void subMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + //template void subMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + //template void subMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + //template void subMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + //template void subMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void subMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void subMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); - struct multiply_8uc4_32f : binary_function + //template void subMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + //template void subMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + //template void subMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + //template void subMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + //template void subMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + //template void subMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void subMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); +} + +////////////////////////////////////////////////////////////////////////// +// subScalar + +namespace arithm +{ + template + void subScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream) + { + AddScalar op(-static_cast(val)); + + if (mask.data) + transform((PtrStepSz) src1, (PtrStepSz) dst, op, mask, stream); + else + transform((PtrStepSz) src1, (PtrStepSz) dst, op, WithOutMask(), stream); + } + + template void subScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void subScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void subScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void subScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void subScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void subScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void subScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + + template void subScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void subScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void subScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void subScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void subScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void subScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void subScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + + //template void subScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + //template void subScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void subScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void subScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void subScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void subScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void subScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + + //template void subScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + //template void subScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void subScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void subScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void subScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void subScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void subScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + + //template void subScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + //template void subScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + //template void subScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + //template void subScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void subScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void subScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void subScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + + //template void subScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + //template void subScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + //template void subScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + //template void subScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + //template void subScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void subScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void subScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + + //template void subScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + //template void subScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + //template void subScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + //template void subScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + //template void subScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + //template void subScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void subScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); +} + +////////////////////////////////////////////////////////////////////////// +// mulMat + +namespace +{ + struct Mul_8uc4_32f : binary_function { __device__ __forceinline__ uint operator ()(uint a, float b) const { @@ -453,301 +881,316 @@ namespace cv { namespace gpu { namespace device return res; } + + __device__ __forceinline__ Mul_8uc4_32f() {} + __device__ __forceinline__ Mul_8uc4_32f(const Mul_8uc4_32f& other) {} }; - OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(multiply_8uc4_32f) - { - enum { smart_block_dim_x = 8 }; - enum { smart_block_dim_y = 8 }; - enum { smart_shift = 8 }; - }; - - void multiply_gpu(const PtrStepSz& src1, const PtrStepSzf& src2, const PtrStepSz& dst, cudaStream_t stream) - { - cv::gpu::device::transform(static_cast< PtrStepSz >(src1), src2, static_cast< PtrStepSz >(dst), multiply_8uc4_32f(), WithOutMask(), stream); - } - - struct multiply_16sc4_32f : binary_function + struct Mul_16sc4_32f : binary_function { __device__ __forceinline__ short4 operator ()(short4 a, float b) const { return make_short4(saturate_cast(a.x * b), saturate_cast(a.y * b), saturate_cast(a.z * b), saturate_cast(a.w * b)); } + + __device__ __forceinline__ Mul_16sc4_32f() {} + __device__ __forceinline__ Mul_16sc4_32f(const Mul_16sc4_32f& other) {} }; - OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(multiply_16sc4_32f) + template struct Mul : binary_function { - enum { smart_block_dim_x = 8 }; - enum { smart_block_dim_y = 8 }; - enum { smart_shift = 8 }; + __device__ __forceinline__ D operator ()(T a, T b) const + { + return saturate_cast(a * b); + } + + __device__ __forceinline__ Mul() {} + __device__ __forceinline__ Mul(const Mul& other) {} }; - void multiply_gpu(const PtrStepSz& src1, const PtrStepSzf& src2, const PtrStepSz& dst, cudaStream_t stream) + template struct MulScale : binary_function { - cv::gpu::device::transform(static_cast< PtrStepSz >(src1), src2, static_cast< PtrStepSz >(dst), multiply_16sc4_32f(), WithOutMask(), stream); - } + S scale; + + explicit MulScale(S scale_) : scale(scale_) {} - template struct Multiply : binary_function - { - Multiply(float scale_) : scale(scale_) {} __device__ __forceinline__ D operator ()(T a, T b) const { return saturate_cast(scale * a * b); } - const float scale; - }; - template struct Multiply : binary_function - { - Multiply(double scale_) : scale(scale_) {} - __device__ __forceinline__ double operator ()(T a, T b) const - { - return scale * a * b; - } - const double scale; - }; - template <> struct Multiply : binary_function - { - Multiply(double scale_) : scale(scale_) {} - __device__ __forceinline__ int operator ()(int a, int b) const - { - return saturate_cast(scale * a * b); - } - const double scale; }; +} - template <> struct TransformFunctorTraits< Multiply > : DefaultTransformFunctorTraits< Multiply > - { - enum { smart_block_dim_y = 8 }; - enum { smart_shift = 4 }; - }; - template <> struct TransformFunctorTraits< Multiply > : DefaultTransformFunctorTraits< Multiply > - { - enum { smart_block_dim_y = 8 }; - enum { smart_shift = 4 }; - }; - template <> struct TransformFunctorTraits< Multiply > : DefaultTransformFunctorTraits< Multiply > - { - enum { smart_block_dim_y = 8 }; - enum { smart_shift = 4 }; - }; - template <> struct TransformFunctorTraits< Multiply > : DefaultTransformFunctorTraits< Multiply > - { - enum { smart_block_dim_y = 8 }; - enum { smart_shift = 4 }; - }; - - template struct MultiplyCaller - { - static void call(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream) - { - Multiply op(static_cast(scale)); - cv::gpu::device::transform((PtrStepSz)src1, (PtrStepSz)src2, (PtrStepSz)dst, op, WithOutMask(), stream); - } - }; - template struct MultiplyCaller - { - static void call(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream) - { - cudaSafeCall( cudaSetDoubleForDevice(&scale) ); - Multiply op(scale); - cv::gpu::device::transform((PtrStepSz)src1, (PtrStepSz)src2, (PtrStepSz)dst, op, WithOutMask(), stream); - } - }; - template <> struct MultiplyCaller - { - static void call(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream) - { - cudaSafeCall( cudaSetDoubleForDevice(&scale) ); - Multiply op(scale); - cv::gpu::device::transform((PtrStepSz)src1, (PtrStepSz)src2, (PtrStepSz)dst, op, WithOutMask(), stream); - } - }; - - template void multiply_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream) - { - MultiplyCaller::call(src1, src2, dst, scale, stream); - } - - template void multiply_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void multiply_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - template void multiply_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - template void multiply_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - template void multiply_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - template void multiply_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - template void multiply_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - - //template void multiply_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void multiply_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void multiply_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void multiply_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void multiply_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void multiply_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void multiply_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - - //template void multiply_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void multiply_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - template void multiply_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void multiply_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - template void multiply_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - template void multiply_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - template void multiply_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - - //template void multiply_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void multiply_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void multiply_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - template void multiply_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - template void multiply_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - template void multiply_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - template void multiply_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - - //template void multiply_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void multiply_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void multiply_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void multiply_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - template void multiply_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - template void multiply_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - template void multiply_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - - //template void multiply_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void multiply_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void multiply_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void multiply_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void multiply_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - template void multiply_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - template void multiply_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - - //template void multiply_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void multiply_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void multiply_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void multiply_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void multiply_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void multiply_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - template void multiply_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - - template struct MultiplyScalar : unary_function - { - MultiplyScalar(double val_, double scale_) : val(val_), scale(scale_) {} - __device__ __forceinline__ D operator ()(T a) const - { - return saturate_cast(scale * a * val); - } - const double val; - const double scale; - }; - - template <> struct TransformFunctorTraits< MultiplyScalar > : DefaultTransformFunctorTraits< MultiplyScalar > - { - enum { smart_block_dim_y = 8 }; - enum { smart_shift = 4 }; - }; - template <> struct TransformFunctorTraits< MultiplyScalar > : DefaultTransformFunctorTraits< MultiplyScalar > - { - enum { smart_block_dim_y = 8 }; - enum { smart_shift = 4 }; - }; - template <> struct TransformFunctorTraits< MultiplyScalar > : DefaultTransformFunctorTraits< MultiplyScalar > - { - enum { smart_block_dim_y = 8 }; - enum { smart_shift = 4 }; - }; - template <> struct TransformFunctorTraits< MultiplyScalar > : DefaultTransformFunctorTraits< MultiplyScalar > - { - enum { smart_block_dim_y = 8 }; - enum { smart_shift = 4 }; - }; - - template void multiply_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream) - { - cudaSafeCall( cudaSetDoubleForDevice(&val) ); - cudaSafeCall( cudaSetDoubleForDevice(&scale) ); - MultiplyScalar op(val, scale); - cv::gpu::device::transform((PtrStepSz)src1, (PtrStepSz)dst, op, WithOutMask(), stream); - } - - template void multiply_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void multiply_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - template void multiply_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - template void multiply_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - template void multiply_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - template void multiply_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - template void multiply_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - - //template void multiply_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void multiply_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void multiply_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void multiply_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void multiply_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void multiply_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void multiply_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - - //template void multiply_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void multiply_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - template void multiply_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void multiply_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - template void multiply_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - template void multiply_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - template void multiply_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - - //template void multiply_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void multiply_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void multiply_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - template void multiply_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - template void multiply_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - template void multiply_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - template void multiply_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - - //template void multiply_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void multiply_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void multiply_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void multiply_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - template void multiply_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - template void multiply_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - template void multiply_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - - //template void multiply_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void multiply_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void multiply_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void multiply_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void multiply_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - template void multiply_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - template void multiply_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - - //template void multiply_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void multiply_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void multiply_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void multiply_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void multiply_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void multiply_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - template void multiply_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - - ////////////////////////////////////////////////////////////////////////// - // divide - - struct divide_8uc4_32f : binary_function - { - __device__ __forceinline__ uchar4 operator ()(uchar4 a, float b) const - { - return b != 0 ? make_uchar4(saturate_cast(a.x / b), saturate_cast(a.y / b), - saturate_cast(a.z / b), saturate_cast(a.w / b)) - : make_uchar4(0,0,0,0); - } - }; - - OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(divide_8uc4_32f) +namespace cv { namespace gpu { namespace device +{ + OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(Mul_8uc4_32f) { enum { smart_block_dim_x = 8 }; enum { smart_block_dim_y = 8 }; enum { smart_shift = 8 }; }; - void divide_gpu(const PtrStepSz& src1, const PtrStepSzf& src2, const PtrStepSz& dst, cudaStream_t stream) + template <> struct TransformFunctorTraits< Mul > : DefaultTransformFunctorTraits< Mul > { - cv::gpu::device::transform(static_cast< PtrStepSz >(src1), src2, static_cast< PtrStepSz >(dst), divide_8uc4_32f(), WithOutMask(), stream); + enum { smart_block_dim_y = 8 }; + enum { smart_shift = 4 }; + }; + template <> struct TransformFunctorTraits< Mul > : DefaultTransformFunctorTraits< Mul > + { + enum { smart_block_dim_y = 8 }; + enum { smart_shift = 4 }; + }; + template <> struct TransformFunctorTraits< Mul > : DefaultTransformFunctorTraits< Mul > + { + enum { smart_block_dim_y = 8 }; + enum { smart_shift = 4 }; + }; + template <> struct TransformFunctorTraits< Mul > : DefaultTransformFunctorTraits< Mul > + { + enum { smart_block_dim_y = 8 }; + enum { smart_shift = 4 }; + }; + + template <> struct TransformFunctorTraits< MulScale > : DefaultTransformFunctorTraits< MulScale > + { + enum { smart_block_dim_y = 8 }; + enum { smart_shift = 4 }; + }; + template <> struct TransformFunctorTraits< MulScale > : DefaultTransformFunctorTraits< MulScale > + { + enum { smart_block_dim_y = 8 }; + enum { smart_shift = 4 }; + }; + template <> struct TransformFunctorTraits< MulScale > : DefaultTransformFunctorTraits< MulScale > + { + enum { smart_block_dim_y = 8 }; + enum { smart_shift = 4 }; + }; + template <> struct TransformFunctorTraits< MulScale > : DefaultTransformFunctorTraits< MulScale > + { + enum { smart_block_dim_y = 8 }; + enum { smart_shift = 4 }; + }; +}}} + +namespace arithm +{ + void mulMat_8uc4_32f(PtrStepSz src1, PtrStepSzf src2, PtrStepSz dst, cudaStream_t stream) + { + transform(src1, src2, dst, Mul_8uc4_32f(), WithOutMask(), stream); } + void mulMat_16sc4_32f(PtrStepSz src1, PtrStepSzf src2, PtrStepSz dst, cudaStream_t stream) + { + transform(src1, src2, dst, Mul_16sc4_32f(), WithOutMask(), stream); + } - struct divide_16sc4_32f : binary_function + template + void mulMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream) + { + if (scale == 1) + { + Mul op; + transform((PtrStepSz) src1, (PtrStepSz) src2, (PtrStepSz) dst, op, WithOutMask(), stream); + } + else + { + MulScale op(static_cast(scale)); + transform((PtrStepSz) src1, (PtrStepSz) src2, (PtrStepSz) dst, op, WithOutMask(), stream); + } + } + + template void mulMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + template void mulMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + template void mulMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + template void mulMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + template void mulMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + template void mulMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + template void mulMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + + template void mulMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + template void mulMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + template void mulMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + template void mulMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + template void mulMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + template void mulMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + template void mulMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + + //template void mulMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + //template void mulMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + template void mulMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + template void mulMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + template void mulMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + template void mulMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + template void mulMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + + //template void mulMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + //template void mulMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + template void mulMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + template void mulMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + template void mulMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + template void mulMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + template void mulMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + + //template void mulMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + //template void mulMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + //template void mulMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + //template void mulMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + template void mulMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + template void mulMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + template void mulMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + + //template void mulMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + //template void mulMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + //template void mulMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + //template void mulMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + //template void mulMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + template void mulMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + template void mulMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + + //template void mulMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + //template void mulMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + //template void mulMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + //template void mulMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + //template void mulMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + //template void mulMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + template void mulMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); +} + +////////////////////////////////////////////////////////////////////////// +// mulScalar + +namespace +{ + template struct MulScalar : unary_function + { + S val; + + explicit MulScalar(S val_) : val(val_) {} + + __device__ __forceinline__ D operator ()(T a) const + { + return saturate_cast(a * val); + } + }; +} + +namespace cv { namespace gpu { namespace device +{ + template <> struct TransformFunctorTraits< MulScalar > : DefaultTransformFunctorTraits< MulScalar > + { + enum { smart_block_dim_y = 8 }; + enum { smart_shift = 4 }; + }; + template <> struct TransformFunctorTraits< MulScalar > : DefaultTransformFunctorTraits< MulScalar > + { + enum { smart_block_dim_y = 8 }; + enum { smart_shift = 4 }; + }; + template <> struct TransformFunctorTraits< MulScalar > : DefaultTransformFunctorTraits< MulScalar > + { + enum { smart_block_dim_y = 8 }; + enum { smart_shift = 4 }; + }; + template <> struct TransformFunctorTraits< MulScalar > : DefaultTransformFunctorTraits< MulScalar > + { + enum { smart_block_dim_y = 8 }; + enum { smart_shift = 4 }; + }; +}}} + +namespace arithm +{ + template + void mulScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream) + { + MulScalar op(static_cast(val)); + transform((PtrStepSz) src1, (PtrStepSz) dst, op, WithOutMask(), stream); + } + + template void mulScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + template void mulScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + template void mulScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + template void mulScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + template void mulScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + template void mulScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + template void mulScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + + template void mulScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + template void mulScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + template void mulScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + template void mulScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + template void mulScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + template void mulScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + template void mulScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + + //template void mulScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + //template void mulScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + template void mulScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + template void mulScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + template void mulScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + template void mulScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + template void mulScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + + //template void mulScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + //template void mulScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + template void mulScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + template void mulScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + template void mulScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + template void mulScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + template void mulScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + + //template void mulScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + //template void mulScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + //template void mulScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + //template void mulScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + template void mulScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + template void mulScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + template void mulScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + + //template void mulScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + //template void mulScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + //template void mulScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + //template void mulScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + //template void mulScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + template void mulScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + template void mulScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + + //template void mulScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + //template void mulScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + //template void mulScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + //template void mulScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + //template void mulScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + //template void mulScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + template void mulScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); +} + +////////////////////////////////////////////////////////////////////////// +// divMat + +namespace +{ + struct Div_8uc4_32f : binary_function + { + __device__ __forceinline__ uint operator ()(uint a, float b) const + { + uint res = 0; + + if (b != 0) + { + b = 1.0f / b; + res |= (saturate_cast((0xffu & (a )) * b) ); + res |= (saturate_cast((0xffu & (a >> 8)) * b) << 8); + res |= (saturate_cast((0xffu & (a >> 16)) * b) << 16); + res |= (saturate_cast((0xffu & (a >> 24)) * b) << 24); + } + + return res; + } + }; + + struct Div_16sc4_32f : binary_function { __device__ __forceinline__ short4 operator ()(short4 a, float b) const { @@ -757,425 +1200,905 @@ namespace cv { namespace gpu { namespace device } }; - OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(divide_16sc4_32f) + template struct Div : binary_function + { + __device__ __forceinline__ D operator ()(T a, T b) const + { + return b != 0 ? saturate_cast(a / b) : 0; + } + + __device__ __forceinline__ Div() {} + __device__ __forceinline__ Div(const Div& other) {} + }; + template struct Div : binary_function + { + __device__ __forceinline__ float operator ()(T a, T b) const + { + return b != 0 ? static_cast(a) / b : 0; + } + + __device__ __forceinline__ Div() {} + __device__ __forceinline__ Div(const Div& other) {} + }; + template struct Div : binary_function + { + __device__ __forceinline__ double operator ()(T a, T b) const + { + return b != 0 ? static_cast(a) / b : 0; + } + + __device__ __forceinline__ Div() {} + __device__ __forceinline__ Div(const Div& other) {} + }; + + template struct DivScale : binary_function + { + S scale; + + explicit DivScale(S scale_) : scale(scale_) {} + + __device__ __forceinline__ D operator ()(T a, T b) const + { + return b != 0 ? saturate_cast(scale * a / b) : 0; + } + }; +} + +namespace cv { namespace gpu { namespace device +{ + OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(Div_8uc4_32f) { enum { smart_block_dim_x = 8 }; enum { smart_block_dim_y = 8 }; enum { smart_shift = 8 }; }; - void divide_gpu(const PtrStepSz& src1, const PtrStepSzf& src2, const PtrStepSz& dst, cudaStream_t stream) + template <> struct TransformFunctorTraits< Div > : DefaultTransformFunctorTraits< Div > { - cv::gpu::device::transform(static_cast< PtrStepSz >(src1), src2, static_cast< PtrStepSz >(dst), divide_16sc4_32f(), WithOutMask(), stream); + enum { smart_block_dim_y = 8 }; + enum { smart_shift = 4 }; + }; + template <> struct TransformFunctorTraits< Div > : DefaultTransformFunctorTraits< Div > + { + enum { smart_block_dim_y = 8 }; + enum { smart_shift = 4 }; + }; + template <> struct TransformFunctorTraits< Div > : DefaultTransformFunctorTraits< Div > + { + enum { smart_block_dim_y = 8 }; + enum { smart_shift = 4 }; + }; + template <> struct TransformFunctorTraits< Div > : DefaultTransformFunctorTraits< Div > + { + enum { smart_block_dim_y = 8 }; + enum { smart_shift = 4 }; + }; + + template <> struct TransformFunctorTraits< DivScale > : DefaultTransformFunctorTraits< DivScale > + { + enum { smart_block_dim_y = 8 }; + enum { smart_shift = 4 }; + }; + template <> struct TransformFunctorTraits< DivScale > : DefaultTransformFunctorTraits< DivScale > + { + enum { smart_block_dim_y = 8 }; + enum { smart_shift = 4 }; + }; + template <> struct TransformFunctorTraits< DivScale > : DefaultTransformFunctorTraits< DivScale > + { + enum { smart_block_dim_y = 8 }; + enum { smart_shift = 4 }; + }; + template <> struct TransformFunctorTraits< DivScale > : DefaultTransformFunctorTraits< DivScale > + { + enum { smart_block_dim_y = 8 }; + enum { smart_shift = 4 }; + }; +}}} + +namespace arithm +{ + void divMat_8uc4_32f(PtrStepSz src1, PtrStepSzf src2, PtrStepSz dst, cudaStream_t stream) + { + transform(src1, src2, dst, Div_8uc4_32f(), WithOutMask(), stream); } - template struct Divide : binary_function + void divMat_16sc4_32f(PtrStepSz src1, PtrStepSzf src2, PtrStepSz dst, cudaStream_t stream) { - Divide(double scale_) : scale(scale_) {} - __device__ __forceinline__ D operator ()(T a, T b) const + transform(src1, src2, dst, Div_16sc4_32f(), WithOutMask(), stream); + } + + template + void divMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream) + { + if (scale == 1) { - return b != 0 ? saturate_cast(a * scale / b) : 0; + Div op; + transform((PtrStepSz) src1, (PtrStepSz) src2, (PtrStepSz) dst, op, WithOutMask(), stream); + } + else + { + DivScale op(static_cast(scale)); + transform((PtrStepSz) src1, (PtrStepSz) src2, (PtrStepSz) dst, op, WithOutMask(), stream); } - const double scale; - }; - - template <> struct TransformFunctorTraits< Divide > : DefaultTransformFunctorTraits< Divide > - { - enum { smart_block_dim_y = 8 }; - enum { smart_shift = 4 }; - }; - template <> struct TransformFunctorTraits< Divide > : DefaultTransformFunctorTraits< Divide > - { - enum { smart_block_dim_y = 8 }; - enum { smart_shift = 4 }; - }; - template <> struct TransformFunctorTraits< Divide > : DefaultTransformFunctorTraits< Divide > - { - enum { smart_block_dim_y = 8 }; - enum { smart_shift = 4 }; - }; - template <> struct TransformFunctorTraits< Divide > : DefaultTransformFunctorTraits< Divide > - { - enum { smart_block_dim_y = 8 }; - enum { smart_shift = 4 }; - }; - - template void divide_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream) - { - cudaSafeCall( cudaSetDoubleForDevice(&scale) ); - Divide op(scale); - cv::gpu::device::transform((PtrStepSz)src1, (PtrStepSz)src2, (PtrStepSz)dst, op, WithOutMask(), stream); } - template void divide_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void divide_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - template void divide_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - template void divide_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - template void divide_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - template void divide_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - template void divide_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); + template void divMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + template void divMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + template void divMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + template void divMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + template void divMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + template void divMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + template void divMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); - //template void divide_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void divide_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void divide_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void divide_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void divide_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void divide_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void divide_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); + template void divMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + template void divMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + template void divMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + template void divMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + template void divMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + template void divMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + template void divMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); - //template void divide_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void divide_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - template void divide_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void divide_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - template void divide_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - template void divide_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - template void divide_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); + //template void divMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + //template void divMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + template void divMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + template void divMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + template void divMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + template void divMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + template void divMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); - //template void divide_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void divide_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void divide_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - template void divide_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - template void divide_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - template void divide_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - template void divide_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); + //template void divMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + //template void divMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + template void divMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + template void divMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + template void divMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + template void divMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + template void divMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); - //template void divide_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void divide_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void divide_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void divide_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - template void divide_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - template void divide_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - template void divide_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); + //template void divMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + //template void divMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + //template void divMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + //template void divMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + template void divMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + template void divMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + template void divMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); - //template void divide_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void divide_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void divide_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void divide_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void divide_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - template void divide_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - template void divide_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); + //template void divMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + //template void divMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + //template void divMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + //template void divMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + //template void divMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + template void divMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + template void divMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); - //template void divide_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void divide_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void divide_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void divide_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void divide_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void divide_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - template void divide_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); + //template void divMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + //template void divMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + //template void divMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + //template void divMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + //template void divMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + //template void divMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + template void divMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); +} - template struct DivideScalar : unary_function +////////////////////////////////////////////////////////////////////////// +// divScalar + +namespace arithm +{ + template + void divScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream) { - DivideScalar(double val_, double scale_) : val(val_), scale(scale_) {} + MulScalar op(static_cast(1.0 / val)); + transform((PtrStepSz) src1, (PtrStepSz) dst, op, WithOutMask(), stream); + } + + template void divScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + template void divScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + template void divScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + template void divScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + template void divScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + template void divScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + template void divScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + + template void divScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + template void divScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + template void divScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + template void divScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + template void divScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + template void divScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + template void divScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + + //template void divScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + //template void divScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + template void divScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + template void divScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + template void divScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + template void divScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + template void divScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + + //template void divScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + //template void divScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + template void divScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + template void divScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + template void divScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + template void divScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + template void divScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + + //template void divScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + //template void divScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + //template void divScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + //template void divScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + template void divScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + template void divScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + template void divScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + + //template void divScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + //template void divScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + //template void divScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + //template void divScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + //template void divScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + template void divScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + template void divScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + + //template void divScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + //template void divScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + //template void divScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + //template void divScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + //template void divScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + //template void divScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + template void divScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); +} + +////////////////////////////////////////////////////////////////////////// +// divInv + +namespace +{ + template struct DivInv : unary_function + { + S val; + + explicit DivInv(double val_) : val(val_) {} + __device__ __forceinline__ D operator ()(T a) const { - return saturate_cast(scale * a / val); + return a != 0 ? saturate_cast(val / a) : 0; } - const double val; - const double scale; }; +} - template <> struct TransformFunctorTraits< DivideScalar > : DefaultTransformFunctorTraits< DivideScalar > +namespace cv { namespace gpu { namespace device +{ + template <> struct TransformFunctorTraits< DivInv > : DefaultTransformFunctorTraits< DivInv > { enum { smart_block_dim_y = 8 }; enum { smart_shift = 4 }; }; - template <> struct TransformFunctorTraits< DivideScalar > : DefaultTransformFunctorTraits< DivideScalar > + template <> struct TransformFunctorTraits< DivInv > : DefaultTransformFunctorTraits< DivInv > { enum { smart_block_dim_y = 8 }; enum { smart_shift = 4 }; }; - template <> struct TransformFunctorTraits< DivideScalar > : DefaultTransformFunctorTraits< DivideScalar > + template <> struct TransformFunctorTraits< DivInv > : DefaultTransformFunctorTraits< DivInv > { enum { smart_block_dim_y = 8 }; enum { smart_shift = 4 }; }; - template <> struct TransformFunctorTraits< DivideScalar > : DefaultTransformFunctorTraits< DivideScalar > + template <> struct TransformFunctorTraits< DivInv > : DefaultTransformFunctorTraits< DivInv > { enum { smart_block_dim_y = 8 }; enum { smart_shift = 4 }; }; +}}} - template void divide_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream) +namespace arithm +{ + template + void divInv(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream) { - cudaSafeCall( cudaSetDoubleForDevice(&val) ); - cudaSafeCall( cudaSetDoubleForDevice(&scale) ); - DivideScalar op(val, scale); - cv::gpu::device::transform((PtrStepSz)src1, (PtrStepSz)dst, op, WithOutMask(), stream); + DivInv op(static_cast(val)); + transform((PtrStepSz) src1, (PtrStepSz) dst, op, WithOutMask(), stream); } - template void divide_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void divide_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - template void divide_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - template void divide_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - template void divide_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - template void divide_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - template void divide_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); + template void divInv(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + template void divInv(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + template void divInv(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + template void divInv(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + template void divInv(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + template void divInv(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + template void divInv(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); - //template void divide_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void divide_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void divide_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void divide_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void divide_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void divide_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void divide_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); + template void divInv(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + template void divInv(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + template void divInv(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + template void divInv(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + template void divInv(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + template void divInv(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + template void divInv(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); - //template void divide_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void divide_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - template void divide_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void divide_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - template void divide_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - template void divide_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - template void divide_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); + //template void divInv(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + //template void divInv(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + template void divInv(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + template void divInv(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + template void divInv(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + template void divInv(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + template void divInv(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); - //template void divide_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void divide_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void divide_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - template void divide_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - template void divide_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - template void divide_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - template void divide_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); + //template void divInv(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + //template void divInv(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + template void divInv(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + template void divInv(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + template void divInv(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + template void divInv(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + template void divInv(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); - //template void divide_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void divide_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void divide_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void divide_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - template void divide_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - template void divide_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - template void divide_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); + //template void divInv(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + //template void divInv(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + //template void divInv(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + //template void divInv(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + template void divInv(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + template void divInv(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + template void divInv(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); - //template void divide_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void divide_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void divide_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void divide_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void divide_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - template void divide_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - template void divide_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); + //template void divInv(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + //template void divInv(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + //template void divInv(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + //template void divInv(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + //template void divInv(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + template void divInv(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + template void divInv(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); - //template void divide_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void divide_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void divide_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void divide_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void divide_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void divide_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - template void divide_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); + //template void divInv(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + //template void divInv(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + //template void divInv(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + //template void divInv(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + //template void divInv(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + //template void divInv(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + template void divInv(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); +} - template struct Reciprocal : unary_function +////////////////////////////////////////////////////////////////////////// +// absDiffMat + +namespace +{ + template struct VAbsDiff4; + template <> struct VAbsDiff4 : binary_function { - Reciprocal(double scale_) : scale(scale_) {} - __device__ __forceinline__ D operator ()(T a) const + __device__ __forceinline__ uint operator ()(uint a, uint b) const { - return a != 0 ? saturate_cast(scale / a) : 0; + uint res = 0; + + #if __CUDA_ARCH__ >= 300 + asm("vabsdiff4.u32.u32.u32.sat %0, %1, %2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); + #elif __CUDA_ARCH__ >= 200 + asm("vabsdiff.u32.u32.u32.sat %0.b0, %1.b0, %2.b0, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); + asm("vabsdiff.u32.u32.u32.sat %0.b1, %1.b1, %2.b1, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); + asm("vabsdiff.u32.u32.u32.sat %0.b2, %1.b2, %2.b2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); + asm("vabsdiff.u32.u32.u32.sat %0.b3, %1.b3, %2.b3, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); + #endif + + return res; } - const double scale; + + __device__ __forceinline__ VAbsDiff4() {} + __device__ __forceinline__ VAbsDiff4(const VAbsDiff4& other) {} + }; + template <> struct VAbsDiff4 : binary_function + { + __device__ __forceinline__ int operator ()(int a, int b) const + { + int res = 0; + + #if __CUDA_ARCH__ >= 300 + asm("vabsdiff4.s32.s32.s32.sat %0, %1, %2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); + #elif __CUDA_ARCH__ >= 200 + asm("vabsdiff.s32.s32.s32.sat %0.b0, %1.b0, %2.b0, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); + asm("vabsdiff.s32.s32.s32.sat %0.b1, %1.b1, %2.b1, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); + asm("vabsdiff.s32.s32.s32.sat %0.b2, %1.b2, %2.b2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); + asm("vabsdiff.s32.s32.s32.sat %0.b3, %1.b3, %2.b3, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); + #endif + + return res; + } + + __device__ __forceinline__ VAbsDiff4() {} + __device__ __forceinline__ VAbsDiff4(const VAbsDiff4& other) {} }; - template <> struct TransformFunctorTraits< Reciprocal > : DefaultTransformFunctorTraits< Reciprocal > + //////////////////////////////////// + + template struct VAbsDiff2; + template <> struct VAbsDiff2 : binary_function { - enum { smart_block_dim_y = 8 }; - enum { smart_shift = 4 }; + __device__ __forceinline__ uint operator ()(uint a, uint b) const + { + uint res = 0; + + #if __CUDA_ARCH__ >= 300 + asm("vabsdiff2.u32.u32.u32.sat %0, %1, %2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); + #elif __CUDA_ARCH__ >= 200 + asm("vabsdiff.u32.u32.u32.sat %0.h0, %1.h0, %2.h0, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); + asm("vabsdiff.u32.u32.u32.sat %0.h1, %1.h1, %2.h1, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); + #endif + + return res; + } + + __device__ __forceinline__ VAbsDiff2() {} + __device__ __forceinline__ VAbsDiff2(const VAbsDiff2& other) {} }; - template <> struct TransformFunctorTraits< Reciprocal > : DefaultTransformFunctorTraits< Reciprocal > + template <> struct VAbsDiff2 : binary_function { - enum { smart_block_dim_y = 8 }; - enum { smart_shift = 4 }; - }; - template <> struct TransformFunctorTraits< Reciprocal > : DefaultTransformFunctorTraits< Reciprocal > - { - enum { smart_block_dim_y = 8 }; - enum { smart_shift = 4 }; - }; - template <> struct TransformFunctorTraits< Reciprocal > : DefaultTransformFunctorTraits< Reciprocal > - { - enum { smart_block_dim_y = 8 }; - enum { smart_shift = 4 }; + __device__ __forceinline__ int operator ()(int a, int b) const + { + int res = 0; + + #if __CUDA_ARCH__ >= 300 + asm("vabsdiff2.s32.s32.s32.sat %0, %1, %2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); + #elif __CUDA_ARCH__ >= 200 + asm("vabsdiff.s32.s32.s32.sat %0.h0, %1.h0, %2.h0, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); + asm("vabsdiff.s32.s32.s32.sat %0.h1, %1.h1, %2.h1, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); + #endif + + return res; + } + + __device__ __forceinline__ VAbsDiff2() {} + __device__ __forceinline__ VAbsDiff2(const VAbsDiff2& other) {} }; - template void divide_gpu(double scalar, const PtrStepSzb& src2, const PtrStepSzb& dst, cudaStream_t stream) + //////////////////////////////////// + + __device__ __forceinline__ int _abs(int a) { - cudaSafeCall( cudaSetDoubleForDevice(&scalar) ); - Reciprocal op(scalar); - cv::gpu::device::transform((PtrStepSz)src2, (PtrStepSz)dst, op, WithOutMask(), stream); + return ::abs(a); + } + __device__ __forceinline__ float _abs(float a) + { + return ::fabsf(a); + } + __device__ __forceinline__ double _abs(double a) + { + return ::fabs(a); } - template void divide_gpu(double scalar, const PtrStepSzb& src2, const PtrStepSzb& dst, cudaStream_t stream); - //template void divide_gpu(double scalar, const PtrStepSzb& src2, const PtrStepSzb& dst, cudaStream_t stream); - template void divide_gpu(double scalar, const PtrStepSzb& src2, const PtrStepSzb& dst, cudaStream_t stream); - template void divide_gpu(double scalar, const PtrStepSzb& src2, const PtrStepSzb& dst, cudaStream_t stream); - template void divide_gpu(double scalar, const PtrStepSzb& src2, const PtrStepSzb& dst, cudaStream_t stream); - template void divide_gpu(double scalar, const PtrStepSzb& src2, const PtrStepSzb& dst, cudaStream_t stream); - template void divide_gpu(double scalar, const PtrStepSzb& src2, const PtrStepSzb& dst, cudaStream_t stream); - - //template void divide_gpu(double scalar, const PtrStepSzb& src2, const PtrStepSzb& dst, cudaStream_t stream); - //template void divide_gpu(double scalar, const PtrStepSzb& src2, const PtrStepSzb& dst, cudaStream_t stream); - //template void divide_gpu(double scalar, const PtrStepSzb& src2, const PtrStepSzb& dst, cudaStream_t stream); - //template void divide_gpu(double scalar, const PtrStepSzb& src2, const PtrStepSzb& dst, cudaStream_t stream); - //template void divide_gpu(double scalar, const PtrStepSzb& src2, const PtrStepSzb& dst, cudaStream_t stream); - //template void divide_gpu(double scalar, const PtrStepSzb& src2, const PtrStepSzb& dst, cudaStream_t stream); - //template void divide_gpu(double scalar, const PtrStepSzb& src2, const PtrStepSzb& dst, cudaStream_t stream); - - //template void divide_gpu(double scalar, const PtrStepSzb& src2, const PtrStepSzb& dst, cudaStream_t stream); - //template void divide_gpu(double scalar, const PtrStepSzb& src2, const PtrStepSzb& dst, cudaStream_t stream); - template void divide_gpu(double scalar, const PtrStepSzb& src2, const PtrStepSzb& dst, cudaStream_t stream); - //template void divide_gpu(double scalar, const PtrStepSzb& src2, const PtrStepSzb& dst, cudaStream_t stream); - template void divide_gpu(double scalar, const PtrStepSzb& src2, const PtrStepSzb& dst, cudaStream_t stream); - template void divide_gpu(double scalar, const PtrStepSzb& src2, const PtrStepSzb& dst, cudaStream_t stream); - template void divide_gpu(double scalar, const PtrStepSzb& src2, const PtrStepSzb& dst, cudaStream_t stream); - - //template void divide_gpu(double scalar, const PtrStepSzb& src2, const PtrStepSzb& dst, cudaStream_t stream); - //template void divide_gpu(double scalar, const PtrStepSzb& src2, const PtrStepSzb& dst, cudaStream_t stream); - //template void divide_gpu(double scalar, const PtrStepSzb& src2, const PtrStepSzb& dst, cudaStream_t stream); - template void divide_gpu(double scalar, const PtrStepSzb& src2, const PtrStepSzb& dst, cudaStream_t stream); - template void divide_gpu(double scalar, const PtrStepSzb& src2, const PtrStepSzb& dst, cudaStream_t stream); - template void divide_gpu(double scalar, const PtrStepSzb& src2, const PtrStepSzb& dst, cudaStream_t stream); - template void divide_gpu(double scalar, const PtrStepSzb& src2, const PtrStepSzb& dst, cudaStream_t stream); - - //template void divide_gpu(double scalar, const PtrStepSzb& src2, const PtrStepSzb& dst, cudaStream_t stream); - //template void divide_gpu(double scalar, const PtrStepSzb& src2, const PtrStepSzb& dst, cudaStream_t stream); - //template void divide_gpu(double scalar, const PtrStepSzb& src2, const PtrStepSzb& dst, cudaStream_t stream); - //template void divide_gpu(double scalar, const PtrStepSzb& src2, const PtrStepSzb& dst, cudaStream_t stream); - template void divide_gpu(double scalar, const PtrStepSzb& src2, const PtrStepSzb& dst, cudaStream_t stream); - template void divide_gpu(double scalar, const PtrStepSzb& src2, const PtrStepSzb& dst, cudaStream_t stream); - template void divide_gpu(double scalar, const PtrStepSzb& src2, const PtrStepSzb& dst, cudaStream_t stream); - - //template void divide_gpu(double scalar, const PtrStepSzb& src2, const PtrStepSzb& dst, cudaStream_t stream); - //template void divide_gpu(double scalar, const PtrStepSzb& src2, const PtrStepSzb& dst, cudaStream_t stream); - //template void divide_gpu(double scalar, const PtrStepSzb& src2, const PtrStepSzb& dst, cudaStream_t stream); - //template void divide_gpu(double scalar, const PtrStepSzb& src2, const PtrStepSzb& dst, cudaStream_t stream); - //template void divide_gpu(double scalar, const PtrStepSzb& src2, const PtrStepSzb& dst, cudaStream_t stream); - template void divide_gpu(double scalar, const PtrStepSzb& src2, const PtrStepSzb& dst, cudaStream_t stream); - template void divide_gpu(double scalar, const PtrStepSzb& src2, const PtrStepSzb& dst, cudaStream_t stream); - - //template void divide_gpu(double scalar, const PtrStepSzb& src2, const PtrStepSzb& dst, cudaStream_t stream); - //template void divide_gpu(double scalar, const PtrStepSzb& src2, const PtrStepSzb& dst, cudaStream_t stream); - //template void divide_gpu(double scalar, const PtrStepSzb& src2, const PtrStepSzb& dst, cudaStream_t stream); - //template void divide_gpu(double scalar, const PtrStepSzb& src2, const PtrStepSzb& dst, cudaStream_t stream); - //template void divide_gpu(double scalar, const PtrStepSzb& src2, const PtrStepSzb& dst, cudaStream_t stream); - //template void divide_gpu(double scalar, const PtrStepSzb& src2, const PtrStepSzb& dst, cudaStream_t stream); - template void divide_gpu(double scalar, const PtrStepSzb& src2, const PtrStepSzb& dst, cudaStream_t stream); - - ////////////////////////////////////////////////////////////////////////// - // absdiff - - template struct Absdiff : binary_function + template struct AbsDiffMat : binary_function { - static __device__ __forceinline__ int abs(int a) - { - return ::abs(a); - } - static __device__ __forceinline__ float abs(float a) - { - return ::fabsf(a); - } - static __device__ __forceinline__ double abs(double a) - { - return ::fabs(a); - } - __device__ __forceinline__ T operator ()(T a, T b) const { - return saturate_cast(::abs(a - b)); + return saturate_cast(_abs(a - b)); } + + __device__ __forceinline__ AbsDiffMat() {} + __device__ __forceinline__ AbsDiffMat(const AbsDiffMat& other) {} + }; +} + +namespace cv { namespace gpu { namespace device +{ + template struct TransformFunctorTraits< VAbsDiff4 > : DefaultTransformFunctorTraits< VAbsDiff4 > + { + enum { smart_shift = 2 }; }; - template <> struct TransformFunctorTraits< Absdiff > : DefaultTransformFunctorTraits< Absdiff > + //////////////////////////////////// + + template struct TransformFunctorTraits< VAbsDiff2 > : DefaultTransformFunctorTraits< VAbsDiff4 > { - enum { smart_block_dim_y = 8 }; - enum { smart_shift = 4 }; - }; - template <> struct TransformFunctorTraits< Absdiff > : DefaultTransformFunctorTraits< Absdiff > - { - enum { smart_block_dim_y = 8 }; - enum { smart_shift = 4 }; - }; - template <> struct TransformFunctorTraits< Absdiff > : DefaultTransformFunctorTraits< Absdiff > - { - enum { smart_block_dim_y = 8 }; - enum { smart_shift = 4 }; - }; - template <> struct TransformFunctorTraits< Absdiff > : DefaultTransformFunctorTraits< Absdiff > - { - enum { smart_block_dim_y = 8 }; - enum { smart_shift = 4 }; + enum { smart_shift = 2 }; }; - template void absdiff_gpu(const PtrStepSzb src1, const PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream) + //////////////////////////////////// + + template <> struct TransformFunctorTraits< AbsDiffMat > : DefaultTransformFunctorTraits< AbsDiffMat > { - cv::gpu::device::transform((PtrStepSz)src1, (PtrStepSz)src2, (PtrStepSz)dst, Absdiff(), WithOutMask(), stream); + enum { smart_block_dim_y = 8 }; + enum { smart_shift = 4 }; + }; + template <> struct TransformFunctorTraits< AbsDiffMat > : DefaultTransformFunctorTraits< AbsDiffMat > + { + enum { smart_block_dim_y = 8 }; + enum { smart_shift = 4 }; + }; + template <> struct TransformFunctorTraits< AbsDiffMat > : DefaultTransformFunctorTraits< AbsDiffMat > + { + enum { smart_block_dim_y = 8 }; + enum { smart_shift = 4 }; + }; + template <> struct TransformFunctorTraits< AbsDiffMat > : DefaultTransformFunctorTraits< AbsDiffMat > + { + enum { smart_block_dim_y = 8 }; + enum { smart_shift = 4 }; + }; +}}} + +namespace arithm +{ + template + void vabsDiff4(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream) + { + transform((PtrStepSz) src1, (PtrStepSz) src2, (PtrStepSz) dst, VAbsDiff4(), WithOutMask(), stream); } - //template void absdiff_gpu(const PtrStepSzb src1, const PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream); - template void absdiff_gpu(const PtrStepSzb src1, const PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream); - //template void absdiff_gpu(const PtrStepSzb src1, const PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream); - template void absdiff_gpu(const PtrStepSzb src1, const PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream); - template void absdiff_gpu(const PtrStepSzb src1, const PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream); - //template void absdiff_gpu(const PtrStepSzb src1, const PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream); - template void absdiff_gpu(const PtrStepSzb src1, const PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream); + template void vabsDiff4(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream); + template void vabsDiff4(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream); - template struct AbsdiffScalar : unary_function + template + void vabsDiff2(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream) { - AbsdiffScalar(double val_) : val(val_) {} + transform((PtrStepSz) src1, (PtrStepSz) src2, (PtrStepSz) dst, VAbsDiff2(), WithOutMask(), stream); + } + + template void vabsDiff2(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream); + template void vabsDiff2(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream); + + template + void absDiffMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream) + { + transform((PtrStepSz) src1, (PtrStepSz) src2, (PtrStepSz) dst, AbsDiffMat(), WithOutMask(), stream); + } + + template void absDiffMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream); + template void absDiffMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream); + template void absDiffMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream); + template void absDiffMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream); + template void absDiffMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream); + template void absDiffMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream); + template void absDiffMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream); +} + +////////////////////////////////////////////////////////////////////////// +// absDiffScalar + +namespace +{ + template struct AbsDiffScalar : unary_function + { + S val; + + explicit AbsDiffScalar(S val_) : val(val_) {} + __device__ __forceinline__ T operator ()(T a) const { - return saturate_cast(::fabs(a - val)); + abs_func f; + return saturate_cast(f(a - val)); } - double val; }; +} - template <> struct TransformFunctorTraits< AbsdiffScalar > : DefaultTransformFunctorTraits< AbsdiffScalar > +namespace cv { namespace gpu { namespace device +{ + template <> struct TransformFunctorTraits< AbsDiffScalar > : DefaultTransformFunctorTraits< AbsDiffScalar > { enum { smart_block_dim_y = 8 }; enum { smart_shift = 4 }; }; - template <> struct TransformFunctorTraits< AbsdiffScalar > : DefaultTransformFunctorTraits< AbsdiffScalar > + template <> struct TransformFunctorTraits< AbsDiffScalar > : DefaultTransformFunctorTraits< AbsDiffScalar > { enum { smart_block_dim_y = 8 }; enum { smart_shift = 4 }; }; - template <> struct TransformFunctorTraits< AbsdiffScalar > : DefaultTransformFunctorTraits< AbsdiffScalar > + template <> struct TransformFunctorTraits< AbsDiffScalar > : DefaultTransformFunctorTraits< AbsDiffScalar > { enum { smart_block_dim_y = 8 }; enum { smart_shift = 4 }; }; - template <> struct TransformFunctorTraits< AbsdiffScalar > : DefaultTransformFunctorTraits< AbsdiffScalar > + template <> struct TransformFunctorTraits< AbsDiffScalar > : DefaultTransformFunctorTraits< AbsDiffScalar > { enum { smart_block_dim_y = 8 }; enum { smart_shift = 4 }; }; +}}} - template void absdiff_gpu(const PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream) +namespace arithm +{ + template + void absDiffScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream) { - cudaSafeCall( cudaSetDoubleForDevice(&val) ); - AbsdiffScalar op(val); - cv::gpu::device::transform((PtrStepSz)src1, (PtrStepSz)dst, op, WithOutMask(), stream); + AbsDiffScalar op(static_cast(val)); + + transform((PtrStepSz) src1, (PtrStepSz) dst, op, WithOutMask(), stream); } - //template void absdiff_gpu(const PtrStepSzb src1, double src2, PtrStepSzb dst, cudaStream_t stream); - template void absdiff_gpu(const PtrStepSzb src1, double src2, PtrStepSzb dst, cudaStream_t stream); - //template void absdiff_gpu(const PtrStepSzb src1, double src2, PtrStepSzb dst, cudaStream_t stream); - template void absdiff_gpu(const PtrStepSzb src1, double src2, PtrStepSzb dst, cudaStream_t stream); - template void absdiff_gpu(const PtrStepSzb src1, double src2, PtrStepSzb dst, cudaStream_t stream); - //template void absdiff_gpu(const PtrStepSzb src1, double src2, PtrStepSzb dst, cudaStream_t stream); - template void absdiff_gpu(const PtrStepSzb src1, double src2, PtrStepSzb dst, cudaStream_t stream); + template void absDiffScalar(PtrStepSzb src1, double src2, PtrStepSzb dst, cudaStream_t stream); + template void absDiffScalar(PtrStepSzb src1, double src2, PtrStepSzb dst, cudaStream_t stream); + template void absDiffScalar(PtrStepSzb src1, double src2, PtrStepSzb dst, cudaStream_t stream); + template void absDiffScalar(PtrStepSzb src1, double src2, PtrStepSzb dst, cudaStream_t stream); + template void absDiffScalar(PtrStepSzb src1, double src2, PtrStepSzb dst, cudaStream_t stream); + template void absDiffScalar(PtrStepSzb src1, double src2, PtrStepSzb dst, cudaStream_t stream); + template void absDiffScalar(PtrStepSzb src1, double src2, PtrStepSzb dst, cudaStream_t stream); +} - ////////////////////////////////////////////////////////////////////////////////////// - // Compare +////////////////////////////////////////////////////////////////////////// +// absMat - template