diff --git a/modules/gpu/src/cuda/element_operations.cu b/modules/gpu/src/cuda/element_operations.cu index bbd5d37e9..d301ea0f1 100644 --- a/modules/gpu/src/cuda/element_operations.cu +++ b/modules/gpu/src/cuda/element_operations.cu @@ -607,68 +607,59 @@ namespace cv { namespace gpu { namespace device ////////////////////////////////////////////////////////////////////////// // multiply - // TODO implement more efficient version - template - void __global__ multiplyKernel(const PtrStep src1, const PtrStep src2, int rows, int cols, - PtrStep dst) + struct multiply_8uc4_32f : binary_function { - int x = blockIdx.x * blockDim.x + threadIdx.x; - int y = blockIdx.y * blockDim.y + threadIdx.y; - - if (x < cols && y < rows) + __device__ __forceinline__ uint operator ()(uint a, float b) const { - ((TDst*)dst.ptr(y))[x] = saturate_cast(((TSrc1*)src1.ptr(y))[x] * ((TSrc2*)src2.ptr(y))[x / cn]); + uint res = 0; + + res |= 0xffu & (saturate_cast((0xffu & (a )) * b) ); + res |= 0xffu & (saturate_cast((0xffu & (a >> 8)) * b) << 8); + res |= 0xffu & (saturate_cast((0xffu & (a >> 16)) * b) << 16); + res |= 0xffu & (saturate_cast((0xffu & (a >> 24)) * b) << 24); + + return res; } - } + }; - - template - void multiplyCaller(const PtrStep src1, const PtrStep src2, int rows, int cols, PtrStep dst, cudaStream_t stream) + template <> struct TransformFunctorTraits : DefaultTransformFunctorTraits { - dim3 threads(32, 8); - dim3 grid(divUp(cols, threads.x), divUp(rows, threads.y)); + enum { smart_block_dim_x = 8 }; + enum { smart_block_dim_y = 8 }; + enum { smart_shift = 8 }; + }; - multiplyKernel<<>>(src1, src2, rows, cols, dst); - cudaSafeCall(cudaGetLastError()); - - if (stream == 0) - cudaSafeCall(cudaDeviceSynchronize()); + void multiply_gpu(const DevMem2D_& src1, const DevMem2Df& src2, const DevMem2D_& dst, cudaStream_t stream) + { + transform(static_cast< DevMem2D_ >(src1), src2, static_cast< DevMem2D_ >(dst), multiply_8uc4_32f(), stream); } - - template void multiplyCaller(const PtrStep src1, const PtrStep src2, int rows, int cols, PtrStep dst, cudaStream_t stream); - - ////////////////////////////////////////////////////////////////////////// // multiply (by scalar) - // TODO implement more efficient version - template - void __global__ multiplyScalarKernel(const PtrStep src1, float scale, int rows, int cols, PtrStep dst) + template struct MultiplyScalar : unary_function { - int x = blockIdx.x * blockDim.x + threadIdx.x; - int y = blockIdx.y * blockDim.y + threadIdx.y; + __host__ __device__ __forceinline__ MultiplyScalar(typename TypeTraits::ParameterType scale_) : scale(scale_) {} - if (x < cols && y < rows) + __device__ __forceinline__ D operator ()(typename TypeTraits::ParameterType a) const { - ((TDst*)dst.ptr(y))[x] = saturate_cast(((TSrc*)src1.ptr(y))[x] * scale); + return saturate_cast(a * scale); } - } + const S scale; + }; - template - void multiplyScalarCaller(const PtrStep src, float scale, int rows, int cols, PtrStep dst, cudaStream_t stream) + template <> struct TransformFunctorTraits< MultiplyScalar > : DefaultTransformFunctorTraits< MultiplyScalar > { - dim3 threads(32, 8); - dim3 grid(divUp(cols, threads.x), divUp(rows, threads.y)); + enum { smart_block_dim_y = 8 }; + enum { smart_shift = 8 }; + }; - multiplyScalarKernel<<>>(src, scale, rows, cols, dst); - cudaSafeCall(cudaGetLastError()); - - if (stream == 0) - cudaSafeCall(cudaDeviceSynchronize()); + template + void multiplyScalar_gpu(const DevMem2D& src, float scale, const DevMem2D& dst, cudaStream_t stream) + { + transform(static_cast< DevMem2D_ >(src), static_cast< DevMem2D_ >(dst), MultiplyScalar(scale), stream); } - - template void multiplyScalarCaller(const PtrStep src, float scale, int rows, int cols, PtrStep dst, cudaStream_t stream); + template void multiplyScalar_gpu(const DevMem2D& src, float scale, const DevMem2D& dst, cudaStream_t stream); }}} diff --git a/modules/gpu/src/element_operations.cpp b/modules/gpu/src/element_operations.cpp index 57b8f2c80..c6b74257b 100644 --- a/modules/gpu/src/element_operations.cpp +++ b/modules/gpu/src/element_operations.cpp @@ -199,22 +199,21 @@ void cv::gpu::subtract(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, Stre namespace cv { namespace gpu { namespace device { - template - void multiplyCaller(const PtrStep src1, const PtrStep src2, int rows, int cols, PtrStep dst, cudaStream_t stream); + void multiply_gpu(const DevMem2D_& src1, const DevMem2Df& src2, const DevMem2D_& dst, cudaStream_t stream); - template - void multiplyScalarCaller(const PtrStep src, float scalar, int rows, int cols, PtrStep dst, cudaStream_t stream); + template + void multiplyScalar_gpu(const DevMem2D& src, float scale, const DevMem2D& dst, cudaStream_t stream); }}} void cv::gpu::multiply(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, Stream& stream) { - if (src1.type() == CV_8UC4 && src2.type() == CV_32F) + if (src1.type() == CV_8UC4 && src2.type() == CV_32FC1) { CV_Assert(src1.size() == src2.size()); + dst.create(src1.size(), src1.type()); - device::multiplyCaller(static_cast(src1), static_cast(src2), - src1.rows, src1.cols * 4, static_cast(dst), - StreamAccessor::getStream(stream)); + + device::multiply_gpu(src1, src2, dst, StreamAccessor::getStream(stream)); } else nppArithmCaller(src1, src2, dst, nppiMul_8u_C1RSfs, nppiMul_8u_C4RSfs, nppiMul_32s_C1R, nppiMul_32f_C1R, StreamAccessor::getStream(stream)); @@ -225,8 +224,8 @@ void cv::gpu::multiply(const GpuMat& src, const Scalar& sc, GpuMat& dst, Stream& if (src.depth() == CV_8U) { dst.create(src.size(), src.type()); - device::multiplyScalarCaller(static_cast(src), (float)(sc[0]), src.rows, src.cols * src.channels(), - static_cast(dst), StreamAccessor::getStream(stream)); + + device::multiplyScalar_gpu(src.reshape(1), (float)(sc[0]), dst, StreamAccessor::getStream(stream)); } else {