added support of 8UC4*32FC1 multiply into GPU module
This commit is contained in:
		@@ -602,4 +602,71 @@ namespace cv { namespace gpu { namespace device
 | 
			
		||||
    template void pow_caller<ushort>(const DevMem2D& src, float power, DevMem2D dst, cudaStream_t stream);
 | 
			
		||||
    template void pow_caller<int>(const DevMem2D& src, float power, DevMem2D dst, cudaStream_t stream);
 | 
			
		||||
    template void pow_caller<float>(const DevMem2D& src, float power, DevMem2D dst, cudaStream_t stream);
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
    //////////////////////////////////////////////////////////////////////////
 | 
			
		||||
    // multiply
 | 
			
		||||
 | 
			
		||||
    template <typename TSrc1, typename TSrc2, typename TDst, int cn>
 | 
			
		||||
    void __global__ multiplyKernel(const PtrStep src1, const PtrStep src2, int rows, int cols,
 | 
			
		||||
                                   PtrStep dst)
 | 
			
		||||
    {
 | 
			
		||||
        int x = blockIdx.x * blockDim.x + threadIdx.x;
 | 
			
		||||
        int y = blockIdx.y * blockDim.y + threadIdx.y;
 | 
			
		||||
 | 
			
		||||
        if (x < cols && y < rows)
 | 
			
		||||
        {
 | 
			
		||||
            ((TDst*)dst.ptr(y))[x] = saturate_cast<TDst>(((TSrc1*)src1.ptr(y))[x] * ((TSrc2*)src2.ptr(y))[x / cn]);
 | 
			
		||||
        }
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
    template <typename TSrc1, typename TSrc2, typename TDst, int cn>
 | 
			
		||||
    void multiplyCaller(const PtrStep src1, const PtrStep src2, int rows, int cols, PtrStep dst, cudaStream_t stream)
 | 
			
		||||
    {
 | 
			
		||||
        dim3 threads(32, 8);
 | 
			
		||||
        dim3 grid(divUp(cols, threads.x), divUp(rows, threads.y));
 | 
			
		||||
 | 
			
		||||
        multiplyKernel<TSrc1, TSrc2, TDst, cn><<<grid, threads>>>(src1, src2, rows, cols, dst);
 | 
			
		||||
        cudaSafeCall(cudaGetLastError());
 | 
			
		||||
 | 
			
		||||
        if (stream == 0)
 | 
			
		||||
            cudaSafeCall(cudaDeviceSynchronize());
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
    template void multiplyCaller<uchar, float, uchar, 4>(const PtrStep src1, const PtrStep src2, int rows, int cols, PtrStep dst, cudaStream_t stream);
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
    //////////////////////////////////////////////////////////////////////////
 | 
			
		||||
    // multiply (by scalar)
 | 
			
		||||
 | 
			
		||||
    template <typename TSrc, typename TDst>
 | 
			
		||||
    void __global__ multiplyScalarKernel(const PtrStep src1, float scale, int rows, int cols, PtrStep dst)
 | 
			
		||||
    {
 | 
			
		||||
        int x = blockIdx.x * blockDim.x + threadIdx.x;
 | 
			
		||||
        int y = blockIdx.y * blockDim.y + threadIdx.y;
 | 
			
		||||
 | 
			
		||||
        if (x < cols && y < rows)
 | 
			
		||||
        {
 | 
			
		||||
            ((TDst*)dst.ptr(y))[x] = saturate_cast<TDst>(((TSrc*)src1.ptr(y))[x] * scale);
 | 
			
		||||
        }
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
    template <typename TSrc, typename TDst>
 | 
			
		||||
    void multiplyScalarCaller(const PtrStep src, float scale, int rows, int cols, PtrStep dst, cudaStream_t stream)
 | 
			
		||||
    {
 | 
			
		||||
        dim3 threads(32, 8);
 | 
			
		||||
        dim3 grid(divUp(cols, threads.x), divUp(rows, threads.y));
 | 
			
		||||
 | 
			
		||||
        multiplyScalarKernel<TSrc, TDst><<<grid, threads>>>(src, scale, rows, cols, dst);
 | 
			
		||||
        cudaSafeCall(cudaGetLastError());
 | 
			
		||||
 | 
			
		||||
        if (stream == 0)
 | 
			
		||||
            cudaSafeCall(cudaDeviceSynchronize());
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
    template void multiplyScalarCaller<uchar, uchar>(const PtrStep src, float scale, int rows, int cols, PtrStep dst, cudaStream_t stream);
 | 
			
		||||
}}}
 | 
			
		||||
 
 | 
			
		||||
@@ -197,11 +197,59 @@ void cv::gpu::subtract(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, Stre
 | 
			
		||||
        nppArithmCaller(src2, src1, dst, nppiSub_8u_C1RSfs, nppiSub_8u_C4RSfs, nppiSub_32s_C1R, nppiSub_32f_C1R, StreamAccessor::getStream(stream));
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
namespace cv { namespace gpu { namespace device
 | 
			
		||||
{
 | 
			
		||||
    template <typename TSrc1, typename TSrc2, typename TDst, int cn>
 | 
			
		||||
    void multiplyCaller(const PtrStep src1, const PtrStep src2, int rows, int cols, PtrStep dst, cudaStream_t stream);
 | 
			
		||||
 | 
			
		||||
    template <typename TSrc, typename TDst>
 | 
			
		||||
    void multiplyScalarCaller(const PtrStep src, float scalar, int rows, int cols, PtrStep dst, cudaStream_t stream);
 | 
			
		||||
}}}
 | 
			
		||||
 | 
			
		||||
void cv::gpu::multiply(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, Stream& stream)
 | 
			
		||||
{
 | 
			
		||||
    nppArithmCaller(src1, src2, dst, nppiMul_8u_C1RSfs, nppiMul_8u_C4RSfs, nppiMul_32s_C1R, nppiMul_32f_C1R, StreamAccessor::getStream(stream));
 | 
			
		||||
    if (src1.type() == CV_8UC4 && src2.type() == CV_32F)
 | 
			
		||||
    {
 | 
			
		||||
        CV_Assert(src1.size() == src2.size());
 | 
			
		||||
        dst.create(src1.size(), src1.type());
 | 
			
		||||
        device::multiplyCaller<uchar, float, uchar, 4>(static_cast<DevMem2D>(src1), static_cast<DevMem2D>(src2),
 | 
			
		||||
                                                       src1.rows, src1.cols * 4, static_cast<DevMem2D>(dst),
 | 
			
		||||
                                                       StreamAccessor::getStream(stream));
 | 
			
		||||
    }
 | 
			
		||||
    else
 | 
			
		||||
        nppArithmCaller(src1, src2, dst, nppiMul_8u_C1RSfs, nppiMul_8u_C4RSfs, nppiMul_32s_C1R, nppiMul_32f_C1R, StreamAccessor::getStream(stream));
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
void cv::gpu::multiply(const GpuMat& src, const Scalar& sc, GpuMat& dst, Stream& stream)
 | 
			
		||||
{
 | 
			
		||||
    if (src.depth() == CV_8U)
 | 
			
		||||
    {
 | 
			
		||||
        dst.create(src.size(), src.type());
 | 
			
		||||
        device::multiplyScalarCaller<uchar, uchar>(static_cast<DevMem2D>(src), (float)(sc[0]), src.rows, src.cols * src.channels(),
 | 
			
		||||
                                                   static_cast<DevMem2D>(dst), StreamAccessor::getStream(stream));
 | 
			
		||||
    }
 | 
			
		||||
    else
 | 
			
		||||
    {
 | 
			
		||||
        CV_Assert(src.type() == CV_32FC1);
 | 
			
		||||
 | 
			
		||||
        dst.create(src.size(), src.type());
 | 
			
		||||
 | 
			
		||||
        NppiSize sz;
 | 
			
		||||
        sz.width  = src.cols;
 | 
			
		||||
        sz.height = src.rows;
 | 
			
		||||
 | 
			
		||||
        cudaStream_t cudaStream = StreamAccessor::getStream(stream);
 | 
			
		||||
 | 
			
		||||
        NppStreamHandler h(cudaStream);
 | 
			
		||||
 | 
			
		||||
        nppSafeCall( nppiMulC_32f_C1R(src.ptr<Npp32f>(), static_cast<int>(src.step), (Npp32f)sc[0], dst.ptr<Npp32f>(), static_cast<int>(dst.step), sz) );
 | 
			
		||||
 | 
			
		||||
        if (cudaStream == 0)
 | 
			
		||||
            cudaSafeCall( cudaDeviceSynchronize() );
 | 
			
		||||
    }
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
void cv::gpu::divide(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, Stream& stream)
 | 
			
		||||
{
 | 
			
		||||
    nppArithmCaller(src2, src1, dst, nppiDiv_8u_C1RSfs, nppiDiv_8u_C4RSfs, nppiDiv_32s_C1R, nppiDiv_32f_C1R, StreamAccessor::getStream(stream));
 | 
			
		||||
@@ -227,26 +275,6 @@ void cv::gpu::subtract(const GpuMat& src, const Scalar& sc, GpuMat& dst, Stream&
 | 
			
		||||
    callers[src.channels()](src, sc, dst, StreamAccessor::getStream(stream));
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
void cv::gpu::multiply(const GpuMat& src, const Scalar& sc, GpuMat& dst, Stream& stream)
 | 
			
		||||
{
 | 
			
		||||
    CV_Assert(src.type() == CV_32FC1);
 | 
			
		||||
 | 
			
		||||
    dst.create(src.size(), src.type());
 | 
			
		||||
 | 
			
		||||
    NppiSize sz;
 | 
			
		||||
    sz.width  = src.cols;
 | 
			
		||||
    sz.height = src.rows;
 | 
			
		||||
 | 
			
		||||
    cudaStream_t cudaStream = StreamAccessor::getStream(stream);
 | 
			
		||||
 | 
			
		||||
    NppStreamHandler h(cudaStream);
 | 
			
		||||
 | 
			
		||||
    nppSafeCall( nppiMulC_32f_C1R(src.ptr<Npp32f>(), static_cast<int>(src.step), (Npp32f)sc[0], dst.ptr<Npp32f>(), static_cast<int>(dst.step), sz) );
 | 
			
		||||
 | 
			
		||||
    if (cudaStream == 0)
 | 
			
		||||
        cudaSafeCall( cudaDeviceSynchronize() );
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
void cv::gpu::divide(const GpuMat& src, const Scalar& sc, GpuMat& dst, Stream& stream)
 | 
			
		||||
{
 | 
			
		||||
    CV_Assert(src.type() == CV_32FC1);
 | 
			
		||||
 
 | 
			
		||||
		Reference in New Issue
	
	Block a user