optimized gpu::multiply
This commit is contained in:
parent
6763bd6d01
commit
acac27d8fe
modules/gpu/src
@ -607,68 +607,59 @@ namespace cv { namespace gpu { namespace device
|
|||||||
//////////////////////////////////////////////////////////////////////////
|
//////////////////////////////////////////////////////////////////////////
|
||||||
// multiply
|
// multiply
|
||||||
|
|
||||||
// TODO implement more efficient version
|
struct multiply_8uc4_32f : binary_function<uint, float, uint>
|
||||||
template <typename TSrc1, typename TSrc2, typename TDst, int cn>
|
|
||||||
void __global__ multiplyKernel(const PtrStep src1, const PtrStep src2, int rows, int cols,
|
|
||||||
PtrStep dst)
|
|
||||||
{
|
{
|
||||||
int x = blockIdx.x * blockDim.x + threadIdx.x;
|
__device__ __forceinline__ uint operator ()(uint a, float b) const
|
||||||
int y = blockIdx.y * blockDim.y + threadIdx.y;
|
|
||||||
|
|
||||||
if (x < cols && y < rows)
|
|
||||||
{
|
{
|
||||||
((TDst*)dst.ptr(y))[x] = saturate_cast<TDst>(((TSrc1*)src1.ptr(y))[x] * ((TSrc2*)src2.ptr(y))[x / cn]);
|
uint res = 0;
|
||||||
|
|
||||||
|
res |= 0xffu & (saturate_cast<uchar>((0xffu & (a )) * b) );
|
||||||
|
res |= 0xffu & (saturate_cast<uchar>((0xffu & (a >> 8)) * b) << 8);
|
||||||
|
res |= 0xffu & (saturate_cast<uchar>((0xffu & (a >> 16)) * b) << 16);
|
||||||
|
res |= 0xffu & (saturate_cast<uchar>((0xffu & (a >> 24)) * b) << 24);
|
||||||
|
|
||||||
|
return res;
|
||||||
}
|
}
|
||||||
}
|
};
|
||||||
|
|
||||||
|
template <> struct TransformFunctorTraits<multiply_8uc4_32f> : DefaultTransformFunctorTraits<multiply_8uc4_32f>
|
||||||
template <typename TSrc1, typename TSrc2, typename TDst, int cn>
|
|
||||||
void multiplyCaller(const PtrStep src1, const PtrStep src2, int rows, int cols, PtrStep dst, cudaStream_t stream)
|
|
||||||
{
|
{
|
||||||
dim3 threads(32, 8);
|
enum { smart_block_dim_x = 8 };
|
||||||
dim3 grid(divUp(cols, threads.x), divUp(rows, threads.y));
|
enum { smart_block_dim_y = 8 };
|
||||||
|
enum { smart_shift = 8 };
|
||||||
|
};
|
||||||
|
|
||||||
multiplyKernel<TSrc1, TSrc2, TDst, cn><<<grid, threads>>>(src1, src2, rows, cols, dst);
|
void multiply_gpu(const DevMem2D_<uchar4>& src1, const DevMem2Df& src2, const DevMem2D_<uchar4>& dst, cudaStream_t stream)
|
||||||
cudaSafeCall(cudaGetLastError());
|
{
|
||||||
|
transform(static_cast< DevMem2D_<uint> >(src1), src2, static_cast< DevMem2D_<uint> >(dst), multiply_8uc4_32f(), stream);
|
||||||
if (stream == 0)
|
|
||||||
cudaSafeCall(cudaDeviceSynchronize());
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
template void multiplyCaller<uchar, float, uchar, 4>(const PtrStep src1, const PtrStep src2, int rows, int cols, PtrStep dst, cudaStream_t stream);
|
|
||||||
|
|
||||||
|
|
||||||
//////////////////////////////////////////////////////////////////////////
|
//////////////////////////////////////////////////////////////////////////
|
||||||
// multiply (by scalar)
|
// multiply (by scalar)
|
||||||
|
|
||||||
// TODO implement more efficient version
|
template <typename T, typename D, typename S> struct MultiplyScalar : unary_function<T, D>
|
||||||
template <typename TSrc, typename TDst>
|
|
||||||
void __global__ multiplyScalarKernel(const PtrStep src1, float scale, int rows, int cols, PtrStep dst)
|
|
||||||
{
|
{
|
||||||
int x = blockIdx.x * blockDim.x + threadIdx.x;
|
__host__ __device__ __forceinline__ MultiplyScalar(typename TypeTraits<S>::ParameterType scale_) : scale(scale_) {}
|
||||||
int y = blockIdx.y * blockDim.y + threadIdx.y;
|
|
||||||
|
|
||||||
if (x < cols && y < rows)
|
__device__ __forceinline__ D operator ()(typename TypeTraits<T>::ParameterType a) const
|
||||||
{
|
{
|
||||||
((TDst*)dst.ptr(y))[x] = saturate_cast<TDst>(((TSrc*)src1.ptr(y))[x] * scale);
|
return saturate_cast<D>(a * scale);
|
||||||
}
|
}
|
||||||
}
|
|
||||||
|
|
||||||
|
const S scale;
|
||||||
|
};
|
||||||
|
|
||||||
template <typename TSrc, typename TDst>
|
template <> struct TransformFunctorTraits< MultiplyScalar<uchar, uchar, float> > : DefaultTransformFunctorTraits< MultiplyScalar<uchar, uchar, float> >
|
||||||
void multiplyScalarCaller(const PtrStep src, float scale, int rows, int cols, PtrStep dst, cudaStream_t stream)
|
|
||||||
{
|
{
|
||||||
dim3 threads(32, 8);
|
enum { smart_block_dim_y = 8 };
|
||||||
dim3 grid(divUp(cols, threads.x), divUp(rows, threads.y));
|
enum { smart_shift = 8 };
|
||||||
|
};
|
||||||
|
|
||||||
multiplyScalarKernel<TSrc, TDst><<<grid, threads>>>(src, scale, rows, cols, dst);
|
template <typename T, typename D>
|
||||||
cudaSafeCall(cudaGetLastError());
|
void multiplyScalar_gpu(const DevMem2D& src, float scale, const DevMem2D& dst, cudaStream_t stream)
|
||||||
|
{
|
||||||
if (stream == 0)
|
transform(static_cast< DevMem2D_<T> >(src), static_cast< DevMem2D_<D> >(dst), MultiplyScalar<T, D, float>(scale), stream);
|
||||||
cudaSafeCall(cudaDeviceSynchronize());
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
template void multiplyScalar_gpu<uchar, uchar>(const DevMem2D& src, float scale, const DevMem2D& dst, cudaStream_t stream);
|
||||||
template void multiplyScalarCaller<uchar, uchar>(const PtrStep src, float scale, int rows, int cols, PtrStep dst, cudaStream_t stream);
|
|
||||||
}}}
|
}}}
|
||||||
|
@ -199,22 +199,21 @@ void cv::gpu::subtract(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, Stre
|
|||||||
|
|
||||||
namespace cv { namespace gpu { namespace device
|
namespace cv { namespace gpu { namespace device
|
||||||
{
|
{
|
||||||
template <typename TSrc1, typename TSrc2, typename TDst, int cn>
|
void multiply_gpu(const DevMem2D_<uchar4>& src1, const DevMem2Df& src2, const DevMem2D_<uchar4>& dst, cudaStream_t stream);
|
||||||
void multiplyCaller(const PtrStep src1, const PtrStep src2, int rows, int cols, PtrStep dst, cudaStream_t stream);
|
|
||||||
|
|
||||||
template <typename TSrc, typename TDst>
|
template <typename T, typename D>
|
||||||
void multiplyScalarCaller(const PtrStep src, float scalar, int rows, int cols, PtrStep dst, cudaStream_t stream);
|
void multiplyScalar_gpu(const DevMem2D& src, float scale, const DevMem2D& dst, cudaStream_t stream);
|
||||||
}}}
|
}}}
|
||||||
|
|
||||||
void cv::gpu::multiply(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, Stream& stream)
|
void cv::gpu::multiply(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, Stream& stream)
|
||||||
{
|
{
|
||||||
if (src1.type() == CV_8UC4 && src2.type() == CV_32F)
|
if (src1.type() == CV_8UC4 && src2.type() == CV_32FC1)
|
||||||
{
|
{
|
||||||
CV_Assert(src1.size() == src2.size());
|
CV_Assert(src1.size() == src2.size());
|
||||||
|
|
||||||
dst.create(src1.size(), src1.type());
|
dst.create(src1.size(), src1.type());
|
||||||
device::multiplyCaller<uchar, float, uchar, 4>(static_cast<DevMem2D>(src1), static_cast<DevMem2D>(src2),
|
|
||||||
src1.rows, src1.cols * 4, static_cast<DevMem2D>(dst),
|
device::multiply_gpu(src1, src2, dst, StreamAccessor::getStream(stream));
|
||||||
StreamAccessor::getStream(stream));
|
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
nppArithmCaller(src1, src2, dst, nppiMul_8u_C1RSfs, nppiMul_8u_C4RSfs, nppiMul_32s_C1R, nppiMul_32f_C1R, StreamAccessor::getStream(stream));
|
nppArithmCaller(src1, src2, dst, nppiMul_8u_C1RSfs, nppiMul_8u_C4RSfs, nppiMul_32s_C1R, nppiMul_32f_C1R, StreamAccessor::getStream(stream));
|
||||||
@ -225,8 +224,8 @@ void cv::gpu::multiply(const GpuMat& src, const Scalar& sc, GpuMat& dst, Stream&
|
|||||||
if (src.depth() == CV_8U)
|
if (src.depth() == CV_8U)
|
||||||
{
|
{
|
||||||
dst.create(src.size(), src.type());
|
dst.create(src.size(), src.type());
|
||||||
device::multiplyScalarCaller<uchar, uchar>(static_cast<DevMem2D>(src), (float)(sc[0]), src.rows, src.cols * src.channels(),
|
|
||||||
static_cast<DevMem2D>(dst), StreamAccessor::getStream(stream));
|
device::multiplyScalar_gpu<uchar, uchar>(src.reshape(1), (float)(sc[0]), dst, StreamAccessor::getStream(stream));
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
|
Loading…
x
Reference in New Issue
Block a user