multiple rows per work-item
This commit is contained in:
parent
ab2749d648
commit
5ee398bfd6
@ -53,19 +53,22 @@
|
|||||||
|
|
||||||
__kernel void convertTo(__global const uchar * srcptr, int src_step, int src_offset,
|
__kernel void convertTo(__global const uchar * srcptr, int src_step, int src_offset,
|
||||||
__global uchar * dstptr, int dst_step, int dst_offset, int dst_rows, int dst_cols,
|
__global uchar * dstptr, int dst_step, int dst_offset, int dst_rows, int dst_cols,
|
||||||
WT alpha, WT beta)
|
WT alpha, WT beta, int rowsPerWI)
|
||||||
{
|
{
|
||||||
int x = get_global_id(0);
|
int x = get_global_id(0);
|
||||||
int y = get_global_id(1);
|
int y0 = get_global_id(1) * rowsPerWI;
|
||||||
|
|
||||||
if (x < dst_cols && y < dst_rows)
|
if (x < dst_cols)
|
||||||
{
|
{
|
||||||
int src_index = mad24(y, src_step, mad24(x, (int)sizeof(srcT), src_offset));
|
int src_index = mad24(y0, src_step, mad24(x, (int)sizeof(srcT), src_offset));
|
||||||
int dst_index = mad24(y, dst_step, mad24(x, (int)sizeof(dstT), dst_offset));
|
int dst_index = mad24(y0, dst_step, mad24(x, (int)sizeof(dstT), dst_offset));
|
||||||
|
|
||||||
__global const srcT * src = (__global const srcT *)(srcptr + src_index);
|
for (int y = y0, y1 = min(dst_rows, y0 + rowsPerWI); y < y1; ++y, src_index += src_step, dst_index += dst_step)
|
||||||
__global dstT * dst = (__global dstT *)(dstptr + dst_index);
|
{
|
||||||
|
__global const srcT * src = (__global const srcT *)(srcptr + src_index);
|
||||||
|
__global dstT * dst = (__global dstT *)(dstptr + dst_index);
|
||||||
|
|
||||||
dst[0] = convertToDT(mad(convertToWT(src[0]), alpha, beta));
|
dst[0] = convertToDT(fma(convertToWT(src[0]), alpha, beta));
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -721,7 +721,7 @@ void UMat::convertTo(OutputArray _dst, int _type, double alpha, double beta) con
|
|||||||
if( dims <= 2 && cn && _dst.isUMat() && ocl::useOpenCL() &&
|
if( dims <= 2 && cn && _dst.isUMat() && ocl::useOpenCL() &&
|
||||||
((needDouble && doubleSupport) || !needDouble) )
|
((needDouble && doubleSupport) || !needDouble) )
|
||||||
{
|
{
|
||||||
int wdepth = std::max(CV_32F, sdepth);
|
int wdepth = std::max(CV_32F, sdepth), rowsPerWI = 4;
|
||||||
|
|
||||||
char cvt[2][40];
|
char cvt[2][40];
|
||||||
ocl::Kernel k("convertTo", ocl::core::convert_oclsrc,
|
ocl::Kernel k("convertTo", ocl::core::convert_oclsrc,
|
||||||
@ -741,11 +741,11 @@ void UMat::convertTo(OutputArray _dst, int _type, double alpha, double beta) con
|
|||||||
dstarg = ocl::KernelArg::WriteOnly(dst, cn);
|
dstarg = ocl::KernelArg::WriteOnly(dst, cn);
|
||||||
|
|
||||||
if (wdepth == CV_32F)
|
if (wdepth == CV_32F)
|
||||||
k.args(srcarg, dstarg, alphaf, betaf);
|
k.args(srcarg, dstarg, alphaf, betaf, rowsPerWI);
|
||||||
else
|
else
|
||||||
k.args(srcarg, dstarg, alpha, beta);
|
k.args(srcarg, dstarg, alpha, beta, rowsPerWI);
|
||||||
|
|
||||||
size_t globalsize[2] = { dst.cols * cn, dst.rows };
|
size_t globalsize[2] = { dst.cols * cn, (dst.rows + rowsPerWI - 1) / rowsPerWI };
|
||||||
if (k.run(2, globalsize, NULL, false))
|
if (k.run(2, globalsize, NULL, false))
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
Loading…
x
Reference in New Issue
Block a user