From 5ee398bfd65fd6cba016e72f9cd70ab41630d31d Mon Sep 17 00:00:00 2001 From: Ilya Lavrenov Date: Wed, 14 May 2014 13:55:39 +0400 Subject: [PATCH] multiple rows per work-item --- modules/core/src/opencl/convert.cl | 19 +++++++++++-------- modules/core/src/umatrix.cpp | 8 ++++---- 2 files changed, 15 insertions(+), 12 deletions(-) diff --git a/modules/core/src/opencl/convert.cl b/modules/core/src/opencl/convert.cl index b80140947..e0e7bd83a 100644 --- a/modules/core/src/opencl/convert.cl +++ b/modules/core/src/opencl/convert.cl @@ -53,19 +53,22 @@ __kernel void convertTo(__global const uchar * srcptr, int src_step, int src_offset, __global uchar * dstptr, int dst_step, int dst_offset, int dst_rows, int dst_cols, - WT alpha, WT beta) + WT alpha, WT beta, int rowsPerWI) { int x = get_global_id(0); - int y = get_global_id(1); + int y0 = get_global_id(1) * rowsPerWI; - if (x < dst_cols && y < dst_rows) + if (x < dst_cols) { - int src_index = mad24(y, src_step, mad24(x, (int)sizeof(srcT), src_offset)); - int dst_index = mad24(y, dst_step, mad24(x, (int)sizeof(dstT), dst_offset)); + int src_index = mad24(y0, src_step, mad24(x, (int)sizeof(srcT), src_offset)); + int dst_index = mad24(y0, dst_step, mad24(x, (int)sizeof(dstT), dst_offset)); - __global const srcT * src = (__global const srcT *)(srcptr + src_index); - __global dstT * dst = (__global dstT *)(dstptr + dst_index); + for (int y = y0, y1 = min(dst_rows, y0 + rowsPerWI); y < y1; ++y, src_index += src_step, dst_index += dst_step) + { + __global const srcT * src = (__global const srcT *)(srcptr + src_index); + __global dstT * dst = (__global dstT *)(dstptr + dst_index); - dst[0] = convertToDT(mad(convertToWT(src[0]), alpha, beta)); + dst[0] = convertToDT(fma(convertToWT(src[0]), alpha, beta)); + } } } diff --git a/modules/core/src/umatrix.cpp b/modules/core/src/umatrix.cpp index 006049254..29d3b8a46 100644 --- a/modules/core/src/umatrix.cpp +++ b/modules/core/src/umatrix.cpp @@ -721,7 +721,7 @@ void UMat::convertTo(OutputArray _dst, int _type, double alpha, double beta) con if( dims <= 2 && cn && _dst.isUMat() && ocl::useOpenCL() && ((needDouble && doubleSupport) || !needDouble) ) { - int wdepth = std::max(CV_32F, sdepth); + int wdepth = std::max(CV_32F, sdepth), rowsPerWI = 4; char cvt[2][40]; ocl::Kernel k("convertTo", ocl::core::convert_oclsrc, @@ -741,11 +741,11 @@ void UMat::convertTo(OutputArray _dst, int _type, double alpha, double beta) con dstarg = ocl::KernelArg::WriteOnly(dst, cn); if (wdepth == CV_32F) - k.args(srcarg, dstarg, alphaf, betaf); + k.args(srcarg, dstarg, alphaf, betaf, rowsPerWI); else - k.args(srcarg, dstarg, alpha, beta); + k.args(srcarg, dstarg, alpha, beta, rowsPerWI); - size_t globalsize[2] = { dst.cols * cn, dst.rows }; + size_t globalsize[2] = { dst.cols * cn, (dst.rows + rowsPerWI - 1) / rowsPerWI }; if (k.run(2, globalsize, NULL, false)) return; }