From 54e4ef657c41a5bb794ebd2a34932cae16a8507b Mon Sep 17 00:00:00 2001 From: Ilya Lavrenov Date: Fri, 27 Jun 2014 14:03:05 +0400 Subject: [PATCH] optimized cv::transpose inplace --- modules/core/src/matrix.cpp | 16 ++++++++++++---- modules/core/src/opencl/transpose.cl | 20 +++++++++++++------- 2 files changed, 25 insertions(+), 11 deletions(-) diff --git a/modules/core/src/matrix.cpp b/modules/core/src/matrix.cpp index c27c96107..f199cb253 100644 --- a/modules/core/src/matrix.cpp +++ b/modules/core/src/matrix.cpp @@ -2973,8 +2973,10 @@ static inline int divUp(int a, int b) static bool ocl_transpose( InputArray _src, OutputArray _dst ) { + const ocl::Device & dev = ocl::Device::getDefault(); const int TILE_DIM = 32, BLOCK_ROWS = 8; - int type = _src.type(), cn = CV_MAT_CN(type), depth = CV_MAT_DEPTH(type); + int type = _src.type(), cn = CV_MAT_CN(type), depth = CV_MAT_DEPTH(type), + rowsPerWI = dev.isIntel() ? 4 : 1; UMat src = _src.getUMat(); _dst.create(src.cols, src.rows, type); @@ -2990,9 +2992,9 @@ static bool ocl_transpose( InputArray _src, OutputArray _dst ) } ocl::Kernel k(kernelName.c_str(), ocl::core::transpose_oclsrc, - format("-D T=%s -D T1=%s -D cn=%d -D TILE_DIM=%d -D BLOCK_ROWS=%d", + format("-D T=%s -D T1=%s -D cn=%d -D TILE_DIM=%d -D BLOCK_ROWS=%d -D rowsPerWI=%d", ocl::memopTypeToStr(type), ocl::memopTypeToStr(depth), - cn, TILE_DIM, BLOCK_ROWS)); + cn, TILE_DIM, BLOCK_ROWS, rowsPerWI)); if (k.empty()) return false; @@ -3003,7 +3005,13 @@ static bool ocl_transpose( InputArray _src, OutputArray _dst ) ocl::KernelArg::WriteOnlyNoSize(dst)); size_t localsize[2] = { TILE_DIM, BLOCK_ROWS }; - size_t globalsize[2] = { src.cols, inplace ? src.rows : divUp(src.rows, TILE_DIM) * BLOCK_ROWS }; + size_t globalsize[2] = { src.cols, inplace ? (src.rows + rowsPerWI - 1) / rowsPerWI : (divUp(src.rows, TILE_DIM) * BLOCK_ROWS) }; + + if (inplace && dev.isIntel()) + { + localsize[0] = 16; + localsize[1] = dev.maxWorkGroupSize() / localsize[0]; + } return k.run(2, globalsize, localsize, false); } diff --git a/modules/core/src/opencl/transpose.cl b/modules/core/src/opencl/transpose.cl index d56e49909..cad3616b5 100644 --- a/modules/core/src/opencl/transpose.cl +++ b/modules/core/src/opencl/transpose.cl @@ -117,18 +117,24 @@ __kernel void transpose(__global const uchar * srcptr, int src_step, int src_off __kernel void transpose_inplace(__global uchar * srcptr, int src_step, int src_offset, int src_rows) { int x = get_global_id(0); - int y = get_global_id(1); + int y = get_global_id(1) * rowsPerWI; - if (y < src_rows && x < y) + if (x < y + rowsPerWI) { int src_index = mad24(y, src_step, mad24(x, TSIZE, src_offset)); int dst_index = mad24(x, src_step, mad24(y, TSIZE, src_offset)); + T tmp; - __global const uchar * src = srcptr + src_index; - __global uchar * dst = srcptr + dst_index; + #pragma unroll + for (int i = 0; i < rowsPerWI; ++i, ++y, src_index += src_step, dst_index += TSIZE) + if (y < src_rows && x < y) + { + __global uchar * src = srcptr + src_index; + __global uchar * dst = srcptr + dst_index; - T tmp = loadpix(dst); - storepix(loadpix(src), dst); - storepix(tmp, src); + tmp = loadpix(dst); + storepix(loadpix(src), dst); + storepix(tmp, src); + } } }