diff --git a/modules/core/perf/opencl/perf_arithm.cpp b/modules/core/perf/opencl/perf_arithm.cpp index ce4482579..98f650473 100644 --- a/modules/core/perf/opencl/perf_arithm.cpp +++ b/modules/core/perf/opencl/perf_arithm.cpp @@ -292,7 +292,7 @@ OCL_PERF_TEST_P(MagnitudeFixture, Magnitude, ::testing::Combine( typedef Size_MatType TransposeFixture; OCL_PERF_TEST_P(TransposeFixture, Transpose, ::testing::Combine( - OCL_TEST_SIZES, OCL_TEST_TYPES)) + OCL_TEST_SIZES, OCL_TEST_TYPES_134)) { const Size_MatType_t params = GetParam(); const Size srcSize = get<0>(params); diff --git a/modules/core/src/matrix.cpp b/modules/core/src/matrix.cpp index a1d6044d5..db1ce760f 100644 --- a/modules/core/src/matrix.cpp +++ b/modules/core/src/matrix.cpp @@ -2893,10 +2893,7 @@ static inline int divUp(int a, int b) static bool ocl_transpose( InputArray _src, OutputArray _dst ) { const int TILE_DIM = 32, BLOCK_ROWS = 8; - int type = _src.type(), cn = CV_MAT_CN(type); - - if (cn == 3) - return false; + int type = _src.type(), cn = CV_MAT_CN(type), depth = CV_MAT_DEPTH(type); UMat src = _src.getUMat(); _dst.create(src.cols, src.rows, type); @@ -2912,8 +2909,9 @@ static bool ocl_transpose( InputArray _src, OutputArray _dst ) } ocl::Kernel k(kernelName.c_str(), ocl::core::transpose_oclsrc, - format("-D T=%s -D TILE_DIM=%d -D BLOCK_ROWS=%d", - ocl::memopTypeToStr(type), TILE_DIM, BLOCK_ROWS)); + format("-D T=%s -D T1=%s -D cn=%d -D TILE_DIM=%d -D BLOCK_ROWS=%d", + ocl::memopTypeToStr(type), ocl::memopTypeToStr(depth), + cn, TILE_DIM, BLOCK_ROWS)); if (k.empty()) return false; diff --git a/modules/core/src/opencl/transpose.cl b/modules/core/src/opencl/transpose.cl index 575cdab3d..b5ec4b6f9 100644 --- a/modules/core/src/opencl/transpose.cl +++ b/modules/core/src/opencl/transpose.cl @@ -43,6 +43,16 @@ // //M*/ +#if cn != 3 +#define loadpix(addr) *(__global const T *)(addr) +#define storepix(val, addr) *(__global T *)(addr) = val +#define TSIZE (int)sizeof(T) +#else +#define loadpix(addr) vload3(0, (__global const T1 *)(addr)) +#define storepix(val, addr) vstore3(val, 0, (__global T1 *)(addr)) +#define TSIZE ((int)sizeof(T1)*3) +#endif + #define LDS_STEP TILE_DIM __kernel void transpose(__global const uchar * srcptr, int src_step, int src_offset, int src_rows, int src_cols, @@ -74,17 +84,16 @@ __kernel void transpose(__global const uchar * srcptr, int src_step, int src_off int x_index = mad24(groupId_y, TILE_DIM, lx); int y_index = mad24(groupId_x, TILE_DIM, ly); - __local T title[TILE_DIM * LDS_STEP]; + __local T tile[TILE_DIM * LDS_STEP]; if (x < src_cols && y < src_rows) { - int index_src = mad24(y, src_step, mad24(x, (int)sizeof(T), src_offset)); + int index_src = mad24(y, src_step, mad24(x, TSIZE, src_offset)); for (int i = 0; i < TILE_DIM; i += BLOCK_ROWS) if (y + i < src_rows) { - __global const T * src = (__global const T *)(srcptr + index_src); - title[mad24(ly + i, LDS_STEP, lx)] = src[0]; + tile[mad24(ly + i, LDS_STEP, lx)] = loadpix(srcptr + index_src); index_src = mad24(BLOCK_ROWS, src_step, index_src); } } @@ -92,13 +101,12 @@ __kernel void transpose(__global const uchar * srcptr, int src_step, int src_off if (x_index < src_rows && y_index < src_cols) { - int index_dst = mad24(y_index, dst_step, mad24(x_index, (int)sizeof(T), dst_offset)); + int index_dst = mad24(y_index, dst_step, mad24(x_index, TSIZE, dst_offset)); for (int i = 0; i < TILE_DIM; i += BLOCK_ROWS) if ((y_index + i) < src_cols) { - __global T * dst = (__global T *)(dstptr + index_dst); - dst[0] = title[mad24(lx, LDS_STEP, ly + i)]; + storepix(tile[mad24(lx, LDS_STEP, ly + i)], dstptr + index_dst); index_dst = mad24(BLOCK_ROWS, dst_step, index_dst); } } @@ -111,14 +119,14 @@ __kernel void transpose_inplace(__global uchar * srcptr, int src_step, int src_o if (y < src_rows && x < y) { - int src_index = mad24(y, src_step, mad24(x, (int)sizeof(T), src_offset)); - int dst_index = mad24(x, src_step, mad24(y, (int)sizeof(T), src_offset)); + int src_index = mad24(y, src_step, mad24(x, TSIZE, src_offset)); + int dst_index = mad24(x, src_step, mad24(y, TSIZE, src_offset)); - __global T * src = (__global T *)(srcptr + src_index); - __global T * dst = (__global T *)(srcptr + dst_index); + __global const uchar * src = srcptr + src_index; + __global uchar * dst = srcptr + dst_index; - T tmp = dst[0]; - dst[0] = src[0]; - src[0] = tmp; + T tmp = loadpix(dst); + storepix(loadpix(src), dst); + storepix(tmp, src); } }