Merge pull request #2466 from ilya-lavrenov:tapi_transpose_3cn
This commit is contained in:
commit
a6c40abfb2
@ -292,7 +292,7 @@ OCL_PERF_TEST_P(MagnitudeFixture, Magnitude, ::testing::Combine(
|
|||||||
typedef Size_MatType TransposeFixture;
|
typedef Size_MatType TransposeFixture;
|
||||||
|
|
||||||
OCL_PERF_TEST_P(TransposeFixture, Transpose, ::testing::Combine(
|
OCL_PERF_TEST_P(TransposeFixture, Transpose, ::testing::Combine(
|
||||||
OCL_TEST_SIZES, OCL_TEST_TYPES))
|
OCL_TEST_SIZES, OCL_TEST_TYPES_134))
|
||||||
{
|
{
|
||||||
const Size_MatType_t params = GetParam();
|
const Size_MatType_t params = GetParam();
|
||||||
const Size srcSize = get<0>(params);
|
const Size srcSize = get<0>(params);
|
||||||
|
@ -2893,10 +2893,7 @@ static inline int divUp(int a, int b)
|
|||||||
static bool ocl_transpose( InputArray _src, OutputArray _dst )
|
static bool ocl_transpose( InputArray _src, OutputArray _dst )
|
||||||
{
|
{
|
||||||
const int TILE_DIM = 32, BLOCK_ROWS = 8;
|
const int TILE_DIM = 32, BLOCK_ROWS = 8;
|
||||||
int type = _src.type(), cn = CV_MAT_CN(type);
|
int type = _src.type(), cn = CV_MAT_CN(type), depth = CV_MAT_DEPTH(type);
|
||||||
|
|
||||||
if (cn == 3)
|
|
||||||
return false;
|
|
||||||
|
|
||||||
UMat src = _src.getUMat();
|
UMat src = _src.getUMat();
|
||||||
_dst.create(src.cols, src.rows, type);
|
_dst.create(src.cols, src.rows, type);
|
||||||
@ -2912,8 +2909,9 @@ static bool ocl_transpose( InputArray _src, OutputArray _dst )
|
|||||||
}
|
}
|
||||||
|
|
||||||
ocl::Kernel k(kernelName.c_str(), ocl::core::transpose_oclsrc,
|
ocl::Kernel k(kernelName.c_str(), ocl::core::transpose_oclsrc,
|
||||||
format("-D T=%s -D TILE_DIM=%d -D BLOCK_ROWS=%d",
|
format("-D T=%s -D T1=%s -D cn=%d -D TILE_DIM=%d -D BLOCK_ROWS=%d",
|
||||||
ocl::memopTypeToStr(type), TILE_DIM, BLOCK_ROWS));
|
ocl::memopTypeToStr(type), ocl::memopTypeToStr(depth),
|
||||||
|
cn, TILE_DIM, BLOCK_ROWS));
|
||||||
if (k.empty())
|
if (k.empty())
|
||||||
return false;
|
return false;
|
||||||
|
|
||||||
|
@ -43,6 +43,16 @@
|
|||||||
//
|
//
|
||||||
//M*/
|
//M*/
|
||||||
|
|
||||||
|
#if cn != 3
|
||||||
|
#define loadpix(addr) *(__global const T *)(addr)
|
||||||
|
#define storepix(val, addr) *(__global T *)(addr) = val
|
||||||
|
#define TSIZE (int)sizeof(T)
|
||||||
|
#else
|
||||||
|
#define loadpix(addr) vload3(0, (__global const T1 *)(addr))
|
||||||
|
#define storepix(val, addr) vstore3(val, 0, (__global T1 *)(addr))
|
||||||
|
#define TSIZE ((int)sizeof(T1)*3)
|
||||||
|
#endif
|
||||||
|
|
||||||
#define LDS_STEP TILE_DIM
|
#define LDS_STEP TILE_DIM
|
||||||
|
|
||||||
__kernel void transpose(__global const uchar * srcptr, int src_step, int src_offset, int src_rows, int src_cols,
|
__kernel void transpose(__global const uchar * srcptr, int src_step, int src_offset, int src_rows, int src_cols,
|
||||||
@ -74,17 +84,16 @@ __kernel void transpose(__global const uchar * srcptr, int src_step, int src_off
|
|||||||
int x_index = mad24(groupId_y, TILE_DIM, lx);
|
int x_index = mad24(groupId_y, TILE_DIM, lx);
|
||||||
int y_index = mad24(groupId_x, TILE_DIM, ly);
|
int y_index = mad24(groupId_x, TILE_DIM, ly);
|
||||||
|
|
||||||
__local T title[TILE_DIM * LDS_STEP];
|
__local T tile[TILE_DIM * LDS_STEP];
|
||||||
|
|
||||||
if (x < src_cols && y < src_rows)
|
if (x < src_cols && y < src_rows)
|
||||||
{
|
{
|
||||||
int index_src = mad24(y, src_step, mad24(x, (int)sizeof(T), src_offset));
|
int index_src = mad24(y, src_step, mad24(x, TSIZE, src_offset));
|
||||||
|
|
||||||
for (int i = 0; i < TILE_DIM; i += BLOCK_ROWS)
|
for (int i = 0; i < TILE_DIM; i += BLOCK_ROWS)
|
||||||
if (y + i < src_rows)
|
if (y + i < src_rows)
|
||||||
{
|
{
|
||||||
__global const T * src = (__global const T *)(srcptr + index_src);
|
tile[mad24(ly + i, LDS_STEP, lx)] = loadpix(srcptr + index_src);
|
||||||
title[mad24(ly + i, LDS_STEP, lx)] = src[0];
|
|
||||||
index_src = mad24(BLOCK_ROWS, src_step, index_src);
|
index_src = mad24(BLOCK_ROWS, src_step, index_src);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -92,13 +101,12 @@ __kernel void transpose(__global const uchar * srcptr, int src_step, int src_off
|
|||||||
|
|
||||||
if (x_index < src_rows && y_index < src_cols)
|
if (x_index < src_rows && y_index < src_cols)
|
||||||
{
|
{
|
||||||
int index_dst = mad24(y_index, dst_step, mad24(x_index, (int)sizeof(T), dst_offset));
|
int index_dst = mad24(y_index, dst_step, mad24(x_index, TSIZE, dst_offset));
|
||||||
|
|
||||||
for (int i = 0; i < TILE_DIM; i += BLOCK_ROWS)
|
for (int i = 0; i < TILE_DIM; i += BLOCK_ROWS)
|
||||||
if ((y_index + i) < src_cols)
|
if ((y_index + i) < src_cols)
|
||||||
{
|
{
|
||||||
__global T * dst = (__global T *)(dstptr + index_dst);
|
storepix(tile[mad24(lx, LDS_STEP, ly + i)], dstptr + index_dst);
|
||||||
dst[0] = title[mad24(lx, LDS_STEP, ly + i)];
|
|
||||||
index_dst = mad24(BLOCK_ROWS, dst_step, index_dst);
|
index_dst = mad24(BLOCK_ROWS, dst_step, index_dst);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -111,14 +119,14 @@ __kernel void transpose_inplace(__global uchar * srcptr, int src_step, int src_o
|
|||||||
|
|
||||||
if (y < src_rows && x < y)
|
if (y < src_rows && x < y)
|
||||||
{
|
{
|
||||||
int src_index = mad24(y, src_step, mad24(x, (int)sizeof(T), src_offset));
|
int src_index = mad24(y, src_step, mad24(x, TSIZE, src_offset));
|
||||||
int dst_index = mad24(x, src_step, mad24(y, (int)sizeof(T), src_offset));
|
int dst_index = mad24(x, src_step, mad24(y, TSIZE, src_offset));
|
||||||
|
|
||||||
__global T * src = (__global T *)(srcptr + src_index);
|
__global const uchar * src = srcptr + src_index;
|
||||||
__global T * dst = (__global T *)(srcptr + dst_index);
|
__global uchar * dst = srcptr + dst_index;
|
||||||
|
|
||||||
T tmp = dst[0];
|
T tmp = loadpix(dst);
|
||||||
dst[0] = src[0];
|
storepix(loadpix(src), dst);
|
||||||
src[0] = tmp;
|
storepix(tmp, src);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
Loading…
x
Reference in New Issue
Block a user