Merge pull request #2466 from ilya-lavrenov:tapi_transpose_3cn

2014-03-13 16:56:01 +04:00
parent 0764a23eb2 89b7c0dd93
commit a6c40abfb2
3 changed files with 27 additions and 21 deletions
--- a/modules/core/perf/opencl/perf_arithm.cpp
+++ b/modules/core/perf/opencl/perf_arithm.cpp
@@ -292,7 +292,7 @@ OCL_PERF_TEST_P(MagnitudeFixture, Magnitude, ::testing::Combine(
 typedef Size_MatType TransposeFixture;
 OCL_PERF_TEST_P(TransposeFixture, Transpose, ::testing::Combine(
-                OCL_TEST_SIZES, OCL_TEST_TYPES))
+                OCL_TEST_SIZES, OCL_TEST_TYPES_134))
 {
    const Size_MatType_t params = GetParam();
    const Size srcSize = get<0>(params);
--- a/modules/core/src/matrix.cpp
+++ b/modules/core/src/matrix.cpp
@@ -2893,10 +2893,7 @@ static inline int divUp(int a, int b)
 static bool ocl_transpose( InputArray _src, OutputArray _dst )
 {
    const int TILE_DIM = 32, BLOCK_ROWS = 8;
-    int type = _src.type(), cn = CV_MAT_CN(type);
+    int type = _src.type(), cn = CV_MAT_CN(type), depth = CV_MAT_DEPTH(type);
    if (cn == 3)
        return false;
    UMat src = _src.getUMat();
    _dst.create(src.cols, src.rows, type);
@@ -2912,8 +2909,9 @@ static bool ocl_transpose( InputArray _src, OutputArray _dst )
    }
    ocl::Kernel k(kernelName.c_str(), ocl::core::transpose_oclsrc,
-                  format("-D T=%s -D TILE_DIM=%d -D BLOCK_ROWS=%d",
+                  format("-D T=%s -D T1=%s -D cn=%d -D TILE_DIM=%d -D BLOCK_ROWS=%d",
-                         ocl::memopTypeToStr(type), TILE_DIM, BLOCK_ROWS));
+                         ocl::memopTypeToStr(type), ocl::memopTypeToStr(depth),
                         cn, TILE_DIM, BLOCK_ROWS));
    if (k.empty())
        return false;
--- a/modules/core/src/opencl/transpose.cl
+++ b/modules/core/src/opencl/transpose.cl
@@ -43,6 +43,16 @@
 //
 //M*/
 #if cn != 3
 #define loadpix(addr) *(__global const T *)(addr)
 #define storepix(val, addr)  *(__global T *)(addr) = val
 #define TSIZE (int)sizeof(T)
 #else
 #define loadpix(addr) vload3(0, (__global const T1 *)(addr))
 #define storepix(val, addr) vstore3(val, 0, (__global T1 *)(addr))
 #define TSIZE ((int)sizeof(T1)*3)
 #endif
 #define LDS_STEP      TILE_DIM
 __kernel void transpose(__global const uchar * srcptr, int src_step, int src_offset, int src_rows, int src_cols,
@@ -74,17 +84,16 @@ __kernel void transpose(__global const uchar * srcptr, int src_step, int src_off
    int x_index = mad24(groupId_y, TILE_DIM, lx);
    int y_index = mad24(groupId_x, TILE_DIM, ly);
-    __local T title[TILE_DIM * LDS_STEP];
+    __local T tile[TILE_DIM * LDS_STEP];
    if (x < src_cols && y < src_rows)
    {
-        int index_src = mad24(y, src_step, mad24(x, (int)sizeof(T), src_offset));
+        int index_src = mad24(y, src_step, mad24(x, TSIZE, src_offset));
        for (int i = 0; i < TILE_DIM; i += BLOCK_ROWS)
            if (y + i < src_rows)
            {
-                __global const T * src = (__global const T *)(srcptr + index_src);
+                tile[mad24(ly + i, LDS_STEP, lx)] = loadpix(srcptr + index_src);
                title[mad24(ly + i, LDS_STEP, lx)] = src[0];
                index_src = mad24(BLOCK_ROWS, src_step, index_src);
            }
    }
@@ -92,13 +101,12 @@ __kernel void transpose(__global const uchar * srcptr, int src_step, int src_off
    if (x_index < src_rows && y_index < src_cols)
    {
-        int index_dst = mad24(y_index, dst_step, mad24(x_index, (int)sizeof(T), dst_offset));
+        int index_dst = mad24(y_index, dst_step, mad24(x_index, TSIZE, dst_offset));
        for (int i = 0; i < TILE_DIM; i += BLOCK_ROWS)
            if ((y_index + i) < src_cols)
            {
-                __global T * dst = (__global T *)(dstptr + index_dst);
+                storepix(tile[mad24(lx, LDS_STEP, ly + i)], dstptr + index_dst);
                dst[0] = title[mad24(lx, LDS_STEP, ly + i)];
                index_dst = mad24(BLOCK_ROWS, dst_step, index_dst);
            }
    }
@@ -111,14 +119,14 @@ __kernel void transpose_inplace(__global uchar * srcptr, int src_step, int src_o
    if (y < src_rows && x < y)
    {
-        int src_index = mad24(y, src_step, mad24(x, (int)sizeof(T), src_offset));
+        int src_index = mad24(y, src_step, mad24(x, TSIZE, src_offset));
-        int dst_index = mad24(x, src_step, mad24(y, (int)sizeof(T), src_offset));
+        int dst_index = mad24(x, src_step, mad24(y, TSIZE, src_offset));
-        __global T * src = (__global T *)(srcptr + src_index);
+        __global const uchar * src = srcptr + src_index;
-        __global T * dst = (__global T *)(srcptr + dst_index);
+        __global uchar * dst = srcptr + dst_index;
-        T tmp = dst[0];
+        T tmp = loadpix(dst);
-        dst[0] = src[0];
+        storepix(loadpix(src), dst);
-        src[0] = tmp;
+        storepix(tmp, src);
    }
 }