reverted and generalized original ocl::transpose

2013-09-24 22:54:13 +04:00
parent c87d2d414d
commit 544c02407e
4 changed files with 72 additions and 14 deletions
--- a/modules/ocl/src/arithm.cpp
+++ b/modules/ocl/src/arithm.cpp
@@ -64,7 +64,6 @@ namespace cv
    {
        //////////////////////////////// OpenCL kernel strings /////////////////////
        extern const char *transpose_kernel;
        extern const char *arithm_nonzero;
        extern const char *arithm_sum;
        extern const char *arithm_sum_3;
@@ -1265,9 +1264,8 @@ int cv::ocl::countNonZero(const oclMat &src)
        CV_Error(CV_GpuNotSupported, "select device don't support double");
    }
    CV_Assert(groupnum != 0);
-    groupnum = groupnum * 2;
+//    groupnum = groupnum * 2;
    int vlen = 8 , dbsize = groupnum * vlen;
    //cl_ulong start, end;
    Context *clCxt = src.clCxt;
    string kernelName = "arithm_op_nonzero";
    int *p = new int[dbsize], nonzero = 0;
@@ -1529,7 +1527,7 @@ oclMatExpr::operator oclMat() const
 #define TILE_DIM      (32)
 #define BLOCK_ROWS    (256/TILE_DIM)
-static void transpose_run(const oclMat &src, oclMat &dst, string kernelName)
+static void transpose_run(const oclMat &src, oclMat &dst, string kernelName, bool inplace = false)
 {
    Context  *clCxt = src.clCxt;
    if (!clCxt->supportsFeature(Context::CL_DOUBLE) && src.depth() == CV_64F)
@@ -1544,7 +1542,7 @@ static void transpose_run(const oclMat &src, oclMat &dst, string kernelName)
                                      channelsString[src.channels()]);
    size_t localThreads[3]  = { TILE_DIM, BLOCK_ROWS, 1 };
-    size_t globalThreads[3] = { src.cols, src.rows, 1 };
+    size_t globalThreads[3] = { src.cols, inplace ? src.rows : divUp(src.rows, TILE_DIM) * BLOCK_ROWS, 1 };
    int srcstep1 = src.step / src.elemSize(), dststep1 = dst.step / dst.elemSize();
    int srcoffset1 = src.offset / src.elemSize(), dstoffset1 = dst.offset / dst.elemSize();
@@ -1568,8 +1566,8 @@ void cv::ocl::transpose(const oclMat &src, oclMat &dst)
    CV_Assert(src.depth() <= CV_64F && src.channels() <= 4);
    if ( src.data == dst.data && src.cols == src.rows && dst.offset == src.offset
-            && dst.rows == dst.cols && src.cols == dst.cols)
+         && dst.size() == src.size())
-        transpose_run( src, dst, "transpose_inplace");
+        transpose_run( src, dst, "transpose_inplace", true);
    else
    {
        dst.create(src.cols, src.rows, src.type());
--- a/modules/ocl/src/opencl/arithm_minMax.cl
+++ b/modules/ocl/src/opencl/arithm_minMax.cl
@@ -44,9 +44,14 @@
 //M*/
 /**************************************PUBLICFUNC*************************************/
 #if defined (DOUBLE_SUPPORT)
 #ifdef cl_amd_fp64
 #pragma OPENCL EXTENSION cl_amd_fp64:enable
 #elif defined (cl_khr_fp64)
 #pragma OPENCL EXTENSION cl_khr_fp64:enable
 #endif
 #endif
 #if defined (DEPTH_0)
 #define VEC_TYPE uchar8
--- a/modules/ocl/src/opencl/arithm_transpose.cl
+++ b/modules/ocl/src/opencl/arithm_transpose.cl
@@ -44,23 +44,78 @@
 //M*/
 #if defined (DOUBLE_SUPPORT)
 #ifdef cl_amd_fp64
 #pragma OPENCL EXTENSION cl_amd_fp64:enable
 #elif defined (cl_khr_fp64)
 #pragma OPENCL EXTENSION cl_khr_fp64:enable
 #endif
 #endif
 #define TILE_DIM      32
 #define BLOCK_ROWS    8
 #define LDS_STEP      TILE_DIM
 __kernel void transpose(__global const T* src, __global T* dst,
    int src_cols, int src_rows,
    int src_step, int dst_step,
    int src_offset, int dst_offset)
 {
-    int x = get_global_id(0);
+    int gp_x = get_group_id(0),   gp_y = get_group_id(1);
-    int y = get_global_id(1);
+    int gs_x = get_num_groups(0), gs_y = get_num_groups(1);
    int groupId_x, groupId_y;
    if(src_rows == src_cols)
    {
        groupId_y = gp_x;
        groupId_x = (gp_x + gp_y) % gs_x;
    }
    else
    {
        int bid = gp_x + gs_x * gp_y;
        groupId_y =  bid % gs_y;
        groupId_x = ((bid / gs_y) + groupId_y) % gs_x;
    }
    int lx = get_local_id(0);
    int ly = get_local_id(1);
    int x = groupId_x * TILE_DIM + lx;
    int y = groupId_y * TILE_DIM + ly;
    int x_index = groupId_y * TILE_DIM + lx;
    int y_index = groupId_x * TILE_DIM + ly;
    __local T title[TILE_DIM * LDS_STEP];
    if (x < src_cols && y < src_rows)
    {
-        int srcIdx = mad24(y, src_step, src_offset + x);
+        int index_src = mad24(y, src_step, x);
        int dstIdx = mad24(x, dst_step, dst_offset + y);
-        dst[dstIdx] = src[srcIdx];
+        for(int i = 0; i < TILE_DIM; i += BLOCK_ROWS)
        {
            if (y + i < src_rows)
            {
                title[(ly + i) * LDS_STEP + lx] = src[src_offset + index_src];
                index_src = mad24(BLOCK_ROWS, src_step, index_src);
            }
        }
    }
    barrier(CLK_LOCAL_MEM_FENCE);
    if (x_index < src_rows && y_index < src_cols)
    {
        int index_dst = mad24(y_index, dst_step, x_index);
        for(int i = 0; i < TILE_DIM; i += BLOCK_ROWS)
        {
            if ((y_index + i) < src_cols)
            {
                dst[dst_offset + index_dst] = title[lx * LDS_STEP + ly + i];
                index_dst +=  dst_step * BLOCK_ROWS;
            }
        }
    }
 }
@@ -72,7 +127,7 @@ __kernel void transpose_inplace(__global T* src, __global T* dst,
    int x = get_global_id(0);
    int y = get_global_id(1);
-    if (x < src_cols && y < src_rows && x < y)
+    if (y < src_rows && x < y)
    {
        int srcIdx = mad24(y, src_step, src_offset + x);
        int dstIdx = mad24(x, dst_step, dst_offset + y);
--- a/modules/ocl/test/utility.hpp
+++ b/modules/ocl/test/utility.hpp
@@ -48,7 +48,7 @@
 #define MHEIGHT 256
 #define MIN_VALUE 171
-#define MAX_VALUE 351
+#define MAX_VALUE 357
 //#define RANDOMROI
 int randomInt(int minVal, int maxVal);