diff --git a/modules/ocl/src/arithm.cpp b/modules/ocl/src/arithm.cpp index 035cea781..c1147cb41 100644 --- a/modules/ocl/src/arithm.cpp +++ b/modules/ocl/src/arithm.cpp @@ -64,7 +64,6 @@ namespace cv { //////////////////////////////// OpenCL kernel strings ///////////////////// - extern const char *transpose_kernel; extern const char *arithm_nonzero; extern const char *arithm_sum; extern const char *arithm_sum_3; @@ -1265,9 +1264,8 @@ int cv::ocl::countNonZero(const oclMat &src) CV_Error(CV_GpuNotSupported, "select device don't support double"); } CV_Assert(groupnum != 0); - groupnum = groupnum * 2; +// groupnum = groupnum * 2; int vlen = 8 , dbsize = groupnum * vlen; - //cl_ulong start, end; Context *clCxt = src.clCxt; string kernelName = "arithm_op_nonzero"; int *p = new int[dbsize], nonzero = 0; @@ -1529,7 +1527,7 @@ oclMatExpr::operator oclMat() const #define TILE_DIM (32) #define BLOCK_ROWS (256/TILE_DIM) -static void transpose_run(const oclMat &src, oclMat &dst, string kernelName) +static void transpose_run(const oclMat &src, oclMat &dst, string kernelName, bool inplace = false) { Context *clCxt = src.clCxt; if (!clCxt->supportsFeature(Context::CL_DOUBLE) && src.depth() == CV_64F) @@ -1544,7 +1542,7 @@ static void transpose_run(const oclMat &src, oclMat &dst, string kernelName) channelsString[src.channels()]); size_t localThreads[3] = { TILE_DIM, BLOCK_ROWS, 1 }; - size_t globalThreads[3] = { src.cols, src.rows, 1 }; + size_t globalThreads[3] = { src.cols, inplace ? src.rows : divUp(src.rows, TILE_DIM) * BLOCK_ROWS, 1 }; int srcstep1 = src.step / src.elemSize(), dststep1 = dst.step / dst.elemSize(); int srcoffset1 = src.offset / src.elemSize(), dstoffset1 = dst.offset / dst.elemSize(); @@ -1568,8 +1566,8 @@ void cv::ocl::transpose(const oclMat &src, oclMat &dst) CV_Assert(src.depth() <= CV_64F && src.channels() <= 4); if ( src.data == dst.data && src.cols == src.rows && dst.offset == src.offset - && dst.rows == dst.cols && src.cols == dst.cols) - transpose_run( src, dst, "transpose_inplace"); + && dst.size() == src.size()) + transpose_run( src, dst, "transpose_inplace", true); else { dst.create(src.cols, src.rows, src.type()); diff --git a/modules/ocl/src/opencl/arithm_minMax.cl b/modules/ocl/src/opencl/arithm_minMax.cl index 1dcb138eb..23b293306 100644 --- a/modules/ocl/src/opencl/arithm_minMax.cl +++ b/modules/ocl/src/opencl/arithm_minMax.cl @@ -44,9 +44,14 @@ //M*/ /**************************************PUBLICFUNC*************************************/ + #if defined (DOUBLE_SUPPORT) +#ifdef cl_amd_fp64 +#pragma OPENCL EXTENSION cl_amd_fp64:enable +#elif defined (cl_khr_fp64) #pragma OPENCL EXTENSION cl_khr_fp64:enable #endif +#endif #if defined (DEPTH_0) #define VEC_TYPE uchar8 diff --git a/modules/ocl/src/opencl/arithm_transpose.cl b/modules/ocl/src/opencl/arithm_transpose.cl index 57f7f1b9d..5328d1f1b 100644 --- a/modules/ocl/src/opencl/arithm_transpose.cl +++ b/modules/ocl/src/opencl/arithm_transpose.cl @@ -44,23 +44,78 @@ //M*/ #if defined (DOUBLE_SUPPORT) +#ifdef cl_amd_fp64 +#pragma OPENCL EXTENSION cl_amd_fp64:enable +#elif defined (cl_khr_fp64) #pragma OPENCL EXTENSION cl_khr_fp64:enable #endif +#endif + +#define TILE_DIM 32 +#define BLOCK_ROWS 8 +#define LDS_STEP TILE_DIM __kernel void transpose(__global const T* src, __global T* dst, int src_cols, int src_rows, int src_step, int dst_step, int src_offset, int dst_offset) { - int x = get_global_id(0); - int y = get_global_id(1); + int gp_x = get_group_id(0), gp_y = get_group_id(1); + int gs_x = get_num_groups(0), gs_y = get_num_groups(1); + + int groupId_x, groupId_y; + + if(src_rows == src_cols) + { + groupId_y = gp_x; + groupId_x = (gp_x + gp_y) % gs_x; + } + else + { + int bid = gp_x + gs_x * gp_y; + groupId_y = bid % gs_y; + groupId_x = ((bid / gs_y) + groupId_y) % gs_x; + } + + int lx = get_local_id(0); + int ly = get_local_id(1); + + int x = groupId_x * TILE_DIM + lx; + int y = groupId_y * TILE_DIM + ly; + + int x_index = groupId_y * TILE_DIM + lx; + int y_index = groupId_x * TILE_DIM + ly; + + __local T title[TILE_DIM * LDS_STEP]; if (x < src_cols && y < src_rows) { - int srcIdx = mad24(y, src_step, src_offset + x); - int dstIdx = mad24(x, dst_step, dst_offset + y); + int index_src = mad24(y, src_step, x); - dst[dstIdx] = src[srcIdx]; + for(int i = 0; i < TILE_DIM; i += BLOCK_ROWS) + { + if (y + i < src_rows) + { + title[(ly + i) * LDS_STEP + lx] = src[src_offset + index_src]; + index_src = mad24(BLOCK_ROWS, src_step, index_src); + } + } + } + + barrier(CLK_LOCAL_MEM_FENCE); + + if (x_index < src_rows && y_index < src_cols) + { + int index_dst = mad24(y_index, dst_step, x_index); + + for(int i = 0; i < TILE_DIM; i += BLOCK_ROWS) + { + if ((y_index + i) < src_cols) + { + dst[dst_offset + index_dst] = title[lx * LDS_STEP + ly + i]; + index_dst += dst_step * BLOCK_ROWS; + } + } } } @@ -72,7 +127,7 @@ __kernel void transpose_inplace(__global T* src, __global T* dst, int x = get_global_id(0); int y = get_global_id(1); - if (x < src_cols && y < src_rows && x < y) + if (y < src_rows && x < y) { int srcIdx = mad24(y, src_step, src_offset + x); int dstIdx = mad24(x, dst_step, dst_offset + y); diff --git a/modules/ocl/test/utility.hpp b/modules/ocl/test/utility.hpp index 48c8bbcd9..7c491916f 100644 --- a/modules/ocl/test/utility.hpp +++ b/modules/ocl/test/utility.hpp @@ -48,7 +48,7 @@ #define MHEIGHT 256 #define MIN_VALUE 171 -#define MAX_VALUE 351 +#define MAX_VALUE 357 //#define RANDOMROI int randomInt(int minVal, int maxVal);