reverted and generalized original ocl::transpose
This commit is contained in:
parent
c87d2d414d
commit
544c02407e
@ -64,7 +64,6 @@ namespace cv
|
|||||||
{
|
{
|
||||||
//////////////////////////////// OpenCL kernel strings /////////////////////
|
//////////////////////////////// OpenCL kernel strings /////////////////////
|
||||||
|
|
||||||
extern const char *transpose_kernel;
|
|
||||||
extern const char *arithm_nonzero;
|
extern const char *arithm_nonzero;
|
||||||
extern const char *arithm_sum;
|
extern const char *arithm_sum;
|
||||||
extern const char *arithm_sum_3;
|
extern const char *arithm_sum_3;
|
||||||
@ -1265,9 +1264,8 @@ int cv::ocl::countNonZero(const oclMat &src)
|
|||||||
CV_Error(CV_GpuNotSupported, "select device don't support double");
|
CV_Error(CV_GpuNotSupported, "select device don't support double");
|
||||||
}
|
}
|
||||||
CV_Assert(groupnum != 0);
|
CV_Assert(groupnum != 0);
|
||||||
groupnum = groupnum * 2;
|
// groupnum = groupnum * 2;
|
||||||
int vlen = 8 , dbsize = groupnum * vlen;
|
int vlen = 8 , dbsize = groupnum * vlen;
|
||||||
//cl_ulong start, end;
|
|
||||||
Context *clCxt = src.clCxt;
|
Context *clCxt = src.clCxt;
|
||||||
string kernelName = "arithm_op_nonzero";
|
string kernelName = "arithm_op_nonzero";
|
||||||
int *p = new int[dbsize], nonzero = 0;
|
int *p = new int[dbsize], nonzero = 0;
|
||||||
@ -1529,7 +1527,7 @@ oclMatExpr::operator oclMat() const
|
|||||||
#define TILE_DIM (32)
|
#define TILE_DIM (32)
|
||||||
#define BLOCK_ROWS (256/TILE_DIM)
|
#define BLOCK_ROWS (256/TILE_DIM)
|
||||||
|
|
||||||
static void transpose_run(const oclMat &src, oclMat &dst, string kernelName)
|
static void transpose_run(const oclMat &src, oclMat &dst, string kernelName, bool inplace = false)
|
||||||
{
|
{
|
||||||
Context *clCxt = src.clCxt;
|
Context *clCxt = src.clCxt;
|
||||||
if (!clCxt->supportsFeature(Context::CL_DOUBLE) && src.depth() == CV_64F)
|
if (!clCxt->supportsFeature(Context::CL_DOUBLE) && src.depth() == CV_64F)
|
||||||
@ -1544,7 +1542,7 @@ static void transpose_run(const oclMat &src, oclMat &dst, string kernelName)
|
|||||||
channelsString[src.channels()]);
|
channelsString[src.channels()]);
|
||||||
|
|
||||||
size_t localThreads[3] = { TILE_DIM, BLOCK_ROWS, 1 };
|
size_t localThreads[3] = { TILE_DIM, BLOCK_ROWS, 1 };
|
||||||
size_t globalThreads[3] = { src.cols, src.rows, 1 };
|
size_t globalThreads[3] = { src.cols, inplace ? src.rows : divUp(src.rows, TILE_DIM) * BLOCK_ROWS, 1 };
|
||||||
|
|
||||||
int srcstep1 = src.step / src.elemSize(), dststep1 = dst.step / dst.elemSize();
|
int srcstep1 = src.step / src.elemSize(), dststep1 = dst.step / dst.elemSize();
|
||||||
int srcoffset1 = src.offset / src.elemSize(), dstoffset1 = dst.offset / dst.elemSize();
|
int srcoffset1 = src.offset / src.elemSize(), dstoffset1 = dst.offset / dst.elemSize();
|
||||||
@ -1568,8 +1566,8 @@ void cv::ocl::transpose(const oclMat &src, oclMat &dst)
|
|||||||
CV_Assert(src.depth() <= CV_64F && src.channels() <= 4);
|
CV_Assert(src.depth() <= CV_64F && src.channels() <= 4);
|
||||||
|
|
||||||
if ( src.data == dst.data && src.cols == src.rows && dst.offset == src.offset
|
if ( src.data == dst.data && src.cols == src.rows && dst.offset == src.offset
|
||||||
&& dst.rows == dst.cols && src.cols == dst.cols)
|
&& dst.size() == src.size())
|
||||||
transpose_run( src, dst, "transpose_inplace");
|
transpose_run( src, dst, "transpose_inplace", true);
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
dst.create(src.cols, src.rows, src.type());
|
dst.create(src.cols, src.rows, src.type());
|
||||||
|
@ -44,9 +44,14 @@
|
|||||||
//M*/
|
//M*/
|
||||||
|
|
||||||
/**************************************PUBLICFUNC*************************************/
|
/**************************************PUBLICFUNC*************************************/
|
||||||
|
|
||||||
#if defined (DOUBLE_SUPPORT)
|
#if defined (DOUBLE_SUPPORT)
|
||||||
|
#ifdef cl_amd_fp64
|
||||||
|
#pragma OPENCL EXTENSION cl_amd_fp64:enable
|
||||||
|
#elif defined (cl_khr_fp64)
|
||||||
#pragma OPENCL EXTENSION cl_khr_fp64:enable
|
#pragma OPENCL EXTENSION cl_khr_fp64:enable
|
||||||
#endif
|
#endif
|
||||||
|
#endif
|
||||||
|
|
||||||
#if defined (DEPTH_0)
|
#if defined (DEPTH_0)
|
||||||
#define VEC_TYPE uchar8
|
#define VEC_TYPE uchar8
|
||||||
|
@ -44,23 +44,78 @@
|
|||||||
//M*/
|
//M*/
|
||||||
|
|
||||||
#if defined (DOUBLE_SUPPORT)
|
#if defined (DOUBLE_SUPPORT)
|
||||||
|
#ifdef cl_amd_fp64
|
||||||
|
#pragma OPENCL EXTENSION cl_amd_fp64:enable
|
||||||
|
#elif defined (cl_khr_fp64)
|
||||||
#pragma OPENCL EXTENSION cl_khr_fp64:enable
|
#pragma OPENCL EXTENSION cl_khr_fp64:enable
|
||||||
#endif
|
#endif
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#define TILE_DIM 32
|
||||||
|
#define BLOCK_ROWS 8
|
||||||
|
#define LDS_STEP TILE_DIM
|
||||||
|
|
||||||
__kernel void transpose(__global const T* src, __global T* dst,
|
__kernel void transpose(__global const T* src, __global T* dst,
|
||||||
int src_cols, int src_rows,
|
int src_cols, int src_rows,
|
||||||
int src_step, int dst_step,
|
int src_step, int dst_step,
|
||||||
int src_offset, int dst_offset)
|
int src_offset, int dst_offset)
|
||||||
{
|
{
|
||||||
int x = get_global_id(0);
|
int gp_x = get_group_id(0), gp_y = get_group_id(1);
|
||||||
int y = get_global_id(1);
|
int gs_x = get_num_groups(0), gs_y = get_num_groups(1);
|
||||||
|
|
||||||
|
int groupId_x, groupId_y;
|
||||||
|
|
||||||
|
if(src_rows == src_cols)
|
||||||
|
{
|
||||||
|
groupId_y = gp_x;
|
||||||
|
groupId_x = (gp_x + gp_y) % gs_x;
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
int bid = gp_x + gs_x * gp_y;
|
||||||
|
groupId_y = bid % gs_y;
|
||||||
|
groupId_x = ((bid / gs_y) + groupId_y) % gs_x;
|
||||||
|
}
|
||||||
|
|
||||||
|
int lx = get_local_id(0);
|
||||||
|
int ly = get_local_id(1);
|
||||||
|
|
||||||
|
int x = groupId_x * TILE_DIM + lx;
|
||||||
|
int y = groupId_y * TILE_DIM + ly;
|
||||||
|
|
||||||
|
int x_index = groupId_y * TILE_DIM + lx;
|
||||||
|
int y_index = groupId_x * TILE_DIM + ly;
|
||||||
|
|
||||||
|
__local T title[TILE_DIM * LDS_STEP];
|
||||||
|
|
||||||
if (x < src_cols && y < src_rows)
|
if (x < src_cols && y < src_rows)
|
||||||
{
|
{
|
||||||
int srcIdx = mad24(y, src_step, src_offset + x);
|
int index_src = mad24(y, src_step, x);
|
||||||
int dstIdx = mad24(x, dst_step, dst_offset + y);
|
|
||||||
|
|
||||||
dst[dstIdx] = src[srcIdx];
|
for(int i = 0; i < TILE_DIM; i += BLOCK_ROWS)
|
||||||
|
{
|
||||||
|
if (y + i < src_rows)
|
||||||
|
{
|
||||||
|
title[(ly + i) * LDS_STEP + lx] = src[src_offset + index_src];
|
||||||
|
index_src = mad24(BLOCK_ROWS, src_step, index_src);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
barrier(CLK_LOCAL_MEM_FENCE);
|
||||||
|
|
||||||
|
if (x_index < src_rows && y_index < src_cols)
|
||||||
|
{
|
||||||
|
int index_dst = mad24(y_index, dst_step, x_index);
|
||||||
|
|
||||||
|
for(int i = 0; i < TILE_DIM; i += BLOCK_ROWS)
|
||||||
|
{
|
||||||
|
if ((y_index + i) < src_cols)
|
||||||
|
{
|
||||||
|
dst[dst_offset + index_dst] = title[lx * LDS_STEP + ly + i];
|
||||||
|
index_dst += dst_step * BLOCK_ROWS;
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -72,7 +127,7 @@ __kernel void transpose_inplace(__global T* src, __global T* dst,
|
|||||||
int x = get_global_id(0);
|
int x = get_global_id(0);
|
||||||
int y = get_global_id(1);
|
int y = get_global_id(1);
|
||||||
|
|
||||||
if (x < src_cols && y < src_rows && x < y)
|
if (y < src_rows && x < y)
|
||||||
{
|
{
|
||||||
int srcIdx = mad24(y, src_step, src_offset + x);
|
int srcIdx = mad24(y, src_step, src_offset + x);
|
||||||
int dstIdx = mad24(x, dst_step, dst_offset + y);
|
int dstIdx = mad24(x, dst_step, dst_offset + y);
|
||||||
|
@ -48,7 +48,7 @@
|
|||||||
#define MHEIGHT 256
|
#define MHEIGHT 256
|
||||||
|
|
||||||
#define MIN_VALUE 171
|
#define MIN_VALUE 171
|
||||||
#define MAX_VALUE 351
|
#define MAX_VALUE 357
|
||||||
|
|
||||||
//#define RANDOMROI
|
//#define RANDOMROI
|
||||||
int randomInt(int minVal, int maxVal);
|
int randomInt(int minVal, int maxVal);
|
||||||
|
Loading…
x
Reference in New Issue
Block a user