Merge pull request #1555 from ilya-lavrenov:ocl_convertC3C4

This commit is contained in:
Roman Donchenko 2013-10-03 16:52:10 +04:00 committed by OpenCV Buildbot
commit 8e75947a7d
3 changed files with 59 additions and 136 deletions

View File

@ -58,12 +58,13 @@ using namespace std;
//////////////////////////////// oclMat //////////////////////////////// //////////////////////////////// oclMat ////////////////////////////////
//////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////
//helper routines // helper routines
namespace cv namespace cv
{ {
namespace ocl namespace ocl
{ {
///////////////////////////OpenCL kernel strings/////////////////////////// /////////////////////////// OpenCL kernel strings ///////////////////////////
extern const char *operator_copyToM; extern const char *operator_copyToM;
extern const char *operator_convertTo; extern const char *operator_convertTo;
extern const char *operator_setTo; extern const char *operator_setTo;
@ -74,42 +75,18 @@ namespace cv
} }
} }
//////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////
// convert_C3C4 // convert_C3C4
static void convert_C3C4(const cl_mem &src, oclMat &dst) static void convert_C3C4(const cl_mem &src, oclMat &dst)
{ {
int dstStep_in_pixel = dst.step1() / dst.oclchannels();
int pixel_end = dst.wholecols * dst.wholerows - 1;
Context *clCxt = dst.clCxt; Context *clCxt = dst.clCxt;
string kernelName = "convertC3C4"; int pixel_end = dst.wholecols * dst.wholerows - 1;
char compile_option[32]; int dstStep_in_pixel = dst.step1() / dst.oclchannels();
switch(dst.depth())
{ const char * const typeMap[] = { "uchar", "char", "ushort", "short", "int", "float", "double" };
case 0: std::string buildOptions = format("-D GENTYPE4=%s4", typeMap[dst.depth()]);
sprintf(compile_option, "-D GENTYPE4=uchar4");
break;
case 1:
sprintf(compile_option, "-D GENTYPE4=char4");
break;
case 2:
sprintf(compile_option, "-D GENTYPE4=ushort4");
break;
case 3:
sprintf(compile_option, "-D GENTYPE4=short4");
break;
case 4:
sprintf(compile_option, "-D GENTYPE4=int4");
break;
case 5:
sprintf(compile_option, "-D GENTYPE4=float4");
break;
case 6:
sprintf(compile_option, "-D GENTYPE4=double4");
break;
default:
CV_Error(CV_StsUnsupportedFormat, "unknown depth");
}
vector< pair<size_t, const void *> > args; vector< pair<size_t, const void *> > args;
args.push_back( make_pair( sizeof(cl_mem), (void *)&src)); args.push_back( make_pair( sizeof(cl_mem), (void *)&src));
args.push_back( make_pair( sizeof(cl_mem), (void *)&dst.data)); args.push_back( make_pair( sizeof(cl_mem), (void *)&dst.data));
@ -118,46 +95,24 @@ static void convert_C3C4(const cl_mem &src, oclMat &dst)
args.push_back( make_pair( sizeof(cl_int), (void *)&dstStep_in_pixel)); args.push_back( make_pair( sizeof(cl_int), (void *)&dstStep_in_pixel));
args.push_back( make_pair( sizeof(cl_int), (void *)&pixel_end)); args.push_back( make_pair( sizeof(cl_int), (void *)&pixel_end));
size_t globalThreads[3] = {((dst.wholecols * dst.wholerows + 3) / 4 + 255) / 256 * 256, 1, 1}; size_t globalThreads[3] = { divUp(dst.wholecols * dst.wholerows, 4), 1, 1 };
size_t localThreads[3] = {256, 1, 1}; size_t localThreads[3] = { 256, 1, 1 };
openCLExecuteKernel(clCxt, &convertC3C4, kernelName, globalThreads, localThreads, args, -1, -1, compile_option); openCLExecuteKernel(clCxt, &convertC3C4, "convertC3C4", globalThreads, localThreads,
args, -1, -1, buildOptions.c_str());
} }
//////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////
// convert_C4C3 // convert_C4C3
static void convert_C4C3(const oclMat &src, cl_mem &dst) static void convert_C4C3(const oclMat &src, cl_mem &dst)
{ {
int srcStep_in_pixel = src.step1() / src.oclchannels(); int srcStep_in_pixel = src.step1() / src.oclchannels();
int pixel_end = src.wholecols * src.wholerows - 1; int pixel_end = src.wholecols * src.wholerows - 1;
Context *clCxt = src.clCxt; Context *clCxt = src.clCxt;
string kernelName = "convertC4C3";
char compile_option[32]; const char * const typeMap[] = { "uchar", "char", "ushort", "short", "int", "float", "double" };
switch(src.depth()) std::string buildOptions = format("-D GENTYPE4=%s4", typeMap[src.depth()]);
{
case 0:
sprintf(compile_option, "-D GENTYPE4=uchar4");
break;
case 1:
sprintf(compile_option, "-D GENTYPE4=char4");
break;
case 2:
sprintf(compile_option, "-D GENTYPE4=ushort4");
break;
case 3:
sprintf(compile_option, "-D GENTYPE4=short4");
break;
case 4:
sprintf(compile_option, "-D GENTYPE4=int4");
break;
case 5:
sprintf(compile_option, "-D GENTYPE4=float4");
break;
case 6:
sprintf(compile_option, "-D GENTYPE4=double4");
break;
default:
CV_Error(CV_StsUnsupportedFormat, "unknown depth");
}
vector< pair<size_t, const void *> > args; vector< pair<size_t, const void *> > args;
args.push_back( make_pair( sizeof(cl_mem), (void *)&src.data)); args.push_back( make_pair( sizeof(cl_mem), (void *)&src.data));
@ -167,10 +122,10 @@ static void convert_C4C3(const oclMat &src, cl_mem &dst)
args.push_back( make_pair( sizeof(cl_int), (void *)&srcStep_in_pixel)); args.push_back( make_pair( sizeof(cl_int), (void *)&srcStep_in_pixel));
args.push_back( make_pair( sizeof(cl_int), (void *)&pixel_end)); args.push_back( make_pair( sizeof(cl_int), (void *)&pixel_end));
size_t globalThreads[3] = {((src.wholecols * src.wholerows + 3) / 4 + 255) / 256 * 256, 1, 1}; size_t globalThreads[3] = { divUp(src.wholecols * src.wholerows, 4), 1, 1};
size_t localThreads[3] = {256, 1, 1}; size_t localThreads[3] = { 256, 1, 1 };
openCLExecuteKernel(clCxt, &convertC3C4, kernelName, globalThreads, localThreads, args, -1, -1, compile_option); openCLExecuteKernel(clCxt, &convertC3C4, "convertC4C3", globalThreads, localThreads, args, -1, -1, buildOptions.c_str());
} }
void cv::ocl::oclMat::upload(const Mat &m) void cv::ocl::oclMat::upload(const Mat &m)
@ -179,14 +134,10 @@ void cv::ocl::oclMat::upload(const Mat &m)
Size wholeSize; Size wholeSize;
Point ofs; Point ofs;
m.locateROI(wholeSize, ofs); m.locateROI(wholeSize, ofs);
// int type = m.type();
// if(m.oclchannels() == 3)
//{
// type = CV_MAKETYPE(m.depth(), 4);
//}
create(wholeSize, m.type()); create(wholeSize, m.type());
if(m.channels() == 3) if (m.channels() == 3)
{ {
int pitch = wholeSize.width * 3 * m.elemSize1(); int pitch = wholeSize.width * 3 * m.elemSize1();
int tail_padding = m.elemSize1() * 3072; int tail_padding = m.elemSize1() * 3072;
@ -197,35 +148,15 @@ void cv::ocl::oclMat::upload(const Mat &m)
openCLMemcpy2D(clCxt, temp, pitch, m.datastart, m.step, wholeSize.width * m.elemSize(), wholeSize.height, clMemcpyHostToDevice, 3); openCLMemcpy2D(clCxt, temp, pitch, m.datastart, m.step, wholeSize.width * m.elemSize(), wholeSize.height, clMemcpyHostToDevice, 3);
convert_C3C4(temp, *this); convert_C3C4(temp, *this);
//int* cputemp=new int[wholeSize.height*wholeSize.width * 3];
//int* cpudata=new int[this->step*this->wholerows/sizeof(int)];
//openCLSafeCall(clEnqueueReadBuffer(clCxt->impl->clCmdQueue, temp, CL_TRUE,
// 0, wholeSize.height*wholeSize.width * 3* sizeof(int), cputemp, 0, NULL, NULL));
//openCLSafeCall(clEnqueueReadBuffer(clCxt->impl->clCmdQueue, (cl_mem)data, CL_TRUE,
// 0, this->step*this->wholerows, cpudata, 0, NULL, NULL));
//for(int i=0;i<wholeSize.height;i++)
//{
// int *a = cputemp+i*wholeSize.width * 3,*b = cpudata + i*this->step/sizeof(int);
// for(int j=0;j<wholeSize.width;j++)
// {
// if((a[3*j] != b[4*j])||(a[3*j+1] != b[4*j+1])||(a[3*j+2] != b[4*j+2]))
// printf("rows=%d,cols=%d,cputtemp=%d,%d,%d;cpudata=%d,%d,%d\n",
// i,j,a[3*j],a[3*j+1],a[3*j+2],b[4*j],b[4*j+1],b[4*j+2]);
// }
//}
//delete []cputemp;
//delete []cpudata;
openCLSafeCall(clReleaseMemObject(temp)); openCLSafeCall(clReleaseMemObject(temp));
} }
else else
{
openCLMemcpy2D(clCxt, data, step, m.datastart, m.step, wholeSize.width * elemSize(), wholeSize.height, clMemcpyHostToDevice); openCLMemcpy2D(clCxt, data, step, m.datastart, m.step, wholeSize.width * elemSize(), wholeSize.height, clMemcpyHostToDevice);
}
rows = m.rows; rows = m.rows;
cols = m.cols; cols = m.cols;
offset = ofs.y * step + ofs.x * elemSize(); offset = ofs.y * step + ofs.x * elemSize();
//download_channels = m.channels();
} }
cv::ocl::oclMat::operator cv::_InputArray() cv::ocl::oclMat::operator cv::_InputArray()
@ -259,11 +190,6 @@ cv::ocl::oclMat& cv::ocl::getOclMatRef(OutputArray src)
void cv::ocl::oclMat::download(cv::Mat &m) const void cv::ocl::oclMat::download(cv::Mat &m) const
{ {
CV_DbgAssert(!this->empty()); CV_DbgAssert(!this->empty());
// int t = type();
// if(download_channels == 3)
//{
// t = CV_MAKETYPE(depth(), 3);
//}
m.create(wholerows, wholecols, type()); m.create(wholerows, wholecols, type());
if(m.channels() == 3) if(m.channels() == 3)
@ -277,30 +203,14 @@ void cv::ocl::oclMat::download(cv::Mat &m) const
convert_C4C3(*this, temp); convert_C4C3(*this, temp);
openCLMemcpy2D(clCxt, m.data, m.step, temp, pitch, wholecols * m.elemSize(), wholerows, clMemcpyDeviceToHost, 3); openCLMemcpy2D(clCxt, m.data, m.step, temp, pitch, wholecols * m.elemSize(), wholerows, clMemcpyDeviceToHost, 3);
//int* cputemp=new int[wholecols*wholerows * 3];
//int* cpudata=new int[this->step*this->wholerows/sizeof(int)];
//openCLSafeCall(clEnqueueReadBuffer(clCxt->impl->clCmdQueue, temp, CL_TRUE,
// 0, wholecols*wholerows * 3* sizeof(int), cputemp, 0, NULL, NULL));
//openCLSafeCall(clEnqueueReadBuffer(clCxt->impl->clCmdQueue, (cl_mem)data, CL_TRUE,
// 0, this->step*this->wholerows, cpudata, 0, NULL, NULL));
//for(int i=0;i<wholerows;i++)
//{
// int *a = cputemp+i*wholecols * 3,*b = cpudata + i*this->step/sizeof(int);
// for(int j=0;j<wholecols;j++)
// {
// if((a[3*j] != b[4*j])||(a[3*j+1] != b[4*j+1])||(a[3*j+2] != b[4*j+2]))
// printf("rows=%d,cols=%d,cputtemp=%d,%d,%d;cpudata=%d,%d,%d\n",
// i,j,a[3*j],a[3*j+1],a[3*j+2],b[4*j],b[4*j+1],b[4*j+2]);
// }
//}
//delete []cputemp;
//delete []cpudata;
openCLSafeCall(clReleaseMemObject(temp)); openCLSafeCall(clReleaseMemObject(temp));
} }
else else
{ {
openCLMemcpy2D(clCxt, m.data, m.step, data, step, wholecols * elemSize(), wholerows, clMemcpyDeviceToHost); openCLMemcpy2D(clCxt, m.data, m.step, data, step, wholecols * elemSize(), wholerows, clMemcpyDeviceToHost);
} }
Size wholesize; Size wholesize;
Point ofs; Point ofs;
locateROI(wholesize, ofs); locateROI(wholesize, ofs);
@ -323,6 +233,7 @@ static void copy_to_with_mask(const oclMat &src, oclMat &dst, const oclMat &mask
{"uchar3", "char3", "ushort3", "short3", "int3", "float3", "double3"}, {"uchar3", "char3", "ushort3", "short3", "int3", "float3", "double3"},
{"uchar4", "char4", "ushort4", "short4", "int4", "float4", "double4"} {"uchar4", "char4", "ushort4", "short4", "int4", "float4", "double4"}
}; };
char compile_option[32]; char compile_option[32];
sprintf(compile_option, "-D GENTYPE=%s", string_types[dst.oclchannels() - 1][dst.depth()].c_str()); sprintf(compile_option, "-D GENTYPE=%s", string_types[dst.oclchannels() - 1][dst.depth()].c_str());
size_t localThreads[3] = {16, 16, 1}; size_t localThreads[3] = {16, 16, 1};
@ -366,6 +277,7 @@ void cv::ocl::oclMat::copyTo( oclMat &mat, const oclMat &mask) const
/////////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////////
//////////////////////////////// ConvertTo //////////////////////////////// //////////////////////////////// ConvertTo ////////////////////////////////
/////////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////////
static void convert_run(const oclMat &src, oclMat &dst, double alpha, double beta) static void convert_run(const oclMat &src, oclMat &dst, double alpha, double beta)
{ {
string kernelName = "convert_to"; string kernelName = "convert_to";
@ -404,6 +316,7 @@ static void convert_run(const oclMat &src, oclMat &dst, double alpha, double bet
openCLExecuteKernel(dst.clCxt , &operator_convertTo, kernelName, globalThreads, openCLExecuteKernel(dst.clCxt , &operator_convertTo, kernelName, globalThreads,
localThreads, args, -1, -1, buildOptions); localThreads, args, -1, -1, buildOptions);
} }
void cv::ocl::oclMat::convertTo( oclMat &dst, int rtype, double alpha, double beta ) const void cv::ocl::oclMat::convertTo( oclMat &dst, int rtype, double alpha, double beta ) const
{ {
if (!clCxt->supportsFeature(Context::CL_DOUBLE) && if (!clCxt->supportsFeature(Context::CL_DOUBLE) &&

View File

@ -32,23 +32,23 @@
// the use of this software, even if advised of the possibility of such damage. // the use of this software, even if advised of the possibility of such damage.
// //
// //
//#pragma OPENCL EXTENSION cl_amd_printf : enable
#if defined (DOUBLE_SUPPORT) #if defined (DOUBLE_SUPPORT)
#pragma OPENCL EXTENSION cl_khr_fp64:enable #pragma OPENCL EXTENSION cl_khr_fp64:enable
#endif #endif
__kernel void convertC3C4(__global const GENTYPE4 * restrict src, __global GENTYPE4 *dst, int cols, int rows, __kernel void convertC3C4(__global const GENTYPE4 * restrict src, __global GENTYPE4 *dst, int cols, int rows,
int dstStep_in_piexl,int pixel_end) int dstStep_in_piexl,int pixel_end)
{ {
int id = get_global_id(0); int id = get_global_id(0);
//int pixel_end = mul24(cols -1 , rows -1);
int3 pixelid = (int3)(mul24(id,3),mad24(id,3,1),mad24(id,3,2)); int3 pixelid = (int3)(mul24(id,3),mad24(id,3,1),mad24(id,3,2));
pixelid = clamp(pixelid,0,pixel_end); pixelid = clamp(pixelid,0,pixel_end);
GENTYPE4 pixel0, pixel1, pixel2, outpix0,outpix1,outpix2,outpix3; GENTYPE4 pixel0, pixel1, pixel2, outpix0,outpix1,outpix2,outpix3;
pixel0 = src[pixelid.x]; pixel0 = src[pixelid.x];
pixel1 = src[pixelid.y]; pixel1 = src[pixelid.y];
pixel2 = src[pixelid.z]; pixel2 = src[pixelid.z];
outpix0 = (GENTYPE4)(pixel0.x,pixel0.y,pixel0.z,0); outpix0 = (GENTYPE4)(pixel0.x,pixel0.y,pixel0.z,0);
outpix1 = (GENTYPE4)(pixel0.w,pixel1.x,pixel1.y,0); outpix1 = (GENTYPE4)(pixel0.w,pixel1.x,pixel1.y,0);
outpix2 = (GENTYPE4)(pixel1.z,pixel1.w,pixel2.x,0); outpix2 = (GENTYPE4)(pixel1.z,pixel1.w,pixel2.x,0);
@ -56,17 +56,19 @@ __kernel void convertC3C4(__global const GENTYPE4 * restrict src, __global GENTY
int4 outy = (id<<2)/cols; int4 outy = (id<<2)/cols;
int4 outx = (id<<2)%cols; int4 outx = (id<<2)%cols;
outx.y++;
outx.z+=2; outx += (int4)(0, 1, 2, 3);
outx.w+=3; outy = select(outy, outy+1, outx>=cols);
outy = select(outy,outy+1,outx>=cols); outx = select(outx, outx-cols, outx>=cols);
outx = select(outx,outx-cols,outx>=cols);
//outpix3 = select(outpix3, outpix0, (uchar4)(outy.w>=rows)); // when cols == 1
//outpix2 = select(outpix2, outpix0, (uchar4)(outy.z>=rows)); outy = select(outy, outy + 1, outx >= cols);
//outpix1 = select(outpix1, outpix0, (uchar4)(outy.y>=rows)); outx = select(outx, outx-cols, outx >= cols);
//outx = select(outx,(int4)outx.x,outy>=rows); outy = select(outy, outy + 1, outx >= cols);
//outy = select(outy,(int4)outy.x,outy>=rows); outx = select(outx, outx-cols, outx >= cols);
int4 addr = mad24(outy,(int4)dstStep_in_piexl,outx); int4 addr = mad24(outy,(int4)dstStep_in_piexl,outx);
if(outx.w<cols && outy.w<rows) if(outx.w<cols && outy.w<rows)
{ {
dst[addr.x] = outpix0; dst[addr.x] = outpix0;
@ -91,20 +93,26 @@ __kernel void convertC3C4(__global const GENTYPE4 * restrict src, __global GENTY
} }
} }
__kernel void convertC4C3(__global const GENTYPE4 * restrict src, __global GENTYPE4 *dst, int cols, int rows, __kernel void convertC4C3(__global const GENTYPE4 * restrict src, __global GENTYPE4 *dst, int cols, int rows,
int srcStep_in_pixel,int pixel_end) int srcStep_in_pixel,int pixel_end)
{ {
int id = get_global_id(0)<<2; int id = get_global_id(0)<<2;
int y = id / cols; int y = id / cols;
int x = id % cols; int x = id % cols;
int4 x4 = (int4)(x,x+1,x+2,x+3); int4 x4 = (int4)(x,x+1,x+2,x+3);
int4 y4 = select((int4)y,(int4)(y+1),x4>=(int4)cols); int4 y4 = select((int4)y,(int4)(y+1),x4>=(int4)cols);
y4=clamp(y4,(int4)0,(int4)(rows-1));
x4 = select(x4,x4-(int4)cols,x4>=(int4)cols); x4 = select(x4,x4-(int4)cols,x4>=(int4)cols);
int4 addr = mad24(y4,(int4)srcStep_in_pixel,x4);
// when cols == 1
y4 = select(y4, y4 + 1,x4>=(int4)cols);
x4 = select(x4, x4 - (int4)cols,x4>=(int4)cols);
y4 = select(y4, y4 + 1,x4>=(int4)cols);
x4 = select(x4, x4-(int4)cols,x4>=(int4)cols);
y4=clamp(y4,(int4)0,(int4)(rows-1));
int4 addr = mad24(y4, (int4)srcStep_in_pixel, x4);
GENTYPE4 pixel0,pixel1,pixel2,pixel3, outpixel1, outpixel2; GENTYPE4 pixel0,pixel1,pixel2,pixel3, outpixel1, outpixel2;
pixel0 = src[addr.x]; pixel0 = src[addr.x];
pixel1 = src[addr.y]; pixel1 = src[addr.y];
@ -120,9 +128,11 @@ __kernel void convertC4C3(__global const GENTYPE4 * restrict src, __global GENTY
outpixel2.y = pixel3.x; outpixel2.y = pixel3.x;
outpixel2.z = pixel3.y; outpixel2.z = pixel3.y;
outpixel2.w = pixel3.z; outpixel2.w = pixel3.z;
int4 outaddr = mul24(id>>2 , 3); int4 outaddr = mul24(id>>2 , 3);
outaddr.y++; outaddr.y++;
outaddr.z+=2; outaddr.z+=2;
if(outaddr.z <= pixel_end) if(outaddr.z <= pixel_end)
{ {
dst[outaddr.x] = pixel0; dst[outaddr.x] = pixel0;

View File

@ -402,7 +402,7 @@ PARAM_TEST_CASE(convertC3C4, MatType, bool)
int type = CV_MAKE_TYPE(depth, 3); int type = CV_MAKE_TYPE(depth, 3);
cv::RNG &rng = TS::ptr()->get_rng(); cv::RNG &rng = TS::ptr()->get_rng();
src = randomMat(rng, randomSize(MIN_VALUE, MAX_VALUE), type, 0, 40, false); src = randomMat(rng, randomSize(1, MAX_VALUE), type, 0, 40, false);
} }
void random_roi() void random_roi()