extended ocl::convertTo

This commit is contained in:
Ilya Lavrenov
2013-09-11 13:35:39 +04:00
parent 6a21eca7e7
commit f20cc2bce8
4 changed files with 203 additions and 538 deletions

View File

@@ -382,40 +382,50 @@ void cv::ocl::oclMat::copyTo( oclMat &mat, const oclMat &mask) const
///////////////////////////////////////////////////////////////////////////
static void convert_run(const oclMat &src, oclMat &dst, double alpha, double beta)
{
string kernelName = "convert_to_S";
stringstream idxStr;
idxStr << src.depth();
kernelName += idxStr.str();
string kernelName = "convert_to";
float alpha_f = alpha, beta_f = beta;
int sdepth = src.depth(), ddepth = dst.depth();
int sstep1 = (int)src.step1(), dstep1 = (int)dst.step1();
int cols1 = src.cols * src.oclchannels();
char buildOptions[150], convertString[50];
const char * typeMap[] = { "uchar", "char", "ushort", "short", "int", "float", "double" };
sprintf(convertString, "convert_%s_sat_rte", typeMap[ddepth]);
sprintf(buildOptions, "-D srcT=%s -D dstT=%s -D convertToDstType=%s", typeMap[sdepth],
typeMap[ddepth], CV_32F == ddepth || ddepth == CV_64F ? "" : convertString);
CV_DbgAssert(src.rows == dst.rows && src.cols == dst.cols);
vector<pair<size_t , const void *> > args;
size_t localThreads[3] = {16, 16, 1};
size_t globalThreads[3];
globalThreads[0] = (dst.cols + localThreads[0] - 1) / localThreads[0] * localThreads[0];
globalThreads[1] = (dst.rows + localThreads[1] - 1) / localThreads[1] * localThreads[1];
globalThreads[2] = 1;
int dststep_in_pixel = dst.step / dst.elemSize(), dstoffset_in_pixel = dst.offset / dst.elemSize();
int srcstep_in_pixel = src.step / src.elemSize(), srcoffset_in_pixel = src.offset / src.elemSize();
if(dst.type() == CV_8UC1)
{
globalThreads[0] = ((dst.cols + 4) / 4 + localThreads[0]) / localThreads[0] * localThreads[0];
}
size_t localThreads[3] = { 16, 16, 1 };
size_t globalThreads[3] = { divUp(cols1, localThreads[0]) * localThreads[0],
divUp(dst.rows, localThreads[1]) * localThreads[1], 1 };
int doffset1 = dst.offset / dst.elemSize1();
int soffset1 = src.offset / src.elemSize1();
args.push_back( make_pair( sizeof(cl_mem) , (void *)&src.data ));
args.push_back( make_pair( sizeof(cl_mem) , (void *)&dst.data ));
args.push_back( make_pair( sizeof(cl_int) , (void *)&src.cols ));
args.push_back( make_pair( sizeof(cl_int) , (void *)&cols1 ));
args.push_back( make_pair( sizeof(cl_int) , (void *)&src.rows ));
args.push_back( make_pair( sizeof(cl_int) , (void *)&srcstep_in_pixel ));
args.push_back( make_pair( sizeof(cl_int) , (void *)&srcoffset_in_pixel ));
args.push_back( make_pair( sizeof(cl_int) , (void *)&dststep_in_pixel ));
args.push_back( make_pair( sizeof(cl_int) , (void *)&dstoffset_in_pixel ));
args.push_back( make_pair( sizeof(cl_int) , (void *)&sstep1 ));
args.push_back( make_pair( sizeof(cl_int) , (void *)&soffset1 ));
args.push_back( make_pair( sizeof(cl_int) , (void *)&dstep1 ));
args.push_back( make_pair( sizeof(cl_int) , (void *)&doffset1 ));
args.push_back( make_pair( sizeof(cl_float) , (void *)&alpha_f ));
args.push_back( make_pair( sizeof(cl_float) , (void *)&beta_f ));
openCLExecuteKernel(dst.clCxt , &operator_convertTo, kernelName, globalThreads,
localThreads, args, dst.oclchannels(), dst.depth());
localThreads, args, -1, -1, buildOptions);
}
void cv::ocl::oclMat::convertTo( oclMat &dst, int rtype, double alpha, double beta ) const
{
//cout << "cv::ocl::oclMat::convertTo()" << endl;
if (!clCxt->supportsFeature(Context::CL_DOUBLE) &&
(depth() == CV_64F || dst.depth() == CV_64F))
{
CV_Error(CV_GpuNotSupported, "Selected device don't support double\r\n");
return;
}
bool noScale = fabs(alpha - 1) < std::numeric_limits<double>::epsilon()
&& fabs(beta) < std::numeric_limits<double>::epsilon();
@@ -425,7 +435,6 @@ void cv::ocl::oclMat::convertTo( oclMat &dst, int rtype, double alpha, double be
else
rtype = CV_MAKETYPE(CV_MAT_DEPTH(rtype), channels());
//int scn = channels();
int sdepth = depth(), ddepth = CV_MAT_DEPTH(rtype);
if( sdepth == ddepth && noScale )
{
@@ -447,7 +456,6 @@ void cv::ocl::oclMat::convertTo( oclMat &dst, int rtype, double alpha, double be
///////////////////////////////////////////////////////////////////////////
oclMat &cv::ocl::oclMat::operator = (const Scalar &s)
{
//cout << "cv::ocl::oclMat::=" << endl;
setTo(s);
return *this;
}

View File

@@ -33,352 +33,28 @@
// the use of this software, even if advised of the possibility of such damage.
//
//
#define F float
#define F2 float2
#define F4 float4
__kernel void convert_to_S4_C1_D0(
__global const int* restrict srcMat,
__global uchar* dstMat,
int cols,
int rows,
int srcStep_in_pixel,
int srcoffset_in_pixel,
int dstStep_in_pixel,
int dstoffset_in_pixel,
F alpha,
F beta)
{
int x=get_global_id(0)<<2;
int y=get_global_id(1);
//int src_addr_start = mad24(y,srcStep_in_pixel,srcoffset_in_pixel);
//int src_addr_end = mad24(y,srcStep_in_pixel,cols+srcoffset_in_pixel);
int off_src = (dstoffset_in_pixel & 3);
int srcidx = mad24(y,srcStep_in_pixel,x+ srcoffset_in_pixel - off_src);
int dst_addr_start = mad24(y,dstStep_in_pixel,dstoffset_in_pixel);
int dst_addr_end = mad24(y,dstStep_in_pixel,cols+dstoffset_in_pixel);
int dstidx = mad24(y,dstStep_in_pixel,x+ dstoffset_in_pixel & (int)0xfffffffc);
if(x+3<cols && y<rows && off_src==0)
{
float4 temp_src = convert_float4(vload4(0,srcMat+srcidx));
*(__global uchar4*)(dstMat+dstidx) = convert_uchar4_sat(temp_src*(F4)alpha+(F4)beta);
}
else
{
if(x+3<cols && y<rows)
{
float4 temp_src = convert_float4(vload4(0,srcMat+srcidx));
uchar4 temp_dst = convert_uchar4_sat(temp_src*(F4)alpha+(F4)beta);
dstMat[dstidx] = temp_dst.x;
dstMat[dstidx+1] = temp_dst.y;
dstMat[dstidx+2] = temp_dst.z;
dstMat[dstidx+3] = temp_dst.w;
}
else if(x+2<cols && y<rows)
{
float4 temp_src = convert_float4(vload4(0,srcMat+srcidx));
uchar4 temp_dst = convert_uchar4_sat(temp_src*(F4)alpha+(F4)beta);
dstMat[dstidx] = temp_dst.x;
dstMat[dstidx+1] = temp_dst.y;
dstMat[dstidx+2] = temp_dst.z;
}
else if(x+1<cols && y<rows)
{
float2 temp_src = convert_float2(vload2(0,srcMat+srcidx));
uchar2 temp_dst = convert_uchar2_sat(temp_src*(F2)alpha+(F2)beta);
dstMat[dstidx] = temp_dst.x;
dstMat[dstidx+1] = temp_dst.y;
}
else if(x<cols && y<rows)
{
dstMat[dstidx] = convert_uchar_sat(convert_float(srcMat[srcidx])*alpha+beta);;
}
}
}
__kernel void convert_to_S4_C4_D0(
__global const int4* restrict srcMat,
__global uchar4* dstMat,
int cols,
int rows,
int srcStep_in_pixel,
int srcoffset_in_pixel,
int dstStep_in_pixel,
int dstoffset_in_pixel,
F alpha,
F beta)
{
int x=get_global_id(0);
int y=get_global_id(1);
int srcidx = mad24(y,srcStep_in_pixel,x+ srcoffset_in_pixel);
int dstidx = mad24(y,dstStep_in_pixel,x+ dstoffset_in_pixel);
if ( (x < cols) & (y < rows) )
{
float4 temp_src = convert_float4(srcMat[srcidx]);
dstMat[dstidx] = convert_uchar4_sat(temp_src*alpha+beta);
}
}
#ifdef DOUBLE_SUPPORT
#pragma OPENCL EXTENSION cl_khr_fp64:enable
#endif
__kernel void convert_to_S5_C1_D0(
__global const float* restrict srcMat,
__global uchar* dstMat,
int cols,
int rows,
int srcStep_in_pixel,
int srcoffset_in_pixel,
int dstStep_in_pixel,
int dstoffset_in_pixel,
F alpha,
F beta)
__kernel void convert_to(
__global const srcT* restrict srcMat,
__global dstT* dstMat,
int cols1, int rows,
int sstep1, int soffset1,
int dstep1, int doffset1,
float alpha, float beta)
{
int x=get_global_id(0)<<2;
int y=get_global_id(1);
//int src_addr_start = mad24(y,srcStep_in_pixel,srcoffset_in_pixel);
//int src_addr_end = mad24(y,srcStep_in_pixel,cols+srcoffset_in_pixel);
int off_src = (dstoffset_in_pixel & 3);
int srcidx = mad24(y,srcStep_in_pixel,x+ srcoffset_in_pixel - off_src);
int dst_addr_start = mad24(y,dstStep_in_pixel,dstoffset_in_pixel);
int dst_addr_end = mad24(y,dstStep_in_pixel,cols+dstoffset_in_pixel);
int dstidx = mad24(y,dstStep_in_pixel,x+ dstoffset_in_pixel & (int)0xfffffffc);
if(x+3<cols && y<rows && off_src==0)
{
float4 temp_src = vload4(0,srcMat+srcidx);
*(__global uchar4*)(dstMat+dstidx) = convert_uchar4_sat(temp_src*(F4)alpha+(F4)beta);
}
else
{
if(x+3<cols && y<rows)
{
float4 temp_src = vload4(0,srcMat+srcidx);
uchar4 temp_dst = convert_uchar4_sat(temp_src*(F4)alpha+(F4)beta);
dstMat[dstidx] = temp_dst.x;
dstMat[dstidx+1] = temp_dst.y;
dstMat[dstidx+2] = temp_dst.z;
dstMat[dstidx+3] = temp_dst.w;
}
else if(x+2<cols && y<rows)
{
float4 temp_src = vload4(0,srcMat+srcidx);
uchar4 temp_dst = convert_uchar4_sat(temp_src*(F4)alpha+(F4)beta);
dstMat[dstidx] = temp_dst.x;
dstMat[dstidx+1] = temp_dst.y;
dstMat[dstidx+2] = temp_dst.z;
}
else if(x+1<cols && y<rows)
{
float2 temp_src = vload2(0,srcMat+srcidx);
uchar2 temp_dst = convert_uchar2_sat(temp_src*(F2)alpha+(F2)beta);
dstMat[dstidx] = temp_dst.x;
dstMat[dstidx+1] = temp_dst.y;
}
else if(x<cols && y<rows)
{
dstMat[dstidx] = convert_uchar_sat(srcMat[srcidx]*alpha+beta);;
}
}
}
__kernel void convert_to_S5_C4_D0(
__global const float4* restrict srcMat,
__global uchar4* dstMat,
int cols,
int rows,
int srcStep_in_pixel,
int srcoffset_in_pixel,
int dstStep_in_pixel,
int dstoffset_in_pixel,
F alpha,
F beta)
{
int x=get_global_id(0);
int y=get_global_id(1);
int srcidx = mad24(y,srcStep_in_pixel,x+ srcoffset_in_pixel);
int dstidx = mad24(y,dstStep_in_pixel,x+ dstoffset_in_pixel);
if ( (x < cols) & (y < rows) )
{
float4 temp_src = srcMat[srcidx];
dstMat[dstidx] = convert_uchar4_sat(temp_src*alpha+beta);
}
}
int x = get_global_id(0);
int y = get_global_id(1);
__kernel void convert_to_S0_C1_D4(
__global const uchar* restrict srcMat,
__global int* dstMat,
int cols,
int rows,
int srcStep_in_pixel,
int srcoffset_in_pixel,
int dstStep_in_pixel,
int dstoffset_in_pixel,
F alpha,
F beta)
{
int x=get_global_id(0);
int y=get_global_id(1);
int srcidx = mad24(y,srcStep_in_pixel,x+ srcoffset_in_pixel);
int dstidx = mad24(y,dstStep_in_pixel,x+ dstoffset_in_pixel);
if ( (x < cols) & (y < rows) )
int srcidx = mad24(y, sstep1, x + soffset1);
int dstidx = mad24(y, dstep1, x + doffset1);
if ( (x < cols1) && (y < rows) )
{
float temp_src = convert_float(srcMat[srcidx]);
dstMat[dstidx] = convert_int_sat(temp_src*alpha+beta);
}
}
__kernel void convert_to_S5_C1_D4(
__global const float* restrict srcMat,
__global int* dstMat,
int cols,
int rows,
int srcStep_in_pixel,
int srcoffset_in_pixel,
int dstStep_in_pixel,
int dstoffset_in_pixel,
F alpha,
F beta)
{
int x=get_global_id(0);
int y=get_global_id(1);
int srcidx = mad24(y,srcStep_in_pixel,x+ srcoffset_in_pixel);
int dstidx = mad24(y,dstStep_in_pixel,x+ dstoffset_in_pixel);
if ( (x < cols) & (y < rows) )
{
float temp_src = srcMat[srcidx];
dstMat[dstidx] = convert_int_sat(temp_src*alpha+beta);
}
}
__kernel void convert_to_S0_C4_D4(
__global const uchar4* restrict srcMat,
__global int4* dstMat,
int cols,
int rows,
int srcStep_in_pixel,
int srcoffset_in_pixel,
int dstStep_in_pixel,
int dstoffset_in_pixel,
F alpha,
F beta)
{
int x=get_global_id(0);
int y=get_global_id(1);
int srcidx = mad24(y,srcStep_in_pixel,x+ srcoffset_in_pixel);
int dstidx = mad24(y,dstStep_in_pixel,x+ dstoffset_in_pixel);
if ( (x < cols) & (y < rows) )
{
float4 temp_src = convert_float4(srcMat[srcidx]);
dstMat[dstidx] = convert_int4_sat(temp_src*alpha+beta);
}
}
__kernel void convert_to_S5_C4_D4(
__global const float4* restrict srcMat,
__global int4* dstMat,
int cols,
int rows,
int srcStep_in_pixel,
int srcoffset_in_pixel,
int dstStep_in_pixel,
int dstoffset_in_pixel,
F alpha,
F beta)
{
int x=get_global_id(0);
int y=get_global_id(1);
int srcidx = mad24(y,srcStep_in_pixel,x+ srcoffset_in_pixel);
int dstidx = mad24(y,dstStep_in_pixel,x+ dstoffset_in_pixel);
if ( (x < cols) & (y < rows) )
{
float4 temp_src = srcMat[srcidx];
dstMat[dstidx] = convert_int4_sat(temp_src*alpha+beta);
}
}
__kernel void convert_to_S0_C1_D5(
__global const uchar* restrict srcMat,
__global float* dstMat,
int cols,
int rows,
int srcStep_in_pixel,
int srcoffset_in_pixel,
int dstStep_in_pixel,
int dstoffset_in_pixel,
F alpha,
F beta)
{
int x=get_global_id(0);
int y=get_global_id(1);
int srcidx = mad24(y,srcStep_in_pixel,x+ srcoffset_in_pixel);
int dstidx = mad24(y,dstStep_in_pixel,x+ dstoffset_in_pixel);
if ( (x < cols) & (y < rows) )
{
float temp_src = convert_float(srcMat[srcidx]);
dstMat[dstidx] = temp_src*alpha+beta;
}
}
__kernel void convert_to_S4_C1_D5(
__global const int* restrict srcMat,
__global float* dstMat,
int cols,
int rows,
int srcStep_in_pixel,
int srcoffset_in_pixel,
int dstStep_in_pixel,
int dstoffset_in_pixel,
F alpha,
F beta)
{
int x=get_global_id(0);
int y=get_global_id(1);
int srcidx = mad24(y,srcStep_in_pixel,x+ srcoffset_in_pixel);
int dstidx = mad24(y,dstStep_in_pixel,x+ dstoffset_in_pixel);
if ( (x < cols) & (y < rows) )
{
float temp_src = convert_float(srcMat[srcidx]);
dstMat[dstidx] = temp_src*alpha+beta;
}
}
__kernel void convert_to_S0_C4_D5(
__global const uchar4* restrict srcMat,
__global float4* dstMat,
int cols,
int rows,
int srcStep_in_pixel,
int srcoffset_in_pixel,
int dstStep_in_pixel,
int dstoffset_in_pixel,
F alpha,
F beta)
{
int x=get_global_id(0);
int y=get_global_id(1);
int srcidx = mad24(y,srcStep_in_pixel,x+ srcoffset_in_pixel);
int dstidx = mad24(y,dstStep_in_pixel,x+ dstoffset_in_pixel);
if ( (x < cols) & (y < rows) )
{
float4 temp_src = convert_float4(srcMat[srcidx]);
dstMat[dstidx] = temp_src*alpha+beta;
}
}
__kernel void convert_to_S4_C4_D5(
__global const int4* restrict srcMat,
__global float4* dstMat,
int cols,
int rows,
int srcStep_in_pixel,
int srcoffset_in_pixel,
int dstStep_in_pixel,
int dstoffset_in_pixel,
F alpha,
F beta)
{
int x=get_global_id(0);
int y=get_global_id(1);
int srcidx = mad24(y,srcStep_in_pixel,x+ srcoffset_in_pixel);
int dstidx = mad24(y,dstStep_in_pixel,x+ dstoffset_in_pixel);
if ( (x < cols) & (y < rows) )
{
float4 temp_src = convert_float4(srcMat[srcidx]);
dstMat[dstidx] = temp_src*alpha+beta;
dstMat[dstidx] = convertToDstType(temp_src*alpha+beta);
}
}