performance and bug fix for addWeighted cartToPolar div exp log resize setTo
add channel 3 support add fast way Between CPU and GPU for the data which is aligned
This commit is contained in:
@@ -455,13 +455,12 @@ void cv::ocl::multiply(const oclMat &src1, const oclMat &src2, oclMat &dst, doub
|
||||
}
|
||||
void cv::ocl::divide(const oclMat &src1, const oclMat &src2, oclMat &dst, double scalar)
|
||||
{
|
||||
if(src1.clCxt -> impl -> double_support ==0)
|
||||
{
|
||||
CV_Error(-217,"Selected device don't support double\r\n");
|
||||
return;
|
||||
}
|
||||
|
||||
arithmetic_run<double>(src1, src2, dst, "arithm_div", &arithm_div, (void *)(&scalar));
|
||||
if(src1.clCxt -> impl -> double_support !=0)
|
||||
arithmetic_run<double>(src1, src2, dst, "arithm_div", &arithm_div, (void *)(&scalar));
|
||||
else
|
||||
arithmetic_run<float>(src1, src2, dst, "arithm_div", &arithm_div, (void *)(&scalar));
|
||||
|
||||
}
|
||||
template <typename WT ,typename CL_WT>
|
||||
void arithmetic_scalar_run(const oclMat &src1, const Scalar &src2, oclMat &dst, const oclMat &mask, string kernelName, const char **kernelString, int isMatSubScalar)
|
||||
@@ -579,7 +578,14 @@ void arithmetic_scalar_run(const oclMat &src, oclMat &dst, string kernelName, co
|
||||
args.push_back( make_pair( sizeof(cl_int), (void *)&src.rows ));
|
||||
args.push_back( make_pair( sizeof(cl_int), (void *)&cols ));
|
||||
args.push_back( make_pair( sizeof(cl_int), (void *)&dst_step1 ));
|
||||
args.push_back( make_pair( sizeof(cl_double), (void *)&scalar ));
|
||||
|
||||
if(src.clCxt -> impl -> double_support !=0)
|
||||
args.push_back( make_pair( sizeof(cl_double), (void *)&scalar ));
|
||||
else
|
||||
{
|
||||
float f_scalar = (float)scalar;
|
||||
args.push_back( make_pair( sizeof(cl_float), (void *)&f_scalar));
|
||||
}
|
||||
|
||||
openCLExecuteKernel(clCxt, kernelString, kernelName, globalThreads, localThreads, args, -1, depth);
|
||||
}
|
||||
@@ -670,9 +676,9 @@ void compare_run(const oclMat &src1, const oclMat &src2, oclMat &dst, string ker
|
||||
int cols = divUp(dst.cols + offset_cols, vector_length);
|
||||
size_t localThreads[3] = { 64, 4, 1 };
|
||||
size_t globalThreads[3] = { divUp(cols, localThreads[0]) * localThreads[0],
|
||||
divUp(dst.rows, localThreads[1]) * localThreads[1],
|
||||
1
|
||||
};
|
||||
divUp(dst.rows, localThreads[1]) * localThreads[1],
|
||||
1
|
||||
};
|
||||
int dst_step1 = dst.cols * dst.elemSize();
|
||||
vector<pair<size_t , const void *> > args;
|
||||
args.push_back( make_pair( sizeof(cl_mem), (void *)&src1.data ));
|
||||
@@ -1253,7 +1259,11 @@ void arithmetic_exp_log_run(const oclMat &src, oclMat &dst, string kernelName, c
|
||||
CV_Assert( src.type() == CV_32F || src.type() == CV_64F);
|
||||
|
||||
Context *clCxt = src.clCxt;
|
||||
|
||||
if(clCxt -> impl -> double_support ==0 && src.type() == CV_64F)
|
||||
{
|
||||
CV_Error(-217,"Selected device don't support double\r\n");
|
||||
return;
|
||||
}
|
||||
//int channels = dst.channels();
|
||||
int depth = dst.depth();
|
||||
|
||||
@@ -2193,56 +2203,46 @@ void cv::ocl::addWeighted(const oclMat &src1, double alpha, const oclMat &src2,
|
||||
|
||||
size_t localThreads[3] = { 256, 1, 1 };
|
||||
size_t globalThreads[3] = { divUp(cols, localThreads[0]) * localThreads[0],
|
||||
divUp(dst.rows, localThreads[1]) * localThreads[1],
|
||||
1
|
||||
};
|
||||
divUp(dst.rows, localThreads[1]) * localThreads[1],
|
||||
1
|
||||
};
|
||||
|
||||
int dst_step1 = dst.cols * dst.elemSize();
|
||||
vector<pair<size_t , const void *> > args;
|
||||
if(sizeof(double) == 8)
|
||||
args.push_back( make_pair( sizeof(cl_mem), (void *)&src1.data ));
|
||||
args.push_back( make_pair( sizeof(cl_int), (void *)&src1.step ));
|
||||
args.push_back( make_pair( sizeof(cl_int), (void *)&src1.offset));
|
||||
args.push_back( make_pair( sizeof(cl_mem), (void *)&src2.data ));
|
||||
args.push_back( make_pair( sizeof(cl_int), (void *)&src2.step ));
|
||||
args.push_back( make_pair( sizeof(cl_int), (void *)&src2.offset));
|
||||
|
||||
if(src1.clCxt -> impl -> double_support != 0)
|
||||
{
|
||||
args.push_back( make_pair( sizeof(cl_mem), (void *)&src1.data ));
|
||||
args.push_back( make_pair( sizeof(cl_double), (void *)&alpha ));
|
||||
args.push_back( make_pair( sizeof(cl_int), (void *)&src1.step ));
|
||||
args.push_back( make_pair( sizeof(cl_int), (void *)&src1.offset));
|
||||
args.push_back( make_pair( sizeof(cl_mem), (void *)&src2.data ));
|
||||
args.push_back( make_pair( sizeof(cl_double), (void *)&beta ));
|
||||
args.push_back( make_pair( sizeof(cl_int), (void *)&src2.step ));
|
||||
args.push_back( make_pair( sizeof(cl_int), (void *)&src2.offset));
|
||||
args.push_back( make_pair( sizeof(cl_double), (void *)&gama ));
|
||||
args.push_back( make_pair( sizeof(cl_mem), (void *)&dst.data ));
|
||||
args.push_back( make_pair( sizeof(cl_int), (void *)&dst.step ));
|
||||
args.push_back( make_pair( sizeof(cl_int), (void *)&dst.offset));
|
||||
args.push_back( make_pair( sizeof(cl_int), (void *)&src1.rows ));
|
||||
args.push_back( make_pair( sizeof(cl_int), (void *)&cols ));
|
||||
args.push_back( make_pair( sizeof(cl_int), (void *)&dst_step1 ));
|
||||
}
|
||||
else
|
||||
{
|
||||
|
||||
args.push_back( make_pair( sizeof(cl_mem), (void *)&src1.data ));
|
||||
args.push_back( make_pair( sizeof(cl_float), (void *)&alpha ));
|
||||
args.push_back( make_pair( sizeof(cl_int), (void *)&src1.step ));
|
||||
args.push_back( make_pair( sizeof(cl_int), (void *)&src1.offset));
|
||||
args.push_back( make_pair( sizeof(cl_mem), (void *)&src2.data ));
|
||||
args.push_back( make_pair( sizeof(cl_float), (void *)&beta ));
|
||||
args.push_back( make_pair( sizeof(cl_int), (void *)&src2.step ));
|
||||
args.push_back( make_pair( sizeof(cl_int), (void *)&src2.offset));
|
||||
args.push_back( make_pair( sizeof(cl_float), (void *)&gama ));
|
||||
args.push_back( make_pair( sizeof(cl_mem), (void *)&dst.data ));
|
||||
args.push_back( make_pair( sizeof(cl_int), (void *)&dst.step ));
|
||||
args.push_back( make_pair( sizeof(cl_int), (void *)&dst.offset));
|
||||
args.push_back( make_pair( sizeof(cl_int), (void *)&src1.rows ));
|
||||
args.push_back( make_pair( sizeof(cl_int), (void *)&cols ));
|
||||
args.push_back( make_pair( sizeof(cl_int), (void *)&dst_step1 ));
|
||||
}
|
||||
}
|
||||
|
||||
args.push_back( make_pair( sizeof(cl_mem), (void *)&dst.data ));
|
||||
args.push_back( make_pair( sizeof(cl_int), (void *)&dst.step ));
|
||||
args.push_back( make_pair( sizeof(cl_int), (void *)&dst.offset));
|
||||
args.push_back( make_pair( sizeof(cl_int), (void *)&src1.rows ));
|
||||
args.push_back( make_pair( sizeof(cl_int), (void *)&cols ));
|
||||
args.push_back( make_pair( sizeof(cl_int), (void *)&dst_step1 ));
|
||||
|
||||
openCLExecuteKernel(clCxt, &arithm_addWeighted, "addWeighted", globalThreads, localThreads, args, -1, depth);
|
||||
}
|
||||
|
||||
void cv::ocl::magnitudeSqr(const oclMat &src1, const oclMat &src2, oclMat &dst)
|
||||
{
|
||||
CV_Assert(src1.type() == src2.type() && src1.size() == src2.size() &&
|
||||
(src1.depth() == CV_32F ));
|
||||
(src1.depth() == CV_32F ));
|
||||
|
||||
dst.create(src1.size(), src1.type());
|
||||
|
||||
@@ -2265,9 +2265,9 @@ void cv::ocl::magnitudeSqr(const oclMat &src1, const oclMat &src2, oclMat &dst)
|
||||
|
||||
size_t localThreads[3] = { 256, 1, 1 };
|
||||
size_t globalThreads[3] = { divUp(cols, localThreads[0]) * localThreads[0],
|
||||
divUp(dst.rows, localThreads[1]) * localThreads[1],
|
||||
1
|
||||
};
|
||||
divUp(dst.rows, localThreads[1]) * localThreads[1],
|
||||
1
|
||||
};
|
||||
|
||||
int dst_step1 = dst.cols * dst.elemSize();
|
||||
vector<pair<size_t , const void *> > args;
|
||||
@@ -2313,9 +2313,9 @@ void cv::ocl::magnitudeSqr(const oclMat &src1, oclMat &dst)
|
||||
|
||||
size_t localThreads[3] = { 256, 1, 1 };
|
||||
size_t globalThreads[3] = { divUp(cols, localThreads[0]) * localThreads[0],
|
||||
divUp(dst.rows, localThreads[1]) * localThreads[1],
|
||||
1
|
||||
};
|
||||
divUp(dst.rows, localThreads[1]) * localThreads[1],
|
||||
1
|
||||
};
|
||||
|
||||
int dst_step1 = dst.cols * dst.elemSize();
|
||||
vector<pair<size_t , const void *> > args;
|
||||
@@ -2348,9 +2348,9 @@ void arithmetic_pow_run(const oclMat &src1, double p, oclMat &dst, string kernel
|
||||
|
||||
size_t localThreads[3] = { 64, 4, 1 };
|
||||
size_t globalThreads[3] = { divUp(cols, localThreads[0]) * localThreads[0],
|
||||
divUp(rows, localThreads[1]) * localThreads[1],
|
||||
1
|
||||
};
|
||||
divUp(rows, localThreads[1]) * localThreads[1],
|
||||
1
|
||||
};
|
||||
|
||||
int dst_step1 = dst.cols * dst.elemSize();
|
||||
vector<pair<size_t , const void *> > args;
|
||||
|
||||
Reference in New Issue
Block a user