rewrote and generalized ocl::threshold
This commit is contained in:
@ -98,80 +98,66 @@ namespace cv
// threshold
typedef void (*gpuThresh_t)(const oclMat &src, oclMat &dst, double thresh, double maxVal, int type);
static void threshold_8u(const oclMat &src, oclMat &dst, double thresh, double maxVal, int type)
static std::vector<uchar> scalarToVector(const cv::Scalar & sc, int depth, int ocn, int cn)
uchar thresh_uchar = cvFloor(thresh);
uchar max_val = cvRound(maxVal);
CV_Assert(ocn == cn || (ocn == 4 && cn == 3));
size_t cols = (dst.cols + (dst.offset % 16) + 15) / 16;
size_t bSizeX = 16, bSizeY = 16;
size_t gSizeX = cols % bSizeX == 0 ? cols : (cols + bSizeX - 1) / bSizeX * bSizeX;
size_t gSizeY = dst.rows;
size_t globalThreads[3] = {gSizeX, gSizeY, 1};
size_t localThreads[3] = {bSizeX, bSizeY, 1};
static const int sizeMap[] = { sizeof(uchar), sizeof(char), sizeof(ushort),
sizeof(short), sizeof(int), sizeof(float), sizeof(double) };
vector< pair<size_t, const void *> > args;
args.push_back( make_pair(sizeof(cl_mem), &;
args.push_back( make_pair(sizeof(cl_mem), &;
args.push_back( make_pair(sizeof(cl_int), (void *)&src.offset));
args.push_back( make_pair(sizeof(cl_int), (void *)&src.step));
args.push_back( make_pair(sizeof(cl_int), (void *)&dst.offset));
args.push_back( make_pair(sizeof(cl_int), (void *)&dst.rows));
args.push_back( make_pair(sizeof(cl_int), (void *)&dst.cols));
args.push_back( make_pair(sizeof(cl_int), (void *)&dst.step));
args.push_back( make_pair(sizeof(cl_uchar), (void *)&thresh_uchar));
args.push_back( make_pair(sizeof(cl_uchar), (void *)&max_val));
args.push_back( make_pair(sizeof(cl_int), (void *)&type));
openCLExecuteKernel(src.clCxt, &imgproc_threshold, "threshold", globalThreads, localThreads, args, src.oclchannels(), src.depth());
int elemSize1 = sizeMap[depth];
int bufSize = elemSize1 * ocn;
std::vector<uchar> _buf(bufSize);
uchar * buf = &_buf[0];
scalarToRawData(sc, buf, CV_MAKE_TYPE(depth, cn));
memset(buf + elemSize1 * cn, 0, (ocn - cn) * elemSize1);
return _buf;
static void threshold_32f(const oclMat &src, oclMat &dst, double thresh, double maxVal, int type)
static void threshold_runner(const oclMat &src, oclMat &dst, double thresh, double maxVal, int thresholdType)
float thresh_f = thresh;
float max_val = maxVal;
int dst_offset = (dst.offset >> 2);
int dst_step = (dst.step >> 2);
int src_offset = (src.offset >> 2);
int src_step = (src.step >> 2);
bool ival = src.depth() < CV_32F;
std::vector<uchar> thresholdValue = scalarToVector(cv::Scalar::all(ival ? cvFloor(thresh) : thresh), dst.depth(),
dst.oclchannels(), dst.channels());
std::vector<uchar> maxValue = scalarToVector(cv::Scalar::all(maxVal), dst.depth(), dst.oclchannels(), dst.channels());
size_t cols = (dst.cols + (dst_offset & 3) + 3) / 4;
size_t bSizeX = 16, bSizeY = 16;
size_t gSizeX = cols % bSizeX == 0 ? cols : (cols + bSizeX - 1) / bSizeX * bSizeX;
size_t gSizeY = dst.rows;
size_t globalThreads[3] = {gSizeX, gSizeY, 1};
size_t localThreads[3] = {bSizeX, bSizeY, 1};
size_t localThreads[3] = { 16, 16, 1 };
size_t globalThreads[3] = { dst.cols, dst.rows, 1 };
const char * const thresholdMap[] = { "THRESH_BINARY", "THRESH_BINARY_INV", "THRESH_TRUNC",
const char * const channelMap[] = { "", "", "2", "4", "4" };
const char * const typeMap[] = { "uchar", "char", "ushort", "short", "int", "float", "double" };
std::string buildOptions = format("-D T=%s%s -D %s", typeMap[src.depth()], channelMap[src.channels()],
int src_step = src.step / src.elemSize(), src_offset = src.offset / src.elemSize();
int dst_step = dst.step / dst.elemSize(), dst_offset = dst.offset / dst.elemSize();
vector< pair<size_t, const void *> > args;
args.push_back( make_pair(sizeof(cl_mem), &;
args.push_back( make_pair(sizeof(cl_mem), &;
args.push_back( make_pair(sizeof(cl_mem), (void *)&;
args.push_back( make_pair(sizeof(cl_int), (void *)&src_offset));
args.push_back( make_pair(sizeof(cl_int), (void *)&src_step));
args.push_back( make_pair(sizeof(cl_mem), (void *)&;
args.push_back( make_pair(sizeof(cl_int), (void *)&dst_offset));
args.push_back( make_pair(sizeof(cl_int), (void *)&dst_step));
args.push_back( make_pair(sizeof(cl_int), (void *)&dst.rows));
args.push_back( make_pair(sizeof(cl_int), (void *)&dst.cols));
args.push_back( make_pair(sizeof(cl_int), (void *)&dst_step));
args.push_back( make_pair(sizeof(cl_float), (void *)&thresh_f));
args.push_back( make_pair(sizeof(cl_float), (void *)&max_val));
args.push_back( make_pair(sizeof(cl_int), (void *)&type));
openCLExecuteKernel(src.clCxt, &imgproc_threshold, "threshold", globalThreads, localThreads, args, src.oclchannels(), src.depth());
args.push_back( make_pair(thresholdValue.size(), (void *)&thresholdValue[0]));
args.push_back( make_pair(maxValue.size(), (void *)&maxValue[0]));
openCLExecuteKernel(src.clCxt, &imgproc_threshold, "threshold", globalThreads, localThreads, args,
-1, -1, buildOptions.c_str());
// threshold: support 8UC1 and 32FC1 data type and five threshold type
double threshold(const oclMat &src, oclMat &dst, double thresh, double maxVal, int type)
double threshold(const oclMat &src, oclMat &dst, double thresh, double maxVal, int thresholdType)
//TODO: These limitations shall be removed later.
CV_Assert(src.type() == CV_8UC1 || src.type() == CV_32FC1);
CV_Assert(type == THRESH_BINARY || type == THRESH_BINARY_INV || type == THRESH_TRUNC
|| type == THRESH_TOZERO || type == THRESH_TOZERO_INV );
CV_Assert(thresholdType == THRESH_BINARY || thresholdType == THRESH_BINARY_INV || thresholdType == THRESH_TRUNC
|| thresholdType == THRESH_TOZERO || thresholdType == THRESH_TOZERO_INV);
static const gpuThresh_t gpuThresh_callers[2] = {threshold_8u, threshold_32f};
dst.create( src.size(), src.type() );
gpuThresh_callers[(src.type() == CV_32FC1)](src, dst, thresh, maxVal, type);
dst.create(src.size(), src.type());
threshold_runner(src, dst, thresh, maxVal, thresholdType);
return thresh;
@ -44,109 +44,37 @@
#if defined (DOUBLE_SUPPORT)
#ifdef cl_amd_fp64
#pragma OPENCL EXTENSION cl_amd_fp64:enable
#elif defined (cl_khr_fp64)
#pragma OPENCL EXTENSION cl_khr_fp64:enable
// threshold type:
__kernel void threshold_C1_D0(__global const uchar * restrict src, __global uchar *dst,
int src_offset, int src_step,
int dst_offset, int dst_rows, int dst_cols, int dst_step,
uchar thresh, uchar max_val, int thresh_type
__kernel void threshold(__global const T * restrict src, int src_offset, int src_step,
__global T * dst, int dst_offset, int dst_step,
int rows, int cols, T thresh, T max_val)
int gx = get_global_id(0);
const int gy = get_global_id(1);
int gy = get_global_id(1);
int offset = (dst_offset & 15);
src_offset -= offset;
int dstart = (gx << 4) - offset;
if(dstart < dst_cols && gy < dst_rows)
if (gx < cols && gy < rows)
uchar16 sdata = vload16(gx, src+src_offset+gy*src_step);
uchar16 ddata;
uchar16 zero = 0;
switch (thresh_type)
case 0:
ddata = ((sdata > thresh) ) ? (uchar16)(max_val) : (uchar16)(0);
case 1:
ddata = ((sdata > thresh)) ? zero : (uchar16)(max_val);
case 2:
ddata = ((sdata > thresh)) ? (uchar16)(thresh) : sdata;
case 3:
ddata = ((sdata > thresh)) ? sdata : zero;
case 4:
ddata = ((sdata > thresh)) ? zero : sdata;
ddata = sdata;
int16 dpos = (int16)(dstart, dstart+1, dstart+2, dstart+3, dstart+4, dstart+5, dstart+6, dstart+7, dstart+8,
dstart+9, dstart+10, dstart+11, dstart+12, dstart+13, dstart+14, dstart+15);
uchar16 dVal = *(__global uchar16*)(dst+dst_offset+gy*dst_step+dstart);
int16 con = dpos >= 0 && dpos < dst_cols;
ddata = convert_uchar16(con != 0) ? ddata : dVal;
if(dstart < dst_cols)
*(__global uchar16*)(dst+dst_offset+gy*dst_step+dstart) = ddata;
__kernel void threshold_C1_D5(__global const float * restrict src, __global float *dst,
int src_offset, int src_step,
int dst_offset, int dst_rows, int dst_cols, int dst_step,
float thresh, float max_val, int thresh_type
const int gx = get_global_id(0);
const int gy = get_global_id(1);
int offset = (dst_offset & 3);
src_offset -= offset;
int dstart = (gx << 2) - offset;
if(dstart < dst_cols && gy < dst_rows)
float4 sdata = vload4(gx, src+src_offset+gy*src_step);
float4 ddata;
float4 zero = 0;
switch (thresh_type)
case 0:
ddata = sdata > thresh ? (float4)(max_val) : (float4)(0.f);
case 1:
ddata = sdata > thresh ? zero : (float4)max_val;
case 2:
ddata = sdata > thresh ? (float4)thresh : sdata;
case 3:
ddata = sdata > thresh ? sdata : (float4)(0.f);
case 4:
ddata = sdata > thresh ? (float4)(0.f) : sdata;
ddata = sdata;
int4 dpos = (int4)(dstart, dstart+1, dstart+2, dstart+3);
float4 dVal = *(__global float4*)(dst+dst_offset+gy*dst_step+dstart);
int4 con = dpos >= 0 && dpos < dst_cols;
ddata = convert_float4(con) != (float4)(0) ? ddata : dVal;
if(dstart < dst_cols)
*(__global float4*)(dst+dst_offset+gy*dst_step+dstart) = ddata;
int src_index = mad24(gy, src_step, src_offset + gx);
int dst_index = mad24(gy, dst_step, dst_offset + gx);
T sdata = src[src_index], zero = (T)(0);
dst[dst_index] = sdata > thresh ? max_val : zero;
#elif defined THRESH_BINARY_INV
dst[dst_index] = sdata > thresh ? zero : max_val;
#elif defined THRESH_TRUNC
dst[dst_index] = sdata > thresh ? thresh : sdata;
#elif defined THRESH_TOZERO
dst[dst_index] = sdata > thresh ? sdata : zero;
#elif defined THRESH_TOZERO_INV
dst[dst_index] = sdata > thresh ? zero : sdata;
@ -502,7 +502,9 @@ INSTANTIATE_TEST_CASE_P(Imgproc, Integral, Combine(
INSTANTIATE_TEST_CASE_P(Imgproc, Threshold, Combine(
Values(CV_8UC1, CV_32FC1),
Values(CV_8UC1, CV_8UC2, CV_8UC3, CV_8UC4,
CV_16SC1, CV_16SC2, CV_16SC3, CV_16SC4,
CV_32FC1, CV_32FC2, CV_32FC3, CV_32FC4),
Reference in New Issue
Block a user