diff --git a/modules/ocl/perf/perf_imgproc.cpp b/modules/ocl/perf/perf_imgproc.cpp index 5c89988b8..c57950ff1 100644 --- a/modules/ocl/perf/perf_imgproc.cpp +++ b/modules/ocl/perf/perf_imgproc.cpp @@ -366,21 +366,23 @@ PERF_TEST_P(resizeFixture, resize, ///////////// threshold//////////////////////// -CV_ENUM(ThreshType, THRESH_BINARY, THRESH_BINARY_INV, THRESH_TRUNC, THRESH_TOZERO, THRESH_TOZERO_INV) +CV_ENUM(ThreshType, THRESH_BINARY, THRESH_TOZERO_INV) -typedef tuple ThreshParams; +typedef tuple ThreshParams; typedef TestBaseWithParam ThreshFixture; PERF_TEST_P(ThreshFixture, threshold, ::testing::Combine(OCL_TYPICAL_MAT_SIZES, + OCL_PERF_ENUM(CV_8UC1, CV_8UC4, CV_16SC1, CV_16SC4, CV_32FC1), ThreshType::all())) { const ThreshParams params = GetParam(); const Size srcSize = get<0>(params); - const int threshType = get<1>(params); + const int srcType = get<1>(params); + const int threshType = get<2>(params); const double maxValue = 220.0, threshold = 50; - Mat src(srcSize, CV_8U), dst(srcSize, CV_8U); + Mat src(srcSize, srcType), dst(srcSize, srcType); randu(src, 0, 100); declare.in(src).out(dst); diff --git a/modules/ocl/src/imgproc.cpp b/modules/ocl/src/imgproc.cpp index 88c2ca873..3539dfaf1 100644 --- a/modules/ocl/src/imgproc.cpp +++ b/modules/ocl/src/imgproc.cpp @@ -118,22 +118,20 @@ namespace cv static void threshold_runner(const oclMat &src, oclMat &dst, double thresh, double maxVal, int thresholdType) { bool ival = src.depth() < CV_32F; + int cn = src.channels(), vecSize = 4, depth = src.depth(); std::vector thresholdValue = scalarToVector(cv::Scalar::all(ival ? cvFloor(thresh) : thresh), dst.depth(), dst.oclchannels(), dst.channels()); std::vector maxValue = scalarToVector(cv::Scalar::all(maxVal), dst.depth(), dst.oclchannels(), dst.channels()); - size_t localThreads[3] = { 16, 16, 1 }; - size_t globalThreads[3] = { dst.cols, dst.rows, 1 }; - const char * const thresholdMap[] = { "THRESH_BINARY", "THRESH_BINARY_INV", "THRESH_TRUNC", "THRESH_TOZERO", "THRESH_TOZERO_INV" }; const char * const channelMap[] = { "", "", "2", "4", "4" }; const char * const typeMap[] = { "uchar", "char", "ushort", "short", "int", "float", "double" }; - std::string buildOptions = format("-D T=%s%s -D %s", typeMap[src.depth()], channelMap[src.channels()], - thresholdMap[thresholdType]); + std::string buildOptions = format("-D T=%s%s -D %s", typeMap[depth], channelMap[cn], thresholdMap[thresholdType]); - int src_step = src.step / src.elemSize(), src_offset = src.offset / src.elemSize(); - int dst_step = dst.step / dst.elemSize(), dst_offset = dst.offset / dst.elemSize(); + int elemSize = src.elemSize(); + int src_step = src.step / elemSize, src_offset = src.offset / elemSize; + int dst_step = dst.step / elemSize, dst_offset = dst.offset / elemSize; vector< pair > args; args.push_back( make_pair(sizeof(cl_mem), (void *)&src.data)); @@ -142,11 +140,32 @@ namespace cv args.push_back( make_pair(sizeof(cl_mem), (void *)&dst.data)); args.push_back( make_pair(sizeof(cl_int), (void *)&dst_offset)); args.push_back( make_pair(sizeof(cl_int), (void *)&dst_step)); - args.push_back( make_pair(sizeof(cl_int), (void *)&dst.rows)); - args.push_back( make_pair(sizeof(cl_int), (void *)&dst.cols)); args.push_back( make_pair(thresholdValue.size(), (void *)&thresholdValue[0])); args.push_back( make_pair(maxValue.size(), (void *)&maxValue[0])); + int max_index = dst.cols, cols = dst.cols; + if (cn == 1 && vecSize > 1) + { + CV_Assert(((vecSize - 1) & vecSize) == 0 && vecSize <= 16); + cols = divUp(cols, vecSize); + buildOptions += format(" -D VECTORIZED -D VT=%s%d -D VLOADN=vload%d -D VECSIZE=%d -D VSTOREN=vstore%d", + typeMap[depth], vecSize, vecSize, vecSize, vecSize); + + int vecSizeBytes = vecSize * dst.elemSize1(); + if ((dst.offset % dst.step) % vecSizeBytes == 0 && dst.step % vecSizeBytes == 0) + buildOptions += " -D DST_ALIGNED"; + if ((src.offset % src.step) % vecSizeBytes == 0 && src.step % vecSizeBytes == 0) + buildOptions += " -D SRC_ALIGNED"; + + args.push_back( make_pair(sizeof(cl_int), (void *)&max_index)); + } + + args.push_back( make_pair(sizeof(cl_int), (void *)&dst.rows)); + args.push_back( make_pair(sizeof(cl_int), (void *)&cols)); + + size_t localThreads[3] = { 16, 16, 1 }; + size_t globalThreads[3] = { cols, dst.rows, 1 }; + openCLExecuteKernel(src.clCxt, &imgproc_threshold, "threshold", globalThreads, localThreads, args, -1, -1, buildOptions.c_str()); } diff --git a/modules/ocl/src/opencl/imgproc_threshold.cl b/modules/ocl/src/opencl/imgproc_threshold.cl index 81f2a7400..6b847c83f 100644 --- a/modules/ocl/src/opencl/imgproc_threshold.cl +++ b/modules/ocl/src/opencl/imgproc_threshold.cl @@ -51,9 +51,63 @@ #endif #endif +#ifdef VECTORIZED + __kernel void threshold(__global const T * restrict src, int src_offset, int src_step, __global T * dst, int dst_offset, int dst_step, - int rows, int cols, T thresh, T max_val) + T thresh, T max_val, int max_index, int rows, int cols) +{ + int gx = get_global_id(0); + int gy = get_global_id(1); + + if (gx < cols && gy < rows) + { + gx *= VECSIZE; + int src_index = mad24(gy, src_step, src_offset + gx); + int dst_index = mad24(gy, dst_step, dst_offset + gx); + +#ifdef SRC_ALIGNED + VT sdata = *((__global VT *)(src + src_index)); +#else + VT sdata = VLOADN(0, src + src_index); +#endif + VT vthresh = (VT)(thresh), zero = (VT)(0); + +#ifdef THRESH_BINARY + VT vecValue = sdata > vthresh ? max_val : zero; +#elif defined THRESH_BINARY_INV + VT vecValue = sdata > vthresh ? zero : max_val; +#elif defined THRESH_TRUNC + VT vecValue = sdata > vthresh ? thresh : sdata; +#elif defined THRESH_TOZERO + VT vecValue = sdata > vthresh ? sdata : zero; +#elif defined THRESH_TOZERO_INV + VT vecValue = sdata > vthresh ? zero : sdata; +#endif + + if (gx + VECSIZE <= max_index) +#ifdef DST_ALIGNED + *(__global VT*)(dst + dst_index) = vecValue; +#else + VSTOREN(vecValue, 0, dst + dst_index); +#endif + else + { + T array[VECSIZE]; + VSTOREN(vecValue, 0, array); + #pragma unroll + for (int i = 0; i < VECSIZE; ++i) + if (gx + i < max_index) + dst[dst_index + i] = array[i]; + } + } +} + +#else + +__kernel void threshold(__global const T * restrict src, int src_offset, int src_step, + __global T * dst, int dst_offset, int dst_step, + T thresh, T max_val, int rows, int cols) { int gx = get_global_id(0); int gy = get_global_id(1); @@ -78,3 +132,5 @@ __kernel void threshold(__global const T * restrict src, int src_offset, int src #endif } } + +#endif