diff --git a/modules/core/include/opencv2/core/ocl.hpp b/modules/core/include/opencv2/core/ocl.hpp index 311276679..e3805bcdc 100644 --- a/modules/core/include/opencv2/core/ocl.hpp +++ b/modules/core/include/opencv2/core/ocl.hpp @@ -489,6 +489,7 @@ public: bool runTask(bool sync, const Queue& q=Queue()); size_t workGroupSize() const; + size_t preferedWorkGroupSizeMultiple() const; bool compileWorkGroupSize(size_t wsz[]) const; size_t localMemSize() const; diff --git a/modules/core/src/ocl.cpp b/modules/core/src/ocl.cpp index 9b7564250..cf3b1dcab 100644 --- a/modules/core/src/ocl.cpp +++ b/modules/core/src/ocl.cpp @@ -2817,6 +2817,16 @@ size_t Kernel::workGroupSize() const sizeof(val), &val, &retsz) >= 0 ? val : 0; } +size_t Kernel::preferedWorkGroupSizeMultiple() const +{ + if(!p) + return 0; + size_t val = 0, retsz = 0; + cl_device_id dev = (cl_device_id)Device::getDefault().ptr(); + return clGetKernelWorkGroupInfo(p->handle, dev, CL_KERNEL_PREFERRED_WORK_GROUP_SIZE_MULTIPLE, + sizeof(val), &val, &retsz) >= 0 ? val : 0; +} + bool Kernel::compileWorkGroupSize(size_t wsz[]) const { if(!p || !wsz) diff --git a/modules/imgproc/src/clahe.cpp b/modules/imgproc/src/clahe.cpp index 89fb62bd0..079e635f9 100644 --- a/modules/imgproc/src/clahe.cpp +++ b/modules/imgproc/src/clahe.cpp @@ -40,10 +40,90 @@ //M*/ #include "precomp.hpp" +#include "opencl_kernels.hpp" // ---------------------------------------------------------------------- // CLAHE +namespace clahe +{ + static bool calcLut(cv::InputArray _src, cv::OutputArray _dst, + const int tilesX, const int tilesY, const cv::Size tileSize, + const int clipLimit, const float lutScale) + { + cv::ocl::Kernel _k("calcLut", cv::ocl::imgproc::clahe_oclsrc); + + bool is_cpu = cv::ocl::Device::getDefault().type() == cv::ocl::Device::TYPE_CPU; + cv::String opts; + if(is_cpu) + opts = "-D CPU "; + else + opts = cv::format("-D WAVE_SIZE=%d", _k.preferedWorkGroupSizeMultiple()); + + cv::ocl::Kernel k("calcLut", cv::ocl::imgproc::clahe_oclsrc, opts); + if(k.empty()) + return false; + + cv::UMat src = _src.getUMat(); + _dst.create(tilesX * tilesY, 256, CV_8UC1); + cv::UMat dst = _dst.getUMat(); + + int tile_size[2]; + tile_size[0] = tileSize.width; + tile_size[1] = tileSize.height; + + size_t localThreads[3] = { 32, 8, 1 }; + size_t globalThreads[3] = { tilesX * localThreads[0], tilesY * localThreads[1], 1 }; + + int idx = 0; + idx = k.set(idx, cv::ocl::KernelArg::ReadOnlyNoSize(src)); + idx = k.set(idx, cv::ocl::KernelArg::WriteOnlyNoSize(dst)); + idx = k.set(idx, tile_size); + idx = k.set(idx, tilesX); + idx = k.set(idx, clipLimit); + idx = k.set(idx, lutScale); + + if (!k.run(2, globalThreads, localThreads, false)) + return false; + return true; + } + + static bool transform(const cv::InputArray _src, cv::OutputArray _dst, const cv::InputArray _lut, + const int tilesX, const int tilesY, const cv::Size & tileSize) + { + + cv::ocl::Kernel k("transform", cv::ocl::imgproc::clahe_oclsrc); + if(k.empty()) + return false; + + int tile_size[2]; + tile_size[0] = tileSize.width; + tile_size[1] = tileSize.height; + + cv::UMat src = _src.getUMat(); + _dst.create(src.size(), src.type()); + cv::UMat dst = _dst.getUMat(); + cv::UMat lut = _lut.getUMat(); + + size_t localThreads[3] = { 32, 8, 1 }; + size_t globalThreads[3] = { src.cols, src.rows, 1 }; + + int idx = 0; + idx = k.set(idx, cv::ocl::KernelArg::ReadOnlyNoSize(src)); + idx = k.set(idx, cv::ocl::KernelArg::WriteOnlyNoSize(dst)); + idx = k.set(idx, cv::ocl::KernelArg::ReadOnlyNoSize(lut)); + idx = k.set(idx, src.cols); + idx = k.set(idx, src.rows); + idx = k.set(idx, tile_size); + idx = k.set(idx, tilesX); + idx = k.set(idx, tilesY); + + if (!k.run(2, globalThreads, localThreads, false)) + return false; + return true; + } +} + namespace { class CLAHE_CalcLut_Body : public cv::ParallelLoopBody @@ -241,7 +321,9 @@ namespace int tilesY_; cv::Mat srcExt_; + cv::UMat usrcExt_; cv::Mat lut_; + cv::UMat ulut_; }; CLAHE_Impl::CLAHE_Impl(double clipLimit, int tilesX, int tilesY) : @@ -256,31 +338,34 @@ namespace void CLAHE_Impl::apply(cv::InputArray _src, cv::OutputArray _dst) { - cv::Mat src = _src.getMat(); + CV_Assert( _src.type() == CV_8UC1 ); - CV_Assert( src.type() == CV_8UC1 ); - - _dst.create( src.size(), src.type() ); - cv::Mat dst = _dst.getMat(); + bool useOpenCL = cv::ocl::useOpenCL() && _src.isUMat() && _src.dims()<=2; const int histSize = 256; - lut_.create(tilesX_ * tilesY_, histSize, CV_8UC1); - cv::Size tileSize; - cv::Mat srcForLut; + cv::_InputArray _srcForLut; - if (src.cols % tilesX_ == 0 && src.rows % tilesY_ == 0) + if (_src.size().width % tilesX_ == 0 && _src.size().height % tilesY_ == 0) { - tileSize = cv::Size(src.cols / tilesX_, src.rows / tilesY_); - srcForLut = src; + tileSize = cv::Size(_src.size().width / tilesX_, _src.size().height / tilesY_); + _srcForLut = _src; } else { - cv::copyMakeBorder(src, srcExt_, 0, tilesY_ - (src.rows % tilesY_), 0, tilesX_ - (src.cols % tilesX_), cv::BORDER_REFLECT_101); - - tileSize = cv::Size(srcExt_.cols / tilesX_, srcExt_.rows / tilesY_); - srcForLut = srcExt_; + if(useOpenCL) + { + cv::copyMakeBorder(_src, usrcExt_, 0, tilesY_ - (_src.size().height % tilesY_), 0, tilesX_ - (_src.size().width % tilesX_), cv::BORDER_REFLECT_101); + tileSize = cv::Size(usrcExt_.size().width / tilesX_, usrcExt_.size().height / tilesY_); + _srcForLut = usrcExt_; + } + else + { + cv::copyMakeBorder(_src, srcExt_, 0, tilesY_ - (_src.size().height % tilesY_), 0, tilesX_ - (_src.size().width % tilesX_), cv::BORDER_REFLECT_101); + tileSize = cv::Size(srcExt_.size().width / tilesX_, srcExt_.size().height / tilesY_); + _srcForLut = srcExt_; + } } const int tileSizeTotal = tileSize.area(); @@ -293,6 +378,16 @@ namespace clipLimit = std::max(clipLimit, 1); } + if(useOpenCL && clahe::calcLut(_srcForLut, ulut_, tilesX_, tilesY_, tileSize, clipLimit, lutScale) ) + if( clahe::transform(_src, _dst, ulut_, tilesX_, tilesY_, tileSize) ) + return; + + cv::Mat src = _src.getMat(); + _dst.create( src.size(), src.type() ); + cv::Mat dst = _dst.getMat(); + cv::Mat srcForLut = _srcForLut.getMat(); + lut_.create(tilesX_ * tilesY_, histSize, CV_8UC1); + CLAHE_CalcLut_Body calcLutBody(srcForLut, lut_, tileSize, tilesX_, tilesY_, clipLimit, lutScale); cv::parallel_for_(cv::Range(0, tilesX_ * tilesY_), calcLutBody); @@ -325,6 +420,8 @@ namespace { srcExt_.release(); lut_.release(); + usrcExt_.release(); + ulut_.release(); } } diff --git a/modules/imgproc/src/opencl/clahe.cl b/modules/imgproc/src/opencl/clahe.cl new file mode 100644 index 000000000..9f88b20bf --- /dev/null +++ b/modules/imgproc/src/opencl/clahe.cl @@ -0,0 +1,252 @@ +/*M/////////////////////////////////////////////////////////////////////////////////////// +// +// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. +// +// By downloading, copying, installing or using the software you agree to this license. +// If you do not agree to this license, do not download, install, +// copy or use the software. +// +// +// License Agreement +// For Open Source Computer Vision Library +// +// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved. +// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved. +// Third party copyrights are property of their respective owners. +// +// @Authors +// Sen Liu, swjtuls1987@126.com +// +// Redistribution and use in source and binary forms, with or without modification, +// are permitted provided that the following conditions are met: +// +// * Redistribution's of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// +// * Redistribution's in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// +// * The name of the copyright holders may not be used to endorse or promote products +// derived from this software without specific prior written permission. +// +// This software is provided by the copyright holders and contributors as is and +// any express or implied warranties, including, but not limited to, the implied +// warranties of merchantability and fitness for a particular purpose are disclaimed. +// In no event shall the Intel Corporation or contributors be liable for any direct, +// indirect, incidental, special, exemplary, or consequential damages +// (including, but not limited to, procurement of substitute goods or services; +// loss of use, data, or profits; or business interruption) however caused +// and on any theory of liability, whether in contract, strict liability, +// or tort (including negligence or otherwise) arising in any way out of +// the use of this software, even if advised of the possibility of such damage. +// +//M*/ + +#ifndef WAVE_SIZE +#define WAVE_SIZE 1 +#endif + +inline int calc_lut(__local int* smem, int val, int tid) +{ + smem[tid] = val; + barrier(CLK_LOCAL_MEM_FENCE); + + if (tid == 0) + for (int i = 1; i < 256; ++i) + smem[i] += smem[i - 1]; + barrier(CLK_LOCAL_MEM_FENCE); + + return smem[tid]; +} + +#ifdef CPU +inline void reduce(volatile __local int* smem, int val, int tid) +{ + smem[tid] = val; + barrier(CLK_LOCAL_MEM_FENCE); + + if (tid < 128) + smem[tid] = val += smem[tid + 128]; + barrier(CLK_LOCAL_MEM_FENCE); + + if (tid < 64) + smem[tid] = val += smem[tid + 64]; + barrier(CLK_LOCAL_MEM_FENCE); + + if (tid < 32) + smem[tid] += smem[tid + 32]; + barrier(CLK_LOCAL_MEM_FENCE); + + if (tid < 16) + smem[tid] += smem[tid + 16]; + barrier(CLK_LOCAL_MEM_FENCE); + + if (tid < 8) + smem[tid] += smem[tid + 8]; + barrier(CLK_LOCAL_MEM_FENCE); + + if (tid < 4) + smem[tid] += smem[tid + 4]; + barrier(CLK_LOCAL_MEM_FENCE); + + if (tid < 2) + smem[tid] += smem[tid + 2]; + barrier(CLK_LOCAL_MEM_FENCE); + + if (tid < 1) + smem[256] = smem[tid] + smem[tid + 1]; + barrier(CLK_LOCAL_MEM_FENCE); +} + +#else + +inline void reduce(__local volatile int* smem, int val, int tid) +{ + smem[tid] = val; + barrier(CLK_LOCAL_MEM_FENCE); + + if (tid < 128) + smem[tid] = val += smem[tid + 128]; + barrier(CLK_LOCAL_MEM_FENCE); + + if (tid < 64) + smem[tid] = val += smem[tid + 64]; + barrier(CLK_LOCAL_MEM_FENCE); + + if (tid < 32) + { + smem[tid] += smem[tid + 32]; +#if WAVE_SIZE < 32 + } barrier(CLK_LOCAL_MEM_FENCE); + + if (tid < 16) + { +#endif + smem[tid] += smem[tid + 16]; +#if WAVE_SIZE < 16 + } + barrier(CLK_LOCAL_MEM_FENCE); + + if (tid < 8) + { +#endif + smem[tid] += smem[tid + 8]; + smem[tid] += smem[tid + 4]; + smem[tid] += smem[tid + 2]; + smem[tid] += smem[tid + 1]; + } +} +#endif + +__kernel void calcLut(__global __const uchar * src, const int srcStep, + const int src_offset, __global uchar * lut, + const int dstStep, const int dst_offset, + const int2 tileSize, const int tilesX, + const int clipLimit, const float lutScale) +{ + __local int smem[512]; + + int tx = get_group_id(0); + int ty = get_group_id(1); + int tid = get_local_id(1) * get_local_size(0) + + get_local_id(0); + smem[tid] = 0; + barrier(CLK_LOCAL_MEM_FENCE); + + for (int i = get_local_id(1); i < tileSize.y; i += get_local_size(1)) + { + __global const uchar* srcPtr = src + mad24(ty * tileSize.y + i, srcStep, tx * tileSize.x + src_offset); + for (int j = get_local_id(0); j < tileSize.x; j += get_local_size(0)) + { + const int data = srcPtr[j]; + atomic_inc(&smem[data]); + } + } + barrier(CLK_LOCAL_MEM_FENCE); + + int tHistVal = smem[tid]; + barrier(CLK_LOCAL_MEM_FENCE); + + if (clipLimit > 0) + { + // clip histogram bar + int clipped = 0; + if (tHistVal > clipLimit) + { + clipped = tHistVal - clipLimit; + tHistVal = clipLimit; + } + + // find number of overall clipped samples + reduce(smem, clipped, tid); + barrier(CLK_LOCAL_MEM_FENCE); +#ifdef CPU + clipped = smem[256]; +#else + clipped = smem[0]; +#endif + + // broadcast evaluated value + + __local int totalClipped; + + if (tid == 0) + totalClipped = clipped; + barrier(CLK_LOCAL_MEM_FENCE); + + // redistribute clipped samples evenly + + int redistBatch = totalClipped / 256; + tHistVal += redistBatch; + + int residual = totalClipped - redistBatch * 256; + if (tid < residual) + ++tHistVal; + } + + const int lutVal = calc_lut(smem, tHistVal, tid); + uint ires = (uint)convert_int_rte(lutScale * lutVal); + lut[(ty * tilesX + tx) * dstStep + tid + dst_offset] = + convert_uchar(clamp(ires, (uint)0, (uint)255)); +} + +__kernel void transform(__global __const uchar * src, const int srcStep, const int src_offset, + __global uchar * dst, const int dstStep, const int dst_offset, + __global uchar * lut, const int lutStep, int lut_offset, + const int cols, const int rows, + const int2 tileSize, + const int tilesX, const int tilesY) +{ + const int x = get_global_id(0); + const int y = get_global_id(1); + + if (x >= cols || y >= rows) + return; + + const float tyf = (convert_float(y) / tileSize.y) - 0.5f; + int ty1 = convert_int_rtn(tyf); + int ty2 = ty1 + 1; + const float ya = tyf - ty1; + ty1 = max(ty1, 0); + ty2 = min(ty2, tilesY - 1); + + const float txf = (convert_float(x) / tileSize.x) - 0.5f; + int tx1 = convert_int_rtn(txf); + int tx2 = tx1 + 1; + const float xa = txf - tx1; + tx1 = max(tx1, 0); + tx2 = min(tx2, tilesX - 1); + + const int srcVal = src[mad24(y, srcStep, x + src_offset)]; + + float res = 0; + + res += lut[mad24(ty1 * tilesX + tx1, lutStep, srcVal + lut_offset)] * ((1.0f - xa) * (1.0f - ya)); + res += lut[mad24(ty1 * tilesX + tx2, lutStep, srcVal + lut_offset)] * ((xa) * (1.0f - ya)); + res += lut[mad24(ty2 * tilesX + tx1, lutStep, srcVal + lut_offset)] * ((1.0f - xa) * (ya)); + res += lut[mad24(ty2 * tilesX + tx2, lutStep, srcVal + lut_offset)] * ((xa) * (ya)); + + uint ires = (uint)convert_int_rte(res); + dst[mad24(y, dstStep, x + dst_offset)] = convert_uchar(clamp(ires, (uint)0, (uint)255)); +}