From 60f9ba0c64cb10abb1c1bfc517e8319c6a2e618a Mon Sep 17 00:00:00 2001 From: Ilya Lavrenov <ilya.lavrenov@itseez.com> Date: Fri, 18 Oct 2013 16:31:52 +0400 Subject: [PATCH] added ROI support to ocl::CLAHE --- modules/ocl/src/imgproc.cpp | 12 +++-- modules/ocl/src/opencl/imgproc_clahe.cl | 64 +++++++++---------------- 2 files changed, 31 insertions(+), 45 deletions(-) diff --git a/modules/ocl/src/imgproc.cpp b/modules/ocl/src/imgproc.cpp index 916d25ca9..98ec14f5f 100644 --- a/modules/ocl/src/imgproc.cpp +++ b/modules/ocl/src/imgproc.cpp @@ -1314,6 +1314,8 @@ namespace cv args.push_back( std::make_pair( sizeof(cl_int), (void *)&tilesX )); args.push_back( std::make_pair( sizeof(cl_int), (void *)&clipLimit )); args.push_back( std::make_pair( sizeof(cl_float), (void *)&lutScale )); + args.push_back( std::make_pair( sizeof(cl_int), (void *)&src.offset )); + args.push_back( std::make_pair( sizeof(cl_int), (void *)&dst.offset )); String kernelName = "calcLut"; size_t localThreads[3] = { 32, 8, 1 }; @@ -1333,7 +1335,7 @@ namespace cv } static void transform(const oclMat &src, oclMat &dst, const oclMat &lut, - const int tilesX, const int tilesY, const cv::Size tileSize) + const int tilesX, const int tilesY, const Size & tileSize) { cl_int2 tile_size; tile_size.s[0] = tileSize.width; @@ -1351,6 +1353,9 @@ namespace cv args.push_back( std::make_pair( sizeof(cl_int2), (void *)&tile_size )); args.push_back( std::make_pair( sizeof(cl_int), (void *)&tilesX )); args.push_back( std::make_pair( sizeof(cl_int), (void *)&tilesY )); + args.push_back( std::make_pair( sizeof(cl_int), (void *)&src.offset )); + args.push_back( std::make_pair( sizeof(cl_int), (void *)&dst.offset )); + args.push_back( std::make_pair( sizeof(cl_int), (void *)&lut.offset )); size_t localThreads[3] = { 32, 8, 1 }; size_t globalThreads[3] = { src.cols, src.rows, 1 }; @@ -1419,9 +1424,10 @@ namespace cv } else { - cv::ocl::copyMakeBorder(src, srcExt_, 0, tilesY_ - (src.rows % tilesY_), 0, tilesX_ - (src.cols % tilesX_), cv::BORDER_REFLECT_101, cv::Scalar()); + ocl::copyMakeBorder(src, srcExt_, 0, tilesY_ - (src.rows % tilesY_), 0, + tilesX_ - (src.cols % tilesX_), BORDER_REFLECT_101, Scalar::all(0)); - tileSize = cv::Size(srcExt_.cols / tilesX_, srcExt_.rows / tilesY_); + tileSize = Size(srcExt_.cols / tilesX_, srcExt_.rows / tilesY_); srcForLut = srcExt_; } diff --git a/modules/ocl/src/opencl/imgproc_clahe.cl b/modules/ocl/src/opencl/imgproc_clahe.cl index 49c709692..55692ae3b 100644 --- a/modules/ocl/src/opencl/imgproc_clahe.cl +++ b/modules/ocl/src/opencl/imgproc_clahe.cl @@ -53,12 +53,8 @@ int calc_lut(__local int* smem, int val, int tid) barrier(CLK_LOCAL_MEM_FENCE); if (tid == 0) - { for (int i = 1; i < 256; ++i) - { smem[i] += smem[i - 1]; - } - } barrier(CLK_LOCAL_MEM_FENCE); return smem[tid]; @@ -71,69 +67,51 @@ void reduce(volatile __local int* smem, int val, int tid) barrier(CLK_LOCAL_MEM_FENCE); if (tid < 128) - { smem[tid] = val += smem[tid + 128]; - } barrier(CLK_LOCAL_MEM_FENCE); if (tid < 64) - { smem[tid] = val += smem[tid + 64]; - } barrier(CLK_LOCAL_MEM_FENCE); if (tid < 32) - { smem[tid] += smem[tid + 32]; - } barrier(CLK_LOCAL_MEM_FENCE); if (tid < 16) - { smem[tid] += smem[tid + 16]; - } barrier(CLK_LOCAL_MEM_FENCE); if (tid < 8) - { smem[tid] += smem[tid + 8]; - } barrier(CLK_LOCAL_MEM_FENCE); if (tid < 4) - { smem[tid] += smem[tid + 4]; - } barrier(CLK_LOCAL_MEM_FENCE); if (tid < 2) - { smem[tid] += smem[tid + 2]; - } barrier(CLK_LOCAL_MEM_FENCE); if (tid < 1) - { smem[256] = smem[tid] + smem[tid + 1]; - } barrier(CLK_LOCAL_MEM_FENCE); } + #else + void reduce(__local volatile int* smem, int val, int tid) { smem[tid] = val; barrier(CLK_LOCAL_MEM_FENCE); if (tid < 128) - { smem[tid] = val += smem[tid + 128]; - } barrier(CLK_LOCAL_MEM_FENCE); if (tid < 64) - { smem[tid] = val += smem[tid + 64]; - } barrier(CLK_LOCAL_MEM_FENCE); if (tid < 32) @@ -141,12 +119,17 @@ void reduce(__local volatile int* smem, int val, int tid) smem[tid] += smem[tid + 32]; #if WAVE_SIZE < 32 } barrier(CLK_LOCAL_MEM_FENCE); - if (tid < 16) { + + if (tid < 16) + { #endif smem[tid] += smem[tid + 16]; #if WAVE_SIZE < 16 - } barrier(CLK_LOCAL_MEM_FENCE); - if (tid < 8) { + } + barrier(CLK_LOCAL_MEM_FENCE); + + if (tid < 8) + { #endif smem[tid] += smem[tid + 8]; smem[tid] += smem[tid + 4]; @@ -159,7 +142,8 @@ void reduce(__local volatile int* smem, int val, int tid) __kernel void calcLut(__global __const uchar * src, __global uchar * lut, const int srcStep, const int dstStep, const int2 tileSize, const int tilesX, - const int clipLimit, const float lutScale) + const int clipLimit, const float lutScale, + const int src_offset, const int dst_offset) { __local int smem[512]; @@ -173,25 +157,21 @@ __kernel void calcLut(__global __const uchar * src, __global uchar * lut, for (int i = get_local_id(1); i < tileSize.y; i += get_local_size(1)) { - __global const uchar* srcPtr = src + mad24( ty * tileSize.y + i, - srcStep, tx * tileSize.x ); + __global const uchar* srcPtr = src + mad24(ty * tileSize.y + i, srcStep, tx * tileSize.x + src_offset); for (int j = get_local_id(0); j < tileSize.x; j += get_local_size(0)) { const int data = srcPtr[j]; atomic_inc(&smem[data]); } } - barrier(CLK_LOCAL_MEM_FENCE); int tHistVal = smem[tid]; - barrier(CLK_LOCAL_MEM_FENCE); if (clipLimit > 0) { // clip histogram bar - int clipped = 0; if (tHistVal > clipLimit) { @@ -200,7 +180,6 @@ __kernel void calcLut(__global __const uchar * src, __global uchar * lut, } // find number of overall clipped samples - reduce(smem, clipped, tid); barrier(CLK_LOCAL_MEM_FENCE); #ifdef CPU @@ -229,7 +208,7 @@ __kernel void calcLut(__global __const uchar * src, __global uchar * lut, const int lutVal = calc_lut(smem, tHistVal, tid); uint ires = (uint)convert_int_rte(lutScale * lutVal); - lut[(ty * tilesX + tx) * dstStep + tid] = + lut[(ty * tilesX + tx) * dstStep + tid + dst_offset] = convert_uchar(clamp(ires, (uint)0, (uint)255)); } @@ -239,7 +218,8 @@ __kernel void transform(__global __const uchar * src, const int srcStep, const int dstStep, const int lutStep, const int cols, const int rows, const int2 tileSize, - const int tilesX, const int tilesY) + const int tilesX, const int tilesY, + const int src_offset, const int dst_offset, int lut_offset) { const int x = get_global_id(0); const int y = get_global_id(1); @@ -261,15 +241,15 @@ __kernel void transform(__global __const uchar * src, tx1 = max(tx1, 0); tx2 = min(tx2, tilesX - 1); - const int srcVal = src[mad24(y, srcStep, x)]; + const int srcVal = src[mad24(y, srcStep, x + src_offset)]; float res = 0; - res += lut[mad24(ty1 * tilesX + tx1, lutStep, srcVal)] * ((1.0f - xa) * (1.0f - ya)); - res += lut[mad24(ty1 * tilesX + tx2, lutStep, srcVal)] * ((xa) * (1.0f - ya)); - res += lut[mad24(ty2 * tilesX + tx1, lutStep, srcVal)] * ((1.0f - xa) * (ya)); - res += lut[mad24(ty2 * tilesX + tx2, lutStep, srcVal)] * ((xa) * (ya)); + res += lut[mad24(ty1 * tilesX + tx1, lutStep, srcVal + lut_offset)] * ((1.0f - xa) * (1.0f - ya)); + res += lut[mad24(ty1 * tilesX + tx2, lutStep, srcVal + lut_offset)] * ((xa) * (1.0f - ya)); + res += lut[mad24(ty2 * tilesX + tx1, lutStep, srcVal + lut_offset)] * ((1.0f - xa) * (ya)); + res += lut[mad24(ty2 * tilesX + tx2, lutStep, srcVal + lut_offset)] * ((xa) * (ya)); uint ires = (uint)convert_int_rte(res); - dst[mad24(y, dstStep, x)] = convert_uchar(clamp(ires, (uint)0, (uint)255)); + dst[mad24(y, dstStep, x + dst_offset)] = convert_uchar(clamp(ires, (uint)0, (uint)255)); }