Merge pull request #2073 from KonstantinMatskevich:ocl_tapi_clahe
This commit is contained in:
		@@ -489,6 +489,7 @@ public:
 | 
			
		||||
    bool runTask(bool sync, const Queue& q=Queue());
 | 
			
		||||
 | 
			
		||||
    size_t workGroupSize() const;
 | 
			
		||||
    size_t preferedWorkGroupSizeMultiple() const;
 | 
			
		||||
    bool compileWorkGroupSize(size_t wsz[]) const;
 | 
			
		||||
    size_t localMemSize() const;
 | 
			
		||||
 | 
			
		||||
 
 | 
			
		||||
@@ -2817,6 +2817,16 @@ size_t Kernel::workGroupSize() const
 | 
			
		||||
                                    sizeof(val), &val, &retsz) >= 0 ? val : 0;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
size_t Kernel::preferedWorkGroupSizeMultiple() const
 | 
			
		||||
{
 | 
			
		||||
    if(!p)
 | 
			
		||||
        return 0;
 | 
			
		||||
    size_t val = 0, retsz = 0;
 | 
			
		||||
    cl_device_id dev = (cl_device_id)Device::getDefault().ptr();
 | 
			
		||||
    return clGetKernelWorkGroupInfo(p->handle, dev, CL_KERNEL_PREFERRED_WORK_GROUP_SIZE_MULTIPLE,
 | 
			
		||||
                                    sizeof(val), &val, &retsz) >= 0 ? val : 0;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
bool Kernel::compileWorkGroupSize(size_t wsz[]) const
 | 
			
		||||
{
 | 
			
		||||
    if(!p || !wsz)
 | 
			
		||||
 
 | 
			
		||||
@@ -40,10 +40,90 @@
 | 
			
		||||
//M*/
 | 
			
		||||
 | 
			
		||||
#include "precomp.hpp"
 | 
			
		||||
#include "opencl_kernels.hpp"
 | 
			
		||||
 | 
			
		||||
// ----------------------------------------------------------------------
 | 
			
		||||
// CLAHE
 | 
			
		||||
 | 
			
		||||
namespace clahe
 | 
			
		||||
{
 | 
			
		||||
    static bool calcLut(cv::InputArray _src, cv::OutputArray _dst,
 | 
			
		||||
        const int tilesX, const int tilesY, const cv::Size tileSize,
 | 
			
		||||
        const int clipLimit, const float lutScale)
 | 
			
		||||
    {
 | 
			
		||||
        cv::ocl::Kernel _k("calcLut", cv::ocl::imgproc::clahe_oclsrc);
 | 
			
		||||
 | 
			
		||||
        bool is_cpu = cv::ocl::Device::getDefault().type() == cv::ocl::Device::TYPE_CPU;
 | 
			
		||||
        cv::String opts;
 | 
			
		||||
        if(is_cpu)
 | 
			
		||||
            opts = "-D CPU ";
 | 
			
		||||
        else
 | 
			
		||||
            opts = cv::format("-D WAVE_SIZE=%d", _k.preferedWorkGroupSizeMultiple());
 | 
			
		||||
 | 
			
		||||
        cv::ocl::Kernel k("calcLut", cv::ocl::imgproc::clahe_oclsrc, opts);
 | 
			
		||||
        if(k.empty())
 | 
			
		||||
            return false;
 | 
			
		||||
 | 
			
		||||
        cv::UMat src = _src.getUMat();
 | 
			
		||||
        _dst.create(tilesX * tilesY, 256, CV_8UC1);
 | 
			
		||||
        cv::UMat dst = _dst.getUMat();
 | 
			
		||||
 | 
			
		||||
        int tile_size[2];
 | 
			
		||||
        tile_size[0] = tileSize.width;
 | 
			
		||||
        tile_size[1] = tileSize.height;
 | 
			
		||||
 | 
			
		||||
        size_t localThreads[3]  = { 32, 8, 1 };
 | 
			
		||||
        size_t globalThreads[3] = { tilesX * localThreads[0], tilesY * localThreads[1], 1 };
 | 
			
		||||
 | 
			
		||||
        int idx = 0;
 | 
			
		||||
        idx = k.set(idx, cv::ocl::KernelArg::ReadOnlyNoSize(src));
 | 
			
		||||
        idx = k.set(idx, cv::ocl::KernelArg::WriteOnlyNoSize(dst));
 | 
			
		||||
        idx = k.set(idx, tile_size);
 | 
			
		||||
        idx = k.set(idx, tilesX);
 | 
			
		||||
        idx = k.set(idx, clipLimit);
 | 
			
		||||
        idx = k.set(idx, lutScale);
 | 
			
		||||
 | 
			
		||||
        if (!k.run(2, globalThreads, localThreads, false))
 | 
			
		||||
            return false;
 | 
			
		||||
        return true;
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    static bool transform(const cv::InputArray _src, cv::OutputArray _dst, const cv::InputArray _lut,
 | 
			
		||||
        const int tilesX, const int tilesY, const cv::Size & tileSize)
 | 
			
		||||
    {
 | 
			
		||||
 | 
			
		||||
        cv::ocl::Kernel k("transform", cv::ocl::imgproc::clahe_oclsrc);
 | 
			
		||||
        if(k.empty())
 | 
			
		||||
            return false;
 | 
			
		||||
 | 
			
		||||
        int tile_size[2];
 | 
			
		||||
        tile_size[0] = tileSize.width;
 | 
			
		||||
        tile_size[1] = tileSize.height;
 | 
			
		||||
 | 
			
		||||
        cv::UMat src = _src.getUMat();
 | 
			
		||||
        _dst.create(src.size(), src.type());
 | 
			
		||||
        cv::UMat dst = _dst.getUMat();
 | 
			
		||||
        cv::UMat lut = _lut.getUMat();
 | 
			
		||||
 | 
			
		||||
        size_t localThreads[3]  = { 32, 8, 1 };
 | 
			
		||||
        size_t globalThreads[3] = { src.cols, src.rows, 1 };
 | 
			
		||||
 | 
			
		||||
        int idx = 0;
 | 
			
		||||
        idx = k.set(idx, cv::ocl::KernelArg::ReadOnlyNoSize(src));
 | 
			
		||||
        idx = k.set(idx, cv::ocl::KernelArg::WriteOnlyNoSize(dst));
 | 
			
		||||
        idx = k.set(idx, cv::ocl::KernelArg::ReadOnlyNoSize(lut));
 | 
			
		||||
        idx = k.set(idx, src.cols);
 | 
			
		||||
        idx = k.set(idx, src.rows);
 | 
			
		||||
        idx = k.set(idx, tile_size);
 | 
			
		||||
        idx = k.set(idx, tilesX);
 | 
			
		||||
        idx = k.set(idx, tilesY);
 | 
			
		||||
 | 
			
		||||
        if (!k.run(2, globalThreads, localThreads, false))
 | 
			
		||||
            return false;
 | 
			
		||||
        return true;
 | 
			
		||||
    }
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
namespace
 | 
			
		||||
{
 | 
			
		||||
    class CLAHE_CalcLut_Body : public cv::ParallelLoopBody
 | 
			
		||||
@@ -241,7 +321,9 @@ namespace
 | 
			
		||||
        int tilesY_;
 | 
			
		||||
 | 
			
		||||
        cv::Mat srcExt_;
 | 
			
		||||
        cv::UMat usrcExt_;
 | 
			
		||||
        cv::Mat lut_;
 | 
			
		||||
        cv::UMat ulut_;
 | 
			
		||||
    };
 | 
			
		||||
 | 
			
		||||
    CLAHE_Impl::CLAHE_Impl(double clipLimit, int tilesX, int tilesY) :
 | 
			
		||||
@@ -256,31 +338,34 @@ namespace
 | 
			
		||||
 | 
			
		||||
    void CLAHE_Impl::apply(cv::InputArray _src, cv::OutputArray _dst)
 | 
			
		||||
    {
 | 
			
		||||
        cv::Mat src = _src.getMat();
 | 
			
		||||
        CV_Assert( _src.type() == CV_8UC1 );
 | 
			
		||||
 | 
			
		||||
        CV_Assert( src.type() == CV_8UC1 );
 | 
			
		||||
 | 
			
		||||
        _dst.create( src.size(), src.type() );
 | 
			
		||||
        cv::Mat dst = _dst.getMat();
 | 
			
		||||
        bool useOpenCL = cv::ocl::useOpenCL() && _src.isUMat() && _src.dims()<=2;
 | 
			
		||||
 | 
			
		||||
        const int histSize = 256;
 | 
			
		||||
 | 
			
		||||
        lut_.create(tilesX_ * tilesY_, histSize, CV_8UC1);
 | 
			
		||||
 | 
			
		||||
        cv::Size tileSize;
 | 
			
		||||
        cv::Mat srcForLut;
 | 
			
		||||
        cv::_InputArray _srcForLut;
 | 
			
		||||
 | 
			
		||||
        if (src.cols % tilesX_ == 0 && src.rows % tilesY_ == 0)
 | 
			
		||||
        if (_src.size().width % tilesX_ == 0 && _src.size().height % tilesY_ == 0)
 | 
			
		||||
        {
 | 
			
		||||
            tileSize = cv::Size(src.cols / tilesX_, src.rows / tilesY_);
 | 
			
		||||
            srcForLut = src;
 | 
			
		||||
            tileSize = cv::Size(_src.size().width / tilesX_, _src.size().height / tilesY_);
 | 
			
		||||
            _srcForLut = _src;
 | 
			
		||||
        }
 | 
			
		||||
        else
 | 
			
		||||
        {
 | 
			
		||||
            cv::copyMakeBorder(src, srcExt_, 0, tilesY_ - (src.rows % tilesY_), 0, tilesX_ - (src.cols % tilesX_), cv::BORDER_REFLECT_101);
 | 
			
		||||
 | 
			
		||||
            tileSize = cv::Size(srcExt_.cols / tilesX_, srcExt_.rows / tilesY_);
 | 
			
		||||
            srcForLut = srcExt_;
 | 
			
		||||
            if(useOpenCL)
 | 
			
		||||
            {
 | 
			
		||||
                cv::copyMakeBorder(_src, usrcExt_, 0, tilesY_ - (_src.size().height % tilesY_), 0, tilesX_ - (_src.size().width % tilesX_), cv::BORDER_REFLECT_101);
 | 
			
		||||
                tileSize = cv::Size(usrcExt_.size().width / tilesX_, usrcExt_.size().height / tilesY_);
 | 
			
		||||
                _srcForLut = usrcExt_;
 | 
			
		||||
            }
 | 
			
		||||
            else
 | 
			
		||||
            {
 | 
			
		||||
                cv::copyMakeBorder(_src, srcExt_, 0, tilesY_ - (_src.size().height % tilesY_), 0, tilesX_ - (_src.size().width % tilesX_), cv::BORDER_REFLECT_101);
 | 
			
		||||
                tileSize = cv::Size(srcExt_.size().width / tilesX_, srcExt_.size().height / tilesY_);
 | 
			
		||||
                _srcForLut = srcExt_;
 | 
			
		||||
            }
 | 
			
		||||
        }
 | 
			
		||||
 | 
			
		||||
        const int tileSizeTotal = tileSize.area();
 | 
			
		||||
@@ -293,6 +378,16 @@ namespace
 | 
			
		||||
            clipLimit = std::max(clipLimit, 1);
 | 
			
		||||
        }
 | 
			
		||||
 | 
			
		||||
        if(useOpenCL && clahe::calcLut(_srcForLut, ulut_, tilesX_, tilesY_, tileSize, clipLimit, lutScale) )
 | 
			
		||||
            if( clahe::transform(_src, _dst, ulut_, tilesX_, tilesY_, tileSize) )
 | 
			
		||||
                return;
 | 
			
		||||
 | 
			
		||||
        cv::Mat src = _src.getMat();
 | 
			
		||||
        _dst.create( src.size(), src.type() );
 | 
			
		||||
        cv::Mat dst = _dst.getMat();
 | 
			
		||||
        cv::Mat srcForLut = _srcForLut.getMat();
 | 
			
		||||
        lut_.create(tilesX_ * tilesY_, histSize, CV_8UC1);
 | 
			
		||||
 | 
			
		||||
        CLAHE_CalcLut_Body calcLutBody(srcForLut, lut_, tileSize, tilesX_, tilesY_, clipLimit, lutScale);
 | 
			
		||||
        cv::parallel_for_(cv::Range(0, tilesX_ * tilesY_), calcLutBody);
 | 
			
		||||
 | 
			
		||||
@@ -325,6 +420,8 @@ namespace
 | 
			
		||||
    {
 | 
			
		||||
        srcExt_.release();
 | 
			
		||||
        lut_.release();
 | 
			
		||||
        usrcExt_.release();
 | 
			
		||||
        ulut_.release();
 | 
			
		||||
    }
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
 
 | 
			
		||||
							
								
								
									
										252
									
								
								modules/imgproc/src/opencl/clahe.cl
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										252
									
								
								modules/imgproc/src/opencl/clahe.cl
									
									
									
									
									
										Normal file
									
								
							@@ -0,0 +1,252 @@
 | 
			
		||||
/*M///////////////////////////////////////////////////////////////////////////////////////
 | 
			
		||||
//
 | 
			
		||||
//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
 | 
			
		||||
//
 | 
			
		||||
//  By downloading, copying, installing or using the software you agree to this license.
 | 
			
		||||
//  If you do not agree to this license, do not download, install,
 | 
			
		||||
//  copy or use the software.
 | 
			
		||||
//
 | 
			
		||||
//
 | 
			
		||||
//                           License Agreement
 | 
			
		||||
//                For Open Source Computer Vision Library
 | 
			
		||||
//
 | 
			
		||||
// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
 | 
			
		||||
// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
 | 
			
		||||
// Third party copyrights are property of their respective owners.
 | 
			
		||||
//
 | 
			
		||||
// @Authors
 | 
			
		||||
//    Sen Liu, swjtuls1987@126.com
 | 
			
		||||
//
 | 
			
		||||
// Redistribution and use in source and binary forms, with or without modification,
 | 
			
		||||
// are permitted provided that the following conditions are met:
 | 
			
		||||
//
 | 
			
		||||
//   * Redistribution's of source code must retain the above copyright notice,
 | 
			
		||||
//     this list of conditions and the following disclaimer.
 | 
			
		||||
//
 | 
			
		||||
//   * Redistribution's in binary form must reproduce the above copyright notice,
 | 
			
		||||
//     this list of conditions and the following disclaimer in the documentation
 | 
			
		||||
//     and/or other materials provided with the distribution.
 | 
			
		||||
//
 | 
			
		||||
//   * The name of the copyright holders may not be used to endorse or promote products
 | 
			
		||||
//     derived from this software without specific prior written permission.
 | 
			
		||||
//
 | 
			
		||||
// This software is provided by the copyright holders and contributors as is and
 | 
			
		||||
// any express or implied warranties, including, but not limited to, the implied
 | 
			
		||||
// warranties of merchantability and fitness for a particular purpose are disclaimed.
 | 
			
		||||
// In no event shall the Intel Corporation or contributors be liable for any direct,
 | 
			
		||||
// indirect, incidental, special, exemplary, or consequential damages
 | 
			
		||||
// (including, but not limited to, procurement of substitute goods or services;
 | 
			
		||||
// loss of use, data, or profits; or business interruption) however caused
 | 
			
		||||
// and on any theory of liability, whether in contract, strict liability,
 | 
			
		||||
// or tort (including negligence or otherwise) arising in any way out of
 | 
			
		||||
// the use of this software, even if advised of the possibility of such damage.
 | 
			
		||||
//
 | 
			
		||||
//M*/
 | 
			
		||||
 | 
			
		||||
#ifndef WAVE_SIZE
 | 
			
		||||
#define WAVE_SIZE 1
 | 
			
		||||
#endif
 | 
			
		||||
 | 
			
		||||
inline int calc_lut(__local int* smem, int val, int tid)
 | 
			
		||||
{
 | 
			
		||||
    smem[tid] = val;
 | 
			
		||||
    barrier(CLK_LOCAL_MEM_FENCE);
 | 
			
		||||
 | 
			
		||||
    if (tid == 0)
 | 
			
		||||
        for (int i = 1; i < 256; ++i)
 | 
			
		||||
            smem[i] += smem[i - 1];
 | 
			
		||||
    barrier(CLK_LOCAL_MEM_FENCE);
 | 
			
		||||
 | 
			
		||||
    return smem[tid];
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
#ifdef CPU
 | 
			
		||||
inline void reduce(volatile __local int* smem, int val, int tid)
 | 
			
		||||
{
 | 
			
		||||
    smem[tid] = val;
 | 
			
		||||
    barrier(CLK_LOCAL_MEM_FENCE);
 | 
			
		||||
 | 
			
		||||
    if (tid < 128)
 | 
			
		||||
        smem[tid] = val += smem[tid + 128];
 | 
			
		||||
    barrier(CLK_LOCAL_MEM_FENCE);
 | 
			
		||||
 | 
			
		||||
    if (tid < 64)
 | 
			
		||||
        smem[tid] = val += smem[tid + 64];
 | 
			
		||||
    barrier(CLK_LOCAL_MEM_FENCE);
 | 
			
		||||
 | 
			
		||||
    if (tid < 32)
 | 
			
		||||
        smem[tid] += smem[tid + 32];
 | 
			
		||||
    barrier(CLK_LOCAL_MEM_FENCE);
 | 
			
		||||
 | 
			
		||||
    if (tid < 16)
 | 
			
		||||
        smem[tid] += smem[tid + 16];
 | 
			
		||||
    barrier(CLK_LOCAL_MEM_FENCE);
 | 
			
		||||
 | 
			
		||||
    if (tid < 8)
 | 
			
		||||
        smem[tid] += smem[tid + 8];
 | 
			
		||||
    barrier(CLK_LOCAL_MEM_FENCE);
 | 
			
		||||
 | 
			
		||||
    if (tid < 4)
 | 
			
		||||
        smem[tid] += smem[tid + 4];
 | 
			
		||||
    barrier(CLK_LOCAL_MEM_FENCE);
 | 
			
		||||
 | 
			
		||||
    if (tid < 2)
 | 
			
		||||
        smem[tid] += smem[tid + 2];
 | 
			
		||||
    barrier(CLK_LOCAL_MEM_FENCE);
 | 
			
		||||
 | 
			
		||||
    if (tid < 1)
 | 
			
		||||
        smem[256] = smem[tid] + smem[tid + 1];
 | 
			
		||||
    barrier(CLK_LOCAL_MEM_FENCE);
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
#else
 | 
			
		||||
 | 
			
		||||
inline void reduce(__local volatile int* smem, int val, int tid)
 | 
			
		||||
{
 | 
			
		||||
    smem[tid] = val;
 | 
			
		||||
    barrier(CLK_LOCAL_MEM_FENCE);
 | 
			
		||||
 | 
			
		||||
    if (tid < 128)
 | 
			
		||||
        smem[tid] = val += smem[tid + 128];
 | 
			
		||||
    barrier(CLK_LOCAL_MEM_FENCE);
 | 
			
		||||
 | 
			
		||||
    if (tid < 64)
 | 
			
		||||
        smem[tid] = val += smem[tid + 64];
 | 
			
		||||
    barrier(CLK_LOCAL_MEM_FENCE);
 | 
			
		||||
 | 
			
		||||
    if (tid < 32)
 | 
			
		||||
    {
 | 
			
		||||
        smem[tid] += smem[tid + 32];
 | 
			
		||||
#if WAVE_SIZE < 32
 | 
			
		||||
    } barrier(CLK_LOCAL_MEM_FENCE);
 | 
			
		||||
 | 
			
		||||
    if (tid < 16)
 | 
			
		||||
    {
 | 
			
		||||
#endif
 | 
			
		||||
        smem[tid] += smem[tid + 16];
 | 
			
		||||
#if WAVE_SIZE < 16
 | 
			
		||||
    }
 | 
			
		||||
    barrier(CLK_LOCAL_MEM_FENCE);
 | 
			
		||||
 | 
			
		||||
    if (tid < 8)
 | 
			
		||||
    {
 | 
			
		||||
#endif
 | 
			
		||||
        smem[tid] += smem[tid + 8];
 | 
			
		||||
        smem[tid] += smem[tid + 4];
 | 
			
		||||
        smem[tid] += smem[tid + 2];
 | 
			
		||||
        smem[tid] += smem[tid + 1];
 | 
			
		||||
    }
 | 
			
		||||
}
 | 
			
		||||
#endif
 | 
			
		||||
 | 
			
		||||
__kernel void calcLut(__global __const uchar * src, const int srcStep,
 | 
			
		||||
                      const int src_offset, __global uchar * lut,
 | 
			
		||||
                      const int dstStep, const int dst_offset,
 | 
			
		||||
                      const int2 tileSize, const int tilesX,
 | 
			
		||||
                      const int clipLimit, const float lutScale)
 | 
			
		||||
{
 | 
			
		||||
    __local int smem[512];
 | 
			
		||||
 | 
			
		||||
    int tx = get_group_id(0);
 | 
			
		||||
    int ty = get_group_id(1);
 | 
			
		||||
    int tid = get_local_id(1) * get_local_size(0)
 | 
			
		||||
                             + get_local_id(0);
 | 
			
		||||
    smem[tid] = 0;
 | 
			
		||||
    barrier(CLK_LOCAL_MEM_FENCE);
 | 
			
		||||
 | 
			
		||||
    for (int i = get_local_id(1); i < tileSize.y; i += get_local_size(1))
 | 
			
		||||
    {
 | 
			
		||||
        __global const uchar* srcPtr = src + mad24(ty * tileSize.y + i, srcStep, tx * tileSize.x + src_offset);
 | 
			
		||||
        for (int j = get_local_id(0); j < tileSize.x; j += get_local_size(0))
 | 
			
		||||
        {
 | 
			
		||||
            const int data = srcPtr[j];
 | 
			
		||||
            atomic_inc(&smem[data]);
 | 
			
		||||
        }
 | 
			
		||||
    }
 | 
			
		||||
    barrier(CLK_LOCAL_MEM_FENCE);
 | 
			
		||||
 | 
			
		||||
    int tHistVal = smem[tid];
 | 
			
		||||
    barrier(CLK_LOCAL_MEM_FENCE);
 | 
			
		||||
 | 
			
		||||
    if (clipLimit > 0)
 | 
			
		||||
    {
 | 
			
		||||
        // clip histogram bar
 | 
			
		||||
        int clipped = 0;
 | 
			
		||||
        if (tHistVal > clipLimit)
 | 
			
		||||
        {
 | 
			
		||||
            clipped = tHistVal - clipLimit;
 | 
			
		||||
            tHistVal = clipLimit;
 | 
			
		||||
        }
 | 
			
		||||
 | 
			
		||||
        // find number of overall clipped samples
 | 
			
		||||
        reduce(smem, clipped, tid);
 | 
			
		||||
        barrier(CLK_LOCAL_MEM_FENCE);
 | 
			
		||||
#ifdef CPU
 | 
			
		||||
        clipped = smem[256];
 | 
			
		||||
#else
 | 
			
		||||
        clipped = smem[0];
 | 
			
		||||
#endif
 | 
			
		||||
 | 
			
		||||
        // broadcast evaluated value
 | 
			
		||||
 | 
			
		||||
        __local int totalClipped;
 | 
			
		||||
 | 
			
		||||
        if (tid == 0)
 | 
			
		||||
            totalClipped = clipped;
 | 
			
		||||
        barrier(CLK_LOCAL_MEM_FENCE);
 | 
			
		||||
 | 
			
		||||
        // redistribute clipped samples evenly
 | 
			
		||||
 | 
			
		||||
        int redistBatch = totalClipped / 256;
 | 
			
		||||
        tHistVal += redistBatch;
 | 
			
		||||
 | 
			
		||||
        int residual = totalClipped - redistBatch * 256;
 | 
			
		||||
        if (tid < residual)
 | 
			
		||||
            ++tHistVal;
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    const int lutVal = calc_lut(smem, tHistVal, tid);
 | 
			
		||||
    uint ires = (uint)convert_int_rte(lutScale * lutVal);
 | 
			
		||||
    lut[(ty * tilesX + tx) * dstStep + tid + dst_offset] =
 | 
			
		||||
        convert_uchar(clamp(ires, (uint)0, (uint)255));
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
__kernel void transform(__global __const uchar * src, const int srcStep, const int src_offset,
 | 
			
		||||
                        __global uchar * dst, const int dstStep, const int dst_offset,
 | 
			
		||||
                        __global uchar * lut, const int lutStep, int lut_offset,
 | 
			
		||||
                        const int cols, const int rows,
 | 
			
		||||
                        const int2 tileSize,
 | 
			
		||||
                        const int tilesX, const int tilesY)
 | 
			
		||||
{
 | 
			
		||||
    const int x = get_global_id(0);
 | 
			
		||||
    const int y = get_global_id(1);
 | 
			
		||||
 | 
			
		||||
    if (x >= cols || y >= rows)
 | 
			
		||||
        return;
 | 
			
		||||
 | 
			
		||||
    const float tyf = (convert_float(y) / tileSize.y) - 0.5f;
 | 
			
		||||
    int ty1 = convert_int_rtn(tyf);
 | 
			
		||||
    int ty2 = ty1 + 1;
 | 
			
		||||
    const float ya = tyf - ty1;
 | 
			
		||||
    ty1 = max(ty1, 0);
 | 
			
		||||
    ty2 = min(ty2, tilesY - 1);
 | 
			
		||||
 | 
			
		||||
    const float txf = (convert_float(x) / tileSize.x) - 0.5f;
 | 
			
		||||
    int tx1 = convert_int_rtn(txf);
 | 
			
		||||
    int tx2 = tx1 + 1;
 | 
			
		||||
    const float xa = txf - tx1;
 | 
			
		||||
    tx1 = max(tx1, 0);
 | 
			
		||||
    tx2 = min(tx2, tilesX - 1);
 | 
			
		||||
 | 
			
		||||
    const int srcVal = src[mad24(y, srcStep, x + src_offset)];
 | 
			
		||||
 | 
			
		||||
    float res = 0;
 | 
			
		||||
 | 
			
		||||
    res += lut[mad24(ty1 * tilesX + tx1, lutStep, srcVal + lut_offset)] * ((1.0f - xa) * (1.0f - ya));
 | 
			
		||||
    res += lut[mad24(ty1 * tilesX + tx2, lutStep, srcVal + lut_offset)] * ((xa) * (1.0f - ya));
 | 
			
		||||
    res += lut[mad24(ty2 * tilesX + tx1, lutStep, srcVal + lut_offset)] * ((1.0f - xa) * (ya));
 | 
			
		||||
    res += lut[mad24(ty2 * tilesX + tx2, lutStep, srcVal + lut_offset)] * ((xa) * (ya));
 | 
			
		||||
 | 
			
		||||
    uint ires = (uint)convert_int_rte(res);
 | 
			
		||||
    dst[mad24(y, dstStep, x + dst_offset)] = convert_uchar(clamp(ires, (uint)0, (uint)255));
 | 
			
		||||
}
 | 
			
		||||
		Reference in New Issue
	
	Block a user