Optimizations to OpenCL bilateral filter.

2014-03-24 13:35:56 -07:00
parent a2dec6c34e
commit ede6d4482b
3 changed files with 104 additions and 27 deletions
--- a/modules/imgproc/src/opencl/bilateral.cl
+++ b/modules/imgproc/src/opencl/bilateral.cl
@@ -32,28 +32,79 @@
 // or tort (including negligence or otherwise) arising in any way out of
 // the use of this software, even if advised of the possibility of such damage.

+
+
 __kernel void bilateral(__global const uchar * src, int src_step, int src_offset,
                        __global uchar * dst, int dst_step, int dst_offset, int dst_rows, int dst_cols,
-                        __constant float * color_weight, __constant float * space_weight, __constant int * space_ofs)
+                        __constant float * space_weight, __constant int * space_ofs)
 {
    int x = get_global_id(0);
    int y = get_global_id(1);
-
    if (y < dst_rows && x < dst_cols)
    {
        int src_index = mad24(y + radius, src_step, x + radius + src_offset);
        int dst_index = mad24(y, dst_step, x + dst_offset);
        float sum = 0.f, wsum = 0.f;
        int val0 = convert_int(src[src_index]);
-
        #pragma unroll
        for (int k = 0; k < maxk; k++ )
        {
            int val = convert_int(src[src_index + space_ofs[k]]);
-            float w = space_weight[k] * color_weight[abs(val - val0)];
+            float w = space_weight[k] * native_exp((float)((val - val0) * (val - val0) * gauss_color_coeff));
            sum += (float)(val) * w;
            wsum += w;
        }
        dst[dst_index] = convert_uchar_rtz(sum / wsum + 0.5f);
    }
 }
+
+__kernel void bilateral_float(__global const uchar * src, int src_step, int src_offset,
+                              __global uchar * dst, int dst_step, int dst_offset, int dst_rows, int dst_cols,
+                              __constant float * space_weight, __constant int * space_ofs)
+{
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+    if (y < dst_rows && x < dst_cols)
+    {
+        int src_index = mad24(y + radius, src_step, x + radius + src_offset);
+        int dst_index = mad24(y, dst_step, x + dst_offset);
+        float sum = 0.f, wsum = 0.f;
+        float val0 = convert_float(src[src_index]);
+        #pragma unroll
+        for (int k = 0; k < maxk; k++ )
+        {
+            float val = convert_float(src[src_index + space_ofs[k]]);
+            float w = space_weight[k] * native_exp((val - val0) * (val - val0) * gauss_color_coeff);
+            sum += (float)(val) * w;
+            wsum += w;
+        }
+        dst[dst_index] = convert_uchar_rtz(sum / wsum + 0.5f);
+    }
+}
+
+
+__kernel void bilateral_float4(__global const uchar * src, int src_step, int src_offset,
+                               __global uchar * dst, int dst_step, int dst_offset, int dst_rows, int dst_cols,
+                               __constant float * space_weight, __constant int * space_ofs)
+{
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+    if (y < dst_rows && x < dst_cols / 4 )
+    {
+        int src_index = ((y + radius) * src_step) + x * 4  + (radius + src_offset);
+        int dst_index = (y  * dst_step) +  x * 4 + dst_offset ;
+        float4 sum = 0.f, wsum = 0.f;
+        float4 val0 = convert_float4(vload4(0, src + src_index));
+        #pragma unroll
+        for (int k = 0; k < maxk; k++ )
+        {
+            float4 val = convert_float4(vload4(0, src + src_index + space_ofs[k]));
+            float spacew = space_weight[k];
+            float4 w = spacew * native_exp((val - val0) * (val - val0) * gauss_color_coeff);
+            sum += val * w;
+            wsum += w;
+        }
+        sum = sum / wsum + .5f;
+        vstore4(convert_uchar4_rtz(sum), 0, dst + dst_index);
+    }
+}
--- a/modules/imgproc/src/smooth.cpp
+++ b/modules/imgproc/src/smooth.cpp
@@ -2210,7 +2210,7 @@ static bool ocl_bilateralFilter_8u(InputArray _src, OutputArray _dst, int d,
                                   double sigma_color, double sigma_space,
                                   int borderType)
 {
-    int type = _src.type(), cn = CV_MAT_CN(type);
+    int type = _src.type();
    int i, j, maxk, radius;

    if ( type != CV_8UC1 )
@@ -2237,19 +2237,14 @@ static bool ocl_bilateralFilter_8u(InputArray _src, OutputArray _dst, int d,

    copyMakeBorder(src, temp, radius, radius, radius, radius, borderType);

-    std::vector<float> _color_weight(cn * 256);
    std::vector<float> _space_weight(d * d);
    std::vector<int> _space_ofs(d * d);
-    float *color_weight = &_color_weight[0];
    float *space_weight = &_space_weight[0];
    int *space_ofs = &_space_ofs[0];

-    // initialize color-related bilateral filter coefficients
-    for( i = 0; i < 256 * cn; i++ )
-        color_weight[i] = (float)std::exp(i * i * gauss_color_coeff);
-
    // initialize space-related bilateral filter coefficients
    for( i = -radius, maxk = 0; i <= radius; i++ )
+    {
        for( j = -radius; j <= radius; j++ )
        {
            double r = std::sqrt((double)i * i + (double)j * j);
@@ -2258,26 +2253,43 @@ static bool ocl_bilateralFilter_8u(InputArray _src, OutputArray _dst, int d,
            space_weight[maxk] = (float)std::exp(r * r * gauss_space_coeff);
            space_ofs[maxk++] = (int)(i * temp.step + j);
        }
+    }

-    ocl::Kernel k("bilateral", ocl::imgproc::bilateral_oclsrc,
-                  format("-D radius=%d -D maxk=%d", radius, maxk));
+    String kernelName("bilateral");
+    size_t sizeDiv = 1;
+
+    if ((ocl::Device::getDefault().isIntel()) &&
+        (ocl::Device::getDefault().type() == ocl::Device::TYPE_GPU))
+    {
+            //Intel GPU
+            if (dst.cols % 4 == 0)
+            {
+                    kernelName = "bilateral_float4";
+                    sizeDiv = 4;
+            }
+            else
+            {
+                    kernelName = "bilateral_float";
+            }
+    }
+    ocl::Kernel k(kernelName.c_str(), ocl::imgproc::bilateral_oclsrc,
+                  format("-D radius=%d -D maxk=%d "
+                         "-D gauss_color_coeff=%f", radius, maxk,
+                         (float)gauss_color_coeff));
    if (k.empty())
        return false;

-    Mat mcolor_weight(1, cn * 256, CV_32FC1, color_weight);
    Mat mspace_weight(1, d * d, CV_32FC1, space_weight);
    Mat mspace_ofs(1, d * d, CV_32SC1, space_ofs);
-    UMat ucolor_weight, uspace_weight, uspace_ofs;
-    mcolor_weight.copyTo(ucolor_weight);
+    UMat uspace_weight, uspace_ofs;
    mspace_weight.copyTo(uspace_weight);
    mspace_ofs.copyTo(uspace_ofs);

    k.args(ocl::KernelArg::ReadOnlyNoSize(temp), ocl::KernelArg::WriteOnly(dst),
-           ocl::KernelArg::PtrReadOnly(ucolor_weight),
           ocl::KernelArg::PtrReadOnly(uspace_weight),
           ocl::KernelArg::PtrReadOnly(uspace_ofs));

-    size_t globalsize[2] = { dst.cols, dst.rows };
+    size_t globalsize[2] = { dst.cols / sizeDiv, dst.rows };
    return k.run(2, globalsize, NULL, false);
 }