From a75807354d8b34947ee92e5a1c222f47875e46ea Mon Sep 17 00:00:00 2001
From: vbystricky <user@user-pc.(none)>
Date: Fri, 12 Sep 2014 15:32:36 +0400
Subject: [PATCH] Optimize OpenCL version function BackgroundSubstractionMOG2

---
 modules/video/src/bgfg_gaussmix2.cpp  |  64 ++++---
 modules/video/src/opencl/bgfg_mog2.cl | 260 +++++++++++++-------------
 2 files changed, 165 insertions(+), 159 deletions(-)

diff --git a/modules/video/src/bgfg_gaussmix2.cpp b/modules/video/src/bgfg_gaussmix2.cpp
index f7917f2e6..11adb74c5 100644
--- a/modules/video/src/bgfg_gaussmix2.cpp
+++ b/modules/video/src/bgfg_gaussmix2.cpp
@@ -188,10 +188,11 @@ public:
 
         int nchannels = CV_MAT_CN(frameType);
         CV_Assert( nchannels <= CV_CN_MAX );
+        CV_Assert( nmixtures <= 255);
 
         if (ocl::useOpenCL() && opencl_ON)
         {
-            kernel_apply.create("mog2_kernel", ocl::video::bgfg_mog2_oclsrc, format("-D CN=%d -D NMIXTURES=%d", nchannels, nmixtures));
+            create_ocl_apply_kernel();
             kernel_getBg.create("getBackgroundImage2_kernel", ocl::video::bgfg_mog2_oclsrc, format( "-D CN=%d -D NMIXTURES=%d", nchannels, nmixtures));
 
             if (kernel_apply.empty() || kernel_getBg.empty())
@@ -213,7 +214,7 @@ public:
             u_mean.setTo(Scalar::all(0));
 
             //make the array for keeping track of the used modes per pixel - all zeros at start
-            u_bgmodelUsedModes.create(frameSize, CV_32FC1);
+            u_bgmodelUsedModes.create(frameSize, CV_8UC1);
             u_bgmodelUsedModes.setTo(cv::Scalar::all(0));
         }
         else
@@ -259,7 +260,17 @@ public:
     virtual void setComplexityReductionThreshold(double ct) { fCT = (float)ct; }
 
     virtual bool getDetectShadows() const { return bShadowDetection; }
-    virtual void setDetectShadows(bool detectshadows) { bShadowDetection = detectshadows; }
+    virtual void setDetectShadows(bool detectshadows)
+    {
+        if ((bShadowDetection && detectshadows) || (!bShadowDetection && !detectshadows))
+            return;
+        bShadowDetection = detectshadows;
+        if (!kernel_apply.empty())
+        {
+            create_ocl_apply_kernel();
+            CV_Assert( !kernel_apply.empty() );
+        }
+    }
 
     virtual int getShadowValue() const { return nShadowDetection; }
     virtual void setShadowValue(int value) { nShadowDetection = (uchar)value; }
@@ -372,6 +383,7 @@ protected:
 
     bool ocl_getBackgroundImage(OutputArray backgroundImage) const;
     bool ocl_apply(InputArray _image, OutputArray _fgmask, double learningRate=-1);
+    void create_ocl_apply_kernel();
 };
 
 struct GaussBGStatModel2Params
@@ -745,16 +757,11 @@ bool BackgroundSubtractorMOG2Impl::ocl_apply(InputArray _image, OutputArray _fgm
     learningRate = learningRate >= 0 && nframes > 1 ? learningRate : 1./std::min( 2*nframes, history );
     CV_Assert(learningRate >= 0);
 
-    UMat fgmask(_image.size(), CV_32SC1);
-
-    fgmask.setTo(cv::Scalar::all(1));
+    _fgmask.create(_image.size(), CV_8U);
+    UMat fgmask = _fgmask.getUMat();
 
     const double alpha1 = 1.0f - learningRate;
 
-    int detectShadows_flag = 0;
-    if(bShadowDetection)
-        detectShadows_flag = 1;
-
     UMat frame = _image.getUMat();
 
     float varMax = MAX(fVarMin, fVarMax);
@@ -762,16 +769,15 @@ bool BackgroundSubtractorMOG2Impl::ocl_apply(InputArray _image, OutputArray _fgm
 
     int idxArg = 0;
     idxArg = kernel_apply.set(idxArg, ocl::KernelArg::ReadOnly(frame));
-    idxArg = kernel_apply.set(idxArg, ocl::KernelArg::ReadWriteNoSize(u_bgmodelUsedModes));
-    idxArg = kernel_apply.set(idxArg, ocl::KernelArg::ReadWriteNoSize(u_weight));
-    idxArg = kernel_apply.set(idxArg, ocl::KernelArg::ReadWriteNoSize(u_mean));
-    idxArg = kernel_apply.set(idxArg, ocl::KernelArg::ReadWriteNoSize(u_variance));
+    idxArg = kernel_apply.set(idxArg, ocl::KernelArg::PtrReadWrite(u_bgmodelUsedModes));
+    idxArg = kernel_apply.set(idxArg, ocl::KernelArg::PtrReadWrite(u_weight));
+    idxArg = kernel_apply.set(idxArg, ocl::KernelArg::PtrReadWrite(u_mean));
+    idxArg = kernel_apply.set(idxArg, ocl::KernelArg::PtrReadWrite(u_variance));
     idxArg = kernel_apply.set(idxArg, ocl::KernelArg::WriteOnlyNoSize(fgmask));
 
     idxArg = kernel_apply.set(idxArg, (float)learningRate);        //alphaT
     idxArg = kernel_apply.set(idxArg, (float)alpha1);
     idxArg = kernel_apply.set(idxArg, (float)(-learningRate*fCT));   //prune
-    idxArg = kernel_apply.set(idxArg, detectShadows_flag);
 
     idxArg = kernel_apply.set(idxArg, (float)varThreshold); //c_Tb
     idxArg = kernel_apply.set(idxArg, backgroundRatio);     //c_TB
@@ -780,18 +786,11 @@ bool BackgroundSubtractorMOG2Impl::ocl_apply(InputArray _image, OutputArray _fgm
     idxArg = kernel_apply.set(idxArg, varMax);
     idxArg = kernel_apply.set(idxArg, fVarInit);
     idxArg = kernel_apply.set(idxArg, fTau);
-    kernel_apply.set(idxArg, nShadowDetection);
+    if (bShadowDetection)
+        kernel_apply.set(idxArg, nShadowDetection);
 
     size_t globalsize[] = {frame.cols, frame.rows, 1};
-
-    if (!(kernel_apply.run(2, globalsize, NULL, true)))
-        return false;
-
-    _fgmask.create(_image.size(),CV_8U);
-    UMat temp = _fgmask.getUMat();
-    fgmask.convertTo(temp, CV_8U);
-
-    return true;
+    return kernel_apply.run(2, globalsize, NULL, true);
 }
 
 bool BackgroundSubtractorMOG2Impl::ocl_getBackgroundImage(OutputArray _backgroundImage) const
@@ -802,10 +801,10 @@ bool BackgroundSubtractorMOG2Impl::ocl_getBackgroundImage(OutputArray _backgroun
     UMat dst = _backgroundImage.getUMat();
 
     int idxArg = 0;
-    idxArg = kernel_getBg.set(idxArg, ocl::KernelArg::ReadOnly(u_bgmodelUsedModes));
-    idxArg = kernel_getBg.set(idxArg, ocl::KernelArg::ReadOnlyNoSize(u_weight));
-    idxArg = kernel_getBg.set(idxArg, ocl::KernelArg::ReadOnlyNoSize(u_mean));
-    idxArg = kernel_getBg.set(idxArg, ocl::KernelArg::WriteOnlyNoSize(dst));
+    idxArg = kernel_getBg.set(idxArg, ocl::KernelArg::PtrReadOnly(u_bgmodelUsedModes));
+    idxArg = kernel_getBg.set(idxArg, ocl::KernelArg::PtrReadOnly(u_weight));
+    idxArg = kernel_getBg.set(idxArg, ocl::KernelArg::PtrReadOnly(u_mean));
+    idxArg = kernel_getBg.set(idxArg, ocl::KernelArg::WriteOnly(dst));
     kernel_getBg.set(idxArg, backgroundRatio);
 
     size_t globalsize[2] = {u_bgmodelUsedModes.cols, u_bgmodelUsedModes.rows};
@@ -815,6 +814,13 @@ bool BackgroundSubtractorMOG2Impl::ocl_getBackgroundImage(OutputArray _backgroun
 
 #endif
 
+void BackgroundSubtractorMOG2Impl::create_ocl_apply_kernel()
+{
+    int nchannels = CV_MAT_CN(frameType);
+    String opts = format("-D CN=%d -D NMIXTURES=%d%s", nchannels, nmixtures, bShadowDetection ? " -D SHADOW_DETECT" : "");
+    kernel_apply.create("mog2_kernel", ocl::video::bgfg_mog2_oclsrc, opts);
+}
+
 void BackgroundSubtractorMOG2Impl::apply(InputArray _image, OutputArray _fgmask, double learningRate)
 {
     bool needToInitialize = nframes == 0 || learningRate >= 1 || _image.size() != frameSize || _image.type() != frameType;
diff --git a/modules/video/src/opencl/bgfg_mog2.cl b/modules/video/src/opencl/bgfg_mog2.cl
index 9bc18b215..629f82d27 100644
--- a/modules/video/src/opencl/bgfg_mog2.cl
+++ b/modules/video/src/opencl/bgfg_mog2.cl
@@ -7,11 +7,6 @@
 #define frameToMean(a, b) (b) = *(a);
 #define meanToFrame(a, b) *b = convert_uchar_sat(a);
 
-inline float sqr(float val)
-{
-    return val * val;
-}
-
 inline float sum(float val)
 {
     return val;
@@ -34,63 +29,45 @@ inline float sum(float val)
     b.z = a[2]; \
     b.w = 0.0f;
 
-inline float sqr(const float4 val)
-{
-    return val.x * val.x + val.y * val.y + val.z * val.z;
-}
-
 inline float sum(const float4 val)
 {
     return (val.x + val.y + val.z);
 }
 
-inline void swap4(__global float4* ptr, int x, int y, int k, int rows, int ptr_step)
-{
-    float4 val = ptr[(k * rows + y) * ptr_step + x];
-    ptr[(k * rows + y) * ptr_step + x] = ptr[((k + 1) * rows + y) * ptr_step + x];
-    ptr[((k + 1) * rows + y) * ptr_step + x] = val;
-}
-
 #endif
 
-inline void swap(__global float* ptr, int x, int y, int k, int rows, int ptr_step)
-{
-    float val = ptr[(k * rows + y) * ptr_step + x];
-    ptr[(k * rows + y) * ptr_step + x] = ptr[((k + 1) * rows + y) * ptr_step + x];
-    ptr[((k + 1) * rows + y) * ptr_step + x] = val;
-}
-
-__kernel void mog2_kernel(__global const uchar* frame, int frame_step, int frame_offset, int frame_row, int frame_col, //uchar || uchar3
-                          __global uchar* modesUsed, int modesUsed_step, int modesUsed_offset,                         //int
-                          __global uchar* weight, int weight_step, int weight_offset,                                  //float
-                          __global uchar* mean, int mean_step, int mean_offset,                                        //T_MEAN=float || float4
-                          __global uchar* variance, int var_step, int var_offset,                                      //float
-                          __global uchar* fgmask, int fgmask_step, int fgmask_offset,                                  //int
+__kernel void mog2_kernel(__global const uchar* frame, int frame_step, int frame_offset, int frame_row, int frame_col,  //uchar || uchar3
+                          __global uchar* modesUsed,                                                                    //uchar
+                          __global uchar* weight,                                                                       //float
+                          __global uchar* mean,                                                                         //T_MEAN=float || float4
+                          __global uchar* variance,                                                                     //float
+                          __global uchar* fgmask, int fgmask_step, int fgmask_offset,                                   //uchar
                           float alphaT, float alpha1, float prune,
-                          int detectShadows_flag,
-                          float c_Tb, float c_TB, float c_Tg, float c_varMin,                     //constants
-                          float c_varMax, float c_varInit, float c_tau, uchar c_shadowVal)
+                          float c_Tb, float c_TB, float c_Tg, float c_varMin,                                           //constants
+                          float c_varMax, float c_varInit, float c_tau
+#ifdef SHADOW_DETECT
+                          , uchar c_shadowVal
+#endif
+                          )
 {
     int x = get_global_id(0);
     int y = get_global_id(1);
 
-    weight_step/= sizeof(float);
-    var_step   /= sizeof(float);
-    mean_step  /= (sizeof(float)*cnMode);
-
     if( x < frame_col && y < frame_row)
     {
-        __global const uchar* _frame = (frame + mad24( y, frame_step, x*CN + frame_offset));
+        __global const uchar* _frame = (frame + mad24(y, frame_step, mad24(x, CN, frame_offset)));
         T_MEAN pix;
         frameToMean(_frame, pix);
 
-        bool background = false; // true - the pixel classified as background
+        uchar foreground = 255; // 0 - the pixel classified as background
 
         bool fitsPDF = false; //if it remains zero a new GMM mode will be added
 
-        __global int* _modesUsed = (__global int*)(modesUsed + mad24( y, modesUsed_step, x*(int)(sizeof(int))));
-        int nmodes = _modesUsed[0];
-        int nNewModes = nmodes; //current number of modes in GMM
+        int pt_idx =  mad24(y, frame_col, x);
+        int idx_step = frame_row * frame_col;
+
+        __global uchar* _modesUsed = modesUsed + pt_idx;
+        uchar nmodes = _modesUsed[0];
 
         float totalWeight = 0.0f;
 
@@ -98,114 +75,130 @@ __kernel void mog2_kernel(__global const uchar* frame, int frame_step, int frame
         __global float* _variance = (__global float*)(variance);
         __global T_MEAN* _mean = (__global T_MEAN*)(mean);
 
-        for (int mode = 0; mode < nmodes; ++mode)
+        uchar mode = 0;
+        for (; mode < nmodes; ++mode)
         {
+            int mode_idx = mad24(mode, idx_step, pt_idx);
+            float c_weight = mad(alpha1, _weight[mode_idx], prune);
 
-            float c_weight = alpha1 * _weight[(mode * frame_row + y) * weight_step + x] + prune;
-            int swap_count = 0;
-            if (!fitsPDF)
+            float c_var = _variance[mode_idx];
+
+            T_MEAN c_mean = _mean[mode_idx];
+
+            T_MEAN diff = c_mean - pix;
+            float dist2 = dot(diff, diff);
+
+            if (totalWeight < c_TB && dist2 < c_Tb * c_var)
+                foreground = 0;
+
+            if (dist2 < c_Tg * c_var)
             {
-                float c_var = _variance[(mode * frame_row + y) * var_step + x];
+                fitsPDF = true;
+                c_weight += alphaT;
 
-                T_MEAN c_mean = _mean[(mode * frame_row + y) * mean_step + x];
+                float k = alphaT / c_weight;
+                T_MEAN mean_new = mad((T_MEAN)-k, diff, c_mean);
+                float variance_new  = clamp(mad(k, (dist2 - c_var), c_var), c_varMin, c_varMax);
 
-                T_MEAN diff = c_mean - pix;
-                float dist2 = sqr(diff);
-
-                if (totalWeight < c_TB && dist2 < c_Tb * c_var)
-                    background = true;
-
-                if (dist2 < c_Tg * c_var)
+                for (int i = mode; i > 0; --i)
                 {
-                    fitsPDF = true;
-                    c_weight += alphaT;
-                    float k = alphaT / c_weight;
+                    int prev_idx = mode_idx - idx_step;
+                    if (c_weight < _weight[prev_idx])
+                        break;
 
-                    _mean[(mode * frame_row + y) * mean_step + x] = c_mean - k * diff;
+                    _weight[mode_idx]   = _weight[prev_idx];
+                    _variance[mode_idx] = _variance[prev_idx];
+                    _mean[mode_idx]     = _mean[prev_idx];
 
-                    float varnew = c_var + k * (dist2 - c_var);
-                    varnew = fmax(varnew, c_varMin);
-                    varnew = fmin(varnew, c_varMax);
-
-                    _variance[(mode * frame_row + y) * var_step + x] = varnew;
-                    for (int i = mode; i > 0; --i)
-                    {
-                        if (c_weight < _weight[((i - 1) * frame_row + y) * weight_step + x])
-                            break;
-                        swap_count++;
-                        swap(_weight, x, y, i - 1, frame_row, weight_step);
-                        swap(_variance, x, y, i - 1, frame_row, var_step);
-                        #if (CN==1)
-                        swap(_mean, x, y, i - 1, frame_row, mean_step);
-                        #else
-                        swap4(_mean, x, y, i - 1, frame_row, mean_step);
-                        #endif
-                    }
+                    mode_idx = prev_idx;
                 }
-            } // !fitsPDF
+
+                _mean[mode_idx]     = mean_new;
+                _variance[mode_idx] = variance_new;
+                _weight[mode_idx]   = c_weight; //update weight by the calculated value
+
+                totalWeight += c_weight;
+
+                mode ++;
+
+                break;
+            }
+            if (c_weight < -prune)
+                c_weight = 0.0f;
+
+            _weight[mode_idx] = c_weight; //update weight by the calculated value
+            totalWeight += c_weight;
+        }
+
+        for (; mode < nmodes; ++mode)
+        {
+            int mode_idx = mad24(mode, idx_step, pt_idx);
+            float c_weight = mad(alpha1, _weight[mode_idx], prune);
 
             if (c_weight < -prune)
             {
                 c_weight = 0.0f;
-                nmodes--;
+                nmodes = mode;
+                break;
             }
-
-            _weight[((mode - swap_count) * frame_row + y) * weight_step + x] = c_weight; //update weight by the calculated value
+            _weight[mode_idx] = c_weight; //update weight by the calculated value
             totalWeight += c_weight;
         }
 
-        totalWeight = 1.f / totalWeight;
-        for (int mode = 0; mode < nmodes; ++mode)
-            _weight[(mode * frame_row + y) * weight_step + x] *= totalWeight;
-
-        nmodes = nNewModes;
+        if (0.f < totalWeight)
+        {
+            totalWeight = 1.f / totalWeight;
+            for (int mode = 0; mode < nmodes; ++mode)
+                _weight[mad24(mode, idx_step, pt_idx)] *= totalWeight;
+        }
 
         if (!fitsPDF)
         {
-            int mode = nmodes == (NMIXTURES) ? (NMIXTURES) - 1 : nmodes++;
+            uchar mode = nmodes == (NMIXTURES) ? (NMIXTURES) - 1 : nmodes++;
+            int mode_idx = mad24(mode, idx_step, pt_idx);
 
             if (nmodes == 1)
-                _weight[(mode * frame_row + y) * weight_step + x] = 1.f;
+                _weight[mode_idx] = 1.f;
             else
             {
-                _weight[(mode * frame_row + y) * weight_step + x] = alphaT;
+                _weight[mode_idx] = alphaT;
 
-                for (int i = 0; i < nmodes - 1; ++i)
-                    _weight[(i * frame_row + y) * weight_step + x] *= alpha1;
+                for (int i = pt_idx; i < mode_idx; i += idx_step)
+                    _weight[i] *= alpha1;
             }
 
-            _mean[(mode * frame_row + y) * mean_step + x] = pix;
-            _variance[(mode * frame_row + y) * var_step + x] = c_varInit;
-
             for (int i = nmodes - 1; i > 0; --i)
             {
-                if (alphaT < _weight[((i - 1) * frame_row + y) * weight_step + x])
+                int prev_idx = mode_idx - idx_step;
+                if (alphaT < _weight[prev_idx])
                     break;
 
-                swap(_weight, x, y, i - 1, frame_row, weight_step);
-                swap(_variance, x, y, i - 1, frame_row, var_step);
-                #if (CN==1)
-                swap(_mean, x, y, i - 1, frame_row, mean_step);
-                #else
-                swap4(_mean, x, y, i - 1, frame_row, mean_step);
-                #endif
+                _weight[mode_idx]   = _weight[prev_idx];
+                _variance[mode_idx] = _variance[prev_idx];
+                _mean[mode_idx]     = _mean[prev_idx];
+
+                mode_idx = prev_idx;
             }
+
+            _mean[mode_idx] = pix;
+            _variance[mode_idx] = c_varInit;
         }
 
         _modesUsed[0] = nmodes;
-        bool isShadow = false;
-        if (detectShadows_flag && !background)
+#ifdef SHADOW_DETECT
+        if (foreground)
         {
             float tWeight = 0.0f;
 
-            for (int mode = 0; mode < nmodes; ++mode)
+            for (uchar mode = 0; mode < nmodes; ++mode)
             {
-                T_MEAN c_mean = _mean[(mode * frame_row + y) * mean_step + x];
+                int mode_idx = mad24(mode, idx_step, pt_idx);
+                T_MEAN c_mean = _mean[mode_idx];
 
                 T_MEAN pix_mean = pix * c_mean;
 
                 float numerator = sum(pix_mean);
-                float denominator = sqr(c_mean);
+                float denominator = dot(c_mean, c_mean);
 
                 if (denominator == 0)
                     break;
@@ -214,60 +207,67 @@ __kernel void mog2_kernel(__global const uchar* frame, int frame_step, int frame
                 {
                     float a = numerator / denominator;
 
-                    T_MEAN dD = a * c_mean - pix;
+                    T_MEAN dD = mad(a, c_mean, -pix);
 
-                    if (sqr(dD) < c_Tb * _variance[(mode * frame_row + y) * var_step + x] * a * a)
+                    if (dot(dD, dD) < c_Tb * _variance[mode_idx] * a * a)
                     {
-                        isShadow = true;
+                        foreground = c_shadowVal;
                         break;
                     }
                 }
 
-                tWeight += _weight[(mode * frame_row + y) * weight_step + x];
+                tWeight += _weight[mode_idx];
                 if (tWeight > c_TB)
                     break;
             }
         }
-        __global int* _fgmask = (__global int*)(fgmask + mad24(y, fgmask_step, x*(int)(sizeof(int)) + fgmask_offset));
-        *_fgmask = background ? 0 : isShadow ? c_shadowVal : 255;
+#endif
+        __global uchar* _fgmask = fgmask + mad24(y, fgmask_step, x + fgmask_offset);
+        *_fgmask = (uchar)foreground;
     }
 }
 
-__kernel void getBackgroundImage2_kernel(__global const uchar* modesUsed, int modesUsed_step, int modesUsed_offset, int modesUsed_row, int modesUsed_col,
-                                         __global const uchar* weight, int weight_step, int weight_offset,
-                                         __global const uchar* mean, int mean_step, int mean_offset,
-                                         __global uchar* dst, int dst_step, int dst_offset,
+__kernel void getBackgroundImage2_kernel(__global const uchar* modesUsed,
+                                         __global const uchar* weight,
+                                         __global const uchar* mean,
+                                         __global uchar* dst, int dst_step, int dst_offset, int dst_row, int dst_col,
                                          float c_TB)
 {
     int x = get_global_id(0);
     int y = get_global_id(1);
 
-    if(x < modesUsed_col && y < modesUsed_row)
+    if(x < dst_col && y < dst_row)
     {
-        __global int* _modesUsed = (__global int*)(modesUsed + mad24( y, modesUsed_step, x*(int)(sizeof(int))));
-        int nmodes = _modesUsed[0];
+        int pt_idx =  mad24(y, dst_col, x);
+
+        __global const uchar* _modesUsed = modesUsed + pt_idx;
+        uchar nmodes = _modesUsed[0];
 
         T_MEAN meanVal = (T_MEAN)F_ZERO;
 
         float totalWeight = 0.0f;
-
-        for (int mode = 0; mode < nmodes; ++mode)
+        __global const float* _weight = (__global const float*)weight;
+        __global const T_MEAN* _mean = (__global const T_MEAN*)(mean);
+        int idx_step = dst_row * dst_col;
+        for (uchar mode = 0; mode < nmodes; ++mode)
         {
-            __global const float* _weight = (__global const float*)(weight + mad24(mode * modesUsed_row + y, weight_step, x*(int)(sizeof(float))));
-            float c_weight = _weight[0];
+            int mode_idx = mad24(mode, idx_step, pt_idx);
+            float c_weight = _weight[mode_idx];
+            T_MEAN c_mean = _mean[mode_idx];
 
-            __global const T_MEAN* _mean = (__global const T_MEAN*)(mean + mad24(mode * modesUsed_row + y, mean_step, x*(int)(sizeof(float))*cnMode));
-            T_MEAN c_mean = _mean[0];
-            meanVal = meanVal + c_weight * c_mean;
+            meanVal = mad(c_weight, c_mean, meanVal);
 
             totalWeight += c_weight;
 
-            if(totalWeight > c_TB)
+            if (totalWeight > c_TB)
                 break;
         }
 
-        meanVal = meanVal * (1.f / totalWeight);
-        __global uchar* _dst = dst + y * dst_step + x*CN + dst_offset;
+        if (0.f < totalWeight)
+            meanVal = meanVal / totalWeight;
+        else
+            meanVal = (T_MEAN)(0.f);
+        __global uchar* _dst = dst + mad24(y, dst_step, mad24(x, CN, dst_offset));
         meanToFrame(meanVal, _dst);
     }
 }
\ No newline at end of file