added gpu::FGDStatModel (Background/foreground segmentation)

2012-06-05 13:32:04 +00:00
parent 3c16c9c92d
commit bfb390e82a
13 changed files with 2200 additions and 28 deletions
--- a/modules/gpu/CMakeLists.txt
+++ b/modules/gpu/CMakeLists.txt
@@ -3,7 +3,7 @@ if(ANDROID OR IOS)
 endif()

 set(the_description "GPU-accelerated Computer Vision")
-ocv_add_module(gpu opencv_imgproc opencv_calib3d opencv_objdetect opencv_video opencv_nonfree)
+ocv_add_module(gpu opencv_imgproc opencv_calib3d opencv_objdetect opencv_video opencv_nonfree opencv_legacy)

 ocv_module_include_directories("${CMAKE_CURRENT_SOURCE_DIR}/src/cuda")

@@ -50,20 +50,20 @@ if (HAVE_CUDA)
  ocv_cuda_compile(cuda_objs ${lib_cuda} ${ncv_cuda})
  #CUDA_BUILD_CLEAN_TARGET()

-  
+
  set(cuda_link_libs ${CUDA_LIBRARIES} ${CUDA_npp_LIBRARY})
-  
-  
+
+
  if(NOT APPLE)
    unset(CUDA_nvcuvid_LIBRARY CACHE)
    find_cuda_helper_libs(nvcuvid)
-	set(cuda_link_libs ${cuda_link_libs} ${CUDA_nvcuvid_LIBRARY})
+    set(cuda_link_libs ${cuda_link_libs} ${CUDA_nvcuvid_LIBRARY})
  endif()

  if(WIN32)
    unset(CUDA_nvcuvenc_LIBRARY CACHE)
    find_cuda_helper_libs(nvcuvenc)
-	set(cuda_link_libs ${cuda_link_libs} ${CUDA_nvcuvenc_LIBRARY})
+    set(cuda_link_libs ${cuda_link_libs} ${CUDA_nvcuvenc_LIBRARY})
  endif()

  if(NOT APPLE AND WITH_FFMPEG)
--- a/modules/gpu/doc/video.rst
+++ b/modules/gpu/doc/video.rst
@@ -294,6 +294,104 @@ Interpolates frames (images) using provided optical flow (displacement field).



+gpu::FGDStatModel
+-----------------
+.. ocv:class:: gpu::FGDStatModel
+
+Class used for background/foreground segmentation. ::
+
+    class FGDStatModel
+    {
+    public:
+        struct Params
+        {
+            ...
+        };
+
+        explicit FGDStatModel(int out_cn = 3);
+        explicit FGDStatModel(const cv::gpu::GpuMat& firstFrame, const Params& params = Params(), int out_cn = 3);
+
+        ~FGDStatModel();
+
+        void create(const cv::gpu::GpuMat& firstFrame, const Params& params = Params());
+        void release();
+
+        int update(const cv::gpu::GpuMat& curFrame);
+
+        //8UC3 or 8UC4 reference background image
+        cv::gpu::GpuMat background;
+
+        //8UC1 foreground image
+        cv::gpu::GpuMat foreground;
+
+        std::vector< std::vector<cv::Point> > foreground_regions;
+    };
+
+The class discriminates between foreground and background pixels by building and maintaining a model of the background. Any pixel which does not fit this model is then deemed to be foreground. The class implements algorithm described in [FGD2003]_.
+
+The results are available through the class fields:
+
+    .. ocv:member:: cv::gpu::GpuMat background
+
+        The output background image.
+
+    .. ocv:member:: cv::gpu::GpuMat foreground
+
+        The output foreground mask as an 8-bit binary image.
+
+    .. ocv:member:: cv::gpu::GpuMat foreground_regions
+
+        The output foreground regions calculated by :ocv:func:`findContours`.
+
+
+
+gpu::FGDStatModel::FGDStatModel
+-------------------------------
+Constructors.
+
+.. ocv:function:: gpu::FGDStatModel::FGDStatModel(int out_cn = 3)
+.. ocv:function:: gpu::FGDStatModel::FGDStatModel(const cv::gpu::GpuMat& firstFrame, const Params& params = Params(), int out_cn = 3)
+
+    :param firstFrame: First frame from video stream. Supports 3- and 4-channels input ( ``CV_8UC3`` and ``CV_8UC4`` ).
+
+    :param params: Algorithm's parameters. See [FGD2003]_ for explanation.
+
+    :param out_cn: Channels count in output result and inner buffers. Can be 3 or 4. 4-channels version requires more memory, but works a bit faster.
+
+.. seealso:: :ocv:func:`gpu::FGDStatModel::create`
+
+
+
+gpu::FGDStatModel::create
+-------------------------
+Initializes background model.
+
+.. ocv:function:: void gpu::FGDStatModel::create(const cv::gpu::GpuMat& firstFrame, const Params& params = Params())
+
+    :param firstFrame: First frame from video stream. Supports 3- and 4-channels input ( ``CV_8UC3`` and ``CV_8UC4`` ).
+
+    :param params: Algorithm's parameters. See [FGD2003]_ for explanation.
+
+
+
+gpu::FGDStatModel::release
+--------------------------
+Releases all inner buffer's memory.
+
+.. ocv:function:: void gpu::FGDStatModel::release()
+
+
+
+gpu::FGDStatModel::update
+--------------------------
+Updates the background model and returns foreground regions count.
+
+.. ocv:function:: int gpu::FGDStatModel::update(const cv::gpu::GpuMat& curFrame);
+
+    :param curFrame: Next video frame.
+
+
+
 gpu::VideoWriter_GPU
 ---------------------
 Video writer class.
@@ -731,3 +829,4 @@ Parse next video frame. Implementation must call this method after new frame was


 .. [Brox2004] T. Brox, A. Bruhn, N. Papenberg, J. Weickert. *High accuracy optical flow estimation based on a theory for warping*. ECCV 2004.
+.. [FGD2003] Liyuan Li, Weimin Huang, Irene Y.H. Gu, and Qi Tian. *Foreground Object Detection from Videos Containing Complex Background*. ACM MM2003 9p, 2003.
--- a/modules/gpu/include/opencv2/gpu/gpu.hpp
+++ b/modules/gpu/include/opencv2/gpu/gpu.hpp
@@ -1891,7 +1891,71 @@ CV_EXPORTS void interpolateFrames(const GpuMat& frame0, const GpuMat& frame1,
 CV_EXPORTS void createOpticalFlowNeedleMap(const GpuMat& u, const GpuMat& v, GpuMat& vertex, GpuMat& colors);


-////////////////////////////////// Video Encoding //////////////////////////////////////////
+//////////////////////// Background/foreground segmentation ////////////////////////
+
+// Foreground Object Detection from Videos Containing Complex Background.
+// Liyuan Li, Weimin Huang, Irene Y.H. Gu, and Qi Tian.
+// ACM MM2003 9p
+class CV_EXPORTS FGDStatModel
+{
+public:
+    struct CV_EXPORTS Params
+    {
+        int Lc;  // Quantized levels per 'color' component. Power of two, typically 32, 64 or 128.
+        int N1c; // Number of color vectors used to model normal background color variation at a given pixel.
+        int N2c; // Number of color vectors retained at given pixel.  Must be > N1c, typically ~ 5/3 of N1c.
+        // Used to allow the first N1c vectors to adapt over time to changing background.
+
+        int Lcc;  // Quantized levels per 'color co-occurrence' component.  Power of two, typically 16, 32 or 64.
+        int N1cc; // Number of color co-occurrence vectors used to model normal background color variation at a given pixel.
+        int N2cc; // Number of color co-occurrence vectors retained at given pixel.  Must be > N1cc, typically ~ 5/3 of N1cc.
+        // Used to allow the first N1cc vectors to adapt over time to changing background.
+
+        bool is_obj_without_holes; // If TRUE we ignore holes within foreground blobs. Defaults to TRUE.
+        int perform_morphing;     // Number of erode-dilate-erode foreground-blob cleanup iterations.
+        // These erase one-pixel junk blobs and merge almost-touching blobs. Default value is 1.
+
+        float alpha1; // How quickly we forget old background pixel values seen. Typically set to 0.1.
+        float alpha2; // "Controls speed of feature learning". Depends on T. Typical value circa 0.005.
+        float alpha3; // Alternate to alpha2, used (e.g.) for quicker initial convergence. Typical value 0.1.
+
+        float delta;   // Affects color and color co-occurrence quantization, typically set to 2.
+        float T;       // A percentage value which determines when new features can be recognized as new background. (Typically 0.9).
+        float minArea; // Discard foreground blobs whose bounding box is smaller than this threshold.
+
+        // default Params
+        Params();
+    };
+
+    // out_cn - channels count in output result (can be 3 or 4)
+    // 4-channels require more memory, but a bit faster
+    explicit FGDStatModel(int out_cn = 3);
+    explicit FGDStatModel(const cv::gpu::GpuMat& firstFrame, const Params& params = Params(), int out_cn = 3);
+
+    ~FGDStatModel();
+
+    void create(const cv::gpu::GpuMat& firstFrame, const Params& params = Params());
+    void release();
+
+    int update(const cv::gpu::GpuMat& curFrame);
+
+    //8UC3 or 8UC4 reference background image
+    cv::gpu::GpuMat background;
+
+    //8UC1 foreground image
+    cv::gpu::GpuMat foreground;
+
+    std::vector< std::vector<cv::Point> > foreground_regions;
+
+private:
+    FGDStatModel(const FGDStatModel&);
+    FGDStatModel& operator=(const FGDStatModel&);
+
+    class Impl;
+    std::auto_ptr<Impl> impl_;
+};
+
+////////////////////////////////// Video Encoding //////////////////////////////////

 // Works only under Windows
 // Supports olny H264 video codec and AVI files
--- a/modules/gpu/perf/perf_video.cpp
+++ b/modules/gpu/perf/perf_video.cpp
@@ -271,4 +271,120 @@ GPU_PERF_TEST_1(FarnebackOpticalFlowTest, cv::gpu::DeviceInfo)

 INSTANTIATE_TEST_CASE_P(Video, FarnebackOpticalFlowTest, ALL_DEVICES);

+//////////////////////////////////////////////////////
+// FGDStatModel
+
+GPU_PERF_TEST(FGDStatModel, cv::gpu::DeviceInfo, std::string)
+{
+    cv::gpu::DeviceInfo devInfo = GET_PARAM(0);
+    cv::gpu::setDevice(devInfo.deviceID());
+
+    std::string inputFile = perf::TestBase::getDataPath(std::string("gpu/video/") + GET_PARAM(1));
+
+    cv::VideoCapture cap(inputFile);
+    ASSERT_TRUE(cap.isOpened());
+
+    cv::Mat frame;
+    cap >> frame;
+    ASSERT_FALSE(frame.empty());
+
+    cv::gpu::GpuMat d_frame(frame);
+    cv::gpu::FGDStatModel d_model(4);
+    d_model.create(d_frame);
+
+    declare.time(10);
+
+    for (int i = 0; i < 10; ++i)
+    {
+        cap >> frame;
+        ASSERT_FALSE(frame.empty());
+
+        d_frame.upload(frame);
+
+        startTimer(); next();
+        d_model.update(d_frame);
+        stopTimer();
+    }
+}
+
+INSTANTIATE_TEST_CASE_P(Video, FGDStatModel, testing::Combine(
+    ALL_DEVICES,
+    testing::Values(std::string("768x576.avi"), std::string("1920x1080.avi"))));
+
+//////////////////////////////////////////////////////
+// VideoWriter
+
+#ifdef WIN32
+
+GPU_PERF_TEST(VideoWriter, cv::gpu::DeviceInfo, std::string)
+{
+    const double FPS = 25.0;
+
+    cv::gpu::DeviceInfo devInfo = GET_PARAM(0);
+    cv::gpu::setDevice(devInfo.deviceID());
+
+    std::string inputFile = perf::TestBase::getDataPath(std::string("gpu/video/") + GET_PARAM(1));
+    std::string outputFile = inputFile.substr(0, inputFile.find('.')) + "_test.avi";
+
+    cv::VideoCapture reader(inputFile);
+    ASSERT_TRUE( reader.isOpened() );
+
+    cv::gpu::VideoWriter_GPU d_writer;
+
+    cv::Mat frame;
+    cv::gpu::GpuMat d_frame;
+
+    declare.time(10);
+
+    for (int i = 0; i < 10; ++i)
+    {
+        reader >> frame;
+        ASSERT_FALSE(frame.empty());
+
+        d_frame.upload(frame);
+
+        if (!d_writer.isOpened())
+            d_writer.open(outputFile, frame.size(), FPS);
+
+        startTimer(); next();
+        d_writer.write(d_frame);
+        stopTimer();
+    }
+}
+
+INSTANTIATE_TEST_CASE_P(Video, VideoWriter, testing::Combine(
+    ALL_DEVICES,
+    testing::Values(std::string("768x576.avi"), std::string("1920x1080.avi"))));
+
+#endif // WIN32
+
+//////////////////////////////////////////////////////
+// VideoReader
+
+GPU_PERF_TEST(VideoReader, cv::gpu::DeviceInfo, std::string)
+{
+    cv::gpu::DeviceInfo devInfo = GET_PARAM(0);
+    cv::gpu::setDevice(devInfo.deviceID());
+
+    std::string inputFile = perf::TestBase::getDataPath(std::string("gpu/video/") + GET_PARAM(1));
+
+    cv::gpu::VideoReader_GPU reader(inputFile);
+    ASSERT_TRUE( reader.isOpened() );
+
+    cv::gpu::GpuMat frame;
+
+    reader.read(frame);
+
+    declare.time(20);
+
+    TEST_CYCLE_N(10)
+    {
+        reader.read(frame);
+    }
+}
+
+INSTANTIATE_TEST_CASE_P(Video, VideoReader, testing::Combine(
+    ALL_DEVICES,
+    testing::Values(std::string("768x576.avi"), std::string("1920x1080.avi"))));
+
 #endif
--- a/modules/gpu/perf_cpu/perf_cpu_precomp.hpp
+++ b/modules/gpu/perf_cpu/perf_cpu_precomp.hpp
@@ -16,6 +16,7 @@
 #include "opencv2/video/video.hpp"
 #include "opencv2/calib3d/calib3d.hpp"
 #include "opencv2/nonfree/nonfree.hpp"
+#include "opencv2/legacy/legacy.hpp"

 #include "perf_utility.hpp"

--- a/modules/gpu/perf_cpu/perf_video.cpp
+++ b/modules/gpu/perf_cpu/perf_video.cpp
@@ -105,4 +105,118 @@ GPU_PERF_TEST_1(FarnebackOpticalFlowTest, cv::gpu::DeviceInfo)

 INSTANTIATE_TEST_CASE_P(Video, FarnebackOpticalFlowTest, ALL_DEVICES);

+//////////////////////////////////////////////////////
+// FGDStatModel
+
+namespace cv
+{
+    template<> void Ptr<CvBGStatModel>::delete_obj()
+    {
+        cvReleaseBGStatModel(&obj);
+    }
+}
+
+GPU_PERF_TEST(FGDStatModel, cv::gpu::DeviceInfo, std::string)
+{
+    std::string inputFile = perf::TestBase::getDataPath(std::string("gpu/video/") + GET_PARAM(1));
+
+    cv::VideoCapture cap(inputFile);
+    ASSERT_TRUE(cap.isOpened());
+
+    cv::Mat frame;
+    cap >> frame;
+    ASSERT_FALSE(frame.empty());
+
+    IplImage ipl_frame = frame;
+    cv::Ptr<CvBGStatModel> model(cvCreateFGDStatModel(&ipl_frame));
+
+    declare.time(60);
+
+    for (int i = 0; i < 10; ++i)
+    {
+        cap >> frame;
+        ASSERT_FALSE(frame.empty());
+
+        ipl_frame = frame;
+
+        startTimer();
+        next();
+
+        cvUpdateBGStatModel(&ipl_frame, model);
+
+        stopTimer();
+    }
+}
+
+INSTANTIATE_TEST_CASE_P(Video, FGDStatModel, testing::Combine(
+    ALL_DEVICES,
+    testing::Values(std::string("768x576.avi"), std::string("1920x1080.avi"))));
+
+//////////////////////////////////////////////////////
+// VideoWriter
+
+#ifdef WIN32
+
+GPU_PERF_TEST(VideoWriter, cv::gpu::DeviceInfo, std::string)
+{
+    const double FPS = 25.0;
+
+    std::string inputFile = perf::TestBase::getDataPath(std::string("gpu/video/") + GET_PARAM(1));
+    std::string outputFile = inputFile.substr(0, inputFile.find('.')) + "_test.avi";
+
+    cv::VideoCapture reader(inputFile);
+    ASSERT_TRUE( reader.isOpened() );
+
+    cv::VideoWriter writer;
+
+    cv::Mat frame;
+
+    declare.time(30);
+
+    for (int i = 0; i < 10; ++i)
+    {
+        reader >> frame;
+        ASSERT_FALSE(frame.empty());
+
+        if (!writer.isOpened())
+            writer.open(outputFile, CV_FOURCC('H', '2', '6', '4'), frame.size(), FPS);
+
+        startTimer(); next();
+        writer.write(frame);
+        stopTimer();
+    }
+}
+
+INSTANTIATE_TEST_CASE_P(Video, VideoWriter, testing::Combine(
+    ALL_DEVICES,
+    testing::Values(std::string("768x576.avi"), std::string("1920x1080.avi"))));
+
+#endif // WIN32
+
+//////////////////////////////////////////////////////
+// VideoReader
+
+GPU_PERF_TEST(VideoReader, cv::gpu::DeviceInfo, std::string)
+{
+    std::string inputFile = perf::TestBase::getDataPath(std::string("gpu/video/") + GET_PARAM(1));
+
+    cv::VideoCapture reader(inputFile);
+    ASSERT_TRUE( reader.isOpened() );
+
+    cv::Mat frame;
+
+    reader >> frame;
+
+    declare.time(20);
+
+    TEST_CYCLE_N(10)
+    {
+        reader >> frame;
+    }
+}
+
+INSTANTIATE_TEST_CASE_P(Video, VideoReader, testing::Combine(
+    ALL_DEVICES,
+    testing::Values(std::string("768x576.avi"), std::string("1920x1080.avi"))));
+
 #endif
--- a/modules/gpu/src/cuda/fgd_bgfg.cu
+++ b/modules/gpu/src/cuda/fgd_bgfg.cu
@@ -0,0 +1,802 @@
+#include "opencv2/gpu/device/common.hpp"
+#include "opencv2/gpu/device/vec_math.hpp"
+#include "opencv2/gpu/device/limits.hpp"
+#include "opencv2/gpu/device/utility.hpp"
+#include "fgd_bgfg_common.hpp"
+
+using namespace cv::gpu;
+using namespace cv::gpu::device;
+
+namespace bgfg
+{
+    ////////////////////////////////////////////////////////////////////////////
+    // calcDiffHistogram
+
+    const unsigned int UINT_BITS = 32U;
+    const int LOG_WARP_SIZE = 5;
+    const int WARP_SIZE = 1 << LOG_WARP_SIZE;
+#if (__CUDA_ARCH__ < 120)
+    const unsigned int TAG_MASK = (1U << (UINT_BITS - LOG_WARP_SIZE)) - 1U;
+#endif
+
+    const int MERGE_THREADBLOCK_SIZE = 256;
+
+    __device__ __forceinline__ void addByte(unsigned int* s_WarpHist_, unsigned int data, unsigned int threadTag)
+    {
+        #if (__CUDA_ARCH__ < 120)
+            volatile unsigned int* s_WarpHist = s_WarpHist_;
+            unsigned int count;
+            do
+            {
+                count = s_WarpHist[data] & TAG_MASK;
+                count = threadTag | (count + 1);
+                s_WarpHist[data] = count;
+            } while (s_WarpHist[data] != count);
+        #else
+            atomicInc(s_WarpHist_ + data, (unsigned int)(-1));
+        #endif
+    }
+
+
+    template <typename PT, typename CT>
+    __global__ void calcPartialHistogram(const DevMem2D_<PT> prevFrame, const PtrStep_<CT> curFrame, unsigned int* partialBuf0, unsigned int* partialBuf1, unsigned int* partialBuf2)
+    {
+#if (__CUDA_ARCH__ < 200)
+        const int HISTOGRAM_WARP_COUNT = 4;
+#else
+        const int HISTOGRAM_WARP_COUNT = 6;
+#endif
+        const int HISTOGRAM_THREADBLOCK_SIZE = HISTOGRAM_WARP_COUNT * WARP_SIZE;
+        const int HISTOGRAM_THREADBLOCK_MEMORY = HISTOGRAM_WARP_COUNT * HISTOGRAM_BIN_COUNT;
+
+        //Per-warp subhistogram storage
+        __shared__ unsigned int s_Hist0[HISTOGRAM_THREADBLOCK_MEMORY];
+        __shared__ unsigned int s_Hist1[HISTOGRAM_THREADBLOCK_MEMORY];
+        __shared__ unsigned int s_Hist2[HISTOGRAM_THREADBLOCK_MEMORY];
+
+        //Clear shared memory storage for current threadblock before processing
+        #pragma unroll
+        for (int i = 0; i < (HISTOGRAM_THREADBLOCK_MEMORY / HISTOGRAM_THREADBLOCK_SIZE); ++i)
+        {
+           s_Hist0[threadIdx.x + i * HISTOGRAM_THREADBLOCK_SIZE] = 0;
+           s_Hist1[threadIdx.x + i * HISTOGRAM_THREADBLOCK_SIZE] = 0;
+           s_Hist2[threadIdx.x + i * HISTOGRAM_THREADBLOCK_SIZE] = 0;
+        }
+        __syncthreads();
+
+        const unsigned int warpId = threadIdx.x >> LOG_WARP_SIZE;
+
+        unsigned int* s_WarpHist0 = s_Hist0 + warpId * HISTOGRAM_BIN_COUNT;
+        unsigned int* s_WarpHist1 = s_Hist1 + warpId * HISTOGRAM_BIN_COUNT;
+        unsigned int* s_WarpHist2 = s_Hist2 + warpId * HISTOGRAM_BIN_COUNT;
+
+        const unsigned int tag = threadIdx.x << (UINT_BITS - LOG_WARP_SIZE);
+        const int dataCount = prevFrame.rows * prevFrame.cols;
+        for (unsigned int pos = blockIdx.x * HISTOGRAM_THREADBLOCK_SIZE + threadIdx.x; pos < dataCount; pos += HISTOGRAM_THREADBLOCK_SIZE * PARTIAL_HISTOGRAM_COUNT)
+        {
+            const unsigned int y = pos / prevFrame.cols;
+            const unsigned int x = pos % prevFrame.cols;
+
+            PT prevVal = prevFrame(y, x);
+            CT curVal = curFrame(y, x);
+
+            int3 diff = make_int3(
+                ::abs(curVal.x - prevVal.x),
+                ::abs(curVal.y - prevVal.y),
+                ::abs(curVal.z - prevVal.z)
+            );
+
+            addByte(s_WarpHist0, diff.x, tag);
+            addByte(s_WarpHist1, diff.y, tag);
+            addByte(s_WarpHist2, diff.z, tag);
+        }
+        __syncthreads();
+
+        //Merge per-warp histograms into per-block and write to global memory
+        for (unsigned int bin = threadIdx.x; bin < HISTOGRAM_BIN_COUNT; bin += HISTOGRAM_THREADBLOCK_SIZE)
+        {
+            unsigned int sum0 = 0;
+            unsigned int sum1 = 0;
+            unsigned int sum2 = 0;
+
+            #pragma unroll
+            for (int i = 0; i < HISTOGRAM_WARP_COUNT; ++i)
+            {
+                #if (__CUDA_ARCH__ < 120)
+                    sum0 += s_Hist0[bin + i * HISTOGRAM_BIN_COUNT] & TAG_MASK;
+                    sum1 += s_Hist1[bin + i * HISTOGRAM_BIN_COUNT] & TAG_MASK;
+                    sum2 += s_Hist2[bin + i * HISTOGRAM_BIN_COUNT] & TAG_MASK;
+                #else
+                    sum0 += s_Hist0[bin + i * HISTOGRAM_BIN_COUNT];
+                    sum1 += s_Hist1[bin + i * HISTOGRAM_BIN_COUNT];
+                    sum2 += s_Hist2[bin + i * HISTOGRAM_BIN_COUNT];
+                #endif
+            }
+
+            partialBuf0[blockIdx.x * HISTOGRAM_BIN_COUNT + bin] = sum0;
+            partialBuf1[blockIdx.x * HISTOGRAM_BIN_COUNT + bin] = sum1;
+            partialBuf2[blockIdx.x * HISTOGRAM_BIN_COUNT + bin] = sum2;
+        }
+    }
+
+    __global__ void mergeHistogram(const unsigned int* partialBuf0, const unsigned int* partialBuf1, const unsigned int* partialBuf2, unsigned int* hist0, unsigned int* hist1, unsigned int* hist2)
+    {
+        unsigned int sum0 = 0;
+        unsigned int sum1 = 0;
+        unsigned int sum2 = 0;
+
+        #pragma unroll
+        for (unsigned int i = threadIdx.x; i < PARTIAL_HISTOGRAM_COUNT; i += MERGE_THREADBLOCK_SIZE)
+        {
+            sum0 += partialBuf0[blockIdx.x + i * HISTOGRAM_BIN_COUNT];
+            sum1 += partialBuf1[blockIdx.x + i * HISTOGRAM_BIN_COUNT];
+            sum2 += partialBuf2[blockIdx.x + i * HISTOGRAM_BIN_COUNT];
+        }
+
+        __shared__ unsigned int data0[MERGE_THREADBLOCK_SIZE];
+        __shared__ unsigned int data1[MERGE_THREADBLOCK_SIZE];
+        __shared__ unsigned int data2[MERGE_THREADBLOCK_SIZE];
+
+        data0[threadIdx.x] = sum0;
+        data1[threadIdx.x] = sum1;
+        data2[threadIdx.x] = sum2;
+        __syncthreads();
+
+        if (threadIdx.x < 128)
+        {
+            data0[threadIdx.x] = sum0 += data0[threadIdx.x + 128];
+            data1[threadIdx.x] = sum1 += data1[threadIdx.x + 128];
+            data2[threadIdx.x] = sum2 += data2[threadIdx.x + 128];
+        }
+        __syncthreads();
+
+        if (threadIdx.x < 64)
+        {
+            data0[threadIdx.x] = sum0 += data0[threadIdx.x + 64];
+            data1[threadIdx.x] = sum1 += data1[threadIdx.x + 64];
+            data2[threadIdx.x] = sum2 += data2[threadIdx.x + 64];
+        }
+        __syncthreads();
+
+        if (threadIdx.x < 32)
+        {
+            volatile unsigned int* vdata0 = data0;
+            volatile unsigned int* vdata1 = data1;
+            volatile unsigned int* vdata2 = data2;
+
+            vdata0[threadIdx.x] = sum0 += vdata0[threadIdx.x + 32];
+            vdata1[threadIdx.x] = sum1 += vdata1[threadIdx.x + 32];
+            vdata2[threadIdx.x] = sum2 += vdata2[threadIdx.x + 32];
+
+            vdata0[threadIdx.x] = sum0 += vdata0[threadIdx.x + 16];
+            vdata1[threadIdx.x] = sum1 += vdata1[threadIdx.x + 16];
+            vdata2[threadIdx.x] = sum2 += vdata2[threadIdx.x + 16];
+
+            vdata0[threadIdx.x] = sum0 += vdata0[threadIdx.x + 8];
+            vdata1[threadIdx.x] = sum1 += vdata1[threadIdx.x + 8];
+            vdata2[threadIdx.x] = sum2 += vdata2[threadIdx.x + 8];
+
+            vdata0[threadIdx.x] = sum0 += vdata0[threadIdx.x + 4];
+            vdata1[threadIdx.x] = sum1 += vdata1[threadIdx.x + 4];
+            vdata2[threadIdx.x] = sum2 += vdata2[threadIdx.x + 4];
+
+            vdata0[threadIdx.x] = sum0 += vdata0[threadIdx.x + 2];
+            vdata1[threadIdx.x] = sum1 += vdata1[threadIdx.x + 2];
+            vdata2[threadIdx.x] = sum2 += vdata2[threadIdx.x + 2];
+
+            vdata0[threadIdx.x] = sum0 += vdata0[threadIdx.x + 1];
+            vdata1[threadIdx.x] = sum1 += vdata1[threadIdx.x + 1];
+            vdata2[threadIdx.x] = sum2 += vdata2[threadIdx.x + 1];
+        }
+
+        if(threadIdx.x == 0)
+        {
+            hist0[blockIdx.x] = sum0;
+            hist1[blockIdx.x] = sum1;
+            hist2[blockIdx.x] = sum2;
+        }
+    }
+
+    template <typename PT, typename CT>
+    void calcDiffHistogram_gpu(DevMem2Db prevFrame, DevMem2Db curFrame,
+                               unsigned int* hist0, unsigned int* hist1, unsigned int* hist2,
+                               unsigned int* partialBuf0, unsigned int* partialBuf1, unsigned int* partialBuf2,
+                               int cc, cudaStream_t stream)
+    {
+        const int HISTOGRAM_WARP_COUNT = cc < 20 ? 4 : 6;
+        const int HISTOGRAM_THREADBLOCK_SIZE = HISTOGRAM_WARP_COUNT * WARP_SIZE;
+
+        calcPartialHistogram<PT, CT><<<PARTIAL_HISTOGRAM_COUNT, HISTOGRAM_THREADBLOCK_SIZE, 0, stream>>>(
+                (DevMem2D_<PT>)prevFrame, (DevMem2D_<CT>)curFrame, partialBuf0, partialBuf1, partialBuf2);
+        cudaSafeCall( cudaGetLastError() );
+
+        mergeHistogram<<<HISTOGRAM_BIN_COUNT, MERGE_THREADBLOCK_SIZE, 0, stream>>>(partialBuf0, partialBuf1, partialBuf2, hist0, hist1, hist2);
+        cudaSafeCall( cudaGetLastError() );
+
+        if (stream == 0)
+            cudaSafeCall( cudaDeviceSynchronize() );
+    }
+
+    template void calcDiffHistogram_gpu<uchar3, uchar3>(DevMem2Db prevFrame, DevMem2Db curFrame, unsigned int* hist0, unsigned int* hist1, unsigned int* hist2, unsigned int* partialBuf0, unsigned int* partialBuf1, unsigned int* partialBuf2, int cc, cudaStream_t stream);
+    template void calcDiffHistogram_gpu<uchar3, uchar4>(DevMem2Db prevFrame, DevMem2Db curFrame, unsigned int* hist0, unsigned int* hist1, unsigned int* hist2, unsigned int* partialBuf0, unsigned int* partialBuf1, unsigned int* partialBuf2, int cc, cudaStream_t stream);
+    template void calcDiffHistogram_gpu<uchar4, uchar3>(DevMem2Db prevFrame, DevMem2Db curFrame, unsigned int* hist0, unsigned int* hist1, unsigned int* hist2, unsigned int* partialBuf0, unsigned int* partialBuf1, unsigned int* partialBuf2, int cc, cudaStream_t stream);
+    template void calcDiffHistogram_gpu<uchar4, uchar4>(DevMem2Db prevFrame, DevMem2Db curFrame, unsigned int* hist0, unsigned int* hist1, unsigned int* hist2, unsigned int* partialBuf0, unsigned int* partialBuf1, unsigned int* partialBuf2, int cc, cudaStream_t stream);
+
+    /////////////////////////////////////////////////////////////////////////
+    // calcDiffThreshMask
+
+    template <typename PT, typename CT>
+    __global__ void calcDiffThreshMask(const DevMem2D_<PT> prevFrame, const PtrStep_<CT> curFrame, uchar3 bestThres, PtrStepb changeMask)
+    {
+        const int y = blockIdx.y * blockDim.y + threadIdx.y;
+        const int x = blockIdx.x * blockDim.x + threadIdx.x;
+
+        if (y > prevFrame.rows || x > prevFrame.cols)
+            return;
+
+        PT prevVal = prevFrame(y, x);
+        CT curVal = curFrame(y, x);
+
+        int3 diff = make_int3(
+            ::abs(curVal.x - prevVal.x),
+            ::abs(curVal.y - prevVal.y),
+            ::abs(curVal.z - prevVal.z)
+        );
+
+        if (diff.x > bestThres.x || diff.y > bestThres.y || diff.z > bestThres.z)
+            changeMask(y, x) = 255;
+    }
+
+    template <typename PT, typename CT>
+    void calcDiffThreshMask_gpu(DevMem2Db prevFrame, DevMem2Db curFrame, uchar3 bestThres, DevMem2Db changeMask, cudaStream_t stream)
+    {
+        dim3 block(32, 8);
+        dim3 grid(divUp(prevFrame.cols, block.x), divUp(prevFrame.rows, block.y));
+
+        calcDiffThreshMask<PT, CT><<<grid, block, 0, stream>>>((DevMem2D_<PT>)prevFrame, (DevMem2D_<CT>)curFrame, bestThres, changeMask);
+        cudaSafeCall( cudaGetLastError() );
+
+        if (stream == 0)
+            cudaSafeCall( cudaDeviceSynchronize() );
+    }
+
+    template void calcDiffThreshMask_gpu<uchar3, uchar3>(DevMem2Db prevFrame, DevMem2Db curFrame, uchar3 bestThres, DevMem2Db changeMask, cudaStream_t stream);
+    template void calcDiffThreshMask_gpu<uchar3, uchar4>(DevMem2Db prevFrame, DevMem2Db curFrame, uchar3 bestThres, DevMem2Db changeMask, cudaStream_t stream);
+    template void calcDiffThreshMask_gpu<uchar4, uchar3>(DevMem2Db prevFrame, DevMem2Db curFrame, uchar3 bestThres, DevMem2Db changeMask, cudaStream_t stream);
+    template void calcDiffThreshMask_gpu<uchar4, uchar4>(DevMem2Db prevFrame, DevMem2Db curFrame, uchar3 bestThres, DevMem2Db changeMask, cudaStream_t stream);
+
+    /////////////////////////////////////////////////////////////////////////
+    // bgfgClassification
+
+    __constant__ BGPixelStat c_stat;
+
+    void setBGPixelStat(const BGPixelStat& stat)
+    {
+        cudaSafeCall( cudaMemcpyToSymbol(c_stat, &stat, sizeof(BGPixelStat)) );
+    }
+
+    template <typename T> struct Output;
+    template <> struct Output<uchar3>
+    {
+        static __device__ __forceinline__ uchar3 make(uchar v0, uchar v1, uchar v2)
+        {
+            return make_uchar3(v0, v1, v2);
+        }
+    };
+    template <> struct Output<uchar4>
+    {
+        static __device__ __forceinline__ uchar4 make(uchar v0, uchar v1, uchar v2)
+        {
+            return make_uchar4(v0, v1, v2, 255);
+        }
+    };
+
+    template <typename PT, typename CT, typename OT>
+    __global__ void bgfgClassification(const DevMem2D_<PT> prevFrame, const PtrStep_<CT> curFrame,
+                                       const PtrStepb Ftd, const PtrStepb Fbd, PtrStepb foreground,
+                                       int deltaC, int deltaCC, float alpha2, int N1c, int N1cc)
+    {
+        const int i = blockIdx.y * blockDim.y + threadIdx.y;
+        const int j = blockIdx.x * blockDim.x + threadIdx.x;
+
+        if (i > prevFrame.rows || j > prevFrame.cols)
+            return;
+
+        if (Fbd(i, j) || Ftd(i, j))
+        {
+            float Pb  = 0.0f;
+            float Pv  = 0.0f;
+            float Pvb = 0.0f;
+
+            int val = 0;
+
+            // Is it a motion pixel?
+            if (Ftd(i, j))
+            {
+                if (!c_stat.is_trained_dyn_model(i, j))
+                    val = 1;
+                else
+                {
+                    PT prevVal = prevFrame(i, j);
+                    CT curVal = curFrame(i, j);
+
+                    // Compare with stored CCt vectors:
+                    for (int k = 0; k < N1cc && c_stat.PV_CC(i, j, k) > alpha2; ++k)
+                    {
+                        OT v1 = c_stat.V1_CC<OT>(i, j, k);
+                        OT v2 = c_stat.V2_CC<OT>(i, j, k);
+
+                        if (::abs(v1.x - prevVal.x) <= deltaCC &&
+                            ::abs(v1.y - prevVal.y) <= deltaCC &&
+                            ::abs(v1.z - prevVal.z) <= deltaCC &&
+                            ::abs(v2.x - curVal.x) <= deltaCC &&
+                            ::abs(v2.y - curVal.y) <= deltaCC &&
+                            ::abs(v2.z - curVal.z) <= deltaCC)
+                        {
+                            Pv += c_stat.PV_CC(i, j, k);
+                            Pvb += c_stat.PVB_CC(i, j, k);
+                        }
+                    }
+
+                    Pb = c_stat.Pbcc(i, j);
+                    if (2 * Pvb * Pb <= Pv)
+                        val = 1;
+                }
+            }
+            else if(c_stat.is_trained_st_model(i, j))
+            {
+                CT curVal = curFrame(i, j);
+
+                // Compare with stored Ct vectors:
+                for (int k = 0; k < N1c && c_stat.PV_C(i, j, k) > alpha2; ++k)
+                {
+                    OT v = c_stat.V_C<OT>(i, j, k);
+
+                    if (::abs(v.x - curVal.x) <= deltaC &&
+                        ::abs(v.y - curVal.y) <= deltaC &&
+                        ::abs(v.z - curVal.z) <= deltaC)
+                    {
+                        Pv += c_stat.PV_C(i, j, k);
+                        Pvb += c_stat.PVB_C(i, j, k);
+                    }
+                }
+                Pb = c_stat.Pbc(i, j);
+                if (2 * Pvb * Pb <= Pv)
+                    val = 1;
+            }
+
+            // Update foreground:
+            foreground(i, j) = static_cast<uchar>(val);
+        } // end if( change detection...
+    }
+
+    template <typename PT, typename CT, typename OT>
+    void bgfgClassification_gpu(DevMem2Db prevFrame, DevMem2Db curFrame, DevMem2Db Ftd, DevMem2Db Fbd, DevMem2Db foreground,
+                                int deltaC, int deltaCC, float alpha2, int N1c, int N1cc, cudaStream_t stream)
+    {
+        dim3 block(32, 8);
+        dim3 grid(divUp(prevFrame.cols, block.x), divUp(prevFrame.rows, block.y));
+
+        cudaSafeCall( cudaFuncSetCacheConfig(bgfgClassification<PT, CT, OT>, cudaFuncCachePreferL1) );
+
+        bgfgClassification<PT, CT, OT><<<grid, block, 0, stream>>>((DevMem2D_<PT>)prevFrame, (DevMem2D_<CT>)curFrame,
+                                                                   Ftd, Fbd, foreground,
+                                                                   deltaC, deltaCC, alpha2, N1c, N1cc);
+        cudaSafeCall( cudaGetLastError() );
+
+        if (stream == 0)
+            cudaSafeCall( cudaDeviceSynchronize() );
+    }
+
+    template void bgfgClassification_gpu<uchar3, uchar3, uchar3>(DevMem2Db prevFrame, DevMem2Db curFrame, DevMem2Db Ftd, DevMem2Db Fbd, DevMem2Db foreground, int deltaC, int deltaCC, float alpha2, int N1c, int N1cc, cudaStream_t stream);
+    template void bgfgClassification_gpu<uchar3, uchar3, uchar4>(DevMem2Db prevFrame, DevMem2Db curFrame, DevMem2Db Ftd, DevMem2Db Fbd, DevMem2Db foreground, int deltaC, int deltaCC, float alpha2, int N1c, int N1cc, cudaStream_t stream);
+    template void bgfgClassification_gpu<uchar3, uchar4, uchar3>(DevMem2Db prevFrame, DevMem2Db curFrame, DevMem2Db Ftd, DevMem2Db Fbd, DevMem2Db foreground, int deltaC, int deltaCC, float alpha2, int N1c, int N1cc, cudaStream_t stream);
+    template void bgfgClassification_gpu<uchar3, uchar4, uchar4>(DevMem2Db prevFrame, DevMem2Db curFrame, DevMem2Db Ftd, DevMem2Db Fbd, DevMem2Db foreground, int deltaC, int deltaCC, float alpha2, int N1c, int N1cc, cudaStream_t stream);
+    template void bgfgClassification_gpu<uchar4, uchar3, uchar3>(DevMem2Db prevFrame, DevMem2Db curFrame, DevMem2Db Ftd, DevMem2Db Fbd, DevMem2Db foreground, int deltaC, int deltaCC, float alpha2, int N1c, int N1cc, cudaStream_t stream);
+    template void bgfgClassification_gpu<uchar4, uchar3, uchar4>(DevMem2Db prevFrame, DevMem2Db curFrame, DevMem2Db Ftd, DevMem2Db Fbd, DevMem2Db foreground, int deltaC, int deltaCC, float alpha2, int N1c, int N1cc, cudaStream_t stream);
+    template void bgfgClassification_gpu<uchar4, uchar4, uchar3>(DevMem2Db prevFrame, DevMem2Db curFrame, DevMem2Db Ftd, DevMem2Db Fbd, DevMem2Db foreground, int deltaC, int deltaCC, float alpha2, int N1c, int N1cc, cudaStream_t stream);
+    template void bgfgClassification_gpu<uchar4, uchar4, uchar4>(DevMem2Db prevFrame, DevMem2Db curFrame, DevMem2Db Ftd, DevMem2Db Fbd, DevMem2Db foreground, int deltaC, int deltaCC, float alpha2, int N1c, int N1cc, cudaStream_t stream);
+
+    ////////////////////////////////////////////////////////////////////////////
+    // updateBackgroundModel
+
+    template <typename PT, typename CT, typename OT, class PrevFramePtr2D, class CurFramePtr2D, class FtdPtr2D, class FbdPtr2D>
+    __global__ void updateBackgroundModel(int cols, int rows, const PrevFramePtr2D prevFrame, const CurFramePtr2D curFrame, const FtdPtr2D Ftd, const FbdPtr2D Fbd,
+                                          PtrStepb foreground, PtrStep_<OT> background,
+                                          int deltaC, int deltaCC, float alpha1, float alpha2, float alpha3, int N1c, int N1cc, int N2c, int N2cc, float T)
+    {
+        const int i = blockIdx.y * blockDim.y + threadIdx.y;
+        const int j = blockIdx.x * blockDim.x + threadIdx.x;
+
+        if (i > rows || j > cols)
+            return;
+
+        const float MIN_PV = 1e-10f;
+
+        const uchar is_trained_dyn_model = c_stat.is_trained_dyn_model(i, j);
+        if (Ftd(i, j) || !is_trained_dyn_model)
+        {
+            const float alpha = is_trained_dyn_model ? alpha2 : alpha3;
+
+            float Pbcc = c_stat.Pbcc(i, j);
+
+            //update Pb
+            Pbcc *= (1.0f - alpha);
+            if (!foreground(i, j))
+            {
+                Pbcc += alpha;
+            }
+
+            int min_dist = numeric_limits<int>::max();
+            int indx = -1;
+
+            PT prevVal = prevFrame(i, j);
+            CT curVal = curFrame(i, j);
+
+            // Find best Vi match:
+            for (int k = 0; k < N2cc; ++k)
+            {
+                float PV_CC = c_stat.PV_CC(i, j, k);
+                if (!PV_CC)
+                    break;
+
+                if (PV_CC < MIN_PV)
+                {
+                    c_stat.PV_CC(i, j, k) = 0;
+                    c_stat.PVB_CC(i, j, k) = 0;
+                    continue;
+                }
+
+                c_stat.PV_CC(i, j, k) = PV_CC * (1.0f - alpha);
+                c_stat.PVB_CC(i, j, k) = c_stat.PVB_CC(i, j, k) * (1.0f - alpha);
+
+                OT v1 = c_stat.V1_CC<OT>(i, j, k);
+
+                int3 val1 = make_int3(
+                    ::abs(v1.x - prevVal.x),
+                    ::abs(v1.y - prevVal.y),
+                    ::abs(v1.z - prevVal.z)
+                );
+
+                OT v2 = c_stat.V2_CC<OT>(i, j, k);
+
+                int3 val2 = make_int3(
+                    ::abs(v2.x - curVal.x),
+                    ::abs(v2.y - curVal.y),
+                    ::abs(v2.z - curVal.z)
+                );
+
+                int dist = val1.x + val1.y + val1.z + val2.x + val2.y + val2.z;
+
+                if (dist < min_dist &&
+                    val1.x <= deltaCC && val1.y <= deltaCC && val1.z <= deltaCC &&
+                    val2.x <= deltaCC && val2.y <= deltaCC && val2.z <= deltaCC)
+                {
+                    min_dist = dist;
+                    indx = k;
+                }
+            }
+
+            if (indx < 0)
+            {
+                // Replace N2th elem in the table by new feature:
+                indx = N2cc - 1;
+                c_stat.PV_CC(i, j, indx) = alpha;
+                c_stat.PVB_CC(i, j, indx) = alpha;
+
+                //udate Vt
+                c_stat.V1_CC<OT>(i, j, indx) = Output<OT>::make(prevVal.x, prevVal.y, prevVal.z);
+                c_stat.V2_CC<OT>(i, j, indx) = Output<OT>::make(curVal.x, curVal.y, curVal.z);
+            }
+            else
+            {
+                // Update:
+                c_stat.PV_CC(i, j, indx) += alpha;
+
+                if (!foreground(i, j))
+                {
+                    c_stat.PVB_CC(i, j, indx) += alpha;
+                }
+            }
+
+            //re-sort CCt table by Pv
+            const float PV_CC_indx = c_stat.PV_CC(i, j, indx);
+            const float PVB_CC_indx = c_stat.PVB_CC(i, j, indx);
+            const OT V1_CC_indx = c_stat.V1_CC<OT>(i, j, indx);
+            const OT V2_CC_indx = c_stat.V2_CC<OT>(i, j, indx);
+            for (int k = 0; k < indx; ++k)
+            {
+                if (c_stat.PV_CC(i, j, k) <= PV_CC_indx)
+                {
+                    //shift elements
+                    float Pv_tmp1;
+                    float Pv_tmp2 = PV_CC_indx;
+
+                    float Pvb_tmp1;
+                    float Pvb_tmp2 = PVB_CC_indx;
+
+                    OT v1_tmp1;
+                    OT v1_tmp2 = V1_CC_indx;
+
+                    OT v2_tmp1;
+                    OT v2_tmp2 = V2_CC_indx;
+
+                    for (int l = k; l <= indx; ++l)
+                    {
+                        Pv_tmp1 = c_stat.PV_CC(i, j, l);
+                        c_stat.PV_CC(i, j, l) = Pv_tmp2;
+                        Pv_tmp2 = Pv_tmp1;
+
+                        Pvb_tmp1 = c_stat.PVB_CC(i, j, l);
+                        c_stat.PVB_CC(i, j, l) = Pvb_tmp2;
+                        Pvb_tmp2 = Pvb_tmp1;
+
+                        v1_tmp1 = c_stat.V1_CC<OT>(i, j, l);
+                        c_stat.V1_CC<OT>(i, j, l) = v1_tmp2;
+                        v1_tmp2 = v1_tmp1;
+
+                        v2_tmp1 = c_stat.V2_CC<OT>(i, j, l);
+                        c_stat.V2_CC<OT>(i, j, l) = v2_tmp2;
+                        v2_tmp2 = v2_tmp1;
+                    }
+
+                    break;
+                }
+            }
+
+            float sum1 = 0.0f;
+            float sum2 = 0.0f;
+
+            //check "once-off" changes
+            for (int k = 0; k < N1cc; ++k)
+            {
+                const float PV_CC = c_stat.PV_CC(i, j, k);
+                if (!PV_CC)
+                    break;
+
+                sum1 += PV_CC;
+                sum2 += c_stat.PVB_CC(i, j, k);
+            }
+
+            if (sum1 > T)
+                c_stat.is_trained_dyn_model(i, j) = 1;
+
+            float diff = sum1 - Pbcc * sum2;
+
+            // Update stat table:
+            if (diff > T)
+            {
+                //new BG features are discovered
+                for (int k = 0; k < N1cc; ++k)
+                {
+                    const float PV_CC = c_stat.PV_CC(i, j, k);
+                    if (!PV_CC)
+                        break;
+
+                    c_stat.PVB_CC(i, j, k) = (PV_CC - Pbcc * c_stat.PVB_CC(i, j, k)) / (1.0f - Pbcc);
+                }
+            }
+
+            c_stat.Pbcc(i, j) = Pbcc;
+        }
+
+        // Handle "stationary" pixel:
+        if (!Ftd(i, j))
+        {
+            const float alpha = c_stat.is_trained_st_model(i, j) ? alpha2 : alpha3;
+
+            float Pbc = c_stat.Pbc(i, j);
+
+            //update Pb
+            Pbc *= (1.0f - alpha);
+            if (!foreground(i, j))
+            {
+                Pbc += alpha;
+            }
+
+            int min_dist = numeric_limits<int>::max();
+            int indx = -1;
+
+            CT curVal = curFrame(i, j);
+
+            //find best Vi match
+            for (int k = 0; k < N2c; ++k)
+            {
+                float PV_C = c_stat.PV_C(i, j, k);
+
+                if (PV_C < MIN_PV)
+                {
+                    c_stat.PV_C(i, j, k) = 0;
+                    c_stat.PVB_C(i, j, k) = 0;
+                    continue;
+                }
+
+                // Exponential decay of memory
+                c_stat.PV_C(i, j, k) = PV_C * (1.0f - alpha);
+                c_stat.PVB_C(i, j, k) = c_stat.PVB_C(i, j, k) * (1.0f - alpha);
+
+                OT v = c_stat.V_C<OT>(i, j, k);
+                int3 val = make_int3(
+                    ::abs(v.x - curVal.x),
+                    ::abs(v.y - curVal.y),
+                    ::abs(v.z - curVal.z)
+                );
+
+                int dist = val.x + val.y + val.z;
+
+                if (dist < min_dist && val.x <= deltaC && val.y <= deltaC && val.z <= deltaC)
+                {
+                    min_dist = dist;
+                    indx = k;
+                }
+            }
+
+            if (indx < 0)
+            {
+                //N2th elem in the table is replaced by a new features
+                indx = N2c - 1;
+
+                c_stat.PV_C(i, j, indx) = alpha;
+                c_stat.PVB_C(i, j, indx) = alpha;
+
+                //udate Vt
+                c_stat.V_C<OT>(i, j, indx) = Output<OT>::make(curVal.x, curVal.y, curVal.z);
+            }
+            else
+            {
+                //update
+                c_stat.PV_C(i, j, indx) += alpha;
+
+                if (!foreground(i, j))
+                {
+                    c_stat.PVB_C(i, j, indx) += alpha;
+                }
+            }
+
+            //re-sort Ct table by Pv
+            const float PV_C_indx = c_stat.PV_C(i, j, indx);
+            const float PVB_C_indx = c_stat.PVB_C(i, j, indx);
+            OT V_C_indx = c_stat.V_C<OT>(i, j, indx);
+            for (int k = 0; k < indx; ++k)
+            {
+                if (c_stat.PV_C(i, j, k) <= PV_C_indx)
+                {
+                    //shift elements
+                    float Pv_tmp1;
+                    float Pv_tmp2 = PV_C_indx;
+
+                    float Pvb_tmp1;
+                    float Pvb_tmp2 = PVB_C_indx;
+
+                    OT v_tmp1;
+                    OT v_tmp2 = V_C_indx;
+
+                    for (int l = k; l <= indx; ++l)
+                    {
+                        Pv_tmp1 = c_stat.PV_C(i, j, l);
+                        c_stat.PV_C(i, j, l) = Pv_tmp2;
+                        Pv_tmp2 = Pv_tmp1;
+
+                        Pvb_tmp1 = c_stat.PVB_C(i, j, l);
+                        c_stat.PVB_C(i, j, l) = Pvb_tmp2;
+                        Pvb_tmp2 = Pvb_tmp1;
+
+                        v_tmp1 = c_stat.V_C<OT>(i, j, l);
+                        c_stat.V_C<OT>(i, j, l) = v_tmp2;
+                        v_tmp2 = v_tmp1;
+                    }
+
+                    break;
+                }
+            }
+
+            // Check "once-off" changes:
+            float sum1 = 0.0f;
+            float sum2 = 0.0f;
+            for (int k = 0; k < N1c; ++k)
+            {
+                const float PV_C = c_stat.PV_C(i, j, k);
+                if (!PV_C)
+                    break;
+
+                sum1 += PV_C;
+                sum2 += c_stat.PVB_C(i, j, k);
+            }
+
+            if (sum1 > T)
+                c_stat.is_trained_st_model(i, j) = 1;
+
+            float diff = sum1 - Pbc * sum2;
+
+            // Update stat table:
+            if (diff > T)
+            {
+                //new BG features are discovered
+                for (int k = 0; k < N1c; ++k)
+                {
+                    const float PV_C = c_stat.PV_C(i, j, k);
+                    if (!PV_C)
+                        break;
+
+                    c_stat.PVB_C(i, j, k) = (PV_C - Pbc * c_stat.PVB_C(i, j, k)) / (1.0f - Pbc);
+                }
+
+                c_stat.Pbc(i, j) = 1.0f - Pbc;
+            }
+            else
+            {
+                c_stat.Pbc(i, j) = Pbc;
+            }
+        } // if !(change detection) at pixel (i,j)
+
+        // Update the reference BG image:
+        if (!foreground(i, j))
+        {
+            CT curVal = curFrame(i, j);
+
+            if (!Ftd(i, j) && !Fbd(i, j))
+            {
+                // Apply IIR filter:
+                OT oldVal = background(i, j);
+
+                int3 newVal = make_int3(
+                    __float2int_rn(oldVal.x * (1.0f - alpha1) + curVal.x * alpha1),
+                    __float2int_rn(oldVal.y * (1.0f - alpha1) + curVal.y * alpha1),
+                    __float2int_rn(oldVal.z * (1.0f - alpha1) + curVal.z * alpha1)
+                );
+
+                background(i, j) = Output<OT>::make(
+                    static_cast<uchar>(newVal.x),
+                    static_cast<uchar>(newVal.y),
+                    static_cast<uchar>(newVal.z)
+                );
+            }
+            else
+            {
+                background(i, j) = Output<OT>::make(curVal.x, curVal.y, curVal.z);
+            }
+        }
+    }
+
+    template <typename PT, typename CT, typename OT>
+    struct UpdateBackgroundModel
+    {
+        static void call(DevMem2D_<PT> prevFrame, DevMem2D_<CT> curFrame, DevMem2Db Ftd, DevMem2Db Fbd, DevMem2Db foreground, DevMem2D_<OT> background,
+                         int deltaC, int deltaCC, float alpha1, float alpha2, float alpha3, int N1c, int N1cc, int N2c, int N2cc, float T,
+                         cudaStream_t stream)
+        {
+            dim3 block(32, 8);
+            dim3 grid(divUp(prevFrame.cols, block.x), divUp(prevFrame.rows, block.y));
+
+            cudaSafeCall( cudaFuncSetCacheConfig(updateBackgroundModel<PT, CT, OT, PtrStep_<PT>, PtrStep_<CT>, PtrStepb, PtrStepb>, cudaFuncCachePreferL1) );
+
+            updateBackgroundModel<PT, CT, OT, PtrStep_<PT>, PtrStep_<CT>, PtrStepb, PtrStepb><<<grid, block, 0, stream>>>(
+                prevFrame.cols, prevFrame.rows,
+                prevFrame, curFrame,
+                Ftd, Fbd, foreground, background,
+                deltaC, deltaCC, alpha1, alpha2, alpha3, N1c, N1cc, N2c, N2cc, T);
+            cudaSafeCall( cudaGetLastError() );
+
+            if (stream == 0)
+                cudaSafeCall( cudaDeviceSynchronize() );
+        }
+    };
+
+    template <typename PT, typename CT, typename OT>
+    void updateBackgroundModel_gpu(DevMem2Db prevFrame, DevMem2Db curFrame, DevMem2Db Ftd, DevMem2Db Fbd, DevMem2Db foreground, DevMem2Db background,
+                                   int deltaC, int deltaCC, float alpha1, float alpha2, float alpha3, int N1c, int N1cc, int N2c, int N2cc, float T,
+                                   cudaStream_t stream)
+    {
+        UpdateBackgroundModel<PT, CT, OT>::call(DevMem2D_<PT>(prevFrame), DevMem2D_<CT>(curFrame), Ftd, Fbd, foreground, DevMem2D_<OT>(background),
+                                                deltaC, deltaCC, alpha1, alpha2, alpha3, N1c, N1cc, N2c, N2cc, T, stream);
+    }
+
+    template void updateBackgroundModel_gpu<uchar3, uchar3, uchar3>(DevMem2Db prevFrame, DevMem2Db curFrame, DevMem2Db Ftd, DevMem2Db Fbd, DevMem2Db foreground, DevMem2Db background, int deltaC, int deltaCC, float alpha1, float alpha2, float alpha3, int N1c, int N1cc, int N2c, int N2cc, float T, cudaStream_t stream);
+    template void updateBackgroundModel_gpu<uchar3, uchar3, uchar4>(DevMem2Db prevFrame, DevMem2Db curFrame, DevMem2Db Ftd, DevMem2Db Fbd, DevMem2Db foreground, DevMem2Db background, int deltaC, int deltaCC, float alpha1, float alpha2, float alpha3, int N1c, int N1cc, int N2c, int N2cc, float T, cudaStream_t stream);
+    template void updateBackgroundModel_gpu<uchar3, uchar4, uchar3>(DevMem2Db prevFrame, DevMem2Db curFrame, DevMem2Db Ftd, DevMem2Db Fbd, DevMem2Db foreground, DevMem2Db background, int deltaC, int deltaCC, float alpha1, float alpha2, float alpha3, int N1c, int N1cc, int N2c, int N2cc, float T, cudaStream_t stream);
+    template void updateBackgroundModel_gpu<uchar3, uchar4, uchar4>(DevMem2Db prevFrame, DevMem2Db curFrame, DevMem2Db Ftd, DevMem2Db Fbd, DevMem2Db foreground, DevMem2Db background, int deltaC, int deltaCC, float alpha1, float alpha2, float alpha3, int N1c, int N1cc, int N2c, int N2cc, float T, cudaStream_t stream);
+    template void updateBackgroundModel_gpu<uchar4, uchar3, uchar3>(DevMem2Db prevFrame, DevMem2Db curFrame, DevMem2Db Ftd, DevMem2Db Fbd, DevMem2Db foreground, DevMem2Db background, int deltaC, int deltaCC, float alpha1, float alpha2, float alpha3, int N1c, int N1cc, int N2c, int N2cc, float T, cudaStream_t stream);
+    template void updateBackgroundModel_gpu<uchar4, uchar3, uchar4>(DevMem2Db prevFrame, DevMem2Db curFrame, DevMem2Db Ftd, DevMem2Db Fbd, DevMem2Db foreground, DevMem2Db background, int deltaC, int deltaCC, float alpha1, float alpha2, float alpha3, int N1c, int N1cc, int N2c, int N2cc, float T, cudaStream_t stream);
+    template void updateBackgroundModel_gpu<uchar4, uchar4, uchar3>(DevMem2Db prevFrame, DevMem2Db curFrame, DevMem2Db Ftd, DevMem2Db Fbd, DevMem2Db foreground, DevMem2Db background, int deltaC, int deltaCC, float alpha1, float alpha2, float alpha3, int N1c, int N1cc, int N2c, int N2cc, float T, cudaStream_t stream);
+    template void updateBackgroundModel_gpu<uchar4, uchar4, uchar4>(DevMem2Db prevFrame, DevMem2Db curFrame, DevMem2Db Ftd, DevMem2Db Fbd, DevMem2Db foreground, DevMem2Db background, int deltaC, int deltaCC, float alpha1, float alpha2, float alpha3, int N1c, int N1cc, int N2c, int N2cc, float T, cudaStream_t stream);
+}
--- a/modules/gpu/src/cuda/fgd_bgfg_common.hpp
+++ b/modules/gpu/src/cuda/fgd_bgfg_common.hpp
@@ -0,0 +1,147 @@
+#ifndef __FGD_BGFG_COMMON_HPP__
+#define __FGD_BGFG_COMMON_HPP__
+
+#include "opencv2/core/devmem2d.hpp"
+
+namespace bgfg
+{
+    struct BGPixelStat
+    {
+    public:
+#ifdef __CUDACC__
+        __device__ float& Pbc(int i, int j);
+        __device__ float& Pbcc(int i, int j);
+
+        __device__ unsigned char& is_trained_st_model(int i, int j);
+        __device__ unsigned char& is_trained_dyn_model(int i, int j);
+
+        __device__ float& PV_C(int i, int j, int k);
+        __device__ float& PVB_C(int i, int j, int k);
+        template <typename T> __device__ T& V_C(int i, int j, int k);
+
+        __device__ float& PV_CC(int i, int j, int k);
+        __device__ float& PVB_CC(int i, int j, int k);
+        template <typename T> __device__ T& V1_CC(int i, int j, int k);
+        template <typename T> __device__ T& V2_CC(int i, int j, int k);
+#endif
+
+        int rows_;
+
+        unsigned char* Pbc_data_;
+        size_t Pbc_step_;
+
+        unsigned char* Pbcc_data_;
+        size_t Pbcc_step_;
+
+        unsigned char* is_trained_st_model_data_;
+        size_t is_trained_st_model_step_;
+
+        unsigned char* is_trained_dyn_model_data_;
+        size_t is_trained_dyn_model_step_;
+
+        unsigned char* ctable_Pv_data_;
+        size_t ctable_Pv_step_;
+
+        unsigned char* ctable_Pvb_data_;
+        size_t ctable_Pvb_step_;
+
+        unsigned char* ctable_v_data_;
+        size_t ctable_v_step_;
+
+        unsigned char* cctable_Pv_data_;
+        size_t cctable_Pv_step_;
+
+        unsigned char* cctable_Pvb_data_;
+        size_t cctable_Pvb_step_;
+
+        unsigned char* cctable_v1_data_;
+        size_t cctable_v1_step_;
+
+        unsigned char* cctable_v2_data_;
+        size_t cctable_v2_step_;
+    };
+
+#ifdef __CUDACC__
+    __device__ __forceinline__ float& BGPixelStat::Pbc(int i, int j)
+    {
+        return *((float*)(Pbc_data_ + i * Pbc_step_) + j);
+    }
+
+    __device__ __forceinline__ float& BGPixelStat::Pbcc(int i, int j)
+    {
+        return *((float*)(Pbcc_data_ + i * Pbcc_step_) + j);
+    }
+
+    __device__ __forceinline__ unsigned char& BGPixelStat::is_trained_st_model(int i, int j)
+    {
+        return *((unsigned char*)(is_trained_st_model_data_ + i * is_trained_st_model_step_) + j);
+    }
+
+    __device__ __forceinline__ unsigned char& BGPixelStat::is_trained_dyn_model(int i, int j)
+    {
+        return *((unsigned char*)(is_trained_dyn_model_data_ + i * is_trained_dyn_model_step_) + j);
+    }
+
+    __device__ __forceinline__ float& BGPixelStat::PV_C(int i, int j, int k)
+    {
+        return *((float*)(ctable_Pv_data_ + ((k * rows_) + i) * ctable_Pv_step_) + j);
+    }
+
+    __device__ __forceinline__ float& BGPixelStat::PVB_C(int i, int j, int k)
+    {
+        return *((float*)(ctable_Pvb_data_ + ((k * rows_) + i) * ctable_Pvb_step_) + j);
+    }
+
+    template <typename T> __device__ __forceinline__ T& BGPixelStat::V_C(int i, int j, int k)
+    {
+        return *((T*)(ctable_v_data_ + ((k * rows_) + i) * ctable_v_step_) + j);
+    }
+
+    __device__ __forceinline__ float& BGPixelStat::PV_CC(int i, int j, int k)
+    {
+        return *((float*)(cctable_Pv_data_ + ((k * rows_) + i) * cctable_Pv_step_) + j);
+    }
+
+    __device__ __forceinline__ float& BGPixelStat::PVB_CC(int i, int j, int k)
+    {
+        return *((float*)(cctable_Pvb_data_ + ((k * rows_) + i) * cctable_Pvb_step_) + j);
+    }
+
+    template <typename T> __device__ __forceinline__ T& BGPixelStat::V1_CC(int i, int j, int k)
+    {
+        return *((T*)(cctable_v1_data_ + ((k * rows_) + i) * cctable_v1_step_) + j);
+    }
+
+    template <typename T> __device__ __forceinline__ T& BGPixelStat::V2_CC(int i, int j, int k)
+    {
+        return *((T*)(cctable_v2_data_ + ((k * rows_) + i) * cctable_v2_step_) + j);
+    }
+#endif
+
+    const int PARTIAL_HISTOGRAM_COUNT = 240;
+    const int HISTOGRAM_BIN_COUNT = 256;
+
+    template <typename PT, typename CT>
+    void calcDiffHistogram_gpu(cv::gpu::DevMem2Db prevFrame, cv::gpu::DevMem2Db curFrame,
+                               unsigned int* hist0, unsigned int* hist1, unsigned int* hist2,
+                               unsigned int* partialBuf0, unsigned int* partialBuf1, unsigned int* partialBuf2,
+                               int cc, cudaStream_t stream);
+
+    template <typename PT, typename CT>
+    void calcDiffThreshMask_gpu(cv::gpu::DevMem2Db prevFrame, cv::gpu::DevMem2Db curFrame, uchar3 bestThres, cv::gpu::DevMem2Db changeMask, cudaStream_t stream);
+
+    void setBGPixelStat(const BGPixelStat& stat);
+
+    template <typename PT, typename CT, typename OT>
+    void bgfgClassification_gpu(cv::gpu::DevMem2Db prevFrame, cv::gpu::DevMem2Db curFrame,
+                                cv::gpu::DevMem2Db Ftd, cv::gpu::DevMem2Db Fbd, cv::gpu::DevMem2Db foreground,
+                                int deltaC, int deltaCC, float alpha2, int N1c, int N1cc, cudaStream_t stream);
+
+    template <typename PT, typename CT, typename OT>
+    void updateBackgroundModel_gpu(cv::gpu::DevMem2Db prevFrame, cv::gpu::DevMem2Db curFrame,
+                                   cv::gpu::DevMem2Db Ftd, cv::gpu::DevMem2Db Fbd, cv::gpu::DevMem2Db foreground, cv::gpu::DevMem2Db background,
+                                   int deltaC, int deltaCC, float alpha1, float alpha2, float alpha3, int N1c, int N1cc, int N2c, int N2cc, float T,
+                                   cudaStream_t stream);
+}
+
+#endif // __FGD_BGFG_COMMON_HPP__
--- a/modules/gpu/src/fgd_bgfg.cpp
+++ b/modules/gpu/src/fgd_bgfg.cpp
@@ -0,0 +1,755 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "precomp.hpp"
+
+#ifndef HAVE_CUDA
+
+class cv::gpu::FGDStatModel::Impl
+{
+};
+
+cv::gpu::FGDStatModel::Params::Params() { throw_nogpu(); }
+
+cv::gpu::FGDStatModel::FGDStatModel(int) { throw_nogpu(); }
+cv::gpu::FGDStatModel::FGDStatModel(const cv::gpu::GpuMat&, const Params&, int) { throw_nogpu(); }
+cv::gpu::FGDStatModel::~FGDStatModel() {}
+void cv::gpu::FGDStatModel::create(const cv::gpu::GpuMat&, const Params&) { throw_nogpu(); }
+void cv::gpu::FGDStatModel::release() {}
+int cv::gpu::FGDStatModel::update(const cv::gpu::GpuMat&) { throw_nogpu(); return 0; }
+
+#else
+
+#include "fgd_bgfg_common.hpp"
+
+namespace
+{
+    class BGPixelStat
+    {
+    public:
+        void create(cv::Size size, const cv::gpu::FGDStatModel::Params& params, int out_cn);
+        void release();
+
+        void setTrained();
+
+        operator bgfg::BGPixelStat();
+
+    private:
+        cv::gpu::GpuMat Pbc_;
+        cv::gpu::GpuMat Pbcc_;
+        cv::gpu::GpuMat is_trained_st_model_;
+        cv::gpu::GpuMat is_trained_dyn_model_;
+
+        cv::gpu::GpuMat ctable_Pv_;
+        cv::gpu::GpuMat ctable_Pvb_;
+        cv::gpu::GpuMat ctable_v_;
+
+        cv::gpu::GpuMat cctable_Pv_;
+        cv::gpu::GpuMat cctable_Pvb_;
+        cv::gpu::GpuMat cctable_v1_;
+        cv::gpu::GpuMat cctable_v2_;
+    };
+
+    void BGPixelStat::create(cv::Size size, const cv::gpu::FGDStatModel::Params& params, int out_cn)
+    {
+        cv::gpu::ensureSizeIsEnough(size, CV_32FC1, Pbc_);
+        Pbc_.setTo(cv::Scalar::all(0));
+
+        cv::gpu::ensureSizeIsEnough(size, CV_32FC1, Pbcc_);
+        Pbcc_.setTo(cv::Scalar::all(0));
+
+        cv::gpu::ensureSizeIsEnough(size, CV_8UC1, is_trained_st_model_);
+        is_trained_st_model_.setTo(cv::Scalar::all(0));
+
+        cv::gpu::ensureSizeIsEnough(size, CV_8UC1, is_trained_dyn_model_);
+        is_trained_dyn_model_.setTo(cv::Scalar::all(0));
+
+        cv::gpu::ensureSizeIsEnough(params.N2c * size.height, size.width, CV_32FC1, ctable_Pv_);
+        ctable_Pv_.setTo(cv::Scalar::all(0));
+
+        cv::gpu::ensureSizeIsEnough(params.N2c * size.height, size.width, CV_32FC1, ctable_Pvb_);
+        ctable_Pvb_.setTo(cv::Scalar::all(0));
+
+        cv::gpu::ensureSizeIsEnough(params.N2c * size.height, size.width, CV_8UC(out_cn), ctable_v_);
+        ctable_v_.setTo(cv::Scalar::all(0));
+
+        cv::gpu::ensureSizeIsEnough(params.N2cc * size.height, size.width, CV_32FC1, cctable_Pv_);
+        cctable_Pv_.setTo(cv::Scalar::all(0));
+
+        cv::gpu::ensureSizeIsEnough(params.N2cc * size.height, size.width, CV_32FC1, cctable_Pvb_);
+        cctable_Pvb_.setTo(cv::Scalar::all(0));
+
+        cv::gpu::ensureSizeIsEnough(params.N2cc * size.height, size.width, CV_8UC(out_cn), cctable_v1_);
+        cctable_v1_.setTo(cv::Scalar::all(0));
+
+        cv::gpu::ensureSizeIsEnough(params.N2cc * size.height, size.width, CV_8UC(out_cn), cctable_v2_);
+        cctable_v2_.setTo(cv::Scalar::all(0));
+    }
+
+    void BGPixelStat::release()
+    {
+        Pbc_.release();
+        Pbcc_.release();
+        is_trained_st_model_.release();
+        is_trained_dyn_model_.release();
+
+        ctable_Pv_.release();
+        ctable_Pvb_.release();
+        ctable_v_.release();
+
+        cctable_Pv_.release();
+        cctable_Pvb_.release();
+        cctable_v1_.release();
+        cctable_v2_.release();
+    }
+
+    void BGPixelStat::setTrained()
+    {
+        is_trained_st_model_.setTo(cv::Scalar::all(1));
+        is_trained_dyn_model_.setTo(cv::Scalar::all(1));
+    }
+
+    BGPixelStat::operator bgfg::BGPixelStat()
+    {
+        bgfg::BGPixelStat stat;
+
+        stat.rows_ = Pbc_.rows;
+
+        stat.Pbc_data_ = Pbc_.data;
+        stat.Pbc_step_ = Pbc_.step;
+
+        stat.Pbcc_data_ = Pbcc_.data;
+        stat.Pbcc_step_ = Pbcc_.step;
+
+        stat.is_trained_st_model_data_ = is_trained_st_model_.data;
+        stat.is_trained_st_model_step_ = is_trained_st_model_.step;
+
+        stat.is_trained_dyn_model_data_ = is_trained_dyn_model_.data;
+        stat.is_trained_dyn_model_step_ = is_trained_dyn_model_.step;
+
+        stat.ctable_Pv_data_ = ctable_Pv_.data;
+        stat.ctable_Pv_step_ = ctable_Pv_.step;
+
+        stat.ctable_Pvb_data_ = ctable_Pvb_.data;
+        stat.ctable_Pvb_step_ = ctable_Pvb_.step;
+
+        stat.ctable_v_data_ = ctable_v_.data;
+        stat.ctable_v_step_ = ctable_v_.step;
+
+        stat.cctable_Pv_data_ = cctable_Pv_.data;
+        stat.cctable_Pv_step_ = cctable_Pv_.step;
+
+        stat.cctable_Pvb_data_ = cctable_Pvb_.data;
+        stat.cctable_Pvb_step_ = cctable_Pvb_.step;
+
+        stat.cctable_v1_data_ = cctable_v1_.data;
+        stat.cctable_v1_step_ = cctable_v1_.step;
+
+        stat.cctable_v2_data_ = cctable_v2_.data;
+        stat.cctable_v2_step_ = cctable_v2_.step;
+
+        return stat;
+    }
+}
+
+class cv::gpu::FGDStatModel::Impl
+{
+public:
+    Impl(cv::gpu::GpuMat& background, cv::gpu::GpuMat& foreground, std::vector< std::vector<cv::Point> >& foreground_regions, int out_cn);
+    ~Impl();
+
+    void create(const cv::gpu::GpuMat& firstFrame, const cv::gpu::FGDStatModel::Params& params);
+    void release();
+
+    int update(const cv::gpu::GpuMat& curFrame);
+
+private:
+    Impl(const Impl&);
+    Impl& operator=(const Impl&);
+
+    int out_cn_;
+
+    cv::gpu::FGDStatModel::Params params_;
+
+    cv::gpu::GpuMat& background_;
+    cv::gpu::GpuMat& foreground_;
+    std::vector< std::vector<cv::Point> >& foreground_regions_;
+
+    cv::Mat h_foreground_;
+
+    cv::gpu::GpuMat prevFrame_;
+    cv::gpu::GpuMat Ftd_;
+    cv::gpu::GpuMat Fbd_;
+    BGPixelStat stat_;
+
+    cv::gpu::GpuMat hist_;
+    cv::gpu::GpuMat histBuf_;
+
+    cv::gpu::GpuMat countBuf_;
+
+    cv::gpu::GpuMat buf_;
+    cv::gpu::GpuMat filterBuf_;
+    cv::gpu::GpuMat filterBrd_;
+
+    cv::Ptr<cv::gpu::FilterEngine_GPU> dilateFilter_;
+    cv::Ptr<cv::gpu::FilterEngine_GPU> erodeFilter_;
+
+    CvMemStorage* storage_;
+};
+
+cv::gpu::FGDStatModel::Impl::Impl(cv::gpu::GpuMat& background, cv::gpu::GpuMat& foreground, std::vector< std::vector<cv::Point> >& foreground_regions, int out_cn) :
+    out_cn_(out_cn), background_(background), foreground_(foreground), foreground_regions_(foreground_regions)
+{
+    CV_Assert( out_cn_ == 3 || out_cn_ == 4 );
+
+    storage_ = cvCreateMemStorage();
+    CV_Assert( storage_ != 0 );
+}
+
+cv::gpu::FGDStatModel::Impl::~Impl()
+{
+    cvReleaseMemStorage(&storage_);
+}
+
+namespace
+{
+    void copyChannels(const cv::gpu::GpuMat& src, cv::gpu::GpuMat& dst, int dst_cn = -1)
+    {
+        const int src_cn = src.channels();
+
+        if (dst_cn < 0)
+            dst_cn = src_cn;
+
+        cv::gpu::ensureSizeIsEnough(src.size(), CV_MAKE_TYPE(src.depth(), dst_cn), dst);
+
+        if (src_cn == dst_cn)
+            src.copyTo(dst);
+        else
+        {
+            static const int cvt_codes[4][4] =
+            {
+                {-1, -1, cv::COLOR_GRAY2BGR, cv::COLOR_GRAY2BGRA},
+                {-1, -1, -1, -1},
+                {cv::COLOR_BGR2GRAY, -1, -1, cv::COLOR_BGR2BGRA},
+                {cv::COLOR_BGRA2GRAY, -1, cv::COLOR_BGRA2BGR, -1}
+            };
+
+            const int cvt_code = cvt_codes[src_cn - 1][dst_cn - 1];
+            CV_DbgAssert( cvt_code >= 0 );
+
+            cv::gpu::cvtColor(src, dst, cvt_code, dst_cn);
+        }
+    }
+}
+
+void cv::gpu::FGDStatModel::Impl::create(const cv::gpu::GpuMat& firstFrame, const cv::gpu::FGDStatModel::Params& params)
+{
+    CV_Assert(firstFrame.type() == CV_8UC3 || firstFrame.type() == CV_8UC4);
+
+    params_ = params;
+
+    cv::gpu::ensureSizeIsEnough(firstFrame.size(), CV_8UC1, foreground_);
+
+    copyChannels(firstFrame, background_, out_cn_);
+
+    copyChannels(firstFrame, prevFrame_);
+
+    cv::gpu::ensureSizeIsEnough(firstFrame.size(), CV_8UC1, Ftd_);
+    cv::gpu::ensureSizeIsEnough(firstFrame.size(), CV_8UC1, Fbd_);
+
+    stat_.create(firstFrame.size(), params_, out_cn_);
+    bgfg::setBGPixelStat(stat_);
+
+    if (params_.perform_morphing > 0)
+    {
+        cv::Mat kernel = cv::getStructuringElement(cv::MORPH_RECT, cv::Size(1 + params_.perform_morphing * 2, 1 + params_.perform_morphing * 2));
+        cv::Point anchor(params_.perform_morphing, params_.perform_morphing);
+
+        dilateFilter_ = cv::gpu::createMorphologyFilter_GPU(cv::MORPH_DILATE, CV_8UC1, kernel, filterBuf_, anchor);
+        erodeFilter_ = cv::gpu::createMorphologyFilter_GPU(cv::MORPH_ERODE, CV_8UC1, kernel, filterBuf_, anchor);
+    }
+}
+
+void cv::gpu::FGDStatModel::Impl::release()
+{
+    background_.release();
+    foreground_.release();
+
+    prevFrame_.release();
+    Ftd_.release();
+    Fbd_.release();
+    stat_.release();
+
+    hist_.release();
+    histBuf_.release();
+
+    countBuf_.release();
+
+    buf_.release();
+    filterBuf_.release();
+    filterBrd_.release();
+}
+
+/////////////////////////////////////////////////////////////////////////
+// changeDetection
+
+namespace
+{
+    void calcDiffHistogram(const cv::gpu::GpuMat& prevFrame, const cv::gpu::GpuMat& curFrame, cv::gpu::GpuMat& hist, cv::gpu::GpuMat& histBuf)
+    {
+        typedef void (*func_t)(cv::gpu::DevMem2Db prevFrame, cv::gpu::DevMem2Db curFrame, unsigned int* hist0, unsigned int* hist1, unsigned int* hist2, unsigned int* partialBuf0, unsigned int* partialBuf1, unsigned int* partialBuf2, int cc, cudaStream_t stream);
+        static const func_t funcs[4][4] =
+        {
+            {0,0,0,0},
+            {0,0,0,0},
+            {0,0,bgfg::calcDiffHistogram_gpu<uchar3, uchar3>,bgfg::calcDiffHistogram_gpu<uchar3, uchar4>},
+            {0,0,bgfg::calcDiffHistogram_gpu<uchar4, uchar3>,bgfg::calcDiffHistogram_gpu<uchar4, uchar4>}
+        };
+
+        hist.create(3, 256, CV_32SC1);
+        histBuf.create(3, bgfg::PARTIAL_HISTOGRAM_COUNT * bgfg::HISTOGRAM_BIN_COUNT, CV_32SC1);
+
+        cv::gpu::DeviceInfo devInfo;
+        int cc = devInfo.majorVersion() * 10 + devInfo.minorVersion();
+
+        funcs[prevFrame.channels() - 1][curFrame.channels() - 1](
+                    prevFrame, curFrame,
+                    hist.ptr<unsigned int>(0), hist.ptr<unsigned int>(1), hist.ptr<unsigned int>(2),
+                    histBuf.ptr<unsigned int>(0), histBuf.ptr<unsigned int>(1), histBuf.ptr<unsigned int>(2),
+                    cc, 0);
+    }
+
+    void calcRelativeVariance(unsigned int hist[3 * 256], double relativeVariance[3][bgfg::HISTOGRAM_BIN_COUNT])
+    {
+        std::memset(relativeVariance, 0, 3 * bgfg::HISTOGRAM_BIN_COUNT * sizeof(double));
+
+        for (int thres = bgfg::HISTOGRAM_BIN_COUNT - 2; thres >= 0; --thres)
+        {
+            cv::Vec3d sum(0.0, 0.0, 0.0);
+            cv::Vec3d sqsum(0.0, 0.0, 0.0);
+            cv::Vec3i count(0, 0, 0);
+
+            for (int j = thres; j < bgfg::HISTOGRAM_BIN_COUNT; ++j)
+            {
+                sum[0]   += static_cast<double>(j) * hist[j];
+                sqsum[0] += static_cast<double>(j * j) * hist[j];
+                count[0] += hist[j];
+
+                sum[1]   += static_cast<double>(j) * hist[j + 256];
+                sqsum[1] += static_cast<double>(j * j) * hist[j + 256];
+                count[1] += hist[j + 256];
+
+                sum[2]   += static_cast<double>(j) * hist[j + 512];
+                sqsum[2] += static_cast<double>(j * j) * hist[j + 512];
+                count[2] += hist[j + 512];
+            }
+
+            count[0] = std::max(count[0], 1);
+            count[1] = std::max(count[1], 1);
+            count[2] = std::max(count[2], 1);
+
+            cv::Vec3d my(
+                sum[0] / count[0],
+                sum[1] / count[1],
+                sum[2] / count[2]
+            );
+
+            relativeVariance[0][thres] = std::sqrt(sqsum[0] / count[0] - my[0] * my[0]);
+            relativeVariance[1][thres] = std::sqrt(sqsum[1] / count[1] - my[1] * my[1]);
+            relativeVariance[2][thres] = std::sqrt(sqsum[2] / count[2] - my[2] * my[2]);
+        }
+    }
+
+    void calcDiffThreshMask(const cv::gpu::GpuMat& prevFrame, const cv::gpu::GpuMat& curFrame, cv::Vec3d bestThres, cv::gpu::GpuMat& changeMask)
+    {
+        typedef void (*func_t)(cv::gpu::DevMem2Db prevFrame, cv::gpu::DevMem2Db curFrame, uchar3 bestThres, cv::gpu::DevMem2Db changeMask, cudaStream_t stream);
+        static const func_t funcs[4][4] =
+        {
+            {0,0,0,0},
+            {0,0,0,0},
+            {0,0,bgfg::calcDiffThreshMask_gpu<uchar3, uchar3>,bgfg::calcDiffThreshMask_gpu<uchar3, uchar4>},
+            {0,0,bgfg::calcDiffThreshMask_gpu<uchar4, uchar3>,bgfg::calcDiffThreshMask_gpu<uchar4, uchar4>}
+        };
+
+        changeMask.setTo(cv::Scalar::all(0));
+
+        funcs[prevFrame.channels() - 1][curFrame.channels() - 1](prevFrame, curFrame, make_uchar3(bestThres[0], bestThres[1], bestThres[2]), changeMask, 0);
+    }
+
+    // performs change detection for Foreground detection algorithm
+    void changeDetection(const cv::gpu::GpuMat& prevFrame, const cv::gpu::GpuMat& curFrame, cv::gpu::GpuMat& changeMask, cv::gpu::GpuMat& hist, cv::gpu::GpuMat& histBuf)
+    {
+        calcDiffHistogram(prevFrame, curFrame, hist, histBuf);
+
+        unsigned int histData[3 * 256];
+        cv::Mat h_hist(3, 256, CV_32SC1, histData);
+        hist.download(h_hist);
+
+        double relativeVariance[3][bgfg::HISTOGRAM_BIN_COUNT];
+        calcRelativeVariance(histData, relativeVariance);
+
+        // Find maximum:
+        cv::Vec3d bestThres(10.0, 10.0, 10.0);
+        for (int i = 0; i < bgfg::HISTOGRAM_BIN_COUNT; ++i)
+        {
+            bestThres[0] = std::max(bestThres[0], relativeVariance[0][i]);
+            bestThres[1] = std::max(bestThres[1], relativeVariance[1][i]);
+            bestThres[2] = std::max(bestThres[2], relativeVariance[2][i]);
+        }
+
+        calcDiffThreshMask(prevFrame, curFrame, bestThres, changeMask);
+    }
+}
+
+/////////////////////////////////////////////////////////////////////////
+// bgfgClassification
+
+namespace
+{
+    int bgfgClassification(const cv::gpu::GpuMat& prevFrame, const cv::gpu::GpuMat& curFrame,
+                           const cv::gpu::GpuMat& Ftd, const cv::gpu::GpuMat& Fbd,
+                           cv::gpu::GpuMat& foreground, cv::gpu::GpuMat& countBuf,
+                           const cv::gpu::FGDStatModel::Params& params, int out_cn)
+    {
+        typedef void (*func_t)(cv::gpu::DevMem2Db prevFrame, cv::gpu::DevMem2Db curFrame, cv::gpu::DevMem2Db Ftd, cv::gpu::DevMem2Db Fbd, cv::gpu::DevMem2Db foreground,
+                               int deltaC, int deltaCC, float alpha2, int N1c, int N1cc, cudaStream_t stream);
+        static const func_t funcs[4][4][4] =
+        {
+            {
+                {0,0,0,0}, {0,0,0,0}, {0,0,0,0}, {0,0,0,0}
+            },
+            {
+                {0,0,0,0}, {0,0,0,0}, {0,0,0,0}, {0,0,0,0}
+            },
+            {
+                {0,0,0,0}, {0,0,0,0},
+                {0,0,bgfg::bgfgClassification_gpu<uchar3, uchar3, uchar3>,bgfg::bgfgClassification_gpu<uchar3, uchar3, uchar4>},
+                {0,0,bgfg::bgfgClassification_gpu<uchar3, uchar4, uchar3>,bgfg::bgfgClassification_gpu<uchar3, uchar4, uchar4>}
+            },
+            {
+                {0,0,0,0}, {0,0,0,0},
+                {0,0,bgfg::bgfgClassification_gpu<uchar4, uchar3, uchar3>,bgfg::bgfgClassification_gpu<uchar4, uchar3, uchar4>},
+                {0,0,bgfg::bgfgClassification_gpu<uchar4, uchar4, uchar3>,bgfg::bgfgClassification_gpu<uchar4, uchar4, uchar4>}
+            }
+        };
+
+        const int deltaC  = cvRound(params.delta * 256 / params.Lc);
+        const int deltaCC = cvRound(params.delta * 256 / params.Lcc);
+
+        funcs[prevFrame.channels() - 1][curFrame.channels() - 1][out_cn - 1](prevFrame, curFrame, Ftd, Fbd, foreground, deltaC, deltaCC, params.alpha2, params.N1c, params.N1cc, 0);
+
+        int count = cv::gpu::countNonZero(foreground, countBuf);
+
+        cv::gpu::multiply(foreground, cv::Scalar::all(255), foreground);
+
+        return count;
+    }
+}
+
+/////////////////////////////////////////////////////////////////////////
+// smoothForeground
+
+namespace
+{
+    void morphology(const cv::gpu::GpuMat& src, cv::gpu::GpuMat& dst, cv::gpu::GpuMat& filterBrd, int brd, cv::Ptr<cv::gpu::FilterEngine_GPU>& filter, cv::Scalar brdVal)
+    {
+        cv::gpu::copyMakeBorder(src, filterBrd, brd, brd, brd, brd, cv::BORDER_CONSTANT, brdVal);
+        filter->apply(filterBrd(cv::Rect(brd, brd, src.cols, src.rows)), dst, cv::Rect(0, 0, src.cols, src.rows));
+    }
+
+    void smoothForeground(cv::gpu::GpuMat& foreground, cv::gpu::GpuMat& filterBrd, cv::gpu::GpuMat& buf,
+                          cv::Ptr<cv::gpu::FilterEngine_GPU>& erodeFilter, cv::Ptr<cv::gpu::FilterEngine_GPU>& dilateFilter,
+                          const cv::gpu::FGDStatModel::Params& params)
+    {
+        const int brd = params.perform_morphing;
+
+        const cv::Scalar erodeBrdVal = cv::Scalar::all(UCHAR_MAX);
+        const cv::Scalar dilateBrdVal = cv::Scalar::all(0);
+
+        // MORPH_OPEN
+        morphology(foreground, buf, filterBrd, brd, erodeFilter, erodeBrdVal);
+        morphology(buf, foreground, filterBrd, brd, dilateFilter, dilateBrdVal);
+
+        // MORPH_CLOSE
+        morphology(foreground, buf, filterBrd, brd, dilateFilter, dilateBrdVal);
+        morphology(buf, foreground, filterBrd, brd, erodeFilter, erodeBrdVal);
+    }
+}
+
+/////////////////////////////////////////////////////////////////////////
+// findForegroundRegions
+
+namespace
+{
+    void seqToContours(CvSeq* _ccontours, CvMemStorage* storage, cv::OutputArrayOfArrays _contours)
+    {
+        cv::Seq<CvSeq*> all_contours(cvTreeToNodeSeq(_ccontours, sizeof(CvSeq), storage));
+
+        size_t total = all_contours.size();
+
+        _contours.create(total, 1, 0, -1, true);
+
+        cv::SeqIterator<CvSeq*> it = all_contours.begin();
+        for (size_t i = 0; i < total; ++i, ++it)
+        {
+            CvSeq* c = *it;
+            ((CvContour*)c)->color = (int)i;
+            _contours.create((int)c->total, 1, CV_32SC2, i, true);
+            cv::Mat ci = _contours.getMat(i);
+            CV_Assert( ci.isContinuous() );
+            cvCvtSeqToArray(c, ci.data);
+        }
+    }
+
+    int findForegroundRegions(cv::gpu::GpuMat& d_foreground, cv::Mat& h_foreground, std::vector< std::vector<cv::Point> >& foreground_regions,
+                              CvMemStorage* storage, const cv::gpu::FGDStatModel::Params& params)
+    {
+        int region_count = 0;
+
+        // Discard under-size foreground regions:
+
+        d_foreground.download(h_foreground);
+        IplImage ipl_foreground = h_foreground;
+        CvSeq* first_seq = 0;
+
+        cvFindContours(&ipl_foreground, storage, &first_seq, sizeof(CvContour), CV_RETR_LIST);
+
+        for (CvSeq* seq = first_seq; seq; seq = seq->h_next)
+        {
+            CvContour* cnt = reinterpret_cast<CvContour*>(seq);
+
+            if (cnt->rect.width * cnt->rect.height < params.minArea || (params.is_obj_without_holes && CV_IS_SEQ_HOLE(seq)))
+            {
+                // Delete under-size contour:
+                CvSeq* prev_seq = seq->h_prev;
+                if (prev_seq)
+                {
+                    prev_seq->h_next = seq->h_next;
+
+                    if (seq->h_next)
+                        seq->h_next->h_prev = prev_seq;
+                }
+                else
+                {
+                    first_seq = seq->h_next;
+
+                    if (seq->h_next)
+                        seq->h_next->h_prev = NULL;
+                }
+            }
+            else
+            {
+                region_count++;
+            }
+        }
+
+        seqToContours(first_seq, storage, foreground_regions);
+        h_foreground.setTo(0);
+
+        cv::drawContours(h_foreground, foreground_regions, -1, cv::Scalar::all(255), -1);
+
+        d_foreground.upload(h_foreground);
+
+        return region_count;
+    }
+}
+
+/////////////////////////////////////////////////////////////////////////
+// updateBackgroundModel
+
+namespace
+{
+    void updateBackgroundModel(const cv::gpu::GpuMat& prevFrame, const cv::gpu::GpuMat& curFrame, const cv::gpu::GpuMat& Ftd, const cv::gpu::GpuMat& Fbd,
+                               const cv::gpu::GpuMat& foreground, cv::gpu::GpuMat& background,
+                               const cv::gpu::FGDStatModel::Params& params)
+    {
+        typedef void (*func_t)(cv::gpu::DevMem2Db prevFrame, cv::gpu::DevMem2Db curFrame, cv::gpu::DevMem2Db Ftd, cv::gpu::DevMem2Db Fbd,
+                               cv::gpu::DevMem2Db foreground, cv::gpu::DevMem2Db background,
+                               int deltaC, int deltaCC, float alpha1, float alpha2, float alpha3, int N1c, int N1cc, int N2c, int N2cc, float T, cudaStream_t stream);
+        static const func_t funcs[4][4][4] =
+        {
+            {
+                {0,0,0,0}, {0,0,0,0}, {0,0,0,0}, {0,0,0,0}
+            },
+            {
+                {0,0,0,0}, {0,0,0,0}, {0,0,0,0}, {0,0,0,0}
+            },
+            {
+                {0,0,0,0}, {0,0,0,0},
+                {0,0,bgfg::updateBackgroundModel_gpu<uchar3, uchar3, uchar3>,bgfg::updateBackgroundModel_gpu<uchar3, uchar3, uchar4>},
+                {0,0,bgfg::updateBackgroundModel_gpu<uchar3, uchar4, uchar3>,bgfg::updateBackgroundModel_gpu<uchar3, uchar4, uchar4>}
+            },
+            {
+                {0,0,0,0}, {0,0,0,0},
+                {0,0,bgfg::updateBackgroundModel_gpu<uchar4, uchar3, uchar3>,bgfg::updateBackgroundModel_gpu<uchar4, uchar3, uchar4>},
+                {0,0,bgfg::updateBackgroundModel_gpu<uchar4, uchar4, uchar3>,bgfg::updateBackgroundModel_gpu<uchar4, uchar4, uchar4>}
+            }
+        };
+
+        const int deltaC  = cvRound(params.delta * 256 / params.Lc);
+        const int deltaCC = cvRound(params.delta * 256 / params.Lcc);
+
+        funcs[prevFrame.channels() - 1][curFrame.channels() - 1][background.channels() - 1](
+                    prevFrame, curFrame, Ftd, Fbd, foreground, background,
+                    deltaC, deltaCC, params.alpha1, params.alpha2, params.alpha3, params.N1c, params.N1cc, params.N2c, params.N2cc, params.T,
+                    0);
+    }
+}
+
+/////////////////////////////////////////////////////////////////////////
+// Impl::update
+
+int cv::gpu::FGDStatModel::Impl::update(const cv::gpu::GpuMat& curFrame)
+{
+    CV_Assert(curFrame.type() == CV_8UC3 || curFrame.type() == CV_8UC4);
+    CV_Assert(curFrame.size() == prevFrame_.size());
+
+    cvClearMemStorage(storage_);
+    foreground_regions_.clear();
+    foreground_.setTo(cv::Scalar::all(0));
+
+    changeDetection(prevFrame_, curFrame, Ftd_, hist_, histBuf_);
+    changeDetection(background_, curFrame, Fbd_, hist_, histBuf_);
+
+    int FG_pixels_count = bgfgClassification(prevFrame_, curFrame, Ftd_, Fbd_, foreground_, countBuf_, params_, out_cn_);
+
+    if (params_.perform_morphing > 0)
+        smoothForeground(foreground_, filterBrd_, buf_, erodeFilter_, dilateFilter_, params_);
+
+    int region_count = 0;
+    if (params_.minArea > 0 || params_.is_obj_without_holes)
+        region_count = findForegroundRegions(foreground_, h_foreground_, foreground_regions_, storage_, params_);
+
+    // Check ALL BG update condition:
+    const double BGFG_FGD_BG_UPDATE_TRESH = 0.5;
+    if (static_cast<double>(FG_pixels_count) / Ftd_.size().area() > BGFG_FGD_BG_UPDATE_TRESH)
+        stat_.setTrained();
+
+    updateBackgroundModel(prevFrame_, curFrame, Ftd_, Fbd_, foreground_, background_, params_);
+
+    copyChannels(curFrame, prevFrame_);
+
+    return region_count;
+}
+
+namespace
+{
+    // Default parameters of foreground detection algorithm:
+    const int BGFG_FGD_LC  = 128;
+    const int BGFG_FGD_N1C = 15;
+    const int BGFG_FGD_N2C = 25;
+
+    const int BGFG_FGD_LCC   = 64;
+    const int BGFG_FGD_N1CC = 25;
+    const int BGFG_FGD_N2CC = 40;
+
+    // Background reference image update parameter:
+    const float BGFG_FGD_ALPHA_1 = 0.1f;
+
+    // stat model update parameter
+    // 0.002f ~ 1K frame(~45sec), 0.005 ~ 18sec (if 25fps and absolutely static BG)
+    const float BGFG_FGD_ALPHA_2 = 0.005f;
+
+    // start value for alpha parameter (to fast initiate statistic model)
+    const float BGFG_FGD_ALPHA_3 = 0.1f;
+
+    const float BGFG_FGD_DELTA = 2.0f;
+
+    const float BGFG_FGD_T = 0.9f;
+
+    const float BGFG_FGD_MINAREA= 15.0f;
+}
+
+cv::gpu::FGDStatModel::Params::Params()
+{
+    Lc      = BGFG_FGD_LC;
+    N1c     = BGFG_FGD_N1C;
+    N2c     = BGFG_FGD_N2C;
+
+    Lcc     = BGFG_FGD_LCC;
+    N1cc    = BGFG_FGD_N1CC;
+    N2cc    = BGFG_FGD_N2CC;
+
+    delta   = BGFG_FGD_DELTA;
+
+    alpha1  = BGFG_FGD_ALPHA_1;
+    alpha2  = BGFG_FGD_ALPHA_2;
+    alpha3  = BGFG_FGD_ALPHA_3;
+
+    T       = BGFG_FGD_T;
+    minArea = BGFG_FGD_MINAREA;
+
+    is_obj_without_holes = true;
+    perform_morphing     = 1;
+}
+
+cv::gpu::FGDStatModel::FGDStatModel(int out_cn)
+{
+    impl_.reset(new Impl(background, foreground, foreground_regions, out_cn));
+}
+
+cv::gpu::FGDStatModel::FGDStatModel(const cv::gpu::GpuMat& firstFrame, const Params& params, int out_cn)
+{
+    impl_.reset(new Impl(background, foreground, foreground_regions, out_cn));
+    create(firstFrame, params);
+}
+
+cv::gpu::FGDStatModel::~FGDStatModel()
+{
+}
+
+void cv::gpu::FGDStatModel::create(const cv::gpu::GpuMat& firstFrame, const Params& params)
+{
+    impl_->create(firstFrame, params);
+}
+
+void cv::gpu::FGDStatModel::release()
+{
+    impl_->release();
+}
+
+int cv::gpu::FGDStatModel::update(const cv::gpu::GpuMat& curFrame)
+{
+    return impl_->update(curFrame);
+}
+
+#endif // HAVE_CUDA
--- a/modules/gpu/src/precomp.hpp
+++ b/modules/gpu/src/precomp.hpp
@@ -68,6 +68,7 @@

 #include "opencv2/gpu/gpu.hpp"
 #include "opencv2/imgproc/imgproc.hpp"
+#include "opencv2/imgproc/imgproc_c.h"
 #include "opencv2/calib3d/calib3d.hpp"
 #include "opencv2/core/internal.hpp"
 #include "opencv2/video/video.hpp"
--- a/modules/gpu/test/precomp.hpp
+++ b/modules/gpu/test/precomp.hpp
@@ -62,6 +62,7 @@
 #include "opencv2/ts/ts_perf.hpp"
 #include "opencv2/gpu/gpu.hpp"
 #include "opencv2/nonfree/nonfree.hpp"
+#include "opencv2/legacy/legacy.hpp"

 #include "utility.hpp"
 #include "interpolation.hpp"
--- a/modules/gpu/test/test_core.cpp
+++ b/modules/gpu/test/test_core.cpp
@@ -3365,7 +3365,7 @@ TEST_P(Reduce, Rows)
    cv::Mat dst_gold;
    cv::reduce(src, dst_gold, 0, reduceOp, dst_depth);

-    EXPECT_MAT_NEAR(dst_gold, dst, dst_depth < CV_32F ? 0.0 : 1e-2);
+    EXPECT_MAT_NEAR(dst_gold, dst, dst_depth < CV_32F ? 0.0 : 0.02);
 }

 TEST_P(Reduce, Cols)
@@ -3381,7 +3381,7 @@ TEST_P(Reduce, Cols)
    dst_gold.rows = 1;
    dst_gold.step = dst_gold.cols * dst_gold.elemSize();

-    EXPECT_MAT_NEAR(dst_gold, dst, dst_depth < CV_32F ? 0.0 : 1e-2);
+    EXPECT_MAT_NEAR(dst_gold, dst, dst_depth < CV_32F ? 0.0 : 0.02);
 }

 INSTANTIATE_TEST_CASE_P(GPU_Core, Reduce, testing::Combine(
--- a/modules/gpu/test/test_video.cpp
+++ b/modules/gpu/test/test_video.cpp
@@ -41,11 +41,9 @@

 #include "precomp.hpp"

-namespace {
-
 //#define DUMP

-/////////////////////////////////////////////////////////////////////////////////////////////////
+//////////////////////////////////////////////////////
 // BroxOpticalFlow

 #define BROX_OPTICAL_FLOW_DUMP_FILE            "opticalflow/brox_optical_flow.bin"
@@ -130,7 +128,7 @@ TEST_P(BroxOpticalFlow, Regression)

 INSTANTIATE_TEST_CASE_P(GPU_Video, BroxOpticalFlow, ALL_DEVICES);

-/////////////////////////////////////////////////////////////////////////////////////////////////
+//////////////////////////////////////////////////////
 // GoodFeaturesToTrack

 IMPLEMENT_PARAM_CLASS(MinDistance, double)
@@ -207,7 +205,7 @@ INSTANTIATE_TEST_CASE_P(GPU_Video, GoodFeaturesToTrack, testing::Combine(
    ALL_DEVICES,
    testing::Values(MinDistance(0.0), MinDistance(3.0))));

-/////////////////////////////////////////////////////////////////////////////////////////////////
+//////////////////////////////////////////////////////
 // PyrLKOpticalFlow

 IMPLEMENT_PARAM_CLASS(UseGray, bool)
@@ -306,7 +304,7 @@ INSTANTIATE_TEST_CASE_P(GPU_Video, PyrLKOpticalFlow, testing::Combine(
    ALL_DEVICES,
    testing::Values(UseGray(true), UseGray(false))));

-/////////////////////////////////////////////////////////////////////////////////////////////////
+//////////////////////////////////////////////////////
 // FarnebackOpticalFlow

 IMPLEMENT_PARAM_CLASS(PyrScale, double)
@@ -413,7 +411,87 @@ TEST_P(OpticalFlowNan, Regression)

 INSTANTIATE_TEST_CASE_P(GPU_Video, OpticalFlowNan, ALL_DEVICES);

-/////////////////////////////////////////////////////////////////////////////////////////////////
+//////////////////////////////////////////////////////
+// FGDStatModel
+
+namespace cv
+{
+    template<> void Ptr<CvBGStatModel>::delete_obj()
+    {
+        cvReleaseBGStatModel(&obj);
+    }
+}
+
+PARAM_TEST_CASE(FGDStatModel, cv::gpu::DeviceInfo, std::string, Channels)
+{
+};
+
+TEST_P(FGDStatModel, Accuracy)
+{
+    cv::gpu::DeviceInfo devInfo = GET_PARAM(0);
+    cv::gpu::setDevice(devInfo.deviceID());
+
+    std::string inputFile = std::string(cvtest::TS::ptr()->get_data_path()) + "video/" + GET_PARAM(1);
+    int out_cn = GET_PARAM(2);
+
+    cv::VideoCapture cap(inputFile);
+    ASSERT_TRUE(cap.isOpened());
+
+    cv::Mat frame;
+    cap >> frame;
+    ASSERT_FALSE(frame.empty());
+
+    IplImage ipl_frame = frame;
+    cv::Ptr<CvBGStatModel> model(cvCreateFGDStatModel(&ipl_frame));
+
+    cv::gpu::GpuMat d_frame(frame);
+    cv::gpu::FGDStatModel d_model(out_cn);
+    d_model.create(d_frame);
+
+    cv::Mat h_background;
+    cv::Mat h_foreground;
+    cv::Mat h_background3;
+
+    cv::Mat backgroundDiff;
+    cv::Mat foregroundDiff;
+
+    for (int i = 0; i < 5; ++i)
+    {
+        cap >> frame;
+        ASSERT_FALSE(frame.empty());
+
+        ipl_frame = frame;
+        int gold_count = cvUpdateBGStatModel(&ipl_frame, model);
+
+        d_frame.upload(frame);
+
+        int count = d_model.update(d_frame);
+
+        ASSERT_EQ(gold_count, count);
+
+        cv::Mat gold_background(model->background);
+        cv::Mat gold_foreground(model->foreground);
+
+        if (out_cn == 3)
+            d_model.background.download(h_background3);
+        else
+        {
+            d_model.background.download(h_background);
+            cv::cvtColor(h_background, h_background3, cv::COLOR_BGRA2BGR);
+        }
+        d_model.foreground.download(h_foreground);
+
+        EXPECT_MAT_NEAR(gold_background, h_background3, 1.0);
+        EXPECT_MAT_NEAR(gold_foreground, h_foreground, 0.0);
+    }
+}
+
+INSTANTIATE_TEST_CASE_P(GPU_Video, FGDStatModel, testing::Combine(
+    ALL_DEVICES,
+    testing::Values(std::string("768x576.avi")),
+    testing::Values(Channels(3), Channels(4))));
+
+//////////////////////////////////////////////////////
 // VideoWriter

 #ifdef WIN32
@@ -447,17 +525,13 @@ TEST_P(VideoWriter, Regression)
    cv::gpu::VideoWriter_GPU d_writer;

    cv::Mat frame;
-    std::vector<cv::Mat> frames;
    cv::gpu::GpuMat d_frame;

-    for (int i = 1; i < 10; ++i)
+    for (int i = 0; i < 10; ++i)
    {
        reader >> frame;
+        ASSERT_FALSE(frame.empty());

-        if (frame.empty())
-            break;
-
-        frames.push_back(frame.clone());
        d_frame.upload(frame);

        if (!d_writer.isOpened())
@@ -481,11 +555,11 @@ TEST_P(VideoWriter, Regression)

 INSTANTIATE_TEST_CASE_P(GPU_Video, VideoWriter, testing::Combine(
    ALL_DEVICES,
-    testing::Values(std::string("VID00003-20100701-2204.mpg"), std::string("big_buck_bunny.mpg"))));
+    testing::Values(std::string("768x576.avi"), std::string("1920x1080.avi"))));

 #endif // WIN32

-/////////////////////////////////////////////////////////////////////////////////////////////////
+//////////////////////////////////////////////////////
 // VideoReader

 PARAM_TEST_CASE(VideoReader, cv::gpu::DeviceInfo, std::string)
@@ -511,7 +585,7 @@ TEST_P(VideoReader, Regression)

    cv::gpu::GpuMat frame;

-    for (int i = 0; i < 5; ++i)
+    for (int i = 0; i < 10; ++i)
    {
        ASSERT_TRUE( reader.read(frame) );
        ASSERT_FALSE( frame.empty() );
@@ -523,6 +597,4 @@ TEST_P(VideoReader, Regression)

 INSTANTIATE_TEST_CASE_P(GPU_Video, VideoReader, testing::Combine(
    ALL_DEVICES,
-    testing::Values(std::string("VID00003-20100701-2204.mpg"))));
-
-} // namespace
+    testing::Values(std::string("768x576.avi"), std::string("1920x1080.avi"))));