Merge remote-tracking branch 'remotes/origin/master' into tvl1_chambolle

2014-08-01 14:54:55 +02:00
parent 7b8bb4248b 345b69d5f7
commit 6594d52185
25 changed files with 1289 additions and 304 deletions
--- a/modules/core/src/arithm.cpp
+++ b/modules/core/src/arithm.cpp
@@ -1491,6 +1491,9 @@ static bool ocl_arithm_op(InputArray _src1, InputArray _src2, OutputArray _dst,
    if (!doubleSupport && (depth2 == CV_64F || depth1 == CV_64F))
        return false;

+    if( (oclop == OCL_OP_MUL_SCALE || oclop == OCL_OP_DIV_SCALE) && (depth1 >= CV_32F || depth2 >= CV_32F || ddepth >= CV_32F) )
+        return false;
+
    int kercn = haveMask || haveScalar ? cn : ocl::predictOptimalVectorWidth(_src1, _src2, _dst);
    int scalarcn = kercn == 3 ? 4 : kercn, rowsPerWI = d.isIntel() ? 4 : 1;

--- a/modules/core/src/convert.cpp
+++ b/modules/core/src/convert.cpp
@@ -1541,7 +1541,7 @@ static bool ocl_convertScaleAbs( InputArray _src, OutputArray _dst, double alpha
        kercn = ocl::predictOptimalVectorWidth(_src, _dst), rowsPerWI = d.isIntel() ? 4 : 1;
    bool doubleSupport = d.doubleFPConfig() > 0;

-    if (!doubleSupport && depth == CV_64F)
+    if (depth == CV_32F || depth == CV_64F)
        return false;

    char cvt[2][50];
--- a/modules/core/src/copy.cpp
+++ b/modules/core/src/copy.cpp
@@ -432,7 +432,7 @@ Mat& Mat::setTo(InputArray _value, InputArray _mask)

        IppStatus status = (IppStatus)-1;
        IppiSize roisize = { cols, rows };
-        int mstep = (int)mask.step, dstep = (int)step;
+        int mstep = (int)mask.step[0], dstep = (int)step[0];

        if (isContinuous() && mask.isContinuous())
        {
@@ -616,7 +616,7 @@ static bool ocl_flip(InputArray _src, OutputArray _dst, int flipCode )
 {
    CV_Assert(flipCode >= -1 && flipCode <= 1);
    int type = _src.type(), depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type),
-            flipType, kercn = std::min(ocl::predictOptimalVectorWidth(_src, _dst), 4);;
+            flipType, kercn = std::min(ocl::predictOptimalVectorWidth(_src, _dst), 4);

    if (cn > 4)
        return false;
@@ -631,7 +631,7 @@ static bool ocl_flip(InputArray _src, OutputArray _dst, int flipCode )

    ocl::Device dev = ocl::Device::getDefault();
    int pxPerWIy = (dev.isIntel() && (dev.type() & ocl::Device::TYPE_GPU)) ? 4 : 1;
-    kercn = std::max(kercn, cn);
+    kercn = (cn!=3 || flipType == FLIP_ROWS) ? std::max(kercn, cn) : cn;

    ocl::Kernel k(kernelName, ocl::core::flip_oclsrc,
        format( "-D T=%s -D T1=%s -D cn=%d -D PIX_PER_WI_Y=%d -D kercn=%d",
@@ -762,7 +762,7 @@ void flip( InputArray _src, OutputArray _dst, int flip_mode )
        flipHoriz( dst.data, dst.step, dst.data, dst.step, dst.size(), esz );
 }

-#ifdef HAVE_OPENCL
+/*#ifdef HAVE_OPENCL

 static bool ocl_repeat(InputArray _src, int ny, int nx, OutputArray _dst)
 {
@@ -790,7 +790,7 @@ static bool ocl_repeat(InputArray _src, int ny, int nx, OutputArray _dst)
    return k.run(2, globalsize, NULL, false);
 }

-#endif
+#endif*/

 void repeat(InputArray _src, int ny, int nx, OutputArray _dst)
 {
@@ -800,8 +800,8 @@ void repeat(InputArray _src, int ny, int nx, OutputArray _dst)
    Size ssize = _src.size();
    _dst.create(ssize.height*ny, ssize.width*nx, _src.type());

-    CV_OCL_RUN(_dst.isUMat(),
-               ocl_repeat(_src, ny, nx, _dst))
+    /*CV_OCL_RUN(_dst.isUMat(),
+               ocl_repeat(_src, ny, nx, _dst))*/

    Mat src = _src.getMat(), dst = _dst.getMat();
    Size dsize = dst.size();
--- a/modules/core/src/matrix.cpp
+++ b/modules/core/src/matrix.cpp
@@ -3336,7 +3336,7 @@ static inline void reduceSumC_8u16u16s32f_64f(const cv::Mat& srcmat, cv::Mat& ds
            stype == CV_32FC3 ? (ippiSumHint)ippiSum_32f_C3R :
            stype == CV_32FC4 ? (ippiSumHint)ippiSum_32f_C4R : 0;
        func =
-        sdepth == CV_8U ? (cv::ReduceFunc)cv::reduceC_<uchar, double,   cv::OpAdd<double> > :
+            sdepth == CV_8U ? (cv::ReduceFunc)cv::reduceC_<uchar, double,   cv::OpAdd<double> > :
            sdepth == CV_16U ? (cv::ReduceFunc)cv::reduceC_<ushort, double,   cv::OpAdd<double> > :
            sdepth == CV_16S ? (cv::ReduceFunc)cv::reduceC_<short, double,   cv::OpAdd<double> > :
            sdepth == CV_32F ? (cv::ReduceFunc)cv::reduceC_<float, double,   cv::OpAdd<double> > : 0;
@@ -3459,6 +3459,9 @@ static bool ocl_reduce(InputArray _src, OutputArray _dst,
    if (!doubleSupport && (sdepth == CV_64F || ddepth == CV_64F))
        return false;

+    if ((op == CV_REDUCE_SUM && sdepth == CV_32F) || op == CV_REDUCE_MIN || op == CV_REDUCE_MAX)
+        return false;
+
    if (op == CV_REDUCE_AVG)
    {
        if (sdepth < CV_32S && ddepth < CV_32S)
--- a/modules/core/src/opencl/minmaxloc.cl
+++ b/modules/core/src/opencl/minmaxloc.cl
@@ -209,7 +209,7 @@ __kernel void minmaxloc(__global const uchar * srcptr, int src_step, int src_off

 #if kercn == 1
 #ifdef NEED_MINVAL
-#if NEED_MINLOC
+#ifdef NEED_MINLOC
            if (minval > temp)
            {
                minval = temp;
@@ -326,7 +326,7 @@ __kernel void minmaxloc(__global const uchar * srcptr, int src_step, int src_off
            int lid2 = lsize + lid;

 #ifdef NEED_MINVAL
-#ifdef NEED_MAXLOC
+#ifdef NEED_MINLOC
            if (localmem_min[lid] >= localmem_min[lid2])
            {
                if (localmem_min[lid] == localmem_min[lid2])
--- a/modules/core/src/stat.cpp
+++ b/modules/core/src/stat.cpp
@@ -2283,7 +2283,7 @@ double cv::norm( InputArray _src, int normType, InputArray _mask )

                setIppErrorStatus();
            }
-            typedef IppStatus (CV_STDCALL* ippiMaskNormFuncC3)(const void *, int, const void *, int, IppiSize, int, Ipp64f *);
+            /*typedef IppStatus (CV_STDCALL* ippiMaskNormFuncC3)(const void *, int, const void *, int, IppiSize, int, Ipp64f *);
            ippiMaskNormFuncC3 ippFuncC3 =
                normType == NORM_INF ?
                (type == CV_8UC3 ? (ippiMaskNormFuncC3)ippiNorm_Inf_8u_C3CMR :
@@ -2318,7 +2318,7 @@ double cv::norm( InputArray _src, int normType, InputArray _mask )
                    return normType == NORM_L2SQR ? (double)(norm * norm) : (double)norm;
                }
                setIppErrorStatus();
-            }
+            }*/
        }
        else
        {
@@ -2724,7 +2724,7 @@ double cv::norm( InputArray _src1, InputArray _src2, int normType, InputArray _m
                0) :
                normType == NORM_L1 ?
                (type == CV_8UC1 ? (ippiMaskNormDiffFuncC1)ippiNormDiff_L1_8u_C1MR :
-                type == CV_8SC1 ? (ippiMaskNormDiffFuncC1)ippiNormDiff_L1_8s_C1MR :
+                //type == CV_8SC1 ? (ippiMaskNormDiffFuncC1)ippiNormDiff_L1_8s_C1MR :
                type == CV_16UC1 ? (ippiMaskNormDiffFuncC1)ippiNormDiff_L1_16u_C1MR :
                type == CV_32FC1 ? (ippiMaskNormDiffFuncC1)ippiNormDiff_L1_32f_C1MR :
                0) :
@@ -2741,7 +2741,7 @@ double cv::norm( InputArray _src1, InputArray _src2, int normType, InputArray _m
                    return normType == NORM_L2SQR ? (double)(norm * norm) : (double)norm;
                setIppErrorStatus();
            }
-            typedef IppStatus (CV_STDCALL* ippiMaskNormDiffFuncC3)(const void *, int, const void *, int, const void *, int, IppiSize, int, Ipp64f *);
+            /*typedef IppStatus (CV_STDCALL* ippiMaskNormDiffFuncC3)(const void *, int, const void *, int, const void *, int, IppiSize, int, Ipp64f *);
            ippiMaskNormDiffFuncC3 ippFuncC3 =
                normType == NORM_INF ?
                (type == CV_8UC3 ? (ippiMaskNormDiffFuncC3)ippiNormDiff_Inf_8u_C3CMR :
@@ -2776,7 +2776,7 @@ double cv::norm( InputArray _src1, InputArray _src2, int normType, InputArray _m
                    return normType == NORM_L2SQR ? (double)(norm * norm) : (double)norm;
                }
                setIppErrorStatus();
-            }
+            }*/
        }
        else
        {
--- a/modules/core/test/ocl/test_arithm.cpp
+++ b/modules/core/test/ocl/test_arithm.cpp
@@ -829,7 +829,7 @@ OCL_TEST_P(Pow, Mat)
 {
    static const double pows[] = { -4, -1, -2.5, 0, 1, 2, 3.7, 4 };

-    for (int j = 0; j < test_loop_times; j++)
+    for (int j = 0; j < 1/*test_loop_times*/; j++)
        for (int k = 0, size = sizeof(pows) / sizeof(double); k < size; ++k)
        {
            SCOPED_TRACE(pows[k]);
@@ -1203,7 +1203,7 @@ OCL_TEST_P(MinMaxIdx_Mask, Mat)

 static bool relativeError(double actual, double expected, double eps)
 {
-    return std::abs(actual - expected) / actual < eps;
+    return std::abs(actual - expected) < eps*(1 + std::abs(actual));
 }

 typedef ArithmTestBase Norm;
@@ -1230,7 +1230,7 @@ OCL_TEST_P(Norm, NORM_INF_1arg_mask)
        OCL_OFF(const double cpuRes = cv::norm(src1_roi, NORM_INF, mask_roi));
        OCL_ON(const double gpuRes = cv::norm(usrc1_roi, NORM_INF, umask_roi));

-        EXPECT_NEAR(cpuRes, gpuRes, 0.1);
+        EXPECT_NEAR(cpuRes, gpuRes, 0.2);
    }
 }

@@ -1302,7 +1302,7 @@ OCL_TEST_P(Norm, NORM_INF_2args)
            OCL_OFF(const double cpuRes = cv::norm(src1_roi, src2_roi, type));
            OCL_ON(const double gpuRes = cv::norm(usrc1_roi, usrc2_roi, type));

-            EXPECT_NEAR(cpuRes, gpuRes, 0.1);
+            EXPECT_NEAR(cpuRes, gpuRes, 0.2);
        }
 }

--- a/modules/cudabgsegm/CMakeLists.txt
+++ b/modules/cudabgsegm/CMakeLists.txt
@@ -6,4 +6,4 @@ set(the_description "CUDA-accelerated Background Segmentation")

 ocv_warnings_disable(CMAKE_CXX_FLAGS /wd4127 /wd4324 /wd4512 -Wundef -Wmissing-declarations)

-ocv_define_module(cudabgsegm opencv_video OPTIONAL opencv_legacy opencv_imgproc opencv_cudaarithm opencv_cudafilters opencv_cudaimgproc)
+ocv_define_module(cudabgsegm opencv_video OPTIONAL opencv_imgproc opencv_cudaarithm opencv_cudafilters opencv_cudaimgproc)
--- a/modules/cudabgsegm/perf/perf_bgsegm.cpp
+++ b/modules/cudabgsegm/perf/perf_bgsegm.cpp
@@ -42,10 +42,6 @@

 #include "perf_precomp.hpp"

-#ifdef HAVE_OPENCV_CUDALEGACY
-#  include "opencv2/cudalegacy.hpp"
-#endif
-
 #ifdef HAVE_OPENCV_CUDAIMGPROC
 #  include "opencv2/cudaimgproc.hpp"
 #endif
@@ -72,18 +68,6 @@ using namespace perf;

 #if BUILD_WITH_VIDEO_INPUT_SUPPORT

-#ifdef HAVE_OPENCV_CUDALEGACY
-
-namespace cv
-{
-    template<> void DefaultDeleter<CvBGStatModel>::operator ()(CvBGStatModel* obj) const
-    {
-        cvReleaseBGStatModel(&obj);
-    }
-}
-
-#endif
-
 DEF_PARAM_TEST_1(Video, string);

 PERF_TEST_P(Video, FGDStatModel,
@@ -150,48 +134,7 @@ PERF_TEST_P(Video, FGDStatModel,
    }
    else
    {
-#ifdef HAVE_OPENCV_CUDALEGACY
-        IplImage ipl_frame = frame;
-        cv::Ptr<CvBGStatModel> model(cvCreateFGDStatModel(&ipl_frame));
-
-        int i = 0;
-
-        // collect performance data
-        for (; i < numIters; ++i)
-        {
-            cap >> frame;
-            ASSERT_FALSE(frame.empty());
-
-            ipl_frame = frame;
-
-            startTimer();
-            if(!next())
-                break;
-
-            cvUpdateBGStatModel(&ipl_frame, model);
-
-            stopTimer();
-        }
-
-        // process last frame in sequence to get data for sanity test
-        for (; i < numIters; ++i)
-        {
-            cap >> frame;
-            ASSERT_FALSE(frame.empty());
-
-            ipl_frame = frame;
-
-            cvUpdateBGStatModel(&ipl_frame, model);
-        }
-
-        const cv::Mat background = cv::cvarrToMat(model->background);
-        const cv::Mat foreground = cv::cvarrToMat(model->foreground);
-
-        CPU_SANITY_CHECK(background);
-        CPU_SANITY_CHECK(foreground);
-#else
        FAIL_NO_CPU();
-#endif
    }
 }

--- a/modules/cudabgsegm/test/test_bgsegm.cpp
+++ b/modules/cudabgsegm/test/test_bgsegm.cpp
@@ -42,10 +42,6 @@

 #include "test_precomp.hpp"

-#ifdef HAVE_OPENCV_CUDALEGACY
-#  include "opencv2/cudalegacy.hpp"
-#endif
-
 #ifdef HAVE_CUDA

 using namespace cvtest;
@@ -63,80 +59,6 @@ using namespace cvtest;
 #  define BUILD_WITH_VIDEO_INPUT_SUPPORT 0
 #endif

-//////////////////////////////////////////////////////
-// FGDStatModel
-
-#if BUILD_WITH_VIDEO_INPUT_SUPPORT && defined(HAVE_OPENCV_CUDALEGACY)
-
-namespace cv
-{
-    template<> void DefaultDeleter<CvBGStatModel>::operator ()(CvBGStatModel* obj) const
-    {
-        cvReleaseBGStatModel(&obj);
-    }
-}
-
-PARAM_TEST_CASE(FGDStatModel, cv::cuda::DeviceInfo, std::string)
-{
-    cv::cuda::DeviceInfo devInfo;
-    std::string inputFile;
-
-    virtual void SetUp()
-    {
-        devInfo = GET_PARAM(0);
-        cv::cuda::setDevice(devInfo.deviceID());
-
-        inputFile = std::string(cvtest::TS::ptr()->get_data_path()) + "video/" + GET_PARAM(1);
-    }
-};
-
-CUDA_TEST_P(FGDStatModel, Update)
-{
-    cv::VideoCapture cap(inputFile);
-    ASSERT_TRUE(cap.isOpened());
-
-    cv::Mat frame;
-    cap >> frame;
-    ASSERT_FALSE(frame.empty());
-
-    IplImage ipl_frame = frame;
-    cv::Ptr<CvBGStatModel> model(cvCreateFGDStatModel(&ipl_frame));
-
-    cv::cuda::GpuMat d_frame(frame);
-    cv::Ptr<cv::cuda::BackgroundSubtractorFGD> d_fgd = cv::cuda::createBackgroundSubtractorFGD();
-    cv::cuda::GpuMat d_foreground, d_background;
-    std::vector< std::vector<cv::Point> > foreground_regions;
-    d_fgd->apply(d_frame, d_foreground);
-
-    for (int i = 0; i < 5; ++i)
-    {
-        cap >> frame;
-        ASSERT_FALSE(frame.empty());
-
-        ipl_frame = frame;
-        int gold_count = cvUpdateBGStatModel(&ipl_frame, model);
-
-        d_frame.upload(frame);
-        d_fgd->apply(d_frame, d_foreground);
-        d_fgd->getBackgroundImage(d_background);
-        d_fgd->getForegroundRegions(foreground_regions);
-        int count = (int) foreground_regions.size();
-
-        cv::Mat gold_background = cv::cvarrToMat(model->background);
-        cv::Mat gold_foreground = cv::cvarrToMat(model->foreground);
-
-        ASSERT_MAT_NEAR(gold_background, d_background, 1.0);
-        ASSERT_MAT_NEAR(gold_foreground, d_foreground, 0.0);
-        ASSERT_EQ(gold_count, count);
-    }
-}
-
-INSTANTIATE_TEST_CASE_P(CUDA_BgSegm, FGDStatModel, testing::Combine(
-    ALL_DEVICES,
-    testing::Values(std::string("768x576.avi"))));
-
-#endif
-
 //////////////////////////////////////////////////////
 // MOG

--- a/modules/imgproc/src/demosaicing.cpp
+++ b/modules/imgproc/src/demosaicing.cpp
@@ -66,6 +66,11 @@ public:
        return 0;
    }

+    int bayer2RGBA(const T*, int, T*, int, int) const
+    {
+        return 0;
+    }
+
    int bayer2RGB_EA(const T*, int, T*, int, int) const
    {
        return 0;
@@ -218,6 +223,11 @@ public:
        return (int)(bayer - (bayer_end - width));
    }

+    int bayer2RGBA(const uchar*, int, uchar*, int, int) const
+    {
+        return 0;
+    }
+
    int bayer2RGB_EA(const uchar* bayer, int bayer_step, uchar* dst, int width, int blue) const
    {
        if (!use_simd)
@@ -323,6 +333,165 @@ public:

    bool use_simd;
 };
+#elif CV_NEON
+class SIMDBayerInterpolator_8u
+{
+public:
+    SIMDBayerInterpolator_8u()
+    {
+    }
+
+    int bayer2Gray(const uchar* bayer, int bayer_step, uchar* dst,
+                   int width, int bcoeff, int gcoeff, int rcoeff) const
+    {
+        /*
+         B G B G | B G B G | B G B G | B G B G
+         G R G R | G R G R | G R G R | G R G R
+         B G B G | B G B G | B G B G | B G B G
+         */
+
+        uint16x8_t masklo = vdupq_n_u16(255);
+        const uchar* bayer_end = bayer + width;
+
+        for( ; bayer <= bayer_end - 18; bayer += 14, dst += 14 )
+        {
+            uint16x8_t r0 = vld1q_u16((const ushort*)bayer);
+            uint16x8_t r1 = vld1q_u16((const ushort*)(bayer + bayer_step));
+            uint16x8_t r2 = vld1q_u16((const ushort*)(bayer + bayer_step*2));
+
+            uint16x8_t b1_ = vaddq_u16(vandq_u16(r0, masklo), vandq_u16(r2, masklo));
+            uint16x8_t b1 = vextq_u16(b1_, b1_, 1);
+            uint16x8_t b0 = vaddq_u16(b1_, b1);
+            // b0 = b0 b2 b4 ...
+            // b1 = b1 b3 b5 ...
+
+            uint16x8_t g0 = vaddq_u16(vshrq_n_u16(r0, 8), vshrq_n_u16(r2, 8));
+            uint16x8_t g1 = vandq_u16(r1, masklo);
+            g0 = vaddq_u16(g0, vaddq_u16(g1, vextq_u16(g1, g1, 1)));
+            g1 = vshlq_n_u16(vextq_u16(g1, g1, 1), 2);
+            // g0 = b0 b2 b4 ...
+            // g1 = b1 b3 b5 ...
+
+            r0 = vshrq_n_u16(r1, 8);
+            r1 = vaddq_u16(r0, vextq_u16(r0, r0, 1));
+            r0 = vshlq_n_u16(r0, 2);
+            // r0 = r0 r2 r4 ...
+            // r1 = r1 r3 r5 ...
+
+            b0 = vreinterpretq_u16_s16(vqdmulhq_n_s16(vreinterpretq_s16_u16(b0), (short)(rcoeff*2)));
+            b1 = vreinterpretq_u16_s16(vqdmulhq_n_s16(vreinterpretq_s16_u16(b1), (short)(rcoeff*4)));
+
+            g0 = vreinterpretq_u16_s16(vqdmulhq_n_s16(vreinterpretq_s16_u16(g0), (short)(gcoeff*2)));
+            g1 = vreinterpretq_u16_s16(vqdmulhq_n_s16(vreinterpretq_s16_u16(g1), (short)(gcoeff*2)));
+
+            r0 = vreinterpretq_u16_s16(vqdmulhq_n_s16(vreinterpretq_s16_u16(r0), (short)(bcoeff*2)));
+            r1 = vreinterpretq_u16_s16(vqdmulhq_n_s16(vreinterpretq_s16_u16(r1), (short)(bcoeff*4)));
+
+            g0 = vaddq_u16(vaddq_u16(g0, b0), r0);
+            g1 = vaddq_u16(vaddq_u16(g1, b1), r1);
+
+            uint8x8x2_t p = vzip_u8(vrshrn_n_u16(g0, 2), vrshrn_n_u16(g1, 2));
+            vst1_u8(dst, p.val[0]);
+            vst1_u8(dst + 8, p.val[1]);
+        }
+
+        return (int)(bayer - (bayer_end - width));
+    }
+
+    int bayer2RGB(const uchar* bayer, int bayer_step, uchar* dst, int width, int blue) const
+    {
+        /*
+         B G B G | B G B G | B G B G | B G B G
+         G R G R | G R G R | G R G R | G R G R
+         B G B G | B G B G | B G B G | B G B G
+         */
+        uint16x8_t masklo = vdupq_n_u16(255);
+        uint8x16x3_t pix;
+        const uchar* bayer_end = bayer + width;
+
+        for( ; bayer <= bayer_end - 18; bayer += 14, dst += 42 )
+        {
+            uint16x8_t r0 = vld1q_u16((const ushort*)bayer);
+            uint16x8_t r1 = vld1q_u16((const ushort*)(bayer + bayer_step));
+            uint16x8_t r2 = vld1q_u16((const ushort*)(bayer + bayer_step*2));
+
+            uint16x8_t b1 = vaddq_u16(vandq_u16(r0, masklo), vandq_u16(r2, masklo));
+            uint16x8_t nextb1 = vextq_u16(b1, b1, 1);
+            uint16x8_t b0 = vaddq_u16(b1, nextb1);
+            // b0 b1 b2 ...
+            uint8x8x2_t bb = vzip_u8(vrshrn_n_u16(b0, 2), vrshrn_n_u16(nextb1, 1));
+            pix.val[1-blue] = vcombine_u8(bb.val[0], bb.val[1]);
+
+            uint16x8_t g0 = vaddq_u16(vshrq_n_u16(r0, 8), vshrq_n_u16(r2, 8));
+            uint16x8_t g1 = vandq_u16(r1, masklo);
+            g0 = vaddq_u16(g0, vaddq_u16(g1, vextq_u16(g1, g1, 1)));
+            g1 = vextq_u16(g1, g1, 1);
+            // g0 g1 g2 ...
+            uint8x8x2_t gg = vzip_u8(vrshrn_n_u16(g0, 2), vmovn_u16(g1));
+            pix.val[1] = vcombine_u8(gg.val[0], gg.val[1]);
+
+            r0 = vshrq_n_u16(r1, 8);
+            r1 = vaddq_u16(r0, vextq_u16(r0, r0, 1));
+            // r0 r1 r2 ...
+            uint8x8x2_t rr = vzip_u8(vmovn_u16(r0), vrshrn_n_u16(r1, 1));
+            pix.val[1+blue] = vcombine_u8(rr.val[0], rr.val[1]);
+
+            vst3q_u8(dst-1, pix);
+        }
+
+        return (int)(bayer - (bayer_end - width));
+    }
+
+    int bayer2RGBA(const uchar* bayer, int bayer_step, uchar* dst, int width, int blue) const
+    {
+        /*
+         B G B G | B G B G | B G B G | B G B G
+         G R G R | G R G R | G R G R | G R G R
+         B G B G | B G B G | B G B G | B G B G
+         */
+        uint16x8_t masklo = vdupq_n_u16(255);
+        uint8x16x4_t pix;
+        const uchar* bayer_end = bayer + width;
+        pix.val[3] = vdupq_n_u8(255);
+
+        for( ; bayer <= bayer_end - 18; bayer += 14, dst += 56 )
+        {
+            uint16x8_t r0 = vld1q_u16((const ushort*)bayer);
+            uint16x8_t r1 = vld1q_u16((const ushort*)(bayer + bayer_step));
+            uint16x8_t r2 = vld1q_u16((const ushort*)(bayer + bayer_step*2));
+
+            uint16x8_t b1 = vaddq_u16(vandq_u16(r0, masklo), vandq_u16(r2, masklo));
+            uint16x8_t nextb1 = vextq_u16(b1, b1, 1);
+            uint16x8_t b0 = vaddq_u16(b1, nextb1);
+            // b0 b1 b2 ...
+            uint8x8x2_t bb = vzip_u8(vrshrn_n_u16(b0, 2), vrshrn_n_u16(nextb1, 1));
+            pix.val[1-blue] = vcombine_u8(bb.val[0], bb.val[1]);
+
+            uint16x8_t g0 = vaddq_u16(vshrq_n_u16(r0, 8), vshrq_n_u16(r2, 8));
+            uint16x8_t g1 = vandq_u16(r1, masklo);
+            g0 = vaddq_u16(g0, vaddq_u16(g1, vextq_u16(g1, g1, 1)));
+            g1 = vextq_u16(g1, g1, 1);
+            // g0 g1 g2 ...
+            uint8x8x2_t gg = vzip_u8(vrshrn_n_u16(g0, 2), vmovn_u16(g1));
+            pix.val[1] = vcombine_u8(gg.val[0], gg.val[1]);
+
+            r0 = vshrq_n_u16(r1, 8);
+            r1 = vaddq_u16(r0, vextq_u16(r0, r0, 1));
+            // r0 r1 r2 ...
+            uint8x8x2_t rr = vzip_u8(vmovn_u16(r0), vrshrn_n_u16(r1, 1));
+            pix.val[1+blue] = vcombine_u8(rr.val[0], rr.val[1]);
+
+            vst4q_u8(dst-1, pix);
+        }
+
+        return (int)(bayer - (bayer_end - width));
+    }
+
+    int bayer2RGB_EA(const uchar*, int, uchar*, int, int) const
+    {
+        return 0;
+    }
+};
 #else
 typedef SIMDBayerStubInterpolator_<uchar> SIMDBayerInterpolator_8u;
 #endif
@@ -559,7 +728,9 @@ public:
            }

            // simd optimization only for dcn == 3
-            int delta = dcn == 4 ? 0 : vecOp.bayer2RGB(bayer, bayer_step, dst, size.width, blue);
+            int delta = dcn == 4 ?
+                vecOp.bayer2RGBA(bayer, bayer_step, dst, size.width, blue) :
+                vecOp.bayer2RGB(bayer, bayer_step, dst, size.width, blue);
            bayer += delta;
            dst += delta*dcn;

--- a/modules/imgproc/src/opencl/pyr_down.cl
+++ b/modules/imgproc/src/opencl/pyr_down.cl
@@ -89,19 +89,56 @@
 #define MAD(x,y,z) mad((x),(y),(z))
 #endif

+#define LOAD_LOCAL(col_gl, col_lcl) \
+    sum0 =     co3* SRC(col_gl, EXTRAPOLATE_(src_y - 2, src_rows));         \
+    sum0 = MAD(co2, SRC(col_gl, EXTRAPOLATE_(src_y - 1, src_rows)), sum0);  \
+    temp = SRC(col_gl, EXTRAPOLATE_(src_y, src_rows));                      \
+    sum0 = MAD(co1, temp, sum0);                                            \
+    sum1 = co3 * temp;                                                      \
+    temp = SRC(col_gl, EXTRAPOLATE_(src_y + 1, src_rows));                  \
+    sum0 = MAD(co2, temp, sum0);                                            \
+    sum1 = MAD(co2, temp, sum1);                                            \
+    temp = SRC(col_gl, EXTRAPOLATE_(src_y + 2, src_rows));                  \
+    sum0 = MAD(co3, temp, sum0);                                            \
+    sum1 = MAD(co1, temp, sum1);                                            \
+    smem[0][col_lcl] = sum0;                                                \
+    sum1 = MAD(co2, SRC(col_gl, EXTRAPOLATE_(src_y + 3, src_rows)), sum1);  \
+    sum1 = MAD(co3, SRC(col_gl, EXTRAPOLATE_(src_y + 4, src_rows)), sum1);  \
+    smem[1][col_lcl] = sum1;
+
+
+#if kercn == 4
+#define LOAD_LOCAL4(col_gl, col_lcl) \
+    sum40 =     co3* SRC4(col_gl, EXTRAPOLATE_(src_y - 2, src_rows));           \
+    sum40 = MAD(co2, SRC4(col_gl, EXTRAPOLATE_(src_y - 1, src_rows)), sum40);   \
+    temp4 = SRC4(col_gl,  EXTRAPOLATE_(src_y, src_rows));                       \
+    sum40 = MAD(co1, temp4, sum40);                                             \
+    sum41 = co3 * temp4;                                                        \
+    temp4 = SRC4(col_gl,  EXTRAPOLATE_(src_y + 1, src_rows));                   \
+    sum40 = MAD(co2, temp4, sum40);                                             \
+    sum41 = MAD(co2, temp4, sum41);                                             \
+    temp4 = SRC4(col_gl,  EXTRAPOLATE_(src_y + 2, src_rows));                   \
+    sum40 = MAD(co3, temp4, sum40);                                             \
+    sum41 = MAD(co1, temp4, sum41);                                             \
+    vstore4(sum40, col_lcl, (__local float*) &smem[0][2]);                      \
+    sum41 = MAD(co2, SRC4(col_gl,  EXTRAPOLATE_(src_y + 3, src_rows)), sum41);  \
+    sum41 = MAD(co3, SRC4(col_gl,  EXTRAPOLATE_(src_y + 4, src_rows)), sum41);  \
+    vstore4(sum41, col_lcl, (__local float*) &smem[1][2]);
+#endif
+
 #define noconvert

 __kernel void pyrDown(__global const uchar * src, int src_step, int src_offset, int src_rows, int src_cols,
                         __global uchar * dst, int dst_step, int dst_offset, int dst_rows, int dst_cols)
 {
    const int x = get_global_id(0)*kercn;
-    const int y = get_group_id(1);
+    const int y = 2*get_global_id(1);

-    __local FT smem[LOCAL_SIZE + 4];
+    __local FT smem[2][LOCAL_SIZE + 4];
    __global uchar * dstData = dst + dst_offset;
    __global const uchar * srcData = src + src_offset;

-    FT sum;
+    FT sum0, sum1, temp;
    FT co1 = 0.375f;
    FT co2 = 0.25f;
    FT co3 = 0.0625f;
@@ -109,134 +146,68 @@ __kernel void pyrDown(__global const uchar * src, int src_step, int src_offset,
    const int src_y = 2*y;
    int col;

-    if (src_y >= 2 && src_y < src_rows - 2)
+    if (src_y >= 2 && src_y < src_rows - 4)
    {
+#define EXTRAPOLATE_(val, maxVal)   val
 #if kercn == 1
        col = EXTRAPOLATE(x, src_cols);
-
-        sum =     co3* SRC(col, src_y - 2);
-        sum = MAD(co2, SRC(col, src_y - 1), sum);
-        sum = MAD(co1, SRC(col, src_y    ), sum);
-        sum = MAD(co2, SRC(col, src_y + 1), sum);
-        sum = MAD(co3, SRC(col, src_y + 2), sum);
-
-        smem[2 + get_local_id(0)] = sum;
+        LOAD_LOCAL(col, 2 + get_local_id(0))
 #else
        if (x < src_cols-4)
        {
-            float4 sum4;
-            sum4 =     co3* SRC4(x, src_y - 2);
-            sum4 = MAD(co2, SRC4(x, src_y - 1), sum4);
-            sum4 = MAD(co1, SRC4(x, src_y    ), sum4);
-            sum4 = MAD(co2, SRC4(x, src_y + 1), sum4);
-            sum4 = MAD(co3, SRC4(x, src_y + 2), sum4);
-
-            vstore4(sum4, get_local_id(0), (__local float*) &smem[2]);
+            float4 sum40, sum41, temp4;
+            LOAD_LOCAL4(x, get_local_id(0))
        }
        else
        {
            for (int i=0; i<4; i++)
            {
                col = EXTRAPOLATE(x+i, src_cols);
-                sum =     co3* SRC(col, src_y - 2);
-                sum = MAD(co2, SRC(col, src_y - 1), sum);
-                sum = MAD(co1, SRC(col, src_y    ), sum);
-                sum = MAD(co2, SRC(col, src_y + 1), sum);
-                sum = MAD(co3, SRC(col, src_y + 2), sum);
-
-                smem[2 + 4*get_local_id(0)+i] = sum;
+                LOAD_LOCAL(col, 2 + 4 * get_local_id(0) + i)
            }
        }
 #endif
        if (get_local_id(0) < 2)
        {
            col = EXTRAPOLATE((int)(get_group_id(0)*LOCAL_SIZE + get_local_id(0) - 2), src_cols);
-
-            sum =     co3* SRC(col, src_y - 2);
-            sum = MAD(co2, SRC(col, src_y - 1), sum);
-            sum = MAD(co1, SRC(col, src_y    ), sum);
-            sum = MAD(co2, SRC(col, src_y + 1), sum);
-            sum = MAD(co3, SRC(col, src_y + 2), sum);
-
-            smem[get_local_id(0)] = sum;
+            LOAD_LOCAL(col, get_local_id(0))
        }
-
-        if (get_local_id(0) > 1 && get_local_id(0) < 4)
+        else if (get_local_id(0) < 4)
        {
            col = EXTRAPOLATE((int)((get_group_id(0)+1)*LOCAL_SIZE + get_local_id(0) - 2), src_cols);
-
-            sum =     co3* SRC(col, src_y - 2);
-            sum = MAD(co2, SRC(col, src_y - 1), sum);
-            sum = MAD(co1, SRC(col, src_y    ), sum);
-            sum = MAD(co2, SRC(col, src_y + 1), sum);
-            sum = MAD(co3, SRC(col, src_y + 2), sum);
-
-            smem[LOCAL_SIZE + get_local_id(0)] = sum;
+            LOAD_LOCAL(col, LOCAL_SIZE + get_local_id(0))
        }
    }
    else // need extrapolate y
    {
+#define EXTRAPOLATE_(val, maxVal)   EXTRAPOLATE(val, maxVal)
 #if kercn == 1
        col = EXTRAPOLATE(x, src_cols);
-
-        sum =     co3* SRC(col, EXTRAPOLATE(src_y - 2, src_rows));
-        sum = MAD(co2, SRC(col, EXTRAPOLATE(src_y - 1, src_rows)), sum);
-        sum = MAD(co1, SRC(col, EXTRAPOLATE(src_y    , src_rows)), sum);
-        sum = MAD(co2, SRC(col, EXTRAPOLATE(src_y + 1, src_rows)), sum);
-        sum = MAD(co3, SRC(col, EXTRAPOLATE(src_y + 2, src_rows)), sum);
-
-        smem[2 + get_local_id(0)] = sum;
+        LOAD_LOCAL(col, 2 + get_local_id(0))
 #else
        if (x < src_cols-4)
        {
-            float4 sum4;
-            sum4 =     co3* SRC4(x, EXTRAPOLATE(src_y - 2, src_rows));
-            sum4 = MAD(co2, SRC4(x, EXTRAPOLATE(src_y - 1, src_rows)), sum4);
-            sum4 = MAD(co1, SRC4(x, EXTRAPOLATE(src_y    , src_rows)), sum4);
-            sum4 = MAD(co2, SRC4(x, EXTRAPOLATE(src_y + 1, src_rows)), sum4);
-            sum4 = MAD(co3, SRC4(x, EXTRAPOLATE(src_y + 2, src_rows)), sum4);
-
-            vstore4(sum4, get_local_id(0), (__local float*) &smem[2]);
+            float4 sum40, sum41, temp4;
+            LOAD_LOCAL4(x, get_local_id(0))
        }
        else
        {
            for (int i=0; i<4; i++)
            {
                col = EXTRAPOLATE(x+i, src_cols);
-                sum =     co3* SRC(col, EXTRAPOLATE(src_y - 2, src_rows));
-                sum = MAD(co2, SRC(col, EXTRAPOLATE(src_y - 1, src_rows)), sum);
-                sum = MAD(co1, SRC(col, EXTRAPOLATE(src_y    , src_rows)), sum);
-                sum = MAD(co2, SRC(col, EXTRAPOLATE(src_y + 1, src_rows)), sum);
-                sum = MAD(co3, SRC(col, EXTRAPOLATE(src_y + 2, src_rows)), sum);
-
-                smem[2 + 4*get_local_id(0)+i] = sum;
+                LOAD_LOCAL(col, 2 + 4*get_local_id(0) + i)
            }
        }
 #endif
        if (get_local_id(0) < 2)
        {
            col = EXTRAPOLATE((int)(get_group_id(0)*LOCAL_SIZE + get_local_id(0) - 2), src_cols);
-
-            sum =     co3* SRC(col, EXTRAPOLATE(src_y - 2, src_rows));
-            sum = MAD(co2, SRC(col, EXTRAPOLATE(src_y - 1, src_rows)), sum);
-            sum = MAD(co1, SRC(col, EXTRAPOLATE(src_y    , src_rows)), sum);
-            sum = MAD(co2, SRC(col, EXTRAPOLATE(src_y + 1, src_rows)), sum);
-            sum = MAD(co3, SRC(col, EXTRAPOLATE(src_y + 2, src_rows)), sum);
-
-            smem[get_local_id(0)] = sum;
+            LOAD_LOCAL(col, get_local_id(0))
        }
-
-        if (get_local_id(0) > 1 && get_local_id(0) < 4)
+        else if (get_local_id(0) < 4)
        {
            col = EXTRAPOLATE((int)((get_group_id(0)+1)*LOCAL_SIZE + get_local_id(0) - 2), src_cols);
-
-            sum =     co3* SRC(col, EXTRAPOLATE(src_y - 2, src_rows));
-            sum = MAD(co2, SRC(col, EXTRAPOLATE(src_y - 1, src_rows)), sum);
-            sum = MAD(co1, SRC(col, EXTRAPOLATE(src_y    , src_rows)), sum);
-            sum = MAD(co2, SRC(col, EXTRAPOLATE(src_y + 1, src_rows)), sum);
-            sum = MAD(co3, SRC(col, EXTRAPOLATE(src_y + 2, src_rows)), sum);
-
-            smem[LOCAL_SIZE + get_local_id(0)] = sum;
+            LOAD_LOCAL(col, LOCAL_SIZE + get_local_id(0))
        }
    }

@@ -247,50 +218,68 @@ __kernel void pyrDown(__global const uchar * src, int src_step, int src_offset,
    {
        const int tid2 = get_local_id(0) * 2;

-        sum = 0.f;
-#if cn == 1
-#if fdepth <= 5
-        sum = sum + dot(vload4(0, (__local float*) (&smem)+tid2), (float4)(co3, co2, co1, co2));
-#else
-        sum = sum + dot(vload4(0, (__local double*) (&smem)+tid2), (double4)(co3, co2, co1, co2));
-#endif
-#else
-        sum = MAD(co3, smem[2 + tid2 - 2], sum);
-        sum = MAD(co2, smem[2 + tid2 - 1], sum);
-        sum = MAD(co1, smem[2 + tid2    ], sum);
-        sum = MAD(co2, smem[2 + tid2 + 1], sum);
-#endif
-        sum = MAD(co3, smem[2 + tid2 + 2], sum);
-
        const int dst_x = (get_group_id(0) * get_local_size(0) + tid2) / 2;

        if (dst_x < dst_cols)
-            storepix(convertToT(sum), dstData + y * dst_step + dst_x * PIXSIZE);
+        {
+            for (int yin = y, y1 = min(dst_rows, y + 2); yin < y1; yin++)
+            {
+#if cn == 1
+#if fdepth <= 5
+                FT sum = dot(vload4(0, (__local float*) (&smem) + tid2 + (yin - y) * (LOCAL_SIZE + 4)), (float4)(co3, co2, co1, co2));
+#else
+                FT sum = dot(vload4(0, (__local double*) (&smem) + tid2 + (yin - y) * (LOCAL_SIZE + 4)), (double4)(co3, co2, co1, co2));
+#endif
+#else
+                FT sum = co3 * smem[yin - y][2 + tid2 - 2];
+                sum = MAD(co2, smem[yin - y][2 + tid2 - 1], sum);
+                sum = MAD(co1, smem[yin - y][2 + tid2    ], sum);
+                sum = MAD(co2, smem[yin - y][2 + tid2 + 1], sum);
+#endif
+                sum = MAD(co3, smem[yin - y][2 + tid2 + 2], sum);
+                storepix(convertToT(sum), dstData + yin * dst_step + dst_x * PIXSIZE);
+            }
+        }
    }
 #else
    int tid4 = get_local_id(0) * 4;
-
-    sum =     co3* smem[2 + tid4 + 2];
-    sum = MAD(co3, smem[2 + tid4 - 2], sum);
-    sum = MAD(co2, smem[2 + tid4 - 1], sum);
-    sum = MAD(co1, smem[2 + tid4    ], sum);
-    sum = MAD(co2, smem[2 + tid4 + 1], sum);
-
    int dst_x = (get_group_id(0) * LOCAL_SIZE + tid4) / 2;
+    if (dst_x < dst_cols - 1)
+    {
+        for (int yin = y, y1 = min(dst_rows, y + 2); yin < y1; yin++)
+        {

-    if (dst_x < dst_cols)
-        storepix(convertToT(sum), dstData + mad24(y, dst_step, dst_x * PIXSIZE));
+            FT sum =  co3* smem[yin - y][2 + tid4 + 2];
+            sum = MAD(co3, smem[yin - y][2 + tid4 - 2], sum);
+            sum = MAD(co2, smem[yin - y][2 + tid4 - 1], sum);
+            sum = MAD(co1, smem[yin - y][2 + tid4    ], sum);
+            sum = MAD(co2, smem[yin - y][2 + tid4 + 1], sum);
+            storepix(convertToT(sum), dstData + mad24(yin, dst_step, dst_x * PIXSIZE));

-    tid4 += 2;
-    dst_x += 1;
+            dst_x ++;
+            sum =     co3* smem[yin - y][2 + tid4 + 4];
+            sum = MAD(co3, smem[yin - y][2 + tid4    ], sum);
+            sum = MAD(co2, smem[yin - y][2 + tid4 + 1], sum);
+            sum = MAD(co1, smem[yin - y][2 + tid4 + 2], sum);
+            sum = MAD(co2, smem[yin - y][2 + tid4 + 3], sum);
+            storepix(convertToT(sum), dstData + mad24(yin, dst_step, dst_x * PIXSIZE));
+            dst_x --;
+        }

-    sum =     co3* smem[2 + tid4 + 2];
-    sum = MAD(co3, smem[2 + tid4 - 2], sum);
-    sum = MAD(co2, smem[2 + tid4 - 1], sum);
-    sum = MAD(co1, smem[2 + tid4    ], sum);
-    sum = MAD(co2, smem[2 + tid4 + 1], sum);
+    }
+    else if (dst_x < dst_cols)
+    {
+        for (int yin = y, y1 = min(dst_rows, y + 2); yin < y1; yin++)
+        {
+            FT sum =  co3* smem[yin - y][2 + tid4 + 2];
+            sum = MAD(co3, smem[yin - y][2 + tid4 - 2], sum);
+            sum = MAD(co2, smem[yin - y][2 + tid4 - 1], sum);
+            sum = MAD(co1, smem[yin - y][2 + tid4    ], sum);
+            sum = MAD(co2, smem[yin - y][2 + tid4 + 1], sum);

-    if (dst_x < dst_cols)
-        storepix(convertToT(sum), dstData + mad24(y, dst_step, dst_x * PIXSIZE));
+            storepix(convertToT(sum), dstData + mad24(yin, dst_step, dst_x * PIXSIZE));
+        }
+    }
 #endif
+
 }
--- a/modules/imgproc/src/pyramids.cpp
+++ b/modules/imgproc/src/pyramids.cpp
@@ -445,7 +445,7 @@ static bool ocl_pyrDown( InputArray _src, OutputArray _dst, const Size& _dsz, in
    k.args(ocl::KernelArg::ReadOnly(src), ocl::KernelArg::WriteOnly(dst));

    size_t localThreads[2]  = { local_size/kercn, 1 };
-    size_t globalThreads[2] = { (src.cols + (kercn-1))/kercn, dst.rows };
+    size_t globalThreads[2] = { (src.cols + (kercn-1))/kercn, (dst.rows + 1) / 2 };
    return k.run(2, globalThreads, localThreads, false);
 }

--- a/modules/photo/doc/cloning.rst
+++ b/modules/photo/doc/cloning.rst
@@ -7,7 +7,7 @@ seamlessClone
 -------------
 Image editing tasks concern either global changes (color/intensity corrections, filters, deformations) or local changes concerned to a selection.
 Here we are interested in achieving local changes, ones that are restricted to a region manually selected (ROI), in a seamless and effortless manner.
-The extent of the changes ranges from slight distortions to complete replacement by novel content.
+The extent of the changes ranges from slight distortions to complete replacement by novel content [PM03]_.

 .. ocv:function:: void seamlessClone( InputArray src, InputArray dst, InputArray mask, Point p, OutputArray blend, int flags)

@@ -25,13 +25,9 @@ The extent of the changes ranges from slight distortions to complete replacement

            * **NORMAL_CLONE**     The power of the method is fully expressed when inserting objects with complex outlines into a new background

-            * **MIXED_CLONE**    The classic method, color-based selection and alpha
-                                 masking might be time consuming and often leaves an undesirable halo. Seamless
-                                 cloning, even averaged with the original image, is not effective. Mixed seamless
-                                 cloning based on a loose selection proves effective.
+            * **MIXED_CLONE**    The classic method, color-based selection and alpha masking might be time consuming and often leaves an undesirable halo. Seamless cloning, even averaged with the original image, is not effective. Mixed seamless cloning based on a loose selection proves effective.

-            * **FEATURE_EXCHANGE**     Feature exchange allows the user to replace easily certain
-                                       features of one object by alternative features.
+            * **FEATURE_EXCHANGE**     Feature exchange allows the user to easily replace certain features of one object by alternative features.



@@ -97,3 +93,5 @@ region, giving its contents a flat aspect. Here Canny Edge Detector is used.
 **NOTE:**

 The algorithm assumes that the color of the source image is close to that of the destination. This assumption means that when the colors don't match, the source image color gets tinted toward the color of the destination image.
+
+.. [PM03] Patrick Perez, Michel Gangnet, Andrew Blake, "Poisson image editing", ACM Transactions on Graphics (SIGGRAPH), 2003.
--- a/modules/photo/doc/decolor.rst
+++ b/modules/photo/doc/decolor.rst
@@ -6,7 +6,7 @@ Decolorization
 decolor
 -------

-Transforms a color image to a grayscale image. It is a basic tool in digital printing, stylized black-and-white photograph rendering, and in many single channel image processing applications.
+Transforms a color image to a grayscale image. It is a basic tool in digital printing, stylized black-and-white photograph rendering, and in many single channel image processing applications [CL12]_.

 .. ocv:function:: void decolor( InputArray src, OutputArray grayscale, OutputArray color_boost )

@@ -17,3 +17,5 @@ Transforms a color image to a grayscale image. It is a basic tool in digital pri
    :param color_boost: Output 8-bit 3-channel image.

 This function is to be applied on color images.
+
+.. [CL12] Cewu Lu, Li Xu, Jiaya Jia, "Contrast Preserving Decolorization", IEEE International Conference on Computational Photography (ICCP), 2012.
--- a/modules/photo/doc/hdr_imaging.rst
+++ b/modules/photo/doc/hdr_imaging.rst
@@ -356,7 +356,7 @@ Creates MergeRobertson object
 .. ocv:function:: Ptr<MergeRobertson> createMergeRobertson()

 References
-==========
+---------------------------

 .. [DM03] F. Drago, K. Myszkowski, T. Annen, N. Chiba, "Adaptive Logarithmic Mapping For Displaying High Contrast Scenes", Computer Graphics Forum, 2003, 22, 419 - 426.

--- a/modules/photo/doc/npr.rst
+++ b/modules/photo/doc/npr.rst
@@ -6,7 +6,7 @@ Non-Photorealistic Rendering
 edgePreservingFilter
 --------------------

-Filtering is the fundamental operation in image and video processing. Edge-preserving smoothing filters are used in many different applications.
+Filtering is the fundamental operation in image and video processing. Edge-preserving smoothing filters are used in many different applications [EM11]_.

 .. ocv:function:: void edgePreservingFilter(InputArray src, OutputArray dst, int flags = 1, float sigma_s = 60, float sigma_r = 0.4f)

@@ -16,9 +16,9 @@ Filtering is the fundamental operation in image and video processing. Edge-prese

    :param flags: Edge preserving filters:

-            * **RECURS_FILTER**
+            * **RECURS_FILTER** = 1

-            * **NORMCONV_FILTER**
+            * **NORMCONV_FILTER** = 2

    :param sigma_s: Range between 0 to 200.

@@ -72,3 +72,5 @@ Stylization aims to produce digital imagery with a wide variety of effects not f
    :param sigma_s: Range between 0 to 200.

    :param sigma_r: Range between 0 to 1.
+
+.. [EM11] Eduardo S. L. Gastal, Manuel M. Oliveira, "Domain transform for edge-aware image and video processing", ACM Trans. Graph. 30(4): 69, 2011.
--- a/modules/photo/src/npr.hpp
+++ b/modules/photo/src/npr.hpp
@@ -173,6 +173,7 @@ void Domain_Filter::compute_Rfilter(Mat &output, Mat &hz, float sigma_h)
 {
    int h = output.rows;
    int w = output.cols;
+    int channel = output.channels();

    float a = (float) exp((-1.0 * sqrt(2.0)) / sigma_h);

@@ -185,11 +186,15 @@ void Domain_Filter::compute_Rfilter(Mat &output, Mat &hz, float sigma_h)
        for(int j=0;j<w;j++)
            V.at<float>(i,j) = pow(a,hz.at<float>(i,j));

-    for(int i=0; i<h; i++)
+   for(int i=0; i<h; i++)
    {
        for(int j =1; j < w; j++)
        {
-           temp.at<float>(i,j) = temp.at<float>(i,j) + (temp.at<float>(i,j-1) - temp.at<float>(i,j)) * V.at<float>(i,j);
+            for(int c = 0; c<channel; c++)
+            {
+                temp.at<float>(i,j*channel+c) = temp.at<float>(i,j*channel+c) +
+                    (temp.at<float>(i,(j-1)*channel+c) - temp.at<float>(i,j*channel+c)) * V.at<float>(i,j);
+            }
        }
    }

@@ -197,7 +202,11 @@ void Domain_Filter::compute_Rfilter(Mat &output, Mat &hz, float sigma_h)
    {
        for(int j =w-2; j >= 0; j--)
        {
-           temp.at<float>(i,j) = temp.at<float>(i,j) + (temp.at<float>(i,j+1) - temp.at<float>(i,j)) * V.at<float>(i,j+1);
+            for(int c = 0; c<channel; c++)
+            {
+                temp.at<float>(i,j*channel+c) = temp.at<float>(i,j*channel+c) +
+                    (temp.at<float>(i,(j+1)*channel+c) - temp.at<float>(i,j*channel+c))*V.at<float>(i,j+1);
+            }
        }
    }

--- a/modules/photo/src/seamless_cloning.cpp
+++ b/modules/photo/src/seamless_cloning.cpp
@@ -108,6 +108,7 @@ void cv::seamlessClone(InputArray _src, InputArray _dst, InputArray _mask, Point

    Cloning obj;
    obj.normal_clone(dest,cd_mask,dst_mask,blend,flags);
+
 }

 void cv::colorChange(InputArray _src, InputArray _mask, OutputArray _dst, float r, float g, float b)
@@ -136,7 +137,6 @@ void cv::colorChange(InputArray _src, InputArray _mask, OutputArray _dst, float
    obj.local_color_change(src,cs_mask,gray,blend,red,green,blue);
 }

-
 void cv::illuminationChange(InputArray _src, InputArray _mask, OutputArray _dst, float a, float b)
 {

--- a/modules/photo/src/seamless_cloning.hpp
+++ b/modules/photo/src/seamless_cloning.hpp
@@ -455,6 +455,8 @@ void Cloning::normal_clone(Mat &I, Mat &mask, Mat &wmask, Mat &cloned, int num)
 {
    int w = I.size().width;
    int h = I.size().height;
+    int channel = I.channels();
+

    initialization(I,mask,wmask);

@@ -466,20 +468,33 @@ void Cloning::normal_clone(Mat &I, Mat &mask, Mat &wmask, Mat &cloned, int num)
    }
    else if(num == 2)
    {
+
        for(int i=0;i < h; i++)
-            for(int j=0; j < w; j++)
+        {
+           for(int j=0; j < w; j++)
            {
-                if(abs(sgx.at<float>(i,j) - sgy.at<float>(i,j)) > abs(grx.at<float>(i,j) - gry.at<float>(i,j)))
+                for(int c=0;c<channel;++c)
                {
-                    srx32.at<float>(i,j) = sgx.at<float>(i,j) * smask.at<float>(i,j);
-                    sry32.at<float>(i,j) = sgy.at<float>(i,j) * smask.at<float>(i,j);
-                }
-                else
-                {
-                    srx32.at<float>(i,j) = grx.at<float>(i,j) * smask.at<float>(i,j);
-                    sry32.at<float>(i,j) = gry.at<float>(i,j) * smask.at<float>(i,j);
+                    if(abs(sgx.at<float>(i,j*channel+c) - sgy.at<float>(i,j*channel+c)) >
+                            abs(grx.at<float>(i,j*channel+c) - gry.at<float>(i,j*channel+c)))
+                    {
+
+                        srx32.at<float>(i,j*channel+c) = sgx.at<float>(i,j*channel+c)
+                            * smask.at<float>(i,j);
+                        sry32.at<float>(i,j*channel+c) = sgy.at<float>(i,j*channel+c)
+                            * smask.at<float>(i,j);
+                    }
+                    else
+                    {
+                        srx32.at<float>(i,j*channel+c) = grx.at<float>(i,j*channel+c)
+                            * smask.at<float>(i,j);
+                        sry32.at<float>(i,j*channel+c) = gry.at<float>(i,j*channel+c)
+                            * smask.at<float>(i,j);
+                    }
                }
            }
+        }
+
    }
    else if(num == 3)
    {
--- a/modules/videoio/CMakeLists.txt
+++ b/modules/videoio/CMakeLists.txt
@@ -148,7 +148,7 @@ endif(HAVE_INTELPERC)

 if(IOS)
  add_definitions(-DHAVE_IOS=1)
-  list(APPEND videoio_srcs src/ios_conversions.mm src/cap_ios_abstract_camera.mm src/cap_ios_photo_camera.mm src/cap_ios_video_camera.mm)
+  list(APPEND videoio_srcs src/cap_ios_abstract_camera.mm src/cap_ios_photo_camera.mm src/cap_ios_video_camera.mm)
  list(APPEND VIDEOIO_LIBRARIES "-framework Accelerate" "-framework AVFoundation" "-framework CoreGraphics" "-framework CoreImage" "-framework CoreMedia" "-framework CoreVideo" "-framework QuartzCore" "-framework AssetsLibrary")
 endif()