Merge branch 'master' of code.opencv.org:opencv

Conflicts: samples/python2/common.py samples/python2/feature_homography.py samples/python2/plane_ar.py samples/python2/plane_tracker.py
2012-08-07 23:21:56 +03:00
parent 647b1dd96c e2eb81bcb2
commit 7db1f711f6
132 changed files with 9929 additions and 1613 deletions
--- a/modules/contrib/src/stereovar.cpp
+++ b/modules/contrib/src/stereovar.cpp
@@ -67,11 +67,12 @@ StereoVar::~StereoVar()

 static Mat diffX(Mat &src)
 {
-    register int x, y, cols = src.cols - 1;
+    int cols = src.cols - 1;
    Mat dst(src.size(), src.type());
-    for(y = 0; y < src.rows; y++){
+    for(int y = 0; y < src.rows; y++){
        const float* pSrc = src.ptr<float>(y);
        float* pDst = dst.ptr<float>(y);
+        int x = 0;
 #if CV_SSE2
        for (x = 0; x <= cols - 8; x += 8) {
            __m128 a0 = _mm_loadu_ps(pSrc + x);
--- a/modules/core/doc/basic_structures.rst
+++ b/modules/core/doc/basic_structures.rst
@@ -2446,6 +2446,6 @@ The above methods are usually enough for users. If you want to make your own alg
 * Make a class and specify ``Algorithm`` as its base class.
 * The algorithm parameters should be the class members. See ``Algorithm::get()`` for the list of possible types of the parameters.
 * Add public virtual method ``AlgorithmInfo* info() const;`` to your class.
- * Add constructor function, ``AlgorithmInfo`` instance and implement the ``info()`` method. The simplest way is to take  http://code.opencv.org/svn/opencv/trunk/opencv/modules/ml/src/ml_init.cpp as the reference and modify it according to the list of your parameters.
+ * Add constructor function, ``AlgorithmInfo`` instance and implement the ``info()`` method. The simplest way is to take  http://code.opencv.org/projects/opencv/repository/revisions/master/entry/modules/ml/src/ml_init.cpp as the reference and modify it according to the list of your parameters.
 * Add some public function (e.g. ``initModule_<mymodule>()``) that calls info() of your algorithm and put it into the same source file as ``info()`` implementation. This is to force C++ linker to include this object file into the target application. See ``Algorithm::create()`` for details.

--- a/modules/core/src/lapack.cpp
+++ b/modules/core/src/lapack.cpp
@@ -1024,7 +1024,7 @@ double cv::invert( InputArray _src, OutputArray _dst, int method )
 							__m128 s0 = _mm_or_ps(t0, t1);
 							__m128 det =_mm_set1_ps((float)d);
 							s0 =  _mm_mul_ps(s0, det);
-							const uchar CV_DECL_ALIGNED(16) inv[16] = {0,0,0,0,0,0,0,0x80,0,0,0,0x80,0,0,0,0};
+							static const uchar CV_DECL_ALIGNED(16) inv[16] = {0,0,0,0,0,0,0,0x80,0,0,0,0x80,0,0,0,0};
 							__m128 pattern = _mm_load_ps((const float*)inv); 
 							s0 = _mm_xor_ps(s0, pattern);//==-1*s0
 							s0 = _mm_shuffle_ps(s0, s0, _MM_SHUFFLE(0,2,1,3));
@@ -1064,7 +1064,7 @@ double cv::invert( InputArray _src, OutputArray _dst, int method )
 							__m128d det = _mm_load1_pd((const double*)&d);
 							sm =  _mm_mul_pd(sm, det);
 				
-							uchar CV_DECL_ALIGNED(16) inv[8] = {0,0,0,0,0,0,0,0x80};
+							static const uchar CV_DECL_ALIGNED(16) inv[8] = {0,0,0,0,0,0,0,0x80};
 							__m128d pattern = _mm_load1_pd((double*)inv); 
 							ss = _mm_mul_pd(ss, det);
 							ss = _mm_xor_pd(ss, pattern);//==-1*ss
@@ -1097,24 +1097,66 @@ double cv::invert( InputArray _src, OutputArray _dst, int method )
                double d = det3(Sf);
                if( d != 0. )
                {
+                    float CV_DECL_ALIGNED(16) t[12];
+                    
                    result = true;
                    d = 1./d;
-                    float t[9];
-                    t[0] = (float)(((double)Sf(1,1) * Sf(2,2) - (double)Sf(1,2) * Sf(2,1)) * d);
-                    t[1] = (float)(((double)Sf(0,2) * Sf(2,1) - (double)Sf(0,1) * Sf(2,2)) * d);
-                    t[2] = (float)(((double)Sf(0,1) * Sf(1,2) - (double)Sf(0,2) * Sf(1,1)) * d);
-                   
-                    t[3] = (float)(((double)Sf(1,2) * Sf(2,0) - (double)Sf(1,0) * Sf(2,2)) * d);
-                    t[4] = (float)(((double)Sf(0,0) * Sf(2,2) - (double)Sf(0,2) * Sf(2,0)) * d);
-                    t[5] = (float)(((double)Sf(0,2) * Sf(1,0) - (double)Sf(0,0) * Sf(1,2)) * d);
-                   
-                    t[6] = (float)(((double)Sf(1,0) * Sf(2,1) - (double)Sf(1,1) * Sf(2,0)) * d);
-                    t[7] = (float)(((double)Sf(0,1) * Sf(2,0) - (double)Sf(0,0) * Sf(2,1)) * d);
-                    t[8] = (float)(((double)Sf(0,0) * Sf(1,1) - (double)Sf(0,1) * Sf(1,0)) * d);
+                #if CV_SSE2
+                    if(USE_SSE2)
+                    {
+                        __m128 det =_mm_set1_ps((float)d);
+                        __m128 s0 = _mm_loadu_ps((const float*)srcdata);//s0 = Sf(0,0) Sf(0,1) Sf(0,2) ***
+                        __m128 s1 = _mm_loadu_ps((const float*)(srcdata+srcstep));//s1 = Sf(1,0) Sf(1,1) Sf(1,2) ***
+                        __m128 s2 = _mm_set_ps(0.f, Sf(2,2), Sf(2,1), Sf(2,0)); //s2 = Sf(2,0) Sf(2,1) Sf(2,2) ***

-                    Df(0,0) = t[0]; Df(0,1) = t[1]; Df(0,2) = t[2];
-                    Df(1,0) = t[3]; Df(1,1) = t[4]; Df(1,2) = t[5];
-                    Df(2,0) = t[6]; Df(2,1) = t[7]; Df(2,2) = t[8];
+                        __m128 r0 =  _mm_shuffle_ps(s1,s1,_MM_SHUFFLE(3,0,2,1)); //r0 = Sf(1,1) Sf(1,2) Sf(1,0) ***
+                        __m128 r1 =  _mm_shuffle_ps(s2,s2,_MM_SHUFFLE(3,1,0,2)); //r1 = Sf(2,2) Sf(2,0) Sf(2,1) ***
+                        __m128 r2 =  _mm_shuffle_ps(s2,s2,_MM_SHUFFLE(3,0,2,1)); //r2 = Sf(2,1) Sf(2,2) Sf(2,0) ***
+                        
+                        __m128 t0 = _mm_mul_ps(s0, r0);//t0 = Sf(0,0)*Sf(1,1) Sf(0,1)*Sf(1,2) Sf(0,2)*Sf(1,0) ***
+                        __m128 t1 = _mm_mul_ps(s0, r1);//t1 = Sf(0,0)*Sf(2,2) Sf(0,1)*Sf(2,0) Sf(0,2)*Sf(2,1) ***
+                        __m128 t2 = _mm_mul_ps(s1, r2);//t2 = Sf(1,0)*Sf(2,1) Sf(1,1)*Sf(2,2) Sf(1,2)*Sf(2,0) ***
+                        
+                        __m128 r3 = _mm_shuffle_ps(s0,s0,_MM_SHUFFLE(3,0,2,1));//r3 = Sf(0,1) Sf(0,2) Sf(0,0) ***
+                        __m128 r4 = _mm_shuffle_ps(s0,s0,_MM_SHUFFLE(3,1,0,2));//r4 = Sf(0,2) Sf(0,0) Sf(0,1) ***
+                        
+                        __m128 t00 = _mm_mul_ps(s1, r3);//t00 = Sf(1,0)*Sf(0,1) Sf(1,1)*Sf(0,2) Sf(1,2)*Sf(0,0) ***
+                        __m128 t11 = _mm_mul_ps(s2, r4);//t11 = Sf(2,0)*Sf(0,2) Sf(2,1)*Sf(0,0) Sf(2,2)*Sf(0,1) ***
+                        __m128 t22 = _mm_mul_ps(s2, r0);//t22 = Sf(2,0)*Sf(1,1) Sf(2,1)*Sf(1,2) Sf(2,2)*Sf(1,0) ***
+                        
+                        t0 = _mm_mul_ps(_mm_sub_ps(t0,t00), det);//Sf(0,0)*Sf(1,1)   Sf(0,1)*Sf(1,2)   Sf(0,2)*Sf(1,0) ***
+                                                                //-Sf(1,0)*Sf(0,1)  -Sf(1,1)*Sf(0,2)  -Sf(1,2)*Sf(0,0)
+                        t1 = _mm_mul_ps(_mm_sub_ps(t1,t11), det);//Sf(0,0)*Sf(2,2)   Sf(0,1)*Sf(2,0)   Sf(0,2)*Sf(2,1) ***
+                                                                //-Sf(2,0)*Sf(0,2)  -Sf(2,1)*Sf(0,0)  -Sf(2,2)*Sf(0,1) 
+                        t2 = _mm_mul_ps(_mm_sub_ps(t2,t22), det);//Sf(1,0)*Sf(2,1)   Sf(1,1)*Sf(2,2)   Sf(1,2)*Sf(2,0) ***
+                                                                //-Sf(2,0)*Sf(1,1)  -Sf(2,1)*Sf(1,2)  -Sf(2,2)*Sf(1,0)
+                        _mm_store_ps(t, t0);
+                        _mm_store_ps(t+4, t1);
+                        _mm_store_ps(t+8, t2);
+                        
+                        Df(0,0) = t[9]; Df(0,1) = t[6]; Df(0,2) = t[1];
+                        Df(1,0) = t[10]; Df(1,1) = t[4]; Df(1,2) = t[2];
+                        Df(2,0) = t[8]; Df(2,1) = t[5]; Df(2,2) = t[0];
+                    }
+                    else
+                #endif
+                    {
+                        t[0] = (float)(((double)Sf(1,1) * Sf(2,2) - (double)Sf(1,2) * Sf(2,1)) * d);
+                        t[1] = (float)(((double)Sf(0,2) * Sf(2,1) - (double)Sf(0,1) * Sf(2,2)) * d);
+                        t[2] = (float)(((double)Sf(0,1) * Sf(1,2) - (double)Sf(0,2) * Sf(1,1)) * d);
+                       
+                        t[3] = (float)(((double)Sf(1,2) * Sf(2,0) - (double)Sf(1,0) * Sf(2,2)) * d);
+                        t[4] = (float)(((double)Sf(0,0) * Sf(2,2) - (double)Sf(0,2) * Sf(2,0)) * d);
+                        t[5] = (float)(((double)Sf(0,2) * Sf(1,0) - (double)Sf(0,0) * Sf(1,2)) * d);
+                       
+                        t[6] = (float)(((double)Sf(1,0) * Sf(2,1) - (double)Sf(1,1) * Sf(2,0)) * d);
+                        t[7] = (float)(((double)Sf(0,1) * Sf(2,0) - (double)Sf(0,0) * Sf(2,1)) * d);
+                        t[8] = (float)(((double)Sf(0,0) * Sf(1,1) - (double)Sf(0,1) * Sf(1,0)) * d);
+
+                        Df(0,0) = t[0]; Df(0,1) = t[1]; Df(0,2) = t[2];
+                        Df(1,0) = t[3]; Df(1,1) = t[4]; Df(1,2) = t[5];
+                        Df(2,0) = t[6]; Df(2,1) = t[7]; Df(2,2) = t[8];
+                    }
                }
            }
            else
--- a/modules/features2d/doc/common_interfaces_of_feature_detectors.rst
+++ b/modules/features2d/doc/common_interfaces_of_feature_detectors.rst
@@ -148,7 +148,7 @@ Wrapping class for feature detection using the
    class FastFeatureDetector : public FeatureDetector
    {
    public:
-        FastFeatureDetector( int threshold=1, bool nonmaxSuppression=true );
+        FastFeatureDetector( int threshold=1, bool nonmaxSuppression=true, type=FastFeatureDetector::TYPE_9_16 );
        virtual void read( const FileNode& fn );
        virtual void write( FileStorage& fs ) const;
    protected:
--- a/modules/features2d/doc/feature_detection_and_description.rst
+++ b/modules/features2d/doc/feature_detection_and_description.rst
@@ -7,7 +7,7 @@ FAST
 --------
 Detects corners using the FAST algorithm

-.. ocv:function:: void FAST( InputArray image, vector<KeyPoint>& keypoints, int threshold, bool nonmaxSupression=true )
+.. ocv:function:: void FAST( InputArray image, vector<KeyPoint>& keypoints, int threshold, bool nonmaxSupression=true, type=FastFeatureDetector::TYPE_9_16 )

    :param image: Image where keypoints (corners) are detected.

@@ -17,6 +17,8 @@ Detects corners using the FAST algorithm

    :param nonmaxSupression: If it is true, non-maximum suppression is applied to detected corners (keypoints).

+    :param type: one of the three neighborhoods as defined in the paper: ``FastFeatureDetector::TYPE_9_16``, ``FastFeatureDetector::TYPE_7_12``, ``FastFeatureDetector::TYPE_5_8``
+
 Detects corners using the FAST algorithm by [Rosten06]_.

 .. [Rosten06] E. Rosten. Machine Learning for High-speed Corner Detection, 2006.
--- a/modules/features2d/src/fast.cpp
+++ b/modules/features2d/src/fast.cpp
@@ -9,16 +9,16 @@ Redistribution and use in source and binary forms, with or without
 modification, are permitted provided that the following conditions
 are met:

-	*Redistributions of source code must retain the above copyright
-	 notice, this list of conditions and the following disclaimer.
+    *Redistributions of source code must retain the above copyright
+     notice, this list of conditions and the following disclaimer.

-	*Redistributions in binary form must reproduce the above copyright
-	 notice, this list of conditions and the following disclaimer in the
-	 documentation and/or other materials provided with the distribution.
+    *Redistributions in binary form must reproduce the above copyright
+     notice, this list of conditions and the following disclaimer in the
+     documentation and/or other materials provided with the distribution.

-	*Neither the name of the University of Cambridge nor the names of
-	 its contributors may be used to endorse or promote products derived
-	 from this software without specific prior written permission.
+    *Neither the name of the University of Cambridge nor the names of
+     its contributors may be used to endorse or promote products derived
+     from this software without specific prior written permission.

 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
@@ -350,7 +350,7 @@ int cornerScore<8>(const uchar* ptr, const int pixel[], int threshold)
    }

    int b0 = -a0;
-    for( k = 0; k < 12; k += 2 )
+    for( k = 0; k < 8; k += 2 )
    {
        int b = std::max((int)d[k+1], (int)d[k+2]);
        b = std::max(b, (int)d[k+3]);
@@ -375,7 +375,10 @@ template<int patternSize>
 void FAST_t(InputArray _img, std::vector<KeyPoint>& keypoints, int threshold, bool nonmax_suppression)
 {
    Mat img = _img.getMat();
-    const int K = patternSize/2, N = patternSize + K + 1, quarterPatternSize = patternSize/4;
+    const int K = patternSize/2, N = patternSize + K + 1;
+#if CV_SSE2
+    const int quarterPatternSize = patternSize/4;
+#endif
    int i, j, k, pixel[25];
    makeOffsets(pixel, (int)img.step, patternSize);
    for(k = patternSize; k < 25; k++)
@@ -585,7 +588,7 @@ FastFeatureDetector::FastFeatureDetector( int _threshold, bool _nonmaxSuppressio
 FastFeatureDetector::FastFeatureDetector( int _threshold, bool _nonmaxSuppression, int _type )
 : threshold(_threshold), nonmaxSuppression(_nonmaxSuppression), type(_type)
 {}
-    
+
 void FastFeatureDetector::detectImpl( const Mat& image, vector<KeyPoint>& keypoints, const Mat& mask ) const
 {
    Mat grayImage = image;
--- a/modules/gpu/doc/introduction.rst
+++ b/modules/gpu/doc/introduction.rst
@@ -42,7 +42,7 @@ You can always determine at runtime whether the OpenCV GPU-built binaries (or PT
 Utilizing Multiple GPUs
 -----------------------

-In the current version, each of the OpenCV GPU algorithms can use only a single GPU. So, to utilize multiple GPUs, you have to manually distribute the work between GPUs. 
+In the current version, each of the OpenCV GPU algorithms can use only a single GPU. So, to utilize multiple GPUs, you have to manually distribute the work between GPUs.
 Switching active devie can be done using :ocv:func:`gpu::setDevice()` function.  For more details please read Cuda C Programing Guide.

 While developing algorithms for multiple GPUs, note a data passing overhead. For primitive functions and small images, it can be significant, which may eliminate all the advantages of having multiple GPUs. But for high-level algorithms, consider using multi-GPU acceleration. For example, the Stereo Block Matching algorithm has been successfully parallelized using the following algorithm:
@@ -59,5 +59,5 @@ While developing algorithms for multiple GPUs, note a data passing overhead. For
 With this algorithm, a dual GPU gave a 180
 %
 performance increase comparing to the single Fermi GPU. For a source code example, see
-http://code.opencv.org/svn/opencv/trunk/opencv/samples/gpu/.
+http://code.opencv.org/projects/opencv/repository/revisions/master/entry/samples/gpu/.

--- a/modules/gpu/doc/video.rst
+++ b/modules/gpu/doc/video.rst
@@ -324,9 +324,9 @@ Class used for background/foreground segmentation. ::
        std::vector< std::vector<cv::Point> > foreground_regions;
    };

-The class discriminates between foreground and background pixels by building and maintaining a model of the background. Any pixel which does not fit this model is then deemed to be foreground. The class implements algorithm described in [FGD2003]_.
+  The class discriminates between foreground and background pixels by building and maintaining a model of the background. Any pixel which does not fit this model is then deemed to be foreground. The class implements algorithm described in [FGD2003]_.

-The results are available through the class fields:
+  The results are available through the class fields:

    .. ocv:member:: cv::gpu::GpuMat background

@@ -489,9 +489,9 @@ Gaussian Mixture-based Background/Foreground Segmentation Algorithm. ::
        ...
    };

-The class discriminates between foreground and background pixels by building and maintaining a model of the background. Any pixel which does not fit this model is then deemed to be foreground. The class implements algorithm described in [MOG2004]_.
+  The class discriminates between foreground and background pixels by building and maintaining a model of the background. Any pixel which does not fit this model is then deemed to be foreground. The class implements algorithm described in [MOG2004]_.

-Here are important members of the class that control the algorithm, which you can set after constructing the class instance:
+  Here are important members of the class that control the algorithm, which you can set after constructing the class instance:

    .. ocv:member:: float backgroundRatio

--- a/modules/gpu/include/opencv2/gpu/gpu.hpp
+++ b/modules/gpu/include/opencv2/gpu/gpu.hpp
@@ -917,6 +917,12 @@ CV_EXPORTS void graphcut(GpuMat& terminals, GpuMat& leftTransp, GpuMat& rightTra
                         GpuMat& labels,
                         GpuMat& buf, Stream& stream = Stream::Null());

+//! compute mask for Generalized Flood fill componetns labeling.
+CV_EXPORTS void connectivityMask(const GpuMat& image, GpuMat& mask, const cv::Scalar& lo, const cv::Scalar& hi, Stream& stream = Stream::Null());
+
+//! performs connected componnents labeling.
+CV_EXPORTS void labelComponents(const GpuMat& mask, GpuMat& components, Stream& stream = Stream::Null());
+
 ////////////////////////////////// Histograms //////////////////////////////////

 //! Compute levels with even distribution. levels will have 1 row and nLevels cols and CV_32SC1 type.
--- a/modules/gpu/perf/perf_imgproc.cpp
+++ b/modules/gpu/perf/perf_imgproc.cpp
@@ -1148,6 +1148,9 @@ GPU_PERF_TEST(CvtColor, cv::gpu::DeviceInfo, cv::Size, MatDepth, CvtColorInfo)
    cv::gpu::GpuMat src(src_host);
    cv::gpu::GpuMat dst;

+    if (info.code >= cv::COLOR_BayerBG2BGR && info.code <= cv::COLOR_BayerGR2BGR)
+        info.dcn = 4;
+
    cv::gpu::cvtColor(src, dst, info.code, info.dcn);

    TEST_CYCLE()
@@ -1172,7 +1175,20 @@ INSTANTIATE_TEST_CASE_P(ImgProc, CvtColor, testing::Combine(
                    CvtColorInfo(3, 3, cv::COLOR_BGR2HSV),
                    CvtColorInfo(3, 3, cv::COLOR_HSV2BGR),
                    CvtColorInfo(3, 3, cv::COLOR_BGR2HLS),
-                    CvtColorInfo(3, 3, cv::COLOR_HLS2BGR))));
+                    CvtColorInfo(3, 3, cv::COLOR_HLS2BGR),
+                    CvtColorInfo(3, 3, cv::COLOR_BGR2Lab),
+                    CvtColorInfo(3, 3, cv::COLOR_RGB2Lab),
+                    CvtColorInfo(3, 3, cv::COLOR_BGR2Luv),
+                    CvtColorInfo(3, 3, cv::COLOR_RGB2Luv),
+                    CvtColorInfo(3, 3, cv::COLOR_Lab2BGR),
+                    CvtColorInfo(3, 3, cv::COLOR_Lab2RGB),
+                    CvtColorInfo(3, 3, cv::COLOR_Luv2BGR),
+                    CvtColorInfo(3, 3, cv::COLOR_Luv2RGB),
+                    CvtColorInfo(1, 3, cv::COLOR_BayerBG2BGR),
+                    CvtColorInfo(1, 3, cv::COLOR_BayerGB2BGR),
+                    CvtColorInfo(1, 3, cv::COLOR_BayerRG2BGR),
+                    CvtColorInfo(1, 3, cv::COLOR_BayerGR2BGR),
+                    CvtColorInfo(4, 4, cv::COLOR_RGBA2mRGBA))));

 //////////////////////////////////////////////////////////////////////
 // SwapChannels
--- a/modules/gpu/perf/perf_utility.cpp
+++ b/modules/gpu/perf/perf_utility.cpp
@@ -65,19 +65,19 @@ void PrintTo(const CvtColorInfo& info, ostream* os)
        "BGR2HSV",
        "RGB2HSV",

-        0,
-        0,
+        "",
+        "",

-        0,
-        0,
+        "BGR2Lab",
+        "RGB2Lab",

-        0,
-        0,
-        0,
-        0,
+        "BayerBG2BGR",
+        "BayerGB2BGR",
+        "BayerRG2BGR",
+        "BayerGR2BGR",

-        0,
-        0,
+        "BGR2Luv",
+        "RGB2Luv",

        "BGR2HLS",
        "RGB2HLS",
@@ -85,18 +85,18 @@ void PrintTo(const CvtColorInfo& info, ostream* os)
        "HSV2BGR",
        "HSV2RGB",

-        0,
-        0,
-        0,
-        0,
+        "Lab2BGR",
+        "Lab2RGB",
+        "Luv2BGR",
+        "Luv2RGB",

        "HLS2BGR",
        "HLS2RGB",

-        0,
-        0,
-        0,
-        0,
+        "BayerBG2BGR_VNG",
+        "BayerGB2BGR_VNG",
+        "BayerRG2BGR_VNG",
+        "BayerGR2BGR_VNG",

        "BGR2HSV_FULL",
        "RGB2HSV_FULL",
@@ -108,30 +108,78 @@ void PrintTo(const CvtColorInfo& info, ostream* os)
        "HLS2BGR_FULL",
        "HLS2RGB_FULL",

-        0,
-        0,
-        0,
-        0,
+        "LBGR2Lab",
+        "LRGB2Lab",
+        "LBGR2Luv",
+        "LRGB2Luv",

-        0,
-        0,
-        0,
-        0,
+        "Lab2LBGR",
+        "Lab2LRGB",
+        "Luv2LBGR",
+        "Luv2LRGB",

        "BGR2YUV",
        "RGB2YUV",
        "YUV2BGR",
        "YUV2RGB",

-        0,
-        0,
-        0,
-        0,
+        "BayerBG2GRAY",
+        "BayerGB2GRAY",
+        "BayerRG2GRAY",
+        "BayerGR2GRAY",

-        0,
-        0,
-        0,
-        0
+        //YUV 4:2:0 formats family
+        "YUV2RGB_NV12",
+        "YUV2BGR_NV12",
+        "YUV2RGB_NV21",
+        "YUV2BGR_NV21",
+
+        "YUV2RGBA_NV12",
+        "YUV2BGRA_NV12",
+        "YUV2RGBA_NV21",
+        "YUV2BGRA_NV21",
+
+        "YUV2RGB_YV12",
+        "YUV2BGR_YV12",
+        "YUV2RGB_IYUV",
+        "YUV2BGR_IYUV",
+
+        "YUV2RGBA_YV12",
+        "YUV2BGRA_YV12",
+        "YUV2RGBA_IYUV",
+        "YUV2BGRA_IYUV",
+
+        "YUV2GRAY_420",
+
+        //YUV 4:2:2 formats family
+        "YUV2RGB_UYVY",
+        "YUV2BGR_UYVY",
+        "YUV2RGB_VYUY",
+        "YUV2BGR_VYUY",
+
+        "YUV2RGBA_UYVY",
+        "YUV2BGRA_UYVY",
+        "YUV2RGBA_VYUY",
+        "YUV2BGRA_VYUY",
+
+        "YUV2RGB_YUY2",
+        "YUV2BGR_YUY2",
+        "YUV2RGB_YVYU",
+        "YUV2BGR_YVYU",
+
+        "YUV2RGBA_YUY2",
+        "YUV2BGRA_YUY2",
+        "YUV2RGBA_YVYU",
+        "YUV2BGRA_YVYU",
+
+        "YUV2GRAY_UYVY",
+        "YUV2GRAY_YUY2",
+
+        // alpha premultiplication
+        "RGBA2mRGBA",
+        "mRGBA2RGBA",
+
+        "COLORCVT_MAX"
    };

    *os << str[info.code];
--- a/modules/gpu/perf_cpu/perf_imgproc.cpp
+++ b/modules/gpu/perf_cpu/perf_imgproc.cpp
@@ -712,6 +712,19 @@ INSTANTIATE_TEST_CASE_P(ImgProc, CvtColor, testing::Combine(
                    CvtColorInfo(3, 3, cv::COLOR_BGR2HSV),
                    CvtColorInfo(3, 3, cv::COLOR_HSV2BGR),
                    CvtColorInfo(3, 3, cv::COLOR_BGR2HLS),
-                    CvtColorInfo(3, 3, cv::COLOR_HLS2BGR))));
+                    CvtColorInfo(3, 3, cv::COLOR_HLS2BGR),
+                    CvtColorInfo(3, 3, cv::COLOR_BGR2Lab),
+                    CvtColorInfo(3, 3, cv::COLOR_RGB2Lab),
+                    CvtColorInfo(3, 3, cv::COLOR_BGR2Luv),
+                    CvtColorInfo(3, 3, cv::COLOR_RGB2Luv),
+                    CvtColorInfo(3, 3, cv::COLOR_Lab2BGR),
+                    CvtColorInfo(3, 3, cv::COLOR_Lab2RGB),
+                    CvtColorInfo(3, 3, cv::COLOR_Luv2BGR),
+                    CvtColorInfo(3, 3, cv::COLOR_Luv2RGB),
+                    CvtColorInfo(1, 3, cv::COLOR_BayerBG2BGR),
+                    CvtColorInfo(1, 3, cv::COLOR_BayerGB2BGR),
+                    CvtColorInfo(1, 3, cv::COLOR_BayerRG2BGR),
+                    CvtColorInfo(1, 3, cv::COLOR_BayerGR2BGR),
+                    CvtColorInfo(4, 4, cv::COLOR_RGBA2mRGBA))));

 #endif
--- a/modules/gpu/perf_cpu/perf_utility.cpp
+++ b/modules/gpu/perf_cpu/perf_utility.cpp
@@ -65,19 +65,19 @@ void PrintTo(const CvtColorInfo& info, ostream* os)
        "BGR2HSV",
        "RGB2HSV",

-        0,
-        0,
+        "",
+        "",

-        0,
-        0,
+        "BGR2Lab",
+        "RGB2Lab",

-        0,
-        0,
-        0,
-        0,
+        "BayerBG2BGR",
+        "BayerGB2BGR",
+        "BayerRG2BGR",
+        "BayerGR2BGR",

-        0,
-        0,
+        "BGR2Luv",
+        "RGB2Luv",

        "BGR2HLS",
        "RGB2HLS",
@@ -85,18 +85,18 @@ void PrintTo(const CvtColorInfo& info, ostream* os)
        "HSV2BGR",
        "HSV2RGB",

-        0,
-        0,
-        0,
-        0,
+        "Lab2BGR",
+        "Lab2RGB",
+        "Luv2BGR",
+        "Luv2RGB",

        "HLS2BGR",
        "HLS2RGB",

-        0,
-        0,
-        0,
-        0,
+        "BayerBG2BGR_VNG",
+        "BayerGB2BGR_VNG",
+        "BayerRG2BGR_VNG",
+        "BayerGR2BGR_VNG",

        "BGR2HSV_FULL",
        "RGB2HSV_FULL",
@@ -108,30 +108,78 @@ void PrintTo(const CvtColorInfo& info, ostream* os)
        "HLS2BGR_FULL",
        "HLS2RGB_FULL",

-        0,
-        0,
-        0,
-        0,
+        "LBGR2Lab",
+        "LRGB2Lab",
+        "LBGR2Luv",
+        "LRGB2Luv",

-        0,
-        0,
-        0,
-        0,
+        "Lab2LBGR",
+        "Lab2LRGB",
+        "Luv2LBGR",
+        "Luv2LRGB",

        "BGR2YUV",
        "RGB2YUV",
        "YUV2BGR",
        "YUV2RGB",

-        0,
-        0,
-        0,
-        0,
+        "BayerBG2GRAY",
+        "BayerGB2GRAY",
+        "BayerRG2GRAY",
+        "BayerGR2GRAY",

-        0,
-        0,
-        0,
-        0
+        //YUV 4:2:0 formats family
+        "YUV2RGB_NV12",
+        "YUV2BGR_NV12",
+        "YUV2RGB_NV21",
+        "YUV2BGR_NV21",
+
+        "YUV2RGBA_NV12",
+        "YUV2BGRA_NV12",
+        "YUV2RGBA_NV21",
+        "YUV2BGRA_NV21",
+
+        "YUV2RGB_YV12",
+        "YUV2BGR_YV12",
+        "YUV2RGB_IYUV",
+        "YUV2BGR_IYUV",
+
+        "YUV2RGBA_YV12",
+        "YUV2BGRA_YV12",
+        "YUV2RGBA_IYUV",
+        "YUV2BGRA_IYUV",
+
+        "YUV2GRAY_420",
+
+        //YUV 4:2:2 formats family
+        "YUV2RGB_UYVY",
+        "YUV2BGR_UYVY",
+        "YUV2RGB_VYUY",
+        "YUV2BGR_VYUY",
+
+        "YUV2RGBA_UYVY",
+        "YUV2BGRA_UYVY",
+        "YUV2RGBA_VYUY",
+        "YUV2BGRA_VYUY",
+
+        "YUV2RGB_YUY2",
+        "YUV2BGR_YUY2",
+        "YUV2RGB_YVYU",
+        "YUV2BGR_YVYU",
+
+        "YUV2RGBA_YUY2",
+        "YUV2BGRA_YUY2",
+        "YUV2RGBA_YVYU",
+        "YUV2BGRA_YVYU",
+
+        "YUV2GRAY_UYVY",
+        "YUV2GRAY_YUY2",
+
+        // alpha premultiplication
+        "RGBA2mRGBA",
+        "mRGBA2RGBA",
+
+        "COLORCVT_MAX"
    };

    *os << str[info.code];
--- a/modules/gpu/src/color.cpp
+++ b/modules/gpu/src/color.cpp
@@ -54,6 +54,17 @@ void cv::gpu::gammaCorrection(const GpuMat&, GpuMat&, bool, Stream&) { throw_nog
 #else /* !defined (HAVE_CUDA) */

 #include <cvt_colot_internal.h>
+
+namespace cv { namespace gpu {
+    namespace device
+    {
+        template <int cn>
+        void Bayer2BGR_8u_gpu(DevMem2Db src, DevMem2Db dst, bool blue_last, bool start_with_green, cudaStream_t stream);
+        template <int cn>
+        void Bayer2BGR_16u_gpu(DevMem2Db src, DevMem2Db dst, bool blue_last, bool start_with_green, cudaStream_t stream);
+    }
+}}
+
 using namespace ::cv::gpu::device;

 namespace
@@ -1144,13 +1155,13 @@ namespace
        funcs[dcn == 4][src.channels() == 4][src.depth()](src, dst, StreamAccessor::getStream(stream));
    }

-    void bgr_to_lab(const GpuMat& src, GpuMat& dst, int dcn, Stream& stream)
+    void bgr_to_lab(const GpuMat& src, GpuMat& dst, int dcn, Stream& st)
    {
        #if (CUDA_VERSION < 5000)
            (void)src;
            (void)dst;
            (void)dcn;
-            (void)stream;
+            (void)st;
            CV_Error( CV_StsBadFlag, "Unknown/unsupported color conversion code" );
        #else
            CV_Assert(src.depth() == CV_8U);
@@ -1160,13 +1171,17 @@ namespace

            dst.create(src.size(), CV_MAKETYPE(src.depth(), dcn));

-            NppStreamHandler h(StreamAccessor::getStream(stream));
+            cudaStream_t stream = StreamAccessor::getStream(st);
+            NppStreamHandler h(stream);

            NppiSize oSizeROI;
            oSizeROI.width = src.cols;
            oSizeROI.height = src.rows;

            nppSafeCall( nppiBGRToLab_8u_C3R(src.ptr<Npp8u>(), static_cast<int>(src.step), dst.ptr<Npp8u>(), static_cast<int>(dst.step), oSizeROI) );
+
+            if (stream == 0)
+                cudaSafeCall( cudaDeviceSynchronize() );
        #endif
    }

@@ -1176,13 +1191,13 @@ namespace
        bgr_to_lab(dst, dst, -1, stream);
    }

-    void lab_to_bgr(const GpuMat& src, GpuMat& dst, int dcn, Stream& stream)
+    void lab_to_bgr(const GpuMat& src, GpuMat& dst, int dcn, Stream& st)
    {
        #if (CUDA_VERSION < 5000)
            (void)src;
            (void)dst;
            (void)dcn;
-            (void)stream;
+            (void)st;
            CV_Error( CV_StsBadFlag, "Unknown/unsupported color conversion code" );
        #else
            CV_Assert(src.depth() == CV_8U);
@@ -1192,13 +1207,17 @@ namespace

            dst.create(src.size(), CV_MAKETYPE(src.depth(), dcn));

-            NppStreamHandler h(StreamAccessor::getStream(stream));
+            cudaStream_t stream = StreamAccessor::getStream(st);
+            NppStreamHandler h(stream);

            NppiSize oSizeROI;
            oSizeROI.width = src.cols;
            oSizeROI.height = src.rows;

            nppSafeCall( nppiLabToBGR_8u_C3R(src.ptr<Npp8u>(), static_cast<int>(src.step), dst.ptr<Npp8u>(), static_cast<int>(dst.step), oSizeROI) );
+
+            if (stream == 0)
+                cudaSafeCall( cudaDeviceSynchronize() );
        #endif
    }

@@ -1208,13 +1227,13 @@ namespace
        bgr_to_rgb(dst, dst, -1, stream);
    }

-    void rgb_to_luv(const GpuMat& src, GpuMat& dst, int dcn, Stream& stream)
+    void rgb_to_luv(const GpuMat& src, GpuMat& dst, int dcn, Stream& st)
    {
        #if (CUDA_VERSION < 5000)
            (void)src;
            (void)dst;
            (void)dcn;
-            (void)stream;
+            (void)st;
            CV_Error( CV_StsBadFlag, "Unknown/unsupported color conversion code" );
        #else
            CV_Assert(src.depth() == CV_8U);
@@ -1224,7 +1243,8 @@ namespace

            dst.create(src.size(), CV_MAKETYPE(src.depth(), dcn));

-            NppStreamHandler h(StreamAccessor::getStream(stream));
+            cudaStream_t stream = StreamAccessor::getStream(st);
+            NppStreamHandler h(stream);

            NppiSize oSizeROI;
            oSizeROI.width = src.cols;
@@ -1234,6 +1254,9 @@ namespace
                nppSafeCall( nppiRGBToLUV_8u_C3R(src.ptr<Npp8u>(), static_cast<int>(src.step), dst.ptr<Npp8u>(), static_cast<int>(dst.step), oSizeROI) );
            else
                nppSafeCall( nppiRGBToLUV_8u_AC4R(src.ptr<Npp8u>(), static_cast<int>(src.step), dst.ptr<Npp8u>(), static_cast<int>(dst.step), oSizeROI) );
+
+            if (stream == 0)
+                cudaSafeCall( cudaDeviceSynchronize() );
        #endif
    }

@@ -1243,13 +1266,13 @@ namespace
        rgb_to_luv(dst, dst, -1, stream);
    }

-    void luv_to_rgb(const GpuMat& src, GpuMat& dst, int dcn, Stream& stream)
+    void luv_to_rgb(const GpuMat& src, GpuMat& dst, int dcn, Stream& st)
    {
        #if (CUDA_VERSION < 5000)
            (void)src;
            (void)dst;
            (void)dcn;
-            (void)stream;
+            (void)st;
            CV_Error( CV_StsBadFlag, "Unknown/unsupported color conversion code" );
        #else
            CV_Assert(src.depth() == CV_8U);
@@ -1259,7 +1282,8 @@ namespace

            dst.create(src.size(), CV_MAKETYPE(src.depth(), dcn));

-            NppStreamHandler h(StreamAccessor::getStream(stream));
+            cudaStream_t stream = StreamAccessor::getStream(st);
+            NppStreamHandler h(stream);

            NppiSize oSizeROI;
            oSizeROI.width = src.cols;
@@ -1269,6 +1293,9 @@ namespace
                nppSafeCall( nppiLUVToRGB_8u_C3R(src.ptr<Npp8u>(), static_cast<int>(src.step), dst.ptr<Npp8u>(), static_cast<int>(dst.step), oSizeROI) );
            else
                nppSafeCall( nppiLUVToRGB_8u_AC4R(src.ptr<Npp8u>(), static_cast<int>(src.step), dst.ptr<Npp8u>(), static_cast<int>(dst.step), oSizeROI) );
+
+            if (stream == 0)
+                cudaSafeCall( cudaDeviceSynchronize() );
        #endif
    }

@@ -1278,19 +1305,20 @@ namespace
        bgr_to_rgb(dst, dst, -1, stream);
    }

-    void rgba_to_mbgra(const GpuMat& src, GpuMat& dst, int, Stream& stream)
+    void rgba_to_mbgra(const GpuMat& src, GpuMat& dst, int, Stream& st)
    {
    #if (CUDA_VERSION < 5000)
        (void)src;
        (void)dst;
-        (void)stream;
+        (void)st;
        CV_Error( CV_StsBadFlag, "Unknown/unsupported color conversion code" );
    #else
        CV_Assert(src.type() == CV_8UC4 || src.type() == CV_16UC4);

        dst.create(src.size(), src.type());

-        NppStreamHandler h(StreamAccessor::getStream(stream));
+        cudaStream_t stream = StreamAccessor::getStream(st);
+        NppStreamHandler h(stream);

        NppiSize oSizeROI;
        oSizeROI.width = src.cols;
@@ -1300,8 +1328,52 @@ namespace
            nppSafeCall( nppiAlphaPremul_8u_AC4R(src.ptr<Npp8u>(), static_cast<int>(src.step), dst.ptr<Npp8u>(), static_cast<int>(dst.step), oSizeROI) );
        else
            nppSafeCall( nppiAlphaPremul_16u_AC4R(src.ptr<Npp16u>(), static_cast<int>(src.step), dst.ptr<Npp16u>(), static_cast<int>(dst.step), oSizeROI) );
+
+        if (stream == 0)
+            cudaSafeCall( cudaDeviceSynchronize() );
    #endif
    }
+
+    void bayer_to_bgr(const GpuMat& src, GpuMat& dst, int dcn, bool blue_last, bool start_with_green, Stream& stream)
+    {
+        typedef void (*func_t)(DevMem2Db src, DevMem2Db dst, bool blue_last, bool start_with_green, cudaStream_t stream);
+        static const func_t funcs[3][4] =
+        {
+            {0,0,Bayer2BGR_8u_gpu<3>, Bayer2BGR_8u_gpu<4>},
+            {0,0,0,0},
+            {0,0,Bayer2BGR_16u_gpu<3>, Bayer2BGR_16u_gpu<4>}
+        };
+
+        if (dcn <= 0) dcn = 3;
+
+        CV_Assert(src.type() == CV_8UC1 || src.type() == CV_16UC1);
+        CV_Assert(src.rows > 2 && src.cols > 2);
+        CV_Assert(dcn == 3 || dcn == 4);
+
+        dst.create(src.size(), CV_MAKETYPE(src.depth(), dcn));
+
+        funcs[src.depth()][dcn - 1](src, dst, blue_last, start_with_green, StreamAccessor::getStream(stream));
+    }
+
+    void bayerBG_to_bgr(const GpuMat& src, GpuMat& dst, int dcn, Stream& stream)
+    {
+        bayer_to_bgr(src, dst, dcn, false, false, stream);
+    }
+
+    void bayerGB_to_bgr(const GpuMat& src, GpuMat& dst, int dcn, Stream& stream)
+    {
+        bayer_to_bgr(src, dst, dcn, false, true, stream);
+    }
+
+    void bayerRG_to_bgr(const GpuMat& src, GpuMat& dst, int dcn, Stream& stream)
+    {
+        bayer_to_bgr(src, dst, dcn, true, false, stream);
+    }
+
+    void bayerGR_to_bgr(const GpuMat& src, GpuMat& dst, int dcn, Stream& stream)
+    {
+        bayer_to_bgr(src, dst, dcn, true, true, stream);
+    }
 }

 void cv::gpu::cvtColor(const GpuMat& src, GpuMat& dst, int code, int dcn, Stream& stream)
@@ -1366,10 +1438,10 @@ void cv::gpu::cvtColor(const GpuMat& src, GpuMat& dst, int code, int dcn, Stream
        bgr_to_lab,             // CV_BGR2Lab     =44
        rgb_to_lab,             // CV_RGB2Lab     =45

-        0,                      // CV_BayerBG2BGR =46
-        0,                      // CV_BayerGB2BGR =47
-        0,                      // CV_BayerRG2BGR =48
-        0,                      // CV_BayerGR2BGR =49
+        bayerBG_to_bgr,         // CV_BayerBG2BGR =46
+        bayerGB_to_bgr,         // CV_BayerGB2BGR =47
+        bayerRG_to_bgr,         // CV_BayerRG2BGR =48
+        bayerGR_to_bgr,         // CV_BayerGR2BGR =49

        bgr_to_luv,             // CV_BGR2Luv     =50
        rgb_to_luv,             // CV_RGB2Luv     =51
@@ -1424,57 +1496,57 @@ void cv::gpu::cvtColor(const GpuMat& src, GpuMat& dst, int code, int dcn, Stream
        0,                      // CV_BayerGR2GRAY = 89

        //YUV 4:2:0 formats family
-        0,                      // COLOR_YUV2RGB_NV12 = 90,
-        0,                      // COLOR_YUV2BGR_NV12 = 91,
-        0,                      // COLOR_YUV2RGB_NV21 = 92,
-        0,                      // COLOR_YUV2BGR_NV21 = 93,
+        0,                      // CV_YUV2RGB_NV12 = 90,
+        0,                      // CV_YUV2BGR_NV12 = 91,
+        0,                      // CV_YUV2RGB_NV21 = 92,
+        0,                      // CV_YUV2BGR_NV21 = 93,

-        0,                      // COLOR_YUV2RGBA_NV12 = 94,
-        0,                      // COLOR_YUV2BGRA_NV12 = 95,
-        0,                      // COLOR_YUV2RGBA_NV21 = 96,
-        0,                      // COLOR_YUV2BGRA_NV21 = 97,
+        0,                      // CV_YUV2RGBA_NV12 = 94,
+        0,                      // CV_YUV2BGRA_NV12 = 95,
+        0,                      // CV_YUV2RGBA_NV21 = 96,
+        0,                      // CV_YUV2BGRA_NV21 = 97,

-        0,                      // COLOR_YUV2RGB_YV12 = 98,
-        0,                      // COLOR_YUV2BGR_YV12 = 99,
-        0,                      // COLOR_YUV2RGB_IYUV = 100,
-        0,                      // COLOR_YUV2BGR_IYUV = 101,
+        0,                      // CV_YUV2RGB_YV12 = 98,
+        0,                      // CV_YUV2BGR_YV12 = 99,
+        0,                      // CV_YUV2RGB_IYUV = 100,
+        0,                      // CV_YUV2BGR_IYUV = 101,

-        0,                      // COLOR_YUV2RGBA_YV12 = 102,
-        0,                      // COLOR_YUV2BGRA_YV12 = 103,
-        0,                      // COLOR_YUV2RGBA_IYUV = 104,
-        0,                      // COLOR_YUV2BGRA_IYUV = 105,
+        0,                      // CV_YUV2RGBA_YV12 = 102,
+        0,                      // CV_YUV2BGRA_YV12 = 103,
+        0,                      // CV_YUV2RGBA_IYUV = 104,
+        0,                      // CV_YUV2BGRA_IYUV = 105,

-        0,                      // COLOR_YUV2GRAY_420 = 106,
+        0,                      // CV_YUV2GRAY_420 = 106,

        //YUV 4:2:2 formats family
-        0,                      // COLOR_YUV2RGB_UYVY = 107,
-        0,                      // COLOR_YUV2BGR_UYVY = 108,
-        0,                      // //COLOR_YUV2RGB_VYUY = 109,
-        0,                      // //COLOR_YUV2BGR_VYUY = 110,
+        0,                      // CV_YUV2RGB_UYVY = 107,
+        0,                      // CV_YUV2BGR_UYVY = 108,
+        0,                      // //CV_YUV2RGB_VYUY = 109,
+        0,                      // //CV_YUV2BGR_VYUY = 110,

-        0,                      // COLOR_YUV2RGBA_UYVY = 111,
-        0,                      // COLOR_YUV2BGRA_UYVY = 112,
-        0,                      // //COLOR_YUV2RGBA_VYUY = 113,
-        0,                      // //COLOR_YUV2BGRA_VYUY = 114,
+        0,                      // CV_YUV2RGBA_UYVY = 111,
+        0,                      // CV_YUV2BGRA_UYVY = 112,
+        0,                      // //CV_YUV2RGBA_VYUY = 113,
+        0,                      // //CV_YUV2BGRA_VYUY = 114,

-        0,                      // COLOR_YUV2RGB_YUY2 = 115,
-        0,                      // COLOR_YUV2BGR_YUY2 = 116,
-        0,                      // COLOR_YUV2RGB_YVYU = 117,
-        0,                      // COLOR_YUV2BGR_YVYU = 118,
+        0,                      // CV_YUV2RGB_YUY2 = 115,
+        0,                      // CV_YUV2BGR_YUY2 = 116,
+        0,                      // CV_YUV2RGB_YVYU = 117,
+        0,                      // CV_YUV2BGR_YVYU = 118,

-        0,                      // COLOR_YUV2RGBA_YUY2 = 119,
-        0,                      // COLOR_YUV2BGRA_YUY2 = 120,
-        0,                      // COLOR_YUV2RGBA_YVYU = 121,
-        0,                      // COLOR_YUV2BGRA_YVYU = 122,
+        0,                      // CV_YUV2RGBA_YUY2 = 119,
+        0,                      // CV_YUV2BGRA_YUY2 = 120,
+        0,                      // CV_YUV2RGBA_YVYU = 121,
+        0,                      // CV_YUV2BGRA_YVYU = 122,

-        0,                      // COLOR_YUV2GRAY_UYVY = 123,
-        0,                      // COLOR_YUV2GRAY_YUY2 = 124,
+        0,                      // CV_YUV2GRAY_UYVY = 123,
+        0,                      // CV_YUV2GRAY_YUY2 = 124,

        // alpha premultiplication
-        rgba_to_mbgra,          // COLOR_RGBA2mRGBA = 125,
-        0,                      // COLOR_mRGBA2RGBA = 126,
+        rgba_to_mbgra,          // CV_RGBA2mRGBA = 125,
+        0,                      // CV_mRGBA2RGBA = 126,

-        0,                      // COLOR_COLORCVT_MAX  = 127
+        0,                      // CV_COLORCVT_MAX  = 127
    };

    CV_Assert(code < 128);
--- a/modules/gpu/src/cuda/ccomponetns.cu
+++ b/modules/gpu/src/cuda/ccomponetns.cu
@@ -0,0 +1,522 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+// By downloading, copying, installing or using the software you agree to this license.
+// If you do not agree to this license, do not download, install,
+// copy or use the software.
+//
+//
+//                          License Agreement
+//               For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2008-2011, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//  * Redistributions of source code must retain the above copyright notice,
+//    this list of conditions and the following disclaimer.
+//
+//  * Redistributions in binary form must reproduce the above copyright notice,
+//    this list of conditions and the following disclaimer in the documentation
+//    and/or other materials provided with the distribution.
+//
+//  * The name of the copyright holders may not be used to endorse or promote products
+//    derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//M*/
+
+#include <opencv2/gpu/device/common.hpp>
+#include <opencv2/gpu/device/vec_traits.hpp>
+#include <opencv2/gpu/device/vec_math.hpp>
+#include <iostream>
+#include <stdio.h>
+
+namespace cv { namespace gpu { namespace device
+{
+    namespace ccl
+    {
+        enum
+        {
+            WARP_SIZE  = 32,
+            WARP_LOG   = 5,
+
+            CTA_SIZE_X = 32,
+            CTA_SIZE_Y = 8,
+
+            STA_SIZE_MARGE_Y = 4,
+            STA_SIZE_MARGE_X = 32,
+
+            TPB_X = 1,
+            TPB_Y = 4,
+
+            TILE_COLS = CTA_SIZE_X * TPB_X,
+            TILE_ROWS = CTA_SIZE_Y * TPB_Y
+        };
+
+        template<typename T> struct IntervalsTraits
+        {
+            typedef T elem_type;
+        };
+
+        template<> struct IntervalsTraits<unsigned char>
+        {
+            typedef int dist_type;
+            enum {ch = 1};
+        };
+
+        template<> struct IntervalsTraits<uchar3>
+        {
+            typedef int3 dist_type;
+            enum {ch = 3};
+        };
+
+        template<> struct IntervalsTraits<uchar4>
+        {
+            typedef int4 dist_type;
+            enum {ch = 4};
+        };
+
+        template<> struct IntervalsTraits<unsigned short>
+        {
+            typedef int dist_type;
+            enum {ch = 1};
+        };
+
+        template<> struct IntervalsTraits<ushort3>
+        {
+            typedef int3 dist_type;
+            enum {ch = 3};
+        };
+
+        template<> struct IntervalsTraits<ushort4>
+        {
+            typedef int4 dist_type;
+            enum {ch = 4};
+        };
+
+        template<> struct IntervalsTraits<float>
+        {
+            typedef float dist_type;
+            enum {ch = 1};
+        };
+
+        template<> struct IntervalsTraits<int>
+        {
+            typedef int dist_type;
+            enum {ch = 1};
+        };
+
+        typedef unsigned char component;
+        enum Edges { UP = 1, DOWN = 2, LEFT = 4, RIGHT = 8, EMPTY = 0xF0 };
+
+        template<typename T, int CH> struct InInterval {};
+
+        template<typename T> struct InInterval<T, 1>
+        {
+            __host__ __device__ __forceinline__ InInterval(const float4& _lo, const float4& _hi) : lo(-_lo.x), hi(_hi.x) {};
+            T lo, hi;
+
+            template<typename I> __device__ __forceinline__ bool operator() (const I& a, const I& b) const
+            {
+                I d = a - b;
+                return lo <= d && d <= hi;
+            }
+        };
+
+        template<typename T> struct InInterval<T, 3>
+        {
+            __host__ __device__ __forceinline__ InInterval(const float4& _lo, const float4& _hi)
+            : lo (VecTraits<T>::make(-_lo.x, -_lo.y, -_lo.z)), hi (VecTraits<T>::make(_hi.x, _hi.y, _hi.z)){};
+            T lo, hi;
+
+            template<typename I> __device__ __forceinline__ bool operator() (const I& a, const I& b) const
+            {
+                I d = a - b;
+                return lo.x <= d.x && d.x <= hi.x &&
+                       lo.y <= d.y && d.y <= hi.y &&
+                       lo.z <= d.z && d.z <= hi.z;
+            }
+        };
+
+        template<typename T> struct InInterval<T, 4>
+        {
+            __host__ __device__ __forceinline__ InInterval(const float4& _lo, const float4& _hi)
+            : lo (VecTraits<T>::make(-_lo.x, -_lo.y, -_lo.z, -_lo.w)), hi (VecTraits<T>::make(_hi.x, _hi.y, _hi.z, -_hi.w)){};
+            T lo, hi;
+
+            template<typename I> __device__ __forceinline__ bool operator() (const I& a, const I& b) const
+            {
+                I d = a - b;
+                return lo.x <= d.x && d.x <= hi.x &&
+                       lo.y <= d.y && d.y <= hi.y &&
+                       lo.z <= d.z && d.z <= hi.z &&
+                       lo.w <= d.w && d.w <= hi.w;
+            }
+        };
+
+
+        template<typename T, typename F>
+        __global__ void computeConnectivity(const DevMem2D_<T> image, DevMem2D components, F connected)
+        {
+            int x = threadIdx.x + blockIdx.x * blockDim.x;
+            int y = threadIdx.y + blockIdx.y * blockDim.y;
+
+            if (x >= image.cols || y >= image.rows) return;
+
+            T intensity = image(y, x);
+            component c = 0;
+
+            if ( x > 0 && connected(intensity, image(y, x - 1)))
+                c |= LEFT;
+
+            if ( y > 0 && connected(intensity, image(y - 1, x)))
+                c |= UP;
+
+            if ( x - 1 < image.cols && connected(intensity, image(y, x + 1)))
+                c |= RIGHT;
+
+            if ( y - 1 < image.rows && connected(intensity, image(y + 1, x)))
+                c |= DOWN;
+
+            components(y, x) = c;
+        }
+
+        template< typename T>
+        void computeEdges(const DevMem2D& image, DevMem2D edges, const float4& lo, const float4& hi, cudaStream_t stream)
+        {
+            dim3 block(CTA_SIZE_X, CTA_SIZE_Y);
+            dim3 grid(divUp(image.cols, block.x), divUp(image.rows, block.y));
+
+            typedef InInterval<typename IntervalsTraits<T>::dist_type, IntervalsTraits<T>::ch> Int_t;
+
+            Int_t inInt(lo, hi);
+            computeConnectivity<T, Int_t><<<grid, block, 0, stream>>>(static_cast<const DevMem2D_<T> >(image), edges, inInt);
+
+            cudaSafeCall( cudaGetLastError() );
+            if (stream == 0)
+                cudaSafeCall( cudaDeviceSynchronize() );
+        }
+
+        template void computeEdges<uchar>  (const DevMem2D& image, DevMem2D edges, const float4& lo, const float4& hi, cudaStream_t stream);
+        template void computeEdges<uchar3> (const DevMem2D& image, DevMem2D edges, const float4& lo, const float4& hi, cudaStream_t stream);
+        template void computeEdges<uchar4> (const DevMem2D& image, DevMem2D edges, const float4& lo, const float4& hi, cudaStream_t stream);
+        template void computeEdges<ushort> (const DevMem2D& image, DevMem2D edges, const float4& lo, const float4& hi, cudaStream_t stream);
+        template void computeEdges<ushort3>(const DevMem2D& image, DevMem2D edges, const float4& lo, const float4& hi, cudaStream_t stream);
+        template void computeEdges<ushort4>(const DevMem2D& image, DevMem2D edges, const float4& lo, const float4& hi, cudaStream_t stream);
+        template void computeEdges<int>    (const DevMem2D& image, DevMem2D edges, const float4& lo, const float4& hi, cudaStream_t stream);
+        template void computeEdges<float>  (const DevMem2D& image, DevMem2D edges, const float4& lo, const float4& hi, cudaStream_t stream);
+
+        __global__ void lableTiles(const DevMem2D edges, DevMem2Di comps)
+        {
+            int x = threadIdx.x + blockIdx.x * TILE_COLS;
+            int y = threadIdx.y + blockIdx.y * TILE_ROWS;
+
+            if (x >= edges.cols || y >= edges.rows) return;
+
+            //currently x is 1
+            int bounds = ((y + TPB_Y) < edges.rows);
+
+            __shared__ int labelsTile[TILE_ROWS][TILE_COLS];
+            __shared__ int  edgesTile[TILE_ROWS][TILE_COLS];
+
+            int new_labels[TPB_Y][TPB_X];
+            int old_labels[TPB_Y][TPB_X];
+
+            #pragma unroll
+            for (int i = 0; i < TPB_Y; ++i)
+                #pragma unroll
+                for (int j = 0; j < TPB_X; ++j)
+                {
+                    int yloc = threadIdx.y + CTA_SIZE_Y * i;
+                    int xloc = threadIdx.x + CTA_SIZE_X * j;
+                    component c = edges(bounds * (y + CTA_SIZE_Y * i), x + CTA_SIZE_X * j);
+
+                    if (!xloc) c &= ~LEFT;
+                    if (!yloc) c &= ~UP;
+
+                    if (xloc == TILE_COLS -1) c &= ~RIGHT;
+                    if (yloc == TILE_ROWS -1) c &= ~DOWN;
+
+                    new_labels[i][j] = yloc * TILE_COLS + xloc;
+                    edgesTile[yloc][xloc] = c;
+                }
+
+
+            for (int i = 0; ; ++i)
+            {
+                //1. backup
+                #pragma unroll
+                for (int i = 0; i < TPB_Y; ++i)
+                    #pragma unroll
+                    for (int j = 0; j < TPB_X; ++j)
+                    {
+                        int yloc = threadIdx.y + CTA_SIZE_Y * i;
+                        int xloc = threadIdx.x + CTA_SIZE_X * j;
+
+                        old_labels[i][j]       = new_labels[i][j];
+                        labelsTile[yloc][xloc] = new_labels[i][j];
+                    }
+
+                __syncthreads();
+
+                //2. compare local arrays
+                #pragma unroll
+                for (int i = 0; i < TPB_Y; ++i)
+                    #pragma unroll
+                    for (int j = 0; j < TPB_X; ++j)
+                    {
+                        int yloc = threadIdx.y + CTA_SIZE_Y * i;
+                        int xloc = threadIdx.x + CTA_SIZE_X * j;
+
+                        component c = edgesTile[yloc][xloc];
+                        int label = new_labels[i][j];
+
+                        if (c & UP)
+                           label = ::min(label, labelsTile[yloc - 1][xloc]);
+
+                        if (c &  DOWN)
+                           label = ::min(label, labelsTile[yloc + 1][xloc]);
+
+                        if (c & LEFT)
+                           label = ::min(label, labelsTile[yloc][xloc - 1]);
+
+                        if (c & RIGHT)
+                           label = ::min(label, labelsTile[yloc][xloc + 1]);
+
+                       new_labels[i][j] = label;
+                    }
+
+                __syncthreads();
+
+                //3. determine: Is any value changed?
+                int changed = 0;
+                #pragma unroll
+                for (int i = 0; i < TPB_Y; ++i)
+                    #pragma unroll
+                    for (int j = 0; j < TPB_X; ++j)
+                    {
+                        if (new_labels[i][j] < old_labels[i][j])
+                        {
+                            changed = 1;
+                            atomicMin(&labelsTile[0][0] + old_labels[i][j], new_labels[i][j]);
+                        }
+                    }
+
+                changed = __syncthreads_or(changed);
+                if (!changed)
+                    break;
+
+                //4. Compact paths
+                const int *labels = &labelsTile[0][0];
+                #pragma unroll
+                for (int i = 0; i < TPB_Y; ++i)
+                    #pragma unroll
+                    for (int j = 0; j < TPB_X; ++j)
+                    {
+                        int label = new_labels[i][j];
+
+                        while( labels[label] < label ) label = labels[label];
+
+                        new_labels[i][j] = label;
+                    }
+                __syncthreads();
+            }
+
+            #pragma unroll
+            for (int i = 0; i < TPB_Y; ++i)
+            #pragma unroll
+                for (int j = 0; j < TPB_X; ++j)
+                {
+                    int label = new_labels[i][j];
+                    int yloc = label / TILE_COLS;
+                    int xloc = label - yloc * TILE_COLS;
+
+                    xloc += blockIdx.x * TILE_COLS;
+                    yloc += blockIdx.y * TILE_ROWS;
+
+                    label = yloc * edges.cols + xloc;
+                    // do it for x too.
+                    if (y + CTA_SIZE_Y * i < comps.rows) comps(y + CTA_SIZE_Y * i, x + CTA_SIZE_X * j) = label;
+                }
+        }
+
+        __device__ __forceinline__ int root(const DevMem2Di& comps, int label)
+        {
+            while(1)
+            {
+                int y = label / comps.cols;
+                int x = label - y * comps.cols;
+
+                int parent = comps(y, x);
+
+                if (label == parent) break;
+
+                label = parent;
+            }
+            return label;
+        }
+
+        __device__ __forceinline__ void isConnected(DevMem2Di& comps, int l1, int l2, bool& changed)
+        {
+            int r1 = root(comps, l1);
+            int r2 = root(comps, l2);
+
+            if (r1 == r2) return;
+
+            int mi = ::min(r1, r2);
+            int ma = ::max(r1, r2);
+
+            int y = ma / comps.cols;
+            int x = ma - y * comps.cols;
+
+            atomicMin(&comps.ptr(y)[x], mi);
+            changed = true;
+        }
+
+        __global__ void crossMerge(const int tilesNumY, const int tilesNumX, int tileSizeY, int tileSizeX,
+            const DevMem2D edges, DevMem2Di comps, const int yIncomplete, int xIncomplete)
+        {
+            int tid = threadIdx.y * blockDim.x + threadIdx.x;
+            int stride = blockDim.y * blockDim.x;
+
+            int ybegin = blockIdx.y * (tilesNumY * tileSizeY);
+            int yend   = ybegin + tilesNumY * tileSizeY;
+
+            if (blockIdx.y == gridDim.y - 1)
+            {
+                yend -= yIncomplete * tileSizeY;
+                yend -= tileSizeY;
+                tileSizeY = (edges.rows % tileSizeY);
+
+                yend += tileSizeY;
+            }
+
+            int xbegin = blockIdx.x * tilesNumX * tileSizeX;
+            int xend   = xbegin + tilesNumX * tileSizeX;
+
+            if (blockIdx.x == gridDim.x - 1)
+            {
+                if (xIncomplete) yend = ybegin;
+                xend -= xIncomplete * tileSizeX;
+                xend -= tileSizeX;
+                tileSizeX = (edges.cols % tileSizeX);
+
+                xend += tileSizeX;
+            }
+
+            if (blockIdx.y == (gridDim.y - 1) && yIncomplete)
+            {
+                xend = xbegin;
+            }
+
+            int tasksV = (tilesNumX - 1) * (yend - ybegin);
+            int tasksH = (tilesNumY - 1) * (xend - xbegin);
+
+            int total = tasksH + tasksV;
+
+            bool changed;
+            do
+            {
+                changed = false;
+                for (int taskIdx = tid; taskIdx < total; taskIdx += stride)
+                {
+                    if (taskIdx < tasksH)
+                    {
+                        int indexH = taskIdx;
+
+                        int row = indexH / (xend - xbegin);
+                        int col = indexH - row * (xend - xbegin);
+
+                        int y = ybegin + (row + 1) * tileSizeY;
+                        int x = xbegin + col;
+
+                        component e = edges( x, y);
+                        if (e & UP)
+                        {
+                            int lc = comps(y,x);
+                            int lu = comps(y - 1, x);
+
+                            isConnected(comps, lc, lu, changed);
+                        }
+                    }
+                    else
+                    {
+                        int indexV = taskIdx - tasksH;
+
+                        int col = indexV / (yend - ybegin);
+                        int row = indexV - col * (yend - ybegin);
+
+                        int x = xbegin + (col + 1) * tileSizeX;
+                        int y = ybegin + row;
+
+                        component e = edges(x, y);
+                        if (e & LEFT)
+                        {
+                            int lc = comps(y, x);
+                            int ll = comps(y, x - 1);
+
+                            isConnected(comps, lc, ll, changed);
+                        }
+                    }
+                }
+            } while (__syncthreads_or(changed));
+        }
+
+        __global__ void flatten(const DevMem2D edges, DevMem2Di comps)
+        {
+            int x = threadIdx.x + blockIdx.x * blockDim.x;
+            int y = threadIdx.y + blockIdx.y * blockDim.y;
+
+            if( x < comps.cols && y < comps.rows)
+                comps(y, x) = root(comps, comps(y, x));
+        }
+
+        void labelComponents(const DevMem2D& edges, DevMem2Di comps, cudaStream_t stream)
+        {
+            dim3 block(CTA_SIZE_X, CTA_SIZE_Y);
+            dim3 grid(divUp(edges.cols, TILE_COLS), divUp(edges.rows, TILE_ROWS));
+
+            lableTiles<<<grid, block, 0, stream>>>(edges, comps);
+            cudaSafeCall( cudaGetLastError() );
+
+            int tileSizeX = TILE_COLS, tileSizeY = TILE_ROWS;
+
+            cudaSafeCall( cudaGetLastError() );
+            // cudaSafeCall( cudaDeviceSynchronize() );
+
+            while (grid.x > 1 || grid.y > 1)
+            {
+                dim3 mergeGrid(ceilf(grid.x / 2.0), ceilf(grid.y / 2.0));
+                dim3 mergeBlock(STA_SIZE_MARGE_X, STA_SIZE_MARGE_Y);
+                std::cout << "merging: " << grid.y  << " x " << grid.x << " ---> " << mergeGrid.y <<  " x " << mergeGrid.x << " for tiles: " << tileSizeY << " x " << tileSizeX << std::endl;
+                crossMerge<<<mergeGrid, mergeBlock, 0, stream>>>(2, 2, tileSizeY, tileSizeX, edges, comps, ceilf(grid.y / 2.0) - grid.y / 2, ceilf(grid.x / 2.0) - grid.x / 2);
+                tileSizeX <<= 1;
+                tileSizeY <<= 1;
+                grid = mergeGrid;
+
+                cudaSafeCall( cudaGetLastError() );
+            }
+
+            grid.x = divUp(edges.cols, block.x);
+            grid.y = divUp(edges.rows, block.y);
+            flatten<<<grid, block, 0, stream>>>(edges, comps);
+            cudaSafeCall( cudaGetLastError() );
+            if (stream == 0)
+                cudaSafeCall( cudaDeviceSynchronize() );
+        }
+    }
+} } }
--- a/modules/gpu/src/cuda/debayer.cu
+++ b/modules/gpu/src/cuda/debayer.cu
@@ -0,0 +1,327 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include <opencv2/gpu/device/common.hpp>
+#include <opencv2/gpu/device/vec_traits.hpp>
+#include <opencv2/gpu/device/vec_math.hpp>
+#include <opencv2/gpu/device/limits.hpp>
+
+namespace cv { namespace gpu {
+    namespace device
+    {
+        template <typename D>
+        __global__ void Bayer2BGR_8u(const PtrStepb src, DevMem2D_<D> dst, const bool blue_last, const bool start_with_green)
+        {
+            const int s_x = blockIdx.x * blockDim.x + threadIdx.x;
+            int s_y = blockIdx.y * blockDim.y + threadIdx.y;
+
+            if (s_y >= dst.rows || (s_x << 2) >= dst.cols)
+                return;
+
+            s_y = ::min(::max(s_y, 1), dst.rows - 2);
+
+            uchar4 patch[3][3];
+            patch[0][1] = ((const uchar4*) src.ptr(s_y - 1))[s_x];
+            patch[0][0] = ((const uchar4*) src.ptr(s_y - 1))[::max(s_x - 1, 0)];
+            patch[0][2] = ((const uchar4*) src.ptr(s_y - 1))[::min(s_x + 1, ((dst.cols + 3) >> 2) - 1)];
+
+            patch[1][1] = ((const uchar4*) src.ptr(s_y))[s_x];
+            patch[1][0] = ((const uchar4*) src.ptr(s_y))[::max(s_x - 1, 0)];
+            patch[1][2] = ((const uchar4*) src.ptr(s_y))[::min(s_x + 1, ((dst.cols + 3) >> 2) - 1)];
+
+            patch[2][1] = ((const uchar4*) src.ptr(s_y + 1))[s_x];
+            patch[2][0] = ((const uchar4*) src.ptr(s_y + 1))[::max(s_x - 1, 0)];
+            patch[2][2] = ((const uchar4*) src.ptr(s_y + 1))[::min(s_x + 1, ((dst.cols + 3) >> 2) - 1)];
+
+            D res0 = VecTraits<D>::all(numeric_limits<uchar>::max());
+            D res1 = VecTraits<D>::all(numeric_limits<uchar>::max());
+            D res2 = VecTraits<D>::all(numeric_limits<uchar>::max());
+            D res3 = VecTraits<D>::all(numeric_limits<uchar>::max());
+
+            if ((s_y & 1) ^ start_with_green)
+            {
+                const int t0 = (patch[0][1].x + patch[2][1].x + 1) >> 1;
+                const int t1 = (patch[1][0].w + patch[1][1].y + 1) >> 1;
+
+                const int t2 = (patch[0][1].x + patch[0][1].z + patch[2][1].x + patch[2][1].z + 2) >> 2;
+                const int t3 = (patch[0][1].y + patch[1][1].x + patch[1][1].z + patch[2][1].y + 2) >> 2;
+
+                const int t4 = (patch[0][1].z + patch[2][1].z + 1) >> 1;
+                const int t5 = (patch[1][1].y + patch[1][1].w + 1) >> 1;
+
+                const int t6 = (patch[0][1].z + patch[0][2].x + patch[2][1].z + patch[2][2].x + 2) >> 2;
+                const int t7 = (patch[0][1].w + patch[1][1].z + patch[1][2].x + patch[2][1].w + 2) >> 2;
+
+                if ((s_y & 1) ^ blue_last)
+                {
+                    res0.x = t1;
+                    res0.y = patch[1][1].x;
+                    res0.z = t0;
+
+                    res1.x = patch[1][1].y;
+                    res1.y = t3;
+                    res1.z = t2;
+
+                    res2.x = t5;
+                    res2.y = patch[1][1].z;
+                    res2.z = t4;
+
+                    res3.x = patch[1][1].w;
+                    res3.y = t7;
+                    res3.z = t6;
+                }
+                else
+                {
+                    res0.x = t0;
+                    res0.y = patch[1][1].x;
+                    res0.z = t1;
+
+                    res1.x = t2;
+                    res1.y = t3;
+                    res1.z = patch[1][1].y;
+
+                    res2.x = t4;
+                    res2.y = patch[1][1].z;
+                    res2.z = t5;
+
+                    res3.x = t6;
+                    res3.y = t7;
+                    res3.z = patch[1][1].w;
+                }
+            }
+            else
+            {
+                const int t0 = (patch[0][0].w + patch[0][1].y + patch[2][0].w + patch[2][1].y + 2) >> 2;
+                const int t1 = (patch[0][1].x + patch[1][0].w + patch[1][1].y + patch[2][1].x + 2) >> 2;
+
+                const int t2 = (patch[0][1].y + patch[2][1].y + 1) >> 1;
+                const int t3 = (patch[1][1].x + patch[1][1].z + 1) >> 1;
+
+                const int t4 = (patch[0][1].y + patch[0][1].w + patch[2][1].y + patch[2][1].w + 2) >> 2;
+                const int t5 = (patch[0][1].z + patch[1][1].y + patch[1][1].w + patch[2][1].z + 2) >> 2;
+
+                const int t6 = (patch[0][1].w + patch[2][1].w + 1) >> 1;
+                const int t7 = (patch[1][1].z + patch[1][2].x + 1) >> 1;
+
+                if ((s_y & 1) ^ blue_last)
+                {
+                    res0.x = patch[1][1].x;
+                    res0.y = t1;
+                    res0.z = t0;
+
+                    res1.x = t3;
+                    res1.y = patch[1][1].y;
+                    res1.z = t2;
+
+                    res2.x = patch[1][1].z;
+                    res2.y = t5;
+                    res2.z = t4;
+
+                    res3.x = t7;
+                    res3.y = patch[1][1].w;
+                    res3.z = t6;
+                }
+                else
+                {
+                    res0.x = t0;
+                    res0.y = t1;
+                    res0.z = patch[1][1].x;
+
+                    res1.x = t2;
+                    res1.y = patch[1][1].y;
+                    res1.z = t3;
+
+                    res2.x = t4;
+                    res2.y = t5;
+                    res2.z = patch[1][1].z;
+
+                    res3.x = t6;
+                    res3.y = patch[1][1].w;
+                    res3.z = t7;
+                }
+            }
+
+            const int d_x = (blockIdx.x * blockDim.x + threadIdx.x) << 2;
+            const int d_y = blockIdx.y * blockDim.y + threadIdx.y;
+
+            dst(d_y, d_x) = res0;
+            if (d_x + 1 < dst.cols)
+                dst(d_y, d_x + 1) = res1;
+            if (d_x + 2 < dst.cols)
+                dst(d_y, d_x + 2) = res2;
+            if (d_x + 3 < dst.cols)
+                dst(d_y, d_x + 3) = res3;
+        }
+
+        template <typename D>
+        __global__ void Bayer2BGR_16u(const PtrStepb src, DevMem2D_<D> dst, const bool blue_last, const bool start_with_green)
+        {
+            const int s_x = blockIdx.x * blockDim.x + threadIdx.x;
+            int s_y = blockIdx.y * blockDim.y + threadIdx.y;
+
+            if (s_y >= dst.rows || (s_x << 1) >= dst.cols)
+                return;
+
+            s_y = ::min(::max(s_y, 1), dst.rows - 2);
+
+            ushort2 patch[3][3];
+            patch[0][1] = ((const ushort2*) src.ptr(s_y - 1))[s_x];
+            patch[0][0] = ((const ushort2*) src.ptr(s_y - 1))[::max(s_x - 1, 0)];
+            patch[0][2] = ((const ushort2*) src.ptr(s_y - 1))[::min(s_x + 1, ((dst.cols + 1) >> 1) - 1)];
+
+            patch[1][1] = ((const ushort2*) src.ptr(s_y))[s_x];
+            patch[1][0] = ((const ushort2*) src.ptr(s_y))[::max(s_x - 1, 0)];
+            patch[1][2] = ((const ushort2*) src.ptr(s_y))[::min(s_x + 1, ((dst.cols + 1) >> 1) - 1)];
+
+            patch[2][1] = ((const ushort2*) src.ptr(s_y + 1))[s_x];
+            patch[2][0] = ((const ushort2*) src.ptr(s_y + 1))[::max(s_x - 1, 0)];
+            patch[2][2] = ((const ushort2*) src.ptr(s_y + 1))[::min(s_x + 1, ((dst.cols + 1) >> 1) - 1)];
+
+            D res0 = VecTraits<D>::all(numeric_limits<ushort>::max());
+            D res1 = VecTraits<D>::all(numeric_limits<ushort>::max());
+
+            if ((s_y & 1) ^ start_with_green)
+            {
+                const int t0 = (patch[0][1].x + patch[2][1].x + 1) >> 1;
+                const int t1 = (patch[1][0].y + patch[1][1].y + 1) >> 1;
+
+                const int t2 = (patch[0][1].x + patch[0][2].x + patch[2][1].x + patch[2][2].x + 2) >> 2;
+                const int t3 = (patch[0][1].y + patch[1][1].x + patch[1][2].x + patch[2][1].y + 2) >> 2;
+
+                if ((s_y & 1) ^ blue_last)
+                {
+                    res0.x = t1;
+                    res0.y = patch[1][1].x;
+                    res0.z = t0;
+
+                    res1.x = patch[1][1].y;
+                    res1.y = t3;
+                    res1.z = t2;
+                }
+                else
+                {
+                    res0.x = t0;
+                    res0.y = patch[1][1].x;
+                    res0.z = t1;
+
+                    res1.x = t2;
+                    res1.y = t3;
+                    res1.z = patch[1][1].y;
+                }
+            }
+            else
+            {
+                const int t0 = (patch[0][0].y + patch[0][1].y + patch[2][0].y + patch[2][1].y + 2) >> 2;
+                const int t1 = (patch[0][1].x + patch[1][0].y + patch[1][1].y + patch[2][1].x + 2) >> 2;
+
+                const int t2 = (patch[0][1].y + patch[2][1].y + 1) >> 1;
+                const int t3 = (patch[1][1].x + patch[1][2].x + 1) >> 1;
+
+                if ((s_y & 1) ^ blue_last)
+                {
+                    res0.x = patch[1][1].x;
+                    res0.y = t1;
+                    res0.z = t0;
+
+                    res1.x = t3;
+                    res1.y = patch[1][1].y;
+                    res1.z = t2;
+                }
+                else
+                {
+                    res0.x = t0;
+                    res0.y = t1;
+                    res0.z = patch[1][1].x;
+
+                    res1.x = t2;
+                    res1.y = patch[1][1].y;
+                    res1.z = t3;
+                }
+            }
+
+            const int d_x = (blockIdx.x * blockDim.x + threadIdx.x) << 1;
+            const int d_y = blockIdx.y * blockDim.y + threadIdx.y;
+
+            dst(d_y, d_x) = res0;
+            if (d_x + 1 < dst.cols)
+                dst(d_y, d_x + 1) = res1;
+        }
+
+        template <int cn>
+        void Bayer2BGR_8u_gpu(DevMem2Db src, DevMem2Db dst, bool blue_last, bool start_with_green, cudaStream_t stream)
+        {
+            typedef typename TypeVec<uchar, cn>::vec_type dst_t;
+
+            const dim3 block(32, 8);
+            const dim3 grid(divUp(dst.cols, 4 * block.x), divUp(dst.rows, block.y));
+
+            cudaSafeCall( cudaFuncSetCacheConfig(Bayer2BGR_8u<dst_t>, cudaFuncCachePreferL1) );
+
+            Bayer2BGR_8u<dst_t><<<grid, block, 0, stream>>>(src, (DevMem2D_<dst_t>)dst, blue_last, start_with_green);
+            cudaSafeCall( cudaGetLastError() );
+
+            if (stream == 0)
+                cudaSafeCall( cudaDeviceSynchronize() );
+        }
+        template <int cn>
+        void Bayer2BGR_16u_gpu(DevMem2Db src, DevMem2Db dst, bool blue_last, bool start_with_green, cudaStream_t stream)
+        {
+            typedef typename TypeVec<ushort, cn>::vec_type dst_t;
+
+            const dim3 block(32, 8);
+            const dim3 grid(divUp(dst.cols, 2 * block.x), divUp(dst.rows, block.y));
+
+            cudaSafeCall( cudaFuncSetCacheConfig(Bayer2BGR_16u<dst_t>, cudaFuncCachePreferL1) );
+
+            Bayer2BGR_16u<dst_t><<<grid, block, 0, stream>>>(src, (DevMem2D_<dst_t>)dst, blue_last, start_with_green);
+            cudaSafeCall( cudaGetLastError() );
+
+            if (stream == 0)
+                cudaSafeCall( cudaDeviceSynchronize() );
+        }
+
+        template void Bayer2BGR_8u_gpu<3>(DevMem2Db src, DevMem2Db dst, bool blue_last, bool start_with_green, cudaStream_t stream);
+        template void Bayer2BGR_8u_gpu<4>(DevMem2Db src, DevMem2Db dst, bool blue_last, bool start_with_green, cudaStream_t stream);
+        template void Bayer2BGR_16u_gpu<3>(DevMem2Db src, DevMem2Db dst, bool blue_last, bool start_with_green, cudaStream_t stream);
+        template void Bayer2BGR_16u_gpu<4>(DevMem2Db src, DevMem2Db dst, bool blue_last, bool start_with_green, cudaStream_t stream);
+    }
+}}
--- a/modules/gpu/src/graphcuts.cpp
+++ b/modules/gpu/src/graphcuts.cpp
@@ -47,8 +47,76 @@
 void cv::gpu::graphcut(GpuMat&, GpuMat&, GpuMat&, GpuMat&, GpuMat&, GpuMat&, GpuMat&, Stream&) { throw_nogpu(); }
 void cv::gpu::graphcut(GpuMat&, GpuMat&, GpuMat&, GpuMat&, GpuMat&, GpuMat&, GpuMat&, GpuMat&, GpuMat&, GpuMat&, GpuMat&, Stream&) { throw_nogpu(); }

+void cv::gpu::connectivityMask(const GpuMat&, GpuMat&, const cv::Scalar&, const cv::Scalar&, Stream&) { throw_nogpu(); }
+void cv::gpu::labelComponents(const GpuMat& mask, GpuMat& components, Stream& stream) { throw_nogpu(); }
+
 #else /* !defined (HAVE_CUDA) */

+namespace cv { namespace gpu { namespace device
+{
+    namespace ccl
+    {
+        void labelComponents(const DevMem2D& edges, DevMem2Di comps, cudaStream_t stream);
+
+        template<typename T>
+        void computeEdges(const DevMem2D& image, DevMem2D edges, const float4& lo, const float4& hi, cudaStream_t stream);
+    }
+}}}
+
+
+float4 scalarToCudaType(const cv::Scalar& in)
+{
+    float4 res;
+    res.x = in[0]; res.y = in[1]; res.z = in[2]; res.w = in[3];
+    return res;
+}
+
+
+void cv::gpu::connectivityMask(const GpuMat& image, GpuMat& mask, const cv::Scalar& lo, const cv::Scalar& hi, Stream& s)
+{
+    CV_Assert(!image.empty());
+
+    int ch = image.channels();
+    CV_Assert(ch <= 4);
+
+    int depth = image.depth();
+
+    typedef void (*func_t)(const DevMem2D& image, DevMem2D edges, const float4& lo, const float4& hi, cudaStream_t stream);
+
+    static const func_t suppotLookup[8][4] =
+    {   //    1,    2,     3,     4
+        { device::ccl::computeEdges<uchar>,  0,  device::ccl::computeEdges<uchar3>,  device::ccl::computeEdges<uchar4>  },// CV_8U
+        { 0,                                 0,  0,                                  0                                  },// CV_16U
+        { device::ccl::computeEdges<ushort>, 0,  device::ccl::computeEdges<ushort3>, device::ccl::computeEdges<ushort4> },// CV_8S
+        { 0,                                 0,  0,                                  0                                  },// CV_16S
+        { device::ccl::computeEdges<int>,    0,  0,                                  0                                  },// CV_32S
+        { device::ccl::computeEdges<float>,  0,  0,                                  0                                  },// CV_32F
+        { 0,                                 0,  0,                                  0                                  },// CV_64F
+        { 0,                                 0,  0,                                  0                                  } // CV_USRTYPE1
+    };
+
+    func_t f = suppotLookup[depth][ch - 1];
+    CV_Assert(f);
+
+    if (image.size() != mask.size() || mask.type() != CV_8UC1)
+        mask.create(image.size(), CV_8UC1);
+
+    cudaStream_t stream = StreamAccessor::getStream(s);
+    float4 culo = scalarToCudaType(lo), cuhi = scalarToCudaType(hi);
+    f(image, mask, culo, cuhi, stream);
+}
+
+void cv::gpu::labelComponents(const GpuMat& mask, GpuMat& components, Stream& s)
+{
+    CV_Assert(!mask.empty() && mask.type() == CV_8U);
+
+    if (mask.size() != components.size() || components.type() != CV_32SC1)
+        components.create(mask.size(), CV_32SC1);
+
+    cudaStream_t stream = StreamAccessor::getStream(s);
+    device::ccl::labelComponents(mask, components, stream);
+}
+
 namespace
 {
    typedef NppStatus (*init_func_t)(NppiSize oSize, NppiGraphcutState** ppState, Npp8u* pDeviceMem);
--- a/modules/gpu/test/test_color.cpp
+++ b/modules/gpu/test/test_color.cpp
@@ -41,6 +41,8 @@

 #include "precomp.hpp"

+#ifdef HAVE_CUDA
+
 namespace {

 ///////////////////////////////////////////////////////////////////////////////////////////////////////
@@ -1628,7 +1630,7 @@ TEST_P(CvtColor, BGR2Lab)
    }
    catch (const cv::Exception& e)
    {
-#if (CUDA_VERSION < 5000)
+#if defined (CUDA_VERSION) && (CUDA_VERSION < 5000)
        ASSERT_EQ(CV_StsBadFlag, e.code);
 #else
        FAIL();
@@ -1655,7 +1657,7 @@ TEST_P(CvtColor, RGB2Lab)
    }
    catch (const cv::Exception& e)
    {
-#if (CUDA_VERSION < 5000)
+#if defined (CUDA_VERSION) && (CUDA_VERSION < 5000)
        ASSERT_EQ(CV_StsBadFlag, e.code);
 #else
        FAIL();
@@ -1682,7 +1684,7 @@ TEST_P(CvtColor, BGR2Luv)
    }
    catch (const cv::Exception& e)
    {
-#if (CUDA_VERSION < 5000)
+#if defined (CUDA_VERSION) && (CUDA_VERSION < 5000)
        ASSERT_EQ(CV_StsBadFlag, e.code);
 #else
        FAIL();
@@ -1709,7 +1711,7 @@ TEST_P(CvtColor, RGB2Luv)
    }
    catch (const cv::Exception& e)
    {
-#if (CUDA_VERSION < 5000)
+#if defined (CUDA_VERSION) && (CUDA_VERSION < 5000)
        ASSERT_EQ(CV_StsBadFlag, e.code);
 #else
        FAIL();
@@ -1736,7 +1738,7 @@ TEST_P(CvtColor, RGBA2mRGBA)
    }
    catch (const cv::Exception& e)
    {
-#if (CUDA_VERSION < 5000)
+#if defined (CUDA_VERSION) && (CUDA_VERSION < 5000)
        ASSERT_EQ(CV_StsBadFlag, e.code);
 #else
        FAIL();
@@ -1744,6 +1746,159 @@ TEST_P(CvtColor, RGBA2mRGBA)
    }
 }

+TEST_P(CvtColor, BayerBG2BGR)
+{
+    if ((depth != CV_8U && depth != CV_16U) || useRoi)
+        return;
+
+    cv::Mat src = randomMat(size, depth);
+
+    cv::gpu::GpuMat dst;
+    cv::gpu::cvtColor(loadMat(src, useRoi), dst, cv::COLOR_BayerBG2BGR);
+
+    cv::Mat dst_gold;
+    cv::cvtColor(src, dst_gold, cv::COLOR_BayerBG2BGR);
+
+    EXPECT_MAT_NEAR(dst_gold(cv::Rect(1, 1, dst.cols - 2, dst.rows - 2)), dst(cv::Rect(1, 1, dst.cols - 2, dst.rows - 2)), 0);
+}
+
+TEST_P(CvtColor, BayerBG2BGR4)
+{
+    if ((depth != CV_8U && depth != CV_16U) || useRoi)
+        return;
+
+    cv::Mat src = randomMat(size, depth);
+
+    cv::gpu::GpuMat dst;
+    cv::gpu::cvtColor(loadMat(src, useRoi), dst, cv::COLOR_BayerBG2BGR, 4);
+
+    ASSERT_EQ(4, dst.channels());
+
+    cv::Mat dst_gold;
+    cv::cvtColor(src, dst_gold, cv::COLOR_BayerBG2BGR);
+
+    cv::Mat dst4(dst);
+    cv::Mat dst3;
+    cv::cvtColor(dst4, dst3, cv::COLOR_BGRA2BGR);
+
+
+    EXPECT_MAT_NEAR(dst_gold(cv::Rect(1, 1, dst.cols - 2, dst.rows - 2)), dst3(cv::Rect(1, 1, dst.cols - 2, dst.rows - 2)), 0);
+}
+
+TEST_P(CvtColor, BayerGB2BGR)
+{
+    if ((depth != CV_8U && depth != CV_16U) || useRoi)
+        return;
+
+    cv::Mat src = randomMat(size, depth);
+
+    cv::gpu::GpuMat dst;
+    cv::gpu::cvtColor(loadMat(src, useRoi), dst, cv::COLOR_BayerGB2BGR);
+
+    cv::Mat dst_gold;
+    cv::cvtColor(src, dst_gold, cv::COLOR_BayerGB2BGR);
+
+    EXPECT_MAT_NEAR(dst_gold(cv::Rect(1, 1, dst.cols - 2, dst.rows - 2)), dst(cv::Rect(1, 1, dst.cols - 2, dst.rows - 2)), 0);
+}
+
+TEST_P(CvtColor, BayerGB2BGR4)
+{
+    if ((depth != CV_8U && depth != CV_16U) || useRoi)
+        return;
+
+    cv::Mat src = randomMat(size, depth);
+
+    cv::gpu::GpuMat dst;
+    cv::gpu::cvtColor(loadMat(src, useRoi), dst, cv::COLOR_BayerGB2BGR, 4);
+
+    ASSERT_EQ(4, dst.channels());
+
+    cv::Mat dst_gold;
+    cv::cvtColor(src, dst_gold, cv::COLOR_BayerGB2BGR);
+
+    cv::Mat dst4(dst);
+    cv::Mat dst3;
+    cv::cvtColor(dst4, dst3, cv::COLOR_BGRA2BGR);
+
+    EXPECT_MAT_NEAR(dst_gold(cv::Rect(1, 1, dst.cols - 2, dst.rows - 2)), dst3(cv::Rect(1, 1, dst.cols - 2, dst.rows - 2)), 0);
+}
+
+TEST_P(CvtColor, BayerRG2BGR)
+{
+    if ((depth != CV_8U && depth != CV_16U) || useRoi)
+        return;
+
+    cv::Mat src = randomMat(size, depth);
+
+    cv::gpu::GpuMat dst;
+    cv::gpu::cvtColor(loadMat(src, useRoi), dst, cv::COLOR_BayerRG2BGR);
+
+    cv::Mat dst_gold;
+    cv::cvtColor(src, dst_gold, cv::COLOR_BayerRG2BGR);
+
+    EXPECT_MAT_NEAR(dst_gold(cv::Rect(1, 1, dst.cols - 2, dst.rows - 2)), dst(cv::Rect(1, 1, dst.cols - 2, dst.rows - 2)), 0);
+}
+
+TEST_P(CvtColor, BayerRG2BGR4)
+{
+    if ((depth != CV_8U && depth != CV_16U) || useRoi)
+        return;
+
+    cv::Mat src = randomMat(size, depth);
+
+    cv::gpu::GpuMat dst;
+    cv::gpu::cvtColor(loadMat(src, useRoi), dst, cv::COLOR_BayerRG2BGR, 4);
+
+    ASSERT_EQ(4, dst.channels());
+
+    cv::Mat dst_gold;
+    cv::cvtColor(src, dst_gold, cv::COLOR_BayerRG2BGR);
+
+    cv::Mat dst4(dst);
+    cv::Mat dst3;
+    cv::cvtColor(dst4, dst3, cv::COLOR_BGRA2BGR);
+
+    EXPECT_MAT_NEAR(dst_gold(cv::Rect(1, 1, dst.cols - 2, dst.rows - 2)), dst3(cv::Rect(1, 1, dst.cols - 2, dst.rows - 2)), 0);
+}
+
+TEST_P(CvtColor, BayerGR2BGR)
+{
+    if ((depth != CV_8U && depth != CV_16U) || useRoi)
+        return;
+
+    cv::Mat src = randomMat(size, depth);
+
+    cv::gpu::GpuMat dst;
+    cv::gpu::cvtColor(loadMat(src, useRoi), dst, cv::COLOR_BayerGR2BGR);
+
+    cv::Mat dst_gold;
+    cv::cvtColor(src, dst_gold, cv::COLOR_BayerGR2BGR);
+
+    EXPECT_MAT_NEAR(dst_gold(cv::Rect(1, 1, dst.cols - 2, dst.rows - 2)), dst(cv::Rect(1, 1, dst.cols - 2, dst.rows - 2)), 0);
+}
+
+TEST_P(CvtColor, BayerGR2BGR4)
+{
+    if ((depth != CV_8U && depth != CV_16U) || useRoi)
+        return;
+
+    cv::Mat src = randomMat(size, depth);
+
+    cv::gpu::GpuMat dst;
+    cv::gpu::cvtColor(loadMat(src, useRoi), dst, cv::COLOR_BayerGR2BGR, 4);
+
+    ASSERT_EQ(4, dst.channels());
+
+    cv::Mat dst_gold;
+    cv::cvtColor(src, dst_gold, cv::COLOR_BayerGR2BGR);
+
+    cv::Mat dst4(dst);
+    cv::Mat dst3;
+    cv::cvtColor(dst4, dst3, cv::COLOR_BGRA2BGR);
+
+    EXPECT_MAT_NEAR(dst_gold(cv::Rect(1, 1, dst.cols - 2, dst.rows - 2)), dst3(cv::Rect(1, 1, dst.cols - 2, dst.rows - 2)), 0);
+}
+
 INSTANTIATE_TEST_CASE_P(GPU_ImgProc, CvtColor, testing::Combine(
    ALL_DEVICES,
    DIFFERENT_SIZES,
@@ -1791,3 +1946,5 @@ INSTANTIATE_TEST_CASE_P(GPU_ImgProc, SwapChannels, testing::Combine(
    WHOLE_SUBMAT));

 } // namespace
+
+#endif // HAVE_CUDA
--- a/modules/gpu/test/test_labeling.cpp
+++ b/modules/gpu/test/test_labeling.cpp
@@ -0,0 +1,205 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+// By downloading, copying, installing or using the software you agree to this license.
+// If you do not agree to this license, do not download, install,
+// copy or use the software.
+//
+//
+//                          License Agreement
+//               For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2008-2011, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//  * Redistributions of source code must retain the above copyright notice,
+//    this list of conditions and the following disclaimer.
+//
+//  * Redistributions in binary form must reproduce the above copyright notice,
+//    this list of conditions and the following disclaimer in the documentation
+//    and/or other materials provided with the distribution.
+//
+//  * The name of the copyright holders may not be used to endorse or promote products
+//    derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//M*/
+
+#include "precomp.hpp"
+#include <string>
+#include <iostream>
+
+#ifdef HAVE_CUDA
+
+namespace {
+
+    struct GreedyLabeling
+    {
+        struct dot
+        {
+            int x;
+            int y;
+
+            static dot make(int i, int j)
+            {
+                dot d; d.x = i; d.y = j;
+                return d;
+            }
+        };
+
+        struct InInterval
+        {
+            InInterval(const int& _lo, const int& _hi) : lo(-_lo), hi(_hi) {};
+            const int lo, hi;
+
+            bool operator() (const unsigned char a, const unsigned char b) const
+            {
+                int d = a - b;
+                return lo <= d && d <= hi;
+            }
+        };
+
+        GreedyLabeling(cv::Mat img)
+        : image(img), _labels(image.cols, image.rows, CV_32SC1, cv::Scalar::all(-1)) {}
+
+        void operator() (cv::Mat labels) const
+        {
+            InInterval inInt(0, 2);
+            dot* stack = new dot[image.cols * image.rows];
+
+            int cc = -1;
+
+            int* dist_labels = (int*)labels.data;
+            int pitch = labels.step1();
+
+            unsigned char* source = (unsigned char*)image.data;
+            int width = image.cols;
+            int height = image.rows;
+
+            for (int j = 0; j < image.rows; ++j)
+                for(int i = 0; i < image.cols; ++i)
+                {
+                    if (dist_labels[j * pitch + i] != -1) continue;
+
+                    dot* top = stack;
+                    dot p = dot::make(i, j);
+                    cc++;
+
+                    dist_labels[j * pitch + i] = cc;
+
+                    while (top >= stack)
+                    {
+                        int*  dl = &dist_labels[p.y * pitch + p.x];
+                        unsigned char* sp = &source[p.y * image.step1() + p.x];
+
+                        //right
+                        if( p.x < (width - 1) && dl[ +1] == -1 && inInt(sp[0], sp[+1]))
+                        {
+                            dl[+1] = cc;
+                            *top++ = dot::make(p.x + 1, p.y);
+                        }
+
+                        //left
+                        if( p.x > 0 && dl[-1] == -1 && inInt(sp[0], sp[-1]))
+                        {
+                            dl[-1] = cc;
+                            *top++ = dot::make(p.x - 1, p.y);
+                        }
+
+                        //bottom
+                        if( p.y < (height - 1) && dl[+pitch] == -1 && inInt(sp[0], sp[+pitch]))
+                        {
+                            dl[+pitch] = cc;
+                            *top++ = dot::make(p.x, p.y + 1);
+                        }
+
+                        //top
+                        if( p.y > 0 && dl[-pitch] == -1 && inInt(sp[0], sp[-pitch]))
+                        {
+                            dl[-pitch] = cc;
+                            *top++ = dot::make(p.x, p.y - 1);
+                        }
+
+                        p = *--top;
+                    }
+                }
+            delete[] stack;
+        }
+
+        cv::Mat image;
+        cv::Mat _labels;
+    };
+}
+
+struct Labeling : testing::TestWithParam<cv::gpu::DeviceInfo>
+{
+    cv::gpu::DeviceInfo devInfo;
+
+    virtual void SetUp()
+    {
+        devInfo = GetParam();
+        cv::gpu::setDevice(devInfo.deviceID());
+    }
+
+    cv::Mat loat_image()
+    {
+        return cv::imread(std::string( cvtest::TS::ptr()->get_data_path() ) + "labeling/label.png");
+    }
+};
+
+TEST_P(Labeling, ConnectedComponents)
+{
+    cv::Mat image;
+    cvtColor(loat_image(), image, CV_BGR2GRAY);
+
+    ASSERT_TRUE(image.type() == CV_8UC1);
+
+    GreedyLabeling host(image);
+    host(host._labels);
+
+    cv::gpu::GpuMat mask;
+    mask.create(image.rows, image.cols, CV_8UC1);
+
+    cv::gpu::GpuMat components;
+    components.create(image.rows, image.cols, CV_32SC1);
+
+    cv::gpu::connectivityMask(cv::gpu::GpuMat(image), mask, cv::Scalar::all(0), cv::Scalar::all(2));
+
+    ASSERT_NO_THROW(cv::gpu::labelComponents(mask, components));
+
+    // for (int j = 0; j + 32 < components.rows; j += 32)
+    //     for (int i = 0; i + 32 < components.cols; i += 32)
+    //     {
+    //         std::cout << "Tile: " << i << " " << j << std::endl;
+    //         std::cout << cv::Mat(host._labels, cv::Rect(i,j,32,32)) << std::endl;
+    //         std::cout << cv::Mat(cv::Mat(components), cv::Rect(i,j,32,32)) << std::endl;
+    //     }
+
+    // for debug
+    // cv::imshow("test", image);
+    // cv::waitKey(0);
+    // cv::imshow("test", host._labels * 50);
+    // cv::waitKey(0);
+    // // cv::imshow("test", cv::Mat(mask) * 10);
+    // // cv::waitKey(0);
+    // cv::imshow("test", cv::Mat(components) * 2);
+    // cv::waitKey(0);
+}
+
+INSTANTIATE_TEST_CASE_P(ConnectedComponents, Labeling, ALL_DEVICES);
+
+#endif
--- a/modules/highgui/doc/reading_and_writing_images_and_video.rst
+++ b/modules/highgui/doc/reading_and_writing_images_and_video.rst
@@ -294,7 +294,7 @@ The methods/functions grab the next frame from video file or camera and return t

 The primary use of the function is in multi-camera environments, especially when the cameras do not have hardware synchronization. That is, you call ``VideoCapture::grab()`` for each camera and after that call the slower method ``VideoCapture::retrieve()`` to decode and get frame from each camera. This way the overhead on demosaicing or motion jpeg decompression etc. is eliminated and the retrieved frames from different cameras will be closer in time.

-Also, when a connected camera is multi-head (for example, a stereo camera or a Kinect device), the correct way of retrieving data from it is to call `VideoCapture::grab` first and then call :ocv:func:`VideoCapture::retrieve` one or more times with different values of the ``channel`` parameter. See http://code.opencv.org/svn/opencv/trunk/opencv/samples/cpp/kinect_maps.cpp
+Also, when a connected camera is multi-head (for example, a stereo camera or a Kinect device), the correct way of retrieving data from it is to call `VideoCapture::grab` first and then call :ocv:func:`VideoCapture::retrieve` one or more times with different values of the ``channel`` parameter. See http://code.opencv.org/projects/opencv/repository/revisions/master/entry/samples/cpp/kinect_maps.cpp


 VideoCapture::retrieve
--- a/modules/highgui/doc/user_interface.rst
+++ b/modules/highgui/doc/user_interface.rst
@@ -203,7 +203,7 @@ Sets mouse handler for the specified window

    :param winname: Window name

-    :param onMouse: Mouse callback. See OpenCV samples, such as  http://code.opencv.org/svn/opencv/trunk/opencv/samples/cpp/ffilldemo.cpp, on how to specify and use the callback.
+    :param onMouse: Mouse callback. See OpenCV samples, such as  http://code.opencv.org/projects/opencv/repository/revisions/master/entry/samples/cpp/ffilldemo.cpp, on how to specify and use the callback.

    :param userdata: The optional parameter passed to the callback.

--- a/modules/imgproc/doc/structural_analysis_and_shape_descriptors.rst
+++ b/modules/imgproc/doc/structural_analysis_and_shape_descriptors.rst
@@ -202,7 +202,7 @@ Approximates a polygonal curve(s) with the specified precision.
 The functions ``approxPolyDP`` approximate a curve or a polygon with another curve/polygon with less vertices so that the distance between them is less or equal to the specified precision. It uses the Douglas-Peucker algorithm
 http://en.wikipedia.org/wiki/Ramer-Douglas-Peucker_algorithm

-See http://code.opencv.org/svn/opencv/trunk/opencv/samples/cpp/contours.cpp for the function usage model.
+See http://code.opencv.org/projects/opencv/repository/revisions/master/entry/samples/cpp/contours.cpp for the function usage model.


 ApproxChains
--- a/modules/imgproc/src/smooth.cpp
+++ b/modules/imgproc/src/smooth.cpp
@@ -1298,17 +1298,17 @@ public:
        maxk(_maxk), space_ofs(_space_ofs), space_weight(_space_weight), color_weight(_color_weight)
    {
    }
-    
+
    virtual void operator() (const Range& range) const
    {
        int i, j, cn = dest->channels(), k;
        Size size = dest->size();
-        
+
        for( i = range.start; i < range.end; i++ )
        {
            const uchar* sptr = temp->ptr(i+radius) + radius*cn;
            uchar* dptr = dest->ptr(i);
-            
+
            if( cn == 1 )
            {
                for( j = 0; j < size.width; j++ )
@@ -1351,10 +1351,10 @@ public:
            }
        }
    }
-    
+
 private:
-    const Mat *temp;
    Mat *dest;
+    const Mat *temp;
    int radius, maxk, *space_ofs;
    float *space_weight, *color_weight;
 };
@@ -1367,40 +1367,40 @@ bilateralFilter_8u( const Mat& src, Mat& dst, int d,
    int cn = src.channels();
    int i, j, maxk, radius;
    Size size = src.size();
-    
+
    CV_Assert( (src.type() == CV_8UC1 || src.type() == CV_8UC3) &&
              src.type() == dst.type() && src.size() == dst.size() &&
              src.data != dst.data );
-    
+
    if( sigma_color <= 0 )
        sigma_color = 1;
    if( sigma_space <= 0 )
        sigma_space = 1;
-    
+
    double gauss_color_coeff = -0.5/(sigma_color*sigma_color);
    double gauss_space_coeff = -0.5/(sigma_space*sigma_space);
-    
+
    if( d <= 0 )
        radius = cvRound(sigma_space*1.5);
    else
        radius = d/2;
    radius = MAX(radius, 1);
    d = radius*2 + 1;
-    
+
    Mat temp;
    copyMakeBorder( src, temp, radius, radius, radius, radius, borderType );
-    
+
    vector<float> _color_weight(cn*256);
    vector<float> _space_weight(d*d);
    vector<int> _space_ofs(d*d);
    float* color_weight = &_color_weight[0];
    float* space_weight = &_space_weight[0];
    int* space_ofs = &_space_ofs[0];
-    
+
    // initialize color-related bilateral filter coefficients
    for( i = 0; i < 256*cn; i++ )
        color_weight[i] = (float)std::exp(i*i*gauss_color_coeff);
-    
+
    // initialize space-related bilateral filter coefficients
    for( i = -radius, maxk = 0; i <= radius; i++ )
        for( j = -radius; j <= radius; j++ )
@@ -1411,7 +1411,7 @@ bilateralFilter_8u( const Mat& src, Mat& dst, int d,
            space_weight[maxk] = (float)std::exp(r*r*gauss_space_coeff);
            space_ofs[maxk++] = (int)(i*temp.step + j*cn);
        }
-    
+
    BilateralFilter_8u_Invoker body(dst, temp, radius, maxk, space_ofs, space_weight, color_weight);
    parallel_for_(Range(0, size.height), body);
 }
--- a/modules/objdetect/doc/cascade_classification.rst
+++ b/modules/objdetect/doc/cascade_classification.rst
@@ -21,7 +21,7 @@ The word "cascade" in the classifier name means that the resultant classifier co
 The feature used in a particular classifier is specified by its shape (1a, 2b etc.), position within the region of interest and the scale (this scale is not the same as the scale used at the detection stage, though these two scales are multiplied). For example, in the case of the third line feature (2c) the response is calculated as the difference between the sum of image pixels under the rectangle covering the whole feature (including the two white stripes and the black stripe in the middle) and the sum of the image pixels under the black stripe multiplied by 3 in order to compensate for the differences in the size of areas. The sums of pixel values over a rectangular regions are calculated rapidly using integral images (see below and the :ocv:func:`integral` description).

 To see the object detector at work, have a look at the facedetect demo:
-http://code.opencv.org/svn/opencv/trunk/opencv/samples/cpp/facedetect.cpp
+http://code.opencv.org/projects/opencv/repository/revisions/master/entry/samples/cpp/facedetect.cpp

 The following reference is for the detection part only. There is a separate application called  ``opencv_traincascade`` that can train a cascade of boosted classifiers from a set of samples.

--- a/modules/objdetect/src/cascadedetect.hpp
+++ b/modules/objdetect/src/cascadedetect.hpp
@@ -444,7 +444,7 @@ inline int predictCategoricalStump( CascadeClassifier& cascade, Ptr<FeatureEvalu
    CascadeClassifier::Data::Stage* cascadeStages = &cascade.data.stages[0];

 #ifdef HAVE_TEGRA_OPTIMIZATION
-    float tmp; // float accumulator -- float operations are quicker
+    float tmp = 0; // float accumulator -- float operations are quicker
 #endif
    for( int si = 0; si < nstages; si++ )
    {
--- a/modules/ocl/CMakeLists.txt
+++ b/modules/ocl/CMakeLists.txt
@@ -29,6 +29,14 @@ if (HAVE_OPENCL)
  if(OPENCL_INCLUDE_DIR)
      ocv_include_directories(${OPENCL_INCLUDE_DIR})
  endif()
+  if (HAVE_CLAMDFFT)
+	set(ocl_link_libs ${ocl_link_libs} ${CLAMDFFT_LIBRARIES})
+	ocv_include_directories(${CLAMDFFT_INCLUDE_DIR})
+  endif()
+  if (HAVE_CLAMDBLAS)
+	set(ocl_link_libs ${ocl_link_libs} ${CLAMDBLAS_LIBRARIES})
+	ocv_include_directories(${CLAMDBLAS_INCLUDE_DIR})
+  endif()
 endif()

 ocv_set_module_sources(
--- a/modules/ocl/doc/ocl.rst
+++ b/modules/ocl/doc/ocl.rst
@@ -1,19 +1,20 @@
-************************************
+***************************************
 ocl. OpenCL-accelerated Computer Vision
-************************************
+***************************************

 .. toctree::
    :maxdepth: 1

    introduction
-    initalization_and_information
-    data_structures
-    operations_on_matrices
-    per_element_operations
-    image_processing
-    matrix_reductions
-    object_detection
-    feature_detection_and_description
-    image_filtering
-    camera_calibration_and_3d_reconstruction
-    video
+    structures_and_functions
+..    initalization_and_information
+..    data_structures
+..    operations_on_matrices
+..    per_element_operations
+..    image_processing
+..    matrix_reductions
+..    object_detection
+..    feature_detection_and_description
+..    image_filtering
+..    camera_calibration_and_3d_reconstruction
+..    video
--- a/modules/ocl/include/opencv2/ocl/ocl.hpp
+++ b/modules/ocl/include/opencv2/ocl/ocl.hpp
@@ -858,7 +858,72 @@ namespace cv
        void benchmark_copy_vectorize(const oclMat &src, oclMat &dst);
        void benchmark_copy_offset_stride(const oclMat &src, oclMat &dst);
        void benchmark_ILP();
-        
+
+		//! computes vertical sum, supports only CV_32FC1 images
+		CV_EXPORTS void columnSum(const oclMat& src, oclMat& sum);
+
+		//! performs linear blending of two images
+		//! to avoid accuracy errors sum of weigths shouldn't be very close to zero
+		// supports only CV_8UC1 source type
+		CV_EXPORTS void blendLinear(const oclMat& img1, const oclMat& img2, const oclMat& weights1, const oclMat& weights2, oclMat& result);
+
+		/////////////////////////////// Pyramid /////////////////////////////////////
+		CV_EXPORTS void pyrDown(const oclMat& src, oclMat& dst);
+
+		//! upsamples the source image and then smoothes it
+		CV_EXPORTS void pyrUp(const cv::ocl::oclMat& src,cv::ocl::oclMat& dst);
+
+		///////////////////////////////////////// match_template /////////////////////////////////////////////////////////////
+		struct CV_EXPORTS MatchTemplateBuf
+		{
+			Size user_block_size;
+			oclMat imagef, templf;
+			std::vector<oclMat> images;
+			std::vector<oclMat> image_sums;
+			std::vector<oclMat> image_sqsums;
+		};
+
+
+		//! computes the proximity map for the raster template and the image where the template is searched for
+		// Supports TM_SQDIFF, TM_SQDIFF_NORMED, TM_CCORR, TM_CCORR_NORMED, TM_CCOEFF, TM_CCOEFF_NORMED for type 8UC1 and 8UC4
+		// Supports TM_SQDIFF, TM_CCORR for type 32FC1 and 32FC4
+		CV_EXPORTS void matchTemplate(const oclMat& image, const oclMat& templ, oclMat& result, int method);
+		
+		//! computes the proximity map for the raster template and the image where the template is searched for
+		// Supports TM_SQDIFF, TM_SQDIFF_NORMED, TM_CCORR, TM_CCORR_NORMED, TM_CCOEFF, TM_CCOEFF_NORMED for type 8UC1 and 8UC4
+		// Supports TM_SQDIFF, TM_CCORR for type 32FC1 and 32FC4
+		CV_EXPORTS void matchTemplate(const oclMat& image, const oclMat& templ, oclMat& result, int method, MatchTemplateBuf& buf);
+
+#ifdef HAVE_CLAMDFFT
+            ///////////////////////////////////////// clAmdFft related /////////////////////////////////////////
+            // the two functions must be called before/after run any fft library functions.
+            CV_EXPORTS void fft_setup();    // this will be implicitly invoked
+            CV_EXPORTS void fft_teardown(); // you need to teardown fft library manually
+
+		    /////////////////////////////////////// DFT /////////////////////////////////////////////////////
+		    //! Performs a forward or inverse discrete Fourier transform (1D or 2D) of floating point matrix.
+		    //! Param dft_size is the size of DFT transform.
+		    //!
+		    //! For complex-to-real transform it is assumed that the source matrix is packed in CLFFT's format.
+		    // support src type of CV32FC1, CV32FC2
+		    // support flags: DFT_INVERSE, DFT_REAL_OUTPUT, DFT_COMPLEX_OUTPUT, DFT_ROWS
+		    // dft_size is the size of original input, which is used for transformation from complex to real.
+		    // dft_size must be powers of 2, 3 and 5
+		    // real to complex dft requires at least v1.8 clAmdFft
+		    // real to complex dft output is not the same with cpu version
+		    // real to complex and complex to real does not support DFT_ROWS
+		    CV_EXPORTS void dft(const oclMat& src, oclMat& dst, Size dft_size = Size(0, 0), int flags = 0);
+#endif // HAVE_CLAMDFFT
+
+#ifdef HAVE_CLAMDBLAS
+		//! implements generalized matrix product algorithm GEMM from BLAS
+		// The functionality requires clAmdBlas library
+		// only support type CV_32FC1
+		// flag GEMM_3_T is not supported
+		CV_EXPORTS void gemm(const oclMat& src1, const oclMat& src2, double alpha,
+		const oclMat& src3, double beta, oclMat& dst, int flags = 0);
+#endif
+
    }
 }
 #include "opencv2/ocl/matrix_operations.hpp"
--- a/modules/ocl/src/blend.cpp
+++ b/modules/ocl/src/blend.cpp
@@ -0,0 +1,98 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
+// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// @Authors
+//    Nathan, liujun@multicorewareinc.com
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other oclMaterials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "precomp.hpp"
+#include <iomanip>
+
+using namespace cv;
+using namespace cv::ocl;
+using namespace std;
+
+#if !defined (HAVE_OPENCL)
+void cv::ocl::blendLinear(const oclMat& img1, const oclMat& img2, const oclMat& weights1, const oclMat& weights2,
+                            oclMat& result){throw_nogpu();}
+#else
+namespace cv 
+{
+	namespace ocl 
+	{
+        ////////////////////////////////////OpenCL kernel strings//////////////////////////
+        extern const char *blend_linear;
+	}
+}
+
+void cv::ocl::blendLinear(const oclMat& img1, const oclMat& img2, const oclMat& weights1, const oclMat& weights2,
+                            oclMat& result)
+{
+	cv::ocl::Context *ctx = img1.clCxt;
+	assert(ctx == img2.clCxt && ctx == weights1.clCxt && ctx == weights2.clCxt);
+	int channels = img1.channels();
+	int depth = img1.depth();
+	int rows = img1.rows;
+	int cols = img1.cols;
+	int istep = img1.step;
+	int wstep = weights1.step;
+	size_t globalSize[] = {cols * channels, rows, 1};
+	size_t localSize[] = {16, 16, 1};
+
+	vector< pair<size_t, const void *> > args;
+
+	if(globalSize[0]!=0)
+	{
+		args.push_back( make_pair( sizeof(cl_mem), (void *)&result.data ));
+		args.push_back( make_pair( sizeof(cl_mem), (void *)&img1.data ));
+		args.push_back( make_pair( sizeof(cl_mem), (void *)&img2.data ));
+		args.push_back( make_pair( sizeof(cl_mem), (void *)&weights1.data ));
+		args.push_back( make_pair( sizeof(cl_mem), (void *)&weights2.data ));
+		args.push_back( make_pair( sizeof(cl_int), (void *)&rows ));
+		args.push_back( make_pair( sizeof(cl_int), (void *)&cols ));
+		args.push_back( make_pair( sizeof(cl_int), (void *)&istep ));
+		args.push_back( make_pair( sizeof(cl_int), (void *)&wstep ));
+		std::string kernelName = "BlendLinear";
+
+		openCLExecuteKernel(ctx, &blend_linear, kernelName, globalSize, localSize, args, channels, depth);
+	}
+}
+#endif
--- a/modules/ocl/src/columnsum.cpp
+++ b/modules/ocl/src/columnsum.cpp
@@ -0,0 +1,91 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
+// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// @Authors
+//    Chunpeng Zhang, chunpeng@multicorewareinc.com
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other oclMaterials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors as is and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include <iomanip>
+#include "precomp.hpp"
+
+using namespace cv;
+using namespace cv::ocl;
+using namespace std;
+
+
+#if !defined(HAVE_OPENCL)
+
+void cv::ocl::columnSum(const oclMat& src,oclMat& dst){ throw_nogpu(); }
+
+#else /*!HAVE_OPENCL */
+
+namespace cv 
+{ 
+	namespace ocl
+	{
+		extern const char* imgproc_columnsum;
+	}
+}
+
+void cv::ocl::columnSum(const oclMat& src,oclMat& dst)
+{
+	CV_Assert(src.type() == CV_32FC1 && dst.type() == CV_32FC1 && src.size() == dst.size());
+
+	Context *clCxt = src.clCxt;                                        
+		       
+	const std::string kernelName = "columnSum";
+		
+	std::vector< pair<size_t, const void *> > args;
+
+	args.push_back( make_pair( sizeof(cl_mem), (void *)&src.data));		
+	args.push_back( make_pair( sizeof(cl_mem), (void *)&dst.data));			
+	args.push_back( make_pair( sizeof(cl_int), (void *)&src.cols));		
+	args.push_back( make_pair( sizeof(cl_int), (void *)&src.rows));			
+	args.push_back( make_pair( sizeof(cl_int), (void *)&src.step));		
+	args.push_back( make_pair( sizeof(cl_int), (void *)&dst.step));		
+
+	size_t globalThreads[3] = {dst.cols, dst.rows, 1};					
+	size_t localThreads[3]  = {16, 16, 1};		
+
+	openCLExecuteKernel(clCxt, &imgproc_columnsum, kernelName, globalThreads, localThreads, args, src.channels(), src.depth());
+
+}
+#endif 
--- a/modules/ocl/src/fft.cpp
+++ b/modules/ocl/src/fft.cpp
@@ -0,0 +1,302 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
+// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// @Authors
+//    Peng Xiao, pengxiao@multicorewareinc.com
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other oclMaterials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors as is and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+#include <iomanip>
+#include "precomp.hpp"
+
+#ifdef HAVE_CLAMDFFT
+
+using namespace cv;
+using namespace cv::ocl;
+using namespace std;
+
+#if !defined (HAVE_OPENCL)
+void cv::ocl::dft(const oclMat& src, oclMat& dst, int flags) { throw_nogpu(); }
+#else
+
+#include <clAmdFft.h>
+
+namespace cv{ namespace ocl {
+    enum FftType
+    {
+        C2R = 1, // complex to complex
+        R2C = 2, // real to opencl HERMITIAN_INTERLEAVED
+        C2C = 3  // opencl HERMITIAN_INTERLEAVED to real
+    };
+    struct FftPlan
+    {
+        friend void fft_setup();
+        friend void fft_teardown();
+        ~FftPlan();
+    protected:
+        FftPlan(Size _dft_size, int _src_step, int _dst_step, int _flags, FftType _type);
+        const Size dft_size;
+        const int src_step, dst_step;
+        const int flags;
+        const FftType type;
+        clAmdFftPlanHandle plHandle;
+        static vector<FftPlan*> planStore;
+        static bool started;
+        static clAmdFftSetupData * setupData;
+    public:
+        // return a baked plan-> 
+        // if there is one matched plan, return it
+        // if not, bake a new one, put it into the planStore and return it.
+        static clAmdFftPlanHandle getPlan(Size _dft_size, int _src_step, int _dst_step, int _flags, FftType _type);
+    };
+}}
+bool cv::ocl::FftPlan::started = false;
+vector<cv::ocl::FftPlan*> cv::ocl::FftPlan::planStore = vector<cv::ocl::FftPlan*>();
+clAmdFftSetupData * cv::ocl::FftPlan::setupData = 0;
+
+void cv::ocl::fft_setup()
+{
+    if(FftPlan::started)
+    {
+        return;
+    }
+    FftPlan::setupData = new clAmdFftSetupData;
+    openCLSafeCall(clAmdFftInitSetupData( FftPlan::setupData ));
+    FftPlan::started = true;
+}
+void cv::ocl::fft_teardown()
+{
+    if(!FftPlan::started)
+    {
+        return;
+    }
+    delete FftPlan::setupData;
+    for(int i = 0; i < FftPlan::planStore.size(); i ++)
+    {
+        delete FftPlan::planStore[i];
+    }
+    FftPlan::planStore.clear();
+    openCLSafeCall( clAmdFftTeardown( ) );
+    FftPlan::started = false;
+}
+
+// bake a new plan
+cv::ocl::FftPlan::FftPlan(Size _dft_size, int _src_step, int _dst_step, int _flags, FftType _type)
+    : dft_size(_dft_size), src_step(_src_step), dst_step(_dst_step), flags(_flags), type(_type), plHandle(0)
+{
+    if(!FftPlan::started)
+    {
+        // implicitly do fft setup
+        fft_setup();
+    }
+
+    bool is_1d_input	= (_dft_size.height == 1);
+    int is_row_dft		= flags & DFT_ROWS;
+    int is_scaled_dft		= flags & DFT_SCALE;
+    int is_inverse			= flags & DFT_INVERSE;
+
+    clAmdFftResultLocation	place;
+    clAmdFftLayout			inLayout;
+    clAmdFftLayout			outLayout;
+    clAmdFftDim				dim = is_1d_input||is_row_dft ? CLFFT_1D : CLFFT_2D;
+
+    size_t batchSize		 = is_row_dft?dft_size.height : 1;
+    size_t clLengthsIn[ 3 ]  = {1, 1, 1};
+    size_t clStridesIn[ 3 ]  = {1, 1, 1};
+    size_t clLengthsOut[ 3 ] = {1, 1, 1};
+    size_t clStridesOut[ 3 ] = {1, 1, 1};
+    clLengthsIn[0]			 = dft_size.width;
+    clLengthsIn[1]			 = is_row_dft ? 1 : dft_size.height;
+    clStridesIn[0]			 = 1;
+    clStridesOut[0]			 = 1;
+
+    switch(_type)
+    {
+    case C2C:
+        inLayout        = CLFFT_COMPLEX_INTERLEAVED;
+        outLayout       = CLFFT_COMPLEX_INTERLEAVED;
+        clStridesIn[1]  = src_step / sizeof(std::complex<float>);
+        clStridesOut[1] = clStridesIn[1];
+        break;
+    case R2C:
+        CV_Assert(!is_row_dft); // this is not supported yet
+        inLayout        = CLFFT_REAL;
+        outLayout       = CLFFT_HERMITIAN_INTERLEAVED;
+        clStridesIn[1]  = src_step / sizeof(float);
+        clStridesOut[1] = dst_step / sizeof(std::complex<float>);
+        break;
+    case C2R:
+        CV_Assert(!is_row_dft); // this is not supported yet
+        inLayout        = CLFFT_HERMITIAN_INTERLEAVED;
+        outLayout       = CLFFT_REAL;
+        clStridesIn[1]  = src_step / sizeof(std::complex<float>);
+        clStridesOut[1] = dst_step / sizeof(float);
+        break;
+    default:
+        //std::runtime_error("does not support this convertion!");
+        cout << "Does not support this convertion!" << endl;
+        throw exception();
+        break;
+    }
+
+    clStridesIn[2]  = is_row_dft ? clStridesIn[1]  : dft_size.width * clStridesIn[1];
+    clStridesOut[2] = is_row_dft ? clStridesOut[1] : dft_size.width * clStridesOut[1];
+
+    openCLSafeCall( clAmdFftCreateDefaultPlan( &plHandle, Context::getContext()->impl->clContext, dim, clLengthsIn ) );
+
+    openCLSafeCall( clAmdFftSetResultLocation( plHandle, CLFFT_OUTOFPLACE ) );
+    openCLSafeCall( clAmdFftSetLayout( plHandle, inLayout, outLayout ) );
+    openCLSafeCall( clAmdFftSetPlanBatchSize( plHandle, batchSize ) );
+
+    openCLSafeCall( clAmdFftSetPlanInStride  ( plHandle, dim, clStridesIn ) );
+    openCLSafeCall( clAmdFftSetPlanOutStride ( plHandle, dim, clStridesOut ) );
+    openCLSafeCall( clAmdFftSetPlanDistance  ( plHandle, clStridesIn[ dim ], clStridesIn[ dim ]) );
+    openCLSafeCall( clAmdFftBakePlan( plHandle, 1, &(Context::getContext()->impl->clCmdQueue), NULL, NULL ) );
+}
+cv::ocl::FftPlan::~FftPlan()
+{
+    for(int i = 0; i < planStore.size(); i ++)
+    {
+        if(planStore[i]->plHandle == plHandle)
+        {
+            planStore.erase(planStore.begin()+ i);
+        }
+    }
+    openCLSafeCall( clAmdFftDestroyPlan( &plHandle ) );
+}
+
+clAmdFftPlanHandle cv::ocl::FftPlan::getPlan(Size _dft_size, int _src_step, int _dst_step, int _flags, FftType _type)
+{
+    // go through search
+    for(int i = 0; i < planStore.size(); i ++)
+    {
+        FftPlan * plan = planStore[i];
+        if(
+            plan->dft_size.width == _dft_size.width && 
+            plan->dft_size.height == _dft_size.height &&
+            plan->flags == _flags &&
+            plan->src_step == _src_step &&
+            plan->dst_step == _dst_step &&
+            plan->type == _type
+            )
+        {
+            return plan->plHandle;
+        }
+    }
+    // no baked plan is found
+    FftPlan *newPlan = new FftPlan(_dft_size, _src_step, _dst_step, _flags, _type);
+    planStore.push_back(newPlan);
+    return newPlan->plHandle;
+}
+
+void cv::ocl::dft(const oclMat& src, oclMat& dst, Size dft_size, int flags) 
+{
+    if(dft_size == Size(0,0))
+    {
+        dft_size = src.size();
+    }
+    // check if the given dft size is of optimal dft size
+    CV_Assert(dft_size.area() == getOptimalDFTSize(dft_size.area()));
+
+    // similar assertions with cuda module
+    CV_Assert(src.type() == CV_32F || src.type() == CV_32FC2);
+
+    // we don't support DFT_SCALE flag
+    CV_Assert(!(DFT_SCALE & flags));
+
+    bool is_1d_input	= (src.rows == 1);
+    int is_row_dft		= flags & DFT_ROWS;
+    int is_scaled_dft		= flags & DFT_SCALE;
+    int is_inverse			= flags & DFT_INVERSE;
+    bool is_complex_input	= src.channels() == 2;
+    bool is_complex_output	= !(flags & DFT_REAL_OUTPUT);
+
+    // We don't support real-to-real transform
+    CV_Assert(is_complex_input || is_complex_output);
+    FftType type = (FftType)(is_complex_input << 0 | is_complex_output << 1);
+
+    switch(type)
+    {
+    case C2C:
+        dst.create(src.rows, src.cols, CV_32FC2);
+        break;
+    case R2C:
+        CV_Assert(!is_row_dft); // this is not supported yet
+        dst.create(src.rows, src.cols/2 + 1, CV_32FC2);
+        break;
+    case C2R:
+        CV_Assert(dft_size.width / 2 + 1 == src.cols && dft_size.height == src.rows);
+        CV_Assert(!is_row_dft); // this is not supported yet
+        dst.create(src.rows, dft_size.width, CV_32FC1);
+        break;
+    default:
+        //std::runtime_error("does not support this convertion!");
+        cout << "Does not support this convertion!" << endl;
+        throw exception();
+        break;
+    }
+    clAmdFftPlanHandle plHandle = FftPlan::getPlan(dft_size, src.step, dst.step, flags, type);
+
+    //get the buffersize
+    size_t buffersize=0;
+    openCLSafeCall( clAmdFftGetTmpBufSize(plHandle, &buffersize ) );
+
+    //allocate the intermediate buffer	
+    cl_mem clMedBuffer=NULL;
+    if (buffersize)
+    {
+        cl_int medstatus;
+        clMedBuffer = clCreateBuffer ( src.clCxt->impl->clContext, CL_MEM_READ_WRITE, buffersize, 0, &medstatus);
+        openCLSafeCall( medstatus );
+    }
+    openCLSafeCall( clAmdFftEnqueueTransform( plHandle, 
+        is_inverse?CLFFT_BACKWARD:CLFFT_FORWARD, 
+        1, 
+        &src.clCxt->impl->clCmdQueue, 
+        0, NULL, NULL, 
+        (cl_mem*)&src.data, (cl_mem*)&dst.data, clMedBuffer ) );
+    openCLSafeCall( clFinish(src.clCxt->impl->clCmdQueue) );
+    if(clMedBuffer)
+    {
+        openCLFree(clMedBuffer);
+    }
+}
+
+#endif
+#endif //HAVE_CLAMDFFT
--- a/modules/ocl/src/gemm.cpp
+++ b/modules/ocl/src/gemm.cpp
@@ -0,0 +1,161 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
+// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// @Authors
+//    Peng Xiao, pengxiao@multicorewareinc.com
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other oclMaterials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors as is and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include <iomanip>
+#include "precomp.hpp"
+
+#ifdef HAVE_CLAMDBLAS
+
+#include "clAmdBlas.h"
+
+#if !defined (HAVE_OPENCL)
+void cv::ocl::dft(const oclMat& src, oclMat& dst, int flags) { throw_nogpu(); }
+#else
+
+using namespace cv;
+
+	void cv::ocl::gemm(const oclMat& src1, const oclMat& src2, double alpha,
+		const oclMat& src3, double beta, oclMat& dst, int flags)
+	{
+		CV_Assert(src1.cols == src2.rows && 
+			(src3.empty() || src1.rows == src3.rows && src2.cols == src3.cols));
+		CV_Assert(!(cv::GEMM_3_T & flags)); // cv::GEMM_3_T is not supported
+		if(!src3.empty())
+		{
+			src3.copyTo(dst);
+		}
+		else
+		{
+			dst.create(src1.rows, src2.cols, src1.type());
+			dst.setTo(Scalar::all(0));
+		}
+		openCLSafeCall( clAmdBlasSetup() );
+		
+		const clAmdBlasTranspose transA = (cv::GEMM_1_T & flags)?clAmdBlasTrans:clAmdBlasNoTrans;
+		const clAmdBlasTranspose transB = (cv::GEMM_2_T & flags)?clAmdBlasTrans:clAmdBlasNoTrans;
+		const clAmdBlasOrder     order  = clAmdBlasRowMajor;
+
+		const int M = src1.rows;
+		const int N = src2.cols;
+		const int K = src1.cols;
+		int lda     = src1.step;
+		int ldb     = src2.step;
+		int ldc     = dst.step;
+		int offa    = src1.offset;
+		int offb    = src2.offset;
+		int offc    = dst.offset;
+
+
+		switch(src1.type())
+		{
+		case CV_32FC1:
+			lda  /= sizeof(float);
+			ldb  /= sizeof(float);
+			ldc  /= sizeof(float);
+			offa /= sizeof(float);
+			offb /= sizeof(float);
+			offc /= sizeof(float);
+			openCLSafeCall
+			(
+				clAmdBlasSgemmEx(order, transA, transB, M, N, K,
+					alpha, (const cl_mem)src1.data, offa, lda, (const cl_mem)src2.data, offb, ldb, 
+					beta, (cl_mem)dst.data, offc, ldc, 1, &src1.clCxt->impl->clCmdQueue, 0, NULL, NULL)
+			);
+			break;
+		case CV_64FC1:
+			lda  /= sizeof(double);
+			ldb  /= sizeof(double);
+			ldc  /= sizeof(double);
+			offa /= sizeof(double);
+			offb /= sizeof(double);
+			offc /= sizeof(double);
+			openCLSafeCall
+			(
+				clAmdBlasDgemmEx(order, transA, transB, M, N, K,
+					alpha, (const cl_mem)src1.data, offa, lda, (const cl_mem)src2.data, offb, ldb, 
+					beta, (cl_mem)dst.data, offc, ldc, 1, &src1.clCxt->impl->clCmdQueue, 0, NULL, NULL)
+			);
+			break;
+		case CV_32FC2:
+			{
+				lda  /= sizeof(std::complex<float>);
+				ldb  /= sizeof(std::complex<float>);
+				ldc  /= sizeof(std::complex<float>);
+				offa /= sizeof(std::complex<float>);
+				offb /= sizeof(std::complex<float>);
+				offc /= sizeof(std::complex<float>);
+				cl_float2 alpha_2 = {{alpha, 0}};
+				cl_float2 beta_2  = {{beta, 0}};
+				openCLSafeCall
+				(
+					clAmdBlasCgemmEx(order, transA, transB, M, N, K,
+						alpha_2, (const cl_mem)src1.data, offa, lda, (const cl_mem)src2.data, offb, ldb, 
+						beta_2, (cl_mem)dst.data, offc, ldc, 1, &src1.clCxt->impl->clCmdQueue, 0, NULL, NULL)
+				);
+			}
+			break;
+		case CV_64FC2:
+			{
+				lda  /= sizeof(std::complex<double>);
+				ldb  /= sizeof(std::complex<double>);
+				ldc  /= sizeof(std::complex<double>);
+				offa /= sizeof(std::complex<double>);
+				offb /= sizeof(std::complex<double>);
+				offc /= sizeof(std::complex<double>);
+				cl_double2 alpha_2 = {{alpha, 0}};
+				cl_double2 beta_2  = {{beta, 0}};
+				openCLSafeCall
+				(
+					clAmdBlasZgemmEx(order, transA, transB, M, N, K,
+						alpha_2, (const cl_mem)src1.data, offa, lda, (const cl_mem)src2.data, offb, ldb, 
+						beta_2, (cl_mem)dst.data, offc, ldc, 1, &src1.clCxt->impl->clCmdQueue, 0, NULL, NULL)
+				);
+			}
+			break;
+		}
+		clAmdBlasTeardown();
+	}
+#endif
+#endif
--- a/modules/ocl/src/kernels/blend_linear.cl
+++ b/modules/ocl/src/kernels/blend_linear.cl
@@ -0,0 +1,196 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2010-2012, MulticoreWare Inc., all rights reserved.
+// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// @Authors
+//    Liu Liujun, liujun@multicorewareinc.com 
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other GpuMaterials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors as is and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+__kernel void BlendLinear_C1_D0(
+	__global uchar *dst,
+	__global uchar *img1,
+	__global uchar *img2,
+	__global float *weight1,
+	__global float *weight2,
+	int rows,
+	int cols,
+	int istep,
+	int wstep
+	)
+{
+	int idx = get_global_id(0);
+	int idy = get_global_id(1);
+	if (idx < cols && idy < rows)
+	{
+		int pos = idy * istep + idx;
+		int wpos = idy * (wstep /sizeof(float)) + idx;
+		float w1 = weight1[wpos];
+		float w2 = weight2[wpos];
+		dst[pos] = (img1[pos] * w1 + img2[pos] * w2) / (w1 + w2 + 1e-5f);
+
+	}
+}
+
+__kernel void BlendLinear_C3_D0(
+	__global uchar *dst,
+	__global uchar *img1,
+	__global uchar *img2,
+	__global float *weight1,
+	__global float *weight2,
+	int rows,
+	int cols,
+	int istep,
+	int wstep
+	)
+{
+	int idx = get_global_id(0);
+	int idy = get_global_id(1);
+	int x = idx / 3;
+	int y = idy;
+	if (x < cols && y < rows)
+	{
+		int pos = idy * istep + idx;
+		int wpos = idy * (wstep /sizeof(float)) + x;
+		float w1 = weight1[wpos];
+		float w2 = weight2[wpos];
+		dst[pos] = (img1[pos] * w1 + img2[pos] * w2) / (w1 + w2 + 1e-5f);
+	}
+}
+
+__kernel void BlendLinear_C4_D0(
+	__global uchar *dst,
+	__global uchar *img1,
+	__global uchar *img2,
+	__global float *weight1,
+	__global float *weight2,
+	int rows,
+	int cols,
+	int istep,
+	int wstep
+	)
+{
+	int idx = get_global_id(0);
+	int idy = get_global_id(1);
+	int x = idx / 4;
+	int y = idy;
+	if (x < cols && y < rows)
+	{
+		int pos = idy * istep + idx;
+		int wpos = idy * (wstep /sizeof(float)) + x;
+		float w1 = weight1[wpos];
+		float w2 = weight2[wpos];
+		dst[pos] = (img1[pos] * w1 + img2[pos] * w2) / (w1 + w2 + 1e-5f);
+	}
+}
+
+__kernel void BlendLinear_C1_D5(
+	__global float *dst,
+	__global float *img1,
+	__global float *img2,
+	__global float *weight1,
+	__global float *weight2,
+	int rows,
+	int cols,
+	int istep,
+	int wstep
+	)
+{
+	int idx = get_global_id(0);
+	int idy = get_global_id(1);
+	if (idx < cols && idy < rows)
+	{
+		int pos = idy * (istep / sizeof(float)) + idx;
+		int wpos = idy * (wstep /sizeof(float)) + idx;
+		float w1 = weight1[wpos];
+		float w2 = weight2[wpos];
+		dst[pos] = (img1[pos] * w1 + img2[pos] * w2) / (w1 + w2 + 1e-5f);
+	}
+}
+
+__kernel void BlendLinear_C3_D5(
+	__global float *dst,
+	__global float *img1,
+	__global float *img2,
+	__global float *weight1,
+	__global float *weight2,
+	int rows,
+	int cols,
+	int istep,
+	int wstep
+	)
+{
+	int idx = get_global_id(0);
+	int idy = get_global_id(1);
+	int x = idx / 3;
+	int y = idy;
+	if (x < cols && y < rows)
+	{
+		int pos = idy * (istep / sizeof(float)) + idx;
+		int wpos = idy * (wstep /sizeof(float)) + x;
+		float w1 = weight1[wpos];
+		float w2 = weight2[wpos];
+		dst[pos] = (img1[pos] * w1 + img2[pos] * w2) / (w1 + w2 + 1e-5f);
+	}
+}
+
+__kernel void BlendLinear_C4_D5(
+	__global float *dst,
+	__global float *img1,
+	__global float *img2,
+	__global float *weight1,
+	__global float *weight2,
+	int rows,
+	int cols,
+	int istep,
+	int wstep
+	)
+{
+	int idx = get_global_id(0);
+	int idy = get_global_id(1);
+	int x = idx / 4;
+	int y = idy;
+	if (x < cols && y < rows)
+	{
+		int pos = idy * (istep / sizeof(float)) + idx;
+		int wpos = idy * (wstep /sizeof(float)) + x;
+		float w1 = weight1[wpos];
+		float w2 = weight2[wpos];
+		dst[pos] = (img1[pos] * w1 + img2[pos] * w2) / (w1 + w2 + 1e-5f);
+	}
+}
--- a/modules/ocl/src/kernels/imgproc_columnsum.cl
+++ b/modules/ocl/src/kernels/imgproc_columnsum.cl
@@ -0,0 +1,80 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
+// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// @Authors
+//    Chunpeng Zhang chunpeng@multicorewareinc.com
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other oclMaterials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors as is and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#pragma OPENCL EXTENSION cl_amd_printf : enable
+#if defined (__ATI__)
+#pragma OPENCL EXTENSION cl_amd_fp64:enable
+
+#elif defined (__NVIDIA__)
+#pragma OPENCL EXTENSION cl_khr_fp64:enable
+#endif
+
+////////////////////////////////////////////////////////////////////
+///////////////////////// columnSum ////////////////////////////////
+////////////////////////////////////////////////////////////////////
+/// CV_32FC1
+__kernel void columnSum_C1_D5(__global float* src,__global float* dst,int srcCols,int srcRows,int srcStep,int dstStep)
+{
+	const int x = get_global_id(0);
+	
+	srcStep >>= 2;
+	dstStep >>= 2;
+
+	if (x < srcCols)
+    {
+		int srcIdx = x ;
+		int dstIdx = x ;
+
+        float sum = 0;
+		
+        for (int y = 0; y < srcRows; ++y)
+        {
+			sum += src[srcIdx];
+            dst[dstIdx] = sum;
+			srcIdx += srcStep;
+			dstIdx += dstStep;	
+        }
+	}
+}
--- a/modules/ocl/src/kernels/match_template.cl
+++ b/modules/ocl/src/kernels/match_template.cl
@@ -0,0 +1,824 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
+// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// @Authors
+//    Peng Xiao, pengxiao@multicorewareinc.com
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other oclMaterials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors as is and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#pragma OPENCL EXTENSION cl_amd_printf : enable
+
+#if defined (__ATI__)
+#pragma OPENCL EXTENSION cl_amd_fp64:enable
+
+#elif defined (__NVIDIA__)
+#pragma OPENCL EXTENSION cl_khr_fp64:enable
+#endif
+
+#if !defined(USE_SQR_INTEGRAL) && (defined (__ATI__) || defined (__NVIDIA__))
+#define TYPE_IMAGE_SQSUM double
+#else
+#define TYPE_IMAGE_SQSUM ulong
+#endif
+
+//////////////////////////////////////////////////
+// utilities
+#define SQSUMS_PTR(ox, oy) mad24(gidy + oy, img_sqsums_step, gidx + img_sqsums_offset + ox)
+#define SUMS_PTR(ox, oy) mad24(gidy + oy, img_sums_step, gidx + img_sums_offset + ox)
+// normAcc* are accurate normalization routines which make GPU matchTemplate
+// consistent with CPU one
+float normAcc(float num, float denum)
+{
+	if(fabs(num) < denum)
+	{
+		return num / denum;
+	}
+	if(fabs(num) < denum * 1.125f)
+	{
+		return num > 0 ? 1 : -1;
+	}
+	return 0;
+}
+
+float normAcc_SQDIFF(float num, float denum)
+{
+	if(fabs(num) < denum)
+	{
+		return num / denum;
+	}
+	if(fabs(num) < denum * 1.125f)
+	{
+		return num > 0 ? 1 : -1;
+	}
+	return 1;
+}
+//////////////////////////////////////////////////////////////////////
+// normalize
+
+__kernel 
+void normalizeKernel_C1_D0
+(
+	__global const TYPE_IMAGE_SQSUM * img_sqsums,
+	__global float * res,
+	ulong tpl_sqsum,
+	int res_rows,
+	int res_cols,
+	int tpl_rows,
+	int tpl_cols, 
+	int img_sqsums_offset,
+	int img_sqsums_step,
+	int res_offset,
+	int res_step
+)
+{
+	int gidx = get_global_id(0);
+	int gidy = get_global_id(1);
+
+	res_step   /= sizeof(*res);
+	res_offset /= sizeof(*res);
+	img_sqsums_step /= sizeof(*img_sqsums);
+	img_sqsums_offset /= sizeof(*img_sqsums);
+	int res_idx = mad24(gidy, res_step, res_offset + gidx);
+	if(gidx < res_cols && gidy < res_rows)
+	{
+		float image_sqsum_ = (float)(
+			(img_sqsums[SQSUMS_PTR(tpl_cols, tpl_rows)] - img_sqsums[SQSUMS_PTR(tpl_cols, 0)]) -
+			(img_sqsums[SQSUMS_PTR(0, tpl_rows)] - img_sqsums[SQSUMS_PTR(0, 0)]));
+		res[res_idx] = normAcc(res[res_idx], sqrt(image_sqsum_ * tpl_sqsum));
+	}
+}
+
+__kernel 
+void matchTemplate_Prepared_SQDIFF_C1_D0
+(
+	__global const TYPE_IMAGE_SQSUM * img_sqsums,
+	__global float * res,
+	ulong tpl_sqsum,
+	int res_rows,
+	int res_cols,
+	int tpl_rows,
+	int tpl_cols, 
+	int img_sqsums_offset,
+	int img_sqsums_step,
+	int res_offset,
+	int res_step
+)
+{
+	int gidx = get_global_id(0);
+	int gidy = get_global_id(1);
+
+	res_step   /= sizeof(*res);
+	res_offset /= sizeof(*res);
+	img_sqsums_step /= sizeof(*img_sqsums);
+	img_sqsums_offset /= sizeof(*img_sqsums);
+	int res_idx = mad24(gidy, res_step, res_offset + gidx);
+	if(gidx < res_cols && gidy < res_rows)
+	{
+		float image_sqsum_ = (float)(
+			(img_sqsums[SQSUMS_PTR(tpl_cols, tpl_rows)] - img_sqsums[SQSUMS_PTR(tpl_cols, 0)]) -
+			(img_sqsums[SQSUMS_PTR(0, tpl_rows)] - img_sqsums[SQSUMS_PTR(0, 0)]));
+		res[res_idx] = image_sqsum_ - 2.f * res[res_idx] + tpl_sqsum;
+	}
+}
+
+__kernel 
+void matchTemplate_Prepared_SQDIFF_NORMED_C1_D0
+(
+	__global const TYPE_IMAGE_SQSUM * img_sqsums,
+	__global float * res,
+	ulong tpl_sqsum,
+	int res_rows,
+	int res_cols,
+	int tpl_rows,
+	int tpl_cols, 
+	int img_sqsums_offset,
+	int img_sqsums_step,
+	int res_offset,
+	int res_step
+)
+{
+	int gidx = get_global_id(0);
+	int gidy = get_global_id(1);
+
+	res_step   /= sizeof(*res);
+	res_offset /= sizeof(*res);
+	img_sqsums_step /= sizeof(*img_sqsums);
+	img_sqsums_offset /= sizeof(*img_sqsums);
+	int res_idx = mad24(gidy, res_step, res_offset + gidx);
+	if(gidx < res_cols && gidy < res_rows)
+	{
+		float image_sqsum_ = (float)(
+			(img_sqsums[SQSUMS_PTR(tpl_cols, tpl_rows)] - img_sqsums[SQSUMS_PTR(tpl_cols, 0)]) -
+			(img_sqsums[SQSUMS_PTR(0, tpl_rows)] - img_sqsums[SQSUMS_PTR(0, 0)]));
+		res[res_idx] = normAcc_SQDIFF(image_sqsum_ - 2.f * res[res_idx] + tpl_sqsum,
+										sqrt(image_sqsum_ * tpl_sqsum));
+	}
+}
+
+//////////////////////////////////////////////////
+// SQDIFF
+__kernel 
+void matchTemplate_Naive_SQDIFF_C1_D0
+(
+	__global const uchar * img,
+	__global const uchar * tpl,
+	__global float * res,
+	int img_rows,
+	int img_cols,
+	int tpl_rows,
+	int tpl_cols, 
+	int res_rows,
+	int res_cols,
+	int img_offset,
+	int tpl_offset,
+	int res_offset,
+	int img_step,
+	int tpl_step,
+	int res_step
+)
+{
+	int gidx = get_global_id(0);
+	int gidy = get_global_id(1);
+	int i,j;
+	int delta;
+	int sum = 0;
+	res_step   /= sizeof(*res);
+	res_offset /= sizeof(*res);
+	int res_idx = mad24(gidy, res_step, res_offset + gidx);
+
+	if(gidx < res_cols && gidy < res_rows)
+	{
+		for(i = 0; i < tpl_rows; i ++)
+		{
+			// get specific rows of img data
+			__global const uchar * img_ptr = img + mad24(gidy + i, img_step, gidx + img_offset);
+			__global const uchar * tpl_ptr = tpl + mad24(i, tpl_step, tpl_offset);
+			for(j = 0; j < tpl_cols; j ++)
+			{
+				delta = img_ptr[j] - tpl_ptr[j];
+				sum   = mad24(delta, delta, sum);
+			}
+		}
+		res[res_idx] = sum;
+	}
+}
+
+__kernel 
+void matchTemplate_Naive_SQDIFF_C1_D5
+(
+	__global const float * img,
+	__global const float * tpl,
+	__global float * res,
+	int img_rows,
+	int img_cols,
+	int tpl_rows,
+	int tpl_cols, 
+	int res_rows,
+	int res_cols,
+	int img_offset,
+	int tpl_offset,
+	int res_offset,
+	int img_step,
+	int tpl_step,
+	int res_step
+)
+{
+	int gidx = get_global_id(0);
+	int gidy = get_global_id(1);
+	int i,j;
+	float delta;
+	float sum = 0;
+	img_step   /= sizeof(*img);
+	img_offset /= sizeof(*img);
+	tpl_step   /= sizeof(*tpl);
+	tpl_offset /= sizeof(*tpl);
+	res_step   /= sizeof(*res);
+	res_offset /= sizeof(*res);
+
+	int res_idx = mad24(gidy, res_step, res_offset + gidx);
+
+	if(gidx < res_cols && gidy < res_rows)
+	{
+		for(i = 0; i < tpl_rows; i ++)
+		{
+			// get specific rows of img data
+			__global const float * img_ptr = img + mad24(gidy + i, img_step, gidx + img_offset);
+			__global const float * tpl_ptr = tpl + mad24(i, tpl_step, tpl_offset);
+			for(j = 0; j < tpl_cols; j ++)
+			{
+				delta = img_ptr[j] - tpl_ptr[j];
+				sum   = mad(delta, delta, sum);
+			}
+		}
+		res[res_idx] = sum;
+	}
+}
+
+__kernel 
+void matchTemplate_Naive_SQDIFF_C4_D0
+(
+	__global const uchar4 * img,
+	__global const uchar4 * tpl,
+	__global float * res,
+	int img_rows,
+	int img_cols,
+	int tpl_rows,
+	int tpl_cols, 
+	int res_rows,
+	int res_cols,
+	int img_offset,
+	int tpl_offset,
+	int res_offset,
+	int img_step,
+	int tpl_step,
+	int res_step
+)
+{
+	int gidx = get_global_id(0);
+	int gidy = get_global_id(1);
+	int i,j;
+	int4 delta;
+	int4 sum = (int4)(0, 0, 0, 0);
+	img_step   /= sizeof(*img);
+	img_offset /= sizeof(*img);
+	tpl_step   /= sizeof(*tpl);
+	tpl_offset /= sizeof(*tpl);
+	res_step   /= sizeof(*res);
+	res_offset /= sizeof(*res);
+
+	int res_idx = mad24(gidy, res_step, res_offset + gidx);
+
+	if(gidx < res_cols && gidy < res_rows)
+	{
+		for(i = 0; i < tpl_rows; i ++)
+		{
+			// get specific rows of img data
+			__global const uchar4 * img_ptr = img + mad24(gidy + i, img_step, gidx + img_offset);
+			__global const uchar4 * tpl_ptr = tpl + mad24(i, tpl_step, tpl_offset);
+			for(j = 0; j < tpl_cols; j ++)
+			{
+				//delta = convert_int4(img_ptr[j] - tpl_ptr[j]); // this alternative is incorrect
+				delta.x = img_ptr[j].x - tpl_ptr[j].x;
+				delta.y = img_ptr[j].y - tpl_ptr[j].y;
+				delta.z = img_ptr[j].z - tpl_ptr[j].z;
+				delta.w = img_ptr[j].w - tpl_ptr[j].w;
+				sum   = mad24(delta, delta, sum);
+			}
+		}
+		res[res_idx] = sum.x + sum.y + sum.z + sum.w;
+	}
+}
+
+__kernel 
+void matchTemplate_Naive_SQDIFF_C4_D5
+(
+	__global const float4 * img,
+	__global const float4 * tpl,
+	__global float * res,
+	int img_rows,
+	int img_cols,
+	int tpl_rows,
+	int tpl_cols, 
+	int res_rows,
+	int res_cols,
+	int img_offset,
+	int tpl_offset,
+	int res_offset,
+	int img_step,
+	int tpl_step,
+	int res_step
+)
+{
+	int gidx = get_global_id(0);
+	int gidy = get_global_id(1);
+	int i,j;
+	float4 delta;
+	float4 sum = (float4)(0, 0, 0, 0);
+	img_step   /= sizeof(*img);
+	img_offset /= sizeof(*img);
+	tpl_step   /= sizeof(*tpl);
+	tpl_offset /= sizeof(*tpl);
+	res_step   /= sizeof(*res);
+	res_offset /= sizeof(*res);
+
+	int res_idx = mad24(gidy, res_step, res_offset + gidx);
+
+	if(gidx < res_cols && gidy < res_rows)
+	{
+		for(i = 0; i < tpl_rows; i ++)
+		{
+			// get specific rows of img data
+			__global const float4 * img_ptr = img + mad24(gidy + i, img_step, gidx + img_offset);
+			__global const float4 * tpl_ptr = tpl + mad24(i, tpl_step, tpl_offset);
+			for(j = 0; j < tpl_cols; j ++)
+			{
+				//delta = convert_int4(img_ptr[j] - tpl_ptr[j]); // this alternative is incorrect
+				delta.x = img_ptr[j].x - tpl_ptr[j].x;
+				delta.y = img_ptr[j].y - tpl_ptr[j].y;
+				delta.z = img_ptr[j].z - tpl_ptr[j].z;
+				delta.w = img_ptr[j].w - tpl_ptr[j].w;
+				sum   = mad(delta, delta, sum);
+			}
+		}
+		res[res_idx] = sum.x + sum.y + sum.z + sum.w;
+	}
+}
+
+//////////////////////////////////////////////////
+// CCORR
+__kernel 
+void matchTemplate_Naive_CCORR_C1_D0
+(
+	__global const uchar * img,
+	__global const uchar * tpl,
+	__global float * res,
+	int img_rows,
+	int img_cols,
+	int tpl_rows,
+	int tpl_cols, 
+	int res_rows,
+	int res_cols,
+	int img_offset,
+	int tpl_offset,
+	int res_offset,
+	int img_step,
+	int tpl_step,
+	int res_step
+)
+{
+	int gidx = get_global_id(0);
+	int gidy = get_global_id(1);
+	int i,j;
+	int sum = 0;
+	res_step   /= sizeof(*res);
+	res_offset /= sizeof(*res);
+
+	int res_idx = mad24(gidy, res_step, res_offset + gidx);
+
+	if(gidx < res_cols && gidy < res_rows)
+	{
+		for(i = 0; i < tpl_rows; i ++)
+		{
+			// get specific rows of img data
+			__global const uchar * img_ptr = img + mad24(gidy + i, img_step, gidx + img_offset);
+			__global const uchar * tpl_ptr = tpl + mad24(i, tpl_step, tpl_offset);
+			for(j = 0; j < tpl_cols; j ++)
+			{
+				sum = mad24(img_ptr[j], tpl_ptr[j], sum);
+			}
+		}
+		res[res_idx] = sum;
+	}
+}
+
+__kernel 
+void matchTemplate_Naive_CCORR_C1_D5
+(
+	__global const float * img,
+	__global const float * tpl,
+	__global float * res,
+	int img_rows,
+	int img_cols,
+	int tpl_rows,
+	int tpl_cols, 
+	int res_rows,
+	int res_cols,
+	int img_offset,
+	int tpl_offset,
+	int res_offset,
+	int img_step,
+	int tpl_step,
+	int res_step
+)
+{
+	int gidx = get_global_id(0);
+	int gidy = get_global_id(1);
+	int i,j;
+	float sum = 0;
+	img_step   /= sizeof(*img);
+	img_offset /= sizeof(*img);
+	tpl_step   /= sizeof(*tpl);
+	tpl_offset /= sizeof(*tpl);
+	res_step   /= sizeof(*res);
+	res_offset /= sizeof(*res);
+
+	int res_idx = mad24(gidy, res_step, res_offset + gidx);
+
+	if(gidx < res_cols && gidy < res_rows)
+	{
+		for(i = 0; i < tpl_rows; i ++)
+		{
+			// get specific rows of img data
+			__global const float * img_ptr = img + mad24(gidy + i, img_step, gidx + img_offset);
+			__global const float * tpl_ptr = tpl + mad24(i, tpl_step, tpl_offset);
+			for(j = 0; j < tpl_cols; j ++)
+			{
+				sum = mad(img_ptr[j], tpl_ptr[j], sum);
+			}
+		}
+		res[res_idx] = sum;
+	}
+}
+
+__kernel 
+void matchTemplate_Naive_CCORR_C4_D0
+(
+	__global const uchar4 * img,
+	__global const uchar4 * tpl,
+	__global float * res,
+	int img_rows,
+	int img_cols,
+	int tpl_rows,
+	int tpl_cols, 
+	int res_rows,
+	int res_cols,
+	int img_offset,
+	int tpl_offset,
+	int res_offset,
+	int img_step,
+	int tpl_step,
+	int res_step
+)
+{
+	int gidx = get_global_id(0);
+	int gidy = get_global_id(1);
+	int i,j;
+	int4 sum = (int4)(0, 0, 0, 0);
+	img_step   /= sizeof(*img);
+	img_offset /= sizeof(*img);
+	tpl_step   /= sizeof(*tpl);
+	tpl_offset /= sizeof(*tpl);
+	res_step   /= sizeof(*res);
+	res_offset /= sizeof(*res);
+
+	int res_idx = mad24(gidy, res_step, res_offset + gidx);
+
+	if(gidx < res_cols && gidy < res_rows)
+	{
+		for(i = 0; i < tpl_rows; i ++)
+		{
+			// get specific rows of img data
+			__global const uchar4 * img_ptr = img + mad24(gidy + i, img_step, gidx + img_offset);
+			__global const uchar4 * tpl_ptr = tpl + mad24(i, tpl_step, tpl_offset);
+			for(j = 0; j < tpl_cols; j ++)
+			{
+				sum   = mad24(convert_int4(img_ptr[j]), convert_int4(tpl_ptr[j]), sum);
+			}
+		}
+		res[res_idx] = sum.x + sum.y + sum.z + sum.w;
+	}
+}
+
+__kernel 
+void matchTemplate_Naive_CCORR_C4_D5
+(
+	__global const float4 * img,
+	__global const float4 * tpl,
+	__global float * res,
+	int img_rows,
+	int img_cols,
+	int tpl_rows,
+	int tpl_cols, 
+	int res_rows,
+	int res_cols,
+	int img_offset,
+	int tpl_offset,
+	int res_offset,
+	int img_step,
+	int tpl_step,
+	int res_step
+)
+{
+	int gidx = get_global_id(0);
+	int gidy = get_global_id(1);
+	int i,j;
+	float4 sum = (float4)(0, 0, 0, 0);
+	img_step   /= sizeof(*img);
+	img_offset /= sizeof(*img);
+	tpl_step   /= sizeof(*tpl);
+	tpl_offset /= sizeof(*tpl);
+	res_step   /= sizeof(*res);
+	res_offset /= sizeof(*res);
+
+	int res_idx = mad24(gidy, res_step, res_offset + gidx);
+
+	if(gidx < res_cols && gidy < res_rows)
+	{
+		for(i = 0; i < tpl_rows; i ++)
+		{
+			// get specific rows of img data
+			__global const float4 * img_ptr = img + mad24(gidy + i, img_step, gidx + img_offset);
+			__global const float4 * tpl_ptr = tpl + mad24(i, tpl_step, tpl_offset);
+			for(j = 0; j < tpl_cols; j ++)
+			{
+				sum = mad(convert_float4(img_ptr[j]), convert_float4(tpl_ptr[j]), sum);
+			}
+		}
+		res[res_idx] = sum.x + sum.y + sum.z + sum.w;
+	}
+}
+
+//////////////////////////////////////////////////
+// CCOFF
+__kernel 
+void matchTemplate_Prepared_CCOFF_C1_D0
+(
+	__global float * res,
+	int img_rows,
+	int img_cols,
+	int tpl_rows,
+	int tpl_cols, 
+	int res_rows,
+	int res_cols,
+	int res_offset,
+	int res_step,
+	__global const uint * img_sums,
+	int img_sums_offset,
+	int img_sums_step,
+	float tpl_sum
+)
+{
+	int gidx = get_global_id(0);
+	int gidy = get_global_id(1);
+	
+	img_sums_offset   /= sizeof(*img_sums);
+	img_sums_step     /= sizeof(*img_sums);
+	res_step   /= sizeof(*res);
+	res_offset /= sizeof(*res);
+	
+	int res_idx = mad24(gidy, res_step, res_offset + gidx);
+
+	if(gidx < res_cols && gidy < res_rows)
+	{
+		float sum = (float)(
+			(img_sums[SUMS_PTR(tpl_cols, tpl_rows)] - img_sums[SUMS_PTR(tpl_cols, 0)])
+		  - (img_sums[SUMS_PTR(0, tpl_rows)] - img_sums[SUMS_PTR(0, 0)]));
+		res[res_idx] -= sum * tpl_sum;
+	}
+}
+__kernel 
+void matchTemplate_Prepared_CCOFF_C4_D0
+(
+	__global float * res,
+	int img_rows,
+	int img_cols,
+	int tpl_rows,
+	int tpl_cols, 
+	int res_rows,
+	int res_cols,
+	int res_offset,
+	int res_step,
+	__global const uint * img_sums_c0,
+	__global const uint * img_sums_c1,
+	__global const uint * img_sums_c2,
+	__global const uint * img_sums_c3,
+	int img_sums_offset,
+	int img_sums_step,
+	float tpl_sum_c0,
+	float tpl_sum_c1,
+	float tpl_sum_c2,
+	float tpl_sum_c3
+)
+{
+	int gidx = get_global_id(0);
+	int gidy = get_global_id(1);
+	
+	img_sums_offset   /= sizeof(*img_sums_c0);
+	img_sums_step     /= sizeof(*img_sums_c0);
+	res_step   /= sizeof(*res);
+	res_offset /= sizeof(*res);
+	
+	int res_idx = mad24(gidy, res_step, res_offset + gidx);
+
+	if(gidx < res_cols && gidy < res_rows)
+	{
+		float ccorr = res[res_idx];
+		ccorr -= tpl_sum_c0*(float)(
+			(img_sums_c0[SUMS_PTR(tpl_cols, tpl_rows)] - img_sums_c0[SUMS_PTR(tpl_cols, 0)])
+		  - (img_sums_c0[SUMS_PTR(0, tpl_rows)] - img_sums_c0[SUMS_PTR(0, 0)]));
+		ccorr -= tpl_sum_c1*(float)(
+			(img_sums_c1[SUMS_PTR(tpl_cols, tpl_rows)] - img_sums_c1[SUMS_PTR(tpl_cols, 0)])
+		  - (img_sums_c1[SUMS_PTR(0, tpl_rows)] - img_sums_c1[SUMS_PTR(0, 0)]));
+		ccorr -= tpl_sum_c2*(float)(
+			(img_sums_c2[SUMS_PTR(tpl_cols, tpl_rows)] - img_sums_c2[SUMS_PTR(tpl_cols, 0)])
+		  - (img_sums_c2[SUMS_PTR(0, tpl_rows)] - img_sums_c2[SUMS_PTR(0, 0)]));
+		ccorr -= tpl_sum_c3*(float)(
+			(img_sums_c3[SUMS_PTR(tpl_cols, tpl_rows)] - img_sums_c3[SUMS_PTR(tpl_cols, 0)])
+		  - (img_sums_c3[SUMS_PTR(0, tpl_rows)] - img_sums_c3[SUMS_PTR(0, 0)]));
+		res[res_idx] = ccorr;
+	}
+}
+
+__kernel
+void matchTemplate_Prepared_CCOFF_NORMED_C1_D0
+(
+	__global float * res,
+	int img_rows,
+	int img_cols,
+	int tpl_rows,
+	int tpl_cols, 
+	int res_rows,
+	int res_cols,
+	int res_offset,
+	int res_step,
+	float weight,
+	__global const uint * img_sums,
+	int img_sums_offset,
+	int img_sums_step,
+	__global const TYPE_IMAGE_SQSUM * img_sqsums,
+	int img_sqsums_offset,
+	int img_sqsums_step,
+	float tpl_sum,
+	float tpl_sqsum
+)
+{
+	int gidx = get_global_id(0);
+	int gidy = get_global_id(1);
+	
+	img_sqsums_step   /= sizeof(*img_sqsums);
+	img_sqsums_offset /= sizeof(*img_sqsums);
+	img_sums_offset   /= sizeof(*img_sums);
+	img_sums_step     /= sizeof(*img_sums);
+	res_step   /= sizeof(*res);
+	res_offset /= sizeof(*res);
+
+	
+	int res_idx = mad24(gidy, res_step, res_offset + gidx);
+
+	if(gidx < res_cols && gidy < res_rows)
+	{
+		float image_sum_ =  (float)(
+			(img_sums[SUMS_PTR(tpl_cols, tpl_rows)] - img_sums[SUMS_PTR(tpl_cols, 0)])
+		  - (img_sums[SUMS_PTR(0, tpl_rows)] - img_sums[SUMS_PTR(0, 0)]));
+
+		float image_sqsum_ = (float)(
+			(img_sqsums[SQSUMS_PTR(tpl_cols, tpl_rows)] - img_sqsums[SQSUMS_PTR(tpl_cols, 0)]) -
+			(img_sqsums[SQSUMS_PTR(0, tpl_rows)] - img_sqsums[SQSUMS_PTR(0, 0)]));
+		res[res_idx] = normAcc(res[res_idx] - image_sum_ * tpl_sum,
+							   sqrt(tpl_sqsum * (image_sqsum_ - weight * image_sum_ * image_sum_)));
+	}
+}
+__kernel
+void matchTemplate_Prepared_CCOFF_NORMED_C4_D0
+(
+	__global float * res,
+	int img_rows,
+	int img_cols,
+	int tpl_rows,
+	int tpl_cols, 
+	int res_rows,
+	int res_cols,
+	int res_offset,
+	int res_step,
+	float weight,
+	__global const uint * img_sums_c0,
+	__global const uint * img_sums_c1,
+	__global const uint * img_sums_c2,
+	__global const uint * img_sums_c3,
+	int img_sums_offset,
+	int img_sums_step,
+	__global const TYPE_IMAGE_SQSUM * img_sqsums_c0,
+	__global const TYPE_IMAGE_SQSUM * img_sqsums_c1,
+	__global const TYPE_IMAGE_SQSUM * img_sqsums_c2,
+	__global const TYPE_IMAGE_SQSUM * img_sqsums_c3,
+	int img_sqsums_offset,
+	int img_sqsums_step,
+	float tpl_sum_c0,
+	float tpl_sum_c1,
+	float tpl_sum_c2,
+	float tpl_sum_c3,
+	float tpl_sqsum
+)
+{
+	int gidx = get_global_id(0);
+	int gidy = get_global_id(1);
+	
+	img_sqsums_step   /= sizeof(*img_sqsums_c0);
+	img_sqsums_offset /= sizeof(*img_sqsums_c0);
+	img_sums_offset   /= sizeof(*img_sums_c0);
+	img_sums_step     /= sizeof(*img_sums_c0);
+	res_step   /= sizeof(*res);
+	res_offset /= sizeof(*res);
+
+	int res_idx = mad24(gidy, res_step, res_offset + gidx);
+
+	if(gidx < res_cols && gidy < res_rows)
+	{
+		float image_sum_c0 =  (float)(
+			(img_sums_c0[SUMS_PTR(tpl_cols, tpl_rows)] - img_sums_c0[SUMS_PTR(tpl_cols, 0)])
+		  - (img_sums_c0[SUMS_PTR(0, tpl_rows)] - img_sums_c0[SUMS_PTR(0, 0)]));
+		float image_sum_c1 =  (float)(
+			(img_sums_c1[SUMS_PTR(tpl_cols, tpl_rows)] - img_sums_c1[SUMS_PTR(tpl_cols, 0)])
+		  - (img_sums_c1[SUMS_PTR(0, tpl_rows)] - img_sums_c1[SUMS_PTR(0, 0)]));
+		float image_sum_c2 =  (float)(
+			(img_sums_c2[SUMS_PTR(tpl_cols, tpl_rows)] - img_sums_c2[SUMS_PTR(tpl_cols, 0)])
+		  - (img_sums_c2[SUMS_PTR(0, tpl_rows)] - img_sums_c2[SUMS_PTR(0, 0)]));
+		float image_sum_c3 =  (float)(
+			(img_sums_c3[SUMS_PTR(tpl_cols, tpl_rows)] - img_sums_c3[SUMS_PTR(tpl_cols, 0)])
+		  - (img_sums_c3[SUMS_PTR(0, tpl_rows)] - img_sums_c3[SUMS_PTR(0, 0)]));
+
+		float image_sqsum_c0 = (float)(
+			(img_sqsums_c0[SQSUMS_PTR(tpl_cols, tpl_rows)] - img_sqsums_c0[SQSUMS_PTR(tpl_cols, 0)]) -
+			(img_sqsums_c0[SQSUMS_PTR(0, tpl_rows)] - img_sqsums_c0[SQSUMS_PTR(0, 0)]));
+		float image_sqsum_c1 = (float)(
+			(img_sqsums_c1[SQSUMS_PTR(tpl_cols, tpl_rows)] - img_sqsums_c1[SQSUMS_PTR(tpl_cols, 0)]) -
+			(img_sqsums_c1[SQSUMS_PTR(0, tpl_rows)] - img_sqsums_c1[SQSUMS_PTR(0, 0)]));
+		float image_sqsum_c2 = (float)(
+			(img_sqsums_c2[SQSUMS_PTR(tpl_cols, tpl_rows)] - img_sqsums_c2[SQSUMS_PTR(tpl_cols, 0)]) -
+			(img_sqsums_c2[SQSUMS_PTR(0, tpl_rows)] - img_sqsums_c2[SQSUMS_PTR(0, 0)]));
+		float image_sqsum_c3 = (float)(
+			(img_sqsums_c3[SQSUMS_PTR(tpl_cols, tpl_rows)] - img_sqsums_c3[SQSUMS_PTR(tpl_cols, 0)]) -
+			(img_sqsums_c3[SQSUMS_PTR(0, tpl_rows)] - img_sqsums_c3[SQSUMS_PTR(0, 0)]));
+
+		float num = res[res_idx] - 
+			image_sum_c0 * tpl_sum_c0 -
+			image_sum_c1 * tpl_sum_c1 -
+			image_sum_c2 * tpl_sum_c2 -
+			image_sum_c3 * tpl_sum_c3;
+		float denum = sqrt( tpl_sqsum * (
+			image_sqsum_c0 - weight * image_sum_c0 * image_sum_c0 +
+			image_sqsum_c1 - weight * image_sum_c1 * image_sum_c1 +
+			image_sqsum_c2 - weight * image_sum_c2 * image_sum_c2 +
+			image_sqsum_c3 - weight * image_sum_c0 * image_sum_c3)
+			);
+		res[res_idx] = normAcc(num, denum);
+	}
+}
+
--- a/modules/ocl/src/kernels/pyr_down.cl
+++ b/modules/ocl/src/kernels/pyr_down.cl
@@ -0,0 +1,500 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
+// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// @Authors
+//    Dachuan Zhao, dachuan@multicorewareinc.com
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other oclMaterials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors as is and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#pragma OPENCL EXTENSION cl_amd_printf : enable
+
+
+uchar round_uchar_uchar(uchar v)
+{ 
+	return v;
+}
+
+uchar round_uchar_int(int v)
+{ 
+    return (uchar)((uint)v <= 255 ? v : v > 0 ? 255 : 0); 
+}
+
+uchar round_uchar_float(float v)
+{ 
+	if(v - convert_int_sat_rte(v) > 1e-6 || v - convert_int_sat_rte(v) < -1e-6)
+	{
+		if(((int)v + 1) - (v + 0.5f) < 1e-6 && ((int)v + 1) - (v + 0.5f) > -1e-6)
+		{
+			v = (int)v + 0.51f;
+		}
+	}
+    int iv = convert_int_sat_rte(v);
+    return round_uchar_int(iv); 
+}
+
+uchar4 round_uchar4_uchar4(uchar4 v)
+{ 
+	return v;
+}
+
+uchar4 round_uchar4_int4(int4 v)
+{ 
+	uchar4 result;
+	result.x = (uchar)(v.x <= 255 ? v.x : v.x > 0 ? 255 : 0); 
+	result.y = (uchar)(v.y <= 255 ? v.y : v.y > 0 ? 255 : 0); 
+	result.z = (uchar)(v.z <= 255 ? v.z : v.z > 0 ? 255 : 0); 
+	result.w = (uchar)(v.w <= 255 ? v.w : v.w > 0 ? 255 : 0); 
+    return result; 
+}
+
+uchar4 round_uchar4_float4(float4 v)
+{ 
+	if(v.x - convert_int_sat_rte(v.x) > 1e-6 || v.x - convert_int_sat_rte(v.x) < -1e-6)
+	{
+		if(((int)(v.x) + 1) - (v.x + 0.5f) < 1e-6 && ((int)(v.x) + 1) - (v.x + 0.5f) > -1e-6)
+		{
+			v.x = (int)(v.x) + 0.51f;
+		}
+	}
+	if(v.y - convert_int_sat_rte(v.y) > 1e-6 || v.y - convert_int_sat_rte(v.y) < -1e-6)
+	{
+		if(((int)(v.y) + 1) - (v.y + 0.5f) < 1e-6 && ((int)(v.y) + 1) - (v.y + 0.5f) > -1e-6)
+		{
+			v.y = (int)(v.y) + 0.51f;
+		}
+	}
+	if(v.z - convert_int_sat_rte(v.z) > 1e-6 || v.z - convert_int_sat_rte(v.z) < -1e-6)
+	{
+		if(((int)(v.z) + 1) - (v.z + 0.5f) < 1e-6 && ((int)(v.z) + 1) - (v.z + 0.5f) > -1e-6)
+		{
+			v.z = (int)(v.z) + 0.51f;
+		}
+	}
+	if(v.w - convert_int_sat_rte(v.w) > 1e-6 || v.w - convert_int_sat_rte(v.w) < -1e-6)
+	{
+		if(((int)(v.w) + 1) - (v.w + 0.5f) < 1e-6 && ((int)(v.w) + 1) - (v.w + 0.5f) > -1e-6)
+		{
+			v.w = (int)(v.w) + 0.51f;
+		}
+	}
+    int4 iv = convert_int4_sat_rte(v);
+    return round_uchar4_int4(iv); 
+}
+
+
+
+
+int idx_row_low(int y, int last_row)
+{
+	if(y < 0)
+	{
+		y = -y;
+	}
+    return y % (last_row + 1);
+}
+
+int idx_row_high(int y, int last_row) 
+{
+	int i;
+	int j;
+	if(last_row - y < 0)
+	{
+		i = (y - last_row);
+	}
+	else
+	{
+		i = (last_row - y);
+	}
+	if(last_row - i < 0)
+	{
+		j = i - last_row;
+	}
+	else
+	{
+		j = last_row - i;
+	}
+    return j % (last_row + 1);
+}
+
+int idx_row(int y, int last_row)
+{
+    return idx_row_low(idx_row_high(y, last_row), last_row);
+}
+
+int idx_col_low(int x, int last_col)
+{
+	if(x < 0)
+	{
+		x = -x;
+	}
+    return x % (last_col + 1);
+}
+
+int idx_col_high(int x, int last_col) 
+{
+	int i;
+	int j;
+	if(last_col - x < 0)
+	{
+		i = (x - last_col);
+	}
+	else
+	{
+		i = (last_col - x);
+	}
+	if(last_col - i < 0)
+	{
+		j = i - last_col;
+	}
+	else
+	{
+		j = last_col - i;
+	}
+    return j % (last_col + 1);
+}
+
+int idx_col(int x, int last_col)
+{
+    return idx_col_low(idx_col_high(x, last_col), last_col);
+}
+
+__kernel void pyrDown_C1_D0(__global uchar * srcData, int srcStep, int srcOffset, int srcRows, int srcCols, __global uchar *dst, int dstStep, int dstOffset, int dstCols)
+{
+    const int x = get_group_id(0) * get_local_size(0) + get_local_id(0);
+    const int y = get_group_id(1);
+
+    __local float smem[256 + 4];
+
+    float sum;
+
+    const int src_y = 2*y;
+    const int last_row = srcRows - 1;
+    const int last_col = srcCols - 1;
+
+    sum = 0;
+
+    sum = sum + 0.0625f * round_uchar_uchar(((__global uchar*)((__global char*)srcData + idx_row(src_y - 2, last_row) * srcStep))[idx_col(x, last_col)]);
+    sum = sum + 0.25f   * round_uchar_uchar(((__global uchar*)((__global char*)srcData + idx_row(src_y - 1, last_row) * srcStep))[idx_col(x, last_col)]);
+    sum = sum + 0.375f  * round_uchar_uchar(((__global uchar*)((__global char*)srcData + idx_row(src_y    , last_row) * srcStep))[idx_col(x, last_col)]);
+    sum = sum + 0.25f   * round_uchar_uchar(((__global uchar*)((__global char*)srcData + idx_row(src_y + 1, last_row) * srcStep))[idx_col(x, last_col)]);
+    sum = sum + 0.0625f * round_uchar_uchar(((__global uchar*)((__global char*)srcData + idx_row(src_y + 2, last_row) * srcStep))[idx_col(x, last_col)]);
+
+    smem[2 + get_local_id(0)] = sum;
+
+    if (get_local_id(0) < 2)
+    {
+        const int left_x = x - 2;
+
+        sum = 0;
+
+        sum = sum + 0.0625f * round_uchar_uchar(((__global uchar*)((__global char*)srcData + idx_row(src_y - 2, last_row) * srcStep))[idx_col(left_x, last_col)]);
+		sum = sum + 0.25f   * round_uchar_uchar(((__global uchar*)((__global char*)srcData + idx_row(src_y - 1, last_row) * srcStep))[idx_col(left_x, last_col)]);
+		sum = sum + 0.375f  * round_uchar_uchar(((__global uchar*)((__global char*)srcData + idx_row(src_y    , last_row) * srcStep))[idx_col(left_x, last_col)]);
+		sum = sum + 0.25f   * round_uchar_uchar(((__global uchar*)((__global char*)srcData + idx_row(src_y + 1, last_row) * srcStep))[idx_col(left_x, last_col)]);
+		sum = sum + 0.0625f * round_uchar_uchar(((__global uchar*)((__global char*)srcData + idx_row(src_y + 2, last_row) * srcStep))[idx_col(left_x, last_col)]);
+
+        smem[get_local_id(0)] = sum;
+    }
+
+    if (get_local_id(0) > 253)
+    {
+        const int right_x = x + 2;
+
+        sum = 0;
+
+        sum = sum + 0.0625f * round_uchar_uchar(((__global uchar*)((__global char*)srcData + idx_row(src_y - 2, last_row) * srcStep))[idx_col(right_x, last_col)]);
+		sum = sum + 0.25f   * round_uchar_uchar(((__global uchar*)((__global char*)srcData + idx_row(src_y - 1, last_row) * srcStep))[idx_col(right_x, last_col)]);
+		sum = sum + 0.375f  * round_uchar_uchar(((__global uchar*)((__global char*)srcData + idx_row(src_y    , last_row) * srcStep))[idx_col(right_x, last_col)]);
+		sum = sum + 0.25f   * round_uchar_uchar(((__global uchar*)((__global char*)srcData + idx_row(src_y + 1, last_row) * srcStep))[idx_col(right_x, last_col)]);
+		sum = sum + 0.0625f * round_uchar_uchar(((__global uchar*)((__global char*)srcData + idx_row(src_y + 2, last_row) * srcStep))[idx_col(right_x, last_col)]);
+
+        smem[4 + get_local_id(0)] = sum;
+    }
+
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    if (get_local_id(0) < 128)
+    {
+        const int tid2 = get_local_id(0) * 2;
+
+        sum = 0;
+
+        sum = sum + 0.0625f * smem[2 + tid2 - 2];
+        sum = sum + 0.25f   * smem[2 + tid2 - 1];
+        sum = sum + 0.375f  * smem[2 + tid2    ];
+        sum = sum + 0.25f   * smem[2 + tid2 + 1];
+        sum = sum + 0.0625f * smem[2 + tid2 + 2];
+
+        const int dst_x = (get_group_id(0) * get_local_size(0) + tid2) / 2;
+
+        if (dst_x < dstCols)
+            dst[y * dstStep + dst_x] = round_uchar_float(sum);
+    }
+}
+
+__kernel void pyrDown_C4_D0(__global uchar4 * srcData, int srcStep, int srcOffset, int srcRows, int srcCols, __global uchar4 *dst, int dstStep, int dstOffset, int dstCols)
+{
+    const int x = get_group_id(0) * get_local_size(0) + get_local_id(0);
+    const int y = get_group_id(1);
+
+    __local float4 smem[256 + 4];
+
+    float4 sum;
+
+    const int src_y = 2*y;
+    const int last_row = srcRows - 1;
+    const int last_col = srcCols - 1;
+
+	float4 co1 = (float4)(0.375f, 0.375f, 0.375f, 0.375f);
+	float4 co2 = (float4)(0.25f, 0.25f, 0.25f, 0.25f);
+	float4 co3 = (float4)(0.0625f, 0.0625f, 0.0625f, 0.0625f);
+
+    sum = 0;
+
+	sum = sum + co3 * convert_float4(round_uchar4_uchar4(((__global uchar4*)((__global char4*)srcData + idx_row(src_y - 2, last_row) * srcStep / 4))[idx_col(x, last_col)]));
+	sum = sum + co2   * convert_float4(round_uchar4_uchar4(((__global uchar4*)((__global char4*)srcData + idx_row(src_y - 1, last_row) * srcStep / 4))[idx_col(x, last_col)]));
+	sum = sum + co1  * convert_float4(round_uchar4_uchar4(((__global uchar4*)((__global char4*)srcData + idx_row(src_y    , last_row) * srcStep / 4))[idx_col(x, last_col)]));
+	sum = sum + co2   * convert_float4(round_uchar4_uchar4(((__global uchar4*)((__global char4*)srcData + idx_row(src_y + 1, last_row) * srcStep / 4))[idx_col(x, last_col)]));
+	sum = sum + co3 * convert_float4(round_uchar4_uchar4(((__global uchar4*)((__global char4*)srcData + idx_row(src_y + 2, last_row) * srcStep / 4))[idx_col(x, last_col)]));
+
+	smem[2 + get_local_id(0)] = sum;
+
+	if (get_local_id(0) < 2)
+	{
+		const int left_x = x - 2;
+
+		sum = 0;
+
+		sum = sum + co3 * convert_float4(round_uchar4_uchar4(((__global uchar4*)((__global char4*)srcData + idx_row(src_y - 2, last_row) * srcStep / 4))[idx_col(left_x, last_col)]));
+		sum = sum + co2   * convert_float4(round_uchar4_uchar4(((__global uchar4*)((__global char4*)srcData + idx_row(src_y - 1, last_row) * srcStep / 4))[idx_col(left_x, last_col)]));
+		sum = sum + co1  * convert_float4(round_uchar4_uchar4(((__global uchar4*)((__global char4*)srcData + idx_row(src_y    , last_row) * srcStep / 4))[idx_col(left_x, last_col)]));
+		sum = sum + co2   * convert_float4(round_uchar4_uchar4(((__global uchar4*)((__global char4*)srcData + idx_row(src_y + 1, last_row) * srcStep / 4))[idx_col(left_x, last_col)]));
+		sum = sum + co3 * convert_float4(round_uchar4_uchar4(((__global uchar4*)((__global char4*)srcData + idx_row(src_y + 2, last_row) * srcStep / 4))[idx_col(left_x, last_col)]));
+
+		smem[get_local_id(0)] = sum;
+	}
+
+	if (get_local_id(0) > 253)
+	{
+		const int right_x = x + 2;
+
+		sum = 0;
+
+		sum = sum + co3 * convert_float4(round_uchar4_uchar4(((__global uchar4*)((__global char4*)srcData + idx_row(src_y - 2, last_row) * srcStep / 4))[idx_col(right_x, last_col)]));
+		sum = sum + co2   * convert_float4(round_uchar4_uchar4(((__global uchar4*)((__global char4*)srcData + idx_row(src_y - 1, last_row) * srcStep / 4))[idx_col(right_x, last_col)]));
+		sum = sum + co1  * convert_float4(round_uchar4_uchar4(((__global uchar4*)((__global char4*)srcData + idx_row(src_y    , last_row) * srcStep / 4))[idx_col(right_x, last_col)]));
+		sum = sum + co2   * convert_float4(round_uchar4_uchar4(((__global uchar4*)((__global char4*)srcData + idx_row(src_y + 1, last_row) * srcStep / 4))[idx_col(right_x, last_col)]));
+		sum = sum + co3 * convert_float4(round_uchar4_uchar4(((__global uchar4*)((__global char4*)srcData + idx_row(src_y + 2, last_row) * srcStep / 4))[idx_col(right_x, last_col)]));
+
+		smem[4 + get_local_id(0)] = sum;
+	}
+
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    if (get_local_id(0) < 128)
+    {
+        const int tid2 = get_local_id(0) * 2;
+
+        sum = 0;
+
+        sum = sum + co3 * smem[2 + tid2 - 2];
+        sum = sum + co2   * smem[2 + tid2 - 1];
+        sum = sum + co1  * smem[2 + tid2    ];
+        sum = sum + co2   * smem[2 + tid2 + 1];
+        sum = sum + co3 * smem[2 + tid2 + 2];
+
+        const int dst_x = (get_group_id(0) * get_local_size(0) + tid2) / 2;
+
+        if (dst_x < dstCols)
+            dst[y * dstStep / 4 + dst_x] = round_uchar4_float4(sum);
+    }
+}
+
+__kernel void pyrDown_C1_D5(__global float * srcData, int srcStep, int srcOffset, int srcRows, int srcCols, __global float *dst, int dstStep, int dstOffset, int dstCols)
+{
+    const int x = get_group_id(0) * get_local_size(0) + get_local_id(0);
+    const int y = get_group_id(1);
+
+    __local float smem[256 + 4];
+
+    float sum;
+
+    const int src_y = 2*y;
+    const int last_row = srcRows - 1;
+    const int last_col = srcCols - 1;
+
+    sum = 0;
+
+    sum = sum + 0.0625f * ((__global float*)((__global char*)srcData + idx_row(src_y - 2, last_row) * srcStep))[idx_col(x, last_col)];
+    sum = sum + 0.25f   * ((__global float*)((__global char*)srcData + idx_row(src_y - 1, last_row) * srcStep))[idx_col(x, last_col)];
+    sum = sum + 0.375f  * ((__global float*)((__global char*)srcData + idx_row(src_y    , last_row) * srcStep))[idx_col(x, last_col)];
+    sum = sum + 0.25f   * ((__global float*)((__global char*)srcData + idx_row(src_y + 1, last_row) * srcStep))[idx_col(x, last_col)];
+    sum = sum + 0.0625f * ((__global float*)((__global char*)srcData + idx_row(src_y + 2, last_row) * srcStep))[idx_col(x, last_col)];
+
+    smem[2 + get_local_id(0)] = sum;
+
+    if (get_local_id(0) < 2)
+    {
+        const int left_x = x - 2;
+
+        sum = 0;
+
+        sum = sum + 0.0625f * ((__global float*)((__global char*)srcData + idx_row(src_y - 2, last_row) * srcStep))[idx_col(left_x, last_col)];
+		sum = sum + 0.25f   * ((__global float*)((__global char*)srcData + idx_row(src_y - 1, last_row) * srcStep))[idx_col(left_x, last_col)];
+		sum = sum + 0.375f  * ((__global float*)((__global char*)srcData + idx_row(src_y    , last_row) * srcStep))[idx_col(left_x, last_col)];
+		sum = sum + 0.25f   * ((__global float*)((__global char*)srcData + idx_row(src_y + 1, last_row) * srcStep))[idx_col(left_x, last_col)];
+		sum = sum + 0.0625f * ((__global float*)((__global char*)srcData + idx_row(src_y + 2, last_row) * srcStep))[idx_col(left_x, last_col)];
+
+        smem[get_local_id(0)] = sum;
+    }
+
+    if (get_local_id(0) > 253)
+    {
+        const int right_x = x + 2;
+
+        sum = 0;
+
+        sum = sum + 0.0625f * ((__global float*)((__global char*)srcData + idx_row(src_y - 2, last_row) * srcStep))[idx_col(right_x, last_col)];
+		sum = sum + 0.25f   * ((__global float*)((__global char*)srcData + idx_row(src_y - 1, last_row) * srcStep))[idx_col(right_x, last_col)];
+		sum = sum + 0.375f  * ((__global float*)((__global char*)srcData + idx_row(src_y    , last_row) * srcStep))[idx_col(right_x, last_col)];
+		sum = sum + 0.25f   * ((__global float*)((__global char*)srcData + idx_row(src_y + 1, last_row) * srcStep))[idx_col(right_x, last_col)];
+		sum = sum + 0.0625f * ((__global float*)((__global char*)srcData + idx_row(src_y + 2, last_row) * srcStep))[idx_col(right_x, last_col)];
+
+        smem[4 + get_local_id(0)] = sum;
+    }
+
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    if (get_local_id(0) < 128)
+    {
+        const int tid2 = get_local_id(0) * 2;
+
+        sum = 0;
+
+        sum = sum + 0.0625f * smem[2 + tid2 - 2];
+        sum = sum + 0.25f   * smem[2 + tid2 - 1];
+        sum = sum + 0.375f  * smem[2 + tid2    ];
+        sum = sum + 0.25f   * smem[2 + tid2 + 1];
+        sum = sum + 0.0625f * smem[2 + tid2 + 2];
+
+        const int dst_x = (get_group_id(0) * get_local_size(0) + tid2) / 2;
+
+        if (dst_x < dstCols)
+            dst[y * dstStep / 4 + dst_x] = sum;
+    }
+}
+
+__kernel void pyrDown_C4_D5(__global float4 * srcData, int srcStep, int srcOffset, int srcRows, int srcCols, __global float4 *dst, int dstStep, int dstOffset, int dstCols)
+{
+    const int x = get_group_id(0) * get_local_size(0) + get_local_id(0);
+    const int y = get_group_id(1);
+
+    __local float4 smem[256 + 4];
+
+    float4 sum;
+
+    const int src_y = 2*y;
+    const int last_row = srcRows - 1;
+    const int last_col = srcCols - 1;
+
+	float4 co1 = (float4)(0.375f, 0.375f, 0.375f, 0.375f);
+	float4 co2 = (float4)(0.25f, 0.25f, 0.25f, 0.25f);
+	float4 co3 = (float4)(0.0625f, 0.0625f, 0.0625f, 0.0625f);
+
+    sum = 0;
+
+	sum = sum + co3 * ((__global float4*)((__global char4*)srcData + idx_row(src_y - 2, last_row) * srcStep / 4))[idx_col(x, last_col)];
+	sum = sum + co2   * ((__global float4*)((__global char4*)srcData + idx_row(src_y - 1, last_row) * srcStep / 4))[idx_col(x, last_col)];
+	sum = sum + co1  * ((__global float4*)((__global char4*)srcData + idx_row(src_y    , last_row) * srcStep / 4))[idx_col(x, last_col)];
+	sum = sum + co2   * ((__global float4*)((__global char4*)srcData + idx_row(src_y + 1, last_row) * srcStep / 4))[idx_col(x, last_col)];
+	sum = sum + co3 * ((__global float4*)((__global char4*)srcData + idx_row(src_y + 2, last_row) * srcStep / 4))[idx_col(x, last_col)];
+
+	smem[2 + get_local_id(0)] = sum;
+
+	if (get_local_id(0) < 2)
+	{
+		const int left_x = x - 2;
+
+		sum = 0;
+
+		sum = sum + co3 * ((__global float4*)((__global char4*)srcData + idx_row(src_y - 2, last_row) * srcStep / 4))[idx_col(left_x, last_col)];
+		sum = sum + co2   * ((__global float4*)((__global char4*)srcData + idx_row(src_y - 1, last_row) * srcStep / 4))[idx_col(left_x, last_col)];
+		sum = sum + co1  * ((__global float4*)((__global char4*)srcData + idx_row(src_y    , last_row) * srcStep / 4))[idx_col(left_x, last_col)];
+		sum = sum + co2   * ((__global float4*)((__global char4*)srcData + idx_row(src_y + 1, last_row) * srcStep / 4))[idx_col(left_x, last_col)];
+		sum = sum + co3 * ((__global float4*)((__global char4*)srcData + idx_row(src_y + 2, last_row) * srcStep / 4))[idx_col(left_x, last_col)];
+
+		smem[get_local_id(0)] = sum;
+	}
+
+	if (get_local_id(0) > 253)
+	{
+		const int right_x = x + 2;
+
+		sum = 0;
+
+		sum = sum + co3 * ((__global float4*)((__global char4*)srcData + idx_row(src_y - 2, last_row) * srcStep / 4))[idx_col(right_x, last_col)];
+		sum = sum + co2   * ((__global float4*)((__global char4*)srcData + idx_row(src_y - 1, last_row) * srcStep / 4))[idx_col(right_x, last_col)];
+		sum = sum + co1  * ((__global float4*)((__global char4*)srcData + idx_row(src_y    , last_row) * srcStep / 4))[idx_col(right_x, last_col)];
+		sum = sum + co2   * ((__global float4*)((__global char4*)srcData + idx_row(src_y + 1, last_row) * srcStep / 4))[idx_col(right_x, last_col)];
+		sum = sum + co3 * ((__global float4*)((__global char4*)srcData + idx_row(src_y + 2, last_row) * srcStep / 4))[idx_col(right_x, last_col)];
+
+		smem[4 + get_local_id(0)] = sum;
+	}
+
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    if (get_local_id(0) < 128)
+    {
+        const int tid2 = get_local_id(0) * 2;
+
+        sum = 0;
+
+        sum = sum + co3 * smem[2 + tid2 - 2];
+        sum = sum + co2   * smem[2 + tid2 - 1];
+        sum = sum + co1  * smem[2 + tid2    ];
+        sum = sum + co2   * smem[2 + tid2 + 1];
+        sum = sum + co3 * smem[2 + tid2 + 2];
+
+        const int dst_x = (get_group_id(0) * get_local_size(0) + tid2) / 2;
+
+        if (dst_x < dstCols)
+            dst[y * dstStep / 16 + dst_x] = sum;
+    }
+}
--- a/modules/ocl/src/kernels/pyr_up.cl
+++ b/modules/ocl/src/kernels/pyr_up.cl
@@ -0,0 +1,750 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
+// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// @Authors
+//    Zhang Chunpeng	chunpeng@multicorewareinc.com
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other oclMaterials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors as is and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+//#pragma OPENCL EXTENSION cl_amd_printf : enable
+
+uchar get_valid_uchar(uchar data)
+{
+	return (uchar)(data <= 255 ? data : data > 0 ? 255 : 0);
+}
+///////////////////////////////////////////////////////////////////////
+//////////////////////////  CV_8UC1  //////////////////////////////////
+///////////////////////////////////////////////////////////////////////
+__kernel void pyrUp_C1_D0(__global uchar* src,__global uchar* dst,
+				int srcRows,int dstRows,int srcCols,int dstCols,
+				int srcOffset,int dstOffset,int srcStep,int dstStep)
+{
+	const int x = get_global_id(0);
+	const int y = get_global_id(1);
+	
+	__local float s_srcPatch[10][10];
+	__local float s_dstPatch[20][16];
+	
+
+	if( get_local_id(0) < 10 && get_local_id(1) < 10 )
+	{
+		int srcx = (int)(get_group_id(0) * get_local_size(0) / 2 + get_local_id(0)) - 1;
+		int srcy = (int)(get_group_id(1) * get_local_size(1) / 2 + get_local_id(1)) - 1;
+		
+		srcx = abs(srcx);
+		srcx = min(srcCols - 1,srcx);
+		
+		srcy = abs(srcy);
+		srcy = min(srcRows -1 ,srcy);
+		
+		s_srcPatch[get_local_id(1)][get_local_id(0)] = (float)(src[srcx + srcy * srcStep]);
+		
+	}
+	
+	barrier(CLK_LOCAL_MEM_FENCE);
+	
+	float sum = 0;
+	const int evenFlag = (int)((get_local_id(0) & 1) == 0);
+	const int oddFlag = (int)((get_local_id(0) & 1) != 0);
+	const bool  eveny = ((get_local_id(1) & 1) == 0);
+	const int tidx = get_local_id(0);
+	
+	if(eveny)
+	{
+		sum = sum + (evenFlag * 0.0625f) * s_srcPatch[1 + (get_local_id(1) >> 1)][1 + ((tidx - 2) >> 1)];
+        sum = sum + ( oddFlag * 0.25f  ) * s_srcPatch[1 + (get_local_id(1) >> 1)][1 + ((tidx - 1) >> 1)];
+        sum = sum + (evenFlag * 0.375f ) * s_srcPatch[1 + (get_local_id(1) >> 1)][1 + ((tidx    ) >> 1)];
+        sum = sum + ( oddFlag * 0.25f  ) * s_srcPatch[1 + (get_local_id(1) >> 1)][1 + ((tidx + 1) >> 1)];
+        sum = sum + (evenFlag * 0.0625f) * s_srcPatch[1 + (get_local_id(1) >> 1)][1 + ((tidx + 2) >> 1)];	
+  	}
+	
+	s_dstPatch[2 + get_local_id(1)][get_local_id(0)] = sum;
+	
+	if (get_local_id(1) < 2)
+    {
+		sum = 0;
+
+        if (eveny)
+        {
+			sum = sum + (evenFlag * 0.0625f) * s_srcPatch[0][1 + ((tidx - 2) >> 1)];
+            sum = sum + ( oddFlag * 0.25f  ) * s_srcPatch[0][1 + ((tidx - 1) >> 1)];
+            sum = sum + (evenFlag * 0.375f ) * s_srcPatch[0][1 + ((tidx    ) >> 1)];
+            sum = sum + ( oddFlag * 0.25f  ) * s_srcPatch[0][1 + ((tidx + 1) >> 1)];
+            sum = sum + (evenFlag * 0.0625f) * s_srcPatch[0][1 + ((tidx + 2) >> 1)];
+         }
+
+         s_dstPatch[get_local_id(1)][get_local_id(0)] = sum;
+     }
+     
+     if (get_local_id(1) > 13)
+     {
+		sum = 0;
+
+		if (eveny)
+		{
+			sum = sum + (evenFlag * 0.0625f) * s_srcPatch[9][1 + ((tidx - 2) >> 1)];
+			sum = sum + ( oddFlag * 0.25f  ) * s_srcPatch[9][1 + ((tidx - 1) >> 1)];
+			sum = sum + (evenFlag * 0.375f ) * s_srcPatch[9][1 + ((tidx    ) >> 1)];
+			sum = sum + ( oddFlag * 0.25f  ) * s_srcPatch[9][1 + ((tidx + 1) >> 1)];
+			sum = sum + (evenFlag * 0.0625f) * s_srcPatch[9][1 + ((tidx + 2) >> 1)];
+		}
+		s_dstPatch[4 + get_local_id(1)][get_local_id(0)] = sum;
+     }
+
+	barrier(CLK_LOCAL_MEM_FENCE);
+	
+	sum = 0;
+
+    const int tidy = get_local_id(1);
+
+    sum = sum + 0.0625f * s_dstPatch[2 + tidy - 2][get_local_id(0)];
+    sum = sum + 0.25f   * s_dstPatch[2 + tidy - 1][get_local_id(0)];
+    sum = sum + 0.375f  * s_dstPatch[2 + tidy    ][get_local_id(0)];
+    sum = sum + 0.25f   * s_dstPatch[2 + tidy + 1][get_local_id(0)];
+    sum = sum + 0.0625f * s_dstPatch[2 + tidy + 2][get_local_id(0)];
+
+    if ((x < dstCols) && (y < dstRows))
+		dst[x + y * dstStep] = (float)(4.0f * sum);
+
+}
+
+///////////////////////////////////////////////////////////////////////
+//////////////////////////  CV_16UC1  /////////////////////////////////
+///////////////////////////////////////////////////////////////////////
+__kernel void pyrUp_C1_D2(__global ushort* src,__global ushort* dst,
+				int srcRows,int dstRows,int srcCols,int dstCols,
+				int srcOffset,int dstOffset,int srcStep,int dstStep)
+{
+	const int x = get_global_id(0);
+	const int y = get_global_id(1);
+	
+	__local float s_srcPatch[10][10];
+	__local float s_dstPatch[20][16];
+	
+	srcStep = srcStep >> 1;
+	dstStep = dstStep >> 1;
+	srcOffset = srcOffset >> 1;
+	dstOffset = dstOffset >> 1;
+	
+
+	if( get_local_id(0) < 10 && get_local_id(1) < 10 )
+	{
+		int srcx = (int)(get_group_id(0) * get_local_size(0) / 2 + get_local_id(0)) - 1;
+		int srcy = (int)(get_group_id(1) * get_local_size(1) / 2 + get_local_id(1)) - 1;
+		
+		srcx = abs(srcx);
+		srcx = min(srcCols - 1,srcx);
+		
+		srcy = abs(srcy);
+		srcy = min(srcRows -1 ,srcy);
+		
+		s_srcPatch[get_local_id(1)][get_local_id(0)] = (float)(src[srcx + srcy * srcStep]);
+		
+	}
+	
+	barrier(CLK_LOCAL_MEM_FENCE);
+	
+	float sum = 0;
+	
+	const int evenFlag = (int)((get_local_id(0) & 1) == 0);
+	const int oddFlag = (int)((get_local_id(0) & 1) != 0);
+	const bool  eveny = ((get_local_id(1) & 1) == 0);
+	const int tidx = get_local_id(0);
+	
+	if(eveny)
+	{
+		sum = sum + (evenFlag * 0.0625f) * s_srcPatch[1 + (get_local_id(1) >> 1)][1 + ((tidx - 2) >> 1)];
+        sum = sum + ( oddFlag * 0.25f  ) * s_srcPatch[1 + (get_local_id(1) >> 1)][1 + ((tidx - 1) >> 1)];
+        sum = sum + (evenFlag * 0.375f ) * s_srcPatch[1 + (get_local_id(1) >> 1)][1 + ((tidx    ) >> 1)];
+        sum = sum + ( oddFlag * 0.25f  ) * s_srcPatch[1 + (get_local_id(1) >> 1)][1 + ((tidx + 1) >> 1)];
+        sum = sum + (evenFlag * 0.0625f) * s_srcPatch[1 + (get_local_id(1) >> 1)][1 + ((tidx + 2) >> 1)];	
+  	}
+	
+	s_dstPatch[2 + get_local_id(1)][get_local_id(0)] = sum;
+	
+	if (get_local_id(1) < 2)
+    {
+		sum = 0;
+
+        if (eveny)
+        {
+			sum = sum + (evenFlag * 0.0625f) * s_srcPatch[0][1 + ((tidx - 2) >> 1)];
+            sum = sum + ( oddFlag * 0.25f  ) * s_srcPatch[0][1 + ((tidx - 1) >> 1)];
+            sum = sum + (evenFlag * 0.375f ) * s_srcPatch[0][1 + ((tidx    ) >> 1)];
+            sum = sum + ( oddFlag * 0.25f  ) * s_srcPatch[0][1 + ((tidx + 1) >> 1)];
+            sum = sum + (evenFlag * 0.0625f) * s_srcPatch[0][1 + ((tidx + 2) >> 1)];
+         }
+
+         s_dstPatch[get_local_id(1)][get_local_id(0)] = sum;
+     }
+     
+     if (get_local_id(1) > 13)
+     {
+		sum = 0;
+
+		if (eveny)
+		{
+			sum = sum + (evenFlag * 0.0625f) * s_srcPatch[9][1 + ((tidx - 2) >> 1)];
+			sum = sum + ( oddFlag * 0.25f  ) * s_srcPatch[9][1 + ((tidx - 1) >> 1)];
+			sum = sum + (evenFlag * 0.375f ) * s_srcPatch[9][1 + ((tidx    ) >> 1)];
+			sum = sum + ( oddFlag * 0.25f  ) * s_srcPatch[9][1 + ((tidx + 1) >> 1)];
+			sum = sum + (evenFlag * 0.0625f) * s_srcPatch[9][1 + ((tidx + 2) >> 1)];
+		}
+		s_dstPatch[4 + get_local_id(1)][get_local_id(0)] = sum;
+     }
+
+	barrier(CLK_LOCAL_MEM_FENCE);
+	
+	sum = 0;
+
+    const int tidy = get_local_id(1);
+
+    sum = sum + 0.0625f * s_dstPatch[2 + tidy - 2][get_local_id(0)];
+    sum = sum + 0.25f   * s_dstPatch[2 + tidy - 1][get_local_id(0)];
+    sum = sum + 0.375f  * s_dstPatch[2 + tidy    ][get_local_id(0)];
+    sum = sum + 0.25f   * s_dstPatch[2 + tidy + 1][get_local_id(0)];
+    sum = sum + 0.0625f * s_dstPatch[2 + tidy + 2][get_local_id(0)];
+
+    if ((x < dstCols) && (y < dstRows))
+		dst[x + y * dstStep] = (float)(4.0f * sum);
+
+}
+
+///////////////////////////////////////////////////////////////////////
+//////////////////////////  CV_32FC1  /////////////////////////////////
+///////////////////////////////////////////////////////////////////////
+__kernel void pyrUp_C1_D5(__global float* src,__global float* dst,
+				int srcRows,int dstRows,int srcCols,int dstCols,
+				int srcOffset,int dstOffset,int srcStep,int dstStep)
+{
+	const int x = get_global_id(0);
+	const int y = get_global_id(1);
+	
+	__local float s_srcPatch[10][10];
+	__local float s_dstPatch[20][16];
+	
+	srcOffset = srcOffset >> 2;
+	dstOffset = dstOffset >> 2;
+	srcStep = srcStep >> 2;
+	dstStep = dstStep >> 2;
+	
+
+	if( get_local_id(0) < 10 && get_local_id(1) < 10 )
+	{
+		int srcx = (int)(get_group_id(0) * get_local_size(0) / 2 + get_local_id(0)) - 1;
+		int srcy = (int)(get_group_id(1) * get_local_size(1) / 2 + get_local_id(1)) - 1;
+		
+		srcx = abs(srcx);
+		srcx = min(srcCols - 1,srcx);
+		
+		srcy = abs(srcy);
+		srcy = min(srcRows -1 ,srcy);
+		
+		s_srcPatch[get_local_id(1)][get_local_id(0)] = (float)(src[srcx + srcy * srcStep]);
+		
+	}
+	
+	barrier(CLK_LOCAL_MEM_FENCE);
+	
+	float sum = 0;
+	const int evenFlag = (int)((get_local_id(0) & 1) == 0);
+	const int oddFlag = (int)((get_local_id(0) & 1) != 0);
+	const bool  eveny = ((get_local_id(1) & 1) == 0);
+	const int tidx = get_local_id(0);
+	
+	if(eveny)
+	{
+		sum = sum + (evenFlag * 0.0625f) * s_srcPatch[1 + (get_local_id(1) >> 1)][1 + ((tidx - 2) >> 1)];
+        sum = sum + ( oddFlag * 0.25f  ) * s_srcPatch[1 + (get_local_id(1) >> 1)][1 + ((tidx - 1) >> 1)];
+        sum = sum + (evenFlag * 0.375f ) * s_srcPatch[1 + (get_local_id(1) >> 1)][1 + ((tidx    ) >> 1)];
+        sum = sum + ( oddFlag * 0.25f  ) * s_srcPatch[1 + (get_local_id(1) >> 1)][1 + ((tidx + 1) >> 1)];
+        sum = sum + (evenFlag * 0.0625f) * s_srcPatch[1 + (get_local_id(1) >> 1)][1 + ((tidx + 2) >> 1)];	
+  	}
+	
+	s_dstPatch[2 + get_local_id(1)][get_local_id(0)] = sum;
+	
+	if (get_local_id(1) < 2)
+    {
+		sum = 0;
+
+        if (eveny)
+        {
+			sum = sum + (evenFlag * 0.0625f) * s_srcPatch[0][1 + ((tidx - 2) >> 1)];
+            sum = sum + ( oddFlag * 0.25f  ) * s_srcPatch[0][1 + ((tidx - 1) >> 1)];
+            sum = sum + (evenFlag * 0.375f ) * s_srcPatch[0][1 + ((tidx    ) >> 1)];
+            sum = sum + ( oddFlag * 0.25f  ) * s_srcPatch[0][1 + ((tidx + 1) >> 1)];
+            sum = sum + (evenFlag * 0.0625f) * s_srcPatch[0][1 + ((tidx + 2) >> 1)];
+         }
+
+         s_dstPatch[get_local_id(1)][get_local_id(0)] = sum;
+     }
+     
+     if (get_local_id(1) > 13)
+     {
+		sum = 0;
+
+		if (eveny)
+		{
+			sum = sum + (evenFlag * 0.0625f) * s_srcPatch[9][1 + ((tidx - 2) >> 1)];
+			sum = sum + ( oddFlag * 0.25f  ) * s_srcPatch[9][1 + ((tidx - 1) >> 1)];
+			sum = sum + (evenFlag * 0.375f ) * s_srcPatch[9][1 + ((tidx    ) >> 1)];
+			sum = sum + ( oddFlag * 0.25f  ) * s_srcPatch[9][1 + ((tidx + 1) >> 1)];
+			sum = sum + (evenFlag * 0.0625f) * s_srcPatch[9][1 + ((tidx + 2) >> 1)];
+		}
+		s_dstPatch[4 + get_local_id(1)][get_local_id(0)] = sum;
+     }
+
+	barrier(CLK_LOCAL_MEM_FENCE);
+	
+	sum = 0;
+
+    const int tidy = get_local_id(1);
+
+    sum = sum + 0.0625f * s_dstPatch[2 + tidy - 2][get_local_id(0)];
+    sum = sum + 0.25f   * s_dstPatch[2 + tidy - 1][get_local_id(0)];
+    sum = sum + 0.375f  * s_dstPatch[2 + tidy    ][get_local_id(0)];
+    sum = sum + 0.25f   * s_dstPatch[2 + tidy + 1][get_local_id(0)];
+    sum = sum + 0.0625f * s_dstPatch[2 + tidy + 2][get_local_id(0)];
+
+    if ((x < dstCols) && (y < dstRows))
+		dst[x + y * dstStep] = (float)(4.0f * sum);
+
+}
+
+///////////////////////////////////////////////////////////////////////
+//////////////////////////  CV_8UC4  //////////////////////////////////
+///////////////////////////////////////////////////////////////////////
+float4 covert_uchar4_to_float4(uchar4 data)
+{
+	float4 f4Data = {0,0,0,0};
+	
+	f4Data.x = (float)data.x;
+	f4Data.y = (float)data.y;
+	f4Data.z = (float)data.z;
+	f4Data.w = (float)data.w;
+	
+	return f4Data;
+}
+
+
+uchar4 convert_float4_to_uchar4(float4 data)
+{
+	uchar4 u4Data;
+	
+	u4Data.x = get_valid_uchar(data.x);
+	u4Data.y = get_valid_uchar(data.y);
+	u4Data.z = get_valid_uchar(data.z);
+	u4Data.w = get_valid_uchar(data.w);
+	
+	return u4Data;
+}
+
+float4 int_x_float4(int leftOpr,float4 rightOpr)
+{
+	float4 result = {0,0,0,0};
+	
+	result.x = rightOpr.x * leftOpr;
+	result.y = rightOpr.y * leftOpr;
+	result.z = rightOpr.z * leftOpr;
+	result.w = rightOpr.w * leftOpr;
+	
+	return result;
+}
+
+float4 float4_x_float4(float4 leftOpr,float4 rightOpr)
+{
+	float4 result;
+	
+	result.x = leftOpr.x * rightOpr.x;
+	result.y = leftOpr.y * rightOpr.y;
+	result.z = leftOpr.z * rightOpr.z;
+	result.w = leftOpr.w * rightOpr.w;
+	
+	return result;
+}
+
+__kernel void pyrUp_C4_D0(__global uchar4* src,__global uchar4* dst,
+				int srcRows,int dstRows,int srcCols,int dstCols,
+				int srcOffset,int dstOffset,int srcStep,int dstStep)
+{
+	const int x = get_global_id(0);
+	const int y = get_global_id(1);
+	
+	__local float4 s_srcPatch[10][10];
+	__local float4 s_dstPatch[20][16];
+	
+	srcOffset >>= 2;
+	dstOffset >>= 2;
+	srcStep >>= 2;
+	dstStep >>= 2;
+	
+
+	if( get_local_id(0) < 10 && get_local_id(1) < 10 )
+	{
+		int srcx = (int)(get_group_id(0) * get_local_size(0) / 2 + get_local_id(0)) - 1;
+		int srcy = (int)(get_group_id(1) * get_local_size(1) / 2 + get_local_id(1)) - 1;
+		
+		srcx = abs(srcx);
+		srcx = min(srcCols - 1,srcx);
+		
+		srcy = abs(srcy);
+		srcy = min(srcRows -1 ,srcy);
+		
+		s_srcPatch[get_local_id(1)][get_local_id(0)] = covert_uchar4_to_float4(src[srcx + srcy * srcStep]);
+	}
+	
+	barrier(CLK_LOCAL_MEM_FENCE);
+	
+	float4 sum = (float4)(0,0,0,0);
+	
+	const int evenFlag = (int)((get_local_id(0) & 1) == 0);
+	const int oddFlag = (int)((get_local_id(0) & 1) != 0);
+	const bool  eveny = ((get_local_id(1) & 1) == 0);
+	const int tidx = get_local_id(0);
+	
+	float4 co1 = (float4)(0.375f, 0.375f, 0.375f, 0.375f);
+	float4 co2 = (float4)(0.25f, 0.25f, 0.25f, 0.25f);
+	float4 co3 = (float4)(0.0625f, 0.0625f, 0.0625f, 0.0625f);
+
+	
+	if(eveny)
+	{
+		sum = sum + float4_x_float4(int_x_float4( evenFlag, co3 ) , s_srcPatch[1 + (get_local_id(1) >> 1)][1 + ((tidx - 2) >> 1)]);
+        sum = sum + float4_x_float4(int_x_float4( oddFlag , co2 ) , s_srcPatch[1 + (get_local_id(1) >> 1)][1 + ((tidx - 1) >> 1)]);
+        sum = sum + float4_x_float4(int_x_float4( evenFlag, co1 ) , s_srcPatch[1 + (get_local_id(1) >> 1)][1 + ((tidx    ) >> 1)]);
+        sum = sum + float4_x_float4(int_x_float4( oddFlag , co2 ) , s_srcPatch[1 + (get_local_id(1) >> 1)][1 + ((tidx + 1) >> 1)]);
+        sum = sum + float4_x_float4(int_x_float4( evenFlag, co3 ) , s_srcPatch[1 + (get_local_id(1) >> 1)][1 + ((tidx + 2) >> 1)]);	
+                
+  	}
+	
+	s_dstPatch[2 + get_local_id(1)][get_local_id(0)] = sum;
+	
+	if (get_local_id(1) < 2)
+    {
+		sum = 0;
+
+        if (eveny)
+        {
+			sum = sum + float4_x_float4(int_x_float4(evenFlag , co3) , s_srcPatch[0][1 + ((tidx - 2) >> 1)]);
+            sum = sum + float4_x_float4(int_x_float4( oddFlag , co2  ) , s_srcPatch[0][1 + ((tidx - 1) >> 1)]);
+            sum = sum + float4_x_float4(int_x_float4(evenFlag , co1 ) , s_srcPatch[0][1 + ((tidx    ) >> 1)]);
+            sum = sum + float4_x_float4(int_x_float4( oddFlag , co2  ) , s_srcPatch[0][1 + ((tidx + 1) >> 1)]);
+            sum = sum + float4_x_float4(int_x_float4(evenFlag , co3) , s_srcPatch[0][1 + ((tidx + 2) >> 1)]);
+         }
+
+         s_dstPatch[get_local_id(1)][get_local_id(0)] = sum;
+     }
+     
+     if (get_local_id(1) > 13)
+     {
+		sum = 0;
+
+		if (eveny)
+		{
+			sum = sum + float4_x_float4(int_x_float4(evenFlag , co3) , s_srcPatch[9][1 + ((tidx - 2) >> 1)]);
+			sum = sum + float4_x_float4(int_x_float4( oddFlag , co2) , s_srcPatch[9][1 + ((tidx - 1) >> 1)]);
+			sum = sum + float4_x_float4(int_x_float4(evenFlag , co1) , s_srcPatch[9][1 + ((tidx    ) >> 1)]);
+			sum = sum + float4_x_float4(int_x_float4( oddFlag , co2) , s_srcPatch[9][1 + ((tidx + 1) >> 1)]);
+			sum = sum + float4_x_float4(int_x_float4(evenFlag , co3) , s_srcPatch[9][1 + ((tidx + 2) >> 1)]);
+			
+		}
+		s_dstPatch[4 + get_local_id(1)][get_local_id(0)] = sum;
+     }
+
+	barrier(CLK_LOCAL_MEM_FENCE);
+	
+	sum = 0;
+
+    const int tidy = get_local_id(1);
+
+    sum = sum + float4_x_float4(co3 , s_dstPatch[2 + tidy - 2][get_local_id(0)]);
+    sum = sum + float4_x_float4(co2 , s_dstPatch[2 + tidy - 1][get_local_id(0)]);
+    sum = sum + float4_x_float4(co1 , s_dstPatch[2 + tidy    ][get_local_id(0)]);
+    sum = sum + float4_x_float4(co2 , s_dstPatch[2 + tidy + 1][get_local_id(0)]);
+    sum = sum + float4_x_float4(co3 , s_dstPatch[2 + tidy + 2][get_local_id(0)]);
+
+    if ((x < dstCols) && (y < dstRows))
+    {
+		dst[x + y * dstStep] = convert_float4_to_uchar4(int_x_float4(4.0f,sum));
+	}
+} 
+///////////////////////////////////////////////////////////////////////
+//////////////////////////  CV_16UC4 //////////////////////////////////
+///////////////////////////////////////////////////////////////////////
+float4 covert_ushort4_to_float4(ushort4 data)
+{
+	float4 f4Data = {0,0,0,0};
+	
+	f4Data.x = (float)data.x;
+	f4Data.y = (float)data.y;
+	f4Data.z = (float)data.z;
+	f4Data.w = (float)data.w;
+	
+	return f4Data;
+}
+
+
+ushort4 convert_float4_to_ushort4(float4 data)
+{
+	ushort4 u4Data;
+	
+	u4Data.x = (float)data.x;
+	u4Data.y = (float)data.y;
+	u4Data.z = (float)data.z;
+	u4Data.w = (float)data.w;
+	
+	return u4Data;
+}
+
+
+__kernel void pyrUp_C4_D2(__global ushort4* src,__global ushort4* dst,
+				int srcRows,int dstRows,int srcCols,int dstCols,
+				int srcOffset,int dstOffset,int srcStep,int dstStep)
+{
+	const int x = get_global_id(0);
+	const int y = get_global_id(1);
+	
+	__local float4 s_srcPatch[10][10];
+	__local float4 s_dstPatch[20][16];
+	
+	srcOffset >>= 3;
+	dstOffset >>= 3;
+	srcStep >>= 3;
+	dstStep >>= 3;
+	
+
+	if( get_local_id(0) < 10 && get_local_id(1) < 10 )
+	{
+		int srcx = (int)(get_group_id(0) * get_local_size(0) / 2 + get_local_id(0)) - 1;
+		int srcy = (int)(get_group_id(1) * get_local_size(1) / 2 + get_local_id(1)) - 1;
+		
+		srcx = abs(srcx);
+		srcx = min(srcCols - 1,srcx);
+		
+		srcy = abs(srcy);
+		srcy = min(srcRows -1 ,srcy);
+		
+		s_srcPatch[get_local_id(1)][get_local_id(0)] = covert_ushort4_to_float4(src[srcx + srcy * srcStep]);
+	}
+	
+	barrier(CLK_LOCAL_MEM_FENCE);
+	
+	float4 sum = (float4)(0,0,0,0);
+	
+	const int evenFlag = (int)((get_local_id(0) & 1) == 0);
+	const int oddFlag = (int)((get_local_id(0) & 1) != 0);
+	const bool  eveny = ((get_local_id(1) & 1) == 0);
+	const int tidx = get_local_id(0);
+	
+	float4 co1 = (float4)(0.375f, 0.375f, 0.375f, 0.375f);
+	float4 co2 = (float4)(0.25f, 0.25f, 0.25f, 0.25f);
+	float4 co3 = (float4)(0.0625f, 0.0625f, 0.0625f, 0.0625f);
+
+	
+	if(eveny)
+	{
+		sum = sum + float4_x_float4(int_x_float4( evenFlag, co3 ) , s_srcPatch[1 + (get_local_id(1) >> 1)][1 + ((tidx - 2) >> 1)]);
+        sum = sum + float4_x_float4(int_x_float4( oddFlag , co2 ) , s_srcPatch[1 + (get_local_id(1) >> 1)][1 + ((tidx - 1) >> 1)]);
+        sum = sum + float4_x_float4(int_x_float4( evenFlag, co1 ) , s_srcPatch[1 + (get_local_id(1) >> 1)][1 + ((tidx    ) >> 1)]);
+        sum = sum + float4_x_float4(int_x_float4( oddFlag , co2 ) , s_srcPatch[1 + (get_local_id(1) >> 1)][1 + ((tidx + 1) >> 1)]);
+        sum = sum + float4_x_float4(int_x_float4( evenFlag, co3 ) , s_srcPatch[1 + (get_local_id(1) >> 1)][1 + ((tidx + 2) >> 1)]);	
+                
+  	}
+	
+	s_dstPatch[2 + get_local_id(1)][get_local_id(0)] = sum;
+	
+	if (get_local_id(1) < 2)
+    {
+		sum = 0;
+
+        if (eveny)
+        {
+			sum = sum + float4_x_float4(int_x_float4(evenFlag , co3) , s_srcPatch[0][1 + ((tidx - 2) >> 1)]);
+            sum = sum + float4_x_float4(int_x_float4( oddFlag , co2  ) , s_srcPatch[0][1 + ((tidx - 1) >> 1)]);
+            sum = sum + float4_x_float4(int_x_float4(evenFlag , co1 ) , s_srcPatch[0][1 + ((tidx    ) >> 1)]);
+            sum = sum + float4_x_float4(int_x_float4( oddFlag , co2  ) , s_srcPatch[0][1 + ((tidx + 1) >> 1)]);
+            sum = sum + float4_x_float4(int_x_float4(evenFlag , co3) , s_srcPatch[0][1 + ((tidx + 2) >> 1)]);
+         }
+
+         s_dstPatch[get_local_id(1)][get_local_id(0)] = sum;
+     }
+     
+     if (get_local_id(1) > 13)
+     {
+		sum = 0;
+
+		if (eveny)
+		{
+			sum = sum + float4_x_float4(int_x_float4(evenFlag , co3) , s_srcPatch[9][1 + ((tidx - 2) >> 1)]);
+			sum = sum + float4_x_float4(int_x_float4( oddFlag , co2) , s_srcPatch[9][1 + ((tidx - 1) >> 1)]);
+			sum = sum + float4_x_float4(int_x_float4(evenFlag , co1) , s_srcPatch[9][1 + ((tidx    ) >> 1)]);
+			sum = sum + float4_x_float4(int_x_float4( oddFlag , co2) , s_srcPatch[9][1 + ((tidx + 1) >> 1)]);
+			sum = sum + float4_x_float4(int_x_float4(evenFlag , co3) , s_srcPatch[9][1 + ((tidx + 2) >> 1)]);
+			
+		}
+		s_dstPatch[4 + get_local_id(1)][get_local_id(0)] = sum;
+     }
+
+	barrier(CLK_LOCAL_MEM_FENCE);
+	
+	sum = 0;
+
+    const int tidy = get_local_id(1);
+
+    sum = sum + float4_x_float4(co3 , s_dstPatch[2 + tidy - 2][get_local_id(0)]);
+    sum = sum + float4_x_float4(co2 , s_dstPatch[2 + tidy - 1][get_local_id(0)]);
+    sum = sum + float4_x_float4(co1 , s_dstPatch[2 + tidy    ][get_local_id(0)]);
+    sum = sum + float4_x_float4(co2 , s_dstPatch[2 + tidy + 1][get_local_id(0)]);
+    sum = sum + float4_x_float4(co3 , s_dstPatch[2 + tidy + 2][get_local_id(0)]);
+
+    if ((x < dstCols) && (y < dstRows))
+    {
+		dst[x + y * dstStep] = convert_float4_to_ushort4(int_x_float4(4.0f,sum));
+	}
+}
+
+///////////////////////////////////////////////////////////////////////
+//////////////////////////  CV_32FC4 //////////////////////////////////
+///////////////////////////////////////////////////////////////////////
+__kernel void pyrUp_C4_D5(__global float4* src,__global float4* dst,
+				int srcRows,int dstRows,int srcCols,int dstCols,
+				int srcOffset,int dstOffset,int srcStep,int dstStep)
+{
+	const int x = get_global_id(0);
+	const int y = get_global_id(1);
+	
+	__local float4 s_srcPatch[10][10];
+	__local float4 s_dstPatch[20][16];
+	
+	srcOffset >>= 4;
+	dstOffset >>= 4;
+	srcStep >>= 4;
+	dstStep >>= 4;
+	
+	
+	if( get_local_id(0) < 10 && get_local_id(1) < 10 )
+	{
+		int srcx = (int)(get_group_id(0) * get_local_size(0) / 2 + get_local_id(0)) - 1;
+		int srcy = (int)(get_group_id(1) * get_local_size(1) / 2 + get_local_id(1)) - 1;
+		
+		srcx = abs(srcx);
+		srcx = min(srcCols - 1,srcx);
+		
+		srcy = abs(srcy);
+		srcy = min(srcRows -1 ,srcy);
+		
+		s_srcPatch[get_local_id(1)][get_local_id(0)] = (float4)(src[srcx + srcy * srcStep]);
+	}
+	
+	barrier(CLK_LOCAL_MEM_FENCE);
+	
+	float4 sum = (float4)(0,0,0,0);
+	
+	const int evenFlag = (int)((get_local_id(0) & 1) == 0);
+	const int oddFlag = (int)((get_local_id(0) & 1) != 0);
+	const bool  eveny = ((get_local_id(1) & 1) == 0);
+	const int tidx = get_local_id(0);
+	
+	float4 co1 = (float4)(0.375f, 0.375f, 0.375f, 0.375f);
+	float4 co2 = (float4)(0.25f, 0.25f, 0.25f, 0.25f);
+	float4 co3 = (float4)(0.0625f, 0.0625f, 0.0625f, 0.0625f);
+
+	
+	if(eveny)
+	{
+		sum = sum + float4_x_float4(int_x_float4( evenFlag, co3 ) , s_srcPatch[1 + (get_local_id(1) >> 1)][1 + ((tidx - 2) >> 1)]);
+        sum = sum + float4_x_float4(int_x_float4( oddFlag , co2 ) , s_srcPatch[1 + (get_local_id(1) >> 1)][1 + ((tidx - 1) >> 1)]);
+        sum = sum + float4_x_float4(int_x_float4( evenFlag, co1 ) , s_srcPatch[1 + (get_local_id(1) >> 1)][1 + ((tidx    ) >> 1)]);
+        sum = sum + float4_x_float4(int_x_float4( oddFlag , co2 ) , s_srcPatch[1 + (get_local_id(1) >> 1)][1 + ((tidx + 1) >> 1)]);
+        sum = sum + float4_x_float4(int_x_float4( evenFlag, co3 ) , s_srcPatch[1 + (get_local_id(1) >> 1)][1 + ((tidx + 2) >> 1)]);	
+                
+  	}
+	
+	s_dstPatch[2 + get_local_id(1)][get_local_id(0)] = sum;
+	
+	if (get_local_id(1) < 2)
+    {
+		sum = 0;
+
+        if (eveny)
+        {
+			sum = sum + float4_x_float4(int_x_float4(evenFlag , co3) , s_srcPatch[0][1 + ((tidx - 2) >> 1)]);
+            sum = sum + float4_x_float4(int_x_float4( oddFlag , co2  ) , s_srcPatch[0][1 + ((tidx - 1) >> 1)]);
+            sum = sum + float4_x_float4(int_x_float4(evenFlag , co1 ) , s_srcPatch[0][1 + ((tidx    ) >> 1)]);
+            sum = sum + float4_x_float4(int_x_float4( oddFlag , co2  ) , s_srcPatch[0][1 + ((tidx + 1) >> 1)]);
+            sum = sum + float4_x_float4(int_x_float4(evenFlag , co3) , s_srcPatch[0][1 + ((tidx + 2) >> 1)]);
+         }
+
+         s_dstPatch[get_local_id(1)][get_local_id(0)] = sum;
+     }
+     
+     if (get_local_id(1) > 13)
+     {
+		sum = 0;
+
+		if (eveny)
+		{
+			sum = sum + float4_x_float4(int_x_float4(evenFlag , co3) , s_srcPatch[9][1 + ((tidx - 2) >> 1)]);
+			sum = sum + float4_x_float4(int_x_float4( oddFlag , co2) , s_srcPatch[9][1 + ((tidx - 1) >> 1)]);
+			sum = sum + float4_x_float4(int_x_float4(evenFlag , co1) , s_srcPatch[9][1 + ((tidx    ) >> 1)]);
+			sum = sum + float4_x_float4(int_x_float4( oddFlag , co2) , s_srcPatch[9][1 + ((tidx + 1) >> 1)]);
+			sum = sum + float4_x_float4(int_x_float4(evenFlag , co3) , s_srcPatch[9][1 + ((tidx + 2) >> 1)]);
+			
+		}
+		s_dstPatch[4 + get_local_id(1)][get_local_id(0)] = sum;
+     }
+
+	barrier(CLK_LOCAL_MEM_FENCE);
+	
+	sum = 0;
+
+    const int tidy = get_local_id(1);
+
+    sum = sum + float4_x_float4(co3 , s_dstPatch[2 + tidy - 2][get_local_id(0)]);
+    sum = sum + float4_x_float4(co2 , s_dstPatch[2 + tidy - 1][get_local_id(0)]);
+    sum = sum + float4_x_float4(co1 , s_dstPatch[2 + tidy    ][get_local_id(0)]);
+    sum = sum + float4_x_float4(co2 , s_dstPatch[2 + tidy + 1][get_local_id(0)]);
+    sum = sum + float4_x_float4(co3 , s_dstPatch[2 + tidy + 2][get_local_id(0)]);
+
+    if ((x < dstCols) && (y < dstRows))
+    {
+		dst[x + y * dstStep] = 4.0f * sum;
+	}
+}
--- a/modules/ocl/src/match_template.cpp
+++ b/modules/ocl/src/match_template.cpp
@@ -0,0 +1,560 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
+// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// @Authors
+//    Peng Xiao, pengxiao@multicorewareinc.com
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other oclMaterials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors as is and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+
+#include <iomanip>
+#include "precomp.hpp"
+
+using namespace cv;
+using namespace cv::ocl;
+using namespace std;
+
+#define EXT_FP64 0
+
+#if !defined (HAVE_OPENCL)
+void cv::ocl::matchTemplate(const oclMat&, const oclMat&, oclMat&) { throw_nogpu(); }
+#else
+//helper routines
+namespace cv
+{
+	namespace ocl
+	{
+		///////////////////////////OpenCL kernel strings///////////////////////////
+		extern const char *match_template;
+	}
+}
+
+namespace cv { namespace ocl
+{
+	void matchTemplate_SQDIFF(
+		const oclMat& image, const oclMat& templ, oclMat& result, MatchTemplateBuf &buf);
+
+	void matchTemplate_SQDIFF_NORMED(
+		const oclMat& image, const oclMat& templ, oclMat& result, MatchTemplateBuf &buf);
+
+	void matchTemplate_CCORR(
+		const oclMat& image, const oclMat& templ, oclMat& result, MatchTemplateBuf &buf);
+
+	void matchTemplate_CCORR_NORMED(
+		const oclMat& image, const oclMat& templ, oclMat& result, MatchTemplateBuf &buf);
+
+	void matchTemplate_CCOFF(
+		const oclMat& image, const oclMat& templ, oclMat& result, MatchTemplateBuf &buf);
+
+	void matchTemplate_CCOFF_NORMED(
+		const oclMat& image, const oclMat& templ, oclMat& result, MatchTemplateBuf &buf);
+
+
+	void matchTemplateNaive_SQDIFF(
+		const oclMat& image, const oclMat& templ, oclMat& result, int cn);
+
+	void matchTemplateNaive_CCORR(
+		const oclMat& image, const oclMat& templ, oclMat& result, int cn);
+
+	// Evaluates optimal template's area threshold. If 
+	// template's area is less  than the threshold, we use naive match 
+	// template version, otherwise FFT-based (if available)
+	int getTemplateThreshold(int method, int depth)
+	{
+		switch (method)
+		{
+		case CV_TM_CCORR: 
+			if (depth == CV_32F) return 250;
+			if (depth == CV_8U) return 300;
+			break;
+		case CV_TM_SQDIFF:
+			if (depth == CV_32F) return MAXSHORT; // do naive SQDIFF for CV_32F
+			if (depth == CV_8U) return 300;
+			break;
+		}
+		CV_Error(CV_StsBadArg, "getTemplateThreshold: unsupported match template mode");
+		return 0;
+	}
+
+
+	//////////////////////////////////////////////////////////////////////
+	// SQDIFF
+	void matchTemplate_SQDIFF(
+		const oclMat& image, const oclMat& templ, oclMat& result, MatchTemplateBuf &buf)
+	{
+		result.create(image.rows - templ.rows + 1, image.cols - templ.cols + 1, CV_32F);
+		if (templ.size().area() < getTemplateThreshold(CV_TM_SQDIFF, image.depth()))
+		{
+			matchTemplateNaive_SQDIFF(image, templ, result, image.channels());
+			return;
+		}
+		else
+		{
+			// TODO
+			CV_Error(CV_StsBadArg, "Not supported yet for this size template");
+		}
+	}
+
+	void matchTemplate_SQDIFF_NORMED(
+		const oclMat& image, const oclMat& templ, oclMat& result, MatchTemplateBuf &buf)
+	{
+		matchTemplate_CCORR(image,templ,result,buf);
+		buf.image_sums.resize(1);
+		buf.image_sqsums.resize(1);
+
+		integral(image.reshape(1), buf.image_sums[0], buf.image_sqsums[0]);
+
+#if EXT_FP64 && SQRSUM_FIXED
+		unsigned long long templ_sqsum = (unsigned long long)sqrSum(templ.reshape(1))[0];
+#else
+		Mat sqr_mat = templ.reshape(1);
+		unsigned long long templ_sqsum = (unsigned long long)sum(sqr_mat.mul(sqr_mat))[0];
+#endif
+
+		Context *clCxt = image.clCxt;
+		string kernelName = "matchTemplate_Prepared_SQDIFF_NORMED";
+		vector< pair<size_t, const void *> > args;
+
+		args.push_back( make_pair( sizeof(cl_mem), (void *)&buf.image_sums[0].data));
+		args.push_back( make_pair( sizeof(cl_mem), (void *)&result.data));
+		args.push_back( make_pair( sizeof(cl_ulong), (void *)&templ_sqsum));
+		args.push_back( make_pair( sizeof(cl_int), (void *)&result.rows));
+		args.push_back( make_pair( sizeof(cl_int), (void *)&result.cols));
+		args.push_back( make_pair( sizeof(cl_int), (void *)&templ.rows));
+		args.push_back( make_pair( sizeof(cl_int), (void *)&templ.cols));
+		args.push_back( make_pair( sizeof(cl_int), (void *)&buf.image_sums[0].offset));
+		args.push_back( make_pair( sizeof(cl_int), (void *)&buf.image_sums[0].step));
+		args.push_back( make_pair( sizeof(cl_int), (void *)&result.offset));
+		args.push_back( make_pair( sizeof(cl_int), (void *)&result.step));
+
+		size_t globalThreads[3] = {result.cols, result.rows, 1};
+		size_t localThreads[3]  = {32, 8, 1};
+		openCLExecuteKernel(clCxt, &match_template, kernelName, globalThreads, localThreads, args, 1, CV_8U);
+	}
+
+	void matchTemplateNaive_SQDIFF(
+		const oclMat& image, const oclMat& templ, oclMat& result, int cn)
+	{
+		CV_Assert((image.depth() == CV_8U && templ.depth() == CV_8U )
+			|| (image.depth() == CV_32F && templ.depth() == CV_32F) && result.depth() == CV_32F);
+		CV_Assert(image.channels() == templ.channels() && (image.channels() == 1 || image.channels() == 4) && result.channels() == 1);
+		CV_Assert(result.rows == image.rows - templ.rows + 1 && result.cols == image.cols - templ.cols + 1);
+
+		Context *clCxt = image.clCxt;
+		string kernelName = "matchTemplate_Naive_SQDIFF";
+
+		vector< pair<size_t, const void *> > args;
+
+		args.push_back( make_pair( sizeof(cl_mem), (void *)&image.data));
+		args.push_back( make_pair( sizeof(cl_mem), (void *)&templ.data));
+		args.push_back( make_pair( sizeof(cl_mem), (void *)&result.data));
+		args.push_back( make_pair( sizeof(cl_int), (void *)&image.rows));
+		args.push_back( make_pair( sizeof(cl_int), (void *)&image.cols));
+		args.push_back( make_pair( sizeof(cl_int), (void *)&templ.rows));
+		args.push_back( make_pair( sizeof(cl_int), (void *)&templ.cols));
+		args.push_back( make_pair( sizeof(cl_int), (void *)&result.rows));
+		args.push_back( make_pair( sizeof(cl_int), (void *)&result.cols));
+		args.push_back( make_pair( sizeof(cl_int), (void *)&image.offset));
+		args.push_back( make_pair( sizeof(cl_int), (void *)&templ.offset));
+		args.push_back( make_pair( sizeof(cl_int), (void *)&result.offset));
+		args.push_back( make_pair( sizeof(cl_int), (void *)&image.step));
+		args.push_back( make_pair( sizeof(cl_int), (void *)&templ.step));
+		args.push_back( make_pair( sizeof(cl_int), (void *)&result.step));
+
+		size_t globalThreads[3] = {result.cols, result.rows, 1};
+		size_t localThreads[3]  = {32, 8, 1};
+		openCLExecuteKernel(clCxt, &match_template, kernelName, globalThreads, localThreads, args, image.channels(), image.depth());
+	}
+
+	//////////////////////////////////////////////////////////////////////
+	// CCORR
+	void matchTemplate_CCORR(
+		const oclMat& image, const oclMat& templ, oclMat& result, MatchTemplateBuf &buf)
+	{
+		result.create(image.rows - templ.rows + 1, image.cols - templ.cols + 1, CV_32F);
+		if (templ.size().area() < getTemplateThreshold(CV_TM_SQDIFF, image.depth()))
+		{
+			matchTemplateNaive_CCORR(image, templ, result, image.channels());
+			return;
+		}
+		else
+		{
+			CV_Error(CV_StsBadArg, "Not supported yet for this size template");
+			if(image.depth() == CV_8U && templ.depth() == CV_8U)
+			{
+				image.convertTo(buf.imagef, CV_32F);
+				templ.convertTo(buf.templf, CV_32F);
+			}
+			CV_Assert(image.channels() == 1);
+			oclMat o_result(image.size(), CV_MAKETYPE(CV_32F, image.channels()));
+			filter2D(buf.imagef,o_result,CV_32F,buf.templf, Point(0,0));
+			result = o_result(Rect(0,0,image.rows - templ.rows + 1, image.cols - templ.cols + 1));
+		}
+	}
+
+	void matchTemplate_CCORR_NORMED(
+		const oclMat& image, const oclMat& templ, oclMat& result, MatchTemplateBuf &buf)
+	{
+		matchTemplate_CCORR(image,templ,result,buf);
+		buf.image_sums.resize(1);
+		buf.image_sqsums.resize(1);
+
+		integral(image.reshape(1), buf.image_sums[0], buf.image_sqsums[0]);
+#if EXT_FP64 && SQRSUM_FIXED
+		unsigned long long templ_sqsum = (unsigned long long)sqrSum(templ.reshape(1))[0];
+#elif EXT_FP64
+		oclMat templ_c1 = templ.reshape(1);
+		multiply(templ_c1, templ_c1, templ_c1);
+		unsigned long long templ_sqsum = (unsigned long long)sum(templ_c1)[0];
+#else
+		Mat m_templ_c1 = templ.reshape(1);
+		multiply(m_templ_c1, m_templ_c1, m_templ_c1);
+		unsigned long long templ_sqsum = (unsigned long long)sum(m_templ_c1)[0];
+#endif
+		Context *clCxt = image.clCxt;
+		string kernelName = "normalizeKernel";
+		vector< pair<size_t, const void *> > args;
+
+		args.push_back( make_pair( sizeof(cl_mem), (void *)&buf.image_sqsums[0].data));
+		args.push_back( make_pair( sizeof(cl_mem), (void *)&result.data));
+		args.push_back( make_pair( sizeof(cl_ulong), (void *)&templ_sqsum));
+		args.push_back( make_pair( sizeof(cl_int), (void *)&result.rows));
+		args.push_back( make_pair( sizeof(cl_int), (void *)&result.cols));
+		args.push_back( make_pair( sizeof(cl_int), (void *)&templ.rows));
+		args.push_back( make_pair( sizeof(cl_int), (void *)&templ.cols));
+		args.push_back( make_pair( sizeof(cl_int), (void *)&buf.image_sqsums[0].offset));
+		args.push_back( make_pair( sizeof(cl_int), (void *)&buf.image_sqsums[0].step));
+		args.push_back( make_pair( sizeof(cl_int), (void *)&result.offset));
+		args.push_back( make_pair( sizeof(cl_int), (void *)&result.step));
+
+		size_t globalThreads[3] = {result.cols, result.rows, 1};
+		size_t localThreads[3]  = {32, 8, 1};
+		openCLExecuteKernel(clCxt, &match_template, kernelName, globalThreads, localThreads, args, 1, CV_8U);
+	}
+
+	void matchTemplateNaive_CCORR(
+		const oclMat& image, const oclMat& templ, oclMat& result, int cn)
+	{
+		CV_Assert((image.depth() == CV_8U && templ.depth() == CV_8U )
+			|| (image.depth() == CV_32F && templ.depth() == CV_32F) && result.depth() == CV_32F);
+		CV_Assert(image.channels() == templ.channels() && (image.channels() == 1 || image.channels() == 4) && result.channels() == 1);
+		CV_Assert(result.rows == image.rows - templ.rows + 1 && result.cols == image.cols - templ.cols + 1);
+
+		Context *clCxt = image.clCxt;
+		string kernelName = "matchTemplate_Naive_CCORR";
+
+		vector< pair<size_t, const void *> > args;
+
+		args.push_back( make_pair( sizeof(cl_mem), (void *)&image.data));
+		args.push_back( make_pair( sizeof(cl_mem), (void *)&templ.data));
+		args.push_back( make_pair( sizeof(cl_mem), (void *)&result.data));
+		args.push_back( make_pair( sizeof(cl_int), (void *)&image.rows));
+		args.push_back( make_pair( sizeof(cl_int), (void *)&image.cols));
+		args.push_back( make_pair( sizeof(cl_int), (void *)&templ.rows));
+		args.push_back( make_pair( sizeof(cl_int), (void *)&templ.cols));
+		args.push_back( make_pair( sizeof(cl_int), (void *)&result.rows));
+		args.push_back( make_pair( sizeof(cl_int), (void *)&result.cols));
+		args.push_back( make_pair( sizeof(cl_int), (void *)&image.offset));
+		args.push_back( make_pair( sizeof(cl_int), (void *)&templ.offset));
+		args.push_back( make_pair( sizeof(cl_int), (void *)&result.offset));
+		args.push_back( make_pair( sizeof(cl_int), (void *)&image.step));
+		args.push_back( make_pair( sizeof(cl_int), (void *)&templ.step));
+		args.push_back( make_pair( sizeof(cl_int), (void *)&result.step));
+
+		size_t globalThreads[3] = {result.cols, result.rows, 1};
+		size_t localThreads[3]  = {32, 8, 1};
+		openCLExecuteKernel(clCxt, &match_template, kernelName, globalThreads, localThreads, args, image.channels(), image.depth());
+	}
+	//////////////////////////////////////////////////////////////////////
+	// CCOFF
+	void matchTemplate_CCOFF(
+		const oclMat& image, const oclMat& templ, oclMat& result, MatchTemplateBuf &buf)
+	{
+		CV_Assert(image.depth() == CV_8U && templ.depth() == CV_8U);
+
+		matchTemplate_CCORR(image,templ,result,buf);
+
+		Context *clCxt = image.clCxt;
+		string kernelName;
+
+		kernelName = "matchTemplate_Prepared_CCOFF";
+		size_t globalThreads[3] = {result.cols, result.rows, 1};
+		size_t localThreads[3]  = {32, 8, 1};
+
+		vector< pair<size_t, const void *> > args;
+		args.push_back( make_pair( sizeof(cl_mem), (void *)&result.data) );
+		args.push_back( make_pair( sizeof(cl_int), (void *)&image.rows) ); 
+		args.push_back( make_pair( sizeof(cl_int), (void *)&image.cols) );
+		args.push_back( make_pair( sizeof(cl_int), (void *)&templ.rows) );
+		args.push_back( make_pair( sizeof(cl_int), (void *)&templ.cols) );
+		args.push_back( make_pair( sizeof(cl_int), (void *)&result.rows) );
+		args.push_back( make_pair( sizeof(cl_int), (void *)&result.cols) );
+		args.push_back( make_pair( sizeof(cl_int), (void *)&result.offset));
+		args.push_back( make_pair( sizeof(cl_int), (void *)&result.step));
+		// to be continued in the following section
+		if(image.channels() == 1)
+		{
+			buf.image_sums.resize(1);
+			// FIXME: temp fix for incorrect integral kernel
+			oclMat tmp_oclmat;
+			integral(image, buf.image_sums[0], tmp_oclmat);
+
+			float templ_sum = 0;
+#if EXT_FP64
+			templ_sum = (float)sum(templ)[0] / templ.size().area();
+#else
+			Mat o_templ = templ;
+			templ_sum = (float)sum(o_templ)[0] / o_templ.size().area(); // temp fix for non-double supported machine
+#endif
+			args.push_back( make_pair( sizeof(cl_mem),  (void *)&buf.image_sums[0].data) );
+			args.push_back( make_pair( sizeof(cl_int),  (void *)&buf.image_sums[0].offset) );
+			args.push_back( make_pair( sizeof(cl_int),  (void *)&buf.image_sums[0].step) );
+			args.push_back( make_pair( sizeof(cl_float),(void *)&templ_sum) );
+		}
+		else
+		{
+			Vec4f templ_sum = Vec4f::all(0);
+#if EXT_FP64
+			split(image,buf.images);
+			templ_sum = sum(templ) / templ.size().area();
+#else 
+			// temp fix for non-double supported machine
+			Mat o_templ = templ, o_image = image;
+			vector<Mat> o_mat_vector;
+			o_mat_vector.resize(image.channels());
+			buf.images.resize(image.channels());
+			split(o_image, o_mat_vector);
+			for(int i = 0; i < o_mat_vector.size(); i ++)
+			{
+				buf.images[i] = oclMat(o_mat_vector[i]);
+			}
+			templ_sum = sum(o_templ) / templ.size().area();
+#endif
+			buf.image_sums.resize(buf.images.size());
+
+			for(int i = 0; i < image.channels(); i ++)
+			{
+				// FIXME: temp fix for incorrect integral kernel
+				oclMat omat_temp;
+				integral(buf.images[i], buf.image_sums[i], omat_temp);
+			}
+			switch(image.channels())
+			{
+			case 4:
+				args.push_back( make_pair( sizeof(cl_mem),  (void *)&buf.image_sums[0].data) );
+				args.push_back( make_pair( sizeof(cl_mem),  (void *)&buf.image_sums[1].data) );
+				args.push_back( make_pair( sizeof(cl_mem),  (void *)&buf.image_sums[2].data) );
+				args.push_back( make_pair( sizeof(cl_mem),  (void *)&buf.image_sums[3].data) );
+				args.push_back( make_pair( sizeof(cl_int),  (void *)&buf.image_sums[0].offset) );
+				args.push_back( make_pair( sizeof(cl_int),  (void *)&buf.image_sums[0].step) );
+				args.push_back( make_pair( sizeof(cl_float),(void *)&templ_sum[0]) );
+				args.push_back( make_pair( sizeof(cl_float),(void *)&templ_sum[1]) );
+				args.push_back( make_pair( sizeof(cl_float),(void *)&templ_sum[2]) );
+				args.push_back( make_pair( sizeof(cl_float),(void *)&templ_sum[3]) );
+				break;
+			default:
+				CV_Error(CV_StsBadArg, "matchTemplate: unsupported number of channels");
+				break;
+			}
+		}
+		openCLExecuteKernel(clCxt, &match_template, kernelName, globalThreads, localThreads, args, image.channels(), image.depth());
+	}
+
+	void matchTemplate_CCOFF_NORMED(
+		const oclMat& image, const oclMat& templ, oclMat& result, MatchTemplateBuf &buf)
+	{
+		image.convertTo(buf.imagef, CV_32F);
+		templ.convertTo(buf.templf, CV_32F);
+
+		matchTemplate_CCORR(buf.imagef, buf.templf, result, buf);
+		float scale = 1.f/templ.size().area();
+
+		Context *clCxt = image.clCxt;
+		string kernelName;
+
+		kernelName = "matchTemplate_Prepared_CCOFF_NORMED";
+		size_t globalThreads[3] = {result.cols, result.rows, 1};
+		size_t localThreads[3]  = {32, 8, 1};
+
+		vector< pair<size_t, const void *> > args;
+		args.push_back( make_pair( sizeof(cl_mem), (void *)&result.data) );
+		args.push_back( make_pair( sizeof(cl_int), (void *)&image.rows) ); 
+		args.push_back( make_pair( sizeof(cl_int), (void *)&image.cols) );
+		args.push_back( make_pair( sizeof(cl_int), (void *)&templ.rows) );
+		args.push_back( make_pair( sizeof(cl_int), (void *)&templ.cols) );
+		args.push_back( make_pair( sizeof(cl_int), (void *)&result.rows) );
+		args.push_back( make_pair( sizeof(cl_int), (void *)&result.cols) );
+		args.push_back( make_pair( sizeof(cl_int), (void *)&result.offset));
+		args.push_back( make_pair( sizeof(cl_int), (void *)&result.step));
+		args.push_back( make_pair( sizeof(cl_float),(void *)&scale) );
+		// to be continued in the following section
+		if(image.channels() == 1)
+		{
+			buf.image_sums.resize(1);
+			buf.image_sqsums.resize(1);
+			integral(image, buf.image_sums[0], buf.image_sqsums[0]);
+			float templ_sum = 0;
+			float templ_sqsum = 0;
+#if EXT_FP64
+			templ_sum   = (float)sum(templ)[0];
+#if SQRSUM_FIXED
+			templ_sqsum = sqrSum(templ);
+#else
+			oclMat templ_sqr = templ;
+			multiply(templ,templ, templ_sqr);
+			templ_sqsum  = sum(templ_sqr)[0];
+#endif //SQRSUM_FIXED
+			templ_sqsum -= scale * templ_sum * templ_sum;
+			templ_sum   *= scale;
+#else
+			// temp fix for non-double supported machine
+			Mat o_templ = templ;
+			templ_sum   = (float)sum(o_templ)[0]; 
+			templ_sqsum = sum(o_templ.mul(o_templ))[0] - scale * templ_sum * templ_sum;
+			templ_sum  *= scale;
+#endif
+			args.push_back( make_pair( sizeof(cl_mem),  (void *)&buf.image_sums[0].data) );
+			args.push_back( make_pair( sizeof(cl_int),  (void *)&buf.image_sums[0].offset) );
+			args.push_back( make_pair( sizeof(cl_int),  (void *)&buf.image_sums[0].step) );
+			args.push_back( make_pair( sizeof(cl_mem),  (void *)&buf.image_sqsums[0].data) );
+			args.push_back( make_pair( sizeof(cl_int),  (void *)&buf.image_sqsums[0].offset) );
+			args.push_back( make_pair( sizeof(cl_int),  (void *)&buf.image_sqsums[0].step) );
+			args.push_back( make_pair( sizeof(cl_float),(void *)&templ_sum) );
+			args.push_back( make_pair( sizeof(cl_float),(void *)&templ_sqsum) );
+		}
+		else
+		{
+			Vec4f templ_sum   = Vec4f::all(0);
+			Vec4f templ_sqsum = Vec4f::all(0);
+#if EXT_FP64
+			split(image,buf.images);
+			templ_sum   = sum(templ);
+#if SQRSUM_FIXED
+			templ_sqsum = sqrSum(templ);
+#else
+			oclMat templ_sqr = templ;
+			multiply(templ,templ, templ_sqr);
+			templ_sqsum  = sum(templ_sqr);
+#endif //SQRSUM_FIXED
+			templ_sqsum -= scale * templ_sum * templ_sum;
+			
+#else 
+			// temp fix for non-double supported machine
+			Mat o_templ = templ, o_image = image;
+			
+			vector<Mat> o_mat_vector;
+			o_mat_vector.resize(image.channels());
+			buf.images.resize(image.channels());
+			split(o_image, o_mat_vector);
+			for(int i = 0; i < o_mat_vector.size(); i ++)
+			{
+				buf.images[i] = oclMat(o_mat_vector[i]);
+			}
+			templ_sum    = sum(o_templ);
+			templ_sqsum  = sum(o_templ.mul(o_templ));
+#endif
+			float templ_sqsum_sum = 0;
+			for(int i = 0; i < image.channels(); i ++)
+			{
+				templ_sqsum_sum += templ_sqsum[i] - scale * templ_sum[i] * templ_sum[i];
+			}
+			templ_sum   *= scale;
+			buf.image_sums.resize(buf.images.size());
+			buf.image_sqsums.resize(buf.images.size());
+			
+			for(int i = 0; i < image.channels(); i ++)
+			{
+				integral(buf.images[i], buf.image_sums[i], buf.image_sqsums[i]);
+			}
+			
+			switch(image.channels())
+			{
+			case 4:
+				args.push_back( make_pair( sizeof(cl_mem),  (void *)&buf.image_sums[0].data) );
+				args.push_back( make_pair( sizeof(cl_mem),  (void *)&buf.image_sums[1].data) );
+				args.push_back( make_pair( sizeof(cl_mem),  (void *)&buf.image_sums[2].data) );
+				args.push_back( make_pair( sizeof(cl_mem),  (void *)&buf.image_sums[3].data) );
+				args.push_back( make_pair( sizeof(cl_int),  (void *)&buf.image_sums[0].offset) );
+				args.push_back( make_pair( sizeof(cl_int),  (void *)&buf.image_sums[0].step) );
+				args.push_back( make_pair( sizeof(cl_mem),  (void *)&buf.image_sqsums[0].data) );
+				args.push_back( make_pair( sizeof(cl_mem),  (void *)&buf.image_sqsums[1].data) );
+				args.push_back( make_pair( sizeof(cl_mem),  (void *)&buf.image_sqsums[2].data) );
+				args.push_back( make_pair( sizeof(cl_mem),  (void *)&buf.image_sqsums[3].data) );
+				args.push_back( make_pair( sizeof(cl_int),  (void *)&buf.image_sqsums[0].offset) );
+				args.push_back( make_pair( sizeof(cl_int),  (void *)&buf.image_sqsums[0].step) );
+				args.push_back( make_pair( sizeof(cl_float),(void *)&templ_sum[0]) );
+				args.push_back( make_pair( sizeof(cl_float),(void *)&templ_sum[1]) );
+				args.push_back( make_pair( sizeof(cl_float),(void *)&templ_sum[2]) );
+				args.push_back( make_pair( sizeof(cl_float),(void *)&templ_sum[3]) );
+				args.push_back( make_pair( sizeof(cl_float),(void *)&templ_sqsum_sum) );
+				break;
+			default:
+				CV_Error(CV_StsBadArg, "matchTemplate: unsupported number of channels");
+				break;
+			}
+		}
+		openCLExecuteKernel(clCxt, &match_template, kernelName, globalThreads, localThreads, args, image.channels(), image.depth());
+	}
+
+}/*ocl*/} /*cv*/
+
+void cv::ocl::matchTemplate(const oclMat& image, const oclMat& templ, oclMat& result, int method)
+{
+	MatchTemplateBuf buf;
+	matchTemplate(image,templ, result, method,buf);
+}
+void cv::ocl::matchTemplate(const oclMat& image, const oclMat& templ, oclMat& result, int method, MatchTemplateBuf& buf)
+{
+	CV_Assert(image.type() == templ.type());
+	CV_Assert(image.cols >= templ.cols && image.rows >= templ.rows);
+
+	typedef void (*Caller)(const oclMat&, const oclMat&, oclMat&, MatchTemplateBuf&);
+
+	const Caller callers[] = { 
+		::matchTemplate_SQDIFF, ::matchTemplate_SQDIFF_NORMED, 
+		::matchTemplate_CCORR, ::matchTemplate_CCORR_NORMED, 
+		::matchTemplate_CCOFF, ::matchTemplate_CCOFF_NORMED
+	};
+
+	Caller caller = callers[method];
+	CV_Assert(caller);
+	caller(image, templ, result, buf);
+}
+#endif //
--- a/modules/ocl/src/pyrdown.cpp
+++ b/modules/ocl/src/pyrdown.cpp
@@ -0,0 +1,115 @@
+
+#include "precomp.hpp"
+
+using namespace cv;
+using namespace cv::ocl;
+using namespace std;
+
+using std::cout;
+using std::endl;
+
+namespace cv
+{
+    namespace ocl
+    {
+        ///////////////////////////OpenCL kernel strings///////////////////////////
+        extern const char *pyr_down;
+
+    }
+}
+
+//////////////////////////////////////////////////////////////////////////////
+/////////////////////// add subtract multiply divide /////////////////////////
+//////////////////////////////////////////////////////////////////////////////
+template<typename T>
+void pyrdown_run(const oclMat &src, const oclMat &dst)
+{
+    CV_Assert(src.cols / 2 == dst.cols && src.rows / 2 == dst.rows);
+
+    CV_Assert(src.type() == dst.type());
+    CV_Assert(src.depth() != CV_8S);
+
+    Context  *clCxt = src.clCxt;
+    //int channels = dst.channels();
+    //int depth = dst.depth();
+
+    string kernelName = "pyrDown";
+
+    //int vector_lengths[4][7] = {{4, 0, 4, 4, 1, 1, 1},
+    //    {4, 0, 4, 4, 1, 1, 1},
+    //    {4, 0, 4, 4, 1, 1, 1},
+    //    {4, 0, 4, 4, 1, 1, 1}
+    //};
+
+    //size_t vector_length = vector_lengths[channels-1][depth];
+    //int offset_cols = (dst.offset / dst.elemSize1()) & (vector_length - 1);
+
+    size_t localThreads[3]  = { 256, 1, 1 };
+    size_t globalThreads[3] = { src.cols, dst.rows, 1};
+
+    //int dst_step1 = dst.cols * dst.elemSize();
+    vector<pair<size_t , const void *> > args;
+    args.push_back( make_pair( sizeof(cl_mem), (void *)&src.data ));
+    args.push_back( make_pair( sizeof(cl_int), (void *)&src.step ));
+    args.push_back( make_pair( sizeof(cl_int), (void *)&src.offset ));
+    args.push_back( make_pair( sizeof(cl_int), (void *)&src.rows));
+    args.push_back( make_pair( sizeof(cl_int), (void *)&src.cols));
+    args.push_back( make_pair( sizeof(cl_mem), (void *)&dst.data ));
+    args.push_back( make_pair( sizeof(cl_int), (void *)&dst.step ));
+    args.push_back( make_pair( sizeof(cl_int), (void *)&dst.offset ));
+    args.push_back( make_pair( sizeof(cl_int), (void *)&dst.cols));
+
+    openCLExecuteKernel(clCxt, &pyr_down, kernelName, globalThreads, localThreads, args, src.channels(), src.depth());
+}
+void pyrdown_run(const oclMat &src, const oclMat &dst)
+{
+	switch(src.depth())
+	{
+	case 0:
+	    pyrdown_run<unsigned char>(src, dst);
+		break;
+
+	case 1:
+	    pyrdown_run<char>(src, dst);
+		break;
+
+	case 2:
+	    pyrdown_run<unsigned short>(src, dst);
+		break;
+
+	case 3:
+	    pyrdown_run<short>(src, dst);
+		break;
+
+	case 4:
+	    pyrdown_run<int>(src, dst);
+		break;
+
+	case 5:
+	    pyrdown_run<float>(src, dst);
+		break;
+
+	case 6:
+	    pyrdown_run<double>(src, dst);
+		break;
+
+	default:
+		break;
+	}
+}
+//////////////////////////////////////////////////////////////////////////////
+// pyrDown
+
+void cv::ocl::pyrDown(const oclMat& src, oclMat& dst)
+{
+    CV_Assert(src.depth() <= CV_32F && src.channels() <= 4);
+
+	//src.step = src.rows;
+
+    dst.create((src.rows + 1) / 2, (src.cols + 1) / 2, src.type());
+
+	//dst.step = dst.rows;
+
+    pyrdown_run(src, dst);
+}
+
--- a/modules/ocl/src/pyrup.cpp
+++ b/modules/ocl/src/pyrup.cpp
@@ -0,0 +1,88 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
+// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// @Authors
+//		Zhang Chunpeng chunpeng@multicorewareinc.com
+//    
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other oclMaterials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+/* Haar features calculation */
+//#define EMU
+
+#include "precomp.hpp"
+
+using namespace cv;
+using namespace cv::ocl;
+using namespace std;
+
+#ifndef HAVE_OPENCL
+void cv::ocl::pyrUp(const oclMat&, GpuMat&, oclMat&) { throw_nogpu(); }
+#else
+
+namespace cv { namespace ocl 
+{ 
+	extern const char *pyr_up;
+	void pyrUp(const cv::ocl::oclMat& src,cv::ocl::oclMat& dst)
+	{
+		dst.create(src.rows * 2, src.cols * 2, src.type());
+		Context *clCxt = src.clCxt;
+		
+		const std::string kernelName = "pyrUp";
+  
+		std::vector< pair<size_t, const void *> > args;
+		args.push_back( make_pair( sizeof(cl_mem), (void *)&src.data));
+		args.push_back( make_pair( sizeof(cl_mem), (void *)&dst.data));
+		args.push_back( make_pair( sizeof(cl_int), (void *)&src.rows));
+		args.push_back( make_pair( sizeof(cl_int), (void *)&dst.rows));
+		args.push_back( make_pair( sizeof(cl_int), (void *)&src.cols));
+		args.push_back( make_pair( sizeof(cl_int), (void *)&dst.cols));
+		args.push_back( make_pair( sizeof(cl_int), (void *)&src.offset));
+		args.push_back( make_pair( sizeof(cl_int), (void *)&dst.offset));
+		args.push_back( make_pair( sizeof(cl_int), (void *)&src.step));
+		args.push_back( make_pair( sizeof(cl_int), (void *)&dst.step));
+		
+		size_t globalThreads[3] = {dst.cols, dst.rows, 1};
+		size_t localThreads[3]  = {16, 16, 1};
+	    
+		openCLExecuteKernel(clCxt, &pyr_up, kernelName, globalThreads, localThreads, args, src.channels(), src.depth());
+	}
+}};
+#endif // HAVE_OPENCL
--- a/modules/ocl/test/test_blend.cpp
+++ b/modules/ocl/test/test_blend.cpp
@@ -0,0 +1,83 @@
+#include "precomp.hpp"
+#include <iomanip>
+
+using namespace cv;
+using namespace cv::ocl;
+using namespace cvtest;
+using namespace testing;
+using namespace std;
+
+template <typename T>
+void blendLinearGold(const cv::Mat& img1, const cv::Mat& img2, const cv::Mat& weights1, const cv::Mat& weights2, cv::Mat& result_gold)
+{
+    result_gold.create(img1.size(), img1.type());
+
+    int cn = img1.channels();
+
+    for (int y = 0; y < img1.rows; ++y)
+    {
+        const float* weights1_row = weights1.ptr<float>(y);
+        const float* weights2_row = weights2.ptr<float>(y);
+        const T* img1_row = img1.ptr<T>(y);
+        const T* img2_row = img2.ptr<T>(y);
+        T* result_gold_row = result_gold.ptr<T>(y);
+
+        for (int x = 0; x < img1.cols * cn; ++x)
+        {
+            float w1 = weights1_row[x / cn];
+            float w2 = weights2_row[x / cn];
+            result_gold_row[x] = static_cast<T>((img1_row[x] * w1 + img2_row[x] * w2) / (w1 + w2 + 1e-5f));
+        }
+    }
+}
+
+PARAM_TEST_CASE(Blend, cv::Size, MatType/*, UseRoi*/)
+{
+    std::vector<cv::ocl::Info> oclinfo;
+    cv::Size size;
+    int type;
+    bool useRoi;
+
+    virtual void SetUp()
+    {
+        //devInfo = GET_PARAM(0);
+        size = GET_PARAM(0);
+        type = GET_PARAM(1);
+        /*useRoi = GET_PARAM(3);*/
+
+        int devnums = getDevice(oclinfo, OPENCV_DEFAULT_OPENCL_DEVICE);
+        CV_Assert(devnums > 0);
+    }
+};
+
+TEST_P(Blend, Accuracy)
+{
+    int depth = CV_MAT_DEPTH(type);
+
+    cv::Mat img1 = randomMat(size, type, 0.0, depth == CV_8U ? 255.0 : 1.0);
+    cv::Mat img2 = randomMat(size, type, 0.0, depth == CV_8U ? 255.0 : 1.0);
+    cv::Mat weights1 = randomMat(size, CV_32F, 0, 1);
+    cv::Mat weights2 = randomMat(size, CV_32F, 0, 1);
+
+	cv::ocl::oclMat gimg1(size, type), gimg2(size, type), gweights1(size, CV_32F), gweights2(size, CV_32F);
+	cv::ocl::oclMat dst(size, type);
+	gimg1.upload(img1);
+	gimg2.upload(img2);
+	gweights1.upload(weights1);
+	gweights2.upload(weights2);
+	cv::ocl::blendLinear(gimg1, gimg2, gweights1, gweights2, dst);
+	cv::Mat result;
+    cv::Mat result_gold;
+	dst.download(result);
+    if (depth == CV_8U)
+        blendLinearGold<uchar>(img1, img2, weights1, weights2, result_gold);
+    else
+        blendLinearGold<float>(img1, img2, weights1, weights2, result_gold);
+
+    EXPECT_MAT_NEAR(result_gold, result, CV_MAT_DEPTH(type) == CV_8U ? 1 : 1e-5f, NULL)
+}
+
+INSTANTIATE_TEST_CASE_P(GPU_ImgProc, Blend, Combine(
+	DIFFERENT_SIZES,
+	testing::Values(MatType(CV_8UC1), MatType(CV_8UC4), MatType(CV_32FC1), MatType(CV_32FC4))
+));
--- a/modules/ocl/test/test_columnsum.cpp
+++ b/modules/ocl/test/test_columnsum.cpp
@@ -0,0 +1,108 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
+// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// @Authors
+//	   Chunpeng Zhang chunpeng@multicorewareinc.com
+//    
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other oclMaterials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "precomp.hpp"
+#include <iomanip>
+
+///////////////////////////////////////////////////////////////////////////////
+/// ColumnSum
+
+#ifdef HAVE_OPENCL
+
+////////////////////////////////////////////////////////////////////////
+// ColumnSum
+
+PARAM_TEST_CASE(ColumnSum, cv::Size, bool )
+{
+    cv::Size size;
+    cv::Mat src;
+	bool useRoi;
+	std::vector<cv::ocl::Info> oclinfo;
+
+    virtual void SetUp()
+    {
+        size = GET_PARAM(0);
+		useRoi = GET_PARAM(1);
+        int devnums = getDevice(oclinfo, OPENCV_DEFAULT_OPENCL_DEVICE);
+        CV_Assert(devnums > 0);
+    }
+};
+
+TEST_P(ColumnSum, Accuracy)
+{
+    cv::Mat src = randomMat(size, CV_32FC1);
+	//cv::Mat src(size,CV_32FC1);
+
+	//cv::ocl::oclMat d_dst = ::createMat(size,src.type(),useRoi);
+	cv::ocl::oclMat d_dst = loadMat(src,useRoi);
+
+    cv::ocl::columnSum(loadMat(src,useRoi),d_dst);
+
+    cv::Mat dst(d_dst);
+
+    for (int j = 0; j < src.cols; ++j)
+    {
+        float gold = src.at<float>(0, j);
+        float res = dst.at<float>(0, j);
+        ASSERT_NEAR(res, gold, 1e-5);
+    }
+
+    for (int i = 1; i < src.rows; ++i)
+    {
+        for (int j = 0; j < src.cols; ++j)
+        {
+            float gold = src.at<float>(i, j) += src.at<float>(i - 1, j);
+            float res = dst.at<float>(i, j);
+            ASSERT_NEAR(res, gold, 1e-5);
+        }
+    }
+}
+
+INSTANTIATE_TEST_CASE_P(GPU_ImgProc, ColumnSum, testing::Combine(
+						DIFFERENT_SIZES,testing::Values(Inverse(false),Inverse(true))));
+
+
+#endif 
--- a/modules/ocl/test/test_fft.cpp
+++ b/modules/ocl/test/test_fft.cpp
@@ -0,0 +1,97 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
+// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// @Authors
+//    Peng Xiao, pengxiao@multicorewareinc.com
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other oclMaterials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors as is and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "precomp.hpp"
+using namespace std;
+#ifdef HAVE_CLAMDFFT
+////////////////////////////////////////////////////////////////////////////
+// Dft
+PARAM_TEST_CASE(Dft, cv::Size, bool) 
+{
+	cv::Size dft_size;
+	bool	 dft_rows;
+	std::vector<cv::ocl::Info> oclinfo;
+    virtual void SetUp()
+    {
+	    int devnums = getDevice(oclinfo);
+        CV_Assert(devnums > 0);
+		dft_size = GET_PARAM(0);
+		dft_rows = GET_PARAM(1);
+    }
+};
+
+TEST_P(Dft, C2C)
+{
+	cv::Mat a = randomMat(dft_size, CV_32FC2, 0.0, 10.0);
+	cv::Mat b_gold;
+	int flags = 0;
+	flags |= dft_rows ? cv::DFT_ROWS : 0;
+
+	cv::ocl::oclMat d_b;
+	
+	cv::dft(a, b_gold, flags);
+	cv::ocl::dft(cv::ocl::oclMat(a), d_b, a.size(), flags);
+	EXPECT_MAT_NEAR(b_gold, cv::Mat(d_b), a.size().area() * 1e-4, "");
+}
+
+
+TEST_P(Dft, R2CthenC2R)
+{
+	cv::Mat a = randomMat(dft_size, CV_32FC1, 0.0, 10.0);
+	
+	int flags = 0;
+	//flags |= dft_rows ? cv::DFT_ROWS : 0; // not supported yet
+
+	cv::ocl::oclMat d_b, d_c;
+	cv::ocl::dft(cv::ocl::oclMat(a), d_b, a.size(), flags);
+	cv::ocl::dft(d_b, d_c, a.size(), flags + cv::DFT_INVERSE + cv::DFT_REAL_OUTPUT);
+	EXPECT_MAT_NEAR(a, d_c, a.size().area() * 1e-4, "");
+}
+
+INSTANTIATE_TEST_CASE_P(ocl_DFT, Dft, testing::Combine(
+    testing::Values(cv::Size(5, 4), cv::Size(20, 20)),
+    testing::Values(false, true)));
+
+#endif // HAVE_CLAMDFFT
--- a/modules/ocl/test/test_gemm.cpp
+++ b/modules/ocl/test/test_gemm.cpp
@@ -0,0 +1,85 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
+// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// @Authors
+//    Peng Xiao, pengxiao@multicorewareinc.com
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other oclMaterials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors as is and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+
+#include "precomp.hpp"
+using namespace std;
+#ifdef HAVE_CLAMDBLAS
+////////////////////////////////////////////////////////////////////////////
+// GEMM
+PARAM_TEST_CASE(Gemm, int, cv::Size, int) 
+{
+	int      type;
+	cv::Size mat_size;
+	int		 flags;
+	vector<cv::ocl::Info> info;
+    virtual void SetUp()
+    {
+		type     = GET_PARAM(0);
+		mat_size = GET_PARAM(1);
+		flags    = GET_PARAM(2);
+		cv::ocl::getDevice(info);
+    }
+};
+
+TEST_P(Gemm, Accuracy)
+{
+	cv::Mat a = randomMat(mat_size, type, 0.0, 10.0);
+	cv::Mat b = randomMat(mat_size, type, 0.0, 10.0);
+	cv::Mat c = randomMat(mat_size, type, 0.0, 10.0);
+
+	cv::Mat dst;
+	cv::ocl::oclMat ocl_dst;
+
+	cv::gemm(a, b, 1.0, c, 1.0, dst, flags);
+	cv::ocl::gemm(cv::ocl::oclMat(a), cv::ocl::oclMat(b), 1.0, cv::ocl::oclMat(c), 1.0, ocl_dst, flags);
+
+	EXPECT_MAT_NEAR(dst, ocl_dst, mat_size.area() * 1e-4, "");
+}
+
+INSTANTIATE_TEST_CASE_P(ocl_gemm, Gemm, testing::Combine(
+	testing::Values(CV_32FC1, CV_32FC2/*, CV_64FC1, CV_64FC2*/),
+    testing::Values(cv::Size(20, 20), cv::Size(300, 300)),
+    testing::Values(0, cv::GEMM_1_T, cv::GEMM_2_T, cv::GEMM_1_T + cv::GEMM_2_T)));
+#endif
--- a/modules/ocl/test/test_match_template.cpp
+++ b/modules/ocl/test/test_match_template.cpp
@@ -0,0 +1,172 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
+// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// @Authors
+//    Peng Xiao, pengxiao@multicorewareinc.com
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other oclMaterials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors as is and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+
+#include "precomp.hpp"
+#define PERF_TEST 0
+
+////////////////////////////////////////////////////////////////////////////////
+// MatchTemplate
+#define ALL_TEMPLATE_METHODS testing::Values(TemplateMethod(cv::TM_SQDIFF), TemplateMethod(cv::TM_CCORR), TemplateMethod(cv::TM_CCOEFF), TemplateMethod(cv::TM_SQDIFF_NORMED), TemplateMethod(cv::TM_CCORR_NORMED), TemplateMethod(cv::TM_CCOEFF_NORMED))
+
+IMPLEMENT_PARAM_CLASS(TemplateSize, cv::Size);
+
+const char* TEMPLATE_METHOD_NAMES[6] = {"TM_SQDIFF", "TM_SQDIFF_NORMED", "TM_CCORR", "TM_CCORR_NORMED", "TM_CCOEFF", "TM_CCOEFF_NORMED"};
+
+PARAM_TEST_CASE(MatchTemplate8U, cv::Size, TemplateSize, Channels, TemplateMethod)
+{
+    cv::Size size;
+    cv::Size templ_size;
+    int cn;
+    int method;
+	std::vector<cv::ocl::Info> oclinfo;
+
+    virtual void SetUp()
+    {
+        size = GET_PARAM(0);
+        templ_size = GET_PARAM(1);
+        cn = GET_PARAM(2);
+        method = GET_PARAM(3);
+        int devnums = getDevice(oclinfo, OPENCV_DEFAULT_OPENCL_DEVICE);
+        CV_Assert(devnums > 0);
+    }
+};
+
+TEST_P(MatchTemplate8U, Accuracy)
+{
+
+	std::cout << "Method: " << TEMPLATE_METHOD_NAMES[method] << std::endl;
+	std::cout << "Image Size: (" << size.width << ", " << size.height << ")"<< std::endl;
+	std::cout << "Template Size: (" << templ_size.width << ", " << templ_size.height << ")"<< std::endl;
+	std::cout << "Channels: " << cn << std::endl;
+
+	cv::Mat image = randomMat(size, CV_MAKETYPE(CV_8U, cn));
+    cv::Mat templ = randomMat(templ_size, CV_MAKETYPE(CV_8U, cn));
+
+    cv::ocl::oclMat dst, ocl_image(image), ocl_templ(templ);
+	cv::ocl::matchTemplate(ocl_image, ocl_templ, dst, method);
+
+    cv::Mat dst_gold;
+    cv::matchTemplate(image, templ, dst_gold, method);
+
+	char sss [100] = "";
+
+	cv::Mat mat_dst;
+	dst.download(mat_dst);
+
+
+    EXPECT_MAT_NEAR(dst_gold, mat_dst, templ_size.area() * 1e-1, sss);
+
+#if PERF_TEST
+	{
+		P_TEST_FULL({}, {cv::ocl::matchTemplate(ocl_image, ocl_templ, dst, method);}, {});
+		P_TEST_FULL({}, {cv::matchTemplate(image, templ, dst_gold, method);}, {});
+	}
+#endif // PERF_TEST
+}
+
+PARAM_TEST_CASE(MatchTemplate32F, cv::Size, TemplateSize, Channels, TemplateMethod)
+{
+    cv::Size size;
+    cv::Size templ_size;
+    int cn;
+    int method;
+	std::vector<cv::ocl::Info> oclinfo;
+
+    virtual void SetUp()
+    {
+        size = GET_PARAM(0);
+        templ_size = GET_PARAM(1);
+        cn = GET_PARAM(2);
+        method = GET_PARAM(3);
+        int devnums = getDevice(oclinfo, OPENCV_DEFAULT_OPENCL_DEVICE);
+        CV_Assert(devnums > 0);
+    }
+};
+
+TEST_P(MatchTemplate32F, Accuracy)
+{
+    cv::Mat image = randomMat(size, CV_MAKETYPE(CV_32F, cn));
+    cv::Mat templ = randomMat(templ_size, CV_MAKETYPE(CV_32F, cn));
+
+    cv::ocl::oclMat dst, ocl_image(image), ocl_templ(templ);
+	cv::ocl::matchTemplate(ocl_image, ocl_templ, dst, method);
+
+    cv::Mat dst_gold;
+    cv::matchTemplate(image, templ, dst_gold, method);
+
+	char sss [100] = "";
+
+	cv::Mat mat_dst;
+	dst.download(mat_dst);
+
+    EXPECT_MAT_NEAR(dst_gold, mat_dst, templ_size.area() * 1e-1, sss);
+
+#if PERF_TEST
+	{
+		std::cout << "Method: " << TEMPLATE_METHOD_NAMES[method] << std::endl;
+		std::cout << "Image Size: (" << size.width << ", " << size.height << ")"<< std::endl;
+		std::cout << "Template Size: (" << templ_size.width << ", " << templ_size.height << ")"<< std::endl;
+		std::cout << "Channels: " << cn << std::endl;
+		P_TEST_FULL({}, {cv::ocl::matchTemplate(ocl_image, ocl_templ, dst, method);}, {});
+		P_TEST_FULL({}, {cv::matchTemplate(image, templ, dst_gold, method);}, {});
+	}
+#endif // PERF_TEST
+}
+
+INSTANTIATE_TEST_CASE_P(GPU_ImgProc, MatchTemplate8U, 
+	testing::Combine(
+    DIFFERENT_SIZES,
+    testing::Values(TemplateSize(cv::Size(5, 5)), TemplateSize(cv::Size(16, 16))/*, TemplateSize(cv::Size(30, 30))*/),
+    testing::Values(Channels(1), Channels(4)),
+	ALL_TEMPLATE_METHODS
+	)
+);
+
+INSTANTIATE_TEST_CASE_P(GPU_ImgProc, MatchTemplate32F, testing::Combine(
+    DIFFERENT_SIZES,
+    testing::Values(TemplateSize(cv::Size(5, 5)), TemplateSize(cv::Size(16, 16))/*, TemplateSize(cv::Size(30, 30))*/),
+    testing::Values(Channels(1), Channels(4)),
+    testing::Values(TemplateMethod(cv::TM_SQDIFF), TemplateMethod(cv::TM_CCORR))));
+
--- a/modules/ocl/test/test_pyrdown.cpp
+++ b/modules/ocl/test/test_pyrdown.cpp
@@ -0,0 +1,295 @@
+///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved.
+// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// @Authors
+//    Dachuan Zhao, dachuan@multicorewareinc.com
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other oclMaterials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+//#define PRINT_CPU_TIME 1000
+//#define PRINT_TIME
+
+
+#include "precomp.hpp"
+#include <iomanip>
+
+#ifdef HAVE_OPENCL
+
+using namespace cv;
+using namespace cv::ocl;
+using namespace cvtest;
+using namespace testing;
+using namespace std;
+
+PARAM_TEST_CASE(PyrDown, MatType, bool)
+{
+    int type;
+    cv::Scalar val;
+
+    //src mat
+    cv::Mat mat1;
+    cv::Mat mat2;
+    cv::Mat mask;
+    cv::Mat dst;
+    cv::Mat dst1; //bak, for two outputs
+
+    // set up roi
+    int roicols;
+    int roirows;
+    int src1x;
+    int src1y;
+    int src2x;
+    int src2y;
+    int dstx;
+    int dsty;
+    int maskx;
+    int masky;
+
+
+    //src mat with roi
+    cv::Mat mat1_roi;
+    cv::Mat mat2_roi;
+    cv::Mat mask_roi;
+    cv::Mat dst_roi;
+    cv::Mat dst1_roi; //bak
+    std::vector<cv::ocl::Info> oclinfo;
+    //ocl dst mat for testing
+    cv::ocl::oclMat gdst_whole;
+    cv::ocl::oclMat gdst1_whole; //bak
+
+    //ocl mat with roi
+    cv::ocl::oclMat gmat1;
+    cv::ocl::oclMat gmat2;
+    cv::ocl::oclMat gdst;
+    cv::ocl::oclMat gdst1;   //bak
+    cv::ocl::oclMat gmask;
+
+    virtual void SetUp()
+    {
+        type = GET_PARAM(0);
+
+        cv::RNG &rng = TS::ptr()->get_rng();
+
+        cv::Size size(MWIDTH, MHEIGHT);
+
+        mat1 = randomMat(rng, size, type, 5, 16, false);
+        mat2 = randomMat(rng, size, type, 5, 16, false);
+        dst  = randomMat(rng, size, type, 5, 16, false);
+        dst1  = randomMat(rng, size, type, 5, 16, false);
+        mask = randomMat(rng, size, CV_8UC1, 0, 2,  false);
+
+        cv::threshold(mask, mask, 0.5, 255., CV_8UC1);
+
+        val = cv::Scalar(rng.uniform(-10.0, 10.0), rng.uniform(-10.0, 10.0), rng.uniform(-10.0, 10.0), rng.uniform(-10.0, 10.0));
+
+        int devnums = getDevice(oclinfo);
+        CV_Assert(devnums > 0);
+        //if you want to use undefault device, set it here
+        //setDevice(oclinfo[0]);
+    }
+
+	void Cleanup()
+	{
+		mat1.release();
+		mat2.release();
+		mask.release();
+		dst.release();
+		dst1.release();
+		mat1_roi.release();
+		mat2_roi.release();
+		mask_roi.release();
+		dst_roi.release();
+		dst1_roi.release();
+
+		gdst_whole.release();
+		gdst1_whole.release();
+		gmat1.release();
+		gmat2.release();
+		gdst.release();
+		gdst1.release();
+		gmask.release();
+	}
+
+    void random_roi()
+    {
+        cv::RNG &rng = TS::ptr()->get_rng();
+
+#ifdef RANDOMROI
+        //randomize ROI
+        roicols = rng.uniform(1, mat1.cols);
+        roirows = rng.uniform(1, mat1.rows);
+        src1x   = rng.uniform(0, mat1.cols - roicols);
+        src1y   = rng.uniform(0, mat1.rows - roirows);
+        dstx    = rng.uniform(0, dst.cols  - roicols);
+        dsty    = rng.uniform(0, dst.rows  - roirows);
+#else
+        roicols = mat1.cols;
+        roirows = mat1.rows;
+        src1x = 0;
+        src1y = 0;
+        dstx = 0;
+        dsty = 0;
+#endif
+        maskx   = rng.uniform(0, mask.cols - roicols);
+        masky   = rng.uniform(0, mask.rows - roirows);
+        src2x   = rng.uniform(0, mat2.cols - roicols);
+        src2y   = rng.uniform(0, mat2.rows - roirows);
+        mat1_roi = mat1(Rect(src1x, src1y, roicols, roirows));
+        mat2_roi = mat2(Rect(src2x, src2y, roicols, roirows));
+        mask_roi = mask(Rect(maskx, masky, roicols, roirows));
+        dst_roi  = dst(Rect(dstx, dsty, roicols, roirows));
+        dst1_roi = dst1(Rect(dstx, dsty, roicols, roirows));
+
+        gdst_whole = dst;
+        gdst = gdst_whole(Rect(dstx, dsty, roicols, roirows));
+
+        gdst1_whole = dst1;
+        gdst1 = gdst1_whole(Rect(dstx, dsty, roicols, roirows));
+
+        gmat1 = mat1_roi;
+        gmat2 = mat2_roi;
+        gmask = mask_roi; //end
+    }
+
+};
+
+#define VARNAME(A) string(#A);
+
+
+void PrePrint()
+{
+		//for(int i = 0; i < MHEIGHT; i++)
+		//{
+		//	printf("(%d) ", i);
+		//	for(int k = 0; k < MWIDTH; k++)
+		//	{
+		//		printf("%d ", mat1_roi.data[i * MHEIGHT + k]);
+		//	}
+		//	printf("\n");
+		//}
+}
+
+void PostPrint()
+{
+		//dst_roi.convertTo(dst_roi,CV_32S);
+		//cpu_dst.convertTo(cpu_dst,CV_32S);
+		//dst_roi -= cpu_dst;
+		//cpu_dst -= dst_roi;
+		//for(int i = 0; i < MHEIGHT / 2; i++)
+		//{
+		//	printf("(%d) ", i);
+		//	for(int k = 0; k < MWIDTH / 2; k++)
+		//	{
+		//		if(gmat1.depth() == 0)
+		//		{
+		//			if(gmat1.channels() == 1)
+		//			{
+		//				printf("%d ", dst_roi.data[i * MHEIGHT / 2 + k]);
+		//			}
+		//			else
+		//			{
+		//				printf("%d ", ((unsigned*)dst_roi.data)[i * MHEIGHT / 2 + k]);
+		//			}
+		//		}
+		//		else if(gmat1.depth() == 5)
+		//		{
+		//			printf("%.6f ", ((float*)dst_roi.data)[i * MHEIGHT / 2 + k]);
+		//		}
+		//	}
+		//	printf("\n");
+		//}
+		//for(int i = 0; i < MHEIGHT / 2; i++)
+		//{
+		//	printf("(%d) ", i);
+		//	for(int k = 0; k < MWIDTH / 2; k++)
+		//	{
+		//		if(gmat1.depth() == 0)
+		//		{
+		//			if(gmat1.channels() == 1)
+		//			{
+		//				printf("%d ", cpu_dst.data[i * MHEIGHT / 2 + k]);
+		//			}
+		//			else
+		//			{
+		//				printf("%d ", ((unsigned*)cpu_dst.data)[i * MHEIGHT / 2 + k]);
+		//			}
+		//		}
+		//		else if(gmat1.depth() == 5)
+		//		{
+		//			printf("%.6f ", ((float*)cpu_dst.data)[i * MHEIGHT / 2 + k]);
+		//		}
+		//	}
+		//	printf("\n");
+		//}
+}
+
+////////////////////////////////PyrDown/////////////////////////////////////////////////
+//struct PyrDown : ArithmTestBase {};
+
+TEST_P(PyrDown, Mat)
+{
+    for(int j = 0; j < LOOP_TIMES; j++)
+    {
+        random_roi();
+
+		cv::pyrDown(mat1_roi, dst_roi);
+		cv::ocl::pyrDown(gmat1, gdst);
+
+        cv::Mat cpu_dst;
+        gdst.download(cpu_dst);
+        char s[1024];
+        sprintf(s, "roicols=%d,roirows=%d,src1x=%d,src1y=%d,dstx=%d,dsty=%d,maskx=%d,masky=%d,src2x=%d,src2y=%d", roicols, roirows, src1x, src1y, dstx, dsty, maskx, masky, src2x, src2y);
+
+		EXPECT_MAT_NEAR(dst_roi, cpu_dst, dst_roi.depth() == CV_32F ? 1e-5f : 1.0f, s);
+
+		Cleanup();
+    }
+}
+
+
+
+
+//********test****************
+INSTANTIATE_TEST_CASE_P(GPU_ImgProc, PyrDown, Combine(
+                            Values(CV_8UC1, CV_8UC4, CV_32FC1, CV_32FC4),
+                            Values(false))); // Values(false) is the reserved parameter
+
+
+#endif // HAVE_OPENCL
--- a/modules/ocl/test/test_pyrup.cpp
+++ b/modules/ocl/test/test_pyrup.cpp
@@ -0,0 +1,91 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
+// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// @Authors
+//    Zhang Chunpeng chunpeng@multicorewareinc.com
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other oclMaterials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "precomp.hpp"
+#include "opencv2/core/core.hpp"
+
+#ifdef HAVE_OPENCL
+
+
+PARAM_TEST_CASE(PyrUp,cv::Size,int)
+{
+	cv::Size size;
+	int type;
+	std::vector<cv::ocl::Info> oclinfo;
+
+	virtual void SetUp()
+	{
+		int devnums = cv::ocl::getDevice(oclinfo, OPENCV_DEFAULT_OPENCL_DEVICE);
+		CV_Assert(devnums > 0);
+		size = GET_PARAM(0);
+		type = GET_PARAM(1);
+	}
+};
+
+TEST_P(PyrUp,Accuracy)
+{
+	cv::Mat src = randomMat(size,type);
+	
+
+	cv::Mat dst_gold;
+	cv::pyrUp(src,dst_gold);
+
+	cv::ocl::oclMat dst;
+	cv::ocl::oclMat srcMat(src);
+	cv::ocl::pyrUp(srcMat,dst);
+	char s[100]={0};
+
+	EXPECT_MAT_NEAR(dst_gold, dst, (src.depth() == CV_32F ? 1e-4f : 1.0),s);	
+	
+}
+
+#if 1
+INSTANTIATE_TEST_CASE_P(GPU_ImgProc, PyrUp, testing::Combine(
+    testing::Values(cv::Size(32, 32)),
+    testing::Values(MatType(CV_8UC1),MatType(CV_16UC1),MatType(CV_32FC1),MatType(CV_8UC4),
+	MatType(CV_16UC4),MatType(CV_32FC4))));
+#endif
+
+#endif // HAVE_OPENCL
--- a/modules/stitching/include/opencv2/stitching/detail/seam_finders.hpp
+++ b/modules/stitching/include/opencv2/stitching/detail/seam_finders.hpp
@@ -43,6 +43,7 @@
 #ifndef __OPENCV_STITCHING_SEAM_FINDERS_HPP__
 #define __OPENCV_STITCHING_SEAM_FINDERS_HPP__

+#include <set>
 #include "opencv2/core/core.hpp"
 #include "opencv2/opencv_modules.hpp"

@@ -92,6 +93,114 @@ private:
 };


+class CV_EXPORTS DpSeamFinder : public SeamFinder
+{
+public:
+    enum CostFunction { COLOR, COLOR_GRAD };
+
+    DpSeamFinder(CostFunction costFunc = COLOR);
+
+    CostFunction costFunction() const { return costFunc_; }
+    void setCostFunction(CostFunction val) { costFunc_ = val; }
+
+    virtual void find(const std::vector<Mat> &src, const std::vector<Point> &corners,
+                      std::vector<Mat> &masks);
+
+private:
+    enum ComponentState
+    {
+        FIRST = 1, SECOND = 2, INTERS = 4,
+        INTERS_FIRST = INTERS | FIRST,
+        INTERS_SECOND = INTERS | SECOND
+    };
+
+    class ImagePairLess
+    {
+    public:
+        ImagePairLess(const std::vector<Mat> &images, const std::vector<Point> &corners)
+            : src_(&images[0]), corners_(&corners[0]) {}
+
+        bool operator() (const std::pair<int, int> &l, const std::pair<int, int> &r) const
+        {
+            Point c1 = corners_[l.first] + Point(src_[l.first].cols / 2, src_[l.first].rows / 2);
+            Point c2 = corners_[l.second] + Point(src_[l.second].cols / 2, src_[l.second].rows / 2);
+            int d1 = (c1 - c2).dot(c1 - c2);
+
+            c1 = corners_[r.first] + Point(src_[r.first].cols / 2, src_[r.first].rows / 2);
+            c2 = corners_[r.second] + Point(src_[r.second].cols / 2, src_[r.second].rows / 2);
+            int d2 = (c1 - c2).dot(c1 - c2);
+
+            return d1 < d2;
+        }
+
+    private:
+        const Mat *src_;
+        const Point *corners_;
+    };
+
+    class ClosePoints
+    {
+    public:
+        ClosePoints(int minDist) : minDist_(minDist) {}
+
+        bool operator() (const Point &p1, const Point &p2) const
+        {
+            int dist2 = (p1.x-p2.x) * (p1.x-p2.x) + (p1.y-p2.y) * (p1.y-p2.y);
+            return dist2 < minDist_ * minDist_;
+        }
+
+    private:
+        int minDist_;
+    };
+
+    void process(const Mat &image1, const Mat &image2, Point tl1, Point tl2,
+                 Mat &mask1, Mat &mask2);
+
+    void findComponents();
+
+    void findEdges();
+
+    void resolveConflicts(const Mat &image1, const Mat &image2,
+                          Point tl1, Point tl2, Mat &mask1, Mat &mask2);
+
+    void computeGradients(const Mat &image1, const Mat &image2);
+
+    bool hasOnlyOneNeighbor(int c);
+
+    bool closeToContour(int y, int x, const Mat_<uchar> &contourMask);
+
+    bool getSeamTips(int c1, int c2, Point &p1, Point &p2);    
+
+    void computeCosts(const Mat &image1, const Mat &image2, Point tl1, Point tl2,
+                      int c, Mat_<float> &costV, Mat_<float> &costH);   
+
+    bool estimateSeam(
+            const Mat &image1, const Mat &image2, Point tl1, Point tl2, int c,
+            Point p1, Point p2, std::vector<Point> &seam, bool &isHorizontal);
+
+    void updateLabelsUsingSeam(int c1, int c2, const std::vector<Point> &seam,
+                               bool isHorizontalSeam);
+
+    CostFunction costFunc_;
+
+    // processing images pair data
+    Point unionTl_, unionBr_;
+    Size unionSize_;
+    Mat_<uchar> mask1_, mask2_;
+    Mat_<uchar> contour1mask_, contour2mask_;
+    Mat_<float> gradx1_, grady1_;
+    Mat_<float> gradx2_, grady2_;
+
+    // components data
+    int ncomps_;
+    Mat_<int> labels_;
+    std::vector<ComponentState> states_;
+    std::vector<Point> tls_, brs_;
+    std::vector<std::vector<Point> > contours_;
+    std::set<std::pair<int, int> > edges_;
+};
+
+
 class CV_EXPORTS GraphCutSeamFinderBase
 {
 public:
--- a/modules/stitching/src/seam_finders.cpp
+++ b/modules/stitching/src/seam_finders.cpp
@@ -41,6 +41,9 @@
 //M*/

 #include "precomp.hpp"
+#include <map>
+
+using namespace std;

 namespace cv {
 namespace detail {
@@ -152,6 +155,866 @@ void VoronoiSeamFinder::findInPair(size_t first, size_t second, Rect roi)
 }


+DpSeamFinder::DpSeamFinder(CostFunction costFunc) : costFunc_(costFunc) {}
+
+
+void DpSeamFinder::find(const vector<Mat> &src, const vector<Point> &corners, vector<Mat> &masks)
+{
+    LOGLN("Finding seams...");
+    int64 t = getTickCount();
+
+    if (src.size() == 0)
+        return;
+
+    vector<pair<int, int> > pairs;
+
+    for (size_t i = 0; i+1 < src.size(); ++i)
+        for (size_t j = i+1; j < src.size(); ++j)
+            pairs.push_back(make_pair(i, j));
+
+    sort(pairs.begin(), pairs.end(), ImagePairLess(src, corners));
+    reverse(pairs.begin(), pairs.end());
+
+    for (size_t i = 0; i < pairs.size(); ++i)
+    {
+        int i0 = pairs[i].first, i1 = pairs[i].second;
+        process(src[i0], src[i1], corners[i0], corners[i1], masks[i0], masks[i1]);
+    }
+
+    LOGLN("Finding seams, time: " << ((getTickCount() - t) / getTickFrequency()) << " sec");
+}
+
+
+void DpSeamFinder::process(
+        const Mat &image1, const Mat &image2, Point tl1, Point tl2,
+        Mat &mask1, Mat &mask2)
+{
+    CV_Assert(image1.size() == mask1.size());
+    CV_Assert(image2.size() == mask2.size());
+
+    Point intersectTl(std::max(tl1.x, tl2.x), std::max(tl1.y, tl2.y));
+
+    Point intersectBr(std::min(tl1.x + image1.cols, tl2.x + image2.cols),
+                      std::min(tl1.y + image1.rows, tl2.y + image2.rows));
+
+    if (intersectTl.x >= intersectBr.x || intersectTl.y >= intersectBr.y)
+        return; // there are no conflicts
+
+    unionTl_ = Point(std::min(tl1.x, tl2.x), std::min(tl1.y, tl2.y));
+
+    unionBr_ = Point(std::max(tl1.x + image1.cols, tl2.x + image2.cols),
+                     std::max(tl1.y + image1.rows, tl2.y + image2.rows));
+
+    unionSize_ = Size(unionBr_.x - unionTl_.x, unionBr_.y - unionTl_.y);
+
+    mask1_ = Mat::zeros(unionSize_, CV_8U);
+    mask2_ = Mat::zeros(unionSize_, CV_8U);
+
+    Mat tmp = mask1_(Rect(tl1.x - unionTl_.x, tl1.y - unionTl_.y, mask1.cols, mask1.rows));
+    mask1.copyTo(tmp);
+
+    tmp = mask2_(Rect(tl2.x - unionTl_.x, tl2.y - unionTl_.y, mask2.cols, mask2.rows));
+    mask2.copyTo(tmp);
+
+    // find both images contour masks
+
+    contour1mask_ = Mat::zeros(unionSize_, CV_8U);
+    contour2mask_ = Mat::zeros(unionSize_, CV_8U);
+
+    for (int y = 0; y < unionSize_.height; ++y)
+    {
+        for (int x = 0; x < unionSize_.width; ++x)
+        {
+            if (mask1_(y, x) &&
+                ((x == 0 || !mask1_(y, x-1)) || (x == unionSize_.width-1 || !mask1_(y, x+1)) ||
+                 (y == 0 || !mask1_(y-1, x)) || (y == unionSize_.height-1 || !mask1_(y+1, x))))
+            {
+                contour1mask_(y, x) = 255;
+            }
+
+            if (mask2_(y, x) &&
+                ((x == 0 || !mask2_(y, x-1)) || (x == unionSize_.width-1 || !mask2_(y, x+1)) ||
+                 (y == 0 || !mask2_(y-1, x)) || (y == unionSize_.height-1 || !mask2_(y+1, x))))
+            {
+                contour2mask_(y, x) = 255;
+            }
+        }
+    }
+
+    findComponents();
+
+    findEdges();
+
+    resolveConflicts(image1, image2, tl1, tl2, mask1, mask2);
+}
+
+
+void DpSeamFinder::findComponents()
+{
+    // label all connected components and get information about them
+
+    ncomps_ = 0;
+    labels_.create(unionSize_);
+    states_.clear();
+    tls_.clear();
+    brs_.clear();
+    contours_.clear();
+
+    for (int y = 0; y < unionSize_.height; ++y)
+    {
+        for (int x = 0; x < unionSize_.width; ++x)
+        {
+            if (mask1_(y, x) && mask2_(y, x))
+                labels_(y, x) = numeric_limits<int>::max();
+            else if (mask1_(y, x))
+                labels_(y, x) = numeric_limits<int>::max()-1;
+            else if (mask2_(y, x))
+                labels_(y, x) = numeric_limits<int>::max()-2;
+            else
+                labels_(y, x) = 0;
+        }
+    }
+
+    for (int y = 0; y < unionSize_.height; ++y)
+    {
+        for (int x = 0; x < unionSize_.width; ++x)
+        {
+            if (labels_(y, x) >= numeric_limits<int>::max()-2)
+            {
+                if (labels_(y, x) == numeric_limits<int>::max())
+                    states_.push_back(INTERS);
+                else if (labels_(y, x) == numeric_limits<int>::max()-1)
+                    states_.push_back(FIRST);
+                else if (labels_(y, x) == numeric_limits<int>::max()-2)
+                    states_.push_back(SECOND);
+
+                floodFill(labels_, Point(x, y), ++ncomps_);
+                tls_.push_back(Point(x, y));
+                brs_.push_back(Point(x+1, y+1));
+                contours_.push_back(vector<Point>());
+            }
+
+            if (labels_(y, x))
+            {
+                int l = labels_(y, x);
+                int ci = l-1;
+
+                tls_[ci].x = std::min(tls_[ci].x, x);
+                tls_[ci].y = std::min(tls_[ci].y, y);
+                brs_[ci].x = std::max(brs_[ci].x, x+1);
+                brs_[ci].y = std::max(brs_[ci].y, y+1);
+
+                if ((x == 0 || labels_(y, x-1) != l) || (x == unionSize_.width-1 || labels_(y, x+1) != l) ||
+                    (y == 0 || labels_(y-1, x) != l) || (y == unionSize_.height-1 || labels_(y+1, x) != l))
+                {
+                    contours_[ci].push_back(Point(x, y));
+                }
+            }
+        }
+    }
+}
+
+
+void DpSeamFinder::findEdges()
+{
+    // find edges between components
+
+    map<pair<int, int>, int> wedges; // weighted edges
+
+    for (int ci = 0; ci < ncomps_-1; ++ci)
+    {
+        for (int cj = ci+1; cj < ncomps_; ++cj)
+        {
+            wedges[make_pair(ci, cj)] = 0;
+            wedges[make_pair(cj, ci)] = 0;
+        }
+    }
+
+    for (int ci = 0; ci < ncomps_; ++ci)
+    {
+        for (size_t i = 0; i < contours_[ci].size(); ++i)
+        {
+            int x = contours_[ci][i].x;
+            int y = contours_[ci][i].y;
+            int l = ci + 1;
+
+            if (x > 0 && labels_(y, x-1) && labels_(y, x-1) != l)
+            {
+                wedges[make_pair(ci, labels_(y, x-1)-1)]++;
+                wedges[make_pair(labels_(y, x-1)-1, ci)]++;
+            }
+
+            if (y > 0 && labels_(y-1, x) && labels_(y-1, x) != l)
+            {
+                wedges[make_pair(ci, labels_(y-1, x)-1)]++;
+                wedges[make_pair(labels_(y-1, x)-1, ci)]++;
+            }
+
+            if (x < unionSize_.width-1 && labels_(y, x+1) && labels_(y, x+1) != l)
+            {
+                wedges[make_pair(ci, labels_(y, x+1)-1)]++;
+                wedges[make_pair(labels_(y, x+1)-1, ci)]++;
+            }
+
+            if (y < unionSize_.height-1 && labels_(y+1, x) && labels_(y+1, x) != l)
+            {
+                wedges[make_pair(ci, labels_(y+1, x)-1)]++;
+                wedges[make_pair(labels_(y+1, x)-1, ci)]++;
+            }
+        }
+    }
+
+    edges_.clear();
+
+    for (int ci = 0; ci < ncomps_-1; ++ci)
+    {
+        for (int cj = ci+1; cj < ncomps_; ++cj)
+        {
+            map<pair<int, int>, int>::iterator itr = wedges.find(make_pair(ci, cj));
+            if (itr != wedges.end() && itr->second > 0)
+                edges_.insert(itr->first);
+
+            itr = wedges.find(make_pair(cj, ci));
+            if (itr != wedges.end() && itr->second > 0)
+                edges_.insert(itr->first);
+        }
+    }
+}
+
+
+void DpSeamFinder::resolveConflicts(const Mat &image1, const Mat &image2,
+                                    Point tl1, Point tl2, Mat &mask1, Mat &mask2)
+{
+    if (costFunc_ == COLOR_GRAD)
+        computeGradients(image1, image2);
+
+    // resolve conflicts between components
+
+    bool hasConflict = true;
+    while (hasConflict)
+    {
+        int c1, c2;
+        hasConflict = false;
+
+        for (set<pair<int, int> >::iterator itr = edges_.begin(); itr != edges_.end(); ++itr)
+        {
+            c1 = itr->first;
+            c2 = itr->second;
+
+            if ((states_[c1] & INTERS) && (states_[c1] & (~INTERS)) != states_[c2])
+            {
+                hasConflict = true;
+                break;
+            }
+        }
+
+        if (hasConflict)
+        {
+            int l1 = c1+1, l2 = c2+1;
+
+            if (hasOnlyOneNeighbor(c1))
+            {
+                // if the first components has only one adjacent component
+
+                for (int y = tls_[c1].y; y < brs_[c1].y; ++y)
+                    for (int x = tls_[c1].x; x < brs_[c1].x; ++x)
+                        if (labels_(y, x) == l1)
+                            labels_(y, x) = l2;
+
+                states_[c1] = states_[c2] == FIRST ? SECOND : FIRST;
+            }
+            else
+            {
+                // if the first component has more than one adjacent component
+
+                Point p1, p2;
+                if (getSeamTips(c1, c2, p1, p2))
+                {
+                    vector<Point> seam;
+                    bool isHorizontalSeam;
+
+                    if (estimateSeam(image1, image2, tl1, tl2, c1, p1, p2, seam, isHorizontalSeam))
+                        updateLabelsUsingSeam(c1, c2, seam, isHorizontalSeam);
+                }
+
+                states_[c1] = states_[c2] == FIRST ? INTERS_SECOND : INTERS_FIRST;
+            }
+
+            const int c[] = {c1, c2};
+            const int l[] = {l1, l2};
+
+            for (int i = 0; i < 2; ++i)
+            {
+                // update information about the (i+1)-th component
+
+                int x0 = tls_[c[i]].x, x1 = brs_[c[i]].x;
+                int y0 = tls_[c[i]].y, y1 = brs_[c[i]].y;
+
+                tls_[c[i]] = Point(numeric_limits<int>::max(), numeric_limits<int>::max());
+                brs_[c[i]] = Point(numeric_limits<int>::min(), numeric_limits<int>::min());
+                contours_[c[i]].clear();
+
+                for (int y = y0; y < y1; ++y)
+                {
+                    for (int x = x0; x < x1; ++x)
+                    {
+                        if (labels_(y, x) == l[i])
+                        {
+                            tls_[c[i]].x = std::min(tls_[c[i]].x, x);
+                            tls_[c[i]].y = std::min(tls_[c[i]].y, y);
+                            brs_[c[i]].x = std::max(brs_[c[i]].x, x+1);
+                            brs_[c[i]].y = std::max(brs_[c[i]].y, y+1);
+
+                            if ((x == 0 || labels_(y, x-1) != l[i]) || (x == unionSize_.width-1 || labels_(y, x+1) != l[i]) ||
+                                (y == 0 || labels_(y-1, x) != l[i]) || (y == unionSize_.height-1 || labels_(y+1, x) != l[i]))
+                            {
+                                contours_[c[i]].push_back(Point(x, y));
+                            }
+                        }
+                    }
+                }
+            }
+
+            // remove edges
+
+            edges_.erase(make_pair(c1, c2));
+            edges_.erase(make_pair(c2, c1));
+        }
+    }
+
+    // update masks
+
+    int dx1 = unionTl_.x - tl1.x, dy1 = unionTl_.y - tl1.y;
+    int dx2 = unionTl_.x - tl2.x, dy2 = unionTl_.y - tl2.y;
+
+    for (int y = 0; y < mask2.rows; ++y)
+    {
+        for (int x = 0; x < mask2.cols; ++x)
+        {
+             int l = labels_(y - dy2, x - dx2);
+             if (l > 0 && (states_[l-1] & FIRST) && mask1.at<uchar>(y - dy2 + dy1, x - dx2 + dx1))
+                mask2.at<uchar>(y, x) = 0;
+        }
+    }
+
+    for (int y = 0; y < mask1.rows; ++y)
+    {
+        for (int x = 0; x < mask1.cols; ++x)
+        {
+             int l = labels_(y - dy1, x - dx1);
+             if (l > 0 && (states_[l-1] & SECOND) && mask2.at<uchar>(y - dy1 + dy2, x - dx1 + dx2))
+                mask1.at<uchar>(y, x) = 0;
+        }
+    }
+}
+
+
+void DpSeamFinder::computeGradients(const Mat &image1, const Mat &image2)
+{
+    CV_Assert(costFunction() == COLOR_GRAD);
+
+    Mat gray;
+    cvtColor(image1, gray, CV_BGR2GRAY);
+    Sobel(gray, gradx1_, CV_32F, 1, 0);
+    Sobel(gray, grady1_, CV_32F, 0, 1);
+
+    cvtColor(image2, gray, CV_BGR2GRAY);
+    Sobel(gray, gradx2_, CV_32F, 1, 0);
+    Sobel(gray, grady2_, CV_32F, 0, 1);
+}
+
+
+bool DpSeamFinder::hasOnlyOneNeighbor(int c)
+{
+    set<pair<int, int> >::iterator begin, end;
+    begin = lower_bound(edges_.begin(), edges_.end(), make_pair(c, numeric_limits<int>::min()));
+    end = upper_bound(edges_.begin(), edges_.end(), make_pair(c, numeric_limits<int>::max()));
+    return ++begin == end;
+}
+
+
+bool DpSeamFinder::closeToContour(int y, int x, const Mat_<uchar> &contourMask)
+{
+    const int rad = 2;
+
+    for (int dy = -rad; dy <= rad; ++dy)
+    {
+        if (y + dy >= 0 && y + dy < unionSize_.height)
+        {
+            for (int dx = -rad; dx <= rad; ++dx)
+            {
+                if (x + dx >= 0 && x + dx < unionSize_.width &&
+                    contourMask(y + dy, x + dx))
+                {
+                    return true;
+                }
+            }
+        }
+    }
+
+    return false;
+}
+
+
+bool DpSeamFinder::getSeamTips(int c1, int c2, Point &p1, Point &p2)
+{
+    CV_Assert(states_[c1] & INTERS);
+
+    // find special points
+
+    vector<Point> specialPoints;
+    int l2 = c2+1;
+
+    for (size_t i = 0; i < contours_[c1].size(); ++i)
+    {
+        int x = contours_[c1][i].x;
+        int y = contours_[c1][i].y;
+
+        if (closeToContour(y, x, contour1mask_) &&
+            closeToContour(y, x, contour2mask_) &&
+            ((x > 0 && labels_(y, x-1) == l2) ||
+             (y > 0 && labels_(y-1, x) == l2) ||
+             (x < unionSize_.width-1 && labels_(y, x+1) == l2) ||
+             (y < unionSize_.height-1 && labels_(y+1, x) == l2)))
+        {
+            specialPoints.push_back(Point(x, y));
+        }
+    }
+
+    if (specialPoints.size() < 2)
+        return false;
+
+    // find clusters
+
+    vector<int> labels;
+    cv::partition(specialPoints, labels, ClosePoints(10));
+
+    int nlabels = *max_element(labels.begin(), labels.end()) + 1;
+    if (nlabels < 2)
+        return false;
+
+    vector<Point> sum(nlabels);
+    vector<vector<Point> > points(nlabels);
+
+    for (size_t i = 0; i < specialPoints.size(); ++i)
+    {
+        sum[labels[i]] += specialPoints[i];
+        points[labels[i]].push_back(specialPoints[i]);
+    }
+
+    // select two most distant clusters
+
+    int idx[2] = {-1,-1};
+    double maxDist = -numeric_limits<double>::max();
+
+    for (int i = 0; i < nlabels-1; ++i)
+    {
+        for (int j = i+1; j < nlabels; ++j)
+        {
+            double size1 = points[i].size(), size2 = points[j].size();
+            double cx1 = cvRound(sum[i].x / size1), cy1 = cvRound(sum[i].y / size1);
+            double cx2 = cvRound(sum[j].x / size2), cy2 = cvRound(sum[j].y / size1);
+
+            double dist = (cx1 - cx2) * (cx1 - cx2) + (cy1 - cy2) * (cy1 - cy2);
+            if (dist > maxDist)
+            {
+                maxDist = dist;
+                idx[0] = i;
+                idx[1] = j;
+            }
+        }
+    }
+
+    // select two points closest to the clusters' centers
+
+    Point p[2];
+
+    for (int i = 0; i < 2; ++i)
+    {
+        double size = points[idx[i]].size();
+        double cx = cvRound(sum[idx[i]].x / size);
+        double cy = cvRound(sum[idx[i]].y / size);
+
+        int closest = -1;
+        double minDist = numeric_limits<double>::max();
+
+        for (size_t j = 0; j < points[idx[i]].size(); ++j)
+        {
+            double dist = (points[idx[i]][j].x - cx) * (points[idx[i]][j].x - cx) +
+                          (points[idx[i]][j].y - cy) * (points[idx[i]][j].y - cy);
+            if (dist < minDist)
+            {
+                minDist = dist;
+                closest = j;
+            }
+        }
+
+        p[i] = points[idx[i]][closest];
+    }
+
+    p1 = p[0];
+    p2 = p[1];
+    return true;
+}
+
+
+namespace
+{
+
+template <typename T>
+float diffL2Square(const Mat &image1, int y1, int x1, const Mat &image2, int y2, int x2)
+{
+    const T *r1 = image1.ptr<T>(y1);
+    const T *r2 = image2.ptr<T>(y2);
+    return static_cast<float>(sqr(r1[3*x1] - r2[3*x2]) + sqr(r1[3*x1+1] - r2[3*x2+1]) +
+                              sqr(r1[3*x1+2] - r2[3*x2+2]));
+}
+
+} // namespace
+
+
+void DpSeamFinder::computeCosts(const Mat &image1, const Mat &image2, Point tl1, Point tl2,
+                                int c, Mat_<float> &costV, Mat_<float> &costH)
+{
+    CV_Assert(states_[c] & INTERS);
+
+    // compute costs    
+
+    float (*diff)(const Mat&, int, int, const Mat&, int, int) = 0;
+    if (image1.type() == CV_32FC3 && image2.type() == CV_32FC3)
+        diff = diffL2Square<float>;
+    else if (image1.type() == CV_8UC3 && image2.type() == CV_8UC3)
+        diff = diffL2Square<uchar>;
+    else
+        CV_Error(CV_StsBadArg, "both images must have CV_32FC3 or CV_8UC3 type");
+
+    int l = c+1;
+    Rect roi(tls_[c], brs_[c]);
+
+    int dx1 = unionTl_.x - tl1.x, dy1 = unionTl_.y - tl1.y;
+    int dx2 = unionTl_.x - tl2.x, dy2 = unionTl_.y - tl2.y;
+
+    const float badRegionCost = normL2(Point3f(255.f, 255.f, 255.f),
+                                       Point3f(0.f, 0.f, 0.f));
+
+    costV.create(roi.height, roi.width+1);
+
+    for (int y = roi.y; y < roi.br().y; ++y)
+    {
+        for (int x = roi.x; x < roi.br().x+1; ++x)
+        {
+            if (labels_(y, x) == l && x > 0 && labels_(y, x-1) == l)
+            {
+                float costColor = (diff(image1, y + dy1, x + dx1 - 1, image2, y + dy2, x + dx2) +
+                                   diff(image1, y + dy1, x + dx1, image2, y + dy2, x + dx2 - 1)) / 2;
+                if (costFunc_ == COLOR)
+                    costV(y - roi.y, x - roi.x) = costColor;
+                else if (costFunc_ == COLOR_GRAD)
+                {
+                    float costGrad = std::abs(gradx1_(y + dy1, x + dx1)) + std::abs(gradx1_(y + dy1, x + dx1 - 1)) +
+                                     std::abs(gradx2_(y + dy2, x + dx2)) + std::abs(gradx2_(y + dy2, x + dx2 - 1)) + 1.f;
+                    costV(y - roi.y, x - roi.x) = costColor / costGrad;
+                }
+            }
+            else
+                costV(y - roi.y, x - roi.x) = badRegionCost;
+        }
+    }
+
+    costH.create(roi.height+1, roi.width);
+
+    for (int y = roi.y; y < roi.br().y+1; ++y)
+    {
+        for (int x = roi.x; x < roi.br().x; ++x)
+        {
+            if (labels_(y, x) == l && y > 0 && labels_(y-1, x) == l)
+            {
+                float costColor = (diff(image1, y + dy1 - 1, x + dx1, image2, y + dy2, x + dx2) +
+                                   diff(image1, y + dy1, x + dx1, image2, y + dy2 - 1, x + dx2)) / 2;
+                if (costFunc_ == COLOR)
+                    costH(y - roi.y, x - roi.x) = costColor;
+                else if (costFunc_ == COLOR_GRAD)
+                {
+                    float costGrad = std::abs(grady1_(y + dy1, x + dx1)) + std::abs(grady1_(y + dy1 - 1, x + dx1)) +
+                                     std::abs(grady2_(y + dy2, x + dx2)) + std::abs(grady2_(y + dy2 - 1, x + dx2)) + 1.f;
+                    costH(y - roi.y, x - roi.x) = costColor / costGrad;
+                }
+            }
+            else
+                costH(y - roi.y, x - roi.x) = badRegionCost;
+        }
+    }
+}
+
+
+bool DpSeamFinder::estimateSeam(
+        const Mat &image1, const Mat &image2, Point tl1, Point tl2, int c,
+        Point p1, Point p2, vector<Point> &seam, bool &isHorizontal)
+{
+    CV_Assert(states_[c] & INTERS);
+
+    Mat_<float> costV, costH;
+    computeCosts(image1, image2, tl1, tl2, c, costV, costH);
+
+    Rect roi(tls_[c], brs_[c]);
+    Point src = p1 - roi.tl();
+    Point dst = p2 - roi.tl();
+    int l = c+1;
+
+    // estimate seam direction
+
+    bool swapped = false;
+    isHorizontal = std::abs(dst.x - src.x) > std::abs(dst.y - src.y);
+
+    if (isHorizontal)
+    {
+        if (src.x > dst.x)
+        {
+            std::swap(src, dst);
+            swapped = true;
+        }
+    }
+    else if (src.y > dst.y)
+    {
+        swapped = true;
+        std::swap(src, dst);
+    }
+
+    // find optimal control
+
+    Mat_<uchar> control = Mat::zeros(roi.size(), CV_8U);
+    Mat_<uchar> reachable = Mat::zeros(roi.size(), CV_8U);
+    Mat_<float> cost = Mat::zeros(roi.size(), CV_32F);
+
+    reachable(src) = 1;
+    cost(src) = 0.f;
+
+    int nsteps;
+    pair<float, int> steps[3];
+
+    if (isHorizontal)
+    {
+        for (int x = src.x+1; x <= dst.x; ++x)
+        {
+            for (int y = 0; y < roi.height; ++y)
+            {
+                // seam follows along upper side of pixels
+
+                nsteps = 0;
+
+                if (labels_(y + roi.y, x + roi.x) == l)
+                {
+                    if (reachable(y, x-1))
+                        steps[nsteps++] = make_pair(cost(y, x-1) + costH(y, x-1), 1);
+                    if (y > 0 && reachable(y-1, x-1))
+                        steps[nsteps++] = make_pair(cost(y-1, x-1) + costH(y-1, x-1) + costV(y-1, x), 2);
+                    if (y < roi.height-1 && reachable(y+1, x-1))
+                        steps[nsteps++] = make_pair(cost(y+1, x-1) + costH(y+1, x-1) + costV(y, x), 3);
+                }
+
+                if (nsteps)
+                {
+                    pair<float, int> opt = *min_element(steps, steps + nsteps);
+                    cost(y, x) = opt.first;
+                    control(y, x) = opt.second;
+                    reachable(y, x) = 255;
+                }
+            }
+        }
+    }
+    else
+    {
+        for (int y = src.y+1; y <= dst.y; ++y)
+        {
+            for (int x = 0; x < roi.width; ++x)
+            {
+                // seam follows along left side of pixels
+
+                nsteps = 0;
+
+                if (labels_(y + roi.y, x + roi.x) == l)
+                {
+                    if (reachable(y-1, x))
+                        steps[nsteps++] = make_pair(cost(y-1, x) + costV(y-1, x), 1);
+                    if (x > 0 && reachable(y-1, x-1))
+                        steps[nsteps++] = make_pair(cost(y-1, x-1) + costV(y-1, x-1) + costH(y, x-1), 2);
+                    if (x < roi.width-1 && reachable(y-1, x+1))
+                        steps[nsteps++] = make_pair(cost(y-1, x+1) + costV(y-1, x+1) + costH(y, x), 3);
+                }
+
+                if (nsteps)
+                {
+                    pair<float, int> opt = *min_element(steps, steps + nsteps);
+                    cost(y, x) = opt.first;
+                    control(y, x) = opt.second;
+                    reachable(y, x) = 255;
+                }
+            }
+        }
+    }
+
+    if (!reachable(dst))
+        return false;
+
+    // restore seam
+
+    Point p = dst;
+    seam.clear();
+    seam.push_back(p + roi.tl());
+
+    if (isHorizontal)
+    {
+        for (; p.x != src.x; seam.push_back(p + roi.tl()))
+        {
+            if (control(p) == 2) p.y--;
+            else if (control(p) == 3) p.y++;
+            p.x--;
+        }
+    }
+    else
+    {
+        for (; p.y != src.y; seam.push_back(p + roi.tl()))
+        {
+            if (control(p) == 2) p.x--;
+            else if (control(p) == 3) p.x++;
+            p.y--;
+        }
+    }
+
+    if (!swapped)
+        reverse(seam.begin(), seam.end());
+
+    CV_Assert(seam.front() == p1);
+    CV_Assert(seam.back() == p2);
+    return true;
+}
+
+
+void DpSeamFinder::updateLabelsUsingSeam(int c1, int c2, const vector<Point> &seam, bool isHorizontalSeam)
+{
+    Mat_<int> mask = Mat::zeros(brs_[c1].y - tls_[c1].y, brs_[c1].x - tls_[c1].x, CV_32S);
+
+    for (size_t i = 0; i < contours_[c1].size(); ++i)
+        mask(contours_[c1][i] - tls_[c1]) = 255;
+
+    for (size_t i = 0; i < seam.size(); ++i)
+        mask(seam[i] - tls_[c1]) = 255;
+
+    // find connected components after seam carving
+
+    int l1 = c1+1, l2 = c2+1;
+
+    int ncomps = 0;
+
+    for (int y = 0; y < mask.rows; ++y)
+        for (int x = 0; x < mask.cols; ++x)
+            if (!mask(y, x) && labels_(y + tls_[c1].y, x + tls_[c1].x) == l1)
+                floodFill(mask, Point(x, y), ++ncomps);
+
+    for (size_t i = 0; i < contours_[c1].size(); ++i)
+    {
+        int x = contours_[c1][i].x - tls_[c1].x;
+        int y = contours_[c1][i].y - tls_[c1].y;
+
+        bool ok = false;
+        static const int dx[] = {-1, +1, 0, 0, -1, +1, -1, +1};
+        static const int dy[] = {0, 0, -1, +1, -1, -1, +1, +1};
+
+        for (int j = 0; j < 8; ++j)
+        {
+            int c = x + dx[j];
+            int r = y + dy[j];
+
+            if (c >= 0 && c < mask.cols && r >= 0 && r < mask.rows &&
+                mask(r, c) && mask(r, c) != 255)
+            {
+                ok = true;
+                mask(y, x) = mask(r, c);
+            }
+        }
+
+        if (!ok)
+            mask(y, x) = 0;
+    }
+
+    if (isHorizontalSeam)
+    {
+        for (size_t i = 0; i < seam.size(); ++i)
+        {
+            int x = seam[i].x - tls_[c1].x;
+            int y = seam[i].y - tls_[c1].y;
+
+            if (y < mask.rows-1 && mask(y+1, x) && mask(y+1, x) != 255)
+                mask(y, x) = mask(y+1, x);
+            else
+                mask(y, x) = 0;
+        }
+    }
+    else
+    {
+        for (size_t i = 0; i < seam.size(); ++i)
+        {
+            int x = seam[i].x - tls_[c1].x;
+            int y = seam[i].y - tls_[c1].y;
+
+            if (x < mask.cols-1 && mask(y, x+1) && mask(y, x+1) != 255)
+                mask(y, x) = mask(y, x+1);
+            else
+                mask(y, x) = 0;
+        }
+    }
+
+    // find new components connected with the second component and
+    // with other components except the ones we are working with
+
+    map<int, int> connect2;
+    map<int, int> connectOther;
+
+    for (int i = 1; i <= ncomps; ++i)
+    {
+        connect2.insert(make_pair(i, 0));
+        connectOther.insert(make_pair(i, 0));
+    }
+
+    for (size_t i = 0; i < contours_[c1].size(); ++i)
+    {
+        int x = contours_[c1][i].x;
+        int y = contours_[c1][i].y;
+
+        if ((x > 0 && labels_(y, x-1) == l2) ||
+            (y > 0 && labels_(y-1, x) == l2) ||
+            (x < unionSize_.width-1 && labels_(y, x+1) == l2) ||
+            (y < unionSize_.height-1 && labels_(y+1, x) == l2))
+        {
+            connect2[mask(y - tls_[c1].y, x - tls_[c1].x)]++;
+        }
+
+        if ((x > 0 && labels_(y, x-1) != l1 && labels_(y, x-1) != l2) ||
+            (y > 0 && labels_(y-1, x) != l1 && labels_(y-1, x) != l2) ||
+            (x < unionSize_.width-1 && labels_(y, x+1) != l1 && labels_(y, x+1) != l2) ||
+            (y < unionSize_.height-1 && labels_(y+1, x) != l1 && labels_(y+1, x) != l2))
+        {
+            connectOther[mask(y - tls_[c1].y, x - tls_[c1].x)]++;
+        }
+    }
+
+    vector<int> isAdjComp(ncomps + 1, 0);
+
+    for (map<int, int>::iterator itr = connect2.begin(); itr != connect2.end(); ++itr)
+    {
+        double len = contours_[c1].size();
+        isAdjComp[itr->first] = itr->second / len > 0.05 && connectOther.find(itr->first)->second / len < 0.1;
+    }
+
+    // update labels
+
+    for (int y = 0; y < mask.rows; ++y)
+        for (int x = 0; x < mask.cols; ++x)
+            if (mask(y, x) && isAdjComp[mask(y, x)])
+                labels_(y + tls_[c1].y, x + tls_[c1].x) = l2;
+}
+
+
 class GraphCutSeamFinder::Impl : public PairwiseSeamFinder
 {
 public:
--- a/modules/ts/misc/run.py
+++ b/modules/ts/misc/run.py
@@ -56,6 +56,7 @@ parse_patterns = (
  {'name': "tests_dir",                'default': None,       'pattern': re.compile("^EXECUTABLE_OUTPUT_PATH:PATH=(.+)$")},
  {'name': "build_type",               'default': "Release",  'pattern': re.compile("^CMAKE_BUILD_TYPE:STRING=(.*)$")},
  {'name': "svnversion_path",          'default': None,       'pattern': re.compile("^SVNVERSION_PATH:FILEPATH=(.*)$")},
+  {'name': "git_executable",           'default': None,       'pattern': re.compile("^GIT_EXECUTABLE:FILEPATH=(.*)$")},
  {'name': "cxx_flags",                'default': "",         'pattern': re.compile("^CMAKE_CXX_FLAGS:STRING=(.*)$")},
  {'name': "cxx_flags_debug",          'default': "",         'pattern': re.compile("^CMAKE_CXX_FLAGS_DEBUG:STRING=(.*)$")},
  {'name': "cxx_flags_release",        'default': "",         'pattern': re.compile("^CMAKE_CXX_FLAGS_RELEASE:STRING=(.*)$")},
@@ -303,13 +304,15 @@ class RunInfo(object):
        # detect target arch
        if self.targetos == "android":
            if "armeabi-v7a" in self.android_abi:
-                self.targetarch = "ARMv7a"
+                self.targetarch = "armv7a"
            elif "armeabi-v6" in self.android_abi:
-                self.targetarch = "ARMv6"
+                self.targetarch = "armv6"
            elif "armeabi" in self.android_abi:
-                self.targetarch = "ARMv5te"
+                self.targetarch = "armv5te"
            elif "x86" in self.android_abi:
                self.targetarch = "x86"
+            elif "mips" in self.android_abi:
+                self.targetarch = "mips"
            else:
                self.targetarch = "ARM"
        elif self.is_x64 and hostmachine in ["AMD64", "x86_64"]:
@@ -327,20 +330,38 @@ class RunInfo(object):

        self.hardware = None

-        self.getSvnVersion(self.cmake_home, "cmake_home_svn")
+        self.cmake_home_vcver = self.getVCVersion(self.cmake_home)
        if self.opencv_home == self.cmake_home:
-            self.opencv_home_svn = self.cmake_home_svn
+            self.opencv_home_vcver = self.cmake_home_vcver
        else:
-            self.getSvnVersion(self.opencv_home, "opencv_home_svn")
+            self.opencv_home_vcver = self.getVCVersion(self.opencv_home)

        self.tests = self.getAvailableTestApps()

-    def getSvnVersion(self, path, name):
+    def getVCVersion(self, root_path):
+        if os.path.isdir(os.path.join(root_path, ".svn")):
+            return self.getSvnVersion(root_path)
+        elif os.path.isdir(os.path.join(root_path, ".git")):
+            return self.getGitHash(root_path)
+        return None
+
+    def getGitHash(self, path):
+        if not path or not self.git_executable:
+            return None
+        try:
+            output = Popen([self.git_executable, "rev-parse", "--short", "HEAD"], stdout=PIPE, stderr=PIPE, cwd = path).communicate()
+            if not output[1]:
+                return output[0].strip()
+            else:
+                return None
+        except OSError:
+            return None
+
+    def getSvnVersion(self, path):
        if not path:
-            setattr(self, name, None)
-            return
-        if not self.svnversion_path and hostos == 'nt':
-            self.tryGetSvnVersionWithTortoise(path, name)
+            val = None
+        elif not self.svnversion_path and hostos == 'nt':
+            val = self.tryGetSvnVersionWithTortoise(path)
        else:
            svnversion = self.svnversion_path
            if not svnversion:
@@ -348,13 +369,16 @@ class RunInfo(object):
            try:
                output = Popen([svnversion, "-n", path], stdout=PIPE, stderr=PIPE).communicate()
                if not output[1]:
-                    setattr(self, name, output[0])
+                    val = output[0]
                else:
-                    setattr(self, name, None)
+                    val = None
            except OSError:
-                setattr(self, name, None)
+                val = None
+        if val:
+            val = val.replace(" ", "_")
+        return val

-    def tryGetSvnVersionWithTortoise(self, path, name):
+    def tryGetSvnVersionWithTortoise(self, path):
        try:
            wcrev = "SubWCRev.exe"
            dir = tempfile.mkdtemp()
@@ -371,9 +395,9 @@ class RunInfo(object):
                tmpfile = open(tmpfilename2, "r")
                version = tmpfile.read()
                tmpfile.close()
-            setattr(self, name, version)
+            return version
        except:
-            setattr(self, name, None)
+            return None
        finally:
            if dir:
                shutil.rmtree(dir)
@@ -406,13 +430,13 @@ class RunInfo(object):
        if app.startswith(self.nameprefix):
            app = app[len(self.nameprefix):]

-        if self.cmake_home_svn:
-            if self.cmake_home_svn == self.opencv_home_svn:
-                rev = self.cmake_home_svn
-            elif self.opencv_home_svn:
-                rev = self.cmake_home_svn + "-" + self.opencv_home_svn
+        if self.cmake_home_vcver:
+            if self.cmake_home_vcver == self.opencv_home_vcver:
+                rev = self.cmake_home_vcver
+            elif self.opencv_home_vcver:
+                rev = self.cmake_home_vcver + "-" + self.opencv_home_vcver
            else:
-                rev = self.cmake_home_svn
+                rev = self.cmake_home_vcver
        else:
            rev = None
        if rev:
@@ -484,7 +508,6 @@ class RunInfo(object):
                    else:
                        prev_option = prev_option + " " + opt
                options.append(tmpfile[1])
-                print options
                output = Popen(options, stdout=PIPE, stderr=PIPE).communicate()
                compiler_output = output[1]
                os.remove(tmpfile[1])
@@ -506,7 +529,7 @@ class RunInfo(object):
                hw = "CUDA_"
            else:
                hw = ""
-            tstamp = timestamp.strftime("%Y-%m-%d--%H-%M-%S")
+            tstamp = timestamp.strftime("%Y%m%d-%H%M%S")
            return "%s_%s_%s_%s%s%s.xml" % (app, self.targetos, self.targetarch, hw, rev, tstamp)

    def getTest(self, name):
--- a/modules/video/src/bgfg_gmg.cpp
+++ b/modules/video/src/bgfg_gmg.cpp
@@ -440,8 +440,7 @@ bool BackgroundSubtractorGMG::HistogramFeatureGMG::operator ==(HistogramFeatureG
    std::vector<size_t>::iterator color_a;
    std::vector<size_t>::iterator color_b;
    std::vector<size_t>::iterator color_a_end = this->color.end();
-    std::vector<size_t>::iterator color_b_end = rhs.color.end();
-    for (color_a = color.begin(),color_b =rhs.color.begin();color_a!=color_a_end;++color_a,++color_b)
+    for (color_a = color.begin(), color_b = rhs.color.begin(); color_a != color_a_end; ++color_a, ++color_b)
    {
        if (*color_a != *color_b)
        {