Merge branch '2.4'

2013-04-05 19:52:42 +04:00
parent 3785439917 74e5ff2ec7
commit 67073daf19
154 changed files with 11355 additions and 16795 deletions
--- a/modules/calib3d/test/test_solvepnp_ransac.cpp
+++ b/modules/calib3d/test/test_solvepnp_ransac.cpp
@@ -239,7 +239,7 @@ protected:
    }
 };

-TEST(Calib3d_SolvePnPRansac, accuracy) { CV_solvePnPRansac_Test test; test.safe_run(); }
+TEST(DISABLED_Calib3d_SolvePnPRansac, accuracy) { CV_solvePnPRansac_Test test; test.safe_run(); }
 TEST(Calib3d_SolvePnP, accuracy) { CV_solvePnP_Test test; test.safe_run(); }


--- a/modules/calib3d/test/test_stereomatching.cpp
+++ b/modules/calib3d/test/test_stereomatching.cpp
@@ -460,14 +460,29 @@ void CV_StereoMatchingTest::run(int)
            continue;
        }
        int dispScaleFactor = datasetsParams[datasetName].dispScaleFactor;
-        Mat tmp; trueLeftDisp.convertTo( tmp, CV_32FC1, 1.f/dispScaleFactor ); trueLeftDisp = tmp; tmp.release();
+        Mat tmp;
+
+        trueLeftDisp.convertTo( tmp, CV_32FC1, 1.f/dispScaleFactor );
+        trueLeftDisp = tmp;
+        tmp.release();
+
        if( !trueRightDisp.empty() )
-            trueRightDisp.convertTo( tmp, CV_32FC1, 1.f/dispScaleFactor ); trueRightDisp = tmp; tmp.release();
+        {
+            trueRightDisp.convertTo( tmp, CV_32FC1, 1.f/dispScaleFactor );
+            trueRightDisp = tmp;
+            tmp.release();
+        }

        Mat leftDisp, rightDisp;
        int ignBorder = max(runStereoMatchingAlgorithm(leftImg, rightImg, leftDisp, rightDisp, ci), EVAL_IGNORE_BORDER);
-        leftDisp.convertTo( tmp, CV_32FC1 ); leftDisp = tmp; tmp.release();
-        rightDisp.convertTo( tmp, CV_32FC1 ); rightDisp = tmp; tmp.release();
+
+        leftDisp.convertTo( tmp, CV_32FC1 );
+        leftDisp = tmp;
+        tmp.release();
+
+        rightDisp.convertTo( tmp, CV_32FC1 );
+        rightDisp = tmp;
+        tmp.release();

        int tempCode = processStereoMatchingResults( resFS, ci, isWrite,
                   leftImg, rightImg, trueLeftDisp, trueRightDisp, leftDisp, rightDisp, QualityEvalParams(ignBorder));
@@ -531,7 +546,8 @@ int CV_StereoMatchingTest::processStereoMatchingResults( FileStorage& fs, int ca
    // rightDisp is not used in current test virsion
    int code = cvtest::TS::OK;
    assert( fs.isOpened() );
-    assert( trueLeftDisp.type() == CV_32FC1 && trueRightDisp.type() == CV_32FC1 );
+    assert( trueLeftDisp.type() == CV_32FC1 );
+    assert( trueRightDisp.empty() || trueRightDisp.type() == CV_32FC1 );
    assert( leftDisp.type() == CV_32FC1 && rightDisp.type() == CV_32FC1 );

    // get masks for unknown ground truth disparity values
--- a/modules/contrib/doc/facerec/facerec_tutorial.rst
+++ b/modules/contrib/doc/facerec/facerec_tutorial.rst
@@ -7,7 +7,7 @@ Face Recognition with OpenCV
 Introduction
 ============

-`OpenCV (Open Source Computer Vision) <http://opencv.willowgarage.com>`_ is a popular computer vision library started by `Intel <http://www.intel.com>`_ in 1999. The cross-platform library sets its focus on real-time image processing and includes patent-free implementations of the latest computer vision algorithms. In 2008 `Willow Garage <http://www.willowgarage.com>`_ took over support and OpenCV 2.3.1 now comes with a programming interface to C, C++, `Python <http://www.python.org>`_ and `Android <http://www.android.com>`_. OpenCV is released under a BSD license so it is used in academic projects and commercial products alike.
+`OpenCV (Open Source Computer Vision) <http://opencv.org>`_ is a popular computer vision library started by `Intel <http://www.intel.com>`_ in 1999. The cross-platform library sets its focus on real-time image processing and includes patent-free implementations of the latest computer vision algorithms. In 2008 `Willow Garage <http://www.willowgarage.com>`_ took over support and OpenCV 2.3.1 now comes with a programming interface to C, C++, `Python <http://www.python.org>`_ and `Android <http://www.android.com>`_. OpenCV is released under a BSD license so it is used in academic projects and commercial products alike.

 OpenCV 2.4 now comes with the very new :ocv:class:`FaceRecognizer` class for face recognition, so you can start experimenting with face recognition right away. This document is the guide I've wished for, when I was working myself into face recognition. It shows you how to perform face recognition with :ocv:class:`FaceRecognizer` in OpenCV (with full source code listings) and gives you an introduction into the algorithms behind. I'll also show how to create the visualizations you can find in many publications, because a lot of people asked for.

--- a/modules/contrib/doc/facerec/src/CMakeLists.txt
+++ b/modules/contrib/doc/facerec/src/CMakeLists.txt
@@ -6,7 +6,7 @@ project(facerec_cpp_samples)
 #SET(OpenCV_DIR /path/to/your/opencv/installation)

 # packages
-find_package(OpenCV REQUIRED) # http://opencv.willowgarage.com
+find_package(OpenCV REQUIRED) # http://opencv.org

 # probably you should loop through the sample files here
 add_executable(facerec_demo facerec_demo.cpp)
--- a/modules/contrib/include/opencv2/contrib/contrib.hpp
+++ b/modules/contrib/include/opencv2/contrib/contrib.hpp
@@ -45,4 +45,4 @@
 #error this is a compatibility header which should not be used inside the OpenCV library
 #endif

-#include "opencv2/contrib.hpp"
+#include "opencv2/contrib.hpp"
--- a/modules/contrib/src/ba.cpp
+++ b/modules/contrib/src/ba.cpp
@@ -1106,7 +1106,7 @@ void LevMarqSparse::bundleAdjust( std::vector<Point3d>& points, //positions of p
    Mat rot_vec = levmarP.rowRange(i*num_cam_param, i*num_cam_param+3);
    Rodrigues( rot_vec, R[i] );
    //translation
-    T[i] = levmarP.rowRange(i*num_cam_param + 3, i*num_cam_param+6);
+    levmarP.rowRange(i*num_cam_param + 3, i*num_cam_param+6).copyTo(T[i]);

    //intrinsic camera matrix
    double* intr_data = (double*)cameraMatrix[i].data;
--- a/modules/contrib/src/fuzzymeanshifttracker.cpp
+++ b/modules/contrib/src/fuzzymeanshifttracker.cpp
@@ -380,6 +380,7 @@ void CvFuzzyMeanShiftTracker::SearchWindow::initDepthValues(IplImage *maskImage,
                {
                    if (*depthData)
                    {
+                        d = *depthData;
                        m1 += d;
                        if (d < mind)
                            mind = d;
--- a/modules/core/doc/intro.rst
+++ b/modules/core/doc/intro.rst
@@ -4,7 +4,7 @@ Introduction

 .. highlight:: cpp

-OpenCV (Open Source Computer Vision Library: http://opencv.willowgarage.com/wiki/) is an open-source BSD-licensed library that includes several hundreds of computer vision algorithms. The document describes the so-called OpenCV 2.x API, which is essentially a C++ API, as opposite to the C-based OpenCV 1.x API. The latter is described in opencv1x.pdf.
+OpenCV (Open Source Computer Vision Library: http://opencv.org) is an open-source BSD-licensed library that includes several hundreds of computer vision algorithms. The document describes the so-called OpenCV 2.x API, which is essentially a C++ API, as opposite to the C-based OpenCV 1.x API. The latter is described in opencv1x.pdf.

 OpenCV has a modular structure, which means that the package includes several shared or static libraries. The following modules are available:

--- a/modules/core/include/opencv2/core/core.hpp
+++ b/modules/core/include/opencv2/core/core.hpp
@@ -45,4 +45,4 @@
 #error this is a compatibility header which should not be used inside the OpenCV library
 #endif

-#include "opencv2/core.hpp"
+#include "opencv2/core.hpp"
--- a/modules/core/include/opencv2/core/cvdef.h
+++ b/modules/core/include/opencv2/core/cvdef.h
@@ -360,6 +360,8 @@ CV_INLINE int cvRound( double value )
        fistp t;
    }
    return t;
+#elif defined _MSC_VER && defined _M_ARM && defined HAVE_TEGRA_OPTIMIZATION
+    TEGRA_ROUND(value);
 #elif defined HAVE_LRINT || defined CV_ICC || defined __GNUC__
 #  ifdef HAVE_TEGRA_OPTIMIZATION
    TEGRA_ROUND(value);
@@ -367,8 +369,12 @@ CV_INLINE int cvRound( double value )
    return (int)lrint(value);
 #  endif
 #else
-    // while this is not IEEE754-compliant rounding, it's usually a good enough approximation
-    return (int)(value + (value >= 0 ? 0.5 : -0.5));
+    double intpart, fractpart;
+    fractpart = modf(value, &intpart);
+    if ((fabs(fractpart) != 0.5) || ((((int)intpart) % 2) != 0))
+        return (int)(value + (value >= 0 ? 0.5 : -0.5));
+    else
+        return (int)intpart;
 #endif
 }

--- a/modules/core/include/opencv2/core/mat.hpp
+++ b/modules/core/include/opencv2/core/mat.hpp
@@ -1704,6 +1704,7 @@ public:
    SparseMatConstIterator_();
    //! the full constructor setting the iterator to the first sparse matrix element
    SparseMatConstIterator_(const SparseMat_<_Tp>* _m);
+    SparseMatConstIterator_(const SparseMat* _m);
    //! the copy constructor
    SparseMatConstIterator_(const SparseMatConstIterator_& it);

@@ -1740,6 +1741,7 @@ public:
    SparseMatIterator_();
    //! the full constructor setting the iterator to the first sparse matrix element
    SparseMatIterator_(SparseMat_<_Tp>* _m);
+    SparseMatIterator_(SparseMat* _m);
    //! the copy constructor
    SparseMatIterator_(const SparseMatIterator_& it);

--- a/modules/core/include/opencv2/core/mat.inl.hpp
+++ b/modules/core/include/opencv2/core/mat.inl.hpp
@@ -2587,6 +2587,13 @@ SparseMatConstIterator_<_Tp>::SparseMatConstIterator_(const SparseMat_<_Tp>* _m)
    : SparseMatConstIterator(_m)
 {}

+template<typename _Tp> inline
+SparseMatConstIterator_<_Tp>::SparseMatConstIterator_(const SparseMat* _m)
+    : SparseMatConstIterator(_m)
+{
+    CV_Assert( _m->type() == DataType<_Tp>::type );
+}
+
 template<typename _Tp> inline
 SparseMatConstIterator_<_Tp>::SparseMatConstIterator_(const SparseMatConstIterator_<_Tp>& it)
    : SparseMatConstIterator(it)
@@ -2634,6 +2641,11 @@ SparseMatIterator_<_Tp>::SparseMatIterator_(SparseMat_<_Tp>* _m)
    : SparseMatConstIterator_<_Tp>(_m)
 {}

+template<typename _Tp> inline
+SparseMatIterator_<_Tp>::SparseMatIterator_(SparseMat* _m)
+    : SparseMatConstIterator_<_Tp>(_m)
+{}
+
 template<typename _Tp> inline
 SparseMatIterator_<_Tp>::SparseMatIterator_(const SparseMatIterator_<_Tp>& it)
    : SparseMatConstIterator_<_Tp>(it)
--- a/modules/core/src/lapack.cpp
+++ b/modules/core/src/lapack.cpp
@@ -577,10 +577,10 @@ JacobiSVDImpl_(_Tp* At, size_t astep, _Tp* _W, _Tp* Vt, size_t vstep,
                    continue;

                p *= 2;
-                double beta = a - b, gamma = hypot((double)p, beta), delta;
+                double beta = a - b, gamma = hypot((double)p, beta);
                if( beta < 0 )
                {
-                    delta = (gamma - beta)*0.5;
+                    double delta = (gamma - beta)*0.5;
                    s = (_Tp)std::sqrt(delta/gamma);
                    c = (_Tp)(p/(gamma*s*2));
                }
@@ -588,36 +588,18 @@ JacobiSVDImpl_(_Tp* At, size_t astep, _Tp* _W, _Tp* Vt, size_t vstep,
                {
                    c = (_Tp)std::sqrt((gamma + beta)/(gamma*2));
                    s = (_Tp)(p/(gamma*c*2));
-                    delta = p*p*0.5/(gamma + beta);
                }

-                W[i] += delta;
-                W[j] -= delta;
-
-                if( iter % 2 != 0 && W[i] > 0 && W[j] > 0 )
+                a = b = 0;
+                for( k = 0; k < m; k++ )
                {
-                    k = vblas.givens(Ai, Aj, m, c, s);
+                    _Tp t0 = c*Ai[k] + s*Aj[k];
+                    _Tp t1 = -s*Ai[k] + c*Aj[k];
+                    Ai[k] = t0; Aj[k] = t1;

-                    for( ; k < m; k++ )
-                    {
-                        _Tp t0 = c*Ai[k] + s*Aj[k];
-                        _Tp t1 = -s*Ai[k] + c*Aj[k];
-                        Ai[k] = t0; Aj[k] = t1;
-                    }
-                }
-                else
-                {
-                    a = b = 0;
-                    for( k = 0; k < m; k++ )
-                    {
-                        _Tp t0 = c*Ai[k] + s*Aj[k];
-                        _Tp t1 = -s*Ai[k] + c*Aj[k];
-                        Ai[k] = t0; Aj[k] = t1;
-
-                        a += (double)t0*t0; b += (double)t1*t1;
-                    }
-                    W[i] = a; W[j] = b;
+                    a += (double)t0*t0; b += (double)t1*t1;
                }
+                W[i] = a; W[j] = b;

                changed = true;

--- a/modules/core/src/matop.cpp
+++ b/modules/core/src/matop.cpp
@@ -324,7 +324,7 @@ void MatOp::augAssignXor(const MatExpr& expr, Mat& m) const
 {
    Mat temp;
    expr.op->assign(expr, temp);
-    m /= temp;
+    m ^= temp;
 }


--- a/modules/core/src/matrix.cpp
+++ b/modules/core/src/matrix.cpp
@@ -183,7 +183,7 @@ static void finalizeHdr(Mat& m)
 void Mat::create(int d, const int* _sizes, int _type)
 {
    int i;
-    CV_Assert(0 <= d && _sizes && d <= CV_MAX_DIM && _sizes);
+    CV_Assert(0 <= d && d <= CV_MAX_DIM && _sizes);
    _type = CV_MAT_TYPE(_type);

    if( data && (d == dims || (d == 1 && dims <= 2)) && _type == type() )
--- a/modules/core/test/test_arithm.cpp
+++ b/modules/core/test/test_arithm.cpp
@@ -1551,3 +1551,16 @@ TEST(Core_Add, AddToColumnWhen4Rows)

    ASSERT_EQ(0, countNonZero(m1 - m2));
 }
+
+TEST(Core_round, CvRound)
+{
+    ASSERT_EQ(2, cvRound(2.0));
+    ASSERT_EQ(2, cvRound(2.1));
+    ASSERT_EQ(-2, cvRound(-2.1));
+    ASSERT_EQ(3, cvRound(2.8));
+    ASSERT_EQ(-3, cvRound(-2.8));
+    ASSERT_EQ(2, cvRound(2.5));
+    ASSERT_EQ(4, cvRound(3.5));
+    ASSERT_EQ(-2, cvRound(-2.5));
+    ASSERT_EQ(-4, cvRound(-3.5));
+}
--- a/modules/features2d/doc/feature_detection_and_description.rst
+++ b/modules/features2d/doc/feature_detection_and_description.rst
@@ -48,7 +48,7 @@ Maximally stable extremal region extractor. ::
    };

 The class encapsulates all the parameters of the MSER extraction algorithm (see
-http://en.wikipedia.org/wiki/Maximally_stable_extremal_regions). Also see http://opencv.willowgarage.com/wiki/documentation/cpp/features2d/MSER for useful comments and parameters description.
+http://en.wikipedia.org/wiki/Maximally_stable_extremal_regions). Also see http://code.opencv.org/projects/opencv/wiki/MSER for useful comments and parameters description.


 ORB
--- a/modules/features2d/src/keypoint.cpp
+++ b/modules/features2d/src/keypoint.cpp
@@ -69,7 +69,7 @@ struct KeypointResponseGreater
 void KeyPointsFilter::retainBest(std::vector<KeyPoint>& keypoints, int n_points)
 {
    //this is only necessary if the keypoints size is greater than the number of desired points.
-    if( n_points > 0 && keypoints.size() > (size_t)n_points )
+    if( n_points >= 0 && keypoints.size() > (size_t)n_points )
    {
        if (n_points==0)
        {
--- a/modules/flann/include/opencv2/flann/dist.h
+++ b/modules/flann/include/opencv2/flann/dist.h
@@ -421,7 +421,6 @@ struct Hamming
    ResultType operator()(Iterator1 a, Iterator2 b, size_t size, ResultType /*worst_dist*/ = -1) const
    {
        ResultType result = 0;
-#ifdef __GNUC__
 #ifdef __ARM_NEON__
        {
            uint32x4_t bits = vmovq_n_u32(0);
@@ -438,7 +437,7 @@ struct Hamming
            result = vgetq_lane_s32 (vreinterpretq_s32_u64(bitSet2),0);
            result += vgetq_lane_s32 (vreinterpretq_s32_u64(bitSet2),2);
        }
-#else
+#elif __GNUC__
        {
            //for portability just use unsigned long -- and use the __builtin_popcountll (see docs for __builtin_popcountll)
            typedef unsigned long long pop_t;
@@ -458,8 +457,8 @@ struct Hamming
                result += __builtin_popcountll(a_final ^ b_final);
            }
        }
-#endif //NEON
-#else
+#else // NO NEON and NOT GNUC
+        typedef unsigned long long pop_t;
        HammingLUT lut;
        result = lut(reinterpret_cast<const unsigned char*> (a),
                     reinterpret_cast<const unsigned char*> (b), size * sizeof(pop_t));
--- a/modules/gpu/CMakeLists.txt
+++ b/modules/gpu/CMakeLists.txt
@@ -29,8 +29,6 @@ if(HAVE_CUDA)
  source_group("Src\\NVidia" FILES ${ncv_files})
  ocv_include_directories("src/nvidia" "src/nvidia/core" "src/nvidia/NPP_staging" ${CUDA_INCLUDE_DIRS})
  ocv_warnings_disable(CMAKE_CXX_FLAGS -Wundef -Wmissing-declarations -Wshadow -Wunused-parameter /wd4211 /wd4201 /wd4100 /wd4505 /wd4408)
-  string(REPLACE "-Wsign-promo" "" CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}")
-  #set (CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} "-Xcompiler;/EHsc-;")

  if(MSVC)
    if(NOT ENABLE_NOISY_WARNINGS)
--- a/modules/gpu/perf/perf_video.cpp
+++ b/modules/gpu/perf/perf_video.cpp
@@ -1007,7 +1007,7 @@ PERF_TEST_P(Video_Cn_MaxFeatures, Video_GMG,

 #if defined(HAVE_NVCUVID) && BUILD_WITH_VIDEO_INPUT_SUPPORT

-PERF_TEST_P(Video, Video_VideoReader, Values("gpu/video/768x576.avi", "gpu/video/1920x1080.avi"))
+PERF_TEST_P(Video, DISABLED_Video_VideoReader, Values("gpu/video/768x576.avi", "gpu/video/1920x1080.avi"))
 {
    declare.time(20);

@@ -1044,7 +1044,7 @@ PERF_TEST_P(Video, Video_VideoReader, Values("gpu/video/768x576.avi", "gpu/video

 #if defined(HAVE_NVCUVID) && defined(WIN32)

-PERF_TEST_P(Video, Video_VideoWriter, Values("gpu/video/768x576.avi", "gpu/video/1920x1080.avi"))
+PERF_TEST_P(Video, DISABLED_Video_VideoWriter, Values("gpu/video/768x576.avi", "gpu/video/1920x1080.avi"))
 {
    declare.time(30);

--- a/modules/gpu/src/element_operations.cpp
+++ b/modules/gpu/src/element_operations.cpp
@@ -1793,10 +1793,10 @@ void cv::gpu::exp(const GpuMat& src, GpuMat& dst, Stream& stream)

 namespace arithm
 {
-    void cmpMatEq_v4(PtrStepSz<uint> src1, PtrStepSz<uint> src2, PtrStepSz<uint> dst, cudaStream_t stream);
-    void cmpMatNe_v4(PtrStepSz<uint> src1, PtrStepSz<uint> src2, PtrStepSz<uint> dst, cudaStream_t stream);
-    void cmpMatLt_v4(PtrStepSz<uint> src1, PtrStepSz<uint> src2, PtrStepSz<uint> dst, cudaStream_t stream);
-    void cmpMatLe_v4(PtrStepSz<uint> src1, PtrStepSz<uint> src2, PtrStepSz<uint> dst, cudaStream_t stream);
+    void cmpMatEq_v4(PtrStepSz<unsigned int> src1, PtrStepSz<unsigned int> src2, PtrStepSz<unsigned int> dst, cudaStream_t stream);
+    void cmpMatNe_v4(PtrStepSz<unsigned int> src1, PtrStepSz<unsigned int> src2, PtrStepSz<unsigned int> dst, cudaStream_t stream);
+    void cmpMatLt_v4(PtrStepSz<unsigned int> src1, PtrStepSz<unsigned int> src2, PtrStepSz<unsigned int> dst, cudaStream_t stream);
+    void cmpMatLe_v4(PtrStepSz<unsigned int> src1, PtrStepSz<unsigned int> src2, PtrStepSz<unsigned int> dst, cudaStream_t stream);

    template <typename T> void cmpMatEq(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
    template <typename T> void cmpMatNe(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
@@ -1820,7 +1820,7 @@ void cv::gpu::compare(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, int c
        {cmpMatEq<double>        , cmpMatNe<double>        , cmpMatLt<double>        , cmpMatLe<double>        }
    };

-    typedef void (*func_v4_t)(PtrStepSz<uint> src1, PtrStepSz<uint> src2, PtrStepSz<uint> dst, cudaStream_t stream);
+    typedef void (*func_v4_t)(PtrStepSz<unsigned int> src1, PtrStepSz<unsigned int> src2, PtrStepSz<unsigned int> dst, cudaStream_t stream);
    static const func_v4_t funcs_v4[] =
    {
        cmpMatEq_v4, cmpMatNe_v4, cmpMatLt_v4, cmpMatLe_v4
--- a/modules/gpu/src/tvl1flow.cpp
+++ b/modules/gpu/src/tvl1flow.cpp
@@ -129,6 +129,17 @@ void cv::gpu::OpticalFlowDual_TVL1_GPU::operator ()(const GpuMat& I0, const GpuM
            gpu::multiply(u1s[s], Scalar::all(0.5), u1s[s]);
            gpu::multiply(u2s[s], Scalar::all(0.5), u2s[s]);
        }
+        else
+        {
+            u1s[s].create(I0s[s].size(), CV_32FC1);
+            u2s[s].create(I0s[s].size(), CV_32FC1);
+        }
+    }
+
+    if (!useInitialFlow)
+    {
+        u1s[nscales-1].setTo(Scalar::all(0));
+        u2s[nscales-1].setTo(Scalar::all(0));
    }

    // pyramidal structure for computing the optical flow
@@ -173,18 +184,9 @@ void cv::gpu::OpticalFlowDual_TVL1_GPU::procOneScale(const GpuMat& I0, const Gpu

    CV_DbgAssert( I1.size() == I0.size() );
    CV_DbgAssert( I1.type() == I0.type() );
-    CV_DbgAssert( u1.empty() || u1.size() == I0.size() );
+    CV_DbgAssert( u1.size() == I0.size() );
    CV_DbgAssert( u2.size() == u1.size() );

-    if (u1.empty())
-    {
-        u1.create(I0.size(), CV_32FC1);
-        u1.setTo(Scalar::all(0));
-
-        u2.create(I0.size(), CV_32FC1);
-        u2.setTo(Scalar::all(0));
-    }
-
    GpuMat I1x = I1x_buf(Rect(0, 0, I0.cols, I0.rows));
    GpuMat I1y = I1y_buf(Rect(0, 0, I0.cols, I0.rows));
    centeredGradient(I1, I1x, I1y);
--- a/modules/highgui/CMakeLists.txt
+++ b/modules/highgui/CMakeLists.txt
@@ -95,7 +95,7 @@ if(HAVE_QT)
  if(${_have_flag})
    set_source_files_properties(${_RCC_OUTFILES} PROPERTIES COMPILE_FLAGS -Wno-missing-declarations)
  endif()
-elseif(WIN32)
+elseif(HAVE_WIN32UI)
  list(APPEND highgui_srcs src/window_w32.cpp)
 elseif(HAVE_GTK)
  list(APPEND highgui_srcs src/window_gtk.cpp)
@@ -111,9 +111,21 @@ elseif(APPLE)
  endif()
 endif()

-if(WIN32)
-  list(APPEND highgui_srcs src/cap_vfw.cpp src/cap_cmu.cpp src/cap_dshow.cpp)
-endif(WIN32)
+if(WIN32 AND NOT ARM)
+  list(APPEND highgui_srcs src/cap_cmu.cpp)
+endif()
+
+if (WIN32 AND HAVE_DSHOW)
+  list(APPEND highgui_srcs src/cap_dshow.cpp)
+endif()
+
+if (WIN32 AND HAVE_MSMF)
+  list(APPEND highgui_srcs src/cap_msmf.cpp)
+endif()
+
+if (WIN32 AND HAVE_VFW)
+  list(APPEND highgui_srcs src/cap_vfw.cpp)
+endif()

 if(HAVE_XINE)
  list(APPEND highgui_srcs src/cap_xine.cpp)
--- a/modules/highgui/include/opencv2/highgui/highgui_c.h
+++ b/modules/highgui/include/opencv2/highgui/highgui_c.h
@@ -298,6 +298,7 @@ enum
    CV_CAP_UNICAP   =600,   // Unicap drivers

    CV_CAP_DSHOW    =700,   // DirectShow (via videoInput)
+    CV_CAP_MSMF     =1400,  // Microsoft Media Foundation (via videoInput)

    CV_CAP_PVAPI    =800,   // PvAPI, Prosilica GigE SDK

--- a/modules/highgui/perf/perf_precomp.hpp
+++ b/modules/highgui/perf/perf_precomp.hpp
@@ -20,9 +20,9 @@
    defined(HAVE_GSTREAMER)    || \
    defined(HAVE_QUICKTIME)    || \
    defined(HAVE_AVFOUNDATION) || \
-    /*defined(HAVE_OPENNI)     || too specialized */ \
    defined(HAVE_FFMPEG)       || \
-    defined(WIN32) /* assume that we have ffmpeg */
+    defined(HAVE_VFW)
+    /*defined(HAVE_OPENNI) too specialized */ \

 #  define BUILD_WITH_VIDEO_INPUT_SUPPORT 1
 #else
@@ -34,7 +34,7 @@
    defined(HAVE_QUICKTIME)    || \
    defined(HAVE_AVFOUNDATION) || \
    defined(HAVE_FFMPEG)       || \
-    defined(WIN32) /* assume that we have ffmpeg */
+    defined(HAVE_VFW)
 #  define BUILD_WITH_VIDEO_OUTPUT_SUPPORT 1
 #else
 #  define BUILD_WITH_VIDEO_OUTPUT_SUPPORT 0
--- a/modules/highgui/src/cap.cpp
+++ b/modules/highgui/src/cap.cpp
@@ -114,7 +114,7 @@ CV_IMPL CvCapture * cvCreateCameraCapture (int index)
 {
    int  domains[] =
    {
-#ifdef HAVE_VIDEOINPUT
+#ifdef HAVE_DSHOW
        CV_CAP_DSHOW,
 #endif
 #if 1
@@ -168,7 +168,8 @@ CV_IMPL CvCapture * cvCreateCameraCapture (int index)
    // try every possibly installed camera API
    for (int i = 0; domains[i] >= 0; i++)
    {
-#if defined(HAVE_VIDEOINPUT)   || \
+#if defined(HAVE_DSHOW)        || \
+    defined(HAVE_MSMF)         || \
    defined(HAVE_TYZX)         || \
    defined(HAVE_VFW)          || \
    defined(HAVE_LIBV4L)       || \
@@ -195,11 +196,18 @@ CV_IMPL CvCapture * cvCreateCameraCapture (int index)

        switch (domains[i])
        {
-#ifdef HAVE_VIDEOINPUT
+#ifdef HAVE_MSMF
+        case CV_CAP_MSMF:
+             capture = cvCreateCameraCapture_MSMF (index);
+             if (capture)
+                 return capture;
+            break;
+#endif
+#ifdef HAVE_DSHOW
        case CV_CAP_DSHOW:
-            capture = cvCreateCameraCapture_DShow (index);
-            if (capture)
-                return capture;
+             capture = cvCreateCameraCapture_DShow (index);
+             if (capture)
+                 return capture;
            break;
 #endif

--- a/modules/highgui/src/cap_dshow.cpp
+++ b/modules/highgui/src/cap_dshow.cpp
@@ -41,7 +41,7 @@

 #include "precomp.hpp"

-#if (defined WIN32 || defined _WIN32) && defined HAVE_VIDEOINPUT
+#if (defined WIN32 || defined _WIN32) && defined HAVE_DSHOW

 /*
   DirectShow-based Video Capturing module is based on
@@ -3098,6 +3098,7 @@ HRESULT videoInput::routeCrossbar(ICaptureGraphBuilder2 **ppBuild, IBaseFilter *
    return hr;
 }

+
 /********************* Capturing video from camera via DirectShow *********************/

 class CvCaptureCAM_DShow : public CvCapture
--- a/modules/highgui/src/cap_ffmpeg.cpp
+++ b/modules/highgui/src/cap_ffmpeg.cpp
@@ -209,7 +209,7 @@ CvCapture* cvCreateFileCapture_FFMPEG_proxy(const char * filename)
    if( result->open( filename ))
        return result;
    delete result;
-#if defined WIN32 || defined _WIN32
+#ifdef HAVE_VFW
    return cvCreateFileCapture_VFW(filename);
 #else
    return 0;
@@ -263,9 +263,9 @@ CvVideoWriter* cvCreateVideoWriter_FFMPEG_proxy( const char* filename, int fourc
    if( result->open( filename, fourcc, fps, frameSize, isColor != 0 ))
        return result;
    delete result;
-#if defined WIN32 || defined _WIN32
-    return cvCreateVideoWriter_VFW(filename, fourcc, fps, frameSize, isColor);
-#else
+#ifdef HAVE_VFW
+     return cvCreateVideoWriter_VFW(filename, fourcc, fps, frameSize, isColor);
+ #else
    return 0;
 #endif
 }
--- a/modules/highgui/src/cap_ffmpeg_impl.hpp
+++ b/modules/highgui/src/cap_ffmpeg_impl.hpp
@@ -153,6 +153,14 @@ extern "C" {
 #define AVERROR_EOF (-MKTAG( 'E','O','F',' '))
 #endif

+#if LIBAVCODEC_BUILD >= CALC_FFMPEG_VERSION(54,25,0)
+#  define CV_CODEC_ID AVCodecID
+#  define CV_CODEC(name) AV_##name
+#else
+#  define CV_CODEC_ID CodecID
+#  define CV_CODEC(name) name
+#endif
+
 static int get_number_of_cpus(void)
 {
 #if LIBAVFORMAT_BUILD < CALC_FFMPEG_VERSION(52, 111, 0)
@@ -1026,7 +1034,7 @@ static const char * icvFFMPEGErrStr(int err)

 /* function internal to FFMPEG (libavformat/riff.c) to lookup codec id by fourcc tag*/
 extern "C" {
-    enum CodecID codec_get_bmp_id(unsigned int tag);
+    enum CV_CODEC_ID codec_get_bmp_id(unsigned int tag);
 }

 void CvVideoWriter_FFMPEG::init()
@@ -1078,7 +1086,7 @@ static AVFrame * icv_alloc_picture_FFMPEG(int pix_fmt, int width, int height, bo

 /* add a video output stream to the container */
 static AVStream *icv_add_video_stream_FFMPEG(AVFormatContext *oc,
-                                             CodecID codec_id,
+                                             CV_CODEC_ID codec_id,
                                             int w, int h, int bitrate,
                                             double fps, int pixel_format)
 {
@@ -1110,7 +1118,7 @@ static AVStream *icv_add_video_stream_FFMPEG(AVFormatContext *oc,
    c->codec_id = oc->oformat->video_codec;
 #endif

-    if(codec_id != CODEC_ID_NONE){
+    if(codec_id != CV_CODEC(CODEC_ID_NONE)){
        c->codec_id = codec_id;
    }

@@ -1179,10 +1187,10 @@ static AVStream *icv_add_video_stream_FFMPEG(AVFormatContext *oc,
    c->gop_size = 12; /* emit one intra frame every twelve frames at most */
    c->pix_fmt = (PixelFormat) pixel_format;

-    if (c->codec_id == CODEC_ID_MPEG2VIDEO) {
+    if (c->codec_id == CV_CODEC(CODEC_ID_MPEG2VIDEO)) {
        c->max_b_frames = 2;
    }
-    if (c->codec_id == CODEC_ID_MPEG1VIDEO || c->codec_id == CODEC_ID_MSMPEG4V3){
+    if (c->codec_id == CV_CODEC(CODEC_ID_MPEG1VIDEO) || c->codec_id == CV_CODEC(CODEC_ID_MSMPEG4V3)){
        /* needed to avoid using macroblocks in which some coeffs overflow
           this doesnt happen with normal video, it just happens here as the
           motion of the chroma plane doesnt match the luma plane */
@@ -1290,7 +1298,7 @@ bool CvVideoWriter_FFMPEG::writeFrame( const unsigned char* data, int step, int

 #if LIBAVFORMAT_BUILD < 5231
    // It is not needed in the latest versions of the ffmpeg
-    if( c->codec_id == CODEC_ID_RAWVIDEO && origin != 1 )
+    if( c->codec_id == CV_CODEC(CODEC_ID_RAWVIDEO) && origin != 1 )
    {
        if( !temp_image.data )
        {
@@ -1477,7 +1485,7 @@ void CvVideoWriter_FFMPEG::close()
 bool CvVideoWriter_FFMPEG::open( const char * filename, int fourcc,
                                 double fps, int width, int height, bool is_color )
 {
-    CodecID codec_id = CODEC_ID_NONE;
+    CV_CODEC_ID codec_id = CV_CODEC(CODEC_ID_NONE);
    int err, codec_pix_fmt;
    double bitrate_scale = 1;

@@ -1518,11 +1526,11 @@ bool CvVideoWriter_FFMPEG::open( const char * filename, int fourcc,

    /* Lookup codec_id for given fourcc */
 #if LIBAVCODEC_VERSION_INT<((51<<16)+(49<<8)+0)
-    if( (codec_id = codec_get_bmp_id( fourcc )) == CODEC_ID_NONE )
+    if( (codec_id = codec_get_bmp_id( fourcc )) == CV_CODEC(CODEC_ID_NONE) )
        return false;
 #else
    const struct AVCodecTag * tags[] = { codec_bmp_tags, NULL};
-    if( (codec_id = av_codec_get_id(tags, fourcc)) == CODEC_ID_NONE )
+    if( (codec_id = av_codec_get_id(tags, fourcc)) == CV_CODEC(CODEC_ID_NONE) )
        return false;
 #endif

@@ -1544,20 +1552,20 @@ bool CvVideoWriter_FFMPEG::open( const char * filename, int fourcc,
    // set a few optimal pixel formats for lossless codecs of interest..
    switch (codec_id) {
 #if LIBAVCODEC_VERSION_INT>((50<<16)+(1<<8)+0)
-    case CODEC_ID_JPEGLS:
+    case CV_CODEC(CODEC_ID_JPEGLS):
        // BGR24 or GRAY8 depending on is_color...
        codec_pix_fmt = input_pix_fmt;
        break;
 #endif
-    case CODEC_ID_HUFFYUV:
+    case CV_CODEC(CODEC_ID_HUFFYUV):
        codec_pix_fmt = PIX_FMT_YUV422P;
        break;
-    case CODEC_ID_MJPEG:
-    case CODEC_ID_LJPEG:
+    case CV_CODEC(CODEC_ID_MJPEG):
+    case CV_CODEC(CODEC_ID_LJPEG):
        codec_pix_fmt = PIX_FMT_YUVJ420P;
        bitrate_scale = 3;
        break;
-    case CODEC_ID_RAWVIDEO:
+    case CV_CODEC(CODEC_ID_RAWVIDEO):
        codec_pix_fmt = input_pix_fmt == PIX_FMT_GRAY8 ||
                        input_pix_fmt == PIX_FMT_GRAY16LE ||
                        input_pix_fmt == PIX_FMT_GRAY16BE ? input_pix_fmt : PIX_FMT_YUV420P;
@@ -1788,7 +1796,7 @@ struct OutputMediaStream_FFMPEG
    void write(unsigned char* data, int size, int keyFrame);

    // add a video output stream to the container
-    static AVStream* addVideoStream(AVFormatContext *oc, CodecID codec_id, int w, int h, int bitrate, double fps, PixelFormat pixel_format);
+    static AVStream* addVideoStream(AVFormatContext *oc, CV_CODEC_ID codec_id, int w, int h, int bitrate, double fps, PixelFormat pixel_format);

    AVOutputFormat* fmt_;
    AVFormatContext* oc_;
@@ -1835,7 +1843,7 @@ void OutputMediaStream_FFMPEG::close()
    }
 }

-AVStream* OutputMediaStream_FFMPEG::addVideoStream(AVFormatContext *oc, CodecID codec_id, int w, int h, int bitrate, double fps, PixelFormat pixel_format)
+AVStream* OutputMediaStream_FFMPEG::addVideoStream(AVFormatContext *oc, CV_CODEC_ID codec_id, int w, int h, int bitrate, double fps, PixelFormat pixel_format)
 {
    #if LIBAVFORMAT_BUILD >= CALC_FFMPEG_VERSION(53, 10, 0)
        AVStream* st = avformat_new_stream(oc, 0);
@@ -1915,10 +1923,10 @@ AVStream* OutputMediaStream_FFMPEG::addVideoStream(AVFormatContext *oc, CodecID
    c->gop_size = 12; // emit one intra frame every twelve frames at most
    c->pix_fmt = pixel_format;

-    if (c->codec_id == CODEC_ID_MPEG2VIDEO)
+    if (c->codec_id == CV_CODEC(CODEC_ID_MPEG2VIDEO))
        c->max_b_frames = 2;

-    if (c->codec_id == CODEC_ID_MPEG1VIDEO || c->codec_id == CODEC_ID_MSMPEG4V3)
+    if (c->codec_id == CV_CODEC(CODEC_ID_MPEG1VIDEO) || c->codec_id == CV_CODEC(CODEC_ID_MSMPEG4V3))
    {
        // needed to avoid using macroblocks in which some coeffs overflow
        // this doesnt happen with normal video, it just happens here as the
@@ -1955,7 +1963,7 @@ bool OutputMediaStream_FFMPEG::open(const char* fileName, int width, int height,
    if (!fmt_)
        return false;

-    CodecID codec_id = CODEC_ID_H264;
+    CV_CODEC_ID codec_id = CV_CODEC(CODEC_ID_H264);

    // alloc memory for context
    #if LIBAVFORMAT_BUILD >= CALC_FFMPEG_VERSION(53, 2, 0)
@@ -2156,23 +2164,23 @@ bool InputMediaStream_FFMPEG::open(const char* fileName, int* codec, int* chroma

            switch (enc->codec_id)
            {
-            case CODEC_ID_MPEG1VIDEO:
+            case CV_CODEC(CODEC_ID_MPEG1VIDEO):
                *codec = ::VideoCodec_MPEG1;
                break;

-            case CODEC_ID_MPEG2VIDEO:
+            case CV_CODEC(CODEC_ID_MPEG2VIDEO):
                *codec = ::VideoCodec_MPEG2;
                break;

-            case CODEC_ID_MPEG4:
+            case CV_CODEC(CODEC_ID_MPEG4):
                *codec = ::VideoCodec_MPEG4;
                break;

-            case CODEC_ID_VC1:
+            case CV_CODEC(CODEC_ID_VC1):
                *codec = ::VideoCodec_VC1;
                break;

-            case CODEC_ID_H264:
+            case CV_CODEC(CODEC_ID_H264):
                *codec = ::VideoCodec_H264;
                break;

--- a/modules/highgui/src/cap_libv4l.cpp
+++ b/modules/highgui/src/cap_libv4l.cpp
@@ -1714,6 +1714,7 @@ static void icvCloseCAM_V4L( CvCaptureCAM_V4L* capture ){
 #endif

     free(capture->deviceName);
+     capture->deviceName = NULL;
     //v4l2_free_ranges(capture);
     //cvFree((void **)capture);
   }
--- a/modules/highgui/src/cap_msmf.cpp
+++ b/modules/highgui/src/cap_msmf.cpp
--- a/modules/highgui/src/cap_vfw.cpp
+++ b/modules/highgui/src/cap_vfw.cpp
@@ -406,7 +406,7 @@ bool CvCaptureCAM_VFW::open( int wIndex )
        fourcc = (DWORD)-1;

        memset( &caps, 0, sizeof(caps));
-        capDriverGetCaps( hWndC, &caps, sizeof(&caps));
+        capDriverGetCaps( hWndC, &caps, sizeof(caps));
        ::MoveWindow( hWndC, 0, 0, 320, 240, TRUE );
        capSetUserData( hWndC, (size_t)this );
        capSetCallbackOnFrame( hWndC, frameCallback );
--- a/modules/highgui/src/precomp.hpp
+++ b/modules/highgui/src/precomp.hpp
@@ -103,14 +103,6 @@ struct CvVideoWriter
    virtual bool writeFrame(const IplImage*) { return false; }
 };

-#if defined WIN32 || defined _WIN32
-#define HAVE_VFW 1
-
-/* uncomment to enable CMUCamera1394 fireware camera module */
-//#define HAVE_CMU1394 1
-#endif
-
-
 CvCapture * cvCreateCameraCapture_V4L( int index );
 CvCapture * cvCreateCameraCapture_DC1394( int index );
 CvCapture * cvCreateCameraCapture_DC1394_2( int index );
@@ -126,6 +118,7 @@ CvVideoWriter* cvCreateVideoWriter_Win32( const char* filename, int fourcc,
 CvVideoWriter* cvCreateVideoWriter_VFW( const char* filename, int fourcc,
                                        double fps, CvSize frameSize, int is_color );
 CvCapture* cvCreateCameraCapture_DShow( int index );
+CvCapture* cvCreateCameraCapture_MSMF( int index );
 CvCapture* cvCreateCameraCapture_OpenNI( int index );
 CvCapture* cvCreateFileCapture_OpenNI( const char* filename );
 CvCapture* cvCreateCameraCapture_Android( int index );
--- a/modules/highgui/src/window.cpp
+++ b/modules/highgui/src/window.cpp
@@ -57,7 +57,7 @@ CV_IMPL void cvSetWindowProperty(const char* name, int prop_id, double prop_valu

        #if defined (HAVE_QT)
            cvSetModeWindow_QT(name,prop_value);
-        #elif defined WIN32 || defined _WIN32
+        #elif defined(HAVE_WIN32UI)
            cvSetModeWindow_W32(name,prop_value);
        #elif defined (HAVE_GTK)
            cvSetModeWindow_GTK(name,prop_value);
@@ -96,7 +96,7 @@ CV_IMPL double cvGetWindowProperty(const char* name, int prop_id)

        #if defined (HAVE_QT)
            return cvGetModeWindow_QT(name);
-        #elif defined WIN32 || defined _WIN32
+        #elif defined(HAVE_WIN32UI)
            return cvGetModeWindow_W32(name);
        #elif defined (HAVE_GTK)
            return cvGetModeWindow_GTK(name);
@@ -113,7 +113,7 @@ CV_IMPL double cvGetWindowProperty(const char* name, int prop_id)

        #if defined (HAVE_QT)
            return cvGetPropWindow_QT(name);
-        #elif defined WIN32 || defined _WIN32
+        #elif defined(HAVE_WIN32UI)
            return cvGetPropWindowAutoSize_W32(name);
        #elif defined (HAVE_GTK)
            return cvGetPropWindowAutoSize_GTK(name);
@@ -126,7 +126,7 @@ CV_IMPL double cvGetWindowProperty(const char* name, int prop_id)

        #if defined (HAVE_QT)
            return cvGetRatioWindow_QT(name);
-        #elif defined WIN32 || defined _WIN32
+        #elif defined(HAVE_WIN32UI)
            return cvGetRatioWindow_W32(name);
        #elif defined (HAVE_GTK)
            return cvGetRatioWindow_GTK(name);
@@ -139,7 +139,7 @@ CV_IMPL double cvGetWindowProperty(const char* name, int prop_id)

        #if defined (HAVE_QT)
            return cvGetOpenGlProp_QT(name);
-        #elif defined WIN32 || defined _WIN32
+        #elif defined(HAVE_WIN32UI)
            return cvGetOpenGlProp_W32(name);
        #elif defined (HAVE_GTK)
            return cvGetOpenGlProp_GTK(name);
@@ -440,11 +440,11 @@ int cv::createButton(const String&, ButtonCallback, void*, int , bool )

 #endif

-#if   defined WIN32 || defined _WIN32         // see window_w32.cpp
+#if   defined(HAVE_WIN32UI)   // see window_w32.cpp
 #elif defined (HAVE_GTK)      // see window_gtk.cpp
-#elif defined (HAVE_COCOA)   // see window_carbon.cpp
+#elif defined (HAVE_COCOA)    // see window_carbon.cpp
 #elif defined (HAVE_CARBON)
-#elif defined (HAVE_QT) //YV see window_QT.cpp
+#elif defined (HAVE_QT)       //YV see window_QT.cpp

 #else

--- a/modules/highgui/test/test_ffmpeg.cpp
+++ b/modules/highgui/test/test_ffmpeg.cpp
@@ -176,7 +176,7 @@ TEST(Highgui_Video, ffmpeg_image) { CV_FFmpegReadImageTest test; test.safe_run()

 #endif

-#if defined(HAVE_FFMPEG) || defined(WIN32) || defined(_WIN32)
+#if defined(HAVE_FFMPEG)

 //////////////////////////////// Parallel VideoWriters and VideoCaptures ////////////////////////////////////

--- a/modules/highgui/test/test_gui.cpp
+++ b/modules/highgui/test/test_gui.cpp
@@ -43,7 +43,7 @@
 #include "test_precomp.hpp"
 #include "opencv2/highgui.hpp"

-#if defined HAVE_GTK  || defined HAVE_QT || defined WIN32 || defined _WIN32 || defined HAVE_CARBON || defined HAVE_COCOA
+#if defined HAVE_GTK || defined HAVE_QT || defined HAVE_WIN32UI || defined HAVE_CARBON || defined HAVE_COCOA

 using namespace cv;
 using namespace std;
--- a/modules/highgui/test/test_precomp.hpp
+++ b/modules/highgui/test/test_precomp.hpp
@@ -16,7 +16,7 @@

 #include "opencv2/core/private.hpp"

-#if defined(HAVE_VIDEOINPUT)   || \
+#if defined(HAVE_DSHOW)        || \
    defined(HAVE_TYZX)         || \
    defined(HAVE_VFW)          || \
    defined(HAVE_LIBV4L)       || \
@@ -32,7 +32,7 @@
    defined(HAVE_OPENNI)       || \
    defined(HAVE_XIMEA)        || \
    defined(HAVE_AVFOUNDATION) || \
-    defined(HAVE_GIGE_API) || \
+    defined(HAVE_GIGE_API)     || \
    (0)
    //defined(HAVE_ANDROID_NATIVE_CAMERA) ||   - enable after #1193
 #  define BUILD_WITH_CAMERA_SUPPORT 1
@@ -45,9 +45,7 @@
    defined(HAVE_QUICKTIME)    || \
    defined(HAVE_AVFOUNDATION) || \
    /*defined(HAVE_OPENNI)     || too specialized */ \
-    defined(HAVE_FFMPEG)       || \
-    defined(WIN32) /* assume that we have ffmpeg */
-
+    defined(HAVE_FFMPEG)
 #  define BUILD_WITH_VIDEO_INPUT_SUPPORT 1
 #else
 #  define BUILD_WITH_VIDEO_INPUT_SUPPORT 0
@@ -57,8 +55,7 @@
    defined(HAVE_GSTREAMER)    || \
    defined(HAVE_QUICKTIME)    || \
    defined(HAVE_AVFOUNDATION) || \
-    defined(HAVE_FFMPEG)       || \
-    defined(WIN32) /* assume that we have ffmpeg */
+    defined(HAVE_FFMPEG)
 #  define BUILD_WITH_VIDEO_OUTPUT_SUPPORT 1
 #else
 #  define BUILD_WITH_VIDEO_OUTPUT_SUPPORT 0
--- a/modules/imgproc/src/imgwarp.cpp
+++ b/modules/imgproc/src/imgwarp.cpp
@@ -1931,7 +1931,7 @@ void cv::resize( InputArray _src, OutputArray _dst, Size dsize,


 #ifdef HAVE_TEGRA_OPTIMIZATION
-    if (tegra::resize(src, dst, inv_scale_x, inv_scale_y, interpolation))
+    if (tegra::resize(src, dst, (float)inv_scale_x, (float)inv_scale_y, interpolation))
        return;
 #endif

@@ -3858,7 +3858,7 @@ cv2DRotationMatrix( CvPoint2D32f center, double angle,
                    double scale, CvMat* matrix )
 {
    cv::Mat M0 = cv::cvarrToMat(matrix), M = cv::getRotationMatrix2D(center, angle, scale);
-    CV_Assert( M.size() == M.size() );
+    CV_Assert( M.size() == M0.size() );
    M.convertTo(M0, M0.type());
    return matrix;
 }
@@ -3871,7 +3871,7 @@ cvGetPerspectiveTransform( const CvPoint2D32f* src,
 {
    cv::Mat M0 = cv::cvarrToMat(matrix),
        M = cv::getPerspectiveTransform((const cv::Point2f*)src, (const cv::Point2f*)dst);
-    CV_Assert( M.size() == M.size() );
+    CV_Assert( M.size() == M0.size() );
    M.convertTo(M0, M0.type());
    return matrix;
 }
--- a/modules/java/CMakeLists.txt
+++ b/modules/java/CMakeLists.txt
@@ -283,7 +283,14 @@ if(BUILD_FAT_JAVA_LIB)
  if(__extradeps)
    list(REMOVE_ITEM __deps ${__extradeps})
  endif()
-  target_link_libraries(${the_module} -Wl,-whole-archive ${__deps} -Wl,-no-whole-archive ${__extradeps} ${OPENCV_LINKER_LIBS})
+  if(APPLE)
+    foreach(_dep ${__deps})
+      target_link_libraries(${the_module} -Wl,-force_load "${_dep}")
+    endforeach()
+  else()
+    target_link_libraries(${the_module} -Wl,-whole-archive ${__deps} -Wl,-no-whole-archive)
+  endif()
+  target_link_libraries(${the_module} ${__extradeps} ${OPENCV_LINKER_LIBS})
 else()
  target_link_libraries(${the_module} ${OPENCV_MODULE_${the_module}_DEPS} ${OPENCV_LINKER_LIBS})
 endif()
--- a/modules/java/android_test/CMakeLists.txt
+++ b/modules/java/android_test/CMakeLists.txt
@@ -14,7 +14,7 @@ ocv_list_filterout(opencv_test_java_files ".svn")

 # copy sources out from the build tree
 set(opencv_test_java_file_deps "")
-foreach(f ${opencv_test_java_files} ${ANDROID_MANIFEST_FILE})
+foreach(f ${opencv_test_java_files} ${ANDROID_MANIFEST_FILE} ".classpath" ".project")
  add_custom_command(
      OUTPUT "${opencv_test_java_bin_dir}/${f}"
      COMMAND ${CMAKE_COMMAND} -E copy "${CMAKE_CURRENT_SOURCE_DIR}/${f}" "${opencv_test_java_bin_dir}/${f}"
--- a/modules/java/android_test/src/org/opencv/test/features2d/BruteForceHammingDescriptorMatcherTest.java
+++ b/modules/java/android_test/src/org/opencv/test/features2d/BruteForceHammingDescriptorMatcherTest.java
@@ -1,5 +1,6 @@
 package org.opencv.test.features2d;

+import java.util.ArrayList;
 import java.util.Arrays;
 import java.util.List;

@@ -204,7 +205,17 @@ public class BruteForceHammingDescriptorMatcherTest extends OpenCVTestCase {
    }

    public void testRadiusMatchMatListOfListOfDMatchFloat() {
-        fail("Not yet implemented");
+        Mat train = getTrainDescriptors();
+        Mat query = getQueryDescriptors();
+        ArrayList<MatOfDMatch> matches = new ArrayList<MatOfDMatch>();
+
+        matcher.radiusMatch(query, train, matches, 50.f);
+
+        assertEquals(matches.size(), 4);
+        assertTrue(matches.get(0).empty());
+        assertMatEqual(matches.get(1), new MatOfDMatch(truth[1]), EPS);
+        assertMatEqual(matches.get(2), new MatOfDMatch(truth[2]), EPS);
+        assertTrue(matches.get(3).empty());
    }

    public void testRadiusMatchMatListOfListOfDMatchFloatListOfMat() {
--- a/modules/java/generator/src/java/android+CameraBridgeViewBase.java
+++ b/modules/java/generator/src/java/android+CameraBridgeViewBase.java
@@ -54,6 +54,9 @@ public abstract class CameraBridgeViewBase extends SurfaceView implements Surfac
    public CameraBridgeViewBase(Context context, int cameraId) {
        super(context);
        mCameraIndex = cameraId;
+        getHolder().addCallback(this);
+        mMaxWidth = MAX_UNSPECIFIED;
+        mMaxHeight = MAX_UNSPECIFIED;
    }

    public CameraBridgeViewBase(Context context, AttributeSet attrs) {
--- a/modules/java/generator/src/java/android+JavaCameraView.java
+++ b/modules/java/generator/src/java/android+JavaCameraView.java
@@ -60,7 +60,6 @@ public class JavaCameraView extends CameraBridgeViewBase implements PreviewCallb

    public JavaCameraView(Context context, AttributeSet attrs) {
        super(context, attrs);
-        Log.d(TAG, "Java camera view ctor");
    }

    protected boolean initializeCamera(int width, int height) {
@@ -237,10 +236,8 @@ public class JavaCameraView extends CameraBridgeViewBase implements PreviewCallb
    }

    public void onPreviewFrame(byte[] frame, Camera arg1) {
-        Log.i(TAG, "Preview Frame received. Need to create MAT and deliver it to clients");
-        Log.i(TAG, "Frame size  is " + frame.length);
-        synchronized (this)
-        {
+        Log.d(TAG, "Preview Frame received. Frame size: " + frame.length);
+        synchronized (this) {
            mFrameChain[1 - mChainIdx].put(0, 0, frame);
            this.notify();
        }
@@ -248,8 +245,7 @@ public class JavaCameraView extends CameraBridgeViewBase implements PreviewCallb
            mCamera.addCallbackBuffer(mBuffer);
    }

-    private class JavaCameraFrame implements CvCameraViewFrame
-    {
+    private class JavaCameraFrame implements CvCameraViewFrame {
        public Mat gray() {
            return mYuvFrameData.submat(0, mHeight, 0, mWidth);
        }
--- a/modules/java/generator/src/java/android+OpenCVLoader.java
+++ b/modules/java/generator/src/java/android+OpenCVLoader.java
@@ -22,6 +22,12 @@ public class OpenCVLoader
     */
    public static final String OPENCV_VERSION_2_4_4 = "2.4.4";

+    /**
+     * OpenCV Library version 2.4.5.
+     */
+    public static final String OPENCV_VERSION_2_4_5 = "2.4.5";
+
+
    /**
     * Loads and initializes OpenCV library from current application package. Roughly, it's an analog of system.loadLibrary("opencv_java").
     * @return Returns true is initialization of OpenCV was successful.
--- a/modules/java/generator/src/java/core+MatOfDMatch.java
+++ b/modules/java/generator/src/java/core+MatOfDMatch.java
@@ -16,8 +16,8 @@ public class MatOfDMatch extends Mat {

    protected MatOfDMatch(long addr) {
        super(addr);
-        if(checkVector(_channels, _depth) < 0 )
-            throw new IllegalArgumentException("Incomatible Mat");
+        if( !empty() && checkVector(_channels, _depth) < 0 )
+            throw new IllegalArgumentException("Incomatible Mat: " + toString());
        //FIXME: do we need release() here?
    }

@@ -27,8 +27,8 @@ public class MatOfDMatch extends Mat {

    public MatOfDMatch(Mat m) {
        super(m, Range.all());
-        if(checkVector(_channels, _depth) < 0 )
-            throw new IllegalArgumentException("Incomatible Mat");
+        if( !empty() && checkVector(_channels, _depth) < 0 )
+            throw new IllegalArgumentException("Incomatible Mat: " + toString());
        //FIXME: do we need release() here?
    }

--- a/modules/legacy/src/blobtrackingauto.cpp
+++ b/modules/legacy/src/blobtrackingauto.cpp
@@ -429,10 +429,11 @@ void CvBlobTrackerAuto1::Process(IplImage* pImg, IplImage* pMask)
            for(i=0; i<NewBlobList.GetBlobNum(); ++i)
            {
                CvBlob* pBN = NewBlobList.GetBlob(i);
-                pBN->ID = m_NextBlobID;

                if(pBN && pBN->w >= CV_BLOB_MINW && pBN->h >= CV_BLOB_MINH)
                {
+                    pBN->ID = m_NextBlobID;
+
                    CvBlob* pB = m_pBT->AddBlob(pBN, pImg, pmask );
                    if(pB)
                    {
--- a/modules/legacy/src/calibfilter.cpp
+++ b/modules/legacy/src/calibfilter.cpp
@@ -235,7 +235,7 @@ void CvCalibFilter::SetCameraCount( int count )
            cvReleaseMat( &rectMap[i][1] );
        }

-        memset( latestCounts, 0, sizeof(latestPoints) );
+        memset( latestCounts, 0, sizeof(latestCounts) );
        maxPoints = 0;
        cameraCount = count;
    }
--- a/modules/legacy/src/epilines.cpp
+++ b/modules/legacy/src/epilines.cpp
@@ -2115,7 +2115,7 @@ CV_IMPL IplImage* icvCreateIsometricImage( IplImage* src, IplImage* dst,
    if( !dst || dst->depth != desired_depth ||
        dst->nChannels != desired_num_channels ||
        dst_size.width != src_size.width ||
-        dst_size.height != dst_size.height )
+        dst_size.height != src_size.height )
    {
        cvReleaseImage( &dst );
        dst = cvCreateImage( src_size, desired_depth, desired_num_channels );
--- a/modules/nonfree/test/test_gpu.cpp
+++ b/modules/nonfree/test/test_gpu.cpp
@@ -58,9 +58,8 @@ namespace
    IMPLEMENT_PARAM_CLASS(SURF_Upright, bool)
 }

-PARAM_TEST_CASE(SURF, cv::gpu::DeviceInfo, SURF_HessianThreshold, SURF_Octaves, SURF_OctaveLayers, SURF_Extended, SURF_Upright)
+PARAM_TEST_CASE(SURF, SURF_HessianThreshold, SURF_Octaves, SURF_OctaveLayers, SURF_Extended, SURF_Upright)
 {
-    cv::gpu::DeviceInfo devInfo;
    double hessianThreshold;
    int nOctaves;
    int nOctaveLayers;
@@ -69,14 +68,11 @@ PARAM_TEST_CASE(SURF, cv::gpu::DeviceInfo, SURF_HessianThreshold, SURF_Octaves,

    virtual void SetUp()
    {
-        devInfo = GET_PARAM(0);
-        hessianThreshold = GET_PARAM(1);
-        nOctaves = GET_PARAM(2);
-        nOctaveLayers = GET_PARAM(3);
-        extended = GET_PARAM(4);
-        upright = GET_PARAM(5);
-
-        cv::gpu::setDevice(devInfo.deviceID());
+        hessianThreshold = GET_PARAM(0);
+        nOctaves = GET_PARAM(1);
+        nOctaveLayers = GET_PARAM(2);
+        extended = GET_PARAM(3);
+        upright = GET_PARAM(4);
    }
 };

@@ -93,39 +89,24 @@ GPU_TEST_P(SURF, Detector)
    surf.upright = upright;
    surf.keypointsRatio = 0.05f;

-    if (!supportFeature(devInfo, cv::gpu::GLOBAL_ATOMICS))
-    {
-        try
-        {
-            std::vector<cv::KeyPoint> keypoints;
-            surf(loadMat(image), cv::gpu::GpuMat(), keypoints);
-        }
-        catch (const cv::Exception& e)
-        {
-            ASSERT_EQ(CV_StsNotImplemented, e.code);
-        }
-    }
-    else
-    {
-        std::vector<cv::KeyPoint> keypoints;
-        surf(loadMat(image), cv::gpu::GpuMat(), keypoints);
+    std::vector<cv::KeyPoint> keypoints;
+    surf(loadMat(image), cv::gpu::GpuMat(), keypoints);

-        cv::SURF surf_gold;
-        surf_gold.hessianThreshold = hessianThreshold;
-        surf_gold.nOctaves = nOctaves;
-        surf_gold.nOctaveLayers = nOctaveLayers;
-        surf_gold.extended = extended;
-        surf_gold.upright = upright;
+    cv::SURF surf_gold;
+    surf_gold.hessianThreshold = hessianThreshold;
+    surf_gold.nOctaves = nOctaves;
+    surf_gold.nOctaveLayers = nOctaveLayers;
+    surf_gold.extended = extended;
+    surf_gold.upright = upright;

-        std::vector<cv::KeyPoint> keypoints_gold;
-        surf_gold(image, cv::noArray(), keypoints_gold);
+    std::vector<cv::KeyPoint> keypoints_gold;
+    surf_gold(image, cv::noArray(), keypoints_gold);

-        ASSERT_EQ(keypoints_gold.size(), keypoints.size());
-        int matchedCount = getMatchedPointsCount(keypoints_gold, keypoints);
-        double matchedRatio = static_cast<double>(matchedCount) / keypoints_gold.size();
+    ASSERT_EQ(keypoints_gold.size(), keypoints.size());
+    int matchedCount = getMatchedPointsCount(keypoints_gold, keypoints);
+    double matchedRatio = static_cast<double>(matchedCount) / keypoints_gold.size();

-        EXPECT_GT(matchedRatio, 0.95);
-    }
+    EXPECT_GT(matchedRatio, 0.95);
 }

 GPU_TEST_P(SURF, Detector_Masked)
@@ -144,39 +125,24 @@ GPU_TEST_P(SURF, Detector_Masked)
    surf.upright = upright;
    surf.keypointsRatio = 0.05f;

-    if (!supportFeature(devInfo, cv::gpu::GLOBAL_ATOMICS))
-    {
-        try
-        {
-            std::vector<cv::KeyPoint> keypoints;
-            surf(loadMat(image), loadMat(mask), keypoints);
-        }
-        catch (const cv::Exception& e)
-        {
-            ASSERT_EQ(CV_StsNotImplemented, e.code);
-        }
-    }
-    else
-    {
-        std::vector<cv::KeyPoint> keypoints;
-        surf(loadMat(image), loadMat(mask), keypoints);
+    std::vector<cv::KeyPoint> keypoints;
+    surf(loadMat(image), loadMat(mask), keypoints);

-        cv::SURF surf_gold;
-        surf_gold.hessianThreshold = hessianThreshold;
-        surf_gold.nOctaves = nOctaves;
-        surf_gold.nOctaveLayers = nOctaveLayers;
-        surf_gold.extended = extended;
-        surf_gold.upright = upright;
+    cv::SURF surf_gold;
+    surf_gold.hessianThreshold = hessianThreshold;
+    surf_gold.nOctaves = nOctaves;
+    surf_gold.nOctaveLayers = nOctaveLayers;
+    surf_gold.extended = extended;
+    surf_gold.upright = upright;

-        std::vector<cv::KeyPoint> keypoints_gold;
-        surf_gold(image, mask, keypoints_gold);
+    std::vector<cv::KeyPoint> keypoints_gold;
+    surf_gold(image, mask, keypoints_gold);

-        ASSERT_EQ(keypoints_gold.size(), keypoints.size());
-        int matchedCount = getMatchedPointsCount(keypoints_gold, keypoints);
-        double matchedRatio = static_cast<double>(matchedCount) / keypoints_gold.size();
+    ASSERT_EQ(keypoints_gold.size(), keypoints.size());
+    int matchedCount = getMatchedPointsCount(keypoints_gold, keypoints);
+    double matchedRatio = static_cast<double>(matchedCount) / keypoints_gold.size();

-        EXPECT_GT(matchedRatio, 0.95);
-    }
+    EXPECT_GT(matchedRatio, 0.95);
 }

 GPU_TEST_P(SURF, Descriptor)
@@ -199,43 +165,26 @@ GPU_TEST_P(SURF, Descriptor)
    surf_gold.extended = extended;
    surf_gold.upright = upright;

-    if (!supportFeature(devInfo, cv::gpu::GLOBAL_ATOMICS))
-    {
-        try
-        {
-            std::vector<cv::KeyPoint> keypoints;
-            cv::gpu::GpuMat descriptors;
-            surf(loadMat(image), cv::gpu::GpuMat(), keypoints, descriptors);
-        }
-        catch (const cv::Exception& e)
-        {
-            ASSERT_EQ(CV_StsNotImplemented, e.code);
-        }
-    }
-    else
-    {
-        std::vector<cv::KeyPoint> keypoints;
-        surf_gold(image, cv::noArray(), keypoints);
+    std::vector<cv::KeyPoint> keypoints;
+    surf_gold(image, cv::noArray(), keypoints);

-        cv::gpu::GpuMat descriptors;
-        surf(loadMat(image), cv::gpu::GpuMat(), keypoints, descriptors, true);
+    cv::gpu::GpuMat descriptors;
+    surf(loadMat(image), cv::gpu::GpuMat(), keypoints, descriptors, true);

-        cv::Mat descriptors_gold;
-        surf_gold(image, cv::noArray(), keypoints, descriptors_gold, true);
+    cv::Mat descriptors_gold;
+    surf_gold(image, cv::noArray(), keypoints, descriptors_gold, true);

-        cv::BFMatcher matcher(cv::NORM_L2);
-        std::vector<cv::DMatch> matches;
-        matcher.match(descriptors_gold, cv::Mat(descriptors), matches);
+    cv::BFMatcher matcher(cv::NORM_L2);
+    std::vector<cv::DMatch> matches;
+    matcher.match(descriptors_gold, cv::Mat(descriptors), matches);

-        int matchedCount = getMatchedPointsCount(keypoints, keypoints, matches);
-        double matchedRatio = static_cast<double>(matchedCount) / keypoints.size();
+    int matchedCount = getMatchedPointsCount(keypoints, keypoints, matches);
+    double matchedRatio = static_cast<double>(matchedCount) / keypoints.size();

-        EXPECT_GT(matchedRatio, 0.6);
-    }
+    EXPECT_GT(matchedRatio, 0.6);
 }

 INSTANTIATE_TEST_CASE_P(GPU_Features2D, SURF, testing::Combine(
-    ALL_DEVICES,
    testing::Values(SURF_HessianThreshold(100.0), SURF_HessianThreshold(500.0), SURF_HessianThreshold(1000.0)),
    testing::Values(SURF_Octaves(3), SURF_Octaves(4)),
    testing::Values(SURF_OctaveLayers(2), SURF_OctaveLayers(3)),
@@ -245,17 +194,15 @@ INSTANTIATE_TEST_CASE_P(GPU_Features2D, SURF, testing::Combine(
 //////////////////////////////////////////////////////
 // VIBE

-PARAM_TEST_CASE(VIBE, cv::gpu::DeviceInfo, cv::Size, MatType, UseRoi)
+PARAM_TEST_CASE(VIBE, cv::Size, MatType, UseRoi)
 {
 };

 GPU_TEST_P(VIBE, Accuracy)
 {
-    const cv::gpu::DeviceInfo devInfo = GET_PARAM(0);
-    cv::gpu::setDevice(devInfo.deviceID());
-    const cv::Size size = GET_PARAM(1);
-    const int type = GET_PARAM(2);
-    const bool useRoi = GET_PARAM(3);
+    const cv::Size size = GET_PARAM(0);
+    const int type = GET_PARAM(1);
+    const bool useRoi = GET_PARAM(2);

    const cv::Mat fullfg(size, CV_8UC1, cv::Scalar::all(255));

@@ -278,7 +225,6 @@ GPU_TEST_P(VIBE, Accuracy)
 }

 INSTANTIATE_TEST_CASE_P(GPU_Video, VIBE, testing::Combine(
-    ALL_DEVICES,
    DIFFERENT_SIZES,
    testing::Values(MatType(CV_8UC1), MatType(CV_8UC3), MatType(CV_8UC4)),
    WHOLE_SUBMAT));
--- a/modules/nonfree/test/test_main.cpp
+++ b/modules/nonfree/test/test_main.cpp
@@ -1,73 +1,3 @@
 #include "test_precomp.hpp"

-#if defined(HAVE_OPENCV_GPU) && defined(HAVE_CUDA)
-
-using namespace cv;
-using namespace cv::gpu;
-using namespace cvtest;
-using namespace testing;
-
-int main(int argc, char **argv)
-{
-    try
-    {
-         const char*  keys =
-                "{ h | help ?            | false | Print help}"
-                "{ i | info              | false | Print information about system and exit }"
-                "{ d | device            | -1   | Device on which tests will be executed (-1 means all devices) }"
-                ;
-
-        CommandLineParser cmd(argc, (const char**)argv, keys);
-
-        if (cmd.get<bool>("help"))
-        {
-            cmd.printMessage();
-            return 0;
-    }
-
-        printCudaInfo();
-
-        if (cmd.get<bool>("info"))
-    {
-            return 0;
-    }
-
-        int device = cmd.get<int>("device");
-        if (device < 0)
-    {
-            DeviceManager::instance().loadAll();
-
-            std::cout << "Run tests on all supported devices \n" << std::endl;
-    }
-        else
-    {
-            DeviceManager::instance().load(device);
-
-            DeviceInfo info(device);
-            std::cout << "Run tests on device " << device << " [" << info.name() << "] \n" << std::endl;
-}
-
-        TS::ptr()->init("cv");
-        InitGoogleTest(&argc, argv);
-
-    return RUN_ALL_TESTS();
-}
-    catch (const std::exception& e)
-    {
-        std::cerr << e.what() << std::endl;
-        return -1;
-    }
-    catch (...)
-{
-        std::cerr << "Unknown error" << std::endl;
-        return -1;
-    }
-
-    return 0;
-}
-
-#else // HAVE_CUDA
-
 CV_TEST_MAIN("cv")
-
-#endif // HAVE_CUDA
--- a/modules/nonfree/test/test_precomp.hpp
+++ b/modules/nonfree/test/test_precomp.hpp
@@ -15,14 +15,16 @@
 #include "opencv2/highgui.hpp"
 #include "opencv2/nonfree.hpp"

+#include "opencv2/ts/gpu_test.hpp"
+
 #include "opencv2/opencv_modules.hpp"
+
 #ifdef HAVE_OPENCV_OCL
 #  include "opencv2/nonfree/ocl.hpp"
 #endif

-#if defined(HAVE_OPENCV_GPU) && defined(HAVE_CUDA)
-    #include "opencv2/ts/gpu_test.hpp"
-    #include "opencv2/nonfree/gpu.hpp"
+#ifdef HAVE_OPENCV_GPU
+#  include "opencv2/nonfree/gpu.hpp"
 #endif

 #endif
--- a/modules/nonfree/test/test_surf.ocl.cpp
+++ b/modules/nonfree/test/test_surf.ocl.cpp
@@ -109,17 +109,6 @@ static int getMatchedPointsCount(const std::vector<cv::KeyPoint>& keypoints1, co
    return validCount;
 }

-#define PARAM_TEST_CASE(name, ...) struct name : testing::TestWithParam< std::tr1::tuple< __VA_ARGS__ > >
-#define IMPLEMENT_PARAM_CLASS(name, type) \
-    namespace { class name { \
-    public: \
-        name ( type arg = type ()) : val_(arg) {} \
-        operator type () const {return val_;} \
-    private: \
-        type val_; \
-    }; \
-    inline void PrintTo( name param, std::ostream* os) {*os << #name <<  "=" << testing::PrintToString(static_cast< type >(param));}}
-
 IMPLEMENT_PARAM_CLASS(HessianThreshold, double)
 IMPLEMENT_PARAM_CLASS(Octaves, int)
 IMPLEMENT_PARAM_CLASS(OctaveLayers, int)
--- a/modules/ocl/CMakeLists.txt
+++ b/modules/ocl/CMakeLists.txt
@@ -3,5 +3,5 @@ if(NOT HAVE_OPENCL)
 endif()

 set(the_description "OpenCL-accelerated Computer Vision")
-ocv_define_module(ocl opencv_core opencv_imgproc opencv_objdetect opencv_video)
+ocv_define_module(ocl opencv_core opencv_imgproc opencv_objdetect opencv_video opencv_features2d)
 ocv_warnings_disable(CMAKE_CXX_FLAGS -Wshadow)
--- a/modules/ocl/include/opencv2/ocl.hpp
+++ b/modules/ocl/include/opencv2/ocl.hpp
@@ -50,7 +50,6 @@
 #include "opencv2/core.hpp"
 #include "opencv2/imgproc.hpp"
 #include "opencv2/objdetect.hpp"
-//#include "opencv2/features2d.hpp"

 namespace cv
 {
@@ -125,6 +124,9 @@ namespace cv

        CV_EXPORTS void* getoclCommandQueue();

+        //explicit call clFinish. The global command queue will be used.
+        CV_EXPORTS void finish();
+
        //this function enable ocl module to use customized cl_context and cl_command_queue
        //getDevice also need to be called before this function
        CV_EXPORTS void setDeviceEx(Info &oclinfo, void *ctx, void *qu, int devnum = 0);
@@ -1714,6 +1716,36 @@ namespace cv
        private:
            oclMat minSSD, leBuf, riBuf;
        };
+        class CV_EXPORTS StereoBeliefPropagation
+        {
+        public:
+            enum { DEFAULT_NDISP  = 64 };
+            enum { DEFAULT_ITERS  = 5  };
+            enum { DEFAULT_LEVELS = 5  };
+            static void estimateRecommendedParams(int width, int height, int &ndisp, int &iters, int &levels);
+            explicit StereoBeliefPropagation(int ndisp  = DEFAULT_NDISP,
+                                             int iters  = DEFAULT_ITERS,
+                                             int levels = DEFAULT_LEVELS,
+                                             int msg_type = CV_16S);
+            StereoBeliefPropagation(int ndisp, int iters, int levels,
+                                    float max_data_term, float data_weight,
+                                    float max_disc_term, float disc_single_jump,
+                                    int msg_type = CV_32F);
+            void operator()(const oclMat &left, const oclMat &right, oclMat &disparity);
+            void operator()(const oclMat &data, oclMat &disparity);
+            int ndisp;
+            int iters;
+            int levels;
+            float max_data_term;
+            float data_weight;
+            float max_disc_term;
+            float disc_single_jump;
+            int msg_type;
+        private:
+            oclMat u, d, l, r, u2, d2, l2, r2;
+            std::vector<oclMat> datas;
+            oclMat out;
+        };
    }
 }
 #if defined _MSC_VER && _MSC_VER >= 1200
--- a/modules/ocl/perf/interpolation.hpp
+++ b/modules/ocl/perf/interpolation.hpp
@@ -1,120 +0,0 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                        Intel License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2000, Intel Corporation, all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of Intel Corporation may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors "as is" and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#ifndef __OPENCV_TEST_INTERPOLATION_HPP__
-#define __OPENCV_TEST_INTERPOLATION_HPP__
-
-template <typename T> T readVal(const cv::Mat &src, int y, int x, int c, int border_type, cv::Scalar borderVal = cv::Scalar())
-{
-    if (border_type == cv::BORDER_CONSTANT)
-        return (y >= 0 && y < src.rows && x >= 0 && x < src.cols) ? src.at<T>(y, x * src.channels() + c) : cv::saturate_cast<T>(borderVal.val[c]);
-
-    return src.at<T>(cv::borderInterpolate(y, src.rows, border_type), cv::borderInterpolate(x, src.cols, border_type) * src.channels() + c);
-}
-
-template <typename T> struct NearestInterpolator
-{
-    static T getValue(const cv::Mat &src, float y, float x, int c, int border_type, cv::Scalar borderVal = cv::Scalar())
-    {
-        return readVal<T>(src, cvFloor(y), cvFloor(x), c, border_type, borderVal);
-    }
-};
-
-template <typename T> struct LinearInterpolator
-{
-    static T getValue(const cv::Mat &src, float y, float x, int c, int border_type, cv::Scalar borderVal = cv::Scalar())
-    {
-        x -= 0.5f;
-        y -= 0.5f;
-
-        int x1 = cvFloor(x);
-        int y1 = cvFloor(y);
-        int x2 = x1 + 1;
-        int y2 = y1 + 1;
-
-        float res = 0;
-
-        res += readVal<T>(src, y1, x1, c, border_type, borderVal) * ((x2 - x) * (y2 - y));
-        res += readVal<T>(src, y1, x2, c, border_type, borderVal) * ((x - x1) * (y2 - y));
-        res += readVal<T>(src, y2, x1, c, border_type, borderVal) * ((x2 - x) * (y - y1));
-        res += readVal<T>(src, y2, x2, c, border_type, borderVal) * ((x - x1) * (y - y1));
-
-        return cv::saturate_cast<T>(res);
-    }
-};
-
-template <typename T> struct CubicInterpolator
-{
-    static float getValue(float p[4], float x)
-    {
-        return p[1] + 0.5 * x * (p[2] - p[0] + x * (2.0 * p[0] - 5.0 * p[1] + 4.0 * p[2] - p[3] + x * (3.0 * (p[1] - p[2]) + p[3] - p[0])));
-    }
-
-    static float getValue(float p[4][4], float x, float y)
-    {
-        float arr[4];
-
-        arr[0] = getValue(p[0], x);
-        arr[1] = getValue(p[1], x);
-        arr[2] = getValue(p[2], x);
-        arr[3] = getValue(p[3], x);
-
-        return getValue(arr, y);
-    }
-
-    static T getValue(const cv::Mat &src, float y, float x, int c, int border_type, cv::Scalar borderVal = cv::Scalar())
-    {
-        int ix = cvRound(x);
-        int iy = cvRound(y);
-
-        float vals[4][4] =
-        {
-            {readVal<T>(src, iy - 2, ix - 2, c, border_type, borderVal), readVal<T>(src, iy - 2, ix - 1, c, border_type, borderVal), readVal<T>(src, iy - 2, ix, c, border_type, borderVal), readVal<T>(src, iy - 2, ix + 1, c, border_type, borderVal)},
-            {readVal<T>(src, iy - 1, ix - 2, c, border_type, borderVal), readVal<T>(src, iy - 1, ix - 1, c, border_type, borderVal), readVal<T>(src, iy - 1, ix, c, border_type, borderVal), readVal<T>(src, iy - 1, ix + 1, c, border_type, borderVal)},
-            {readVal<T>(src, iy    , ix - 2, c, border_type, borderVal), readVal<T>(src, iy    , ix - 1, c, border_type, borderVal), readVal<T>(src, iy    , ix, c, border_type, borderVal), readVal<T>(src, iy    , ix + 1, c, border_type, borderVal)},
-            {readVal<T>(src, iy + 1, ix - 2, c, border_type, borderVal), readVal<T>(src, iy + 1, ix - 1, c, border_type, borderVal), readVal<T>(src, iy + 1, ix, c, border_type, borderVal), readVal<T>(src, iy + 1, ix + 1, c, border_type, borderVal)},
-        };
-
-        return cv::saturate_cast<T>(getValue(vals, (x - ix + 2.0) / 4.0, (y - iy + 2.0) / 4.0));
-    }
-};
-
-#endif // __OPENCV_TEST_INTERPOLATION_HPP__
--- a/modules/ocl/perf/main.cpp
+++ b/modules/ocl/perf/main.cpp
@@ -7,12 +7,13 @@
 //  copy or use the software.
 //
 //
-//                        Intel License Agreement
+//                           License Agreement
 //                For Open Source Computer Vision Library
 //
-// Copyright (C) 2000, Intel Corporation, all rights reserved.
+// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
+// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
 // Third party copyrights are property of their respective owners.
-//
+
 // Redistribution and use in source and binary forms, with or without modification,
 // are permitted provided that the following conditions are met:
 //
@@ -21,12 +22,12 @@
 //
 //   * Redistribution's in binary form must reproduce the above copyright notice,
 //     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
+//     and/or other oclMaterials provided with the distribution.
 //
-//   * The name of Intel Corporation may not be used to endorse or promote products
+//   * The name of the copyright holders may not be used to endorse or promote products
 //     derived from this software without specific prior written permission.
 //
-// This software is provided by the copyright holders and contributors "as is" and
+// This software is provided by the copyright holders and contributors as is and
 // any express or implied warranties, including, but not limited to, the implied
 // warranties of merchantability and fitness for a particular purpose are disclaimed.
 // In no event shall the Intel Corporation or contributors be liable for any direct,
@@ -41,99 +42,118 @@

 #include "precomp.hpp"

-#ifdef HAVE_OPENCL
-
-using namespace std;
-using namespace cv;
-using namespace cv::ocl;
-using namespace cvtest;
-using namespace testing;
-
-void print_info()
+int main(int argc, const char *argv[])
 {
-    printf("\n");
-#if defined _WIN32
-#   if defined _WIN64
-    puts("OS: Windows 64");
-#   else
-    puts("OS: Windows 32");
-#   endif
-#elif defined linux
-#   if defined _LP64
-    puts("OS: Linux 64");
-#   else
-    puts("OS: Linux 32");
-#   endif
-#elif defined __APPLE__
-#   if defined _LP64
-    puts("OS: Apple 64");
-#   else
-    puts("OS: Apple 32");
-#   endif
-#endif
+    vector<ocl::Info> oclinfo;
+    int num_devices = getDevice(oclinfo);
+
+    if (num_devices < 1)
+    {
+        cerr << "no device found\n";
+        return -1;
+    }
+
+    int devidx = 0;
+
+    for (size_t i = 0; i < oclinfo.size(); i++)
+    {
+        for (size_t j = 0; j < oclinfo[i].DeviceName.size(); j++)
+        {
+            printf("device %d: %s\n", devidx++, oclinfo[i].DeviceName[j].c_str());
+        }
+    }
+
+    redirectError(cvErrorCallback);

-}
-std::string workdir;
-int main(int argc, char **argv)
-{
-    TS::ptr()->init("ocl");
-    InitGoogleTest(&argc, argv);
    const char *keys =
-        "{ h | false              | print help message }"
-		"{ w | ../../../samples/c/| set working directory i.e. -w=C:\\}"
-        "{ t | gpu                | set device type:i.e. -t=cpu or gpu}"
-        "{ p | 0                  | set platform id i.e. -p=0}"
-        "{ d | 0                  | set device id i.e. -d=0}";
+        "{ h help    | false | print help message }"
+        "{ f filter  |       | filter for test }"
+        "{ w workdir |       | set working directory }"
+        "{ l list    | false | show all tests }"
+        "{ d device  | 0     | device id }"
+        "{ i iters   | 10    | iteration count }"
+        "{ m warmup  | 1     | gpu warm up iteration count}"
+        "{ t xtop    | 1.1	  | xfactor top boundary}"
+        "{ b xbottom | 0.9	  | xfactor bottom boundary}"
+        "{ v verify  | false | only run gpu once to verify if problems occur}";

    CommandLineParser cmd(argc, argv, keys);
-    if (cmd.get<string>("h")=="true")
+
+    if (cmd.has("help"))
    {
-        cout << "Avaible options besides goole test option:" << endl;
+        cout << "Avaible options:" << endl;
        cmd.printMessage();
        return 0;
    }
-    workdir = cmd.get<string>("w");
-    string type = cmd.get<string>("t");
-    unsigned int pid = cmd.get<unsigned int>("p");
-    int device = cmd.get<int>("d");
-    print_info();
-    // int flag = CVCL_DEVICE_TYPE_GPU;

-    // if(type == "cpu")
-    // {
-    //     flag = CVCL_DEVICE_TYPE_CPU;
-    // }
-    std::vector<cv::ocl::Info> oclinfo;
-    int devnums = getDevice(oclinfo);
-    if(devnums <= device || device < 0)
+    int device = cmd.get<int>("device");
+
+    if (device < 0 || device >= num_devices)
    {
-        std::cout << "device invalid\n";
+        cerr << "Invalid device ID" << endl;
        return -1;
    }

-    if(pid >= oclinfo.size())
+    if (cmd.get<bool>("verify"))
    {
-        std::cout << "platform invalid\n";
-        return -1;
+        TestSystem::instance().setNumIters(1);
+        TestSystem::instance().setGPUWarmupIters(0);
+        TestSystem::instance().setCPUIters(0);
    }

-    if(pid != 0 || device != 0)
+    devidx = 0;
+
+    for (size_t i = 0; i < oclinfo.size(); i++)
    {
-        setDevice(oclinfo[pid], device);
+        for (size_t j = 0; j < oclinfo[i].DeviceName.size(); j++, devidx++)
+        {
+            if (device == devidx)
+            {
+                ocl::setDevice(oclinfo[i], (int)j);
+                TestSystem::instance().setRecordName(oclinfo[i].DeviceName[j]);
+                printf("\nuse %d: %s\n", devidx, oclinfo[i].DeviceName[j].c_str());
+                goto END_DEV;
+            }
+        }
    }

-    cout << "Device type:" << type << endl << "Device name:" << oclinfo[pid].DeviceName[device] << endl;
-    setBinpath(CLBINPATH);
-    return RUN_ALL_TESTS();
-}
+END_DEV:

-#else // DON'T HAVE_OPENCL
+    string filter = cmd.get<string>("filter");
+    string workdir = cmd.get<string>("workdir");
+    bool list = cmd.has("list");
+    int iters = cmd.get<int>("iters");
+    int wu_iters = cmd.get<int>("warmup");
+    double x_top = cmd.get<double>("xtop");
+    double x_bottom = cmd.get<double>("xbottom");
+
+    TestSystem::instance().setTopThreshold(x_top);
+    TestSystem::instance().setBottomThreshold(x_bottom);
+
+    if (!filter.empty())
+    {
+        TestSystem::instance().setTestFilter(filter);
+    }
+
+    if (!workdir.empty())
+    {
+        if (workdir[workdir.size() - 1] != '/' && workdir[workdir.size() - 1] != '\\')
+        {
+            workdir += '/';
+        }
+
+        TestSystem::instance().setWorkingDir(workdir);
+    }
+
+    if (list)
+    {
+        TestSystem::instance().setListMode(true);
+    }
+
+    TestSystem::instance().setNumIters(iters);
+    TestSystem::instance().setGPUWarmupIters(wu_iters);
+
+    TestSystem::instance().run();

-int main()
-{
-    printf("OpenCV was built without OpenCL support\n");
    return 0;
-}
-
-
-#endif // HAVE_OPENCL
+}
--- a/modules/ocl/perf/perf_arithm.cpp
+++ b/modules/ocl/perf/perf_arithm.cpp
--- a/modules/ocl/perf/perf_blend.cpp
+++ b/modules/ocl/perf/perf_blend.cpp
@@ -44,79 +44,77 @@
 //M*/

 #include "precomp.hpp"
-#include <iomanip>
-
-#ifdef HAVE_OPENCL
-using namespace cv;
-using namespace cv::ocl;
-using namespace cvtest;
-using namespace testing;
-using namespace std;
-
-PARAM_TEST_CASE(Blend, MatType, int)
+///////////// blend ////////////////////////
+template <typename T>
+void blendLinearGold(const cv::Mat &img1, const cv::Mat &img2, const cv::Mat &weights1, const cv::Mat &weights2, cv::Mat &result_gold)
 {
-    int type;
-    int channels;
-    std::vector<cv::ocl::Info> oclinfo;
+    result_gold.create(img1.size(), img1.type());

-    virtual void SetUp()
+    int cn = img1.channels();
+
+    for (int y = 0; y < img1.rows; ++y)
    {
+        const float *weights1_row = weights1.ptr<float>(y);
+        const float *weights2_row = weights2.ptr<float>(y);
+        const T *img1_row = img1.ptr<T>(y);
+        const T *img2_row = img2.ptr<T>(y);
+        T *result_gold_row = result_gold.ptr<T>(y);

-        type = GET_PARAM(0);
-        channels = GET_PARAM(1);
-        //int devnums = getDevice(oclinfo);
-        //CV_Assert(devnums > 0);
-        //cv::ocl::setBinpath(CLBINPATH);
-    }
-};
-
-TEST_P(Blend, Performance)
-{
-    cv::Size size(MWIDTH, MHEIGHT);
-    cv::Mat img1_host = randomMat(size, CV_MAKETYPE(type, channels), 0, type == CV_8U ? 255.0 : 1.0);
-    cv::Mat img2_host = randomMat(size, CV_MAKETYPE(type, channels), 0, type == CV_8U ? 255.0 : 1.0);
-    cv::Mat weights1 = randomMat(size, CV_32F, 0, 1);
-    cv::Mat weights2 = randomMat(size, CV_32F, 0, 1);
-    cv::ocl::oclMat gimg1(size, CV_MAKETYPE(type, channels)), gimg2(size, CV_MAKETYPE(type, channels)), gweights1(size, CV_32F), gweights2(size, CV_32F);
-    cv::ocl::oclMat gdst(size, CV_MAKETYPE(type, channels));
-
-
-    double totalgputick_all = 0;
-    double totalgputick_kernel = 0;
-    double t1 = 0;
-    double t2 = 0;
-
-    for (int j = 0; j < LOOP_TIMES + 1; j ++) //LOOP_TIMES=100
-    {
-        t1 = (double)cvGetTickCount();
-        cv::ocl::oclMat gimg1 = cv::ocl::oclMat(img1_host);
-        cv::ocl::oclMat gimg2 = cv::ocl::oclMat(img2_host);
-        cv::ocl::oclMat gweights1 = cv::ocl::oclMat(weights1);
-        cv::ocl::oclMat gweights2 = cv::ocl::oclMat(weights1);
-
-        t2 = (double)cvGetTickCount();
-        cv::ocl::blendLinear(gimg1, gimg2, gweights1, gweights2, gdst);
-        t2 = (double)cvGetTickCount() - t2;
-
-        cv::Mat m;
-        gdst.download(m);
-        t1 = (double)cvGetTickCount() - t1;
-
-        if (j == 0)
+        for (int x = 0; x < img1.cols * cn; ++x)
        {
-            continue;
+            float w1 = weights1_row[x / cn];
+            float w2 = weights2_row[x / cn];
+            result_gold_row[x] = static_cast<T>((img1_row[x] * w1 + img2_row[x] * w2) / (w1 + w2 + 1e-5f));
        }
-
-        totalgputick_all = t1 + totalgputick_all;
-        totalgputick_kernel = t2 + totalgputick_kernel;
-    };
-
-    cout << "average gpu total  runtime is  " << totalgputick_all / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
-
-    cout << "average gpu runtime without data transfering  is  " << totalgputick_kernel / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
-
+    }
 }
+TEST(blend)
+{
+    Mat src1, src2, weights1, weights2, dst;
+    ocl::oclMat d_src1, d_src2, d_weights1, d_weights2, d_dst;

-INSTANTIATE_TEST_CASE_P(GPU_ImgProc, Blend, Combine(
-                            Values(CV_8U, CV_32F), Values(1, 4)));
-#endif
+    int all_type[] = {CV_8UC1, CV_8UC4};
+    std::string type_name[] = {"CV_8UC1", "CV_8UC4"};
+
+    for (int size = Min_Size; size <= Max_Size; size *= Multiple)
+    {
+        for (size_t j = 0; j < sizeof(all_type) / sizeof(int); j++)
+        {
+            SUBTEST << size << 'x' << size << "; " << type_name[j] << " and CV_32FC1";
+
+            gen(src1, size, size, all_type[j], 0, 256);
+            gen(src2, size, size, all_type[j], 0, 256);
+            gen(weights1, size, size, CV_32FC1, 0, 1);
+            gen(weights2, size, size, CV_32FC1, 0, 1);
+
+            blendLinearGold<uchar>(src1, src2, weights1, weights2, dst);
+
+            CPU_ON;
+            blendLinearGold<uchar>(src1, src2, weights1, weights2, dst);
+            CPU_OFF;
+
+            d_src1.upload(src1);
+            d_src2.upload(src2);
+            d_weights1.upload(weights1);
+            d_weights2.upload(weights2);
+
+            WARMUP_ON;
+            ocl::blendLinear(d_src1, d_src2, d_weights1, d_weights2, d_dst);
+            WARMUP_OFF;
+
+            GPU_ON;
+            ocl::blendLinear(d_src1, d_src2, d_weights1, d_weights2, d_dst);
+             ;
+            GPU_OFF;
+
+            GPU_FULL_ON;
+            d_src1.upload(src1);
+            d_src2.upload(src2);
+            d_weights1.upload(weights1);
+            d_weights2.upload(weights2);
+            ocl::blendLinear(d_src1, d_src2, d_weights1, d_weights2, d_dst);
+            d_dst.download(dst);
+            GPU_FULL_OFF;
+        }
+    }
+}
--- a/modules/ocl/perf/perf_brute_force_matcher.cpp
+++ b/modules/ocl/perf/perf_brute_force_matcher.cpp
@@ -0,0 +1,150 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
+// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// @Authors
+//    Fangfang Bai, fangfang@multicorewareinc.com
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other oclMaterials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors as is and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+#include "precomp.hpp"
+
+//////////////////// BruteForceMatch /////////////////
+TEST(BruteForceMatcher)
+{
+    Mat trainIdx_cpu;
+    Mat distance_cpu;
+    Mat allDist_cpu;
+    Mat nMatches_cpu;
+
+    for (int size = Min_Size; size <= Max_Size; size *= Multiple)
+    {
+        // Init CPU matcher
+        int desc_len = 64;
+
+        BFMatcher matcher(NORM_L2);
+
+        Mat query;
+        gen(query, size, desc_len, CV_32F, 0, 1);
+
+        Mat train;
+        gen(train, size, desc_len, CV_32F, 0, 1);
+        // Output
+        vector< vector<DMatch> > matches(2);
+        // Init GPU matcher
+        ocl::BruteForceMatcher_OCL_base d_matcher(ocl::BruteForceMatcher_OCL_base::L2Dist);
+
+        ocl::oclMat d_query(query);
+        ocl::oclMat d_train(train);
+
+        ocl::oclMat d_trainIdx, d_distance, d_allDist, d_nMatches;
+
+        SUBTEST << size << "; match";
+
+        matcher.match(query, train, matches[0]);
+
+        CPU_ON;
+        matcher.match(query, train, matches[0]);
+        CPU_OFF;
+
+        WARMUP_ON;
+        d_matcher.matchSingle(d_query, d_train, d_trainIdx, d_distance);
+        WARMUP_OFF;
+
+        GPU_ON;
+        d_matcher.matchSingle(d_query, d_train, d_trainIdx, d_distance);
+         ;
+        GPU_OFF;
+
+        GPU_FULL_ON;
+        d_query.upload(query);
+        d_train.upload(train);
+        d_matcher.match(d_query, d_train, matches[0]);
+        GPU_FULL_OFF;
+
+        SUBTEST << size << "; knnMatch";
+
+        matcher.knnMatch(query, train, matches, 2);
+
+        CPU_ON;
+        matcher.knnMatch(query, train, matches, 2);
+        CPU_OFF;
+
+        WARMUP_ON;
+        d_matcher.knnMatchSingle(d_query, d_train, d_trainIdx, d_distance, d_allDist, 2);
+        WARMUP_OFF;
+
+        GPU_ON;
+        d_matcher.knnMatchSingle(d_query, d_train, d_trainIdx, d_distance, d_allDist, 2);
+         ;
+        GPU_OFF;
+
+        GPU_FULL_ON;
+        d_query.upload(query);
+        d_train.upload(train);
+        d_matcher.knnMatch(d_query, d_train, matches, 2);
+        GPU_FULL_OFF;
+
+        SUBTEST << size << "; radiusMatch";
+
+        float max_distance = 2.0f;
+
+        matcher.radiusMatch(query, train, matches, max_distance);
+
+        CPU_ON;
+        matcher.radiusMatch(query, train, matches, max_distance);
+        CPU_OFF;
+
+        d_trainIdx.release();
+
+        WARMUP_ON;
+        d_matcher.radiusMatchSingle(d_query, d_train, d_trainIdx, d_distance, d_nMatches, max_distance);
+        WARMUP_OFF;
+
+        GPU_ON;
+        d_matcher.radiusMatchSingle(d_query, d_train, d_trainIdx, d_distance, d_nMatches, max_distance);
+         ;
+        GPU_OFF;
+
+        GPU_FULL_ON;
+        d_query.upload(query);
+        d_train.upload(train);
+        d_matcher.radiusMatch(d_query, d_train, matches, max_distance);
+        GPU_FULL_OFF;
+    }
+}
--- a/modules/ocl/perf/perf_canny.cpp
+++ b/modules/ocl/perf/perf_canny.cpp
@@ -42,112 +42,42 @@
 // the use of this software, even if advised of the possibility of such damage.
 //
 //M*/
-
 #include "precomp.hpp"
-#include <iomanip>
-#ifdef HAVE_OPENCL
-using namespace cv;
-using namespace cv::ocl;
-using namespace cvtest;
-using namespace testing;
-using namespace std;

-#ifndef MWC_TEST_UTILITY
-#define MWC_TEST_UTILITY
-
-// Param class
-#ifndef IMPLEMENT_PARAM_CLASS
-#define IMPLEMENT_PARAM_CLASS(name, type) \
-class name \
-    { \
-    public: \
-    name ( type arg = type ()) : val_(arg) {} \
-    operator type () const {return val_;} \
-    private: \
-    type val_; \
-    }; \
-    inline void PrintTo( name param, std::ostream* os) \
-    { \
-    *os << #name <<  "(" << testing::PrintToString(static_cast< type >(param)) << ")"; \
-    }
-
-IMPLEMENT_PARAM_CLASS(Channels, int)
-#endif // IMPLEMENT_PARAM_CLASS
-#endif // MWC_TEST_UTILITY
-
-////////////////////////////////////////////////////////
-// Canny1
-extern std::string workdir;
-IMPLEMENT_PARAM_CLASS(AppertureSize, int);
-IMPLEMENT_PARAM_CLASS(L2gradient, bool);
-
-PARAM_TEST_CASE(Canny1, AppertureSize, L2gradient)
+///////////// Canny ////////////////////////
+TEST(Canny)
 {
-    int apperture_size;
-    bool useL2gradient;
-    //std::vector<cv::ocl::Info> oclinfo;
+    Mat img = imread(abspath("aloeL.jpg"), CV_LOAD_IMAGE_GRAYSCALE);

-    virtual void SetUp()
+    if (img.empty())
    {
-        apperture_size = GET_PARAM(0);
-        useL2gradient = GET_PARAM(1);
-
-        //int devnums = getDevice(oclinfo);
-        //CV_Assert(devnums > 0);
-    }
-};
-
-TEST_P(Canny1, Performance)
-{
-    cv::Mat img = readImage(workdir + "fruits.jpg", cv::IMREAD_GRAYSCALE);
-    ASSERT_FALSE(img.empty());
-
-    double low_thresh = 100.0;
-    double high_thresh = 150.0;
-
-    cv::Mat edges_gold;
-    cv::ocl::oclMat edges;
-
-    double totalgputick = 0;
-    double totalgputick_kernel = 0;
-
-    double t1 = 0;
-    double t2 = 0;
-    for(int j = 0; j < LOOP_TIMES + 1; j ++)
-    {
-
-        t1 = (double)cvGetTickCount();//gpu start1
-
-        cv::ocl::oclMat ocl_img = cv::ocl::oclMat(img);//upload
-
-        t2 = (double)cvGetTickCount(); //kernel
-        cv::ocl::Canny(ocl_img, edges, low_thresh, high_thresh, apperture_size, useL2gradient);
-        t2 = (double)cvGetTickCount() - t2;//kernel
-
-        cv::Mat cpu_dst;
-        edges.download (cpu_dst);//download
-
-        t1 = (double)cvGetTickCount() - t1;//gpu end1
-
-        if(j == 0)
-            continue;
-
-        totalgputick = t1 + totalgputick;
-
-        totalgputick_kernel = t2 + totalgputick_kernel;
-
+        throw runtime_error("can't open aloeL.jpg");
    }

-    cout << "average gpu runtime is  " << totalgputick / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
-    cout << "average gpu runtime without data transfer is  " << totalgputick_kernel / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
+    SUBTEST << img.cols << 'x' << img.rows << "; aloeL.jpg" << "; edges" << "; CV_8UC1";

+    Mat edges(img.size(), CV_8UC1);

-}
+    CPU_ON;
+    Canny(img, edges, 50.0, 100.0);
+    CPU_OFF;

-INSTANTIATE_TEST_CASE_P(GPU_ImgProc, Canny1, testing::Combine(
-                            testing::Values(AppertureSize(3), AppertureSize(5)),
-                            testing::Values(L2gradient(false), L2gradient(true))));
+    ocl::oclMat d_img(img);
+    ocl::oclMat d_edges;
+    ocl::CannyBuf d_buf;

+    WARMUP_ON;
+    ocl::Canny(d_img, d_buf, d_edges, 50.0, 100.0);
+    WARMUP_OFF;

+    GPU_ON;
+    ocl::Canny(d_img, d_buf, d_edges, 50.0, 100.0);
+     ;
+    GPU_OFF;

-#endif  //Have opencl
+    GPU_FULL_ON;
+    d_img.upload(img);
+    ocl::Canny(d_img, d_buf, d_edges, 50.0, 100.0);
+    d_edges.download(edges);
+    GPU_FULL_OFF;
+}
--- a/modules/ocl/perf/perf_color.cpp
+++ b/modules/ocl/perf/perf_color.cpp
@@ -0,0 +1,91 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
+// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// @Authors
+//    Fangfang Bai, fangfang@multicorewareinc.com
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other oclMaterials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors as is and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+#include "precomp.hpp"
+
+///////////// cvtColor////////////////////////
+TEST(cvtColor)
+{
+    Mat src, dst;
+    ocl::oclMat d_src, d_dst;
+
+    int all_type[] = {CV_8UC4};
+    std::string type_name[] = {"CV_8UC4"};
+
+    for (int size = Min_Size; size <= Max_Size; size *= Multiple)
+    {
+        for (size_t j = 0; j < sizeof(all_type) / sizeof(int); j++)
+        {
+            gen(src, size, size, all_type[j], 0, 256);
+            SUBTEST << size << "x" << size << "; " << type_name[j] << " ; CV_RGBA2GRAY";
+
+            cvtColor(src, dst, CV_RGBA2GRAY, 4);
+
+            CPU_ON;
+            cvtColor(src, dst, CV_RGBA2GRAY, 4);
+            CPU_OFF;
+
+            d_src.upload(src);
+
+            WARMUP_ON;
+            ocl::cvtColor(d_src, d_dst, CV_RGBA2GRAY, 4);
+            WARMUP_OFF;
+
+            GPU_ON;
+            ocl::cvtColor(d_src, d_dst, CV_RGBA2GRAY, 4);
+             ;
+            GPU_OFF;
+
+            GPU_FULL_ON;
+            d_src.upload(src);
+            ocl::cvtColor(d_src, d_dst, CV_RGBA2GRAY, 4);
+            d_dst.download(dst);
+            GPU_FULL_OFF;
+        }
+
+
+    }
+
+
+}
--- a/modules/ocl/perf/perf_columnsum.cpp
+++ b/modules/ocl/perf/perf_columnsum.cpp
@@ -15,8 +15,7 @@
 // Third party copyrights are property of their respective owners.
 //
 // @Authors
-//	   Fangfang Bai fangfang@multicorewareinc.com
-//
+//    Fangfang Bai, fangfang@multicorewareinc.com
 //
 // Redistribution and use in source and binary forms, with or without modification,
 // are permitted provided that the following conditions are met:
@@ -31,7 +30,7 @@
 //   * The name of the copyright holders may not be used to endorse or promote products
 //     derived from this software without specific prior written permission.
 //
-// This software is provided by the copyright holders and contributors "as is" and
+// This software is provided by the copyright holders and contributors as is and
 // any express or implied warranties, including, but not limited to, the implied
 // warranties of merchantability and fitness for a particular purpose are disclaimed.
 // In no event shall the Intel Corporation or contributors be liable for any direct,
@@ -43,78 +42,47 @@
 // the use of this software, even if advised of the possibility of such damage.
 //
 //M*/
-
 #include "precomp.hpp"
-#include <iomanip>

-using namespace cv;
-using namespace cv::ocl;
-using namespace cvtest;
-using namespace testing;
-using namespace std;
-
-///////////////////////////////////////////////////////////////////////////////
-/// ColumnSum
-
-#ifdef HAVE_OPENCL
-
-////////////////////////////////////////////////////////////////////////
-// ColumnSum
-
-PARAM_TEST_CASE(ColumnSum)
+///////////// columnSum////////////////////////
+TEST(columnSum)
 {
-    cv::Mat src;
-    //std::vector<cv::ocl::Info> oclinfo;
+    Mat src, dst;
+    ocl::oclMat d_src, d_dst;

-    virtual void SetUp()
+    for (int size = Min_Size; size <= Max_Size; size *= Multiple)
    {
-        //int devnums = getDevice(oclinfo);
-        //CV_Assert(devnums > 0);
+        SUBTEST << size << 'x' << size << "; CV_32FC1";
+
+        gen(src, size, size, CV_32FC1, 0, 256);
+
+        CPU_ON;
+        dst.create(src.size(), src.type());
+
+        for (int i = 1; i < src.rows; ++i)
+        {
+            for (int j = 0; j < src.cols; ++j)
+            {
+                dst.at<float>(i, j) = src.at<float>(i, j) += src.at<float>(i - 1, j);
+            }
+        }
+
+        CPU_OFF;
+
+        d_src.upload(src);
+        WARMUP_ON;
+        ocl::columnSum(d_src, d_dst);
+        WARMUP_OFF;
+
+        GPU_ON;
+        ocl::columnSum(d_src, d_dst);
+         ;
+        GPU_OFF;
+
+        GPU_FULL_ON;
+        d_src.upload(src);
+        ocl::columnSum(d_src, d_dst);
+        d_dst.download(dst);
+        GPU_FULL_OFF;
    }
-};
-
-TEST_F(ColumnSum, Performance)
-{
-    cv::Size size(MWIDTH, MHEIGHT);
-    cv::Mat src = randomMat(size, CV_32FC1);
-    cv::ocl::oclMat d_dst;
-
-    double totalgputick = 0;
-    double totalgputick_kernel = 0;
-    double t1 = 0;
-    double t2 = 0;
-
-    for(int j = 0; j < LOOP_TIMES + 1; j ++)
-    {
-
-        t1 = (double)cvGetTickCount();//gpu start1
-
-        cv::ocl::oclMat d_src(src);
-
-        t2 = (double)cvGetTickCount(); //kernel
-        cv::ocl::columnSum(d_src, d_dst);
-        t2 = (double)cvGetTickCount() - t2;//kernel
-
-        cv::Mat cpu_dst;
-        d_dst.download (cpu_dst);//download
-
-        t1 = (double)cvGetTickCount() - t1;//gpu end1
-
-        if(j == 0)
-            continue;
-
-        totalgputick = t1 + totalgputick;
-        totalgputick_kernel = t2 + totalgputick_kernel;
-
-    }
-
-    cout << "average gpu runtime is  " << totalgputick / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
-    cout << "average gpu runtime without data transfer is  " << totalgputick_kernel / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
-
-
-
-}
-
-
-
-#endif
+}
--- a/modules/ocl/perf/perf_fft.cpp
+++ b/modules/ocl/perf/perf_fft.cpp
@@ -15,7 +15,7 @@
 // Third party copyrights are property of their respective owners.
 //
 // @Authors
-//    Fangfangbai, fangfang@multicorewareinc.com
+//    Fangfang Bai, fangfang@multicorewareinc.com
 //
 // Redistribution and use in source and binary forms, with or without modification,
 // are permitted provided that the following conditions are met:
@@ -42,85 +42,48 @@
 // the use of this software, even if advised of the possibility of such damage.
 //
 //M*/
-
 #include "precomp.hpp"
-using namespace std;
-#ifdef HAVE_CLAMDFFT
-////////////////////////////////////////////////////////////////////////////
-// Dft
-PARAM_TEST_CASE(Dft, cv::Size, bool)
+
+///////////// dft ////////////////////////
+TEST(dft)
 {
-    cv::Size dft_size;
-    bool	 dft_rows;
-    vector<cv::ocl::Info> info;
-    virtual void SetUp()
+    Mat src, dst;
+    ocl::oclMat d_src, d_dst;
+
+    int all_type[] = {CV_32FC1, CV_32FC2};
+    std::string type_name[] = {"CV_32FC1", "CV_32FC2"};
+
+    for (int size = Min_Size; size <= Max_Size; size *= Multiple)
    {
-        dft_size = GET_PARAM(0);
-        dft_rows = GET_PARAM(1);
-        cv::ocl::getDevice(info);
-    }
-};
+        for (size_t j = 0; j < sizeof(all_type) / sizeof(int); j++)
+        {
+            SUBTEST << size << 'x' << size << "; " << type_name[j] << " ; complex-to-complex";

-TEST_P(Dft, C2C)
-{
-    cv::Mat a = randomMat(dft_size, CV_32FC2, 0.0, 10.0);
-    int flags = 0;
-    flags |= dft_rows ? cv::DFT_ROWS : 0;
+            gen(src, size, size, all_type[j], Scalar::all(0), Scalar::all(1));

-    cv::ocl::oclMat d_b;
+            dft(src, dst);

-    double totalgputick = 0;
-    double totalgputick_kernel = 0;
-    double t1 = 0;
-    double t2 = 0;
+            CPU_ON;
+            dft(src, dst);
+            CPU_OFF;

-    for(int j = 0; j < LOOP_TIMES + 1; j ++)
-    {
+            d_src.upload(src);

-        t1 = (double)cvGetTickCount();//gpu start1
+            WARMUP_ON;
+            ocl::dft(d_src, d_dst, Size(size, size));
+            WARMUP_OFF;

-        cv::ocl::oclMat ga = cv::ocl::oclMat(a); //upload
+            GPU_ON;
+            ocl::dft(d_src, d_dst, Size(size, size));
+             ;
+            GPU_OFF;

-        t2 = (double)cvGetTickCount(); //kernel
-        cv::ocl::dft(ga, d_b, a.size(), flags);
-        t2 = (double)cvGetTickCount() - t2;//kernel
-
-        cv::Mat cpu_dst;
-        d_b.download (cpu_dst);//download
-
-        t1 = (double)cvGetTickCount() - t1;//gpu end1
-
-        if(j == 0)
-            continue;
-
-        totalgputick = t1 + totalgputick;
-        totalgputick_kernel = t2 + totalgputick_kernel;
+            GPU_FULL_ON;
+            d_src.upload(src);
+            ocl::dft(d_src, d_dst, Size(size, size));
+            d_dst.download(dst);
+            GPU_FULL_OFF;
+        }

    }
-
-    cout << "average gpu runtime is  " << totalgputick / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
-    cout << "average gpu runtime without data transfer is  " << totalgputick_kernel / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
-}
-
-
-
-TEST_P(Dft, R2CthenC2R)
-{
-    cv::Mat a = randomMat(dft_size, CV_32FC1, 0.0, 10.0);
-
-    int flags = 0;
-    //flags |= dft_rows ? cv::DFT_ROWS : 0; // not supported yet
-
-    cv::ocl::oclMat d_b, d_c;
-
-    cv::ocl::dft(cv::ocl::oclMat(a), d_b, a.size(), flags);
-    cv::ocl::dft(d_b, d_c, a.size(), flags + cv::DFT_INVERSE + cv::DFT_REAL_OUTPUT);
-
-    EXPECT_MAT_NEAR(a, d_c, a.size().area() * 1e-4, "");
-}
-
-//INSTANTIATE_TEST_CASE_P(ocl_DFT, Dft, testing::Combine(
-//						testing::Values(cv::Size(1280, 1024), cv::Size(1920, 1080),cv::Size(1800, 1500)),
-//						testing::Values(false, true)));
-
-#endif // HAVE_CLAMDFFT
+}
--- a/modules/ocl/perf/perf_filters.cpp
+++ b/modules/ocl/perf/perf_filters.cpp
--- a/modules/ocl/perf/perf_gemm.cpp
+++ b/modules/ocl/perf/perf_gemm.cpp
@@ -16,6 +16,7 @@
 //
 // @Authors
 //    Fangfang Bai, fangfang@multicorewareinc.com
+//
 // Redistribution and use in source and binary forms, with or without modification,
 // are permitted provided that the following conditions are met:
 //
@@ -41,73 +42,47 @@
 // the use of this software, even if advised of the possibility of such damage.
 //
 //M*/
-
-
 #include "precomp.hpp"
-using namespace std;
-#ifdef HAVE_CLAMDBLAS
-////////////////////////////////////////////////////////////////////////////
-// GEMM
-PARAM_TEST_CASE(Gemm, int, cv::Size, int)
+
+///////////// gemm ////////////////////////
+TEST(gemm)
 {
-    int      type;
-    cv::Size mat_size;
-    int		 flags;
-    vector<cv::ocl::Info> info;
-    virtual void SetUp()
+    Mat src1, src2, src3, dst;
+    ocl::oclMat d_src1, d_src2, d_src3, d_dst;
+
+    for (int size = Min_Size; size <= Max_Size; size *= Multiple)
    {
-        type     = GET_PARAM(0);
-        mat_size = GET_PARAM(1);
-        flags    = GET_PARAM(2);
+        SUBTEST << size << 'x' << size;

-        cv::ocl::getDevice(info);
+        gen(src1, size, size, CV_32FC1, Scalar::all(-10), Scalar::all(10));
+        gen(src2, size, size, CV_32FC1, Scalar::all(-10), Scalar::all(10));
+        gen(src3, size, size, CV_32FC1, Scalar::all(-10), Scalar::all(10));
+
+        gemm(src1, src2, 1.0, src3, 1.0, dst);
+
+        CPU_ON;
+        gemm(src1, src2, 1.0, src3, 1.0, dst);
+        CPU_OFF;
+
+        d_src1.upload(src1);
+        d_src2.upload(src2);
+        d_src3.upload(src3);
+
+        WARMUP_ON;
+        ocl::gemm(d_src1, d_src2, 1.0, d_src3, 1.0, d_dst);
+        WARMUP_OFF;
+
+        GPU_ON;
+        ocl::gemm(d_src1, d_src2, 1.0, d_src3, 1.0, d_dst);
+         ;
+        GPU_OFF;
+
+        GPU_FULL_ON;
+        d_src1.upload(src1);
+        d_src2.upload(src2);
+        d_src3.upload(src3);
+        ocl::gemm(d_src1, d_src2, 1.0, d_src3, 1.0, d_dst);
+        d_dst.download(dst);
+        GPU_FULL_OFF;
    }
-};
-
-TEST_P(Gemm, Performance)
-{
-    cv::Mat a = randomMat(mat_size, type, 0.0, 10.0);
-    cv::Mat b = randomMat(mat_size, type, 0.0, 10.0);
-    cv::Mat c = randomMat(mat_size, type, 0.0, 10.0);
-    cv::ocl::oclMat ocl_dst;
-
-    double totalgputick = 0;
-    double totalgputick_kernel = 0;
-    double t1 = 0;
-    double t2 = 0;
-
-    for(int j = 0; j < LOOP_TIMES + 1; j ++)
-    {
-
-        t1 = (double)cvGetTickCount();//gpu start1
-
-        cv::ocl::oclMat ga = cv::ocl::oclMat(a);//upload
-        cv::ocl::oclMat gb = cv::ocl::oclMat(b);//upload
-        cv::ocl::oclMat gc = cv::ocl::oclMat(c);//upload
-
-        t2 = (double)cvGetTickCount(); //kernel
-        cv::ocl::gemm(ga, gb, 1.0, gc, 1.0, ocl_dst, flags);
-        t2 = (double)cvGetTickCount() - t2;//kernel
-
-        cv::Mat cpu_dst;
-        ocl_dst.download (cpu_dst);//download
-
-        t1 = (double)cvGetTickCount() - t1;//gpu end
-
-        if(j == 0)
-            continue;
-
-        totalgputick = t1 + totalgputick;
-        totalgputick_kernel = t2 + totalgputick_kernel;
-
-    }
-    cout << "average gpu runtime is  " << totalgputick / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
-    cout << "average gpu runtime without data transfer is  " << totalgputick_kernel / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
-}
-
-
-INSTANTIATE_TEST_CASE_P(ocl_gemm, Gemm, testing::Combine(
-                            testing::Values(CV_32FC1, CV_32FC2/* , CV_64FC1, CV_64FC2*/),
-                            testing::Values(cv::Size(512, 512), cv::Size(1024, 1024)),
-                            testing::Values(0, (int)cv::GEMM_1_T, (int)cv::GEMM_2_T, (int)(cv::GEMM_1_T + cv::GEMM_2_T))));
-#endif
+}
--- a/modules/ocl/perf/perf_haar.cpp
+++ b/modules/ocl/perf/perf_haar.cpp
@@ -10,12 +10,12 @@
 //                           License Agreement
 //                For Open Source Computer Vision Library
 //
-// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved.
+// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
 // Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
 // Third party copyrights are property of their respective owners.
 //
 // @Authors
-//    Jia Haipeng, jiahaipeng95@gmail.com
+//    Fangfang Bai, fangfang@multicorewareinc.com
 //
 // Redistribution and use in source and binary forms, with or without modification,
 // are permitted provided that the following conditions are met:
@@ -30,7 +30,7 @@
 //   * The name of the copyright holders may not be used to endorse or promote products
 //     derived from this software without specific prior written permission.
 //
-// This software is provided by the copyright holders and contributors "as is" and
+// This software is provided by the copyright holders and contributors as is and
 // any express or implied warranties, including, but not limited to, the implied
 // warranties of merchantability and fitness for a particular purpose are disclaimed.
 // In no event shall the Intel Corporation or contributors be liable for any direct,
@@ -42,132 +42,97 @@
 // the use of this software, even if advised of the possibility of such damage.
 //
 //M*/
-
 #include "precomp.hpp"

-#ifdef HAVE_OPENCL
+///////////// Haar ////////////////////////
+namespace cv
+{
+namespace ocl
+{

-using namespace cvtest;
-using namespace testing;
-using namespace std;
-using namespace cv;
-extern std::string workdir;
 struct getRect
 {
-    Rect operator ()(const CvAvgComp &e) const
+    Rect operator()(const CvAvgComp &e) const
    {
        return e.rect;
    }
 };

-PARAM_TEST_CASE(HaarTestBase, int, int)
+class CascadeClassifier_GPU : public OclCascadeClassifier
 {
-    //std::vector<cv::ocl::Info> oclinfo;
-    cv::ocl::OclCascadeClassifier cascade, nestedCascade;
-    cv::CascadeClassifier cpucascade, cpunestedCascade;
-    //    Mat img;
-
-    double scale;
-    int index;
-
-    virtual void SetUp()
+public:
+    void detectMultiScale(oclMat &image,
+                          CV_OUT std::vector<cv::Rect>& faces,
+                          double scaleFactor = 1.1,
+                          int minNeighbors = 3, int flags = 0,
+                          Size minSize = Size(),
+                          Size maxSize = Size())
    {
-        scale = 1.0;
-        index = 0;
-        string cascadeName = "../../../data/haarcascades/haarcascade_frontalface_alt.xml";
-
-        if( (!cascade.load( cascadeName )) || (!cpucascade.load(cascadeName)))
-        {
-            cout << "ERROR: Could not load classifier cascade" << endl;
-            return;
-        }
-        //int devnums = getDevice(oclinfo);
-        //CV_Assert(devnums>0);
-        ////if you want to use undefault device, set it here
-        ////setDevice(oclinfo[0]);
-        //cv::ocl::setBinpath("E:\\");
+        (void)maxSize;
+        MemStorage storage(cvCreateMemStorage(0));
+        //CvMat img=image;
+        CvSeq *objs = oclHaarDetectObjects(image, storage, scaleFactor, minNeighbors, flags, minSize);
+        vector<CvAvgComp> vecAvgComp;
+        Seq<CvAvgComp>(objs).copyTo(vecAvgComp);
+        faces.resize(vecAvgComp.size());
+        std::transform(vecAvgComp.begin(), vecAvgComp.end(), faces.begin(), getRect());
    }
+
 };

-////////////////////////////////faceDetect/////////////////////////////////////////////////
-
-struct Haar : HaarTestBase {};
-
-TEST_F(Haar, FaceDetect)
-{
-    string imgName = workdir + "lena.jpg";
-    Mat img = imread( imgName, 1 );
-
-    if(img.empty())
-    {
-        std::cout << imgName << std::endl;
-        return ;
-    }
-
-    //int i = 0;
-    double t = 0;
-    vector<Rect> faces, oclfaces;
-
-    // const static Scalar colors[] =  { CV_RGB(0, 0, 255),
-    //                                   CV_RGB(0, 128, 255),
-    //                                   CV_RGB(0, 255, 255),
-    //                                   CV_RGB(0, 255, 0),
-    //                                   CV_RGB(255, 128, 0),
-    //                                   CV_RGB(255, 255, 0),
-    //                                   CV_RGB(255, 0, 0),
-    //                                   CV_RGB(255, 0, 255)
-    //                                 } ;
-
-    Mat gray, smallImg(cvRound (img.rows / scale), cvRound(img.cols / scale), CV_8UC1 );
-    MemStorage storage(cvCreateMemStorage(0));
-    cvtColor( img, gray, CV_BGR2GRAY );
-    resize( gray, smallImg, smallImg.size(), 0, 0, INTER_LINEAR );
-    equalizeHist( smallImg, smallImg );
-
-    t = (double)cvGetTickCount();
-    for(int k = 0; k < LOOP_TIMES; k++)
-    {
-        cpucascade.detectMultiScale( smallImg, faces,  1.1,
-                                     3, 0
-                                     | CV_HAAR_SCALE_IMAGE
-                                     , Size(30, 30), Size(0, 0) );
-    }
-    t = (double)cvGetTickCount() - t ;
-    printf( "cpudetection time = %g ms\n", t / (LOOP_TIMES * (double)cvGetTickFrequency() * 1000.) );
-
-    cv::ocl::oclMat image;
-    CvSeq *_objects=NULL;
-    t = (double)cvGetTickCount();
-    for(int k = 0; k < LOOP_TIMES; k++)
-    {
-        image.upload(smallImg);
-        _objects = cascade.oclHaarDetectObjects( image, storage, 1.1,
-                   3, 0
-                   | CV_HAAR_SCALE_IMAGE
-                   , Size(30, 30), Size(0, 0) );
-    }
-    t = (double)cvGetTickCount() - t ;
-    printf( "ocldetection time = %g ms\n", t / (LOOP_TIMES * (double)cvGetTickFrequency() * 1000.) );
-    vector<CvAvgComp> vecAvgComp;
-    Seq<CvAvgComp>(_objects).copyTo(vecAvgComp);
-    oclfaces.resize(vecAvgComp.size());
-    std::transform(vecAvgComp.begin(), vecAvgComp.end(), oclfaces.begin(), getRect());
-
-    //for( vector<Rect>::const_iterator r = faces.begin(); r != faces.end(); r++, i++ )
-    //{
-    //	Mat smallImgROI;
-    //	Point center;
-    //	Scalar color = colors[i%8];
-    //	int radius;
-    //	center.x = cvRound((r->x + r->width*0.5)*scale);
-    //	center.y = cvRound((r->y + r->height*0.5)*scale);
-    //	radius = cvRound((r->width + r->height)*0.25*scale);
-    //	circle( img, center, radius, color, 3, 8, 0 );
-    //}
-    //namedWindow("result");
-    //imshow("result",img);
-    //waitKey(0);
-    //destroyAllWindows();
-
 }
-#endif // HAVE_OPENCL
+}
+TEST(Haar)
+{
+    Mat img = imread(abspath("basketball1.png"), CV_LOAD_IMAGE_GRAYSCALE);
+
+    if (img.empty())
+    {
+        throw runtime_error("can't open basketball1.png");
+    }
+
+    CascadeClassifier faceCascadeCPU;
+
+    if (!faceCascadeCPU.load(abspath("haarcascade_frontalface_alt.xml")))
+    {
+        throw runtime_error("can't load haarcascade_frontalface_alt.xml");
+    }
+
+    vector<Rect> faces;
+
+    SUBTEST << img.cols << "x" << img.rows << "; scale image";
+    CPU_ON;
+    faceCascadeCPU.detectMultiScale(img, faces,
+                                    1.1, 2, 0 | CV_HAAR_SCALE_IMAGE, Size(30, 30));
+    CPU_OFF;
+
+    ocl::CascadeClassifier_GPU faceCascade;
+
+    if (!faceCascade.load(abspath("haarcascade_frontalface_alt.xml")))
+    {
+        throw runtime_error("can't load haarcascade_frontalface_alt.xml");
+    }
+
+    ocl::oclMat d_img(img);
+
+    faces.clear();
+
+    WARMUP_ON;
+    faceCascade.detectMultiScale(d_img, faces,
+                                 1.1, 2, 0 | CV_HAAR_SCALE_IMAGE, Size(30, 30));
+    WARMUP_OFF;
+
+    faces.clear();
+
+    GPU_ON;
+    faceCascade.detectMultiScale(d_img, faces,
+                                 1.1, 2, 0 | CV_HAAR_SCALE_IMAGE, Size(30, 30));
+     ;
+    GPU_OFF;
+
+    GPU_FULL_ON;
+    d_img.upload(img);
+    faceCascade.detectMultiScale(d_img, faces,
+                                 1.1, 2, 0 | CV_HAAR_SCALE_IMAGE, Size(30, 30));
+    GPU_FULL_OFF;
+}
--- a/modules/ocl/perf/perf_hog.cpp
+++ b/modules/ocl/perf/perf_hog.cpp
@@ -15,7 +15,7 @@
 // Third party copyrights are property of their respective owners.
 //
 // @Authors
-//    Peng Xiao, pengxiao@multicorewareinc.com
+//    Fangfang Bai, fangfang@multicorewareinc.com
 //
 // Redistribution and use in source and binary forms, with or without modification,
 // are permitted provided that the following conditions are met:
@@ -42,125 +42,47 @@
 // the use of this software, even if advised of the possibility of such damage.
 //
 //M*/
-
 #include "precomp.hpp"
-#include <iomanip>

-#ifdef HAVE_OPENCL
-
-using namespace cv;
-using namespace cv::ocl;
-using namespace cvtest;
-using namespace testing;
-using namespace std;
-extern std::string workdir;
-
-#ifndef MWC_TEST_UTILITY
-#define MWC_TEST_UTILITY
-
-// Param class
-#ifndef IMPLEMENT_PARAM_CLASS
-#define IMPLEMENT_PARAM_CLASS(name, type) \
-class name \
-    { \
-    public: \
-    name ( type arg = type ()) : val_(arg) {} \
-    operator type () const {return val_;} \
-    private: \
-    type val_; \
-    }; \
-    inline void PrintTo( name param, std::ostream* os) \
-    { \
-    *os << #name <<  "(" << testing::PrintToString(static_cast< type >(param)) << ")"; \
-    }
-
-#endif // IMPLEMENT_PARAM_CLASS
-#endif // MWC_TEST_UTILITY
-
-IMPLEMENT_PARAM_CLASS(WinSizw48, bool);
-
-PARAM_TEST_CASE(HOG, WinSizw48, bool)
+///////////// HOG////////////////////////
+TEST(HOG)
 {
-    bool is48;
-    vector<float> detector;
-    virtual void SetUp()
+    Mat src = imread(abspath("road.png"), cv::IMREAD_GRAYSCALE);
+
+    if (src.empty())
    {
-        is48 = GET_PARAM(0);
-        if(is48)
-        {
-            detector = cv::ocl::HOGDescriptor::getPeopleDetector48x96();
-        }
-        else
-        {
-            detector = cv::ocl::HOGDescriptor::getPeopleDetector64x128();
-        }
-    }
-};
-
-TEST_P(HOG, Performance)
-{
-    cv::Mat img = readImage(workdir + "lena.jpg", cv::IMREAD_GRAYSCALE);
-    ASSERT_FALSE(img.empty());
-
-    // define HOG related arguments
-    float scale = 1.05f;
-    //int nlevels = 13;
-    int gr_threshold = 8;
-    float hit_threshold = 1.4f;
-    //bool hit_threshold_auto = true;
-
-    int win_width = is48 ? 48 : 64;
-    int win_stride_width = 8;
-    int win_stride_height = 8;
-
-    bool gamma_corr = true;
-
-    Size win_size(win_width, win_width * 2); //(64, 128) or (48, 96)
-    Size win_stride(win_stride_width, win_stride_height);
-
-    cv::ocl::HOGDescriptor gpu_hog(win_size, Size(16, 16), Size(8, 8), Size(8, 8), 9,
-                                   cv::ocl::HOGDescriptor::DEFAULT_WIN_SIGMA, 0.2, gamma_corr,
-                                   cv::ocl::HOGDescriptor::DEFAULT_NLEVELS);
-
-    gpu_hog.setSVMDetector(detector);
-
-    double totalgputick = 0;
-    double totalgputick_kernel = 0;
-
-    double t1 = 0;
-    double t2 = 0;
-    for(int j = 0; j < LOOP_TIMES + 1; j ++)
-    {
-        t1 = (double)cvGetTickCount();//gpu start1
-
-        ocl::oclMat d_src(img);//upload
-
-        t2 = (double)cvGetTickCount(); //kernel
-
-        vector<Rect> found;
-        gpu_hog.detectMultiScale(d_src, found, hit_threshold, win_stride,
-                                 Size(0, 0), scale, gr_threshold);
-
-        t2 = (double)cvGetTickCount() - t2;//kernel
-
-        // no download time for HOG
-
-        t1 = (double)cvGetTickCount() - t1;//gpu end1
-
-        if(j == 0)
-            continue;
-
-        totalgputick = t1 + totalgputick;
-
-        totalgputick_kernel = t2 + totalgputick_kernel;
-
+        throw runtime_error("can't open road.png");
    }

-    cout << "average gpu runtime is  " << totalgputick / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
-    cout << "average gpu runtime without data transfer is  " << totalgputick_kernel / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
-}

+    cv::HOGDescriptor hog;
+    hog.setSVMDetector(hog.getDefaultPeopleDetector());
+    std::vector<cv::Rect> found_locations;

-INSTANTIATE_TEST_CASE_P(GPU_ObjDetect, HOG, testing::Combine(testing::Values(WinSizw48(false), WinSizw48(true)), testing::Values(false)));
+    SUBTEST << 768 << 'x' << 576 << "; road.png";

-#endif  //Have opencl
+    hog.detectMultiScale(src, found_locations);
+
+    CPU_ON;
+    hog.detectMultiScale(src, found_locations);
+    CPU_OFF;
+
+    cv::ocl::HOGDescriptor ocl_hog;
+    ocl_hog.setSVMDetector(ocl_hog.getDefaultPeopleDetector());
+    ocl::oclMat d_src;
+    d_src.upload(src);
+
+    WARMUP_ON;
+    ocl_hog.detectMultiScale(d_src, found_locations);
+    WARMUP_OFF;
+
+    GPU_ON;
+    ocl_hog.detectMultiScale(d_src, found_locations);
+     ;
+    GPU_OFF;
+
+    GPU_FULL_ON;
+    d_src.upload(src);
+    ocl_hog.detectMultiScale(d_src, found_locations);
+    GPU_FULL_OFF;
+}
--- a/modules/ocl/perf/perf_imgproc.cpp
+++ b/modules/ocl/perf/perf_imgproc.cpp
--- a/modules/ocl/perf/perf_match_template.cpp
+++ b/modules/ocl/perf/perf_match_template.cpp
@@ -42,191 +42,105 @@
 // the use of this software, even if advised of the possibility of such damage.
 //
 //M*/
-
 #include "precomp.hpp"
-#include <iomanip>
-#ifdef HAVE_OPENCL
-using namespace cv;
-using namespace cv::ocl;
-using namespace cvtest;
-using namespace testing;
-using namespace std;

-#ifndef MWC_TEST_UTILITY
-#define MWC_TEST_UTILITY
-//////// Utility
-#ifndef DIFFERENT_SIZES
-#else
-#undef DIFFERENT_SIZES
-#endif
-#define DIFFERENT_SIZES testing::Values(cv::Size(256, 256), cv::Size(3000, 3000))
-
-// Param class
-#ifndef IMPLEMENT_PARAM_CLASS
-#define IMPLEMENT_PARAM_CLASS(name, type) \
-class name \
-{ \
-public: \
-    name ( type arg = type ()) : val_(arg) {} \
-    operator type () const {return val_;} \
-private: \
-    type val_; \
-}; \
-    inline void PrintTo( name param, std::ostream* os) \
-{ \
-    *os << #name <<  "(" << testing::PrintToString(static_cast< type >(param)) << ")"; \
-}
-
-IMPLEMENT_PARAM_CLASS(Channels, int)
-#endif // IMPLEMENT_PARAM_CLASS
-#endif // MWC_TEST_UTILITY
-
-////////////////////////////////////////////////////////////////////////////////
-// MatchTemplate
-#define ALL_TEMPLATE_METHODS testing::Values(TemplateMethod(cv::TM_SQDIFF), TemplateMethod(cv::TM_CCORR), TemplateMethod(cv::TM_CCOEFF), TemplateMethod(cv::TM_SQDIFF_NORMED), TemplateMethod(cv::TM_CCORR_NORMED), TemplateMethod(cv::TM_CCOEFF_NORMED))
-
-IMPLEMENT_PARAM_CLASS(TemplateSize, cv::Size);
-
-const char *TEMPLATE_METHOD_NAMES[6] = {"TM_SQDIFF", "TM_SQDIFF_NORMED", "TM_CCORR", "TM_CCORR_NORMED", "TM_CCOEFF", "TM_CCOEFF_NORMED"};
-
-PARAM_TEST_CASE(MatchTemplate, cv::Size, TemplateSize, Channels, TemplateMethod)
+/////////// matchTemplate ////////////////////////
+//void InitMatchTemplate()
+//{
+//	Mat src; gen(src, 500, 500, CV_32F, 0, 1);
+//	Mat templ; gen(templ, 500, 500, CV_32F, 0, 1);
+//	ocl::oclMat d_src(src), d_templ(templ), d_dst;
+//	ocl::matchTemplate(d_src, d_templ, d_dst, CV_TM_CCORR);
+//}
+TEST(matchTemplate)
 {
-    cv::Size size;
-    cv::Size templ_size;
-    int cn;
-    int method;
-    //vector<cv::ocl::Info> oclinfo;
+    //InitMatchTemplate();

-    virtual void SetUp()
+    Mat src, templ, dst;
+    int templ_size = 5;
+
+
+    for (int size = Min_Size; size <= Max_Size; size *= Multiple)
    {
-        size = GET_PARAM(0);
-        templ_size = GET_PARAM(1);
-        cn = GET_PARAM(2);
-        method = GET_PARAM(3);
-        //int devnums = getDevice(oclinfo);
-        //CV_Assert(devnums > 0);
+        int all_type[] = {CV_32FC1, CV_32FC4};
+        std::string type_name[] = {"CV_32FC1", "CV_32FC4"};
+
+        for (size_t j = 0; j < sizeof(all_type) / sizeof(int); j++)
+        {
+            for(templ_size = 5; templ_size <= 5; templ_size *= 5)
+            {
+                gen(src, size, size, all_type[j], 0, 1);
+
+                SUBTEST << src.cols << 'x' << src.rows << "; " << type_name[j] << "; templ " << templ_size << 'x' << templ_size << "; CCORR";
+
+                gen(templ, templ_size, templ_size, all_type[j], 0, 1);
+
+                matchTemplate(src, templ, dst, CV_TM_CCORR);
+
+                CPU_ON;
+                matchTemplate(src, templ, dst, CV_TM_CCORR);
+                CPU_OFF;
+
+                ocl::oclMat d_src(src), d_templ, d_dst;
+
+                d_templ.upload(templ);
+
+                WARMUP_ON;
+                ocl::matchTemplate(d_src, d_templ, d_dst, CV_TM_CCORR);
+                WARMUP_OFF;
+
+                GPU_ON;
+                ocl::matchTemplate(d_src, d_templ, d_dst, CV_TM_CCORR);
+                 ;
+                GPU_OFF;
+
+                GPU_FULL_ON;
+                d_src.upload(src);
+                d_templ.upload(templ);
+                ocl::matchTemplate(d_src, d_templ, d_dst, CV_TM_CCORR);
+                d_dst.download(dst);
+                GPU_FULL_OFF;
+            }
+        }
+
+        int all_type_8U[] = {CV_8UC1};
+        std::string type_name_8U[] = {"CV_8UC1"};
+
+        for (size_t j = 0; j < sizeof(all_type_8U) / sizeof(int); j++)
+        {
+            for(templ_size = 5; templ_size <= 5; templ_size *= 5)
+            {
+                SUBTEST << src.cols << 'x' << src.rows << "; " << type_name_8U[j] << "; templ " << templ_size << 'x' << templ_size << "; CCORR_NORMED";
+
+                gen(src, size, size, all_type_8U[j], 0, 255);
+
+                gen(templ, templ_size, templ_size, all_type_8U[j], 0, 255);
+
+                matchTemplate(src, templ, dst, CV_TM_CCORR_NORMED);
+
+                CPU_ON;
+                matchTemplate(src, templ, dst, CV_TM_CCORR_NORMED);
+                CPU_OFF;
+
+                ocl::oclMat d_src(src);
+                ocl::oclMat d_templ(templ), d_dst;
+
+                WARMUP_ON;
+                ocl::matchTemplate(d_src, d_templ, d_dst, CV_TM_CCORR_NORMED);
+                WARMUP_OFF;
+
+                GPU_ON;
+                ocl::matchTemplate(d_src, d_templ, d_dst, CV_TM_CCORR_NORMED);
+                 ;
+                GPU_OFF;
+
+                GPU_FULL_ON;
+                d_src.upload(src);
+                d_templ.upload(templ);
+                ocl::matchTemplate(d_src, d_templ, d_dst, CV_TM_CCORR_NORMED);
+                d_dst.download(dst);
+                GPU_FULL_OFF;
+            }
+        }
    }
-};
-struct MatchTemplate8U : MatchTemplate {};
-
-TEST_P(MatchTemplate8U, Performance)
-{
-    std::cout << "Method: " << TEMPLATE_METHOD_NAMES[method] << std::endl;
-    std::cout << "Image Size: (" << size.width << ", " << size.height << ")" << std::endl;
-    std::cout << "Template Size: (" << templ_size.width << ", " << templ_size.height << ")" << std::endl;
-    std::cout << "Channels: " << cn << std::endl;
-
-    cv::Mat image = randomMat(size, CV_MAKETYPE(CV_8U, cn));
-    cv::Mat templ = randomMat(templ_size, CV_MAKETYPE(CV_8U, cn));
-    cv::Mat dst_gold;
-    cv::ocl::oclMat dst;
-
-
-
-
-
-    double totalgputick = 0;
-    double totalgputick_kernel = 0;
-
-    double t1 = 0;
-    double t2 = 0;
-    for(int j = 0; j < LOOP_TIMES + 1; j ++)
-    {
-
-        t1 = (double)cvGetTickCount();//gpu start1
-
-        cv::ocl::oclMat ocl_image = cv::ocl::oclMat(image);//upload
-        cv::ocl::oclMat ocl_templ = cv::ocl::oclMat(templ);//upload
-
-        t2 = (double)cvGetTickCount(); //kernel
-        cv::ocl::matchTemplate(ocl_image, ocl_templ, dst, method);
-        t2 = (double)cvGetTickCount() - t2;//kernel
-
-        cv::Mat cpu_dst;
-        dst.download (cpu_dst);//download
-
-        t1 = (double)cvGetTickCount() - t1;//gpu end1
-
-        if(j == 0)
-            continue;
-
-        totalgputick = t1 + totalgputick;
-        totalgputick_kernel = t2 + totalgputick_kernel;
-
-    }
-
-    cout << "average gpu runtime is  " << totalgputick / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
-    cout << "average gpu runtime without data transfer is  " << totalgputick_kernel / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
-
-
-}
-
-
-struct MatchTemplate32F : MatchTemplate {};
-TEST_P(MatchTemplate32F, Performance)
-{
-    std::cout << "Method: " << TEMPLATE_METHOD_NAMES[method] << std::endl;
-    std::cout << "Image Size: (" << size.width << ", " << size.height << ")" << std::endl;
-    std::cout << "Template Size: (" << templ_size.width << ", " << templ_size.height << ")" << std::endl;
-    std::cout << "Channels: " << cn << std::endl;
-    cv::Mat image = randomMat(size, CV_MAKETYPE(CV_32F, cn));
-    cv::Mat templ = randomMat(templ_size, CV_MAKETYPE(CV_32F, cn));
-
-    cv::Mat dst_gold;
-    cv::ocl::oclMat dst;
-
-
-
-
-    double totalgputick = 0;
-    double totalgputick_kernel = 0;
-
-    double t1 = 0;
-    double t2 = 0;
-    for(int j = 0; j < LOOP_TIMES; j ++)
-    {
-
-        t1 = (double)cvGetTickCount();//gpu start1
-
-        cv::ocl::oclMat ocl_image = cv::ocl::oclMat(image);//upload
-        cv::ocl::oclMat ocl_templ = cv::ocl::oclMat(templ);//upload
-
-        t2 = (double)cvGetTickCount(); //kernel
-        cv::ocl::matchTemplate(ocl_image, ocl_templ, dst, method);
-        t2 = (double)cvGetTickCount() - t2;//kernel
-
-        cv::Mat cpu_dst;
-        dst.download (cpu_dst);//download
-
-        t1 = (double)cvGetTickCount() - t1;//gpu end1
-
-        totalgputick = t1 + totalgputick;
-
-        totalgputick_kernel = t2 + totalgputick_kernel;
-
-    }
-
-    cout << "average gpu runtime is  " << totalgputick / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
-    cout << "average gpu runtime without data transfer is  " << totalgputick_kernel / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
-
-
-
-}
-
-
-INSTANTIATE_TEST_CASE_P(GPU_ImgProc, MatchTemplate8U,
-                        testing::Combine(
-                            testing::Values(cv::Size(1280, 1024), cv::Size(MWIDTH, MHEIGHT), cv::Size(1800, 1500)),
-                            testing::Values(TemplateSize(cv::Size(5, 5)), TemplateSize(cv::Size(16, 16))/*, TemplateSize(cv::Size(30, 30))*/),
-                            testing::Values(Channels(1), Channels(4)/*, Channels(3)*/),
-                            ALL_TEMPLATE_METHODS
-                        )
-                       );
-
-INSTANTIATE_TEST_CASE_P(GPU_ImgProc, MatchTemplate32F, testing::Combine(
-                            testing::Values(cv::Size(1280, 1024), cv::Size(MWIDTH, MHEIGHT), cv::Size(1800, 1500)),
-                            testing::Values(TemplateSize(cv::Size(5, 5)), TemplateSize(cv::Size(16, 16))/*, TemplateSize(cv::Size(30, 30))*/),
-                            testing::Values(Channels(1), Channels(4) /*, Channels(3)*/),
-                            testing::Values(TemplateMethod(cv::TM_SQDIFF), TemplateMethod(cv::TM_CCORR))));
-
-#endif //HAVE_OPENCL
+}
--- a/modules/ocl/perf/perf_matrix_operation.cpp
+++ b/modules/ocl/perf/perf_matrix_operation.cpp
@@ -10,12 +10,12 @@
 //                           License Agreement
 //                For Open Source Computer Vision Library
 //
-// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved.
+// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
 // Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
 // Third party copyrights are property of their respective owners.
 //
 // @Authors
-//    Jia Haipeng, jiahaipeng95@gmail.com
+//    Fangfang Bai, fangfang@multicorewareinc.com
 //
 // Redistribution and use in source and binary forms, with or without modification,
 // are permitted provided that the following conditions are met:
@@ -30,7 +30,7 @@
 //   * The name of the copyright holders may not be used to endorse or promote products
 //     derived from this software without specific prior written permission.
 //
-// This software is provided by the copyright holders and contributors "as is" and
+// This software is provided by the copyright holders and contributors as is and
 // any express or implied warranties, including, but not limited to, the implied
 // warranties of merchantability and fitness for a particular purpose are disclaimed.
 // In no event shall the Intel Corporation or contributors be liable for any direct,
@@ -42,697 +42,140 @@
 // the use of this software, even if advised of the possibility of such damage.
 //
 //M*/
-
 #include "precomp.hpp"

-#ifdef HAVE_OPENCL
-
-using namespace cvtest;
-using namespace testing;
-using namespace std;
-using namespace cv::ocl;
-////////////////////////////////converto/////////////////////////////////////////////////
-PARAM_TEST_CASE(ConvertToTestBase, MatType, MatType)
+///////////// ConvertTo////////////////////////
+TEST(ConvertTo)
 {
-    int type;
-    int dst_type;
+    Mat src, dst;
+    ocl::oclMat d_src, d_dst;

-    //src mat
-    cv::Mat mat;
-    cv::Mat dst;
+    int all_type[] = {CV_8UC1, CV_8UC4};
+    std::string type_name[] = {"CV_8UC1", "CV_8UC4"};

-    // set up roi
-    int roicols;
-    int roirows;
-    int srcx;
-    int srcy;
-    int dstx;
-    int dsty;
-
-    //src mat with roi
-    cv::Mat mat_roi;
-    cv::Mat dst_roi;
-    //std::vector<cv::ocl::Info> oclinfo;
-    //ocl dst mat for testing
-    cv::ocl::oclMat gdst_whole;
-
-    //ocl mat with roi
-    cv::ocl::oclMat gmat;
-    cv::ocl::oclMat gdst;
-
-    virtual void SetUp()
+    for (int size = Min_Size; size <= Max_Size; size *= Multiple)
    {
-        type     = GET_PARAM(0);
-        dst_type = GET_PARAM(1);
+        for (size_t j = 0; j < sizeof(all_type) / sizeof(int); j++)
+        {
+            SUBTEST << size << 'x' << size << "; " << type_name[j] << " to 32FC1";

-        cv::RNG &rng = TS::ptr()->get_rng();
-        cv::Size size(MWIDTH, MHEIGHT);
+            gen(src, size, size, all_type[j], 0, 256);
+            //gen(dst, size, size, all_type[j], 0, 256);
+
+            //d_dst.upload(dst);
+
+            src.convertTo(dst, CV_32FC1);
+
+            CPU_ON;
+            src.convertTo(dst, CV_32FC1);
+            CPU_OFF;
+
+            d_src.upload(src);
+
+            WARMUP_ON;
+            d_src.convertTo(d_dst, CV_32FC1);
+            WARMUP_OFF;
+
+            GPU_ON;
+            d_src.convertTo(d_dst, CV_32FC1);
+             ;
+            GPU_OFF;
+
+            GPU_FULL_ON;
+            d_src.upload(src);
+            d_src.convertTo(d_dst, CV_32FC1);
+            d_dst.download(dst);
+            GPU_FULL_OFF;
+        }

-        mat = randomMat(rng, size, type, 5, 16, false);
-        dst  = randomMat(rng, size, type, 5, 16, false);
-        //int devnums = getDevice(oclinfo);
-        //CV_Assert(devnums > 0);
-        ////if you want to use undefault device, set it here
-        ////setDevice(oclinfo[0]);
-        //setBinpath(CLBINPATH);
    }
-
-    void Has_roi(int b)
-    {
-        //cv::RNG& rng = TS::ptr()->get_rng();
-        if(b)
-        {
-            //randomize ROI
-            roicols =  mat.cols - 1; //start
-            roirows = mat.rows - 1;
-            srcx   = 1;
-            srcy   = 1;
-            dstx    = 1;
-            dsty    = 1;
-        }
-        else
-        {
-            roicols = mat.cols;
-            roirows = mat.rows;
-            srcx   = 0;
-            srcy   = 0;
-            dstx   = 0;
-            dsty   = 0;
-        };
-
-        mat_roi = mat(Rect(srcx, srcy, roicols, roirows));
-        dst_roi  = dst(Rect(dstx, dsty, roicols, roirows));
-
-        //gdst_whole = dst;
-        //gdst = gdst_whole(Rect(dstx,dsty,roicols,roirows));
-
-        //gmat = mat_roi;
-    }
-};
-
-
-struct ConvertTo : ConvertToTestBase {};
-
-TEST_P(ConvertTo, Accuracy)
-{
-#ifndef PRINT_KERNEL_RUN_TIME
-    double totalcputick = 0;
-    double totalgputick = 0;
-    double totalgputick_kernel = 0;
-    double t0 = 0;
-    double t1 = 0;
-    double t2 = 0;
-    for(int k = LOOPROISTART; k < LOOPROIEND; k++)
-    {
-        totalcputick = 0;
-        totalgputick = 0;
-        totalgputick_kernel = 0;
-        for(int j = 0; j < LOOP_TIMES + 1; j ++)
-        {
-            Has_roi(k);
-
-            t0 = (double)cvGetTickCount();//cpu start
-            mat_roi.convertTo(dst_roi, dst_type);
-            t0 = (double)cvGetTickCount() - t0;//cpu end
-
-            t1 = (double)cvGetTickCount();//gpu start1
-            gdst_whole = dst;
-            gdst = gdst_whole(Rect(dstx, dsty, roicols, roirows));
-
-            gmat = mat_roi;
-            t2 = (double)cvGetTickCount(); //kernel
-            gmat.convertTo(gdst, dst_type);
-            t2 = (double)cvGetTickCount() - t2;//kernel
-            cv::Mat cpu_dst;
-            gdst_whole.download (cpu_dst);//download
-            t1 = (double)cvGetTickCount() - t1;//gpu end1
-            if(j == 0)
-                continue;
-            totalgputick = t1 + totalgputick;
-            totalcputick = t0 + totalcputick;
-            totalgputick_kernel = t2 + totalgputick_kernel;
-
-        }
-        if(k == 0)
-        {
-            cout << "no roi\n";
-        }
-        else
-        {
-            cout << "with roi\n";
-        };
-        cout << "average cpu runtime is  " << totalcputick / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
-        cout << "average gpu runtime is  " << totalgputick / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
-        cout << "average gpu runtime without data transfer is  " << totalgputick_kernel / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
-    }
-#else
-    for(int j = LOOPROISTART; j < LOOPROIEND; j ++)
-    {
-        Has_roi(j);
-        gdst_whole = dst;
-        gdst = gdst_whole(Rect(dstx, dsty, roicols, roirows));
-
-        gmat = mat_roi;
-        if(j == 0)
-        {
-            cout << "no roi:";
-        }
-        else
-        {
-            cout << "\nwith roi:";
-        };
-        gmat.convertTo(gdst, dst_type);
-    };
-#endif
-
 }
-
-
-///////////////////////////////////////////copyto/////////////////////////////////////////////////////////////
-
-PARAM_TEST_CASE(CopyToTestBase, MatType, bool)
+///////////// copyTo////////////////////////
+TEST(copyTo)
 {
-    int type;
+    Mat src, dst;
+    ocl::oclMat d_src, d_dst;

-    cv::Mat mat;
-    cv::Mat mask;
-    cv::Mat dst;
+    int all_type[] = {CV_8UC1, CV_8UC4};
+    std::string type_name[] = {"CV_8UC1", "CV_8UC4"};

-    // set up roi
-    int roicols;
-    int roirows;
-    int srcx;
-    int srcy;
-    int dstx;
-    int dsty;
-    int maskx;
-    int masky;
-
-    //src mat with roi
-    cv::Mat mat_roi;
-    cv::Mat mask_roi;
-    cv::Mat dst_roi;
-    //std::vector<cv::ocl::Info> oclinfo;
-    //ocl dst mat for testing
-    cv::ocl::oclMat gdst_whole;
-
-    //ocl mat with roi
-    cv::ocl::oclMat gmat;
-    cv::ocl::oclMat gdst;
-    cv::ocl::oclMat gmask;
-
-    virtual void SetUp()
+    for (int size = Min_Size; size <= Max_Size; size *= Multiple)
    {
-        type = GET_PARAM(0);
+        for (size_t j = 0; j < sizeof(all_type) / sizeof(int); j++)
+        {
+            SUBTEST << size << 'x' << size << "; " << type_name[j] ;

-        cv::RNG &rng = TS::ptr()->get_rng();
-        cv::Size size(MWIDTH, MHEIGHT);
+            gen(src, size, size, all_type[j], 0, 256);
+            //gen(dst, size, size, all_type[j], 0, 256);

-        mat = randomMat(rng, size, type, 5, 16, false);
-        dst  = randomMat(rng, size, type, 5, 16, false);
-        mask = randomMat(rng, size, CV_8UC1, 0, 2,  false);
+            //d_dst.upload(dst);
+
+            src.copyTo(dst);
+
+            CPU_ON;
+            src.copyTo(dst);
+            CPU_OFF;
+
+            d_src.upload(src);
+
+            WARMUP_ON;
+            d_src.copyTo(d_dst);
+            WARMUP_OFF;
+
+            GPU_ON;
+            d_src.copyTo(d_dst);
+             ;
+            GPU_OFF;
+
+            GPU_FULL_ON;
+            d_src.upload(src);
+            d_src.copyTo(d_dst);
+            d_dst.download(dst);
+            GPU_FULL_OFF;
+        }

-        cv::threshold(mask, mask, 0.5, 255., CV_8UC1);
-        //int devnums = getDevice(oclinfo);
-        //CV_Assert(devnums > 0);
-        ////if you want to use undefault device, set it here
-        ////setDevice(oclinfo[0]);
-        //setBinpath(CLBINPATH);
    }
-
-    void Has_roi(int b)
-    {
-        //cv::RNG& rng = TS::ptr()->get_rng();
-        if(b)
-        {
-            //randomize ROI
-            roicols =  mat.cols - 1; //start
-            roirows = mat.rows - 1;
-            srcx   = 1;
-            srcy   = 1;
-            dstx    = 1;
-            dsty    = 1;
-            maskx   = 1;
-            masky   = 1;
-        }
-        else
-        {
-            roicols = mat.cols;
-            roirows = mat.rows;
-            srcx   = 0;
-            srcy   = 0;
-            dstx   = 0;
-            dsty   = 0;
-            maskx   = 0;
-            masky   = 0;
-        };
-
-        mat_roi = mat(Rect(srcx, srcy, roicols, roirows));
-        mask_roi = mask(Rect(maskx, masky, roicols, roirows));
-        dst_roi  = dst(Rect(dstx, dsty, roicols, roirows));
-
-        //gdst_whole = dst;
-        //gdst = gdst_whole(Rect(dstx,dsty,roicols,roirows));
-
-        //gmat = mat_roi;
-        //gmask = mask_roi;
-    }
-};
-
-struct CopyTo : CopyToTestBase {};
-
-TEST_P(CopyTo, Without_mask)
-{
-#ifndef PRINT_KERNEL_RUN_TIME
-    double totalcputick = 0;
-    double totalgputick = 0;
-    double totalgputick_kernel = 0;
-    double t0 = 0;
-    double t1 = 0;
-    double t2 = 0;
-    for(int k = LOOPROISTART; k < LOOPROIEND; k++)
-    {
-        totalcputick = 0;
-        totalgputick = 0;
-        totalgputick_kernel = 0;
-        for(int j = 0; j < LOOP_TIMES + 1; j ++)
-        {
-            Has_roi(k);
-
-            t0 = (double)cvGetTickCount();//cpu start
-            mat_roi.copyTo(dst_roi);
-            t0 = (double)cvGetTickCount() - t0;//cpu end
-
-            t1 = (double)cvGetTickCount();//gpu start1
-            gdst_whole = dst;
-            gdst = gdst_whole(Rect(dstx, dsty, roicols, roirows));
-
-            gmat = mat_roi;
-            t2 = (double)cvGetTickCount(); //kernel
-            gmat.copyTo(gdst);
-            t2 = (double)cvGetTickCount() - t2;//kernel
-            cv::Mat cpu_dst;
-            gdst_whole.download (cpu_dst);//download
-            t1 = (double)cvGetTickCount() - t1;//gpu end1
-            if(j == 0)
-                continue;
-            totalgputick = t1 + totalgputick;
-            totalcputick = t0 + totalcputick;
-            totalgputick_kernel = t2 + totalgputick_kernel;
-
-        }
-        if(k == 0)
-        {
-            cout << "no roi\n";
-        }
-        else
-        {
-            cout << "with roi\n";
-        };
-        cout << "average cpu runtime is  " << totalcputick / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
-        cout << "average gpu runtime is  " << totalgputick / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
-        cout << "average gpu runtime without data transfer is  " << totalgputick_kernel / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
-    }
-#else
-    for(int j = LOOPROISTART; j < LOOPROIEND; j ++)
-    {
-        Has_roi(j);
-        gdst_whole = dst;
-        gdst = gdst_whole(Rect(dstx, dsty, roicols, roirows));
-
-        gmat = mat_roi;
-
-        if(j == 0)
-        {
-            cout << "no roi:";
-        }
-        else
-        {
-            cout << "\nwith roi:";
-        };
-        gmat.copyTo(gdst);
-    };
-#endif
 }
-
-TEST_P(CopyTo, With_mask)
+///////////// setTo////////////////////////
+TEST(setTo)
 {
-#ifndef PRINT_KERNEL_RUN_TIME
-    double totalcputick = 0;
-    double totalgputick = 0;
-    double totalgputick_kernel = 0;
-    double t0 = 0;
-    double t1 = 0;
-    double t2 = 0;
-    for(int k = LOOPROISTART; k < LOOPROIEND; k++)
+    Mat src, dst;
+    Scalar val(1, 2, 3, 4);
+    ocl::oclMat d_src, d_dst;
+
+    int all_type[] = {CV_8UC1, CV_8UC4};
+    std::string type_name[] = {"CV_8UC1", "CV_8UC4"};
+
+    for (int size = Min_Size; size <= Max_Size; size *= Multiple)
    {
-        totalcputick = 0;
-        totalgputick = 0;
-        totalgputick_kernel = 0;
-        for(int j = 0; j < LOOP_TIMES + 1; j ++)
+        for (size_t j = 0; j < sizeof(all_type) / sizeof(int); j++)
        {
-            Has_roi(k);
+            SUBTEST << size << 'x' << size << "; " << type_name[j] ;

-            t0 = (double)cvGetTickCount();//cpu start
-            mat_roi.copyTo(dst_roi, mask_roi);
-            t0 = (double)cvGetTickCount() - t0;//cpu end
+            gen(src, size, size, all_type[j], 0, 256);

-            t1 = (double)cvGetTickCount();//gpu start1
-            gdst_whole = dst;
-            gdst = gdst_whole(Rect(dstx, dsty, roicols, roirows));
+            src.setTo(val);

-            gmat = mat_roi;
-            gmask = mask_roi;
-            t2 = (double)cvGetTickCount(); //kernel
-            gmat.copyTo(gdst, gmask);
-            t2 = (double)cvGetTickCount() - t2;//kernel
-            cv::Mat cpu_dst;
-            gdst_whole.download (cpu_dst);//download
-            t1 = (double)cvGetTickCount() - t1;//gpu end1
-            if(j == 0)
-                continue;
-            totalgputick = t1 + totalgputick;
-            totalcputick = t0 + totalcputick;
-            totalgputick_kernel = t2 + totalgputick_kernel;
+            CPU_ON;
+            src.setTo(val);
+            CPU_OFF;

+            d_src.upload(src);
+
+            WARMUP_ON;
+            d_src.setTo(val);
+            WARMUP_OFF;
+
+            GPU_ON;
+            d_src.setTo(val);
+             ;
+            GPU_OFF;
+
+            GPU_FULL_ON;
+            d_src.upload(src);
+            d_src.setTo(val);
+            GPU_FULL_OFF;
        }
-        if(k == 0)
-        {
-            cout << "no roi\n";
-        }
-        else
-        {
-            cout << "with roi\n";
-        };
-        cout << "average cpu runtime is  " << totalcputick / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
-        cout << "average gpu runtime is  " << totalgputick / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
-        cout << "average gpu runtime without data transfer is  " << totalgputick_kernel / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
+
    }
-#else
-    for(int j = LOOPROISTART; j < LOOPROIEND; j ++)
-    {
-        Has_roi(j);
-        gdst_whole = dst;
-        gdst = gdst_whole(Rect(dstx, dsty, roicols, roirows));
-
-        gmat = mat_roi;
-        gmask = mask_roi;
-
-        if(j == 0)
-        {
-            cout << "no roi:";
-        }
-        else
-        {
-            cout << "\nwith roi:";
-        };
-        gmat.copyTo(gdst, gmask);
-    };
-#endif
-}
-
-///////////////////////////////////////////copyto/////////////////////////////////////////////////////////////
-
-PARAM_TEST_CASE(SetToTestBase, MatType, bool)
-{
-    int type;
-    cv::Scalar val;
-
-    cv::Mat mat;
-    cv::Mat mask;
-
-    // set up roi
-    int roicols;
-    int roirows;
-    int srcx;
-    int srcy;
-    int maskx;
-    int masky;
-
-    //src mat with roi
-    cv::Mat mat_roi;
-    cv::Mat mask_roi;
-    //std::vector<cv::ocl::Info> oclinfo;
-    //ocl dst mat for testing
-    cv::ocl::oclMat gmat_whole;
-
-    //ocl mat with roi
-    cv::ocl::oclMat gmat;
-    cv::ocl::oclMat gmask;
-
-    virtual void SetUp()
-    {
-        type = GET_PARAM(0);
-
-        cv::RNG &rng = TS::ptr()->get_rng();
-        cv::Size size(MWIDTH, MHEIGHT);
-
-        mat = randomMat(rng, size, type, 5, 16, false);
-        mask = randomMat(rng, size, CV_8UC1, 0, 2,  false);
-
-        cv::threshold(mask, mask, 0.5, 255., CV_8UC1);
-        val = cv::Scalar(rng.uniform(-10.0, 10.0), rng.uniform(-10.0, 10.0), rng.uniform(-10.0, 10.0), rng.uniform(-10.0, 10.0));
-        //int devnums = getDevice(oclinfo);
-        //CV_Assert(devnums > 0);
-        ////if you want to use undefault device, set it here
-        ////setDevice(oclinfo[0]);
-        //setBinpath(CLBINPATH);
-    }
-
-    void Has_roi(int b)
-    {
-        //cv::RNG& rng = TS::ptr()->get_rng();
-        if(b)
-        {
-            //randomize ROI
-            roicols =  mat.cols - 1; //start
-            roirows = mat.rows - 1;
-            srcx   = 1;
-            srcy   = 1;
-            maskx   = 1;
-            masky   = 1;
-        }
-        else
-        {
-            roicols = mat.cols;
-            roirows = mat.rows;
-            srcx   = 0;
-            srcy   = 0;
-            maskx   = 0;
-            masky   = 0;
-        };
-
-        mat_roi = mat(Rect(srcx, srcy, roicols, roirows));
-        mask_roi = mask(Rect(maskx, masky, roicols, roirows));
-
-        //gmat_whole = mat;
-        //gmat = gmat_whole(Rect(srcx,srcy,roicols,roirows));
-
-        //gmask = mask_roi;
-    }
-};
-
-struct SetTo : SetToTestBase {};
-
-TEST_P(SetTo, Without_mask)
-{
-#ifndef PRINT_KERNEL_RUN_TIME
-    double totalcputick = 0;
-    double totalgputick = 0;
-    double totalgputick_kernel = 0;
-    double t0 = 0;
-    double t1 = 0;
-    double t2 = 0;
-    for(int k = LOOPROISTART; k < LOOPROIEND; k++)
-    {
-        totalcputick = 0;
-        totalgputick = 0;
-        totalgputick_kernel = 0;
-        for(int j = 0; j < LOOP_TIMES + 1; j ++)
-        {
-            Has_roi(k);
-
-            t0 = (double)cvGetTickCount();//cpu start
-            mat_roi.setTo(val);
-            t0 = (double)cvGetTickCount() - t0;//cpu end
-
-            t1 = (double)cvGetTickCount();//gpu start1
-            gmat_whole = mat;
-            gmat = gmat_whole(Rect(srcx, srcy, roicols, roirows));
-            t2 = (double)cvGetTickCount(); //kernel
-            gmat.setTo(val);
-            t2 = (double)cvGetTickCount() - t2;//kernel
-            cv::Mat cpu_dst;
-            gmat_whole.download(cpu_dst);//download
-            t1 = (double)cvGetTickCount() - t1;//gpu end1
-            if(j == 0)
-                continue;
-            totalgputick = t1 + totalgputick;
-            totalcputick = t0 + totalcputick;
-            totalgputick_kernel = t2 + totalgputick_kernel;
-
-        }
-        if(k == 0)
-        {
-            cout << "no roi\n";
-        }
-        else
-        {
-            cout << "with roi\n";
-        };
-        cout << "average cpu runtime is  " << totalcputick / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
-        cout << "average gpu runtime is  " << totalgputick / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
-        cout << "average gpu runtime without data transfer is  " << totalgputick_kernel / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
-    }
-#else
-    for(int j = LOOPROISTART; j < LOOPROIEND; j ++)
-    {
-        Has_roi(j);
-        gmat_whole = mat;
-        gmat = gmat_whole(Rect(srcx, srcy, roicols, roirows));
-
-        if(j == 0)
-        {
-            cout << "no roi:";
-        }
-        else
-        {
-            cout << "\nwith roi:";
-        };
-        gmat.setTo(val);
-    };
-#endif
-}
-
-TEST_P(SetTo, With_mask)
-{
-#ifndef PRINT_KERNEL_RUN_TIME
-    double totalcputick = 0;
-    double totalgputick = 0;
-    double totalgputick_kernel = 0;
-    double t0 = 0;
-    double t1 = 0;
-    double t2 = 0;
-    for(int k = LOOPROISTART; k < LOOPROIEND; k++)
-    {
-        totalcputick = 0;
-        totalgputick = 0;
-        totalgputick_kernel = 0;
-        for(int j = 0; j < LOOP_TIMES + 1; j ++)
-        {
-            Has_roi(k);
-
-            t0 = (double)cvGetTickCount();//cpu start
-            mat_roi.setTo(val, mask_roi);
-            t0 = (double)cvGetTickCount() - t0;//cpu end
-
-            t1 = (double)cvGetTickCount();//gpu start1
-            gmat_whole = mat;
-            gmat = gmat_whole(Rect(srcx, srcy, roicols, roirows));
-
-            gmask = mask_roi;
-            t2 = (double)cvGetTickCount(); //kernel
-            gmat.setTo(val, gmask);
-            t2 = (double)cvGetTickCount() - t2;//kernel
-            cv::Mat cpu_dst;
-            gmat_whole.download(cpu_dst);//download
-            t1 = (double)cvGetTickCount() - t1;//gpu end1
-            if(j == 0)
-                continue;
-            totalgputick = t1 + totalgputick;
-            totalcputick = t0 + totalcputick;
-            totalgputick_kernel = t2 + totalgputick_kernel;
-
-        }
-        if(k == 0)
-        {
-            cout << "no roi\n";
-        }
-        else
-        {
-            cout << "with roi\n";
-        };
-        cout << "average cpu runtime is  " << totalcputick / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
-        cout << "average gpu runtime is  " << totalgputick / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
-        cout << "average gpu runtime without data transfer is  " << totalgputick_kernel / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
-    }
-#else
-    for(int j = LOOPROISTART; j < LOOPROIEND; j ++)
-    {
-        Has_roi(j);
-        gmat_whole = mat;
-        gmat = gmat_whole(Rect(srcx, srcy, roicols, roirows));
-
-        gmask = mask_roi;
-
-        if(j == 0)
-        {
-            cout << "no roi:";
-        }
-        else
-        {
-            cout << "\nwith roi:";
-        };
-        gmat.setTo(val, gmask);
-    };
-#endif
-}
-PARAM_TEST_CASE(DataTransfer, MatType, bool)
-{
-    int type;
-    cv::Mat mat;
-    cv::ocl::oclMat gmat_whole;
-
-    virtual void SetUp()
-    {
-        type = GET_PARAM(0);
-        cv::RNG &rng = TS::ptr()->get_rng();
-        cv::Size size(MWIDTH, MHEIGHT);
-        mat = randomMat(rng, size, type, 5, 16, false);
-    }
-};
-TEST_P(DataTransfer, perf)
-{
-    double totaluploadtick = 0;
-    double totaldownloadtick = 0;
-    double totaltick = 0;
-    double t0 = 0;
-    double t1 = 0;
-    cv::Mat cpu_dst;
-    for(int j = 0; j < LOOP_TIMES + 1; j ++)
-    {
-        t0 = (double)cvGetTickCount();
-        gmat_whole.upload(mat);//upload
-        t0 = (double)cvGetTickCount() - t0;
-
-        t1 = (double)cvGetTickCount();
-        gmat_whole.download(cpu_dst);//download
-        t1 = (double)cvGetTickCount() - t1;
-
-        if(j == 0)
-            continue;
-        totaluploadtick = t0 + totaluploadtick;
-        totaldownloadtick = t1 + totaldownloadtick;
-    }
-    totaltick = totaluploadtick + totaldownloadtick;
-    cout << "average upload time is  " << totaluploadtick / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
-    cout << "average download time is  " << totaldownloadtick / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
-    cout << "average data transfer time is  " << totaltick / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
-}
-//**********test************
-
-INSTANTIATE_TEST_CASE_P(MatrixOperation, ConvertTo, Combine(
-                            Values(CV_8UC1, CV_8UC4, CV_32FC1, CV_32FC4),
-                            Values(CV_8UC1, CV_8UC4, CV_32FC1, CV_32FC4)));
-
-INSTANTIATE_TEST_CASE_P(MatrixOperation, CopyTo, Combine(
-                            Values(CV_8UC1, CV_8UC4, CV_32FC1, CV_32FC4),
-                            Values(false))); // Values(false) is the reserved parameter
-
-INSTANTIATE_TEST_CASE_P(MatrixOperation, SetTo, Combine(
-                            Values(CV_8UC1, CV_8UC4, CV_32FC1, CV_32FC4),
-                            Values(false))); // Values(false) is the reserved parameter
-INSTANTIATE_TEST_CASE_P(MatrixOperation, DataTransfer, Combine(
-                            Values(CV_8UC1, CV_8UC3, CV_8UC4, CV_32FC1, CV_32FC3, CV_32FC4),
-                            Values(false))); // Values(false) is the reserved parameter
-#endif
+}
--- a/modules/ocl/perf/perf_norm.cpp
+++ b/modules/ocl/perf/perf_norm.cpp
@@ -0,0 +1,84 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
+// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// @Authors
+//    Fangfang Bai, fangfang@multicorewareinc.com
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other oclMaterials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors as is and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+#include "precomp.hpp"
+
+///////////// norm////////////////////////
+TEST(norm)
+{
+    Mat src, buf;
+    ocl::oclMat d_src, d_buf;
+
+
+    for (int size = Min_Size; size <= Max_Size; size *= Multiple)
+    {
+        SUBTEST << size << 'x' << size << "; CV_8UC1; NORM_INF";
+
+        gen(src, size, size, CV_8UC1, Scalar::all(0), Scalar::all(1));
+        gen(buf, size, size, CV_8UC1, Scalar::all(0), Scalar::all(1));
+
+        norm(src, NORM_INF);
+
+        CPU_ON;
+        norm(src, NORM_INF);
+        CPU_OFF;
+
+        d_src.upload(src);
+        d_buf.upload(buf);
+
+        WARMUP_ON;
+        ocl::norm(d_src, d_buf, NORM_INF);
+        WARMUP_OFF;
+
+        GPU_ON;
+        ocl::norm(d_src, d_buf, NORM_INF);
+         ;
+        GPU_OFF;
+
+        GPU_FULL_ON;
+        d_src.upload(src);
+        ocl::norm(d_src, d_buf, NORM_INF);
+        GPU_FULL_OFF;
+    }
+}
--- a/modules/ocl/perf/perf_pyrdown.cpp
+++ b/modules/ocl/perf/perf_pyrdown.cpp
@@ -1,4 +1,4 @@
-///////////////////////////////////////////////////////////////////////////////////////
+/*M///////////////////////////////////////////////////////////////////////////////////////
 //
 //  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
 //
@@ -15,7 +15,7 @@
 // Third party copyrights are property of their respective owners.
 //
 // @Authors
-//    fangfang bai, fangfang@multicorewareinc.com
+//    Fangfang Bai, fangfang@multicorewareinc.com
 //
 // Redistribution and use in source and binary forms, with or without modification,
 // are permitted provided that the following conditions are met:
@@ -30,7 +30,7 @@
 //   * The name of the copyright holders may not be used to endorse or promote products
 //     derived from this software without specific prior written permission.
 //
-// This software is provided by the copyright holders and contributors "as is" and
+// This software is provided by the copyright holders and contributors as is and
 // any express or implied warranties, including, but not limited to, the implied
 // warranties of merchantability and fitness for a particular purpose are disclaimed.
 // In no event shall the Intel Corporation or contributors be liable for any direct,
@@ -42,96 +42,46 @@
 // the use of this software, even if advised of the possibility of such damage.
 //
 //M*/
-
 #include "precomp.hpp"
-#include <iomanip>

-#ifdef HAVE_OPENCL
-
-using namespace cv;
-using namespace cv::ocl;
-using namespace cvtest;
-using namespace testing;
-using namespace std;
-
-PARAM_TEST_CASE(PyrDown, MatType, int)
+///////////// pyrDown //////////////////////
+TEST(pyrDown)
 {
-    int type;
-    int channels;
-    //src mat
-    cv::Mat mat1;
-    cv::Mat dst;
+    Mat src, dst;
+    int all_type[] = {CV_8UC1, CV_8UC4};
+    std::string type_name[] = {"CV_8UC1", "CV_8UC4"};

-    //std::vector<cv::ocl::Info> oclinfo;
-    //ocl dst mat for testing
-
-    cv::ocl::oclMat gmat1;
-    cv::ocl::oclMat gdst;
-
-
-    virtual void SetUp()
+    for (int size = Min_Size; size <= Max_Size; size *= Multiple)
    {
-        type = GET_PARAM(0);
-        channels = GET_PARAM(1);
-        //int devnums = getDevice(oclinfo);
-        //CV_Assert(devnums > 0);
-    }
-
-
-};
-
-#define VARNAME(A) string(#A);
-
-////////////////////////////////PyrDown/////////////////////////////////////////////////
-TEST_P(PyrDown, Mat)
-{
-    cv::Size size(MWIDTH, MHEIGHT);
-    cv::RNG &rng = TS::ptr()->get_rng();
-    mat1 = randomMat(rng, size, CV_MAKETYPE(type, channels), 5, 16, false);
-
-
-    cv::ocl::oclMat gdst;
-    double totalgputick = 0;
-    double totalgputick_kernel = 0;
-
-    double t1 = 0;
-    double t2 = 0;
-
-    for (int j = 0; j < LOOP_TIMES + 1; j ++)
-    {
-
-        t1 = (double)cvGetTickCount();//gpu start1
-
-        cv::ocl::oclMat gmat1(mat1);
-
-        t2 = (double)cvGetTickCount(); //kernel
-        cv::ocl::pyrDown(gmat1, gdst);
-        t2 = (double)cvGetTickCount() - t2;//kernel
-
-        cv::Mat cpu_dst;
-        gdst.download(cpu_dst);
-
-        t1 = (double)cvGetTickCount() - t1;//gpu end1
-
-        if (j == 0)
+        for (size_t j = 0; j < sizeof(all_type) / sizeof(int); j++)
        {
-            continue;
+            SUBTEST << size << 'x' << size << "; " << type_name[j] ;
+
+            gen(src, size, size, all_type[j], 0, 256);
+
+            pyrDown(src, dst);
+
+            CPU_ON;
+            pyrDown(src, dst);
+            CPU_OFF;
+
+            ocl::oclMat d_src(src);
+            ocl::oclMat d_dst;
+
+            WARMUP_ON;
+            ocl::pyrDown(d_src, d_dst);
+            WARMUP_OFF;
+
+            GPU_ON;
+            ocl::pyrDown(d_src, d_dst);
+             ;
+            GPU_OFF;
+
+            GPU_FULL_ON;
+            d_src.upload(src);
+            ocl::pyrDown(d_src, d_dst);
+            d_dst.download(dst);
+            GPU_FULL_OFF;
        }
-
-        totalgputick = t1 + totalgputick;
-
-        totalgputick_kernel = t2 + totalgputick_kernel;
-
    }
-
-    cout << "average gpu runtime is  " << totalgputick / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
-    cout << "average gpu runtime without data transfer is  " << totalgputick_kernel / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
-
-}
-
-//********test****************
-INSTANTIATE_TEST_CASE_P(GPU_ImgProc, PyrDown, Combine(
-                            Values(CV_8U, CV_32F), Values(1, 4)));
-
-
-#endif // HAVE_OPENCL
+}
--- a/modules/ocl/perf/perf_pyrlk.cpp
+++ b/modules/ocl/perf/perf_pyrlk.cpp
@@ -0,0 +1,143 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
+// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// @Authors
+//    Fangfang Bai, fangfang@multicorewareinc.com
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other oclMaterials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors as is and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+#include "precomp.hpp"
+
+///////////// PyrLKOpticalFlow ////////////////////////
+TEST(PyrLKOpticalFlow)
+{
+    std::string images1[] = {"rubberwhale1.png", "aloeL.jpg"};
+    std::string images2[] = {"rubberwhale2.png", "aloeR.jpg"};
+
+    for (size_t i = 0; i < sizeof(images1) / sizeof(std::string); i++)
+    {
+        Mat frame0 = imread(abspath(images1[i]), i == 0 ? IMREAD_COLOR : IMREAD_GRAYSCALE);
+
+        if (frame0.empty())
+        {
+            std::string errstr = "can't open " + images1[i];
+            throw runtime_error(errstr);
+        }
+
+        Mat frame1 = imread(abspath(images2[i]), i == 0 ? IMREAD_COLOR : IMREAD_GRAYSCALE);
+
+        if (frame1.empty())
+        {
+            std::string errstr = "can't open " + images2[i];
+            throw runtime_error(errstr);
+        }
+
+        Mat gray_frame;
+
+        if (i == 0)
+        {
+            cvtColor(frame0, gray_frame, COLOR_BGR2GRAY);
+        }
+
+        for (int points = Min_Size; points <= Max_Size; points *= Multiple)
+        {
+            if (i == 0)
+                SUBTEST << frame0.cols << "x" << frame0.rows << "; color; " << points << " points";
+            else
+                SUBTEST << frame0.cols << "x" << frame0.rows << "; gray; " << points << " points";
+            Mat nextPts_cpu;
+            Mat status_cpu;
+
+            vector<Point2f> pts;
+            goodFeaturesToTrack(i == 0 ? gray_frame : frame0, pts, points, 0.01, 0.0);
+
+            vector<Point2f> nextPts;
+            vector<unsigned char> status;
+
+            vector<float> err;
+
+            calcOpticalFlowPyrLK(frame0, frame1, pts, nextPts, status, err);
+
+            CPU_ON;
+            calcOpticalFlowPyrLK(frame0, frame1, pts, nextPts, status, err);
+            CPU_OFF;
+
+            ocl::PyrLKOpticalFlow d_pyrLK;
+
+            ocl::oclMat d_frame0(frame0);
+            ocl::oclMat d_frame1(frame1);
+
+            ocl::oclMat d_pts;
+            Mat pts_mat(1, (int)pts.size(), CV_32FC2, (void *)&pts[0]);
+            d_pts.upload(pts_mat);
+
+            ocl::oclMat d_nextPts;
+            ocl::oclMat d_status;
+            ocl::oclMat d_err;
+
+            WARMUP_ON;
+            d_pyrLK.sparse(d_frame0, d_frame1, d_pts, d_nextPts, d_status, &d_err);
+            WARMUP_OFF;
+
+            GPU_ON;
+            d_pyrLK.sparse(d_frame0, d_frame1, d_pts, d_nextPts, d_status, &d_err);
+             ;
+            GPU_OFF;
+
+            GPU_FULL_ON;
+            d_frame0.upload(frame0);
+            d_frame1.upload(frame1);
+            d_pts.upload(pts_mat);
+            d_pyrLK.sparse(d_frame0, d_frame1, d_pts, d_nextPts, d_status, &d_err);
+
+            if (!d_nextPts.empty())
+            {
+                d_nextPts.download(nextPts_cpu);
+            }
+
+            if (!d_status.empty())
+            {
+                d_status.download(status_cpu);
+            }
+
+            GPU_FULL_OFF;
+        }
+
+    }
+}
--- a/modules/ocl/perf/perf_pyrup.cpp
+++ b/modules/ocl/perf/perf_pyrup.cpp
@@ -15,7 +15,7 @@
 // Third party copyrights are property of their respective owners.
 //
 // @Authors
-//    fangfang bai fangfang@multicorewareinc.com
+//    Fangfang Bai, fangfang@multicorewareinc.com
 //
 // Redistribution and use in source and binary forms, with or without modification,
 // are permitted provided that the following conditions are met:
@@ -30,7 +30,7 @@
 //   * The name of the copyright holders may not be used to endorse or promote products
 //     derived from this software without specific prior written permission.
 //
-// This software is provided by the copyright holders and contributors "as is" and
+// This software is provided by the copyright holders and contributors as is and
 // any express or implied warranties, including, but not limited to, the implied
 // warranties of merchantability and fitness for a particular purpose are disclaimed.
 // In no event shall the Intel Corporation or contributors be liable for any direct,
@@ -42,80 +42,46 @@
 // the use of this software, even if advised of the possibility of such damage.
 //
 //M*/
-
 #include "precomp.hpp"
-#include <iomanip>
-#ifdef HAVE_OPENCL
-using namespace cv;
-using namespace cv::ocl;
-using namespace cvtest;
-using namespace testing;
-using namespace std;

-
-PARAM_TEST_CASE(PyrUp, MatType, int)
+///////////// pyrUp ////////////////////////
+TEST(pyrUp)
 {
-    int type;
-    int channels;
-    //std::vector<cv::ocl::Info> oclinfo;
+    Mat src, dst;
+    int all_type[] = {CV_8UC1, CV_8UC4};
+    std::string type_name[] = {"CV_8UC1", "CV_8UC4"};

-    virtual void SetUp()
+    for (int size = 500; size <= 2000; size *= 2)
    {
-        type = GET_PARAM(0);
-        channels = GET_PARAM(1);
-        //int devnums = getDevice(oclinfo);
-        //CV_Assert(devnums > 0);
-    }
-};
-
-TEST_P(PyrUp, Performance)
-{
-    cv::Size size(MWIDTH, MHEIGHT);
-    cv::Mat src = randomMat(size, CV_MAKETYPE(type, channels));
-    cv::Mat dst_gold;
-    cv::ocl::oclMat dst;
-
-
-    double totalgputick = 0;
-    double totalgputick_kernel = 0;
-
-    double t1 = 0;
-    double t2 = 0;
-
-    for (int j = 0; j < LOOP_TIMES + 1; j ++)
-    {
-        t1 = (double)cvGetTickCount();//gpu start1
-
-        cv::ocl::oclMat srcMat = cv::ocl::oclMat(src);//upload
-
-        t2 = (double)cvGetTickCount(); //kernel
-        cv::ocl::pyrUp(srcMat, dst);
-        t2 = (double)cvGetTickCount() - t2;//kernel
-
-        cv::Mat cpu_dst;
-        dst.download(cpu_dst); //download
-
-        t1 = (double)cvGetTickCount() - t1;//gpu end1
-
-        if (j == 0)
+        for (size_t j = 0; j < sizeof(all_type) / sizeof(int); j++)
        {
-            continue;
+            SUBTEST << size << 'x' << size << "; " << type_name[j] ;
+
+            gen(src, size, size, all_type[j], 0, 256);
+
+            pyrUp(src, dst);
+
+            CPU_ON;
+            pyrUp(src, dst);
+            CPU_OFF;
+
+            ocl::oclMat d_src(src);
+            ocl::oclMat d_dst;
+
+            WARMUP_ON;
+            ocl::pyrUp(d_src, d_dst);
+            WARMUP_OFF;
+
+            GPU_ON;
+            ocl::pyrUp(d_src, d_dst);
+             ;
+            GPU_OFF;
+
+            GPU_FULL_ON;
+            d_src.upload(src);
+            ocl::pyrUp(d_src, d_dst);
+            d_dst.download(dst);
+            GPU_FULL_OFF;
        }
-
-        totalgputick = t1 + totalgputick;
-
-        totalgputick_kernel = t2 + totalgputick_kernel;
-
    }
-
-
-    cout << "average gpu runtime is  " << totalgputick / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
-    cout << "average gpu runtime without data transfer is  " << totalgputick_kernel / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
-
-
-}
-
-INSTANTIATE_TEST_CASE_P(GPU_ImgProc, PyrUp, Combine(
-                            Values(CV_8U, CV_32F), Values(1, 4)));
-
-#endif // HAVE_OPENCL
+}
--- a/modules/ocl/perf/perf_split_merge.cpp
+++ b/modules/ocl/perf/perf_split_merge.cpp
@@ -10,12 +10,12 @@
 //                           License Agreement
 //                For Open Source Computer Vision Library
 //
-// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved.
+// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
 // Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
 // Third party copyrights are property of their respective owners.
 //
 // @Authors
-//    Jia Haipeng, jiahaipeng95@gmail.com
+//    Fangfang Bai, fangfang@multicorewareinc.com
 //
 // Redistribution and use in source and binary forms, with or without modification,
 // are permitted provided that the following conditions are met:
@@ -30,7 +30,7 @@
 //   * The name of the copyright holders may not be used to endorse or promote products
 //     derived from this software without specific prior written permission.
 //
-// This software is provided by the copyright holders and contributors "as is" and
+// This software is provided by the copyright holders and contributors as is and
 // any express or implied warranties, including, but not limited to, the implied
 // warranties of merchantability and fitness for a particular purpose are disclaimed.
 // In no event shall the Intel Corporation or contributors be liable for any direct,
@@ -42,446 +42,109 @@
 // the use of this software, even if advised of the possibility of such damage.
 //
 //M*/
-
 #include "precomp.hpp"

-#ifdef HAVE_OPENCL
-
-using namespace cvtest;
-using namespace testing;
-using namespace std;
-using namespace cv::ocl;
-PARAM_TEST_CASE(MergeTestBase, MatType, int)
+///////////// Merge////////////////////////
+TEST(Merge)
 {
-    int type;
-    int channels;
+    Mat dst;
+    ocl::oclMat d_dst;

-    //src mat
-    cv::Mat mat1;
-    cv::Mat mat2;
-    cv::Mat mat3;
-    cv::Mat mat4;
+    int channels = 4;
+    int all_type[] = {CV_8UC1, CV_32FC1};
+    std::string type_name[] = {"CV_8UC1", "CV_32FC1"};

-    //dst mat
-    cv::Mat dst;
-
-    // set up roi
-    int roicols;
-    int roirows;
-    int src1x;
-    int src1y;
-    int src2x;
-    int src2y;
-    int src3x;
-    int src3y;
-    int src4x;
-    int src4y;
-    int dstx;
-    int dsty;
-
-    //src mat with roi
-    cv::Mat mat1_roi;
-    cv::Mat mat2_roi;
-    cv::Mat mat3_roi;
-    cv::Mat mat4_roi;
-
-    //dst mat with roi
-    cv::Mat dst_roi;
-    //std::vector<cv::ocl::Info> oclinfo;
-    //ocl dst mat for testing
-    cv::ocl::oclMat gdst_whole;
-
-    //ocl mat with roi
-    cv::ocl::oclMat gmat1;
-    cv::ocl::oclMat gmat2;
-    cv::ocl::oclMat gmat3;
-    cv::ocl::oclMat gmat4;
-    cv::ocl::oclMat gdst;
-
-    virtual void SetUp()
+    for (int size = Min_Size; size <= Max_Size; size *= Multiple)
    {
-        type = GET_PARAM(0);
-        channels = GET_PARAM(1);
+        for (size_t j = 0; j < sizeof(all_type) / sizeof(int); j++)
+        {
+            SUBTEST << size << 'x' << size << "; " << type_name[j] ;
+            Size size1 = Size(size, size);
+            std::vector<Mat> src(channels);

-        cv::RNG &rng = TS::ptr()->get_rng();
-        cv::Size size(MWIDTH, MHEIGHT);
+            for (int i = 0; i < channels; ++i)
+            {
+                src[i] = Mat(size1, all_type[j], cv::Scalar::all(i));
+            }
+
+            merge(src, dst);
+
+            CPU_ON;
+            merge(src, dst);
+            CPU_OFF;
+
+            std::vector<ocl::oclMat> d_src(channels);
+
+            for (int i = 0; i < channels; ++i)
+            {
+                d_src[i] = ocl::oclMat(size1, all_type[j], cv::Scalar::all(i));
+            }
+
+            WARMUP_ON;
+            ocl::merge(d_src, d_dst);
+            WARMUP_OFF;
+
+            GPU_ON;
+            ocl::merge(d_src, d_dst);
+             ;
+            GPU_OFF;
+
+            GPU_FULL_ON;
+
+            for (int i = 0; i < channels; ++i)
+            {
+                d_src[i] = ocl::oclMat(size1, CV_8U, cv::Scalar::all(i));
+            }
+
+            ocl::merge(d_src, d_dst);
+            d_dst.download(dst);
+            GPU_FULL_OFF;
+        }

-        mat1 = randomMat(rng, size, CV_MAKETYPE(type, 1), 5, 16, false);
-        mat2 = randomMat(rng, size, CV_MAKETYPE(type, 1), 5, 16, false);
-        mat3 = randomMat(rng, size, CV_MAKETYPE(type, 1), 5, 16, false);
-        mat4 = randomMat(rng, size, CV_MAKETYPE(type, 1), 5, 16, false);
-        dst  = randomMat(rng, size, CV_MAKETYPE(type, channels), 5, 16, false);
-        //int devnums = getDevice(oclinfo);
-        //CV_Assert(devnums > 0);
-        ////if you want to use undefault device, set it here
-        ////setDevice(oclinfo[0]);
-        //setBinpath(CLBINPATH);
    }
-    void Has_roi(int b)
-    {
-        //cv::RNG& rng = TS::ptr()->get_rng();
-        if(b)
-        {
-            //randomize ROI
-            roicols =  mat1.cols - 1; //start
-            roirows = mat1.rows - 1;
-            src1x   = 1;
-            src1y   = 1;
-            src2x   = 1;
-            src2y   = 1;
-            src3x   = 1;
-            src3y   = 1;
-            src4x   = 1;
-            src4y   = 1;
-            dstx    = 1;
-            dsty    = 1;
-
-        }
-        else
-        {
-            roicols = mat1.cols;
-            roirows = mat1.rows;
-            src1x   = 0;
-            src1y   = 0;
-            src2x   = 0;
-            src2y   = 0;
-            src3x   = 0;
-            src3y   = 0;
-            src4x   = 0;
-            src4y   = 0;
-            dstx    = 0;
-            dsty    = 0;
-        };
-
-        mat1_roi = mat1(Rect(src1x, src1y, roicols, roirows));
-        mat2_roi = mat2(Rect(src2x, src2y, roicols, roirows));
-        mat3_roi = mat3(Rect(src3x, src3y, roicols, roirows));
-        mat4_roi = mat4(Rect(src4x, src4y, roicols, roirows));
-
-
-        dst_roi = dst(Rect(dstx, dsty, roicols, roirows));
-    }
-
-};
-
-struct Merge : MergeTestBase {};
-
-TEST_P(Merge, Accuracy)
-{
-#ifndef PRINT_KERNEL_RUN_TIME
-    double totalcputick = 0;
-    double totalgputick = 0;
-    double totalgputick_kernel = 0;
-    double t0 = 0;
-    double t1 = 0;
-    double t2 = 0;
-    for(int k = LOOPROISTART; k < LOOPROIEND; k++)
-    {
-        totalcputick = 0;
-        totalgputick = 0;
-        totalgputick_kernel = 0;
-        for(int j = 0; j < LOOP_TIMES + 1; j ++)
-        {
-            Has_roi(k);
-            std::vector<cv::Mat> dev_src;
-            dev_src.push_back(mat1_roi);
-            dev_src.push_back(mat2_roi);
-            dev_src.push_back(mat3_roi);
-            dev_src.push_back(mat4_roi);
-            t0 = (double)cvGetTickCount();//cpu start
-            cv::merge(dev_src, dst_roi);
-            t0 = (double)cvGetTickCount() - t0;//cpu end
-
-            t1 = (double)cvGetTickCount();//gpu start1	]
-            gmat1 = mat1_roi;
-            gmat2 = mat2_roi;
-            gmat3 = mat3_roi;
-            gmat4 = mat4_roi;
-            gdst_whole = dst;
-            gdst = gdst_whole(Rect(dstx, dsty, roicols, roirows));
-            std::vector<cv::ocl::oclMat> dev_gsrc;
-            dev_gsrc.push_back(gmat1);
-            dev_gsrc.push_back(gmat2);
-            dev_gsrc.push_back(gmat3);
-            dev_gsrc.push_back(gmat4);
-            t2 = (double)cvGetTickCount(); //kernel
-            cv::ocl::merge(dev_gsrc, gdst);
-            t2 = (double)cvGetTickCount() - t2;//kernel
-            cv::Mat cpu_dst;
-            gdst_whole.download (cpu_dst);//download
-            t1 = (double)cvGetTickCount() - t1;//gpu end1
-
-            if(j == 0)
-                continue;
-
-            totalgputick = t1 + totalgputick;
-            totalcputick = t0 + totalcputick;
-            totalgputick_kernel = t2 + totalgputick_kernel;
-
-        }
-        if(k == 0)
-        {
-            cout << "no roi\n";
-        }
-        else
-        {
-            cout << "with roi\n";
-        };
-        cout << "average cpu runtime is  " << totalcputick / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
-        cout << "average gpu runtime is  " << totalgputick / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
-        cout << "average gpu runtime without data transfer is  " << totalgputick_kernel / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
-    }
-#else
-    for(int j = LOOPROISTART; j < LOOPROIEND; j ++)
-    {
-        Has_roi(j);
-        gmat1 = mat1_roi;
-        gmat2 = mat2_roi;
-        gmat3 = mat3_roi;
-        gmat4 = mat4_roi;
-        gdst_whole = dst;
-        gdst = gdst_whole(Rect(dstx, dsty, roicols, roirows));
-        std::vector<cv::ocl::oclMat> dev_gsrc;
-        dev_gsrc.push_back(gmat1);
-        dev_gsrc.push_back(gmat2);
-        dev_gsrc.push_back(gmat3);
-        dev_gsrc.push_back(gmat4);
-
-        if(j == 0)
-        {
-            cout << "no roi:";
-        }
-        else
-        {
-            cout << "\nwith roi:";
-        };
-        cv::ocl::merge(dev_gsrc, gdst);
-    };
-#endif
 }

-
-PARAM_TEST_CASE(SplitTestBase, MatType, int)
+///////////// Split////////////////////////
+TEST(Split)
 {
-    int type;
-    int channels;
+    //int channels = 4;
+    int all_type[] = {CV_8UC1, CV_32FC1};
+    std::string type_name[] = {"CV_8UC1", "CV_32FC1"};

-    //src mat
-    cv::Mat mat;
-
-    //dstmat
-    cv::Mat dst1;
-    cv::Mat dst2;
-    cv::Mat dst3;
-    cv::Mat dst4;
-
-    // set up roi
-    int roicols;
-    int roirows;
-    int srcx;
-    int srcy;
-    int dst1x;
-    int dst1y;
-    int dst2x;
-    int dst2y;
-    int dst3x;
-    int dst3y;
-    int dst4x;
-    int dst4y;
-
-    //src mat with roi
-    cv::Mat mat_roi;
-
-    //dst mat with roi
-    cv::Mat dst1_roi;
-    cv::Mat dst2_roi;
-    cv::Mat dst3_roi;
-    cv::Mat dst4_roi;
-    //std::vector<cv::ocl::Info> oclinfo;
-    //ocl dst mat for testing
-    cv::ocl::oclMat gdst1_whole;
-    cv::ocl::oclMat gdst2_whole;
-    cv::ocl::oclMat gdst3_whole;
-    cv::ocl::oclMat gdst4_whole;
-
-    //ocl mat with roi
-    cv::ocl::oclMat gmat;
-    cv::ocl::oclMat gdst1;
-    cv::ocl::oclMat gdst2;
-    cv::ocl::oclMat gdst3;
-    cv::ocl::oclMat gdst4;
-
-    virtual void SetUp()
+    for (int size = Min_Size; size <= Max_Size; size *= Multiple)
    {
-        type = GET_PARAM(0);
-        channels = GET_PARAM(1);
+        for (size_t j = 0; j < sizeof(all_type) / sizeof(int); j++)
+        {
+            SUBTEST << size << 'x' << size << "; " << type_name[j];
+            Size size1 = Size(size, size);

-        cv::RNG &rng = TS::ptr()->get_rng();
-        cv::Size size(MWIDTH, MHEIGHT);
+            Mat src(size1, CV_MAKE_TYPE(all_type[j], 4), cv::Scalar(1, 2, 3, 4));
+
+            std::vector<cv::Mat> dst;
+
+            split(src, dst);
+
+            CPU_ON;
+            split(src, dst);
+            CPU_OFF;
+
+            ocl::oclMat d_src(size1, CV_MAKE_TYPE(all_type[j], 4), cv::Scalar(1, 2, 3, 4));
+            std::vector<cv::ocl::oclMat> d_dst;
+
+            WARMUP_ON;
+            ocl::split(d_src, d_dst);
+            WARMUP_OFF;
+
+            GPU_ON;
+            ocl::split(d_src, d_dst);
+             ;
+            GPU_OFF;
+
+            GPU_FULL_ON;
+            d_src.upload(src);
+            ocl::split(d_src, d_dst);
+            GPU_FULL_OFF;
+        }

-        mat  = randomMat(rng, size, CV_MAKETYPE(type, channels), 5, 16, false);
-        dst1 = randomMat(rng, size, CV_MAKETYPE(type, 1), 5, 16, false);
-        dst2 = randomMat(rng, size, CV_MAKETYPE(type, 1), 5, 16, false);
-        dst3 = randomMat(rng, size, CV_MAKETYPE(type, 1), 5, 16, false);
-        dst4 = randomMat(rng, size, CV_MAKETYPE(type, 1), 5, 16, false);
-        //int devnums = getDevice(oclinfo);
-        //CV_Assert(devnums > 0);
-        ////if you want to use undefault device, set it here
-        ////setDevice(oclinfo[0]);
-        //setBinpath(CLBINPATH);
    }
-
-    void Has_roi(int b)
-    {
-        //cv::RNG& rng = TS::ptr()->get_rng();
-        if(b)
-        {
-            //randomize ROI
-            roicols =  mat.cols - 1; //start
-            roirows = mat.rows - 1;
-            srcx   = 1;
-            srcx   = 1;
-            dst1x    = 1;
-            dst1y    = 1;
-            dst2x    = 1;
-            dst2y    = 1;
-            dst3x    = 1;
-            dst3y    = 1;
-            dst4x    = 1;
-            dst4y    = 1;
-        }
-        else
-        {
-            roicols = mat.cols;
-            roirows = mat.rows;
-            srcx = 0;
-            srcy = 0;
-            dst1x = 0;
-            dst1y = 0;
-            dst2x    = 0;
-            dst2y    = 0;
-            dst3x    = 0;
-            dst3y    = 0;
-            dst4x    = 0;
-            dst4y    = 0;
-        };
-
-        mat_roi = mat(Rect(srcx, srcy, roicols, roirows));
-
-        dst1_roi = dst1(Rect(dst1x, dst1y, roicols, roirows));
-        dst2_roi = dst2(Rect(dst2x, dst2y, roicols, roirows));
-        dst3_roi = dst3(Rect(dst3x, dst3y, roicols, roirows));
-        dst4_roi = dst4(Rect(dst4x, dst4y, roicols, roirows));
-    }
-
-};
-
-struct Split : SplitTestBase {};
-
-TEST_P(Split, Accuracy)
-{
-#ifndef PRINT_KERNEL_RUN_TIME
-    double totalcputick = 0;
-    double totalgputick = 0;
-    double totalgputick_kernel = 0;
-    double t0 = 0;
-    double t1 = 0;
-    double t2 = 0;
-    for(int k = LOOPROISTART; k < LOOPROIEND; k++)
-    {
-        totalcputick = 0;
-        totalgputick = 0;
-        totalgputick_kernel = 0;
-        for(int j = 0; j < LOOP_TIMES + 1; j ++)
-        {
-            Has_roi(k);
-            cv::Mat         dev_dst[4]  = {dst1_roi, dst2_roi, dst3_roi, dst4_roi};
-            cv::ocl::oclMat dev_gdst[4] = {gdst1, gdst2, gdst3, gdst4};
-            t0 = (double)cvGetTickCount();//cpu start
-            cv::split(mat_roi, dev_dst);
-            t0 = (double)cvGetTickCount() - t0;//cpu end
-
-            t1 = (double)cvGetTickCount();//gpu start1
-            gdst1_whole = dst1;
-            gdst1 = gdst1_whole(Rect(dst1x, dst1y, roicols, roirows));
-
-            gdst2_whole = dst2;
-            gdst2 = gdst2_whole(Rect(dst2x, dst2y, roicols, roirows));
-
-            gdst3_whole = dst3;
-            gdst3 = gdst3_whole(Rect(dst3x, dst3y, roicols, roirows));
-
-            gdst4_whole = dst4;
-            gdst4 = gdst4_whole(Rect(dst4x, dst4y, roicols, roirows));
-
-            gmat = mat_roi;
-            t2 = (double)cvGetTickCount(); //kernel
-            cv::ocl::split(gmat, dev_gdst);
-            t2 = (double)cvGetTickCount() - t2;//kernel
-            cv::Mat cpu_dst1;
-            cv::Mat cpu_dst2;
-            cv::Mat cpu_dst3;
-            cv::Mat cpu_dst4;
-            gdst1_whole.download(cpu_dst1);
-            gdst2_whole.download(cpu_dst2);
-            gdst3_whole.download(cpu_dst3);
-            gdst4_whole.download(cpu_dst4);
-            t1 = (double)cvGetTickCount() - t1;//gpu end1
-            if(j == 0)
-                continue;
-            totalgputick = t1 + totalgputick;
-            totalcputick = t0 + totalcputick;
-            totalgputick_kernel = t2 + totalgputick_kernel;
-
-        }
-        if(k == 0)
-        {
-            cout << "no roi\n";
-        }
-        else
-        {
-            cout << "with roi\n";
-        };
-        cout << "average cpu runtime is  " << totalcputick / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
-        cout << "average gpu runtime is  " << totalgputick / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
-        cout << "average gpu runtime without data transfer is  " << totalgputick_kernel / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
-    }
-#else
-    for(int j = LOOPROISTART; j < LOOPROIEND; j ++)
-    {
-        Has_roi(j);
-        //cv::Mat         dev_dst[4]  = {dst1_roi, dst2_roi, dst3_roi, dst4_roi};
-        cv::ocl::oclMat dev_gdst[4] = {gdst1, gdst2, gdst3, gdst4};
-        gdst1_whole = dst1;
-        gdst1 = gdst1_whole(Rect(dst1x, dst1y, roicols, roirows));
-
-        gdst2_whole = dst2;
-        gdst2 = gdst2_whole(Rect(dst2x, dst2y, roicols, roirows));
-
-        gdst3_whole = dst3;
-        gdst3 = gdst3_whole(Rect(dst3x, dst3y, roicols, roirows));
-
-        gdst4_whole = dst4;
-        gdst4 = gdst4_whole(Rect(dst4x, dst4y, roicols, roirows));
-        gmat = mat_roi;
-        if(j == 0)
-        {
-            cout << "no roi:";
-        }
-        else
-        {
-            cout << "\nwith roi:";
-        };
-        cv::ocl::split(gmat, dev_gdst);
-    };
-#endif
 }
-
-//*************test*****************
-INSTANTIATE_TEST_CASE_P(SplitMerge, Merge, Combine(
-                            Values(CV_8UC4, CV_32FC4), Values(1, 4)));
-
-INSTANTIATE_TEST_CASE_P(SplitMerge, Split , Combine(
-                            Values(CV_8U, CV_32S, CV_32F), Values(1, 4)));
-
-#endif // HAVE_OPENCL
--- a/modules/ocl/perf/precomp.cpp
+++ b/modules/ocl/perf/precomp.cpp
@@ -7,12 +7,13 @@
 //  copy or use the software.
 //
 //
-//                        Intel License Agreement
+//                           License Agreement
 //                For Open Source Computer Vision Library
 //
-// Copyright (C) 2000, Intel Corporation, all rights reserved.
+// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
+// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
 // Third party copyrights are property of their respective owners.
-//
+
 // Redistribution and use in source and binary forms, with or without modification,
 // are permitted provided that the following conditions are met:
 //
@@ -21,12 +22,12 @@
 //
 //   * Redistribution's in binary form must reproduce the above copyright notice,
 //     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
+//     and/or other oclMaterials provided with the distribution.
 //
-//   * The name of Intel Corporation may not be used to endorse or promote products
+//   * The name of the copyright holders may not be used to endorse or promote products
 //     derived from this software without specific prior written permission.
 //
-// This software is provided by the copyright holders and contributors "as is" and
+// This software is provided by the copyright holders and contributors as is and
 // any express or implied warranties, including, but not limited to, the implied
 // warranties of merchantability and fitness for a particular purpose are disclaimed.
 // In no event shall the Intel Corporation or contributors be liable for any direct,
@@ -41,4 +42,321 @@

 #include "precomp.hpp"

+// This program test most of the functions in ocl module and generate data metrix of x-factor in .csv files
+// All images needed in this test are in samples/gpu folder.
+// For haar template, haarcascade_frontalface_alt.xml shouold be in working directory
+void TestSystem::run()
+{
+    if (is_list_mode_)
+    {
+        for (vector<Runnable *>::iterator it = tests_.begin(); it != tests_.end(); ++it)
+        {
+            cout << (*it)->name() << endl;
+        }
+
+        return;
+    }
+
+    // Run test initializers
+    for (vector<Runnable *>::iterator it = inits_.begin(); it != inits_.end(); ++it)
+    {
+        if ((*it)->name().find(test_filter_, 0) != string::npos)
+        {
+            (*it)->run();
+        }
+    }
+
+    printHeading();
+    writeHeading();
+
+    // Run tests
+    for (vector<Runnable *>::iterator it = tests_.begin(); it != tests_.end(); ++it)
+    {
+        try
+        {
+            if ((*it)->name().find(test_filter_, 0) != string::npos)
+            {
+                cout << endl << (*it)->name() << ":\n";
+
+                setCurrentTest((*it)->name());
+                //fprintf(record_,"%s\n",(*it)->name().c_str());
+
+                (*it)->run();
+                finishCurrentSubtest();
+            }
+        }
+        catch (const Exception &)
+        {
+            // Message is printed via callback
+            resetCurrentSubtest();
+        }
+        catch (const runtime_error &e)
+        {
+            printError(e.what());
+            resetCurrentSubtest();
+        }
+    }
+
+    printSummary();
+    writeSummary();
+}
+
+
+void TestSystem::finishCurrentSubtest()
+{
+    if (cur_subtest_is_empty_)
+        // There is no need to print subtest statistics
+    {
+        return;
+    }
+
+    double cpu_time = cpu_elapsed_ / getTickFrequency() * 1000.0;
+    double gpu_time = gpu_elapsed_ / getTickFrequency() * 1000.0;
+    double gpu_full_time = gpu_full_elapsed_ / getTickFrequency() * 1000.0;
+
+    double speedup = static_cast<double>(cpu_elapsed_) / std::max(1.0, gpu_elapsed_);
+    speedup_total_ += speedup;
+
+    double fullspeedup = static_cast<double>(cpu_elapsed_) / std::max(1.0, gpu_full_elapsed_);
+    speedup_full_total_ += fullspeedup;
+
+    if (speedup > top_)
+    {
+        speedup_faster_count_++;
+    }
+    else if (speedup < bottom_)
+    {
+        speedup_slower_count_++;
+    }
+    else
+    {
+        speedup_equal_count_++;
+    }
+
+    if (fullspeedup > top_)
+    {
+        speedup_full_faster_count_++;
+    }
+    else if (fullspeedup < bottom_)
+    {
+        speedup_full_slower_count_++;
+    }
+    else
+    {
+        speedup_full_equal_count_++;
+    }
+
+    // compute min, max and
+    std::sort(gpu_times_.begin(), gpu_times_.end());
+    double gpu_min = gpu_times_.front() / getTickFrequency() * 1000.0;
+    double gpu_max = gpu_times_.back() / getTickFrequency() * 1000.0;
+    double deviation = 0;
+
+    if (gpu_times_.size() > 1)
+    {
+        double sum = 0;
+
+        for (size_t i = 0; i < gpu_times_.size(); i++)
+        {
+            int64 diff = gpu_times_[i] - static_cast<int64>(gpu_elapsed_);
+            double diff_time = diff * 1000 / getTickFrequency();
+            sum += diff_time * diff_time;
+        }
+
+        deviation = std::sqrt(sum / gpu_times_.size());
+    }
+
+    printMetrics(cpu_time, gpu_time, gpu_full_time, speedup, fullspeedup);
+    writeMetrics(cpu_time, gpu_time, gpu_full_time, speedup, fullspeedup, gpu_min, gpu_max, deviation);
+
+    num_subtests_called_++;
+    resetCurrentSubtest();
+}
+
+
+double TestSystem::meanTime(const vector<int64> &samples)
+{
+    double sum = accumulate(samples.begin(), samples.end(), 0.);
+    return sum / samples.size();
+}
+
+
+void TestSystem::printHeading()
+{
+    cout << endl;
+    cout << setiosflags(ios_base::left);
+    cout << TAB << setw(10) << "CPU, ms" << setw(10) << "GPU, ms"
+         << setw(14) << "SPEEDUP" << setw(14) << "GPUTOTAL, ms" << setw(14) << "TOTALSPEEDUP"
+         << "DESCRIPTION\n";
+
+    cout << resetiosflags(ios_base::left);
+}
+
+void TestSystem::writeHeading()
+{
+    if (!record_)
+    {
+        recordname_ += "_OCL.csv";
+        record_ = fopen(recordname_.c_str(), "w");
+    }
+
+    fprintf(record_, "NAME,DESCRIPTION,CPU (ms),GPU (ms),SPEEDUP,GPUTOTAL (ms),TOTALSPEEDUP,GPU Min (ms),GPU Max (ms), Standard deviation (ms)\n");
+
+    fflush(record_);
+}
+
+void TestSystem::printSummary()
+{
+    cout << setiosflags(ios_base::fixed);
+    cout << "\naverage GPU speedup: x"
+         << setprecision(3) << speedup_total_ / std::max(1, num_subtests_called_)
+         << endl;
+    cout << "\nGPU exceeded: "
+         << setprecision(3) << speedup_faster_count_
+         << "\nGPU passed: "
+         << setprecision(3) << speedup_equal_count_
+         << "\nGPU failed: "
+         << setprecision(3) << speedup_slower_count_
+         << endl;
+    cout << "\nGPU exceeded rate: "
+         << setprecision(3) << (float)speedup_faster_count_ / std::max(1, num_subtests_called_) * 100
+         << "%"
+         << "\nGPU passed rate: "
+         << setprecision(3) << (float)speedup_equal_count_ / std::max(1, num_subtests_called_) * 100
+         << "%"
+         << "\nGPU failed rate: "
+         << setprecision(3) << (float)speedup_slower_count_ / std::max(1, num_subtests_called_) * 100
+         << "%"
+         << endl;
+    cout << "\naverage GPUTOTAL speedup: x"
+         << setprecision(3) << speedup_full_total_ / std::max(1, num_subtests_called_)
+         << endl;
+    cout << "\nGPUTOTAL exceeded: "
+         << setprecision(3) << speedup_full_faster_count_
+         << "\nGPUTOTAL passed: "
+         << setprecision(3) << speedup_full_equal_count_
+         << "\nGPUTOTAL failed: "
+         << setprecision(3) << speedup_full_slower_count_
+         << endl;
+    cout << "\nGPUTOTAL exceeded rate: "
+         << setprecision(3) << (float)speedup_full_faster_count_ / std::max(1, num_subtests_called_) * 100
+         << "%"
+         << "\nGPUTOTAL passed rate: "
+         << setprecision(3) << (float)speedup_full_equal_count_ / std::max(1, num_subtests_called_) * 100
+         << "%"
+         << "\nGPUTOTAL failed rate: "
+         << setprecision(3) << (float)speedup_full_slower_count_ / std::max(1, num_subtests_called_) * 100
+         << "%"
+         << endl;
+    cout << resetiosflags(ios_base::fixed);
+}
+
+
+void TestSystem::printMetrics(double cpu_time, double gpu_time, double gpu_full_time, double speedup, double fullspeedup)
+{
+    cout << TAB << setiosflags(ios_base::left);
+    stringstream stream;
+
+    stream << cpu_time;
+    cout << setw(10) << stream.str();
+
+    stream.str("");
+    stream << gpu_time;
+    cout << setw(10) << stream.str();
+
+    stream.str("");
+    stream << "x" << setprecision(3) << speedup;
+    cout << setw(14) << stream.str();
+
+    stream.str("");
+    stream << gpu_full_time;
+    cout << setw(14) << stream.str();
+
+    stream.str("");
+    stream << "x" << setprecision(3) << fullspeedup;
+    cout << setw(14) << stream.str();
+
+    cout << cur_subtest_description_.str();
+    cout << resetiosflags(ios_base::left) << endl;
+}
+
+void TestSystem::writeMetrics(double cpu_time, double gpu_time, double gpu_full_time, double speedup, double fullspeedup, double gpu_min, double gpu_max, double std_dev)
+{
+    if (!record_)
+    {
+        recordname_ += ".csv";
+        record_ = fopen(recordname_.c_str(), "w");
+    }
+
+    fprintf(record_, "%s,%s,%.3f,%.3f,%.3f,%.3f,%.3f,%.3f,%.3f,%.3f\n", itname_changed_ ? itname_.c_str() : "",
+            cur_subtest_description_.str().c_str(),
+            cpu_time, gpu_time, speedup, gpu_full_time, fullspeedup,
+            gpu_min, gpu_max, std_dev);
+
+    if (itname_changed_)
+    {
+        itname_changed_ = false;
+    }
+
+    fflush(record_);
+}
+
+void TestSystem::writeSummary()
+{
+    if (!record_)
+    {
+        recordname_ += ".csv";
+        record_ = fopen(recordname_.c_str(), "w");
+    }
+
+    fprintf(record_, "\nAverage GPU speedup: %.3f\n"
+            "exceeded: %d (%.3f%%)\n"
+            "passed: %d (%.3f%%)\n"
+            "failed: %d (%.3f%%)\n"
+            "\nAverage GPUTOTAL speedup: %.3f\n"
+            "exceeded: %d (%.3f%%)\n"
+            "passed: %d (%.3f%%)\n"
+            "failed: %d (%.3f%%)\n",
+            speedup_total_ / std::max(1, num_subtests_called_),
+            speedup_faster_count_, (float)speedup_faster_count_ / std::max(1, num_subtests_called_) * 100,
+            speedup_equal_count_, (float)speedup_equal_count_ / std::max(1, num_subtests_called_) * 100,
+            speedup_slower_count_, (float)speedup_slower_count_ / std::max(1, num_subtests_called_) * 100,
+            speedup_full_total_ / std::max(1, num_subtests_called_),
+            speedup_full_faster_count_, (float)speedup_full_faster_count_ / std::max(1, num_subtests_called_) * 100,
+            speedup_full_equal_count_, (float)speedup_full_equal_count_ / std::max(1, num_subtests_called_) * 100,
+            speedup_full_slower_count_, (float)speedup_full_slower_count_ / std::max(1, num_subtests_called_) * 100
+           );
+    fflush(record_);
+}
+
+void TestSystem::printError(const std::string &msg)
+{
+    if(msg != "CL_INVALID_BUFFER_SIZE")
+    {
+        cout << TAB << "[error: " << msg << "] " << cur_subtest_description_.str() << endl;
+    }
+}
+
+void gen(Mat &mat, int rows, int cols, int type, Scalar low, Scalar high)
+{
+    mat.create(rows, cols, type);
+    RNG rng(0);
+    rng.fill(mat, RNG::UNIFORM, low, high);
+}
+
+
+string abspath(const string &relpath)
+{
+    return TestSystem::instance().workingDir() + relpath;
+}
+
+
+int CV_CDECL cvErrorCallback(int /*status*/, const char * /*func_name*/,
+                             const char *err_msg, const char * /*file_name*/,
+                             int /*line*/, void * /*userdata*/)
+{
+    TestSystem::instance().printError(err_msg);
+    return 0;
+}
+

--- a/modules/ocl/perf/precomp.hpp
+++ b/modules/ocl/perf/precomp.hpp
@@ -7,12 +7,13 @@
 //  copy or use the software.
 //
 //
-//                        Intel License Agreement
+//                           License Agreement
 //                For Open Source Computer Vision Library
 //
-// Copyright (C) 2000, Intel Corporation, all rights reserved.
+// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
+// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
 // Third party copyrights are property of their respective owners.
-//
+
 // Redistribution and use in source and binary forms, with or without modification,
 // are permitted provided that the following conditions are met:
 //
@@ -21,12 +22,12 @@
 //
 //   * Redistribution's in binary form must reproduce the above copyright notice,
 //     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
+//     and/or other oclMaterials provided with the distribution.
 //
-//   * The name of Intel Corporation may not be used to endorse or promote products
+//   * The name of the copyright holders may not be used to endorse or promote products
 //     derived from this software without specific prior written permission.
 //
-// This software is provided by the copyright holders and contributors "as is" and
+// This software is provided by the copyright holders and contributors as is and
 // any express or implied warranties, including, but not limited to, the implied
 // warranties of merchantability and fitness for a particular purpose are disclaimed.
 // In no event shall the Intel Corporation or contributors be liable for any direct,
@@ -39,38 +40,354 @@
 //
 //M*/

-#ifdef __GNUC__
-#  pragma GCC diagnostic ignored "-Wmissing-declarations"
-#  if defined __clang__ || defined __APPLE__
-#    pragma GCC diagnostic ignored "-Wmissing-prototypes"
-#    pragma GCC diagnostic ignored "-Wextra"
-#  endif
-#endif
-
-#ifndef __OPENCV_TEST_PRECOMP_HPP__
-#define __OPENCV_TEST_PRECOMP_HPP__
-
-#include <cmath>
-#include <cstdio>
+#include <iomanip>
+#include <stdexcept>
+#include <string>
 #include <iostream>
-#include <fstream>
-#include <sstream>
-#include <string>
-#include <limits>
-#include <algorithm>
-#include <iterator>
-#include <string>
-#include <cstdarg>
-#include "opencv2/highgui.hpp"
+#include <cstdio>
+#include <vector>
+#include <numeric>
+#include "opencv2/core.hpp"
 #include "opencv2/imgproc.hpp"
+#include "opencv2/highgui.hpp"
 #include "opencv2/video.hpp"
-#include "opencv2/ts.hpp"
+#include "opencv2/objdetect.hpp"
+#include "opencv2/features2d.hpp"
 #include "opencv2/ocl.hpp"

-#include "utility.hpp"
-#include "interpolation.hpp"
+#define Min_Size 1000
+#define Max_Size 4000
+#define Multiple 2
+#define TAB "    "

-#include "opencv2/core/private.hpp"
+using namespace std;
+using namespace cv;

-#endif
+void gen(Mat &mat, int rows, int cols, int type, Scalar low, Scalar high);
+string abspath(const string &relpath);
+int CV_CDECL cvErrorCallback(int, const char *, const char *, const char *, int, void *);
+typedef struct
+{
+    short x;
+    short y;
+} COOR;
+COOR do_meanShift(int x0, int y0, uchar *sptr, uchar *dptr, int sstep,
+                  cv::Size size, int sp, int sr, int maxIter, float eps, int *tab);
+void meanShiftProc_(const Mat &src_roi, Mat &dst_roi, Mat &dstCoor_roi,
+                    int sp, int sr, cv::TermCriteria crit);

+class Runnable
+{
+public:
+    explicit Runnable(const std::string &runname): name_(runname) {}
+    virtual ~Runnable() {}
+
+    const std::string &name() const
+    {
+        return name_;
+    }
+
+    virtual void run() = 0;
+
+private:
+    std::string name_;
+};
+
+class TestSystem
+{
+public:
+    static TestSystem &instance()
+    {
+        static TestSystem me;
+        return me;
+    }
+
+    void setWorkingDir(const std::string &val)
+    {
+        working_dir_ = val;
+    }
+    const std::string &workingDir() const
+    {
+        return working_dir_;
+    }
+
+    void setTestFilter(const std::string &val)
+    {
+        test_filter_ = val;
+    }
+    const std::string &testFilter() const
+    {
+        return test_filter_;
+    }
+
+    void setNumIters(int num_iters)
+    {
+        num_iters_ = num_iters;
+    }
+    void setGPUWarmupIters(int num_iters)
+    {
+        gpu_warmup_iters_ = num_iters;
+    }
+    void setCPUIters(int num_iters)
+    {
+        cpu_num_iters_ = num_iters;
+    }
+
+    void setTopThreshold(double top)
+    {
+        top_ = top;
+    }
+    void setBottomThreshold(double bottom)
+    {
+        bottom_ = bottom;
+    }
+
+    void addInit(Runnable *init)
+    {
+        inits_.push_back(init);
+    }
+    void addTest(Runnable *test)
+    {
+        tests_.push_back(test);
+    }
+    void run();
+
+    // It's public because OpenCV callback uses it
+    void printError(const std::string &msg);
+
+    std::stringstream &startNewSubtest()
+    {
+        finishCurrentSubtest();
+        return cur_subtest_description_;
+    }
+
+    bool stop() const
+    {
+        return cur_iter_idx_ >= num_iters_;
+    }
+
+    bool cpu_stop() const
+    {
+        return cur_iter_idx_ >= cpu_num_iters_;
+    }
+
+    bool warmupStop()
+    {
+        return cur_warmup_idx_++ >= gpu_warmup_iters_;
+    }
+
+    void warmupComplete()
+    {
+        cur_warmup_idx_ = 0;
+    }
+
+    void cpuOn()
+    {
+        cpu_started_ = cv::getTickCount();
+    }
+    void cpuOff()
+    {
+        int64 delta = cv::getTickCount() - cpu_started_;
+        cpu_times_.push_back(delta);
+        ++cur_iter_idx_;
+    }
+    void cpuComplete()
+    {
+        cpu_elapsed_ += meanTime(cpu_times_);
+        cur_subtest_is_empty_ = false;
+        cur_iter_idx_ = 0;
+    }
+
+    void gpuOn()
+    {
+        gpu_started_ = cv::getTickCount();
+    }
+    void gpuOff()
+    {
+        int64 delta = cv::getTickCount() - gpu_started_;
+        gpu_times_.push_back(delta);
+        ++cur_iter_idx_;
+    }
+    void gpuComplete()
+    {
+        gpu_elapsed_ += meanTime(gpu_times_);
+        cur_subtest_is_empty_ = false;
+        cur_iter_idx_ = 0;
+    }
+
+    void gpufullOn()
+    {
+        gpu_full_started_ = cv::getTickCount();
+    }
+    void gpufullOff()
+    {
+        int64 delta = cv::getTickCount() - gpu_full_started_;
+        gpu_full_times_.push_back(delta);
+        ++cur_iter_idx_;
+    }
+    void gpufullComplete()
+    {
+        gpu_full_elapsed_ += meanTime(gpu_full_times_);
+        cur_subtest_is_empty_ = false;
+        cur_iter_idx_ = 0;
+    }
+
+    bool isListMode() const
+    {
+        return is_list_mode_;
+    }
+    void setListMode(bool value)
+    {
+        is_list_mode_ = value;
+    }
+
+    void setRecordName(const std::string &name)
+    {
+        recordname_ = name;
+    }
+
+    void setCurrentTest(const std::string &name)
+    {
+        itname_ = name;
+        itname_changed_ = true;
+    }
+
+private:
+    TestSystem():
+        cur_subtest_is_empty_(true), cpu_elapsed_(0),
+        gpu_elapsed_(0), gpu_full_elapsed_(0), speedup_total_(0.0),
+        num_subtests_called_(0),
+        speedup_faster_count_(0), speedup_slower_count_(0), speedup_equal_count_(0),
+        speedup_full_faster_count_(0), speedup_full_slower_count_(0), speedup_full_equal_count_(0), is_list_mode_(false),
+        num_iters_(10), cpu_num_iters_(2),
+        gpu_warmup_iters_(1), cur_iter_idx_(0), cur_warmup_idx_(0),
+        record_(0), recordname_("performance"), itname_changed_(true)
+    {
+        cpu_times_.reserve(num_iters_);
+        gpu_times_.reserve(num_iters_);
+        gpu_full_times_.reserve(num_iters_);
+    }
+
+    void finishCurrentSubtest();
+    void resetCurrentSubtest()
+    {
+        cpu_elapsed_ = 0;
+        gpu_elapsed_ = 0;
+        gpu_full_elapsed_ = 0;
+        cur_subtest_description_.str("");
+        cur_subtest_is_empty_ = true;
+        cur_iter_idx_ = 0;
+        cpu_times_.clear();
+        gpu_times_.clear();
+        gpu_full_times_.clear();
+    }
+
+    double meanTime(const std::vector<int64> &samples);
+
+    void printHeading();
+    void printSummary();
+    void printMetrics(double cpu_time, double gpu_time = 0.0f, double gpu_full_time = 0.0f, double speedup = 0.0f, double fullspeedup = 0.0f);
+
+    void writeHeading();
+    void writeSummary();
+    void writeMetrics(double cpu_time, double gpu_time = 0.0f, double gpu_full_time = 0.0f,
+                      double speedup = 0.0f, double fullspeedup = 0.0f,
+                      double gpu_min = 0.0f, double gpu_max = 0.0f, double std_dev = 0.0f);
+
+    std::string working_dir_;
+    std::string test_filter_;
+
+    std::vector<Runnable *> inits_;
+    std::vector<Runnable *> tests_;
+
+    std::stringstream cur_subtest_description_;
+    bool cur_subtest_is_empty_;
+
+    int64 cpu_started_;
+    int64 gpu_started_;
+    int64 gpu_full_started_;
+    double cpu_elapsed_;
+    double gpu_elapsed_;
+    double gpu_full_elapsed_;
+
+    double speedup_total_;
+    double speedup_full_total_;
+    int num_subtests_called_;
+
+    int speedup_faster_count_;
+    int speedup_slower_count_;
+    int speedup_equal_count_;
+
+    int speedup_full_faster_count_;
+    int speedup_full_slower_count_;
+    int speedup_full_equal_count_;
+
+    bool is_list_mode_;
+
+    double top_;
+    double bottom_;
+
+    int num_iters_;
+    int cpu_num_iters_;     //there's no need to set cpu running same times with gpu
+    int gpu_warmup_iters_;  //gpu warm up times, default is 1
+    int cur_iter_idx_;
+    int cur_warmup_idx_;    //current gpu warm up times
+    std::vector<int64> cpu_times_;
+    std::vector<int64> gpu_times_;
+    std::vector<int64> gpu_full_times_;
+
+    FILE *record_;
+    std::string recordname_;
+    std::string itname_;
+    bool itname_changed_;
+};
+
+
+#define GLOBAL_INIT(name) \
+struct name##_init: Runnable { \
+    name##_init(): Runnable(#name) { \
+    TestSystem::instance().addInit(this); \
+} \
+    void run(); \
+} name##_init_instance; \
+    void name##_init::run()
+
+
+#define TEST(name) \
+struct name##_test: Runnable { \
+    name##_test(): Runnable(#name) { \
+    TestSystem::instance().addTest(this); \
+} \
+    void run(); \
+} name##_test_instance; \
+    void name##_test::run()
+
+#define SUBTEST TestSystem::instance().startNewSubtest()
+
+#define CPU_ON \
+    while (!TestSystem::instance().cpu_stop()) { \
+    TestSystem::instance().cpuOn()
+#define CPU_OFF \
+    TestSystem::instance().cpuOff(); \
+    } TestSystem::instance().cpuComplete()
+
+#define GPU_ON \
+    while (!TestSystem::instance().stop()) { \
+    TestSystem::instance().gpuOn()
+#define GPU_OFF \
+    ocl::finish(); \
+    TestSystem::instance().gpuOff(); \
+    } TestSystem::instance().gpuComplete()
+
+#define GPU_FULL_ON \
+    while (!TestSystem::instance().stop()) { \
+    TestSystem::instance().gpufullOn()
+#define GPU_FULL_OFF \
+    TestSystem::instance().gpufullOff(); \
+    } TestSystem::instance().gpufullComplete()
+
+#define WARMUP_ON \
+    while (!TestSystem::instance().warmupStop()) {
+#define WARMUP_OFF \
+        ocl::finish(); \
+    } TestSystem::instance().warmupComplete()
--- a/modules/ocl/perf/utility.cpp
+++ b/modules/ocl/perf/utility.cpp
@@ -1,265 +0,0 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                        Intel License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2000, Intel Corporation, all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of Intel Corporation may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors "as is" and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#include "precomp.hpp"
-#define VARNAME(A) #A
-using namespace std;
-using namespace cv;
-using namespace cv::gpu;
-using namespace cvtest;
-
-
-//std::string generateVarList(int first,...)
-//{
-//	vector<std::string> varname;
-//
-//	va_list argp;
-//	string s;
-//	stringstream ss;
-//	va_start(argp,first);
-//	int i=first;
-//	while(i!=-1)
-//	{
-//		ss<<i<<",";
-//		i=va_arg(argp,int);
-//	};
-//	s=ss.str();
-//	va_end(argp);
-//	return s;
-//};
-
-//std::string generateVarList(int& p1,int& p2)
-//{
-//	stringstream ss;
-//	ss<<VARNAME(p1)<<":"<<src1x<<","<<VARNAME(p2)<<":"<<src1y;
-//	return ss.str();
-//};
-
-int randomInt(int minVal, int maxVal)
-{
-    RNG &rng = TS::ptr()->get_rng();
-    return rng.uniform(minVal, maxVal);
-}
-
-double randomDouble(double minVal, double maxVal)
-{
-    RNG &rng = TS::ptr()->get_rng();
-    return rng.uniform(minVal, maxVal);
-}
-
-Size randomSize(int minVal, int maxVal)
-{
-    return cv::Size(randomInt(minVal, maxVal), randomInt(minVal, maxVal));
-}
-
-Scalar randomScalar(double minVal, double maxVal)
-{
-    return Scalar(randomDouble(minVal, maxVal), randomDouble(minVal, maxVal), randomDouble(minVal, maxVal), randomDouble(minVal, maxVal));
-}
-
-Mat randomMat(Size size, int type, double minVal, double maxVal)
-{
-    return randomMat(TS::ptr()->get_rng(), size, type, minVal, maxVal, false);
-}
-
-
-
-
-
-
-
-/*
-void showDiff(InputArray gold_, InputArray actual_, double eps)
-{
-    Mat gold;
-    if (gold_.kind() == _InputArray::MAT)
-        gold = gold_.getMat();
-    else
-        gold_.getGpuMat().download(gold);
-
-    Mat actual;
-    if (actual_.kind() == _InputArray::MAT)
-        actual = actual_.getMat();
-    else
-        actual_.getGpuMat().download(actual);
-
-    Mat diff;
-    absdiff(gold, actual, diff);
-    threshold(diff, diff, eps, 255.0, cv::THRESH_BINARY);
-
-    namedWindow("gold", WINDOW_NORMAL);
-    namedWindow("actual", WINDOW_NORMAL);
-    namedWindow("diff", WINDOW_NORMAL);
-
-    imshow("gold", gold);
-    imshow("actual", actual);
-    imshow("diff", diff);
-
-    waitKey();
-}
-*/
-
-/*
-bool supportFeature(const DeviceInfo& info, FeatureSet feature)
-{
-    return TargetArchs::builtWith(feature) && info.supports(feature);
-}
-
-const vector<DeviceInfo>& devices()
-{
-    static vector<DeviceInfo> devs;
-    static bool first = true;
-
-    if (first)
-    {
-        int deviceCount = getCudaEnabledDeviceCount();
-
-        devs.reserve(deviceCount);
-
-        for (int i = 0; i < deviceCount; ++i)
-        {
-            DeviceInfo info(i);
-            if (info.isCompatible())
-                devs.push_back(info);
-        }
-
-        first = false;
-    }
-
-    return devs;
-}
-
-vector<DeviceInfo> devices(FeatureSet feature)
-{
-    const vector<DeviceInfo>& d = devices();
-
-    vector<DeviceInfo> devs_filtered;
-
-    if (TargetArchs::builtWith(feature))
-    {
-        devs_filtered.reserve(d.size());
-
-        for (size_t i = 0, size = d.size(); i < size; ++i)
-        {
-            const DeviceInfo& info = d[i];
-
-            if (info.supports(feature))
-                devs_filtered.push_back(info);
-        }
-    }
-
-    return devs_filtered;
-}
-*/
-
-vector<MatType> types(int depth_start, int depth_end, int cn_start, int cn_end)
-{
-    vector<MatType> v;
-
-    v.reserve((depth_end - depth_start + 1) * (cn_end - cn_start + 1));
-
-    for (int depth = depth_start; depth <= depth_end; ++depth)
-    {
-        for (int cn = cn_start; cn <= cn_end; ++cn)
-        {
-            v.push_back(CV_MAKETYPE(depth, cn));
-        }
-    }
-
-    return v;
-}
-
-const vector<MatType> &all_types()
-{
-    static vector<MatType> v = types(CV_8U, CV_64F, 1, 4);
-
-    return v;
-}
-
-Mat readImage(const string &fileName, int flags)
-{
-    return imread(string(cvtest::TS::ptr()->get_data_path()) + fileName, flags);
-}
-
-Mat readImageType(const string &fname, int type)
-{
-    Mat src = readImage(fname, CV_MAT_CN(type) == 1 ? IMREAD_GRAYSCALE : IMREAD_COLOR);
-    if (CV_MAT_CN(type) == 4)
-    {
-        Mat temp;
-        cvtColor(src, temp, cv::COLOR_BGR2BGRA);
-        swap(src, temp);
-    }
-    src.convertTo(src, CV_MAT_DEPTH(type));
-    return src;
-}
-
-double checkNorm(const Mat &m)
-{
-    return norm(m, NORM_INF);
-}
-
-double checkNorm(const Mat &m1, const Mat &m2)
-{
-    return norm(m1, m2, NORM_INF);
-}
-
-double checkSimilarity(const Mat &m1, const Mat &m2)
-{
-    Mat diff;
-    matchTemplate(m1, m2, diff, CV_TM_CCORR_NORMED);
-    return std::abs(diff.at<float>(0, 0) - 1.f);
-}
-
-/*
-void cv::ocl::PrintTo(const DeviceInfo& info, ostream* os)
-{
-    (*os) << info.name();
-}
-*/
-
-void PrintTo(const Inverse &inverse, std::ostream *os)
-{
-    if (inverse)
-        (*os) << "inverse";
-    else
-        (*os) << "direct";
-}
--- a/modules/ocl/perf/utility.hpp
+++ b/modules/ocl/perf/utility.hpp
@@ -1,182 +0,0 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                        Intel License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2000, Intel Corporation, all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of Intel Corporation may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors "as is" and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#ifndef __OPENCV_TEST_UTILITY_HPP__
-#define __OPENCV_TEST_UTILITY_HPP__
-//#define PRINT_KERNEL_RUN_TIME
-#ifdef PRINT_KERNEL_RUN_TIME
-#define LOOP_TIMES 1
-#else
-#define LOOP_TIMES 1
-#endif
-#define MWIDTH 1920
-#define MHEIGHT 1080
-#define CLBINPATH ".\\"
-#define LOOPROISTART 0
-#define LOOPROIEND 1
-int randomInt(int minVal, int maxVal);
-double randomDouble(double minVal, double maxVal);
-
-//std::string generateVarList(int first,...);
-std::string generateVarList(int &p1, int &p2);
-cv::Size randomSize(int minVal, int maxVal);
-cv::Scalar randomScalar(double minVal, double maxVal);
-cv::Mat randomMat(cv::Size size, int type, double minVal = 0.0, double maxVal = 255.0);
-
-void showDiff(cv::InputArray gold, cv::InputArray actual, double eps);
-
-//! return true if device supports specified feature and gpu module was built with support the feature.
-//bool supportFeature(const cv::gpu::DeviceInfo& info, cv::gpu::FeatureSet feature);
-
-//! return all devices compatible with current gpu module build.
-//const std::vector<cv::ocl::DeviceInfo>& devices();
-//! return all devices compatible with current gpu module build which support specified feature.
-//std::vector<cv::ocl::DeviceInfo> devices(cv::gpu::FeatureSet feature);
-
-//! read image from testdata folder.
-cv::Mat readImage(const std::string &fileName, int flags = cv::IMREAD_COLOR);
-cv::Mat readImageType(const std::string &fname, int type);
-
-double checkNorm(const cv::Mat &m);
-double checkNorm(const cv::Mat &m1, const cv::Mat &m2);
-double checkSimilarity(const cv::Mat &m1, const cv::Mat &m2);
-
-#define EXPECT_MAT_NORM(mat, eps) \
-{ \
-    EXPECT_LE(checkNorm(cv::Mat(mat)), eps) \
-}
-
-/*#define EXPECT_MAT_NEAR(mat1, mat2, eps) \
-{ \
-   ASSERT_EQ(mat1.type(), mat2.type()); \
-   ASSERT_EQ(mat1.size(), mat2.size()); \
-   EXPECT_LE(checkNorm(cv::Mat(mat1), cv::Mat(mat2)), eps); \
-}*/
-
-#define EXPECT_MAT_NEAR(mat1, mat2, eps,s) \
-{ \
-    ASSERT_EQ(mat1.type(), mat2.type()); \
-    ASSERT_EQ(mat1.size(), mat2.size()); \
-    EXPECT_LE(checkNorm(cv::Mat(mat1), cv::Mat(mat2)), eps)<<s; \
-}
-
-#define EXPECT_MAT_SIMILAR(mat1, mat2, eps) \
-{ \
-    ASSERT_EQ(mat1.type(), mat2.type()); \
-    ASSERT_EQ(mat1.size(), mat2.size()); \
-    EXPECT_LE(checkSimilarity(cv::Mat(mat1), cv::Mat(mat2)), eps); \
-}
-
-namespace cv
-{
-    namespace ocl
-    {
-        // void PrintTo(const DeviceInfo& info, std::ostream* os);
-    }
-}
-
-using perf::MatDepth;
-using perf::MatType;
-
-//! return vector with types from specified range.
-std::vector<MatType> types(int depth_start, int depth_end, int cn_start, int cn_end);
-
-//! return vector with all types (depth: CV_8U-CV_64F, channels: 1-4).
-const std::vector<MatType> &all_types();
-
-class Inverse
-{
-public:
-    inline Inverse(bool val = false) : val_(val) {}
-
-    inline operator bool() const
-    {
-        return val_;
-    }
-
-private:
-    bool val_;
-};
-
-void PrintTo(const Inverse &useRoi, std::ostream *os);
-
-CV_ENUM(CmpCode, cv::CMP_EQ, cv::CMP_GT, cv::CMP_GE, cv::CMP_LT, cv::CMP_LE, cv::CMP_NE)
-
-CV_ENUM(NormCode, cv::NORM_INF, cv::NORM_L1, cv::NORM_L2, cv::NORM_TYPE_MASK, cv::NORM_RELATIVE, cv::NORM_MINMAX)
-
-enum {FLIP_BOTH = 0, FLIP_X = 1, FLIP_Y = -1};
-CV_ENUM(FlipCode, FLIP_BOTH, FLIP_X, FLIP_Y)
-
-CV_ENUM(ReduceOp, CV_REDUCE_SUM, CV_REDUCE_AVG, CV_REDUCE_MAX, CV_REDUCE_MIN)
-
-CV_FLAGS(GemmFlags, cv::GEMM_1_T, cv::GEMM_2_T, cv::GEMM_3_T);
-
-CV_ENUM(MorphOp, cv::MORPH_OPEN, cv::MORPH_CLOSE, cv::MORPH_GRADIENT, cv::MORPH_TOPHAT, cv::MORPH_BLACKHAT)
-
-CV_ENUM(ThreshOp, cv::THRESH_BINARY, cv::THRESH_BINARY_INV, cv::THRESH_TRUNC, cv::THRESH_TOZERO, cv::THRESH_TOZERO_INV)
-
-CV_ENUM(Interpolation, cv::INTER_NEAREST, cv::INTER_LINEAR, cv::INTER_CUBIC)
-
-CV_ENUM(Border, cv::BORDER_REFLECT101, cv::BORDER_REPLICATE, cv::BORDER_CONSTANT, cv::BORDER_REFLECT, cv::BORDER_WRAP)
-
-CV_FLAGS(WarpFlags, cv::INTER_NEAREST, cv::INTER_LINEAR, cv::INTER_CUBIC, cv::WARP_INVERSE_MAP)
-
-CV_ENUM(TemplateMethod, cv::TM_SQDIFF, cv::TM_SQDIFF_NORMED, cv::TM_CCORR, cv::TM_CCORR_NORMED, cv::TM_CCOEFF, cv::TM_CCOEFF_NORMED)
-
-CV_FLAGS(DftFlags, cv::DFT_INVERSE, cv::DFT_SCALE, cv::DFT_ROWS, cv::DFT_COMPLEX_OUTPUT, cv::DFT_REAL_OUTPUT)
-
-void  run_perf_test();
-
-#define PARAM_TEST_CASE(name, ...) struct name : testing::TestWithParam< std::tr1::tuple< __VA_ARGS__ > >
-
-#define GET_PARAM(k) std::tr1::get< k >(GetParam())
-
-#define ALL_DEVICES testing::ValuesIn(devices())
-#define DEVICES(feature) testing::ValuesIn(devices(feature))
-
-#define ALL_TYPES testing::ValuesIn(all_types())
-#define TYPES(depth_start, depth_end, cn_start, cn_end) testing::ValuesIn(types(depth_start, depth_end, cn_start, cn_end))
-
-#define DIFFERENT_SIZES testing::Values(cv::Size(128, 128), cv::Size(113, 113))
-
-#define DIRECT_INVERSE testing::Values(Inverse(false), Inverse(true))
-
-#endif // __OPENCV_TEST_UTILITY_HPP__
--- a/modules/ocl/src/fft.cpp
+++ b/modules/ocl/src/fft.cpp
@@ -205,7 +205,7 @@ cv::ocl::FftPlan::FftPlan(Size _dft_size, int _src_step, int _dst_step, int _fla
    clStridesIn[2]  = is_row_dft ? clStridesIn[1]  : dft_size.width * clStridesIn[1];
    clStridesOut[2] = is_row_dft ? clStridesOut[1] : dft_size.width * clStridesOut[1];

-    openCLSafeCall( clAmdFftCreateDefaultPlan( &plHandle, (cl_context)getoclContext(), dim, clLengthsIn ) );
+    openCLSafeCall( clAmdFftCreateDefaultPlan( &plHandle, *(cl_context*)getoclContext(), dim, clLengthsIn ) );

    openCLSafeCall( clAmdFftSetResultLocation( plHandle, CLFFT_OUTOFPLACE ) );
    openCLSafeCall( clAmdFftSetLayout( plHandle, inLayout, outLayout ) );
@@ -219,8 +219,7 @@ cv::ocl::FftPlan::FftPlan(Size _dft_size, int _src_step, int _dst_step, int _fla
    openCLSafeCall( clAmdFftSetPlanScale  ( plHandle, is_inverse ? CLFFT_BACKWARD : CLFFT_FORWARD, scale_ ) );

    //ready to bake
-    cl_command_queue clq = (cl_command_queue)getoclCommandQueue();
-    openCLSafeCall( clAmdFftBakePlan( plHandle, 1, &clq, NULL, NULL ) );
+    openCLSafeCall( clAmdFftBakePlan( plHandle, 1, (cl_command_queue*)getoclCommandQueue(), NULL, NULL ) );
 }
 cv::ocl::FftPlan::~FftPlan()
 {
--- a/modules/ocl/src/initialization.cpp
+++ b/modules/ocl/src/initialization.cpp
@@ -351,6 +351,11 @@ namespace cv
            return &(Context::getContext()->impl->clCmdQueue);
        }

+        void finish()
+        {
+            clFinish(Context::getContext()->impl->clCmdQueue);
+        }
+
        void queryDeviceInfo(DEVICE_INFO info_type, void* info)
        {
            static Info::Impl* impl = Context::getContext()->impl;
@@ -709,7 +714,7 @@ namespace cv
            clReleaseEvent(event);
 #endif

-            clFinish(clCxt->impl->clCmdQueue);
+            clFlush(clCxt->impl->clCmdQueue);
            openCLSafeCall(clReleaseKernel(kernel));
        }

@@ -905,16 +910,18 @@ namespace cv
        std::auto_ptr<Context> Context::clCxt;
        int Context::val = 0;
        static Mutex cs;
-        Context *Context::getContext()
+        static volatile int context_tear_down = 0;
+        Context* Context::getContext()
        {
            if(*((volatile int*)&val) != 1)
            {
                AutoLock al(cs);
                if(*((volatile int*)&val) != 1)
                {
+                    if (context_tear_down)
+                        return clCxt.get();
                    if( 0 == clCxt.get())
                    clCxt.reset(new Context);
-
                    std::vector<Info> oclinfo;
                    CV_Assert(getDevice(oclinfo, CVCL_DEVICE_TYPE_ALL) > 0);
                    oclinfo[0].impl->setDevice(0, 0, 0);
@@ -1042,9 +1049,14 @@ BOOL WINAPI DllMain( HINSTANCE, DWORD  fdwReason, LPVOID )
    {
        // application hangs if call clReleaseCommandQueue here, so release context only
        // without context release application hangs as well
-        cl_context ctx = (cl_context)getoclContext();
-        if(ctx)
-            openCLSafeCall(clReleaseContext(ctx));
+        context_tear_down = 1;
+        Context* cv_ctx = Context::getContext();
+        if(cv_ctx)
+        {
+            cl_context ctx = (cl_context)&(cv_ctx->impl->oclcontext);
+            if(ctx)
+                openCLSafeCall(clReleaseContext(ctx));
+        }
    }
    return TRUE;
 }
--- a/modules/ocl/src/mcwutil.cpp
+++ b/modules/ocl/src/mcwutil.cpp
@@ -142,7 +142,7 @@ namespace cv
                format.image_channel_data_type = CL_FLOAT;
                break;
            default:
-                throw std::exception();
+                CV_Error(-1, "Image forma is not supported");
                break;
            }
            switch(channels)
@@ -157,7 +157,7 @@ namespace cv
                format.image_channel_order     = CL_RGBA;
                break;
            default:
-                throw std::exception();
+                CV_Error(-1, "Image forma is not supported");
                break;
            }
 #if CL_VERSION_1_2
@@ -195,7 +195,8 @@ namespace cv
                const size_t regin[3] = {mat.cols * mat.elemSize(), mat.rows, 1};
                clEnqueueCopyBufferRect((cl_command_queue)mat.clCxt->oclCommandQueue(), (cl_mem)mat.data, devData, origin, origin,
                    regin, mat.step, 0, mat.cols * mat.elemSize(), 0, 0, NULL, NULL);
-            }
+                clFlush((cl_command_queue)mat.clCxt->oclCommandQueue());
+           }
            else
            {
                devData = (cl_mem)mat.data;
@@ -204,7 +205,7 @@ namespace cv
            clEnqueueCopyBufferToImage((cl_command_queue)mat.clCxt->oclCommandQueue(), devData, texture, 0, origin, region, 0, NULL, 0);
            if ((mat.cols * mat.elemSize() != mat.step))
            {
-                clFinish((cl_command_queue)mat.clCxt->oclCommandQueue());
+                clFlush((cl_command_queue)mat.clCxt->oclCommandQueue());
                clReleaseMemObject(devData);
            }

@@ -229,7 +230,8 @@ namespace cv
            try
            {
                cv::ocl::openCLGetKernelFromSource(clCxt, &_kernel_string, "test_func");
-                //_support = true;
+                finish();
+                _support = true;
            }
            catch (const cv::Exception& e)
            {
--- a/modules/ocl/src/opencl/arithm_absdiff.cl
+++ b/modules/ocl/src/opencl/arithm_absdiff.cl
@@ -44,7 +44,11 @@
 //M*/

 #if defined (DOUBLE_SUPPORT)
+#ifdef cl_khr_fp64
 #pragma OPENCL EXTENSION cl_khr_fp64:enable
+#elif defined (cl_amd_fp64)
+#pragma OPENCL EXTENSION cl_amd_fp64:enable
+#endif
 #endif

 //////////////////////////////////////////////////////////////////////////////////////////////////////
@@ -63,6 +67,9 @@ __kernel void arithm_absdiff_D0 (__global uchar *src1, int src1_step, int src1_o
    {
        x = x << 2;

+#ifdef dst_align
+#undef dst_align
+#endif
        #define dst_align (dst_offset & 3)
        int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
        int src2_index = mad24(y, src2_step, x + src2_offset - dst_align);
@@ -111,7 +118,10 @@ __kernel void arithm_absdiff_D2 (__global ushort *src1, int src1_step, int src1_
    {
        x = x << 2;

-        #define dst_align ((dst_offset >> 1) & 3)
+#ifdef dst_align
+#undef dst_align
+#endif
+#define dst_align ((dst_offset >> 1) & 3)
        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
        int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));

@@ -145,7 +155,10 @@ __kernel void arithm_absdiff_D3 (__global short *src1, int src1_step, int src1_o
    {
        x = x << 2;

-        #define dst_align ((dst_offset >> 1) & 3)
+#ifdef dst_align
+#undef dst_align
+#endif
+#define dst_align ((dst_offset >> 1) & 3)
        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
        int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));

@@ -249,7 +262,10 @@ __kernel void arithm_s_absdiff_C1_D0 (__global   uchar *src1, int src1_step, int
    {
        x = x << 2;

-        #define dst_align (dst_offset & 3)
+#ifdef dst_align
+#undef dst_align
+#endif
+#define dst_align (dst_offset & 3)
        int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);

        int dst_start  = mad24(y, dst_step, dst_offset);
@@ -288,7 +304,10 @@ __kernel void arithm_s_absdiff_C1_D2 (__global   ushort *src1, int src1_step, in
    {
        x = x << 1;

-        #define dst_align ((dst_offset >> 1) & 1)
+#ifdef dst_align
+#undef dst_align
+#endif
+#define dst_align ((dst_offset >> 1) & 1)
        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));

        int dst_start  = mad24(y, dst_step, dst_offset);
@@ -319,7 +338,10 @@ __kernel void arithm_s_absdiff_C1_D3 (__global   short *src1, int src1_step, int
    {
        x = x << 1;

-        #define dst_align ((dst_offset >> 1) & 1)
+#ifdef dst_align
+#undef dst_align
+#endif
+#define dst_align ((dst_offset >> 1) & 1)
        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));

        int dst_start  = mad24(y, dst_step, dst_offset);
@@ -387,8 +409,8 @@ __kernel void arithm_s_absdiff_C1_D5 (__global   float *src1, int src1_step, int

 #if defined (DOUBLE_SUPPORT)
 __kernel void arithm_s_absdiff_C1_D6 (__global   double *src1, int src1_step, int src1_offset,
-                                     __global   double *dst,  int dst_step,  int dst_offset,
-                                     double4 src2, int rows, int cols, int dst_step1)
+                                      __global   double *dst,  int dst_step,  int dst_offset,
+                                      double4 src2, int rows, int cols, int dst_step1)
 {

    int x = get_global_id(0);
@@ -422,7 +444,10 @@ __kernel void arithm_s_absdiff_C2_D0 (__global   uchar *src1, int src1_step, int
    {
        x = x << 1;

-        #define dst_align ((dst_offset >> 1) & 1)
+#ifdef dst_align
+#undef dst_align
+#endif
+#define dst_align ((dst_offset >> 1) & 1)
        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));

        int dst_start  = mad24(y, dst_step, dst_offset);
@@ -465,7 +490,7 @@ __kernel void arithm_s_absdiff_C2_D2 (__global   ushort *src1, int src1_step, in
 }
 __kernel void arithm_s_absdiff_C2_D3 (__global   short *src1, int src1_step, int src1_offset,
                                      __global   short *dst,  int dst_step,  int dst_offset,
-                                     int4 src2, int rows, int cols, int dst_step1)
+                                      int4 src2, int rows, int cols, int dst_step1)
 {

    int x = get_global_id(0);
@@ -509,7 +534,7 @@ __kernel void arithm_s_absdiff_C2_D4 (__global   int *src1, int src1_step, int s
 }
 __kernel void arithm_s_absdiff_C2_D5 (__global   float *src1, int src1_step, int src1_offset,
                                      __global   float *dst,  int dst_step,  int dst_offset,
-                                     float4 src2, int rows, int cols, int dst_step1)
+                                      float4 src2, int rows, int cols, int dst_step1)
 {

    int x = get_global_id(0);
@@ -564,7 +589,10 @@ __kernel void arithm_s_absdiff_C3_D0 (__global   uchar *src1, int src1_step, int
    {
        x = x << 2;

-        #define dst_align (((dst_offset % dst_step) / 3 ) & 3)
+#ifdef dst_align
+#undef dst_align
+#endif
+#define dst_align (((dst_offset % dst_step) / 3 ) & 3)
        int src1_index = mad24(y, src1_step, (x * 3) + src1_offset - (dst_align * 3));

        int dst_start  = mad24(y, dst_step, dst_offset);
@@ -618,7 +646,10 @@ __kernel void arithm_s_absdiff_C3_D2 (__global   ushort *src1, int src1_step, in
    {
        x = x << 1;

-        #define dst_align (((dst_offset % dst_step) / 6 ) & 1)
+#ifdef dst_align
+#undef dst_align
+#endif
+#define dst_align (((dst_offset % dst_step) / 6 ) & 1)
        int src1_index = mad24(y, src1_step, (x * 6) + src1_offset - (dst_align * 6));

        int dst_start  = mad24(y, dst_step, dst_offset);
@@ -644,16 +675,16 @@ __kernel void arithm_s_absdiff_C3_D2 (__global   ushort *src1, int src1_step, in
        data_0.xy = ((dst_index + 0 >= dst_start)) ? tmp_data_0.xy : data_0.xy;

        data_1.x  = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end))
-                     ? tmp_data_1.x : data_1.x;
+                    ? tmp_data_1.x : data_1.x;
        data_1.y  = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
-                     ? tmp_data_1.y : data_1.y;
+                    ? tmp_data_1.y : data_1.y;

        data_2.xy = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
-                     ? tmp_data_2.xy : data_2.xy;
+                    ? tmp_data_2.xy : data_2.xy;

-       *((__global ushort2 *)((__global char *)dst + dst_index + 0))= data_0;
-       *((__global ushort2 *)((__global char *)dst + dst_index + 4))= data_1;
-       *((__global ushort2 *)((__global char *)dst + dst_index + 8))= data_2;
+        *((__global ushort2 *)((__global char *)dst + dst_index + 0))= data_0;
+        *((__global ushort2 *)((__global char *)dst + dst_index + 4))= data_1;
+        *((__global ushort2 *)((__global char *)dst + dst_index + 8))= data_2;
    }
 }
 __kernel void arithm_s_absdiff_C3_D3 (__global   short *src1, int src1_step, int src1_offset,
@@ -668,7 +699,10 @@ __kernel void arithm_s_absdiff_C3_D3 (__global   short *src1, int src1_step, int
    {
        x = x << 1;

-        #define dst_align (((dst_offset % dst_step) / 6 ) & 1)
+#ifdef dst_align
+#undef dst_align
+#endif
+#define dst_align (((dst_offset % dst_step) / 6 ) & 1)
        int src1_index = mad24(y, src1_step, (x * 6) + src1_offset - (dst_align * 6));

        int dst_start  = mad24(y, dst_step, dst_offset);
@@ -694,16 +728,16 @@ __kernel void arithm_s_absdiff_C3_D3 (__global   short *src1, int src1_step, int
        data_0.xy = ((dst_index + 0 >= dst_start)) ? tmp_data_0.xy : data_0.xy;

        data_1.x  = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end))
-                     ? tmp_data_1.x : data_1.x;
+                    ? tmp_data_1.x : data_1.x;
        data_1.y  = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
-                     ? tmp_data_1.y : data_1.y;
+                    ? tmp_data_1.y : data_1.y;

        data_2.xy = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
-                     ? tmp_data_2.xy : data_2.xy;
+                    ? tmp_data_2.xy : data_2.xy;

-       *((__global short2 *)((__global char *)dst + dst_index + 0))= data_0;
-       *((__global short2 *)((__global char *)dst + dst_index + 4))= data_1;
-       *((__global short2 *)((__global char *)dst + dst_index + 8))= data_2;
+        *((__global short2 *)((__global char *)dst + dst_index + 0))= data_0;
+        *((__global short2 *)((__global char *)dst + dst_index + 4))= data_1;
+        *((__global short2 *)((__global char *)dst + dst_index + 8))= data_2;
    }
 }
 __kernel void arithm_s_absdiff_C3_D4 (__global   int *src1, int src1_step, int src1_offset,
@@ -735,9 +769,9 @@ __kernel void arithm_s_absdiff_C3_D4 (__global   int *src1, int src1_step, int s
        int tmp_data_1 = convert_int_sat(abs_diff(src1_data_1, src2_data_1));
        int tmp_data_2 = convert_int_sat(abs_diff(src1_data_2, src2_data_2));

-       *((__global int *)((__global char *)dst + dst_index + 0))= tmp_data_0;
-       *((__global int *)((__global char *)dst + dst_index + 4))= tmp_data_1;
-       *((__global int *)((__global char *)dst + dst_index + 8))= tmp_data_2;
+        *((__global int *)((__global char *)dst + dst_index + 0))= tmp_data_0;
+        *((__global int *)((__global char *)dst + dst_index + 4))= tmp_data_1;
+        *((__global int *)((__global char *)dst + dst_index + 8))= tmp_data_2;
    }
 }
 __kernel void arithm_s_absdiff_C3_D5 (__global   float *src1, int src1_step, int src1_offset,
@@ -769,9 +803,9 @@ __kernel void arithm_s_absdiff_C3_D5 (__global   float *src1, int src1_step, int
        float tmp_data_1 = fabs(src1_data_1 - src2_data_1);
        float tmp_data_2 = fabs(src1_data_2 - src2_data_2);

-       *((__global float *)((__global char *)dst + dst_index + 0))= tmp_data_0;
-       *((__global float *)((__global char *)dst + dst_index + 4))= tmp_data_1;
-       *((__global float *)((__global char *)dst + dst_index + 8))= tmp_data_2;
+        *((__global float *)((__global char *)dst + dst_index + 0))= tmp_data_0;
+        *((__global float *)((__global char *)dst + dst_index + 4))= tmp_data_1;
+        *((__global float *)((__global char *)dst + dst_index + 8))= tmp_data_2;
    }
 }

@@ -805,9 +839,9 @@ __kernel void arithm_s_absdiff_C3_D6 (__global   double *src1, int src1_step, in
        double tmp_data_1 = fabs(src1_data_1 - src2_data_1);
        double tmp_data_2 = fabs(src1_data_2 - src2_data_2);

-       *((__global double *)((__global char *)dst + dst_index + 0 ))= tmp_data_0;
-       *((__global double *)((__global char *)dst + dst_index + 8 ))= tmp_data_1;
-       *((__global double *)((__global char *)dst + dst_index + 16))= tmp_data_2;
+        *((__global double *)((__global char *)dst + dst_index + 0 ))= tmp_data_0;
+        *((__global double *)((__global char *)dst + dst_index + 8 ))= tmp_data_1;
+        *((__global double *)((__global char *)dst + dst_index + 16))= tmp_data_2;
    }
 }
 #endif
--- a/modules/ocl/src/opencl/arithm_add.cl
+++ b/modules/ocl/src/opencl/arithm_add.cl
@@ -45,7 +45,11 @@
 //M*/

 #if defined (DOUBLE_SUPPORT)
+#ifdef cl_khr_fp64
 #pragma OPENCL EXTENSION cl_khr_fp64:enable
+#elif defined (cl_amd_fp64)
+#pragma OPENCL EXTENSION cl_amd_fp64:enable
+#endif
 #endif

 //////////////////////////////////////////////////////////////////////////////////////////////////////
@@ -64,7 +68,10 @@ __kernel void arithm_add_D0 (__global uchar *src1, int src1_step, int src1_offse
    {
        x = x << 2;

-        #define dst_align (dst_offset & 3)
+#ifdef dst_align
+#undef dst_align
+#endif
+#define dst_align (dst_offset & 3)
        int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
        int src2_index = mad24(y, src2_step, x + src2_offset - dst_align);

@@ -112,7 +119,10 @@ __kernel void arithm_add_D2 (__global ushort *src1, int src1_step, int src1_offs
    {
        x = x << 2;

-        #define dst_align ((dst_offset >> 1) & 3)
+#ifdef dst_align
+#undef dst_align
+#endif
+#define dst_align ((dst_offset >> 1) & 3)
        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
        int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));

@@ -147,7 +157,10 @@ __kernel void arithm_add_D3 (__global short *src1, int src1_step, int src1_offse
    {
        x = x << 2;

-        #define dst_align ((dst_offset >> 1) & 3)
+#ifdef dst_align
+#undef dst_align
+#endif
+#define dst_align ((dst_offset >> 1) & 3)
        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
        int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));

@@ -252,7 +265,10 @@ __kernel void arithm_add_with_mask_C1_D0 (__global uchar *src1, int src1_step, i
    {
        x = x << 2;

-        #define dst_align (dst_offset & 3)
+#ifdef dst_align
+#undef dst_align
+#endif
+#define dst_align (dst_offset & 3)
        int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
        int src2_index = mad24(y, src2_step, x + src2_offset - dst_align);
        int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
@@ -311,7 +327,10 @@ __kernel void arithm_add_with_mask_C1_D2 (__global ushort *src1, int src1_step,
    {
        x = x << 1;

-        #define dst_align ((dst_offset >> 1) & 1)
+#ifdef dst_align
+#undef dst_align
+#endif
+#define dst_align ((dst_offset >> 1) & 1)
        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
        int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
        int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
@@ -348,7 +367,10 @@ __kernel void arithm_add_with_mask_C1_D3 (__global short *src1, int src1_step, i
    {
        x = x << 1;

-        #define dst_align ((dst_offset >> 1) & 1)
+#ifdef dst_align
+#undef dst_align
+#endif
+#define dst_align ((dst_offset >> 1) & 1)
        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
        int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
        int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
@@ -477,7 +499,10 @@ __kernel void arithm_add_with_mask_C2_D0 (__global uchar *src1, int src1_step, i
    {
        x = x << 1;

-        #define dst_align ((dst_offset >> 1) & 1)
+#ifdef dst_align
+#undef dst_align
+#endif
+#define dst_align ((dst_offset >> 1) & 1)
        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
        int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
        int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
@@ -664,7 +689,10 @@ __kernel void arithm_add_with_mask_C3_D0 (__global uchar *src1, int src1_step, i
    {
        x = x << 2;

-        #define dst_align (((dst_offset % dst_step) / 3 ) & 3)
+#ifdef dst_align
+#undef dst_align
+#endif
+#define dst_align (((dst_offset % dst_step) / 3 ) & 3)
        int src1_index = mad24(y, src1_step, (x * 3) + src1_offset - (dst_align * 3));
        int src2_index = mad24(y, src2_step, (x * 3) + src2_offset - (dst_align * 3));
        int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
@@ -724,7 +752,10 @@ __kernel void arithm_add_with_mask_C3_D2 (__global ushort *src1, int src1_step,
    {
        x = x << 1;

-        #define dst_align (((dst_offset % dst_step) / 6 ) & 1)
+#ifdef dst_align
+#undef dst_align
+#endif
+#define dst_align (((dst_offset % dst_step) / 6 ) & 1)
        int src1_index = mad24(y, src1_step, (x * 6) + src1_offset - (dst_align * 6));
        int src2_index = mad24(y, src2_step, (x * 6) + src2_offset - (dst_align * 6));
        int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
@@ -754,16 +785,16 @@ __kernel void arithm_add_with_mask_C3_D2 (__global ushort *src1, int src1_step,
        data_0.xy = ((mask_data.x) && (dst_index + 0 >= dst_start)) ? tmp_data_0.xy : data_0.xy;

        data_1.x  = ((mask_data.x) && (dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end))
-                     ? tmp_data_1.x : data_1.x;
+                    ? tmp_data_1.x : data_1.x;
        data_1.y  = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
-                     ? tmp_data_1.y : data_1.y;
+                    ? tmp_data_1.y : data_1.y;

        data_2.xy = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
-                     ? tmp_data_2.xy : data_2.xy;
+                    ? tmp_data_2.xy : data_2.xy;

-       *((__global ushort2 *)((__global char *)dst + dst_index + 0))= data_0;
-       *((__global ushort2 *)((__global char *)dst + dst_index + 4))= data_1;
-       *((__global ushort2 *)((__global char *)dst + dst_index + 8))= data_2;
+        *((__global ushort2 *)((__global char *)dst + dst_index + 0))= data_0;
+        *((__global ushort2 *)((__global char *)dst + dst_index + 4))= data_1;
+        *((__global ushort2 *)((__global char *)dst + dst_index + 8))= data_2;
    }
 }
 __kernel void arithm_add_with_mask_C3_D3 (__global short *src1, int src1_step, int src1_offset,
@@ -780,7 +811,10 @@ __kernel void arithm_add_with_mask_C3_D3 (__global short *src1, int src1_step, i
    {
        x = x << 1;

-        #define dst_align (((dst_offset % dst_step) / 6 ) & 1)
+#ifdef dst_align
+#undef dst_align
+#endif
+#define dst_align (((dst_offset % dst_step) / 6 ) & 1)
        int src1_index = mad24(y, src1_step, (x * 6) + src1_offset - (dst_align * 6));
        int src2_index = mad24(y, src2_step, (x * 6) + src2_offset - (dst_align * 6));
        int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
@@ -810,16 +844,16 @@ __kernel void arithm_add_with_mask_C3_D3 (__global short *src1, int src1_step, i
        data_0.xy = ((mask_data.x) && (dst_index + 0 >= dst_start)) ? tmp_data_0.xy : data_0.xy;

        data_1.x  = ((mask_data.x) && (dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end))
-                     ? tmp_data_1.x : data_1.x;
+                    ? tmp_data_1.x : data_1.x;
        data_1.y  = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
-                     ? tmp_data_1.y : data_1.y;
+                    ? tmp_data_1.y : data_1.y;

        data_2.xy = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
-                     ? tmp_data_2.xy : data_2.xy;
+                    ? tmp_data_2.xy : data_2.xy;

-       *((__global short2 *)((__global char *)dst + dst_index + 0))= data_0;
-       *((__global short2 *)((__global char *)dst + dst_index + 4))= data_1;
-       *((__global short2 *)((__global char *)dst + dst_index + 8))= data_2;
+        *((__global short2 *)((__global char *)dst + dst_index + 0))= data_0;
+        *((__global short2 *)((__global char *)dst + dst_index + 4))= data_1;
+        *((__global short2 *)((__global char *)dst + dst_index + 8))= data_2;
    }
 }
 __kernel void arithm_add_with_mask_C3_D4 (__global int   *src1, int src1_step, int src1_offset,
@@ -861,9 +895,9 @@ __kernel void arithm_add_with_mask_C3_D4 (__global int   *src1, int src1_step, i
        data_1 = mask_data ? tmp_data_1 : data_1;
        data_2 = mask_data ? tmp_data_2 : data_2;

-       *((__global int *)((__global char *)dst + dst_index + 0))= data_0;
-       *((__global int *)((__global char *)dst + dst_index + 4))= data_1;
-       *((__global int *)((__global char *)dst + dst_index + 8))= data_2;
+        *((__global int *)((__global char *)dst + dst_index + 0))= data_0;
+        *((__global int *)((__global char *)dst + dst_index + 4))= data_1;
+        *((__global int *)((__global char *)dst + dst_index + 8))= data_2;
    }
 }
 __kernel void arithm_add_with_mask_C3_D5 (__global float *src1, int src1_step, int src1_offset,
@@ -905,9 +939,9 @@ __kernel void arithm_add_with_mask_C3_D5 (__global float *src1, int src1_step, i
        data_1 = mask_data ? tmp_data_1 : data_1;
        data_2 = mask_data ? tmp_data_2 : data_2;

-       *((__global float *)((__global char *)dst + dst_index + 0))= data_0;
-       *((__global float *)((__global char *)dst + dst_index + 4))= data_1;
-       *((__global float *)((__global char *)dst + dst_index + 8))= data_2;
+        *((__global float *)((__global char *)dst + dst_index + 0))= data_0;
+        *((__global float *)((__global char *)dst + dst_index + 4))= data_1;
+        *((__global float *)((__global char *)dst + dst_index + 8))= data_2;
    }
 }

@@ -951,9 +985,9 @@ __kernel void arithm_add_with_mask_C3_D6 (__global double *src1, int src1_step,
        data_1 = mask_data ? tmp_data_1 : data_1;
        data_2 = mask_data ? tmp_data_2 : data_2;

-       *((__global double *)((__global char *)dst + dst_index + 0 ))= data_0;
-       *((__global double *)((__global char *)dst + dst_index + 8 ))= data_1;
-       *((__global double *)((__global char *)dst + dst_index + 16))= data_2;
+        *((__global double *)((__global char *)dst + dst_index + 0 ))= data_0;
+        *((__global double *)((__global char *)dst + dst_index + 8 ))= data_1;
+        *((__global double *)((__global char *)dst + dst_index + 16))= data_2;
    }
 }
 #endif
--- a/modules/ocl/src/opencl/arithm_addWeighted.cl
+++ b/modules/ocl/src/opencl/arithm_addWeighted.cl
@@ -42,8 +42,12 @@
 // the use of this software, even if advised of the possibility of such damage.
 //
 //M*/
-#if defined DOUBLE_SUPPORT
+#if defined (DOUBLE_SUPPORT)
+#ifdef cl_khr_fp64
 #pragma OPENCL EXTENSION cl_khr_fp64:enable
+#elif defined (cl_amd_fp64)
+#pragma OPENCL EXTENSION cl_amd_fp64:enable
+#endif
 typedef double F;
 #else
 typedef float F;
@@ -52,10 +56,10 @@ typedef float F;
 /////////////////////////////////////////////addWeighted//////////////////////////////////////////////
 ///////////////////////////////////////////////////////////////////////////////////////////////////////
 __kernel void addWeighted_D0 (__global uchar *src1,int src1_step,int src1_offset,
-                           __global uchar *src2, int src2_step,int src2_offset,
-                           F alpha,F beta,F gama,
-                           __global uchar *dst,  int dst_step,int dst_offset,
-                           int rows,  int cols,int dst_step1)
+                              __global uchar *src2, int src2_step,int src2_offset,
+                              F alpha,F beta,F gama,
+                              __global uchar *dst,  int dst_step,int dst_offset,
+                              int rows,  int cols,int dst_step1)
 {
    int x = get_global_id(0);
    int y = get_global_id(1);
@@ -65,7 +69,10 @@ __kernel void addWeighted_D0 (__global uchar *src1,int src1_step,int src1_offset
    {

        x = x << 2;
-        #define dst_align (dst_offset & 3)
+#ifdef dst_align
+#undef dst_align
+#endif
+#define dst_align (dst_offset & 3)
        int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
        int src2_index = mad24(y, src2_step, x + src2_offset - dst_align);

@@ -87,7 +94,7 @@ __kernel void addWeighted_D0 (__global uchar *src1,int src1_step,int src1_offset

        uchar4 dst_data = *((__global uchar4 *)(dst + dst_index));
 //        short4 tmp      = convert_short4_sat(src1_data) * alpha + convert_short4_sat(src2_data) * beta + gama;
-         short4 tmp;
+        short4 tmp;
        tmp.x = src1_data.x * alpha + src2_data.x * beta + gama;
        tmp.y = src1_data.y * alpha + src2_data.y * beta + gama;
        tmp.z = src1_data.z * alpha + src2_data.z * beta + gama;
@@ -100,7 +107,7 @@ __kernel void addWeighted_D0 (__global uchar *src1,int src1_step,int src1_offset
        dst_data.w = ((dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end)) ? tmp_data.w : dst_data.w;

        *((__global uchar4 *)(dst + dst_index)) = dst_data;
-       // dst[x + y * dst_step] = src1[x + y * src1_step] * alpha + src2[x + y * src2_step] * beta + gama;
+        // dst[x + y * dst_step] = src1[x + y * src1_step] * alpha + src2[x + y * src2_step] * beta + gama;
    }

 }
@@ -108,10 +115,10 @@ __kernel void addWeighted_D0 (__global uchar *src1,int src1_step,int src1_offset


 __kernel void addWeighted_D2 (__global ushort *src1, int src1_step,int src1_offset,
-                           __global ushort *src2, int src2_step,int src2_offset,
-                           F alpha,F beta,F gama,
-                           __global ushort *dst,  int dst_step,int dst_offset,
-                           int rows,  int cols,int dst_step1)
+                              __global ushort *src2, int src2_step,int src2_offset,
+                              F alpha,F beta,F gama,
+                              __global ushort *dst,  int dst_step,int dst_offset,
+                              int rows,  int cols,int dst_step1)
 {
    int x = get_global_id(0);
    int y = get_global_id(1);
@@ -122,34 +129,37 @@ __kernel void addWeighted_D2 (__global ushort *src1, int src1_step,int src1_offs

        x = x << 2;

-        #define dst_align ((dst_offset >> 1) & 3)
+#ifdef dst_align
+#undef dst_align
+#endif
+#define dst_align ((dst_offset >> 1) & 3)
        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset -( dst_align << 1));
        int src2_index = mad24(y, src2_step, (x << 1) + src2_offset -( dst_align << 1));

        int dst_start  = mad24(y, dst_step, dst_offset);
        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
        int dst_index  = mad24(y, dst_step, dst_offset +( x<< 1) & (int)0xfffffff8);
-    int src1_index_fix = src1_index < 0 ? 0 : src1_index;
-    int src2_index_fix = src2_index < 0 ? 0 : src2_index;
+        int src1_index_fix = src1_index < 0 ? 0 : src1_index;
+        int src2_index_fix = src2_index < 0 ? 0 : src2_index;
        ushort4 src1_data = vload4(0, (__global ushort *)((__global char *)src1 + src1_index_fix));
        ushort4 src2_data = vload4(0, (__global ushort *)((__global char *)src2 + src2_index_fix));
-    if(src1_index < 0)
-    {
-        ushort4 tmp;
-        tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
-        src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
-    }
-    if(src2_index < 0)
-    {
-        ushort4 tmp;
-        tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
-        src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
-    }
+        if(src1_index < 0)
+        {
+            ushort4 tmp;
+            tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
+            src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
+        }
+        if(src2_index < 0)
+        {
+            ushort4 tmp;
+            tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
+            src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
+        }


        ushort4 dst_data = *((__global ushort4 *)((__global char *)dst + dst_index));
-       // int4 tmp      = convert_int4_sat(src1_data) * alpha + convert_int4_sat(src2_data) * beta + gama;
-         int4 tmp;
+        // int4 tmp      = convert_int4_sat(src1_data) * alpha + convert_int4_sat(src2_data) * beta + gama;
+        int4 tmp;
        tmp.x = src1_data.x * alpha + src2_data.x * beta + gama;
        tmp.y = src1_data.y * alpha + src2_data.y * beta + gama;
        tmp.z = src1_data.z * alpha + src2_data.z * beta + gama;
@@ -182,7 +192,10 @@ __kernel void addWeighted_D3 (__global short *src1, int src1_step,int src1_offse

        x = x << 2;

-        #define dst_align ((dst_offset >> 1) & 3)
+#ifdef dst_align
+#undef dst_align
+#endif
+#define dst_align ((dst_offset >> 1) & 3)
        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset -( dst_align << 1));
        int src2_index = mad24(y, src2_step, (x << 1) + src2_offset -( dst_align << 1));

@@ -190,26 +203,26 @@ __kernel void addWeighted_D3 (__global short *src1, int src1_step,int src1_offse
        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
        int dst_index  = mad24(y, dst_step, dst_offset +( x<< 1) - (dst_align << 1 ));

-    int src1_index_fix = src1_index < 0 ? 0 : src1_index;
-    int src2_index_fix = src2_index < 0 ? 0 : src2_index;
+        int src1_index_fix = src1_index < 0 ? 0 : src1_index;
+        int src2_index_fix = src2_index < 0 ? 0 : src2_index;
        short4 src1_data = vload4(0, (__global short *)((__global char *)src1 + src1_index_fix));
        short4 src2_data = vload4(0, (__global short *)((__global char *)src2 + src2_index_fix));

-    if(src1_index < 0)
-    {
-        short4 tmp;
-        tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
-        src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
-    }
-    if(src2_index < 0)
-    {
-        short4 tmp;
-        tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
-        src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
-    }
+        if(src1_index < 0)
+        {
+            short4 tmp;
+            tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
+            src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
+        }
+        if(src2_index < 0)
+        {
+            short4 tmp;
+            tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
+            src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
+        }
        short4 dst_data = *((__global short4 *)((__global char *)dst + dst_index));
-       // int4 tmp      = convert_int4_sat(src1_data) * alpha + convert_int4_sat(src2_data) * beta + gama;
-         int4 tmp;
+        // int4 tmp      = convert_int4_sat(src1_data) * alpha + convert_int4_sat(src2_data) * beta + gama;
+        int4 tmp;
        tmp.x = src1_data.x * alpha + src2_data.x * beta + gama;
        tmp.y = src1_data.y * alpha + src2_data.y * beta + gama;
        tmp.z = src1_data.z * alpha + src2_data.z * beta + gama;
@@ -228,7 +241,7 @@ __kernel void addWeighted_D3 (__global short *src1, int src1_step,int src1_offse

 __kernel void addWeighted_D4 (__global int *src1, int src1_step,int src1_offset,
                              __global int *src2, int src2_step,int src2_offset,
-                             F alpha,F beta, F gama,
+                              F alpha,F beta, F gama,
                              __global int *dst,  int dst_step,int dst_offset,
                              int rows,  int cols,int dst_step1)
 {
@@ -241,9 +254,12 @@ __kernel void addWeighted_D4 (__global int *src1, int src1_step,int src1_offset,

        x = x << 2;

-        #define bitOfInt  (sizeof(int)== 4 ? 2: 3)
+#define bitOfInt  (sizeof(int)== 4 ? 2: 3)

-        #define dst_align ((dst_offset >> bitOfInt) & 3)
+#ifdef dst_align
+#undef dst_align
+#endif
+#define dst_align ((dst_offset >> bitOfInt) & 3)

        int src1_index = mad24(y, src1_step, (x << bitOfInt) + src1_offset - (dst_align << bitOfInt));
        int src2_index = mad24(y, src2_step, (x << bitOfInt) + src2_offset - (dst_align << bitOfInt));
@@ -252,26 +268,26 @@ __kernel void addWeighted_D4 (__global int *src1, int src1_step,int src1_offset,
        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
        int dst_index  = mad24(y, dst_step, dst_offset + (x << bitOfInt) -(dst_align << bitOfInt));

-    int src1_index_fix = src1_index < 0 ? 0 : src1_index;
-    int src2_index_fix = src2_index < 0 ? 0 : src2_index;
+        int src1_index_fix = src1_index < 0 ? 0 : src1_index;
+        int src2_index_fix = src2_index < 0 ? 0 : src2_index;
        int4 src1_data = vload4(0, (__global int *)((__global char *)src1 + src1_index_fix));
        int4 src2_data = vload4(0, (__global int *)((__global char *)src2 + src2_index_fix));

-    if(src1_index < 0)
-    {
-        int4 tmp;
-        tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
-        src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
-    }
-    if(src2_index < 0)
-    {
-        int4 tmp;
-        tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
-        src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
-    }
+        if(src1_index < 0)
+        {
+            int4 tmp;
+            tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
+            src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
+        }
+        if(src2_index < 0)
+        {
+            int4 tmp;
+            tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
+            src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
+        }
        int4 dst_data = *((__global int4 *)((__global char *)dst + dst_index));
-       // double4   tmp = convert_double4(src1_data) * alpha + convert_double4(src2_data) * beta + gama ;
-         float4 tmp;
+        // double4   tmp = convert_double4(src1_data) * alpha + convert_double4(src2_data) * beta + gama ;
+        float4 tmp;
        tmp.x = src1_data.x * alpha + src2_data.x * beta + gama;
        tmp.y = src1_data.y * alpha + src2_data.y * beta + gama;
        tmp.z = src1_data.z * alpha + src2_data.z * beta + gama;
@@ -291,7 +307,7 @@ __kernel void addWeighted_D4 (__global int *src1, int src1_step,int src1_offset,

 __kernel void addWeighted_D5 (__global float *src1,int src1_step,int src1_offset,
                              __global float *src2, int src2_step,int src2_offset,
-                             F alpha,F beta, F gama,
+                              F alpha,F beta, F gama,
                              __global float *dst,  int dst_step,int dst_offset,
                              int rows,  int cols,int dst_step1)
 {
@@ -304,7 +320,10 @@ __kernel void addWeighted_D5 (__global float *src1,int src1_step,int src1_offset

        x = x << 2;

-        #define dst_align ((dst_offset >> 2) & 3)
+#ifdef dst_align
+#undef dst_align
+#endif
+#define dst_align ((dst_offset >> 2) & 3)

        int src1_index = mad24(y, src1_step, (x << 2) + src1_offset - (dst_align << 2));
        int src2_index = mad24(y, src2_step, (x << 2) + src2_offset - (dst_align << 2));
@@ -313,32 +332,32 @@ __kernel void addWeighted_D5 (__global float *src1,int src1_step,int src1_offset
        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
        int dst_index  = mad24(y, dst_step, dst_offset + (x << 2) -(dst_align << 2));

-    int src1_index_fix = src1_index < 0 ? 0 : src1_index;
-    int src2_index_fix = src2_index < 0 ? 0 : src2_index;
+        int src1_index_fix = src1_index < 0 ? 0 : src1_index;
+        int src2_index_fix = src2_index < 0 ? 0 : src2_index;
        float4 src1_data = vload4(0, (__global float  *)((__global char *)src1 + src1_index_fix));
        float4 src2_data = vload4(0, (__global float *)((__global char *)src2 + src2_index_fix));
        float4 dst_data = *((__global float4 *)((__global char *)dst + dst_index));
-    if(src1_index < 0)
-    {
-        float4 tmp;
-        tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
-        src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
-    }
-    if(src2_index < 0)
-    {
-        float4 tmp;
-        tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
-        src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
-    }
-    //    double4   tmp = convert_double4(src1_data) * alpha + convert_double4(src2_data) * beta + gama ;
+        if(src1_index < 0)
+        {
+            float4 tmp;
+            tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
+            src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
+        }
+        if(src2_index < 0)
+        {
+            float4 tmp;
+            tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
+            src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
+        }
+        //    double4   tmp = convert_double4(src1_data) * alpha + convert_double4(src2_data) * beta + gama ;

-       // float4   tmp_data =(src1_data) * alpha + (src2_data) * beta + gama ;
-         float4 tmp_data;
+        // float4   tmp_data =(src1_data) * alpha + (src2_data) * beta + gama ;
+        float4 tmp_data;
        tmp_data.x = src1_data.x * alpha + src2_data.x * beta + gama;
        tmp_data.y = src1_data.y * alpha + src2_data.y * beta + gama;
        tmp_data.z = src1_data.z * alpha + src2_data.z * beta + gama;
        tmp_data.w = src1_data.w * alpha + src2_data.w * beta + gama;
-       // float4 tmp_data = convert_float4(tmp);
+        // float4 tmp_data = convert_float4(tmp);

        dst_data.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : dst_data.x;
        dst_data.y = ((dst_index + 4 >= dst_start) && (dst_index + 4 < dst_end)) ? tmp_data.y : dst_data.y;
@@ -353,7 +372,7 @@ __kernel void addWeighted_D5 (__global float *src1,int src1_step,int src1_offset
 #if defined (DOUBLE_SUPPORT)
 __kernel void addWeighted_D6 (__global double *src1, int src1_step,int src1_offset,
                              __global double *src2, int src2_step,int src2_offset,
-                             F alpha,F beta, F gama,
+                              F alpha,F beta, F gama,
                              __global double *dst,  int dst_step,int dst_offset,
                              int rows,  int cols,int dst_step1)
 {
@@ -366,7 +385,10 @@ __kernel void addWeighted_D6 (__global double *src1, int src1_step,int src1_offs

        x = x << 2;

-        #define dst_align ((dst_offset >> 3) & 3)
+#ifdef dst_align
+#undef dst_align
+#endif
+#define dst_align ((dst_offset >> 3) & 3)

        int src1_index = mad24(y, src1_step, (x << 3) + src1_offset - (dst_align << 3));
        int src2_index = mad24(y, src2_step, (x << 3) + src2_offset - (dst_align << 3));
@@ -375,25 +397,25 @@ __kernel void addWeighted_D6 (__global double *src1, int src1_step,int src1_offs
        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
        int dst_index  = mad24(y, dst_step, dst_offset + (x << 3) -(dst_align << 3));

-    int src1_index_fix = src1_index < 0 ? 0 : src1_index;
-    int src2_index_fix = src2_index < 0 ? 0 : src2_index;
+        int src1_index_fix = src1_index < 0 ? 0 : src1_index;
+        int src2_index_fix = src2_index < 0 ? 0 : src2_index;
        double4 src1_data = vload4(0, (__global double  *)((__global char *)src1 + src1_index_fix));
        double4 src2_data = vload4(0, (__global double  *)((__global char *)src2 + src2_index_fix));
        double4 dst_data = *((__global double4 *)((__global char *)dst + dst_index));
-    if(src1_index < 0)
-    {
-        double4 tmp;
-        tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
-        src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
-    }
-    if(src2_index < 0)
-    {
-        double4 tmp;
-        tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
-        src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
-    }
-      //  double4   tmp_data = (src1_data) * alpha + (src2_data) * beta + gama ;
-         double4 tmp_data;
+        if(src1_index < 0)
+        {
+            double4 tmp;
+            tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
+            src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
+        }
+        if(src2_index < 0)
+        {
+            double4 tmp;
+            tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
+            src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
+        }
+        //  double4   tmp_data = (src1_data) * alpha + (src2_data) * beta + gama ;
+        double4 tmp_data;
        tmp_data.x = src1_data.x * alpha + src2_data.x * beta + gama;
        tmp_data.y = src1_data.y * alpha + src2_data.y * beta + gama;
        tmp_data.z = src1_data.z * alpha + src2_data.z * beta + gama;
--- a/modules/ocl/src/opencl/arithm_add_scalar.cl
+++ b/modules/ocl/src/opencl/arithm_add_scalar.cl
@@ -44,9 +44,13 @@
 //M*/

 #if defined (DOUBLE_SUPPORT)
+#ifdef cl_khr_fp64
 #pragma OPENCL EXTENSION cl_khr_fp64:enable
+#elif defined (cl_amd_fp64)
+#pragma OPENCL EXTENSION cl_amd_fp64:enable
 #endif

+#endif
 /**************************************add with scalar without mask**************************************/
 __kernel void arithm_s_add_C1_D0 (__global   uchar *src1, int src1_step, int src1_offset,
                                  __global   uchar *dst,  int dst_step,  int dst_offset,
@@ -59,7 +63,10 @@ __kernel void arithm_s_add_C1_D0 (__global   uchar *src1, int src1_step, int src
    {
        x = x << 2;

-        #define dst_align (dst_offset & 3)
+#ifdef dst_align
+#undef dst_align
+#endif
+#define dst_align (dst_offset & 3)
        int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);

        int dst_start  = mad24(y, dst_step, dst_offset);
@@ -99,7 +106,10 @@ __kernel void arithm_s_add_C1_D2 (__global   ushort *src1, int src1_step, int sr
    {
        x = x << 1;

-        #define dst_align ((dst_offset >> 1) & 1)
+#ifdef dst_align
+#undef dst_align
+#endif
+#define dst_align ((dst_offset >> 1) & 1)
        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));

        int dst_start  = mad24(y, dst_step, dst_offset);
@@ -131,7 +141,10 @@ __kernel void arithm_s_add_C1_D3 (__global   short *src1, int src1_step, int src
    {
        x = x << 1;

-        #define dst_align ((dst_offset >> 1) & 1)
+#ifdef dst_align
+#undef dst_align
+#endif
+#define dst_align ((dst_offset >> 1) & 1)
        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));

        int dst_start  = mad24(y, dst_step, dst_offset);
@@ -233,7 +246,10 @@ __kernel void arithm_s_add_C2_D0 (__global   uchar *src1, int src1_step, int src
    {
        x = x << 1;

-        #define dst_align ((dst_offset >> 1) & 1)
+#ifdef dst_align
+#undef dst_align
+#endif
+#define dst_align ((dst_offset >> 1) & 1)
        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));

        int dst_start  = mad24(y, dst_step, dst_offset);
@@ -378,7 +394,10 @@ __kernel void arithm_s_add_C3_D0 (__global   uchar *src1, int src1_step, int src
    {
        x = x << 2;

-        #define dst_align (((dst_offset % dst_step) / 3 ) & 3)
+#ifdef dst_align
+#undef dst_align
+#endif
+#define dst_align (((dst_offset % dst_step) / 3 ) & 3)
        int src1_index = mad24(y, src1_step, (x * 3) + src1_offset - (dst_align * 3));

        int dst_start  = mad24(y, dst_step, dst_offset);
@@ -432,7 +451,10 @@ __kernel void arithm_s_add_C3_D2 (__global   ushort *src1, int src1_step, int sr
    {
        x = x << 1;

-        #define dst_align (((dst_offset % dst_step) / 6 ) & 1)
+#ifdef dst_align
+#undef dst_align
+#endif
+#define dst_align (((dst_offset % dst_step) / 6 ) & 1)
        int src1_index = mad24(y, src1_step, (x * 6) + src1_offset - (dst_align * 6));

        int dst_start  = mad24(y, dst_step, dst_offset);
@@ -458,16 +480,16 @@ __kernel void arithm_s_add_C3_D2 (__global   ushort *src1, int src1_step, int sr
        data_0.xy = ((dst_index + 0 >= dst_start)) ? tmp_data_0.xy : data_0.xy;

        data_1.x  = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end))
-                     ? tmp_data_1.x : data_1.x;
+                    ? tmp_data_1.x : data_1.x;
        data_1.y  = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
-                     ? tmp_data_1.y : data_1.y;
+                    ? tmp_data_1.y : data_1.y;

        data_2.xy = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
-                     ? tmp_data_2.xy : data_2.xy;
+                    ? tmp_data_2.xy : data_2.xy;

-       *((__global ushort2 *)((__global char *)dst + dst_index + 0))= data_0;
-       *((__global ushort2 *)((__global char *)dst + dst_index + 4))= data_1;
-       *((__global ushort2 *)((__global char *)dst + dst_index + 8))= data_2;
+        *((__global ushort2 *)((__global char *)dst + dst_index + 0))= data_0;
+        *((__global ushort2 *)((__global char *)dst + dst_index + 4))= data_1;
+        *((__global ushort2 *)((__global char *)dst + dst_index + 8))= data_2;
    }
 }
 __kernel void arithm_s_add_C3_D3 (__global   short *src1, int src1_step, int src1_offset,
@@ -482,7 +504,10 @@ __kernel void arithm_s_add_C3_D3 (__global   short *src1, int src1_step, int src
    {
        x = x << 1;

-        #define dst_align (((dst_offset % dst_step) / 6 ) & 1)
+#ifdef dst_align
+#undef dst_align
+#endif
+#define dst_align (((dst_offset % dst_step) / 6 ) & 1)
        int src1_index = mad24(y, src1_step, (x * 6) + src1_offset - (dst_align * 6));

        int dst_start  = mad24(y, dst_step, dst_offset);
@@ -508,16 +533,16 @@ __kernel void arithm_s_add_C3_D3 (__global   short *src1, int src1_step, int src
        data_0.xy = ((dst_index + 0 >= dst_start)) ? tmp_data_0.xy : data_0.xy;

        data_1.x  = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end))
-                     ? tmp_data_1.x : data_1.x;
+                    ? tmp_data_1.x : data_1.x;
        data_1.y  = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
-                     ? tmp_data_1.y : data_1.y;
+                    ? tmp_data_1.y : data_1.y;

        data_2.xy = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
-                     ? tmp_data_2.xy : data_2.xy;
+                    ? tmp_data_2.xy : data_2.xy;

-       *((__global short2 *)((__global char *)dst + dst_index + 0))= data_0;
-       *((__global short2 *)((__global char *)dst + dst_index + 4))= data_1;
-       *((__global short2 *)((__global char *)dst + dst_index + 8))= data_2;
+        *((__global short2 *)((__global char *)dst + dst_index + 0))= data_0;
+        *((__global short2 *)((__global char *)dst + dst_index + 4))= data_1;
+        *((__global short2 *)((__global char *)dst + dst_index + 8))= data_2;
    }
 }
 __kernel void arithm_s_add_C3_D4 (__global   int *src1, int src1_step, int src1_offset,
@@ -549,9 +574,9 @@ __kernel void arithm_s_add_C3_D4 (__global   int *src1, int src1_step, int src1_
        int tmp_data_1 = convert_int_sat((long)src1_data_1 + (long)src2_data_1);
        int tmp_data_2 = convert_int_sat((long)src1_data_2 + (long)src2_data_2);

-       *((__global int *)((__global char *)dst + dst_index + 0))= tmp_data_0;
-       *((__global int *)((__global char *)dst + dst_index + 4))= tmp_data_1;
-       *((__global int *)((__global char *)dst + dst_index + 8))= tmp_data_2;
+        *((__global int *)((__global char *)dst + dst_index + 0))= tmp_data_0;
+        *((__global int *)((__global char *)dst + dst_index + 4))= tmp_data_1;
+        *((__global int *)((__global char *)dst + dst_index + 8))= tmp_data_2;
    }
 }
 __kernel void arithm_s_add_C3_D5 (__global   float *src1, int src1_step, int src1_offset,
@@ -583,9 +608,9 @@ __kernel void arithm_s_add_C3_D5 (__global   float *src1, int src1_step, int src
        float tmp_data_1 = src1_data_1 + src2_data_1;
        float tmp_data_2 = src1_data_2 + src2_data_2;

-       *((__global float *)((__global char *)dst + dst_index + 0))= tmp_data_0;
-       *((__global float *)((__global char *)dst + dst_index + 4))= tmp_data_1;
-       *((__global float *)((__global char *)dst + dst_index + 8))= tmp_data_2;
+        *((__global float *)((__global char *)dst + dst_index + 0))= tmp_data_0;
+        *((__global float *)((__global char *)dst + dst_index + 4))= tmp_data_1;
+        *((__global float *)((__global char *)dst + dst_index + 8))= tmp_data_2;
    }
 }

@@ -619,9 +644,9 @@ __kernel void arithm_s_add_C3_D6 (__global   double *src1, int src1_step, int sr
        double tmp_data_1 = src1_data_1 + src2_data_1;
        double tmp_data_2 = src1_data_2 + src2_data_2;

-       *((__global double *)((__global char *)dst + dst_index + 0 ))= tmp_data_0;
-       *((__global double *)((__global char *)dst + dst_index + 8 ))= tmp_data_1;
-       *((__global double *)((__global char *)dst + dst_index + 16))= tmp_data_2;
+        *((__global double *)((__global char *)dst + dst_index + 0 ))= tmp_data_0;
+        *((__global double *)((__global char *)dst + dst_index + 8 ))= tmp_data_1;
+        *((__global double *)((__global char *)dst + dst_index + 16))= tmp_data_2;
    }
 }
 #endif
--- a/modules/ocl/src/opencl/arithm_add_scalar_mask.cl
+++ b/modules/ocl/src/opencl/arithm_add_scalar_mask.cl
@@ -44,7 +44,11 @@
 //M*/

 #if defined (DOUBLE_SUPPORT)
+#ifdef cl_khr_fp64
 #pragma OPENCL EXTENSION cl_khr_fp64:enable
+#elif defined (cl_amd_fp64)
+#pragma OPENCL EXTENSION cl_amd_fp64:enable
+#endif
 #endif

 /**************************************add with scalar with mask**************************************/
@@ -61,7 +65,10 @@ __kernel void arithm_s_add_with_mask_C1_D0 (__global   uchar *src1, int src1_ste
    {
        x = x << 2;

-        #define dst_align (dst_offset & 3)
+#ifdef dst_align
+#undef dst_align
+#endif
+#define dst_align (dst_offset & 3)
        int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
        int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);

@@ -111,7 +118,10 @@ __kernel void arithm_s_add_with_mask_C1_D2 (__global   ushort *src1, int src1_st
    {
        x = x << 1;

-        #define dst_align ((dst_offset >> 1) & 1)
+#ifdef dst_align
+#undef dst_align
+#endif
+#define dst_align ((dst_offset >> 1) & 1)
        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
        int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);

@@ -146,7 +156,10 @@ __kernel void arithm_s_add_with_mask_C1_D3 (__global   short *src1, int src1_ste
    {
        x = x << 1;

-        #define dst_align ((dst_offset >> 1) & 1)
+#ifdef dst_align
+#undef dst_align
+#endif
+#define dst_align ((dst_offset >> 1) & 1)
        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
        int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);

@@ -267,7 +280,10 @@ __kernel void arithm_s_add_with_mask_C2_D0 (__global   uchar *src1, int src1_ste
    {
        x = x << 1;

-        #define dst_align ((dst_offset >> 1) & 1)
+#ifdef dst_align
+#undef dst_align
+#endif
+#define dst_align ((dst_offset >> 1) & 1)
        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
        int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);

@@ -443,7 +459,10 @@ __kernel void arithm_s_add_with_mask_C3_D0 (__global   uchar *src1, int src1_ste
    {
        x = x << 2;

-        #define dst_align (((dst_offset % dst_step) / 3 ) & 3)
+#ifdef dst_align
+#undef dst_align
+#endif
+#define dst_align (((dst_offset % dst_step) / 3 ) & 3)
        int src1_index = mad24(y, src1_step, (x * 3) + src1_offset - (dst_align * 3));
        int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);

@@ -501,7 +520,10 @@ __kernel void arithm_s_add_with_mask_C3_D2 (__global   ushort *src1, int src1_st
    {
        x = x << 1;

-        #define dst_align (((dst_offset % dst_step) / 6 ) & 1)
+#ifdef dst_align
+#undef dst_align
+#endif
+#define dst_align (((dst_offset % dst_step) / 6 ) & 1)
        int src1_index = mad24(y, src1_step, (x * 6) + src1_offset - (dst_align * 6));
        int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);

@@ -530,16 +552,16 @@ __kernel void arithm_s_add_with_mask_C3_D2 (__global   ushort *src1, int src1_st
        data_0.xy = ((mask_data.x) && (dst_index + 0 >= dst_start)) ? tmp_data_0.xy : data_0.xy;

        data_1.x  = ((mask_data.x) && (dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end))
-                     ? tmp_data_1.x : data_1.x;
+                    ? tmp_data_1.x : data_1.x;
        data_1.y  = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
-                     ? tmp_data_1.y : data_1.y;
+                    ? tmp_data_1.y : data_1.y;

        data_2.xy = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
-                     ? tmp_data_2.xy : data_2.xy;
+                    ? tmp_data_2.xy : data_2.xy;

-       *((__global ushort2 *)((__global char *)dst + dst_index + 0))= data_0;
-       *((__global ushort2 *)((__global char *)dst + dst_index + 4))= data_1;
-       *((__global ushort2 *)((__global char *)dst + dst_index + 8))= data_2;
+        *((__global ushort2 *)((__global char *)dst + dst_index + 0))= data_0;
+        *((__global ushort2 *)((__global char *)dst + dst_index + 4))= data_1;
+        *((__global ushort2 *)((__global char *)dst + dst_index + 8))= data_2;
    }
 }
 __kernel void arithm_s_add_with_mask_C3_D3 (__global   short *src1, int src1_step, int src1_offset,
@@ -555,7 +577,10 @@ __kernel void arithm_s_add_with_mask_C3_D3 (__global   short *src1, int src1_ste
    {
        x = x << 1;

-        #define dst_align (((dst_offset % dst_step) / 6 ) & 1)
+#ifdef dst_align
+#undef dst_align
+#endif
+#define dst_align (((dst_offset % dst_step) / 6 ) & 1)
        int src1_index = mad24(y, src1_step, (x * 6) + src1_offset - (dst_align * 6));
        int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);

@@ -584,16 +609,16 @@ __kernel void arithm_s_add_with_mask_C3_D3 (__global   short *src1, int src1_ste
        data_0.xy = ((mask_data.x) && (dst_index + 0 >= dst_start)) ? tmp_data_0.xy : data_0.xy;

        data_1.x  = ((mask_data.x) && (dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end))
-                     ? tmp_data_1.x : data_1.x;
+                    ? tmp_data_1.x : data_1.x;
        data_1.y  = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
-                     ? tmp_data_1.y : data_1.y;
+                    ? tmp_data_1.y : data_1.y;

        data_2.xy = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
-                     ? tmp_data_2.xy : data_2.xy;
+                    ? tmp_data_2.xy : data_2.xy;

-       *((__global short2 *)((__global char *)dst + dst_index + 0))= data_0;
-       *((__global short2 *)((__global char *)dst + dst_index + 4))= data_1;
-       *((__global short2 *)((__global char *)dst + dst_index + 8))= data_2;
+        *((__global short2 *)((__global char *)dst + dst_index + 0))= data_0;
+        *((__global short2 *)((__global char *)dst + dst_index + 4))= data_1;
+        *((__global short2 *)((__global char *)dst + dst_index + 8))= data_2;
    }
 }
 __kernel void arithm_s_add_with_mask_C3_D4 (__global   int *src1, int src1_step, int src1_offset,
@@ -633,9 +658,9 @@ __kernel void arithm_s_add_with_mask_C3_D4 (__global   int *src1, int src1_step,
        data_1 = mask_data ? tmp_data_1 : data_1;
        data_2 = mask_data ? tmp_data_2 : data_2;

-       *((__global int *)((__global char *)dst + dst_index + 0))= data_0;
-       *((__global int *)((__global char *)dst + dst_index + 4))= data_1;
-       *((__global int *)((__global char *)dst + dst_index + 8))= data_2;
+        *((__global int *)((__global char *)dst + dst_index + 0))= data_0;
+        *((__global int *)((__global char *)dst + dst_index + 4))= data_1;
+        *((__global int *)((__global char *)dst + dst_index + 8))= data_2;
    }
 }
 __kernel void arithm_s_add_with_mask_C3_D5 (__global   float *src1, int src1_step, int src1_offset,
@@ -675,9 +700,9 @@ __kernel void arithm_s_add_with_mask_C3_D5 (__global   float *src1, int src1_ste
        data_1 = mask_data ? tmp_data_1 : data_1;
        data_2 = mask_data ? tmp_data_2 : data_2;

-       *((__global float *)((__global char *)dst + dst_index + 0))= data_0;
-       *((__global float *)((__global char *)dst + dst_index + 4))= data_1;
-       *((__global float *)((__global char *)dst + dst_index + 8))= data_2;
+        *((__global float *)((__global char *)dst + dst_index + 0))= data_0;
+        *((__global float *)((__global char *)dst + dst_index + 4))= data_1;
+        *((__global float *)((__global char *)dst + dst_index + 8))= data_2;
    }
 }

@@ -719,9 +744,9 @@ __kernel void arithm_s_add_with_mask_C3_D6 (__global   double *src1, int src1_st
        data_1 = mask_data ? tmp_data_1 : data_1;
        data_2 = mask_data ? tmp_data_2 : data_2;

-       *((__global double *)((__global char *)dst + dst_index + 0 ))= data_0;
-       *((__global double *)((__global char *)dst + dst_index + 8 ))= data_1;
-       *((__global double *)((__global char *)dst + dst_index + 16))= data_2;
+        *((__global double *)((__global char *)dst + dst_index + 0 ))= data_0;
+        *((__global double *)((__global char *)dst + dst_index + 8 ))= data_1;
+        *((__global double *)((__global char *)dst + dst_index + 16))= data_2;
    }
 }
 #endif
--- a/modules/ocl/src/opencl/arithm_bitwise_and.cl
+++ b/modules/ocl/src/opencl/arithm_bitwise_and.cl
@@ -43,7 +43,11 @@
 //
 //M*/
 #if defined (DOUBLE_SUPPORT)
+#ifdef cl_khr_fp64
 #pragma OPENCL EXTENSION cl_khr_fp64:enable
+#elif defined (cl_amd_fp64)
+#pragma OPENCL EXTENSION cl_amd_fp64:enable
+#endif
 #endif

 //////////////////////////////////////////////////////////////////////////////////////////////////////
@@ -51,9 +55,9 @@
 ///////////////////////////////////////////////////////////////////////////////////////////////////////
 /**************************************bitwise_and without mask**************************************/
 __kernel void arithm_bitwise_and_D0 (__global uchar *src1, int src1_step, int src1_offset,
-                             __global uchar *src2, int src2_step, int src2_offset,
-                             __global uchar *dst,  int dst_step,  int dst_offset,
-                             int rows, int cols, int dst_step1)
+                                     __global uchar *src2, int src2_step, int src2_offset,
+                                     __global uchar *dst,  int dst_step,  int dst_offset,
+                                     int rows, int cols, int dst_step1)
 {
    int x = get_global_id(0);
    int y = get_global_id(1);
@@ -62,30 +66,33 @@ __kernel void arithm_bitwise_and_D0 (__global uchar *src1, int src1_step, int sr
    {
        x = x << 2;

-        #define dst_align (dst_offset & 3)
+#ifdef dst_align
+#undef dst_align
+#endif
+#define dst_align (dst_offset & 3)
        int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
        int src2_index = mad24(y, src2_step, x + src2_offset - dst_align);

        int dst_start  = mad24(y, dst_step, dst_offset);
        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
        int dst_index  = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
-     int src1_index_fix = src1_index < 0 ? 0 : src1_index;
-     int src2_index_fix = src2_index < 0 ? 0 : src2_index;
-     uchar4 src1_data = vload4(0, src1 + src1_index_fix);
-     uchar4 src2_data = vload4(0, src2 + src2_index_fix);
+        int src1_index_fix = src1_index < 0 ? 0 : src1_index;
+        int src2_index_fix = src2_index < 0 ? 0 : src2_index;
+        uchar4 src1_data = vload4(0, src1 + src1_index_fix);
+        uchar4 src2_data = vload4(0, src2 + src2_index_fix);

-     if(src1_index < 0)
-     {
-        uchar4 tmp;
-        tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
-        src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
-     }
-     if(src2_index < 0)
-     {
-        uchar4 tmp;
-        tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
-        src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
-     }
+        if(src1_index < 0)
+        {
+            uchar4 tmp;
+            tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
+            src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
+        }
+        if(src2_index < 0)
+        {
+            uchar4 tmp;
+            tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
+            src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
+        }

        uchar4 dst_data = *((__global uchar4 *)(dst + dst_index));
        uchar4 tmp_data = src1_data & src2_data;
@@ -101,9 +108,9 @@ __kernel void arithm_bitwise_and_D0 (__global uchar *src1, int src1_step, int sr


 __kernel void arithm_bitwise_and_D1 (__global char *src1, int src1_step, int src1_offset,
-                             __global char *src2, int src2_step, int src2_offset,
-                             __global char *dst,  int dst_step,  int dst_offset,
-                             int rows, int cols, int dst_step1)
+                                     __global char *src2, int src2_step, int src2_offset,
+                                     __global char *dst,  int dst_step,  int dst_offset,
+                                     int rows, int cols, int dst_step1)
 {
    int x = get_global_id(0);
    int y = get_global_id(1);
@@ -112,7 +119,10 @@ __kernel void arithm_bitwise_and_D1 (__global char *src1, int src1_step, int src
    {
        x = x << 2;

-        #define dst_align (dst_offset & 3)
+#ifdef dst_align
+#undef dst_align
+#endif
+#define dst_align (dst_offset & 3)
        int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
        int src2_index = mad24(y, src2_step, x + src2_offset - dst_align);

@@ -120,23 +130,23 @@ __kernel void arithm_bitwise_and_D1 (__global char *src1, int src1_step, int src
        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
        int dst_index  = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);

-     int src1_index_fix = src1_index < 0 ? 0 : src1_index;
-     int src2_index_fix = src2_index < 0 ? 0 : src2_index;
-     char4 src1_data = vload4(0, src1 + src1_index_fix);
-     char4 src2_data = vload4(0, src2 + src2_index_fix);
+        int src1_index_fix = src1_index < 0 ? 0 : src1_index;
+        int src2_index_fix = src2_index < 0 ? 0 : src2_index;
+        char4 src1_data = vload4(0, src1 + src1_index_fix);
+        char4 src2_data = vload4(0, src2 + src2_index_fix);

-     if(src1_index < 0)
-     {
-        char4 tmp;
-        tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
-        src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
-     }
-     if(src2_index < 0)
-     {
-        char4 tmp;
-        tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
-        src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
-     }
+        if(src1_index < 0)
+        {
+            char4 tmp;
+            tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
+            src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
+        }
+        if(src2_index < 0)
+        {
+            char4 tmp;
+            tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
+            src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
+        }
        char4 dst_data = *((__global char4 *)(dst + dst_index));
        char4 tmp_data = src1_data & src2_data;

@@ -151,9 +161,9 @@ __kernel void arithm_bitwise_and_D1 (__global char *src1, int src1_step, int src


 __kernel void arithm_bitwise_and_D2 (__global ushort *src1, int src1_step, int src1_offset,
-                             __global ushort *src2, int src2_step, int src2_offset,
-                             __global ushort *dst,  int dst_step,  int dst_offset,
-                             int rows, int cols, int dst_step1)
+                                     __global ushort *src2, int src2_step, int src2_offset,
+                                     __global ushort *dst,  int dst_step,  int dst_offset,
+                                     int rows, int cols, int dst_step1)

 {
    int x = get_global_id(0);
@@ -163,7 +173,10 @@ __kernel void arithm_bitwise_and_D2 (__global ushort *src1, int src1_step, int s
    {
        x = x << 2;

-        #define dst_align ((dst_offset >> 1) & 3)
+#ifdef dst_align
+#undef dst_align
+#endif
+#define dst_align ((dst_offset >> 1) & 3)
        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
        int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));

@@ -171,23 +184,23 @@ __kernel void arithm_bitwise_and_D2 (__global ushort *src1, int src1_step, int s
        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
        int dst_index  = mad24(y, dst_step, dst_offset + (x << 1) & (int)0xfffffff8);

-     int src1_index_fix = src1_index < 0 ? 0 : src1_index;
-     int src2_index_fix = src2_index < 0 ? 0 : src2_index;
+        int src1_index_fix = src1_index < 0 ? 0 : src1_index;
+        int src2_index_fix = src2_index < 0 ? 0 : src2_index;
        ushort4 src1_data = vload4(0, (__global ushort *)((__global char *)src1 + src1_index_fix));
        ushort4 src2_data = vload4(0, (__global ushort *)((__global char *)src2 + src2_index_fix));

-     if(src1_index < 0)
-     {
-        ushort4 tmp;
-        tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
-        src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
-     }
-     if(src2_index < 0)
-     {
-        ushort4 tmp;
-        tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
-        src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
-     }
+        if(src1_index < 0)
+        {
+            ushort4 tmp;
+            tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
+            src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
+        }
+        if(src2_index < 0)
+        {
+            ushort4 tmp;
+            tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
+            src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
+        }
        ushort4 dst_data = *((__global ushort4 *)((__global char *)dst + dst_index));
        ushort4 tmp_data = src1_data & src2_data;

@@ -203,9 +216,9 @@ __kernel void arithm_bitwise_and_D2 (__global ushort *src1, int src1_step, int s


 __kernel void arithm_bitwise_and_D3 (__global short *src1, int src1_step, int src1_offset,
-                             __global short *src2, int src2_step, int src2_offset,
-                             __global short *dst,  int dst_step,  int dst_offset,
-                             int rows, int cols, int dst_step1)
+                                     __global short *src2, int src2_step, int src2_offset,
+                                     __global short *dst,  int dst_step,  int dst_offset,
+                                     int rows, int cols, int dst_step1)

 {
    int x = get_global_id(0);
@@ -215,7 +228,10 @@ __kernel void arithm_bitwise_and_D3 (__global short *src1, int src1_step, int sr
    {
        x = x << 2;

-        #define dst_align ((dst_offset >> 1) & 3)
+#ifdef dst_align
+#undef dst_align
+#endif
+#define dst_align ((dst_offset >> 1) & 3)
        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
        int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));

@@ -223,23 +239,23 @@ __kernel void arithm_bitwise_and_D3 (__global short *src1, int src1_step, int sr
        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
        int dst_index  = mad24(y, dst_step, dst_offset + (x << 1) & (int)0xfffffff8);

-     int src1_index_fix = src1_index < 0 ? 0 : src1_index;
-     int src2_index_fix = src2_index < 0 ? 0 : src2_index;
+        int src1_index_fix = src1_index < 0 ? 0 : src1_index;
+        int src2_index_fix = src2_index < 0 ? 0 : src2_index;
        short4 src1_data = vload4(0, (__global short *)((__global char *)src1 + src1_index_fix));
        short4 src2_data = vload4(0, (__global short *)((__global char *)src2 + src2_index_fix));

-     if(src1_index < 0)
-     {
-        short4 tmp;
-        tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
-        src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
-     }
-     if(src2_index < 0)
-     {
-        short4 tmp;
-        tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
-        src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
-     }
+        if(src1_index < 0)
+        {
+            short4 tmp;
+            tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
+            src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
+        }
+        if(src2_index < 0)
+        {
+            short4 tmp;
+            tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
+            src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
+        }
        short4 dst_data = *((__global short4 *)((__global char *)dst + dst_index));
        short4 tmp_data = src1_data & src2_data;

@@ -255,9 +271,9 @@ __kernel void arithm_bitwise_and_D3 (__global short *src1, int src1_step, int sr


 __kernel void arithm_bitwise_and_D4 (__global int *src1, int src1_step, int src1_offset,
-                             __global int *src2, int src2_step, int src2_offset,
-                             __global int *dst,  int dst_step,  int dst_offset,
-                             int rows, int cols, int dst_step1)
+                                     __global int *src2, int src2_step, int src2_offset,
+                                     __global int *dst,  int dst_step,  int dst_offset,
+                                     int rows, int cols, int dst_step1)
 {
    int x = get_global_id(0);
    int y = get_global_id(1);
@@ -277,9 +293,9 @@ __kernel void arithm_bitwise_and_D4 (__global int *src1, int src1_step, int src1
 }

 __kernel void arithm_bitwise_and_D5 (__global char *src1, int src1_step, int src1_offset,
-                             __global char *src2, int src2_step, int src2_offset,
-                             __global char *dst,  int dst_step,  int dst_offset,
-                             int rows, int cols, int dst_step1)
+                                     __global char *src2, int src2_step, int src2_offset,
+                                     __global char *dst,  int dst_step,  int dst_offset,
+                                     int rows, int cols, int dst_step1)
 {
    int x = get_global_id(0);
    int y = get_global_id(1);
@@ -300,9 +316,9 @@ __kernel void arithm_bitwise_and_D5 (__global char *src1, int src1_step, int src

 #if defined (DOUBLE_SUPPORT)
 __kernel void arithm_bitwise_and_D6 (__global char *src1, int src1_step, int src1_offset,
-                             __global char *src2, int src2_step, int src2_offset,
-                             __global char *dst,  int dst_step,  int dst_offset,
-                             int rows, int cols, int dst_step1)
+                                     __global char *src2, int src2_step, int src2_offset,
+                                     __global char *dst,  int dst_step,  int dst_offset,
+                                     int rows, int cols, int dst_step1)
 {
    int x = get_global_id(0);
    int y = get_global_id(1);
--- a/modules/ocl/src/opencl/arithm_bitwise_and_mask.cl
+++ b/modules/ocl/src/opencl/arithm_bitwise_and_mask.cl
@@ -43,18 +43,22 @@
 //
 //M*/
 #if defined (DOUBLE_SUPPORT)
+#ifdef cl_khr_fp64
 #pragma OPENCL EXTENSION cl_khr_fp64:enable
+#elif defined (cl_amd_fp64)
+#pragma OPENCL EXTENSION cl_amd_fp64:enable
+#endif
 #endif
-
 //////////////////////////////////////////////////////////////////////////////////////////////////////
 ////////////////////////////////////////////BITWISE_AND////////////////////////////////////////////////////
 ///////////////////////////////////////////////////////////////////////////////////////////////////////
 /**************************************bitwise_and with mask**************************************/
-__kernel void arithm_bitwise_and_with_mask_C1_D0 (__global uchar *src1, int src1_step, int src1_offset,
-                                          __global uchar *src2, int src2_step, int src2_offset,
-                                          __global uchar *mask, int mask_step, int mask_offset,
-                                          __global uchar *dst,  int dst_step,  int dst_offset,
-                                          int rows, int cols, int dst_step1)
+__kernel void arithm_bitwise_and_with_mask_C1_D0 (
+        __global uchar *src1, int src1_step, int src1_offset,
+        __global uchar *src2, int src2_step, int src2_offset,
+        __global uchar *mask, int mask_step, int mask_offset,
+        __global uchar *dst,  int dst_step,  int dst_offset,
+        int rows, int cols, int dst_step1)
 {

    int x = get_global_id(0);
@@ -64,7 +68,10 @@ __kernel void arithm_bitwise_and_with_mask_C1_D0 (__global uchar *src1, int src1
    {
        x = x << 2;

-        #define dst_align (dst_offset & 3)
+#ifdef dst_align
+#undef dst_align
+#endif
+#define dst_align (dst_offset & 3)
        int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
        int src2_index = mad24(y, src2_step, x + src2_offset - dst_align);
        int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
@@ -91,11 +98,12 @@ __kernel void arithm_bitwise_and_with_mask_C1_D0 (__global uchar *src1, int src1



-__kernel void arithm_bitwise_and_with_mask_C1_D1 (__global char *src1, int src1_step, int src1_offset,
-                                          __global char *src2, int src2_step, int src2_offset,
-                                          __global uchar *mask, int mask_step, int mask_offset,
-                                          __global char *dst,  int dst_step,  int dst_offset,
-                                          int rows, int cols, int dst_step1)
+__kernel void arithm_bitwise_and_with_mask_C1_D1 (
+        __global char *src1, int src1_step, int src1_offset,
+        __global char *src2, int src2_step, int src2_offset,
+        __global uchar *mask, int mask_step, int mask_offset,
+        __global char *dst,  int dst_step,  int dst_offset,
+        int rows, int cols, int dst_step1)
 {

    int x = get_global_id(0);
@@ -105,7 +113,10 @@ __kernel void arithm_bitwise_and_with_mask_C1_D1 (__global char *src1, int src1_
    {
        x = x << 2;

-        #define dst_align (dst_offset & 3)
+#ifdef dst_align
+#undef dst_align
+#endif
+#define dst_align (dst_offset & 3)
        int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
        int src2_index = mad24(y, src2_step, x + src2_offset - dst_align);
        int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
@@ -132,11 +143,12 @@ __kernel void arithm_bitwise_and_with_mask_C1_D1 (__global char *src1, int src1_



-__kernel void arithm_bitwise_and_with_mask_C1_D2 (__global ushort *src1, int src1_step, int src1_offset,
-                                          __global ushort *src2, int src2_step, int src2_offset,
-                                          __global uchar  *mask, int mask_step, int mask_offset,
-                                          __global ushort *dst,  int dst_step,  int dst_offset,
-                                          int rows, int cols, int dst_step1)
+__kernel void arithm_bitwise_and_with_mask_C1_D2 (
+        __global ushort *src1, int src1_step, int src1_offset,
+        __global ushort *src2, int src2_step, int src2_offset,
+        __global uchar  *mask, int mask_step, int mask_offset,
+        __global ushort *dst,  int dst_step,  int dst_offset,
+        int rows, int cols, int dst_step1)
 {

    int x = get_global_id(0);
@@ -146,7 +158,10 @@ __kernel void arithm_bitwise_and_with_mask_C1_D2 (__global ushort *src1, int src
    {
        x = x << 1;

-        #define dst_align ((dst_offset >> 1) & 1)
+#ifdef dst_align
+#undef dst_align
+#endif
+#define dst_align ((dst_offset >> 1) & 1)
        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
        int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
        int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
@@ -171,11 +186,12 @@ __kernel void arithm_bitwise_and_with_mask_C1_D2 (__global ushort *src1, int src



-__kernel void arithm_bitwise_and_with_mask_C1_D3 (__global short *src1, int src1_step, int src1_offset,
-                                          __global short *src2, int src2_step, int src2_offset,
-                                          __global uchar *mask, int mask_step, int mask_offset,
-                                          __global short *dst,  int dst_step,  int dst_offset,
-                                          int rows, int cols, int dst_step1)
+__kernel void arithm_bitwise_and_with_mask_C1_D3 (
+        __global short *src1, int src1_step, int src1_offset,
+        __global short *src2, int src2_step, int src2_offset,
+        __global uchar *mask, int mask_step, int mask_offset,
+        __global short *dst,  int dst_step,  int dst_offset,
+        int rows, int cols, int dst_step1)
 {

    int x = get_global_id(0);
@@ -185,7 +201,10 @@ __kernel void arithm_bitwise_and_with_mask_C1_D3 (__global short *src1, int src1
    {
        x = x << 1;

-        #define dst_align ((dst_offset >> 1) & 1)
+#ifdef dst_align
+#undef dst_align
+#endif
+#define dst_align ((dst_offset >> 1) & 1)
        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
        int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
        int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
@@ -198,8 +217,8 @@ __kernel void arithm_bitwise_and_with_mask_C1_D3 (__global short *src1, int src1
        short2 src2_data = vload2(0, (__global short *)((__global char *)src2 + src2_index));
        uchar2  mask_data = vload2(0, mask + mask_index);

-    short2 data = *((__global short2 *)((__global uchar *)dst + dst_index));
-    short2 tmp_data = src1_data & src2_data;
+        short2 data = *((__global short2 *)((__global uchar *)dst + dst_index));
+        short2 tmp_data = src1_data & src2_data;

        data.x = convert_short((mask_data.x) && (dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : data.x;
        data.y = convert_short((mask_data.y) && (dst_index + 2 >= dst_start) && (dst_index + 2 < dst_end)) ? tmp_data.y : data.y;
@@ -210,11 +229,12 @@ __kernel void arithm_bitwise_and_with_mask_C1_D3 (__global short *src1, int src1



-__kernel void arithm_bitwise_and_with_mask_C1_D4 (__global int   *src1, int src1_step, int src1_offset,
-                                          __global int   *src2, int src2_step, int src2_offset,
-                                          __global uchar *mask, int mask_step, int mask_offset,
-                                          __global int   *dst,  int dst_step,  int dst_offset,
-                                          int rows, int cols, int dst_step1)
+__kernel void arithm_bitwise_and_with_mask_C1_D4 (
+        __global int   *src1, int src1_step, int src1_offset,
+        __global int   *src2, int src2_step, int src2_offset,
+        __global uchar *mask, int mask_step, int mask_offset,
+        __global int   *dst,  int dst_step,  int dst_offset,
+        int rows, int cols, int dst_step1)
 {

    int x = get_global_id(0);
@@ -242,11 +262,12 @@ __kernel void arithm_bitwise_and_with_mask_C1_D4 (__global int   *src1, int src1



-__kernel void arithm_bitwise_and_with_mask_C1_D5 (__global char *src1, int src1_step, int src1_offset,
-                                          __global char *src2, int src2_step, int src2_offset,
-                                          __global uchar *mask, int mask_step, int mask_offset,
-                                          __global char *dst,  int dst_step,  int dst_offset,
-                                          int rows, int cols, int dst_step1)
+__kernel void arithm_bitwise_and_with_mask_C1_D5 (
+        __global char *src1, int src1_step, int src1_offset,
+        __global char *src2, int src2_step, int src2_offset,
+        __global uchar *mask, int mask_step, int mask_offset,
+        __global char *dst,  int dst_step,  int dst_offset,
+        int rows, int cols, int dst_step1)
 {

    int x = get_global_id(0);
@@ -274,12 +295,12 @@ __kernel void arithm_bitwise_and_with_mask_C1_D5 (__global char *src1, int src1_



-#if defined (DOUBLE_SUPPORT)
-__kernel void arithm_bitwise_and_with_mask_C1_D6 (__global char *src1, int src1_step, int src1_offset,
-                                          __global char *src2, int src2_step, int src2_offset,
-                                          __global uchar *mask, int mask_step, int mask_offset,
-                                          __global char *dst,  int dst_step,  int dst_offset,
-                                          int rows, int cols, int dst_step1)
+__kernel void arithm_bitwise_and_with_mask_C1_D6 (
+        __global char *src1, int src1_step, int src1_offset,
+        __global char *src2, int src2_step, int src2_offset,
+        __global uchar *mask, int mask_step, int mask_offset,
+        __global char *dst,  int dst_step,  int dst_offset,
+        int rows, int cols, int dst_step1)
 {

    int x = get_global_id(0);
@@ -305,15 +326,15 @@ __kernel void arithm_bitwise_and_with_mask_C1_D6 (__global char *src1, int src1_
    }

 }
-#endif



-__kernel void arithm_bitwise_and_with_mask_C2_D0 (__global uchar *src1, int src1_step, int src1_offset,
-                                          __global uchar *src2, int src2_step, int src2_offset,
-                                          __global uchar *mask, int mask_step, int mask_offset,
-                                          __global uchar *dst,  int dst_step,  int dst_offset,
-                                          int rows, int cols, int dst_step1)
+__kernel void arithm_bitwise_and_with_mask_C2_D0 (
+        __global uchar *src1, int src1_step, int src1_offset,
+        __global uchar *src2, int src2_step, int src2_offset,
+        __global uchar *mask, int mask_step, int mask_offset,
+        __global uchar *dst,  int dst_step,  int dst_offset,
+        int rows, int cols, int dst_step1)
 {

    int x = get_global_id(0);
@@ -323,7 +344,10 @@ __kernel void arithm_bitwise_and_with_mask_C2_D0 (__global uchar *src1, int src1
    {
        x = x << 1;

-        #define dst_align ((dst_offset >> 1) & 1)
+#ifdef dst_align
+#undef dst_align
+#endif
+#define dst_align ((dst_offset >> 1) & 1)
        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
        int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
        int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
@@ -347,11 +371,12 @@ __kernel void arithm_bitwise_and_with_mask_C2_D0 (__global uchar *src1, int src1
 }


-__kernel void arithm_bitwise_and_with_mask_C2_D1 (__global char *src1, int src1_step, int src1_offset,
-                                          __global char *src2, int src2_step, int src2_offset,
-                                          __global uchar *mask, int mask_step, int mask_offset,
-                                          __global char *dst,  int dst_step,  int dst_offset,
-                                          int rows, int cols, int dst_step1)
+__kernel void arithm_bitwise_and_with_mask_C2_D1 (
+        __global char *src1, int src1_step, int src1_offset,
+        __global char *src2, int src2_step, int src2_offset,
+        __global uchar *mask, int mask_step, int mask_offset,
+        __global char *dst,  int dst_step,  int dst_offset,
+        int rows, int cols, int dst_step1)
 {

    int x = get_global_id(0);
@@ -361,7 +386,10 @@ __kernel void arithm_bitwise_and_with_mask_C2_D1 (__global char *src1, int src1_
    {
        x = x << 1;

-        #define dst_align ((dst_offset >> 1) & 1)
+#ifdef dst_align
+#undef dst_align
+#endif
+#define dst_align ((dst_offset >> 1) & 1)
        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
        int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
        int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
@@ -384,11 +412,12 @@ __kernel void arithm_bitwise_and_with_mask_C2_D1 (__global char *src1, int src1_
    }
 }

-__kernel void arithm_bitwise_and_with_mask_C2_D2 (__global ushort *src1, int src1_step, int src1_offset,
-                                          __global ushort *src2, int src2_step, int src2_offset,
-                                          __global uchar  *mask, int mask_step, int mask_offset,
-                                          __global ushort *dst,  int dst_step,  int dst_offset,
-                                          int rows, int cols, int dst_step1)
+__kernel void arithm_bitwise_and_with_mask_C2_D2 (
+        __global ushort *src1, int src1_step, int src1_offset,
+        __global ushort *src2, int src2_step, int src2_offset,
+        __global uchar  *mask, int mask_step, int mask_offset,
+        __global ushort *dst,  int dst_step,  int dst_offset,
+        int rows, int cols, int dst_step1)
 {

    int x = get_global_id(0);
@@ -413,11 +442,12 @@ __kernel void arithm_bitwise_and_with_mask_C2_D2 (__global ushort *src1, int src
        *((__global ushort2 *)((__global char *)dst + dst_index)) = data;
    }
 }
-__kernel void arithm_bitwise_and_with_mask_C2_D3 (__global short *src1, int src1_step, int src1_offset,
-                                          __global short *src2, int src2_step, int src2_offset,
-                                          __global uchar *mask, int mask_step, int mask_offset,
-                                          __global short *dst,  int dst_step,  int dst_offset,
-                                          int rows, int cols, int dst_step1)
+__kernel void arithm_bitwise_and_with_mask_C2_D3 (
+        __global short *src1, int src1_step, int src1_offset,
+        __global short *src2, int src2_step, int src2_offset,
+        __global uchar *mask, int mask_step, int mask_offset,
+        __global short *dst,  int dst_step,  int dst_offset,
+        int rows, int cols, int dst_step1)
 {

    int x = get_global_id(0);
@@ -442,11 +472,12 @@ __kernel void arithm_bitwise_and_with_mask_C2_D3 (__global short *src1, int src1
        *((__global short2 *)((__global char *)dst + dst_index)) = data;
    }
 }
-__kernel void arithm_bitwise_and_with_mask_C2_D4 (__global int   *src1, int src1_step, int src1_offset,
-                                          __global int   *src2, int src2_step, int src2_offset,
-                                          __global uchar *mask, int mask_step, int mask_offset,
-                                          __global int    *dst,  int dst_step,  int dst_offset,
-                                          int rows, int cols, int dst_step1)
+__kernel void arithm_bitwise_and_with_mask_C2_D4 (
+        __global int   *src1, int src1_step, int src1_offset,
+        __global int   *src2, int src2_step, int src2_offset,
+        __global uchar *mask, int mask_step, int mask_offset,
+        __global int    *dst,  int dst_step,  int dst_offset,
+        int rows, int cols, int dst_step1)
 {

    int x = get_global_id(0);
@@ -471,11 +502,12 @@ __kernel void arithm_bitwise_and_with_mask_C2_D4 (__global int   *src1, int src1
        *((__global int2 *)((__global char *)dst + dst_index)) = data;
    }
 }
-__kernel void arithm_bitwise_and_with_mask_C2_D5 (__global char *src1, int src1_step, int src1_offset,
-                                          __global char *src2, int src2_step, int src2_offset,
-                                          __global uchar *mask, int mask_step, int mask_offset,
-                                          __global char *dst,  int dst_step,  int dst_offset,
-                                          int rows, int cols, int dst_step1)
+__kernel void arithm_bitwise_and_with_mask_C2_D5 (
+        __global char *src1, int src1_step, int src1_offset,
+        __global char *src2, int src2_step, int src2_offset,
+        __global uchar *mask, int mask_step, int mask_offset,
+        __global char *dst,  int dst_step,  int dst_offset,
+        int rows, int cols, int dst_step1)
 {

    int x = get_global_id(0);
@@ -500,12 +532,13 @@ __kernel void arithm_bitwise_and_with_mask_C2_D5 (__global char *src1, int src1_
        *((__global char8 *)((__global char *)dst + dst_index)) = data;
    }
 }
-#if defined (DOUBLE_SUPPORT)
-__kernel void arithm_bitwise_and_with_mask_C2_D6 (__global char *src1, int src1_step, int src1_offset,
-                                          __global char *src2, int src2_step, int src2_offset,
-                                          __global uchar *mask, int mask_step, int mask_offset,
-                                          __global char *dst,  int dst_step,  int dst_offset,
-                                          int rows, int cols, int dst_step1)
+
+__kernel void arithm_bitwise_and_with_mask_C2_D6 (
+        __global char *src1, int src1_step, int src1_offset,
+        __global char *src2, int src2_step, int src2_offset,
+        __global uchar *mask, int mask_step, int mask_offset,
+        __global char *dst,  int dst_step,  int dst_offset,
+        int rows, int cols, int dst_step1)
 {

    int x = get_global_id(0);
@@ -530,15 +563,15 @@ __kernel void arithm_bitwise_and_with_mask_C2_D6 (__global char *src1, int src1_
        *((__global char16 *)((__global char *)dst + dst_index)) = data;
    }
 }
-#endif



-__kernel void arithm_bitwise_and_with_mask_C3_D0 (__global uchar *src1, int src1_step, int src1_offset,
-                                          __global uchar *src2, int src2_step, int src2_offset,
-                                          __global uchar *mask, int mask_step, int mask_offset,
-                                          __global uchar *dst,  int dst_step,  int dst_offset,
-                                          int rows, int cols, int dst_step1)
+__kernel void arithm_bitwise_and_with_mask_C3_D0 (
+        __global uchar *src1, int src1_step, int src1_offset,
+        __global uchar *src2, int src2_step, int src2_offset,
+        __global uchar *mask, int mask_step, int mask_offset,
+        __global uchar *dst,  int dst_step,  int dst_offset,
+        int rows, int cols, int dst_step1)
 {

    int x = get_global_id(0);
@@ -548,7 +581,10 @@ __kernel void arithm_bitwise_and_with_mask_C3_D0 (__global uchar *src1, int src1
    {
        x = x << 2;

-        #define dst_align (((dst_offset % dst_step) / 3 ) & 3)
+#ifdef dst_align
+#undef dst_align
+#endif
+#define dst_align (((dst_offset % dst_step) / 3 ) & 3)
        int src1_index = mad24(y, src1_step, (x * 3) + src1_offset - (dst_align * 3));
        int src2_index = mad24(y, src2_step, (x * 3) + src2_offset - (dst_align * 3));
        int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
@@ -596,11 +632,12 @@ __kernel void arithm_bitwise_and_with_mask_C3_D0 (__global uchar *src1, int src1
 }


-__kernel void arithm_bitwise_and_with_mask_C3_D1 (__global char *src1, int src1_step, int src1_offset,
-                                          __global char *src2, int src2_step, int src2_offset,
-                                          __global uchar *mask, int mask_step, int mask_offset,
-                                          __global char *dst,  int dst_step,  int dst_offset,
-                                          int rows, int cols, int dst_step1)
+__kernel void arithm_bitwise_and_with_mask_C3_D1 (
+        __global char *src1, int src1_step, int src1_offset,
+        __global char *src2, int src2_step, int src2_offset,
+        __global uchar *mask, int mask_step, int mask_offset,
+        __global char *dst,  int dst_step,  int dst_offset,
+        int rows, int cols, int dst_step1)
 {

    int x = get_global_id(0);
@@ -610,7 +647,10 @@ __kernel void arithm_bitwise_and_with_mask_C3_D1 (__global char *src1, int src1_
    {
        x = x << 2;

-        #define dst_align (((dst_offset % dst_step) / 3 ) & 3)
+#ifdef dst_align
+#undef dst_align
+#endif
+#define dst_align (((dst_offset % dst_step) / 3 ) & 3)
        int src1_index = mad24(y, src1_step, (x * 3) + src1_offset - (dst_align * 3));
        int src2_index = mad24(y, src2_step, (x * 3) + src2_offset - (dst_align * 3));
        int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
@@ -657,11 +697,12 @@ __kernel void arithm_bitwise_and_with_mask_C3_D1 (__global char *src1, int src1_
    }
 }

-__kernel void arithm_bitwise_and_with_mask_C3_D2 (__global ushort *src1, int src1_step, int src1_offset,
-                                          __global ushort *src2, int src2_step, int src2_offset,
-                                          __global uchar  *mask, int mask_step, int mask_offset,
-                                          __global ushort *dst,  int dst_step,  int dst_offset,
-                                          int rows, int cols, int dst_step1)
+__kernel void arithm_bitwise_and_with_mask_C3_D2 (
+        __global ushort *src1, int src1_step, int src1_offset,
+        __global ushort *src2, int src2_step, int src2_offset,
+        __global uchar  *mask, int mask_step, int mask_offset,
+        __global ushort *dst,  int dst_step,  int dst_offset,
+        int rows, int cols, int dst_step1)
 {

    int x = get_global_id(0);
@@ -671,7 +712,10 @@ __kernel void arithm_bitwise_and_with_mask_C3_D2 (__global ushort *src1, int src
    {
        x = x << 1;

-        #define dst_align (((dst_offset % dst_step) / 6 ) & 1)
+#ifdef dst_align
+#undef dst_align
+#endif
+#define dst_align (((dst_offset % dst_step) / 6 ) & 1)
        int src1_index = mad24(y, src1_step, (x * 6) + src1_offset - (dst_align * 6));
        int src2_index = mad24(y, src2_step, (x * 6) + src2_offset - (dst_align * 6));
        int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
@@ -701,23 +745,24 @@ __kernel void arithm_bitwise_and_with_mask_C3_D2 (__global ushort *src1, int src
        data_0.xy = ((mask_data.x) && (dst_index + 0 >= dst_start)) ? tmp_data_0.xy : data_0.xy;

        data_1.x  = ((mask_data.x) && (dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end))
-                     ? tmp_data_1.x : data_1.x;
+                    ? tmp_data_1.x : data_1.x;
        data_1.y  = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
-                     ? tmp_data_1.y : data_1.y;
+                    ? tmp_data_1.y : data_1.y;

        data_2.xy = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
-                     ? tmp_data_2.xy : data_2.xy;
+                    ? tmp_data_2.xy : data_2.xy;

-       *((__global ushort2 *)((__global char *)dst + dst_index + 0))= data_0;
-       *((__global ushort2 *)((__global char *)dst + dst_index + 4))= data_1;
-       *((__global ushort2 *)((__global char *)dst + dst_index + 8))= data_2;
+        *((__global ushort2 *)((__global char *)dst + dst_index + 0))= data_0;
+        *((__global ushort2 *)((__global char *)dst + dst_index + 4))= data_1;
+        *((__global ushort2 *)((__global char *)dst + dst_index + 8))= data_2;
    }
 }
-__kernel void arithm_bitwise_and_with_mask_C3_D3 (__global short *src1, int src1_step, int src1_offset,
-                                          __global short *src2, int src2_step, int src2_offset,
-                                          __global uchar  *mask, int mask_step, int mask_offset,
-                                          __global short *dst,  int dst_step,  int dst_offset,
-                                          int rows, int cols, int dst_step1)
+__kernel void arithm_bitwise_and_with_mask_C3_D3 (
+        __global short *src1, int src1_step, int src1_offset,
+        __global short *src2, int src2_step, int src2_offset,
+        __global uchar  *mask, int mask_step, int mask_offset,
+        __global short *dst,  int dst_step,  int dst_offset,
+        int rows, int cols, int dst_step1)
 {

    int x = get_global_id(0);
@@ -727,7 +772,10 @@ __kernel void arithm_bitwise_and_with_mask_C3_D3 (__global short *src1, int src1
    {
        x = x << 1;

-        #define dst_align (((dst_offset % dst_step) / 6 ) & 1)
+#ifdef dst_align
+#undef dst_align
+#endif
+#define dst_align (((dst_offset % dst_step) / 6 ) & 1)
        int src1_index = mad24(y, src1_step, (x * 6) + src1_offset - (dst_align * 6));
        int src2_index = mad24(y, src2_step, (x * 6) + src2_offset - (dst_align * 6));
        int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
@@ -757,23 +805,24 @@ __kernel void arithm_bitwise_and_with_mask_C3_D3 (__global short *src1, int src1
        data_0.xy = ((mask_data.x) && (dst_index + 0 >= dst_start)) ? tmp_data_0.xy : data_0.xy;

        data_1.x  = ((mask_data.x) && (dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end))
-                     ? tmp_data_1.x : data_1.x;
+                    ? tmp_data_1.x : data_1.x;
        data_1.y  = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
-                     ? tmp_data_1.y : data_1.y;
+                    ? tmp_data_1.y : data_1.y;

        data_2.xy = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
-                     ? tmp_data_2.xy : data_2.xy;
+                    ? tmp_data_2.xy : data_2.xy;

-       *((__global short2 *)((__global char *)dst + dst_index + 0))= data_0;
-       *((__global short2 *)((__global char *)dst + dst_index + 4))= data_1;
-       *((__global short2 *)((__global char *)dst + dst_index + 8))= data_2;
+        *((__global short2 *)((__global char *)dst + dst_index + 0))= data_0;
+        *((__global short2 *)((__global char *)dst + dst_index + 4))= data_1;
+        *((__global short2 *)((__global char *)dst + dst_index + 8))= data_2;
    }
 }
-__kernel void arithm_bitwise_and_with_mask_C3_D4 (__global int   *src1, int src1_step, int src1_offset,
-                                          __global int   *src2, int src2_step, int src2_offset,
-                                          __global uchar *mask, int mask_step, int mask_offset,
-                                          __global int   *dst,  int dst_step,  int dst_offset,
-                                          int rows, int cols, int dst_step1)
+__kernel void arithm_bitwise_and_with_mask_C3_D4 (
+        __global int   *src1, int src1_step, int src1_offset,
+        __global int   *src2, int src2_step, int src2_offset,
+        __global uchar *mask, int mask_step, int mask_offset,
+        __global int   *dst,  int dst_step,  int dst_offset,
+        int rows, int cols, int dst_step1)
 {

    int x = get_global_id(0);
@@ -808,16 +857,17 @@ __kernel void arithm_bitwise_and_with_mask_C3_D4 (__global int   *src1, int src1
        data_1 = mask_data ? tmp_data_1 : data_1;
        data_2 = mask_data ? tmp_data_2 : data_2;

-       *((__global int *)((__global char *)dst + dst_index + 0))= data_0;
-       *((__global int *)((__global char *)dst + dst_index + 4))= data_1;
-       *((__global int *)((__global char *)dst + dst_index + 8))= data_2;
+        *((__global int *)((__global char *)dst + dst_index + 0))= data_0;
+        *((__global int *)((__global char *)dst + dst_index + 4))= data_1;
+        *((__global int *)((__global char *)dst + dst_index + 8))= data_2;
    }
 }
-__kernel void arithm_bitwise_and_with_mask_C3_D5 (__global char *src1, int src1_step, int src1_offset,
-                                          __global char *src2, int src2_step, int src2_offset,
-                                          __global uchar *mask, int mask_step, int mask_offset,
-                                          __global char *dst,  int dst_step,  int dst_offset,
-                                          int rows, int cols, int dst_step1)
+__kernel void arithm_bitwise_and_with_mask_C3_D5 (
+        __global char *src1, int src1_step, int src1_offset,
+        __global char *src2, int src2_step, int src2_offset,
+        __global uchar *mask, int mask_step, int mask_offset,
+        __global char *dst,  int dst_step,  int dst_offset,
+        int rows, int cols, int dst_step1)
 {

    int x = get_global_id(0);
@@ -852,17 +902,18 @@ __kernel void arithm_bitwise_and_with_mask_C3_D5 (__global char *src1, int src1_
        data_1 = mask_data ? tmp_data_1 : data_1;
        data_2 = mask_data ? tmp_data_2 : data_2;

-       *((__global char4 *)((__global char *)dst + dst_index + 0))= data_0;
-       *((__global char4 *)((__global char *)dst + dst_index + 4))= data_1;
-       *((__global char4 *)((__global char *)dst + dst_index + 8))= data_2;
+        *((__global char4 *)((__global char *)dst + dst_index + 0))= data_0;
+        *((__global char4 *)((__global char *)dst + dst_index + 4))= data_1;
+        *((__global char4 *)((__global char *)dst + dst_index + 8))= data_2;
    }
 }
 #if defined (DOUBLE_SUPPORT)
-__kernel void arithm_bitwise_and_with_mask_C3_D6 (__global char *src1, int src1_step, int src1_offset,
-                                          __global char *src2, int src2_step, int src2_offset,
-                                          __global uchar  *mask, int mask_step, int mask_offset,
-                                          __global char *dst,  int dst_step,  int dst_offset,
-                                          int rows, int cols, int dst_step1)
+__kernel void arithm_bitwise_and_with_mask_C3_D6 (
+        __global char *src1, int src1_step, int src1_offset,
+        __global char *src2, int src2_step, int src2_offset,
+        __global uchar  *mask, int mask_step, int mask_offset,
+        __global char *dst,  int dst_step,  int dst_offset,
+        int rows, int cols, int dst_step1)
 {

    int x = get_global_id(0);
@@ -897,20 +948,20 @@ __kernel void arithm_bitwise_and_with_mask_C3_D6 (__global char *src1, int src1_
        data_1 = mask_data ? tmp_data_1 : data_1;
        data_2 = mask_data ? tmp_data_2 : data_2;

-       *((__global char8 *)((__global char *)dst + dst_index + 0 ))= data_0;
-       *((__global char8 *)((__global char *)dst + dst_index + 8 ))= data_1;
-       *((__global char8 *)((__global char *)dst + dst_index + 16))= data_2;
+        *((__global char8 *)((__global char *)dst + dst_index + 0 ))= data_0;
+        *((__global char8 *)((__global char *)dst + dst_index + 8 ))= data_1;
+        *((__global char8 *)((__global char *)dst + dst_index + 16))= data_2;
    }
 }
 #endif


-
-__kernel void arithm_bitwise_and_with_mask_C4_D0 (__global uchar *src1, int src1_step, int src1_offset,
-                                          __global uchar *src2, int src2_step, int src2_offset,
-                                          __global uchar *mask, int mask_step, int mask_offset,
-                                          __global uchar *dst,  int dst_step,  int dst_offset,
-                                          int rows, int cols, int dst_step1)
+__kernel void arithm_bitwise_and_with_mask_C4_D0 (
+        __global uchar *src1, int src1_step, int src1_offset,
+        __global uchar *src2, int src2_step, int src2_offset,
+        __global uchar *mask, int mask_step, int mask_offset,
+        __global uchar *dst,  int dst_step,  int dst_offset,
+        int rows, int cols, int dst_step1)
 {

    int x = get_global_id(0);
@@ -937,11 +988,12 @@ __kernel void arithm_bitwise_and_with_mask_C4_D0 (__global uchar *src1, int src1
 }


-__kernel void arithm_bitwise_and_with_mask_C4_D1 (__global char *src1, int src1_step, int src1_offset,
-                                          __global char *src2, int src2_step, int src2_offset,
-                                          __global uchar *mask, int mask_step, int mask_offset,
-                                          __global char *dst,  int dst_step,  int dst_offset,
-                                          int rows, int cols, int dst_step1)
+__kernel void arithm_bitwise_and_with_mask_C4_D1 (
+        __global char *src1, int src1_step, int src1_offset,
+        __global char *src2, int src2_step, int src2_offset,
+        __global uchar *mask, int mask_step, int mask_offset,
+        __global char *dst,  int dst_step,  int dst_offset,
+        int rows, int cols, int dst_step1)
 {

    int x = get_global_id(0);
@@ -967,11 +1019,12 @@ __kernel void arithm_bitwise_and_with_mask_C4_D1 (__global char *src1, int src1_
    }
 }

-__kernel void arithm_bitwise_and_with_mask_C4_D2 (__global ushort *src1, int src1_step, int src1_offset,
-                                          __global ushort *src2, int src2_step, int src2_offset,
-                                          __global uchar  *mask, int mask_step, int mask_offset,
-                                          __global ushort *dst,  int dst_step,  int dst_offset,
-                                          int rows, int cols, int dst_step1)
+__kernel void arithm_bitwise_and_with_mask_C4_D2 (
+        __global ushort *src1, int src1_step, int src1_offset,
+        __global ushort *src2, int src2_step, int src2_offset,
+        __global uchar  *mask, int mask_step, int mask_offset,
+        __global ushort *dst,  int dst_step,  int dst_offset,
+        int rows, int cols, int dst_step1)
 {

    int x = get_global_id(0);
@@ -996,11 +1049,12 @@ __kernel void arithm_bitwise_and_with_mask_C4_D2 (__global ushort *src1, int src
        *((__global ushort4 *)((__global char *)dst + dst_index)) = data;
    }
 }
-__kernel void arithm_bitwise_and_with_mask_C4_D3 (__global short *src1, int src1_step, int src1_offset,
-                                          __global short *src2, int src2_step, int src2_offset,
-                                          __global uchar *mask, int mask_step, int mask_offset,
-                                          __global short *dst,  int dst_step,  int dst_offset,
-                                          int rows, int cols, int dst_step1)
+__kernel void arithm_bitwise_and_with_mask_C4_D3 (
+        __global short *src1, int src1_step, int src1_offset,
+        __global short *src2, int src2_step, int src2_offset,
+        __global uchar *mask, int mask_step, int mask_offset,
+        __global short *dst,  int dst_step,  int dst_offset,
+        int rows, int cols, int dst_step1)
 {

    int x = get_global_id(0);
@@ -1025,11 +1079,12 @@ __kernel void arithm_bitwise_and_with_mask_C4_D3 (__global short *src1, int src1
        *((__global short4 *)((__global char *)dst + dst_index)) = data;
    }
 }
-__kernel void arithm_bitwise_and_with_mask_C4_D4 (__global int   *src1, int src1_step, int src1_offset,
-                                          __global int   *src2, int src2_step, int src2_offset,
-                                          __global uchar *mask, int mask_step, int mask_offset,
-                                          __global int   *dst,  int dst_step,  int dst_offset,
-                                          int rows, int cols, int dst_step1)
+__kernel void arithm_bitwise_and_with_mask_C4_D4 (
+        __global int   *src1, int src1_step, int src1_offset,
+        __global int   *src2, int src2_step, int src2_offset,
+        __global uchar *mask, int mask_step, int mask_offset,
+        __global int   *dst,  int dst_step,  int dst_offset,
+        int rows, int cols, int dst_step1)
 {

    int x = get_global_id(0);
@@ -1054,11 +1109,12 @@ __kernel void arithm_bitwise_and_with_mask_C4_D4 (__global int   *src1, int src1
        *((__global int4 *)((__global char *)dst + dst_index)) = data;
    }
 }
-__kernel void arithm_bitwise_and_with_mask_C4_D5 (__global char *src1, int src1_step, int src1_offset,
-                                          __global char *src2, int src2_step, int src2_offset,
-                                          __global uchar *mask, int mask_step, int mask_offset,
-                                          __global char *dst,  int dst_step,  int dst_offset,
-                                          int rows, int cols, int dst_step1)
+__kernel void arithm_bitwise_and_with_mask_C4_D5 (
+        __global char *src1, int src1_step, int src1_offset,
+        __global char *src2, int src2_step, int src2_offset,
+        __global uchar *mask, int mask_step, int mask_offset,
+        __global char *dst,  int dst_step,  int dst_offset,
+        int rows, int cols, int dst_step1)
 {

    int x = get_global_id(0);
@@ -1084,11 +1140,12 @@ __kernel void arithm_bitwise_and_with_mask_C4_D5 (__global char *src1, int src1_
    }
 }
 #if defined (DOUBLE_SUPPORT)
-__kernel void arithm_bitwise_and_with_mask_C4_D6 (__global char *src1, int src1_step, int src1_offset,
-                                                  __global char *src2, int src2_step, int src2_offset,
-                                                  __global uchar  *mask, int mask_step, int mask_offset,
-                                                  __global char *dst,  int dst_step,  int dst_offset,
-                                                  int rows, int cols, int dst_step1)
+__kernel void arithm_bitwise_and_with_mask_C4_D6 (
+        __global char *src1, int src1_step, int src1_offset,
+        __global char *src2, int src2_step, int src2_offset,
+        __global uchar  *mask, int mask_step, int mask_offset,
+        __global char *dst,  int dst_step,  int dst_offset,
+        int rows, int cols, int dst_step1)
 {

    int x = get_global_id(0);
--- a/modules/ocl/src/opencl/arithm_bitwise_and_scalar.cl
+++ b/modules/ocl/src/opencl/arithm_bitwise_and_scalar.cl
@@ -42,19 +42,22 @@
 // the use of this software, even if advised of the possibility of such damage.
 //
 //
-#if defined (__ATI__)
-#pragma OPENCL EXTENSION cl_amd_fp64:enable
-#elif defined (__NVIDIA__)
+#if defined (DOUBLE_SUPPORT)
+#ifdef cl_khr_fp64
 #pragma OPENCL EXTENSION cl_khr_fp64:enable
+#elif defined (cl_amd_fp64)
+#pragma OPENCL EXTENSION cl_amd_fp64:enable
+#endif
 #endif

 //////////////////////////////////////////////////////////////////////////////////////////////////////
 ////////////////////////////////////////////BITWISE_AND////////////////////////////////////////////////////
 ///////////////////////////////////////////////////////////////////////////////////////////////////////
 /**************************************and with scalar without mask**************************************/
-__kernel void arithm_s_bitwise_and_C1_D0 (__global   uchar *src1, int src1_step, int src1_offset,
-                                  __global   uchar *dst,  int dst_step,  int dst_offset,
-                                  uchar4 src2, int rows, int cols, int dst_step1)
+__kernel void arithm_s_bitwise_and_C1_D0 (
+        __global   uchar *src1, int src1_step, int src1_offset,
+        __global   uchar *dst,  int dst_step,  int dst_offset,
+        uchar4 src2, int rows, int cols, int dst_step1)
 {
    int x = get_global_id(0);
    int y = get_global_id(1);
@@ -63,7 +66,10 @@ __kernel void arithm_s_bitwise_and_C1_D0 (__global   uchar *src1, int src1_step,
    {
        x = x << 2;

-        #define dst_align (dst_offset & 3)
+#ifdef dst_align
+#undef dst_align
+#endif
+#define dst_align (dst_offset & 3)
        int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);

        int dst_start  = mad24(y, dst_step, dst_offset);
@@ -86,9 +92,10 @@ __kernel void arithm_s_bitwise_and_C1_D0 (__global   uchar *src1, int src1_step,
 }


-__kernel void arithm_s_bitwise_and_C1_D1 (__global   char *src1, int src1_step, int src1_offset,
-                                  __global   char *dst,  int dst_step,  int dst_offset,
-                                  char4 src2, int rows, int cols, int dst_step1)
+__kernel void arithm_s_bitwise_and_C1_D1 (
+        __global   char *src1, int src1_step, int src1_offset,
+        __global   char *dst,  int dst_step,  int dst_offset,
+        char4 src2, int rows, int cols, int dst_step1)
 {
    int x = get_global_id(0);
    int y = get_global_id(1);
@@ -97,7 +104,10 @@ __kernel void arithm_s_bitwise_and_C1_D1 (__global   char *src1, int src1_step,
    {
        x = x << 2;

-        #define dst_align (dst_offset & 3)
+#ifdef dst_align
+#undef dst_align
+#endif
+#define dst_align (dst_offset & 3)
        int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);

        int dst_start  = mad24(y, dst_step, dst_offset);
@@ -119,9 +129,10 @@ __kernel void arithm_s_bitwise_and_C1_D1 (__global   char *src1, int src1_step,
    }
 }

-__kernel void arithm_s_bitwise_and_C1_D2 (__global   ushort *src1, int src1_step, int src1_offset,
-                                  __global   ushort *dst,  int dst_step,  int dst_offset,
-                                  ushort4 src2, int rows, int cols, int dst_step1)
+__kernel void arithm_s_bitwise_and_C1_D2 (
+        __global   ushort *src1, int src1_step, int src1_offset,
+        __global   ushort *dst,  int dst_step,  int dst_offset,
+        ushort4 src2, int rows, int cols, int dst_step1)
 {

    int x = get_global_id(0);
@@ -131,7 +142,10 @@ __kernel void arithm_s_bitwise_and_C1_D2 (__global   ushort *src1, int src1_step
    {
        x = x << 1;

-        #define dst_align ((dst_offset >> 1) & 1)
+#ifdef dst_align
+#undef dst_align
+#endif
+#define dst_align ((dst_offset >> 1) & 1)
        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));

        int dst_start  = mad24(y, dst_step, dst_offset);
@@ -150,9 +164,10 @@ __kernel void arithm_s_bitwise_and_C1_D2 (__global   ushort *src1, int src1_step
        *((__global ushort2 *)((__global uchar *)dst + dst_index)) = data;
    }
 }
-__kernel void arithm_s_bitwise_and_C1_D3 (__global   short *src1, int src1_step, int src1_offset,
-                                  __global   short *dst,  int dst_step,  int dst_offset,
-                                  short4 src2, int rows, int cols, int dst_step1)
+__kernel void arithm_s_bitwise_and_C1_D3 (
+        __global   short *src1, int src1_step, int src1_offset,
+        __global   short *dst,  int dst_step,  int dst_offset,
+        short4 src2, int rows, int cols, int dst_step1)
 {

    int x = get_global_id(0);
@@ -162,7 +177,10 @@ __kernel void arithm_s_bitwise_and_C1_D3 (__global   short *src1, int src1_step,
    {
        x = x << 1;

-        #define dst_align ((dst_offset >> 1) & 1)
+#ifdef dst_align
+#undef dst_align
+#endif
+#define dst_align ((dst_offset >> 1) & 1)
        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));

        int dst_start  = mad24(y, dst_step, dst_offset);
@@ -181,9 +199,10 @@ __kernel void arithm_s_bitwise_and_C1_D3 (__global   short *src1, int src1_step,
        *((__global short2 *)((__global uchar *)dst + dst_index)) = data;
    }
 }
-__kernel void arithm_s_bitwise_and_C1_D4 (__global   int *src1, int src1_step, int src1_offset,
-                                  __global   int *dst,  int dst_step,  int dst_offset,
-                                  int4 src2, int rows, int cols, int dst_step1)
+__kernel void arithm_s_bitwise_and_C1_D4 (
+        __global   int *src1, int src1_step, int src1_offset,
+        __global   int *dst,  int dst_step,  int dst_offset,
+        int4 src2, int rows, int cols, int dst_step1)
 {

    int x = get_global_id(0);
@@ -202,9 +221,10 @@ __kernel void arithm_s_bitwise_and_C1_D4 (__global   int *src1, int src1_step, i
        *((__global int *)((__global char *)dst + dst_index)) = data;
    }
 }
-__kernel void arithm_s_bitwise_and_C1_D5 (__global   char *src1, int src1_step, int src1_offset,
-                                  __global   char *dst,  int dst_step,  int dst_offset,
-                                  char16 src2, int rows, int cols, int dst_step1)
+__kernel void arithm_s_bitwise_and_C1_D5 (
+        __global   char *src1, int src1_step, int src1_offset,
+        __global   char *dst,  int dst_step,  int dst_offset,
+        char16 src2, int rows, int cols, int dst_step1)
 {

    int x = get_global_id(0);
@@ -232,11 +252,11 @@ __kernel void arithm_s_bitwise_and_C1_D5 (__global   char *src1, int src1_step,
        *((__global char4 *)((__global char *)dst + dst_index)) = data;
    }
 }
-
 #if defined (DOUBLE_SUPPORT)
-__kernel void arithm_s_bitwise_and_C1_D6 (__global short *src1, int src1_step, int src1_offset,
-                                  __global short *dst,  int dst_step,  int dst_offset,
-                                  short16 src2, int rows, int cols, int dst_step1)
+__kernel void arithm_s_bitwise_and_C1_D6 (
+        __global short *src1, int src1_step, int src1_offset,
+        __global short *dst,  int dst_step,  int dst_offset,
+        short16 src2, int rows, int cols, int dst_step1)
 {

    int x = get_global_id(0);
@@ -256,9 +276,10 @@ __kernel void arithm_s_bitwise_and_C1_D6 (__global short *src1, int src1_step, i
    }
 }
 #endif
-__kernel void arithm_s_bitwise_and_C2_D0 (__global   uchar *src1, int src1_step, int src1_offset,
-                                  __global   uchar *dst,  int dst_step,  int dst_offset,
-                                  uchar4 src2, int rows, int cols, int dst_step1)
+__kernel void arithm_s_bitwise_and_C2_D0 (
+        __global   uchar *src1, int src1_step, int src1_offset,
+        __global   uchar *dst,  int dst_step,  int dst_offset,
+        uchar4 src2, int rows, int cols, int dst_step1)
 {

    int x = get_global_id(0);
@@ -268,7 +289,10 @@ __kernel void arithm_s_bitwise_and_C2_D0 (__global   uchar *src1, int src1_step,
    {
        x = x << 1;

-        #define dst_align ((dst_offset >> 1) & 1)
+#ifdef dst_align
+#undef dst_align
+#endif
+#define dst_align ((dst_offset >> 1) & 1)
        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));

        int dst_start  = mad24(y, dst_step, dst_offset);
@@ -290,9 +314,10 @@ __kernel void arithm_s_bitwise_and_C2_D0 (__global   uchar *src1, int src1_step,
 }


-__kernel void arithm_s_bitwise_and_C2_D1 (__global   char *src1, int src1_step, int src1_offset,
-                                  __global   char *dst,  int dst_step,  int dst_offset,
-                                  char4 src2, int rows, int cols, int dst_step1)
+__kernel void arithm_s_bitwise_and_C2_D1 (
+        __global   char *src1, int src1_step, int src1_offset,
+        __global   char *dst,  int dst_step,  int dst_offset,
+        char4 src2, int rows, int cols, int dst_step1)
 {

    int x = get_global_id(0);
@@ -302,7 +327,10 @@ __kernel void arithm_s_bitwise_and_C2_D1 (__global   char *src1, int src1_step,
    {
        x = x << 1;

-        #define dst_align ((dst_offset >> 1) & 1)
+#ifdef dst_align
+#undef dst_align
+#endif
+#define dst_align ((dst_offset >> 1) & 1)
        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));

        int dst_start  = mad24(y, dst_step, dst_offset);
@@ -322,9 +350,10 @@ __kernel void arithm_s_bitwise_and_C2_D1 (__global   char *src1, int src1_step,
    }
 }

-__kernel void arithm_s_bitwise_and_C2_D2 (__global   ushort *src1, int src1_step, int src1_offset,
-                                  __global   ushort *dst,  int dst_step,  int dst_offset,
-                                  ushort4 src2, int rows, int cols, int dst_step1)
+__kernel void arithm_s_bitwise_and_C2_D2 (
+        __global   ushort *src1, int src1_step, int src1_offset,
+        __global   ushort *dst,  int dst_step,  int dst_offset,
+        ushort4 src2, int rows, int cols, int dst_step1)
 {

    int x = get_global_id(0);
@@ -343,9 +372,10 @@ __kernel void arithm_s_bitwise_and_C2_D2 (__global   ushort *src1, int src1_step
        *((__global ushort2 *)((__global char *)dst + dst_index)) = data;
    }
 }
-__kernel void arithm_s_bitwise_and_C2_D3 (__global   short *src1, int src1_step, int src1_offset,
-                                  __global   short *dst,  int dst_step,  int dst_offset,
-                                  short4 src2, int rows, int cols, int dst_step1)
+__kernel void arithm_s_bitwise_and_C2_D3 (
+        __global   short *src1, int src1_step, int src1_offset,
+        __global   short *dst,  int dst_step,  int dst_offset,
+        short4 src2, int rows, int cols, int dst_step1)
 {

    int x = get_global_id(0);
@@ -364,9 +394,10 @@ __kernel void arithm_s_bitwise_and_C2_D3 (__global   short *src1, int src1_step,
        *((__global short2 *)((__global char *)dst + dst_index)) = data;
    }
 }
-__kernel void arithm_s_bitwise_and_C2_D4 (__global   int *src1, int src1_step, int src1_offset,
-                                  __global   int *dst,  int dst_step,  int dst_offset,
-                                  int4 src2, int rows, int cols, int dst_step1)
+__kernel void arithm_s_bitwise_and_C2_D4 (
+        __global   int *src1, int src1_step, int src1_offset,
+        __global   int *dst,  int dst_step,  int dst_offset,
+        int4 src2, int rows, int cols, int dst_step1)
 {

    int x = get_global_id(0);
@@ -384,9 +415,10 @@ __kernel void arithm_s_bitwise_and_C2_D4 (__global   int *src1, int src1_step, i
        *((__global int2 *)((__global char *)dst + dst_index)) = data;
    }
 }
-__kernel void arithm_s_bitwise_and_C2_D5 (__global   char *src1, int src1_step, int src1_offset,
-                                  __global   char *dst,  int dst_step,  int dst_offset,
-                                  char16 src2, int rows, int cols, int dst_step1)
+__kernel void arithm_s_bitwise_and_C2_D5 (
+        __global   char *src1, int src1_step, int src1_offset,
+        __global   char *dst,  int dst_step,  int dst_offset,
+        char16 src2, int rows, int cols, int dst_step1)
 {

    int x = get_global_id(0);
@@ -403,12 +435,13 @@ __kernel void arithm_s_bitwise_and_C2_D5 (__global   char *src1, int src1_step,
        char8 tmp_data = src1_data & src2_data;

        *((__global char8 *)((__global char *)dst + dst_index)) = tmp_data;
-      }
+    }
 }
 #if defined (DOUBLE_SUPPORT)
-__kernel void arithm_s_bitwise_and_C2_D6 (__global short *src1, int src1_step, int src1_offset,
-                                  __global short *dst,  int dst_step,  int dst_offset,
-                                  short16 src2, int rows, int cols, int dst_step1)
+__kernel void arithm_s_bitwise_and_C2_D6 (
+        __global short *src1, int src1_step, int src1_offset,
+        __global short *dst,  int dst_step,  int dst_offset,
+        short16 src2, int rows, int cols, int dst_step1)
 {

    int x = get_global_id(0);
@@ -428,9 +461,10 @@ __kernel void arithm_s_bitwise_and_C2_D6 (__global short *src1, int src1_step, i
    }
 }
 #endif
-__kernel void arithm_s_bitwise_and_C3_D0 (__global   uchar *src1, int src1_step, int src1_offset,
-                                  __global   uchar *dst,  int dst_step,  int dst_offset,
-                                  uchar4 src2, int rows, int cols, int dst_step1)
+__kernel void arithm_s_bitwise_and_C3_D0 (
+        __global   uchar *src1, int src1_step, int src1_offset,
+        __global   uchar *dst,  int dst_step,  int dst_offset,
+        uchar4 src2, int rows, int cols, int dst_step1)
 {

    int x = get_global_id(0);
@@ -440,7 +474,10 @@ __kernel void arithm_s_bitwise_and_C3_D0 (__global   uchar *src1, int src1_step,
    {
        x = x << 2;

-        #define dst_align (((dst_offset % dst_step) / 3 ) & 3)
+#ifdef dst_align
+#undef dst_align
+#endif
+#define dst_align (((dst_offset % dst_step) / 3 ) & 3)
        int src1_index = mad24(y, src1_step, (x * 3) + src1_offset - (dst_align * 3));

        int dst_start  = mad24(y, dst_step, dst_offset);
@@ -484,9 +521,10 @@ __kernel void arithm_s_bitwise_and_C3_D0 (__global   uchar *src1, int src1_step,
 }


-__kernel void arithm_s_bitwise_and_C3_D1 (__global   char *src1, int src1_step, int src1_offset,
-                                  __global   char *dst,  int dst_step,  int dst_offset,
-                                  char4 src2, int rows, int cols, int dst_step1)
+__kernel void arithm_s_bitwise_and_C3_D1 (
+        __global   char *src1, int src1_step, int src1_offset,
+        __global   char *dst,  int dst_step,  int dst_offset,
+        char4 src2, int rows, int cols, int dst_step1)
 {

    int x = get_global_id(0);
@@ -496,7 +534,10 @@ __kernel void arithm_s_bitwise_and_C3_D1 (__global   char *src1, int src1_step,
    {
        x = x << 2;

-        #define dst_align (((dst_offset % dst_step) / 3 ) & 3)
+#ifdef dst_align
+#undef dst_align
+#endif
+#define dst_align (((dst_offset % dst_step) / 3 ) & 3)
        int src1_index = mad24(y, src1_step, (x * 3) + src1_offset - (dst_align * 3));

        int dst_start  = mad24(y, dst_step, dst_offset);
@@ -539,9 +580,10 @@ __kernel void arithm_s_bitwise_and_C3_D1 (__global   char *src1, int src1_step,
    }
 }

-__kernel void arithm_s_bitwise_and_C3_D2 (__global   ushort *src1, int src1_step, int src1_offset,
-                                  __global   ushort *dst,  int dst_step,  int dst_offset,
-                                  ushort4 src2, int rows, int cols, int dst_step1)
+__kernel void arithm_s_bitwise_and_C3_D2 (
+        __global   ushort *src1, int src1_step, int src1_offset,
+        __global   ushort *dst,  int dst_step,  int dst_offset,
+        ushort4 src2, int rows, int cols, int dst_step1)
 {

    int x = get_global_id(0);
@@ -551,7 +593,10 @@ __kernel void arithm_s_bitwise_and_C3_D2 (__global   ushort *src1, int src1_step
    {
        x = x << 1;

-        #define dst_align (((dst_offset % dst_step) / 6 ) & 1)
+#ifdef dst_align
+#undef dst_align
+#endif
+#define dst_align (((dst_offset % dst_step) / 6 ) & 1)
        int src1_index = mad24(y, src1_step, (x * 6) + src1_offset - (dst_align * 6));

        int dst_start  = mad24(y, dst_step, dst_offset);
@@ -577,21 +622,22 @@ __kernel void arithm_s_bitwise_and_C3_D2 (__global   ushort *src1, int src1_step
        data_0.xy = ((dst_index + 0 >= dst_start)) ? tmp_data_0.xy : data_0.xy;

        data_1.x  = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end))
-                     ? tmp_data_1.x : data_1.x;
+                    ? tmp_data_1.x : data_1.x;
        data_1.y  = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
-                     ? tmp_data_1.y : data_1.y;
+                    ? tmp_data_1.y : data_1.y;

        data_2.xy = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
-                     ? tmp_data_2.xy : data_2.xy;
+                    ? tmp_data_2.xy : data_2.xy;

-       *((__global ushort2 *)((__global char *)dst + dst_index + 0))= data_0;
-       *((__global ushort2 *)((__global char *)dst + dst_index + 4))= data_1;
-       *((__global ushort2 *)((__global char *)dst + dst_index + 8))= data_2;
+        *((__global ushort2 *)((__global char *)dst + dst_index + 0))= data_0;
+        *((__global ushort2 *)((__global char *)dst + dst_index + 4))= data_1;
+        *((__global ushort2 *)((__global char *)dst + dst_index + 8))= data_2;
    }
 }
-__kernel void arithm_s_bitwise_and_C3_D3 (__global   short *src1, int src1_step, int src1_offset,
-                                  __global   short *dst,  int dst_step,  int dst_offset,
-                                  short4 src2, int rows, int cols, int dst_step1)
+__kernel void arithm_s_bitwise_and_C3_D3 (
+        __global   short *src1, int src1_step, int src1_offset,
+        __global   short *dst,  int dst_step,  int dst_offset,
+        short4 src2, int rows, int cols, int dst_step1)
 {

    int x = get_global_id(0);
@@ -601,7 +647,10 @@ __kernel void arithm_s_bitwise_and_C3_D3 (__global   short *src1, int src1_step,
    {
        x = x << 1;

-        #define dst_align (((dst_offset % dst_step) / 6 ) & 1)
+#ifdef dst_align
+#undef dst_align
+#endif
+#define dst_align (((dst_offset % dst_step) / 6 ) & 1)
        int src1_index = mad24(y, src1_step, (x * 6) + src1_offset - (dst_align * 6));

        int dst_start  = mad24(y, dst_step, dst_offset);
@@ -627,21 +676,22 @@ __kernel void arithm_s_bitwise_and_C3_D3 (__global   short *src1, int src1_step,
        data_0.xy = ((dst_index + 0 >= dst_start)) ? tmp_data_0.xy : data_0.xy;

        data_1.x  = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end))
-                     ? tmp_data_1.x : data_1.x;
+                    ? tmp_data_1.x : data_1.x;
        data_1.y  = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
-                     ? tmp_data_1.y : data_1.y;
+                    ? tmp_data_1.y : data_1.y;

        data_2.xy = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
-                     ? tmp_data_2.xy : data_2.xy;
+                    ? tmp_data_2.xy : data_2.xy;

-       *((__global short2 *)((__global char *)dst + dst_index + 0))= data_0;
-       *((__global short2 *)((__global char *)dst + dst_index + 4))= data_1;
-       *((__global short2 *)((__global char *)dst + dst_index + 8))= data_2;
+        *((__global short2 *)((__global char *)dst + dst_index + 0))= data_0;
+        *((__global short2 *)((__global char *)dst + dst_index + 4))= data_1;
+        *((__global short2 *)((__global char *)dst + dst_index + 8))= data_2;
    }
 }
-__kernel void arithm_s_bitwise_and_C3_D4 (__global   int *src1, int src1_step, int src1_offset,
-                                  __global   int *dst,  int dst_step,  int dst_offset,
-                                  int4 src2, int rows, int cols, int dst_step1)
+__kernel void arithm_s_bitwise_and_C3_D4 (
+        __global   int *src1, int src1_step, int src1_offset,
+        __global   int *dst,  int dst_step,  int dst_offset,
+        int4 src2, int rows, int cols, int dst_step1)
 {

    int x = get_global_id(0);
@@ -668,14 +718,15 @@ __kernel void arithm_s_bitwise_and_C3_D4 (__global   int *src1, int src1_step, i
        int tmp_data_1 = src1_data_1 & src2_data_1;
        int tmp_data_2 = src1_data_2 & src2_data_2;

-       *((__global int *)((__global char *)dst + dst_index + 0))= tmp_data_0;
-       *((__global int *)((__global char *)dst + dst_index + 4))= tmp_data_1;
-       *((__global int *)((__global char *)dst + dst_index + 8))= tmp_data_2;
+        *((__global int *)((__global char *)dst + dst_index + 0))= tmp_data_0;
+        *((__global int *)((__global char *)dst + dst_index + 4))= tmp_data_1;
+        *((__global int *)((__global char *)dst + dst_index + 8))= tmp_data_2;
    }
 }
-__kernel void arithm_s_bitwise_and_C3_D5 (__global   char *src1, int src1_step, int src1_offset,
-                                  __global   char *dst,  int dst_step,  int dst_offset,
-                                  char16 src2, int rows, int cols, int dst_step1)
+__kernel void arithm_s_bitwise_and_C3_D5 (
+        __global   char *src1, int src1_step, int src1_offset,
+        __global   char *dst,  int dst_step,  int dst_offset,
+        char16 src2, int rows, int cols, int dst_step1)
 {

    int x = get_global_id(0);
@@ -702,15 +753,16 @@ __kernel void arithm_s_bitwise_and_C3_D5 (__global   char *src1, int src1_step,
        char4 tmp_data_1 = src1_data_1 & src2_data_1;
        char4 tmp_data_2 = src1_data_2 & src2_data_2;

-       *((__global char4 *)((__global char *)dst + dst_index + 0))= tmp_data_0;
-       *((__global char4 *)((__global char *)dst + dst_index + 4))= tmp_data_1;
-       *((__global char4 *)((__global char *)dst + dst_index + 8))= tmp_data_2;
+        *((__global char4 *)((__global char *)dst + dst_index + 0))= tmp_data_0;
+        *((__global char4 *)((__global char *)dst + dst_index + 4))= tmp_data_1;
+        *((__global char4 *)((__global char *)dst + dst_index + 8))= tmp_data_2;
    }
 }
 #if defined (DOUBLE_SUPPORT)
-__kernel void arithm_s_bitwise_and_C3_D6 (__global short *src1, int src1_step, int src1_offset,
-                                          __global short *dst,  int dst_step,  int dst_offset,
-                                          short16 src2, int rows, int cols, int dst_step1)
+__kernel void arithm_s_bitwise_and_C3_D6 (
+        __global short *src1, int src1_step, int src1_offset,
+        __global short *dst,  int dst_step,  int dst_offset,
+        short16 src2, int rows, int cols, int dst_step1)
 {

    int x = get_global_id(0);
@@ -737,15 +789,16 @@ __kernel void arithm_s_bitwise_and_C3_D6 (__global short *src1, int src1_step, i
        short4 tmp_data_1 = src1_data_1 & src2_data_1;
        short4 tmp_data_2 = src1_data_2 & src2_data_2;

-       *((__global short4 *)((__global char *)dst + dst_index + 0 ))= tmp_data_0;
-       *((__global short4 *)((__global char *)dst + dst_index + 8 ))= tmp_data_1;
-       *((__global short4 *)((__global char *)dst + dst_index + 16))= tmp_data_2;
+        *((__global short4 *)((__global char *)dst + dst_index + 0 ))= tmp_data_0;
+        *((__global short4 *)((__global char *)dst + dst_index + 8 ))= tmp_data_1;
+        *((__global short4 *)((__global char *)dst + dst_index + 16))= tmp_data_2;
    }
 }
 #endif
-__kernel void arithm_s_bitwise_and_C4_D0 (__global   uchar *src1, int src1_step, int src1_offset,
-                                  __global   uchar *dst,  int dst_step,  int dst_offset,
-                                  uchar4 src2, int rows, int cols, int dst_step1)
+__kernel void arithm_s_bitwise_and_C4_D0 (
+        __global   uchar *src1, int src1_step, int src1_offset,
+        __global   uchar *dst,  int dst_step,  int dst_offset,
+        uchar4 src2, int rows, int cols, int dst_step1)
 {

    int x = get_global_id(0);
@@ -765,9 +818,10 @@ __kernel void arithm_s_bitwise_and_C4_D0 (__global   uchar *src1, int src1_step,
 }


-__kernel void arithm_s_bitwise_and_C4_D1 (__global   char *src1, int src1_step, int src1_offset,
-                                  __global   char *dst,  int dst_step,  int dst_offset,
-                                  char4 src2, int rows, int cols, int dst_step1)
+__kernel void arithm_s_bitwise_and_C4_D1 (
+        __global   char *src1, int src1_step, int src1_offset,
+        __global   char *dst,  int dst_step,  int dst_offset,
+        char4 src2, int rows, int cols, int dst_step1)
 {

    int x = get_global_id(0);
@@ -786,9 +840,10 @@ __kernel void arithm_s_bitwise_and_C4_D1 (__global   char *src1, int src1_step,
    }
 }

-__kernel void arithm_s_bitwise_and_C4_D2 (__global   ushort *src1, int src1_step, int src1_offset,
-                                  __global   ushort *dst,  int dst_step,  int dst_offset,
-                                  ushort4 src2, int rows, int cols, int dst_step1)
+__kernel void arithm_s_bitwise_and_C4_D2 (
+        __global   ushort *src1, int src1_step, int src1_offset,
+        __global   ushort *dst,  int dst_step,  int dst_offset,
+        ushort4 src2, int rows, int cols, int dst_step1)
 {

    int x = get_global_id(0);
@@ -806,9 +861,10 @@ __kernel void arithm_s_bitwise_and_C4_D2 (__global   ushort *src1, int src1_step
        *((__global ushort4 *)((__global char *)dst + dst_index)) = data;
    }
 }
-__kernel void arithm_s_bitwise_and_C4_D3 (__global   short *src1, int src1_step, int src1_offset,
-                                  __global   short *dst,  int dst_step,  int dst_offset,
-                                  short4 src2, int rows, int cols, int dst_step1)
+__kernel void arithm_s_bitwise_and_C4_D3 (
+        __global   short *src1, int src1_step, int src1_offset,
+        __global   short *dst,  int dst_step,  int dst_offset,
+        short4 src2, int rows, int cols, int dst_step1)
 {

    int x = get_global_id(0);
@@ -826,9 +882,10 @@ __kernel void arithm_s_bitwise_and_C4_D3 (__global   short *src1, int src1_step,
        *((__global short4 *)((__global char *)dst + dst_index)) = data;
    }
 }
-__kernel void arithm_s_bitwise_and_C4_D4 (__global   int *src1, int src1_step, int src1_offset,
-                                  __global   int *dst,  int dst_step,  int dst_offset,
-                                  int4 src2, int rows, int cols, int dst_step1)
+__kernel void arithm_s_bitwise_and_C4_D4 (
+        __global   int *src1, int src1_step, int src1_offset,
+        __global   int *dst,  int dst_step,  int dst_offset,
+        int4 src2, int rows, int cols, int dst_step1)
 {

    int x = get_global_id(0);
@@ -846,9 +903,10 @@ __kernel void arithm_s_bitwise_and_C4_D4 (__global   int *src1, int src1_step, i
        *((__global int4 *)((__global char *)dst + dst_index)) = data;
    }
 }
-__kernel void arithm_s_bitwise_and_C4_D5 (__global   char *src1, int src1_step, int src1_offset,
-                                  __global   char *dst,  int dst_step,  int dst_offset,
-                                  char16 src2, int rows, int cols, int dst_step1)
+__kernel void arithm_s_bitwise_and_C4_D5 (
+        __global   char *src1, int src1_step, int src1_offset,
+        __global   char *dst,  int dst_step,  int dst_offset,
+        char16 src2, int rows, int cols, int dst_step1)
 {

    int x = get_global_id(0);
@@ -869,9 +927,10 @@ __kernel void arithm_s_bitwise_and_C4_D5 (__global   char *src1, int src1_step,
    }
 }
 #if defined (DOUBLE_SUPPORT)
-__kernel void arithm_s_bitwise_and_C4_D6 (__global short *src1, int src1_step, int src1_offset,
-                                          __global short *dst,  int dst_step,  int dst_offset,
-                                          short16 src2, int rows, int cols, int dst_step1)
+__kernel void arithm_s_bitwise_and_C4_D6 (
+        __global short *src1, int src1_step, int src1_offset,
+        __global short *dst,  int dst_step,  int dst_offset,
+        short16 src2, int rows, int cols, int dst_step1)
 {

    int x = get_global_id(0);
@@ -897,10 +956,10 @@ __kernel void arithm_s_bitwise_and_C4_D6 (__global short *src1, int src1_step, i
        short4 tmp_data_2 = src1_data_2 & src2_data_2;
        short4 tmp_data_3 = src1_data_3 & src2_data_3;

-       *((__global short4 *)((__global char *)dst + dst_index + 0 ))= tmp_data_0;
-       *((__global short4 *)((__global char *)dst + dst_index + 8 ))= tmp_data_1;
-       *((__global short4 *)((__global char *)dst + dst_index + 16))= tmp_data_2;
-       *((__global short4 *)((__global char *)dst + dst_index + 24))= tmp_data_3;
+        *((__global short4 *)((__global char *)dst + dst_index + 0 ))= tmp_data_0;
+        *((__global short4 *)((__global char *)dst + dst_index + 8 ))= tmp_data_1;
+        *((__global short4 *)((__global char *)dst + dst_index + 16))= tmp_data_2;
+        *((__global short4 *)((__global char *)dst + dst_index + 24))= tmp_data_3;

    }
 }
--- a/modules/ocl/src/opencl/arithm_bitwise_and_scalar_mask.cl
+++ b/modules/ocl/src/opencl/arithm_bitwise_and_scalar_mask.cl
@@ -42,20 +42,22 @@
 // the use of this software, even if advised of the possibility of such damage.
 //
 //M*/
-#if defined (__ATI__)
-#pragma OPENCL EXTENSION cl_amd_fp64:enable
-#elif defined (__NVIDIA__)
+#if defined (DOUBLE_SUPPORT)
+#ifdef cl_khr_fp64
 #pragma OPENCL EXTENSION cl_khr_fp64:enable
+#elif defined (cl_amd_fp64)
+#pragma OPENCL EXTENSION cl_amd_fp64:enable
+#endif
 #endif
-
 //////////////////////////////////////////////////////////////////////////////////////////////////////
 ////////////////////////////////////////////BITWISE_AND////////////////////////////////////////////////////
 ///////////////////////////////////////////////////////////////////////////////////////////////////////
 /**************************************bitwise_and with scalar with mask**************************************/
-__kernel void arithm_s_bitwise_and_with_mask_C1_D0 (__global   uchar *src1, int src1_step, int src1_offset,
-                                            __global   uchar *dst,  int dst_step,  int dst_offset,
-                                            __global   uchar *mask, int mask_step, int mask_offset,
-                                            uchar4 src2, int rows, int cols, int dst_step1)
+__kernel void arithm_s_bitwise_and_with_mask_C1_D0 (
+        __global   uchar *src1, int src1_step, int src1_offset,
+        __global   uchar *dst,  int dst_step,  int dst_offset,
+        __global   uchar *mask, int mask_step, int mask_offset,
+        uchar4 src2, int rows, int cols, int dst_step1)
 {

    int x = get_global_id(0);
@@ -65,7 +67,10 @@ __kernel void arithm_s_bitwise_and_with_mask_C1_D0 (__global   uchar *src1, int
    {
        x = x << 2;

-        #define dst_align (dst_offset & 3)
+#ifdef dst_align
+#undef dst_align
+#endif
+#define dst_align (dst_offset & 3)
        int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
        int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);

@@ -90,10 +95,11 @@ __kernel void arithm_s_bitwise_and_with_mask_C1_D0 (__global   uchar *src1, int
 }


-__kernel void arithm_s_bitwise_and_with_mask_C1_D1 (__global   char *src1, int src1_step, int src1_offset,
-                                            __global   char *dst,  int dst_step,  int dst_offset,
-                                            __global   uchar *mask, int mask_step, int mask_offset,
-                                            char4 src2, int rows, int cols, int dst_step1)
+__kernel void arithm_s_bitwise_and_with_mask_C1_D1 (
+        __global   char *src1, int src1_step, int src1_offset,
+        __global   char *dst,  int dst_step,  int dst_offset,
+        __global   uchar *mask, int mask_step, int mask_offset,
+        char4 src2, int rows, int cols, int dst_step1)
 {

    int x = get_global_id(0);
@@ -103,7 +109,10 @@ __kernel void arithm_s_bitwise_and_with_mask_C1_D1 (__global   char *src1, int s
    {
        x = x << 2;

-        #define dst_align (dst_offset & 3)
+#ifdef dst_align
+#undef dst_align
+#endif
+#define dst_align (dst_offset & 3)
        int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
        int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);

@@ -127,10 +136,11 @@ __kernel void arithm_s_bitwise_and_with_mask_C1_D1 (__global   char *src1, int s
    }
 }

-__kernel void arithm_s_bitwise_and_with_mask_C1_D2 (__global   ushort *src1, int src1_step, int src1_offset,
-                                            __global   ushort *dst,  int dst_step,  int dst_offset,
-                                            __global   uchar  *mask, int mask_step, int mask_offset,
-                                            ushort4 src2, int rows, int cols, int dst_step1)
+__kernel void arithm_s_bitwise_and_with_mask_C1_D2 (
+        __global   ushort *src1, int src1_step, int src1_offset,
+        __global   ushort *dst,  int dst_step,  int dst_offset,
+        __global   uchar  *mask, int mask_step, int mask_offset,
+        ushort4 src2, int rows, int cols, int dst_step1)
 {

    int x = get_global_id(0);
@@ -140,7 +150,10 @@ __kernel void arithm_s_bitwise_and_with_mask_C1_D2 (__global   ushort *src1, int
    {
        x = x << 1;

-        #define dst_align ((dst_offset >> 1) & 1)
+#ifdef dst_align
+#undef dst_align
+#endif
+#define dst_align ((dst_offset >> 1) & 1)
        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
        int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);

@@ -161,10 +174,11 @@ __kernel void arithm_s_bitwise_and_with_mask_C1_D2 (__global   ushort *src1, int
        *((__global ushort2 *)((__global uchar *)dst + dst_index)) = data;
    }
 }
-__kernel void arithm_s_bitwise_and_with_mask_C1_D3 (__global   short *src1, int src1_step, int src1_offset,
-                                            __global   short *dst,  int dst_step,  int dst_offset,
-                                            __global   uchar *mask, int mask_step, int mask_offset,
-                                            short4 src2, int rows, int cols, int dst_step1)
+__kernel void arithm_s_bitwise_and_with_mask_C1_D3 (
+        __global   short *src1, int src1_step, int src1_offset,
+        __global   short *dst,  int dst_step,  int dst_offset,
+        __global   uchar *mask, int mask_step, int mask_offset,
+        short4 src2, int rows, int cols, int dst_step1)
 {

    int x = get_global_id(0);
@@ -174,7 +188,10 @@ __kernel void arithm_s_bitwise_and_with_mask_C1_D3 (__global   short *src1, int
    {
        x = x << 1;

-        #define dst_align ((dst_offset >> 1) & 1)
+#ifdef dst_align
+#undef dst_align
+#endif
+#define dst_align ((dst_offset >> 1) & 1)
        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
        int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);

@@ -195,10 +212,11 @@ __kernel void arithm_s_bitwise_and_with_mask_C1_D3 (__global   short *src1, int
        *((__global short2 *)((__global uchar *)dst + dst_index)) = data;
    }
 }
-__kernel void arithm_s_bitwise_and_with_mask_C1_D4 (__global   int   *src1, int src1_step, int src1_offset,
-                                            __global   int   *dst,  int dst_step,  int dst_offset,
-                                            __global   uchar *mask, int mask_step, int mask_offset,
-                                            int4 src2, int rows, int cols, int dst_step1)
+__kernel void arithm_s_bitwise_and_with_mask_C1_D4 (
+        __global   int   *src1, int src1_step, int src1_offset,
+        __global   int   *dst,  int dst_step,  int dst_offset,
+        __global   uchar *mask, int mask_step, int mask_offset,
+        int4 src2, int rows, int cols, int dst_step1)
 {

    int x = get_global_id(0);
@@ -223,10 +241,11 @@ __kernel void arithm_s_bitwise_and_with_mask_C1_D4 (__global   int   *src1, int
    }
 }

-__kernel void arithm_s_bitwise_and_with_mask_C1_D5 (__global char *src1, int src1_step, int src1_offset,
-                                                    __global char *dst,  int dst_step,  int dst_offset,
-                                                    __global   uchar *mask, int mask_step, int mask_offset,
-                                                    char16 src2, int rows, int cols, int dst_step1)
+__kernel void arithm_s_bitwise_and_with_mask_C1_D5 (
+        __global char *src1, int src1_step, int src1_offset,
+        __global char *dst,  int dst_step,  int dst_offset,
+        __global   uchar *mask, int mask_step, int mask_offset,
+        char16 src2, int rows, int cols, int dst_step1)
 {

    int x = get_global_id(0);
@@ -252,10 +271,11 @@ __kernel void arithm_s_bitwise_and_with_mask_C1_D5 (__global char *src1, int src
 }

 #if defined (DOUBLE_SUPPORT)
-__kernel void arithm_s_bitwise_and_with_mask_C1_D6 (__global short *src1, int src1_step, int src1_offset,
-                                            __global short *dst,  int dst_step,  int dst_offset,
-                                            __global   uchar *mask, int mask_step, int mask_offset,
-                                            short16 src2, int rows, int cols, int dst_step1)
+__kernel void arithm_s_bitwise_and_with_mask_C1_D6 (
+        __global short *src1, int src1_step, int src1_offset,
+        __global short *dst,  int dst_step,  int dst_offset,
+        __global   uchar *mask, int mask_step, int mask_offset,
+        short16 src2, int rows, int cols, int dst_step1)
 {

    int x = get_global_id(0);
@@ -280,10 +300,11 @@ __kernel void arithm_s_bitwise_and_with_mask_C1_D6 (__global short *src1, int sr
    }
 }
 #endif
-__kernel void arithm_s_bitwise_and_with_mask_C2_D0 (__global   uchar *src1, int src1_step, int src1_offset,
-                                            __global   uchar *dst,  int dst_step,  int dst_offset,
-                                            __global   uchar *mask, int mask_step, int mask_offset,
-                                            uchar4 src2, int rows, int cols, int dst_step1)
+__kernel void arithm_s_bitwise_and_with_mask_C2_D0 (
+        __global   uchar *src1, int src1_step, int src1_offset,
+        __global   uchar *dst,  int dst_step,  int dst_offset,
+        __global   uchar *mask, int mask_step, int mask_offset,
+        uchar4 src2, int rows, int cols, int dst_step1)
 {

    int x = get_global_id(0);
@@ -293,7 +314,10 @@ __kernel void arithm_s_bitwise_and_with_mask_C2_D0 (__global   uchar *src1, int
    {
        x = x << 1;

-        #define dst_align ((dst_offset >> 1) & 1)
+#ifdef dst_align
+#undef dst_align
+#endif
+#define dst_align ((dst_offset >> 1) & 1)
        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
        int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);

@@ -316,10 +340,11 @@ __kernel void arithm_s_bitwise_and_with_mask_C2_D0 (__global   uchar *src1, int
 }


-__kernel void arithm_s_bitwise_and_with_mask_C2_D1 (__global   char *src1, int src1_step, int src1_offset,
-                                            __global   char *dst,  int dst_step,  int dst_offset,
-                                            __global   uchar *mask, int mask_step, int mask_offset,
-                                            char4 src2, int rows, int cols, int dst_step1)
+__kernel void arithm_s_bitwise_and_with_mask_C2_D1 (
+        __global   char *src1, int src1_step, int src1_offset,
+        __global   char *dst,  int dst_step,  int dst_offset,
+        __global   uchar *mask, int mask_step, int mask_offset,
+        char4 src2, int rows, int cols, int dst_step1)
 {

    int x = get_global_id(0);
@@ -329,7 +354,10 @@ __kernel void arithm_s_bitwise_and_with_mask_C2_D1 (__global   char *src1, int s
    {
        x = x << 1;

-        #define dst_align ((dst_offset >> 1) & 1)
+#ifdef dst_align
+#undef dst_align
+#endif
+#define dst_align ((dst_offset >> 1) & 1)
        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
        int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);

@@ -351,10 +379,11 @@ __kernel void arithm_s_bitwise_and_with_mask_C2_D1 (__global   char *src1, int s
    }
 }

-__kernel void arithm_s_bitwise_and_with_mask_C2_D2 (__global   ushort *src1, int src1_step, int src1_offset,
-                                            __global   ushort *dst,  int dst_step,  int dst_offset,
-                                            __global   uchar *mask, int mask_step, int mask_offset,
-                                            ushort4 src2, int rows, int cols, int dst_step1)
+__kernel void arithm_s_bitwise_and_with_mask_C2_D2 (
+        __global   ushort *src1, int src1_step, int src1_offset,
+        __global   ushort *dst,  int dst_step,  int dst_offset,
+        __global   uchar *mask, int mask_step, int mask_offset,
+        ushort4 src2, int rows, int cols, int dst_step1)
 {

    int x = get_global_id(0);
@@ -378,10 +407,11 @@ __kernel void arithm_s_bitwise_and_with_mask_C2_D2 (__global   ushort *src1, int
        *((__global ushort2 *)((__global char *)dst + dst_index)) = data;
    }
 }
-__kernel void arithm_s_bitwise_and_with_mask_C2_D3 (__global   short *src1, int src1_step, int src1_offset,
-                                            __global   short *dst,  int dst_step,  int dst_offset,
-                                            __global   uchar *mask, int mask_step, int mask_offset,
-                                            short4 src2, int rows, int cols, int dst_step1)
+__kernel void arithm_s_bitwise_and_with_mask_C2_D3 (
+        __global   short *src1, int src1_step, int src1_offset,
+        __global   short *dst,  int dst_step,  int dst_offset,
+        __global   uchar *mask, int mask_step, int mask_offset,
+        short4 src2, int rows, int cols, int dst_step1)
 {

    int x = get_global_id(0);
@@ -405,10 +435,11 @@ __kernel void arithm_s_bitwise_and_with_mask_C2_D3 (__global   short *src1, int
        *((__global short2 *)((__global char *)dst + dst_index)) = data;
    }
 }
-__kernel void arithm_s_bitwise_and_with_mask_C2_D4 (__global   int *src1, int src1_step, int src1_offset,
-                                            __global   int *dst,  int dst_step,  int dst_offset,
-                                            __global   uchar *mask, int mask_step, int mask_offset,
-                                            int4 src2, int rows, int cols, int dst_step1)
+__kernel void arithm_s_bitwise_and_with_mask_C2_D4 (
+        __global   int *src1, int src1_step, int src1_offset,
+        __global   int *dst,  int dst_step,  int dst_offset,
+        __global   uchar *mask, int mask_step, int mask_offset,
+        int4 src2, int rows, int cols, int dst_step1)
 {

    int x = get_global_id(0);
@@ -432,10 +463,11 @@ __kernel void arithm_s_bitwise_and_with_mask_C2_D4 (__global   int *src1, int sr
        *((__global int2 *)((__global char *)dst + dst_index)) = data;
    }
 }
-__kernel void arithm_s_bitwise_and_with_mask_C2_D5 (__global   char *src1, int src1_step, int src1_offset,
-                                            __global  char *dst,  int dst_step,  int dst_offset,
-                                            __global   uchar *mask, int mask_step, int mask_offset,
-                                            char16 src2, int rows, int cols, int dst_step1)
+__kernel void arithm_s_bitwise_and_with_mask_C2_D5 (
+        __global   char *src1, int src1_step, int src1_offset,
+        __global  char *dst,  int dst_step,  int dst_offset,
+        __global   uchar *mask, int mask_step, int mask_offset,
+        char16 src2, int rows, int cols, int dst_step1)
 {

    int x = get_global_id(0);
@@ -461,10 +493,11 @@ __kernel void arithm_s_bitwise_and_with_mask_C2_D5 (__global   char *src1, int s
    }
 }
 #if defined (DOUBLE_SUPPORT)
-__kernel void arithm_s_bitwise_and_with_mask_C2_D6 (__global short *src1, int src1_step, int src1_offset,
-                                                    __global short *dst,  int dst_step,  int dst_offset,
-                                                    __global uchar *mask, int mask_step, int mask_offset,
-                                                    short16 src2, int rows, int cols, int dst_step1)
+__kernel void arithm_s_bitwise_and_with_mask_C2_D6 (
+        __global short *src1, int src1_step, int src1_offset,
+        __global short *dst,  int dst_step,  int dst_offset,
+        __global uchar *mask, int mask_step, int mask_offset,
+        short16 src2, int rows, int cols, int dst_step1)
 {

    int x = get_global_id(0);
@@ -489,10 +522,11 @@ __kernel void arithm_s_bitwise_and_with_mask_C2_D6 (__global short *src1, int sr
    }
 }
 #endif
-__kernel void arithm_s_bitwise_and_with_mask_C3_D0 (__global   uchar *src1, int src1_step, int src1_offset,
-                                            __global   uchar *dst,  int dst_step,  int dst_offset,
-                                            __global   uchar *mask, int mask_step, int mask_offset,
-                                            uchar4 src2, int rows, int cols, int dst_step1)
+__kernel void arithm_s_bitwise_and_with_mask_C3_D0 (
+        __global   uchar *src1, int src1_step, int src1_offset,
+        __global   uchar *dst,  int dst_step,  int dst_offset,
+        __global   uchar *mask, int mask_step, int mask_offset,
+        uchar4 src2, int rows, int cols, int dst_step1)
 {

    int x = get_global_id(0);
@@ -502,7 +536,10 @@ __kernel void arithm_s_bitwise_and_with_mask_C3_D0 (__global   uchar *src1, int
    {
        x = x << 2;

-        #define dst_align (((dst_offset % dst_step) / 3 ) & 3)
+#ifdef dst_align
+#undef dst_align
+#endif
+#define dst_align (((dst_offset % dst_step) / 3 ) & 3)
        int src1_index = mad24(y, src1_step, (x * 3) + src1_offset - (dst_align * 3));
        int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);

@@ -549,10 +586,11 @@ __kernel void arithm_s_bitwise_and_with_mask_C3_D0 (__global   uchar *src1, int
 }


-__kernel void arithm_s_bitwise_and_with_mask_C3_D1 (__global   char *src1, int src1_step, int src1_offset,
-                                            __global   char *dst,  int dst_step,  int dst_offset,
-                                            __global   uchar *mask, int mask_step, int mask_offset,
-                                            char4 src2, int rows, int cols, int dst_step1)
+__kernel void arithm_s_bitwise_and_with_mask_C3_D1 (
+        __global   char *src1, int src1_step, int src1_offset,
+        __global   char *dst,  int dst_step,  int dst_offset,
+        __global   uchar *mask, int mask_step, int mask_offset,
+        char4 src2, int rows, int cols, int dst_step1)
 {

    int x = get_global_id(0);
@@ -562,7 +600,10 @@ __kernel void arithm_s_bitwise_and_with_mask_C3_D1 (__global   char *src1, int s
    {
        x = x << 2;

-        #define dst_align (((dst_offset % dst_step) / 3 ) & 3)
+#ifdef dst_align
+#undef dst_align
+#endif
+#define dst_align (((dst_offset % dst_step) / 3 ) & 3)
        int src1_index = mad24(y, src1_step, (x * 3) + src1_offset - (dst_align * 3));
        int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);

@@ -608,10 +649,11 @@ __kernel void arithm_s_bitwise_and_with_mask_C3_D1 (__global   char *src1, int s
    }
 }

-__kernel void arithm_s_bitwise_and_with_mask_C3_D2 (__global   ushort *src1, int src1_step, int src1_offset,
-                                            __global   ushort *dst,  int dst_step,  int dst_offset,
-                                            __global   uchar *mask, int mask_step, int mask_offset,
-                                            ushort4 src2, int rows, int cols, int dst_step1)
+__kernel void arithm_s_bitwise_and_with_mask_C3_D2 (
+        __global   ushort *src1, int src1_step, int src1_offset,
+        __global   ushort *dst,  int dst_step,  int dst_offset,
+        __global   uchar *mask, int mask_step, int mask_offset,
+        ushort4 src2, int rows, int cols, int dst_step1)
 {

    int x = get_global_id(0);
@@ -621,7 +663,10 @@ __kernel void arithm_s_bitwise_and_with_mask_C3_D2 (__global   ushort *src1, int
    {
        x = x << 1;

-        #define dst_align (((dst_offset % dst_step) / 6 ) & 1)
+#ifdef dst_align
+#undef dst_align
+#endif
+#define dst_align (((dst_offset % dst_step) / 6 ) & 1)
        int src1_index = mad24(y, src1_step, (x * 6) + src1_offset - (dst_align * 6));
        int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);

@@ -650,22 +695,23 @@ __kernel void arithm_s_bitwise_and_with_mask_C3_D2 (__global   ushort *src1, int
        data_0.xy = ((mask_data.x) && (dst_index + 0 >= dst_start)) ? tmp_data_0.xy : data_0.xy;

        data_1.x  = ((mask_data.x) && (dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end))
-                     ? tmp_data_1.x : data_1.x;
+                    ? tmp_data_1.x : data_1.x;
        data_1.y  = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
-                     ? tmp_data_1.y : data_1.y;
+                    ? tmp_data_1.y : data_1.y;

        data_2.xy = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
-                     ? tmp_data_2.xy : data_2.xy;
+                    ? tmp_data_2.xy : data_2.xy;

-       *((__global ushort2 *)((__global char *)dst + dst_index + 0))= data_0;
-       *((__global ushort2 *)((__global char *)dst + dst_index + 4))= data_1;
-       *((__global ushort2 *)((__global char *)dst + dst_index + 8))= data_2;
+        *((__global ushort2 *)((__global char *)dst + dst_index + 0))= data_0;
+        *((__global ushort2 *)((__global char *)dst + dst_index + 4))= data_1;
+        *((__global ushort2 *)((__global char *)dst + dst_index + 8))= data_2;
    }
 }
-__kernel void arithm_s_bitwise_and_with_mask_C3_D3 (__global   short *src1, int src1_step, int src1_offset,
-                                            __global   short *dst,  int dst_step,  int dst_offset,
-                                            __global   uchar *mask, int mask_step, int mask_offset,
-                                            short4 src2, int rows, int cols, int dst_step1)
+__kernel void arithm_s_bitwise_and_with_mask_C3_D3 (
+        __global   short *src1, int src1_step, int src1_offset,
+        __global   short *dst,  int dst_step,  int dst_offset,
+        __global   uchar *mask, int mask_step, int mask_offset,
+        short4 src2, int rows, int cols, int dst_step1)
 {

    int x = get_global_id(0);
@@ -675,7 +721,10 @@ __kernel void arithm_s_bitwise_and_with_mask_C3_D3 (__global   short *src1, int
    {
        x = x << 1;

-        #define dst_align (((dst_offset % dst_step) / 6 ) & 1)
+#ifdef dst_align
+#undef dst_align
+#endif
+#define dst_align (((dst_offset % dst_step) / 6 ) & 1)
        int src1_index = mad24(y, src1_step, (x * 6) + src1_offset - (dst_align * 6));
        int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);

@@ -704,22 +753,23 @@ __kernel void arithm_s_bitwise_and_with_mask_C3_D3 (__global   short *src1, int
        data_0.xy = ((mask_data.x) && (dst_index + 0 >= dst_start)) ? tmp_data_0.xy : data_0.xy;

        data_1.x  = ((mask_data.x) && (dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end))
-                     ? tmp_data_1.x : data_1.x;
+                    ? tmp_data_1.x : data_1.x;
        data_1.y  = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
-                     ? tmp_data_1.y : data_1.y;
+                    ? tmp_data_1.y : data_1.y;

        data_2.xy = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
-                     ? tmp_data_2.xy : data_2.xy;
+                    ? tmp_data_2.xy : data_2.xy;

-       *((__global short2 *)((__global char *)dst + dst_index + 0))= data_0;
-       *((__global short2 *)((__global char *)dst + dst_index + 4))= data_1;
-       *((__global short2 *)((__global char *)dst + dst_index + 8))= data_2;
+        *((__global short2 *)((__global char *)dst + dst_index + 0))= data_0;
+        *((__global short2 *)((__global char *)dst + dst_index + 4))= data_1;
+        *((__global short2 *)((__global char *)dst + dst_index + 8))= data_2;
    }
 }
-__kernel void arithm_s_bitwise_and_with_mask_C3_D4 (__global   int *src1, int src1_step, int src1_offset,
-                                            __global   int *dst,  int dst_step,  int dst_offset,
-                                            __global   uchar *mask, int mask_step, int mask_offset,
-                                            int4 src2, int rows, int cols, int dst_step1)
+__kernel void arithm_s_bitwise_and_with_mask_C3_D4 (
+        __global   int *src1, int src1_step, int src1_offset,
+        __global   int *dst,  int dst_step,  int dst_offset,
+        __global   uchar *mask, int mask_step, int mask_offset,
+        int4 src2, int rows, int cols, int dst_step1)
 {

    int x = get_global_id(0);
@@ -753,15 +803,16 @@ __kernel void arithm_s_bitwise_and_with_mask_C3_D4 (__global   int *src1, int sr
        data_1 = mask_data ? tmp_data_1 : data_1;
        data_2 = mask_data ? tmp_data_2 : data_2;

-       *((__global int *)((__global char *)dst + dst_index + 0))= data_0;
-       *((__global int *)((__global char *)dst + dst_index + 4))= data_1;
-       *((__global int *)((__global char *)dst + dst_index + 8))= data_2;
+        *((__global int *)((__global char *)dst + dst_index + 0))= data_0;
+        *((__global int *)((__global char *)dst + dst_index + 4))= data_1;
+        *((__global int *)((__global char *)dst + dst_index + 8))= data_2;
    }
 }
-__kernel void arithm_s_bitwise_and_with_mask_C3_D5 (__global   char *src1, int src1_step, int src1_offset,
-                                            __global   char *dst,  int dst_step,  int dst_offset,
-                                            __global   uchar *mask, int mask_step, int mask_offset,
-                                            char16 src2, int rows, int cols, int dst_step1)
+__kernel void arithm_s_bitwise_and_with_mask_C3_D5 (
+        __global   char *src1, int src1_step, int src1_offset,
+        __global   char *dst,  int dst_step,  int dst_offset,
+        __global   uchar *mask, int mask_step, int mask_offset,
+        char16 src2, int rows, int cols, int dst_step1)
 {

    int x = get_global_id(0);
@@ -795,16 +846,17 @@ __kernel void arithm_s_bitwise_and_with_mask_C3_D5 (__global   char *src1, int s
        data_1 = mask_data ? tmp_data_1 : data_1;
        data_2 = mask_data ? tmp_data_2 : data_2;

-       *((__global char4 *)((__global char *)dst + dst_index + 0))= data_0;
-       *((__global char4 *)((__global char *)dst + dst_index + 4))= data_1;
-       *((__global char4 *)((__global char *)dst + dst_index + 8))= data_2;
+        *((__global char4 *)((__global char *)dst + dst_index + 0))= data_0;
+        *((__global char4 *)((__global char *)dst + dst_index + 4))= data_1;
+        *((__global char4 *)((__global char *)dst + dst_index + 8))= data_2;
    }
 }
 #if defined (DOUBLE_SUPPORT)
-__kernel void arithm_s_bitwise_and_with_mask_C3_D6 (__global short *src1, int src1_step, int src1_offset,
-                                                    __global short *dst,  int dst_step,  int dst_offset,
-                                                    __global uchar  *mask, int mask_step, int mask_offset,
-                                                    short16 src2, int rows, int cols, int dst_step1)
+__kernel void arithm_s_bitwise_and_with_mask_C3_D6 (
+        __global short *src1, int src1_step, int src1_offset,
+        __global short *dst,  int dst_step,  int dst_offset,
+        __global uchar  *mask, int mask_step, int mask_offset,
+        short16 src2, int rows, int cols, int dst_step1)
 {

    int x = get_global_id(0);
@@ -838,16 +890,17 @@ __kernel void arithm_s_bitwise_and_with_mask_C3_D6 (__global short *src1, int sr
        data_1 = mask_data ? tmp_data_1 : data_1;
        data_2 = mask_data ? tmp_data_2 : data_2;

-       *((__global short4 *)((__global char *)dst + dst_index + 0 ))= data_0;
-       *((__global short4 *)((__global char *)dst + dst_index + 8 ))= data_1;
-       *((__global short4 *)((__global char *)dst + dst_index + 16))= data_2;
+        *((__global short4 *)((__global char *)dst + dst_index + 0 ))= data_0;
+        *((__global short4 *)((__global char *)dst + dst_index + 8 ))= data_1;
+        *((__global short4 *)((__global char *)dst + dst_index + 16))= data_2;
    }
 }
 #endif
-__kernel void arithm_s_bitwise_and_with_mask_C4_D0 (__global   uchar *src1, int src1_step, int src1_offset,
-                                            __global   uchar *dst,  int dst_step,  int dst_offset,
-                                            __global   uchar *mask, int mask_step, int mask_offset,
-                                            uchar4 src2, int rows, int cols, int dst_step1)
+__kernel void arithm_s_bitwise_and_with_mask_C4_D0 (
+        __global   uchar *src1, int src1_step, int src1_offset,
+        __global   uchar *dst,  int dst_step,  int dst_offset,
+        __global   uchar *mask, int mask_step, int mask_offset,
+        uchar4 src2, int rows, int cols, int dst_step1)
 {

    int x = get_global_id(0);
@@ -872,10 +925,11 @@ __kernel void arithm_s_bitwise_and_with_mask_C4_D0 (__global   uchar *src1, int
 }


-__kernel void arithm_s_bitwise_and_with_mask_C4_D1 (__global   char *src1, int src1_step, int src1_offset,
-                                            __global   char *dst,  int dst_step,  int dst_offset,
-                                            __global   uchar *mask, int mask_step, int mask_offset,
-                                            char4 src2, int rows, int cols, int dst_step1)
+__kernel void arithm_s_bitwise_and_with_mask_C4_D1 (
+        __global   char *src1, int src1_step, int src1_offset,
+        __global   char *dst,  int dst_step,  int dst_offset,
+        __global   uchar *mask, int mask_step, int mask_offset,
+        char4 src2, int rows, int cols, int dst_step1)
 {

    int x = get_global_id(0);
@@ -899,10 +953,11 @@ __kernel void arithm_s_bitwise_and_with_mask_C4_D1 (__global   char *src1, int s
    }
 }

-__kernel void arithm_s_bitwise_and_with_mask_C4_D2 (__global   ushort *src1, int src1_step, int src1_offset,
-                                            __global   ushort *dst,  int dst_step,  int dst_offset,
-                                            __global   uchar *mask, int mask_step, int mask_offset,
-                                            ushort4 src2, int rows, int cols, int dst_step1)
+__kernel void arithm_s_bitwise_and_with_mask_C4_D2 (
+        __global   ushort *src1, int src1_step, int src1_offset,
+        __global   ushort *dst,  int dst_step,  int dst_offset,
+        __global   uchar *mask, int mask_step, int mask_offset,
+        ushort4 src2, int rows, int cols, int dst_step1)
 {

    int x = get_global_id(0);
@@ -925,10 +980,11 @@ __kernel void arithm_s_bitwise_and_with_mask_C4_D2 (__global   ushort *src1, int
        *((__global ushort4 *)((__global char *)dst + dst_index)) = data;
    }
 }
-__kernel void arithm_s_bitwise_and_with_mask_C4_D3 (__global   short *src1, int src1_step, int src1_offset,
-                                            __global   short *dst,  int dst_step,  int dst_offset,
-                                            __global   uchar *mask, int mask_step, int mask_offset,
-                                            short4 src2, int rows, int cols, int dst_step1)
+__kernel void arithm_s_bitwise_and_with_mask_C4_D3 (
+        __global   short *src1, int src1_step, int src1_offset,
+        __global   short *dst,  int dst_step,  int dst_offset,
+        __global   uchar *mask, int mask_step, int mask_offset,
+        short4 src2, int rows, int cols, int dst_step1)
 {

    int x = get_global_id(0);
@@ -951,10 +1007,11 @@ __kernel void arithm_s_bitwise_and_with_mask_C4_D3 (__global   short *src1, int
        *((__global short4 *)((__global char *)dst + dst_index)) = data;
    }
 }
-__kernel void arithm_s_bitwise_and_with_mask_C4_D4 (__global   int *src1, int src1_step, int src1_offset,
-                                            __global   int *dst,  int dst_step,  int dst_offset,
-                                            __global   uchar *mask, int mask_step, int mask_offset,
-                                            int4 src2, int rows, int cols, int dst_step1)
+__kernel void arithm_s_bitwise_and_with_mask_C4_D4 (
+        __global   int *src1, int src1_step, int src1_offset,
+        __global   int *dst,  int dst_step,  int dst_offset,
+        __global   uchar *mask, int mask_step, int mask_offset,
+        int4 src2, int rows, int cols, int dst_step1)
 {

    int x = get_global_id(0);
@@ -977,10 +1034,11 @@ __kernel void arithm_s_bitwise_and_with_mask_C4_D4 (__global   int *src1, int sr
        *((__global int4 *)((__global char *)dst + dst_index)) = data;
    }
 }
-__kernel void arithm_s_bitwise_and_with_mask_C4_D5 (__global   char *src1, int src1_step, int src1_offset,
-                                                    __global   char *dst,  int dst_step,  int dst_offset,
-                                                    __global   uchar *mask, int mask_step, int mask_offset,
-                                                    char16 src2, int rows, int cols, int dst_step1)
+__kernel void arithm_s_bitwise_and_with_mask_C4_D5 (
+        __global   char *src1, int src1_step, int src1_offset,
+        __global   char *dst,  int dst_step,  int dst_offset,
+        __global   uchar *mask, int mask_step, int mask_offset,
+        char16 src2, int rows, int cols, int dst_step1)
 {

    int x = get_global_id(0);
@@ -1006,10 +1064,11 @@ __kernel void arithm_s_bitwise_and_with_mask_C4_D5 (__global   char *src1, int s
    }
 }
 #if defined (DOUBLE_SUPPORT)
-__kernel void arithm_s_bitwise_and_with_mask_C4_D6 (__global short *src1, int src1_step, int src1_offset,
-                                                    __global short *dst,  int dst_step,  int dst_offset,
-                                                    __global uchar *mask, int mask_step, int mask_offset,
-                                                    short16 src2, int rows, int cols, int dst_step1)
+__kernel void arithm_s_bitwise_and_with_mask_C4_D6 (
+        __global short *src1, int src1_step, int src1_offset,
+        __global short *dst,  int dst_step,  int dst_offset,
+        __global uchar *mask, int mask_step, int mask_offset,
+        short16 src2, int rows, int cols, int dst_step1)
 {

    int x = get_global_id(0);
--- a/modules/ocl/src/opencl/arithm_bitwise_not.cl
+++ b/modules/ocl/src/opencl/arithm_bitwise_not.cl
@@ -43,9 +43,12 @@
 //
 //M*/
 #if defined (DOUBLE_SUPPORT)
+#ifdef cl_khr_fp64
 #pragma OPENCL EXTENSION cl_khr_fp64:enable
+#elif defined (cl_amd_fp64)
+#pragma OPENCL EXTENSION cl_amd_fp64:enable
+#endif
 #endif
-
 //////////////////////////////////////////////////////////////////////////////////////////////////////
 ////////////////////////////////////////////BITWISE_NOT////////////////////////////////////////////////////
 ///////////////////////////////////////////////////////////////////////////////////////////////////////
@@ -61,25 +64,28 @@ __kernel void arithm_bitwise_not_D0 (__global uchar *src1, int src1_step, int sr
    {
        x = x << 2;

-        #define dst_align (dst_offset & 3)
+#ifdef dst_align
+#undef dst_align
+#endif
+#define dst_align (dst_offset & 3)
        int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);

        int dst_start  = mad24(y, dst_step, dst_offset);
        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
        int dst_index  = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
-    int src1_index_fix = src1_index < 0 ? 0 : src1_index;
+        int src1_index_fix = src1_index < 0 ? 0 : src1_index;
        uchar4 src1_data = vload4(0, src1 + src1_index_fix);

        uchar4 dst_data = *((__global uchar4 *)(dst + dst_index));
        uchar4 tmp_data = ~ src1_data;

-  /*  if(src1_index < 0)
-    {
-      uchar4 tmp;
-      tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
-      src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
-    }
-  */
+        /*  if(src1_index < 0)
+          {
+            uchar4 tmp;
+            tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
+            src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
+          }
+        */
        dst_data.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : dst_data.x;
        dst_data.y = ((dst_index + 1 >= dst_start) && (dst_index + 1 < dst_end)) ? tmp_data.y : dst_data.y;
        dst_data.z = ((dst_index + 2 >= dst_start) && (dst_index + 2 < dst_end)) ? tmp_data.z : dst_data.z;
@@ -91,8 +97,8 @@ __kernel void arithm_bitwise_not_D0 (__global uchar *src1, int src1_step, int sr


 __kernel void arithm_bitwise_not_D1 (__global char *src1, int src1_step, int src1_offset,
-                             __global char *dst,  int dst_step,  int dst_offset,
-                             int rows, int cols, int dst_step1)
+                                     __global char *dst,  int dst_step,  int dst_offset,
+                                     int rows, int cols, int dst_step1)
 {
    int x = get_global_id(0);
    int y = get_global_id(1);
@@ -101,7 +107,10 @@ __kernel void arithm_bitwise_not_D1 (__global char *src1, int src1_step, int src
    {
        x = x << 2;

-        #define dst_align (dst_offset & 3)
+#ifdef dst_align
+#undef dst_align
+#endif
+#define dst_align (dst_offset & 3)
        int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);

        int dst_start  = mad24(y, dst_step, dst_offset);
@@ -124,8 +133,8 @@ __kernel void arithm_bitwise_not_D1 (__global char *src1, int src1_step, int src


 __kernel void arithm_bitwise_not_D2 (__global ushort *src1, int src1_step, int src1_offset,
-                             __global ushort *dst,  int dst_step,  int dst_offset,
-                             int rows, int cols, int dst_step1)
+                                     __global ushort *dst,  int dst_step,  int dst_offset,
+                                     int rows, int cols, int dst_step1)

 {
    int x = get_global_id(0);
@@ -135,7 +144,10 @@ __kernel void arithm_bitwise_not_D2 (__global ushort *src1, int src1_step, int s
    {
        x = x << 2;

-        #define dst_align ((dst_offset >> 1) & 3)
+#ifdef dst_align
+#undef dst_align
+#endif
+#define dst_align ((dst_offset >> 1) & 3)
        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));

        int dst_start  = mad24(y, dst_step, dst_offset);
@@ -159,8 +171,8 @@ __kernel void arithm_bitwise_not_D2 (__global ushort *src1, int src1_step, int s


 __kernel void arithm_bitwise_not_D3 (__global short *src1, int src1_step, int src1_offset,
-                             __global short *dst,  int dst_step,  int dst_offset,
-                             int rows, int cols, int dst_step1)
+                                     __global short *dst,  int dst_step,  int dst_offset,
+                                     int rows, int cols, int dst_step1)

 {
    int x = get_global_id(0);
@@ -170,7 +182,10 @@ __kernel void arithm_bitwise_not_D3 (__global short *src1, int src1_step, int sr
    {
        x = x << 2;

-        #define dst_align ((dst_offset >> 1) & 3)
+#ifdef dst_align
+#undef dst_align
+#endif
+#define dst_align ((dst_offset >> 1) & 3)
        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));

        int dst_start  = mad24(y, dst_step, dst_offset);
@@ -194,8 +209,8 @@ __kernel void arithm_bitwise_not_D3 (__global short *src1, int src1_step, int sr


 __kernel void arithm_bitwise_not_D4 (__global int *src1, int src1_step, int src1_offset,
-                             __global int *dst,  int dst_step,  int dst_offset,
-                             int rows, int cols, int dst_step1)
+                                     __global int *dst,  int dst_step,  int dst_offset,
+                                     int rows, int cols, int dst_step1)
 {
    int x = get_global_id(0);
    int y = get_global_id(1);
--- a/modules/ocl/src/opencl/arithm_bitwise_or.cl
+++ b/modules/ocl/src/opencl/arithm_bitwise_or.cl
@@ -43,7 +43,11 @@
 //
 //M*/
 #if defined (DOUBLE_SUPPORT)
+#ifdef cl_khr_fp64
 #pragma OPENCL EXTENSION cl_khr_fp64:enable
+#elif defined (cl_amd_fp64)
+#pragma OPENCL EXTENSION cl_amd_fp64:enable
+#endif
 #endif

 //////////////////////////////////////////////////////////////////////////////////////////////////////
@@ -51,9 +55,9 @@
 ///////////////////////////////////////////////////////////////////////////////////////////////////////
 /**************************************bitwise_or without mask**************************************/
 __kernel void arithm_bitwise_or_D0 (__global uchar *src1, int src1_step, int src1_offset,
-                             __global uchar *src2, int src2_step, int src2_offset,
-                             __global uchar *dst,  int dst_step,  int dst_offset,
-                             int rows, int cols, int dst_step1)
+                                    __global uchar *src2, int src2_step, int src2_offset,
+                                    __global uchar *dst,  int dst_step,  int dst_offset,
+                                    int rows, int cols, int dst_step1)
 {
    int x = get_global_id(0);
    int y = get_global_id(1);
@@ -62,29 +66,32 @@ __kernel void arithm_bitwise_or_D0 (__global uchar *src1, int src1_step, int src
    {
        x = x << 2;

-        #define dst_align (dst_offset & 3)
+#ifdef dst_align
+#undef dst_align
+#endif
+#define dst_align (dst_offset & 3)
        int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
        int src2_index = mad24(y, src2_step, x + src2_offset - dst_align);

        int dst_start  = mad24(y, dst_step, dst_offset);
        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
        int dst_index  = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
-      int src1_index_fix = src1_index < 0 ? 0 : src1_index;
-      int src2_index_fix = src2_index < 0 ? 0 : src2_index;
+        int src1_index_fix = src1_index < 0 ? 0 : src1_index;
+        int src2_index_fix = src2_index < 0 ? 0 : src2_index;
        uchar4 src1_data = vload4(0, src1 + src1_index_fix);
        uchar4 src2_data = vload4(0, src2 + src2_index_fix);
-      if(src1_index < 0)
-      {
-        uchar4 tmp;
-        tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
-        src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
-      }
-      if(src2_index < 0)
-      {
-        uchar4 tmp;
-        tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
-        src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
-      }
+        if(src1_index < 0)
+        {
+            uchar4 tmp;
+            tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
+            src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
+        }
+        if(src2_index < 0)
+        {
+            uchar4 tmp;
+            tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
+            src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
+        }
        uchar4 dst_data = *((__global uchar4 *)(dst + dst_index));
        uchar4 tmp_data = src1_data | src2_data;

@@ -99,9 +106,9 @@ __kernel void arithm_bitwise_or_D0 (__global uchar *src1, int src1_step, int src


 __kernel void arithm_bitwise_or_D1 (__global char *src1, int src1_step, int src1_offset,
-                             __global char *src2, int src2_step, int src2_offset,
-                             __global char *dst,  int dst_step,  int dst_offset,
-                             int rows, int cols, int dst_step1)
+                                    __global char *src2, int src2_step, int src2_offset,
+                                    __global char *dst,  int dst_step,  int dst_offset,
+                                    int rows, int cols, int dst_step1)
 {
    int x = get_global_id(0);
    int y = get_global_id(1);
@@ -110,7 +117,10 @@ __kernel void arithm_bitwise_or_D1 (__global char *src1, int src1_step, int src1
    {
        x = x << 2;

-        #define dst_align (dst_offset & 3)
+#ifdef dst_align
+#undef dst_align
+#endif
+#define dst_align (dst_offset & 3)
        int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
        int src2_index = mad24(y, src2_step, x + src2_offset - dst_align);

@@ -135,9 +145,9 @@ __kernel void arithm_bitwise_or_D1 (__global char *src1, int src1_step, int src1


 __kernel void arithm_bitwise_or_D2 (__global ushort *src1, int src1_step, int src1_offset,
-                             __global ushort *src2, int src2_step, int src2_offset,
-                             __global ushort *dst,  int dst_step,  int dst_offset,
-                             int rows, int cols, int dst_step1)
+                                    __global ushort *src2, int src2_step, int src2_offset,
+                                    __global ushort *dst,  int dst_step,  int dst_offset,
+                                    int rows, int cols, int dst_step1)

 {
    int x = get_global_id(0);
@@ -147,7 +157,10 @@ __kernel void arithm_bitwise_or_D2 (__global ushort *src1, int src1_step, int sr
    {
        x = x << 2;

-        #define dst_align ((dst_offset >> 1) & 3)
+#ifdef dst_align
+#undef dst_align
+#endif
+#define dst_align ((dst_offset >> 1) & 3)
        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
        int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));

@@ -173,9 +186,9 @@ __kernel void arithm_bitwise_or_D2 (__global ushort *src1, int src1_step, int sr


 __kernel void arithm_bitwise_or_D3 (__global short *src1, int src1_step, int src1_offset,
-                             __global short *src2, int src2_step, int src2_offset,
-                             __global short *dst,  int dst_step,  int dst_offset,
-                             int rows, int cols, int dst_step1)
+                                    __global short *src2, int src2_step, int src2_offset,
+                                    __global short *dst,  int dst_step,  int dst_offset,
+                                    int rows, int cols, int dst_step1)

 {
    int x = get_global_id(0);
@@ -185,7 +198,10 @@ __kernel void arithm_bitwise_or_D3 (__global short *src1, int src1_step, int src
    {
        x = x << 2;

-        #define dst_align ((dst_offset >> 1) & 3)
+#ifdef dst_align
+#undef dst_align
+#endif
+#define dst_align ((dst_offset >> 1) & 3)
        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
        int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));

@@ -211,9 +227,9 @@ __kernel void arithm_bitwise_or_D3 (__global short *src1, int src1_step, int src


 __kernel void arithm_bitwise_or_D4 (__global int *src1, int src1_step, int src1_offset,
-                             __global int *src2, int src2_step, int src2_offset,
-                             __global int *dst,  int dst_step,  int dst_offset,
-                             int rows, int cols, int dst_step1)
+                                    __global int *src2, int src2_step, int src2_offset,
+                                    __global int *dst,  int dst_step,  int dst_offset,
+                                    int rows, int cols, int dst_step1)
 {
    int x = get_global_id(0);
    int y = get_global_id(1);
@@ -233,9 +249,9 @@ __kernel void arithm_bitwise_or_D4 (__global int *src1, int src1_step, int src1_
 }

 __kernel void arithm_bitwise_or_D5 (__global char *src1, int src1_step, int src1_offset,
-                             __global char *src2, int src2_step, int src2_offset,
-                             __global char *dst,  int dst_step,  int dst_offset,
-                             int rows, int cols, int dst_step1)
+                                    __global char *src2, int src2_step, int src2_offset,
+                                    __global char *dst,  int dst_step,  int dst_offset,
+                                    int rows, int cols, int dst_step1)
 {
    int x = get_global_id(0);
    int y = get_global_id(1);
@@ -256,9 +272,9 @@ __kernel void arithm_bitwise_or_D5 (__global char *src1, int src1_step, int src1

 #if defined (DOUBLE_SUPPORT)
 __kernel void arithm_bitwise_or_D6 (__global char *src1, int src1_step, int src1_offset,
-                             __global char *src2, int src2_step, int src2_offset,
-                             __global char *dst,  int dst_step,  int dst_offset,
-                             int rows, int cols, int dst_step1)
+                                    __global char *src2, int src2_step, int src2_offset,
+                                    __global char *dst,  int dst_step,  int dst_offset,
+                                    int rows, int cols, int dst_step1)
 {
    int x = get_global_id(0);
    int y = get_global_id(1);
--- a/modules/ocl/src/opencl/arithm_bitwise_or_mask.cl
+++ b/modules/ocl/src/opencl/arithm_bitwise_or_mask.cl
@@ -43,18 +43,22 @@
 //
 //M*/
 #if defined (DOUBLE_SUPPORT)
+#ifdef cl_khr_fp64
 #pragma OPENCL EXTENSION cl_khr_fp64:enable
+#elif defined (cl_amd_fp64)
+#pragma OPENCL EXTENSION cl_amd_fp64:enable
+#endif
 #endif
-
 //////////////////////////////////////////////////////////////////////////////////////////////////////
 ////////////////////////////////////////////BITWISE_OR////////////////////////////////////////////////////
 ///////////////////////////////////////////////////////////////////////////////////////////////////////
 /**************************************bitwise_or with mask**************************************/
-__kernel void arithm_bitwise_or_with_mask_C1_D0 (__global uchar *src1, int src1_step, int src1_offset,
-                                          __global uchar *src2, int src2_step, int src2_offset,
-                                          __global uchar *mask, int mask_step, int mask_offset,
-                                          __global uchar *dst,  int dst_step,  int dst_offset,
-                                          int rows, int cols, int dst_step1)
+__kernel void arithm_bitwise_or_with_mask_C1_D0 (
+        __global uchar *src1, int src1_step, int src1_offset,
+        __global uchar *src2, int src2_step, int src2_offset,
+        __global uchar *mask, int mask_step, int mask_offset,
+        __global uchar *dst,  int dst_step,  int dst_offset,
+        int rows, int cols, int dst_step1)
 {

    int x = get_global_id(0);
@@ -64,7 +68,10 @@ __kernel void arithm_bitwise_or_with_mask_C1_D0 (__global uchar *src1, int src1_
    {
        x = x << 2;

-        #define dst_align (dst_offset & 3)
+#ifdef dst_align
+#undef dst_align
+#endif
+#define dst_align (dst_offset & 3)
        int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
        int src2_index = mad24(y, src2_step, x + src2_offset - dst_align);
        int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
@@ -91,11 +98,12 @@ __kernel void arithm_bitwise_or_with_mask_C1_D0 (__global uchar *src1, int src1_



-__kernel void arithm_bitwise_or_with_mask_C1_D1 (__global char *src1, int src1_step, int src1_offset,
-                                          __global char *src2, int src2_step, int src2_offset,
-                                          __global uchar *mask, int mask_step, int mask_offset,
-                                          __global char *dst,  int dst_step,  int dst_offset,
-                                          int rows, int cols, int dst_step1)
+__kernel void arithm_bitwise_or_with_mask_C1_D1 (
+        __global char *src1, int src1_step, int src1_offset,
+        __global char *src2, int src2_step, int src2_offset,
+        __global uchar *mask, int mask_step, int mask_offset,
+        __global char *dst,  int dst_step,  int dst_offset,
+        int rows, int cols, int dst_step1)
 {

    int x = get_global_id(0);
@@ -105,7 +113,10 @@ __kernel void arithm_bitwise_or_with_mask_C1_D1 (__global char *src1, int src1_s
    {
        x = x << 2;

-        #define dst_align (dst_offset & 3)
+#ifdef dst_align
+#undef dst_align
+#endif
+#define dst_align (dst_offset & 3)
        int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
        int src2_index = mad24(y, src2_step, x + src2_offset - dst_align);
        int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
@@ -132,11 +143,12 @@ __kernel void arithm_bitwise_or_with_mask_C1_D1 (__global char *src1, int src1_s



-__kernel void arithm_bitwise_or_with_mask_C1_D2 (__global ushort *src1, int src1_step, int src1_offset,
-                                          __global ushort *src2, int src2_step, int src2_offset,
-                                          __global uchar  *mask, int mask_step, int mask_offset,
-                                          __global ushort *dst,  int dst_step,  int dst_offset,
-                                          int rows, int cols, int dst_step1)
+__kernel void arithm_bitwise_or_with_mask_C1_D2 (
+        __global ushort *src1, int src1_step, int src1_offset,
+        __global ushort *src2, int src2_step, int src2_offset,
+        __global uchar  *mask, int mask_step, int mask_offset,
+        __global ushort *dst,  int dst_step,  int dst_offset,
+        int rows, int cols, int dst_step1)
 {

    int x = get_global_id(0);
@@ -146,7 +158,10 @@ __kernel void arithm_bitwise_or_with_mask_C1_D2 (__global ushort *src1, int src1
    {
        x = x << 1;

-        #define dst_align ((dst_offset >> 1) & 1)
+#ifdef dst_align
+#undef dst_align
+#endif
+#define dst_align ((dst_offset >> 1) & 1)
        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
        int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
        int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
@@ -171,11 +186,12 @@ __kernel void arithm_bitwise_or_with_mask_C1_D2 (__global ushort *src1, int src1



-__kernel void arithm_bitwise_or_with_mask_C1_D3 (__global short *src1, int src1_step, int src1_offset,
-                                          __global short *src2, int src2_step, int src2_offset,
-                                          __global uchar *mask, int mask_step, int mask_offset,
-                                          __global short *dst,  int dst_step,  int dst_offset,
-                                          int rows, int cols, int dst_step1)
+__kernel void arithm_bitwise_or_with_mask_C1_D3 (
+        __global short *src1, int src1_step, int src1_offset,
+        __global short *src2, int src2_step, int src2_offset,
+        __global uchar *mask, int mask_step, int mask_offset,
+        __global short *dst,  int dst_step,  int dst_offset,
+        int rows, int cols, int dst_step1)
 {

    int x = get_global_id(0);
@@ -185,7 +201,10 @@ __kernel void arithm_bitwise_or_with_mask_C1_D3 (__global short *src1, int src1_
    {
        x = x << 1;

-        #define dst_align ((dst_offset >> 1) & 1)
+#ifdef dst_align
+#undef dst_align
+#endif
+#define dst_align ((dst_offset >> 1) & 1)
        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
        int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
        int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
@@ -198,8 +217,8 @@ __kernel void arithm_bitwise_or_with_mask_C1_D3 (__global short *src1, int src1_
        short2 src2_data = vload2(0, (__global short *)((__global char *)src2 + src2_index));
        uchar2  mask_data = vload2(0, mask + mask_index);

-    short2 data = *((__global short2 *)((__global uchar *)dst + dst_index));
-    short2 tmp_data = src1_data | src2_data;
+        short2 data = *((__global short2 *)((__global uchar *)dst + dst_index));
+        short2 tmp_data = src1_data | src2_data;

        data.x = convert_short((mask_data.x) && (dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : data.x;
        data.y = convert_short((mask_data.y) && (dst_index + 2 >= dst_start) && (dst_index + 2 < dst_end)) ? tmp_data.y : data.y;
@@ -210,11 +229,12 @@ __kernel void arithm_bitwise_or_with_mask_C1_D3 (__global short *src1, int src1_



-__kernel void arithm_bitwise_or_with_mask_C1_D4 (__global int   *src1, int src1_step, int src1_offset,
-                                          __global int   *src2, int src2_step, int src2_offset,
-                                          __global uchar *mask, int mask_step, int mask_offset,
-                                          __global int   *dst,  int dst_step,  int dst_offset,
-                                          int rows, int cols, int dst_step1)
+__kernel void arithm_bitwise_or_with_mask_C1_D4 (
+        __global int   *src1, int src1_step, int src1_offset,
+        __global int   *src2, int src2_step, int src2_offset,
+        __global uchar *mask, int mask_step, int mask_offset,
+        __global int   *dst,  int dst_step,  int dst_offset,
+        int rows, int cols, int dst_step1)
 {

    int x = get_global_id(0);
@@ -242,11 +262,12 @@ __kernel void arithm_bitwise_or_with_mask_C1_D4 (__global int   *src1, int src1_



-__kernel void arithm_bitwise_or_with_mask_C1_D5 (__global char *src1, int src1_step, int src1_offset,
-                                          __global char *src2, int src2_step, int src2_offset,
-                                          __global uchar *mask, int mask_step, int mask_offset,
-                                          __global char *dst,  int dst_step,  int dst_offset,
-                                          int rows, int cols, int dst_step1)
+__kernel void arithm_bitwise_or_with_mask_C1_D5 (
+        __global char *src1, int src1_step, int src1_offset,
+        __global char *src2, int src2_step, int src2_offset,
+        __global uchar *mask, int mask_step, int mask_offset,
+        __global char *dst,  int dst_step,  int dst_offset,
+        int rows, int cols, int dst_step1)
 {

    int x = get_global_id(0);
@@ -273,13 +294,13 @@ __kernel void arithm_bitwise_or_with_mask_C1_D5 (__global char *src1, int src1_s
 }


-
 #if defined (DOUBLE_SUPPORT)
-__kernel void arithm_bitwise_or_with_mask_C1_D6 (__global char *src1, int src1_step, int src1_offset,
-                                          __global char *src2, int src2_step, int src2_offset,
-                                          __global uchar *mask, int mask_step, int mask_offset,
-                                          __global char *dst,  int dst_step,  int dst_offset,
-                                          int rows, int cols, int dst_step1)
+__kernel void arithm_bitwise_or_with_mask_C1_D6 (
+        __global char *src1, int src1_step, int src1_offset,
+        __global char *src2, int src2_step, int src2_offset,
+        __global uchar *mask, int mask_step, int mask_offset,
+        __global char *dst,  int dst_step,  int dst_offset,
+        int rows, int cols, int dst_step1)
 {

    int x = get_global_id(0);
@@ -308,12 +329,12 @@ __kernel void arithm_bitwise_or_with_mask_C1_D6 (__global char *src1, int src1_s
 #endif


-
-__kernel void arithm_bitwise_or_with_mask_C2_D0 (__global uchar *src1, int src1_step, int src1_offset,
-                                          __global uchar *src2, int src2_step, int src2_offset,
-                                          __global uchar *mask, int mask_step, int mask_offset,
-                                          __global uchar *dst,  int dst_step,  int dst_offset,
-                                          int rows, int cols, int dst_step1)
+__kernel void arithm_bitwise_or_with_mask_C2_D0 (
+        __global uchar *src1, int src1_step, int src1_offset,
+        __global uchar *src2, int src2_step, int src2_offset,
+        __global uchar *mask, int mask_step, int mask_offset,
+        __global uchar *dst,  int dst_step,  int dst_offset,
+        int rows, int cols, int dst_step1)
 {

    int x = get_global_id(0);
@@ -323,7 +344,10 @@ __kernel void arithm_bitwise_or_with_mask_C2_D0 (__global uchar *src1, int src1_
    {
        x = x << 1;

-        #define dst_align ((dst_offset >> 1) & 1)
+#ifdef dst_align
+#undef dst_align
+#endif
+#define dst_align ((dst_offset >> 1) & 1)
        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
        int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
        int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
@@ -347,11 +371,12 @@ __kernel void arithm_bitwise_or_with_mask_C2_D0 (__global uchar *src1, int src1_
 }


-__kernel void arithm_bitwise_or_with_mask_C2_D1 (__global char *src1, int src1_step, int src1_offset,
-                                          __global char *src2, int src2_step, int src2_offset,
-                                          __global uchar *mask, int mask_step, int mask_offset,
-                                          __global char *dst,  int dst_step,  int dst_offset,
-                                          int rows, int cols, int dst_step1)
+__kernel void arithm_bitwise_or_with_mask_C2_D1 (
+        __global char *src1, int src1_step, int src1_offset,
+        __global char *src2, int src2_step, int src2_offset,
+        __global uchar *mask, int mask_step, int mask_offset,
+        __global char *dst,  int dst_step,  int dst_offset,
+        int rows, int cols, int dst_step1)
 {

    int x = get_global_id(0);
@@ -361,7 +386,10 @@ __kernel void arithm_bitwise_or_with_mask_C2_D1 (__global char *src1, int src1_s
    {
        x = x << 1;

-        #define dst_align ((dst_offset >> 1) & 1)
+#ifdef dst_align
+#undef dst_align
+#endif
+#define dst_align ((dst_offset >> 1) & 1)
        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
        int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
        int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
@@ -384,11 +412,12 @@ __kernel void arithm_bitwise_or_with_mask_C2_D1 (__global char *src1, int src1_s
    }
 }

-__kernel void arithm_bitwise_or_with_mask_C2_D2 (__global ushort *src1, int src1_step, int src1_offset,
-                                          __global ushort *src2, int src2_step, int src2_offset,
-                                          __global uchar  *mask, int mask_step, int mask_offset,
-                                          __global ushort *dst,  int dst_step,  int dst_offset,
-                                          int rows, int cols, int dst_step1)
+__kernel void arithm_bitwise_or_with_mask_C2_D2 (
+        __global ushort *src1, int src1_step, int src1_offset,
+        __global ushort *src2, int src2_step, int src2_offset,
+        __global uchar  *mask, int mask_step, int mask_offset,
+        __global ushort *dst,  int dst_step,  int dst_offset,
+        int rows, int cols, int dst_step1)
 {

    int x = get_global_id(0);
@@ -413,11 +442,12 @@ __kernel void arithm_bitwise_or_with_mask_C2_D2 (__global ushort *src1, int src1
        *((__global ushort2 *)((__global char *)dst + dst_index)) = data;
    }
 }
-__kernel void arithm_bitwise_or_with_mask_C2_D3 (__global short *src1, int src1_step, int src1_offset,
-                                          __global short *src2, int src2_step, int src2_offset,
-                                          __global uchar *mask, int mask_step, int mask_offset,
-                                          __global short *dst,  int dst_step,  int dst_offset,
-                                          int rows, int cols, int dst_step1)
+__kernel void arithm_bitwise_or_with_mask_C2_D3 (
+        __global short *src1, int src1_step, int src1_offset,
+        __global short *src2, int src2_step, int src2_offset,
+        __global uchar *mask, int mask_step, int mask_offset,
+        __global short *dst,  int dst_step,  int dst_offset,
+        int rows, int cols, int dst_step1)
 {

    int x = get_global_id(0);
@@ -442,11 +472,12 @@ __kernel void arithm_bitwise_or_with_mask_C2_D3 (__global short *src1, int src1_
        *((__global short2 *)((__global char *)dst + dst_index)) = data;
    }
 }
-__kernel void arithm_bitwise_or_with_mask_C2_D4 (__global int   *src1, int src1_step, int src1_offset,
-                                          __global int   *src2, int src2_step, int src2_offset,
-                                          __global uchar *mask, int mask_step, int mask_offset,
-                                          __global int    *dst,  int dst_step,  int dst_offset,
-                                          int rows, int cols, int dst_step1)
+__kernel void arithm_bitwise_or_with_mask_C2_D4 (
+        __global int   *src1, int src1_step, int src1_offset,
+        __global int   *src2, int src2_step, int src2_offset,
+        __global uchar *mask, int mask_step, int mask_offset,
+        __global int    *dst,  int dst_step,  int dst_offset,
+        int rows, int cols, int dst_step1)
 {

    int x = get_global_id(0);
@@ -471,11 +502,12 @@ __kernel void arithm_bitwise_or_with_mask_C2_D4 (__global int   *src1, int src1_
        *((__global int2 *)((__global char *)dst + dst_index)) = data;
    }
 }
-__kernel void arithm_bitwise_or_with_mask_C2_D5 (__global char *src1, int src1_step, int src1_offset,
-                                          __global char *src2, int src2_step, int src2_offset,
-                                          __global uchar *mask, int mask_step, int mask_offset,
-                                          __global char *dst,  int dst_step,  int dst_offset,
-                                          int rows, int cols, int dst_step1)
+__kernel void arithm_bitwise_or_with_mask_C2_D5 (
+        __global char *src1, int src1_step, int src1_offset,
+        __global char *src2, int src2_step, int src2_offset,
+        __global uchar *mask, int mask_step, int mask_offset,
+        __global char *dst,  int dst_step,  int dst_offset,
+        int rows, int cols, int dst_step1)
 {

    int x = get_global_id(0);
@@ -501,11 +533,12 @@ __kernel void arithm_bitwise_or_with_mask_C2_D5 (__global char *src1, int src1_s
    }
 }
 #if defined (DOUBLE_SUPPORT)
-__kernel void arithm_bitwise_or_with_mask_C2_D6 (__global char *src1, int src1_step, int src1_offset,
-                                          __global char *src2, int src2_step, int src2_offset,
-                                          __global uchar *mask, int mask_step, int mask_offset,
-                                          __global char *dst,  int dst_step,  int dst_offset,
-                                          int rows, int cols, int dst_step1)
+__kernel void arithm_bitwise_or_with_mask_C2_D6 (
+        __global char *src1, int src1_step, int src1_offset,
+        __global char *src2, int src2_step, int src2_offset,
+        __global uchar *mask, int mask_step, int mask_offset,
+        __global char *dst,  int dst_step,  int dst_offset,
+        int rows, int cols, int dst_step1)
 {

    int x = get_global_id(0);
@@ -533,12 +566,12 @@ __kernel void arithm_bitwise_or_with_mask_C2_D6 (__global char *src1, int src1_s
 #endif


-
-__kernel void arithm_bitwise_or_with_mask_C3_D0 (__global uchar *src1, int src1_step, int src1_offset,
-                                          __global uchar *src2, int src2_step, int src2_offset,
-                                          __global uchar *mask, int mask_step, int mask_offset,
-                                          __global uchar *dst,  int dst_step,  int dst_offset,
-                                          int rows, int cols, int dst_step1)
+__kernel void arithm_bitwise_or_with_mask_C3_D0 (
+        __global uchar *src1, int src1_step, int src1_offset,
+        __global uchar *src2, int src2_step, int src2_offset,
+        __global uchar *mask, int mask_step, int mask_offset,
+        __global uchar *dst,  int dst_step,  int dst_offset,
+        int rows, int cols, int dst_step1)
 {

    int x = get_global_id(0);
@@ -548,7 +581,10 @@ __kernel void arithm_bitwise_or_with_mask_C3_D0 (__global uchar *src1, int src1_
    {
        x = x << 2;

-        #define dst_align (((dst_offset % dst_step) / 3 ) & 3)
+#ifdef dst_align
+#undef dst_align
+#endif
+#define dst_align (((dst_offset % dst_step) / 3 ) & 3)
        int src1_index = mad24(y, src1_step, (x * 3) + src1_offset - (dst_align * 3));
        int src2_index = mad24(y, src2_step, (x * 3) + src2_offset - (dst_align * 3));
        int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
@@ -596,11 +632,12 @@ __kernel void arithm_bitwise_or_with_mask_C3_D0 (__global uchar *src1, int src1_
 }


-__kernel void arithm_bitwise_or_with_mask_C3_D1 (__global char *src1, int src1_step, int src1_offset,
-                                          __global char *src2, int src2_step, int src2_offset,
-                                          __global uchar *mask, int mask_step, int mask_offset,
-                                          __global char *dst,  int dst_step,  int dst_offset,
-                                          int rows, int cols, int dst_step1)
+__kernel void arithm_bitwise_or_with_mask_C3_D1 (
+        __global char *src1, int src1_step, int src1_offset,
+        __global char *src2, int src2_step, int src2_offset,
+        __global uchar *mask, int mask_step, int mask_offset,
+        __global char *dst,  int dst_step,  int dst_offset,
+        int rows, int cols, int dst_step1)
 {

    int x = get_global_id(0);
@@ -610,7 +647,10 @@ __kernel void arithm_bitwise_or_with_mask_C3_D1 (__global char *src1, int src1_s
    {
        x = x << 2;

-        #define dst_align (((dst_offset % dst_step) / 3 ) & 3)
+#ifdef dst_align
+#undef dst_align
+#endif
+#define dst_align (((dst_offset % dst_step) / 3 ) & 3)
        int src1_index = mad24(y, src1_step, (x * 3) + src1_offset - (dst_align * 3));
        int src2_index = mad24(y, src2_step, (x * 3) + src2_offset - (dst_align * 3));
        int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
@@ -657,11 +697,12 @@ __kernel void arithm_bitwise_or_with_mask_C3_D1 (__global char *src1, int src1_s
    }
 }

-__kernel void arithm_bitwise_or_with_mask_C3_D2 (__global ushort *src1, int src1_step, int src1_offset,
-                                          __global ushort *src2, int src2_step, int src2_offset,
-                                          __global uchar  *mask, int mask_step, int mask_offset,
-                                          __global ushort *dst,  int dst_step,  int dst_offset,
-                                          int rows, int cols, int dst_step1)
+__kernel void arithm_bitwise_or_with_mask_C3_D2 (
+        __global ushort *src1, int src1_step, int src1_offset,
+        __global ushort *src2, int src2_step, int src2_offset,
+        __global uchar  *mask, int mask_step, int mask_offset,
+        __global ushort *dst,  int dst_step,  int dst_offset,
+        int rows, int cols, int dst_step1)
 {

    int x = get_global_id(0);
@@ -671,7 +712,10 @@ __kernel void arithm_bitwise_or_with_mask_C3_D2 (__global ushort *src1, int src1
    {
        x = x << 1;

-        #define dst_align (((dst_offset % dst_step) / 6 ) & 1)
+#ifdef dst_align
+#undef dst_align
+#endif
+#define dst_align (((dst_offset % dst_step) / 6 ) & 1)
        int src1_index = mad24(y, src1_step, (x * 6) + src1_offset - (dst_align * 6));
        int src2_index = mad24(y, src2_step, (x * 6) + src2_offset - (dst_align * 6));
        int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
@@ -701,23 +745,24 @@ __kernel void arithm_bitwise_or_with_mask_C3_D2 (__global ushort *src1, int src1
        data_0.xy = ((mask_data.x) && (dst_index + 0 >= dst_start)) ? tmp_data_0.xy : data_0.xy;

        data_1.x  = ((mask_data.x) && (dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end))
-                     ? tmp_data_1.x : data_1.x;
+                    ? tmp_data_1.x : data_1.x;
        data_1.y  = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
-                     ? tmp_data_1.y : data_1.y;
+                    ? tmp_data_1.y : data_1.y;

        data_2.xy = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
-                     ? tmp_data_2.xy : data_2.xy;
+                    ? tmp_data_2.xy : data_2.xy;

-       *((__global ushort2 *)((__global char *)dst + dst_index + 0))= data_0;
-       *((__global ushort2 *)((__global char *)dst + dst_index + 4))= data_1;
-       *((__global ushort2 *)((__global char *)dst + dst_index + 8))= data_2;
+        *((__global ushort2 *)((__global char *)dst + dst_index + 0))= data_0;
+        *((__global ushort2 *)((__global char *)dst + dst_index + 4))= data_1;
+        *((__global ushort2 *)((__global char *)dst + dst_index + 8))= data_2;
    }
 }
-__kernel void arithm_bitwise_or_with_mask_C3_D3 (__global short *src1, int src1_step, int src1_offset,
-                                          __global short *src2, int src2_step, int src2_offset,
-                                          __global uchar  *mask, int mask_step, int mask_offset,
-                                          __global short *dst,  int dst_step,  int dst_offset,
-                                          int rows, int cols, int dst_step1)
+__kernel void arithm_bitwise_or_with_mask_C3_D3 (
+        __global short *src1, int src1_step, int src1_offset,
+        __global short *src2, int src2_step, int src2_offset,
+        __global uchar  *mask, int mask_step, int mask_offset,
+        __global short *dst,  int dst_step,  int dst_offset,
+        int rows, int cols, int dst_step1)
 {

    int x = get_global_id(0);
@@ -727,7 +772,10 @@ __kernel void arithm_bitwise_or_with_mask_C3_D3 (__global short *src1, int src1_
    {
        x = x << 1;

-        #define dst_align (((dst_offset % dst_step) / 6 ) & 1)
+#ifdef dst_align
+#undef dst_align
+#endif
+#define dst_align (((dst_offset % dst_step) / 6 ) & 1)
        int src1_index = mad24(y, src1_step, (x * 6) + src1_offset - (dst_align * 6));
        int src2_index = mad24(y, src2_step, (x * 6) + src2_offset - (dst_align * 6));
        int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
@@ -757,23 +805,24 @@ __kernel void arithm_bitwise_or_with_mask_C3_D3 (__global short *src1, int src1_
        data_0.xy = ((mask_data.x) && (dst_index + 0 >= dst_start)) ? tmp_data_0.xy : data_0.xy;

        data_1.x  = ((mask_data.x) && (dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end))
-                     ? tmp_data_1.x : data_1.x;
+                    ? tmp_data_1.x : data_1.x;
        data_1.y  = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
-                     ? tmp_data_1.y : data_1.y;
+                    ? tmp_data_1.y : data_1.y;

        data_2.xy = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
-                     ? tmp_data_2.xy : data_2.xy;
+                    ? tmp_data_2.xy : data_2.xy;

-       *((__global short2 *)((__global char *)dst + dst_index + 0))= data_0;
-       *((__global short2 *)((__global char *)dst + dst_index + 4))= data_1;
-       *((__global short2 *)((__global char *)dst + dst_index + 8))= data_2;
+        *((__global short2 *)((__global char *)dst + dst_index + 0))= data_0;
+        *((__global short2 *)((__global char *)dst + dst_index + 4))= data_1;
+        *((__global short2 *)((__global char *)dst + dst_index + 8))= data_2;
    }
 }
-__kernel void arithm_bitwise_or_with_mask_C3_D4 (__global int   *src1, int src1_step, int src1_offset,
-                                          __global int   *src2, int src2_step, int src2_offset,
-                                          __global uchar *mask, int mask_step, int mask_offset,
-                                          __global int   *dst,  int dst_step,  int dst_offset,
-                                          int rows, int cols, int dst_step1)
+__kernel void arithm_bitwise_or_with_mask_C3_D4 (
+        __global int   *src1, int src1_step, int src1_offset,
+        __global int   *src2, int src2_step, int src2_offset,
+        __global uchar *mask, int mask_step, int mask_offset,
+        __global int   *dst,  int dst_step,  int dst_offset,
+        int rows, int cols, int dst_step1)
 {

    int x = get_global_id(0);
@@ -808,16 +857,17 @@ __kernel void arithm_bitwise_or_with_mask_C3_D4 (__global int   *src1, int src1_
        data_1 = mask_data ? tmp_data_1 : data_1;
        data_2 = mask_data ? tmp_data_2 : data_2;

-       *((__global int *)((__global char *)dst + dst_index + 0))= data_0;
-       *((__global int *)((__global char *)dst + dst_index + 4))= data_1;
-       *((__global int *)((__global char *)dst + dst_index + 8))= data_2;
+        *((__global int *)((__global char *)dst + dst_index + 0))= data_0;
+        *((__global int *)((__global char *)dst + dst_index + 4))= data_1;
+        *((__global int *)((__global char *)dst + dst_index + 8))= data_2;
    }
 }
-__kernel void arithm_bitwise_or_with_mask_C3_D5 (__global char *src1, int src1_step, int src1_offset,
-                                          __global char *src2, int src2_step, int src2_offset,
-                                          __global uchar *mask, int mask_step, int mask_offset,
-                                          __global char *dst,  int dst_step,  int dst_offset,
-                                          int rows, int cols, int dst_step1)
+__kernel void arithm_bitwise_or_with_mask_C3_D5 (
+        __global char *src1, int src1_step, int src1_offset,
+        __global char *src2, int src2_step, int src2_offset,
+        __global uchar *mask, int mask_step, int mask_offset,
+        __global char *dst,  int dst_step,  int dst_offset,
+        int rows, int cols, int dst_step1)
 {

    int x = get_global_id(0);
@@ -852,17 +902,18 @@ __kernel void arithm_bitwise_or_with_mask_C3_D5 (__global char *src1, int src1_s
        data_1 = mask_data ? tmp_data_1 : data_1;
        data_2 = mask_data ? tmp_data_2 : data_2;

-       *((__global char4 *)((__global char *)dst + dst_index + 0))= data_0;
-       *((__global char4 *)((__global char *)dst + dst_index + 4))= data_1;
-       *((__global char4 *)((__global char *)dst + dst_index + 8))= data_2;
+        *((__global char4 *)((__global char *)dst + dst_index + 0))= data_0;
+        *((__global char4 *)((__global char *)dst + dst_index + 4))= data_1;
+        *((__global char4 *)((__global char *)dst + dst_index + 8))= data_2;
    }
 }
 #if defined (DOUBLE_SUPPORT)
-__kernel void arithm_bitwise_or_with_mask_C3_D6 (__global char *src1, int src1_step, int src1_offset,
-                                          __global char *src2, int src2_step, int src2_offset,
-                                          __global uchar  *mask, int mask_step, int mask_offset,
-                                          __global char *dst,  int dst_step,  int dst_offset,
-                                          int rows, int cols, int dst_step1)
+__kernel void arithm_bitwise_or_with_mask_C3_D6 (
+        __global char *src1, int src1_step, int src1_offset,
+        __global char *src2, int src2_step, int src2_offset,
+        __global uchar  *mask, int mask_step, int mask_offset,
+        __global char *dst,  int dst_step,  int dst_offset,
+        int rows, int cols, int dst_step1)
 {

    int x = get_global_id(0);
@@ -897,20 +948,20 @@ __kernel void arithm_bitwise_or_with_mask_C3_D6 (__global char *src1, int src1_s
        data_1 = mask_data ? tmp_data_1 : data_1;
        data_2 = mask_data ? tmp_data_2 : data_2;

-       *((__global char8 *)((__global char *)dst + dst_index + 0 ))= data_0;
-       *((__global char8 *)((__global char *)dst + dst_index + 8 ))= data_1;
-       *((__global char8 *)((__global char *)dst + dst_index + 16))= data_2;
+        *((__global char8 *)((__global char *)dst + dst_index + 0 ))= data_0;
+        *((__global char8 *)((__global char *)dst + dst_index + 8 ))= data_1;
+        *((__global char8 *)((__global char *)dst + dst_index + 16))= data_2;
    }
 }
 #endif


-
-__kernel void arithm_bitwise_or_with_mask_C4_D0 (__global uchar *src1, int src1_step, int src1_offset,
-                                          __global uchar *src2, int src2_step, int src2_offset,
-                                          __global uchar *mask, int mask_step, int mask_offset,
-                                          __global uchar *dst,  int dst_step,  int dst_offset,
-                                          int rows, int cols, int dst_step1)
+__kernel void arithm_bitwise_or_with_mask_C4_D0 (
+        __global uchar *src1, int src1_step, int src1_offset,
+        __global uchar *src2, int src2_step, int src2_offset,
+        __global uchar *mask, int mask_step, int mask_offset,
+        __global uchar *dst,  int dst_step,  int dst_offset,
+        int rows, int cols, int dst_step1)
 {

    int x = get_global_id(0);
@@ -937,11 +988,12 @@ __kernel void arithm_bitwise_or_with_mask_C4_D0 (__global uchar *src1, int src1_
 }


-__kernel void arithm_bitwise_or_with_mask_C4_D1 (__global char *src1, int src1_step, int src1_offset,
-                                          __global char *src2, int src2_step, int src2_offset,
-                                          __global uchar *mask, int mask_step, int mask_offset,
-                                          __global char *dst,  int dst_step,  int dst_offset,
-                                          int rows, int cols, int dst_step1)
+__kernel void arithm_bitwise_or_with_mask_C4_D1 (
+        __global char *src1, int src1_step, int src1_offset,
+        __global char *src2, int src2_step, int src2_offset,
+        __global uchar *mask, int mask_step, int mask_offset,
+        __global char *dst,  int dst_step,  int dst_offset,
+        int rows, int cols, int dst_step1)
 {

    int x = get_global_id(0);
@@ -967,11 +1019,12 @@ __kernel void arithm_bitwise_or_with_mask_C4_D1 (__global char *src1, int src1_s
    }
 }

-__kernel void arithm_bitwise_or_with_mask_C4_D2 (__global ushort *src1, int src1_step, int src1_offset,
-                                          __global ushort *src2, int src2_step, int src2_offset,
-                                          __global uchar  *mask, int mask_step, int mask_offset,
-                                          __global ushort *dst,  int dst_step,  int dst_offset,
-                                          int rows, int cols, int dst_step1)
+__kernel void arithm_bitwise_or_with_mask_C4_D2 (
+        __global ushort *src1, int src1_step, int src1_offset,
+        __global ushort *src2, int src2_step, int src2_offset,
+        __global uchar  *mask, int mask_step, int mask_offset,
+        __global ushort *dst,  int dst_step,  int dst_offset,
+        int rows, int cols, int dst_step1)
 {

    int x = get_global_id(0);
@@ -996,11 +1049,12 @@ __kernel void arithm_bitwise_or_with_mask_C4_D2 (__global ushort *src1, int src1
        *((__global ushort4 *)((__global char *)dst + dst_index)) = data;
    }
 }
-__kernel void arithm_bitwise_or_with_mask_C4_D3 (__global short *src1, int src1_step, int src1_offset,
-                                          __global short *src2, int src2_step, int src2_offset,
-                                          __global uchar *mask, int mask_step, int mask_offset,
-                                          __global short *dst,  int dst_step,  int dst_offset,
-                                          int rows, int cols, int dst_step1)
+__kernel void arithm_bitwise_or_with_mask_C4_D3 (
+        __global short *src1, int src1_step, int src1_offset,
+        __global short *src2, int src2_step, int src2_offset,
+        __global uchar *mask, int mask_step, int mask_offset,
+        __global short *dst,  int dst_step,  int dst_offset,
+        int rows, int cols, int dst_step1)
 {

    int x = get_global_id(0);
@@ -1025,11 +1079,12 @@ __kernel void arithm_bitwise_or_with_mask_C4_D3 (__global short *src1, int src1_
        *((__global short4 *)((__global char *)dst + dst_index)) = data;
    }
 }
-__kernel void arithm_bitwise_or_with_mask_C4_D4 (__global int   *src1, int src1_step, int src1_offset,
-                                          __global int   *src2, int src2_step, int src2_offset,
-                                          __global uchar *mask, int mask_step, int mask_offset,
-                                          __global int   *dst,  int dst_step,  int dst_offset,
-                                          int rows, int cols, int dst_step1)
+__kernel void arithm_bitwise_or_with_mask_C4_D4 (
+        __global int   *src1, int src1_step, int src1_offset,
+        __global int   *src2, int src2_step, int src2_offset,
+        __global uchar *mask, int mask_step, int mask_offset,
+        __global int   *dst,  int dst_step,  int dst_offset,
+        int rows, int cols, int dst_step1)
 {

    int x = get_global_id(0);
@@ -1054,11 +1109,12 @@ __kernel void arithm_bitwise_or_with_mask_C4_D4 (__global int   *src1, int src1_
        *((__global int4 *)((__global char *)dst + dst_index)) = data;
    }
 }
-__kernel void arithm_bitwise_or_with_mask_C4_D5 (__global char *src1, int src1_step, int src1_offset,
-                                          __global char *src2, int src2_step, int src2_offset,
-                                          __global uchar *mask, int mask_step, int mask_offset,
-                                          __global char *dst,  int dst_step,  int dst_offset,
-                                          int rows, int cols, int dst_step1)
+__kernel void arithm_bitwise_or_with_mask_C4_D5 (
+        __global char *src1, int src1_step, int src1_offset,
+        __global char *src2, int src2_step, int src2_offset,
+        __global uchar *mask, int mask_step, int mask_offset,
+        __global char *dst,  int dst_step,  int dst_offset,
+        int rows, int cols, int dst_step1)
 {

    int x = get_global_id(0);
@@ -1084,11 +1140,12 @@ __kernel void arithm_bitwise_or_with_mask_C4_D5 (__global char *src1, int src1_s
    }
 }
 #if defined (DOUBLE_SUPPORT)
-__kernel void arithm_bitwise_or_with_mask_C4_D6 (__global char *src1, int src1_step, int src1_offset,
-                                                  __global char *src2, int src2_step, int src2_offset,
-                                                  __global uchar  *mask, int mask_step, int mask_offset,
-                                                  __global char *dst,  int dst_step,  int dst_offset,
-                                                  int rows, int cols, int dst_step1)
+__kernel void arithm_bitwise_or_with_mask_C4_D6 (
+        __global char *src1, int src1_step, int src1_offset,
+        __global char *src2, int src2_step, int src2_offset,
+        __global uchar  *mask, int mask_step, int mask_offset,
+        __global char *dst,  int dst_step,  int dst_offset,
+        int rows, int cols, int dst_step1)
 {

    int x = get_global_id(0);
--- a/modules/ocl/src/opencl/arithm_bitwise_or_scalar.cl
+++ b/modules/ocl/src/opencl/arithm_bitwise_or_scalar.cl
@@ -43,16 +43,21 @@
 //
 //M*/
 #if defined (DOUBLE_SUPPORT)
+#ifdef cl_khr_fp64
 #pragma OPENCL EXTENSION cl_khr_fp64:enable
+#elif defined (cl_amd_fp64)
+#pragma OPENCL EXTENSION cl_amd_fp64:enable
+#endif
 #endif

 //////////////////////////////////////////////////////////////////////////////////////////////////////
 ////////////////////////////////////////////BITWISE_OR////////////////////////////////////////////////////
 ///////////////////////////////////////////////////////////////////////////////////////////////////////
 /**************************************and with scalar without mask**************************************/
-__kernel void arithm_s_bitwise_or_C1_D0 (__global   uchar *src1, int src1_step, int src1_offset,
-                                  __global   uchar *dst,  int dst_step,  int dst_offset,
-                                  uchar4 src2, int rows, int cols, int dst_step1)
+__kernel void arithm_s_bitwise_or_C1_D0 (
+        __global   uchar *src1, int src1_step, int src1_offset,
+        __global   uchar *dst,  int dst_step,  int dst_offset,
+        uchar4 src2, int rows, int cols, int dst_step1)
 {
    int x = get_global_id(0);
    int y = get_global_id(1);
@@ -61,7 +66,10 @@ __kernel void arithm_s_bitwise_or_C1_D0 (__global   uchar *src1, int src1_step,
    {
        x = x << 2;

-        #define dst_align (dst_offset & 3)
+#ifdef dst_align
+#undef dst_align
+#endif
+#define dst_align (dst_offset & 3)
        int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);

        int dst_start  = mad24(y, dst_step, dst_offset);
@@ -84,9 +92,10 @@ __kernel void arithm_s_bitwise_or_C1_D0 (__global   uchar *src1, int src1_step,
 }


-__kernel void arithm_s_bitwise_or_C1_D1 (__global   char *src1, int src1_step, int src1_offset,
-                                  __global   char *dst,  int dst_step,  int dst_offset,
-                                  char4 src2, int rows, int cols, int dst_step1)
+__kernel void arithm_s_bitwise_or_C1_D1 (
+        __global   char *src1, int src1_step, int src1_offset,
+        __global   char *dst,  int dst_step,  int dst_offset,
+        char4 src2, int rows, int cols, int dst_step1)
 {
    int x = get_global_id(0);
    int y = get_global_id(1);
@@ -95,7 +104,10 @@ __kernel void arithm_s_bitwise_or_C1_D1 (__global   char *src1, int src1_step, i
    {
        x = x << 2;

-        #define dst_align (dst_offset & 3)
+#ifdef dst_align
+#undef dst_align
+#endif
+#define dst_align (dst_offset & 3)
        int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);

        int dst_start  = mad24(y, dst_step, dst_offset);
@@ -117,9 +129,10 @@ __kernel void arithm_s_bitwise_or_C1_D1 (__global   char *src1, int src1_step, i
    }
 }

-__kernel void arithm_s_bitwise_or_C1_D2 (__global   ushort *src1, int src1_step, int src1_offset,
-                                  __global   ushort *dst,  int dst_step,  int dst_offset,
-                                  ushort4 src2, int rows, int cols, int dst_step1)
+__kernel void arithm_s_bitwise_or_C1_D2 (
+        __global   ushort *src1, int src1_step, int src1_offset,
+        __global   ushort *dst,  int dst_step,  int dst_offset,
+        ushort4 src2, int rows, int cols, int dst_step1)
 {

    int x = get_global_id(0);
@@ -129,7 +142,10 @@ __kernel void arithm_s_bitwise_or_C1_D2 (__global   ushort *src1, int src1_step,
    {
        x = x << 1;

-        #define dst_align ((dst_offset >> 1) & 1)
+#ifdef dst_align
+#undef dst_align
+#endif
+#define dst_align ((dst_offset >> 1) & 1)
        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));

        int dst_start  = mad24(y, dst_step, dst_offset);
@@ -148,9 +164,10 @@ __kernel void arithm_s_bitwise_or_C1_D2 (__global   ushort *src1, int src1_step,
        *((__global ushort2 *)((__global uchar *)dst + dst_index)) = data;
    }
 }
-__kernel void arithm_s_bitwise_or_C1_D3 (__global   short *src1, int src1_step, int src1_offset,
-                                  __global   short *dst,  int dst_step,  int dst_offset,
-                                  short4 src2, int rows, int cols, int dst_step1)
+__kernel void arithm_s_bitwise_or_C1_D3 (
+        __global   short *src1, int src1_step, int src1_offset,
+        __global   short *dst,  int dst_step,  int dst_offset,
+        short4 src2, int rows, int cols, int dst_step1)
 {

    int x = get_global_id(0);
@@ -160,7 +177,10 @@ __kernel void arithm_s_bitwise_or_C1_D3 (__global   short *src1, int src1_step,
    {
        x = x << 1;

-        #define dst_align ((dst_offset >> 1) & 1)
+#ifdef dst_align
+#undef dst_align
+#endif
+#define dst_align ((dst_offset >> 1) & 1)
        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));

        int dst_start  = mad24(y, dst_step, dst_offset);
@@ -179,9 +199,10 @@ __kernel void arithm_s_bitwise_or_C1_D3 (__global   short *src1, int src1_step,
        *((__global short2 *)((__global uchar *)dst + dst_index)) = data;
    }
 }
-__kernel void arithm_s_bitwise_or_C1_D4 (__global   int *src1, int src1_step, int src1_offset,
-                                  __global   int *dst,  int dst_step,  int dst_offset,
-                                  int4 src2, int rows, int cols, int dst_step1)
+__kernel void arithm_s_bitwise_or_C1_D4 (
+        __global   int *src1, int src1_step, int src1_offset,
+        __global   int *dst,  int dst_step,  int dst_offset,
+        int4 src2, int rows, int cols, int dst_step1)
 {

    int x = get_global_id(0);
@@ -200,9 +221,10 @@ __kernel void arithm_s_bitwise_or_C1_D4 (__global   int *src1, int src1_step, in
        *((__global int *)((__global char *)dst + dst_index)) = data;
    }
 }
-__kernel void arithm_s_bitwise_or_C1_D5 (__global   char *src1, int src1_step, int src1_offset,
-                                  __global   char *dst,  int dst_step,  int dst_offset,
-                                  char16 src2, int rows, int cols, int dst_step1)
+__kernel void arithm_s_bitwise_or_C1_D5 (
+        __global   char *src1, int src1_step, int src1_offset,
+        __global   char *dst,  int dst_step,  int dst_offset,
+        char16 src2, int rows, int cols, int dst_step1)
 {

    int x = get_global_id(0);
@@ -222,9 +244,10 @@ __kernel void arithm_s_bitwise_or_C1_D5 (__global   char *src1, int src1_step, i
    }
 }
 #if defined (DOUBLE_SUPPORT)
-__kernel void arithm_s_bitwise_or_C1_D6 (__global short *src1, int src1_step, int src1_offset,
-                                  __global short *dst,  int dst_step,  int dst_offset,
-                                  short16 src2, int rows, int cols, int dst_step1)
+__kernel void arithm_s_bitwise_or_C1_D6 (
+        __global short *src1, int src1_step, int src1_offset,
+        __global short *dst,  int dst_step,  int dst_offset,
+        short16 src2, int rows, int cols, int dst_step1)

 {

@@ -245,10 +268,10 @@ __kernel void arithm_s_bitwise_or_C1_D6 (__global short *src1, int src1_step, in
    }
 }
 #endif
-
-__kernel void arithm_s_bitwise_or_C2_D0 (__global   uchar *src1, int src1_step, int src1_offset,
-                                  __global   uchar *dst,  int dst_step,  int dst_offset,
-                                  uchar4 src2, int rows, int cols, int dst_step1)
+__kernel void arithm_s_bitwise_or_C2_D0 (
+        __global   uchar *src1, int src1_step, int src1_offset,
+        __global   uchar *dst,  int dst_step,  int dst_offset,
+        uchar4 src2, int rows, int cols, int dst_step1)

 {

@@ -259,7 +282,10 @@ __kernel void arithm_s_bitwise_or_C2_D0 (__global   uchar *src1, int src1_step,
    {
        x = x << 1;

-        #define dst_align ((dst_offset >> 1) & 1)
+#ifdef dst_align
+#undef dst_align
+#endif
+#define dst_align ((dst_offset >> 1) & 1)
        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));

        int dst_start  = mad24(y, dst_step, dst_offset);
@@ -280,9 +306,10 @@ __kernel void arithm_s_bitwise_or_C2_D0 (__global   uchar *src1, int src1_step,
 }


-__kernel void arithm_s_bitwise_or_C2_D1 (__global   char *src1, int src1_step, int src1_offset,
-                                  __global   char *dst,  int dst_step,  int dst_offset,
-                                  char4 src2, int rows, int cols, int dst_step1)
+__kernel void arithm_s_bitwise_or_C2_D1 (
+        __global   char *src1, int src1_step, int src1_offset,
+        __global   char *dst,  int dst_step,  int dst_offset,
+        char4 src2, int rows, int cols, int dst_step1)

 {

@@ -293,7 +320,10 @@ __kernel void arithm_s_bitwise_or_C2_D1 (__global   char *src1, int src1_step, i
    {
        x = x << 1;

-        #define dst_align ((dst_offset >> 1) & 1)
+#ifdef dst_align
+#undef dst_align
+#endif
+#define dst_align ((dst_offset >> 1) & 1)
        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));

        int dst_start  = mad24(y, dst_step, dst_offset);
@@ -313,9 +343,10 @@ __kernel void arithm_s_bitwise_or_C2_D1 (__global   char *src1, int src1_step, i
    }
 }

-__kernel void arithm_s_bitwise_or_C2_D2 (__global   ushort *src1, int src1_step, int src1_offset,
-                                  __global   ushort *dst,  int dst_step,  int dst_offset,
-                                  ushort4 src2, int rows, int cols, int dst_step1)
+__kernel void arithm_s_bitwise_or_C2_D2 (
+        __global   ushort *src1, int src1_step, int src1_offset,
+        __global   ushort *dst,  int dst_step,  int dst_offset,
+        ushort4 src2, int rows, int cols, int dst_step1)

 {

@@ -335,9 +366,10 @@ __kernel void arithm_s_bitwise_or_C2_D2 (__global   ushort *src1, int src1_step,
        *((__global ushort2 *)((__global char *)dst + dst_index)) = data;
    }
 }
-__kernel void arithm_s_bitwise_or_C2_D3 (__global   short *src1, int src1_step, int src1_offset,
-                                  __global   short *dst,  int dst_step,  int dst_offset,
-                                  short4 src2, int rows, int cols, int dst_step1)
+__kernel void arithm_s_bitwise_or_C2_D3 (
+        __global   short *src1, int src1_step, int src1_offset,
+        __global   short *dst,  int dst_step,  int dst_offset,
+        short4 src2, int rows, int cols, int dst_step1)

 {

@@ -358,8 +390,8 @@ __kernel void arithm_s_bitwise_or_C2_D3 (__global   short *src1, int src1_step,
    }
 }
 __kernel void arithm_s_bitwise_or_C2_D4 (__global   int *src1, int src1_step, int src1_offset,
-                                  __global   int *dst,  int dst_step,  int dst_offset,
-                                  int4 src2, int rows, int cols, int dst_step1)
+        __global   int *dst,  int dst_step,  int dst_offset,
+        int4 src2, int rows, int cols, int dst_step1)

 {

@@ -378,9 +410,10 @@ __kernel void arithm_s_bitwise_or_C2_D4 (__global   int *src1, int src1_step, in
        *((__global int2 *)((__global char *)dst + dst_index)) = data;
    }
 }
-__kernel void arithm_s_bitwise_or_C2_D5 (__global   char *src1, int src1_step, int src1_offset,
-                                  __global   char *dst,  int dst_step,  int dst_offset,
-                                  char16 src2, int rows, int cols, int dst_step1)
+__kernel void arithm_s_bitwise_or_C2_D5 (
+        __global   char *src1, int src1_step, int src1_offset,
+        __global   char *dst,  int dst_step,  int dst_offset,
+        char16 src2, int rows, int cols, int dst_step1)

 {

@@ -400,9 +433,10 @@ __kernel void arithm_s_bitwise_or_C2_D5 (__global   char *src1, int src1_step, i
    }
 }
 #if defined (DOUBLE_SUPPORT)
-__kernel void arithm_s_bitwise_or_C2_D6 (__global short *src1, int src1_step, int src1_offset,
-                                  __global short *dst,  int dst_step,  int dst_offset,
-                                  short16 src2, int rows, int cols, int dst_step1)
+__kernel void arithm_s_bitwise_or_C2_D6 (
+        __global short *src1, int src1_step, int src1_offset,
+        __global short *dst,  int dst_step,  int dst_offset,
+        short16 src2, int rows, int cols, int dst_step1)

 {

@@ -423,9 +457,10 @@ __kernel void arithm_s_bitwise_or_C2_D6 (__global short *src1, int src1_step, in
    }
 }
 #endif
-__kernel void arithm_s_bitwise_or_C3_D0 (__global   uchar *src1, int src1_step, int src1_offset,
-                                  __global   uchar *dst,  int dst_step,  int dst_offset,
-                                  uchar4 src2, int rows, int cols, int dst_step1)
+__kernel void arithm_s_bitwise_or_C3_D0 (
+        __global   uchar *src1, int src1_step, int src1_offset,
+        __global   uchar *dst,  int dst_step,  int dst_offset,
+        uchar4 src2, int rows, int cols, int dst_step1)

 {

@@ -436,7 +471,10 @@ __kernel void arithm_s_bitwise_or_C3_D0 (__global   uchar *src1, int src1_step,
    {
        x = x << 2;

-        #define dst_align (((dst_offset % dst_step) / 3 ) & 3)
+#ifdef dst_align
+#undef dst_align
+#endif
+#define dst_align (((dst_offset % dst_step) / 3 ) & 3)
        int src1_index = mad24(y, src1_step, (x * 3) + src1_offset - (dst_align * 3));

        int dst_start  = mad24(y, dst_step, dst_offset);
@@ -480,9 +518,10 @@ __kernel void arithm_s_bitwise_or_C3_D0 (__global   uchar *src1, int src1_step,
 }


-__kernel void arithm_s_bitwise_or_C3_D1 (__global   char *src1, int src1_step, int src1_offset,
-                                  __global   char *dst,  int dst_step,  int dst_offset,
-                                  char4 src2, int rows, int cols, int dst_step1)
+__kernel void arithm_s_bitwise_or_C3_D1 (
+        __global   char *src1, int src1_step, int src1_offset,
+        __global   char *dst,  int dst_step,  int dst_offset,
+        char4 src2, int rows, int cols, int dst_step1)

 {

@@ -493,7 +532,10 @@ __kernel void arithm_s_bitwise_or_C3_D1 (__global   char *src1, int src1_step, i
    {
        x = x << 2;

-        #define dst_align (((dst_offset % dst_step) / 3 ) & 3)
+#ifdef dst_align
+#undef dst_align
+#endif
+#define dst_align (((dst_offset % dst_step) / 3 ) & 3)
        int src1_index = mad24(y, src1_step, (x * 3) + src1_offset - (dst_align * 3));

        int dst_start  = mad24(y, dst_step, dst_offset);
@@ -536,9 +578,10 @@ __kernel void arithm_s_bitwise_or_C3_D1 (__global   char *src1, int src1_step, i
    }
 }

-__kernel void arithm_s_bitwise_or_C3_D2 (__global   ushort *src1, int src1_step, int src1_offset,
-                                  __global   ushort *dst,  int dst_step,  int dst_offset,
-                                  ushort4 src2, int rows, int cols, int dst_step1)
+__kernel void arithm_s_bitwise_or_C3_D2 (
+        __global   ushort *src1, int src1_step, int src1_offset,
+        __global   ushort *dst,  int dst_step,  int dst_offset,
+        ushort4 src2, int rows, int cols, int dst_step1)

 {

@@ -549,7 +592,10 @@ __kernel void arithm_s_bitwise_or_C3_D2 (__global   ushort *src1, int src1_step,
    {
        x = x << 1;

-        #define dst_align (((dst_offset % dst_step) / 6 ) & 1)
+#ifdef dst_align
+#undef dst_align
+#endif
+#define dst_align (((dst_offset % dst_step) / 6 ) & 1)
        int src1_index = mad24(y, src1_step, (x * 6) + src1_offset - (dst_align * 6));

        int dst_start  = mad24(y, dst_step, dst_offset);
@@ -575,21 +621,22 @@ __kernel void arithm_s_bitwise_or_C3_D2 (__global   ushort *src1, int src1_step,
        data_0.xy = ((dst_index + 0 >= dst_start)) ? tmp_data_0.xy : data_0.xy;

        data_1.x  = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end))
-                     ? tmp_data_1.x : data_1.x;
+                    ? tmp_data_1.x : data_1.x;
        data_1.y  = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
-                     ? tmp_data_1.y : data_1.y;
+                    ? tmp_data_1.y : data_1.y;

        data_2.xy = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
-                     ? tmp_data_2.xy : data_2.xy;
+                    ? tmp_data_2.xy : data_2.xy;

-       *((__global ushort2 *)((__global char *)dst + dst_index + 0))= data_0;
-       *((__global ushort2 *)((__global char *)dst + dst_index + 4))= data_1;
-       *((__global ushort2 *)((__global char *)dst + dst_index + 8))= data_2;
+        *((__global ushort2 *)((__global char *)dst + dst_index + 0))= data_0;
+        *((__global ushort2 *)((__global char *)dst + dst_index + 4))= data_1;
+        *((__global ushort2 *)((__global char *)dst + dst_index + 8))= data_2;
    }
 }
-__kernel void arithm_s_bitwise_or_C3_D3 (__global   short *src1, int src1_step, int src1_offset,
-                                  __global   short *dst,  int dst_step,  int dst_offset,
-                                  short4 src2, int rows, int cols, int dst_step1)
+__kernel void arithm_s_bitwise_or_C3_D3 (
+        __global   short *src1, int src1_step, int src1_offset,
+        __global   short *dst,  int dst_step,  int dst_offset,
+        short4 src2, int rows, int cols, int dst_step1)

 {

@@ -600,7 +647,10 @@ __kernel void arithm_s_bitwise_or_C3_D3 (__global   short *src1, int src1_step,
    {
        x = x << 1;

-        #define dst_align (((dst_offset % dst_step) / 6 ) & 1)
+#ifdef dst_align
+#undef dst_align
+#endif
+#define dst_align (((dst_offset % dst_step) / 6 ) & 1)
        int src1_index = mad24(y, src1_step, (x * 6) + src1_offset - (dst_align * 6));

        int dst_start  = mad24(y, dst_step, dst_offset);
@@ -626,21 +676,22 @@ __kernel void arithm_s_bitwise_or_C3_D3 (__global   short *src1, int src1_step,
        data_0.xy = ((dst_index + 0 >= dst_start)) ? tmp_data_0.xy : data_0.xy;

        data_1.x  = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end))
-                     ? tmp_data_1.x : data_1.x;
+                    ? tmp_data_1.x : data_1.x;
        data_1.y  = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
-                     ? tmp_data_1.y : data_1.y;
+                    ? tmp_data_1.y : data_1.y;

        data_2.xy = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
-                     ? tmp_data_2.xy : data_2.xy;
+                    ? tmp_data_2.xy : data_2.xy;

-       *((__global short2 *)((__global char *)dst + dst_index + 0))= data_0;
-       *((__global short2 *)((__global char *)dst + dst_index + 4))= data_1;
-       *((__global short2 *)((__global char *)dst + dst_index + 8))= data_2;
+        *((__global short2 *)((__global char *)dst + dst_index + 0))= data_0;
+        *((__global short2 *)((__global char *)dst + dst_index + 4))= data_1;
+        *((__global short2 *)((__global char *)dst + dst_index + 8))= data_2;
    }
 }
-__kernel void arithm_s_bitwise_or_C3_D4 (__global   int *src1, int src1_step, int src1_offset,
-                                  __global   int *dst,  int dst_step,  int dst_offset,
-                                  int4 src2, int rows, int cols, int dst_step1)
+__kernel void arithm_s_bitwise_or_C3_D4 (
+        __global   int *src1, int src1_step, int src1_offset,
+        __global   int *dst,  int dst_step,  int dst_offset,
+        int4 src2, int rows, int cols, int dst_step1)

 {

@@ -668,14 +719,15 @@ __kernel void arithm_s_bitwise_or_C3_D4 (__global   int *src1, int src1_step, in
        int tmp_data_1 = src1_data_1 | src2_data_1;
        int tmp_data_2 = src1_data_2 | src2_data_2;

-       *((__global int *)((__global char *)dst + dst_index + 0))= tmp_data_0;
-       *((__global int *)((__global char *)dst + dst_index + 4))= tmp_data_1;
-       *((__global int *)((__global char *)dst + dst_index + 8))= tmp_data_2;
+        *((__global int *)((__global char *)dst + dst_index + 0))= tmp_data_0;
+        *((__global int *)((__global char *)dst + dst_index + 4))= tmp_data_1;
+        *((__global int *)((__global char *)dst + dst_index + 8))= tmp_data_2;
    }
 }
-__kernel void arithm_s_bitwise_or_C3_D5 (__global   char *src1, int src1_step, int src1_offset,
-                                  __global   char *dst,  int dst_step,  int dst_offset,
-                                  char16 src2, int rows, int cols, int dst_step1)
+__kernel void arithm_s_bitwise_or_C3_D5 (
+        __global   char *src1, int src1_step, int src1_offset,
+        __global   char *dst,  int dst_step,  int dst_offset,
+        char16 src2, int rows, int cols, int dst_step1)

 {

@@ -700,15 +752,16 @@ __kernel void arithm_s_bitwise_or_C3_D5 (__global   char *src1, int src1_step, i
        char4 tmp_data_1 = src1_data_1 | src2_data_1;
        char4 tmp_data_2 = src1_data_2 | src2_data_2;

-       *((__global char4 *)((__global char *)dst + dst_index + 0))= tmp_data_0;
-       *((__global char4 *)((__global char *)dst + dst_index + 4))= tmp_data_1;
-       *((__global char4 *)((__global char *)dst + dst_index + 8))= tmp_data_2;
+        *((__global char4 *)((__global char *)dst + dst_index + 0))= tmp_data_0;
+        *((__global char4 *)((__global char *)dst + dst_index + 4))= tmp_data_1;
+        *((__global char4 *)((__global char *)dst + dst_index + 8))= tmp_data_2;
    }
 }
 #if defined (DOUBLE_SUPPORT)
-__kernel void arithm_s_bitwise_or_C3_D6 (__global short *src1, int src1_step, int src1_offset,
-                                          __global short *dst,  int dst_step,  int dst_offset,
-                                          short16 src2, int rows, int cols, int dst_step1)
+__kernel void arithm_s_bitwise_or_C3_D6 (
+        __global short *src1, int src1_step, int src1_offset,
+        __global short *dst,  int dst_step,  int dst_offset,
+        short16 src2, int rows, int cols, int dst_step1)

 {

@@ -736,15 +789,16 @@ __kernel void arithm_s_bitwise_or_C3_D6 (__global short *src1, int src1_step, in
        short4 tmp_data_1 = src1_data_1 | src2_data_1;
        short4 tmp_data_2 = src1_data_2 | src2_data_2;

-       *((__global short4 *)((__global char *)dst + dst_index + 0 ))= tmp_data_0;
-       *((__global short4 *)((__global char *)dst + dst_index + 8 ))= tmp_data_1;
-       *((__global short4 *)((__global char *)dst + dst_index + 16))= tmp_data_2;
+        *((__global short4 *)((__global char *)dst + dst_index + 0 ))= tmp_data_0;
+        *((__global short4 *)((__global char *)dst + dst_index + 8 ))= tmp_data_1;
+        *((__global short4 *)((__global char *)dst + dst_index + 16))= tmp_data_2;
    }
 }
 #endif
-__kernel void arithm_s_bitwise_or_C4_D0 (__global   uchar *src1, int src1_step, int src1_offset,
-                                  __global   uchar *dst,  int dst_step,  int dst_offset,
-                                  uchar4 src2, int rows, int cols, int dst_step1)
+__kernel void arithm_s_bitwise_or_C4_D0 (
+        __global   uchar *src1, int src1_step, int src1_offset,
+        __global   uchar *dst,  int dst_step,  int dst_offset,
+        uchar4 src2, int rows, int cols, int dst_step1)

 {

@@ -765,9 +819,10 @@ __kernel void arithm_s_bitwise_or_C4_D0 (__global   uchar *src1, int src1_step,
 }


-__kernel void arithm_s_bitwise_or_C4_D1 (__global   char *src1, int src1_step, int src1_offset,
-                                  __global   char *dst,  int dst_step,  int dst_offset,
-                                  char4 src2, int rows, int cols, int dst_step1)
+__kernel void arithm_s_bitwise_or_C4_D1 (
+        __global   char *src1, int src1_step, int src1_offset,
+        __global   char *dst,  int dst_step,  int dst_offset,
+        char4 src2, int rows, int cols, int dst_step1)

 {

@@ -787,9 +842,10 @@ __kernel void arithm_s_bitwise_or_C4_D1 (__global   char *src1, int src1_step, i
    }
 }

-__kernel void arithm_s_bitwise_or_C4_D2 (__global   ushort *src1, int src1_step, int src1_offset,
-                                  __global   ushort *dst,  int dst_step,  int dst_offset,
-                                  ushort4 src2, int rows, int cols, int dst_step1)
+__kernel void arithm_s_bitwise_or_C4_D2 (
+        __global   ushort *src1, int src1_step, int src1_offset,
+        __global   ushort *dst,  int dst_step,  int dst_offset,
+        ushort4 src2, int rows, int cols, int dst_step1)

 {

@@ -808,9 +864,10 @@ __kernel void arithm_s_bitwise_or_C4_D2 (__global   ushort *src1, int src1_step,
        *((__global ushort4 *)((__global char *)dst + dst_index)) = data;
    }
 }
-__kernel void arithm_s_bitwise_or_C4_D3 (__global   short *src1, int src1_step, int src1_offset,
-                                  __global   short *dst,  int dst_step,  int dst_offset,
-                                  short4 src2, int rows, int cols, int dst_step1)
+__kernel void arithm_s_bitwise_or_C4_D3 (
+        __global   short *src1, int src1_step, int src1_offset,
+        __global   short *dst,  int dst_step,  int dst_offset,
+        short4 src2, int rows, int cols, int dst_step1)

 {

@@ -829,9 +886,10 @@ __kernel void arithm_s_bitwise_or_C4_D3 (__global   short *src1, int src1_step,
        *((__global short4 *)((__global char *)dst + dst_index)) = data;
    }
 }
-__kernel void arithm_s_bitwise_or_C4_D4 (__global   int *src1, int src1_step, int src1_offset,
-                                  __global   int *dst,  int dst_step,  int dst_offset,
-                                  int4 src2, int rows, int cols, int dst_step1)
+__kernel void arithm_s_bitwise_or_C4_D4 (
+        __global   int *src1, int src1_step, int src1_offset,
+        __global   int *dst,  int dst_step,  int dst_offset,
+        int4 src2, int rows, int cols, int dst_step1)

 {

@@ -850,9 +908,10 @@ __kernel void arithm_s_bitwise_or_C4_D4 (__global   int *src1, int src1_step, in
        *((__global int4 *)((__global char *)dst + dst_index)) = data;
    }
 }
-__kernel void arithm_s_bitwise_or_C4_D5 (__global   char *src1, int src1_step, int src1_offset,
-                                  __global   char *dst,  int dst_step,  int dst_offset,
-                                  char16 src2, int rows, int cols, int dst_step1)
+__kernel void arithm_s_bitwise_or_C4_D5 (
+        __global   char *src1, int src1_step, int src1_offset,
+        __global   char *dst,  int dst_step,  int dst_offset,
+        char16 src2, int rows, int cols, int dst_step1)

 {

@@ -874,9 +933,10 @@ __kernel void arithm_s_bitwise_or_C4_D5 (__global   char *src1, int src1_step, i
    }
 }
 #if defined (DOUBLE_SUPPORT)
-__kernel void arithm_s_bitwise_or_C4_D6 (__global short *src1, int src1_step, int src1_offset,
-                                          __global short *dst,  int dst_step,  int dst_offset,
-                                          short16 src2, int rows, int cols, int dst_step1)
+__kernel void arithm_s_bitwise_or_C4_D6 (
+        __global short *src1, int src1_step, int src1_offset,
+        __global short *dst,  int dst_step,  int dst_offset,
+        short16 src2, int rows, int cols, int dst_step1)

 {

@@ -903,10 +963,10 @@ __kernel void arithm_s_bitwise_or_C4_D6 (__global short *src1, int src1_step, in
        short4 tmp_data_2 = src1_data_2 | src2_data_2;
        short4 tmp_data_3 = src1_data_3 | src2_data_3;

-       *((__global short4 *)((__global char *)dst + dst_index + 0 ))= tmp_data_0;
-       *((__global short4 *)((__global char *)dst + dst_index + 8 ))= tmp_data_1;
-       *((__global short4 *)((__global char *)dst + dst_index + 16))= tmp_data_2;
-       *((__global short4 *)((__global char *)dst + dst_index + 24))= tmp_data_3;
+        *((__global short4 *)((__global char *)dst + dst_index + 0 ))= tmp_data_0;
+        *((__global short4 *)((__global char *)dst + dst_index + 8 ))= tmp_data_1;
+        *((__global short4 *)((__global char *)dst + dst_index + 16))= tmp_data_2;
+        *((__global short4 *)((__global char *)dst + dst_index + 24))= tmp_data_3;

    }
 }
--- a/modules/ocl/src/opencl/arithm_bitwise_or_scalar_mask.cl
+++ b/modules/ocl/src/opencl/arithm_bitwise_or_scalar_mask.cl
@@ -43,17 +43,21 @@
 //
 //M*/
 #if defined (DOUBLE_SUPPORT)
+#ifdef cl_khr_fp64
 #pragma OPENCL EXTENSION cl_khr_fp64:enable
+#elif defined (cl_amd_fp64)
+#pragma OPENCL EXTENSION cl_amd_fp64:enable
+#endif
 #endif
-
 //////////////////////////////////////////////////////////////////////////////////////////////////////
 ////////////////////////////////////////////BITWISE_OR////////////////////////////////////////////////////
 ///////////////////////////////////////////////////////////////////////////////////////////////////////
 /**************************************bitwise_or with scalar with mask**************************************/
-__kernel void arithm_s_bitwise_or_with_mask_C1_D0 (__global   uchar *src1, int src1_step, int src1_offset,
-                                            __global   uchar *dst,  int dst_step,  int dst_offset,
-                                            __global   uchar *mask, int mask_step, int mask_offset,
-                                            uchar4 src2, int rows, int cols, int dst_step1)
+__kernel void arithm_s_bitwise_or_with_mask_C1_D0 (
+        __global   uchar *src1, int src1_step, int src1_offset,
+        __global   uchar *dst,  int dst_step,  int dst_offset,
+        __global   uchar *mask, int mask_step, int mask_offset,
+        uchar4 src2, int rows, int cols, int dst_step1)

 {

@@ -64,7 +68,10 @@ __kernel void arithm_s_bitwise_or_with_mask_C1_D0 (__global   uchar *src1, int s
    {
        x = x << 2;

-        #define dst_align (dst_offset & 3)
+#ifdef dst_align
+#undef dst_align
+#endif
+#define dst_align (dst_offset & 3)
        int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
        int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);

@@ -89,10 +96,11 @@ __kernel void arithm_s_bitwise_or_with_mask_C1_D0 (__global   uchar *src1, int s
 }


-__kernel void arithm_s_bitwise_or_with_mask_C1_D1 (__global   char *src1, int src1_step, int src1_offset,
-                                            __global   char *dst,  int dst_step,  int dst_offset,
-                                            __global   uchar *mask, int mask_step, int mask_offset,
-                                            char4 src2, int rows, int cols, int dst_step1)
+__kernel void arithm_s_bitwise_or_with_mask_C1_D1 (
+        __global   char *src1, int src1_step, int src1_offset,
+        __global   char *dst,  int dst_step,  int dst_offset,
+        __global   uchar *mask, int mask_step, int mask_offset,
+        char4 src2, int rows, int cols, int dst_step1)

 {

@@ -103,7 +111,10 @@ __kernel void arithm_s_bitwise_or_with_mask_C1_D1 (__global   char *src1, int sr
    {
        x = x << 2;

-        #define dst_align (dst_offset & 3)
+#ifdef dst_align
+#undef dst_align
+#endif
+#define dst_align (dst_offset & 3)
        int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
        int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);

@@ -127,10 +138,11 @@ __kernel void arithm_s_bitwise_or_with_mask_C1_D1 (__global   char *src1, int sr
    }
 }

-__kernel void arithm_s_bitwise_or_with_mask_C1_D2 (__global   ushort *src1, int src1_step, int src1_offset,
-                                            __global   ushort *dst,  int dst_step,  int dst_offset,
-                                            __global   uchar  *mask, int mask_step, int mask_offset,
-                                            ushort4 src2, int rows, int cols, int dst_step1)
+__kernel void arithm_s_bitwise_or_with_mask_C1_D2 (
+        __global   ushort *src1, int src1_step, int src1_offset,
+        __global   ushort *dst,  int dst_step,  int dst_offset,
+        __global   uchar  *mask, int mask_step, int mask_offset,
+        ushort4 src2, int rows, int cols, int dst_step1)

 {

@@ -141,7 +153,10 @@ __kernel void arithm_s_bitwise_or_with_mask_C1_D2 (__global   ushort *src1, int
    {
        x = x << 1;

-        #define dst_align ((dst_offset >> 1) & 1)
+#ifdef dst_align
+#undef dst_align
+#endif
+#define dst_align ((dst_offset >> 1) & 1)
        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
        int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);

@@ -162,10 +177,11 @@ __kernel void arithm_s_bitwise_or_with_mask_C1_D2 (__global   ushort *src1, int
        *((__global ushort2 *)((__global uchar *)dst + dst_index)) = data;
    }
 }
-__kernel void arithm_s_bitwise_or_with_mask_C1_D3 (__global   short *src1, int src1_step, int src1_offset,
-                                            __global   short *dst,  int dst_step,  int dst_offset,
-                                            __global   uchar *mask, int mask_step, int mask_offset,
-                                            short4 src2, int rows, int cols, int dst_step1)
+__kernel void arithm_s_bitwise_or_with_mask_C1_D3 (
+        __global   short *src1, int src1_step, int src1_offset,
+        __global   short *dst,  int dst_step,  int dst_offset,
+        __global   uchar *mask, int mask_step, int mask_offset,
+        short4 src2, int rows, int cols, int dst_step1)

 {

@@ -176,7 +192,10 @@ __kernel void arithm_s_bitwise_or_with_mask_C1_D3 (__global   short *src1, int s
    {
        x = x << 1;

-        #define dst_align ((dst_offset >> 1) & 1)
+#ifdef dst_align
+#undef dst_align
+#endif
+#define dst_align ((dst_offset >> 1) & 1)
        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
        int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);

@@ -197,10 +216,11 @@ __kernel void arithm_s_bitwise_or_with_mask_C1_D3 (__global   short *src1, int s
        *((__global short2 *)((__global uchar *)dst + dst_index)) = data;
    }
 }
-__kernel void arithm_s_bitwise_or_with_mask_C1_D4 (__global   int   *src1, int src1_step, int src1_offset,
-                                            __global   int   *dst,  int dst_step,  int dst_offset,
-                                            __global   uchar *mask, int mask_step, int mask_offset,
-                                            int4 src2, int rows, int cols, int dst_step1)
+__kernel void arithm_s_bitwise_or_with_mask_C1_D4 (
+        __global   int   *src1, int src1_step, int src1_offset,
+        __global   int   *dst,  int dst_step,  int dst_offset,
+        __global   uchar *mask, int mask_step, int mask_offset,
+        int4 src2, int rows, int cols, int dst_step1)

 {

@@ -226,10 +246,11 @@ __kernel void arithm_s_bitwise_or_with_mask_C1_D4 (__global   int   *src1, int s
    }
 }

-__kernel void arithm_s_bitwise_or_with_mask_C1_D5 (__global   char   *src1, int src1_step, int src1_offset,
-                                            __global   char   *dst,  int dst_step,  int dst_offset,
-                                            __global   uchar *mask, int mask_step, int mask_offset,
-                                            char16 src2, int rows, int cols, int dst_step1)
+__kernel void arithm_s_bitwise_or_with_mask_C1_D5 (
+        __global   char   *src1, int src1_step, int src1_offset,
+        __global   char   *dst,  int dst_step,  int dst_offset,
+        __global   uchar *mask, int mask_step, int mask_offset,
+        char16 src2, int rows, int cols, int dst_step1)

 {

@@ -254,12 +275,12 @@ __kernel void arithm_s_bitwise_or_with_mask_C1_D5 (__global   char   *src1, int
        *((__global char4 *)((__global char *)dst + dst_index)) = data;
    }
 }
-
 #if defined (DOUBLE_SUPPORT)
-__kernel void arithm_s_bitwise_or_with_mask_C1_D6 (__global short *src1, int src1_step, int src1_offset,
-                                            __global short *dst,  int dst_step,  int dst_offset,
-                                            __global   uchar *mask, int mask_step, int mask_offset,
-                                            short16 src2, int rows, int cols, int dst_step1)
+__kernel void arithm_s_bitwise_or_with_mask_C1_D6 (
+        __global short *src1, int src1_step, int src1_offset,
+        __global short *dst,  int dst_step,  int dst_offset,
+        __global   uchar *mask, int mask_step, int mask_offset,
+        short16 src2, int rows, int cols, int dst_step1)

 {

@@ -285,10 +306,11 @@ __kernel void arithm_s_bitwise_or_with_mask_C1_D6 (__global short *src1, int src
    }
 }
 #endif
-__kernel void arithm_s_bitwise_or_with_mask_C2_D0 (__global   uchar *src1, int src1_step, int src1_offset,
-                                            __global   uchar *dst,  int dst_step,  int dst_offset,
-                                            __global   uchar *mask, int mask_step, int mask_offset,
-                                            uchar4 src2, int rows, int cols, int dst_step1)
+__kernel void arithm_s_bitwise_or_with_mask_C2_D0 (
+        __global   uchar *src1, int src1_step, int src1_offset,
+        __global   uchar *dst,  int dst_step,  int dst_offset,
+        __global   uchar *mask, int mask_step, int mask_offset,
+        uchar4 src2, int rows, int cols, int dst_step1)

 {

@@ -299,7 +321,10 @@ __kernel void arithm_s_bitwise_or_with_mask_C2_D0 (__global   uchar *src1, int s
    {
        x = x << 1;

-        #define dst_align ((dst_offset >> 1) & 1)
+#ifdef dst_align
+#undef dst_align
+#endif
+#define dst_align ((dst_offset >> 1) & 1)
        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
        int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);

@@ -322,10 +347,11 @@ __kernel void arithm_s_bitwise_or_with_mask_C2_D0 (__global   uchar *src1, int s
 }


-__kernel void arithm_s_bitwise_or_with_mask_C2_D1 (__global   char *src1, int src1_step, int src1_offset,
-                                            __global   char *dst,  int dst_step,  int dst_offset,
-                                            __global   uchar *mask, int mask_step, int mask_offset,
-                                            char4 src2, int rows, int cols, int dst_step1)
+__kernel void arithm_s_bitwise_or_with_mask_C2_D1 (
+        __global   char *src1, int src1_step, int src1_offset,
+        __global   char *dst,  int dst_step,  int dst_offset,
+        __global   uchar *mask, int mask_step, int mask_offset,
+        char4 src2, int rows, int cols, int dst_step1)

 {

@@ -336,7 +362,10 @@ __kernel void arithm_s_bitwise_or_with_mask_C2_D1 (__global   char *src1, int sr
    {
        x = x << 1;

-        #define dst_align ((dst_offset >> 1) & 1)
+#ifdef dst_align
+#undef dst_align
+#endif
+#define dst_align ((dst_offset >> 1) & 1)
        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
        int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);

@@ -358,10 +387,11 @@ __kernel void arithm_s_bitwise_or_with_mask_C2_D1 (__global   char *src1, int sr
    }
 }

-__kernel void arithm_s_bitwise_or_with_mask_C2_D2 (__global   ushort *src1, int src1_step, int src1_offset,
-                                            __global   ushort *dst,  int dst_step,  int dst_offset,
-                                            __global   uchar *mask, int mask_step, int mask_offset,
-                                            ushort4 src2, int rows, int cols, int dst_step1)
+__kernel void arithm_s_bitwise_or_with_mask_C2_D2 (
+        __global   ushort *src1, int src1_step, int src1_offset,
+        __global   ushort *dst,  int dst_step,  int dst_offset,
+        __global   uchar *mask, int mask_step, int mask_offset,
+        ushort4 src2, int rows, int cols, int dst_step1)

 {

@@ -386,10 +416,11 @@ __kernel void arithm_s_bitwise_or_with_mask_C2_D2 (__global   ushort *src1, int
        *((__global ushort2 *)((__global char *)dst + dst_index)) = data;
    }
 }
-__kernel void arithm_s_bitwise_or_with_mask_C2_D3 (__global   short *src1, int src1_step, int src1_offset,
-                                            __global   short *dst,  int dst_step,  int dst_offset,
-                                            __global   uchar *mask, int mask_step, int mask_offset,
-                                            short4 src2, int rows, int cols, int dst_step1)
+__kernel void arithm_s_bitwise_or_with_mask_C2_D3 (
+        __global   short *src1, int src1_step, int src1_offset,
+        __global   short *dst,  int dst_step,  int dst_offset,
+        __global   uchar *mask, int mask_step, int mask_offset,
+        short4 src2, int rows, int cols, int dst_step1)

 {

@@ -414,10 +445,11 @@ __kernel void arithm_s_bitwise_or_with_mask_C2_D3 (__global   short *src1, int s
        *((__global short2 *)((__global char *)dst + dst_index)) = data;
    }
 }
-__kernel void arithm_s_bitwise_or_with_mask_C2_D4 (__global   int *src1, int src1_step, int src1_offset,
-                                            __global   int *dst,  int dst_step,  int dst_offset,
-                                            __global   uchar *mask, int mask_step, int mask_offset,
-                                            int4 src2, int rows, int cols, int dst_step1)
+__kernel void arithm_s_bitwise_or_with_mask_C2_D4 (
+        __global   int *src1, int src1_step, int src1_offset,
+        __global   int *dst,  int dst_step,  int dst_offset,
+        __global   uchar *mask, int mask_step, int mask_offset,
+        int4 src2, int rows, int cols, int dst_step1)

 {

@@ -442,10 +474,11 @@ __kernel void arithm_s_bitwise_or_with_mask_C2_D4 (__global   int *src1, int src
        *((__global int2 *)((__global char *)dst + dst_index)) = data;
    }
 }
-__kernel void arithm_s_bitwise_or_with_mask_C2_D5 (__global   char *src1, int src1_step, int src1_offset,
-                                            __global   char *dst,  int dst_step,  int dst_offset,
-                                            __global   uchar *mask, int mask_step, int mask_offset,
-                                            char16 src2, int rows, int cols, int dst_step1)
+__kernel void arithm_s_bitwise_or_with_mask_C2_D5 (
+        __global   char *src1, int src1_step, int src1_offset,
+        __global   char *dst,  int dst_step,  int dst_offset,
+        __global   uchar *mask, int mask_step, int mask_offset,
+        char16 src2, int rows, int cols, int dst_step1)

 {

@@ -463,17 +496,18 @@ __kernel void arithm_s_bitwise_or_with_mask_C2_D5 (__global   char *src1, int sr
        char8 src_data1 = *((__global char8 *)((__global char *)src1 + src1_index));
        char8 src_data2 = (char8)(src2.s0, src2.s1, src2.s2, src2.s3, src2.s4, src2.s5, src2.s6, src2.s7);
        char8 dst_data = *((__global char8 *)((__global char *)dst  + dst_index));
-          char8 data = src_data1 | src_data2;
+        char8 data = src_data1 | src_data2;
        data = mask_data ? data : dst_data;
        *((__global char8 *)((__global char *)dst + dst_index)) = data;

-      }
+    }
 }
 #if defined (DOUBLE_SUPPORT)
-__kernel void arithm_s_bitwise_or_with_mask_C2_D6 (__global   char *src1, int src1_step, int src1_offset,
-                                            __global   char *dst,  int dst_step,  int dst_offset,
-                                            __global   uchar *mask, int mask_step, int mask_offset,
-                                            short16 src2, int rows, int cols, int dst_step1)
+__kernel void arithm_s_bitwise_or_with_mask_C2_D6 (
+        __global   char *src1, int src1_step, int src1_offset,
+        __global   char *dst,  int dst_step,  int dst_offset,
+        __global   uchar *mask, int mask_step, int mask_offset,
+        short16 src2, int rows, int cols, int dst_step1)

 {

@@ -499,10 +533,11 @@ __kernel void arithm_s_bitwise_or_with_mask_C2_D6 (__global   char *src1, int sr
    }
 }
 #endif
-__kernel void arithm_s_bitwise_or_with_mask_C3_D0 (__global   uchar *src1, int src1_step, int src1_offset,
-                                            __global   uchar *dst,  int dst_step,  int dst_offset,
-                                            __global   uchar *mask, int mask_step, int mask_offset,
-                                            uchar4 src2, int rows, int cols, int dst_step1)
+__kernel void arithm_s_bitwise_or_with_mask_C3_D0 (
+        __global   uchar *src1, int src1_step, int src1_offset,
+        __global   uchar *dst,  int dst_step,  int dst_offset,
+        __global   uchar *mask, int mask_step, int mask_offset,
+        uchar4 src2, int rows, int cols, int dst_step1)

 {

@@ -513,7 +548,10 @@ __kernel void arithm_s_bitwise_or_with_mask_C3_D0 (__global   uchar *src1, int s
    {
        x = x << 2;

-        #define dst_align (((dst_offset % dst_step) / 3 ) & 3)
+#ifdef dst_align
+#undef dst_align
+#endif
+#define dst_align (((dst_offset % dst_step) / 3 ) & 3)
        int src1_index = mad24(y, src1_step, (x * 3) + src1_offset - (dst_align * 3));
        int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);

@@ -560,10 +598,11 @@ __kernel void arithm_s_bitwise_or_with_mask_C3_D0 (__global   uchar *src1, int s
 }


-__kernel void arithm_s_bitwise_or_with_mask_C3_D1 (__global   char *src1, int src1_step, int src1_offset,
-                                            __global   char *dst,  int dst_step,  int dst_offset,
-                                            __global   uchar *mask, int mask_step, int mask_offset,
-                                            char4 src2, int rows, int cols, int dst_step1)
+__kernel void arithm_s_bitwise_or_with_mask_C3_D1 (
+        __global   char *src1, int src1_step, int src1_offset,
+        __global   char *dst,  int dst_step,  int dst_offset,
+        __global   uchar *mask, int mask_step, int mask_offset,
+        char4 src2, int rows, int cols, int dst_step1)

 {

@@ -574,7 +613,10 @@ __kernel void arithm_s_bitwise_or_with_mask_C3_D1 (__global   char *src1, int sr
    {
        x = x << 2;

-        #define dst_align (((dst_offset % dst_step) / 3 ) & 3)
+#ifdef dst_align
+#undef dst_align
+#endif
+#define dst_align (((dst_offset % dst_step) / 3 ) & 3)
        int src1_index = mad24(y, src1_step, (x * 3) + src1_offset - (dst_align * 3));
        int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);

@@ -620,10 +662,11 @@ __kernel void arithm_s_bitwise_or_with_mask_C3_D1 (__global   char *src1, int sr
    }
 }

-__kernel void arithm_s_bitwise_or_with_mask_C3_D2 (__global   ushort *src1, int src1_step, int src1_offset,
-                                            __global   ushort *dst,  int dst_step,  int dst_offset,
-                                            __global   uchar *mask, int mask_step, int mask_offset,
-                                            ushort4 src2, int rows, int cols, int dst_step1)
+__kernel void arithm_s_bitwise_or_with_mask_C3_D2 (
+        __global   ushort *src1, int src1_step, int src1_offset,
+        __global   ushort *dst,  int dst_step,  int dst_offset,
+        __global   uchar *mask, int mask_step, int mask_offset,
+        ushort4 src2, int rows, int cols, int dst_step1)

 {

@@ -634,7 +677,10 @@ __kernel void arithm_s_bitwise_or_with_mask_C3_D2 (__global   ushort *src1, int
    {
        x = x << 1;

-        #define dst_align (((dst_offset % dst_step) / 6 ) & 1)
+#ifdef dst_align
+#undef dst_align
+#endif
+#define dst_align (((dst_offset % dst_step) / 6 ) & 1)
        int src1_index = mad24(y, src1_step, (x * 6) + src1_offset - (dst_align * 6));
        int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);

@@ -663,22 +709,23 @@ __kernel void arithm_s_bitwise_or_with_mask_C3_D2 (__global   ushort *src1, int
        data_0.xy = ((mask_data.x) && (dst_index + 0 >= dst_start)) ? tmp_data_0.xy : data_0.xy;

        data_1.x  = ((mask_data.x) && (dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end))
-                     ? tmp_data_1.x : data_1.x;
+                    ? tmp_data_1.x : data_1.x;
        data_1.y  = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
-                     ? tmp_data_1.y : data_1.y;
+                    ? tmp_data_1.y : data_1.y;

        data_2.xy = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
-                     ? tmp_data_2.xy : data_2.xy;
+                    ? tmp_data_2.xy : data_2.xy;

-       *((__global ushort2 *)((__global char *)dst + dst_index + 0))= data_0;
-       *((__global ushort2 *)((__global char *)dst + dst_index + 4))= data_1;
-       *((__global ushort2 *)((__global char *)dst + dst_index + 8))= data_2;
+        *((__global ushort2 *)((__global char *)dst + dst_index + 0))= data_0;
+        *((__global ushort2 *)((__global char *)dst + dst_index + 4))= data_1;
+        *((__global ushort2 *)((__global char *)dst + dst_index + 8))= data_2;
    }
 }
-__kernel void arithm_s_bitwise_or_with_mask_C3_D3 (__global   short *src1, int src1_step, int src1_offset,
-                                            __global   short *dst,  int dst_step,  int dst_offset,
-                                            __global   uchar *mask, int mask_step, int mask_offset,
-                                            short4 src2, int rows, int cols, int dst_step1)
+__kernel void arithm_s_bitwise_or_with_mask_C3_D3 (
+        __global   short *src1, int src1_step, int src1_offset,
+        __global   short *dst,  int dst_step,  int dst_offset,
+        __global   uchar *mask, int mask_step, int mask_offset,
+        short4 src2, int rows, int cols, int dst_step1)

 {

@@ -689,7 +736,10 @@ __kernel void arithm_s_bitwise_or_with_mask_C3_D3 (__global   short *src1, int s
    {
        x = x << 1;

-        #define dst_align (((dst_offset % dst_step) / 6 ) & 1)
+#ifdef dst_align
+#undef dst_align
+#endif
+#define dst_align (((dst_offset % dst_step) / 6 ) & 1)
        int src1_index = mad24(y, src1_step, (x * 6) + src1_offset - (dst_align * 6));
        int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);

@@ -718,22 +768,23 @@ __kernel void arithm_s_bitwise_or_with_mask_C3_D3 (__global   short *src1, int s
        data_0.xy = ((mask_data.x) && (dst_index + 0 >= dst_start)) ? tmp_data_0.xy : data_0.xy;

        data_1.x  = ((mask_data.x) && (dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end))
-                     ? tmp_data_1.x : data_1.x;
+                    ? tmp_data_1.x : data_1.x;
        data_1.y  = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
-                     ? tmp_data_1.y : data_1.y;
+                    ? tmp_data_1.y : data_1.y;

        data_2.xy = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
-                     ? tmp_data_2.xy : data_2.xy;
+                    ? tmp_data_2.xy : data_2.xy;

-       *((__global short2 *)((__global char *)dst + dst_index + 0))= data_0;
-       *((__global short2 *)((__global char *)dst + dst_index + 4))= data_1;
-       *((__global short2 *)((__global char *)dst + dst_index + 8))= data_2;
+        *((__global short2 *)((__global char *)dst + dst_index + 0))= data_0;
+        *((__global short2 *)((__global char *)dst + dst_index + 4))= data_1;
+        *((__global short2 *)((__global char *)dst + dst_index + 8))= data_2;
    }
 }
-__kernel void arithm_s_bitwise_or_with_mask_C3_D4 (__global   int *src1, int src1_step, int src1_offset,
-                                            __global   int *dst,  int dst_step,  int dst_offset,
-                                            __global   uchar *mask, int mask_step, int mask_offset,
-                                            int4 src2, int rows, int cols, int dst_step1)
+__kernel void arithm_s_bitwise_or_with_mask_C3_D4 (
+        __global   int *src1, int src1_step, int src1_offset,
+        __global   int *dst,  int dst_step,  int dst_offset,
+        __global   uchar *mask, int mask_step, int mask_offset,
+        int4 src2, int rows, int cols, int dst_step1)

 {

@@ -768,15 +819,16 @@ __kernel void arithm_s_bitwise_or_with_mask_C3_D4 (__global   int *src1, int src
        data_1 = mask_data ? tmp_data_1 : data_1;
        data_2 = mask_data ? tmp_data_2 : data_2;

-       *((__global int *)((__global char *)dst + dst_index + 0))= data_0;
-       *((__global int *)((__global char *)dst + dst_index + 4))= data_1;
-       *((__global int *)((__global char *)dst + dst_index + 8))= data_2;
+        *((__global int *)((__global char *)dst + dst_index + 0))= data_0;
+        *((__global int *)((__global char *)dst + dst_index + 4))= data_1;
+        *((__global int *)((__global char *)dst + dst_index + 8))= data_2;
    }
 }
-__kernel void arithm_s_bitwise_or_with_mask_C3_D5 (__global   char *src1, int src1_step, int src1_offset,
-                                            __global   char *dst,  int dst_step,  int dst_offset,
-                                            __global   uchar *mask, int mask_step, int mask_offset,
-                                            char16 src2, int rows, int cols, int dst_step1)
+__kernel void arithm_s_bitwise_or_with_mask_C3_D5 (
+        __global   char *src1, int src1_step, int src1_offset,
+        __global   char *dst,  int dst_step,  int dst_offset,
+        __global   uchar *mask, int mask_step, int mask_offset,
+        char16 src2, int rows, int cols, int dst_step1)

 {

@@ -811,17 +863,18 @@ __kernel void arithm_s_bitwise_or_with_mask_C3_D5 (__global   char *src1, int sr
        data_1 = mask_data ? tmp_data_1 : data_1;
        data_2 = mask_data ? tmp_data_2 : data_2;

-       *((__global char4 *)((__global char *)dst + dst_index + 0))= data_0;
-       *((__global char4 *)((__global char *)dst + dst_index + 4))= data_1;
-       *((__global char4 *)((__global char *)dst + dst_index + 8))= data_2;
+        *((__global char4 *)((__global char *)dst + dst_index + 0))= data_0;
+        *((__global char4 *)((__global char *)dst + dst_index + 4))= data_1;
+        *((__global char4 *)((__global char *)dst + dst_index + 8))= data_2;

-       }
+    }
 }
 #if defined (DOUBLE_SUPPORT)
-__kernel void arithm_s_bitwise_or_with_mask_C3_D6 (__global short *src1, int src1_step, int src1_offset,
-                                                    __global short *dst,  int dst_step,  int dst_offset,
-                                                    __global uchar  *mask, int mask_step, int mask_offset,
-                                                    short16 src2, int rows, int cols, int dst_step1)
+__kernel void arithm_s_bitwise_or_with_mask_C3_D6 (
+        __global short *src1, int src1_step, int src1_offset,
+        __global short *dst,  int dst_step,  int dst_offset,
+        __global uchar  *mask, int mask_step, int mask_offset,
+        short16 src2, int rows, int cols, int dst_step1)
 {

    int x = get_global_id(0);
@@ -855,16 +908,17 @@ __kernel void arithm_s_bitwise_or_with_mask_C3_D6 (__global short *src1, int src
        data_1 = mask_data ? tmp_data_1 : data_1;
        data_2 = mask_data ? tmp_data_2 : data_2;

-       *((__global short4 *)((__global char *)dst + dst_index + 0 ))= data_0;
-       *((__global short4 *)((__global char *)dst + dst_index + 8 ))= data_1;
-       *((__global short4 *)((__global char *)dst + dst_index + 16))= data_2;
+        *((__global short4 *)((__global char *)dst + dst_index + 0 ))= data_0;
+        *((__global short4 *)((__global char *)dst + dst_index + 8 ))= data_1;
+        *((__global short4 *)((__global char *)dst + dst_index + 16))= data_2;
    }
 }
 #endif
-__kernel void arithm_s_bitwise_or_with_mask_C4_D0 (__global   uchar *src1, int src1_step, int src1_offset,
-                                            __global   uchar *dst,  int dst_step,  int dst_offset,
-                                            __global   uchar *mask, int mask_step, int mask_offset,
-                                            uchar4 src2, int rows, int cols, int dst_step1)
+__kernel void arithm_s_bitwise_or_with_mask_C4_D0 (
+        __global   uchar *src1, int src1_step, int src1_offset,
+        __global   uchar *dst,  int dst_step,  int dst_offset,
+        __global   uchar *mask, int mask_step, int mask_offset,
+        uchar4 src2, int rows, int cols, int dst_step1)

 {

@@ -890,10 +944,11 @@ __kernel void arithm_s_bitwise_or_with_mask_C4_D0 (__global   uchar *src1, int s
 }


-__kernel void arithm_s_bitwise_or_with_mask_C4_D1 (__global   char *src1, int src1_step, int src1_offset,
-                                            __global   char *dst,  int dst_step,  int dst_offset,
-                                            __global   uchar *mask, int mask_step, int mask_offset,
-                                            char4 src2, int rows, int cols, int dst_step1)
+__kernel void arithm_s_bitwise_or_with_mask_C4_D1 (
+        __global   char *src1, int src1_step, int src1_offset,
+        __global   char *dst,  int dst_step,  int dst_offset,
+        __global   uchar *mask, int mask_step, int mask_offset,
+        char4 src2, int rows, int cols, int dst_step1)

 {

@@ -918,10 +973,11 @@ __kernel void arithm_s_bitwise_or_with_mask_C4_D1 (__global   char *src1, int sr
    }
 }

-__kernel void arithm_s_bitwise_or_with_mask_C4_D2 (__global   ushort *src1, int src1_step, int src1_offset,
-                                            __global   ushort *dst,  int dst_step,  int dst_offset,
-                                            __global   uchar *mask, int mask_step, int mask_offset,
-                                            ushort4 src2, int rows, int cols, int dst_step1)
+__kernel void arithm_s_bitwise_or_with_mask_C4_D2 (
+        __global   ushort *src1, int src1_step, int src1_offset,
+        __global   ushort *dst,  int dst_step,  int dst_offset,
+        __global   uchar *mask, int mask_step, int mask_offset,
+        ushort4 src2, int rows, int cols, int dst_step1)

 {

@@ -945,10 +1001,11 @@ __kernel void arithm_s_bitwise_or_with_mask_C4_D2 (__global   ushort *src1, int
        *((__global ushort4 *)((__global char *)dst + dst_index)) = data;
    }
 }
-__kernel void arithm_s_bitwise_or_with_mask_C4_D3 (__global   short *src1, int src1_step, int src1_offset,
-                                            __global   short *dst,  int dst_step,  int dst_offset,
-                                            __global   uchar *mask, int mask_step, int mask_offset,
-                                            short4 src2, int rows, int cols, int dst_step1)
+__kernel void arithm_s_bitwise_or_with_mask_C4_D3 (
+        __global   short *src1, int src1_step, int src1_offset,
+        __global   short *dst,  int dst_step,  int dst_offset,
+        __global   uchar *mask, int mask_step, int mask_offset,
+        short4 src2, int rows, int cols, int dst_step1)

 {

@@ -972,10 +1029,11 @@ __kernel void arithm_s_bitwise_or_with_mask_C4_D3 (__global   short *src1, int s
        *((__global short4 *)((__global char *)dst + dst_index)) = data;
    }
 }
-__kernel void arithm_s_bitwise_or_with_mask_C4_D4 (__global   int *src1, int src1_step, int src1_offset,
-                                            __global   int *dst,  int dst_step,  int dst_offset,
-                                            __global   uchar *mask, int mask_step, int mask_offset,
-                                            int4 src2, int rows, int cols, int dst_step1)
+__kernel void arithm_s_bitwise_or_with_mask_C4_D4 (
+        __global   int *src1, int src1_step, int src1_offset,
+        __global   int *dst,  int dst_step,  int dst_offset,
+        __global   uchar *mask, int mask_step, int mask_offset,
+        int4 src2, int rows, int cols, int dst_step1)

 {

@@ -999,10 +1057,11 @@ __kernel void arithm_s_bitwise_or_with_mask_C4_D4 (__global   int *src1, int src
        *((__global int4 *)((__global char *)dst + dst_index)) = data;
    }
 }
-__kernel void arithm_s_bitwise_or_with_mask_C4_D5 (__global   char *src1, int src1_step, int src1_offset,
-                                            __global   char *dst,  int dst_step,  int dst_offset,
-                                            __global   uchar *mask, int mask_step, int mask_offset,
-                                            char16 src2, int rows, int cols, int dst_step1)
+__kernel void arithm_s_bitwise_or_with_mask_C4_D5 (
+        __global   char *src1, int src1_step, int src1_offset,
+        __global   char *dst,  int dst_step,  int dst_offset,
+        __global   uchar *mask, int mask_step, int mask_offset,
+        char16 src2, int rows, int cols, int dst_step1)

 {

@@ -1029,10 +1088,11 @@ __kernel void arithm_s_bitwise_or_with_mask_C4_D5 (__global   char *src1, int sr
    }
 }
 #if defined (DOUBLE_SUPPORT)
-__kernel void arithm_s_bitwise_or_with_mask_C4_D6 (__global short *src1, int src1_step, int src1_offset,
-                                                    __global short *dst,  int dst_step,  int dst_offset,
-                                                    __global uchar *mask, int mask_step, int mask_offset,
-                                                    short16 src2, int rows, int cols, int dst_step1)
+__kernel void arithm_s_bitwise_or_with_mask_C4_D6 (
+        __global short *src1, int src1_step, int src1_offset,
+        __global short *dst,  int dst_step,  int dst_offset,
+        __global uchar *mask, int mask_step, int mask_offset,
+        short16 src2, int rows, int cols, int dst_step1)
 {

    int x = get_global_id(0);
--- a/modules/ocl/src/opencl/arithm_bitwise_xor.cl
+++ b/modules/ocl/src/opencl/arithm_bitwise_xor.cl
@@ -43,17 +43,20 @@
 //
 //M*/
 #if defined (DOUBLE_SUPPORT)
+#ifdef cl_khr_fp64
 #pragma OPENCL EXTENSION cl_khr_fp64:enable
+#elif defined (cl_amd_fp64)
+#pragma OPENCL EXTENSION cl_amd_fp64:enable
+#endif
 #endif
-
 //////////////////////////////////////////////////////////////////////////////////////////////////////
 ////////////////////////////////////////////BITWISE_XOR////////////////////////////////////////////////////
 ///////////////////////////////////////////////////////////////////////////////////////////////////////
 /**************************************bitwise_xor without mask**************************************/
 __kernel void arithm_bitwise_xor_D0 (__global uchar *src1, int src1_step, int src1_offset,
-                             __global uchar *src2, int src2_step, int src2_offset,
-                             __global uchar *dst,  int dst_step,  int dst_offset,
-                             int rows, int cols, int dst_step1)
+                                     __global uchar *src2, int src2_step, int src2_offset,
+                                     __global uchar *dst,  int dst_step,  int dst_offset,
+                                     int rows, int cols, int dst_step1)
 {
    int x = get_global_id(0);
    int y = get_global_id(1);
@@ -62,7 +65,10 @@ __kernel void arithm_bitwise_xor_D0 (__global uchar *src1, int src1_step, int sr
    {
        x = x << 2;

-        #define dst_align (dst_offset & 3)
+#ifdef dst_align
+#undef dst_align
+#endif
+#define dst_align (dst_offset & 3)
        int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
        int src2_index = mad24(y, src2_step, x + src2_offset - dst_align);

@@ -70,23 +76,23 @@ __kernel void arithm_bitwise_xor_D0 (__global uchar *src1, int src1_step, int sr
        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
        int dst_index  = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);

-     int src1_index_fix = src1_index < 0 ? 0 : src1_index;
-     int src2_index_fix = src2_index < 0 ? 0 : src2_index;
+        int src1_index_fix = src1_index < 0 ? 0 : src1_index;
+        int src2_index_fix = src2_index < 0 ? 0 : src2_index;
        uchar4 src1_data = vload4(0, src1 + src1_index_fix);
        uchar4 src2_data = vload4(0, src2 + src2_index_fix);

-     if(src1_index < 0)
-     {
-        uchar4 tmp;
-        tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
-        src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
-     }
-     if(src2_index < 0)
-     {
-        uchar4 tmp;
-        tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
-        src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
-     }
+        if(src1_index < 0)
+        {
+            uchar4 tmp;
+            tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
+            src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
+        }
+        if(src2_index < 0)
+        {
+            uchar4 tmp;
+            tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
+            src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
+        }
        uchar4 dst_data = *((__global uchar4 *)(dst + dst_index));
        uchar4 tmp_data = src1_data ^ src2_data;

@@ -101,9 +107,9 @@ __kernel void arithm_bitwise_xor_D0 (__global uchar *src1, int src1_step, int sr


 __kernel void arithm_bitwise_xor_D1 (__global char *src1, int src1_step, int src1_offset,
-                             __global char *src2, int src2_step, int src2_offset,
-                             __global char *dst,  int dst_step,  int dst_offset,
-                             int rows, int cols, int dst_step1)
+                                     __global char *src2, int src2_step, int src2_offset,
+                                     __global char *dst,  int dst_step,  int dst_offset,
+                                     int rows, int cols, int dst_step1)
 {
    int x = get_global_id(0);
    int y = get_global_id(1);
@@ -112,7 +118,10 @@ __kernel void arithm_bitwise_xor_D1 (__global char *src1, int src1_step, int src
    {
        x = x << 2;

-        #define dst_align (dst_offset & 3)
+#ifdef dst_align
+#undef dst_align
+#endif
+#define dst_align (dst_offset & 3)
        int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
        int src2_index = mad24(y, src2_step, x + src2_offset - dst_align);

@@ -120,23 +129,23 @@ __kernel void arithm_bitwise_xor_D1 (__global char *src1, int src1_step, int src
        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
        int dst_index  = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);

-     int src1_index_fix = src1_index < 0 ? 0 : src1_index;
-     int src2_index_fix = src2_index < 0 ? 0 : src2_index;
+        int src1_index_fix = src1_index < 0 ? 0 : src1_index;
+        int src2_index_fix = src2_index < 0 ? 0 : src2_index;
        char4 src1_data = vload4(0, src1 + src1_index_fix);
        char4 src2_data = vload4(0, src2 + src2_index_fix);

-     if(src1_index < 0)
-     {
-        char4 tmp;
-        tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
-        src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
-     }
-     if(src2_index < 0)
-     {
-        char4 tmp;
-        tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
-        src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
-     }
+        if(src1_index < 0)
+        {
+            char4 tmp;
+            tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
+            src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
+        }
+        if(src2_index < 0)
+        {
+            char4 tmp;
+            tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
+            src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
+        }
        char4 dst_data = *((__global char4 *)(dst + dst_index));
        char4 tmp_data = src1_data ^ src2_data;

@@ -151,9 +160,9 @@ __kernel void arithm_bitwise_xor_D1 (__global char *src1, int src1_step, int src


 __kernel void arithm_bitwise_xor_D2 (__global ushort *src1, int src1_step, int src1_offset,
-                             __global ushort *src2, int src2_step, int src2_offset,
-                             __global ushort *dst,  int dst_step,  int dst_offset,
-                             int rows, int cols, int dst_step1)
+                                     __global ushort *src2, int src2_step, int src2_offset,
+                                     __global ushort *dst,  int dst_step,  int dst_offset,
+                                     int rows, int cols, int dst_step1)

 {
    int x = get_global_id(0);
@@ -163,7 +172,10 @@ __kernel void arithm_bitwise_xor_D2 (__global ushort *src1, int src1_step, int s
    {
        x = x << 2;

-        #define dst_align ((dst_offset >> 1) & 3)
+#ifdef dst_align
+#undef dst_align
+#endif
+#define dst_align ((dst_offset >> 1) & 3)
        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
        int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));

@@ -171,23 +183,23 @@ __kernel void arithm_bitwise_xor_D2 (__global ushort *src1, int src1_step, int s
        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
        int dst_index  = mad24(y, dst_step, dst_offset + (x << 1) & (int)0xfffffff8);

-     int src1_index_fix = src1_index < 0 ? 0 : src1_index;
-     int src2_index_fix = src2_index < 0 ? 0 : src2_index;
+        int src1_index_fix = src1_index < 0 ? 0 : src1_index;
+        int src2_index_fix = src2_index < 0 ? 0 : src2_index;
        ushort4 src1_data = vload4(0, (__global ushort *)((__global char *)src1 + src1_index_fix));
        ushort4 src2_data = vload4(0, (__global ushort *)((__global char *)src2 + src2_index_fix));

-     if(src1_index < 0)
-     {
-        ushort4 tmp;
-        tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
-        src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
-     }
-     if(src2_index < 0)
-     {
-        ushort4 tmp;
-        tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
-        src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
-     }
+        if(src1_index < 0)
+        {
+            ushort4 tmp;
+            tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
+            src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
+        }
+        if(src2_index < 0)
+        {
+            ushort4 tmp;
+            tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
+            src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
+        }
        ushort4 dst_data = *((__global ushort4 *)((__global char *)dst + dst_index));
        ushort4 tmp_data = src1_data ^ src2_data;

@@ -203,9 +215,9 @@ __kernel void arithm_bitwise_xor_D2 (__global ushort *src1, int src1_step, int s


 __kernel void arithm_bitwise_xor_D3 (__global short *src1, int src1_step, int src1_offset,
-                             __global short *src2, int src2_step, int src2_offset,
-                             __global short *dst,  int dst_step,  int dst_offset,
-                             int rows, int cols, int dst_step1)
+                                     __global short *src2, int src2_step, int src2_offset,
+                                     __global short *dst,  int dst_step,  int dst_offset,
+                                     int rows, int cols, int dst_step1)

 {
    int x = get_global_id(0);
@@ -215,7 +227,10 @@ __kernel void arithm_bitwise_xor_D3 (__global short *src1, int src1_step, int sr
    {
        x = x << 2;

-        #define dst_align ((dst_offset >> 1) & 3)
+#ifdef dst_align
+#undef dst_align
+#endif
+#define dst_align ((dst_offset >> 1) & 3)
        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
        int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));

@@ -223,25 +238,25 @@ __kernel void arithm_bitwise_xor_D3 (__global short *src1, int src1_step, int sr
        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
        int dst_index  = mad24(y, dst_step, dst_offset + (x << 1) & (int)0xfffffff8);

-     int src1_index_fix = src1_index < 0 ? 0 : src1_index;
-     int src2_index_fix = src2_index < 0 ? 0 : src2_index;
+        int src1_index_fix = src1_index < 0 ? 0 : src1_index;
+        int src2_index_fix = src2_index < 0 ? 0 : src2_index;
        short4 src1_data = vload4(0, (__global short *)((__global char *)src1 + src1_index_fix));
        short4 src2_data = vload4(0, (__global short *)((__global char *)src2 + src2_index_fix));

        short4 dst_data = *((__global short4 *)((__global char *)dst + dst_index));

-     if(src1_index < 0)
-     {
-        short4 tmp;
-        tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
-        src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
-     }
-     if(src2_index < 0)
-     {
-        short4 tmp;
-        tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
-        src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
-     }
+        if(src1_index < 0)
+        {
+            short4 tmp;
+            tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
+            src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
+        }
+        if(src2_index < 0)
+        {
+            short4 tmp;
+            tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
+            src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
+        }



@@ -259,9 +274,9 @@ __kernel void arithm_bitwise_xor_D3 (__global short *src1, int src1_step, int sr


 __kernel void arithm_bitwise_xor_D4 (__global int *src1, int src1_step, int src1_offset,
-                             __global int *src2, int src2_step, int src2_offset,
-                             __global int *dst,  int dst_step,  int dst_offset,
-                             int rows, int cols, int dst_step1)
+                                     __global int *src2, int src2_step, int src2_offset,
+                                     __global int *dst,  int dst_step,  int dst_offset,
+                                     int rows, int cols, int dst_step1)
 {
    int x = get_global_id(0);
    int y = get_global_id(1);
@@ -281,9 +296,9 @@ __kernel void arithm_bitwise_xor_D4 (__global int *src1, int src1_step, int src1
 }

 __kernel void arithm_bitwise_xor_D5 (__global char *src1, int src1_step, int src1_offset,
-                             __global char *src2, int src2_step, int src2_offset,
-                             __global char *dst,  int dst_step,  int dst_offset,
-                             int rows, int cols, int dst_step1)
+                                     __global char *src2, int src2_step, int src2_offset,
+                                     __global char *dst,  int dst_step,  int dst_offset,
+                                     int rows, int cols, int dst_step1)
 {
    int x = get_global_id(0);
    int y = get_global_id(1);
@@ -301,12 +316,11 @@ __kernel void arithm_bitwise_xor_D5 (__global char *src1, int src1_step, int src
        *((__global char4 *)((__global char *)dst + dst_index)) = tmp;
    }
 }
-
 #if defined (DOUBLE_SUPPORT)
 __kernel void arithm_bitwise_xor_D6 (__global char *src1, int src1_step, int src1_offset,
-                             __global char *src2, int src2_step, int src2_offset,
-                             __global char *dst,  int dst_step,  int dst_offset,
-                             int rows, int cols, int dst_step1)
+                                     __global char *src2, int src2_step, int src2_offset,
+                                     __global char *dst,  int dst_step,  int dst_offset,
+                                     int rows, int cols, int dst_step1)
 {
    int x = get_global_id(0);
    int y = get_global_id(1);
--- a/modules/ocl/src/opencl/arithm_bitwise_xor_mask.cl
+++ b/modules/ocl/src/opencl/arithm_bitwise_xor_mask.cl
@@ -43,18 +43,22 @@
 //
 //M*/
 #if defined (DOUBLE_SUPPORT)
+#ifdef cl_khr_fp64
 #pragma OPENCL EXTENSION cl_khr_fp64:enable
+#elif defined (cl_amd_fp64)
+#pragma OPENCL EXTENSION cl_amd_fp64:enable
+#endif
 #endif
-
 //////////////////////////////////////////////////////////////////////////////////////////////////////
 ////////////////////////////////////////////BITWISE_XOR////////////////////////////////////////////////////
 ///////////////////////////////////////////////////////////////////////////////////////////////////////
 /**************************************bitwise_xor with mask**************************************/
-__kernel void arithm_bitwise_xor_with_mask_C1_D0 (__global uchar *src1, int src1_step, int src1_offset,
-                                          __global uchar *src2, int src2_step, int src2_offset,
-                                          __global uchar *mask, int mask_step, int mask_offset,
-                                          __global uchar *dst,  int dst_step,  int dst_offset,
-                                          int rows, int cols, int dst_step1)
+__kernel void arithm_bitwise_xor_with_mask_C1_D0 (
+        __global uchar *src1, int src1_step, int src1_offset,
+        __global uchar *src2, int src2_step, int src2_offset,
+        __global uchar *mask, int mask_step, int mask_offset,
+        __global uchar *dst,  int dst_step,  int dst_offset,
+        int rows, int cols, int dst_step1)
 {

    int x = get_global_id(0);
@@ -64,7 +68,10 @@ __kernel void arithm_bitwise_xor_with_mask_C1_D0 (__global uchar *src1, int src1
    {
        x = x << 2;

-        #define dst_align (dst_offset & 3)
+#ifdef dst_align
+#undef dst_align
+#endif
+#define dst_align (dst_offset & 3)
        int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
        int src2_index = mad24(y, src2_step, x + src2_offset - dst_align);
        int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
@@ -91,11 +98,12 @@ __kernel void arithm_bitwise_xor_with_mask_C1_D0 (__global uchar *src1, int src1



-__kernel void arithm_bitwise_xor_with_mask_C1_D1 (__global char *src1, int src1_step, int src1_offset,
-                                          __global char *src2, int src2_step, int src2_offset,
-                                          __global uchar *mask, int mask_step, int mask_offset,
-                                          __global char *dst,  int dst_step,  int dst_offset,
-                                          int rows, int cols, int dst_step1)
+__kernel void arithm_bitwise_xor_with_mask_C1_D1 (
+        __global char *src1, int src1_step, int src1_offset,
+        __global char *src2, int src2_step, int src2_offset,
+        __global uchar *mask, int mask_step, int mask_offset,
+        __global char *dst,  int dst_step,  int dst_offset,
+        int rows, int cols, int dst_step1)
 {

    int x = get_global_id(0);
@@ -105,7 +113,10 @@ __kernel void arithm_bitwise_xor_with_mask_C1_D1 (__global char *src1, int src1_
    {
        x = x << 2;

-        #define dst_align (dst_offset & 3)
+#ifdef dst_align
+#undef dst_align
+#endif
+#define dst_align (dst_offset & 3)
        int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
        int src2_index = mad24(y, src2_step, x + src2_offset - dst_align);
        int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
@@ -132,11 +143,12 @@ __kernel void arithm_bitwise_xor_with_mask_C1_D1 (__global char *src1, int src1_



-__kernel void arithm_bitwise_xor_with_mask_C1_D2 (__global ushort *src1, int src1_step, int src1_offset,
-                                          __global ushort *src2, int src2_step, int src2_offset,
-                                          __global uchar  *mask, int mask_step, int mask_offset,
-                                          __global ushort *dst,  int dst_step,  int dst_offset,
-                                          int rows, int cols, int dst_step1)
+__kernel void arithm_bitwise_xor_with_mask_C1_D2 (
+        __global ushort *src1, int src1_step, int src1_offset,
+        __global ushort *src2, int src2_step, int src2_offset,
+        __global uchar  *mask, int mask_step, int mask_offset,
+        __global ushort *dst,  int dst_step,  int dst_offset,
+        int rows, int cols, int dst_step1)
 {

    int x = get_global_id(0);
@@ -146,7 +158,10 @@ __kernel void arithm_bitwise_xor_with_mask_C1_D2 (__global ushort *src1, int src
    {
        x = x << 1;

-        #define dst_align ((dst_offset >> 1) & 1)
+#ifdef dst_align
+#undef dst_align
+#endif
+#define dst_align ((dst_offset >> 1) & 1)
        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
        int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
        int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
@@ -171,11 +186,12 @@ __kernel void arithm_bitwise_xor_with_mask_C1_D2 (__global ushort *src1, int src



-__kernel void arithm_bitwise_xor_with_mask_C1_D3 (__global short *src1, int src1_step, int src1_offset,
-                                          __global short *src2, int src2_step, int src2_offset,
-                                          __global uchar *mask, int mask_step, int mask_offset,
-                                          __global short *dst,  int dst_step,  int dst_offset,
-                                          int rows, int cols, int dst_step1)
+__kernel void arithm_bitwise_xor_with_mask_C1_D3 (
+        __global short *src1, int src1_step, int src1_offset,
+        __global short *src2, int src2_step, int src2_offset,
+        __global uchar *mask, int mask_step, int mask_offset,
+        __global short *dst,  int dst_step,  int dst_offset,
+        int rows, int cols, int dst_step1)
 {

    int x = get_global_id(0);
@@ -185,7 +201,10 @@ __kernel void arithm_bitwise_xor_with_mask_C1_D3 (__global short *src1, int src1
    {
        x = x << 1;

-        #define dst_align ((dst_offset >> 1) & 1)
+#ifdef dst_align
+#undef dst_align
+#endif
+#define dst_align ((dst_offset >> 1) & 1)
        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
        int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
        int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
@@ -198,8 +217,8 @@ __kernel void arithm_bitwise_xor_with_mask_C1_D3 (__global short *src1, int src1
        short2 src2_data = vload2(0, (__global short *)((__global char *)src2 + src2_index));
        uchar2  mask_data = vload2(0, mask + mask_index);

-    short2 data = *((__global short2 *)((__global uchar *)dst + dst_index));
-    short2 tmp_data = src1_data ^ src2_data;
+        short2 data = *((__global short2 *)((__global uchar *)dst + dst_index));
+        short2 tmp_data = src1_data ^ src2_data;

        data.x = convert_short((mask_data.x) && (dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : data.x;
        data.y = convert_short((mask_data.y) && (dst_index + 2 >= dst_start) && (dst_index + 2 < dst_end)) ? tmp_data.y : data.y;
@@ -210,11 +229,12 @@ __kernel void arithm_bitwise_xor_with_mask_C1_D3 (__global short *src1, int src1



-__kernel void arithm_bitwise_xor_with_mask_C1_D4 (__global int   *src1, int src1_step, int src1_offset,
-                                          __global int   *src2, int src2_step, int src2_offset,
-                                          __global uchar *mask, int mask_step, int mask_offset,
-                                          __global int   *dst,  int dst_step,  int dst_offset,
-                                          int rows, int cols, int dst_step1)
+__kernel void arithm_bitwise_xor_with_mask_C1_D4 (
+        __global int   *src1, int src1_step, int src1_offset,
+        __global int   *src2, int src2_step, int src2_offset,
+        __global uchar *mask, int mask_step, int mask_offset,
+        __global int   *dst,  int dst_step,  int dst_offset,
+        int rows, int cols, int dst_step1)
 {

    int x = get_global_id(0);
@@ -242,11 +262,12 @@ __kernel void arithm_bitwise_xor_with_mask_C1_D4 (__global int   *src1, int src1



-__kernel void arithm_bitwise_xor_with_mask_C1_D5 (__global char *src1, int src1_step, int src1_offset,
-                                          __global char *src2, int src2_step, int src2_offset,
-                                          __global uchar *mask, int mask_step, int mask_offset,
-                                          __global char *dst,  int dst_step,  int dst_offset,
-                                          int rows, int cols, int dst_step1)
+__kernel void arithm_bitwise_xor_with_mask_C1_D5 (
+        __global char *src1, int src1_step, int src1_offset,
+        __global char *src2, int src2_step, int src2_offset,
+        __global uchar *mask, int mask_step, int mask_offset,
+        __global char *dst,  int dst_step,  int dst_offset,
+        int rows, int cols, int dst_step1)
 {

    int x = get_global_id(0);
@@ -273,13 +294,13 @@ __kernel void arithm_bitwise_xor_with_mask_C1_D5 (__global char *src1, int src1_
 }


-
 #if defined (DOUBLE_SUPPORT)
-__kernel void arithm_bitwise_xor_with_mask_C1_D6 (__global char *src1, int src1_step, int src1_offset,
-                                          __global char *src2, int src2_step, int src2_offset,
-                                          __global uchar *mask, int mask_step, int mask_offset,
-                                          __global char *dst,  int dst_step,  int dst_offset,
-                                          int rows, int cols, int dst_step1)
+__kernel void arithm_bitwise_xor_with_mask_C1_D6 (
+        __global char *src1, int src1_step, int src1_offset,
+        __global char *src2, int src2_step, int src2_offset,
+        __global uchar *mask, int mask_step, int mask_offset,
+        __global char *dst,  int dst_step,  int dst_offset,
+        int rows, int cols, int dst_step1)
 {

    int x = get_global_id(0);
@@ -308,12 +329,12 @@ __kernel void arithm_bitwise_xor_with_mask_C1_D6 (__global char *src1, int src1_



-
-__kernel void arithm_bitwise_xor_with_mask_C2_D0 (__global uchar *src1, int src1_step, int src1_offset,
-                                          __global uchar *src2, int src2_step, int src2_offset,
-                                          __global uchar *mask, int mask_step, int mask_offset,
-                                          __global uchar *dst,  int dst_step,  int dst_offset,
-                                          int rows, int cols, int dst_step1)
+__kernel void arithm_bitwise_xor_with_mask_C2_D0 (
+        __global uchar *src1, int src1_step, int src1_offset,
+        __global uchar *src2, int src2_step, int src2_offset,
+        __global uchar *mask, int mask_step, int mask_offset,
+        __global uchar *dst,  int dst_step,  int dst_offset,
+        int rows, int cols, int dst_step1)
 {

    int x = get_global_id(0);
@@ -323,7 +344,10 @@ __kernel void arithm_bitwise_xor_with_mask_C2_D0 (__global uchar *src1, int src1
    {
        x = x << 1;

-        #define dst_align ((dst_offset >> 1) & 1)
+#ifdef dst_align
+#undef dst_align
+#endif
+#define dst_align ((dst_offset >> 1) & 1)
        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
        int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
        int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
@@ -347,11 +371,12 @@ __kernel void arithm_bitwise_xor_with_mask_C2_D0 (__global uchar *src1, int src1
 }


-__kernel void arithm_bitwise_xor_with_mask_C2_D1 (__global char *src1, int src1_step, int src1_offset,
-                                          __global char *src2, int src2_step, int src2_offset,
-                                          __global uchar *mask, int mask_step, int mask_offset,
-                                          __global char *dst,  int dst_step,  int dst_offset,
-                                          int rows, int cols, int dst_step1)
+__kernel void arithm_bitwise_xor_with_mask_C2_D1 (
+        __global char *src1, int src1_step, int src1_offset,
+        __global char *src2, int src2_step, int src2_offset,
+        __global uchar *mask, int mask_step, int mask_offset,
+        __global char *dst,  int dst_step,  int dst_offset,
+        int rows, int cols, int dst_step1)
 {

    int x = get_global_id(0);
@@ -361,7 +386,10 @@ __kernel void arithm_bitwise_xor_with_mask_C2_D1 (__global char *src1, int src1_
    {
        x = x << 1;

-        #define dst_align ((dst_offset >> 1) & 1)
+#ifdef dst_align
+#undef dst_align
+#endif
+#define dst_align ((dst_offset >> 1) & 1)
        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
        int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
        int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
@@ -384,11 +412,12 @@ __kernel void arithm_bitwise_xor_with_mask_C2_D1 (__global char *src1, int src1_
    }
 }

-__kernel void arithm_bitwise_xor_with_mask_C2_D2 (__global ushort *src1, int src1_step, int src1_offset,
-                                          __global ushort *src2, int src2_step, int src2_offset,
-                                          __global uchar  *mask, int mask_step, int mask_offset,
-                                          __global ushort *dst,  int dst_step,  int dst_offset,
-                                          int rows, int cols, int dst_step1)
+__kernel void arithm_bitwise_xor_with_mask_C2_D2 (
+        __global ushort *src1, int src1_step, int src1_offset,
+        __global ushort *src2, int src2_step, int src2_offset,
+        __global uchar  *mask, int mask_step, int mask_offset,
+        __global ushort *dst,  int dst_step,  int dst_offset,
+        int rows, int cols, int dst_step1)
 {

    int x = get_global_id(0);
@@ -413,11 +442,12 @@ __kernel void arithm_bitwise_xor_with_mask_C2_D2 (__global ushort *src1, int src
        *((__global ushort2 *)((__global char *)dst + dst_index)) = data;
    }
 }
-__kernel void arithm_bitwise_xor_with_mask_C2_D3 (__global short *src1, int src1_step, int src1_offset,
-                                          __global short *src2, int src2_step, int src2_offset,
-                                          __global uchar *mask, int mask_step, int mask_offset,
-                                          __global short *dst,  int dst_step,  int dst_offset,
-                                          int rows, int cols, int dst_step1)
+__kernel void arithm_bitwise_xor_with_mask_C2_D3 (
+        __global short *src1, int src1_step, int src1_offset,
+        __global short *src2, int src2_step, int src2_offset,
+        __global uchar *mask, int mask_step, int mask_offset,
+        __global short *dst,  int dst_step,  int dst_offset,
+        int rows, int cols, int dst_step1)
 {

    int x = get_global_id(0);
@@ -442,11 +472,12 @@ __kernel void arithm_bitwise_xor_with_mask_C2_D3 (__global short *src1, int src1
        *((__global short2 *)((__global char *)dst + dst_index)) = data;
    }
 }
-__kernel void arithm_bitwise_xor_with_mask_C2_D4 (__global int   *src1, int src1_step, int src1_offset,
-                                          __global int   *src2, int src2_step, int src2_offset,
-                                          __global uchar *mask, int mask_step, int mask_offset,
-                                          __global int    *dst,  int dst_step,  int dst_offset,
-                                          int rows, int cols, int dst_step1)
+__kernel void arithm_bitwise_xor_with_mask_C2_D4 (
+        __global int   *src1, int src1_step, int src1_offset,
+        __global int   *src2, int src2_step, int src2_offset,
+        __global uchar *mask, int mask_step, int mask_offset,
+        __global int    *dst,  int dst_step,  int dst_offset,
+        int rows, int cols, int dst_step1)
 {

    int x = get_global_id(0);
@@ -471,11 +502,12 @@ __kernel void arithm_bitwise_xor_with_mask_C2_D4 (__global int   *src1, int src1
        *((__global int2 *)((__global char *)dst + dst_index)) = data;
    }
 }
-__kernel void arithm_bitwise_xor_with_mask_C2_D5 (__global char *src1, int src1_step, int src1_offset,
-                                          __global char *src2, int src2_step, int src2_offset,
-                                          __global uchar *mask, int mask_step, int mask_offset,
-                                          __global char *dst,  int dst_step,  int dst_offset,
-                                          int rows, int cols, int dst_step1)
+__kernel void arithm_bitwise_xor_with_mask_C2_D5 (
+        __global char *src1, int src1_step, int src1_offset,
+        __global char *src2, int src2_step, int src2_offset,
+        __global uchar *mask, int mask_step, int mask_offset,
+        __global char *dst,  int dst_step,  int dst_offset,
+        int rows, int cols, int dst_step1)
 {

    int x = get_global_id(0);
@@ -501,11 +533,12 @@ __kernel void arithm_bitwise_xor_with_mask_C2_D5 (__global char *src1, int src1_
    }
 }
 #if defined (DOUBLE_SUPPORT)
-__kernel void arithm_bitwise_xor_with_mask_C2_D6 (__global char *src1, int src1_step, int src1_offset,
-                                          __global char *src2, int src2_step, int src2_offset,
-                                          __global uchar *mask, int mask_step, int mask_offset,
-                                          __global char *dst,  int dst_step,  int dst_offset,
-                                          int rows, int cols, int dst_step1)
+__kernel void arithm_bitwise_xor_with_mask_C2_D6 (
+        __global char *src1, int src1_step, int src1_offset,
+        __global char *src2, int src2_step, int src2_offset,
+        __global uchar *mask, int mask_step, int mask_offset,
+        __global char *dst,  int dst_step,  int dst_offset,
+        int rows, int cols, int dst_step1)
 {

    int x = get_global_id(0);
@@ -533,12 +566,12 @@ __kernel void arithm_bitwise_xor_with_mask_C2_D6 (__global char *src1, int src1_
 #endif


-
-__kernel void arithm_bitwise_xor_with_mask_C3_D0 (__global uchar *src1, int src1_step, int src1_offset,
-                                          __global uchar *src2, int src2_step, int src2_offset,
-                                          __global uchar *mask, int mask_step, int mask_offset,
-                                          __global uchar *dst,  int dst_step,  int dst_offset,
-                                          int rows, int cols, int dst_step1)
+__kernel void arithm_bitwise_xor_with_mask_C3_D0 (
+        __global uchar *src1, int src1_step, int src1_offset,
+        __global uchar *src2, int src2_step, int src2_offset,
+        __global uchar *mask, int mask_step, int mask_offset,
+        __global uchar *dst,  int dst_step,  int dst_offset,
+        int rows, int cols, int dst_step1)
 {

    int x = get_global_id(0);
@@ -548,7 +581,10 @@ __kernel void arithm_bitwise_xor_with_mask_C3_D0 (__global uchar *src1, int src1
    {
        x = x << 2;

-        #define dst_align (((dst_offset % dst_step) / 3 ) & 3)
+#ifdef dst_align
+#undef dst_align
+#endif
+#define dst_align (((dst_offset % dst_step) / 3 ) & 3)
        int src1_index = mad24(y, src1_step, (x * 3) + src1_offset - (dst_align * 3));
        int src2_index = mad24(y, src2_step, (x * 3) + src2_offset - (dst_align * 3));
        int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
@@ -596,11 +632,12 @@ __kernel void arithm_bitwise_xor_with_mask_C3_D0 (__global uchar *src1, int src1
 }


-__kernel void arithm_bitwise_xor_with_mask_C3_D1 (__global char *src1, int src1_step, int src1_offset,
-                                          __global char *src2, int src2_step, int src2_offset,
-                                          __global uchar *mask, int mask_step, int mask_offset,
-                                          __global char *dst,  int dst_step,  int dst_offset,
-                                          int rows, int cols, int dst_step1)
+__kernel void arithm_bitwise_xor_with_mask_C3_D1 (
+        __global char *src1, int src1_step, int src1_offset,
+        __global char *src2, int src2_step, int src2_offset,
+        __global uchar *mask, int mask_step, int mask_offset,
+        __global char *dst,  int dst_step,  int dst_offset,
+        int rows, int cols, int dst_step1)
 {

    int x = get_global_id(0);
@@ -610,7 +647,10 @@ __kernel void arithm_bitwise_xor_with_mask_C3_D1 (__global char *src1, int src1_
    {
        x = x << 2;

-        #define dst_align (((dst_offset % dst_step) / 3 ) & 3)
+#ifdef dst_align
+#undef dst_align
+#endif
+#define dst_align (((dst_offset % dst_step) / 3 ) & 3)
        int src1_index = mad24(y, src1_step, (x * 3) + src1_offset - (dst_align * 3));
        int src2_index = mad24(y, src2_step, (x * 3) + src2_offset - (dst_align * 3));
        int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
@@ -657,11 +697,12 @@ __kernel void arithm_bitwise_xor_with_mask_C3_D1 (__global char *src1, int src1_
    }
 }

-__kernel void arithm_bitwise_xor_with_mask_C3_D2 (__global ushort *src1, int src1_step, int src1_offset,
-                                          __global ushort *src2, int src2_step, int src2_offset,
-                                          __global uchar  *mask, int mask_step, int mask_offset,
-                                          __global ushort *dst,  int dst_step,  int dst_offset,
-                                          int rows, int cols, int dst_step1)
+__kernel void arithm_bitwise_xor_with_mask_C3_D2 (
+        __global ushort *src1, int src1_step, int src1_offset,
+        __global ushort *src2, int src2_step, int src2_offset,
+        __global uchar  *mask, int mask_step, int mask_offset,
+        __global ushort *dst,  int dst_step,  int dst_offset,
+        int rows, int cols, int dst_step1)
 {

    int x = get_global_id(0);
@@ -671,7 +712,10 @@ __kernel void arithm_bitwise_xor_with_mask_C3_D2 (__global ushort *src1, int src
    {
        x = x << 1;

-        #define dst_align (((dst_offset % dst_step) / 6 ) & 1)
+#ifdef dst_align
+#undef dst_align
+#endif
+#define dst_align (((dst_offset % dst_step) / 6 ) & 1)
        int src1_index = mad24(y, src1_step, (x * 6) + src1_offset - (dst_align * 6));
        int src2_index = mad24(y, src2_step, (x * 6) + src2_offset - (dst_align * 6));
        int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
@@ -701,23 +745,24 @@ __kernel void arithm_bitwise_xor_with_mask_C3_D2 (__global ushort *src1, int src
        data_0.xy = ((mask_data.x) && (dst_index + 0 >= dst_start)) ? tmp_data_0.xy : data_0.xy;

        data_1.x  = ((mask_data.x) && (dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end))
-                     ? tmp_data_1.x : data_1.x;
+                    ? tmp_data_1.x : data_1.x;
        data_1.y  = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
-                     ? tmp_data_1.y : data_1.y;
+                    ? tmp_data_1.y : data_1.y;

        data_2.xy = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
-                     ? tmp_data_2.xy : data_2.xy;
+                    ? tmp_data_2.xy : data_2.xy;

-       *((__global ushort2 *)((__global char *)dst + dst_index + 0))= data_0;
-       *((__global ushort2 *)((__global char *)dst + dst_index + 4))= data_1;
-       *((__global ushort2 *)((__global char *)dst + dst_index + 8))= data_2;
+        *((__global ushort2 *)((__global char *)dst + dst_index + 0))= data_0;
+        *((__global ushort2 *)((__global char *)dst + dst_index + 4))= data_1;
+        *((__global ushort2 *)((__global char *)dst + dst_index + 8))= data_2;
    }
 }
-__kernel void arithm_bitwise_xor_with_mask_C3_D3 (__global short *src1, int src1_step, int src1_offset,
-                                          __global short *src2, int src2_step, int src2_offset,
-                                          __global uchar  *mask, int mask_step, int mask_offset,
-                                          __global short *dst,  int dst_step,  int dst_offset,
-                                          int rows, int cols, int dst_step1)
+__kernel void arithm_bitwise_xor_with_mask_C3_D3 (
+        __global short *src1, int src1_step, int src1_offset,
+        __global short *src2, int src2_step, int src2_offset,
+        __global uchar  *mask, int mask_step, int mask_offset,
+        __global short *dst,  int dst_step,  int dst_offset,
+        int rows, int cols, int dst_step1)
 {

    int x = get_global_id(0);
@@ -727,7 +772,10 @@ __kernel void arithm_bitwise_xor_with_mask_C3_D3 (__global short *src1, int src1
    {
        x = x << 1;

-        #define dst_align (((dst_offset % dst_step) / 6 ) & 1)
+#ifdef dst_align
+#undef dst_align
+#endif
+#define dst_align (((dst_offset % dst_step) / 6 ) & 1)
        int src1_index = mad24(y, src1_step, (x * 6) + src1_offset - (dst_align * 6));
        int src2_index = mad24(y, src2_step, (x * 6) + src2_offset - (dst_align * 6));
        int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
@@ -757,23 +805,24 @@ __kernel void arithm_bitwise_xor_with_mask_C3_D3 (__global short *src1, int src1
        data_0.xy = ((mask_data.x) && (dst_index + 0 >= dst_start)) ? tmp_data_0.xy : data_0.xy;

        data_1.x  = ((mask_data.x) && (dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end))
-                     ? tmp_data_1.x : data_1.x;
+                    ? tmp_data_1.x : data_1.x;
        data_1.y  = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
-                     ? tmp_data_1.y : data_1.y;
+                    ? tmp_data_1.y : data_1.y;

        data_2.xy = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
-                     ? tmp_data_2.xy : data_2.xy;
+                    ? tmp_data_2.xy : data_2.xy;

-       *((__global short2 *)((__global char *)dst + dst_index + 0))= data_0;
-       *((__global short2 *)((__global char *)dst + dst_index + 4))= data_1;
-       *((__global short2 *)((__global char *)dst + dst_index + 8))= data_2;
+        *((__global short2 *)((__global char *)dst + dst_index + 0))= data_0;
+        *((__global short2 *)((__global char *)dst + dst_index + 4))= data_1;
+        *((__global short2 *)((__global char *)dst + dst_index + 8))= data_2;
    }
 }
-__kernel void arithm_bitwise_xor_with_mask_C3_D4 (__global int   *src1, int src1_step, int src1_offset,
-                                          __global int   *src2, int src2_step, int src2_offset,
-                                          __global uchar *mask, int mask_step, int mask_offset,
-                                          __global int   *dst,  int dst_step,  int dst_offset,
-                                          int rows, int cols, int dst_step1)
+__kernel void arithm_bitwise_xor_with_mask_C3_D4 (
+        __global int   *src1, int src1_step, int src1_offset,
+        __global int   *src2, int src2_step, int src2_offset,
+        __global uchar *mask, int mask_step, int mask_offset,
+        __global int   *dst,  int dst_step,  int dst_offset,
+        int rows, int cols, int dst_step1)
 {

    int x = get_global_id(0);
@@ -808,16 +857,17 @@ __kernel void arithm_bitwise_xor_with_mask_C3_D4 (__global int   *src1, int src1
        data_1 = mask_data ? tmp_data_1 : data_1;
        data_2 = mask_data ? tmp_data_2 : data_2;

-       *((__global int *)((__global char *)dst + dst_index + 0))= data_0;
-       *((__global int *)((__global char *)dst + dst_index + 4))= data_1;
-       *((__global int *)((__global char *)dst + dst_index + 8))= data_2;
+        *((__global int *)((__global char *)dst + dst_index + 0))= data_0;
+        *((__global int *)((__global char *)dst + dst_index + 4))= data_1;
+        *((__global int *)((__global char *)dst + dst_index + 8))= data_2;
    }
 }
-__kernel void arithm_bitwise_xor_with_mask_C3_D5 (__global char *src1, int src1_step, int src1_offset,
-                                          __global char *src2, int src2_step, int src2_offset,
-                                          __global uchar *mask, int mask_step, int mask_offset,
-                                          __global char *dst,  int dst_step,  int dst_offset,
-                                          int rows, int cols, int dst_step1)
+__kernel void arithm_bitwise_xor_with_mask_C3_D5 (
+        __global char *src1, int src1_step, int src1_offset,
+        __global char *src2, int src2_step, int src2_offset,
+        __global uchar *mask, int mask_step, int mask_offset,
+        __global char *dst,  int dst_step,  int dst_offset,
+        int rows, int cols, int dst_step1)
 {

    int x = get_global_id(0);
@@ -852,17 +902,18 @@ __kernel void arithm_bitwise_xor_with_mask_C3_D5 (__global char *src1, int src1_
        data_1 = mask_data ? tmp_data_1 : data_1;
        data_2 = mask_data ? tmp_data_2 : data_2;

-       *((__global char4 *)((__global char *)dst + dst_index + 0))= data_0;
-       *((__global char4 *)((__global char *)dst + dst_index + 4))= data_1;
-       *((__global char4 *)((__global char *)dst + dst_index + 8))= data_2;
+        *((__global char4 *)((__global char *)dst + dst_index + 0))= data_0;
+        *((__global char4 *)((__global char *)dst + dst_index + 4))= data_1;
+        *((__global char4 *)((__global char *)dst + dst_index + 8))= data_2;
    }
 }
 #if defined (DOUBLE_SUPPORT)
-__kernel void arithm_bitwise_xor_with_mask_C3_D6 (__global char *src1, int src1_step, int src1_offset,
-                                          __global char *src2, int src2_step, int src2_offset,
-                                          __global uchar  *mask, int mask_step, int mask_offset,
-                                          __global char *dst,  int dst_step,  int dst_offset,
-                                          int rows, int cols, int dst_step1)
+__kernel void arithm_bitwise_xor_with_mask_C3_D6 (
+        __global char *src1, int src1_step, int src1_offset,
+        __global char *src2, int src2_step, int src2_offset,
+        __global uchar  *mask, int mask_step, int mask_offset,
+        __global char *dst,  int dst_step,  int dst_offset,
+        int rows, int cols, int dst_step1)
 {

    int x = get_global_id(0);
@@ -897,20 +948,20 @@ __kernel void arithm_bitwise_xor_with_mask_C3_D6 (__global char *src1, int src1_
        data_1 = mask_data ? tmp_data_1 : data_1;
        data_2 = mask_data ? tmp_data_2 : data_2;

-       *((__global char8 *)((__global char *)dst + dst_index + 0 ))= data_0;
-       *((__global char8 *)((__global char *)dst + dst_index + 8 ))= data_1;
-       *((__global char8 *)((__global char *)dst + dst_index + 16))= data_2;
+        *((__global char8 *)((__global char *)dst + dst_index + 0 ))= data_0;
+        *((__global char8 *)((__global char *)dst + dst_index + 8 ))= data_1;
+        *((__global char8 *)((__global char *)dst + dst_index + 16))= data_2;
    }
 }
 #endif


-
-__kernel void arithm_bitwise_xor_with_mask_C4_D0 (__global uchar *src1, int src1_step, int src1_offset,
-                                          __global uchar *src2, int src2_step, int src2_offset,
-                                          __global uchar *mask, int mask_step, int mask_offset,
-                                          __global uchar *dst,  int dst_step,  int dst_offset,
-                                          int rows, int cols, int dst_step1)
+__kernel void arithm_bitwise_xor_with_mask_C4_D0 (
+        __global uchar *src1, int src1_step, int src1_offset,
+        __global uchar *src2, int src2_step, int src2_offset,
+        __global uchar *mask, int mask_step, int mask_offset,
+        __global uchar *dst,  int dst_step,  int dst_offset,
+        int rows, int cols, int dst_step1)
 {

    int x = get_global_id(0);
@@ -937,11 +988,12 @@ __kernel void arithm_bitwise_xor_with_mask_C4_D0 (__global uchar *src1, int src1
 }


-__kernel void arithm_bitwise_xor_with_mask_C4_D1 (__global char *src1, int src1_step, int src1_offset,
-                                          __global char *src2, int src2_step, int src2_offset,
-                                          __global uchar *mask, int mask_step, int mask_offset,
-                                          __global char *dst,  int dst_step,  int dst_offset,
-                                          int rows, int cols, int dst_step1)
+__kernel void arithm_bitwise_xor_with_mask_C4_D1 (
+        __global char *src1, int src1_step, int src1_offset,
+        __global char *src2, int src2_step, int src2_offset,
+        __global uchar *mask, int mask_step, int mask_offset,
+        __global char *dst,  int dst_step,  int dst_offset,
+        int rows, int cols, int dst_step1)
 {

    int x = get_global_id(0);
@@ -967,11 +1019,12 @@ __kernel void arithm_bitwise_xor_with_mask_C4_D1 (__global char *src1, int src1_
    }
 }

-__kernel void arithm_bitwise_xor_with_mask_C4_D2 (__global ushort *src1, int src1_step, int src1_offset,
-                                          __global ushort *src2, int src2_step, int src2_offset,
-                                          __global uchar  *mask, int mask_step, int mask_offset,
-                                          __global ushort *dst,  int dst_step,  int dst_offset,
-                                          int rows, int cols, int dst_step1)
+__kernel void arithm_bitwise_xor_with_mask_C4_D2 (
+        __global ushort *src1, int src1_step, int src1_offset,
+        __global ushort *src2, int src2_step, int src2_offset,
+        __global uchar  *mask, int mask_step, int mask_offset,
+        __global ushort *dst,  int dst_step,  int dst_offset,
+        int rows, int cols, int dst_step1)
 {

    int x = get_global_id(0);
@@ -996,11 +1049,12 @@ __kernel void arithm_bitwise_xor_with_mask_C4_D2 (__global ushort *src1, int src
        *((__global ushort4 *)((__global char *)dst + dst_index)) = data;
    }
 }
-__kernel void arithm_bitwise_xor_with_mask_C4_D3 (__global short *src1, int src1_step, int src1_offset,
-                                          __global short *src2, int src2_step, int src2_offset,
-                                          __global uchar *mask, int mask_step, int mask_offset,
-                                          __global short *dst,  int dst_step,  int dst_offset,
-                                          int rows, int cols, int dst_step1)
+__kernel void arithm_bitwise_xor_with_mask_C4_D3 (
+        __global short *src1, int src1_step, int src1_offset,
+        __global short *src2, int src2_step, int src2_offset,
+        __global uchar *mask, int mask_step, int mask_offset,
+        __global short *dst,  int dst_step,  int dst_offset,
+        int rows, int cols, int dst_step1)
 {

    int x = get_global_id(0);
@@ -1025,11 +1079,12 @@ __kernel void arithm_bitwise_xor_with_mask_C4_D3 (__global short *src1, int src1
        *((__global short4 *)((__global char *)dst + dst_index)) = data;
    }
 }
-__kernel void arithm_bitwise_xor_with_mask_C4_D4 (__global int   *src1, int src1_step, int src1_offset,
-                                          __global int   *src2, int src2_step, int src2_offset,
-                                          __global uchar *mask, int mask_step, int mask_offset,
-                                          __global int   *dst,  int dst_step,  int dst_offset,
-                                          int rows, int cols, int dst_step1)
+__kernel void arithm_bitwise_xor_with_mask_C4_D4 (
+        __global int   *src1, int src1_step, int src1_offset,
+        __global int   *src2, int src2_step, int src2_offset,
+        __global uchar *mask, int mask_step, int mask_offset,
+        __global int   *dst,  int dst_step,  int dst_offset,
+        int rows, int cols, int dst_step1)
 {

    int x = get_global_id(0);
@@ -1054,11 +1109,12 @@ __kernel void arithm_bitwise_xor_with_mask_C4_D4 (__global int   *src1, int src1
        *((__global int4 *)((__global char *)dst + dst_index)) = data;
    }
 }
-__kernel void arithm_bitwise_xor_with_mask_C4_D5 (__global char *src1, int src1_step, int src1_offset,
-                                          __global char *src2, int src2_step, int src2_offset,
-                                          __global uchar *mask, int mask_step, int mask_offset,
-                                          __global char *dst,  int dst_step,  int dst_offset,
-                                          int rows, int cols, int dst_step1)
+__kernel void arithm_bitwise_xor_with_mask_C4_D5 (
+        __global char *src1, int src1_step, int src1_offset,
+        __global char *src2, int src2_step, int src2_offset,
+        __global uchar *mask, int mask_step, int mask_offset,
+        __global char *dst,  int dst_step,  int dst_offset,
+        int rows, int cols, int dst_step1)
 {

    int x = get_global_id(0);
@@ -1084,11 +1140,12 @@ __kernel void arithm_bitwise_xor_with_mask_C4_D5 (__global char *src1, int src1_
    }
 }
 #if defined (DOUBLE_SUPPORT)
-__kernel void arithm_bitwise_xor_with_mask_C4_D6 (__global char *src1, int src1_step, int src1_offset,
-                                                  __global char *src2, int src2_step, int src2_offset,
-                                                  __global uchar  *mask, int mask_step, int mask_offset,
-                                                  __global char *dst,  int dst_step,  int dst_offset,
-                                                  int rows, int cols, int dst_step1)
+__kernel void arithm_bitwise_xor_with_mask_C4_D6 (
+        __global char *src1, int src1_step, int src1_offset,
+        __global char *src2, int src2_step, int src2_offset,
+        __global uchar  *mask, int mask_step, int mask_offset,
+        __global char *dst,  int dst_step,  int dst_offset,
+        int rows, int cols, int dst_step1)
 {

    int x = get_global_id(0);
--- a/modules/ocl/src/opencl/arithm_bitwise_xor_scalar.cl
+++ b/modules/ocl/src/opencl/arithm_bitwise_xor_scalar.cl
@@ -42,19 +42,21 @@
 // the use of this software, even if advised of the possibility of such damage.
 //
 //
-#if defined (__ATI__)
-#pragma OPENCL EXTENSION cl_amd_fp64:enable
-#elif defined (__NVIDIA__)
+#if defined (DOUBLE_SUPPORT)
+#ifdef cl_khr_fp64
 #pragma OPENCL EXTENSION cl_khr_fp64:enable
+#elif defined (cl_amd_fp64)
+#pragma OPENCL EXTENSION cl_amd_fp64:enable
+#endif
 #endif
-
 //////////////////////////////////////////////////////////////////////////////////////////////////////
 ////////////////////////////////////////////BITWISE_XOR////////////////////////////////////////////////////
 ///////////////////////////////////////////////////////////////////////////////////////////////////////
 /**************************************xor with scalar without mask**************************************/
-__kernel void arithm_s_bitwise_xor_C1_D0 (__global   uchar *src1, int src1_step, int src1_offset,
-                                  __global   uchar *dst,  int dst_step,  int dst_offset,
-                                  uchar4 src2, int rows, int cols, int dst_step1)
+__kernel void arithm_s_bitwise_xor_C1_D0 (
+        __global   uchar *src1, int src1_step, int src1_offset,
+        __global   uchar *dst,  int dst_step,  int dst_offset,
+        uchar4 src2, int rows, int cols, int dst_step1)
 {
    int x = get_global_id(0);
    int y = get_global_id(1);
@@ -63,7 +65,10 @@ __kernel void arithm_s_bitwise_xor_C1_D0 (__global   uchar *src1, int src1_step,
    {
        x = x << 2;

-        #define dst_align (dst_offset & 3)
+#ifdef dst_align
+#undef dst_align
+#endif
+#define dst_align (dst_offset & 3)
        int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);

        int dst_start  = mad24(y, dst_step, dst_offset);
@@ -86,9 +91,10 @@ __kernel void arithm_s_bitwise_xor_C1_D0 (__global   uchar *src1, int src1_step,
 }


-__kernel void arithm_s_bitwise_xor_C1_D1 (__global   char *src1, int src1_step, int src1_offset,
-                                  __global   char *dst,  int dst_step,  int dst_offset,
-                                  char4 src2, int rows, int cols, int dst_step1)
+__kernel void arithm_s_bitwise_xor_C1_D1 (
+        __global   char *src1, int src1_step, int src1_offset,
+        __global   char *dst,  int dst_step,  int dst_offset,
+        char4 src2, int rows, int cols, int dst_step1)
 {
    int x = get_global_id(0);
    int y = get_global_id(1);
@@ -97,7 +103,10 @@ __kernel void arithm_s_bitwise_xor_C1_D1 (__global   char *src1, int src1_step,
    {
        x = x << 2;

-        #define dst_align (dst_offset & 3)
+#ifdef dst_align
+#undef dst_align
+#endif
+#define dst_align (dst_offset & 3)
        int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);

        int dst_start  = mad24(y, dst_step, dst_offset);
@@ -119,9 +128,10 @@ __kernel void arithm_s_bitwise_xor_C1_D1 (__global   char *src1, int src1_step,
    }
 }

-__kernel void arithm_s_bitwise_xor_C1_D2 (__global   ushort *src1, int src1_step, int src1_offset,
-                                  __global   ushort *dst,  int dst_step,  int dst_offset,
-                                  ushort4 src2, int rows, int cols, int dst_step1)
+__kernel void arithm_s_bitwise_xor_C1_D2 (
+        __global   ushort *src1, int src1_step, int src1_offset,
+        __global   ushort *dst,  int dst_step,  int dst_offset,
+        ushort4 src2, int rows, int cols, int dst_step1)
 {

    int x = get_global_id(0);
@@ -131,7 +141,10 @@ __kernel void arithm_s_bitwise_xor_C1_D2 (__global   ushort *src1, int src1_step
    {
        x = x << 1;

-        #define dst_align ((dst_offset >> 1) & 1)
+#ifdef dst_align
+#undef dst_align
+#endif
+#define dst_align ((dst_offset >> 1) & 1)
        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));

        int dst_start  = mad24(y, dst_step, dst_offset);
@@ -150,9 +163,10 @@ __kernel void arithm_s_bitwise_xor_C1_D2 (__global   ushort *src1, int src1_step
        *((__global ushort2 *)((__global uchar *)dst + dst_index)) = data;
    }
 }
-__kernel void arithm_s_bitwise_xor_C1_D3 (__global   short *src1, int src1_step, int src1_offset,
-                                  __global   short *dst,  int dst_step,  int dst_offset,
-                                  short4 src2, int rows, int cols, int dst_step1)
+__kernel void arithm_s_bitwise_xor_C1_D3 (
+        __global   short *src1, int src1_step, int src1_offset,
+        __global   short *dst,  int dst_step,  int dst_offset,
+        short4 src2, int rows, int cols, int dst_step1)
 {

    int x = get_global_id(0);
@@ -162,7 +176,10 @@ __kernel void arithm_s_bitwise_xor_C1_D3 (__global   short *src1, int src1_step,
    {
        x = x << 1;

-        #define dst_align ((dst_offset >> 1) & 1)
+#ifdef dst_align
+#undef dst_align
+#endif
+#define dst_align ((dst_offset >> 1) & 1)
        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));

        int dst_start  = mad24(y, dst_step, dst_offset);
@@ -181,9 +198,10 @@ __kernel void arithm_s_bitwise_xor_C1_D3 (__global   short *src1, int src1_step,
        *((__global short2 *)((__global uchar *)dst + dst_index)) = data;
    }
 }
-__kernel void arithm_s_bitwise_xor_C1_D4 (__global   int *src1, int src1_step, int src1_offset,
-                                  __global   int *dst,  int dst_step,  int dst_offset,
-                                  int4 src2, int rows, int cols, int dst_step1)
+__kernel void arithm_s_bitwise_xor_C1_D4 (
+        __global   int *src1, int src1_step, int src1_offset,
+        __global   int *dst,  int dst_step,  int dst_offset,
+        int4 src2, int rows, int cols, int dst_step1)
 {

    int x = get_global_id(0);
@@ -202,9 +220,10 @@ __kernel void arithm_s_bitwise_xor_C1_D4 (__global   int *src1, int src1_step, i
        *((__global int *)((__global char *)dst + dst_index)) = data;
    }
 }
-__kernel void arithm_s_bitwise_xor_C1_D5 (__global   char *src1, int src1_step, int src1_offset,
-                                  __global   char *dst,  int dst_step,  int dst_offset,
-                                  char16 src2, int rows, int cols, int dst_step1)
+__kernel void arithm_s_bitwise_xor_C1_D5 (
+        __global   char *src1, int src1_step, int src1_offset,
+        __global   char *dst,  int dst_step,  int dst_offset,
+        char16 src2, int rows, int cols, int dst_step1)
 {

    int x = get_global_id(0);
@@ -234,9 +253,10 @@ __kernel void arithm_s_bitwise_xor_C1_D5 (__global   char *src1, int src1_step,
 }

 #if defined (DOUBLE_SUPPORT)
-__kernel void arithm_s_bitwise_xor_C1_D6 (__global short *src1, int src1_step, int src1_offset,
-                                  __global short *dst,  int dst_step,  int dst_offset,
-                                  short16 src2, int rows, int cols, int dst_step1)
+__kernel void arithm_s_bitwise_xor_C1_D6 (
+        __global short *src1, int src1_step, int src1_offset,
+        __global short *dst,  int dst_step,  int dst_offset,
+        short16 src2, int rows, int cols, int dst_step1)
 {

    int x = get_global_id(0);
@@ -256,9 +276,10 @@ __kernel void arithm_s_bitwise_xor_C1_D6 (__global short *src1, int src1_step, i
    }
 }
 #endif
-__kernel void arithm_s_bitwise_xor_C2_D0 (__global   uchar *src1, int src1_step, int src1_offset,
-                                  __global   uchar *dst,  int dst_step,  int dst_offset,
-                                  uchar4 src2, int rows, int cols, int dst_step1)
+__kernel void arithm_s_bitwise_xor_C2_D0 (
+        __global   uchar *src1, int src1_step, int src1_offset,
+        __global   uchar *dst,  int dst_step,  int dst_offset,
+        uchar4 src2, int rows, int cols, int dst_step1)
 {

    int x = get_global_id(0);
@@ -268,7 +289,10 @@ __kernel void arithm_s_bitwise_xor_C2_D0 (__global   uchar *src1, int src1_step,
    {
        x = x << 1;

-        #define dst_align ((dst_offset >> 1) & 1)
+#ifdef dst_align
+#undef dst_align
+#endif
+#define dst_align ((dst_offset >> 1) & 1)
        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));

        int dst_start  = mad24(y, dst_step, dst_offset);
@@ -290,9 +314,10 @@ __kernel void arithm_s_bitwise_xor_C2_D0 (__global   uchar *src1, int src1_step,
 }


-__kernel void arithm_s_bitwise_xor_C2_D1 (__global   char *src1, int src1_step, int src1_offset,
-                                  __global   char *dst,  int dst_step,  int dst_offset,
-                                  char4 src2, int rows, int cols, int dst_step1)
+__kernel void arithm_s_bitwise_xor_C2_D1 (
+        __global   char *src1, int src1_step, int src1_offset,
+        __global   char *dst,  int dst_step,  int dst_offset,
+        char4 src2, int rows, int cols, int dst_step1)
 {

    int x = get_global_id(0);
@@ -302,7 +327,10 @@ __kernel void arithm_s_bitwise_xor_C2_D1 (__global   char *src1, int src1_step,
    {
        x = x << 1;

-        #define dst_align ((dst_offset >> 1) & 1)
+#ifdef dst_align
+#undef dst_align
+#endif
+#define dst_align ((dst_offset >> 1) & 1)
        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));

        int dst_start  = mad24(y, dst_step, dst_offset);
@@ -322,9 +350,10 @@ __kernel void arithm_s_bitwise_xor_C2_D1 (__global   char *src1, int src1_step,
    }
 }

-__kernel void arithm_s_bitwise_xor_C2_D2 (__global   ushort *src1, int src1_step, int src1_offset,
-                                  __global   ushort *dst,  int dst_step,  int dst_offset,
-                                  ushort4 src2, int rows, int cols, int dst_step1)
+__kernel void arithm_s_bitwise_xor_C2_D2 (
+        __global   ushort *src1, int src1_step, int src1_offset,
+        __global   ushort *dst,  int dst_step,  int dst_offset,
+        ushort4 src2, int rows, int cols, int dst_step1)
 {

    int x = get_global_id(0);
@@ -343,9 +372,10 @@ __kernel void arithm_s_bitwise_xor_C2_D2 (__global   ushort *src1, int src1_step
        *((__global ushort2 *)((__global char *)dst + dst_index)) = data;
    }
 }
-__kernel void arithm_s_bitwise_xor_C2_D3 (__global   short *src1, int src1_step, int src1_offset,
-                                  __global   short *dst,  int dst_step,  int dst_offset,
-                                  short4 src2, int rows, int cols, int dst_step1)
+__kernel void arithm_s_bitwise_xor_C2_D3 (
+        __global   short *src1, int src1_step, int src1_offset,
+        __global   short *dst,  int dst_step,  int dst_offset,
+        short4 src2, int rows, int cols, int dst_step1)
 {

    int x = get_global_id(0);
@@ -364,9 +394,10 @@ __kernel void arithm_s_bitwise_xor_C2_D3 (__global   short *src1, int src1_step,
        *((__global short2 *)((__global char *)dst + dst_index)) = data;
    }
 }
-__kernel void arithm_s_bitwise_xor_C2_D4 (__global   int *src1, int src1_step, int src1_offset,
-                                  __global   int *dst,  int dst_step,  int dst_offset,
-                                  int4 src2, int rows, int cols, int dst_step1)
+__kernel void arithm_s_bitwise_xor_C2_D4 (
+        __global   int *src1, int src1_step, int src1_offset,
+        __global   int *dst,  int dst_step,  int dst_offset,
+        int4 src2, int rows, int cols, int dst_step1)
 {

    int x = get_global_id(0);
@@ -384,9 +415,10 @@ __kernel void arithm_s_bitwise_xor_C2_D4 (__global   int *src1, int src1_step, i
        *((__global int2 *)((__global char *)dst + dst_index)) = data;
    }
 }
-__kernel void arithm_s_bitwise_xor_C2_D5 (__global   char *src1, int src1_step, int src1_offset,
-                                  __global   char *dst,  int dst_step,  int dst_offset,
-                                  char16 src2, int rows, int cols, int dst_step1)
+__kernel void arithm_s_bitwise_xor_C2_D5 (
+        __global   char *src1, int src1_step, int src1_offset,
+        __global   char *dst,  int dst_step,  int dst_offset,
+        char16 src2, int rows, int cols, int dst_step1)
 {

    int x = get_global_id(0);
@@ -403,12 +435,13 @@ __kernel void arithm_s_bitwise_xor_C2_D5 (__global   char *src1, int src1_step,
        char8 tmp_data = src1_data ^ src2_data;

        *((__global char8 *)((__global char *)dst + dst_index)) = tmp_data;
-      }
+    }
 }
 #if defined (DOUBLE_SUPPORT)
-__kernel void arithm_s_bitwise_xor_C2_D6 (__global short *src1, int src1_step, int src1_offset,
-                                  __global short *dst,  int dst_step,  int dst_offset,
-                                  short16 src2, int rows, int cols, int dst_step1)
+__kernel void arithm_s_bitwise_xor_C2_D6 (
+        __global short *src1, int src1_step, int src1_offset,
+        __global short *dst,  int dst_step,  int dst_offset,
+        short16 src2, int rows, int cols, int dst_step1)
 {

    int x = get_global_id(0);
@@ -428,9 +461,10 @@ __kernel void arithm_s_bitwise_xor_C2_D6 (__global short *src1, int src1_step, i
    }
 }
 #endif
-__kernel void arithm_s_bitwise_xor_C3_D0 (__global   uchar *src1, int src1_step, int src1_offset,
-                                  __global   uchar *dst,  int dst_step,  int dst_offset,
-                                  uchar4 src2, int rows, int cols, int dst_step1)
+__kernel void arithm_s_bitwise_xor_C3_D0 (
+        __global   uchar *src1, int src1_step, int src1_offset,
+        __global   uchar *dst,  int dst_step,  int dst_offset,
+        uchar4 src2, int rows, int cols, int dst_step1)
 {

    int x = get_global_id(0);
@@ -440,7 +474,10 @@ __kernel void arithm_s_bitwise_xor_C3_D0 (__global   uchar *src1, int src1_step,
    {
        x = x << 2;

-        #define dst_align (((dst_offset % dst_step) / 3 ) & 3)
+#ifdef dst_align
+#undef dst_align
+#endif
+#define dst_align (((dst_offset % dst_step) / 3 ) & 3)
        int src1_index = mad24(y, src1_step, (x * 3) + src1_offset - (dst_align * 3));

        int dst_start  = mad24(y, dst_step, dst_offset);
@@ -484,9 +521,10 @@ __kernel void arithm_s_bitwise_xor_C3_D0 (__global   uchar *src1, int src1_step,
 }


-__kernel void arithm_s_bitwise_xor_C3_D1 (__global   char *src1, int src1_step, int src1_offset,
-                                  __global   char *dst,  int dst_step,  int dst_offset,
-                                  char4 src2, int rows, int cols, int dst_step1)
+__kernel void arithm_s_bitwise_xor_C3_D1 (
+        __global   char *src1, int src1_step, int src1_offset,
+        __global   char *dst,  int dst_step,  int dst_offset,
+        char4 src2, int rows, int cols, int dst_step1)
 {

    int x = get_global_id(0);
@@ -496,7 +534,10 @@ __kernel void arithm_s_bitwise_xor_C3_D1 (__global   char *src1, int src1_step,
    {
        x = x << 2;

-        #define dst_align (((dst_offset % dst_step) / 3 ) & 3)
+#ifdef dst_align
+#undef dst_align
+#endif
+#define dst_align (((dst_offset % dst_step) / 3 ) & 3)
        int src1_index = mad24(y, src1_step, (x * 3) + src1_offset - (dst_align * 3));

        int dst_start  = mad24(y, dst_step, dst_offset);
@@ -539,9 +580,10 @@ __kernel void arithm_s_bitwise_xor_C3_D1 (__global   char *src1, int src1_step,
    }
 }

-__kernel void arithm_s_bitwise_xor_C3_D2 (__global   ushort *src1, int src1_step, int src1_offset,
-                                  __global   ushort *dst,  int dst_step,  int dst_offset,
-                                  ushort4 src2, int rows, int cols, int dst_step1)
+__kernel void arithm_s_bitwise_xor_C3_D2 (
+        __global   ushort *src1, int src1_step, int src1_offset,
+        __global   ushort *dst,  int dst_step,  int dst_offset,
+        ushort4 src2, int rows, int cols, int dst_step1)
 {

    int x = get_global_id(0);
@@ -551,7 +593,10 @@ __kernel void arithm_s_bitwise_xor_C3_D2 (__global   ushort *src1, int src1_step
    {
        x = x << 1;

-        #define dst_align (((dst_offset % dst_step) / 6 ) & 1)
+#ifdef dst_align
+#undef dst_align
+#endif
+#define dst_align (((dst_offset % dst_step) / 6 ) & 1)
        int src1_index = mad24(y, src1_step, (x * 6) + src1_offset - (dst_align * 6));

        int dst_start  = mad24(y, dst_step, dst_offset);
@@ -577,21 +622,22 @@ __kernel void arithm_s_bitwise_xor_C3_D2 (__global   ushort *src1, int src1_step
        data_0.xy = ((dst_index + 0 >= dst_start)) ? tmp_data_0.xy : data_0.xy;

        data_1.x  = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end))
-                     ? tmp_data_1.x : data_1.x;
+                    ? tmp_data_1.x : data_1.x;
        data_1.y  = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
-                     ? tmp_data_1.y : data_1.y;
+                    ? tmp_data_1.y : data_1.y;

        data_2.xy = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
-                     ? tmp_data_2.xy : data_2.xy;
+                    ? tmp_data_2.xy : data_2.xy;

-       *((__global ushort2 *)((__global char *)dst + dst_index + 0))= data_0;
-       *((__global ushort2 *)((__global char *)dst + dst_index + 4))= data_1;
-       *((__global ushort2 *)((__global char *)dst + dst_index + 8))= data_2;
+        *((__global ushort2 *)((__global char *)dst + dst_index + 0))= data_0;
+        *((__global ushort2 *)((__global char *)dst + dst_index + 4))= data_1;
+        *((__global ushort2 *)((__global char *)dst + dst_index + 8))= data_2;
    }
 }
-__kernel void arithm_s_bitwise_xor_C3_D3 (__global   short *src1, int src1_step, int src1_offset,
-                                  __global   short *dst,  int dst_step,  int dst_offset,
-                                  short4 src2, int rows, int cols, int dst_step1)
+__kernel void arithm_s_bitwise_xor_C3_D3 (
+        __global   short *src1, int src1_step, int src1_offset,
+        __global   short *dst,  int dst_step,  int dst_offset,
+        short4 src2, int rows, int cols, int dst_step1)
 {

    int x = get_global_id(0);
@@ -601,7 +647,10 @@ __kernel void arithm_s_bitwise_xor_C3_D3 (__global   short *src1, int src1_step,
    {
        x = x << 1;

-        #define dst_align (((dst_offset % dst_step) / 6 ) & 1)
+#ifdef dst_align
+#undef dst_align
+#endif
+#define dst_align (((dst_offset % dst_step) / 6 ) & 1)
        int src1_index = mad24(y, src1_step, (x * 6) + src1_offset - (dst_align * 6));

        int dst_start  = mad24(y, dst_step, dst_offset);
@@ -627,21 +676,22 @@ __kernel void arithm_s_bitwise_xor_C3_D3 (__global   short *src1, int src1_step,
        data_0.xy = ((dst_index + 0 >= dst_start)) ? tmp_data_0.xy : data_0.xy;

        data_1.x  = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end))
-                     ? tmp_data_1.x : data_1.x;
+                    ? tmp_data_1.x : data_1.x;
        data_1.y  = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
-                     ? tmp_data_1.y : data_1.y;
+                    ? tmp_data_1.y : data_1.y;

        data_2.xy = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
-                     ? tmp_data_2.xy : data_2.xy;
+                    ? tmp_data_2.xy : data_2.xy;

-       *((__global short2 *)((__global char *)dst + dst_index + 0))= data_0;
-       *((__global short2 *)((__global char *)dst + dst_index + 4))= data_1;
-       *((__global short2 *)((__global char *)dst + dst_index + 8))= data_2;
+        *((__global short2 *)((__global char *)dst + dst_index + 0))= data_0;
+        *((__global short2 *)((__global char *)dst + dst_index + 4))= data_1;
+        *((__global short2 *)((__global char *)dst + dst_index + 8))= data_2;
    }
 }
-__kernel void arithm_s_bitwise_xor_C3_D4 (__global   int *src1, int src1_step, int src1_offset,
-                                  __global   int *dst,  int dst_step,  int dst_offset,
-                                  int4 src2, int rows, int cols, int dst_step1)
+__kernel void arithm_s_bitwise_xor_C3_D4 (
+        __global   int *src1, int src1_step, int src1_offset,
+        __global   int *dst,  int dst_step,  int dst_offset,
+        int4 src2, int rows, int cols, int dst_step1)
 {

    int x = get_global_id(0);
@@ -668,14 +718,15 @@ __kernel void arithm_s_bitwise_xor_C3_D4 (__global   int *src1, int src1_step, i
        int tmp_data_1 = src1_data_1 ^ src2_data_1;
        int tmp_data_2 = src1_data_2 ^ src2_data_2;

-       *((__global int *)((__global char *)dst + dst_index + 0))= tmp_data_0;
-       *((__global int *)((__global char *)dst + dst_index + 4))= tmp_data_1;
-       *((__global int *)((__global char *)dst + dst_index + 8))= tmp_data_2;
+        *((__global int *)((__global char *)dst + dst_index + 0))= tmp_data_0;
+        *((__global int *)((__global char *)dst + dst_index + 4))= tmp_data_1;
+        *((__global int *)((__global char *)dst + dst_index + 8))= tmp_data_2;
    }
 }
-__kernel void arithm_s_bitwise_xor_C3_D5 (__global   char *src1, int src1_step, int src1_offset,
-                                  __global   char *dst,  int dst_step,  int dst_offset,
-                                  char16 src2, int rows, int cols, int dst_step1)
+__kernel void arithm_s_bitwise_xor_C3_D5 (
+        __global   char *src1, int src1_step, int src1_offset,
+        __global   char *dst,  int dst_step,  int dst_offset,
+        char16 src2, int rows, int cols, int dst_step1)
 {

    int x = get_global_id(0);
@@ -702,15 +753,16 @@ __kernel void arithm_s_bitwise_xor_C3_D5 (__global   char *src1, int src1_step,
        char4 tmp_data_1 = src1_data_1 ^ src2_data_1;
        char4 tmp_data_2 = src1_data_2 ^ src2_data_2;

-       *((__global char4 *)((__global char *)dst + dst_index + 0))= tmp_data_0;
-       *((__global char4 *)((__global char *)dst + dst_index + 4))= tmp_data_1;
-       *((__global char4 *)((__global char *)dst + dst_index + 8))= tmp_data_2;
+        *((__global char4 *)((__global char *)dst + dst_index + 0))= tmp_data_0;
+        *((__global char4 *)((__global char *)dst + dst_index + 4))= tmp_data_1;
+        *((__global char4 *)((__global char *)dst + dst_index + 8))= tmp_data_2;
    }
 }
 #if defined (DOUBLE_SUPPORT)
-__kernel void arithm_s_bitwise_xor_C3_D6 (__global short *src1, int src1_step, int src1_offset,
-                                          __global short *dst,  int dst_step,  int dst_offset,
-                                          short16 src2, int rows, int cols, int dst_step1)
+__kernel void arithm_s_bitwise_xor_C3_D6 (
+        __global short *src1, int src1_step, int src1_offset,
+        __global short *dst,  int dst_step,  int dst_offset,
+        short16 src2, int rows, int cols, int dst_step1)
 {

    int x = get_global_id(0);
@@ -737,15 +789,16 @@ __kernel void arithm_s_bitwise_xor_C3_D6 (__global short *src1, int src1_step, i
        short4 tmp_data_1 = src1_data_1 ^ src2_data_1;
        short4 tmp_data_2 = src1_data_2 ^ src2_data_2;

-       *((__global short4 *)((__global char *)dst + dst_index + 0 ))= tmp_data_0;
-       *((__global short4 *)((__global char *)dst + dst_index + 8 ))= tmp_data_1;
-       *((__global short4 *)((__global char *)dst + dst_index + 16))= tmp_data_2;
+        *((__global short4 *)((__global char *)dst + dst_index + 0 ))= tmp_data_0;
+        *((__global short4 *)((__global char *)dst + dst_index + 8 ))= tmp_data_1;
+        *((__global short4 *)((__global char *)dst + dst_index + 16))= tmp_data_2;
    }
 }
 #endif
-__kernel void arithm_s_bitwise_xor_C4_D0 (__global   uchar *src1, int src1_step, int src1_offset,
-                                  __global   uchar *dst,  int dst_step,  int dst_offset,
-                                  uchar4 src2, int rows, int cols, int dst_step1)
+__kernel void arithm_s_bitwise_xor_C4_D0 (
+        __global   uchar *src1, int src1_step, int src1_offset,
+        __global   uchar *dst,  int dst_step,  int dst_offset,
+        uchar4 src2, int rows, int cols, int dst_step1)
 {

    int x = get_global_id(0);
@@ -765,9 +818,10 @@ __kernel void arithm_s_bitwise_xor_C4_D0 (__global   uchar *src1, int src1_step,
 }


-__kernel void arithm_s_bitwise_xor_C4_D1 (__global   char *src1, int src1_step, int src1_offset,
-                                  __global   char *dst,  int dst_step,  int dst_offset,
-                                  char4 src2, int rows, int cols, int dst_step1)
+__kernel void arithm_s_bitwise_xor_C4_D1 (
+        __global   char *src1, int src1_step, int src1_offset,
+        __global   char *dst,  int dst_step,  int dst_offset,
+        char4 src2, int rows, int cols, int dst_step1)
 {

    int x = get_global_id(0);
@@ -786,9 +840,10 @@ __kernel void arithm_s_bitwise_xor_C4_D1 (__global   char *src1, int src1_step,
    }
 }

-__kernel void arithm_s_bitwise_xor_C4_D2 (__global   ushort *src1, int src1_step, int src1_offset,
-                                  __global   ushort *dst,  int dst_step,  int dst_offset,
-                                  ushort4 src2, int rows, int cols, int dst_step1)
+__kernel void arithm_s_bitwise_xor_C4_D2 (
+        __global   ushort *src1, int src1_step, int src1_offset,
+        __global   ushort *dst,  int dst_step,  int dst_offset,
+        ushort4 src2, int rows, int cols, int dst_step1)
 {

    int x = get_global_id(0);
@@ -806,9 +861,10 @@ __kernel void arithm_s_bitwise_xor_C4_D2 (__global   ushort *src1, int src1_step
        *((__global ushort4 *)((__global char *)dst + dst_index)) = data;
    }
 }
-__kernel void arithm_s_bitwise_xor_C4_D3 (__global   short *src1, int src1_step, int src1_offset,
-                                  __global   short *dst,  int dst_step,  int dst_offset,
-                                  short4 src2, int rows, int cols, int dst_step1)
+__kernel void arithm_s_bitwise_xor_C4_D3 (
+        __global   short *src1, int src1_step, int src1_offset,
+        __global   short *dst,  int dst_step,  int dst_offset,
+        short4 src2, int rows, int cols, int dst_step1)
 {

    int x = get_global_id(0);
@@ -826,9 +882,10 @@ __kernel void arithm_s_bitwise_xor_C4_D3 (__global   short *src1, int src1_step,
        *((__global short4 *)((__global char *)dst + dst_index)) = data;
    }
 }
-__kernel void arithm_s_bitwise_xor_C4_D4 (__global   int *src1, int src1_step, int src1_offset,
-                                  __global   int *dst,  int dst_step,  int dst_offset,
-                                  int4 src2, int rows, int cols, int dst_step1)
+__kernel void arithm_s_bitwise_xor_C4_D4 (
+        __global   int *src1, int src1_step, int src1_offset,
+        __global   int *dst,  int dst_step,  int dst_offset,
+        int4 src2, int rows, int cols, int dst_step1)
 {

    int x = get_global_id(0);
@@ -846,9 +903,10 @@ __kernel void arithm_s_bitwise_xor_C4_D4 (__global   int *src1, int src1_step, i
        *((__global int4 *)((__global char *)dst + dst_index)) = data;
    }
 }
-__kernel void arithm_s_bitwise_xor_C4_D5 (__global   char *src1, int src1_step, int src1_offset,
-                                  __global   char *dst,  int dst_step,  int dst_offset,
-                                  char16 src2, int rows, int cols, int dst_step1)
+__kernel void arithm_s_bitwise_xor_C4_D5 (
+        __global   char *src1, int src1_step, int src1_offset,
+        __global   char *dst,  int dst_step,  int dst_offset,
+        char16 src2, int rows, int cols, int dst_step1)
 {

    int x = get_global_id(0);
@@ -869,9 +927,10 @@ __kernel void arithm_s_bitwise_xor_C4_D5 (__global   char *src1, int src1_step,
    }
 }
 #if defined (DOUBLE_SUPPORT)
-__kernel void arithm_s_bitwise_xor_C4_D6 (__global short *src1, int src1_step, int src1_offset,
-                                          __global short *dst,  int dst_step,  int dst_offset,
-                                          short16 src2, int rows, int cols, int dst_step1)
+__kernel void arithm_s_bitwise_xor_C4_D6 (
+        __global short *src1, int src1_step, int src1_offset,
+        __global short *dst,  int dst_step,  int dst_offset,
+        short16 src2, int rows, int cols, int dst_step1)
 {

    int x = get_global_id(0);
@@ -897,11 +956,11 @@ __kernel void arithm_s_bitwise_xor_C4_D6 (__global short *src1, int src1_step, i
        short4 tmp_data_2 = src1_data_2 ^ src2_data_2;
        short4 tmp_data_3 = src1_data_3 ^ src2_data_3;

-       *((__global short4 *)((__global char *)dst + dst_index + 0 ))= tmp_data_0;
-       *((__global short4 *)((__global char *)dst + dst_index + 8 ))= tmp_data_1;
-       *((__global short4 *)((__global char *)dst + dst_index + 16))= tmp_data_2;
-       *((__global short4 *)((__global char *)dst + dst_index + 24))= tmp_data_3;
+        *((__global short4 *)((__global char *)dst + dst_index + 0 ))= tmp_data_0;
+        *((__global short4 *)((__global char *)dst + dst_index + 8 ))= tmp_data_1;
+        *((__global short4 *)((__global char *)dst + dst_index + 16))= tmp_data_2;
+        *((__global short4 *)((__global char *)dst + dst_index + 24))= tmp_data_3;

    }
 }
-#endif
+#endif
--- a/Show More
+++ b/Show More