From 5b598f8a0e9048dd800d479e8abd979e012effdd Mon Sep 17 00:00:00 2001
From: yao <bitwangyaoyao@gmail.com>
Date: Thu, 30 May 2013 16:20:31 +0800
Subject: [PATCH 01/27] a few fixes of ocl::perf test cases

---
 modules/ocl/perf/perf_arithm.cpp  |   5 -
 modules/ocl/perf/perf_filters.cpp |   8 +-
 modules/ocl/perf/perf_imgproc.cpp | 197 ++----------------------------
 3 files changed, 10 insertions(+), 200 deletions(-)
diff --git a/modules/ocl/perf/perf_arithm.cpp b/modules/ocl/perf/perf_arithm.cpp
index 4f690e091..3ef0634e7 100644
--- a/modules/ocl/perf/perf_arithm.cpp
+++ b/modules/ocl/perf/perf_arithm.cpp
@@ -62,7 +62,6 @@ PERFTEST(lut)
 
             gen(src, size, size, all_type[j], 0, 256);
             gen(lut, 1, 256, CV_8UC1, 0, 1);
-            dst = src;
 
             LUT(src, lut, dst);
 
@@ -233,8 +232,6 @@ PERFTEST(Mul)
 
             gen(src1, size, size, all_type[j], 0, 256);
             gen(src2, size, size, all_type[j], 0, 256);
-            dst = src1;
-            dst.setTo(0);
 
             multiply(src1, src2, dst);
 
@@ -281,8 +278,6 @@ PERFTEST(Div)
 
             gen(src1, size, size, all_type[j], 0, 256);
             gen(src2, size, size, all_type[j], 0, 256);
-            dst = src1;
-            dst.setTo(0);
 
             divide(src1, src2, dst);
 
diff --git a/modules/ocl/perf/perf_filters.cpp b/modules/ocl/perf/perf_filters.cpp
index c8c840d0e..5cf92711e 100644
--- a/modules/ocl/perf/perf_filters.cpp
+++ b/modules/ocl/perf/perf_filters.cpp
@@ -291,9 +291,7 @@ PERFTEST(GaussianBlur)
         {
             SUBTEST << size << 'x' << size << "; " << type_name[j] ;
 
-            gen(src, size, size, all_type[j], 0, 256);
-            dst = src;
-            dst.setTo(0);
+            gen(src, size, size, all_type[j], 5, 16);
 
             GaussianBlur(src, dst, Size(9, 9), 0);
 
@@ -346,8 +344,8 @@ PERFTEST(filter2D)
                 Mat kernel;
                 gen(kernel, ksize, ksize, CV_32FC1, 0.0, 1.0);
 
-				Mat dst, ocl_dst;
-				dst.setTo(0);
+                Mat dst, ocl_dst;
+
                 cv::filter2D(src, dst, -1, kernel);
 
                 CPU_ON;
diff --git a/modules/ocl/perf/perf_imgproc.cpp b/modules/ocl/perf/perf_imgproc.cpp
index 18c7429e8..0aef8b27e 100644
--- a/modules/ocl/perf/perf_imgproc.cpp
+++ b/modules/ocl/perf/perf_imgproc.cpp
@@ -674,8 +674,8 @@ COOR do_meanShift(int x0, int y0, uchar *sptr, uchar *dptr, int sstep, cv::Size
     coor.y = static_cast<short>(y0);
     return coor;
 }
-void meanShiftFiltering_(const Mat &src_roi, Mat &dst_roi, int sp, int sr, cv::TermCriteria crit);
-void meanShiftFiltering_(const Mat &src_roi, Mat &dst_roi, int sp, int sr, cv::TermCriteria crit)
+
+static void meanShiftFiltering_(const Mat &src_roi, Mat &dst_roi, int sp, int sr, cv::TermCriteria crit)
 {
     if( src_roi.empty() )
         CV_Error( CV_StsBadArg, "The input image is empty" );
@@ -683,6 +683,8 @@ void meanShiftFiltering_(const Mat &src_roi, Mat &dst_roi, int sp, int sr, cv::T
     if( src_roi.depth() != CV_8U || src_roi.channels() != 4 )
         CV_Error( CV_StsUnsupportedFormat, "Only 8-bit, 4-channel images are supported" );
 
+    dst_roi.create(src_roi.size(), src_roi.type());
+
     CV_Assert( (src_roi.cols == dst_roi.cols) && (src_roi.rows == dst_roi.rows) );
     CV_Assert( !(dst_roi.step & 0x3) );
 
@@ -725,9 +727,6 @@ PERFTEST(meanShiftFiltering)
         SUBTEST << size << 'x' << size << "; 8UC3 vs 8UC4";
 
         gen(src, size, size, CV_8UC4, Scalar::all(0), Scalar::all(256));
-        //gen(dst, size, size, CV_8UC4, Scalar::all(0), Scalar::all(256));
-        dst = src;
-        dst.setTo(0);
 
         cv::TermCriteria crit(cv::TermCriteria::COUNT + cv::TermCriteria::EPS, 5, 1);
 
@@ -756,201 +755,21 @@ PERFTEST(meanShiftFiltering)
         TestSystem::instance().ExpectedMatNear(dst, ocl_dst, 0.0);
     }
 }
-///////////// meanShiftProc////////////////////////
-#if 0
-COOR do_meanShift(int x0, int y0, uchar *sptr, uchar *dptr, int sstep, cv::Size size, int sp, int sr, int maxIter, float eps, int *tab)
-{
-
-    int isr2 = sr * sr;
-    int c0, c1, c2, c3;
-    int iter;
-    uchar *ptr = NULL;
-    uchar *pstart = NULL;
-    int revx = 0, revy = 0;
-    c0 = sptr[0];
-    c1 = sptr[1];
-    c2 = sptr[2];
-    c3 = sptr[3];
-
-    // iterate meanshift procedure
-    for (iter = 0; iter < maxIter; iter++)
-    {
-        int count = 0;
-        int s0 = 0, s1 = 0, s2 = 0, sx = 0, sy = 0;
-
-        //mean shift: process pixels in window (p-sigmaSp)x(p+sigmaSp)
-        int minx = x0 - sp;
-        int miny = y0 - sp;
-        int maxx = x0 + sp;
-        int maxy = y0 + sp;
-
-        //deal with the image boundary
-        if (minx < 0)
-        {
-            minx = 0;
-        }
-
-        if (miny < 0)
-        {
-            miny = 0;
-        }
-
-        if (maxx >= size.width)
-        {
-            maxx = size.width - 1;
-        }
-
-        if (maxy >= size.height)
-        {
-            maxy = size.height - 1;
-        }
-
-        if (iter == 0)
-        {
-            pstart = sptr;
-        }
-        else
-        {
-            pstart = pstart + revy * sstep + (revx << 2); //point to the new position
-        }
-
-        ptr = pstart;
-        ptr = ptr + (miny - y0) * sstep + ((minx - x0) << 2); //point to the start in the row
-
-        for (int y = miny; y <= maxy; y++, ptr += sstep - ((maxx - minx + 1) << 2))
-        {
-            int rowCount = 0;
-            int x = minx;
-#if CV_ENABLE_UNROLLED
-
-            for (; x + 4 <= maxx; x += 4, ptr += 16)
-            {
-                int t0, t1, t2;
-                t0 = ptr[0], t1 = ptr[1], t2 = ptr[2];
-
-                if (tab[t0 - c0 + 255] + tab[t1 - c1 + 255] + tab[t2 - c2 + 255] <= isr2)
-                {
-                    s0 += t0;
-                    s1 += t1;
-                    s2 += t2;
-                    sx += x;
-                    rowCount++;
-                }
-
-                t0 = ptr[4], t1 = ptr[5], t2 = ptr[6];
-
-                if (tab[t0 - c0 + 255] + tab[t1 - c1 + 255] + tab[t2 - c2 + 255] <= isr2)
-                {
-                    s0 += t0;
-                    s1 += t1;
-                    s2 += t2;
-                    sx += x + 1;
-                    rowCount++;
-                }
-
-                t0 = ptr[8], t1 = ptr[9], t2 = ptr[10];
-
-                if (tab[t0 - c0 + 255] + tab[t1 - c1 + 255] + tab[t2 - c2 + 255] <= isr2)
-                {
-                    s0 += t0;
-                    s1 += t1;
-                    s2 += t2;
-                    sx += x + 2;
-                    rowCount++;
-                }
-
-                t0 = ptr[12], t1 = ptr[13], t2 = ptr[14];
-
-                if (tab[t0 - c0 + 255] + tab[t1 - c1 + 255] + tab[t2 - c2 + 255] <= isr2)
-                {
-                    s0 += t0;
-                    s1 += t1;
-                    s2 += t2;
-                    sx += x + 3;
-                    rowCount++;
-                }
-            }
-
-#endif
-
-            for (; x <= maxx; x++, ptr += 4)
-            {
-                int t0 = ptr[0], t1 = ptr[1], t2 = ptr[2];
-
-                if (tab[t0 - c0 + 255] + tab[t1 - c1 + 255] + tab[t2 - c2 + 255] <= isr2)
-                {
-                    s0 += t0;
-                    s1 += t1;
-                    s2 += t2;
-                    sx += x;
-                    rowCount++;
-                }
-            }
-
-            if (rowCount == 0)
-            {
-                continue;
-            }
-
-            count += rowCount;
-            sy += y * rowCount;
-        }
-
-        if (count == 0)
-        {
-            break;
-        }
-
-        int x1 = sx / count;
-        int y1 = sy / count;
-        s0 = s0 / count;
-        s1 = s1 / count;
-        s2 = s2 / count;
-
-        bool stopFlag = (x0 == x1 && y0 == y1) || (abs(x1 - x0) + abs(y1 - y0) +
-                        tab[s0 - c0 + 255] + tab[s1 - c1 + 255] + tab[s2 - c2 + 255] <= eps);
-
-        //revise the pointer corresponding to the new (y0,x0)
-        revx = x1 - x0;
-        revy = y1 - y0;
-
-        x0 = x1;
-        y0 = y1;
-        c0 = s0;
-        c1 = s1;
-        c2 = s2;
-
-        if (stopFlag)
-        {
-            break;
-        }
-    } //for iter
-
-    dptr[0] = (uchar)c0;
-    dptr[1] = (uchar)c1;
-    dptr[2] = (uchar)c2;
-    dptr[3] = (uchar)c3;
-
-    COOR coor;
-    coor.x = static_cast<short>(x0);
-    coor.y = static_cast<short>(y0);
-    return coor;
-}
-#endif
 
 void meanShiftProc_(const Mat &src_roi, Mat &dst_roi, Mat &dstCoor_roi, int sp, int sr, cv::TermCriteria crit)
 {
-
     if (src_roi.empty())
     {
         CV_Error(CV_StsBadArg, "The input image is empty");
     }
-
     if (src_roi.depth() != CV_8U || src_roi.channels() != 4)
     {
         CV_Error(CV_StsUnsupportedFormat, "Only 8-bit, 4-channel images are supported");
     }
 
+    dst_roi.create(src_roi.size(), src_roi.type());
+    dstCoor_roi.create(src_roi.size(), CV_16SC2);
+
     CV_Assert((src_roi.cols == dst_roi.cols) && (src_roi.rows == dst_roi.rows) &&
               (src_roi.cols == dstCoor_roi.cols) && (src_roi.rows == dstCoor_roi.rows));
     CV_Assert(!(dstCoor_roi.step & 0x3));
@@ -1008,8 +827,6 @@ PERFTEST(meanShiftProc)
         SUBTEST << size << 'x' << size << "; 8UC4 and CV_16SC2 ";
 
         gen(src, size, size, CV_8UC4, Scalar::all(0), Scalar::all(256));
-        gen(dst[0], size, size, CV_8UC4, Scalar::all(0), Scalar::all(256));
-        gen(dst[1], size, size, CV_16SC2, Scalar::all(0), Scalar::all(256));
 
         meanShiftProc_(src, dst[0], dst[1], 5, 6, crit);
 

From fdc133d8c9e112d4a20ffc9f2efdfbb7754f048b Mon Sep 17 00:00:00 2001
From: peng xiao <hisenxpress@gmail.com>
Date: Thu, 30 May 2013 09:37:05 +0800
Subject: [PATCH 02/27] Fix ocl::pyrup kernel build on Mac.

---
 modules/ocl/src/opencl/pyr_up.cl | 15 ++++++++-------
 1 file changed, 8 insertions(+), 7 deletions(-)

diff --git a/modules/ocl/src/opencl/pyr_up.cl b/modules/ocl/src/opencl/pyr_up.cl
index ef41c9408..4afa7b710 100644
--- a/modules/ocl/src/opencl/pyr_up.cl
+++ b/modules/ocl/src/opencl/pyr_up.cl
@@ -389,8 +389,8 @@ __kernel void pyrUp_C4_D0(__global uchar4* src,__global uchar4* dst,
 
     float4 sum = (float4)(0,0,0,0);
 
-    const int evenFlag = (int)((tidx & 1) == 0);
-    const int oddFlag = (int)((tidx & 1) != 0);
+    const float4 evenFlag = (float4)((tidx & 1) == 0);
+    const float4 oddFlag = (float4)((tidx & 1) != 0);
     const bool  eveny = ((tidy & 1) == 0);
 
     float4 co1 = (float4)(0.375f, 0.375f, 0.375f, 0.375f);
@@ -455,6 +455,7 @@ __kernel void pyrUp_C4_D0(__global uchar4* src,__global uchar4* dst,
         dst[x + y * dstStep] = convert_uchar4_sat_rte(4.0f * sum);
     }
 }
+
 ///////////////////////////////////////////////////////////////////////
 //////////////////////////  CV_16UC4 //////////////////////////////////
 ///////////////////////////////////////////////////////////////////////
@@ -492,8 +493,8 @@ __kernel void pyrUp_C4_D2(__global ushort4* src,__global ushort4* dst,
 
     float4 sum = (float4)(0,0,0,0);
 
-    const int evenFlag = (int)((get_local_id(0) & 1) == 0);
-    const int oddFlag = (int)((get_local_id(0) & 1) != 0);
+    const float4 evenFlag = (float4)((get_local_id(0) & 1) == 0);
+    const float4 oddFlag = (float4)((get_local_id(0) & 1) != 0);
     const bool  eveny = ((get_local_id(1) & 1) == 0);
     const int tidx = get_local_id(0);
 
@@ -604,8 +605,8 @@ __kernel void pyrUp_C4_D5(__global float4* src,__global float4* dst,
 
     float4 sum = (float4)(0,0,0,0);
 
-    const int evenFlag = (int)((tidx & 1) == 0);
-    const int oddFlag = (int)((tidx & 1) != 0);
+    const float4 evenFlag = (float4)((tidx & 1) == 0);
+    const float4 oddFlag = (float4)((tidx & 1) != 0);
     const bool  eveny = ((tidy & 1) == 0);
 
     float4 co1 = (float4)(0.375f, 0.375f, 0.375f, 0.375f);
@@ -669,4 +670,4 @@ __kernel void pyrUp_C4_D5(__global float4* src,__global float4* dst,
     {
         dst[x + y * dstStep] = 4.0f * sum;
     }
-}
\ No newline at end of file
+}

From b1c248fcc9eaa24baabf55fde7fcff096d62dcb2 Mon Sep 17 00:00:00 2001
From: peng xiao <hisenxpress@gmail.com>
Date: Fri, 31 May 2013 10:53:52 +0800
Subject: [PATCH 03/27] Fix ocl::filter2D. In current implementation, this
 function only works when anchor point is in the kernel center and kernel size
 supported is either 3x3 or 5x5.

---
 modules/ocl/include/opencv2/ocl/ocl.hpp       |  2 ++
 modules/ocl/src/filtering.cpp                 | 10 +++++++---
 modules/ocl/src/opencl/filtering_laplacian.cl | 12 ++++++------
 3 files changed, 15 insertions(+), 9 deletions(-)

diff --git a/modules/ocl/include/opencv2/ocl/ocl.hpp b/modules/ocl/include/opencv2/ocl/ocl.hpp
index 785248cfc..01b0f72d2 100644
--- a/modules/ocl/include/opencv2/ocl/ocl.hpp
+++ b/modules/ocl/include/opencv2/ocl/ocl.hpp
@@ -689,6 +689,8 @@ namespace cv
         }
 
         //! applies non-separable 2D linear filter to the image
+        //  Note, at the moment this function only works when anchor point is in the kernel center
+        //  and kernel size supported is either 3x3 or 5x5; otherwise the function will fail to output valid result
         CV_EXPORTS void filter2D(const oclMat &src, oclMat &dst, int ddepth, const Mat &kernel,
                                  Point anchor = Point(-1, -1), int borderType = BORDER_DEFAULT);
 
diff --git a/modules/ocl/src/filtering.cpp b/modules/ocl/src/filtering.cpp
index 56a70ae53..f35a26e33 100644
--- a/modules/ocl/src/filtering.cpp
+++ b/modules/ocl/src/filtering.cpp
@@ -645,7 +645,11 @@ static void GPUFilter2D(const oclMat &src, oclMat &dst, oclMat &mat_kernel,
     args.push_back(make_pair(sizeof(cl_int), (void *)&src.wholecols));
     args.push_back(make_pair(sizeof(cl_int), (void *)&src.wholerows));
 
-    openCLExecuteKernel(clCxt, &filtering_laplacian, kernelName, globalThreads, localThreads, args, cn, depth);
+    const int buffer_size = 100;
+    char opt_buffer [buffer_size] = "";
+    sprintf(opt_buffer, "-DANCHOR=%d -DANX=%d -DANY=%d", ksize.width, anchor.x, anchor.y);
+
+    openCLExecuteKernel(clCxt, &filtering_laplacian, kernelName, globalThreads, localThreads, args, cn, depth, opt_buffer);
 }
 Ptr<BaseFilter_GPU> cv::ocl::getLinearFilter_GPU(int srcType, int dstType, const Mat &kernel, const Size &ksize,
         Point anchor, int borderType)
@@ -656,7 +660,7 @@ Ptr<BaseFilter_GPU> cv::ocl::getLinearFilter_GPU(int srcType, int dstType, const
 
     oclMat gpu_krnl;
     int nDivisor;
-    normalizeKernel(kernel, gpu_krnl, CV_32S, &nDivisor, true);
+    normalizeKernel(kernel, gpu_krnl, CV_32S, &nDivisor, false);
     normalizeAnchor(anchor, ksize);
 
     return Ptr<BaseFilter_GPU>(new LinearFilter_GPU(ksize, anchor, gpu_krnl, GPUFilter2D_callers[CV_MAT_CN(srcType)],
@@ -1172,7 +1176,7 @@ void linearRowFilter_gpu(const oclMat &src, const oclMat &dst, oclMat mat_kernel
     args.push_back(make_pair(sizeof(cl_int), (void *)&ridusy));
     args.push_back(make_pair(sizeof(cl_mem), (void *)&mat_kernel.data));
 
-    openCLExecuteKernel2(clCxt, &filter_sep_row, kernelName, globalThreads, localThreads, args, channels, src.depth(), compile_option, CLFLUSH);
+    openCLExecuteKernel(clCxt, &filter_sep_row, kernelName, globalThreads, localThreads, args, channels, src.depth(), compile_option);
 }
 
 Ptr<BaseRowFilter_GPU> cv::ocl::getLinearRowFilter_GPU(int srcType, int /*bufType*/, const Mat &rowKernel, int anchor, int bordertype)
diff --git a/modules/ocl/src/opencl/filtering_laplacian.cl b/modules/ocl/src/opencl/filtering_laplacian.cl
index 96a2f51ef..8535eb1a5 100644
--- a/modules/ocl/src/opencl/filtering_laplacian.cl
+++ b/modules/ocl/src/opencl/filtering_laplacian.cl
@@ -82,9 +82,9 @@
 //////////////////////////////////////////////////////////////////////////////////////////////////////
 /////////////////////////////Macro for define elements number per thread/////////////////////////////
 ////////////////////////////////////////////////////////////////////////////////////////////////////
-#define ANCHOR                  3
-#define ANX                     1
-#define ANY                     1
+//#define ANCHOR                  3
+//#define ANX                     1
+//#define ANY                     1
 
 #define ROWS_PER_GROUP          4
 #define ROWS_PER_GROUP_BITS     2
@@ -185,7 +185,7 @@ __kernel void filter2D_C1_D0(__global uchar *src, int src_step, int src_offset_x
 
         for(int i = 0; i < ANCHOR; i++)
         {
-#pragma unroll 3
+#pragma unroll
             for(int j = 0; j < ANCHOR; j++)
             {
                 if(dst_rows_index < dst_rows_end)
@@ -295,7 +295,7 @@ __kernel void filter2D_C1_D5(__global float *src, int src_step, int src_offset_x
 
         for(int i = 0; i < ANCHOR; i++)
         {
-#pragma unroll 3
+#pragma unroll
             for(int j = 0; j < ANCHOR; j++)
             {
                 if(dst_rows_index < dst_rows_end)
@@ -410,7 +410,7 @@ __kernel void filter2D_C4_D0(__global uchar4 *src, int src_step, int src_offset_
 
         for(int i = 0; i < ANCHOR; i++)
         {
-#pragma unroll 3
+#pragma unroll
             for(int j = 0; j < ANCHOR; j++)
             {
                 if(dst_rows_index < dst_rows_end)

From abefcc60612617cdc48b278e347bdf0df4b0139b Mon Sep 17 00:00:00 2001
From: yao <bitwangyaoyao@gmail.com>
Date: Fri, 31 May 2013 15:16:03 +0800
Subject: [PATCH 04/27] Adjust perf_filters, as this function only supports 3x3
 kernel

---
 modules/ocl/perf/perf_filters.cpp | 47 +++++++++++++++----------------
 1 file changed, 23 insertions(+), 24 deletions(-)

diff --git a/modules/ocl/perf/perf_filters.cpp b/modules/ocl/perf/perf_filters.cpp
index 5cf92711e..a05301b34 100644
--- a/modules/ocl/perf/perf_filters.cpp
+++ b/modules/ocl/perf/perf_filters.cpp
@@ -337,39 +337,38 @@ PERFTEST(filter2D)
         {
             gen(src, size, size, all_type[j], 0, 256);
 
-            for (int ksize = 3; ksize <= 15; ksize = 2*ksize+1)
-            {
-                SUBTEST << "ksize = " << ksize << "; " << size << 'x' << size << "; " << type_name[j] ;
+            const int ksize = 3;
 
-                Mat kernel;
-                gen(kernel, ksize, ksize, CV_32FC1, 0.0, 1.0);
+            SUBTEST << "ksize = " << ksize << "; " << size << 'x' << size << "; " << type_name[j] ;
 
-                Mat dst, ocl_dst;
+            Mat kernel;
+            gen(kernel, ksize, ksize, CV_32SC1, -3.0, 3.0);
 
-                cv::filter2D(src, dst, -1, kernel);
+            Mat dst, ocl_dst;
 
-                CPU_ON;
-                cv::filter2D(src, dst, -1, kernel);
-                CPU_OFF;
+            cv::filter2D(src, dst, -1, kernel);
 
-                ocl::oclMat d_src(src), d_dst;
+            CPU_ON;
+            cv::filter2D(src, dst, -1, kernel);
+            CPU_OFF;
 
-                WARMUP_ON;
-                ocl::filter2D(d_src, d_dst, -1, kernel);
-                WARMUP_OFF;
+            ocl::oclMat d_src(src), d_dst;
 
-                GPU_ON;
-                ocl::filter2D(d_src, d_dst, -1, kernel);
-                GPU_OFF;
+            WARMUP_ON;
+            ocl::filter2D(d_src, d_dst, -1, kernel);
+            WARMUP_OFF;
 
-                GPU_FULL_ON;
-                d_src.upload(src);
-                ocl::filter2D(d_src, d_dst, -1, kernel);
-                d_dst.download(ocl_dst);
-                GPU_FULL_OFF;
+            GPU_ON;
+            ocl::filter2D(d_src, d_dst, -1, kernel);
+            GPU_OFF;
 
-                TestSystem::instance().ExpectedMatNear(ocl_dst, dst, 1e-5);
-            }
+            GPU_FULL_ON;
+            d_src.upload(src);
+            ocl::filter2D(d_src, d_dst, -1, kernel);
+            d_dst.download(ocl_dst);
+            GPU_FULL_OFF;
+
+            TestSystem::instance().ExpectedMatNear(ocl_dst, dst, 1e-5);
 
         }
 

From 15a213d3fca17d785441044d7126300d4f4ed4ef Mon Sep 17 00:00:00 2001
From: yao <bitwangyaoyao@gmail.com>
Date: Fri, 31 May 2013 15:35:54 +0800
Subject: [PATCH 05/27] fix a crash on Linux

---
 modules/ocl/src/pyrlk.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/modules/ocl/src/pyrlk.cpp b/modules/ocl/src/pyrlk.cpp
index a3e65dde3..8e9420480 100644
--- a/modules/ocl/src/pyrlk.cpp
+++ b/modules/ocl/src/pyrlk.cpp
@@ -142,7 +142,7 @@ static void lkSparse_run(oclMat &I, oclMat &J,
             int wave_size = queryDeviceInfo<WAVEFRONT_SIZE, int>(kernel);
             openCLSafeCall(clReleaseKernel(kernel));
 
-            static char opt[16] = {0};
+            static char opt[32] = {0};
             sprintf(opt, " -D WAVE_SIZE=%d", wave_size);
 
             openCLExecuteKernel(clCxt, &pyrlk, kernelName, globalThreads, localThreads, 

From d85f27b53791c144c70a7746ab3002d8d281e30e Mon Sep 17 00:00:00 2001
From: peng xiao <hisenxpress@gmail.com>
Date: Fri, 31 May 2013 16:06:56 +0800
Subject: [PATCH 06/27] Update ocl::surf_matcher sample. The new sample adjust
 some parameters thus it should always be able to calculate valid homography
 matrix when input is box.png and box_in_scene.png. Pure cpp surf and
 bfmatcher implementation is also added to show the user its accuracy and
 performance.

---
 samples/ocl/surf_matcher.cpp | 413 ++++++++++++++++++++++++-----------
 1 file changed, 285 insertions(+), 128 deletions(-)

diff --git a/samples/ocl/surf_matcher.cpp b/samples/ocl/surf_matcher.cpp
index ea6ee97cb..8734e3f81 100644
--- a/samples/ocl/surf_matcher.cpp
+++ b/samples/ocl/surf_matcher.cpp
@@ -46,156 +46,101 @@
 #include <iostream>
 #include <stdio.h>
 #include "opencv2/core/core.hpp"
-#include "opencv2/features2d/features2d.hpp"
 #include "opencv2/highgui/highgui.hpp"
 #include "opencv2/ocl/ocl.hpp"
-#include "opencv2/nonfree/nonfree.hpp"
 #include "opencv2/nonfree/ocl.hpp"
 #include "opencv2/calib3d/calib3d.hpp"
+#include "opencv2/nonfree/nonfree.hpp"
 
-using namespace std;
 using namespace cv;
 using namespace cv::ocl;
 
-//#define USE_CPU_DESCRIPTOR // use cpu descriptor extractor until ocl descriptor extractor is fixed
-//#define USE_CPU_BFMATCHER
+const int LOOP_NUM = 10;
+const int GOOD_PTS_MAX = 50;
+const float GOOD_PORTION = 0.15f;
+
+namespace
+{
 void help();
 
 void help()
 {
-    cout << "\nThis program demonstrates using SURF_OCL features detector and descriptor extractor" << endl;
-    cout << "\nUsage:\n\tsurf_matcher --left <image1> --right <image2>" << endl;
+    std::cout << "\nThis program demonstrates using SURF_OCL features detector and descriptor extractor" << std::endl;
+    std::cout << "\nUsage:\n\tsurf_matcher --left <image1> --right <image2> [-c]" << std::endl;
+    std::cout << "\nExample:\n\tsurf_matcher --left box.png --right box_in_scene.png" << std::endl;
 }
 
+int64 work_begin = 0;
+int64 work_end = 0;
 
-////////////////////////////////////////////////////
-// This program demonstrates the usage of SURF_OCL.
-// use cpu findHomography interface to calculate the transformation matrix
-int main(int argc, char* argv[])
+void workBegin() 
+{ 
+    work_begin = getTickCount();
+}
+void workEnd()
 {
-    if (argc != 5 && argc != 1)
+    work_end = getTickCount() - work_begin;
+}
+double getTime(){
+    return work_end /((double)cvGetTickFrequency() * 1000.);
+}
+
+template<class KPDetector>
+struct SURFDetector
+{
+    KPDetector surf;
+    SURFDetector(double hessian = 800.0)
+        :surf(hessian)
     {
-        help();
-        return -1;
     }
-    vector<cv::ocl::Info> info;
-    if(!cv::ocl::getDevice(info))
+    template<class T>
+    void operator()(const T& in, const T& mask, vector<cv::KeyPoint>& pts, T& descriptors, bool useProvided = false)
     {
-        cout << "Error: Did not find a valid OpenCL device!" << endl;
-        return -1;
+        surf(in, mask, pts, descriptors, useProvided);
     }
-    Mat cpu_img1, cpu_img2, cpu_img1_grey, cpu_img2_grey;
-    oclMat img1, img2;
-    if(argc != 5)
+};
+
+template<class KPMatcher>
+struct SURFMatcher
+{
+    KPMatcher matcher;
+    template<class T>
+    void match(const T& in1, const T& in2, vector<cv::DMatch>& matches)
     {
-        cpu_img1 = imread("o.png");
-        cvtColor(cpu_img1, cpu_img1_grey, CV_BGR2GRAY);
-        img1     = cpu_img1_grey;
-        CV_Assert(!img1.empty());
-
-        cpu_img2 = imread("r2.png");
-        cvtColor(cpu_img2, cpu_img2_grey, CV_BGR2GRAY);
-        img2     = cpu_img2_grey;
-    }
-    else
-    {
-        for (int i = 1; i < argc; ++i)
-        {
-            if (string(argv[i]) == "--left")
-            {
-                cpu_img1 = imread(argv[++i]);
-                cvtColor(cpu_img1, cpu_img1_grey, CV_BGR2GRAY);
-                img1     = cpu_img1_grey;
-                CV_Assert(!img1.empty());
-            }
-            else if (string(argv[i]) == "--right")
-            {
-                cpu_img2 = imread(argv[++i]);
-                cvtColor(cpu_img2, cpu_img2_grey, CV_BGR2GRAY);
-                img2     = cpu_img2_grey;
-            }
-            else if (string(argv[i]) == "--help")
-            {
-                help();
-                return -1;
-            }
-        }
+        matcher.match(in1, in2, matches);
     }
+};
 
-    SURF_OCL surf;
-    //surf.hessianThreshold = 400.f;
-    //surf.extended = false;
-
-    // detecting keypoints & computing descriptors
-    oclMat keypoints1GPU, keypoints2GPU;
-    oclMat descriptors1GPU, descriptors2GPU;
-
-    // downloading results
-    vector<KeyPoint> keypoints1, keypoints2;
-    vector<DMatch> matches;
-
-
-#ifndef USE_CPU_DESCRIPTOR
-    surf(img1, oclMat(), keypoints1GPU, descriptors1GPU);
-    surf(img2, oclMat(), keypoints2GPU, descriptors2GPU);
-
-    surf.downloadKeypoints(keypoints1GPU, keypoints1);
-    surf.downloadKeypoints(keypoints2GPU, keypoints2);
-
-
-#ifdef USE_CPU_BFMATCHER
-    //BFMatcher
-    BFMatcher matcher(cv::NORM_L2);
-    matcher.match(Mat(descriptors1GPU), Mat(descriptors2GPU), matches);
-#else
-    BruteForceMatcher_OCL_base matcher(BruteForceMatcher_OCL_base::L2Dist);
-    matcher.match(descriptors1GPU, descriptors2GPU, matches);
-#endif
-
-#else
-    surf(img1, oclMat(), keypoints1GPU);
-    surf(img2, oclMat(), keypoints2GPU);
-    surf.downloadKeypoints(keypoints1GPU, keypoints1);
-    surf.downloadKeypoints(keypoints2GPU, keypoints2);
-
-    // use SURF_OCL to detect keypoints and use SURF to extract descriptors
-    SURF surf_cpu;
-    Mat descriptors1, descriptors2;
-    surf_cpu(cpu_img1, Mat(), keypoints1, descriptors1, true);
-    surf_cpu(cpu_img2, Mat(), keypoints2, descriptors2, true);
-    matcher.match(descriptors1, descriptors2, matches);
-#endif
-    cout << "OCL: FOUND " << keypoints1GPU.cols << " keypoints on first image" << endl;
-    cout << "OCL: FOUND " << keypoints2GPU.cols << " keypoints on second image" << endl;
-
-    double max_dist = 0; double min_dist = 100;
-    //-- Quick calculation of max and min distances between keypoints
-    for( size_t i = 0; i < keypoints1.size(); i++ )
-    {
-        double dist = matches[i].distance;
-        if( dist < min_dist ) min_dist = dist;
-        if( dist > max_dist ) max_dist = dist;
-    }
-
-    printf("-- Max dist : %f \n", max_dist );
-    printf("-- Min dist : %f \n", min_dist );
-
-    //-- Draw only "good" matches (i.e. whose distance is less than 2.5*min_dist )
+Mat drawGoodMatches(
+    const Mat& cpu_img1,
+    const Mat& cpu_img2,
+    const vector<KeyPoint>& keypoints1, 
+    const vector<KeyPoint>& keypoints2, 
+    vector<DMatch>& matches,
+    vector<Point2f>& scene_corners_
+    )
+{
+    //-- Sort matches and preserve top 10% matches 
+    std::sort(matches.begin(), matches.end());
     std::vector< DMatch > good_matches;
+    double minDist = matches.front().distance,
+        maxDist = matches.back().distance;
 
-    for( size_t i = 0; i < keypoints1.size(); i++ )
+    const int ptsPairs = std::min(GOOD_PTS_MAX, (int)(matches.size() * GOOD_PORTION));
+    for( int i = 0; i < ptsPairs; i++ )
     {
-        if( matches[i].distance < 3*min_dist )
-        {
-            good_matches.push_back( matches[i]);
-        }
+        good_matches.push_back( matches[i] );
     }
+    std::cout << "\nMax distance: " << maxDist << std::endl;
+    std::cout << "Min distance: " << minDist << std::endl;
+
+    std::cout << "Calculating homography using " << ptsPairs << " point pairs." << std::endl;
 
     // drawing the results
     Mat img_matches;
     drawMatches( cpu_img1, keypoints1, cpu_img2, keypoints2,
         good_matches, img_matches, Scalar::all(-1), Scalar::all(-1),
-        vector<char>(), DrawMatchesFlags::NOT_DRAW_SINGLE_POINTS );
+        vector<char>(), DrawMatchesFlags::NOT_DRAW_SINGLE_POINTS  );
 
     //-- Localize the object
     std::vector<Point2f> obj;
@@ -207,26 +152,238 @@ int main(int argc, char* argv[])
         obj.push_back( keypoints1[ good_matches[i].queryIdx ].pt );
         scene.push_back( keypoints2[ good_matches[i].trainIdx ].pt );
     }
-    Mat H = findHomography( obj, scene, CV_RANSAC );
-
     //-- Get the corners from the image_1 ( the object to be "detected" )
     std::vector<Point2f> obj_corners(4);
     obj_corners[0] = cvPoint(0,0); obj_corners[1] = cvPoint( cpu_img1.cols, 0 );
     obj_corners[2] = cvPoint( cpu_img1.cols, cpu_img1.rows ); obj_corners[3] = cvPoint( 0, cpu_img1.rows );
     std::vector<Point2f> scene_corners(4);
-
+    
+    Mat H = findHomography( obj, scene, CV_RANSAC );
     perspectiveTransform( obj_corners, scene_corners, H);
 
+    scene_corners_ = scene_corners;
+    
     //-- Draw lines between the corners (the mapped object in the scene - image_2 )
-    line( img_matches, scene_corners[0] + Point2f( (float)cpu_img1.cols, 0), scene_corners[1] + Point2f( (float)cpu_img1.cols, 0), Scalar( 0, 255, 0), 4 );
-    line( img_matches, scene_corners[1] + Point2f( (float)cpu_img1.cols, 0), scene_corners[2] + Point2f( (float)cpu_img1.cols, 0), Scalar( 0, 255, 0), 4 );
-    line( img_matches, scene_corners[2] + Point2f( (float)cpu_img1.cols, 0), scene_corners[3] + Point2f( (float)cpu_img1.cols, 0), Scalar( 0, 255, 0), 4 );
-    line( img_matches, scene_corners[3] + Point2f( (float)cpu_img1.cols, 0), scene_corners[0] + Point2f( (float)cpu_img1.cols, 0), Scalar( 0, 255, 0), 4 );
+    line( img_matches, 
+        scene_corners[0] + Point2f( (float)cpu_img1.cols, 0), scene_corners[1] + Point2f( (float)cpu_img1.cols, 0), 
+        Scalar( 0, 255, 0), 2, CV_AA );
+    line( img_matches, 
+        scene_corners[1] + Point2f( (float)cpu_img1.cols, 0), scene_corners[2] + Point2f( (float)cpu_img1.cols, 0), 
+        Scalar( 0, 255, 0), 2, CV_AA );
+    line( img_matches, 
+        scene_corners[2] + Point2f( (float)cpu_img1.cols, 0), scene_corners[3] + Point2f( (float)cpu_img1.cols, 0), 
+        Scalar( 0, 255, 0), 2, CV_AA );
+    line( img_matches, 
+        scene_corners[3] + Point2f( (float)cpu_img1.cols, 0), scene_corners[0] + Point2f( (float)cpu_img1.cols, 0), 
+        Scalar( 0, 255, 0), 2, CV_AA );
+    return img_matches;
+}
+
+}
+////////////////////////////////////////////////////
+// This program demonstrates the usage of SURF_OCL.
+// use cpu findHomography interface to calculate the transformation matrix
+int main(int argc, char* argv[])
+{
+    vector<cv::ocl::Info> info;
+    if(cv::ocl::getDevice(info) == 0)
+    {
+        std::cout << "Error: Did not find a valid OpenCL device!" << std::endl;
+        return -1;
+    }
+    ocl::setDevice(info[0]);
+
+    Mat cpu_img1, cpu_img2, cpu_img1_grey, cpu_img2_grey;
+    oclMat img1, img2;
+    bool useCPU = false;
+    bool useGPU = false;
+    bool useALL = false;
+
+    for (int i = 1; i < argc; ++i)
+    {
+        if (string(argv[i]) == "--left")
+        {
+            cpu_img1 = imread(argv[++i]);
+            CV_Assert(!cpu_img1.empty());
+            cvtColor(cpu_img1, cpu_img1_grey, CV_BGR2GRAY);
+            img1 = cpu_img1_grey;
+        }
+        else if (string(argv[i]) == "--right")
+        {
+            cpu_img2 = imread(argv[++i]);
+            CV_Assert(!cpu_img2.empty());
+            cvtColor(cpu_img2, cpu_img2_grey, CV_BGR2GRAY);
+            img2 = cpu_img2_grey;
+        }
+        else if (string(argv[i]) == "-c")
+        {
+            useCPU = true;
+            useGPU = false;
+            useALL = false;
+        }else if(string(argv[i]) == "-g")
+        {
+            useGPU = true;
+            useCPU = false;
+            useALL = false;
+        }else if(string(argv[i]) == "-a")
+        {
+            useALL = true;
+            useCPU = false;
+            useGPU = false;
+        }
+        else if (string(argv[i]) == "--help")
+        {
+            help();
+            return -1;
+        }
+    }
+    if(!useCPU)
+    {
+        std::cout
+            << "Device name:"
+            << info[0].DeviceName[0]
+        << std::endl;
+    }
+    double surf_time = 0.;
+
+    //declare input/output
+    vector<KeyPoint> keypoints1, keypoints2;
+    vector<DMatch> matches;
+
+    vector<KeyPoint> gpu_keypoints1;
+    vector<KeyPoint> gpu_keypoints2;
+    vector<DMatch> gpu_matches;
+
+    Mat descriptors1CPU, descriptors2CPU;
+
+    oclMat keypoints1GPU, keypoints2GPU;
+    oclMat descriptors1GPU, descriptors2GPU;
+
+    //instantiate detectors/matchers
+    SURFDetector<SURF>     cpp_surf;
+    SURFDetector<SURF_OCL> ocl_surf;
+    
+    SURFMatcher<BFMatcher>      cpp_matcher;
+    SURFMatcher<BFMatcher_OCL>  ocl_matcher;
+
+    //-- start of timing section
+    if (useCPU) 
+    {
+        for (int i = 0; i <= LOOP_NUM; i++)
+        {
+            if(i == 1) workBegin();
+            cpp_surf(cpu_img1_grey, Mat(), keypoints1, descriptors1CPU);
+            cpp_surf(cpu_img2_grey, Mat(), keypoints2, descriptors2CPU);
+            cpp_matcher.match(descriptors1CPU, descriptors2CPU, matches);
+        }
+        workEnd();
+        std::cout << "CPP: FOUND " << keypoints1.size() << " keypoints on first image" << std::endl;
+        std::cout << "CPP: FOUND " << keypoints2.size() << " keypoints on second image" << std::endl;
+
+        surf_time = getTime();
+        std::cout << "SURF run time: " << surf_time / LOOP_NUM << " ms" << std::endl<<"\n";
+    }
+    else if(useGPU)
+    {
+        for (int i = 0; i <= LOOP_NUM; i++)
+        {
+            if(i == 1) workBegin();
+            ocl_surf(img1, oclMat(), keypoints1, descriptors1GPU);
+            ocl_surf(img2, oclMat(), keypoints2, descriptors2GPU);
+            ocl_matcher.match(descriptors1GPU, descriptors2GPU, matches);
+        }
+        workEnd();
+        std::cout << "OCL: FOUND " << keypoints1.size() << " keypoints on first image" << std::endl;
+        std::cout << "OCL: FOUND " << keypoints2.size() << " keypoints on second image" << std::endl;
+
+        surf_time = getTime();
+        std::cout << "SURF run time: " << surf_time / LOOP_NUM << " ms" << std::endl<<"\n";
+    }else
+    {
+        //cpu runs
+        for (int i = 0; i <= LOOP_NUM; i++)
+        {
+            if(i == 1) workBegin();
+            cpp_surf(cpu_img1_grey, Mat(), keypoints1, descriptors1CPU);
+            cpp_surf(cpu_img2_grey, Mat(), keypoints2, descriptors2CPU);
+            cpp_matcher.match(descriptors1CPU, descriptors2CPU, matches);
+        }
+        workEnd();
+        std::cout << "\nCPP: FOUND " << keypoints1.size() << " keypoints on first image" << std::endl;
+        std::cout << "CPP: FOUND " << keypoints2.size() << " keypoints on second image" << std::endl;
+
+        surf_time = getTime();
+        std::cout << "(CPP)SURF run time: " << surf_time / LOOP_NUM << " ms" << std::endl;
+
+        //gpu runs
+        for (int i = 0; i <= LOOP_NUM; i++)
+        {
+            if(i == 1) workBegin();
+            ocl_surf(img1, oclMat(), gpu_keypoints1, descriptors1GPU);
+            ocl_surf(img2, oclMat(), gpu_keypoints2, descriptors2GPU);
+            ocl_matcher.match(descriptors1GPU, descriptors2GPU, gpu_matches);
+        }
+        workEnd();
+        std::cout << "\nOCL: FOUND " << keypoints1.size() << " keypoints on first image" << std::endl;
+        std::cout << "OCL: FOUND " << keypoints2.size() << " keypoints on second image" << std::endl;
+
+        surf_time = getTime();
+        std::cout << "(OCL)SURF run time: " << surf_time / LOOP_NUM << " ms" << std::endl<<"\n";
+
+    }
+
+    //--------------------------------------------------------------------------
+    std::vector<Point2f> cpu_corner;
+    Mat img_matches = drawGoodMatches(cpu_img1, cpu_img2, keypoints1, keypoints2, matches, cpu_corner);
+
+    std::vector<Point2f> gpu_corner;
+    Mat ocl_img_matches;
+    if(useALL || (!useCPU&&!useGPU))
+    {
+        ocl_img_matches = drawGoodMatches(cpu_img1, cpu_img2, gpu_keypoints1, gpu_keypoints2, gpu_matches, gpu_corner);
+
+        //check accuracy
+        std::cout<<"\nCheck accuracy:\n";
+
+        if(cpu_corner.size()!=gpu_corner.size())
+            std::cout<<"Failed\n";
+        else
+        {
+            bool result = false;
+            for(int i = 0; i < cpu_corner.size(); i++)
+            {
+                if((std::abs(cpu_corner[i].x - gpu_corner[i].x) > 10)
+                    ||(std::abs(cpu_corner[i].y - gpu_corner[i].y) > 10))
+                {
+                    std::cout<<"Failed\n";
+                    result = false;
+                    break;
+                }
+                result = true;
+            } 
+            if(result)
+                std::cout<<"Passed\n";
+        }
+    }
 
     //-- Show detected matches
-    namedWindow("ocl surf matches", 0);
-    imshow("ocl surf matches", img_matches);
-    waitKey(0);
+    if (useCPU)
+    {
+        namedWindow("cpu surf matches", 0);
+        imshow("cpu surf matches", img_matches);
+    }
+    else if(useGPU)
+    {
+        namedWindow("ocl surf matches", 0);
+        imshow("ocl surf matches", img_matches);
+    }else
+    {
+        namedWindow("cpu surf matches", 0);
+        imshow("cpu surf matches", img_matches);
 
+        namedWindow("ocl surf matches", 0);
+        imshow("ocl surf matches", ocl_img_matches);
+    }
+    waitKey(0);
     return 0;
 }

From 97b86aa259ec1b33d1c5b28bd04d6247e7671a93 Mon Sep 17 00:00:00 2001
From: peng xiao <hisenxpress@gmail.com>
Date: Fri, 31 May 2013 16:48:40 +0800
Subject: [PATCH 07/27] Initialize OpenCL context at the end of getDevice call.
 Added for better compatibility with the current samples/test cases. User now
 will be able to initialize OpenCL context explicitly with ocl::getDevice api.
 This may be obsoleted in future releases.

---
 modules/ocl/src/initialization.cpp | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/modules/ocl/src/initialization.cpp b/modules/ocl/src/initialization.cpp
index fd462ad91..a9cd08b9f 100644
--- a/modules/ocl/src/initialization.cpp
+++ b/modules/ocl/src/initialization.cpp
@@ -333,6 +333,10 @@ namespace cv
                     oclinfo.push_back(ocltmpinfo);
                 }
             }
+            if(devcienums > 0)
+            {
+                setDevice(oclinfo[0]);
+            }
             return devcienums;
         }
 

From cdb16f112074697445cc3c960142bc05dbfa916b Mon Sep 17 00:00:00 2001
From: peng xiao <hisenxpress@gmail.com>
Date: Fri, 31 May 2013 17:29:55 +0800
Subject: [PATCH 08/27] Fix build error

---
 samples/ocl/surf_matcher.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/samples/ocl/surf_matcher.cpp b/samples/ocl/surf_matcher.cpp
index 8734e3f81..038a8dc5c 100644
--- a/samples/ocl/surf_matcher.cpp
+++ b/samples/ocl/surf_matcher.cpp
@@ -350,7 +350,7 @@ int main(int argc, char* argv[])
         else
         {
             bool result = false;
-            for(int i = 0; i < cpu_corner.size(); i++)
+            for(size_t i = 0; i < cpu_corner.size(); i++)
             {
                 if((std::abs(cpu_corner[i].x - gpu_corner[i].x) > 10)
                     ||(std::abs(cpu_corner[i].y - gpu_corner[i].y) > 10))

From 8a4090fe3f6daa00f5421e2b9fefd2f14790c126 Mon Sep 17 00:00:00 2001
From: Roman Donchenko <roman.donchenko@itseez.com>
Date: Fri, 31 May 2013 17:27:42 +0400
Subject: [PATCH 09/27] Make AutoLock noncopyable (it would break on copying,
 anyway).

---
 modules/core/include/opencv2/core/core.hpp | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/modules/core/include/opencv2/core/core.hpp b/modules/core/include/opencv2/core/core.hpp
index 1c8e0e2ca..2b7791958 100644
--- a/modules/core/include/opencv2/core/core.hpp
+++ b/modules/core/include/opencv2/core/core.hpp
@@ -4813,6 +4813,9 @@ public:
     ~AutoLock() { mutex->unlock(); }
 protected:
     Mutex* mutex;
+private:
+    AutoLock(const AutoLock&);
+    AutoLock& operator = (const AutoLock&);
 };
 
 }

From 6f006e50dc2cff6ba8b316640d769db1ac08e2ca Mon Sep 17 00:00:00 2001
From: Andrey Pavlenko <andrey.pavlenko@itseez.com>
Date: Fri, 31 May 2013 18:58:30 +0400
Subject: [PATCH 10/27] setting 'char' to be signed by default since some tests
 fail when it's wrong (e.g. native compilation on ARM Linux)

---
 cmake/OpenCVCompilerOptions.cmake | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/cmake/OpenCVCompilerOptions.cmake b/cmake/OpenCVCompilerOptions.cmake
index aeed112ae..bded2d41f 100644
--- a/cmake/OpenCVCompilerOptions.cmake
+++ b/cmake/OpenCVCompilerOptions.cmake
@@ -47,6 +47,9 @@ macro(add_extra_compiler_option option)
   endif()
 endmacro()
 
+# some OpenCV tests fail when 'char' is 'unsigned' by default
+add_extra_compiler_option(-fsigned-char)
+
 if(MINGW)
   # http://gcc.gnu.org/bugzilla/show_bug.cgi?id=40838
   # here we are trying to workaround the problem

From 081c47e3df9819e025520ea7b77cc627cd4ccdbe Mon Sep 17 00:00:00 2001
From: Andrey Pavlenko <andrey.pavlenko@itseez.com>
Date: Fri, 31 May 2013 19:55:51 +0400
Subject: [PATCH 11/27] making the comment less ambigous

---
 cmake/OpenCVCompilerOptions.cmake | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cmake/OpenCVCompilerOptions.cmake b/cmake/OpenCVCompilerOptions.cmake
index bded2d41f..7a91b188a 100644
--- a/cmake/OpenCVCompilerOptions.cmake
+++ b/cmake/OpenCVCompilerOptions.cmake
@@ -47,7 +47,7 @@ macro(add_extra_compiler_option option)
   endif()
 endmacro()
 
-# some OpenCV tests fail when 'char' is 'unsigned' by default
+# OpenCV fails some tests when 'char' is 'unsigned' by default
 add_extra_compiler_option(-fsigned-char)
 
 if(MINGW)

From ff28bf831f156eceba443479fc08aafbe32b1ee6 Mon Sep 17 00:00:00 2001
From: Vladislav Vinogradov <vlad.vinogradov@itseez.com>
Date: Mon, 3 Jun 2013 14:01:04 +0400
Subject: [PATCH 12/27] disabled samples with driver api

---
 samples/gpu/cascadeclassifier_nvidia_api.cpp | 15 ++++++++++++---
 samples/gpu/driver_api_multi.cpp             |  6 +++++-
 samples/gpu/driver_api_stereo_multi.cpp      |  6 +++++-
 3 files changed, 22 insertions(+), 5 deletions(-)

diff --git a/samples/gpu/cascadeclassifier_nvidia_api.cpp b/samples/gpu/cascadeclassifier_nvidia_api.cpp
index 99c95ab97..98195b35c 100644
--- a/samples/gpu/cascadeclassifier_nvidia_api.cpp
+++ b/samples/gpu/cascadeclassifier_nvidia_api.cpp
@@ -17,12 +17,21 @@ using namespace std;
 using namespace cv;
 
 
-#if !defined(HAVE_CUDA)
+#if !defined(HAVE_CUDA) || defined(__arm__)
+
 int main( int, const char** )
 {
-    cout << "Please compile the library with CUDA support" << endl;
-    return -1;
+#if !defined(HAVE_CUDA)
+    std::cout << "CUDA support is required (CMake key 'WITH_CUDA' must be true)." << std::endl;
+#endif
+
+#if defined(__arm__)
+    std::cout << "Unsupported for ARM CUDA library." << std::endl;
+#endif
+
+    return 0;
 }
+
 #else
 
 
diff --git a/samples/gpu/driver_api_multi.cpp b/samples/gpu/driver_api_multi.cpp
index 2d743f0e9..c829830e7 100644
--- a/samples/gpu/driver_api_multi.cpp
+++ b/samples/gpu/driver_api_multi.cpp
@@ -11,7 +11,7 @@
 #include "opencv2/core/core.hpp"
 #include "opencv2/gpu/gpu.hpp"
 
-#if !defined(HAVE_CUDA) || !defined(HAVE_TBB)
+#if !defined(HAVE_CUDA) || !defined(HAVE_TBB) || defined(__arm__)
 
 int main()
 {
@@ -23,6 +23,10 @@ int main()
     std::cout << "TBB support is required (CMake key 'WITH_TBB' must be true).\n";
 #endif
 
+#if defined(__arm__)
+    std::cout << "Unsupported for ARM CUDA library." << std::endl;
+#endif
+
     return 0;
 }
 
diff --git a/samples/gpu/driver_api_stereo_multi.cpp b/samples/gpu/driver_api_stereo_multi.cpp
index 10c397477..d4d0af451 100644
--- a/samples/gpu/driver_api_stereo_multi.cpp
+++ b/samples/gpu/driver_api_stereo_multi.cpp
@@ -13,7 +13,7 @@
 #include "opencv2/highgui/highgui.hpp"
 #include "opencv2/gpu/gpu.hpp"
 
-#if !defined(HAVE_CUDA) || !defined(HAVE_TBB)
+#if !defined(HAVE_CUDA) || !defined(HAVE_TBB) || defined(__arm__)
 
 int main()
 {
@@ -25,6 +25,10 @@ int main()
     std::cout << "TBB support is required (CMake key 'WITH_TBB' must be true).\n";
 #endif
 
+#if defined(__arm__)
+    std::cout << "Unsupported for ARM CUDA library." << std::endl;
+#endif
+
     return 0;
 }
 

From dc937c10f994029c7c83b2f2ffa8106ca1354257 Mon Sep 17 00:00:00 2001
From: yao <bitwangyaoyao@gmail.com>
Date: Tue, 4 Jun 2013 11:31:54 +0800
Subject: [PATCH 13/27] change a test image of pyrlk

---
 modules/ocl/perf/perf_opticalflow.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/modules/ocl/perf/perf_opticalflow.cpp b/modules/ocl/perf/perf_opticalflow.cpp
index 988797020..97283b206 100644
--- a/modules/ocl/perf/perf_opticalflow.cpp
+++ b/modules/ocl/perf/perf_opticalflow.cpp
@@ -48,8 +48,8 @@
 ///////////// PyrLKOpticalFlow ////////////////////////
 PERFTEST(PyrLKOpticalFlow)
 {
-    std::string images1[] = {"rubberwhale1.png", "aloeL.jpg"};
-    std::string images2[] = {"rubberwhale2.png", "aloeR.jpg"};
+    std::string images1[] = {"rubberwhale1.png", "basketball1.png"};
+    std::string images2[] = {"rubberwhale2.png", "basketball2.png"};
 
     for (size_t i = 0; i < sizeof(images1) / sizeof(std::string); i++)
     {

From a7a94de74ae46f99b8ab7606dab1faef93c6e7fa Mon Sep 17 00:00:00 2001
From: peng xiao <hisenxpress@gmail.com>
Date: Tue, 4 Jun 2013 15:55:33 +0800
Subject: [PATCH 14/27] Fix a bug of gfft. When user provided corners buffer is
 big enough to be copied to from tmpCorners_, we allow the buffer to be reused
 other than allocate a new cl_mem object.

---
 modules/ocl/src/gfft.cpp          | 3 ++-
 modules/ocl/test/test_optflow.cpp | 2 +-
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/modules/ocl/src/gfft.cpp b/modules/ocl/src/gfft.cpp
index af7580bd4..7fd5e3a17 100644
--- a/modules/ocl/src/gfft.cpp
+++ b/modules/ocl/src/gfft.cpp
@@ -257,7 +257,8 @@ void cv::ocl::GoodFeaturesToTrackDetector_OCL::operator ()(const oclMat& image,
     
     if (minDistance < 1)
     {
-        corners = tmpCorners_(Rect(0, 0, maxCorners > 0 ? std::min(maxCorners, total) : total, 1));
+        Rect roi_range(0, 0, maxCorners > 0 ? std::min(maxCorners, total) : total, 1);
+        tmpCorners_(roi_range).copyTo(corners);
     }
     else
     {
diff --git a/modules/ocl/test/test_optflow.cpp b/modules/ocl/test/test_optflow.cpp
index 0121be8f9..72689f3f7 100644
--- a/modules/ocl/test/test_optflow.cpp
+++ b/modules/ocl/test/test_optflow.cpp
@@ -121,7 +121,7 @@ TEST_P(GoodFeaturesToTrack, EmptyCorners)
 
     cv::ocl::GoodFeaturesToTrackDetector_OCL detector(maxCorners, qualityLevel, minDistance);
 
-    cv::ocl::oclMat src(100, 100, CV_8UC1, cv::Scalar::all(0));
+    cv::ocl::oclMat src(100, 128, CV_8UC1, cv::Scalar::all(0));
     cv::ocl::oclMat corners(1, maxCorners, CV_32FC2);
 
     detector(src, corners);

From f049aa7674f3f3ef985f14e65de3debfeb163b3a Mon Sep 17 00:00:00 2001
From: yao <bitwangyaoyao@gmail.com>
Date: Tue, 4 Jun 2013 15:59:21 +0800
Subject: [PATCH 15/27] use GoodFeaturesToTrackDetector_OCL

---
 samples/ocl/pyrlk_optical_flow.cpp | 54 ++++++++++++++----------------
 1 file changed, 25 insertions(+), 29 deletions(-)

diff --git a/samples/ocl/pyrlk_optical_flow.cpp b/samples/ocl/pyrlk_optical_flow.cpp
index 1b2b1d379..cc8d886f7 100644
--- a/samples/ocl/pyrlk_optical_flow.cpp
+++ b/samples/ocl/pyrlk_optical_flow.cpp
@@ -29,6 +29,7 @@ static double getTime(){
 
 static void download(const oclMat& d_mat, vector<Point2f>& vec)
 {
+    vec.clear();
     vec.resize(d_mat.cols);
     Mat mat(1, d_mat.cols, CV_32FC2, (void*)&vec[0]);
     d_mat.download(mat);
@@ -36,6 +37,7 @@ static void download(const oclMat& d_mat, vector<Point2f>& vec)
 
 static void download(const oclMat& d_mat, vector<uchar>& vec)
 {
+    vec.clear();
     vec.resize(d_mat.cols);
     Mat mat(1, d_mat.cols, CV_8UC1, (void*)&vec[0]);
     d_mat.download(mat);
@@ -119,14 +121,15 @@ int main(int argc, const char* argv[])
     bool useCPU = cmd.get<bool>("s");
     bool useCamera = cmd.get<bool>("c");
     int inputName = cmd.get<int>("c");
-    oclMat d_nextPts, d_status;
 
+    oclMat d_nextPts, d_status;
+    GoodFeaturesToTrackDetector_OCL d_features(points);
     Mat frame0 = imread(fname0, cv::IMREAD_GRAYSCALE);
     Mat frame1 = imread(fname1, cv::IMREAD_GRAYSCALE);
     PyrLKOpticalFlow d_pyrLK;
-    vector<cv::Point2f> pts;
-    vector<cv::Point2f> nextPts;
-    vector<unsigned char> status;
+    vector<cv::Point2f> pts(points);
+    vector<cv::Point2f> nextPts(points);
+    vector<unsigned char> status(points);
     vector<float> err;
 
     if (frame0.empty() || frame1.empty())
@@ -199,29 +202,24 @@ int main(int argc, const char* argv[])
                     ptr1 = frame0Gray;
                 }
 
-                pts.clear();
-
-                cv::goodFeaturesToTrack(ptr0, pts, points, 0.01, 0.0);
-
-                if (pts.size() == 0)
-                {
-                    continue;
-                }
-
                 if (useCPU)
                 {
-                    cv::calcOpticalFlowPyrLK(ptr0, ptr1, pts, nextPts, status, err);
+                    pts.clear();
+                    goodFeaturesToTrack(ptr0, pts, points, 0.01, 0.0);
+                    if(pts.size() == 0)
+                        continue;
+                    calcOpticalFlowPyrLK(ptr0, ptr1, pts, nextPts, status, err);
                 }
                 else
                 {
-                    oclMat d_prevPts(1, points, CV_32FC2, (void*)&pts[0]);
-
-                    d_pyrLK.sparse(oclMat(ptr0), oclMat(ptr1), d_prevPts, d_nextPts, d_status);
-
-                    download(d_prevPts, pts);
+                    oclMat d_img(ptr0), d_prevPts;
+                    d_features(d_img, d_prevPts);
+                    if(!d_prevPts.rows || !d_prevPts.cols)
+                        continue;
+                    d_pyrLK.sparse(d_img, oclMat(ptr1), d_prevPts, d_nextPts, d_status);
+                    d_features.downloadPoints(d_prevPts,pts);
                     download(d_nextPts, nextPts);
                     download(d_status, status);
-
                 }
                 if (i%2 == 1)
                     frame1.copyTo(frameCopy);
@@ -246,21 +244,19 @@ nocamera:
         for(int i = 0; i <= LOOP_NUM;i ++) 
         {
             cout << "loop" << i << endl;
-            if (i > 0) workBegin();
-
-            cv::goodFeaturesToTrack(frame0, pts, points, 0.01, minDist);
+            if (i > 0) workBegin();     
 
             if (useCPU)
             {
-                cv::calcOpticalFlowPyrLK(frame0, frame1, pts, nextPts, status, err);
+                goodFeaturesToTrack(frame0, pts, points, 0.01, minDist);
+                calcOpticalFlowPyrLK(frame0, frame1, pts, nextPts, status, err);
             }
             else
             {
-                oclMat d_prevPts(1, points, CV_32FC2, (void*)&pts[0]);
-
-                d_pyrLK.sparse(oclMat(frame0), oclMat(frame1), d_prevPts, d_nextPts, d_status);
-
-                download(d_prevPts, pts);
+                oclMat d_img(frame0), d_prevPts;
+                d_features(d_img, d_prevPts);
+                d_pyrLK.sparse(d_img, oclMat(frame1), d_prevPts, d_nextPts, d_status);
+                d_features.downloadPoints(d_prevPts, pts);
                 download(d_nextPts, nextPts);
                 download(d_status, status);
             }

From 3aea7e8f8dc4cf572d919d81b14d1f69c9bd2f0d Mon Sep 17 00:00:00 2001
From: Vladislav Vinogradov <vlad.vinogradov@itseez.com>
Date: Tue, 4 Jun 2013 12:51:36 +0400
Subject: [PATCH 16/27] fixed gpu module build on arm platform

links with CUDA driver library only if we use video encoding/decoding
---
 modules/gpu/CMakeLists.txt | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/modules/gpu/CMakeLists.txt b/modules/gpu/CMakeLists.txt
index 87c172da5..a471da008 100644
--- a/modules/gpu/CMakeLists.txt
+++ b/modules/gpu/CMakeLists.txt
@@ -42,10 +42,10 @@ if(HAVE_CUDA)
 
   ocv_cuda_compile(cuda_objs ${lib_cuda} ${ncv_cuda})
 
-  set(cuda_link_libs ${CUDA_LIBRARIES} ${CUDA_CUDA_LIBRARY} ${CUDA_npp_LIBRARY})
+  set(cuda_link_libs ${CUDA_LIBRARIES} ${CUDA_npp_LIBRARY})
 
   if(WITH_NVCUVID)
-    set(cuda_link_libs ${cuda_link_libs} ${CUDA_nvcuvid_LIBRARY})
+    set(cuda_link_libs ${cuda_link_libs} ${CUDA_CUDA_LIBRARY} ${CUDA_nvcuvid_LIBRARY})
   endif()
 
   if(WIN32)

From 918381875adb0b4cb4fd9a5d061b47ba99b3db7e Mon Sep 17 00:00:00 2001
From: Vladislav Vinogradov <vlad.vinogradov@itseez.com>
Date: Tue, 4 Jun 2013 13:57:35 +0400
Subject: [PATCH 17/27] rewrite gpu/device/vec_math.hpp file

old version isn't compiled with CUDA 5.5
new version doesn't depend on functional.hpp
---
 .../include/opencv2/gpu/device/vec_math.hpp   | 1106 +++++++++++++----
 modules/gpu/src/cuda/ccomponetns.cu           |    4 +-
 modules/gpu/src/cuda/hough.cu                 |    5 +-
 3 files changed, 854 insertions(+), 261 deletions(-)

diff --git a/modules/gpu/include/opencv2/gpu/device/vec_math.hpp b/modules/gpu/include/opencv2/gpu/device/vec_math.hpp
index 1c46dc0c3..a6cb43a2f 100644
--- a/modules/gpu/include/opencv2/gpu/device/vec_math.hpp
+++ b/modules/gpu/include/opencv2/gpu/device/vec_math.hpp
@@ -43,288 +43,880 @@
 #ifndef __OPENCV_GPU_VECMATH_HPP__
 #define __OPENCV_GPU_VECMATH_HPP__
 
-#include "saturate_cast.hpp"
 #include "vec_traits.hpp"
-#include "functional.hpp"
+#include "saturate_cast.hpp"
 
 namespace cv { namespace gpu { namespace device
 {
-    namespace vec_math_detail
-    {
-        template <int cn, typename VecD> struct SatCastHelper;
-        template <typename VecD> struct SatCastHelper<1, VecD>
-        {
-            template <typename VecS> static __device__ __forceinline__ VecD cast(const VecS& v)
-            {
-                typedef typename VecTraits<VecD>::elem_type D;
-                return VecTraits<VecD>::make(saturate_cast<D>(v.x));
-            }
-        };
-        template <typename VecD> struct SatCastHelper<2, VecD>
-        {
-            template <typename VecS> static __device__ __forceinline__ VecD cast(const VecS& v)
-            {
-                typedef typename VecTraits<VecD>::elem_type D;
-                return VecTraits<VecD>::make(saturate_cast<D>(v.x), saturate_cast<D>(v.y));
-            }
-        };
-        template <typename VecD> struct SatCastHelper<3, VecD>
-        {
-            template <typename VecS> static __device__ __forceinline__ VecD cast(const VecS& v)
-            {
-                typedef typename VecTraits<VecD>::elem_type D;
-                return VecTraits<VecD>::make(saturate_cast<D>(v.x), saturate_cast<D>(v.y), saturate_cast<D>(v.z));
-            }
-        };
-        template <typename VecD> struct SatCastHelper<4, VecD>
-        {
-            template <typename VecS> static __device__ __forceinline__ VecD cast(const VecS& v)
-            {
-                typedef typename VecTraits<VecD>::elem_type D;
-                return VecTraits<VecD>::make(saturate_cast<D>(v.x), saturate_cast<D>(v.y), saturate_cast<D>(v.z), saturate_cast<D>(v.w));
-            }
-        };
 
-        template <typename VecD, typename VecS> static __device__ __forceinline__ VecD saturate_cast_caller(const VecS& v)
+// saturate_cast
+
+namespace vec_math_detail
+{
+    template <int cn, typename VecD> struct SatCastHelper;
+    template <typename VecD> struct SatCastHelper<1, VecD>
+    {
+        template <typename VecS> static __device__ __forceinline__ VecD cast(const VecS& v)
         {
-            return SatCastHelper<VecTraits<VecD>::cn, VecD>::cast(v);
+            typedef typename VecTraits<VecD>::elem_type D;
+            return VecTraits<VecD>::make(saturate_cast<D>(v.x));
         }
-    }
-
-    template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const uchar1& v) {return vec_math_detail::saturate_cast_caller<_Tp>(v);}
-    template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const char1& v) {return vec_math_detail::saturate_cast_caller<_Tp>(v);}
-    template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const ushort1& v) {return vec_math_detail::saturate_cast_caller<_Tp>(v);}
-    template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const short1& v) {return vec_math_detail::saturate_cast_caller<_Tp>(v);}
-    template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const uint1& v) {return vec_math_detail::saturate_cast_caller<_Tp>(v);}
-    template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const int1& v) {return vec_math_detail::saturate_cast_caller<_Tp>(v);}
-    template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const float1& v) {return vec_math_detail::saturate_cast_caller<_Tp>(v);}
-    template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const double1& v) {return vec_math_detail::saturate_cast_caller<_Tp>(v);}
-
-    template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const uchar2& v) {return vec_math_detail::saturate_cast_caller<_Tp>(v);}
-    template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const char2& v) {return vec_math_detail::saturate_cast_caller<_Tp>(v);}
-    template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const ushort2& v) {return vec_math_detail::saturate_cast_caller<_Tp>(v);}
-    template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const short2& v) {return vec_math_detail::saturate_cast_caller<_Tp>(v);}
-    template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const uint2& v) {return vec_math_detail::saturate_cast_caller<_Tp>(v);}
-    template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const int2& v) {return vec_math_detail::saturate_cast_caller<_Tp>(v);}
-    template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const float2& v) {return vec_math_detail::saturate_cast_caller<_Tp>(v);}
-    template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const double2& v) {return vec_math_detail::saturate_cast_caller<_Tp>(v);}
-
-    template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const uchar3& v) {return vec_math_detail::saturate_cast_caller<_Tp>(v);}
-    template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const char3& v) {return vec_math_detail::saturate_cast_caller<_Tp>(v);}
-    template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const ushort3& v) {return vec_math_detail::saturate_cast_caller<_Tp>(v);}
-    template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const short3& v) {return vec_math_detail::saturate_cast_caller<_Tp>(v);}
-    template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const uint3& v) {return vec_math_detail::saturate_cast_caller<_Tp>(v);}
-    template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const int3& v) {return vec_math_detail::saturate_cast_caller<_Tp>(v);}
-    template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const float3& v) {return vec_math_detail::saturate_cast_caller<_Tp>(v);}
-    template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const double3& v) {return vec_math_detail::saturate_cast_caller<_Tp>(v);}
-
-    template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const uchar4& v) {return vec_math_detail::saturate_cast_caller<_Tp>(v);}
-    template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const char4& v) {return vec_math_detail::saturate_cast_caller<_Tp>(v);}
-    template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const ushort4& v) {return vec_math_detail::saturate_cast_caller<_Tp>(v);}
-    template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const short4& v) {return vec_math_detail::saturate_cast_caller<_Tp>(v);}
-    template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const uint4& v) {return vec_math_detail::saturate_cast_caller<_Tp>(v);}
-    template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const int4& v) {return vec_math_detail::saturate_cast_caller<_Tp>(v);}
-    template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const float4& v) {return vec_math_detail::saturate_cast_caller<_Tp>(v);}
-    template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const double4& v) {return vec_math_detail::saturate_cast_caller<_Tp>(v);}
-
-#define OPENCV_GPU_IMPLEMENT_VEC_UNOP(type, op, func) \
-    __device__ __forceinline__ TypeVec<func<type>::result_type, 1>::vec_type op(const type ## 1 & a) \
-    { \
-        func<type> f; \
-        return VecTraits<TypeVec<func<type>::result_type, 1>::vec_type>::make(f(a.x)); \
-    } \
-    __device__ __forceinline__ TypeVec<func<type>::result_type, 2>::vec_type op(const type ## 2 & a) \
-    { \
-        func<type> f; \
-        return VecTraits<TypeVec<func<type>::result_type, 2>::vec_type>::make(f(a.x), f(a.y)); \
-    } \
-    __device__ __forceinline__ TypeVec<func<type>::result_type, 3>::vec_type op(const type ## 3 & a) \
-    { \
-        func<type> f; \
-        return VecTraits<TypeVec<func<type>::result_type, 3>::vec_type>::make(f(a.x), f(a.y), f(a.z)); \
-    } \
-    __device__ __forceinline__ TypeVec<func<type>::result_type, 4>::vec_type op(const type ## 4 & a) \
-    { \
-        func<type> f; \
-        return VecTraits<TypeVec<func<type>::result_type, 4>::vec_type>::make(f(a.x), f(a.y), f(a.z), f(a.w)); \
-    }
-
-    namespace vec_math_detail
+    };
+    template <typename VecD> struct SatCastHelper<2, VecD>
     {
-        template <typename T1, typename T2> struct BinOpTraits
+        template <typename VecS> static __device__ __forceinline__ VecD cast(const VecS& v)
         {
-            typedef int argument_type;
-        };
-        template <typename T> struct BinOpTraits<T, T>
+            typedef typename VecTraits<VecD>::elem_type D;
+            return VecTraits<VecD>::make(saturate_cast<D>(v.x), saturate_cast<D>(v.y));
+        }
+    };
+    template <typename VecD> struct SatCastHelper<3, VecD>
+    {
+        template <typename VecS> static __device__ __forceinline__ VecD cast(const VecS& v)
         {
-            typedef T argument_type;
-        };
-        template <typename T> struct BinOpTraits<T, double>
+            typedef typename VecTraits<VecD>::elem_type D;
+            return VecTraits<VecD>::make(saturate_cast<D>(v.x), saturate_cast<D>(v.y), saturate_cast<D>(v.z));
+        }
+    };
+    template <typename VecD> struct SatCastHelper<4, VecD>
+    {
+        template <typename VecS> static __device__ __forceinline__ VecD cast(const VecS& v)
         {
-            typedef double argument_type;
-        };
-        template <typename T> struct BinOpTraits<double, T>
-        {
-            typedef double argument_type;
-        };
-        template <> struct BinOpTraits<double, double>
-        {
-            typedef double argument_type;
-        };
-        template <typename T> struct BinOpTraits<T, float>
-        {
-            typedef float argument_type;
-        };
-        template <typename T> struct BinOpTraits<float, T>
-        {
-            typedef float argument_type;
-        };
-        template <> struct BinOpTraits<float, float>
-        {
-            typedef float argument_type;
-        };
-        template <> struct BinOpTraits<double, float>
-        {
-            typedef double argument_type;
-        };
-        template <> struct BinOpTraits<float, double>
-        {
-            typedef double argument_type;
-        };
+            typedef typename VecTraits<VecD>::elem_type D;
+            return VecTraits<VecD>::make(saturate_cast<D>(v.x), saturate_cast<D>(v.y), saturate_cast<D>(v.z), saturate_cast<D>(v.w));
+        }
+    };
+
+    template <typename VecD, typename VecS> static __device__ __forceinline__ VecD saturate_cast_helper(const VecS& v)
+    {
+        return SatCastHelper<VecTraits<VecD>::cn, VecD>::cast(v);
+    }
+}
+
+template<typename T> static __device__ __forceinline__ T saturate_cast(const uchar1& v) {return vec_math_detail::saturate_cast_helper<T>(v);}
+template<typename T> static __device__ __forceinline__ T saturate_cast(const char1& v) {return vec_math_detail::saturate_cast_helper<T>(v);}
+template<typename T> static __device__ __forceinline__ T saturate_cast(const ushort1& v) {return vec_math_detail::saturate_cast_helper<T>(v);}
+template<typename T> static __device__ __forceinline__ T saturate_cast(const short1& v) {return vec_math_detail::saturate_cast_helper<T>(v);}
+template<typename T> static __device__ __forceinline__ T saturate_cast(const uint1& v) {return vec_math_detail::saturate_cast_helper<T>(v);}
+template<typename T> static __device__ __forceinline__ T saturate_cast(const int1& v) {return vec_math_detail::saturate_cast_helper<T>(v);}
+template<typename T> static __device__ __forceinline__ T saturate_cast(const float1& v) {return vec_math_detail::saturate_cast_helper<T>(v);}
+template<typename T> static __device__ __forceinline__ T saturate_cast(const double1& v) {return vec_math_detail::saturate_cast_helper<T>(v);}
+
+template<typename T> static __device__ __forceinline__ T saturate_cast(const uchar2& v) {return vec_math_detail::saturate_cast_helper<T>(v);}
+template<typename T> static __device__ __forceinline__ T saturate_cast(const char2& v) {return vec_math_detail::saturate_cast_helper<T>(v);}
+template<typename T> static __device__ __forceinline__ T saturate_cast(const ushort2& v) {return vec_math_detail::saturate_cast_helper<T>(v);}
+template<typename T> static __device__ __forceinline__ T saturate_cast(const short2& v) {return vec_math_detail::saturate_cast_helper<T>(v);}
+template<typename T> static __device__ __forceinline__ T saturate_cast(const uint2& v) {return vec_math_detail::saturate_cast_helper<T>(v);}
+template<typename T> static __device__ __forceinline__ T saturate_cast(const int2& v) {return vec_math_detail::saturate_cast_helper<T>(v);}
+template<typename T> static __device__ __forceinline__ T saturate_cast(const float2& v) {return vec_math_detail::saturate_cast_helper<T>(v);}
+template<typename T> static __device__ __forceinline__ T saturate_cast(const double2& v) {return vec_math_detail::saturate_cast_helper<T>(v);}
+
+template<typename T> static __device__ __forceinline__ T saturate_cast(const uchar3& v) {return vec_math_detail::saturate_cast_helper<T>(v);}
+template<typename T> static __device__ __forceinline__ T saturate_cast(const char3& v) {return vec_math_detail::saturate_cast_helper<T>(v);}
+template<typename T> static __device__ __forceinline__ T saturate_cast(const ushort3& v) {return vec_math_detail::saturate_cast_helper<T>(v);}
+template<typename T> static __device__ __forceinline__ T saturate_cast(const short3& v) {return vec_math_detail::saturate_cast_helper<T>(v);}
+template<typename T> static __device__ __forceinline__ T saturate_cast(const uint3& v) {return vec_math_detail::saturate_cast_helper<T>(v);}
+template<typename T> static __device__ __forceinline__ T saturate_cast(const int3& v) {return vec_math_detail::saturate_cast_helper<T>(v);}
+template<typename T> static __device__ __forceinline__ T saturate_cast(const float3& v) {return vec_math_detail::saturate_cast_helper<T>(v);}
+template<typename T> static __device__ __forceinline__ T saturate_cast(const double3& v) {return vec_math_detail::saturate_cast_helper<T>(v);}
+
+template<typename T> static __device__ __forceinline__ T saturate_cast(const uchar4& v) {return vec_math_detail::saturate_cast_helper<T>(v);}
+template<typename T> static __device__ __forceinline__ T saturate_cast(const char4& v) {return vec_math_detail::saturate_cast_helper<T>(v);}
+template<typename T> static __device__ __forceinline__ T saturate_cast(const ushort4& v) {return vec_math_detail::saturate_cast_helper<T>(v);}
+template<typename T> static __device__ __forceinline__ T saturate_cast(const short4& v) {return vec_math_detail::saturate_cast_helper<T>(v);}
+template<typename T> static __device__ __forceinline__ T saturate_cast(const uint4& v) {return vec_math_detail::saturate_cast_helper<T>(v);}
+template<typename T> static __device__ __forceinline__ T saturate_cast(const int4& v) {return vec_math_detail::saturate_cast_helper<T>(v);}
+template<typename T> static __device__ __forceinline__ T saturate_cast(const float4& v) {return vec_math_detail::saturate_cast_helper<T>(v);}
+template<typename T> static __device__ __forceinline__ T saturate_cast(const double4& v) {return vec_math_detail::saturate_cast_helper<T>(v);}
+
+// unary operators
+
+#define CV_CUDEV_IMPLEMENT_VEC_UNARY_OP(op, input_type, output_type) \
+    __device__ __forceinline__ output_type ## 1 operator op(const input_type ## 1 & a) \
+    { \
+        return VecTraits<output_type ## 1>::make(op (a.x)); \
+    } \
+    __device__ __forceinline__ output_type ## 2 operator op(const input_type ## 2 & a) \
+    { \
+        return VecTraits<output_type ## 2>::make(op (a.x), op (a.y)); \
+    } \
+    __device__ __forceinline__ output_type ## 3 operator op(const input_type ## 3 & a) \
+    { \
+        return VecTraits<output_type ## 3>::make(op (a.x), op (a.y), op (a.z)); \
+    } \
+    __device__ __forceinline__ output_type ## 4 operator op(const input_type ## 4 & a) \
+    { \
+        return VecTraits<output_type ## 4>::make(op (a.x), op (a.y), op (a.z), op (a.w)); \
     }
 
-#define OPENCV_GPU_IMPLEMENT_VEC_BINOP(type, op, func) \
-    __device__ __forceinline__ TypeVec<func<type>::result_type, 1>::vec_type op(const type ## 1 & a, const type ## 1 & b) \
+CV_CUDEV_IMPLEMENT_VEC_UNARY_OP(-, char, char)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_OP(-, short, short)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_OP(-, int, int)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_OP(-, float, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_OP(-, double, double)
+
+CV_CUDEV_IMPLEMENT_VEC_UNARY_OP(!, uchar, uchar)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_OP(!, char, uchar)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_OP(!, ushort, uchar)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_OP(!, short, uchar)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_OP(!, int, uchar)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_OP(!, uint, uchar)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_OP(!, float, uchar)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_OP(!, double, uchar)
+
+CV_CUDEV_IMPLEMENT_VEC_UNARY_OP(~, uchar, uchar)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_OP(~, char, char)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_OP(~, ushort, ushort)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_OP(~, short, short)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_OP(~, int, int)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_OP(~, uint, uint)
+
+#undef CV_CUDEV_IMPLEMENT_VEC_UNARY_OP
+
+// unary functions
+
+#define CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(func_name, func, input_type, output_type) \
+    __device__ __forceinline__ output_type ## 1 func_name(const input_type ## 1 & a) \
     { \
-        func<type> f; \
-        return VecTraits<TypeVec<func<type>::result_type, 1>::vec_type>::make(f(a.x, b.x)); \
+        return VecTraits<output_type ## 1>::make(func (a.x)); \
     } \
-    template <typename T> \
-    __device__ __forceinline__ typename TypeVec<typename func<typename vec_math_detail::BinOpTraits<type, T>::argument_type>::result_type, 1>::vec_type op(const type ## 1 & v, T s) \
+    __device__ __forceinline__ output_type ## 2 func_name(const input_type ## 2 & a) \
     { \
-        func<typename vec_math_detail::BinOpTraits<type, T>::argument_type> f; \
-        return VecTraits<typename TypeVec<typename func<typename vec_math_detail::BinOpTraits<type, T>::argument_type>::result_type, 1>::vec_type>::make(f(v.x, s)); \
+        return VecTraits<output_type ## 2>::make(func (a.x), func (a.y)); \
     } \
-    template <typename T> \
-    __device__ __forceinline__ typename TypeVec<typename func<typename vec_math_detail::BinOpTraits<type, T>::argument_type>::result_type, 1>::vec_type op(T s, const type ## 1 & v) \
+    __device__ __forceinline__ output_type ## 3 func_name(const input_type ## 3 & a) \
     { \
-        func<typename vec_math_detail::BinOpTraits<type, T>::argument_type> f; \
-        return VecTraits<typename TypeVec<typename func<typename vec_math_detail::BinOpTraits<type, T>::argument_type>::result_type, 1>::vec_type>::make(f(s, v.x)); \
+        return VecTraits<output_type ## 3>::make(func (a.x), func (a.y), func (a.z)); \
     } \
-    __device__ __forceinline__ TypeVec<func<type>::result_type, 2>::vec_type op(const type ## 2 & a, const type ## 2 & b) \
+    __device__ __forceinline__ output_type ## 4 func_name(const input_type ## 4 & a) \
     { \
-        func<type> f; \
-        return VecTraits<TypeVec<func<type>::result_type, 2>::vec_type>::make(f(a.x, b.x), f(a.y, b.y)); \
-    } \
-    template <typename T> \
-    __device__ __forceinline__ typename TypeVec<typename func<typename vec_math_detail::BinOpTraits<type, T>::argument_type>::result_type, 2>::vec_type op(const type ## 2 & v, T s) \
-    { \
-        func<typename vec_math_detail::BinOpTraits<type, T>::argument_type> f; \
-        return VecTraits<typename TypeVec<typename func<typename vec_math_detail::BinOpTraits<type, T>::argument_type>::result_type, 2>::vec_type>::make(f(v.x, s), f(v.y, s)); \
-    } \
-    template <typename T> \
-    __device__ __forceinline__ typename TypeVec<typename func<typename vec_math_detail::BinOpTraits<type, T>::argument_type>::result_type, 2>::vec_type op(T s, const type ## 2 & v) \
-    { \
-        func<typename vec_math_detail::BinOpTraits<type, T>::argument_type> f; \
-        return VecTraits<typename TypeVec<typename func<typename vec_math_detail::BinOpTraits<type, T>::argument_type>::result_type, 2>::vec_type>::make(f(s, v.x), f(s, v.y)); \
-    } \
-    __device__ __forceinline__ TypeVec<func<type>::result_type, 3>::vec_type op(const type ## 3 & a, const type ## 3 & b) \
-    { \
-        func<type> f; \
-        return VecTraits<TypeVec<func<type>::result_type, 3>::vec_type>::make(f(a.x, b.x), f(a.y, b.y), f(a.z, b.z)); \
-    } \
-    template <typename T> \
-    __device__ __forceinline__ typename TypeVec<typename func<typename vec_math_detail::BinOpTraits<type, T>::argument_type>::result_type, 3>::vec_type op(const type ## 3 & v, T s) \
-    { \
-        func<typename vec_math_detail::BinOpTraits<type, T>::argument_type> f; \
-        return VecTraits<typename TypeVec<typename func<typename vec_math_detail::BinOpTraits<type, T>::argument_type>::result_type, 3>::vec_type>::make(f(v.x, s), f(v.y, s), f(v.z, s)); \
-    } \
-    template <typename T> \
-    __device__ __forceinline__ typename TypeVec<typename func<typename vec_math_detail::BinOpTraits<type, T>::argument_type>::result_type, 3>::vec_type op(T s, const type ## 3 & v) \
-    { \
-        func<typename vec_math_detail::BinOpTraits<type, T>::argument_type> f; \
-        return VecTraits<typename TypeVec<typename func<typename vec_math_detail::BinOpTraits<type, T>::argument_type>::result_type, 3>::vec_type>::make(f(s, v.x), f(s, v.y), f(s, v.z)); \
-    } \
-    __device__ __forceinline__ TypeVec<func<type>::result_type, 4>::vec_type op(const type ## 4 & a, const type ## 4 & b) \
-    { \
-        func<type> f; \
-        return VecTraits<TypeVec<func<type>::result_type, 4>::vec_type>::make(f(a.x, b.x), f(a.y, b.y), f(a.z, b.z), f(a.w, b.w)); \
-    } \
-    template <typename T> \
-    __device__ __forceinline__ typename TypeVec<typename func<typename vec_math_detail::BinOpTraits<type, T>::argument_type>::result_type, 4>::vec_type op(const type ## 4 & v, T s) \
-    { \
-        func<typename vec_math_detail::BinOpTraits<type, T>::argument_type> f; \
-        return VecTraits<typename TypeVec<typename func<typename vec_math_detail::BinOpTraits<type, T>::argument_type>::result_type, 4>::vec_type>::make(f(v.x, s), f(v.y, s), f(v.z, s), f(v.w, s)); \
-    } \
-    template <typename T> \
-    __device__ __forceinline__ typename TypeVec<typename func<typename vec_math_detail::BinOpTraits<type, T>::argument_type>::result_type, 4>::vec_type op(T s, const type ## 4 & v) \
-    { \
-        func<typename vec_math_detail::BinOpTraits<T, type>::argument_type> f; \
-        return VecTraits<typename TypeVec<typename func<typename vec_math_detail::BinOpTraits<type, T>::argument_type>::result_type, 4>::vec_type>::make(f(s, v.x), f(s, v.y), f(s, v.z), f(s, v.w)); \
+        return VecTraits<output_type ## 4>::make(func (a.x), func (a.y), func (a.z), func (a.w)); \
     }
 
-#define OPENCV_GPU_IMPLEMENT_VEC_OP(type) \
-    OPENCV_GPU_IMPLEMENT_VEC_BINOP(type, operator +, plus) \
-    OPENCV_GPU_IMPLEMENT_VEC_BINOP(type, operator -, minus) \
-    OPENCV_GPU_IMPLEMENT_VEC_BINOP(type, operator *, multiplies) \
-    OPENCV_GPU_IMPLEMENT_VEC_BINOP(type, operator /, divides) \
-    OPENCV_GPU_IMPLEMENT_VEC_UNOP (type, operator -, negate) \
-    OPENCV_GPU_IMPLEMENT_VEC_BINOP(type, operator ==, equal_to) \
-    OPENCV_GPU_IMPLEMENT_VEC_BINOP(type, operator !=, not_equal_to) \
-    OPENCV_GPU_IMPLEMENT_VEC_BINOP(type, operator > , greater) \
-    OPENCV_GPU_IMPLEMENT_VEC_BINOP(type, operator < , less) \
-    OPENCV_GPU_IMPLEMENT_VEC_BINOP(type, operator >=, greater_equal) \
-    OPENCV_GPU_IMPLEMENT_VEC_BINOP(type, operator <=, less_equal) \
-    OPENCV_GPU_IMPLEMENT_VEC_BINOP(type, operator &&, logical_and) \
-    OPENCV_GPU_IMPLEMENT_VEC_BINOP(type, operator ||, logical_or) \
-    OPENCV_GPU_IMPLEMENT_VEC_UNOP (type, operator ! , logical_not) \
-    OPENCV_GPU_IMPLEMENT_VEC_BINOP(type, max, maximum) \
-    OPENCV_GPU_IMPLEMENT_VEC_BINOP(type, min, minimum) \
-    OPENCV_GPU_IMPLEMENT_VEC_UNOP(type, abs, abs_func) \
-    OPENCV_GPU_IMPLEMENT_VEC_UNOP(type, sqrt, sqrt_func) \
-    OPENCV_GPU_IMPLEMENT_VEC_UNOP(type, exp, exp_func) \
-    OPENCV_GPU_IMPLEMENT_VEC_UNOP(type, exp2, exp2_func) \
-    OPENCV_GPU_IMPLEMENT_VEC_UNOP(type, exp10, exp10_func) \
-    OPENCV_GPU_IMPLEMENT_VEC_UNOP(type, log, log_func) \
-    OPENCV_GPU_IMPLEMENT_VEC_UNOP(type, log2, log2_func) \
-    OPENCV_GPU_IMPLEMENT_VEC_UNOP(type, log10, log10_func) \
-    OPENCV_GPU_IMPLEMENT_VEC_UNOP(type, sin, sin_func) \
-    OPENCV_GPU_IMPLEMENT_VEC_UNOP(type, cos, cos_func) \
-    OPENCV_GPU_IMPLEMENT_VEC_UNOP(type, tan, tan_func) \
-    OPENCV_GPU_IMPLEMENT_VEC_UNOP(type, asin, asin_func) \
-    OPENCV_GPU_IMPLEMENT_VEC_UNOP(type, acos, acos_func) \
-    OPENCV_GPU_IMPLEMENT_VEC_UNOP(type, atan, atan_func) \
-    OPENCV_GPU_IMPLEMENT_VEC_UNOP(type, sinh, sinh_func) \
-    OPENCV_GPU_IMPLEMENT_VEC_UNOP(type, cosh, cosh_func) \
-    OPENCV_GPU_IMPLEMENT_VEC_UNOP(type, tanh, tanh_func) \
-    OPENCV_GPU_IMPLEMENT_VEC_UNOP(type, asinh, asinh_func) \
-    OPENCV_GPU_IMPLEMENT_VEC_UNOP(type, acosh, acosh_func) \
-    OPENCV_GPU_IMPLEMENT_VEC_UNOP(type, atanh, atanh_func) \
-    OPENCV_GPU_IMPLEMENT_VEC_BINOP(type, hypot, hypot_func) \
-    OPENCV_GPU_IMPLEMENT_VEC_BINOP(type, atan2, atan2_func) \
-    OPENCV_GPU_IMPLEMENT_VEC_BINOP(type, pow, pow_func) \
-    OPENCV_GPU_IMPLEMENT_VEC_BINOP(type, hypot_sqr, hypot_sqr_func)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(abs, /*::abs*/, uchar, uchar)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(abs, ::abs, char, char)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(abs, /*::abs*/, ushort, ushort)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(abs, ::abs, short, short)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(abs, ::abs, int, int)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(abs, /*::abs*/, uint, uint)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(abs, ::fabsf, float, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(abs, ::fabs, double, double)
 
-#define OPENCV_GPU_IMPLEMENT_VEC_INT_OP(type) \
-    OPENCV_GPU_IMPLEMENT_VEC_OP(type) \
-    OPENCV_GPU_IMPLEMENT_VEC_BINOP(type, operator &, bit_and) \
-    OPENCV_GPU_IMPLEMENT_VEC_BINOP(type, operator |, bit_or) \
-    OPENCV_GPU_IMPLEMENT_VEC_BINOP(type, operator ^, bit_xor) \
-    OPENCV_GPU_IMPLEMENT_VEC_UNOP (type, operator ~, bit_not)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(sqrt, ::sqrtf, uchar, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(sqrt, ::sqrtf, char, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(sqrt, ::sqrtf, ushort, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(sqrt, ::sqrtf, short, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(sqrt, ::sqrtf, int, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(sqrt, ::sqrtf, uint, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(sqrt, ::sqrtf, float, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(sqrt, ::sqrt, double, double)
 
-    OPENCV_GPU_IMPLEMENT_VEC_INT_OP(uchar)
-    OPENCV_GPU_IMPLEMENT_VEC_INT_OP(char)
-    OPENCV_GPU_IMPLEMENT_VEC_INT_OP(ushort)
-    OPENCV_GPU_IMPLEMENT_VEC_INT_OP(short)
-    OPENCV_GPU_IMPLEMENT_VEC_INT_OP(int)
-    OPENCV_GPU_IMPLEMENT_VEC_INT_OP(uint)
-    OPENCV_GPU_IMPLEMENT_VEC_OP(float)
-    OPENCV_GPU_IMPLEMENT_VEC_OP(double)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(exp, ::expf, uchar, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(exp, ::expf, char, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(exp, ::expf, ushort, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(exp, ::expf, short, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(exp, ::expf, int, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(exp, ::expf, uint, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(exp, ::expf, float, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(exp, ::exp, double, double)
+
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(exp2, ::exp2f, uchar, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(exp2, ::exp2f, char, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(exp2, ::exp2f, ushort, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(exp2, ::exp2f, short, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(exp2, ::exp2f, int, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(exp2, ::exp2f, uint, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(exp2, ::exp2f, float, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(exp2, ::exp2, double, double)
+
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(exp10, ::exp10f, uchar, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(exp10, ::exp10f, char, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(exp10, ::exp10f, ushort, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(exp10, ::exp10f, short, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(exp10, ::exp10f, int, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(exp10, ::exp10f, uint, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(exp10, ::exp10f, float, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(exp10, ::exp10, double, double)
+
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(log, ::logf, uchar, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(log, ::logf, char, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(log, ::logf, ushort, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(log, ::logf, short, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(log, ::logf, int, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(log, ::logf, uint, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(log, ::logf, float, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(log, ::log, double, double)
+
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(log2, ::log2f, uchar, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(log2, ::log2f, char, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(log2, ::log2f, ushort, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(log2, ::log2f, short, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(log2, ::log2f, int, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(log2, ::log2f, uint, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(log2, ::log2f, float, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(log2, ::log2, double, double)
+
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(log10, ::log10f, uchar, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(log10, ::log10f, char, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(log10, ::log10f, ushort, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(log10, ::log10f, short, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(log10, ::log10f, int, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(log10, ::log10f, uint, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(log10, ::log10f, float, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(log10, ::log10, double, double)
+
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(sin, ::sinf, uchar, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(sin, ::sinf, char, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(sin, ::sinf, ushort, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(sin, ::sinf, short, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(sin, ::sinf, int, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(sin, ::sinf, uint, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(sin, ::sinf, float, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(sin, ::sin, double, double)
+
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(cos, ::cosf, uchar, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(cos, ::cosf, char, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(cos, ::cosf, ushort, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(cos, ::cosf, short, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(cos, ::cosf, int, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(cos, ::cosf, uint, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(cos, ::cosf, float, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(cos, ::cos, double, double)
+
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(tan, ::tanf, uchar, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(tan, ::tanf, char, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(tan, ::tanf, ushort, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(tan, ::tanf, short, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(tan, ::tanf, int, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(tan, ::tanf, uint, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(tan, ::tanf, float, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(tan, ::tan, double, double)
+
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(asin, ::asinf, uchar, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(asin, ::asinf, char, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(asin, ::asinf, ushort, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(asin, ::asinf, short, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(asin, ::asinf, int, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(asin, ::asinf, uint, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(asin, ::asinf, float, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(asin, ::asin, double, double)
+
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(acos, ::acosf, uchar, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(acos, ::acosf, char, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(acos, ::acosf, ushort, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(acos, ::acosf, short, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(acos, ::acosf, int, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(acos, ::acosf, uint, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(acos, ::acosf, float, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(acos, ::acos, double, double)
+
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(atan, ::atanf, uchar, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(atan, ::atanf, char, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(atan, ::atanf, ushort, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(atan, ::atanf, short, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(atan, ::atanf, int, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(atan, ::atanf, uint, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(atan, ::atanf, float, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(atan, ::atan, double, double)
+
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(sinh, ::sinhf, uchar, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(sinh, ::sinhf, char, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(sinh, ::sinhf, ushort, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(sinh, ::sinhf, short, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(sinh, ::sinhf, int, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(sinh, ::sinhf, uint, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(sinh, ::sinhf, float, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(sinh, ::sinh, double, double)
+
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(cosh, ::coshf, uchar, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(cosh, ::coshf, char, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(cosh, ::coshf, ushort, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(cosh, ::coshf, short, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(cosh, ::coshf, int, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(cosh, ::coshf, uint, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(cosh, ::coshf, float, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(cosh, ::cosh, double, double)
+
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(tanh, ::tanhf, uchar, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(tanh, ::tanhf, char, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(tanh, ::tanhf, ushort, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(tanh, ::tanhf, short, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(tanh, ::tanhf, int, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(tanh, ::tanhf, uint, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(tanh, ::tanhf, float, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(tanh, ::tanh, double, double)
+
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(asinh, ::asinhf, uchar, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(asinh, ::asinhf, char, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(asinh, ::asinhf, ushort, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(asinh, ::asinhf, short, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(asinh, ::asinhf, int, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(asinh, ::asinhf, uint, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(asinh, ::asinhf, float, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(asinh, ::asinh, double, double)
+
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(acosh, ::acoshf, uchar, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(acosh, ::acoshf, char, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(acosh, ::acoshf, ushort, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(acosh, ::acoshf, short, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(acosh, ::acoshf, int, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(acosh, ::acoshf, uint, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(acosh, ::acoshf, float, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(acosh, ::acosh, double, double)
+
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(atanh, ::atanhf, uchar, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(atanh, ::atanhf, char, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(atanh, ::atanhf, ushort, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(atanh, ::atanhf, short, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(atanh, ::atanhf, int, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(atanh, ::atanhf, uint, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(atanh, ::atanhf, float, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(atanh, ::atanh, double, double)
+
+#undef CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC
+
+// binary operators (vec & vec)
+
+#define CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(op, input_type, output_type) \
+    __device__ __forceinline__ output_type ## 1 operator op(const input_type ## 1 & a, const input_type ## 1 & b) \
+    { \
+        return VecTraits<output_type ## 1>::make(a.x op b.x); \
+    } \
+    __device__ __forceinline__ output_type ## 2 operator op(const input_type ## 2 & a, const input_type ## 2 & b) \
+    { \
+        return VecTraits<output_type ## 2>::make(a.x op b.x, a.y op b.y); \
+    } \
+    __device__ __forceinline__ output_type ## 3 operator op(const input_type ## 3 & a, const input_type ## 3 & b) \
+    { \
+        return VecTraits<output_type ## 3>::make(a.x op b.x, a.y op b.y, a.z op b.z); \
+    } \
+    __device__ __forceinline__ output_type ## 4 operator op(const input_type ## 4 & a, const input_type ## 4 & b) \
+    { \
+        return VecTraits<output_type ## 4>::make(a.x op b.x, a.y op b.y, a.z op b.z, a.w op b.w); \
+    }
+
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(+, uchar, int)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(+, char, int)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(+, ushort, int)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(+, short, int)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(+, int, int)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(+, uint, uint)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(+, float, float)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(+, double, double)
+
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(-, uchar, int)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(-, char, int)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(-, ushort, int)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(-, short, int)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(-, int, int)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(-, uint, uint)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(-, float, float)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(-, double, double)
+
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(*, uchar, int)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(*, char, int)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(*, ushort, int)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(*, short, int)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(*, int, int)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(*, uint, uint)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(*, float, float)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(*, double, double)
+
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(/, uchar, int)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(/, char, int)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(/, ushort, int)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(/, short, int)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(/, int, int)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(/, uint, uint)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(/, float, float)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(/, double, double)
+
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(==, uchar, uchar)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(==, char, uchar)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(==, ushort, uchar)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(==, short, uchar)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(==, int, uchar)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(==, uint, uchar)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(==, float, uchar)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(==, double, uchar)
+
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(!=, uchar, uchar)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(!=, char, uchar)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(!=, ushort, uchar)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(!=, short, uchar)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(!=, int, uchar)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(!=, uint, uchar)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(!=, float, uchar)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(!=, double, uchar)
+
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(>, uchar, uchar)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(>, char, uchar)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(>, ushort, uchar)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(>, short, uchar)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(>, int, uchar)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(>, uint, uchar)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(>, float, uchar)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(>, double, uchar)
+
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(<, uchar, uchar)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(<, char, uchar)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(<, ushort, uchar)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(<, short, uchar)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(<, int, uchar)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(<, uint, uchar)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(<, float, uchar)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(<, double, uchar)
+
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(>=, uchar, uchar)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(>=, char, uchar)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(>=, ushort, uchar)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(>=, short, uchar)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(>=, int, uchar)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(>=, uint, uchar)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(>=, float, uchar)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(>=, double, uchar)
+
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(<=, uchar, uchar)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(<=, char, uchar)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(<=, ushort, uchar)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(<=, short, uchar)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(<=, int, uchar)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(<=, uint, uchar)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(<=, float, uchar)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(<=, double, uchar)
+
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(&&, uchar, uchar)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(&&, char, uchar)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(&&, ushort, uchar)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(&&, short, uchar)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(&&, int, uchar)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(&&, uint, uchar)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(&&, float, uchar)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(&&, double, uchar)
+
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(||, uchar, uchar)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(||, char, uchar)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(||, ushort, uchar)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(||, short, uchar)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(||, int, uchar)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(||, uint, uchar)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(||, float, uchar)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(||, double, uchar)
+
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(&, uchar, uchar)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(&, char, char)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(&, ushort, ushort)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(&, short, short)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(&, int, int)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(&, uint, uint)
+
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(|, uchar, uchar)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(|, char, char)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(|, ushort, ushort)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(|, short, short)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(|, int, int)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(|, uint, uint)
+
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(^, uchar, uchar)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(^, char, char)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(^, ushort, ushort)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(^, short, short)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(^, int, int)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(^, uint, uint)
+
+#undef CV_CUDEV_IMPLEMENT_VEC_BINARY_OP
+
+// binary operators (vec & scalar)
+
+#define CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(op, input_type, scalar_type, output_type) \
+    __device__ __forceinline__ output_type ## 1 operator op(const input_type ## 1 & a, scalar_type s) \
+    { \
+        return VecTraits<output_type ## 1>::make(a.x op s); \
+    } \
+    __device__ __forceinline__ output_type ## 1 operator op(scalar_type s, const input_type ## 1 & b) \
+    { \
+        return VecTraits<output_type ## 1>::make(s op b.x); \
+    } \
+    __device__ __forceinline__ output_type ## 2 operator op(const input_type ## 2 & a, scalar_type s) \
+    { \
+        return VecTraits<output_type ## 2>::make(a.x op s, a.y op s); \
+    } \
+    __device__ __forceinline__ output_type ## 2 operator op(scalar_type s, const input_type ## 2 & b) \
+    { \
+        return VecTraits<output_type ## 2>::make(s op b.x, s op b.y); \
+    } \
+    __device__ __forceinline__ output_type ## 3 operator op(const input_type ## 3 & a, scalar_type s) \
+    { \
+        return VecTraits<output_type ## 3>::make(a.x op s, a.y op s, a.z op s); \
+    } \
+    __device__ __forceinline__ output_type ## 3 operator op(scalar_type s, const input_type ## 3 & b) \
+    { \
+        return VecTraits<output_type ## 3>::make(s op b.x, s op b.y, s op b.z); \
+    } \
+    __device__ __forceinline__ output_type ## 4 operator op(const input_type ## 4 & a, scalar_type s) \
+    { \
+        return VecTraits<output_type ## 4>::make(a.x op s, a.y op s, a.z op s, a.w op s); \
+    } \
+    __device__ __forceinline__ output_type ## 4 operator op(scalar_type s, const input_type ## 4 & b) \
+    { \
+        return VecTraits<output_type ## 4>::make(s op b.x, s op b.y, s op b.z, s op b.w); \
+    }
+
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(+, uchar, int, int)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(+, uchar, float, float)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(+, uchar, double, double)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(+, char, int, int)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(+, char, float, float)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(+, char, double, double)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(+, ushort, int, int)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(+, ushort, float, float)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(+, ushort, double, double)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(+, short, int, int)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(+, short, float, float)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(+, short, double, double)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(+, int, int, int)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(+, int, float, float)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(+, int, double, double)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(+, uint, uint, uint)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(+, uint, float, float)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(+, uint, double, double)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(+, float, float, float)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(+, float, double, double)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(+, double, double, double)
+
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(-, uchar, int, int)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(-, uchar, float, float)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(-, uchar, double, double)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(-, char, int, int)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(-, char, float, float)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(-, char, double, double)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(-, ushort, int, int)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(-, ushort, float, float)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(-, ushort, double, double)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(-, short, int, int)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(-, short, float, float)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(-, short, double, double)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(-, int, int, int)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(-, int, float, float)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(-, int, double, double)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(-, uint, uint, uint)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(-, uint, float, float)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(-, uint, double, double)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(-, float, float, float)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(-, float, double, double)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(-, double, double, double)
+
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(*, uchar, int, int)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(*, uchar, float, float)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(*, uchar, double, double)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(*, char, int, int)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(*, char, float, float)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(*, char, double, double)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(*, ushort, int, int)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(*, ushort, float, float)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(*, ushort, double, double)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(*, short, int, int)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(*, short, float, float)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(*, short, double, double)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(*, int, int, int)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(*, int, float, float)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(*, int, double, double)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(*, uint, uint, uint)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(*, uint, float, float)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(*, uint, double, double)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(*, float, float, float)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(*, float, double, double)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(*, double, double, double)
+
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(/, uchar, int, int)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(/, uchar, float, float)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(/, uchar, double, double)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(/, char, int, int)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(/, char, float, float)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(/, char, double, double)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(/, ushort, int, int)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(/, ushort, float, float)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(/, ushort, double, double)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(/, short, int, int)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(/, short, float, float)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(/, short, double, double)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(/, int, int, int)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(/, int, float, float)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(/, int, double, double)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(/, uint, uint, uint)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(/, uint, float, float)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(/, uint, double, double)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(/, float, float, float)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(/, float, double, double)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(/, double, double, double)
+
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(==, uchar, uchar, uchar)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(==, char, char, uchar)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(==, ushort, ushort, uchar)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(==, short, short, uchar)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(==, int, int, uchar)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(==, uint, uint, uchar)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(==, float, float, uchar)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(==, double, double, uchar)
+
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(!=, uchar, uchar, uchar)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(!=, char, char, uchar)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(!=, ushort, ushort, uchar)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(!=, short, short, uchar)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(!=, int, int, uchar)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(!=, uint, uint, uchar)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(!=, float, float, uchar)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(!=, double, double, uchar)
+
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(>, uchar, uchar, uchar)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(>, char, char, uchar)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(>, ushort, ushort, uchar)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(>, short, short, uchar)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(>, int, int, uchar)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(>, uint, uint, uchar)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(>, float, float, uchar)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(>, double, double, uchar)
+
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(<, uchar, uchar, uchar)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(<, char, char, uchar)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(<, ushort, ushort, uchar)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(<, short, short, uchar)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(<, int, int, uchar)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(<, uint, uint, uchar)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(<, float, float, uchar)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(<, double, double, uchar)
+
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(>=, uchar, uchar, uchar)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(>=, char, char, uchar)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(>=, ushort, ushort, uchar)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(>=, short, short, uchar)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(>=, int, int, uchar)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(>=, uint, uint, uchar)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(>=, float, float, uchar)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(>=, double, double, uchar)
+
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(<=, uchar, uchar, uchar)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(<=, char, char, uchar)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(<=, ushort, ushort, uchar)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(<=, short, short, uchar)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(<=, int, int, uchar)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(<=, uint, uint, uchar)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(<=, float, float, uchar)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(<=, double, double, uchar)
+
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(&&, uchar, uchar, uchar)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(&&, char, char, uchar)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(&&, ushort, ushort, uchar)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(&&, short, short, uchar)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(&&, int, int, uchar)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(&&, uint, uint, uchar)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(&&, float, float, uchar)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(&&, double, double, uchar)
+
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(||, uchar, uchar, uchar)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(||, char, char, uchar)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(||, ushort, ushort, uchar)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(||, short, short, uchar)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(||, int, int, uchar)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(||, uint, uint, uchar)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(||, float, float, uchar)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(||, double, double, uchar)
+
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(&, uchar, uchar, uchar)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(&, char, char, char)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(&, ushort, ushort, ushort)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(&, short, short, short)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(&, int, int, int)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(&, uint, uint, uint)
+
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(|, uchar, uchar, uchar)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(|, char, char, char)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(|, ushort, ushort, ushort)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(|, short, short, short)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(|, int, int, int)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(|, uint, uint, uint)
+
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(^, uchar, uchar, uchar)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(^, char, char, char)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(^, ushort, ushort, ushort)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(^, short, short, short)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(^, int, int, int)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(^, uint, uint, uint)
+
+#undef CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP
+
+// binary function (vec & vec)
+
+#define CV_CUDEV_IMPLEMENT_VEC_BINARY_FUNC(func_name, func, input_type, output_type) \
+    __device__ __forceinline__ output_type ## 1 func_name(const input_type ## 1 & a, const input_type ## 1 & b) \
+    { \
+        return VecTraits<output_type ## 1>::make(func (a.x, b.x)); \
+    } \
+    __device__ __forceinline__ output_type ## 2 func_name(const input_type ## 2 & a, const input_type ## 2 & b) \
+    { \
+        return VecTraits<output_type ## 2>::make(func (a.x, b.x), func (a.y, b.y)); \
+    } \
+    __device__ __forceinline__ output_type ## 3 func_name(const input_type ## 3 & a, const input_type ## 3 & b) \
+    { \
+        return VecTraits<output_type ## 3>::make(func (a.x, b.x), func (a.y, b.y), func (a.z, b.z)); \
+    } \
+    __device__ __forceinline__ output_type ## 4 func_name(const input_type ## 4 & a, const input_type ## 4 & b) \
+    { \
+        return VecTraits<output_type ## 4>::make(func (a.x, b.x), func (a.y, b.y), func (a.z, b.z), func (a.w, b.w)); \
+    }
+
+CV_CUDEV_IMPLEMENT_VEC_BINARY_FUNC(max, ::max, uchar, uchar)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_FUNC(max, ::max, char, char)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_FUNC(max, ::max, ushort, ushort)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_FUNC(max, ::max, short, short)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_FUNC(max, ::max, uint, uint)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_FUNC(max, ::max, int, int)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_FUNC(max, ::fmaxf, float, float)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_FUNC(max, ::fmax, double, double)
+
+CV_CUDEV_IMPLEMENT_VEC_BINARY_FUNC(min, ::min, uchar, uchar)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_FUNC(min, ::min, char, char)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_FUNC(min, ::min, ushort, ushort)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_FUNC(min, ::min, short, short)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_FUNC(min, ::min, uint, uint)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_FUNC(min, ::min, int, int)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_FUNC(min, ::fminf, float, float)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_FUNC(min, ::fmin, double, double)
+
+CV_CUDEV_IMPLEMENT_VEC_BINARY_FUNC(hypot, ::hypotf, uchar, float)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_FUNC(hypot, ::hypotf, char, float)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_FUNC(hypot, ::hypotf, ushort, float)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_FUNC(hypot, ::hypotf, short, float)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_FUNC(hypot, ::hypotf, uint, float)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_FUNC(hypot, ::hypotf, int, float)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_FUNC(hypot, ::hypotf, float, float)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_FUNC(hypot, ::hypot, double, double)
+
+CV_CUDEV_IMPLEMENT_VEC_BINARY_FUNC(atan2, ::atan2f, uchar, float)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_FUNC(atan2, ::atan2f, char, float)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_FUNC(atan2, ::atan2f, ushort, float)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_FUNC(atan2, ::atan2f, short, float)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_FUNC(atan2, ::atan2f, uint, float)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_FUNC(atan2, ::atan2f, int, float)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_FUNC(atan2, ::atan2f, float, float)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_FUNC(atan2, ::atan2, double, double)
+
+#undef CV_CUDEV_IMPLEMENT_VEC_BINARY_FUNC
+
+// binary function (vec & scalar)
+
+#define CV_CUDEV_IMPLEMENT_SCALAR_BINARY_FUNC(func_name, func, input_type, scalar_type, output_type) \
+    __device__ __forceinline__ output_type ## 1 func_name(const input_type ## 1 & a, scalar_type s) \
+    { \
+        return VecTraits<output_type ## 1>::make(func ((output_type) a.x, (output_type) s)); \
+    } \
+    __device__ __forceinline__ output_type ## 1 func_name(scalar_type s, const input_type ## 1 & b) \
+    { \
+        return VecTraits<output_type ## 1>::make(func ((output_type) s, (output_type) b.x)); \
+    } \
+    __device__ __forceinline__ output_type ## 2 func_name(const input_type ## 2 & a, scalar_type s) \
+    { \
+        return VecTraits<output_type ## 2>::make(func ((output_type) a.x, (output_type) s), func ((output_type) a.y, (output_type) s)); \
+    } \
+    __device__ __forceinline__ output_type ## 2 func_name(scalar_type s, const input_type ## 2 & b) \
+    { \
+        return VecTraits<output_type ## 2>::make(func ((output_type) s, (output_type) b.x), func ((output_type) s, (output_type) b.y)); \
+    } \
+    __device__ __forceinline__ output_type ## 3 func_name(const input_type ## 3 & a, scalar_type s) \
+    { \
+        return VecTraits<output_type ## 3>::make(func ((output_type) a.x, (output_type) s), func ((output_type) a.y, (output_type) s), func ((output_type) a.z, (output_type) s)); \
+    } \
+    __device__ __forceinline__ output_type ## 3 func_name(scalar_type s, const input_type ## 3 & b) \
+    { \
+        return VecTraits<output_type ## 3>::make(func ((output_type) s, (output_type) b.x), func ((output_type) s, (output_type) b.y), func ((output_type) s, (output_type) b.z)); \
+    } \
+    __device__ __forceinline__ output_type ## 4 func_name(const input_type ## 4 & a, scalar_type s) \
+    { \
+        return VecTraits<output_type ## 4>::make(func ((output_type) a.x, (output_type) s), func ((output_type) a.y, (output_type) s), func ((output_type) a.z, (output_type) s), func ((output_type) a.w, (output_type) s)); \
+    } \
+    __device__ __forceinline__ output_type ## 4 func_name(scalar_type s, const input_type ## 4 & b) \
+    { \
+        return VecTraits<output_type ## 4>::make(func ((output_type) s, (output_type) b.x), func ((output_type) s, (output_type) b.y), func ((output_type) s, (output_type) b.z), func ((output_type) s, (output_type) b.w)); \
+    }
+
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_FUNC(max, ::max, uchar, uchar, uchar)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_FUNC(max, ::fmaxf, uchar, float, float)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_FUNC(max, ::fmax, uchar, double, double)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_FUNC(max, ::max, char, char, char)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_FUNC(max, ::fmaxf, char, float, float)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_FUNC(max, ::fmax, char, double, double)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_FUNC(max, ::max, ushort, ushort, ushort)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_FUNC(max, ::fmaxf, ushort, float, float)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_FUNC(max, ::fmax, ushort, double, double)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_FUNC(max, ::max, short, short, short)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_FUNC(max, ::fmaxf, short, float, float)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_FUNC(max, ::fmax, short, double, double)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_FUNC(max, ::max, uint, uint, uint)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_FUNC(max, ::fmaxf, uint, float, float)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_FUNC(max, ::fmax, uint, double, double)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_FUNC(max, ::max, int, int, int)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_FUNC(max, ::fmaxf, int, float, float)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_FUNC(max, ::fmax, int, double, double)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_FUNC(max, ::fmaxf, float, float, float)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_FUNC(max, ::fmax, float, double, double)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_FUNC(max, ::fmax, double, double, double)
+
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_FUNC(min, ::min, uchar, uchar, uchar)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_FUNC(min, ::fminf, uchar, float, float)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_FUNC(min, ::fmin, uchar, double, double)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_FUNC(min, ::min, char, char, char)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_FUNC(min, ::fminf, char, float, float)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_FUNC(min, ::fmin, char, double, double)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_FUNC(min, ::min, ushort, ushort, ushort)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_FUNC(min, ::fminf, ushort, float, float)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_FUNC(min, ::fmin, ushort, double, double)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_FUNC(min, ::min, short, short, short)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_FUNC(min, ::fminf, short, float, float)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_FUNC(min, ::fmin, short, double, double)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_FUNC(min, ::min, uint, uint, uint)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_FUNC(min, ::fminf, uint, float, float)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_FUNC(min, ::fmin, uint, double, double)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_FUNC(min, ::min, int, int, int)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_FUNC(min, ::fminf, int, float, float)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_FUNC(min, ::fmin, int, double, double)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_FUNC(min, ::fminf, float, float, float)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_FUNC(min, ::fmin, float, double, double)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_FUNC(min, ::fmin, double, double, double)
+
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_FUNC(hypot, ::hypotf, uchar, float, float)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_FUNC(hypot, ::hypot, uchar, double, double)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_FUNC(hypot, ::hypotf, char, float, float)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_FUNC(hypot, ::hypot, char, double, double)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_FUNC(hypot, ::hypotf, ushort, float, float)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_FUNC(hypot, ::hypot, ushort, double, double)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_FUNC(hypot, ::hypotf, short, float, float)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_FUNC(hypot, ::hypot, short, double, double)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_FUNC(hypot, ::hypotf, uint, float, float)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_FUNC(hypot, ::hypot, uint, double, double)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_FUNC(hypot, ::hypotf, int, float, float)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_FUNC(hypot, ::hypot, int, double, double)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_FUNC(hypot, ::hypotf, float, float, float)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_FUNC(hypot, ::hypot, float, double, double)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_FUNC(hypot, ::hypot, double, double, double)
+
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_FUNC(atan2, ::atan2f, uchar, float, float)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_FUNC(atan2, ::atan2, uchar, double, double)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_FUNC(atan2, ::atan2f, char, float, float)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_FUNC(atan2, ::atan2, char, double, double)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_FUNC(atan2, ::atan2f, ushort, float, float)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_FUNC(atan2, ::atan2, ushort, double, double)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_FUNC(atan2, ::atan2f, short, float, float)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_FUNC(atan2, ::atan2, short, double, double)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_FUNC(atan2, ::atan2f, uint, float, float)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_FUNC(atan2, ::atan2, uint, double, double)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_FUNC(atan2, ::atan2f, int, float, float)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_FUNC(atan2, ::atan2, int, double, double)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_FUNC(atan2, ::atan2f, float, float, float)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_FUNC(atan2, ::atan2, float, double, double)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_FUNC(atan2, ::atan2, double, double, double)
+
+#undef CV_CUDEV_IMPLEMENT_SCALAR_BINARY_FUNC
 
-    #undef OPENCV_GPU_IMPLEMENT_VEC_UNOP
-    #undef OPENCV_GPU_IMPLEMENT_VEC_BINOP
-    #undef OPENCV_GPU_IMPLEMENT_VEC_OP
-    #undef OPENCV_GPU_IMPLEMENT_VEC_INT_OP
 }}} // namespace cv { namespace gpu { namespace device
 
 #endif // __OPENCV_GPU_VECMATH_HPP__
diff --git a/modules/gpu/src/cuda/ccomponetns.cu b/modules/gpu/src/cuda/ccomponetns.cu
index 7f3d4ae33..c4d79bd80 100644
--- a/modules/gpu/src/cuda/ccomponetns.cu
+++ b/modules/gpu/src/cuda/ccomponetns.cu
@@ -153,7 +153,7 @@ namespace cv { namespace gpu { namespace device
 
             template<typename I> __device__ __forceinline__ bool operator() (const I& a, const I& b) const
             {
-                I d = a - b;
+                I d = saturate_cast<I>(a - b);
                 return lo.x <= d.x && d.x <= hi.x &&
                        lo.y <= d.y && d.y <= hi.y &&
                        lo.z <= d.z && d.z <= hi.z;
@@ -169,7 +169,7 @@ namespace cv { namespace gpu { namespace device
 
             template<typename I> __device__ __forceinline__ bool operator() (const I& a, const I& b) const
             {
-                I d = a - b;
+                I d = saturate_cast<I>(a - b);
                 return lo.x <= d.x && d.x <= hi.x &&
                        lo.y <= d.y && d.y <= hi.y &&
                        lo.z <= d.z && d.z <= hi.z &&
diff --git a/modules/gpu/src/cuda/hough.cu b/modules/gpu/src/cuda/hough.cu
index faec89b95..59eba2608 100644
--- a/modules/gpu/src/cuda/hough.cu
+++ b/modules/gpu/src/cuda/hough.cu
@@ -48,6 +48,7 @@
 #include "opencv2/gpu/device/common.hpp"
 #include "opencv2/gpu/device/emulation.hpp"
 #include "opencv2/gpu/device/vec_math.hpp"
+#include "opencv2/gpu/device/functional.hpp"
 #include "opencv2/gpu/device/limits.hpp"
 #include "opencv2/gpu/device/dynamic_smem.hpp"
 
@@ -811,7 +812,7 @@ namespace cv { namespace gpu { namespace device
 
             const int ind = ::atomicAdd(r_sizes + n, 1);
             if (ind < maxSize)
-                r_table(n, ind) = p - templCenter;
+                r_table(n, ind) = saturate_cast<short2>(p - templCenter);
         }
 
         void buildRTable_gpu(const unsigned int* coordList, const float* thetaList, int pointsCount,
@@ -855,7 +856,7 @@ namespace cv { namespace gpu { namespace device
 
             for (int j = 0; j < r_row_size; ++j)
             {
-                short2 c = p - r_row[j];
+                int2 c = p - r_row[j];
 
                 c.x = __float2int_rn(c.x * idp);
                 c.y = __float2int_rn(c.y * idp);

From 516e5b2563100329882948f68aceb15b5fe8fe60 Mon Sep 17 00:00:00 2001
From: Vladislav Vinogradov <vlad.vinogradov@itseez.com>
Date: Tue, 4 Jun 2013 13:58:45 +0400
Subject: [PATCH 18/27] fixed BroxOpticalFlow regression test

the output of BroxOpticalFlow differs a bit in CUDA 5.5
---
 modules/gpu/test/test_optflow.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/modules/gpu/test/test_optflow.cpp b/modules/gpu/test/test_optflow.cpp
index 9e30b9208..53b93a096 100644
--- a/modules/gpu/test/test_optflow.cpp
+++ b/modules/gpu/test/test_optflow.cpp
@@ -102,8 +102,8 @@ GPU_TEST_P(BroxOpticalFlow, Regression)
     for (int i = 0; i < v_gold.rows; ++i)
         f.read(v_gold.ptr<char>(i), v_gold.cols * sizeof(float));
 
-    EXPECT_MAT_NEAR(u_gold, u, 0);
-    EXPECT_MAT_NEAR(v_gold, v, 0);
+    EXPECT_MAT_SIMILAR(u_gold, u, 1e-3);
+    EXPECT_MAT_SIMILAR(v_gold, v, 1e-3);
 #else
     std::ofstream f(fname.c_str(), std::ios_base::binary);
 

From 4a770535c475be597f7487833927f0d5ab084988 Mon Sep 17 00:00:00 2001
From: Vladislav Vinogradov <vlad.vinogradov@itseez.com>
Date: Tue, 4 Jun 2013 14:59:47 +0400
Subject: [PATCH 19/27] fixed BoxFilter sanity test (different rounding
 results)

---
 modules/gpu/perf/perf_filters.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/modules/gpu/perf/perf_filters.cpp b/modules/gpu/perf/perf_filters.cpp
index 40d88aad4..adfc294f6 100644
--- a/modules/gpu/perf/perf_filters.cpp
+++ b/modules/gpu/perf/perf_filters.cpp
@@ -72,7 +72,7 @@ PERF_TEST_P(Sz_Type_KernelSz, Filters_Blur,
 
         TEST_CYCLE() cv::gpu::blur(d_src, dst, cv::Size(ksize, ksize));
 
-        GPU_SANITY_CHECK(dst);
+        GPU_SANITY_CHECK(dst, 1);
     }
     else
     {

From 89f3c40d791a6dd1951fd23ba7c1d9a8df8f1454 Mon Sep 17 00:00:00 2001
From: Vladislav Vinogradov <vlad.vinogradov@itseez.com>
Date: Tue, 4 Jun 2013 15:01:06 +0400
Subject: [PATCH 20/27] fixed BroxOpticalFlow sanity test (increase epsilon
 value)

+ interpolateFrames and createOpticalFlowNeedleMap
---
 modules/gpu/perf/perf_video.cpp | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/modules/gpu/perf/perf_video.cpp b/modules/gpu/perf/perf_video.cpp
index 1ab01a75b..672d657b2 100644
--- a/modules/gpu/perf/perf_video.cpp
+++ b/modules/gpu/perf/perf_video.cpp
@@ -103,7 +103,7 @@ PERF_TEST_P(ImagePair, Video_InterpolateFrames,
 
         TEST_CYCLE() cv::gpu::interpolateFrames(d_frame0, d_frame1, d_fu, d_fv, d_bu, d_bv, 0.5f, newFrame, d_buf);
 
-        GPU_SANITY_CHECK(newFrame);
+        GPU_SANITY_CHECK(newFrame, 1e-4);
     }
     else
     {
@@ -142,7 +142,7 @@ PERF_TEST_P(ImagePair, Video_CreateOpticalFlowNeedleMap,
 
         TEST_CYCLE() cv::gpu::createOpticalFlowNeedleMap(u, v, vertex, colors);
 
-        GPU_SANITY_CHECK(vertex);
+        GPU_SANITY_CHECK(vertex, 1e-6);
         GPU_SANITY_CHECK(colors);
     }
     else
@@ -219,8 +219,8 @@ PERF_TEST_P(ImagePair, Video_BroxOpticalFlow,
 
         TEST_CYCLE() d_flow(d_frame0, d_frame1, u, v);
 
-        GPU_SANITY_CHECK(u);
-        GPU_SANITY_CHECK(v);
+        GPU_SANITY_CHECK(u, 1e-1);
+        GPU_SANITY_CHECK(v, 1e-1);
     }
     else
     {

From 93a44d423618c7c6c04135dd99b47209cc88ead6 Mon Sep 17 00:00:00 2001
From: Roman Donchenko <roman.donchenko@itseez.com>
Date: Wed, 5 Jun 2013 13:51:11 +0400
Subject: [PATCH 21/27] Fix typo in .gitattributes.

---
 .gitattributes | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.gitattributes b/.gitattributes
index af704cdf0..cd4359ba3 100644
--- a/.gitattributes
+++ b/.gitattributes
@@ -33,7 +33,7 @@
 CMakeLists.txt  text whitespace=tabwidth=2
 
 *.png       binary
-*.jepg      binary
+*.jpeg      binary
 *.jpg       binary
 *.exr       binary
 *.ico       binary

From 31a5f7ef3c2187b82a91290a430651adcc0b19d1 Mon Sep 17 00:00:00 2001
From: Vladislav Vinogradov <vlad.vinogradov@itseez.com>
Date: Wed, 5 Jun 2013 14:08:55 +0400
Subject: [PATCH 22/27] fixed bug #3069 (infinite loop in GPU LBP Cascade
 detectMultiScale)

---
 modules/gpu/src/cascadeclassifier.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/modules/gpu/src/cascadeclassifier.cpp b/modules/gpu/src/cascadeclassifier.cpp
index 814a96bc0..7b95b6909 100644
--- a/modules/gpu/src/cascadeclassifier.cpp
+++ b/modules/gpu/src/cascadeclassifier.cpp
@@ -406,7 +406,7 @@ public:
         GpuMat dclassified(1, 1, CV_32S);
         cudaSafeCall( cudaMemcpy(dclassified.ptr(), &classified, sizeof(int), cudaMemcpyHostToDevice) );
 
-        PyrLavel level(0, 1.0f, image.size(), NxM, minObjectSize);
+        PyrLavel level(0, scaleFactor, image.size(), NxM, minObjectSize);
 
         while (level.isFeasible(maxObjectSize))
         {

From a2adafd5089065d44f82933c45cc4e7591141ed9 Mon Sep 17 00:00:00 2001
From: caorong <Lelouchcr@gmail.com>
Date: Thu, 16 May 2013 11:26:37 +0800
Subject: [PATCH 23/27] fix a bug(DetectorType never change)

changed line281 -> line220
Presentation:
because line 220 give the globle var mDetectorType,and in line 230 it will be compared with mDeteorType !!!  it will never be unequal ~
fix:
change mDetectorType(previous globle var) to a new local val tmpDetectorType
---
 .../src/org/opencv/samples/facedetect/FdActivity.java       | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/samples/android/face-detection/src/org/opencv/samples/facedetect/FdActivity.java b/samples/android/face-detection/src/org/opencv/samples/facedetect/FdActivity.java
index 23727739e..b06b2cc1c 100644
--- a/samples/android/face-detection/src/org/opencv/samples/facedetect/FdActivity.java
+++ b/samples/android/face-detection/src/org/opencv/samples/facedetect/FdActivity.java
@@ -215,9 +215,9 @@ public class FdActivity extends Activity implements CvCameraViewListener2 {
         else if (item == mItemFace20)
             setMinFaceSize(0.2f);
         else if (item == mItemType) {
-            mDetectorType = (mDetectorType + 1) % mDetectorName.length;
-            item.setTitle(mDetectorName[mDetectorType]);
-            setDetectorType(mDetectorType);
+            int tmpDetectorType = (mDetectorType + 1) % mDetectorName.length;
+            item.setTitle(mDetectorName[tmpDetectorType]);
+            setDetectorType(tmpDetectorType);
         }
         return true;
     }

From 985bfea556ff0511eb16d99ef83a6ef78c80e38d Mon Sep 17 00:00:00 2001
From: Roman Donchenko <roman.donchenko@itseez.com>
Date: Wed, 5 Jun 2013 15:54:27 +0400
Subject: [PATCH 24/27] Don't resolve symlinks when looking for modules.

We don't really need it, it makes the code longer, and it can lead to
inconsistent paths when OpenCV is itself inside a symlink.
---
 cmake/OpenCVModule.cmake |  4 ++--
 cmake/OpenCVUtils.cmake  | 10 ----------
 2 files changed, 2 insertions(+), 12 deletions(-)

diff --git a/cmake/OpenCVModule.cmake b/cmake/OpenCVModule.cmake
index 8312845fe..e8619ad7d 100644
--- a/cmake/OpenCVModule.cmake
+++ b/cmake/OpenCVModule.cmake
@@ -303,7 +303,7 @@ macro(ocv_glob_modules)
   # collect modules
   set(OPENCV_INITIAL_PASS ON)
   foreach(__path ${ARGN})
-    ocv_get_real_path(__path "${__path}")
+    get_filename_component(__path "${__path}" ABSOLUTE)
 
     list(FIND __directories_observed "${__path}" __pathIdx)
     if(__pathIdx GREATER -1)
@@ -315,7 +315,7 @@ macro(ocv_glob_modules)
     if(__ocvmodules)
       list(SORT __ocvmodules)
       foreach(mod ${__ocvmodules})
-        ocv_get_real_path(__modpath "${__path}/${mod}")
+        get_filename_component(__modpath "${__path}/${mod}" ABSOLUTE)
         if(EXISTS "${__modpath}/CMakeLists.txt")
 
           list(FIND __directories_observed "${__modpath}" __pathIdx)
diff --git a/cmake/OpenCVUtils.cmake b/cmake/OpenCVUtils.cmake
index db24c9970..5547a113c 100644
--- a/cmake/OpenCVUtils.cmake
+++ b/cmake/OpenCVUtils.cmake
@@ -411,16 +411,6 @@ macro(ocv_regex_escape var regex)
 endmacro()
 
 
-# get absolute path with symlinks resolved
-macro(ocv_get_real_path VAR PATHSTR)
-  if(CMAKE_VERSION VERSION_LESS 2.8)
-    get_filename_component(${VAR} "${PATHSTR}" ABSOLUTE)
-  else()
-    get_filename_component(${VAR} "${PATHSTR}" REALPATH)
-  endif()
-endmacro()
-
-
 # convert list of paths to full paths
 macro(ocv_convert_to_full_paths VAR)
   if(${VAR})

From a954d3630f8a567317a105300c8ccd735f2550a4 Mon Sep 17 00:00:00 2001
From: Roman Donchenko <roman.donchenko@itseez.com>
Date: Wed, 5 Jun 2013 18:09:47 +0400
Subject: [PATCH 25/27] Add support for adding custom OpenCV modules.

---
 CMakeLists.txt         | 4 ++++
 modules/CMakeLists.txt | 2 +-
 2 files changed, 5 insertions(+), 1 deletion(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 5f1edaa53..2c735e005 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -286,6 +286,10 @@ set(OPENCV_CONFIG_FILE_INCLUDE_DIR "${CMAKE_BINARY_DIR}/" CACHE PATH "Where to c
 add_definitions(-DHAVE_CVCONFIG_H)
 ocv_include_directories(${OPENCV_CONFIG_FILE_INCLUDE_DIR})
 
+# ----------------------------------------------------------------------------
+#  Path for additional modules
+# ----------------------------------------------------------------------------
+set(OPENCV_EXTRA_MODULES_PATH "" CACHE PATH "Where to look for additional OpenCV modules")
 
 # ----------------------------------------------------------------------------
 #  Autodetect if we are in a GIT repository
diff --git a/modules/CMakeLists.txt b/modules/CMakeLists.txt
index 4a6ed6d11..3e1ad708e 100644
--- a/modules/CMakeLists.txt
+++ b/modules/CMakeLists.txt
@@ -2,4 +2,4 @@ if(NOT OPENCV_MODULES_PATH)
   set(OPENCV_MODULES_PATH "${CMAKE_CURRENT_SOURCE_DIR}")
 endif()
 
-ocv_glob_modules(${OPENCV_MODULES_PATH})
+ocv_glob_modules(${OPENCV_MODULES_PATH} ${OPENCV_EXTRA_MODULES_PATH})

From 429f84e59e45a66f56540e67f5ab3a9c8dc34249 Mon Sep 17 00:00:00 2001
From: peng xiao <hisenxpress@gmail.com>
Date: Thu, 6 Jun 2013 11:44:35 +0800
Subject: [PATCH 26/27] Fix a bug of cornerHarris and cornerMinEigenVal. The
 bug is a buffer overrun when border type is reflect101. It is found that gfft
 crashed with input of size 100x100 on Intel CPU.

---
 modules/ocl/src/opencl/imgproc_calcHarris.cl     | 15 ++++++++-------
 .../ocl/src/opencl/imgproc_calcMinEigenVal.cl    | 16 +++++++++-------
 modules/ocl/test/test_optflow.cpp                |  2 +-
 3 files changed, 18 insertions(+), 15 deletions(-)

diff --git a/modules/ocl/src/opencl/imgproc_calcHarris.cl b/modules/ocl/src/opencl/imgproc_calcHarris.cl
index 15742d6c5..1911a7201 100644
--- a/modules/ocl/src/opencl/imgproc_calcHarris.cl
+++ b/modules/ocl/src/opencl/imgproc_calcHarris.cl
@@ -130,28 +130,29 @@ __kernel void calcHarris(__global const float *Dx,__global const float *Dy, __gl
         data[2][i] = dy_data[i] * dy_data[i];
     }
 #else
-   for(int i=0; i < ksY+1; i++)
-   {
+    int clamped_col = min(dst_cols, col);
+    for(int i=0; i < ksY+1; i++)
+    {
         int dx_selected_row;
         int dx_selected_col;
         dx_selected_row = ADDR_H(dx_startY+i, 0, dx_whole_rows);
         dx_selected_row = ADDR_B(dx_startY+i, dx_whole_rows, dx_selected_row);
-        dx_selected_col = ADDR_L(dx_startX+col, 0, dx_whole_cols);
-        dx_selected_col = ADDR_R(dx_startX+col, dx_whole_cols, dx_selected_col);
+        dx_selected_col = ADDR_L(dx_startX+clamped_col, 0, dx_whole_cols);
+        dx_selected_col = ADDR_R(dx_startX+clamped_col, dx_whole_cols, dx_selected_col);
         dx_data[i] = Dx[dx_selected_row * (dx_step>>2) + dx_selected_col];
 
         int dy_selected_row;
         int dy_selected_col;
         dy_selected_row = ADDR_H(dy_startY+i, 0, dy_whole_rows);
         dy_selected_row = ADDR_B(dy_startY+i, dy_whole_rows, dy_selected_row);
-        dy_selected_col = ADDR_L(dy_startX+col, 0, dy_whole_cols);
-        dy_selected_col = ADDR_R(dy_startX+col, dy_whole_cols, dy_selected_col);
+        dy_selected_col = ADDR_L(dy_startX+clamped_col, 0, dy_whole_cols);
+        dy_selected_col = ADDR_R(dy_startX+clamped_col, dy_whole_cols, dy_selected_col);
         dy_data[i] = Dy[dy_selected_row * (dy_step>>2) + dy_selected_col];
 
         data[0][i] = dx_data[i] * dx_data[i];
         data[1][i] = dx_data[i] * dy_data[i];
         data[2][i] = dy_data[i] * dy_data[i];
-   }
+    }
 #endif
     float sum0 = 0.0, sum1 = 0.0, sum2 = 0.0;
     for(int i=1; i < ksY; i++)
diff --git a/modules/ocl/src/opencl/imgproc_calcMinEigenVal.cl b/modules/ocl/src/opencl/imgproc_calcMinEigenVal.cl
index 662fbb07b..462ec7792 100644
--- a/modules/ocl/src/opencl/imgproc_calcMinEigenVal.cl
+++ b/modules/ocl/src/opencl/imgproc_calcMinEigenVal.cl
@@ -130,28 +130,30 @@ __kernel void calcMinEigenVal(__global const float *Dx,__global const float *Dy,
         data[2][i] = dy_data[i] * dy_data[i];
     }
 #else
-   for(int i=0; i < ksY+1; i++)
-   {
+    int clamped_col = min(dst_cols, col);
+
+    for(int i=0; i < ksY+1; i++)
+    {
         int dx_selected_row;
         int dx_selected_col;
         dx_selected_row = ADDR_H(dx_startY+i, 0, dx_whole_rows);
         dx_selected_row = ADDR_B(dx_startY+i, dx_whole_rows, dx_selected_row);
-        dx_selected_col = ADDR_L(dx_startX+col, 0, dx_whole_cols);
-        dx_selected_col = ADDR_R(dx_startX+col, dx_whole_cols, dx_selected_col);
+        dx_selected_col = ADDR_L(dx_startX+clamped_col, 0, dx_whole_cols);
+        dx_selected_col = ADDR_R(dx_startX+clamped_col, dx_whole_cols, dx_selected_col);
         dx_data[i] = Dx[dx_selected_row * (dx_step>>2) + dx_selected_col];
 
         int dy_selected_row;
         int dy_selected_col;
         dy_selected_row = ADDR_H(dy_startY+i, 0, dy_whole_rows);
         dy_selected_row = ADDR_B(dy_startY+i, dy_whole_rows, dy_selected_row);
-        dy_selected_col = ADDR_L(dy_startX+col, 0, dy_whole_cols);
-        dy_selected_col = ADDR_R(dy_startX+col, dy_whole_cols, dy_selected_col);
+        dy_selected_col = ADDR_L(dy_startX+clamped_col, 0, dy_whole_cols);
+        dy_selected_col = ADDR_R(dy_startX+clamped_col, dy_whole_cols, dy_selected_col);
         dy_data[i] = Dy[dy_selected_row * (dy_step>>2) + dy_selected_col];
 
         data[0][i] = dx_data[i] * dx_data[i];
         data[1][i] = dx_data[i] * dy_data[i];
         data[2][i] = dy_data[i] * dy_data[i];
-   }
+    }
 #endif
     float sum0 = 0.0, sum1 = 0.0, sum2 = 0.0;
     for(int i=1; i < ksY; i++)
diff --git a/modules/ocl/test/test_optflow.cpp b/modules/ocl/test/test_optflow.cpp
index 72689f3f7..0121be8f9 100644
--- a/modules/ocl/test/test_optflow.cpp
+++ b/modules/ocl/test/test_optflow.cpp
@@ -121,7 +121,7 @@ TEST_P(GoodFeaturesToTrack, EmptyCorners)
 
     cv::ocl::GoodFeaturesToTrackDetector_OCL detector(maxCorners, qualityLevel, minDistance);
 
-    cv::ocl::oclMat src(100, 128, CV_8UC1, cv::Scalar::all(0));
+    cv::ocl::oclMat src(100, 100, CV_8UC1, cv::Scalar::all(0));
     cv::ocl::oclMat corners(1, maxCorners, CV_32FC2);
 
     detector(src, corners);

From 8714cbac91e6e8f90395d9af3fd57f3c6b35235b Mon Sep 17 00:00:00 2001
From: Roman Donchenko <roman.donchenko@itseez.com>
Date: Thu, 6 Jun 2013 14:09:33 +0400
Subject: [PATCH 27/27] Fix a missing header path when building with Qt 4.

Also, removing explicit include path configuration,
since QT_USE_FILE takes care of that.
---
 cmake/OpenCVFindLibsGUI.cmake  | 2 +-
 modules/highgui/CMakeLists.txt | 6 +-----
 2 files changed, 2 insertions(+), 6 deletions(-)

diff --git a/cmake/OpenCVFindLibsGUI.cmake b/cmake/OpenCVFindLibsGUI.cmake
index 2ea864c16..59ce1cd05 100644
--- a/cmake/OpenCVFindLibsGUI.cmake
+++ b/cmake/OpenCVFindLibsGUI.cmake
@@ -33,7 +33,7 @@ if(WITH_QT)
   endif()
 
   if(NOT HAVE_QT)
-    find_package(Qt4)
+    find_package(Qt4 REQUIRED QtCore QtGui QtTest)
     if(QT4_FOUND)
       set(HAVE_QT TRUE)
       add_definitions(-DHAVE_QT) # We need to define the macro this way, using cvconfig.h does not work
diff --git a/modules/highgui/CMakeLists.txt b/modules/highgui/CMakeLists.txt
index 4c60867af..fad2562c8 100644
--- a/modules/highgui/CMakeLists.txt
+++ b/modules/highgui/CMakeLists.txt
@@ -95,14 +95,10 @@ elseif(HAVE_QT)
   endif()
   include(${QT_USE_FILE})
 
-  if(QT_INCLUDE_DIR)
-    ocv_include_directories(${QT_INCLUDE_DIR})
-  endif()
-
   QT4_ADD_RESOURCES(_RCC_OUTFILES src/window_QT.qrc)
   QT4_WRAP_CPP(_MOC_OUTFILES src/window_QT.h)
 
-  list(APPEND HIGHGUI_LIBRARIES ${QT_LIBRARIES} ${QT_QTTEST_LIBRARY})
+  list(APPEND HIGHGUI_LIBRARIES ${QT_LIBRARIES})
   list(APPEND highgui_srcs src/window_QT.cpp ${_MOC_OUTFILES} ${_RCC_OUTFILES})
   ocv_check_flag_support(CXX -Wno-missing-declarations _have_flag)
   if(${_have_flag})