added WITH_CUFFT and WITH_CUBLAS flags to cmake scripts

fixed gpu module error reporting added asynchronous version of some functions
2011-10-19 09:53:22 +00:00
parent a73b509b1e
commit 40ee754e28
16 changed files with 721 additions and 639 deletions
--- a/modules/gpu/src/cuda/imgproc.cu
+++ b/modules/gpu/src/cuda/imgproc.cu
@@ -120,8 +120,7 @@ namespace cv { namespace gpu { namespace imgproc
        return make_short2((short)x0, (short)y0);
    }

-    extern "C" __global__ void meanshift_kernel( unsigned char* out, size_t out_step, int cols, int rows, 
-                                                 int sp, int sr, int maxIter, float eps )
+    __global__ void meanshift_kernel(unsigned char* out, size_t out_step, int cols, int rows, int sp, int sr, int maxIter, float eps )
    {
        int x0 = blockIdx.x * blockDim.x + threadIdx.x;
        int y0 = blockIdx.y * blockDim.y + threadIdx.y;
@@ -130,10 +129,10 @@ namespace cv { namespace gpu { namespace imgproc
            do_mean_shift(x0, y0, out, out_step, cols, rows, sp, sr, maxIter, eps);
    }

-    extern "C" __global__ void meanshiftproc_kernel( unsigned char* outr, size_t outrstep, 
-                                                 unsigned char* outsp, size_t outspstep, 
-                                                 int cols, int rows, 
-                                                 int sp, int sr, int maxIter, float eps )
+    __global__ void meanshiftproc_kernel(unsigned char* outr, size_t outrstep, 
+                                         unsigned char* outsp, size_t outspstep, 
+                                         int cols, int rows, 
+                                         int sp, int sr, int maxIter, float eps)
    {
        int x0 = blockIdx.x * blockDim.x + threadIdx.x;
        int y0 = blockIdx.y * blockDim.y + threadIdx.y;
@@ -145,7 +144,7 @@ namespace cv { namespace gpu { namespace imgproc
        }
    }

-    extern "C" void meanShiftFiltering_gpu(const DevMem2Db& src, DevMem2Db dst, int sp, int sr, int maxIter, float eps)
+    void meanShiftFiltering_gpu(const DevMem2Db& src, DevMem2Db dst, int sp, int sr, int maxIter, float eps, cudaStream_t stream)
    {
        dim3 grid(1, 1, 1);
        dim3 threads(32, 8, 1);
@@ -155,13 +154,16 @@ namespace cv { namespace gpu { namespace imgproc
        cudaChannelFormatDesc desc = cudaCreateChannelDesc<uchar4>();
        cudaSafeCall( cudaBindTexture2D( 0, tex_meanshift, src.data, desc, src.cols, src.rows, src.step ) );

-        meanshift_kernel<<< grid, threads >>>( dst.data, dst.step, dst.cols, dst.rows, sp, sr, maxIter, eps );
+        meanshift_kernel<<< grid, threads, 0, stream >>>( dst.data, dst.step, dst.cols, dst.rows, sp, sr, maxIter, eps );
        cudaSafeCall( cudaGetLastError() );

-        cudaSafeCall( cudaDeviceSynchronize() );
-        cudaSafeCall( cudaUnbindTexture( tex_meanshift ) );        
+        if (stream == 0)
+            cudaSafeCall( cudaDeviceSynchronize() );
+
+        //cudaSafeCall( cudaUnbindTexture( tex_meanshift ) );        
    }
-    extern "C" void meanShiftProc_gpu(const DevMem2Db& src, DevMem2Db dstr, DevMem2Db dstsp, int sp, int sr, int maxIter, float eps) 
+
+    void meanShiftProc_gpu(const DevMem2Db& src, DevMem2Db dstr, DevMem2Db dstsp, int sp, int sr, int maxIter, float eps, cudaStream_t stream) 
    {
        dim3 grid(1, 1, 1);
        dim3 threads(32, 8, 1);
@@ -171,11 +173,13 @@ namespace cv { namespace gpu { namespace imgproc
        cudaChannelFormatDesc desc = cudaCreateChannelDesc<uchar4>();
        cudaSafeCall( cudaBindTexture2D( 0, tex_meanshift, src.data, desc, src.cols, src.rows, src.step ) );

-        meanshiftproc_kernel<<< grid, threads >>>( dstr.data, dstr.step, dstsp.data, dstsp.step, dstr.cols, dstr.rows, sp, sr, maxIter, eps );
+        meanshiftproc_kernel<<< grid, threads, 0, stream >>>( dstr.data, dstr.step, dstsp.data, dstsp.step, dstr.cols, dstr.rows, sp, sr, maxIter, eps );
        cudaSafeCall( cudaGetLastError() );

-        cudaSafeCall( cudaDeviceSynchronize() );
-        cudaSafeCall( cudaUnbindTexture( tex_meanshift ) );        
+        if (stream == 0)
+            cudaSafeCall( cudaDeviceSynchronize() );
+
+        //cudaSafeCall( cudaUnbindTexture( tex_meanshift ) );        
    }

 /////////////////////////////////// drawColorDisp ///////////////////////////////////////////////
@@ -389,15 +393,16 @@ namespace cv { namespace gpu { namespace imgproc
        }
    }

-    void extractCovData_caller(const DevMem2Df Dx, const DevMem2Df Dy, PtrStepf dst)
+    void extractCovData_caller(const DevMem2Df Dx, const DevMem2Df Dy, PtrStepf dst, cudaStream_t stream)
    {
        dim3 threads(32, 8);
        dim3 grid(divUp(Dx.cols, threads.x), divUp(Dx.rows, threads.y));

-        extractCovData_kernel<<<grid, threads>>>(Dx.cols, Dx.rows, Dx, Dy, dst);
+        extractCovData_kernel<<<grid, threads, 0, stream>>>(Dx.cols, Dx.rows, Dx, Dy, dst);
        cudaSafeCall( cudaGetLastError() );

-        cudaSafeCall( cudaDeviceSynchronize() );
+        if (stream == 0)
+            cudaSafeCall( cudaDeviceSynchronize() );
    }

 /////////////////////////////////////////// Corner Harris /////////////////////////////////////////////////
@@ -475,7 +480,7 @@ namespace cv { namespace gpu { namespace imgproc
    }

    void cornerHarris_caller(const int block_size, const float k, const DevMem2Db Dx, const DevMem2Db Dy, DevMem2Db dst, 
-                             int border_type)
+                             int border_type, cudaStream_t stream)
    {
        const int rows = Dx.rows;
        const int cols = Dx.cols;
@@ -492,7 +497,7 @@ namespace cv { namespace gpu { namespace imgproc
        switch (border_type) 
        {
        case BORDER_REFLECT101_GPU:
-            cornerHarris_kernel<<<grid, threads>>>(
+            cornerHarris_kernel<<<grid, threads, 0, stream>>>(
                    cols, rows, block_size, k, dst, BrdRowReflect101<void>(cols), BrdColReflect101<void>(rows));
            break;
        case BORDER_REPLICATE_GPU:
@@ -500,16 +505,18 @@ namespace cv { namespace gpu { namespace imgproc
            harrisDxTex.addressMode[1] = cudaAddressModeClamp;
            harrisDyTex.addressMode[0] = cudaAddressModeClamp;
            harrisDyTex.addressMode[1] = cudaAddressModeClamp;
-            cornerHarris_kernel<<<grid, threads>>>(cols, rows, block_size, k, dst);
+
+            cornerHarris_kernel<<<grid, threads, 0, stream>>>(cols, rows, block_size, k, dst);
            break;
        }

        cudaSafeCall( cudaGetLastError() );

-        cudaSafeCall( cudaDeviceSynchronize() );
+        if (stream == 0)
+            cudaSafeCall( cudaDeviceSynchronize() );

-        cudaSafeCall(cudaUnbindTexture(harrisDxTex));
-        cudaSafeCall(cudaUnbindTexture(harrisDyTex));
+        //cudaSafeCall(cudaUnbindTexture(harrisDxTex));
+        //cudaSafeCall(cudaUnbindTexture(harrisDyTex));
    }

 /////////////////////////////////////////// Corner Min Eigen Val /////////////////////////////////////////////////
@@ -592,7 +599,7 @@ namespace cv { namespace gpu { namespace imgproc
    }

    void cornerMinEigenVal_caller(const int block_size, const DevMem2Db Dx, const DevMem2Db Dy, DevMem2Db dst,
-                                  int border_type)
+                                  int border_type, cudaStream_t stream)
    {
        const int rows = Dx.rows;
        const int cols = Dx.cols;
@@ -609,7 +616,7 @@ namespace cv { namespace gpu { namespace imgproc
        switch (border_type)
        {
        case BORDER_REFLECT101_GPU:
-            cornerMinEigenVal_kernel<<<grid, threads>>>(
+            cornerMinEigenVal_kernel<<<grid, threads, 0, stream>>>(
                    cols, rows, block_size, dst, BrdRowReflect101<void>(cols), BrdColReflect101<void>(rows));
            break;
        case BORDER_REPLICATE_GPU:
@@ -617,16 +624,18 @@ namespace cv { namespace gpu { namespace imgproc
            minEigenValDxTex.addressMode[1] = cudaAddressModeClamp;
            minEigenValDyTex.addressMode[0] = cudaAddressModeClamp;
            minEigenValDyTex.addressMode[1] = cudaAddressModeClamp;
-            cornerMinEigenVal_kernel<<<grid, threads>>>(cols, rows, block_size, dst);
+
+            cornerMinEigenVal_kernel<<<grid, threads, 0, stream>>>(cols, rows, block_size, dst);
            break;
        }

        cudaSafeCall( cudaGetLastError() );

-        cudaSafeCall(cudaDeviceSynchronize());
+        if (stream == 0)
+            cudaSafeCall(cudaDeviceSynchronize());

-        cudaSafeCall(cudaUnbindTexture(minEigenValDxTex));
-        cudaSafeCall(cudaUnbindTexture(minEigenValDyTex));
+        //cudaSafeCall(cudaUnbindTexture(minEigenValDxTex));
+        //cudaSafeCall(cudaUnbindTexture(minEigenValDyTex));
    }

 ////////////////////////////// Column Sum //////////////////////////////////////
@@ -667,8 +676,7 @@ namespace cv { namespace gpu { namespace imgproc
    //////////////////////////////////////////////////////////////////////////
    // mulSpectrums

-    __global__ void mulSpectrumsKernel(const PtrStep<cufftComplex> a, const PtrStep<cufftComplex> b, 
-                                       DevMem2D_<cufftComplex> c)
+    __global__ void mulSpectrumsKernel(const PtrStep<cufftComplex> a, const PtrStep<cufftComplex> b, DevMem2D_<cufftComplex> c)
    {
        const int x = blockIdx.x * blockDim.x + threadIdx.x;    
        const int y = blockIdx.y * blockDim.y + threadIdx.y;    
@@ -680,25 +688,23 @@ namespace cv { namespace gpu { namespace imgproc
    }


-    void mulSpectrums(const PtrStep<cufftComplex> a, const PtrStep<cufftComplex> b, 
-                      DevMem2D_<cufftComplex> c)
+    void mulSpectrums(const PtrStep<cufftComplex> a, const PtrStep<cufftComplex> b, DevMem2D_<cufftComplex> c, cudaStream_t stream)
    {
        dim3 threads(256);
        dim3 grid(divUp(c.cols, threads.x), divUp(c.rows, threads.y));

-        mulSpectrumsKernel<<<grid, threads>>>(a, b, c);
+        mulSpectrumsKernel<<<grid, threads, 0, stream>>>(a, b, c);
        cudaSafeCall( cudaGetLastError() );

-        cudaSafeCall( cudaDeviceSynchronize() );
+        if (stream == 0)
+            cudaSafeCall( cudaDeviceSynchronize() );
    }


    //////////////////////////////////////////////////////////////////////////
    // mulSpectrums_CONJ

-    __global__ void mulSpectrumsKernel_CONJ(
-            const PtrStep<cufftComplex> a, const PtrStep<cufftComplex> b,
-            DevMem2D_<cufftComplex> c)
+    __global__ void mulSpectrumsKernel_CONJ(const PtrStep<cufftComplex> a, const PtrStep<cufftComplex> b, DevMem2D_<cufftComplex> c)
    {
        const int x = blockIdx.x * blockDim.x + threadIdx.x;    
        const int y = blockIdx.y * blockDim.y + threadIdx.y;    
@@ -710,25 +716,23 @@ namespace cv { namespace gpu { namespace imgproc
    }


-    void mulSpectrums_CONJ(const PtrStep<cufftComplex> a, const PtrStep<cufftComplex> b, 
-                           DevMem2D_<cufftComplex> c)
+    void mulSpectrums_CONJ(const PtrStep<cufftComplex> a, const PtrStep<cufftComplex> b, DevMem2D_<cufftComplex> c, cudaStream_t stream)
    {
        dim3 threads(256);
        dim3 grid(divUp(c.cols, threads.x), divUp(c.rows, threads.y));

-        mulSpectrumsKernel_CONJ<<<grid, threads>>>(a, b, c);
+        mulSpectrumsKernel_CONJ<<<grid, threads, 0, stream>>>(a, b, c);
        cudaSafeCall( cudaGetLastError() );

-        cudaSafeCall( cudaDeviceSynchronize() );
+        if (stream == 0)
+            cudaSafeCall( cudaDeviceSynchronize() );
    }


    //////////////////////////////////////////////////////////////////////////
    // mulAndScaleSpectrums

-    __global__ void mulAndScaleSpectrumsKernel(
-            const PtrStep<cufftComplex> a, const PtrStep<cufftComplex> b, 
-            float scale, DevMem2D_<cufftComplex> c)
+    __global__ void mulAndScaleSpectrumsKernel(const PtrStep<cufftComplex> a, const PtrStep<cufftComplex> b, float scale, DevMem2D_<cufftComplex> c)
    {
        const int x = blockIdx.x * blockDim.x + threadIdx.x;
        const int y = blockIdx.y * blockDim.y + threadIdx.y;
@@ -741,25 +745,23 @@ namespace cv { namespace gpu { namespace imgproc
    }


-    void mulAndScaleSpectrums(const PtrStep<cufftComplex> a, const PtrStep<cufftComplex> b,
-                              float scale, DevMem2D_<cufftComplex> c)
+    void mulAndScaleSpectrums(const PtrStep<cufftComplex> a, const PtrStep<cufftComplex> b, float scale, DevMem2D_<cufftComplex> c, cudaStream_t stream)
    {
        dim3 threads(256);
        dim3 grid(divUp(c.cols, threads.x), divUp(c.rows, threads.y));

-        mulAndScaleSpectrumsKernel<<<grid, threads>>>(a, b, scale, c);
+        mulAndScaleSpectrumsKernel<<<grid, threads, 0, stream>>>(a, b, scale, c);
        cudaSafeCall( cudaGetLastError() );

-        cudaSafeCall( cudaDeviceSynchronize() );
+        if (stream)
+            cudaSafeCall( cudaDeviceSynchronize() );
    }


    //////////////////////////////////////////////////////////////////////////
    // mulAndScaleSpectrums_CONJ

-    __global__ void mulAndScaleSpectrumsKernel_CONJ(
-            const PtrStep<cufftComplex> a, const PtrStep<cufftComplex> b,
-            float scale, DevMem2D_<cufftComplex> c)
+    __global__ void mulAndScaleSpectrumsKernel_CONJ(const PtrStep<cufftComplex> a, const PtrStep<cufftComplex> b, float scale, DevMem2D_<cufftComplex> c)
    {
        const int x = blockIdx.x * blockDim.x + threadIdx.x;
        const int y = blockIdx.y * blockDim.y + threadIdx.y;
@@ -772,16 +774,16 @@ namespace cv { namespace gpu { namespace imgproc
    }


-    void mulAndScaleSpectrums_CONJ(const PtrStep<cufftComplex> a, const PtrStep<cufftComplex> b,
-                                  float scale, DevMem2D_<cufftComplex> c)
+    void mulAndScaleSpectrums_CONJ(const PtrStep<cufftComplex> a, const PtrStep<cufftComplex> b, float scale, DevMem2D_<cufftComplex> c, cudaStream_t stream)
    {
        dim3 threads(256);
        dim3 grid(divUp(c.cols, threads.x), divUp(c.rows, threads.y));

-        mulAndScaleSpectrumsKernel_CONJ<<<grid, threads>>>(a, b, scale, c);
+        mulAndScaleSpectrumsKernel_CONJ<<<grid, threads, 0, stream>>>(a, b, scale, c);
        cudaSafeCall( cudaGetLastError() );

-        cudaSafeCall( cudaDeviceSynchronize() );
+        if (stream == 0)
+            cudaSafeCall( cudaDeviceSynchronize() );
    }    

    //////////////////////////////////////////////////////////////////////////
@@ -1015,17 +1017,18 @@ namespace cv { namespace gpu { namespace imgproc
        }
    }

-    void convolve_gpu(const DevMem2Df& src, const PtrStepf& dst, int kWidth, int kHeight, float* kernel)
+    void convolve_gpu(const DevMem2Df& src, const PtrStepf& dst, int kWidth, int kHeight, float* kernel, cudaStream_t stream)
    {
        cudaSafeCall(cudaMemcpyToSymbol(c_convolveKernel, kernel, kWidth * kHeight * sizeof(float), 0, cudaMemcpyDeviceToDevice) );

        const dim3 block(16, 16);
        const dim3 grid(divUp(src.cols, block.x), divUp(src.rows, block.y));

-        convolve<<<grid, block>>>(src, dst, kWidth, kHeight);
+        convolve<<<grid, block, 0, stream>>>(src, dst, kWidth, kHeight);
        cudaSafeCall(cudaGetLastError());

-        cudaSafeCall(cudaDeviceSynchronize());
+        if (stream == 0)
+            cudaSafeCall(cudaDeviceSynchronize());
    }


--- a/modules/gpu/src/cuda/match_template.cu
+++ b/modules/gpu/src/cuda/match_template.cu
@@ -78,11 +78,11 @@ __device__ __forceinline__ float2 sub(uchar2 a, uchar2 b) { return make_float2(a
 __device__ __forceinline__ float3 sub(uchar3 a, uchar3 b) { return make_float3(a.x - b.x, a.y - b.y, a.z - b.z); }
 __device__ __forceinline__ float4 sub(uchar4 a, uchar4 b) { return make_float4(a.x - b.x, a.y - b.y, a.z - b.z, a.w - b.w); }

+//////////////////////////////////////////////////////////////////////
+// Naive_CCORR

-template <typename T, int cn>
-__global__ void matchTemplateNaiveKernel_CCORR(
-        int w, int h, const PtrStepb image, const PtrStepb templ, 
-        DevMem2Df result)
+template <typename T, int cn> 
+__global__ void matchTemplateNaiveKernel_CCORR(int w, int h, const PtrStepb image, const PtrStepb templ, DevMem2Df result)
 {
    typedef typename TypeVec<T, cn>::vec_type Type;
    typedef typename TypeVec<float, cn>::vec_type Typef;
@@ -106,73 +106,49 @@ __global__ void matchTemplateNaiveKernel_CCORR(
    }
 }

-
-void matchTemplateNaive_CCORR_32F(const DevMem2Db image, const DevMem2Db templ,
-                                  DevMem2Df result, int cn)
+template <typename T, int cn>
+void matchTemplateNaive_CCORR(const DevMem2Db image, const DevMem2Db templ, DevMem2Df result, cudaStream_t stream)
 {
-    dim3 threads(32, 8);
-    dim3 grid(divUp(result.cols, threads.x), divUp(result.rows, threads.y));
+    const dim3 threads(32, 8);
+    const dim3 grid(divUp(result.cols, threads.x), divUp(result.rows, threads.y));

-    switch (cn)
-    {
-    case 1:
-        matchTemplateNaiveKernel_CCORR<float, 1><<<grid, threads>>>(
-                templ.cols, templ.rows, image, templ, result);
-        break;
-    case 2:
-        matchTemplateNaiveKernel_CCORR<float, 2><<<grid, threads>>>(
-                templ.cols, templ.rows, image, templ, result);
-        break;
-    case 3:
-        matchTemplateNaiveKernel_CCORR<float, 3><<<grid, threads>>>(
-                templ.cols, templ.rows, image, templ, result);
-        break;
-    case 4:
-        matchTemplateNaiveKernel_CCORR<float, 4><<<grid, threads>>>(
-                templ.cols, templ.rows, image, templ, result);
-        break;
-    }
+    matchTemplateNaiveKernel_CCORR<T, cn><<<grid, threads, 0, stream>>>(templ.cols, templ.rows, image, templ, result);
    cudaSafeCall( cudaGetLastError() );

-    cudaSafeCall( cudaDeviceSynchronize() );
+    if (stream == 0)
+        cudaSafeCall( cudaDeviceSynchronize() );
+}
+
+void matchTemplateNaive_CCORR_32F(const DevMem2Db image, const DevMem2Db templ, DevMem2Df result, int cn, cudaStream_t stream)
+{
+    typedef void (*caller_t)(const DevMem2Db image, const DevMem2Db templ, DevMem2Df result, cudaStream_t stream);
+
+    static const caller_t callers[] = 
+    {
+        0, matchTemplateNaive_CCORR<float, 1>, matchTemplateNaive_CCORR<float, 2>, matchTemplateNaive_CCORR<float, 3>, matchTemplateNaive_CCORR<float, 4>
+    };
+
+    callers[cn](image, templ, result, stream);
 }


-void matchTemplateNaive_CCORR_8U(const DevMem2Db image, const DevMem2Db templ,
-                                 DevMem2Df result, int cn)
+void matchTemplateNaive_CCORR_8U(const DevMem2Db image, const DevMem2Db templ, DevMem2Df result, int cn, cudaStream_t stream)
 {
-    dim3 threads(32, 8);
-    dim3 grid(divUp(result.cols, threads.x), divUp(result.rows, threads.y));
+    typedef void (*caller_t)(const DevMem2Db image, const DevMem2Db templ, DevMem2Df result, cudaStream_t stream);

-    switch (cn)
+    static const caller_t callers[] = 
    {
-    case 1:
-        matchTemplateNaiveKernel_CCORR<uchar, 1><<<grid, threads>>>(
-                templ.cols, templ.rows, image, templ, result);
-        break;
-    case 2:
-        matchTemplateNaiveKernel_CCORR<uchar, 2><<<grid, threads>>>(
-                templ.cols, templ.rows, image, templ, result);
-        break;
-    case 3:
-        matchTemplateNaiveKernel_CCORR<uchar, 3><<<grid, threads>>>(
-                templ.cols, templ.rows, image, templ, result);
-        break;
-    case 4:
-        matchTemplateNaiveKernel_CCORR<uchar, 4><<<grid, threads>>>(
-                templ.cols, templ.rows, image, templ, result);
-        break;
-    }
-    cudaSafeCall( cudaGetLastError() );
+        0, matchTemplateNaive_CCORR<uchar, 1>, matchTemplateNaive_CCORR<uchar, 2>, matchTemplateNaive_CCORR<uchar, 3>, matchTemplateNaive_CCORR<uchar, 4>
+    };

-    cudaSafeCall( cudaDeviceSynchronize() );
+    callers[cn](image, templ, result, stream);
 }

+//////////////////////////////////////////////////////////////////////
+// Naive_SQDIFF

 template <typename T, int cn>
-__global__ void matchTemplateNaiveKernel_SQDIFF(
-        int w, int h, const PtrStepb image, const PtrStepb templ, 
-        DevMem2Df result)
+__global__ void matchTemplateNaiveKernel_SQDIFF(int w, int h, const PtrStepb image, const PtrStepb templ, DevMem2Df result)
 {
    typedef typename TypeVec<T, cn>::vec_type Type;
    typedef typename TypeVec<float, cn>::vec_type Typef;
@@ -200,73 +176,48 @@ __global__ void matchTemplateNaiveKernel_SQDIFF(
    }
 }

-
-void matchTemplateNaive_SQDIFF_32F(const DevMem2Db image, const DevMem2Db templ,
-                                   DevMem2Df result, int cn)
+template <typename T, int cn>
+void matchTemplateNaive_SQDIFF(const DevMem2Db image, const DevMem2Db templ, DevMem2Df result, cudaStream_t stream)
 {
-    dim3 threads(32, 8);
-    dim3 grid(divUp(result.cols, threads.x), divUp(result.rows, threads.y));
+    const dim3 threads(32, 8);
+    const dim3 grid(divUp(result.cols, threads.x), divUp(result.rows, threads.y));

-    switch (cn)
-    {
-    case 1:
-        matchTemplateNaiveKernel_SQDIFF<float, 1><<<grid, threads>>>(
-                templ.cols, templ.rows, image, templ, result);
-        break;
-    case 2:
-        matchTemplateNaiveKernel_SQDIFF<float, 2><<<grid, threads>>>(
-                templ.cols, templ.rows, image, templ, result);
-        break;
-    case 3:
-        matchTemplateNaiveKernel_SQDIFF<float, 3><<<grid, threads>>>(
-                templ.cols, templ.rows, image, templ, result);
-        break;
-    case 4:
-        matchTemplateNaiveKernel_SQDIFF<float, 4><<<grid, threads>>>(
-                templ.cols, templ.rows, image, templ, result);
-        break;
-    }
+    matchTemplateNaiveKernel_SQDIFF<T, cn><<<grid, threads, 0, stream>>>(templ.cols, templ.rows, image, templ, result);
    cudaSafeCall( cudaGetLastError() );

-    cudaSafeCall( cudaDeviceSynchronize() );
+    if (stream == 0)
+        cudaSafeCall( cudaDeviceSynchronize() );
 }

-
-void matchTemplateNaive_SQDIFF_8U(const DevMem2Db image, const DevMem2Db templ,
-                                  DevMem2Df result, int cn)
+void matchTemplateNaive_SQDIFF_32F(const DevMem2Db image, const DevMem2Db templ, DevMem2Df result, int cn, cudaStream_t stream)
 {
-    dim3 threads(32, 8);
-    dim3 grid(divUp(result.cols, threads.x), divUp(result.rows, threads.y));
+    typedef void (*caller_t)(const DevMem2Db image, const DevMem2Db templ, DevMem2Df result, cudaStream_t stream);

-    switch (cn)
+    static const caller_t callers[] = 
    {
-    case 1:
-        matchTemplateNaiveKernel_SQDIFF<uchar, 1><<<grid, threads>>>(
-                templ.cols, templ.rows, image, templ, result);
-        break;
-    case 2:
-        matchTemplateNaiveKernel_SQDIFF<uchar, 2><<<grid, threads>>>(
-                templ.cols, templ.rows, image, templ, result);
-        break;
-    case 3:
-        matchTemplateNaiveKernel_SQDIFF<uchar, 3><<<grid, threads>>>(
-                templ.cols, templ.rows, image, templ, result);
-        break;
-    case 4:
-        matchTemplateNaiveKernel_SQDIFF<uchar, 4><<<grid, threads>>>(
-                templ.cols, templ.rows, image, templ, result);
-        break;
-    }
-    cudaSafeCall( cudaGetLastError() );
+        0, matchTemplateNaive_SQDIFF<float, 1>, matchTemplateNaive_SQDIFF<float, 2>, matchTemplateNaive_SQDIFF<float, 3>, matchTemplateNaive_SQDIFF<float, 4>
+    };

-    cudaSafeCall( cudaDeviceSynchronize() );
+    callers[cn](image, templ, result, stream);
 }

+void matchTemplateNaive_SQDIFF_8U(const DevMem2Db image, const DevMem2Db templ, DevMem2Df result, int cn, cudaStream_t stream)
+{
+    typedef void (*caller_t)(const DevMem2Db image, const DevMem2Db templ, DevMem2Df result, cudaStream_t stream);
+
+    static const caller_t callers[] = 
+    {
+        0, matchTemplateNaive_SQDIFF<uchar, 1>, matchTemplateNaive_SQDIFF<uchar, 2>, matchTemplateNaive_SQDIFF<uchar, 3>, matchTemplateNaive_SQDIFF<uchar, 4>
+    };
+
+    callers[cn](image, templ, result, stream);
+}
+
+//////////////////////////////////////////////////////////////////////
+// Prepared_SQDIFF

 template <int cn>
-__global__ void matchTemplatePreparedKernel_SQDIFF_8U(
-        int w, int h, const PtrStep<unsigned long long> image_sqsum, 
-        unsigned int templ_sqsum, DevMem2Df result)
+__global__ void matchTemplatePreparedKernel_SQDIFF_8U(int w, int h, const PtrStep<unsigned long long> image_sqsum, unsigned int templ_sqsum, DevMem2Df result)
 {
    const int x = blockIdx.x * blockDim.x + threadIdx.x;
    const int y = blockIdx.y * blockDim.y + threadIdx.y;
@@ -281,37 +232,34 @@ __global__ void matchTemplatePreparedKernel_SQDIFF_8U(
    }
 }

-
-void matchTemplatePrepared_SQDIFF_8U(
-        int w, int h, const DevMem2D_<unsigned long long> image_sqsum, 
-        unsigned int templ_sqsum, DevMem2Df result, int cn)
+template <int cn>
+void matchTemplatePrepared_SQDIFF_8U(int w, int h, const DevMem2D_<unsigned long long> image_sqsum, unsigned int templ_sqsum, DevMem2Df result, cudaStream_t stream)
 {
-    dim3 threads(32, 8);
-    dim3 grid(divUp(result.cols, threads.x), divUp(result.rows, threads.y));
-    switch (cn)
-    {
-    case 1:
-        matchTemplatePreparedKernel_SQDIFF_8U<1><<<grid, threads>>>(
-                w, h, image_sqsum, templ_sqsum, result);
-        break;
-    case 2:
-        matchTemplatePreparedKernel_SQDIFF_8U<2><<<grid, threads>>>(
-                w, h, image_sqsum, templ_sqsum, result);
-        break;
-    case 3:
-        matchTemplatePreparedKernel_SQDIFF_8U<3><<<grid, threads>>>(
-                w, h, image_sqsum, templ_sqsum, result);
-        break;
-    case 4:
-        matchTemplatePreparedKernel_SQDIFF_8U<4><<<grid, threads>>>(
-                w, h, image_sqsum, templ_sqsum, result);
-        break;
-    }
+    const dim3 threads(32, 8);
+    const dim3 grid(divUp(result.cols, threads.x), divUp(result.rows, threads.y));
+
+    matchTemplatePreparedKernel_SQDIFF_8U<cn><<<grid, threads, 0, stream>>>(w, h, image_sqsum, templ_sqsum, result);
    cudaSafeCall( cudaGetLastError() );

-    cudaSafeCall( cudaDeviceSynchronize() );
+    if (stream == 0)
+        cudaSafeCall( cudaDeviceSynchronize() );
 }

+void matchTemplatePrepared_SQDIFF_8U(int w, int h, const DevMem2D_<unsigned long long> image_sqsum, unsigned int templ_sqsum, DevMem2Df result, int cn, 
+                                     cudaStream_t stream)
+{
+    typedef void (*caller_t)(int w, int h, const DevMem2D_<unsigned long long> image_sqsum, unsigned int templ_sqsum, DevMem2Df result, cudaStream_t stream);
+
+    static const caller_t callers[] = 
+    {
+        0, matchTemplatePrepared_SQDIFF_8U<1>, matchTemplatePrepared_SQDIFF_8U<2>, matchTemplatePrepared_SQDIFF_8U<3>, matchTemplatePrepared_SQDIFF_8U<4>
+    };
+
+    callers[cn](w, h, image_sqsum, templ_sqsum, result, stream);
+}
+
+//////////////////////////////////////////////////////////////////////
+// Prepared_SQDIFF_NORMED

 // normAcc* are accurate normalization routines which make GPU matchTemplate
 // consistent with CPU one
@@ -337,9 +285,7 @@ __device__ float normAcc_SQDIFF(float num, float denum)


 template <int cn>
-__global__ void matchTemplatePreparedKernel_SQDIFF_NORMED_8U(
-        int w, int h, const PtrStep<unsigned long long> image_sqsum, 
-        unsigned int templ_sqsum, DevMem2Df result)
+__global__ void matchTemplatePreparedKernel_SQDIFF_NORMED_8U(int w, int h, const PtrStep<unsigned long long> image_sqsum, unsigned int templ_sqsum, DevMem2Df result)
 {
    const int x = blockIdx.x * blockDim.x + threadIdx.x;
    const int y = blockIdx.y * blockDim.y + threadIdx.y;
@@ -355,41 +301,37 @@ __global__ void matchTemplatePreparedKernel_SQDIFF_NORMED_8U(
    }
 }

-
-void matchTemplatePrepared_SQDIFF_NORMED_8U(
-        int w, int h, const DevMem2D_<unsigned long long> image_sqsum, 
-        unsigned int templ_sqsum, DevMem2Df result, int cn)
+template <int cn>
+void matchTemplatePrepared_SQDIFF_NORMED_8U(int w, int h, const DevMem2D_<unsigned long long> image_sqsum, unsigned int templ_sqsum, 
+                                            DevMem2Df result, cudaStream_t stream)
 {
-    dim3 threads(32, 8);
-    dim3 grid(divUp(result.cols, threads.x), divUp(result.rows, threads.y));
-    switch (cn)
-    {
-    case 1:
-        matchTemplatePreparedKernel_SQDIFF_NORMED_8U<1><<<grid, threads>>>(
-                w, h, image_sqsum, templ_sqsum, result);
-        break;
-    case 2:
-        matchTemplatePreparedKernel_SQDIFF_NORMED_8U<2><<<grid, threads>>>(
-                w, h, image_sqsum, templ_sqsum, result);
-        break;
-    case 3:
-        matchTemplatePreparedKernel_SQDIFF_NORMED_8U<3><<<grid, threads>>>(
-                w, h, image_sqsum, templ_sqsum, result);
-        break;
-    case 4:
-        matchTemplatePreparedKernel_SQDIFF_NORMED_8U<4><<<grid, threads>>>(
-                w, h, image_sqsum, templ_sqsum, result);
-        break;
-    }
+    const dim3 threads(32, 8);
+    const dim3 grid(divUp(result.cols, threads.x), divUp(result.rows, threads.y));
+
+    matchTemplatePreparedKernel_SQDIFF_NORMED_8U<cn><<<grid, threads, 0, stream>>>(w, h, image_sqsum, templ_sqsum, result);
    cudaSafeCall( cudaGetLastError() );

-    cudaSafeCall( cudaDeviceSynchronize() );
+    if (stream == 0)
+        cudaSafeCall( cudaDeviceSynchronize() );
 }


-__global__ void matchTemplatePreparedKernel_CCOFF_8U(
-        int w, int h, float templ_sum_scale, 
-        const PtrStep<unsigned int> image_sum, DevMem2Df result)
+void matchTemplatePrepared_SQDIFF_NORMED_8U(int w, int h, const DevMem2D_<unsigned long long> image_sqsum, unsigned int templ_sqsum, 
+                                            DevMem2Df result, int cn, cudaStream_t stream)
+{
+    typedef void (*caller_t)(int w, int h, const DevMem2D_<unsigned long long> image_sqsum, unsigned int templ_sqsum, DevMem2Df result, cudaStream_t stream);
+    static const caller_t callers[] = 
+    {
+        0, matchTemplatePrepared_SQDIFF_NORMED_8U<1>, matchTemplatePrepared_SQDIFF_NORMED_8U<2>, matchTemplatePrepared_SQDIFF_NORMED_8U<3>, matchTemplatePrepared_SQDIFF_NORMED_8U<4>
+    };
+
+    callers[cn](w, h, image_sqsum, templ_sqsum, result, stream);
+}
+
+//////////////////////////////////////////////////////////////////////
+// Prepared_CCOFF
+
+__global__ void matchTemplatePreparedKernel_CCOFF_8U(int w, int h, float templ_sum_scale, const PtrStep<unsigned int> image_sum, DevMem2Df result)
 {
    const int x = blockIdx.x * blockDim.x + threadIdx.x;
    const int y = blockIdx.y * blockDim.y + threadIdx.y;
@@ -404,21 +346,20 @@ __global__ void matchTemplatePreparedKernel_CCOFF_8U(
    }
 }

-
-void matchTemplatePrepared_CCOFF_8U(
-        int w, int h, const DevMem2D_<unsigned int> image_sum,
-        unsigned int templ_sum, DevMem2Df result)
+void matchTemplatePrepared_CCOFF_8U(int w, int h, const DevMem2D_<unsigned int> image_sum, unsigned int templ_sum, DevMem2Df result, cudaStream_t stream)
 {
    dim3 threads(32, 8);
    dim3 grid(divUp(result.cols, threads.x), divUp(result.rows, threads.y));
-    matchTemplatePreparedKernel_CCOFF_8U<<<grid, threads>>>(
-            w, h, (float)templ_sum / (w * h), image_sum, result);
+
+    matchTemplatePreparedKernel_CCOFF_8U<<<grid, threads, 0, stream>>>(w, h, (float)templ_sum / (w * h), image_sum, result);
    cudaSafeCall( cudaGetLastError() );

-    cudaSafeCall( cudaDeviceSynchronize() );
+    if (stream == 0)
+        cudaSafeCall( cudaDeviceSynchronize() );
 }


+
 __global__ void matchTemplatePreparedKernel_CCOFF_8UC2(
        int w, int h, float templ_sum_scale_r, float templ_sum_scale_g,
        const PtrStep<unsigned int> image_sum_r,
@@ -442,25 +383,27 @@ __global__ void matchTemplatePreparedKernel_CCOFF_8UC2(
    }
 }

-
 void matchTemplatePrepared_CCOFF_8UC2(
        int w, int h, 
        const DevMem2D_<unsigned int> image_sum_r, 
        const DevMem2D_<unsigned int> image_sum_g,
        unsigned int templ_sum_r, unsigned int templ_sum_g, 
-        DevMem2Df result)
+        DevMem2Df result, cudaStream_t stream)
 {
    dim3 threads(32, 8);
    dim3 grid(divUp(result.cols, threads.x), divUp(result.rows, threads.y));
-    matchTemplatePreparedKernel_CCOFF_8UC2<<<grid, threads>>>(
+
+    matchTemplatePreparedKernel_CCOFF_8UC2<<<grid, threads, 0, stream>>>(
            w, h, (float)templ_sum_r / (w * h), (float)templ_sum_g / (w * h),
            image_sum_r, image_sum_g, result);
    cudaSafeCall( cudaGetLastError() );

-    cudaSafeCall( cudaDeviceSynchronize() );
+    if (stream == 0)
+        cudaSafeCall( cudaDeviceSynchronize() );
 }


+
 __global__ void matchTemplatePreparedKernel_CCOFF_8UC3(
        int w, int h, 
        float templ_sum_scale_r,
@@ -492,7 +435,6 @@ __global__ void matchTemplatePreparedKernel_CCOFF_8UC3(
    }
 }

-
 void matchTemplatePrepared_CCOFF_8UC3(
        int w, int h, 
        const DevMem2D_<unsigned int> image_sum_r, 
@@ -501,11 +443,12 @@ void matchTemplatePrepared_CCOFF_8UC3(
        unsigned int templ_sum_r, 
        unsigned int templ_sum_g, 
        unsigned int templ_sum_b, 
-        DevMem2Df result)
+        DevMem2Df result, cudaStream_t stream)
 {
    dim3 threads(32, 8);
    dim3 grid(divUp(result.cols, threads.x), divUp(result.rows, threads.y));
-    matchTemplatePreparedKernel_CCOFF_8UC3<<<grid, threads>>>(
+
+    matchTemplatePreparedKernel_CCOFF_8UC3<<<grid, threads, 0, stream>>>(
            w, h, 
            (float)templ_sum_r / (w * h),
            (float)templ_sum_g / (w * h),
@@ -513,10 +456,12 @@ void matchTemplatePrepared_CCOFF_8UC3(
            image_sum_r, image_sum_g, image_sum_b, result);
    cudaSafeCall( cudaGetLastError() );

-    cudaSafeCall( cudaDeviceSynchronize() );
+    if (stream == 0)
+        cudaSafeCall( cudaDeviceSynchronize() );
 }


+
 __global__ void matchTemplatePreparedKernel_CCOFF_8UC4(
        int w, int h, 
        float templ_sum_scale_r, 
@@ -554,7 +499,6 @@ __global__ void matchTemplatePreparedKernel_CCOFF_8UC4(
    }
 }

-
 void matchTemplatePrepared_CCOFF_8UC4(
        int w, int h, 
        const DevMem2D_<unsigned int> image_sum_r, 
@@ -565,11 +509,12 @@ void matchTemplatePrepared_CCOFF_8UC4(
        unsigned int templ_sum_g, 
        unsigned int templ_sum_b, 
        unsigned int templ_sum_a, 
-        DevMem2Df result)
+        DevMem2Df result, cudaStream_t stream)
 {
    dim3 threads(32, 8);
    dim3 grid(divUp(result.cols, threads.x), divUp(result.rows, threads.y));
-    matchTemplatePreparedKernel_CCOFF_8UC4<<<grid, threads>>>(
+
+    matchTemplatePreparedKernel_CCOFF_8UC4<<<grid, threads, 0, stream>>>(
            w, h, 
            (float)templ_sum_r / (w * h), 
            (float)templ_sum_g / (w * h), 
@@ -579,9 +524,12 @@ void matchTemplatePrepared_CCOFF_8UC4(
            result);
    cudaSafeCall( cudaGetLastError() );

-    cudaSafeCall( cudaDeviceSynchronize() );
+    if (stream == 0)
+        cudaSafeCall( cudaDeviceSynchronize() );
 }

+//////////////////////////////////////////////////////////////////////
+// Prepared_CCOFF_NORMED

 __global__ void matchTemplatePreparedKernel_CCOFF_NORMED_8U(
        int w, int h, float weight, 
@@ -607,12 +555,11 @@ __global__ void matchTemplatePreparedKernel_CCOFF_NORMED_8U(
    }
 }

-
 void matchTemplatePrepared_CCOFF_NORMED_8U(
            int w, int h, const DevMem2D_<unsigned int> image_sum, 
            const DevMem2D_<unsigned long long> image_sqsum,
            unsigned int templ_sum, unsigned int templ_sqsum,
-            DevMem2Df result)
+            DevMem2Df result, cudaStream_t stream)
 {
    dim3 threads(32, 8);
    dim3 grid(divUp(result.cols, threads.x), divUp(result.rows, threads.y));
@@ -620,15 +567,18 @@ void matchTemplatePrepared_CCOFF_NORMED_8U(
    float weight = 1.f / (w * h);
    float templ_sum_scale = templ_sum * weight;
    float templ_sqsum_scale = templ_sqsum - weight * templ_sum * templ_sum;
-    matchTemplatePreparedKernel_CCOFF_NORMED_8U<<<grid, threads>>>(
+
+    matchTemplatePreparedKernel_CCOFF_NORMED_8U<<<grid, threads, 0, stream>>>(
            w, h, weight, templ_sum_scale, templ_sqsum_scale, 
            image_sum, image_sqsum, result);
    cudaSafeCall( cudaGetLastError() );

-    cudaSafeCall( cudaDeviceSynchronize() );
+    if (stream == 0)
+        cudaSafeCall( cudaDeviceSynchronize() );
 }


+
 __global__ void matchTemplatePreparedKernel_CCOFF_NORMED_8UC2(
        int w, int h, float weight, 
        float templ_sum_scale_r, float templ_sum_scale_g, 
@@ -663,14 +613,13 @@ __global__ void matchTemplatePreparedKernel_CCOFF_NORMED_8UC2(
    }
 }

-
 void matchTemplatePrepared_CCOFF_NORMED_8UC2(
            int w, int h, 
            const DevMem2D_<unsigned int> image_sum_r, const DevMem2D_<unsigned long long> image_sqsum_r,
            const DevMem2D_<unsigned int> image_sum_g, const DevMem2D_<unsigned long long> image_sqsum_g,
            unsigned int templ_sum_r, unsigned int templ_sqsum_r,
            unsigned int templ_sum_g, unsigned int templ_sqsum_g,
-            DevMem2Df result)
+            DevMem2Df result, cudaStream_t stream)
 {
    dim3 threads(32, 8);
    dim3 grid(divUp(result.cols, threads.x), divUp(result.rows, threads.y));
@@ -680,7 +629,8 @@ void matchTemplatePrepared_CCOFF_NORMED_8UC2(
    float templ_sum_scale_g = templ_sum_g * weight;
    float templ_sqsum_scale = templ_sqsum_r - weight * templ_sum_r * templ_sum_r 
                               + templ_sqsum_g - weight * templ_sum_g * templ_sum_g;
-    matchTemplatePreparedKernel_CCOFF_NORMED_8UC2<<<grid, threads>>>(
+
+    matchTemplatePreparedKernel_CCOFF_NORMED_8UC2<<<grid, threads, 0, stream>>>(
            w, h, weight, 
            templ_sum_scale_r, templ_sum_scale_g,
            templ_sqsum_scale,
@@ -689,10 +639,12 @@ void matchTemplatePrepared_CCOFF_NORMED_8UC2(
            result);
    cudaSafeCall( cudaGetLastError() );

-    cudaSafeCall( cudaDeviceSynchronize() );
+    if (stream == 0)
+        cudaSafeCall( cudaDeviceSynchronize() );
 }


+
 __global__ void matchTemplatePreparedKernel_CCOFF_NORMED_8UC3(
        int w, int h, float weight, 
        float templ_sum_scale_r, float templ_sum_scale_g, float templ_sum_scale_b, 
@@ -736,7 +688,6 @@ __global__ void matchTemplatePreparedKernel_CCOFF_NORMED_8UC3(
    }
 }

-
 void matchTemplatePrepared_CCOFF_NORMED_8UC3(
            int w, int h, 
            const DevMem2D_<unsigned int> image_sum_r, const DevMem2D_<unsigned long long> image_sqsum_r,
@@ -745,7 +696,7 @@ void matchTemplatePrepared_CCOFF_NORMED_8UC3(
            unsigned int templ_sum_r, unsigned int templ_sqsum_r,
            unsigned int templ_sum_g, unsigned int templ_sqsum_g,
            unsigned int templ_sum_b, unsigned int templ_sqsum_b,
-            DevMem2Df result)
+            DevMem2Df result, cudaStream_t stream)
 {
    dim3 threads(32, 8);
    dim3 grid(divUp(result.cols, threads.x), divUp(result.rows, threads.y));
@@ -757,7 +708,8 @@ void matchTemplatePrepared_CCOFF_NORMED_8UC3(
    float templ_sqsum_scale = templ_sqsum_r - weight * templ_sum_r * templ_sum_r 
                              + templ_sqsum_g - weight * templ_sum_g * templ_sum_g
                              + templ_sqsum_b - weight * templ_sum_b * templ_sum_b;
-    matchTemplatePreparedKernel_CCOFF_NORMED_8UC3<<<grid, threads>>>(
+
+    matchTemplatePreparedKernel_CCOFF_NORMED_8UC3<<<grid, threads, 0, stream>>>(
            w, h, weight, 
            templ_sum_scale_r, templ_sum_scale_g, templ_sum_scale_b, 
            templ_sqsum_scale, 
@@ -767,10 +719,12 @@ void matchTemplatePrepared_CCOFF_NORMED_8UC3(
            result);
    cudaSafeCall( cudaGetLastError() );

-    cudaSafeCall( cudaDeviceSynchronize() );
+    if (stream == 0)
+        cudaSafeCall( cudaDeviceSynchronize() );
 }


+
 __global__ void matchTemplatePreparedKernel_CCOFF_NORMED_8UC4(
        int w, int h, float weight, 
        float templ_sum_scale_r, float templ_sum_scale_g, float templ_sum_scale_b, 
@@ -821,7 +775,6 @@ __global__ void matchTemplatePreparedKernel_CCOFF_NORMED_8UC4(
    }
 }

-
 void matchTemplatePrepared_CCOFF_NORMED_8UC4(
            int w, int h, 
            const DevMem2D_<unsigned int> image_sum_r, const DevMem2D_<unsigned long long> image_sqsum_r,
@@ -832,7 +785,7 @@ void matchTemplatePrepared_CCOFF_NORMED_8UC4(
            unsigned int templ_sum_g, unsigned int templ_sqsum_g,
            unsigned int templ_sum_b, unsigned int templ_sqsum_b,
            unsigned int templ_sum_a, unsigned int templ_sqsum_a,
-            DevMem2Df result)
+            DevMem2Df result, cudaStream_t stream)
 {
    dim3 threads(32, 8);
    dim3 grid(divUp(result.cols, threads.x), divUp(result.rows, threads.y));
@@ -846,7 +799,8 @@ void matchTemplatePrepared_CCOFF_NORMED_8UC4(
                              + templ_sqsum_g - weight * templ_sum_g * templ_sum_g
                              + templ_sqsum_b - weight * templ_sum_b * templ_sum_b
                              + templ_sqsum_a - weight * templ_sum_a * templ_sum_a;
-    matchTemplatePreparedKernel_CCOFF_NORMED_8UC4<<<grid, threads>>>(
+
+    matchTemplatePreparedKernel_CCOFF_NORMED_8UC4<<<grid, threads, 0, stream>>>(
            w, h, weight, 
            templ_sum_scale_r, templ_sum_scale_g, templ_sum_scale_b, templ_sum_scale_a, 
            templ_sqsum_scale, 
@@ -857,9 +811,12 @@ void matchTemplatePrepared_CCOFF_NORMED_8UC4(
            result);
    cudaSafeCall( cudaGetLastError() );

-    cudaSafeCall( cudaDeviceSynchronize() );
+    if (stream == 0)
+        cudaSafeCall( cudaDeviceSynchronize() );
 }

+//////////////////////////////////////////////////////////////////////
+// normalize

 template <int cn>
 __global__ void normalizeKernel_8U(
@@ -878,32 +835,36 @@ __global__ void normalizeKernel_8U(
    }
 }

-
 void normalize_8U(int w, int h, const DevMem2D_<unsigned long long> image_sqsum, 
-                  unsigned int templ_sqsum, DevMem2Df result, int cn)
+                  unsigned int templ_sqsum, DevMem2Df result, int cn, cudaStream_t stream)
 {
    dim3 threads(32, 8);
    dim3 grid(divUp(result.cols, threads.x), divUp(result.rows, threads.y));
+
    switch (cn)
    {
    case 1:
-        normalizeKernel_8U<1><<<grid, threads>>>(w, h, image_sqsum, templ_sqsum, result);
+        normalizeKernel_8U<1><<<grid, threads, 0, stream>>>(w, h, image_sqsum, templ_sqsum, result);
        break;
    case 2:
-        normalizeKernel_8U<2><<<grid, threads>>>(w, h, image_sqsum, templ_sqsum, result);
+        normalizeKernel_8U<2><<<grid, threads, 0, stream>>>(w, h, image_sqsum, templ_sqsum, result);
        break;
    case 3:
-        normalizeKernel_8U<3><<<grid, threads>>>(w, h, image_sqsum, templ_sqsum, result);
+        normalizeKernel_8U<3><<<grid, threads, 0, stream>>>(w, h, image_sqsum, templ_sqsum, result);
        break;
    case 4:
-        normalizeKernel_8U<4><<<grid, threads>>>(w, h, image_sqsum, templ_sqsum, result);
+        normalizeKernel_8U<4><<<grid, threads, 0, stream>>>(w, h, image_sqsum, templ_sqsum, result);
        break;
    }
+
    cudaSafeCall( cudaGetLastError() );

-    cudaSafeCall( cudaDeviceSynchronize() );
+    if (stream == 0)
+        cudaSafeCall( cudaDeviceSynchronize() );
 }

+//////////////////////////////////////////////////////////////////////
+// extractFirstChannel

 template <int cn>
 __global__ void extractFirstChannel_32F(const PtrStepb image, DevMem2Df result)
@@ -920,8 +881,7 @@ __global__ void extractFirstChannel_32F(const PtrStepb image, DevMem2Df result)
    }
 }

-
-void extractFirstChannel_32F(const DevMem2Db image, DevMem2Df result, int cn)
+void extractFirstChannel_32F(const DevMem2Db image, DevMem2Df result, int cn, cudaStream_t stream)
 {
    dim3 threads(32, 8);
    dim3 grid(divUp(result.cols, threads.x), divUp(result.rows, threads.y));
@@ -929,23 +889,21 @@ void extractFirstChannel_32F(const DevMem2Db image, DevMem2Df result, int cn)
    switch (cn)
    {
    case 1:
-        extractFirstChannel_32F<1><<<grid, threads>>>(image, result);
+        extractFirstChannel_32F<1><<<grid, threads, 0, stream>>>(image, result);
        break;
    case 2:
-        extractFirstChannel_32F<2><<<grid, threads>>>(image, result);
+        extractFirstChannel_32F<2><<<grid, threads, 0, stream>>>(image, result);
        break;
    case 3:
-        extractFirstChannel_32F<3><<<grid, threads>>>(image, result);
+        extractFirstChannel_32F<3><<<grid, threads, 0, stream>>>(image, result);
        break;
    case 4:
-        extractFirstChannel_32F<4><<<grid, threads>>>(image, result);
+        extractFirstChannel_32F<4><<<grid, threads, 0, stream>>>(image, result);
        break;
    }
    cudaSafeCall( cudaGetLastError() );

-    cudaSafeCall( cudaDeviceSynchronize() );
+    if (stream == 0)
+        cudaSafeCall( cudaDeviceSynchronize() );
 }
-
-
 }}}
-
--- a/modules/gpu/src/cuda/safe_call.hpp
+++ b/modules/gpu/src/cuda/safe_call.hpp
@@ -45,16 +45,18 @@

 #include "cuda_runtime_api.h"
 #include "cufft.h"
-//#include <nppdefs.h>
+#include "NCV.hpp"

 #if defined(__GNUC__)
    #define cudaSafeCall(expr)  ___cudaSafeCall(expr, __FILE__, __LINE__, __func__)
-    #define cufftSafeCall(expr)  ___cufftSafeCall(expr, __FILE__, __LINE__, __func__)
    #define nppSafeCall(expr)  ___nppSafeCall(expr, __FILE__, __LINE__, __func__)
+    #define ncvSafeCall(expr)  ___ncvSafeCall(expr, __FILE__, __LINE__, __func__)
+    #define cufftSafeCall(expr)  ___cufftSafeCall(expr, __FILE__, __LINE__, __func__)
 #else /* defined(__CUDACC__) || defined(__MSVC__) */
    #define cudaSafeCall(expr)  ___cudaSafeCall(expr, __FILE__, __LINE__)
-    #define cufftSafeCall(expr)  ___cufftSafeCall(expr, __FILE__, __LINE__)
    #define nppSafeCall(expr)  ___nppSafeCall(expr, __FILE__, __LINE__)
+    #define ncvSafeCall(expr)  ___ncvSafeCall(expr, __FILE__, __LINE__)
+    #define cufftSafeCall(expr)  ___cufftSafeCall(expr, __FILE__, __LINE__)
 #endif

 namespace cv
@@ -62,8 +64,9 @@ namespace cv
    namespace gpu
    {
        void error(const char *error_string, const char *file, const int line, const char *func = "");
-        void nppError(int err, const char *file, const int line, const char *func = "");   
-        void cufftError(int err, const char *file, const int line, const char *func = "");   
+        void nppError(int err, const char *file, const int line, const char *func = "");
+        void ncvError(int err, const char *file, const int line, const char *func = "");
+        void cufftError(int err, const char *file, const int line, const char *func = "");

        static inline void ___cudaSafeCall(cudaError_t err, const char *file, const int line, const char *func = "")
        {
@@ -71,17 +74,23 @@ namespace cv
                cv::gpu::error(cudaGetErrorString(err), file, line, func);
        }

-        static inline void ___cufftSafeCall(cufftResult_t err, const char *file, const int line, const char *func = "")
-        {
-            if (CUFFT_SUCCESS != err)
-                cv::gpu::cufftError(err, file, line, func);
-        }
-
        static inline void ___nppSafeCall(int err, const char *file, const int line, const char *func = "")
        {
            if (err < 0)
                cv::gpu::nppError(err, file, line, func);
        }
+
+        static inline void ___ncvSafeCall(int err, const char *file, const int line, const char *func = "")
+        {
+            if (NCV_SUCCESS != err)
+                cv::gpu::ncvError(err, file, line, func);
+        }
+
+        static inline void ___cufftSafeCall(cufftResult_t err, const char *file, const int line, const char *func = "")
+        {
+            if (CUFFT_SUCCESS != err)
+                cv::gpu::cufftError(err, file, line, func);
+        }
    }
 }