renamed gpu namespace -> cuda

2013-08-28 15:45:13 +04:00
parent e12496d150
commit e895b7455e
343 changed files with 3882 additions and 3882 deletions
--- a/modules/softcascade/include/opencv2/softcascade.hpp
+++ b/modules/softcascade/include/opencv2/softcascade.hpp
@@ -233,7 +233,7 @@ public:
    // Param frame is an input 3-channel bgr image.
    // Param channels is a GPU matrix of optionally shrinked channels
    // Param stream is stream is a high-level CUDA stream abstraction used for asynchronous execution.
-    virtual void apply(InputArray frame, OutputArray channels, cv::gpu::Stream& stream = cv::gpu::Stream::Null()) = 0;
+    virtual void apply(InputArray frame, OutputArray channels, cv::cuda::Stream& stream = cv::cuda::Stream::Null()) = 0;

    // Creates a specific preprocessor implementation.
    // Param shrinkage is a resizing factor. Resize is applied before the computing integral sum
@@ -280,7 +280,7 @@ public:
    // Param objects is an output array of Detections represented as GpuMat of detections (SCascade::Detection)
    //    The first element of the matrix is  actually a count of detections.
    // Param stream is stream is a high-level CUDA stream abstraction used for asynchronous execution
-    virtual void detect(InputArray image, InputArray rois, OutputArray objects, cv::gpu::Stream& stream = cv::gpu::Stream::Null()) const;
+    virtual void detect(InputArray image, InputArray rois, OutputArray objects, cv::cuda::Stream& stream = cv::cuda::Stream::Null()) const;

 private:

--- a/modules/softcascade/perf/perf_cuda_softcascade.cpp
+++ b/modules/softcascade/perf/perf_cuda_softcascade.cpp
@@ -37,7 +37,7 @@ namespace {
        }
    };

-    cv::Mat sortDetections(cv::gpu::GpuMat& objects)
+    cv::Mat sortDetections(cv::cuda::GpuMat& objects)
    {
        cv::Mat detections(objects);

@@ -64,7 +64,7 @@ RUN_GPU(SCascadeTest, detect)
 {
    cv::Mat cpu = cv::imread(getDataPath(get<1>(GetParam())));;
    ASSERT_FALSE(cpu.empty());
-    cv::gpu::GpuMat colored(cpu);
+    cv::cuda::GpuMat colored(cpu);

    cv::softcascade::SCascade cascade;

@@ -73,7 +73,7 @@ RUN_GPU(SCascadeTest, detect)

    ASSERT_TRUE(cascade.load(fs.getFirstTopLevelNode()));

-    cv::gpu::GpuMat objectBoxes(1, 10000 * sizeof(cv::softcascade::Detection), CV_8UC1), rois(colored.size(), CV_8UC1);
+    cv::cuda::GpuMat objectBoxes(1, 10000 * sizeof(cv::softcascade::Detection), CV_8UC1), rois(colored.size(), CV_8UC1);
    rois.setTo(1);

    cascade.detect(colored, rois, objectBoxes);
@@ -122,7 +122,7 @@ RUN_GPU(SCascadeTestRoi, detectInRoi)
 {
    cv::Mat cpu = cv::imread(getDataPath(get<1>(GetParam())));
    ASSERT_FALSE(cpu.empty());
-    cv::gpu::GpuMat colored(cpu);
+    cv::cuda::GpuMat colored(cpu);

    cv::softcascade::SCascade cascade;

@@ -131,7 +131,7 @@ RUN_GPU(SCascadeTestRoi, detectInRoi)

    ASSERT_TRUE(cascade.load(fs.getFirstTopLevelNode()));

-    cv::gpu::GpuMat objectBoxes(1, 16384 * 20, CV_8UC1), rois(colored.size(), CV_8UC1);
+    cv::cuda::GpuMat objectBoxes(1, 16384 * 20, CV_8UC1), rois(colored.size(), CV_8UC1);
    rois.setTo(0);

    int nroi = get<2>(GetParam());
@@ -139,7 +139,7 @@ RUN_GPU(SCascadeTestRoi, detectInRoi)
    for (int i = 0; i < nroi; ++i)
    {
        cv::Rect r = getFromTable(rng(10));
-        cv::gpu::GpuMat sub(rois, r);
+        cv::cuda::GpuMat sub(rois, r);
        sub.setTo(1);
    }

@@ -167,7 +167,7 @@ RUN_GPU(SCascadeTestRoi, detectEachRoi)
 {
    cv::Mat cpu = cv::imread(getDataPath(get<1>(GetParam())));
    ASSERT_FALSE(cpu.empty());
-    cv::gpu::GpuMat colored(cpu);
+    cv::cuda::GpuMat colored(cpu);

    cv::softcascade::SCascade cascade;

@@ -176,12 +176,12 @@ RUN_GPU(SCascadeTestRoi, detectEachRoi)

    ASSERT_TRUE(cascade.load(fs.getFirstTopLevelNode()));

-    cv::gpu::GpuMat objectBoxes(1, 16384 * 20, CV_8UC1), rois(colored.size(), CV_8UC1);
+    cv::cuda::GpuMat objectBoxes(1, 16384 * 20, CV_8UC1), rois(colored.size(), CV_8UC1);
    rois.setTo(0);

    int idx = get<2>(GetParam());
    cv::Rect r = getFromTable(idx);
-    cv::gpu::GpuMat sub(rois, r);
+    cv::cuda::GpuMat sub(rois, r);
    sub.setTo(1);

    cascade.detect(colored, rois, objectBoxes);
@@ -206,7 +206,7 @@ RUN_GPU(SCascadeTest, detectStream)
 {
    cv::Mat cpu = cv::imread(getDataPath(get<1>(GetParam())));
    ASSERT_FALSE(cpu.empty());
-    cv::gpu::GpuMat colored(cpu);
+    cv::cuda::GpuMat colored(cpu);

    cv::softcascade::SCascade cascade;

@@ -215,10 +215,10 @@ RUN_GPU(SCascadeTest, detectStream)

    ASSERT_TRUE(cascade.load(fs.getFirstTopLevelNode()));

-    cv::gpu::GpuMat objectBoxes(1, 10000 * sizeof(cv::softcascade::Detection), CV_8UC1), rois(colored.size(), CV_8UC1);
+    cv::cuda::GpuMat objectBoxes(1, 10000 * sizeof(cv::softcascade::Detection), CV_8UC1), rois(colored.size(), CV_8UC1);
    rois.setTo(1);

-    cv::gpu::Stream s;
+    cv::cuda::Stream s;

    cascade.detect(colored, rois, objectBoxes, s);

--- a/modules/softcascade/src/cuda/channels.cu
+++ b/modules/softcascade/src/cuda/channels.cu
@@ -59,7 +59,7 @@ namespace cv { namespace softcascade { namespace cudev
        return bytes;
    }

-    __global__ void shfl_integral_horizontal(const cv::gpu::PtrStep<uint4> img, cv::gpu::PtrStep<uint4> integral)
+    __global__ void shfl_integral_horizontal(const cv::cuda::PtrStep<uint4> img, cv::cuda::PtrStep<uint4> integral)
    {
    #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 300)
        __shared__ int sums[128];
@@ -299,7 +299,7 @@ namespace cv { namespace softcascade { namespace cudev
    // The final set of sums from the block is then propgated, with the block
    // computing "down" the image and adding the running sum to the local
    // block sums.
-    __global__ void shfl_integral_vertical(cv::gpu::PtrStepSz<unsigned int> integral)
+    __global__ void shfl_integral_vertical(cv::cuda::PtrStepSz<unsigned int> integral)
    {
    #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 300)
        __shared__ unsigned int sums[32][9];
@@ -357,7 +357,7 @@ namespace cv { namespace softcascade { namespace cudev
    #endif
    }

-    void shfl_integral(const cv::gpu::PtrStepSzb& img, cv::gpu::PtrStepSz<unsigned int> integral, cudaStream_t stream)
+    void shfl_integral(const cv::cuda::PtrStepSzb& img, cv::cuda::PtrStepSz<unsigned int> integral, cudaStream_t stream)
    {
        {
            // each thread handles 16 values, use 1 block/row
@@ -369,13 +369,13 @@ namespace cv { namespace softcascade { namespace cudev

            cudaSafeCall( cudaFuncSetCacheConfig(shfl_integral_horizontal, cudaFuncCachePreferL1) );

-            shfl_integral_horizontal<<<grid, block, 0, stream>>>((const cv::gpu::PtrStepSz<uint4>) img, (cv::gpu::PtrStepSz<uint4>) integral);
+            shfl_integral_horizontal<<<grid, block, 0, stream>>>((const cv::cuda::PtrStepSz<uint4>) img, (cv::cuda::PtrStepSz<uint4>) integral);
            cudaSafeCall( cudaGetLastError() );
        }

        {
            const dim3 block(32, 8);
-            const dim3 grid(cv::gpu::cudev::divUp(integral.cols, block.x), 1);
+            const dim3 grid(cv::cuda::cudev::divUp(integral.cols, block.x), 1);

            shfl_integral_vertical<<<grid, block, 0, stream>>>(integral);
            cudaSafeCall( cudaGetLastError() );
@@ -385,7 +385,7 @@ namespace cv { namespace softcascade { namespace cudev
            cudaSafeCall( cudaDeviceSynchronize() );
    }

-    __global__ void shfl_integral_vertical(cv::gpu::PtrStepSz<unsigned int> buffer, cv::gpu::PtrStepSz<unsigned int> integral)
+    __global__ void shfl_integral_vertical(cv::cuda::PtrStepSz<unsigned int> buffer, cv::cuda::PtrStepSz<unsigned int> integral)
    {
    #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 300)
        __shared__ unsigned int sums[32][9];
@@ -445,7 +445,7 @@ namespace cv { namespace softcascade { namespace cudev
    }

    // used for frame preprocessing before Soft Cascade evaluation: no synchronization needed
-    void shfl_integral_gpu_buffered(cv::gpu::PtrStepSzb img, cv::gpu::PtrStepSz<uint4> buffer, cv::gpu::PtrStepSz<unsigned int> integral,
+    void shfl_integral_gpu_buffered(cv::cuda::PtrStepSzb img, cv::cuda::PtrStepSz<uint4> buffer, cv::cuda::PtrStepSz<unsigned int> integral,
        int blockStep, cudaStream_t stream)
    {
        {
@@ -454,15 +454,15 @@ namespace cv { namespace softcascade { namespace cudev

            cudaSafeCall( cudaFuncSetCacheConfig(shfl_integral_horizontal, cudaFuncCachePreferL1) );

-            shfl_integral_horizontal<<<grid, block, 0, stream>>>((cv::gpu::PtrStepSz<uint4>) img, buffer);
+            shfl_integral_horizontal<<<grid, block, 0, stream>>>((cv::cuda::PtrStepSz<uint4>) img, buffer);
            cudaSafeCall( cudaGetLastError() );
        }

        {
            const dim3 block(32, 8);
-            const dim3 grid(cv::gpu::cudev::divUp(integral.cols, block.x), 1);
+            const dim3 grid(cv::cuda::cudev::divUp(integral.cols, block.x), 1);

-            shfl_integral_vertical<<<grid, block, 0, stream>>>((cv::gpu::PtrStepSz<unsigned int>)buffer, integral);
+            shfl_integral_vertical<<<grid, block, 0, stream>>>((cv::cuda::PtrStepSz<unsigned int>)buffer, integral);
            cudaSafeCall( cudaGetLastError() );
        }
    }
@@ -486,7 +486,7 @@ namespace cv { namespace softcascade { namespace cudev
        return CV_DESCALE((unsigned int)(b * B2Y + g * G2Y + r * R2Y), yuv_shift);
    }

-    __global__ void device_transform(const cv::gpu::PtrStepSz<uchar3> bgr, cv::gpu::PtrStepSzb gray)
+    __global__ void device_transform(const cv::cuda::PtrStepSz<uchar3> bgr, cv::cuda::PtrStepSzb gray)
    {
        const int y = blockIdx.y * blockDim.y + threadIdx.y;
        const int x = blockIdx.x * blockDim.x + threadIdx.x;
@@ -497,10 +497,10 @@ namespace cv { namespace softcascade { namespace cudev
    }

    ///////
-    void transform(const cv::gpu::PtrStepSz<uchar3>& bgr, cv::gpu::PtrStepSzb gray)
+    void transform(const cv::cuda::PtrStepSz<uchar3>& bgr, cv::cuda::PtrStepSzb gray)
    {
        const dim3 block(32, 8);
-        const dim3 grid(cv::gpu::cudev::divUp(bgr.cols, block.x), cv::gpu::cudev::divUp(bgr.rows, block.y));
+        const dim3 grid(cv::cuda::cudev::divUp(bgr.cols, block.x), cv::cuda::cudev::divUp(bgr.rows, block.y));
        device_transform<<<grid, block>>>(bgr, gray);
        cudaSafeCall(cudaDeviceSynchronize());
    }
--- a/modules/softcascade/src/cuda/icf-sc.cu
+++ b/modules/softcascade/src/cuda/icf-sc.cu
@@ -76,7 +76,7 @@ typedef unsigned char uchar;
        shrank[ y * outPitch + x] = shrink<FACTOR>(ptr, inPitch, y, x);
    }

-    void shrink(const cv::gpu::PtrStepSzb& channels, cv::gpu::PtrStepSzb shrunk)
+    void shrink(const cv::cuda::PtrStepSzb& channels, cv::cuda::PtrStepSzb shrunk)
    {
        dim3 block(32, 8);
        dim3 grid(shrunk.cols / 32, shrunk.rows / 8);
@@ -124,7 +124,7 @@ typedef unsigned char uchar;
        luvg[luvgPitch * (y + 2 * 480) + x] = v;
    }

-    void bgr2Luv(const cv::gpu::PtrStepSzb& bgr, cv::gpu::PtrStepSzb luv)
+    void bgr2Luv(const cv::cuda::PtrStepSzb& bgr, cv::cuda::PtrStepSzb luv)
    {
        dim3 block(32, 8);
        dim3 grid(bgr.cols / 32, bgr.rows / 8);
@@ -206,7 +206,7 @@ typedef unsigned char uchar;
    texture<uchar,  cudaTextureType2D, cudaReadModeElementType> tgray;

    template<bool isDefaultNum>
-    __global__ void gray2hog(cv::gpu::PtrStepSzb mag)
+    __global__ void gray2hog(cv::cuda::PtrStepSzb mag)
    {
        const int x = blockIdx.x * blockDim.x + threadIdx.x;
        const int y = blockIdx.y * blockDim.y + threadIdx.y;
@@ -221,7 +221,7 @@ typedef unsigned char uchar;
        mag( 480 * fast_angle_bin<isDefaultNum>(dy, dx) + y, x) = cmag;
    }

-    void gray2hog(const cv::gpu::PtrStepSzb& gray, cv::gpu::PtrStepSzb mag, const int bins)
+    void gray2hog(const cv::cuda::PtrStepSzb& gray, cv::cuda::PtrStepSzb mag, const int bins)
    {
        dim3 block(32, 8);
        dim3 grid(gray.cols / 32, gray.rows / 8);
@@ -250,7 +250,7 @@ typedef unsigned char uchar;
        hog[((fh * bin) + y) * hogPitch + x] = val;
    }

-    void fillBins(cv::gpu::PtrStepSzb hogluv, const cv::gpu::PtrStepSzf& nangle,
+    void fillBins(cv::cuda::PtrStepSzb hogluv, const cv::cuda::PtrStepSzf& nangle,
                  const int fw,  const int fh, const int bins, cudaStream_t stream )
    {
        const uchar* mag = (const uchar*)hogluv.ptr(fh * bins);
@@ -324,8 +324,8 @@ typedef unsigned char uchar;
        }
    }

-    void suppress(const cv::gpu::PtrStepSzb& objects, cv::gpu::PtrStepSzb overlaps, cv::gpu::PtrStepSzi ndetections,
-        cv::gpu::PtrStepSzb suppressed, cudaStream_t stream)
+    void suppress(const cv::cuda::PtrStepSzb& objects, cv::cuda::PtrStepSzb overlaps, cv::cuda::PtrStepSzi ndetections,
+        cv::cuda::PtrStepSzb suppressed, cudaStream_t stream)
    {
        int block = 192;
        int grid = 1;
@@ -527,8 +527,8 @@ __global__ void soft_cascade(const CascadeInvoker<Policy> invoker, Detection* ob
 }

 template<typename Policy>
-void CascadeInvoker<Policy>::operator()(const cv::gpu::PtrStepSzb& roi, const cv::gpu::PtrStepSzi& hogluv,
-    cv::gpu::PtrStepSz<uchar4> objects, const int downscales, const cudaStream_t& stream) const
+void CascadeInvoker<Policy>::operator()(const cv::cuda::PtrStepSzb& roi, const cv::cuda::PtrStepSzi& hogluv,
+    cv::cuda::PtrStepSz<uchar4> objects, const int downscales, const cudaStream_t& stream) const
 {
    int fw = roi.rows;
    int fh = roi.cols;
@@ -560,7 +560,7 @@ void CascadeInvoker<Policy>::operator()(const cv::gpu::PtrStepSzb& roi, const cv
    }
 }

-template void CascadeInvoker<GK107PolicyX4>::operator()(const cv::gpu::PtrStepSzb& roi, const cv::gpu::PtrStepSzi& hogluv,
-    cv::gpu::PtrStepSz<uchar4> objects, const int downscales, const cudaStream_t& stream) const;
+template void CascadeInvoker<GK107PolicyX4>::operator()(const cv::cuda::PtrStepSzb& roi, const cv::cuda::PtrStepSzi& hogluv,
+    cv::cuda::PtrStepSz<uchar4> objects, const int downscales, const cudaStream_t& stream) const;

 }}}
--- a/modules/softcascade/src/cuda_invoker.hpp
+++ b/modules/softcascade/src/cuda_invoker.hpp
@@ -128,8 +128,8 @@ struct CascadeInvoker
 {
    CascadeInvoker(): levels(0), stages(0), nodes(0), leaves(0), scales(0) {}

-    CascadeInvoker(const cv::gpu::PtrStepSzb& _levels, const cv::gpu::PtrStepSzf& _stages,
-                   const cv::gpu::PtrStepSzb& _nodes,  const cv::gpu::PtrStepSzf& _leaves)
+    CascadeInvoker(const cv::cuda::PtrStepSzb& _levels, const cv::cuda::PtrStepSzf& _stages,
+                   const cv::cuda::PtrStepSzb& _nodes,  const cv::cuda::PtrStepSzf& _leaves)
    : levels((const Level*)_levels.ptr()),
      stages((const float*)_stages.ptr()),
      nodes((const Node*)_nodes.ptr()), leaves((const float*)_leaves.ptr()),
@@ -144,7 +144,7 @@ struct CascadeInvoker

    int scales;

-    void operator()(const cv::gpu::PtrStepSzb& roi, const cv::gpu::PtrStepSzi& hogluv, cv::gpu::PtrStepSz<uchar4> objects,
+    void operator()(const cv::cuda::PtrStepSzb& roi, const cv::cuda::PtrStepSzi& hogluv, cv::cuda::PtrStepSz<uchar4> objects,
        const int downscales, const cudaStream_t& stream = 0) const;

    template<bool isUp>
--- a/modules/softcascade/src/detector_cuda.cpp
+++ b/modules/softcascade/src/detector_cuda.cpp
@@ -50,7 +50,7 @@ cv::softcascade::SCascade::~SCascade() { throw_no_cuda(); }

 bool cv::softcascade::SCascade::load(const FileNode&) { throw_no_cuda(); return false;}

-void cv::softcascade::SCascade::detect(InputArray, InputArray, OutputArray, cv::gpu::Stream&) const { throw_no_cuda(); }
+void cv::softcascade::SCascade::detect(InputArray, InputArray, OutputArray, cv::cuda::Stream&) const { throw_no_cuda(); }

 void cv::softcascade::SCascade::read(const FileNode& fn) { Algorithm::read(fn); }

@@ -85,18 +85,18 @@ cv::softcascade::cudev::Level::Level(int idx, const Octave& oct, const float sca

 namespace cv { namespace softcascade { namespace cudev {

-    void fillBins(cv::gpu::PtrStepSzb hogluv, const cv::gpu::PtrStepSzf& nangle,
+    void fillBins(cv::cuda::PtrStepSzb hogluv, const cv::cuda::PtrStepSzf& nangle,
        const int fw, const int fh, const int bins, cudaStream_t stream);

-    void suppress(const cv::gpu::PtrStepSzb& objects, cv::gpu::PtrStepSzb overlaps, cv::gpu::PtrStepSzi ndetections,
-        cv::gpu::PtrStepSzb suppressed, cudaStream_t stream);
+    void suppress(const cv::cuda::PtrStepSzb& objects, cv::cuda::PtrStepSzb overlaps, cv::cuda::PtrStepSzi ndetections,
+        cv::cuda::PtrStepSzb suppressed, cudaStream_t stream);

-    void bgr2Luv(const cv::gpu::PtrStepSzb& bgr, cv::gpu::PtrStepSzb luv);
-    void transform(const cv::gpu::PtrStepSz<uchar3>& bgr, cv::gpu::PtrStepSzb gray);
-    void gray2hog(const cv::gpu::PtrStepSzb& gray, cv::gpu::PtrStepSzb mag, const int bins);
-    void shrink(const cv::gpu::PtrStepSzb& channels, cv::gpu::PtrStepSzb shrunk);
+    void bgr2Luv(const cv::cuda::PtrStepSzb& bgr, cv::cuda::PtrStepSzb luv);
+    void transform(const cv::cuda::PtrStepSz<uchar3>& bgr, cv::cuda::PtrStepSzb gray);
+    void gray2hog(const cv::cuda::PtrStepSzb& gray, cv::cuda::PtrStepSzb mag, const int bins);
+    void shrink(const cv::cuda::PtrStepSzb& channels, cv::cuda::PtrStepSzb shrunk);

-    void shfl_integral(const cv::gpu::PtrStepSzb& img, cv::gpu::PtrStepSz<unsigned int> integral, cudaStream_t stream);
+    void shfl_integral(const cv::cuda::PtrStepSzb& img, cv::cuda::PtrStepSz<unsigned int> integral, cudaStream_t stream);
 }}}

 struct cv::softcascade::SCascade::Fields
@@ -333,7 +333,7 @@ struct cv::softcascade::SCascade::Fields
        preprocessor = ChannelsProcessor::create(shrinkage, 6, method);
    }

-    void detect(cv::gpu::GpuMat& objects, cv::gpu::Stream& s) const
+    void detect(cv::cuda::GpuMat& objects, cv::cuda::Stream& s) const
    {
        objects.setTo(Scalar::all(0), s);

@@ -342,19 +342,19 @@ struct cv::softcascade::SCascade::Fields
        cudev::CascadeInvoker<cudev::GK107PolicyX4> invoker
        = cudev::CascadeInvoker<cudev::GK107PolicyX4>(levels, stages, nodes, leaves);

-        cudaStream_t stream = cv::gpu::StreamAccessor::getStream(s);
+        cudaStream_t stream = cv::cuda::StreamAccessor::getStream(s);
        invoker(mask, hogluv, objects, downscales, stream);
    }

-    void suppress(cv::gpu::GpuMat& objects, cv::gpu::Stream& s)
+    void suppress(cv::cuda::GpuMat& objects, cv::cuda::Stream& s)
    {
-        cv::gpu::GpuMat ndetections = cv::gpu::GpuMat(objects, cv::Rect(0, 0, sizeof(Detection), 1));
+        cv::cuda::GpuMat ndetections = cv::cuda::GpuMat(objects, cv::Rect(0, 0, sizeof(Detection), 1));
        ensureSizeIsEnough(objects.rows, objects.cols, CV_8UC1, overlaps);

        overlaps.setTo(0, s);
        suppressed.setTo(0, s);

-        cudaStream_t stream = cv::gpu::StreamAccessor::getStream(s);
+        cudaStream_t stream = cv::cuda::StreamAccessor::getStream(s);
        cudev::suppress(objects, overlaps, ndetections, suppressed, stream);
    }

@@ -398,34 +398,34 @@ public:


    // 160x120x10
-    cv::gpu::GpuMat shrunk;
+    cv::cuda::GpuMat shrunk;

    // temporal mat for integral
-    cv::gpu::GpuMat integralBuffer;
+    cv::cuda::GpuMat integralBuffer;

    // 161x121x10
-    cv::gpu::GpuMat hogluv;
+    cv::cuda::GpuMat hogluv;


    // used for suppression
-    cv::gpu::GpuMat suppressed;
+    cv::cuda::GpuMat suppressed;
    // used for area overlap computing during
-    cv::gpu::GpuMat overlaps;
+    cv::cuda::GpuMat overlaps;


    // Cascade from xml
-    cv::gpu::GpuMat octaves;
-    cv::gpu::GpuMat stages;
-    cv::gpu::GpuMat nodes;
-    cv::gpu::GpuMat leaves;
-    cv::gpu::GpuMat levels;
+    cv::cuda::GpuMat octaves;
+    cv::cuda::GpuMat stages;
+    cv::cuda::GpuMat nodes;
+    cv::cuda::GpuMat leaves;
+    cv::cuda::GpuMat levels;


    // For ROI
-    cv::gpu::GpuMat mask;
-    cv::gpu::GpuMat genRoiTmp;
+    cv::cuda::GpuMat mask;
+    cv::cuda::GpuMat genRoiTmp;

-//     cv::gpu::GpuMat collected;
+//     cv::cuda::GpuMat collected;


    std::vector<cudev::Octave> voctaves;
@@ -458,18 +458,18 @@ bool cv::softcascade::SCascade::load(const FileNode& fn)

 namespace {

-void integral(const cv::gpu::GpuMat& src, cv::gpu::GpuMat& sum, cv::gpu::GpuMat& buffer, cv::gpu::Stream& s)
+void integral(const cv::cuda::GpuMat& src, cv::cuda::GpuMat& sum, cv::cuda::GpuMat& buffer, cv::cuda::Stream& s)
 {
    CV_Assert(src.type() == CV_8UC1);

-    cudaStream_t stream = cv::gpu::StreamAccessor::getStream(s);
+    cudaStream_t stream = cv::cuda::StreamAccessor::getStream(s);

    cv::Size whole;
    cv::Point offset;

    src.locateROI(whole, offset);

-    if (cv::gpu::deviceSupports(cv::gpu::WARP_SHUFFLE_FUNCTIONS) && src.cols <= 2048
+    if (cv::cuda::deviceSupports(cv::cuda::WARP_SHUFFLE_FUNCTIONS) && src.cols <= 2048
        && offset.x % 16 == 0 && ((src.cols + 63) / 64) * 64 <= (static_cast<int>(src.step) - offset.x))
    {
        ensureSizeIsEnough(((src.rows + 7) / 8) * 8, ((src.cols + 63) / 64) * 64, CV_32SC1, buffer);
@@ -479,8 +479,8 @@ void integral(const cv::gpu::GpuMat& src, cv::gpu::GpuMat& sum, cv::gpu::GpuMat&
        sum.create(src.rows + 1, src.cols + 1, CV_32SC1);
        sum.setTo(cv::Scalar::all(0), s);

-        cv::gpu::GpuMat inner = sum(cv::Rect(1, 1, src.cols, src.rows));
-        cv::gpu::GpuMat res = buffer(cv::Rect(0, 0, src.cols, src.rows));
+        cv::cuda::GpuMat inner = sum(cv::Rect(1, 1, src.cols, src.rows));
+        cv::cuda::GpuMat res = buffer(cv::Rect(0, 0, src.cols, src.rows));

        res.copyTo(inner, s);
    }
@@ -489,7 +489,7 @@ void integral(const cv::gpu::GpuMat& src, cv::gpu::GpuMat& sum, cv::gpu::GpuMat&

 }

-void cv::softcascade::SCascade::detect(InputArray _image, InputArray _rois, OutputArray _objects, cv::gpu::Stream& s) const
+void cv::softcascade::SCascade::detect(InputArray _image, InputArray _rois, OutputArray _objects, cv::cuda::Stream& s) const
 {
    CV_Assert(fields);

@@ -497,11 +497,11 @@ void cv::softcascade::SCascade::detect(InputArray _image, InputArray _rois, Outp
    int type = _image.type();
    CV_Assert(type == CV_8UC3 || type == CV_32SC1 || (!_rois.empty()));

-    const cv::gpu::GpuMat image = _image.getGpuMat();
+    const cv::cuda::GpuMat image = _image.getGpuMat();

    if (_objects.empty()) _objects.create(1, 4096 * sizeof(Detection), CV_8UC1);

-    cv::gpu::GpuMat rois = _rois.getGpuMat(), objects = _objects.getGpuMat();
+    cv::cuda::GpuMat rois = _rois.getGpuMat(), objects = _objects.getGpuMat();

    /// roi
    Fields& flds = *fields;
@@ -510,7 +510,7 @@ void cv::softcascade::SCascade::detect(InputArray _image, InputArray _rois, Outp
    flds.mask.create( rois.cols / shr, rois.rows / shr, rois.type());

    cudev::shrink(rois, flds.mask);
-    //cv::gpu::transpose(flds.genRoiTmp, flds.mask, s);
+    //cv::cuda::transpose(flds.genRoiTmp, flds.mask, s);

    if (type == CV_8UC3)
    {
@@ -531,7 +531,7 @@ void cv::softcascade::SCascade::detect(InputArray _image, InputArray _rois, Outp

    if ( (flags && NMS_MASK) != NO_REJECT)
    {
-        cv::gpu::GpuMat spr(objects, cv::Rect(0, 0, flds.suppressed.cols, flds.suppressed.rows));
+        cv::cuda::GpuMat spr(objects, cv::Rect(0, 0, flds.suppressed.cols, flds.suppressed.rows));
        flds.suppress(objects, s);
        flds.suppressed.copyTo(spr);
    }
@@ -546,10 +546,10 @@ namespace {

 using cv::InputArray;
 using cv::OutputArray;
-using cv::gpu::Stream;
-using cv::gpu::GpuMat;
+using cv::cuda::Stream;
+using cv::cuda::GpuMat;

-inline void setZero(cv::gpu::GpuMat& m, cv::gpu::Stream& s)
+inline void setZero(cv::cuda::GpuMat& m, cv::cuda::Stream& s)
 {
    m.setTo(0, s);
 }
@@ -559,22 +559,22 @@ struct SeparablePreprocessor : public cv::softcascade::ChannelsProcessor
    SeparablePreprocessor(const int s, const int b) : cv::softcascade::ChannelsProcessor(), shrinkage(s), bins(b) {}
    virtual ~SeparablePreprocessor() {}

-    virtual void apply(InputArray _frame, OutputArray _shrunk, cv::gpu::Stream& s = cv::gpu::Stream::Null())
+    virtual void apply(InputArray _frame, OutputArray _shrunk, cv::cuda::Stream& s = cv::cuda::Stream::Null())
    {
        bgr = _frame.getGpuMat();
-        //cv::gpu::GaussianBlur(frame, bgr, cv::Size(3, 3), -1.0);
+        //cv::cuda::GaussianBlur(frame, bgr, cv::Size(3, 3), -1.0);

        _shrunk.create(bgr.rows * (4 + bins) / shrinkage, bgr.cols / shrinkage, CV_8UC1);
-        cv::gpu::GpuMat shrunk = _shrunk.getGpuMat();
+        cv::cuda::GpuMat shrunk = _shrunk.getGpuMat();

        channels.create(bgr.rows * (4 + bins), bgr.cols, CV_8UC1);
        setZero(channels, s);

        gray.create(bgr.size(), CV_8UC1);
-        cv::softcascade::cudev::transform(bgr, gray); //cv::gpu::cvtColor(bgr, gray, CV_BGR2GRAY);
+        cv::softcascade::cudev::transform(bgr, gray); //cv::cuda::cvtColor(bgr, gray, CV_BGR2GRAY);
        cv::softcascade::cudev::gray2hog(gray, channels(cv::Rect(0, 0, bgr.cols, bgr.rows * (bins + 1))), bins);

-        cv::gpu::GpuMat luv(channels, cv::Rect(0, bgr.rows * (bins + 1), bgr.cols, bgr.rows * 3));
+        cv::cuda::GpuMat luv(channels, cv::Rect(0, bgr.rows * (bins + 1), bgr.cols, bgr.rows * 3));
        cv::softcascade::cudev::bgr2Luv(bgr, luv);
        cv::softcascade::cudev::shrink(channels, shrunk);
    }
@@ -583,9 +583,9 @@ private:
    const int shrinkage;
    const int bins;

-    cv::gpu::GpuMat bgr;
-    cv::gpu::GpuMat gray;
-    cv::gpu::GpuMat channels;
+    cv::cuda::GpuMat bgr;
+    cv::cuda::GpuMat gray;
+    cv::cuda::GpuMat channels;
    SeparablePreprocessor& operator=( const SeparablePreprocessor& );
 };

--- a/modules/softcascade/test/test_cuda_softcascade.cpp
+++ b/modules/softcascade/test/test_cuda_softcascade.cpp
@@ -156,11 +156,11 @@ namespace
 #endif
 }

-class SCascadeTestRoi : public ::testing::TestWithParam<std::tr1::tuple<cv::gpu::DeviceInfo, std::string, std::string, int> >
+class SCascadeTestRoi : public ::testing::TestWithParam<std::tr1::tuple<cv::cuda::DeviceInfo, std::string, std::string, int> >
 {
    virtual void SetUp()
    {
-        cv::gpu::setDevice(get<0>(GetParam()).deviceID());
+        cv::cuda::setDevice(get<0>(GetParam()).deviceID());
    }
 };

@@ -176,7 +176,7 @@ TEST_P(SCascadeTestRoi, Detect)

    ASSERT_TRUE(cascade.load(fs.getFirstTopLevelNode()));

-    cv::gpu::GpuMat colored(coloredCpu), objectBoxes(1, 16384, CV_8UC1), rois(colored.size(), CV_8UC1);
+    cv::cuda::GpuMat colored(coloredCpu), objectBoxes(1, 16384, CV_8UC1), rois(colored.size(), CV_8UC1);
    rois.setTo(0);

    int nroi = get<3>(GetParam());
@@ -185,7 +185,7 @@ TEST_P(SCascadeTestRoi, Detect)
    for (int i = 0; i < nroi; ++i)
    {
        cv::Rect r = getFromTable(rng(10));
-        cv::gpu::GpuMat sub(rois, r);
+        cv::cuda::GpuMat sub(rois, r);
        sub.setTo(1);
        cv::rectangle(result, r, cv::Scalar(0, 0, 255, 255), 1);
    }
@@ -230,7 +230,7 @@ struct Fixture
 };
 }

-typedef std::tr1::tuple<cv::gpu::DeviceInfo, Fixture> SCascadeTestAllFixture;
+typedef std::tr1::tuple<cv::cuda::DeviceInfo, Fixture> SCascadeTestAllFixture;
 class SCascadeTestAll : public ::testing::TestWithParam<SCascadeTestAllFixture>
 {
 protected:
@@ -239,7 +239,7 @@ protected:

    virtual void SetUp()
    {
-        cv::gpu::setDevice(get<0>(GetParam()).deviceID());
+        cv::cuda::setDevice(get<0>(GetParam()).deviceID());
        xml = path(get<1>(GetParam()).path);
        expected = get<1>(GetParam()).expected;
    }
@@ -257,7 +257,7 @@ TEST_P(SCascadeTestAll, detect)
    cv::Mat coloredCpu = cv::imread(path("images/image_00000000_0.png"));
    ASSERT_FALSE(coloredCpu.empty());

-    cv::gpu::GpuMat colored(coloredCpu), objectBoxes, rois(colored.size(), CV_8UC1);
+    cv::cuda::GpuMat colored(coloredCpu), objectBoxes, rois(colored.size(), CV_8UC1);
    rois.setTo(1);

    cascade.detect(colored, rois, objectBoxes);
@@ -294,10 +294,10 @@ TEST_P(SCascadeTestAll, detectStream)
    cv::Mat coloredCpu = cv::imread(path("images/image_00000000_0.png"));
    ASSERT_FALSE(coloredCpu.empty());

-    cv::gpu::GpuMat colored(coloredCpu), objectBoxes(1, 100000, CV_8UC1), rois(colored.size(), CV_8UC1);
+    cv::cuda::GpuMat colored(coloredCpu), objectBoxes(1, 100000, CV_8UC1), rois(colored.size(), CV_8UC1);
    rois.setTo(cv::Scalar::all(1));

-    cv::gpu::Stream s;
+    cv::cuda::Stream s;

    objectBoxes.setTo(0);
    cascade.detect(colored, rois, objectBoxes, s);
--- a/modules/softcascade/test/utility.cpp
+++ b/modules/softcascade/test/utility.cpp
@@ -46,7 +46,7 @@

 using namespace std;
 using namespace cv;
-using namespace cv::gpu;
+using namespace cv::cuda;
 using namespace cvtest;
 using namespace testing;
 using namespace testing::internal;
--- a/modules/softcascade/test/utility.hpp
+++ b/modules/softcascade/test/utility.hpp
@@ -49,7 +49,7 @@
 //////////////////////////////////////////////////////////////////////
 // Gpu devices
 //! return true if device supports specified feature and gpu module was built with support the feature.
-bool supportFeature(const cv::gpu::DeviceInfo& info, cv::gpu::FeatureSet feature);
+bool supportFeature(const cv::cuda::DeviceInfo& info, cv::cuda::FeatureSet feature);


 #if defined(HAVE_CUDA)
@@ -61,15 +61,15 @@ public:
    void load(int i);
    void loadAll();

-    const std::vector<cv::gpu::DeviceInfo>& values() const { return devices_; }
+    const std::vector<cv::cuda::DeviceInfo>& values() const { return devices_; }

 private:
-    std::vector<cv::gpu::DeviceInfo> devices_;
+    std::vector<cv::cuda::DeviceInfo> devices_;
    DeviceManager() {loadAll();}
 };
 # define ALL_DEVICES testing::ValuesIn(DeviceManager::instance().values())
 #else
-# define ALL_DEVICES testing::ValuesIn(std::vector<cv::gpu::DeviceInfo>())
+# define ALL_DEVICES testing::ValuesIn(std::vector<cv::cuda::DeviceInfo>())
 #endif

 #endif // __OPENCV_GPU_TEST_UTILITY_HPP__