diff --git a/modules/gpu/src/cuda/isf-sc.cu b/modules/gpu/src/cuda/isf-sc.cu
index 0de2d8e37..ac4b8f0e8 100644
--- a/modules/gpu/src/cuda/isf-sc.cu
+++ b/modules/gpu/src/cuda/isf-sc.cu
@@ -161,192 +161,128 @@ namespace icf {
     }
 
     texture<float2,  cudaTextureType2D, cudaReadModeElementType> troi;
-#if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 300
-    template<bool isUp>
-    __global__ void test_kernel_warp(const Level* levels, const Octave* octaves, const float* stages,
-        const Node* nodes, const float* leaves, Detection* objects, const uint ndetections, uint* ctr,
-        const int downscales)
+
+template<typename Policy>
+template<bool isUp>
+__device void CascadeInvoker<Policy>::detect(Detection* objects, const uint ndetections, uint* ctr, const int downscales) const
+{
+    const int y = blockIdx.y * blockDim.y + threadIdx.y;
+    const int x = blockIdx.x;
+
+    // load Lavel
+    __shared__ Level level;
+
+    // check POI
+    __shared__ volatile char roiCache[Policy::STA_Y];
+
+    if (!threadIdx.y && !threadIdx.x)
+        ((float2*)roiCache)[threadIdx.x] = tex2D(troi, blockIdx.y, x);
+
+    __syncthreads();
+
+    if (!roiCache[threadIdx.y]) return;
+
+    if (!threadIdx.x)
+        level = levels[downscales + blockIdx.z];
+
+    if(x >= level.workRect.x || y >= level.workRect.y) return;
+
+    int st = level.octave * level.step;
+    const int stEnd = st + level.step;
+
+    float confidence = 0.f;
+    for(; st < stEnd; st += Policy::WARP)
     {
-        const int y = blockIdx.y * blockDim.y + threadIdx.y;
-        const int x = blockIdx.x;
+        const int nId = (st + threadIdx.x) * 3;
 
-        // load Lavel
-        __shared__ Level level;
+        Node node = nodes[nId];
 
-        // check POI
-        __shared__ volatile char roiCache[8];
-        if (!threadIdx.y && !threadIdx.x)
-            ((float2*)roiCache)[threadIdx.x] = tex2D(troi, blockIdx.y, x);
+        float threshold = rescale<isUp>(level, node);
+        int sum = get<isUp>(x, y + (node.threshold >> 28) * 120, node.rect);
 
-        __syncthreads();
+        int next = 1 + (int)(sum >= threshold);
 
-        if (!roiCache[threadIdx.y]) return;
+        node = nodes[nId + next];
+        threshold = rescale<isUp>(level, node);
+        sum = get<isUp>(x, y + (node.threshold >> 28) * 120, node.rect);
 
-        if (!threadIdx.x)
-            level = levels[downscales + blockIdx.z];
+        const int lShift = (next - 1) * 2 + (int)(sum >= threshold);
+        float impact = leaves[(st + threadIdx.x) * 4 + lShift];
 
-        if(x >= level.workRect.x || y >= level.workRect.y) return;
-
-        Octave octave = octaves[level.octave];
-        int st = octave.index * octave.stages;
-        const int stEnd = st + 1024;
-
-        float confidence = 0.f;
-
-        for(; st < stEnd; st += 32)
-        {
-
-            const int nId = (st + threadIdx.x) * 3;
-            dprintf("\n\n%d: stage: %d %d\n",threadIdx.x, st, nId);
-            Node node = nodes[nId];
-
-            float threshold = rescale<isUp>(level, node);
-            int sum = get<isUp>(x, y + (node.threshold >> 28) * 120, node.rect);
-
-            int next = 1 + (int)(sum >= threshold);
-            dprintf("%d: go: %d (%d >= %f)\n\n" ,threadIdx.x, next, sum, threshold);
-
-            node = nodes[nId + next];
-            threshold = rescale<isUp>(level, node);
-            sum = get<isUp>(x, y + (node.threshold >> 28) * 120, node.rect);
-
-            const int lShift = (next - 1) * 2 + (int)(sum >= threshold);
-            float impact = leaves[(st + threadIdx.x) * 4 + lShift];
-
-            dprintf("%d: decided: %d (%d >= %f) %d %f\n\n" ,threadIdx.x, next, sum, threshold, lShift, impact);
-            dprintf("%d: extracted stage: %f\n",threadIdx.x, stages[(st + threadIdx.x)]);
-            dprintf("%d: computed  score: %f\n",threadIdx.x, impact);
 #pragma unroll
-            // scan on shuffl functions
-            for (int i = 1; i < 32; i *= 2)
-            {
-                const float n = __shfl_up(impact, i, 32);
-
-                if (threadIdx.x >= i)
-                    impact += n;
-            }
-
-            dprintf("%d: impact scaned %f\n" ,threadIdx.x, impact);
-
-            confidence += impact;
-            if(__any((confidence <= stages[(st + threadIdx.x)]))) st += 2048;
-        }
-
-        if(!threadIdx.x && st == stEnd &&  ((confidence - FLT_EPSILON) >= 0))
+        // scan on shuffl functions
+        for (int i = 1; i < Policy::WARP; i *= 2)
         {
-            int idx = atomicInc(ctr, ndetections);
-            // store detection
-            objects[idx] = Detection(__float2int_rn(x * octave.shrinkage),
-                __float2int_rn(y * octave.shrinkage), level.objSize.x, level.objSize.y, confidence);
+            const float n = __shfl_up(impact, i, Policy::WARP);
+
+            if (threadIdx.x >= i)
+                impact += n;
         }
+
+        confidence += impact;
+        if(__any((confidence <= stages[(st + threadIdx.x)]))) st += 2048;
     }
-#else
-    template<bool isUp>
-    __global__ void test_kernel_warp(const Level* levels, const Octave* octaves, const float* stages,
-        const Node* nodes, const float* leaves, Detection* objects, const uint ndetections, uint* ctr,
-        const int downscales)
+
+    if(!threadIdx.x && st == stEnd &&  ((confidence - FLT_EPSILON) >= 0))
     {
-        const int y = blockIdx.y * blockDim.y + threadIdx.y;
-        const int x = blockIdx.x * blockDim.x + threadIdx.x;
-        Level level = levels[blockIdx.z];
-
-        // if (blockIdx.z != 31) return;
-        if(x >= level.workRect.x || y >= level.workRect.y) return;
-
-        // int roi = tex2D(troi, x, y);
-        // printf("%d\n", roi);
-        // if (!roi) return;
-
-        Octave octave = octaves[level.octave];
-
-        int st = octave.index * octave.stages;
-        const int stEnd = st + 1000;//octave.stages;
-
-        float confidence = 0.f;
-
-        for(; st < stEnd; ++st)
-        {
-            dprintf("\n\nstage: %d\n", st);
-            const int nId = st * 3;
-            Node node = nodes[nId];
-
-            dprintf("Node: [%d %d %d %d] %d %d\n", node.rect.x, node.rect.y, node.rect.z, node.rect.w,
-                node.threshold >> 28, node.threshold & 0x0FFFFFFFU);
-
-            float threshold = rescale<isUp>(level, node);
-            int sum = get<isUp>(x, y + (node.threshold >> 28) * 121, node.rect);
-
-            dprintf("Node: [%d %d %d %d] %f\n", node.rect.x, node.rect.y, node.rect.z,
-                node.rect.w, threshold);
-
-            int next = 1 + (int)(sum >= threshold);
-            dprintf("go: %d (%d >= %f)\n\n" ,next, sum, threshold);
-
-            node = nodes[nId + next];
-            threshold = rescale<isUp>(level, node);
-            sum = get<isUp>(x, y + (node.threshold >> 28) * 121, node.rect);
-
-            const int lShift = (next - 1) * 2 + (int)(sum >= threshold);
-            float impact = leaves[st * 4 + lShift];
-            confidence += impact;
-
-            if (confidence <= stages[st]) st = stEnd + 10;
-            dprintf("decided: %d (%d >= %f) %d %f\n\n" ,next, sum, threshold, lShift, impact);
-            dprintf("extracted stage: %f\n", stages[st]);
-            dprintf("computed  score: %f\n\n", confidence);
-        }
-
-        if(st == stEnd)
-        {
-            int idx = atomicInc(ctr, ndetections);
-            // store detection
-            objects[idx] = Detection(__float2int_rn(x * octave.shrinkage),
-                __float2int_rn(y * octave.shrinkage), level.objSize.x, level.objSize.y, confidence);
-        }
-    }
-#endif
-
-    template<>
-    void CascadeInvoker<CascadePolicy>::operator()(const PtrStepSzb& roi, const PtrStepSzi& hogluv,
-        PtrStepSz<uchar4> objects, PtrStepSzi counter, const int downscales, const int scale, const cudaStream_t& stream) const
-    {
-        int fw = 160;
-        int fh = 120;
-
-        dim3 block(32, 8);
-        dim3 grid(fw, fh / 8, (scale == -1) ? downscales : 1);
-
-        uint* ctr = (uint*)(counter.ptr(0));
-        Detection* det = (Detection*)objects.ptr();
-        uint max_det = objects.cols / sizeof(Detection);
-
-        cudaChannelFormatDesc desc = cudaCreateChannelDesc<int>();
-        cudaSafeCall( cudaBindTexture2D(0, thogluv, hogluv.data, desc, hogluv.cols, hogluv.rows, hogluv.step));
-
-        cudaChannelFormatDesc desc_roi = cudaCreateChannelDesc<float2>();
-        cudaSafeCall( cudaBindTexture2D(0, troi, roi.data, desc_roi, roi.cols / 8, roi.rows, roi.step));
-
-        if (scale == -1)
-        {
-            test_kernel_warp<false><<<grid, block, 0, stream>>>(levels, octaves, stages, nodes, leaves, det, max_det, ctr, 0);
-            cudaSafeCall( cudaGetLastError());
-
-            grid = dim3(fw, fh / 8, 47 - downscales);
-            test_kernel_warp<true><<<grid, block, 0, stream>>>(levels, octaves, stages, nodes, leaves, det, max_det, ctr, downscales);
-        }
-        else
-        {
-            if (scale >= downscales)
-                test_kernel_warp<true><<<grid, block, 0, stream>>>(levels, octaves, stages, nodes, leaves, det, max_det, ctr, scale);
-            else
-                test_kernel_warp<false><<<grid, block, 0, stream>>>(levels, octaves, stages, nodes, leaves, det, max_det, ctr, scale);
-        }
-
-        if (!stream)
-        {
-            cudaSafeCall( cudaGetLastError());
-            cudaSafeCall( cudaDeviceSynchronize());
-        }
+        int idx = atomicInc(ctr, ndetections);
+        objects[idx] = Detection(__float2int_rn(x * Policy::SHRINKAGE),
+            __float2int_rn(y * Policy::SHRINKAGE), level.objSize.x, level.objSize.y, confidence);
     }
 }
+
+template<typename Policy, bool isUp>
+__global__ void soft_cascade(const CascadeInvoker<Policy> invoker, Detection* objects, const uint n, uint* ctr, const int downs)
+{
+    invoker.template detect<isUp>(objects, n, ctr, downs);
+}
+
+template<typename Policy>
+void CascadeInvoker<Policy>::operator()(const PtrStepSzb& roi, const PtrStepSzi& hogluv,
+    PtrStepSz<uchar4> objects, PtrStepSzi counter, const int downscales, const int scale, const cudaStream_t& stream) const
+{
+    int fw = 160;
+    int fh = 120;
+
+    dim3 grid(fw, fh / Policy::STA_Y, (scale == -1) ? downscales : 1);
+
+    uint* ctr = (uint*)(counter.ptr(0));
+    Detection* det = (Detection*)objects.ptr();
+    uint max_det = objects.cols / sizeof(Detection);
+
+    cudaChannelFormatDesc desc = cudaCreateChannelDesc<int>();
+    cudaSafeCall( cudaBindTexture2D(0, thogluv, hogluv.data, desc, hogluv.cols, hogluv.rows, hogluv.step));
+
+    cudaChannelFormatDesc desc_roi = cudaCreateChannelDesc<float2>();
+    cudaSafeCall( cudaBindTexture2D(0, troi, roi.data, desc_roi, roi.cols / 8, roi.rows, roi.step));
+
+    const CascadeInvoker<Policy> inv = *this;
+
+    if (scale == -1)
+    {
+        soft_cascade<Policy, false><<<grid, Policy::block(), 0, stream>>>(inv, det, max_det, ctr, 0);
+        cudaSafeCall( cudaGetLastError());
+
+        grid = dim3(fw, fh / Policy::STA_Y, scales - downscales);
+        soft_cascade<Policy, true><<<grid, Policy::block(), 0, stream>>>(inv, det, max_det, ctr, downscales);
+    }
+    else
+    {
+        if (scale >= downscales)
+            soft_cascade<Policy, true><<<grid, Policy::block(), 0, stream>>>(inv, det, max_det, ctr, scale);
+        else
+            soft_cascade<Policy, false><<<grid, Policy::block(), 0, stream>>>(inv, det, max_det, ctr, scale);
+    }
+
+    if (!stream)
+    {
+        cudaSafeCall( cudaGetLastError());
+        cudaSafeCall( cudaDeviceSynchronize());
+    }
+}
+
+template void CascadeInvoker<GK107PolicyX4>::operator()(const PtrStepSzb& roi, const PtrStepSzi& hogluv,
+    PtrStepSz<uchar4> objects, PtrStepSzi counter, const int downscales, const int scale, const cudaStream_t& stream) const;
+
+}
 }}}
\ No newline at end of file
diff --git a/modules/gpu/src/icf.hpp b/modules/gpu/src/icf.hpp
index 60df55882..8eb080e23 100644
--- a/modules/gpu/src/icf.hpp
+++ b/modules/gpu/src/icf.hpp
@@ -72,9 +72,9 @@ struct __align__(16) Octave
 struct __align__(8) Level //is actually 24 bytes
 {
     int octave;
+    int step;
 
     float relScale;
-    float shrScale;   // used for marking detection
     float scaling[2]; // calculated according to Dollal paper
 
     // for 640x480 we can not get overflow
@@ -115,31 +115,41 @@ struct __align__(16) Detection
     : x(_x), y(_y), w(_w), h(_h), confidence(c), kind(0) {};
 };
 
-struct CascadePolicy
+struct GK107PolicyX4
 {
-    enum {STA_X = 32, STA_Y = 8};
+    enum {WARP = 32, STA_X = WARP, STA_Y = 8, SHRINKAGE = 4};
+    static const dim3 block()
+    {
+        return dim3(GK107PolicyX4::STA_X, GK107PolicyX4::STA_Y);
+    }
 };
 
 template<typename Policy>
 struct CascadeInvoker
 {
-    CascadeInvoker(): levels(0), octaves(0), stages(0), nodes(0), leaves(0) {}
+    CascadeInvoker(): levels(0), stages(0), nodes(0), leaves(0), scales(0) {}
+
     CascadeInvoker(const PtrStepSzb& _levels, const PtrStepSzb& _octaves, const PtrStepSzf& _stages,
                    const PtrStepSzb& _nodes,  const PtrStepSzf& _leaves)
-    : levels((const Level*)_levels.ptr()), octaves((const Octave*)_octaves.ptr()), stages((const float*)_stages.ptr()),
-       nodes((const Node*)_nodes.ptr()), leaves((const float*)_leaves.ptr())
+    : levels((const Level*)_levels.ptr()),
+      stages((const float*)_stages.ptr()),
+      nodes((const Node*)_nodes.ptr()), leaves((const float*)_leaves.ptr()),
+      scales(_levels.cols / sizeof(Level))
     {}
 
     const Level*  levels;
-    const Octave* octaves;
-
     const float*  stages;
 
     const Node*   nodes;
     const float*  leaves;
 
+    int scales;
+
     void operator()(const PtrStepSzb& roi, const PtrStepSzi& hogluv, PtrStepSz<uchar4> objects,
         PtrStepSzi counter, const int downscales, const int csale = -1, const cudaStream_t& stream = 0) const;
+
+    template<bool isUp>
+    __device void detect(Detection* objects, const uint ndetections, uint* ctr, const int downscales) const;
 };
 
 }
diff --git a/modules/gpu/src/softcascade.cpp b/modules/gpu/src/softcascade.cpp
index a69be9239..6133bd1cb 100644
--- a/modules/gpu/src/softcascade.cpp
+++ b/modules/gpu/src/softcascade.cpp
@@ -63,7 +63,7 @@ void cv::gpu::SCascade::read(const FileNode& fn) { Algorithm::read(fn); }
 #include <icf.hpp>
 
 cv::gpu::device::icf::Level::Level(int idx, const Octave& oct, const float scale, const int w, const int h)
-:  octave(idx), relScale(scale / oct.scale), shrScale (relScale / (float)oct.shrinkage)
+:  octave(idx), step(oct.stages), relScale(scale / oct.scale)
 {
     workRect.x = round(w / (float)oct.shrinkage);
     workRect.y = round(h / (float)oct.shrinkage);
@@ -100,7 +100,7 @@ namespace imgproc {
 
 struct cv::gpu::SCascade::Fields
 {
-    static Fields* parseCascade(const FileNode &root, const float mins, const float maxs)
+    static Fields* parseCascade(const FileNode &root, const float mins, const float maxs, const int totals)
     {
         static const char *const SC_STAGE_TYPE          = "stageType";
         static const char *const SC_BOOST               = "BOOST";
@@ -119,11 +119,8 @@ struct cv::gpu::SCascade::Fields
         static const char *const SC_ORIG_W              = "width";
         static const char *const SC_ORIG_H              = "height";
 
-        int origWidth = (int)root[SC_ORIG_W];
-        CV_Assert(origWidth  == ORIG_OBJECT_WIDTH);
-
+        int origWidth  = (int)root[SC_ORIG_W];
         int origHeight = (int)root[SC_ORIG_H];
-        CV_Assert(origHeight == ORIG_OBJECT_HEIGHT);
 
         static const char *const SC_OCTAVES             = "octaves";
         static const char *const SC_STAGES              = "stages";
@@ -142,7 +139,6 @@ struct cv::gpu::SCascade::Fields
         static const char * const SC_F_CHANNEL          = "channel";
         static const char * const SC_F_RECT             = "rect";
 
-
         FileNode fn = root[SC_OCTAVES];
             if (fn.empty()) return false;
 
@@ -167,8 +163,8 @@ struct cv::gpu::SCascade::Fields
 
             ushort nstages = saturate_cast<ushort>((int)fns[SC_OCT_STAGES]);
             ushort2 size;
-            size.x = cvRound(ORIG_OBJECT_WIDTH * scale);
-            size.y = cvRound(ORIG_OBJECT_HEIGHT * scale);
+            size.x = cvRound(origWidth * scale);
+            size.y = cvRound(origHeight * scale);
             shrinkage = saturate_cast<ushort>((int)fns[SC_OCT_SHRINKAGE]);
 
             Octave octave(octIndex, nstages, shrinkage, size, scale);
@@ -245,11 +241,11 @@ struct cv::gpu::SCascade::Fields
         CV_Assert(!hleaves.empty());
 
         std::vector<Level> vlevels;
-        float logFactor = (::log(maxs) - ::log(mins)) / (TOTAL_SCALES -1);
+        float logFactor = (::log(maxs) - ::log(mins)) / (totals -1);
 
         float scale = mins;
         int downscales = 0;
-        for (int sc = 0; sc < TOTAL_SCALES; ++sc)
+        for (int sc = 0; sc < totals; ++sc)
         {
             int width  = ::std::max(0.0f, FRAME_WIDTH - (origWidth  * scale));
             int height = ::std::max(0.0f, FRAME_HEIGHT - (origHeight * scale));
@@ -302,7 +298,7 @@ struct cv::gpu::SCascade::Fields
         leaves.upload(hleaves);
         levels.upload(hlevels);
 
-        invoker = device::icf::CascadeInvoker<device::icf::CascadePolicy>(levels, octaves, stages, nodes, leaves);
+        invoker = device::icf::CascadeInvoker<device::icf::GK107PolicyX4>(levels, octaves, stages, nodes, leaves);
 
     }
 
@@ -456,16 +452,13 @@ public:
 
     GpuMat sobelBuf;
 
-    device::icf::CascadeInvoker<device::icf::CascadePolicy> invoker;
+    device::icf::CascadeInvoker<device::icf::GK107PolicyX4> invoker;
 
     enum { BOOST = 0 };
     enum
     {
         FRAME_WIDTH        = 640,
         FRAME_HEIGHT       = 480,
-        TOTAL_SCALES       = 55,
-        ORIG_OBJECT_WIDTH  = 64,
-        ORIG_OBJECT_HEIGHT = 128,
         HOG_BINS           = 6,
         LUV_BINS           = 3,
         HOG_LUV_BINS       = 10
@@ -480,21 +473,19 @@ cv::gpu::SCascade::~SCascade() { delete fields; }
 bool cv::gpu::SCascade::load(const FileNode& fn)
 {
     if (fields) delete fields;
-    fields = Fields::parseCascade(fn, minScale, maxScale);
+    fields = Fields::parseCascade(fn, minScale, maxScale, scales);
     return fields != 0;
 }
 
 void cv::gpu::SCascade::detect(InputArray image, InputArray _rois, OutputArray _objects, Stream& s) const
 {
+    CV_Assert(fields);
+
     const GpuMat colored = image.getGpuMat();
     // only color images are supperted
     CV_Assert(colored.type() == CV_8UC3 || colored.type() == CV_32SC1);
 
     GpuMat rois = _rois.getGpuMat(), objects = _objects.getGpuMat();
-
-    // we guess user knows about shrincage
-    // CV_Assert((rois.size().width == getRoiSize().height) && (rois.type() == CV_8UC1));
-
     Fields& flds = *fields;
 
     if (colored.type() == CV_8UC3)
@@ -518,15 +509,13 @@ void cv::gpu::SCascade::detect(InputArray image, InputArray _rois, OutputArray _
 
 void cv::gpu::SCascade::detect(InputArray image, InputArray _rois, OutputArray _objects, const int level, Stream& s) const
 {
+    CV_Assert(fields);
+
     const GpuMat colored = image.getGpuMat();
     // only color images are supperted
     CV_Assert(colored.type() == CV_8UC3 || colored.type() == CV_32SC1);
 
-    // we guess user knows about shrincage
-    // CV_Assert((rois.size().width == getRoiSize().height) && (rois.type() == CV_8UC1));
-
     Fields& flds = *fields;
-
     if (colored.type() == CV_8UC3)
     {
         // only this window size allowed
@@ -549,6 +538,8 @@ void cv::gpu::SCascade::detect(InputArray image, InputArray _rois, OutputArray _
 
 void cv::gpu::SCascade::genRoi(InputArray _roi, OutputArray _mask, Stream& stream) const
 {
+    CV_Assert(fields);
+
     const GpuMat roi = _roi.getGpuMat();
     _mask.create( roi.cols / 4, roi.rows / 4, roi.type() );
     GpuMat mask = _mask.getGpuMat();
diff --git a/modules/gpu/test/test_softcascade.cpp b/modules/gpu/test/test_softcascade.cpp
index cfae940c7..7034b33b9 100644
--- a/modules/gpu/test/test_softcascade.cpp
+++ b/modules/gpu/test/test_softcascade.cpp
@@ -172,7 +172,7 @@ GPU_TEST_P(SCascadeTestRoi, detect,
         sub.setTo(1);
         cv::rectangle(result, r, cv::Scalar(0, 0, 255, 255), 1);
     }
-
+    objectBoxes.setTo(0);
     cascade.genRoi(rois, trois);
     cascade.detect(colored, trois, objectBoxes);
 
@@ -222,7 +222,7 @@ GPU_TEST_P(SCascadeTestLevel, detect,
 
     cv::gpu::GpuMat trois;
     cascade.genRoi(rois, trois);
-
+    objectBoxes.setTo(0);
     int level = GET_PARAM(3);
     cascade.detect(colored, trois, objectBoxes, level);
 
@@ -281,7 +281,7 @@ GPU_TEST_P(SCascadeTestAll, detect,
 
     cv::gpu::GpuMat trois;
     cascade.genRoi(rois, trois);
-
+    objectBoxes.setTo(0);
     cascade.detect(colored, trois, objectBoxes);
 
     typedef cv::gpu::SCascade::Detection Detection;
@@ -321,7 +321,7 @@ GPU_TEST_P(SCascadeTestAll, detectOnIntegral,
 
     cv::gpu::GpuMat trois;
     cascade.genRoi(rois, trois);
-
+    objectBoxes.setTo(0);
     cascade.detect(hogluv, trois, objectBoxes);
 
     typedef cv::gpu::SCascade::Detection Detection;
@@ -357,7 +357,7 @@ GPU_TEST_P(SCascadeTestAll, detectStream,
 
     cv::gpu::GpuMat trois;
     cascade.genRoi(rois, trois, s);
-
+    objectBoxes.setTo(0);
     cascade.detect(colored, trois, objectBoxes, s);
 
     cudaDeviceSynchronize();