diff --git a/modules/gpu/src/cuda/integral_image.cu b/modules/gpu/src/cuda/integral_image.cu
index 558f9085d..5bd35bdc7 100644
--- a/modules/gpu/src/cuda/integral_image.cu
+++ b/modules/gpu/src/cuda/integral_image.cu
@@ -383,6 +383,89 @@ namespace cv { namespace gpu { namespace device
             if (stream == 0)
                 cudaSafeCall( cudaDeviceSynchronize() );
         }
+
+        __global__ void shfl_integral_vertical(PtrStepSz<unsigned int> buffer, PtrStepSz<unsigned int> integral)
+        {
+        #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 300)
+            __shared__ unsigned int sums[32][9];
+
+            const int tidx = blockIdx.x * blockDim.x + threadIdx.x;
+            const int lane_id = tidx % 8;
+
+            if (tidx >= integral.cols)
+                return;
+
+            sums[threadIdx.x][threadIdx.y] = 0;
+            __syncthreads();
+
+            unsigned int stepSum = 0;
+
+            for (int y = threadIdx.y; y < integral.rows; y += blockDim.y)
+            {
+                unsigned int* p = buffer.ptr(y) + tidx;
+                unsigned int* dst = integral.ptr(y + 1) + tidx + 1;
+
+                unsigned int sum = *p;
+
+                sums[threadIdx.x][threadIdx.y] = sum;
+                __syncthreads();
+
+                // place into SMEM
+                // shfl scan reduce the SMEM, reformating so the column
+                // sums are computed in a warp
+                // then read out properly
+                const int j = threadIdx.x % 8;
+                const int k = threadIdx.x / 8 + threadIdx.y * 4;
+
+                int partial_sum = sums[k][j];
+
+                for (int i = 1; i <= 8; i *= 2)
+                {
+                    int n = __shfl_up(partial_sum, i, 32);
+
+                    if (lane_id >= i)
+                        partial_sum += n;
+                }
+
+                sums[k][j] = partial_sum;
+                __syncthreads();
+
+                if (threadIdx.y > 0)
+                    sum += sums[threadIdx.x][threadIdx.y - 1];
+
+                sum += stepSum;
+                stepSum += sums[threadIdx.x][blockDim.y - 1];
+
+                __syncthreads();
+
+                *dst = sum;
+            }
+        #endif
+        }
+
+        // used for frame preprocessing before Soft Cascade evaluation: no synchronization needed
+        // ToDo: partial dy
+        void shfl_integral_gpu_buffered(PtrStepSzb img, PtrStepSz<uint4> buffer, PtrStepSz<unsigned int> integral,
+            int blockStep, cudaStream_t stream)
+        {
+            {
+                const int block = blockStep;
+                const int grid = img.rows;
+
+                cudaSafeCall( cudaFuncSetCacheConfig(shfl_integral_horizontal, cudaFuncCachePreferL1) );
+
+                shfl_integral_horizontal<<<grid, block, 0, stream>>>((PtrStepSz<uint4>) img, buffer);
+                cudaSafeCall( cudaGetLastError() );
+            }
+
+            {
+                const dim3 block(32, 8);
+                const dim3 grid(divUp(integral.cols, block.x), 1);
+
+                shfl_integral_vertical<<<grid, block, 0, stream>>>((PtrStepSz<uint>)buffer, integral);
+                cudaSafeCall( cudaGetLastError() );
+            }
+        }
     }
 }}}
 
diff --git a/modules/gpu/src/cuda/isf-sc.cu b/modules/gpu/src/cuda/isf-sc.cu
index 7aef41abc..3391bb1a0 100644
--- a/modules/gpu/src/cuda/isf-sc.cu
+++ b/modules/gpu/src/cuda/isf-sc.cu
@@ -198,14 +198,14 @@ namespace icf {
             Node node = nodes[nId];
 
             float threshold = rescale<isUp>(level, node);
-            int sum = get<isUp>(x, y + (node.threshold >> 28) * 121, node.rect);
+            int sum = get<isUp>(x, y + (node.threshold >> 28) * 120, node.rect);
 
             int next = 1 + (int)(sum >= threshold);
             dprintf("%d: go: %d (%d >= %f)\n\n" ,threadIdx.x, next, sum, threshold);
 
             node = nodes[nId + next];
             threshold = rescale<isUp>(level, node);
-            sum = get<isUp>(x, y + (node.threshold >> 28) * 121, node.rect);
+            sum = get<isUp>(x, y + (node.threshold >> 28) * 120, node.rect);
 
             const int lShift = (next - 1) * 2 + (int)(sum >= threshold);
             float impact = leaves[(st + threadIdx.x) * 4 + lShift];
diff --git a/modules/gpu/src/softcascade.cpp b/modules/gpu/src/softcascade.cpp
index d9519e873..2d43a5440 100644
--- a/modules/gpu/src/softcascade.cpp
+++ b/modules/gpu/src/softcascade.cpp
@@ -76,14 +76,20 @@ cv::gpu::device::icf::Level::Level(int idx, const Octave& oct, const float scale
 }
 
 namespace cv { namespace gpu { namespace device {
+
 namespace icf {
     void fillBins(cv::gpu::PtrStepSzb hogluv, const cv::gpu::PtrStepSzf& nangle,
         const int fw, const int fh, const int bins);
 }
-namespace imgproc
-{
-    void shfl_integral_gpu(PtrStepSzb img, PtrStepSz<unsigned int> integral, cudaStream_t stream);
+
+namespace imgproc {
+    void shfl_integral_gpu_buffered(PtrStepSzb, PtrStepSz<uint4>, PtrStepSz<unsigned int>, int, cudaStream_t);
+
+    template <typename T>
+    void resize_gpu(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float fx, float fy,
+                    PtrStepSzb dst, int interpolation, cudaStream_t stream);
 }
+
 }}}
 
 struct cv::gpu::SoftCascade::Filds
@@ -319,9 +325,13 @@ struct cv::gpu::SoftCascade::Filds
         plane.create(FRAME_HEIGHT * (HOG_LUV_BINS + 1), FRAME_WIDTH, CV_8UC1);
         fplane.create(FRAME_HEIGHT * 6, FRAME_WIDTH, CV_32FC1);
         luv.create(FRAME_HEIGHT, FRAME_WIDTH, CV_8UC3);
+
         shrunk.create(FRAME_HEIGHT / shr * HOG_LUV_BINS, FRAME_WIDTH / shr, CV_8UC1);
-        integralBuffer.create(1 , (shrunk.rows + 1) * HOG_LUV_BINS * (shrunk.cols + 1), CV_32SC1);
-        hogluv.create((FRAME_HEIGHT / shr + 1) * HOG_LUV_BINS, FRAME_WIDTH / shr + 64, CV_32SC1);
+        integralBuffer.create(shrunk.rows, shrunk.cols, CV_32SC1);
+
+        hogluv.create((FRAME_HEIGHT / shr) * HOG_LUV_BINS + 1, FRAME_WIDTH / shr + 1, CV_32SC1);
+        hogluv.setTo(cv::Scalar::all(0));
+
         detCounter.create(1,1, CV_32SC1);
 
         octaves.upload(hoctaves);
@@ -432,16 +442,7 @@ private:
 
         GpuMat channels(plane, cv::Rect(0, 0, fw, fh * Filds::HOG_LUV_BINS));
         cv::gpu::resize(channels, shrunk, cv::Size(), 0.25, 0.25, CV_INTER_AREA);
-
-        fw /= shrinkage;
-        fh /= shrinkage;
-
-        for(int i = 0; i < Filds::HOG_LUV_BINS; ++i)
-        {
-            GpuMat channel(shrunk, cv::Rect(0, fh  * i, fw, fh ));
-            GpuMat sum(hogluv, cv::Rect(0, (fh + 1) * i, fw + 1, fh + 1));
-            cv::gpu::integralBuffered(channel, sum, integralBuffer);
-        }
+        device::imgproc::shfl_integral_gpu_buffered(shrunk, integralBuffer, hogluv, 12, 0);
     }
 
 public:
diff --git a/modules/gpu/test/test_softcascade.cpp b/modules/gpu/test/test_softcascade.cpp
index fb936be88..1146b062b 100644
--- a/modules/gpu/test/test_softcascade.cpp
+++ b/modules/gpu/test/test_softcascade.cpp
@@ -271,6 +271,7 @@ GPU_TEST_P(SoftCascadeTestAll, detect,
     ASSERT_EQ(detections.cols / sizeof(Detection) ,3670U);
 }
 
+//ToDo: fix me
 GPU_TEST_P(SoftCascadeTestAll, detectOnIntegral,
         ALL_DEVICES
         )