changed hog to work with variable parameters and changed the hog sample to test it with more options

added comments and tests
2015-06-11 13:29:23 +02:00
parent 424c2bddb3
commit 89889ae8ea
4 changed files with 467 additions and 113 deletions
--- a/modules/cudaobjdetect/src/cuda/hog.cu
+++ b/modules/cudaobjdetect/src/cuda/hog.cu
@@ -49,11 +49,6 @@

 namespace cv { namespace cuda { namespace device
 {
-    // Other values are not supported
-    #define CELL_WIDTH 8
-    #define CELL_HEIGHT 8
-    #define CELLS_PER_BLOCK_X 2
-    #define CELLS_PER_BLOCK_Y 2

    namespace hog
    {
@@ -62,6 +57,8 @@ namespace cv { namespace cuda { namespace device
        __constant__ int cblock_stride_y;
        __constant__ int cnblocks_win_x;
        __constant__ int cnblocks_win_y;
+        __constant__ int cncells_block_x;
+        __constant__ int cncells_block_y;
        __constant__ int cblock_hist_size;
        __constant__ int cblock_hist_size_2up;
        __constant__ int cdescr_size;
@@ -72,31 +69,47 @@ namespace cv { namespace cuda { namespace device
        the typical GPU thread count (pert block) values */
        int power_2up(unsigned int n)
        {
-            if (n < 1) return 1;
-            else if (n < 2) return 2;
-            else if (n < 4) return 4;
-            else if (n < 8) return 8;
-            else if (n < 16) return 16;
-            else if (n < 32) return 32;
-            else if (n < 64) return 64;
-            else if (n < 128) return 128;
-            else if (n < 256) return 256;
-            else if (n < 512) return 512;
-            else if (n < 1024) return 1024;
+            if (n <= 1) return 1;
+            else if (n <= 2) return 2;
+            else if (n <= 4) return 4;
+            else if (n <= 8) return 8;
+            else if (n <= 16) return 16;
+            else if (n <= 32) return 32;
+            else if (n <= 64) return 64;
+            else if (n <= 128) return 128;
+            else if (n <= 256) return 256;
+            else if (n <= 512) return 512;
+            else if (n <= 1024) return 1024;
            return -1; // Input is too big
        }

+        /* Returns the max size for nblocks */
+        int max_nblocks(int nthreads, int ncells_block = 1)
+        {
+            int threads = nthreads * ncells_block;
+            if(threads * 4 <= 256)
+                return 4;
+            else if(threads * 3 <= 256)
+                return 3;
+            else if(threads * 2 <= 256)
+                return 2;
+            else
+                return 1;
+        }
+

        void set_up_constants(int nbins, int block_stride_x, int block_stride_y,
-                              int nblocks_win_x, int nblocks_win_y)
+                              int nblocks_win_x, int nblocks_win_y, int ncells_block_x, int ncells_block_y)
        {
            cudaSafeCall( cudaMemcpyToSymbol(cnbins, &nbins, sizeof(nbins)) );
            cudaSafeCall( cudaMemcpyToSymbol(cblock_stride_x, &block_stride_x, sizeof(block_stride_x)) );
            cudaSafeCall( cudaMemcpyToSymbol(cblock_stride_y, &block_stride_y, sizeof(block_stride_y)) );
            cudaSafeCall( cudaMemcpyToSymbol(cnblocks_win_x, &nblocks_win_x, sizeof(nblocks_win_x)) );
            cudaSafeCall( cudaMemcpyToSymbol(cnblocks_win_y, &nblocks_win_y, sizeof(nblocks_win_y)) );
+            cudaSafeCall( cudaMemcpyToSymbol(cncells_block_x, &ncells_block_x, sizeof(ncells_block_x)) );
+            cudaSafeCall( cudaMemcpyToSymbol(cncells_block_y, &ncells_block_y, sizeof(ncells_block_y)) );

-            int block_hist_size = nbins * CELLS_PER_BLOCK_X * CELLS_PER_BLOCK_Y;
+            int block_hist_size = nbins * ncells_block_x * ncells_block_y;
            cudaSafeCall( cudaMemcpyToSymbol(cblock_hist_size, &block_hist_size, sizeof(block_hist_size)) );

            int block_hist_size_2up = power_2up(block_hist_size);
@@ -112,44 +125,48 @@ namespace cv { namespace cuda { namespace device

        //----------------------------------------------------------------------------
        // Histogram computation
-
-
+        //
+        // CUDA kernel to compute the histograms
        template <int nblocks> // Number of histogram blocks processed by single GPU thread block
        __global__ void compute_hists_kernel_many_blocks(const int img_block_width, const PtrStepf grad,
-                                                         const PtrStepb qangle, float scale, float* block_hists)
+                                                         const PtrStepb qangle, float scale, float* block_hists,
+                                                         int cell_size, int patch_size, int block_patch_size,
+                                                         int threads_cell, int threads_block, int half_cell_size)
        {
            const int block_x = threadIdx.z;
-            const int cell_x = threadIdx.x / 16;
+            const int cell_x = threadIdx.x / threads_cell;
            const int cell_y = threadIdx.y;
-            const int cell_thread_x = threadIdx.x & 0xF;
+            const int cell_thread_x = threadIdx.x & (threads_cell - 1);

            if (blockIdx.x * blockDim.z + block_x >= img_block_width)
                return;

            extern __shared__ float smem[];
            float* hists = smem;
-            float* final_hist = smem + cnbins * 48 * nblocks;
+            float* final_hist = smem + cnbins * block_patch_size * nblocks;

-            const int offset_x = (blockIdx.x * blockDim.z + block_x) * cblock_stride_x +
-                                 4 * cell_x + cell_thread_x;
-            const int offset_y = blockIdx.y * cblock_stride_y + 4 * cell_y;
-
-            const float* grad_ptr = grad.ptr(offset_y) + offset_x * 2;
-            const unsigned char* qangle_ptr = qangle.ptr(offset_y) + offset_x * 2;
-
-            // 12 means that 12 pixels affect on block's cell (in one row)
-            if (cell_thread_x < 12)
+            // patch_size means that patch_size pixels affect on block's cell
+            if (cell_thread_x < patch_size)
            {
-                float* hist = hists + 12 * (cell_y * blockDim.z * CELLS_PER_BLOCK_Y +
-                                            cell_x + block_x * CELLS_PER_BLOCK_X) +
+                const int offset_x = (blockIdx.x * blockDim.z + block_x) * cblock_stride_x +
+                                     half_cell_size * cell_x + cell_thread_x;
+                const int offset_y = blockIdx.y * cblock_stride_y + half_cell_size * cell_y;
+
+                const float* grad_ptr = grad.ptr(offset_y) + offset_x * 2;
+                const unsigned char* qangle_ptr = qangle.ptr(offset_y) + offset_x * 2;
+
+
+                float* hist = hists + patch_size * (cell_y * blockDim.z * cncells_block_y +
+                                            cell_x + block_x * cncells_block_x) +
                                           cell_thread_x;
                for (int bin_id = 0; bin_id < cnbins; ++bin_id)
-                    hist[bin_id * 48 * nblocks] = 0.f;
+                    hist[bin_id * block_patch_size * nblocks] = 0.f;

-                const int dist_x = -4 + (int)cell_thread_x - 4 * cell_x;
+                //(dist_x, dist_y) : distance between current pixel in patch and cell's center
+                const int dist_x = -half_cell_size + (int)cell_thread_x - half_cell_size * cell_x;

-                const int dist_y_begin = -4 - 4 * (int)threadIdx.y;
-                for (int dist_y = dist_y_begin; dist_y < dist_y_begin + 12; ++dist_y)
+                const int dist_y_begin = -half_cell_size - half_cell_size * (int)threadIdx.y;
+                for (int dist_y = dist_y_begin; dist_y < dist_y_begin + patch_size; ++dist_y)
                {
                    float2 vote = *(const float2*)grad_ptr;
                    uchar2 bin = *(const uchar2*)qangle_ptr;
@@ -157,25 +174,29 @@ namespace cv { namespace cuda { namespace device
                    grad_ptr += grad.step/sizeof(float);
                    qangle_ptr += qangle.step;

-                    int dist_center_y = dist_y - 4 * (1 - 2 * cell_y);
-                    int dist_center_x = dist_x - 4 * (1 - 2 * cell_x);
+                    //(dist_center_x, dist_center_y) : distance between current pixel in patch and block's center
+                    int dist_center_y = dist_y - half_cell_size * (1 - 2 * cell_y);
+                    int dist_center_x = dist_x - half_cell_size * (1 - 2 * cell_x);

                    float gaussian = ::expf(-(dist_center_y * dist_center_y +
                                              dist_center_x * dist_center_x) * scale);
-                    float interp_weight = (8.f - ::fabs(dist_y + 0.5f)) *
-                                          (8.f - ::fabs(dist_x + 0.5f)) / 64.f;

-                    hist[bin.x * 48 * nblocks] += gaussian * interp_weight * vote.x;
-                    hist[bin.y * 48 * nblocks] += gaussian * interp_weight * vote.y;
+                    float interp_weight = ((float)cell_size - ::fabs(dist_y + 0.5f)) *
+                                          ((float)cell_size - ::fabs(dist_x + 0.5f)) / (float)threads_block;
+
+                    hist[bin.x * block_patch_size * nblocks] += gaussian * interp_weight * vote.x;
+                    hist[bin.y * block_patch_size * nblocks] += gaussian * interp_weight * vote.y;
                }

+                //reduction of the histograms
                volatile float* hist_ = hist;
-                for (int bin_id = 0; bin_id < cnbins; ++bin_id, hist_ += 48 * nblocks)
+                for (int bin_id = 0; bin_id < cnbins; ++bin_id, hist_ += block_patch_size * nblocks)
                {
-                    if (cell_thread_x < 6) hist_[0] += hist_[6];
-                    if (cell_thread_x < 3) hist_[0] += hist_[3];
+                    if (cell_thread_x < patch_size/2) hist_[0] += hist_[patch_size/2];
+                    if (cell_thread_x < patch_size/4 && (!((patch_size/4) < 3 && cell_thread_x == 0)))
+                            hist_[0] += hist_[patch_size/4];
                    if (cell_thread_x == 0)
-                        final_hist[((cell_x + block_x * 2) * 2 + cell_y) * cnbins + bin_id]
+                        final_hist[((cell_x + block_x * cncells_block_x) * cncells_block_y + cell_y) * cnbins + bin_id]
                            = hist_[0] + hist_[1] + hist_[2];
                }
            }
@@ -186,37 +207,69 @@ namespace cv { namespace cuda { namespace device
                                               blockIdx.x * blockDim.z + block_x) *
                                              cblock_hist_size;

-            int tid = (cell_y * CELLS_PER_BLOCK_Y + cell_x) * 16 + cell_thread_x;
+            //copying from final_hist to block_hist
+            int tid;
+            if(threads_cell < cnbins)
+            {
+                tid = (cell_y * cncells_block_y + cell_x) * cnbins + cell_thread_x;
+            } else
+            {
+                tid = (cell_y * cncells_block_y + cell_x) * threads_cell + cell_thread_x;
+            }
            if (tid < cblock_hist_size)
+            {
                block_hist[tid] = final_hist[block_x * cblock_hist_size + tid];
+                if(threads_cell < cnbins && cell_thread_x == (threads_cell-1))
+                {
+                    for(int i=1;i<=(cnbins - threads_cell);++i)
+                    {
+                        block_hist[tid + i] = final_hist[block_x * cblock_hist_size + tid + i];
+                    }
+                }
+            }
        }

-
+        //declaration of variables and invoke the kernel with the calculated number of blocks
        void compute_hists(int nbins, int block_stride_x, int block_stride_y,
                           int height, int width, const PtrStepSzf& grad,
-                           const PtrStepSzb& qangle, float sigma, float* block_hists)
+                           const PtrStepSzb& qangle, float sigma, float* block_hists,
+                           int cell_size_x, int cell_size_y, int ncells_block_x, int ncells_block_y)
        {
-            const int nblocks = 1;
+            const int ncells_block = ncells_block_x * ncells_block_y;
+            const int patch_side = cell_size_x / 4;
+            const int patch_size = cell_size_x + (patch_side * 2);
+            const int block_patch_size = ncells_block * patch_size;
+            const int threads_cell = power_2up(patch_size);
+            const int threads_block = ncells_block * threads_cell;
+            const int half_cell_size = cell_size_x / 2;

-            int img_block_width = (width - CELLS_PER_BLOCK_X * CELL_WIDTH + block_stride_x) /
+            int img_block_width = (width - ncells_block_x * cell_size_x + block_stride_x) /
                                  block_stride_x;
-            int img_block_height = (height - CELLS_PER_BLOCK_Y * CELL_HEIGHT + block_stride_y) /
+            int img_block_height = (height - ncells_block_y * cell_size_y + block_stride_y) /
                                   block_stride_y;

+            const int nblocks = max_nblocks(threads_cell, ncells_block);
            dim3 grid(divUp(img_block_width, nblocks), img_block_height);
-            dim3 threads(32, 2, nblocks);
-
-            cudaSafeCall(cudaFuncSetCacheConfig(compute_hists_kernel_many_blocks<nblocks>,
-                                                cudaFuncCachePreferL1));
+            dim3 threads(threads_cell * ncells_block_x, ncells_block_y, nblocks);

            // Precompute gaussian spatial window parameter
            float scale = 1.f / (2.f * sigma * sigma);

-            int hists_size = (nbins * CELLS_PER_BLOCK_X * CELLS_PER_BLOCK_Y * 12 * nblocks) * sizeof(float);
-            int final_hists_size = (nbins * CELLS_PER_BLOCK_X * CELLS_PER_BLOCK_Y * nblocks) * sizeof(float);
+            int hists_size = (nbins * ncells_block * patch_size * nblocks) * sizeof(float);
+            int final_hists_size = (nbins * ncells_block * nblocks) * sizeof(float);
            int smem = hists_size + final_hists_size;
-            compute_hists_kernel_many_blocks<nblocks><<<grid, threads, smem>>>(
-                img_block_width, grad, qangle, scale, block_hists);
+            if (nblocks == 4)
+                compute_hists_kernel_many_blocks<4><<<grid, threads, smem>>>(
+                    img_block_width, grad, qangle, scale, block_hists, cell_size_x, patch_size, block_patch_size, threads_cell, threads_block, half_cell_size);
+            else if (nblocks == 3)
+                compute_hists_kernel_many_blocks<3><<<grid, threads, smem>>>(
+                    img_block_width, grad, qangle, scale, block_hists, cell_size_x, patch_size, block_patch_size, threads_cell, threads_block, half_cell_size);
+            else if (nblocks == 2)
+                compute_hists_kernel_many_blocks<2><<<grid, threads, smem>>>(
+                    img_block_width, grad, qangle, scale, block_hists, cell_size_x, patch_size, block_patch_size, threads_cell, threads_block, half_cell_size);
+            else
+                compute_hists_kernel_many_blocks<1><<<grid, threads, smem>>>(
+                    img_block_width, grad, qangle, scale, block_hists, cell_size_x, patch_size, block_patch_size, threads_cell, threads_block, half_cell_size);
            cudaSafeCall( cudaGetLastError() );

            cudaSafeCall( cudaDeviceSynchronize() );
@@ -293,16 +346,16 @@ namespace cv { namespace cuda { namespace device


        void normalize_hists(int nbins, int block_stride_x, int block_stride_y,
-                             int height, int width, float* block_hists, float threshold)
+                             int height, int width, float* block_hists, float threshold, int cell_size_x, int cell_size_y, int ncells_block_x, int ncells_block_y)
        {
            const int nblocks = 1;

-            int block_hist_size = nbins * CELLS_PER_BLOCK_X * CELLS_PER_BLOCK_Y;
+            int block_hist_size = nbins * ncells_block_x * ncells_block_y;
            int nthreads = power_2up(block_hist_size);
            dim3 threads(nthreads, 1, nblocks);

-            int img_block_width = (width - CELLS_PER_BLOCK_X * CELL_WIDTH + block_stride_x) / block_stride_x;
-            int img_block_height = (height - CELLS_PER_BLOCK_Y * CELL_HEIGHT + block_stride_y) / block_stride_y;
+            int img_block_width = (width - ncells_block_x * cell_size_x + block_stride_x) / block_stride_x;
+            int img_block_height = (height - ncells_block_y * cell_size_y + block_stride_y) / block_stride_y;
            dim3 grid(divUp(img_block_width, nblocks), img_block_height);

            if (nthreads == 32)
@@ -310,7 +363,7 @@ namespace cv { namespace cuda { namespace device
            else if (nthreads == 64)
                normalize_hists_kernel_many_blocks<64, nblocks><<<grid, threads>>>(block_hist_size, img_block_width, block_hists, threshold);
            else if (nthreads == 128)
-                normalize_hists_kernel_many_blocks<64, nblocks><<<grid, threads>>>(block_hist_size, img_block_width, block_hists, threshold);
+                normalize_hists_kernel_many_blocks<128, nblocks><<<grid, threads>>>(block_hist_size, img_block_width, block_hists, threshold);
            else if (nthreads == 256)
                normalize_hists_kernel_many_blocks<256, nblocks><<<grid, threads>>>(block_hist_size, img_block_width, block_hists, threshold);
            else if (nthreads == 512)
@@ -365,7 +418,7 @@ namespace cv { namespace cuda { namespace device

       void compute_confidence_hists(int win_height, int win_width, int block_stride_y, int block_stride_x,
                                               int win_stride_y, int win_stride_x, int height, int width, float* block_hists,
-                                               float* coefs, float free_coef, float threshold, float *confidences)
+                                               float* coefs, float free_coef, float threshold, int cell_size_x, int ncells_block_x, float *confidences)
       {
           const int nthreads = 256;
           const int nblocks = 1;
@@ -381,7 +434,7 @@ namespace cv { namespace cuda { namespace device
           cudaSafeCall(cudaFuncSetCacheConfig(compute_confidence_hists_kernel_many_blocks<nthreads, nblocks>,
                                                                                   cudaFuncCachePreferL1));

-           int img_block_width = (width - CELLS_PER_BLOCK_X * CELL_WIDTH + block_stride_x) /
+           int img_block_width = (width - ncells_block_x * cell_size_x + block_stride_x) /
                                                       block_stride_x;
           compute_confidence_hists_kernel_many_blocks<nthreads, nblocks><<<grid, threads>>>(
                   img_win_width, img_block_width, win_block_stride_x, win_block_stride_y,
@@ -427,7 +480,7 @@ namespace cv { namespace cuda { namespace device

        void classify_hists(int win_height, int win_width, int block_stride_y, int block_stride_x,
                            int win_stride_y, int win_stride_x, int height, int width, float* block_hists,
-                            float* coefs, float free_coef, float threshold, unsigned char* labels)
+                            float* coefs, float free_coef, float threshold, int cell_size_x, int ncells_block_x, unsigned char* labels)
        {
            const int nthreads = 256;
            const int nblocks = 1;
@@ -442,7 +495,7 @@ namespace cv { namespace cuda { namespace device

            cudaSafeCall(cudaFuncSetCacheConfig(classify_hists_kernel_many_blocks<nthreads, nblocks>, cudaFuncCachePreferL1));

-            int img_block_width = (width - CELLS_PER_BLOCK_X * CELL_WIDTH + block_stride_x) / block_stride_x;
+            int img_block_width = (width - ncells_block_x * cell_size_x + block_stride_x) / block_stride_x;
            classify_hists_kernel_many_blocks<nthreads, nblocks><<<grid, threads>>>(
                img_win_width, img_block_width, win_block_stride_x, win_block_stride_y,
                block_hists, coefs, free_coef, threshold, labels);
@@ -477,7 +530,7 @@ namespace cv { namespace cuda { namespace device


        void extract_descrs_by_rows(int win_height, int win_width, int block_stride_y, int block_stride_x, int win_stride_y, int win_stride_x,
-                                    int height, int width, float* block_hists, PtrStepSzf descriptors)
+                                    int height, int width, float* block_hists, int cell_size_x, int ncells_block_x, PtrStepSzf descriptors)
        {
            const int nthreads = 256;

@@ -488,7 +541,7 @@ namespace cv { namespace cuda { namespace device
            dim3 threads(nthreads, 1);
            dim3 grid(img_win_width, img_win_height);

-            int img_block_width = (width - CELLS_PER_BLOCK_X * CELL_WIDTH + block_stride_x) / block_stride_x;
+            int img_block_width = (width - ncells_block_x * cell_size_x + block_stride_x) / block_stride_x;
            extract_descrs_by_rows_kernel<nthreads><<<grid, threads>>>(
                img_block_width, win_block_stride_x, win_block_stride_y, block_hists, descriptors);
            cudaSafeCall( cudaGetLastError() );
@@ -525,7 +578,7 @@ namespace cv { namespace cuda { namespace device


        void extract_descrs_by_cols(int win_height, int win_width, int block_stride_y, int block_stride_x,
-                                    int win_stride_y, int win_stride_x, int height, int width, float* block_hists,
+                                    int win_stride_y, int win_stride_x, int height, int width, float* block_hists, int cell_size_x, int ncells_block_x,
                                    PtrStepSzf descriptors)
        {
            const int nthreads = 256;
@@ -537,7 +590,7 @@ namespace cv { namespace cuda { namespace device
            dim3 threads(nthreads, 1);
            dim3 grid(img_win_width, img_win_height);

-            int img_block_width = (width - CELLS_PER_BLOCK_X * CELL_WIDTH + block_stride_x) / block_stride_x;
+            int img_block_width = (width - ncells_block_x * cell_size_x + block_stride_x) / block_stride_x;
            extract_descrs_by_cols_kernel<nthreads><<<grid, threads>>>(
                img_block_width, win_block_stride_x, win_block_stride_y, block_hists, descriptors);
            cudaSafeCall( cudaGetLastError() );
--- a/modules/cudaobjdetect/src/hog.cpp
+++ b/modules/cudaobjdetect/src/hog.cpp
@@ -51,34 +51,45 @@ Ptr<cuda::HOG> cv::cuda::HOG::create(Size, Size, Size, Size, int) { throw_no_cud

 #else

+/****************************************************************************************\
+      The code below is implementation of HOG (Histogram-of-Oriented Gradients)
+      descriptor and object detection, introduced by Navneet Dalal and Bill Triggs.
+
+      The computed feature vectors are compatible with the
+      INRIA Object Detection and Localization Toolkit
+      (http://pascal.inrialpes.fr/soft/olt/)
+\****************************************************************************************/
+
 namespace cv { namespace cuda { namespace device
 {
    namespace hog
    {
        void set_up_constants(int nbins, int block_stride_x, int block_stride_y,
-                              int nblocks_win_x, int nblocks_win_y);
+                              int nblocks_win_x, int nblocks_win_y,
+                              int ncells_block_x, int ncells_block_y);

-        void compute_hists(int nbins, int block_stride_x, int blovck_stride_y,
-                           int height, int width, const cv::cuda::PtrStepSzf& grad,
-                           const cv::cuda::PtrStepSzb& qangle, float sigma, float* block_hists);
+        void compute_hists(int nbins, int block_stride_x, int block_stride_y,
+                           int height, int width, const PtrStepSzf& grad,
+                           const PtrStepSzb& qangle, float sigma, float* block_hists,
+                           int cell_size_x, int cell_size_y, int ncells_block_x, int ncells_block_y);

        void normalize_hists(int nbins, int block_stride_x, int block_stride_y,
-                             int height, int width, float* block_hists, float threshold);
+                             int height, int width, float* block_hists, float threshold, int cell_size_x, int cell_size_y, int ncells_block_x, int ncells_block_y);

        void classify_hists(int win_height, int win_width, int block_stride_y,
                            int block_stride_x, int win_stride_y, int win_stride_x, int height,
                            int width, float* block_hists, float* coefs, float free_coef,
-                            float threshold, unsigned char* labels);
+                            float threshold, int cell_size_x, int ncells_block_x, unsigned char* labels);

        void compute_confidence_hists(int win_height, int win_width, int block_stride_y, int block_stride_x,
                           int win_stride_y, int win_stride_x, int height, int width, float* block_hists,
-                           float* coefs, float free_coef, float threshold, float *confidences);
+                           float* coefs, float free_coef, float threshold, int cell_size_x, int ncells_block_x, float *confidences);

        void extract_descrs_by_rows(int win_height, int win_width, int block_stride_y, int block_stride_x,
-                                    int win_stride_y, int win_stride_x, int height, int width, float* block_hists,
+                                    int win_stride_y, int win_stride_x, int height, int width, float* block_hists, int cell_size_x, int ncells_block_x,
                                    cv::cuda::PtrStepSzf descriptors);
        void extract_descrs_by_cols(int win_height, int win_width, int block_stride_y, int block_stride_x,
-                                    int win_stride_y, int win_stride_x, int height, int width, float* block_hists,
+                                    int win_stride_y, int win_stride_x, int height, int width, float* block_hists, int cell_size_x, int ncells_block_x,
                                    cv::cuda::PtrStepSzf descriptors);

        void compute_gradients_8UC1(int nbins, int height, int width, const cv::cuda::PtrStepSzb& img,
@@ -167,6 +178,7 @@ namespace
        double scale0_;
        int group_threshold_;
        int descr_format_;
+        Size cells_per_block_;

    private:
        int getTotalHistSize(Size img_size) const;
@@ -197,7 +209,8 @@ namespace
        win_stride_(block_stride),
        scale0_(1.05),
        group_threshold_(2),
-        descr_format_(DESCR_FORMAT_COL_BY_COL)
+        descr_format_(DESCR_FORMAT_COL_BY_COL),
+        cells_per_block_(block_size.width / cell_size.width, block_size.height / cell_size.height)
    {
        CV_Assert((win_size.width  - block_size.width ) % block_stride.width  == 0 &&
                  (win_size.height - block_size.height) % block_stride.height == 0);
@@ -205,12 +218,13 @@ namespace
        CV_Assert(block_size.width % cell_size.width == 0 &&
                  block_size.height % cell_size.height == 0);

-        CV_Assert(block_stride == cell_size);
+        // Navneet Dalal and Bill Triggs. Histograms of oriented gradients for
+        // human detection. In International Conference on Computer Vision and
+        // Pattern Recognition, volume 2, pages 886–893, June 2005
+        // http://lear.inrialpes.fr/people/triggs/pubs/Dalal-cvpr05.pdf (28.07.2015) [Figure 5]
+        CV_Assert(block_stride == (block_size / 2));

-        CV_Assert(cell_size == Size(8, 8));
-
-        Size cells_per_block(block_size.width / cell_size.width, block_size.height / cell_size.height);
-        CV_Assert(cells_per_block == Size(2, 2));
+        CV_Assert(cell_size.width == cell_size.height);
    }

    static int numPartsWithin(int size, int part_size, int stride)
@@ -231,8 +245,7 @@ namespace

    size_t HOG_Impl::getBlockHistogramSize() const
    {
-        Size cells_per_block(block_size_.width / cell_size_.width, block_size_.height / cell_size_.height);
-        return nbins_ * cells_per_block.area();
+        return nbins_ * cells_per_block_.area();
    }

    double HOG_Impl::getWinSigma() const
@@ -313,6 +326,7 @@ namespace
                                detector_.ptr<float>(),
                                (float)free_coef_,
                                (float)hit_threshold_,
+                                cell_size_.width, cells_per_block_.width,
                                labels.ptr());

            Mat labels_host;
@@ -339,6 +353,7 @@ namespace
                                          detector_.ptr<float>(),
                                          (float)free_coef_,
                                          (float)hit_threshold_,
+                                          cell_size_.width, cells_per_block_.width,
                                          labels.ptr<float>());

            Mat labels_host;
@@ -465,6 +480,7 @@ namespace
                                        win_stride_.height, win_stride_.width,
                                        img.rows, img.cols,
                                        block_hists.ptr<float>(),
+                                        cell_size_.width, cells_per_block_.width,
                                        descriptors);
            break;
        case DESCR_FORMAT_COL_BY_COL:
@@ -473,6 +489,7 @@ namespace
                                        win_stride_.height, win_stride_.width,
                                        img.rows, img.cols,
                                        block_hists.ptr<float>(),
+                                        cell_size_.width, cells_per_block_.width,
                                        descriptors);
            break;
        default:
@@ -490,7 +507,7 @@ namespace
    void HOG_Impl::computeBlockHistograms(const GpuMat& img, GpuMat& block_hists)
    {
        cv::Size blocks_per_win = numPartsWithin(win_size_, block_size_, block_stride_);
-        hog::set_up_constants(nbins_, block_stride_.width, block_stride_.height, blocks_per_win.width, blocks_per_win.height);
+        hog::set_up_constants(nbins_, block_stride_.width, block_stride_.height, blocks_per_win.width, blocks_per_win.height, cells_per_block_.width, cells_per_block_.height);

        BufferPool pool(Stream::Null());

@@ -505,13 +522,17 @@ namespace
                           img.rows, img.cols,
                           grad, qangle,
                           (float)getWinSigma(),
-                           block_hists.ptr<float>());
+                           block_hists.ptr<float>(),
+                           cell_size_.width, cell_size_.height,
+                           cells_per_block_.width, cells_per_block_.height);

        hog::normalize_hists(nbins_,
                             block_stride_.width, block_stride_.height,
                             img.rows, img.cols,
                             block_hists.ptr<float>(),
-                             (float)threshold_L2hys_);
+                             (float)threshold_L2hys_,
+                             cell_size_.width, cell_size_.height,
+                             cells_per_block_.width, cells_per_block_.height);
    }

    void HOG_Impl::computeGradient(const GpuMat& img, GpuMat& grad, GpuMat& qangle)