integrated patch: HOG confidence calculation. Thanks, Wongun.

2012-07-25 15:26:26 +04:00
parent 4fa282e1a9
commit e1e0c46639
5 changed files with 467 additions and 1 deletions
--- a/modules/gpu/src/cuda/hog.cu
+++ b/modules/gpu/src/cuda/hog.cu
@@ -326,6 +326,97 @@ namespace cv { namespace gpu { namespace device
        //  Linear SVM based classification
        //

+       // return confidence values not just positive location
+       template <int nthreads, // Number of threads per one histogram block
+                           int nblocks> // Number of histogram block processed by single GPU thread block
+       __global__ void compute_confidence_hists_kernel_many_blocks(const int img_win_width, const int img_block_width,
+                                                                                                           const int win_block_stride_x, const int win_block_stride_y,
+                                                                                                           const float* block_hists, const float* coefs,
+                                                                                                           float free_coef, float threshold, float* confidences)
+       {
+               const int win_x = threadIdx.z;
+               if (blockIdx.x * blockDim.z + win_x >= img_win_width)
+                       return;
+
+               const float* hist = block_hists + (blockIdx.y * win_block_stride_y * img_block_width +
+                                                                                    blockIdx.x * win_block_stride_x * blockDim.z + win_x) *
+                                                                                   cblock_hist_size;
+
+               float product = 0.f;
+               for (int i = threadIdx.x; i < cdescr_size; i += nthreads)
+               {
+                       int offset_y = i / cdescr_width;
+                       int offset_x = i - offset_y * cdescr_width;
+                       product += coefs[i] * hist[offset_y * img_block_width * cblock_hist_size + offset_x];
+               }
+
+               __shared__ float products[nthreads * nblocks];
+
+               const int tid = threadIdx.z * nthreads + threadIdx.x;
+               products[tid] = product;
+
+               __syncthreads();
+
+               if (nthreads >= 512)
+               {
+                       if (threadIdx.x < 256) products[tid] = product = product + products[tid + 256];
+                       __syncthreads();
+               }
+               if (nthreads >= 256)
+               {
+                       if (threadIdx.x < 128) products[tid] = product = product + products[tid + 128];
+                       __syncthreads();
+               }
+               if (nthreads >= 128)
+               {
+                       if (threadIdx.x < 64) products[tid] = product = product + products[tid + 64];
+                       __syncthreads();
+               }
+
+               if (threadIdx.x < 32)
+               {
+                       volatile float* smem = products;
+                       if (nthreads >= 64) smem[tid] = product = product + smem[tid + 32];
+                       if (nthreads >= 32) smem[tid] = product = product + smem[tid + 16];
+                       if (nthreads >= 16) smem[tid] = product = product + smem[tid + 8];
+                       if (nthreads >= 8) smem[tid] = product = product + smem[tid + 4];
+                       if (nthreads >= 4) smem[tid] = product = product + smem[tid + 2];
+                       if (nthreads >= 2) smem[tid] = product = product + smem[tid + 1];
+               }
+
+               if (threadIdx.x == 0)
+                       confidences[blockIdx.y * img_win_width + blockIdx.x * blockDim.z + win_x]
+                               = (float)(product + free_coef);
+
+       }
+
+       void compute_confidence_hists(int win_height, int win_width, int block_stride_y, int block_stride_x,
+                                               int win_stride_y, int win_stride_x, int height, int width, float* block_hists,
+                                               float* coefs, float free_coef, float threshold, float *confidences)
+       {
+               const int nthreads = 256;
+               const int nblocks = 1;
+
+               int win_block_stride_x = win_stride_x / block_stride_x;
+               int win_block_stride_y = win_stride_y / block_stride_y;
+               int img_win_width = (width - win_width + win_stride_x) / win_stride_x;
+               int img_win_height = (height - win_height + win_stride_y) / win_stride_y;
+
+               dim3 threads(nthreads, 1, nblocks);
+               dim3 grid(divUp(img_win_width, nblocks), img_win_height);
+
+               cudaSafeCall(cudaFuncSetCacheConfig(compute_confidence_hists_kernel_many_blocks<nthreads, nblocks>,
+                                                                                       cudaFuncCachePreferL1));
+
+               int img_block_width = (width - CELLS_PER_BLOCK_X * CELL_WIDTH + block_stride_x) /
+                                                           block_stride_x;
+               compute_confidence_hists_kernel_many_blocks<nthreads, nblocks><<<grid, threads>>>(
+                       img_win_width, img_block_width, win_block_stride_x, win_block_stride_y,
+                       block_hists, coefs, free_coef, threshold, confidences);
+               cudaSafeCall(cudaThreadSynchronize());
+       }
+
+

        template <int nthreads, // Number of threads per one histogram block
                  int nblocks> // Number of histogram block processed by single GPU thread block