From 0e339dd13741a52e29cf65ee4e155e114218af4b Mon Sep 17 00:00:00 2001 From: Vladislav Vinogradov Date: Mon, 12 Nov 2012 13:19:48 +0400 Subject: [PATCH] hog --- modules/gpu/src/cuda/hog.cu | 179 ++++++++++++------------------------ 1 file changed, 60 insertions(+), 119 deletions(-) diff --git a/modules/gpu/src/cuda/hog.cu b/modules/gpu/src/cuda/hog.cu index 953fdec1d..6a7e927d1 100644 --- a/modules/gpu/src/cuda/hog.cu +++ b/modules/gpu/src/cuda/hog.cu @@ -42,7 +42,10 @@ #if !defined CUDA_DISABLER -#include "internal_shared.hpp" +#include "opencv2/gpu/device/common.hpp" +#include "opencv2/gpu/device/reduce.hpp" +#include "opencv2/gpu/device/functional.hpp" +#include "opencv2/gpu/device/warp_shuffle.hpp" namespace cv { namespace gpu { namespace device { @@ -226,29 +229,30 @@ namespace cv { namespace gpu { namespace device template - __device__ float reduce_smem(volatile float* smem) + __device__ float reduce_smem(float* smem, float val) { unsigned int tid = threadIdx.x; - float sum = smem[tid]; + float sum = val; - if (size >= 512) { if (tid < 256) smem[tid] = sum = sum + smem[tid + 256]; __syncthreads(); } - if (size >= 256) { if (tid < 128) smem[tid] = sum = sum + smem[tid + 128]; __syncthreads(); } - if (size >= 128) { if (tid < 64) smem[tid] = sum = sum + smem[tid + 64]; __syncthreads(); } + reduce(smem, sum, tid, plus()); - if (tid < 32) + if (size == 32) { - if (size >= 64) smem[tid] = sum = sum + smem[tid + 32]; - if (size >= 32) smem[tid] = sum = sum + smem[tid + 16]; - if (size >= 16) smem[tid] = sum = sum + smem[tid + 8]; - if (size >= 8) smem[tid] = sum = sum + smem[tid + 4]; - if (size >= 4) smem[tid] = sum = sum + smem[tid + 2]; - if (size >= 2) smem[tid] = sum = sum + smem[tid + 1]; + #if __CUDA_ARCH__ >= 300 + return shfl(sum, 0); + #else + return smem[0]; + #endif } - __syncthreads(); - sum = smem[0]; + #if __CUDA_ARCH__ >= 300 + if (threadIdx.x == 0) + smem[0] = sum; + #endif - return sum; + __syncthreads(); + + return smem[0]; } @@ -272,19 +276,13 @@ namespace cv { namespace gpu { namespace device if (threadIdx.x < block_hist_size) elem = hist[0]; - squares[threadIdx.x] = elem * elem; - - __syncthreads(); - float sum = reduce_smem(squares); + float sum = reduce_smem(squares, elem * elem); float scale = 1.0f / (::sqrtf(sum) + 0.1f * block_hist_size); elem = ::min(elem * scale, threshold); - __syncthreads(); - squares[threadIdx.x] = elem * elem; + sum = reduce_smem(squares, elem * elem); - __syncthreads(); - sum = reduce_smem(squares); scale = 1.0f / (::sqrtf(sum) + 1e-3f); if (threadIdx.x < block_hist_size) @@ -330,65 +328,36 @@ namespace cv { namespace gpu { namespace device // return confidence values not just positive location template // Number of histogram block processed by single GPU thread block + int nblocks> // Number of histogram block processed by single GPU thread block __global__ void compute_confidence_hists_kernel_many_blocks(const int img_win_width, const int img_block_width, const int win_block_stride_x, const int win_block_stride_y, const float* block_hists, const float* coefs, float free_coef, float threshold, float* confidences) { - const int win_x = threadIdx.z; - if (blockIdx.x * blockDim.z + win_x >= img_win_width) - return; + const int win_x = threadIdx.z; + if (blockIdx.x * blockDim.z + win_x >= img_win_width) + return; - const float* hist = block_hists + (blockIdx.y * win_block_stride_y * img_block_width + - blockIdx.x * win_block_stride_x * blockDim.z + win_x) * - cblock_hist_size; + const float* hist = block_hists + (blockIdx.y * win_block_stride_y * img_block_width + + blockIdx.x * win_block_stride_x * blockDim.z + win_x) * + cblock_hist_size; - float product = 0.f; - for (int i = threadIdx.x; i < cdescr_size; i += nthreads) - { - int offset_y = i / cdescr_width; - int offset_x = i - offset_y * cdescr_width; - product += coefs[i] * hist[offset_y * img_block_width * cblock_hist_size + offset_x]; - } + float product = 0.f; + for (int i = threadIdx.x; i < cdescr_size; i += nthreads) + { + int offset_y = i / cdescr_width; + int offset_x = i - offset_y * cdescr_width; + product += coefs[i] * hist[offset_y * img_block_width * cblock_hist_size + offset_x]; + } - __shared__ float products[nthreads * nblocks]; + __shared__ float products[nthreads * nblocks]; - const int tid = threadIdx.z * nthreads + threadIdx.x; - products[tid] = product; + const int tid = threadIdx.z * nthreads + threadIdx.x; - __syncthreads(); + reduce(products, product, tid, plus()); - if (nthreads >= 512) - { - if (threadIdx.x < 256) products[tid] = product = product + products[tid + 256]; - __syncthreads(); - } - if (nthreads >= 256) - { - if (threadIdx.x < 128) products[tid] = product = product + products[tid + 128]; - __syncthreads(); - } - if (nthreads >= 128) - { - if (threadIdx.x < 64) products[tid] = product = product + products[tid + 64]; - __syncthreads(); - } - - if (threadIdx.x < 32) - { - volatile float* smem = products; - if (nthreads >= 64) smem[tid] = product = product + smem[tid + 32]; - if (nthreads >= 32) smem[tid] = product = product + smem[tid + 16]; - if (nthreads >= 16) smem[tid] = product = product + smem[tid + 8]; - if (nthreads >= 8) smem[tid] = product = product + smem[tid + 4]; - if (nthreads >= 4) smem[tid] = product = product + smem[tid + 2]; - if (nthreads >= 2) smem[tid] = product = product + smem[tid + 1]; - } - - if (threadIdx.x == 0) - confidences[blockIdx.y * img_win_width + blockIdx.x * blockDim.z + win_x] - = (float)(product + free_coef); + if (threadIdx.x == 0) + confidences[blockIdx.y * img_win_width + blockIdx.x * blockDim.z + win_x] = product + free_coef; } @@ -396,32 +365,32 @@ namespace cv { namespace gpu { namespace device int win_stride_y, int win_stride_x, int height, int width, float* block_hists, float* coefs, float free_coef, float threshold, float *confidences) { - const int nthreads = 256; - const int nblocks = 1; + const int nthreads = 256; + const int nblocks = 1; - int win_block_stride_x = win_stride_x / block_stride_x; - int win_block_stride_y = win_stride_y / block_stride_y; - int img_win_width = (width - win_width + win_stride_x) / win_stride_x; - int img_win_height = (height - win_height + win_stride_y) / win_stride_y; + int win_block_stride_x = win_stride_x / block_stride_x; + int win_block_stride_y = win_stride_y / block_stride_y; + int img_win_width = (width - win_width + win_stride_x) / win_stride_x; + int img_win_height = (height - win_height + win_stride_y) / win_stride_y; - dim3 threads(nthreads, 1, nblocks); - dim3 grid(divUp(img_win_width, nblocks), img_win_height); + dim3 threads(nthreads, 1, nblocks); + dim3 grid(divUp(img_win_width, nblocks), img_win_height); - cudaSafeCall(cudaFuncSetCacheConfig(compute_confidence_hists_kernel_many_blocks, - cudaFuncCachePreferL1)); + cudaSafeCall(cudaFuncSetCacheConfig(compute_confidence_hists_kernel_many_blocks, + cudaFuncCachePreferL1)); - int img_block_width = (width - CELLS_PER_BLOCK_X * CELL_WIDTH + block_stride_x) / - block_stride_x; - compute_confidence_hists_kernel_many_blocks<<>>( - img_win_width, img_block_width, win_block_stride_x, win_block_stride_y, - block_hists, coefs, free_coef, threshold, confidences); - cudaSafeCall(cudaThreadSynchronize()); + int img_block_width = (width - CELLS_PER_BLOCK_X * CELL_WIDTH + block_stride_x) / + block_stride_x; + compute_confidence_hists_kernel_many_blocks<<>>( + img_win_width, img_block_width, win_block_stride_x, win_block_stride_y, + block_hists, coefs, free_coef, threshold, confidences); + cudaSafeCall(cudaThreadSynchronize()); } template // Number of histogram block processed by single GPU thread block + int nblocks> // Number of histogram block processed by single GPU thread block __global__ void classify_hists_kernel_many_blocks(const int img_win_width, const int img_block_width, const int win_block_stride_x, const int win_block_stride_y, const float* block_hists, const float* coefs, @@ -446,36 +415,8 @@ namespace cv { namespace gpu { namespace device __shared__ float products[nthreads * nblocks]; const int tid = threadIdx.z * nthreads + threadIdx.x; - products[tid] = product; - __syncthreads(); - - if (nthreads >= 512) - { - if (threadIdx.x < 256) products[tid] = product = product + products[tid + 256]; - __syncthreads(); - } - if (nthreads >= 256) - { - if (threadIdx.x < 128) products[tid] = product = product + products[tid + 128]; - __syncthreads(); - } - if (nthreads >= 128) - { - if (threadIdx.x < 64) products[tid] = product = product + products[tid + 64]; - __syncthreads(); - } - - if (threadIdx.x < 32) - { - volatile float* smem = products; - if (nthreads >= 64) smem[tid] = product = product + smem[tid + 32]; - if (nthreads >= 32) smem[tid] = product = product + smem[tid + 16]; - if (nthreads >= 16) smem[tid] = product = product + smem[tid + 8]; - if (nthreads >= 8) smem[tid] = product = product + smem[tid + 4]; - if (nthreads >= 4) smem[tid] = product = product + smem[tid + 2]; - if (nthreads >= 2) smem[tid] = product = product + smem[tid + 1]; - } + reduce(products, product, tid, plus()); if (threadIdx.x == 0) labels[blockIdx.y * img_win_width + blockIdx.x * blockDim.z + win_x] = (product + free_coef >= threshold); @@ -868,4 +809,4 @@ namespace cv { namespace gpu { namespace device }}} // namespace cv { namespace gpu { namespace device -#endif /* CUDA_DISABLER */ \ No newline at end of file +#endif /* CUDA_DISABLER */