This commit is contained in:
Vladislav Vinogradov 2012-11-12 13:19:48 +04:00
parent 0ddd16cf78
commit 0e339dd137

View File

@ -42,7 +42,10 @@
#if !defined CUDA_DISABLER #if !defined CUDA_DISABLER
#include "internal_shared.hpp" #include "opencv2/gpu/device/common.hpp"
#include "opencv2/gpu/device/reduce.hpp"
#include "opencv2/gpu/device/functional.hpp"
#include "opencv2/gpu/device/warp_shuffle.hpp"
namespace cv { namespace gpu { namespace device namespace cv { namespace gpu { namespace device
{ {
@ -226,29 +229,30 @@ namespace cv { namespace gpu { namespace device
template<int size> template<int size>
__device__ float reduce_smem(volatile float* smem) __device__ float reduce_smem(float* smem, float val)
{ {
unsigned int tid = threadIdx.x; unsigned int tid = threadIdx.x;
float sum = smem[tid]; float sum = val;
if (size >= 512) { if (tid < 256) smem[tid] = sum = sum + smem[tid + 256]; __syncthreads(); } reduce<size>(smem, sum, tid, plus<float>());
if (size >= 256) { if (tid < 128) smem[tid] = sum = sum + smem[tid + 128]; __syncthreads(); }
if (size >= 128) { if (tid < 64) smem[tid] = sum = sum + smem[tid + 64]; __syncthreads(); }
if (tid < 32) if (size == 32)
{ {
if (size >= 64) smem[tid] = sum = sum + smem[tid + 32]; #if __CUDA_ARCH__ >= 300
if (size >= 32) smem[tid] = sum = sum + smem[tid + 16]; return shfl(sum, 0);
if (size >= 16) smem[tid] = sum = sum + smem[tid + 8]; #else
if (size >= 8) smem[tid] = sum = sum + smem[tid + 4]; return smem[0];
if (size >= 4) smem[tid] = sum = sum + smem[tid + 2]; #endif
if (size >= 2) smem[tid] = sum = sum + smem[tid + 1];
} }
__syncthreads(); #if __CUDA_ARCH__ >= 300
sum = smem[0]; if (threadIdx.x == 0)
smem[0] = sum;
#endif
return sum; __syncthreads();
return smem[0];
} }
@ -272,19 +276,13 @@ namespace cv { namespace gpu { namespace device
if (threadIdx.x < block_hist_size) if (threadIdx.x < block_hist_size)
elem = hist[0]; elem = hist[0];
squares[threadIdx.x] = elem * elem; float sum = reduce_smem<nthreads>(squares, elem * elem);
__syncthreads();
float sum = reduce_smem<nthreads>(squares);
float scale = 1.0f / (::sqrtf(sum) + 0.1f * block_hist_size); float scale = 1.0f / (::sqrtf(sum) + 0.1f * block_hist_size);
elem = ::min(elem * scale, threshold); elem = ::min(elem * scale, threshold);
__syncthreads(); sum = reduce_smem<nthreads>(squares, elem * elem);
squares[threadIdx.x] = elem * elem;
__syncthreads();
sum = reduce_smem<nthreads>(squares);
scale = 1.0f / (::sqrtf(sum) + 1e-3f); scale = 1.0f / (::sqrtf(sum) + 1e-3f);
if (threadIdx.x < block_hist_size) if (threadIdx.x < block_hist_size)
@ -330,65 +328,36 @@ namespace cv { namespace gpu { namespace device
// return confidence values not just positive location // return confidence values not just positive location
template <int nthreads, // Number of threads per one histogram block template <int nthreads, // Number of threads per one histogram block
int nblocks> // Number of histogram block processed by single GPU thread block int nblocks> // Number of histogram block processed by single GPU thread block
__global__ void compute_confidence_hists_kernel_many_blocks(const int img_win_width, const int img_block_width, __global__ void compute_confidence_hists_kernel_many_blocks(const int img_win_width, const int img_block_width,
const int win_block_stride_x, const int win_block_stride_y, const int win_block_stride_x, const int win_block_stride_y,
const float* block_hists, const float* coefs, const float* block_hists, const float* coefs,
float free_coef, float threshold, float* confidences) float free_coef, float threshold, float* confidences)
{ {
const int win_x = threadIdx.z; const int win_x = threadIdx.z;
if (blockIdx.x * blockDim.z + win_x >= img_win_width) if (blockIdx.x * blockDim.z + win_x >= img_win_width)
return; return;
const float* hist = block_hists + (blockIdx.y * win_block_stride_y * img_block_width + const float* hist = block_hists + (blockIdx.y * win_block_stride_y * img_block_width +
blockIdx.x * win_block_stride_x * blockDim.z + win_x) * blockIdx.x * win_block_stride_x * blockDim.z + win_x) *
cblock_hist_size; cblock_hist_size;
float product = 0.f; float product = 0.f;
for (int i = threadIdx.x; i < cdescr_size; i += nthreads) for (int i = threadIdx.x; i < cdescr_size; i += nthreads)
{ {
int offset_y = i / cdescr_width; int offset_y = i / cdescr_width;
int offset_x = i - offset_y * cdescr_width; int offset_x = i - offset_y * cdescr_width;
product += coefs[i] * hist[offset_y * img_block_width * cblock_hist_size + offset_x]; product += coefs[i] * hist[offset_y * img_block_width * cblock_hist_size + offset_x];
} }
__shared__ float products[nthreads * nblocks]; __shared__ float products[nthreads * nblocks];
const int tid = threadIdx.z * nthreads + threadIdx.x; const int tid = threadIdx.z * nthreads + threadIdx.x;
products[tid] = product;
__syncthreads(); reduce<nthreads>(products, product, tid, plus<float>());
if (nthreads >= 512) if (threadIdx.x == 0)
{ confidences[blockIdx.y * img_win_width + blockIdx.x * blockDim.z + win_x] = product + free_coef;
if (threadIdx.x < 256) products[tid] = product = product + products[tid + 256];
__syncthreads();
}
if (nthreads >= 256)
{
if (threadIdx.x < 128) products[tid] = product = product + products[tid + 128];
__syncthreads();
}
if (nthreads >= 128)
{
if (threadIdx.x < 64) products[tid] = product = product + products[tid + 64];
__syncthreads();
}
if (threadIdx.x < 32)
{
volatile float* smem = products;
if (nthreads >= 64) smem[tid] = product = product + smem[tid + 32];
if (nthreads >= 32) smem[tid] = product = product + smem[tid + 16];
if (nthreads >= 16) smem[tid] = product = product + smem[tid + 8];
if (nthreads >= 8) smem[tid] = product = product + smem[tid + 4];
if (nthreads >= 4) smem[tid] = product = product + smem[tid + 2];
if (nthreads >= 2) smem[tid] = product = product + smem[tid + 1];
}
if (threadIdx.x == 0)
confidences[blockIdx.y * img_win_width + blockIdx.x * blockDim.z + win_x]
= (float)(product + free_coef);
} }
@ -396,32 +365,32 @@ namespace cv { namespace gpu { namespace device
int win_stride_y, int win_stride_x, int height, int width, float* block_hists, int win_stride_y, int win_stride_x, int height, int width, float* block_hists,
float* coefs, float free_coef, float threshold, float *confidences) float* coefs, float free_coef, float threshold, float *confidences)
{ {
const int nthreads = 256; const int nthreads = 256;
const int nblocks = 1; const int nblocks = 1;
int win_block_stride_x = win_stride_x / block_stride_x; int win_block_stride_x = win_stride_x / block_stride_x;
int win_block_stride_y = win_stride_y / block_stride_y; int win_block_stride_y = win_stride_y / block_stride_y;
int img_win_width = (width - win_width + win_stride_x) / win_stride_x; int img_win_width = (width - win_width + win_stride_x) / win_stride_x;
int img_win_height = (height - win_height + win_stride_y) / win_stride_y; int img_win_height = (height - win_height + win_stride_y) / win_stride_y;
dim3 threads(nthreads, 1, nblocks); dim3 threads(nthreads, 1, nblocks);
dim3 grid(divUp(img_win_width, nblocks), img_win_height); dim3 grid(divUp(img_win_width, nblocks), img_win_height);
cudaSafeCall(cudaFuncSetCacheConfig(compute_confidence_hists_kernel_many_blocks<nthreads, nblocks>, cudaSafeCall(cudaFuncSetCacheConfig(compute_confidence_hists_kernel_many_blocks<nthreads, nblocks>,
cudaFuncCachePreferL1)); cudaFuncCachePreferL1));
int img_block_width = (width - CELLS_PER_BLOCK_X * CELL_WIDTH + block_stride_x) / int img_block_width = (width - CELLS_PER_BLOCK_X * CELL_WIDTH + block_stride_x) /
block_stride_x; block_stride_x;
compute_confidence_hists_kernel_many_blocks<nthreads, nblocks><<<grid, threads>>>( compute_confidence_hists_kernel_many_blocks<nthreads, nblocks><<<grid, threads>>>(
img_win_width, img_block_width, win_block_stride_x, win_block_stride_y, img_win_width, img_block_width, win_block_stride_x, win_block_stride_y,
block_hists, coefs, free_coef, threshold, confidences); block_hists, coefs, free_coef, threshold, confidences);
cudaSafeCall(cudaThreadSynchronize()); cudaSafeCall(cudaThreadSynchronize());
} }
template <int nthreads, // Number of threads per one histogram block template <int nthreads, // Number of threads per one histogram block
int nblocks> // Number of histogram block processed by single GPU thread block int nblocks> // Number of histogram block processed by single GPU thread block
__global__ void classify_hists_kernel_many_blocks(const int img_win_width, const int img_block_width, __global__ void classify_hists_kernel_many_blocks(const int img_win_width, const int img_block_width,
const int win_block_stride_x, const int win_block_stride_y, const int win_block_stride_x, const int win_block_stride_y,
const float* block_hists, const float* coefs, const float* block_hists, const float* coefs,
@ -446,36 +415,8 @@ namespace cv { namespace gpu { namespace device
__shared__ float products[nthreads * nblocks]; __shared__ float products[nthreads * nblocks];
const int tid = threadIdx.z * nthreads + threadIdx.x; const int tid = threadIdx.z * nthreads + threadIdx.x;
products[tid] = product;
__syncthreads(); reduce<nthreads>(products, product, tid, plus<float>());
if (nthreads >= 512)
{
if (threadIdx.x < 256) products[tid] = product = product + products[tid + 256];
__syncthreads();
}
if (nthreads >= 256)
{
if (threadIdx.x < 128) products[tid] = product = product + products[tid + 128];
__syncthreads();
}
if (nthreads >= 128)
{
if (threadIdx.x < 64) products[tid] = product = product + products[tid + 64];
__syncthreads();
}
if (threadIdx.x < 32)
{
volatile float* smem = products;
if (nthreads >= 64) smem[tid] = product = product + smem[tid + 32];
if (nthreads >= 32) smem[tid] = product = product + smem[tid + 16];
if (nthreads >= 16) smem[tid] = product = product + smem[tid + 8];
if (nthreads >= 8) smem[tid] = product = product + smem[tid + 4];
if (nthreads >= 4) smem[tid] = product = product + smem[tid + 2];
if (nthreads >= 2) smem[tid] = product = product + smem[tid + 1];
}
if (threadIdx.x == 0) if (threadIdx.x == 0)
labels[blockIdx.y * img_win_width + blockIdx.x * blockDim.z + win_x] = (product + free_coef >= threshold); labels[blockIdx.y * img_win_width + blockIdx.x * blockDim.z + win_x] = (product + free_coef >= threshold);