hog
This commit is contained in:
parent
0ddd16cf78
commit
0e339dd137
@ -42,7 +42,10 @@
|
||||
|
||||
#if !defined CUDA_DISABLER
|
||||
|
||||
#include "internal_shared.hpp"
|
||||
#include "opencv2/gpu/device/common.hpp"
|
||||
#include "opencv2/gpu/device/reduce.hpp"
|
||||
#include "opencv2/gpu/device/functional.hpp"
|
||||
#include "opencv2/gpu/device/warp_shuffle.hpp"
|
||||
|
||||
namespace cv { namespace gpu { namespace device
|
||||
{
|
||||
@ -226,29 +229,30 @@ namespace cv { namespace gpu { namespace device
|
||||
|
||||
|
||||
template<int size>
|
||||
__device__ float reduce_smem(volatile float* smem)
|
||||
__device__ float reduce_smem(float* smem, float val)
|
||||
{
|
||||
unsigned int tid = threadIdx.x;
|
||||
float sum = smem[tid];
|
||||
float sum = val;
|
||||
|
||||
if (size >= 512) { if (tid < 256) smem[tid] = sum = sum + smem[tid + 256]; __syncthreads(); }
|
||||
if (size >= 256) { if (tid < 128) smem[tid] = sum = sum + smem[tid + 128]; __syncthreads(); }
|
||||
if (size >= 128) { if (tid < 64) smem[tid] = sum = sum + smem[tid + 64]; __syncthreads(); }
|
||||
reduce<size>(smem, sum, tid, plus<float>());
|
||||
|
||||
if (tid < 32)
|
||||
if (size == 32)
|
||||
{
|
||||
if (size >= 64) smem[tid] = sum = sum + smem[tid + 32];
|
||||
if (size >= 32) smem[tid] = sum = sum + smem[tid + 16];
|
||||
if (size >= 16) smem[tid] = sum = sum + smem[tid + 8];
|
||||
if (size >= 8) smem[tid] = sum = sum + smem[tid + 4];
|
||||
if (size >= 4) smem[tid] = sum = sum + smem[tid + 2];
|
||||
if (size >= 2) smem[tid] = sum = sum + smem[tid + 1];
|
||||
#if __CUDA_ARCH__ >= 300
|
||||
return shfl(sum, 0);
|
||||
#else
|
||||
return smem[0];
|
||||
#endif
|
||||
}
|
||||
|
||||
__syncthreads();
|
||||
sum = smem[0];
|
||||
#if __CUDA_ARCH__ >= 300
|
||||
if (threadIdx.x == 0)
|
||||
smem[0] = sum;
|
||||
#endif
|
||||
|
||||
return sum;
|
||||
__syncthreads();
|
||||
|
||||
return smem[0];
|
||||
}
|
||||
|
||||
|
||||
@ -272,19 +276,13 @@ namespace cv { namespace gpu { namespace device
|
||||
if (threadIdx.x < block_hist_size)
|
||||
elem = hist[0];
|
||||
|
||||
squares[threadIdx.x] = elem * elem;
|
||||
|
||||
__syncthreads();
|
||||
float sum = reduce_smem<nthreads>(squares);
|
||||
float sum = reduce_smem<nthreads>(squares, elem * elem);
|
||||
|
||||
float scale = 1.0f / (::sqrtf(sum) + 0.1f * block_hist_size);
|
||||
elem = ::min(elem * scale, threshold);
|
||||
|
||||
__syncthreads();
|
||||
squares[threadIdx.x] = elem * elem;
|
||||
sum = reduce_smem<nthreads>(squares, elem * elem);
|
||||
|
||||
__syncthreads();
|
||||
sum = reduce_smem<nthreads>(squares);
|
||||
scale = 1.0f / (::sqrtf(sum) + 1e-3f);
|
||||
|
||||
if (threadIdx.x < block_hist_size)
|
||||
@ -355,40 +353,11 @@ namespace cv { namespace gpu { namespace device
|
||||
__shared__ float products[nthreads * nblocks];
|
||||
|
||||
const int tid = threadIdx.z * nthreads + threadIdx.x;
|
||||
products[tid] = product;
|
||||
|
||||
__syncthreads();
|
||||
|
||||
if (nthreads >= 512)
|
||||
{
|
||||
if (threadIdx.x < 256) products[tid] = product = product + products[tid + 256];
|
||||
__syncthreads();
|
||||
}
|
||||
if (nthreads >= 256)
|
||||
{
|
||||
if (threadIdx.x < 128) products[tid] = product = product + products[tid + 128];
|
||||
__syncthreads();
|
||||
}
|
||||
if (nthreads >= 128)
|
||||
{
|
||||
if (threadIdx.x < 64) products[tid] = product = product + products[tid + 64];
|
||||
__syncthreads();
|
||||
}
|
||||
|
||||
if (threadIdx.x < 32)
|
||||
{
|
||||
volatile float* smem = products;
|
||||
if (nthreads >= 64) smem[tid] = product = product + smem[tid + 32];
|
||||
if (nthreads >= 32) smem[tid] = product = product + smem[tid + 16];
|
||||
if (nthreads >= 16) smem[tid] = product = product + smem[tid + 8];
|
||||
if (nthreads >= 8) smem[tid] = product = product + smem[tid + 4];
|
||||
if (nthreads >= 4) smem[tid] = product = product + smem[tid + 2];
|
||||
if (nthreads >= 2) smem[tid] = product = product + smem[tid + 1];
|
||||
}
|
||||
reduce<nthreads>(products, product, tid, plus<float>());
|
||||
|
||||
if (threadIdx.x == 0)
|
||||
confidences[blockIdx.y * img_win_width + blockIdx.x * blockDim.z + win_x]
|
||||
= (float)(product + free_coef);
|
||||
confidences[blockIdx.y * img_win_width + blockIdx.x * blockDim.z + win_x] = product + free_coef;
|
||||
|
||||
}
|
||||
|
||||
@ -446,36 +415,8 @@ namespace cv { namespace gpu { namespace device
|
||||
__shared__ float products[nthreads * nblocks];
|
||||
|
||||
const int tid = threadIdx.z * nthreads + threadIdx.x;
|
||||
products[tid] = product;
|
||||
|
||||
__syncthreads();
|
||||
|
||||
if (nthreads >= 512)
|
||||
{
|
||||
if (threadIdx.x < 256) products[tid] = product = product + products[tid + 256];
|
||||
__syncthreads();
|
||||
}
|
||||
if (nthreads >= 256)
|
||||
{
|
||||
if (threadIdx.x < 128) products[tid] = product = product + products[tid + 128];
|
||||
__syncthreads();
|
||||
}
|
||||
if (nthreads >= 128)
|
||||
{
|
||||
if (threadIdx.x < 64) products[tid] = product = product + products[tid + 64];
|
||||
__syncthreads();
|
||||
}
|
||||
|
||||
if (threadIdx.x < 32)
|
||||
{
|
||||
volatile float* smem = products;
|
||||
if (nthreads >= 64) smem[tid] = product = product + smem[tid + 32];
|
||||
if (nthreads >= 32) smem[tid] = product = product + smem[tid + 16];
|
||||
if (nthreads >= 16) smem[tid] = product = product + smem[tid + 8];
|
||||
if (nthreads >= 8) smem[tid] = product = product + smem[tid + 4];
|
||||
if (nthreads >= 4) smem[tid] = product = product + smem[tid + 2];
|
||||
if (nthreads >= 2) smem[tid] = product = product + smem[tid + 1];
|
||||
}
|
||||
reduce<nthreads>(products, product, tid, plus<float>());
|
||||
|
||||
if (threadIdx.x == 0)
|
||||
labels[blockIdx.y * img_win_width + blockIdx.x * blockDim.z + win_x] = (product + free_coef >= threshold);
|
||||
|
Loading…
x
Reference in New Issue
Block a user