optimize hog bin computing

This commit is contained in:
marina.kolpakova 2012-11-28 15:40:00 +04:00
parent ee291a15da
commit 0e1005ca92
3 changed files with 181 additions and 44 deletions

View File

@ -1534,6 +1534,28 @@ class CV_EXPORTS SCascade : public Algorithm
{
public:
enum { GENERIC = 1, SEPARABLE = 2};
class CV_EXPORTS Preprocessor
{
public:
// Appends specified number of HOG first-order features integrals into given vector.
// Param frame is an input 3-channel bgr image.
// Param channels is a GPU matrix of integrals.
// Param stream is stream is a high-level CUDA stream abstraction used for asynchronous execution.
virtual void apply(InputArray frame, OutputArray channels, Stream& stream = Stream::Null()) = 0;
// Creates a specific preprocessor implementation.
// Param shrinkage is a resizing factor. Resize is applied before the computing integral sum
// Param bins is a number of HOG-like channels.
// Param method is a channel computing method.
static cv::Ptr<Preprocessor> create(const int shrinkage, const int bins, const int method = GENERIC);
protected:
Preprocessor();
};
// Representation of detectors result.
struct CV_EXPORTS Detection
{
@ -1576,9 +1598,6 @@ public:
// Param stream is stream is a high-level CUDA stream abstraction used for asynchronous execution
virtual void detect(InputArray image, InputArray rois, OutputArray objects, Stream& stream = Stream::Null()) const;
// Preprocesing only
virtual void preprocess(InputArray image, OutputArray channels, Stream& stream = Stream::Null()) const;
// Convert ROI matrix into the suitable for detect method.
// Param roi is an input matrix of the same size as the image.
// There non zero value mean that detector should be executed in this point.

View File

@ -99,6 +99,7 @@ namespace icf {
cudaSafeCall(cudaDeviceSynchronize());
}
template<bool isDefaultNum>
__device__ __forceinline__ int fast_angle_bin(const float& dx, const float& dy)
{
const float angle_quantum = M_PI / 6.f;
@ -110,36 +111,82 @@ namespace icf {
return static_cast<int>(angle * angle_scaling) % 6;
}
template<>
__device__ __forceinline__ int fast_angle_bin<true>(const float& dy, const float& dx)
{
int index = 0;
float max_dot = fabs(dx);
{
const float dot_product = fabs(dx * 0.8660254037844386f + dy * 0.5f);
if(dot_product > max_dot)
{
max_dot = dot_product;
index = 1;
}
}
{
const float dot_product = fabs(dy * 0.8660254037844386f + dx * 0.5f);
if(dot_product > max_dot)
{
max_dot = dot_product;
index = 2;
}
}
{
int i = 3;
float2 bin_vector_i;
bin_vector_i.x = ::cos(i * (M_PI / 6.f));
bin_vector_i.y = ::sin(i * (M_PI / 6.f));
const float dot_product = fabs(dx * bin_vector_i.x + dy * bin_vector_i.y);
if(dot_product > max_dot)
{
max_dot = dot_product;
index = i;
}
}
{
const float dot_product = fabs(dx * (-0.4999999999999998f) + dy * 0.8660254037844387f);
if(dot_product > max_dot)
{
max_dot = dot_product;
index = 4;
}
}
{
const float dot_product = fabs(dx * (-0.8660254037844387f) + dy * 0.49999999999999994f);
if(dot_product > max_dot)
{
max_dot = dot_product;
index = 5;
}
}
return index;
}
texture<uchar, cudaTextureType2D, cudaReadModeElementType> tgray;
__global__ void magnitude_d(PtrStepSzb mag)
template<bool isDefaultNum>
__global__ void gray2hog(PtrStepSzb mag)
{
const int x = blockIdx.x * blockDim.x + threadIdx.x;
const int y = blockIdx.y * blockDim.y + threadIdx.y;
const float dx_a = tex2D(tgray, x + 1, y),
dx_b = tex2D(tgray, x - 1, y),
dx = dx_a - dx_b,
const float dx = tex2D(tgray, x + 1, y + 0) - tex2D(tgray, x - 1, y - 0);
const float dy = tex2D(tgray, x + 0, y + 1) - tex2D(tgray, x - 0, y - 1);
dy_a = tex2D(tgray, x, y + 1),
dy_b = tex2D(tgray, x, y - 1),
dy = dy_a - dy_b;
const float magnitude = sqrtf((dx * dx) + (dy * dy)) * (1.0f / sqrtf(2));
const uchar cmag = static_cast<uchar>(magnitude);
const float magnitude_scaling = 1.0f/ sqrtf(2);
const float magnitude = sqrtf((dx * dx) + (dy * dy)) * magnitude_scaling;
const uchar magnitude_u8 = static_cast<uchar>(magnitude);
mag( 480 * 6 + y, x) = magnitude_u8;
int angle_channel_index;
angle_channel_index = fast_angle_bin(dy, dx);
mag( 480 * angle_channel_index + y, x) = magnitude_u8;
mag( 480 * 6 + y, x) = cmag;
mag( 480 * fast_angle_bin<isDefaultNum>(dy, dx) + y, x) = cmag;
}
void magnitude(const PtrStepSzb& gray, PtrStepSzb mag)
void gray2hog(const PtrStepSzb& gray, PtrStepSzb mag, const int bins)
{
dim3 block(32, 8);
dim3 grid(gray.cols / 32, gray.rows / 8);
@ -147,7 +194,10 @@ namespace icf {
cudaChannelFormatDesc desc = cudaCreateChannelDesc<uchar>();
cudaSafeCall( cudaBindTexture2D(0, tgray, gray.data, desc, gray.cols, gray.rows, gray.step) );
magnitude_d<<<grid, block>>>(mag);
if (bins == 6)
gray2hog<true><<<grid, block>>>(mag);
else
gray2hog<false><<<grid, block>>>(mag);
cudaSafeCall(cudaDeviceSynchronize());
}

View File

@ -57,6 +57,11 @@ void cv::gpu::SCascade::genRoi(InputArray, OutputArray, Stream&) const { throw_n
void cv::gpu::SCascade::read(const FileNode& fn) { Algorithm::read(fn); }
cv::gpu::SCascade::Preprocessor::Preprocessor() { throw_nogpu(); }
void cv::gpu::SCascade::Preprocessor::create(const int, const int, const int) { throw_nogpu(); }
#else
#include <icf.hpp>
@ -90,7 +95,7 @@ namespace icf {
PtrStepSzb suppressed, cudaStream_t stream);
void bgr2Luv(const PtrStepSzb& bgr, PtrStepSzb luv);
void magnitude(const PtrStepSzb& gray, PtrStepSzb mag);
void gray2hog(const PtrStepSzb& gray, PtrStepSzb mag, const int bins);
}
namespace imgproc {
@ -609,34 +614,97 @@ void cv::gpu::SCascade::read(const FileNode& fn)
Algorithm::read(fn);
}
// namespace {
// void bgr2Luv(const cv::gpu::GpuMat& input, cv::gpu::GpuMat& luv /*integral*/)
// {
// cv::gpu::GpuMat bgr;
// cv::gpu::GaussianBlur(input, bgr, cv::Size(3, 3), -1);
// cv::gpu::GpuMat gray, /*luv,*/ shrunk, buffer;
// luv.create(bgr.rows * 10, bgr.cols, CV_8UC1);
// luv.setTo(0);
// cv::gpu::cvtColor(bgr, gray, CV_BGR2GRAY);
// cv::gpu::device::icf::magnitude(gray, luv(cv::Rect(0, 0, bgr.cols, bgr.rows * 7)));
// cv::gpu::GpuMat __luv(luv, cv::Rect(0, bgr.rows * 7, bgr.cols, bgr.rows * 3));
// cv::gpu::device::icf::bgr2Luv(bgr, __luv);
// // cv::gpu::resize(luv, shrunk, cv::Size(), 0.25f, 0.25f, CV_INTER_AREA);
// // cv::gpu::integralBuffered(shrunk, integral, buffer);
// }
// }
namespace {
void bgr2Luv(const cv::gpu::GpuMat& input, cv::gpu::GpuMat& integral)
using cv::InputArray;
using cv::OutputArray;
using cv::gpu::Stream;
using cv::gpu::GpuMat;
struct GenricPreprocessor : public cv::gpu::SCascade::Preprocessor
{
cv::gpu::GpuMat bgr;
cv::gpu::GaussianBlur(input, bgr, cv::Size(3, 3), -1);
GenricPreprocessor(const int s, const int b) : cv::gpu::SCascade::Preprocessor(), shrinkage(s), bins(b) {}
cv::gpu::GpuMat gray, luv, shrunk, buffer;
luv.create(bgr.rows * 10, bgr.cols, CV_8UC1);
luv.setTo(0);
virtual void apply(InputArray /*frame*/, OutputArray /*channels*/, Stream& /*s*/ = Stream::Null())
{
cv::gpu::cvtColor(bgr, gray, CV_BGR2GRAY);
cv::gpu::device::icf::magnitude(gray, luv(cv::Rect(0, 0, bgr.cols, bgr.rows * 7)));
}
cv::gpu::GpuMat __luv(luv, cv::Rect(0, bgr.rows * 7, bgr.cols, bgr.rows * 3));
cv::gpu::device::icf::bgr2Luv(bgr, __luv);
private:
const int shrinkage;
const int bins;
};
cv::gpu::resize(luv, shrunk, cv::Size(), 0.25f, 0.25f, CV_INTER_AREA);
cv::gpu::integralBuffered(shrunk, integral, buffer);
}
}
void cv::gpu::SCascade::preprocess(InputArray _bgr, OutputArray _channels, Stream& stream) const
inline void setZero(cv::gpu::GpuMat& m, Stream& s)
{
CV_Assert(fields);
(void)stream;
const GpuMat bgr = _bgr.getGpuMat(), channels = _channels.getGpuMat();
if (s)
s.enqueueMemSet(m, 0);
else
m.setTo(0);
}
struct SeparablePreprocessor : public cv::gpu::SCascade::Preprocessor
{
SeparablePreprocessor(const int s, const int b) : cv::gpu::SCascade::Preprocessor(), shrinkage(s), bins(b) {}
virtual void apply(InputArray _frame, OutputArray _channels, Stream& s = Stream::Null())
{
const GpuMat frame = _frame.getGpuMat();
cv::gpu::GaussianBlur(frame, bgr, cv::Size(3, 3), -1.0);
_channels.create(frame.rows * (4 + bins), frame.cols, CV_8UC1);
GpuMat channels = _channels.getGpuMat();
setZero(channels, s);
cv::gpu::cvtColor(bgr, gray, CV_BGR2GRAY);
cv::gpu::device::icf::gray2hog(gray, channels(cv::Rect(0, 0, bgr.cols, bgr.rows * (bins + 1))), bins);
cv::gpu::GpuMat luv(channels, cv::Rect(0, bgr.rows * (bins + 1), bgr.cols, bgr.rows * 3));
cv::gpu::device::icf::bgr2Luv(bgr, luv);
}
private:
const int shrinkage;
const int bins;
GpuMat bgr;
GpuMat gray;
};
}
cv::gpu::SCascade::Preprocessor::Preprocessor(){}
cv::Ptr<cv::gpu::SCascade::Preprocessor> cv::gpu::SCascade::Preprocessor::create(const int s, const int b, const int m)
{
CV_Assert(m == SEPARABLE || m == GENERIC);
if (m == GENERIC)
return cv::Ptr<cv::gpu::SCascade::Preprocessor>(new GenricPreprocessor(s, b));
return cv::Ptr<cv::gpu::SCascade::Preprocessor>(new SeparablePreprocessor(s, b));
}
#endif