optimize hog

This commit is contained in:
yao 2013-06-19 11:20:45 +08:00
parent 843094a07f
commit 26c246140a
2 changed files with 711 additions and 325 deletions

View File

@ -48,13 +48,107 @@ using namespace cv;
using namespace cv::ocl;
using namespace std;
#define CELL_WIDTH 8
#define CELL_HEIGHT 8
#define CELLS_PER_BLOCK_X 2
#define CELLS_PER_BLOCK_Y 2
#define NTHREADS 256
static oclMat gauss_w_lut;
static bool hog_device_cpu;
/* pre-compute gaussian and interp_weight lookup tables if sigma is 4.0f */
static const float gaussian_interp_lut[] =
{
/* gaussian lut */
0.01831564f, 0.02926831f, 0.04393693f, 0.06196101f, 0.08208500f, 0.10215643f,
0.11943297f, 0.13117145f, 0.13533528f, 0.13117145f, 0.11943297f, 0.10215643f,
0.08208500f, 0.06196101f, 0.04393693f, 0.02926831f, 0.02926831f, 0.04677062f,
0.07021102f, 0.09901341f, 0.13117145f, 0.16324551f, 0.19085334f, 0.20961139f,
0.21626517f, 0.20961139f, 0.19085334f, 0.16324551f, 0.13117145f, 0.09901341f,
0.07021102f, 0.04677062f, 0.04393693f, 0.07021102f, 0.10539922f, 0.14863673f,
0.19691168f, 0.24506053f, 0.28650481f, 0.31466395f, 0.32465246f, 0.31466395f,
0.28650481f, 0.24506053f, 0.19691168f, 0.14863673f, 0.10539922f, 0.07021102f,
0.06196101f, 0.09901341f, 0.14863673f, 0.20961139f, 0.27768996f, 0.34559074f,
0.40403652f, 0.44374731f, 0.45783335f, 0.44374731f, 0.40403652f, 0.34559074f,
0.27768996f, 0.20961139f, 0.14863673f, 0.09901341f, 0.08208500f, 0.13117145f,
0.19691168f, 0.27768996f, 0.36787945f, 0.45783335f, 0.53526145f, 0.58786964f,
0.60653067f, 0.58786964f, 0.53526145f, 0.45783335f, 0.36787945f, 0.27768996f,
0.19691168f, 0.13117145f, 0.10215643f, 0.16324551f, 0.24506053f, 0.34559074f,
0.45783335f, 0.56978285f, 0.66614360f, 0.73161560f, 0.75483960f, 0.73161560f,
0.66614360f, 0.56978285f, 0.45783335f, 0.34559074f, 0.24506053f, 0.16324551f,
0.11943297f, 0.19085334f, 0.28650481f, 0.40403652f, 0.53526145f, 0.66614360f,
0.77880079f, 0.85534531f, 0.88249689f, 0.85534531f, 0.77880079f, 0.66614360f,
0.53526145f, 0.40403652f, 0.28650481f, 0.19085334f, 0.13117145f, 0.20961139f,
0.31466395f, 0.44374731f, 0.58786964f, 0.73161560f, 0.85534531f, 0.93941307f,
0.96923321f, 0.93941307f, 0.85534531f, 0.73161560f, 0.58786964f, 0.44374731f,
0.31466395f, 0.20961139f, 0.13533528f, 0.21626517f, 0.32465246f, 0.45783335f,
0.60653067f, 0.75483960f, 0.88249689f, 0.96923321f, 1.00000000f, 0.96923321f,
0.88249689f, 0.75483960f, 0.60653067f, 0.45783335f, 0.32465246f, 0.21626517f,
0.13117145f, 0.20961139f, 0.31466395f, 0.44374731f, 0.58786964f, 0.73161560f,
0.85534531f, 0.93941307f, 0.96923321f, 0.93941307f, 0.85534531f, 0.73161560f,
0.58786964f, 0.44374731f, 0.31466395f, 0.20961139f, 0.11943297f, 0.19085334f,
0.28650481f, 0.40403652f, 0.53526145f, 0.66614360f, 0.77880079f, 0.85534531f,
0.88249689f, 0.85534531f, 0.77880079f, 0.66614360f, 0.53526145f, 0.40403652f,
0.28650481f, 0.19085334f, 0.10215643f, 0.16324551f, 0.24506053f, 0.34559074f,
0.45783335f, 0.56978285f, 0.66614360f, 0.73161560f, 0.75483960f, 0.73161560f,
0.66614360f, 0.56978285f, 0.45783335f, 0.34559074f, 0.24506053f, 0.16324551f,
0.08208500f, 0.13117145f, 0.19691168f, 0.27768996f, 0.36787945f, 0.45783335f,
0.53526145f, 0.58786964f, 0.60653067f, 0.58786964f, 0.53526145f, 0.45783335f,
0.36787945f, 0.27768996f, 0.19691168f, 0.13117145f, 0.06196101f, 0.09901341f,
0.14863673f, 0.20961139f, 0.27768996f, 0.34559074f, 0.40403652f, 0.44374731f,
0.45783335f, 0.44374731f, 0.40403652f, 0.34559074f, 0.27768996f, 0.20961139f,
0.14863673f, 0.09901341f, 0.04393693f, 0.07021102f, 0.10539922f, 0.14863673f,
0.19691168f, 0.24506053f, 0.28650481f, 0.31466395f, 0.32465246f, 0.31466395f,
0.28650481f, 0.24506053f, 0.19691168f, 0.14863673f, 0.10539922f, 0.07021102f,
0.02926831f, 0.04677062f, 0.07021102f, 0.09901341f, 0.13117145f, 0.16324551f,
0.19085334f, 0.20961139f, 0.21626517f, 0.20961139f, 0.19085334f, 0.16324551f,
0.13117145f, 0.09901341f, 0.07021102f, 0.04677062f,
/* interp_weight lut */
0.00390625f, 0.01171875f, 0.01953125f, 0.02734375f, 0.03515625f, 0.04296875f,
0.05078125f, 0.05859375f, 0.05859375f, 0.05078125f, 0.04296875f, 0.03515625f,
0.02734375f, 0.01953125f, 0.01171875f, 0.00390625f, 0.01171875f, 0.03515625f,
0.05859375f, 0.08203125f, 0.10546875f, 0.12890625f, 0.15234375f, 0.17578125f,
0.17578125f, 0.15234375f, 0.12890625f, 0.10546875f, 0.08203125f, 0.05859375f,
0.03515625f, 0.01171875f, 0.01953125f, 0.05859375f, 0.09765625f, 0.13671875f,
0.17578125f, 0.21484375f, 0.25390625f, 0.29296875f, 0.29296875f, 0.25390625f,
0.21484375f, 0.17578125f, 0.13671875f, 0.09765625f, 0.05859375f, 0.01953125f,
0.02734375f, 0.08203125f, 0.13671875f, 0.19140625f, 0.24609375f, 0.30078125f,
0.35546875f, 0.41015625f, 0.41015625f, 0.35546875f, 0.30078125f, 0.24609375f,
0.19140625f, 0.13671875f, 0.08203125f, 0.02734375f, 0.03515625f, 0.10546875f,
0.17578125f, 0.24609375f, 0.31640625f, 0.38671875f, 0.45703125f, 0.52734375f,
0.52734375f, 0.45703125f, 0.38671875f, 0.31640625f, 0.24609375f, 0.17578125f,
0.10546875f, 0.03515625f, 0.04296875f, 0.12890625f, 0.21484375f, 0.30078125f,
0.38671875f, 0.47265625f, 0.55859375f, 0.64453125f, 0.64453125f, 0.55859375f,
0.47265625f, 0.38671875f, 0.30078125f, 0.21484375f, 0.12890625f, 0.04296875f,
0.05078125f, 0.15234375f, 0.25390625f, 0.35546875f, 0.45703125f, 0.55859375f,
0.66015625f, 0.76171875f, 0.76171875f, 0.66015625f, 0.55859375f, 0.45703125f,
0.35546875f, 0.25390625f, 0.15234375f, 0.05078125f, 0.05859375f, 0.17578125f,
0.29296875f, 0.41015625f, 0.52734375f, 0.64453125f, 0.76171875f, 0.87890625f,
0.87890625f, 0.76171875f, 0.64453125f, 0.52734375f, 0.41015625f, 0.29296875f,
0.17578125f, 0.05859375f, 0.05859375f, 0.17578125f, 0.29296875f, 0.41015625f,
0.52734375f, 0.64453125f, 0.76171875f, 0.87890625f, 0.87890625f, 0.76171875f,
0.64453125f, 0.52734375f, 0.41015625f, 0.29296875f, 0.17578125f, 0.05859375f,
0.05078125f, 0.15234375f, 0.25390625f, 0.35546875f, 0.45703125f, 0.55859375f,
0.66015625f, 0.76171875f, 0.76171875f, 0.66015625f, 0.55859375f, 0.45703125f,
0.35546875f, 0.25390625f, 0.15234375f, 0.05078125f, 0.04296875f, 0.12890625f,
0.21484375f, 0.30078125f, 0.38671875f, 0.47265625f, 0.55859375f, 0.64453125f,
0.64453125f, 0.55859375f, 0.47265625f, 0.38671875f, 0.30078125f, 0.21484375f,
0.12890625f, 0.04296875f, 0.03515625f, 0.10546875f, 0.17578125f, 0.24609375f,
0.31640625f, 0.38671875f, 0.45703125f, 0.52734375f, 0.52734375f, 0.45703125f,
0.38671875f, 0.31640625f, 0.24609375f, 0.17578125f, 0.10546875f, 0.03515625f,
0.02734375f, 0.08203125f, 0.13671875f, 0.19140625f, 0.24609375f, 0.30078125f,
0.35546875f, 0.41015625f, 0.41015625f, 0.35546875f, 0.30078125f, 0.24609375f,
0.19140625f, 0.13671875f, 0.08203125f, 0.02734375f, 0.01953125f, 0.05859375f,
0.09765625f, 0.13671875f, 0.17578125f, 0.21484375f, 0.25390625f, 0.29296875f,
0.29296875f, 0.25390625f, 0.21484375f, 0.17578125f, 0.13671875f, 0.09765625f,
0.05859375f, 0.01953125f, 0.01171875f, 0.03515625f, 0.05859375f, 0.08203125f,
0.10546875f, 0.12890625f, 0.15234375f, 0.17578125f, 0.17578125f, 0.15234375f,
0.12890625f, 0.10546875f, 0.08203125f, 0.05859375f, 0.03515625f, 0.01171875f,
0.00390625f, 0.01171875f, 0.01953125f, 0.02734375f, 0.03515625f, 0.04296875f,
0.05078125f, 0.05859375f, 0.05859375f, 0.05078125f, 0.04296875f, 0.03515625f,
0.02734375f, 0.01953125f, 0.01171875f, 0.00390625f
};
namespace cv
{
namespace ocl
@ -78,38 +172,43 @@ namespace cv
int cnblocks_win_x;
int cnblocks_win_y;
int cblock_hist_size;
int cblock_hist_size_2up;
int cdescr_size;
int cdescr_width;
int cdescr_height;
void set_up_constants(int nbins, int block_stride_x, int block_stride_y,
int nblocks_win_x, int nblocks_win_y);
void compute_hists(int nbins, int block_stride_x, int blovck_stride_y,
int height, int width, const cv::ocl::oclMat &grad,
const cv::ocl::oclMat &qangle, float sigma, cv::ocl::oclMat &block_hists);
int height, int width, float sigma, const cv::ocl::oclMat &grad,
const cv::ocl::oclMat &qangle,
const cv::ocl::oclMat &gauss_w_lut, cv::ocl::oclMat &block_hists);
void normalize_hists(int nbins, int block_stride_x, int block_stride_y,
int height, int width, cv::ocl::oclMat &block_hists, float threshold);
int height, int width, cv::ocl::oclMat &block_hists,
float threshold);
void classify_hists(int win_height, int win_width, int block_stride_y,
int block_stride_x, int win_stride_y, int win_stride_x, int height,
int width, const cv::ocl::oclMat &block_hists, const cv::ocl::oclMat &coefs, float free_coef,
int block_stride_x, int win_stride_y, int win_stride_x,
int height, int width, const cv::ocl::oclMat &block_hists,
const cv::ocl::oclMat &coefs, float free_coef,
float threshold, cv::ocl::oclMat &labels);
void extract_descrs_by_rows(int win_height, int win_width, int block_stride_y, int block_stride_x,
int win_stride_y, int win_stride_x, int height, int width, const cv::ocl::oclMat &block_hists,
void extract_descrs_by_rows(int win_height, int win_width, int block_stride_y,
int block_stride_x, int win_stride_y, int win_stride_x,
int height, int width, const cv::ocl::oclMat &block_hists,
cv::ocl::oclMat &descriptors);
void extract_descrs_by_cols(int win_height, int win_width, int block_stride_y, int block_stride_x,
int win_stride_y, int win_stride_x, int height, int width, const cv::ocl::oclMat &block_hists,
void extract_descrs_by_cols(int win_height, int win_width, int block_stride_y,
int block_stride_x, int win_stride_y, int win_stride_x,
int height, int width, const cv::ocl::oclMat &block_hists,
cv::ocl::oclMat &descriptors);
void compute_gradients_8UC1(int height, int width, const cv::ocl::oclMat &img,
float angle_scale, cv::ocl::oclMat &grad, cv::ocl::oclMat &qangle, bool correct_gamma);
float angle_scale, cv::ocl::oclMat &grad,
cv::ocl::oclMat &qangle, bool correct_gamma);
void compute_gradients_8UC4(int height, int width, const cv::ocl::oclMat &img,
float angle_scale, cv::ocl::oclMat &grad, cv::ocl::oclMat &qangle, bool correct_gamma);
void resize( const oclMat &src, oclMat &dst, const Size sz);
float angle_scale, cv::ocl::oclMat &grad,
cv::ocl::oclMat &qangle, bool correct_gamma);
}
}
}
@ -117,8 +216,14 @@ namespace cv
using namespace ::cv::ocl::device;
cv::ocl::HOGDescriptor::HOGDescriptor(Size win_size_, Size block_size_, Size block_stride_, Size cell_size_,
int nbins_, double win_sigma_, double threshold_L2hys_, bool gamma_correction_, int nlevels_)
static inline int divUp(int total, int grain)
{
return (total + grain - 1) / grain;
}
cv::ocl::HOGDescriptor::HOGDescriptor(Size win_size_, Size block_size_, Size block_stride_,
Size cell_size_, int nbins_, double win_sigma_,
double threshold_L2hys_, bool gamma_correction_, int nlevels_)
: win_size(win_size_),
block_size(block_size_),
block_stride(block_stride_),
@ -132,19 +237,27 @@ cv::ocl::HOGDescriptor::HOGDescriptor(Size win_size_, Size block_size_, Size blo
CV_Assert((win_size.width - block_size.width ) % block_stride.width == 0 &&
(win_size.height - block_size.height) % block_stride.height == 0);
CV_Assert(block_size.width % cell_size.width == 0 && block_size.height % cell_size.height == 0);
CV_Assert(block_size.width % cell_size.width == 0 &&
block_size.height % cell_size.height == 0);
CV_Assert(block_stride == cell_size);
CV_Assert(cell_size == Size(8, 8));
Size cells_per_block = Size(block_size.width / cell_size.width, block_size.height / cell_size.height);
Size cells_per_block(block_size.width / cell_size.width,
block_size.height / cell_size.height);
CV_Assert(cells_per_block == Size(2, 2));
cv::Size blocks_per_win = numPartsWithin(win_size, block_size, block_stride);
hog::set_up_constants(nbins, block_stride.width, block_stride.height, blocks_per_win.width, blocks_per_win.height);
hog::set_up_constants(nbins, block_stride.width, block_stride.height,
blocks_per_win.width, blocks_per_win.height);
effect_size = Size(0, 0);
if (queryDeviceInfo<IS_CPU_DEVICE, bool>())
hog_device_cpu = true;
else
hog_device_cpu = false;
}
size_t cv::ocl::HOGDescriptor::getDescriptorSize() const
@ -154,7 +267,8 @@ size_t cv::ocl::HOGDescriptor::getDescriptorSize() const
size_t cv::ocl::HOGDescriptor::getBlockHistogramSize() const
{
Size cells_per_block = Size(block_size.width / cell_size.width, block_size.height / cell_size.height);
Size cells_per_block = Size(block_size.width / cell_size.width,
block_size.height / cell_size.height);
return (size_t)(nbins * cells_per_block.area());
}
@ -167,7 +281,8 @@ bool cv::ocl::HOGDescriptor::checkDetectorSize() const
{
size_t detector_size = detector.rows * detector.cols;
size_t descriptor_size = getDescriptorSize();
return detector_size == 0 || detector_size == descriptor_size || detector_size == descriptor_size + 1;
return detector_size == 0 || detector_size == descriptor_size ||
detector_size == descriptor_size + 1;
}
void cv::ocl::HOGDescriptor::setSVMDetector(const vector<float> &_detector)
@ -207,10 +322,16 @@ void cv::ocl::HOGDescriptor::init_buffer(const oclMat &img, Size win_stride)
const size_t block_hist_size = getBlockHistogramSize();
const Size blocks_per_img = numPartsWithin(img.size(), block_size, block_stride);
block_hists.create(1, static_cast<int>(block_hist_size * blocks_per_img.area()), CV_32F);
block_hists.create(1,
static_cast<int>(block_hist_size * blocks_per_img.area()) + 256, CV_32F);
Size wins_per_img = numPartsWithin(img.size(), win_size, win_stride);
labels.create(1, wins_per_img.area(), CV_8U);
vector<float> v_lut = vector<float>(gaussian_interp_lut, gaussian_interp_lut +
sizeof(gaussian_interp_lut) / sizeof(gaussian_interp_lut[0]));
Mat m_lut(v_lut);
gauss_w_lut.upload(m_lut.reshape(1,1));
}
void cv::ocl::HOGDescriptor::computeGradient(const oclMat &img, oclMat &grad, oclMat &qangle)
@ -221,29 +342,34 @@ void cv::ocl::HOGDescriptor::computeGradient(const oclMat &img, oclMat &grad, oc
switch (img.type())
{
case CV_8UC1:
hog::compute_gradients_8UC1(effect_size.height, effect_size.width, img, angleScale, grad, qangle, gamma_correction);
hog::compute_gradients_8UC1(effect_size.height, effect_size.width, img,
angleScale, grad, qangle, gamma_correction);
break;
case CV_8UC4:
hog::compute_gradients_8UC4(effect_size.height, effect_size.width, img, angleScale, grad, qangle, gamma_correction);
hog::compute_gradients_8UC4(effect_size.height, effect_size.width, img,
angleScale, grad, qangle, gamma_correction);
break;
}
}
void cv::ocl::HOGDescriptor::computeBlockHistograms(const oclMat &img)
{
computeGradient(img, grad, qangle);
computeGradient(img, this->grad, this->qangle);
hog::compute_hists(nbins, block_stride.width, block_stride.height, effect_size.height, effect_size.width,
grad, qangle, (float)getWinSigma(), block_hists);
hog::compute_hists(nbins, block_stride.width, block_stride.height, effect_size.height,
effect_size.width, (float)getWinSigma(), grad, qangle, gauss_w_lut, block_hists);
hog::normalize_hists(nbins, block_stride.width, block_stride.height, effect_size.height, effect_size.width,
block_hists, (float)threshold_L2hys);
hog::normalize_hists(nbins, block_stride.width, block_stride.height, effect_size.height,
effect_size.width, block_hists, (float)threshold_L2hys);
}
void cv::ocl::HOGDescriptor::getDescriptors(const oclMat &img, Size win_stride, oclMat &descriptors, int descr_format)
void cv::ocl::HOGDescriptor::getDescriptors(const oclMat &img, Size win_stride,
oclMat &descriptors, int descr_format)
{
CV_Assert(win_stride.width % block_stride.width == 0 && win_stride.height % block_stride.height == 0);
CV_Assert(win_stride.width % block_stride.width == 0 &&
win_stride.height % block_stride.height == 0);
init_buffer(img, win_stride);
@ -253,17 +379,20 @@ void cv::ocl::HOGDescriptor::getDescriptors(const oclMat &img, Size win_stride,
Size blocks_per_win = numPartsWithin(win_size, block_size, block_stride);
Size wins_per_img = numPartsWithin(effect_size, win_size, win_stride);
descriptors.create(wins_per_img.area(), static_cast<int>(blocks_per_win.area() * block_hist_size), CV_32F);
descriptors.create(wins_per_img.area(),
static_cast<int>(blocks_per_win.area() * block_hist_size), CV_32F);
switch (descr_format)
{
case DESCR_FORMAT_ROW_BY_ROW:
hog::extract_descrs_by_rows(win_size.height, win_size.width, block_stride.height, block_stride.width,
win_stride.height, win_stride.width, effect_size.height, effect_size.width, block_hists, descriptors);
hog::extract_descrs_by_rows(win_size.height, win_size.width,
block_stride.height, block_stride.width, win_stride.height, win_stride.width,
effect_size.height, effect_size.width, block_hists, descriptors);
break;
case DESCR_FORMAT_COL_BY_COL:
hog::extract_descrs_by_cols(win_size.height, win_size.width, block_stride.height, block_stride.width,
win_stride.height, win_stride.width, effect_size.height, effect_size.width, block_hists, descriptors);
hog::extract_descrs_by_cols(win_size.height, win_size.width,
block_stride.height, block_stride.width, win_stride.height, win_stride.width,
effect_size.height, effect_size.width, block_hists, descriptors);
break;
default:
CV_Error(CV_StsBadArg, "Unknown descriptor format");
@ -271,7 +400,8 @@ void cv::ocl::HOGDescriptor::getDescriptors(const oclMat &img, Size win_stride,
}
void cv::ocl::HOGDescriptor::detect(const oclMat &img, vector<Point> &hits, double hit_threshold, Size win_stride, Size padding)
void cv::ocl::HOGDescriptor::detect(const oclMat &img, vector<Point> &hits,
double hit_threshold, Size win_stride, Size padding)
{
CV_Assert(img.type() == CV_8UC1 || img.type() == CV_8UC4);
CV_Assert(padding == Size(0, 0));
@ -283,14 +413,16 @@ void cv::ocl::HOGDescriptor::detect(const oclMat &img, vector<Point> &hits, doub
if (win_stride == Size())
win_stride = block_stride;
else
CV_Assert(win_stride.width % block_stride.width == 0 && win_stride.height % block_stride.height == 0);
CV_Assert(win_stride.width % block_stride.width == 0 &&
win_stride.height % block_stride.height == 0);
init_buffer(img, win_stride);
computeBlockHistograms(img);
hog::classify_hists(win_size.height, win_size.width, block_stride.height, block_stride.width,
win_stride.height, win_stride.width, effect_size.height, effect_size.width, block_hists,
detector, (float)free_coef, (float)hit_threshold, labels);
hog::classify_hists(win_size.height, win_size.width, block_stride.height,
block_stride.width, win_stride.height, win_stride.width,
effect_size.height, effect_size.width, block_hists, detector,
(float)free_coef, (float)hit_threshold, labels);
labels.download(labels_host);
unsigned char *vec = labels_host.ptr();
@ -306,8 +438,9 @@ void cv::ocl::HOGDescriptor::detect(const oclMat &img, vector<Point> &hits, doub
void cv::ocl::HOGDescriptor::detectMultiScale(const oclMat &img, vector<Rect> &found_locations, double hit_threshold,
Size win_stride, Size padding, double scale0, int group_threshold)
void cv::ocl::HOGDescriptor::detectMultiScale(const oclMat &img, vector<Rect> &found_locations,
double hit_threshold, Size win_stride, Size padding,
double scale0, int group_threshold)
{
CV_Assert(img.type() == CV_8UC1 || img.type() == CV_8UC4);
CV_Assert(scale0 > 1);
@ -333,7 +466,8 @@ void cv::ocl::HOGDescriptor::detectMultiScale(const oclMat &img, vector<Rect> &f
if (win_stride == Size())
win_stride = block_stride;
else
CV_Assert(win_stride.width % block_stride.width == 0 && win_stride.height % block_stride.height == 0);
CV_Assert(win_stride.width % block_stride.width == 0 &&
win_stride.height % block_stride.height == 0);
init_buffer(img, win_stride);
image_scale.create(img.size(), img.type());
@ -347,16 +481,18 @@ void cv::ocl::HOGDescriptor::detectMultiScale(const oclMat &img, vector<Rect> &f
}
else
{
hog::resize( img, image_scale, effect_size);
resize(img, image_scale, effect_size);
detect(image_scale, locations, hit_threshold, win_stride, padding);
}
Size scaled_win_size(cvRound(win_size.width * scale), cvRound(win_size.height * scale));
Size scaled_win_size(cvRound(win_size.width * scale),
cvRound(win_size.height * scale));
for (size_t j = 0; j < locations.size(); j++)
all_candidates.push_back(Rect(Point2d((CvPoint)locations[j]) * scale, scaled_win_size));
all_candidates.push_back(Rect(Point2d((CvPoint)locations[j]) * scale,
scaled_win_size));
}
found_locations.assign(all_candidates.begin(), all_candidates.end());
groupRectangles(found_locations, group_threshold, 0.2/*magic number copied from CPU version*/);
groupRectangles(found_locations, group_threshold, 0.2);
}
int cv::ocl::HOGDescriptor::numPartsWithin(int size, int part_size, int stride)
@ -364,9 +500,11 @@ int cv::ocl::HOGDescriptor::numPartsWithin(int size, int part_size, int stride)
return (size - part_size + stride) / stride;
}
cv::Size cv::ocl::HOGDescriptor::numPartsWithin(cv::Size size, cv::Size part_size, cv::Size stride)
cv::Size cv::ocl::HOGDescriptor::numPartsWithin(cv::Size size, cv::Size part_size,
cv::Size stride)
{
return Size(numPartsWithin(size.width, part_size.width, stride.width), numPartsWithin(size.height, part_size.height, stride.height));
return Size(numPartsWithin(size.width, part_size.width, stride.width),
numPartsWithin(size.height, part_size.height, stride.height));
}
std::vector<float> cv::ocl::HOGDescriptor::getDefaultPeopleDetector()
@ -1547,7 +1685,8 @@ static int power_2up(unsigned int n)
return -1; // Input is too big
}
void cv::ocl::device::hog::set_up_constants(int nbins, int block_stride_x, int block_stride_y,
void cv::ocl::device::hog::set_up_constants(int nbins,
int block_stride_x, int block_stride_y,
int nblocks_win_x, int nblocks_win_y)
{
cnbins = nbins;
@ -1559,53 +1698,32 @@ void cv::ocl::device::hog::set_up_constants(int nbins, int block_stride_x, int b
int block_hist_size = nbins * CELLS_PER_BLOCK_X * CELLS_PER_BLOCK_Y;
cblock_hist_size = block_hist_size;
int block_hist_size_2up = power_2up(block_hist_size);
cblock_hist_size_2up = block_hist_size_2up;
int descr_width = nblocks_win_x * block_hist_size;
cdescr_width = descr_width;
cdescr_height = nblocks_win_y;
int descr_size = descr_width * nblocks_win_y;
cdescr_size = descr_size;
}
static inline int divUp(int total, int grain)
{
return (total + grain - 1) / grain;
}
static void openCLExecuteKernel_hog(Context *clCxt , const char **source, string kernelName,
size_t globalThreads[3], size_t localThreads[3],
vector< pair<size_t, const void *> > &args)
{
cl_kernel kernel = openCLGetKernelFromSource(clCxt, source, kernelName);
size_t wave_size = queryDeviceInfo<WAVEFRONT_SIZE, size_t>(kernel);
openCLSafeCall(clReleaseKernel(kernel));
if (wave_size <= 16)
{
char build_options[64];
sprintf(build_options, (wave_size == 16) ? "-D WAVE_SIZE_16" : "-D WAVE_SIZE_1");
openCLExecuteKernel(clCxt, source, kernelName, globalThreads, localThreads, args, -1, -1, build_options);
}
else
openCLExecuteKernel(clCxt, source, kernelName, globalThreads, localThreads, args, -1, -1);
}
void cv::ocl::device::hog::compute_hists(int nbins, int block_stride_x, int block_stride_y,
int height, int width, const cv::ocl::oclMat &grad,
const cv::ocl::oclMat &qangle, float sigma, cv::ocl::oclMat &block_hists)
void cv::ocl::device::hog::compute_hists(int nbins,
int block_stride_x, int block_stride_y,
int height, int width, float sigma,
const cv::ocl::oclMat &grad,
const cv::ocl::oclMat &qangle,
const cv::ocl::oclMat &gauss_w_lut,
cv::ocl::oclMat &block_hists)
{
Context *clCxt = Context::getContext();
string kernelName = "compute_hists_kernel";
vector< pair<size_t, const void *> > args;
string kernelName = (sigma == 4.0f) ? "compute_hists_lut_kernel" :
"compute_hists_kernel";
int img_block_width = (width - CELLS_PER_BLOCK_X * CELL_WIDTH + block_stride_x) / block_stride_x;
int img_block_height = (height - CELLS_PER_BLOCK_Y * CELL_HEIGHT + block_stride_y) / block_stride_y;
int img_block_width = (width - CELLS_PER_BLOCK_X * CELL_WIDTH + block_stride_x)
/ block_stride_x;
int img_block_height = (height - CELLS_PER_BLOCK_Y * CELL_HEIGHT + block_stride_y)
/ block_stride_y;
int blocks_total = img_block_width * img_block_height;
int blocks_in_group = 4;
size_t localThreads[3] = { blocks_in_group * 24, 2, 1 };
size_t globalThreads[3] = { divUp(blocks_total, blocks_in_group) * localThreads[0], 2, 1 };
int grad_quadstep = grad.step >> 2;
int qangle_step = qangle.step;
@ -1613,6 +1731,11 @@ void cv::ocl::device::hog::compute_hists(int nbins, int block_stride_x, int bloc
// Precompute gaussian spatial window parameter
float scale = 1.f / (2.f * sigma * sigma);
int blocks_in_group = 4;
size_t localThreads[3] = { blocks_in_group * 24, 2, 1 };
size_t globalThreads[3] = {
divUp(img_block_width * img_block_height, blocks_in_group) * localThreads[0], 2, 1 };
int hists_size = (nbins * CELLS_PER_BLOCK_X * CELLS_PER_BLOCK_Y * 12) * sizeof(float);
int final_hists_size = (nbins * CELLS_PER_BLOCK_X * CELLS_PER_BLOCK_Y) * sizeof(float);
int smem = (hists_size + final_hists_size) * blocks_in_group;
@ -1628,62 +1751,120 @@ void cv::ocl::device::hog::compute_hists(int nbins, int block_stride_x, int bloc
args.push_back( make_pair( sizeof(cl_int), (void *)&qangle_step));
args.push_back( make_pair( sizeof(cl_mem), (void *)&grad.data));
args.push_back( make_pair( sizeof(cl_mem), (void *)&qangle.data));
if (kernelName.compare("compute_hists_lut_kernel") == 0)
args.push_back( make_pair( sizeof(cl_mem), (void *)&gauss_w_lut.data));
else
args.push_back( make_pair( sizeof(cl_float), (void *)&scale));
args.push_back( make_pair( sizeof(cl_mem), (void *)&block_hists.data));
args.push_back( make_pair( smem, (void *)NULL));
openCLExecuteKernel_hog(clCxt, &objdetect_hog, kernelName, globalThreads, localThreads, args);
openCLExecuteKernel(clCxt, &objdetect_hog, kernelName, globalThreads,
localThreads, args, -1, -1);
}
void cv::ocl::device::hog::normalize_hists(int nbins, int block_stride_x, int block_stride_y,
int height, int width, cv::ocl::oclMat &block_hists, float threshold)
void cv::ocl::device::hog::normalize_hists(int nbins,
int block_stride_x, int block_stride_y,
int height, int width,
cv::ocl::oclMat &block_hists,
float threshold)
{
Context *clCxt = Context::getContext();
string kernelName = "normalize_hists_kernel";
vector< pair<size_t, const void *> > args;
string kernelName;
int block_hist_size = nbins * CELLS_PER_BLOCK_X * CELLS_PER_BLOCK_Y;
int nthreads = power_2up(block_hist_size);
int img_block_width = (width - CELLS_PER_BLOCK_X * CELL_WIDTH + block_stride_x)
/ block_stride_x;
int img_block_height = (height - CELLS_PER_BLOCK_Y * CELL_HEIGHT + block_stride_y)
/ block_stride_y;
int nthreads;
size_t globalThreads[3] = { 1, 1, 1 };
size_t localThreads[3] = { 1, 1, 1 };
int img_block_width = (width - CELLS_PER_BLOCK_X * CELL_WIDTH + block_stride_x) / block_stride_x;
int img_block_height = (height - CELLS_PER_BLOCK_Y * CELL_HEIGHT + block_stride_y) / block_stride_y;
size_t globalThreads[3] = { img_block_width * nthreads, img_block_height, 1 };
size_t localThreads[3] = { nthreads, 1, 1 };
if ( nbins == 9 )
{
/* optimized for the case of 9 bins */
kernelName = "normalize_hists_36_kernel";
int blocks_in_group = NTHREADS / block_hist_size;
nthreads = blocks_in_group * block_hist_size;
int num_groups = divUp( img_block_width * img_block_height, blocks_in_group);
globalThreads[0] = nthreads * num_groups;
localThreads[0] = nthreads;
}
else
{
kernelName = "normalize_hists_kernel";
nthreads = power_2up(block_hist_size);
globalThreads[0] = img_block_width * nthreads;
globalThreads[1] = img_block_height;
localThreads[0] = nthreads;
if ((nthreads < 32) || (nthreads > 512) )
cv::ocl::error("normalize_hists: histogram's size is too small or too big", __FILE__, __LINE__, "normalize_hists");
cv::ocl::error("normalize_hists: histogram's size is too small or too big",
__FILE__, __LINE__, "normalize_hists");
args.push_back( make_pair( sizeof(cl_int), (void *)&nthreads));
args.push_back( make_pair( sizeof(cl_int), (void *)&block_hist_size));
args.push_back( make_pair( sizeof(cl_int), (void *)&img_block_width));
}
args.push_back( make_pair( sizeof(cl_mem), (void *)&block_hists.data));
args.push_back( make_pair( sizeof(cl_float), (void *)&threshold));
args.push_back( make_pair( nthreads * sizeof(float), (void *)NULL));
openCLExecuteKernel_hog(clCxt, &objdetect_hog, kernelName, globalThreads, localThreads, args);
if(hog_device_cpu)
openCLExecuteKernel(clCxt, &objdetect_hog, kernelName, globalThreads,
localThreads, args, -1, -1, "-D CPU");
else
openCLExecuteKernel(clCxt, &objdetect_hog, kernelName, globalThreads,
localThreads, args, -1, -1);
}
void cv::ocl::device::hog::classify_hists(int win_height, int win_width, int block_stride_y,
int block_stride_x, int win_stride_y, int win_stride_x, int height,
int width, const cv::ocl::oclMat &block_hists, const cv::ocl::oclMat &coefs, float free_coef,
float threshold, cv::ocl::oclMat &labels)
void cv::ocl::device::hog::classify_hists(int win_height, int win_width,
int block_stride_y, int block_stride_x,
int win_stride_y, int win_stride_x,
int height, int width,
const cv::ocl::oclMat &block_hists,
const cv::ocl::oclMat &coefs,
float free_coef, float threshold,
cv::ocl::oclMat &labels)
{
Context *clCxt = Context::getContext();
string kernelName = "classify_hists_kernel";
vector< pair<size_t, const void *> > args;
int nthreads;
string kernelName;
switch (cdescr_width)
{
case 180:
nthreads = 180;
kernelName = "classify_hists_180_kernel";
args.push_back( make_pair( sizeof(cl_int), (void *)&cdescr_width));
args.push_back( make_pair( sizeof(cl_int), (void *)&cdescr_height));
break;
case 252:
nthreads = 256;
kernelName = "classify_hists_252_kernel";
args.push_back( make_pair( sizeof(cl_int), (void *)&cdescr_width));
args.push_back( make_pair( sizeof(cl_int), (void *)&cdescr_height));
break;
default:
nthreads = 256;
kernelName = "classify_hists_kernel";
args.push_back( make_pair( sizeof(cl_int), (void *)&cdescr_size));
args.push_back( make_pair( sizeof(cl_int), (void *)&cdescr_width));
}
int win_block_stride_x = win_stride_x / block_stride_x;
int win_block_stride_y = win_stride_y / block_stride_y;
int img_win_width = (width - win_width + win_stride_x) / win_stride_x;
int img_win_height = (height - win_height + win_stride_y) / win_stride_y;
int img_block_width = (width - CELLS_PER_BLOCK_X * CELL_WIDTH + block_stride_x) / block_stride_x;
size_t globalThreads[3] = { img_win_width * NTHREADS, img_win_height, 1 };
size_t localThreads[3] = { NTHREADS, 1, 1 };
int img_block_width = (width - CELLS_PER_BLOCK_X * CELL_WIDTH + block_stride_x) /
block_stride_x;
size_t globalThreads[3] = { img_win_width * nthreads, img_win_height, 1 };
size_t localThreads[3] = { nthreads, 1, 1 };
args.push_back( make_pair( sizeof(cl_int), (void *)&cblock_hist_size));
args.push_back( make_pair( sizeof(cl_int), (void *)&cdescr_size));
args.push_back( make_pair( sizeof(cl_int), (void *)&cdescr_width));
args.push_back( make_pair( sizeof(cl_int), (void *)&img_win_width));
args.push_back( make_pair( sizeof(cl_int), (void *)&img_block_width));
args.push_back( make_pair( sizeof(cl_int), (void *)&win_block_stride_x));
@ -1694,12 +1875,20 @@ void cv::ocl::device::hog::classify_hists(int win_height, int win_width, int blo
args.push_back( make_pair( sizeof(cl_float), (void *)&threshold));
args.push_back( make_pair( sizeof(cl_mem), (void *)&labels.data));
openCLExecuteKernel_hog(clCxt, &objdetect_hog, kernelName, globalThreads, localThreads, args);
if(hog_device_cpu)
openCLExecuteKernel(clCxt, &objdetect_hog, kernelName, globalThreads,
localThreads, args, -1, -1, "-D CPU");
else
openCLExecuteKernel(clCxt, &objdetect_hog, kernelName, globalThreads,
localThreads, args, -1, -1);
}
void cv::ocl::device::hog::extract_descrs_by_rows(int win_height, int win_width, int block_stride_y, int block_stride_x,
int win_stride_y, int win_stride_x, int height, int width,
const cv::ocl::oclMat &block_hists, cv::ocl::oclMat &descriptors)
void cv::ocl::device::hog::extract_descrs_by_rows(int win_height, int win_width,
int block_stride_y, int block_stride_x,
int win_stride_y, int win_stride_x,
int height, int width,
const cv::ocl::oclMat &block_hists,
cv::ocl::oclMat &descriptors)
{
Context *clCxt = Context::getContext();
string kernelName = "extract_descrs_by_rows_kernel";
@ -1709,7 +1898,8 @@ void cv::ocl::device::hog::extract_descrs_by_rows(int win_height, int win_width,
int win_block_stride_y = win_stride_y / block_stride_y;
int img_win_width = (width - win_width + win_stride_x) / win_stride_x;
int img_win_height = (height - win_height + win_stride_y) / win_stride_y;
int img_block_width = (width - CELLS_PER_BLOCK_X * CELL_WIDTH + block_stride_x) / block_stride_x;
int img_block_width = (width - CELLS_PER_BLOCK_X * CELL_WIDTH + block_stride_x) /
block_stride_x;
int descriptors_quadstep = descriptors.step >> 2;
size_t globalThreads[3] = { img_win_width * NTHREADS, img_win_height, 1 };
@ -1725,12 +1915,16 @@ void cv::ocl::device::hog::extract_descrs_by_rows(int win_height, int win_width,
args.push_back( make_pair( sizeof(cl_mem), (void *)&block_hists.data));
args.push_back( make_pair( sizeof(cl_mem), (void *)&descriptors.data));
openCLExecuteKernel(clCxt, &objdetect_hog, kernelName, globalThreads, localThreads, args, -1, -1);
openCLExecuteKernel(clCxt, &objdetect_hog, kernelName, globalThreads,
localThreads, args, -1, -1);
}
void cv::ocl::device::hog::extract_descrs_by_cols(int win_height, int win_width, int block_stride_y, int block_stride_x,
int win_stride_y, int win_stride_x, int height, int width,
const cv::ocl::oclMat &block_hists, cv::ocl::oclMat &descriptors)
void cv::ocl::device::hog::extract_descrs_by_cols(int win_height, int win_width,
int block_stride_y, int block_stride_x,
int win_stride_y, int win_stride_x,
int height, int width,
const cv::ocl::oclMat &block_hists,
cv::ocl::oclMat &descriptors)
{
Context *clCxt = Context::getContext();
string kernelName = "extract_descrs_by_cols_kernel";
@ -1740,7 +1934,8 @@ void cv::ocl::device::hog::extract_descrs_by_cols(int win_height, int win_width,
int win_block_stride_y = win_stride_y / block_stride_y;
int img_win_width = (width - win_width + win_stride_x) / win_stride_x;
int img_win_height = (height - win_height + win_stride_y) / win_stride_y;
int img_block_width = (width - CELLS_PER_BLOCK_X * CELL_WIDTH + block_stride_x) / block_stride_x;
int img_block_width = (width - CELLS_PER_BLOCK_X * CELL_WIDTH + block_stride_x) /
block_stride_x;
int descriptors_quadstep = descriptors.step >> 2;
size_t globalThreads[3] = { img_win_width * NTHREADS, img_win_height, 1 };
@ -1757,11 +1952,16 @@ void cv::ocl::device::hog::extract_descrs_by_cols(int win_height, int win_width,
args.push_back( make_pair( sizeof(cl_mem), (void *)&block_hists.data));
args.push_back( make_pair( sizeof(cl_mem), (void *)&descriptors.data));
openCLExecuteKernel(clCxt, &objdetect_hog, kernelName, globalThreads, localThreads, args, -1, -1);
openCLExecuteKernel(clCxt, &objdetect_hog, kernelName, globalThreads,
localThreads, args, -1, -1);
}
void cv::ocl::device::hog::compute_gradients_8UC1(int height, int width, const cv::ocl::oclMat &img,
float angle_scale, cv::ocl::oclMat &grad, cv::ocl::oclMat &qangle, bool correct_gamma)
void cv::ocl::device::hog::compute_gradients_8UC1(int height, int width,
const cv::ocl::oclMat &img,
float angle_scale,
cv::ocl::oclMat &grad,
cv::ocl::oclMat &qangle,
bool correct_gamma)
{
Context *clCxt = Context::getContext();
string kernelName = "compute_gradients_8UC1_kernel";
@ -1786,11 +1986,16 @@ void cv::ocl::device::hog::compute_gradients_8UC1(int height, int width, const c
args.push_back( make_pair( sizeof(cl_char), (void *)&correctGamma));
args.push_back( make_pair( sizeof(cl_int), (void *)&cnbins));
openCLExecuteKernel(clCxt, &objdetect_hog, kernelName, globalThreads, localThreads, args, -1, -1);
openCLExecuteKernel(clCxt, &objdetect_hog, kernelName, globalThreads,
localThreads, args, -1, -1);
}
void cv::ocl::device::hog::compute_gradients_8UC4(int height, int width, const cv::ocl::oclMat &img,
float angle_scale, cv::ocl::oclMat &grad, cv::ocl::oclMat &qangle, bool correct_gamma)
void cv::ocl::device::hog::compute_gradients_8UC4(int height, int width,
const cv::ocl::oclMat &img,
float angle_scale,
cv::ocl::oclMat &grad,
cv::ocl::oclMat &qangle,
bool correct_gamma)
{
Context *clCxt = Context::getContext();
string kernelName = "compute_gradients_8UC4_kernel";
@ -1816,39 +2021,6 @@ void cv::ocl::device::hog::compute_gradients_8UC4(int height, int width, const c
args.push_back( make_pair( sizeof(cl_char), (void *)&correctGamma));
args.push_back( make_pair( sizeof(cl_int), (void *)&cnbins));
openCLExecuteKernel(clCxt, &objdetect_hog, kernelName, globalThreads, localThreads, args, -1, -1);
}
void cv::ocl::device::hog::resize( const oclMat &src, oclMat &dst, const Size sz)
{
CV_Assert( (src.channels() == dst.channels()) );
Context *clCxt = Context::getContext();
string kernelName = (src.type() == CV_8UC1) ? "resize_8UC1_kernel" : "resize_8UC4_kernel";
size_t blkSizeX = 16, blkSizeY = 16;
size_t glbSizeX = sz.width % blkSizeX == 0 ? sz.width : (sz.width / blkSizeX + 1) * blkSizeX;
size_t glbSizeY = sz.height % blkSizeY == 0 ? sz.height : (sz.height / blkSizeY + 1) * blkSizeY;
size_t globalThreads[3] = {glbSizeX, glbSizeY, 1};
size_t localThreads[3] = {blkSizeX, blkSizeY, 1};
float ifx = (float)src.cols / sz.width;
float ify = (float)src.rows / sz.height;
int src_step = static_cast<int>(src.step);
int dst_step = static_cast<int>(dst.step);
vector< pair<size_t, const void *> > args;
args.push_back( make_pair(sizeof(cl_mem), (void *)&dst.data));
args.push_back( make_pair(sizeof(cl_mem), (void *)&src.data));
args.push_back( make_pair(sizeof(cl_int), (void *)&dst.offset));
args.push_back( make_pair(sizeof(cl_int), (void *)&src.offset));
args.push_back( make_pair(sizeof(cl_int), (void *)&dst_step));
args.push_back( make_pair(sizeof(cl_int), (void *)&src_step));
args.push_back( make_pair(sizeof(cl_int), (void *)&src.cols));
args.push_back( make_pair(sizeof(cl_int), (void *)&src.rows));
args.push_back( make_pair(sizeof(cl_int), (void *)&sz.width));
args.push_back( make_pair(sizeof(cl_int), (void *)&sz.height));
args.push_back( make_pair(sizeof(cl_float), (void *)&ifx));
args.push_back( make_pair(sizeof(cl_float), (void *)&ify));
openCLExecuteKernel(clCxt, &objdetect_hog, kernelName, globalThreads, localThreads, args, -1, -1);
openCLExecuteKernel(clCxt, &objdetect_hog, kernelName, globalThreads,
localThreads, args, -1, -1);
}

View File

@ -43,7 +43,6 @@
//
//M*/
#define CELL_WIDTH 8
#define CELL_HEIGHT 8
#define CELLS_PER_BLOCK_X 2
@ -51,6 +50,100 @@
#define NTHREADS 256
#define CV_PI_F 3.1415926535897932384626433832795f
//----------------------------------------------------------------------------
// Histogram computation
// 12 threads for a cell, 12x4 threads per block
// Use pre-computed gaussian and interp_weight lookup tables if sigma is 4.0f
__kernel void compute_hists_lut_kernel(
const int cblock_stride_x, const int cblock_stride_y,
const int cnbins, const int cblock_hist_size, const int img_block_width,
const int blocks_in_group, const int blocks_total,
const int grad_quadstep, const int qangle_step,
__global const float* grad, __global const uchar* qangle,
__global const float* gauss_w_lut,
__global float* block_hists, __local float* smem)
{
const int lx = get_local_id(0);
const int lp = lx / 24; /* local group id */
const int gid = get_group_id(0) * blocks_in_group + lp;/* global group id */
const int gidY = gid / img_block_width;
const int gidX = gid - gidY * img_block_width;
const int lidX = lx - lp * 24;
const int lidY = get_local_id(1);
const int cell_x = lidX / 12;
const int cell_y = lidY;
const int cell_thread_x = lidX - cell_x * 12;
__local float* hists = smem + lp * cnbins * (CELLS_PER_BLOCK_X *
CELLS_PER_BLOCK_Y * 12 + CELLS_PER_BLOCK_X * CELLS_PER_BLOCK_Y);
__local float* final_hist = hists + cnbins *
(CELLS_PER_BLOCK_X * CELLS_PER_BLOCK_Y * 12);
const int offset_x = gidX * cblock_stride_x + (cell_x << 2) + cell_thread_x;
const int offset_y = gidY * cblock_stride_y + (cell_y << 2);
__global const float* grad_ptr = (gid < blocks_total) ?
grad + offset_y * grad_quadstep + (offset_x << 1) : grad;
__global const uchar* qangle_ptr = (gid < blocks_total) ?
qangle + offset_y * qangle_step + (offset_x << 1) : qangle;
__local float* hist = hists + 12 * (cell_y * CELLS_PER_BLOCK_Y + cell_x) +
cell_thread_x;
for (int bin_id = 0; bin_id < cnbins; ++bin_id)
hist[bin_id * 48] = 0.f;
const int dist_x = -4 + cell_thread_x - 4 * cell_x;
const int dist_center_x = dist_x - 4 * (1 - 2 * cell_x);
const int dist_y_begin = -4 - 4 * lidY;
for (int dist_y = dist_y_begin; dist_y < dist_y_begin + 12; ++dist_y)
{
float2 vote = (float2) (grad_ptr[0], grad_ptr[1]);
uchar2 bin = (uchar2) (qangle_ptr[0], qangle_ptr[1]);
grad_ptr += grad_quadstep;
qangle_ptr += qangle_step;
int dist_center_y = dist_y - 4 * (1 - 2 * cell_y);
int idx = (dist_center_y + 8) * 16 + (dist_center_x + 8);
float gaussian = gauss_w_lut[idx];
idx = (dist_y + 8) * 16 + (dist_x + 8);
float interp_weight = gauss_w_lut[256+idx];
hist[bin.x * 48] += gaussian * interp_weight * vote.x;
hist[bin.y * 48] += gaussian * interp_weight * vote.y;
}
barrier(CLK_LOCAL_MEM_FENCE);
volatile __local float* hist_ = hist;
for (int bin_id = 0; bin_id < cnbins; ++bin_id, hist_ += 48)
{
if (cell_thread_x < 6)
hist_[0] += hist_[6];
barrier(CLK_LOCAL_MEM_FENCE);
if (cell_thread_x < 3)
hist_[0] += hist_[3];
#ifdef CPU
barrier(CLK_LOCAL_MEM_FENCE);
#endif
if (cell_thread_x == 0)
final_hist[(cell_x * 2 + cell_y) * cnbins + bin_id] =
hist_[0] + hist_[1] + hist_[2];
}
barrier(CLK_LOCAL_MEM_FENCE);
int tid = (cell_y * CELLS_PER_BLOCK_Y + cell_x) * 12 + cell_thread_x;
if ((tid < cblock_hist_size) && (gid < blocks_total))
{
__global float* block_hist = block_hists +
(gidY * img_block_width + gidX) * cblock_hist_size;
block_hist[tid] = final_hist[tid];
}
}
//----------------------------------------------------------------------------
// Histogram computation
// 12 threads for a cell, 12x4 threads per block
@ -125,16 +218,14 @@ __kernel void compute_hists_kernel(
barrier(CLK_LOCAL_MEM_FENCE);
if (cell_thread_x < 3)
hist_[0] += hist_[3];
#ifdef WAVE_SIZE_1
#ifdef CPU
barrier(CLK_LOCAL_MEM_FENCE);
#endif
if (cell_thread_x == 0)
final_hist[(cell_x * 2 + cell_y) * cnbins + bin_id] =
hist_[0] + hist_[1] + hist_[2];
}
#ifdef WAVE_SIZE_1
barrier(CLK_LOCAL_MEM_FENCE);
#endif
int tid = (cell_y * CELLS_PER_BLOCK_Y + cell_x) * 12 + cell_thread_x;
if ((tid < cblock_hist_size) && (gid < blocks_total))
@ -145,6 +236,57 @@ __kernel void compute_hists_kernel(
}
}
//-------------------------------------------------------------
// Normalization of histograms via L2Hys_norm
// optimized for the case of 9 bins
__kernel void normalize_hists_36_kernel(__global float* block_hists,
const float threshold, __local float *squares)
{
const int tid = get_local_id(0);
const int gid = get_global_id(0);
const int bid = tid / 36; /* block-hist id, (0 - 6) */
const int boffset = bid * 36; /* block-hist offset in the work-group */
const int hid = tid - boffset; /* histogram bin id, (0 - 35) */
float elem = block_hists[gid];
squares[tid] = elem * elem;
barrier(CLK_LOCAL_MEM_FENCE);
__local float* smem = squares + boffset;
float sum = smem[hid];
if (hid < 18)
smem[hid] = sum = sum + smem[hid + 18];
barrier(CLK_LOCAL_MEM_FENCE);
if (hid < 9)
smem[hid] = sum = sum + smem[hid + 9];
barrier(CLK_LOCAL_MEM_FENCE);
if (hid < 4)
smem[hid] = sum + smem[hid + 4];
barrier(CLK_LOCAL_MEM_FENCE);
sum = smem[0] + smem[1] + smem[2] + smem[3] + smem[8];
elem = elem / (sqrt(sum) + 3.6f);
elem = min(elem, threshold);
barrier(CLK_LOCAL_MEM_FENCE);
squares[tid] = elem * elem;
barrier(CLK_LOCAL_MEM_FENCE);
sum = smem[hid];
if (hid < 18)
smem[hid] = sum = sum + smem[hid + 18];
barrier(CLK_LOCAL_MEM_FENCE);
if (hid < 9)
smem[hid] = sum = sum + smem[hid + 9];
barrier(CLK_LOCAL_MEM_FENCE);
if (hid < 4)
smem[hid] = sum + smem[hid + 4];
barrier(CLK_LOCAL_MEM_FENCE);
sum = smem[0] + smem[1] + smem[2] + smem[3] + smem[8];
block_hists[gid] = elem / (sqrt(sum) + 1e-3f);
}
//-------------------------------------------------------------
// Normalization of histograms via L2Hys_norm
//
@ -153,76 +295,50 @@ float reduce_smem(volatile __local float* smem, int size)
unsigned int tid = get_local_id(0);
float sum = smem[tid];
if (size >= 512)
{
if (tid < 256) smem[tid] = sum = sum + smem[tid + 256];
barrier(CLK_LOCAL_MEM_FENCE);
}
if (size >= 256)
{
if (tid < 128) smem[tid] = sum = sum + smem[tid + 128];
barrier(CLK_LOCAL_MEM_FENCE);
}
if (size >= 128)
{
if (tid < 64) smem[tid] = sum = sum + smem[tid + 64];
barrier(CLK_LOCAL_MEM_FENCE);
}
if (size >= 512) { if (tid < 256) smem[tid] = sum = sum + smem[tid + 256];
barrier(CLK_LOCAL_MEM_FENCE); }
if (size >= 256) { if (tid < 128) smem[tid] = sum = sum + smem[tid + 128];
barrier(CLK_LOCAL_MEM_FENCE); }
if (size >= 128) { if (tid < 64) smem[tid] = sum = sum + smem[tid + 64];
barrier(CLK_LOCAL_MEM_FENCE); }
#ifdef CPU
if (size >= 64) { if (tid < 32) smem[tid] = sum = sum + smem[tid + 32];
barrier(CLK_LOCAL_MEM_FENCE); }
if (size >= 32) { if (tid < 16) smem[tid] = sum = sum + smem[tid + 16];
barrier(CLK_LOCAL_MEM_FENCE); }
if (size >= 16) { if (tid < 8) smem[tid] = sum = sum + smem[tid + 8];
barrier(CLK_LOCAL_MEM_FENCE); }
if (size >= 8) { if (tid < 4) smem[tid] = sum = sum + smem[tid + 4];
barrier(CLK_LOCAL_MEM_FENCE); }
if (size >= 4) { if (tid < 2) smem[tid] = sum = sum + smem[tid + 2];
barrier(CLK_LOCAL_MEM_FENCE); }
if (size >= 2) { if (tid < 1) smem[tid] = sum = sum + smem[tid + 1];
barrier(CLK_LOCAL_MEM_FENCE); }
#else
if (tid < 32)
{
if (size >= 64) smem[tid] = sum = sum + smem[tid + 32];
#if defined(WAVE_SIZE_16) || defined(WAVE_SIZE_1)
}
barrier(CLK_LOCAL_MEM_FENCE);
if (tid < 16)
{
#endif
if (size >= 32) smem[tid] = sum = sum + smem[tid + 16];
#ifdef WAVE_SIZE_1
}
barrier(CLK_LOCAL_MEM_FENCE);
if (tid < 8)
{
#endif
if (size >= 16) smem[tid] = sum = sum + smem[tid + 8];
#ifdef WAVE_SIZE_1
}
barrier(CLK_LOCAL_MEM_FENCE);
if (tid < 4)
{
#endif
if (size >= 8) smem[tid] = sum = sum + smem[tid + 4];
#ifdef WAVE_SIZE_1
}
barrier(CLK_LOCAL_MEM_FENCE);
if (tid < 2)
{
#endif
if (size >= 4) smem[tid] = sum = sum + smem[tid + 2];
#ifdef WAVE_SIZE_1
}
barrier(CLK_LOCAL_MEM_FENCE);
if (tid < 1)
{
#endif
if (size >= 2) smem[tid] = sum = sum + smem[tid + 1];
}
barrier(CLK_LOCAL_MEM_FENCE);
sum = smem[0];
#endif
return sum;
}
__kernel void normalize_hists_kernel(const int nthreads, const int block_hist_size, const int img_block_width,
__kernel void normalize_hists_kernel(
const int nthreads, const int block_hist_size, const int img_block_width,
__global float* block_hists, const float threshold, __local float *squares)
{
const int tid = get_local_id(0);
const int gidX = get_group_id(0);
const int gidY = get_group_id(1);
__global float* hist = block_hists + (gidY * img_block_width + gidX) * block_hist_size + tid;
__global float* hist = block_hists + (gidY * img_block_width + gidX) *
block_hist_size + tid;
float elem = 0.f;
if (tid < block_hist_size)
@ -249,8 +365,10 @@ __kernel void normalize_hists_kernel(const int nthreads, const int block_hist_si
//---------------------------------------------------------------------
// Linear SVM based classification
//
__kernel void classify_hists_kernel(const int cblock_hist_size, const int cdescr_size, const int cdescr_width,
// 48x96 window, 9 bins and default parameters
// 180 threads, each thread corresponds to a bin in a row
__kernel void classify_hists_180_kernel(
const int cdescr_width, const int cdescr_height, const int cblock_hist_size,
const int img_win_width, const int img_block_width,
const int win_block_stride_x, const int win_block_stride_y,
__global const float * block_hists, __global const float* coefs,
@ -260,14 +378,85 @@ __kernel void classify_hists_kernel(const int cblock_hist_size, const int cdescr
const int gidX = get_group_id(0);
const int gidY = get_group_id(1);
__global const float* hist = block_hists + (gidY * win_block_stride_y * img_block_width + gidX * win_block_stride_x) * cblock_hist_size;
__global const float* hist = block_hists + (gidY * win_block_stride_y *
img_block_width + gidX * win_block_stride_x) * cblock_hist_size;
float product = 0.f;
for (int i = tid; i < cdescr_size; i += NTHREADS)
for (int i = 0; i < cdescr_height; i++)
{
int offset_y = i / cdescr_width;
int offset_x = i - offset_y * cdescr_width;
product += coefs[i] * hist[offset_y * img_block_width * cblock_hist_size + offset_x];
product += coefs[i * cdescr_width + tid] *
hist[i * img_block_width * cblock_hist_size + tid];
}
__local float products[180];
products[tid] = product;
barrier(CLK_LOCAL_MEM_FENCE);
if (tid < 90) products[tid] = product = product + products[tid + 90];
barrier(CLK_LOCAL_MEM_FENCE);
if (tid < 45) products[tid] = product = product + products[tid + 45];
barrier(CLK_LOCAL_MEM_FENCE);
volatile __local float* smem = products;
#ifdef CPU
if (tid < 13) smem[tid] = product = product + smem[tid + 32];
barrier(CLK_LOCAL_MEM_FENCE);
if (tid < 16) smem[tid] = product = product + smem[tid + 16];
barrier(CLK_LOCAL_MEM_FENCE);
if(tid<8) smem[tid] = product = product + smem[tid + 8];
barrier(CLK_LOCAL_MEM_FENCE);
if(tid<4) smem[tid] = product = product + smem[tid + 4];
barrier(CLK_LOCAL_MEM_FENCE);
if(tid<2) smem[tid] = product = product + smem[tid + 2];
barrier(CLK_LOCAL_MEM_FENCE);
#else
if (tid < 13)
{
smem[tid] = product = product + smem[tid + 32];
}
if (tid < 16)
{
smem[tid] = product = product + smem[tid + 16];
smem[tid] = product = product + smem[tid + 8];
smem[tid] = product = product + smem[tid + 4];
smem[tid] = product = product + smem[tid + 2];
}
#endif
if (tid == 0){
product = product + smem[tid + 1];
labels[gidY * img_win_width + gidX] = (product + free_coef >= threshold);
}
}
//---------------------------------------------------------------------
// Linear SVM based classification
// 64x128 window, 9 bins and default parameters
// 256 threads, 252 of them are used
__kernel void classify_hists_252_kernel(
const int cdescr_width, const int cdescr_height, const int cblock_hist_size,
const int img_win_width, const int img_block_width,
const int win_block_stride_x, const int win_block_stride_y,
__global const float * block_hists, __global const float* coefs,
float free_coef, float threshold, __global uchar* labels)
{
const int tid = get_local_id(0);
const int gidX = get_group_id(0);
const int gidY = get_group_id(1);
__global const float* hist = block_hists + (gidY * win_block_stride_y *
img_block_width + gidX * win_block_stride_x) * cblock_hist_size;
float product = 0.f;
if (tid < cdescr_width)
{
for (int i = 0; i < cdescr_height; i++)
product += coefs[i * cdescr_width + tid] *
hist[i * img_block_width * cblock_hist_size + tid];
}
__local float products[NTHREADS];
@ -283,55 +472,106 @@ __kernel void classify_hists_kernel(const int cblock_hist_size, const int cdescr
barrier(CLK_LOCAL_MEM_FENCE);
volatile __local float* smem = products;
#ifdef CPU
if(tid<32) smem[tid] = product = product + smem[tid + 32];
barrier(CLK_LOCAL_MEM_FENCE);
if(tid<16) smem[tid] = product = product + smem[tid + 16];
barrier(CLK_LOCAL_MEM_FENCE);
if(tid<8) smem[tid] = product = product + smem[tid + 8];
barrier(CLK_LOCAL_MEM_FENCE);
if(tid<4) smem[tid] = product = product + smem[tid + 4];
barrier(CLK_LOCAL_MEM_FENCE);
if(tid<2) smem[tid] = product = product + smem[tid + 2];
barrier(CLK_LOCAL_MEM_FENCE);
#else
if (tid < 32)
{
smem[tid] = product = product + smem[tid + 32];
#if defined(WAVE_SIZE_16) || defined(WAVE_SIZE_1)
}
barrier(CLK_LOCAL_MEM_FENCE);
if (tid < 16)
{
#endif
smem[tid] = product = product + smem[tid + 16];
#ifdef WAVE_SIZE_1
}
barrier(CLK_LOCAL_MEM_FENCE);
if (tid < 8)
{
#endif
smem[tid] = product = product + smem[tid + 8];
#ifdef WAVE_SIZE_1
}
barrier(CLK_LOCAL_MEM_FENCE);
if (tid < 4)
{
#endif
smem[tid] = product = product + smem[tid + 4];
#ifdef WAVE_SIZE_1
}
barrier(CLK_LOCAL_MEM_FENCE);
if (tid < 2)
{
#endif
smem[tid] = product = product + smem[tid + 2];
#ifdef WAVE_SIZE_1
}
barrier(CLK_LOCAL_MEM_FENCE);
if (tid < 1)
{
#endif
smem[tid] = product = product + smem[tid + 1];
if (tid == 0){
product = product + smem[tid + 1];
labels[gidY * img_win_width + gidX] = (product + free_coef >= threshold);
}
}
if (tid == 0)
//---------------------------------------------------------------------
// Linear SVM based classification
// 256 threads
__kernel void classify_hists_kernel(
const int cdescr_size, const int cdescr_width, const int cblock_hist_size,
const int img_win_width, const int img_block_width,
const int win_block_stride_x, const int win_block_stride_y,
__global const float * block_hists, __global const float* coefs,
float free_coef, float threshold, __global uchar* labels)
{
const int tid = get_local_id(0);
const int gidX = get_group_id(0);
const int gidY = get_group_id(1);
__global const float* hist = block_hists + (gidY * win_block_stride_y *
img_block_width + gidX * win_block_stride_x) * cblock_hist_size;
float product = 0.f;
for (int i = tid; i < cdescr_size; i += NTHREADS)
{
int offset_y = i / cdescr_width;
int offset_x = i - offset_y * cdescr_width;
product += coefs[i] *
hist[offset_y * img_block_width * cblock_hist_size + offset_x];
}
__local float products[NTHREADS];
products[tid] = product;
barrier(CLK_LOCAL_MEM_FENCE);
if (tid < 128) products[tid] = product = product + products[tid + 128];
barrier(CLK_LOCAL_MEM_FENCE);
if (tid < 64) products[tid] = product = product + products[tid + 64];
barrier(CLK_LOCAL_MEM_FENCE);
volatile __local float* smem = products;
#ifdef CPU
if(tid<32) smem[tid] = product = product + smem[tid + 32];
barrier(CLK_LOCAL_MEM_FENCE);
if(tid<16) smem[tid] = product = product + smem[tid + 16];
barrier(CLK_LOCAL_MEM_FENCE);
if(tid<8) smem[tid] = product = product + smem[tid + 8];
barrier(CLK_LOCAL_MEM_FENCE);
if(tid<4) smem[tid] = product = product + smem[tid + 4];
barrier(CLK_LOCAL_MEM_FENCE);
if(tid<2) smem[tid] = product = product + smem[tid + 2];
barrier(CLK_LOCAL_MEM_FENCE);
#else
if (tid < 32)
{
smem[tid] = product = product + smem[tid + 32];
smem[tid] = product = product + smem[tid + 16];
smem[tid] = product = product + smem[tid + 8];
smem[tid] = product = product + smem[tid + 4];
smem[tid] = product = product + smem[tid + 2];
}
#endif
if (tid == 0){
smem[tid] = product = product + smem[tid + 1];
labels[gidY * img_win_width + gidX] = (product + free_coef >= threshold);
}
}
//----------------------------------------------------------------------------
// Extract descriptors
__kernel void extract_descrs_by_rows_kernel(const int cblock_hist_size, const int descriptors_quadstep, const int cdescr_size, const int cdescr_width,
const int img_block_width, const int win_block_stride_x, const int win_block_stride_y,
__kernel void extract_descrs_by_rows_kernel(
const int cblock_hist_size, const int descriptors_quadstep,
const int cdescr_size, const int cdescr_width, const int img_block_width,
const int win_block_stride_x, const int win_block_stride_y,
__global const float* block_hists, __global float* descriptors)
{
int tid = get_local_id(0);
@ -339,10 +579,12 @@ __kernel void extract_descrs_by_rows_kernel(const int cblock_hist_size, const in
int gidY = get_group_id(1);
// Get left top corner of the window in src
__global const float* hist = block_hists + (gidY * win_block_stride_y * img_block_width + gidX * win_block_stride_x) * cblock_hist_size;
__global const float* hist = block_hists + (gidY * win_block_stride_y *
img_block_width + gidX * win_block_stride_x) * cblock_hist_size;
// Get left top corner of the window in dst
__global float* descriptor = descriptors + (gidY * get_num_groups(0) + gidX) * descriptors_quadstep;
__global float* descriptor = descriptors +
(gidY * get_num_groups(0) + gidX) * descriptors_quadstep;
// Copy elements from src to dst
for (int i = tid; i < cdescr_size; i += NTHREADS)
@ -353,19 +595,23 @@ __kernel void extract_descrs_by_rows_kernel(const int cblock_hist_size, const in
}
}
__kernel void extract_descrs_by_cols_kernel(const int cblock_hist_size, const int descriptors_quadstep, const int cdescr_size,
const int cnblocks_win_x, const int cnblocks_win_y, const int img_block_width, const int win_block_stride_x,
const int win_block_stride_y, __global const float* block_hists, __global float* descriptors)
__kernel void extract_descrs_by_cols_kernel(
const int cblock_hist_size, const int descriptors_quadstep, const int cdescr_size,
const int cnblocks_win_x, const int cnblocks_win_y, const int img_block_width,
const int win_block_stride_x, const int win_block_stride_y,
__global const float* block_hists, __global float* descriptors)
{
int tid = get_local_id(0);
int gidX = get_group_id(0);
int gidY = get_group_id(1);
// Get left top corner of the window in src
__global const float* hist = block_hists + (gidY * win_block_stride_y * img_block_width + gidX * win_block_stride_x) * cblock_hist_size;
__global const float* hist = block_hists + (gidY * win_block_stride_y *
img_block_width + gidX * win_block_stride_x) * cblock_hist_size;
// Get left top corner of the window in dst
__global float* descriptor = descriptors + (gidY * get_num_groups(0) + gidX) * descriptors_quadstep;
__global float* descriptor = descriptors +
(gidY * get_num_groups(0) + gidX) * descriptors_quadstep;
// Copy elements from src to dst
for (int i = tid; i < cdescr_size; i += NTHREADS)
@ -376,14 +622,17 @@ __kernel void extract_descrs_by_cols_kernel(const int cblock_hist_size, const in
int y = block_idx / cnblocks_win_x;
int x = block_idx - y * cnblocks_win_x;
descriptor[(x * cnblocks_win_y + y) * cblock_hist_size + idx_in_block] = hist[(y * img_block_width + x) * cblock_hist_size + idx_in_block];
descriptor[(x * cnblocks_win_y + y) * cblock_hist_size + idx_in_block] =
hist[(y * img_block_width + x) * cblock_hist_size + idx_in_block];
}
}
//----------------------------------------------------------------------------
// Gradients computation
__kernel void compute_gradients_8UC4_kernel(const int height, const int width, const int img_step, const int grad_quadstep, const int qangle_step,
__kernel void compute_gradients_8UC4_kernel(
const int height, const int width,
const int img_step, const int grad_quadstep, const int qangle_step,
const __global uchar4 * img, __global float * grad, __global uchar * qangle,
const float angle_scale, const char correct_gamma, const int cnbins)
{
@ -426,8 +675,10 @@ __kernel void compute_gradients_8UC4_kernel(const int height, const int width, c
barrier(CLK_LOCAL_MEM_FENCE);
if (x < width)
{
float3 a = (float3) (sh_row[tid], sh_row[tid + (NTHREADS + 2)], sh_row[tid + 2 * (NTHREADS + 2)]);
float3 b = (float3) (sh_row[tid + 2], sh_row[tid + 2 + (NTHREADS + 2)], sh_row[tid + 2 + 2 * (NTHREADS + 2)]);
float3 a = (float3) (sh_row[tid], sh_row[tid + (NTHREADS + 2)],
sh_row[tid + 2 * (NTHREADS + 2)]);
float3 b = (float3) (sh_row[tid + 2], sh_row[tid + 2 + (NTHREADS + 2)],
sh_row[tid + 2 + 2 * (NTHREADS + 2)]);
float3 dx;
if (correct_gamma == 1)
@ -482,7 +733,9 @@ __kernel void compute_gradients_8UC4_kernel(const int height, const int width, c
}
}
__kernel void compute_gradients_8UC1_kernel(const int height, const int width, const int img_step, const int grad_quadstep, const int qangle_step,
__kernel void compute_gradients_8UC1_kernel(
const int height, const int width,
const int img_step, const int grad_quadstep, const int qangle_step,
__global const uchar * img, __global float * grad, __global uchar * qangle,
const float angle_scale, const char correct_gamma, const int cnbins)
{
@ -540,42 +793,3 @@ __kernel void compute_gradients_8UC1_kernel(const int height, const int width, c
grad[ ((gidY * grad_quadstep + x) << 1) + 1 ] = mag * ang;
}
}
//----------------------------------------------------------------------------
// Resize
__kernel void resize_8UC4_kernel(__global uchar4 * dst, __global const uchar4 * src,
int dst_offset, int src_offset, int dst_step, int src_step,
int src_cols, int src_rows, int dst_cols, int dst_rows, float ifx, float ify )
{
int dx = get_global_id(0);
int dy = get_global_id(1);
int sx = (int)floor(dx*ifx+0.5f);
int sy = (int)floor(dy*ify+0.5f);
sx = min(sx, src_cols-1);
sy = min(sy, src_rows-1);
int dpos = (dst_offset>>2) + dy * (dst_step>>2) + dx;
int spos = (src_offset>>2) + sy * (src_step>>2) + sx;
if(dx<dst_cols && dy<dst_rows)
dst[dpos] = src[spos];
}
__kernel void resize_8UC1_kernel(__global uchar * dst, __global const uchar * src,
int dst_offset, int src_offset, int dst_step, int src_step,
int src_cols, int src_rows, int dst_cols, int dst_rows, float ifx, float ify )
{
int dx = get_global_id(0);
int dy = get_global_id(1);
int sx = (int)floor(dx*ifx+0.5f);
int sy = (int)floor(dy*ify+0.5f);
sx = min(sx, src_cols-1);
sy = min(sy, src_rows-1);
int dpos = dst_offset + dy * dst_step + dx;
int spos = src_offset + sy * src_step + sx;
if(dx<dst_cols && dy<dst_rows)
dst[dpos] = src[spos];
}