optimize hog
This commit is contained in:
parent
843094a07f
commit
26c246140a
@ -48,13 +48,107 @@ using namespace cv;
|
||||
using namespace cv::ocl;
|
||||
using namespace std;
|
||||
|
||||
|
||||
#define CELL_WIDTH 8
|
||||
#define CELL_HEIGHT 8
|
||||
#define CELLS_PER_BLOCK_X 2
|
||||
#define CELLS_PER_BLOCK_Y 2
|
||||
#define NTHREADS 256
|
||||
|
||||
static oclMat gauss_w_lut;
|
||||
static bool hog_device_cpu;
|
||||
/* pre-compute gaussian and interp_weight lookup tables if sigma is 4.0f */
|
||||
static const float gaussian_interp_lut[] =
|
||||
{
|
||||
/* gaussian lut */
|
||||
0.01831564f, 0.02926831f, 0.04393693f, 0.06196101f, 0.08208500f, 0.10215643f,
|
||||
0.11943297f, 0.13117145f, 0.13533528f, 0.13117145f, 0.11943297f, 0.10215643f,
|
||||
0.08208500f, 0.06196101f, 0.04393693f, 0.02926831f, 0.02926831f, 0.04677062f,
|
||||
0.07021102f, 0.09901341f, 0.13117145f, 0.16324551f, 0.19085334f, 0.20961139f,
|
||||
0.21626517f, 0.20961139f, 0.19085334f, 0.16324551f, 0.13117145f, 0.09901341f,
|
||||
0.07021102f, 0.04677062f, 0.04393693f, 0.07021102f, 0.10539922f, 0.14863673f,
|
||||
0.19691168f, 0.24506053f, 0.28650481f, 0.31466395f, 0.32465246f, 0.31466395f,
|
||||
0.28650481f, 0.24506053f, 0.19691168f, 0.14863673f, 0.10539922f, 0.07021102f,
|
||||
0.06196101f, 0.09901341f, 0.14863673f, 0.20961139f, 0.27768996f, 0.34559074f,
|
||||
0.40403652f, 0.44374731f, 0.45783335f, 0.44374731f, 0.40403652f, 0.34559074f,
|
||||
0.27768996f, 0.20961139f, 0.14863673f, 0.09901341f, 0.08208500f, 0.13117145f,
|
||||
0.19691168f, 0.27768996f, 0.36787945f, 0.45783335f, 0.53526145f, 0.58786964f,
|
||||
0.60653067f, 0.58786964f, 0.53526145f, 0.45783335f, 0.36787945f, 0.27768996f,
|
||||
0.19691168f, 0.13117145f, 0.10215643f, 0.16324551f, 0.24506053f, 0.34559074f,
|
||||
0.45783335f, 0.56978285f, 0.66614360f, 0.73161560f, 0.75483960f, 0.73161560f,
|
||||
0.66614360f, 0.56978285f, 0.45783335f, 0.34559074f, 0.24506053f, 0.16324551f,
|
||||
0.11943297f, 0.19085334f, 0.28650481f, 0.40403652f, 0.53526145f, 0.66614360f,
|
||||
0.77880079f, 0.85534531f, 0.88249689f, 0.85534531f, 0.77880079f, 0.66614360f,
|
||||
0.53526145f, 0.40403652f, 0.28650481f, 0.19085334f, 0.13117145f, 0.20961139f,
|
||||
0.31466395f, 0.44374731f, 0.58786964f, 0.73161560f, 0.85534531f, 0.93941307f,
|
||||
0.96923321f, 0.93941307f, 0.85534531f, 0.73161560f, 0.58786964f, 0.44374731f,
|
||||
0.31466395f, 0.20961139f, 0.13533528f, 0.21626517f, 0.32465246f, 0.45783335f,
|
||||
0.60653067f, 0.75483960f, 0.88249689f, 0.96923321f, 1.00000000f, 0.96923321f,
|
||||
0.88249689f, 0.75483960f, 0.60653067f, 0.45783335f, 0.32465246f, 0.21626517f,
|
||||
0.13117145f, 0.20961139f, 0.31466395f, 0.44374731f, 0.58786964f, 0.73161560f,
|
||||
0.85534531f, 0.93941307f, 0.96923321f, 0.93941307f, 0.85534531f, 0.73161560f,
|
||||
0.58786964f, 0.44374731f, 0.31466395f, 0.20961139f, 0.11943297f, 0.19085334f,
|
||||
0.28650481f, 0.40403652f, 0.53526145f, 0.66614360f, 0.77880079f, 0.85534531f,
|
||||
0.88249689f, 0.85534531f, 0.77880079f, 0.66614360f, 0.53526145f, 0.40403652f,
|
||||
0.28650481f, 0.19085334f, 0.10215643f, 0.16324551f, 0.24506053f, 0.34559074f,
|
||||
0.45783335f, 0.56978285f, 0.66614360f, 0.73161560f, 0.75483960f, 0.73161560f,
|
||||
0.66614360f, 0.56978285f, 0.45783335f, 0.34559074f, 0.24506053f, 0.16324551f,
|
||||
0.08208500f, 0.13117145f, 0.19691168f, 0.27768996f, 0.36787945f, 0.45783335f,
|
||||
0.53526145f, 0.58786964f, 0.60653067f, 0.58786964f, 0.53526145f, 0.45783335f,
|
||||
0.36787945f, 0.27768996f, 0.19691168f, 0.13117145f, 0.06196101f, 0.09901341f,
|
||||
0.14863673f, 0.20961139f, 0.27768996f, 0.34559074f, 0.40403652f, 0.44374731f,
|
||||
0.45783335f, 0.44374731f, 0.40403652f, 0.34559074f, 0.27768996f, 0.20961139f,
|
||||
0.14863673f, 0.09901341f, 0.04393693f, 0.07021102f, 0.10539922f, 0.14863673f,
|
||||
0.19691168f, 0.24506053f, 0.28650481f, 0.31466395f, 0.32465246f, 0.31466395f,
|
||||
0.28650481f, 0.24506053f, 0.19691168f, 0.14863673f, 0.10539922f, 0.07021102f,
|
||||
0.02926831f, 0.04677062f, 0.07021102f, 0.09901341f, 0.13117145f, 0.16324551f,
|
||||
0.19085334f, 0.20961139f, 0.21626517f, 0.20961139f, 0.19085334f, 0.16324551f,
|
||||
0.13117145f, 0.09901341f, 0.07021102f, 0.04677062f,
|
||||
/* interp_weight lut */
|
||||
0.00390625f, 0.01171875f, 0.01953125f, 0.02734375f, 0.03515625f, 0.04296875f,
|
||||
0.05078125f, 0.05859375f, 0.05859375f, 0.05078125f, 0.04296875f, 0.03515625f,
|
||||
0.02734375f, 0.01953125f, 0.01171875f, 0.00390625f, 0.01171875f, 0.03515625f,
|
||||
0.05859375f, 0.08203125f, 0.10546875f, 0.12890625f, 0.15234375f, 0.17578125f,
|
||||
0.17578125f, 0.15234375f, 0.12890625f, 0.10546875f, 0.08203125f, 0.05859375f,
|
||||
0.03515625f, 0.01171875f, 0.01953125f, 0.05859375f, 0.09765625f, 0.13671875f,
|
||||
0.17578125f, 0.21484375f, 0.25390625f, 0.29296875f, 0.29296875f, 0.25390625f,
|
||||
0.21484375f, 0.17578125f, 0.13671875f, 0.09765625f, 0.05859375f, 0.01953125f,
|
||||
0.02734375f, 0.08203125f, 0.13671875f, 0.19140625f, 0.24609375f, 0.30078125f,
|
||||
0.35546875f, 0.41015625f, 0.41015625f, 0.35546875f, 0.30078125f, 0.24609375f,
|
||||
0.19140625f, 0.13671875f, 0.08203125f, 0.02734375f, 0.03515625f, 0.10546875f,
|
||||
0.17578125f, 0.24609375f, 0.31640625f, 0.38671875f, 0.45703125f, 0.52734375f,
|
||||
0.52734375f, 0.45703125f, 0.38671875f, 0.31640625f, 0.24609375f, 0.17578125f,
|
||||
0.10546875f, 0.03515625f, 0.04296875f, 0.12890625f, 0.21484375f, 0.30078125f,
|
||||
0.38671875f, 0.47265625f, 0.55859375f, 0.64453125f, 0.64453125f, 0.55859375f,
|
||||
0.47265625f, 0.38671875f, 0.30078125f, 0.21484375f, 0.12890625f, 0.04296875f,
|
||||
0.05078125f, 0.15234375f, 0.25390625f, 0.35546875f, 0.45703125f, 0.55859375f,
|
||||
0.66015625f, 0.76171875f, 0.76171875f, 0.66015625f, 0.55859375f, 0.45703125f,
|
||||
0.35546875f, 0.25390625f, 0.15234375f, 0.05078125f, 0.05859375f, 0.17578125f,
|
||||
0.29296875f, 0.41015625f, 0.52734375f, 0.64453125f, 0.76171875f, 0.87890625f,
|
||||
0.87890625f, 0.76171875f, 0.64453125f, 0.52734375f, 0.41015625f, 0.29296875f,
|
||||
0.17578125f, 0.05859375f, 0.05859375f, 0.17578125f, 0.29296875f, 0.41015625f,
|
||||
0.52734375f, 0.64453125f, 0.76171875f, 0.87890625f, 0.87890625f, 0.76171875f,
|
||||
0.64453125f, 0.52734375f, 0.41015625f, 0.29296875f, 0.17578125f, 0.05859375f,
|
||||
0.05078125f, 0.15234375f, 0.25390625f, 0.35546875f, 0.45703125f, 0.55859375f,
|
||||
0.66015625f, 0.76171875f, 0.76171875f, 0.66015625f, 0.55859375f, 0.45703125f,
|
||||
0.35546875f, 0.25390625f, 0.15234375f, 0.05078125f, 0.04296875f, 0.12890625f,
|
||||
0.21484375f, 0.30078125f, 0.38671875f, 0.47265625f, 0.55859375f, 0.64453125f,
|
||||
0.64453125f, 0.55859375f, 0.47265625f, 0.38671875f, 0.30078125f, 0.21484375f,
|
||||
0.12890625f, 0.04296875f, 0.03515625f, 0.10546875f, 0.17578125f, 0.24609375f,
|
||||
0.31640625f, 0.38671875f, 0.45703125f, 0.52734375f, 0.52734375f, 0.45703125f,
|
||||
0.38671875f, 0.31640625f, 0.24609375f, 0.17578125f, 0.10546875f, 0.03515625f,
|
||||
0.02734375f, 0.08203125f, 0.13671875f, 0.19140625f, 0.24609375f, 0.30078125f,
|
||||
0.35546875f, 0.41015625f, 0.41015625f, 0.35546875f, 0.30078125f, 0.24609375f,
|
||||
0.19140625f, 0.13671875f, 0.08203125f, 0.02734375f, 0.01953125f, 0.05859375f,
|
||||
0.09765625f, 0.13671875f, 0.17578125f, 0.21484375f, 0.25390625f, 0.29296875f,
|
||||
0.29296875f, 0.25390625f, 0.21484375f, 0.17578125f, 0.13671875f, 0.09765625f,
|
||||
0.05859375f, 0.01953125f, 0.01171875f, 0.03515625f, 0.05859375f, 0.08203125f,
|
||||
0.10546875f, 0.12890625f, 0.15234375f, 0.17578125f, 0.17578125f, 0.15234375f,
|
||||
0.12890625f, 0.10546875f, 0.08203125f, 0.05859375f, 0.03515625f, 0.01171875f,
|
||||
0.00390625f, 0.01171875f, 0.01953125f, 0.02734375f, 0.03515625f, 0.04296875f,
|
||||
0.05078125f, 0.05859375f, 0.05859375f, 0.05078125f, 0.04296875f, 0.03515625f,
|
||||
0.02734375f, 0.01953125f, 0.01171875f, 0.00390625f
|
||||
};
|
||||
|
||||
namespace cv
|
||||
{
|
||||
namespace ocl
|
||||
@ -78,38 +172,43 @@ namespace cv
|
||||
int cnblocks_win_x;
|
||||
int cnblocks_win_y;
|
||||
int cblock_hist_size;
|
||||
int cblock_hist_size_2up;
|
||||
int cdescr_size;
|
||||
int cdescr_width;
|
||||
int cdescr_height;
|
||||
|
||||
void set_up_constants(int nbins, int block_stride_x, int block_stride_y,
|
||||
int nblocks_win_x, int nblocks_win_y);
|
||||
|
||||
void compute_hists(int nbins, int block_stride_x, int blovck_stride_y,
|
||||
int height, int width, const cv::ocl::oclMat &grad,
|
||||
const cv::ocl::oclMat &qangle, float sigma, cv::ocl::oclMat &block_hists);
|
||||
int height, int width, float sigma, const cv::ocl::oclMat &grad,
|
||||
const cv::ocl::oclMat &qangle,
|
||||
const cv::ocl::oclMat &gauss_w_lut, cv::ocl::oclMat &block_hists);
|
||||
|
||||
void normalize_hists(int nbins, int block_stride_x, int block_stride_y,
|
||||
int height, int width, cv::ocl::oclMat &block_hists, float threshold);
|
||||
int height, int width, cv::ocl::oclMat &block_hists,
|
||||
float threshold);
|
||||
|
||||
void classify_hists(int win_height, int win_width, int block_stride_y,
|
||||
int block_stride_x, int win_stride_y, int win_stride_x, int height,
|
||||
int width, const cv::ocl::oclMat &block_hists, const cv::ocl::oclMat &coefs, float free_coef,
|
||||
int block_stride_x, int win_stride_y, int win_stride_x,
|
||||
int height, int width, const cv::ocl::oclMat &block_hists,
|
||||
const cv::ocl::oclMat &coefs, float free_coef,
|
||||
float threshold, cv::ocl::oclMat &labels);
|
||||
|
||||
void extract_descrs_by_rows(int win_height, int win_width, int block_stride_y, int block_stride_x,
|
||||
int win_stride_y, int win_stride_x, int height, int width, const cv::ocl::oclMat &block_hists,
|
||||
void extract_descrs_by_rows(int win_height, int win_width, int block_stride_y,
|
||||
int block_stride_x, int win_stride_y, int win_stride_x,
|
||||
int height, int width, const cv::ocl::oclMat &block_hists,
|
||||
cv::ocl::oclMat &descriptors);
|
||||
void extract_descrs_by_cols(int win_height, int win_width, int block_stride_y, int block_stride_x,
|
||||
int win_stride_y, int win_stride_x, int height, int width, const cv::ocl::oclMat &block_hists,
|
||||
void extract_descrs_by_cols(int win_height, int win_width, int block_stride_y,
|
||||
int block_stride_x, int win_stride_y, int win_stride_x,
|
||||
int height, int width, const cv::ocl::oclMat &block_hists,
|
||||
cv::ocl::oclMat &descriptors);
|
||||
|
||||
void compute_gradients_8UC1(int height, int width, const cv::ocl::oclMat &img,
|
||||
float angle_scale, cv::ocl::oclMat &grad, cv::ocl::oclMat &qangle, bool correct_gamma);
|
||||
float angle_scale, cv::ocl::oclMat &grad,
|
||||
cv::ocl::oclMat &qangle, bool correct_gamma);
|
||||
void compute_gradients_8UC4(int height, int width, const cv::ocl::oclMat &img,
|
||||
float angle_scale, cv::ocl::oclMat &grad, cv::ocl::oclMat &qangle, bool correct_gamma);
|
||||
|
||||
void resize( const oclMat &src, oclMat &dst, const Size sz);
|
||||
float angle_scale, cv::ocl::oclMat &grad,
|
||||
cv::ocl::oclMat &qangle, bool correct_gamma);
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -117,8 +216,14 @@ namespace cv
|
||||
|
||||
using namespace ::cv::ocl::device;
|
||||
|
||||
cv::ocl::HOGDescriptor::HOGDescriptor(Size win_size_, Size block_size_, Size block_stride_, Size cell_size_,
|
||||
int nbins_, double win_sigma_, double threshold_L2hys_, bool gamma_correction_, int nlevels_)
|
||||
static inline int divUp(int total, int grain)
|
||||
{
|
||||
return (total + grain - 1) / grain;
|
||||
}
|
||||
|
||||
cv::ocl::HOGDescriptor::HOGDescriptor(Size win_size_, Size block_size_, Size block_stride_,
|
||||
Size cell_size_, int nbins_, double win_sigma_,
|
||||
double threshold_L2hys_, bool gamma_correction_, int nlevels_)
|
||||
: win_size(win_size_),
|
||||
block_size(block_size_),
|
||||
block_stride(block_stride_),
|
||||
@ -132,19 +237,27 @@ cv::ocl::HOGDescriptor::HOGDescriptor(Size win_size_, Size block_size_, Size blo
|
||||
CV_Assert((win_size.width - block_size.width ) % block_stride.width == 0 &&
|
||||
(win_size.height - block_size.height) % block_stride.height == 0);
|
||||
|
||||
CV_Assert(block_size.width % cell_size.width == 0 && block_size.height % cell_size.height == 0);
|
||||
CV_Assert(block_size.width % cell_size.width == 0 &&
|
||||
block_size.height % cell_size.height == 0);
|
||||
|
||||
CV_Assert(block_stride == cell_size);
|
||||
|
||||
CV_Assert(cell_size == Size(8, 8));
|
||||
|
||||
Size cells_per_block = Size(block_size.width / cell_size.width, block_size.height / cell_size.height);
|
||||
Size cells_per_block(block_size.width / cell_size.width,
|
||||
block_size.height / cell_size.height);
|
||||
CV_Assert(cells_per_block == Size(2, 2));
|
||||
|
||||
cv::Size blocks_per_win = numPartsWithin(win_size, block_size, block_stride);
|
||||
hog::set_up_constants(nbins, block_stride.width, block_stride.height, blocks_per_win.width, blocks_per_win.height);
|
||||
hog::set_up_constants(nbins, block_stride.width, block_stride.height,
|
||||
blocks_per_win.width, blocks_per_win.height);
|
||||
|
||||
effect_size = Size(0, 0);
|
||||
|
||||
if (queryDeviceInfo<IS_CPU_DEVICE, bool>())
|
||||
hog_device_cpu = true;
|
||||
else
|
||||
hog_device_cpu = false;
|
||||
}
|
||||
|
||||
size_t cv::ocl::HOGDescriptor::getDescriptorSize() const
|
||||
@ -154,7 +267,8 @@ size_t cv::ocl::HOGDescriptor::getDescriptorSize() const
|
||||
|
||||
size_t cv::ocl::HOGDescriptor::getBlockHistogramSize() const
|
||||
{
|
||||
Size cells_per_block = Size(block_size.width / cell_size.width, block_size.height / cell_size.height);
|
||||
Size cells_per_block = Size(block_size.width / cell_size.width,
|
||||
block_size.height / cell_size.height);
|
||||
return (size_t)(nbins * cells_per_block.area());
|
||||
}
|
||||
|
||||
@ -167,7 +281,8 @@ bool cv::ocl::HOGDescriptor::checkDetectorSize() const
|
||||
{
|
||||
size_t detector_size = detector.rows * detector.cols;
|
||||
size_t descriptor_size = getDescriptorSize();
|
||||
return detector_size == 0 || detector_size == descriptor_size || detector_size == descriptor_size + 1;
|
||||
return detector_size == 0 || detector_size == descriptor_size ||
|
||||
detector_size == descriptor_size + 1;
|
||||
}
|
||||
|
||||
void cv::ocl::HOGDescriptor::setSVMDetector(const vector<float> &_detector)
|
||||
@ -207,10 +322,16 @@ void cv::ocl::HOGDescriptor::init_buffer(const oclMat &img, Size win_stride)
|
||||
|
||||
const size_t block_hist_size = getBlockHistogramSize();
|
||||
const Size blocks_per_img = numPartsWithin(img.size(), block_size, block_stride);
|
||||
block_hists.create(1, static_cast<int>(block_hist_size * blocks_per_img.area()), CV_32F);
|
||||
block_hists.create(1,
|
||||
static_cast<int>(block_hist_size * blocks_per_img.area()) + 256, CV_32F);
|
||||
|
||||
Size wins_per_img = numPartsWithin(img.size(), win_size, win_stride);
|
||||
labels.create(1, wins_per_img.area(), CV_8U);
|
||||
|
||||
vector<float> v_lut = vector<float>(gaussian_interp_lut, gaussian_interp_lut +
|
||||
sizeof(gaussian_interp_lut) / sizeof(gaussian_interp_lut[0]));
|
||||
Mat m_lut(v_lut);
|
||||
gauss_w_lut.upload(m_lut.reshape(1,1));
|
||||
}
|
||||
|
||||
void cv::ocl::HOGDescriptor::computeGradient(const oclMat &img, oclMat &grad, oclMat &qangle)
|
||||
@ -221,29 +342,34 @@ void cv::ocl::HOGDescriptor::computeGradient(const oclMat &img, oclMat &grad, oc
|
||||
switch (img.type())
|
||||
{
|
||||
case CV_8UC1:
|
||||
hog::compute_gradients_8UC1(effect_size.height, effect_size.width, img, angleScale, grad, qangle, gamma_correction);
|
||||
hog::compute_gradients_8UC1(effect_size.height, effect_size.width, img,
|
||||
angleScale, grad, qangle, gamma_correction);
|
||||
break;
|
||||
case CV_8UC4:
|
||||
hog::compute_gradients_8UC4(effect_size.height, effect_size.width, img, angleScale, grad, qangle, gamma_correction);
|
||||
hog::compute_gradients_8UC4(effect_size.height, effect_size.width, img,
|
||||
angleScale, grad, qangle, gamma_correction);
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
void cv::ocl::HOGDescriptor::computeBlockHistograms(const oclMat &img)
|
||||
{
|
||||
computeGradient(img, grad, qangle);
|
||||
computeGradient(img, this->grad, this->qangle);
|
||||
|
||||
hog::compute_hists(nbins, block_stride.width, block_stride.height, effect_size.height, effect_size.width,
|
||||
grad, qangle, (float)getWinSigma(), block_hists);
|
||||
hog::compute_hists(nbins, block_stride.width, block_stride.height, effect_size.height,
|
||||
effect_size.width, (float)getWinSigma(), grad, qangle, gauss_w_lut, block_hists);
|
||||
|
||||
hog::normalize_hists(nbins, block_stride.width, block_stride.height, effect_size.height, effect_size.width,
|
||||
block_hists, (float)threshold_L2hys);
|
||||
hog::normalize_hists(nbins, block_stride.width, block_stride.height, effect_size.height,
|
||||
effect_size.width, block_hists, (float)threshold_L2hys);
|
||||
}
|
||||
|
||||
|
||||
void cv::ocl::HOGDescriptor::getDescriptors(const oclMat &img, Size win_stride, oclMat &descriptors, int descr_format)
|
||||
void cv::ocl::HOGDescriptor::getDescriptors(const oclMat &img, Size win_stride,
|
||||
oclMat &descriptors, int descr_format)
|
||||
{
|
||||
CV_Assert(win_stride.width % block_stride.width == 0 && win_stride.height % block_stride.height == 0);
|
||||
CV_Assert(win_stride.width % block_stride.width == 0 &&
|
||||
win_stride.height % block_stride.height == 0);
|
||||
|
||||
init_buffer(img, win_stride);
|
||||
|
||||
@ -253,17 +379,20 @@ void cv::ocl::HOGDescriptor::getDescriptors(const oclMat &img, Size win_stride,
|
||||
Size blocks_per_win = numPartsWithin(win_size, block_size, block_stride);
|
||||
Size wins_per_img = numPartsWithin(effect_size, win_size, win_stride);
|
||||
|
||||
descriptors.create(wins_per_img.area(), static_cast<int>(blocks_per_win.area() * block_hist_size), CV_32F);
|
||||
descriptors.create(wins_per_img.area(),
|
||||
static_cast<int>(blocks_per_win.area() * block_hist_size), CV_32F);
|
||||
|
||||
switch (descr_format)
|
||||
{
|
||||
case DESCR_FORMAT_ROW_BY_ROW:
|
||||
hog::extract_descrs_by_rows(win_size.height, win_size.width, block_stride.height, block_stride.width,
|
||||
win_stride.height, win_stride.width, effect_size.height, effect_size.width, block_hists, descriptors);
|
||||
hog::extract_descrs_by_rows(win_size.height, win_size.width,
|
||||
block_stride.height, block_stride.width, win_stride.height, win_stride.width,
|
||||
effect_size.height, effect_size.width, block_hists, descriptors);
|
||||
break;
|
||||
case DESCR_FORMAT_COL_BY_COL:
|
||||
hog::extract_descrs_by_cols(win_size.height, win_size.width, block_stride.height, block_stride.width,
|
||||
win_stride.height, win_stride.width, effect_size.height, effect_size.width, block_hists, descriptors);
|
||||
hog::extract_descrs_by_cols(win_size.height, win_size.width,
|
||||
block_stride.height, block_stride.width, win_stride.height, win_stride.width,
|
||||
effect_size.height, effect_size.width, block_hists, descriptors);
|
||||
break;
|
||||
default:
|
||||
CV_Error(CV_StsBadArg, "Unknown descriptor format");
|
||||
@ -271,7 +400,8 @@ void cv::ocl::HOGDescriptor::getDescriptors(const oclMat &img, Size win_stride,
|
||||
}
|
||||
|
||||
|
||||
void cv::ocl::HOGDescriptor::detect(const oclMat &img, vector<Point> &hits, double hit_threshold, Size win_stride, Size padding)
|
||||
void cv::ocl::HOGDescriptor::detect(const oclMat &img, vector<Point> &hits,
|
||||
double hit_threshold, Size win_stride, Size padding)
|
||||
{
|
||||
CV_Assert(img.type() == CV_8UC1 || img.type() == CV_8UC4);
|
||||
CV_Assert(padding == Size(0, 0));
|
||||
@ -283,14 +413,16 @@ void cv::ocl::HOGDescriptor::detect(const oclMat &img, vector<Point> &hits, doub
|
||||
if (win_stride == Size())
|
||||
win_stride = block_stride;
|
||||
else
|
||||
CV_Assert(win_stride.width % block_stride.width == 0 && win_stride.height % block_stride.height == 0);
|
||||
CV_Assert(win_stride.width % block_stride.width == 0 &&
|
||||
win_stride.height % block_stride.height == 0);
|
||||
init_buffer(img, win_stride);
|
||||
|
||||
computeBlockHistograms(img);
|
||||
|
||||
hog::classify_hists(win_size.height, win_size.width, block_stride.height, block_stride.width,
|
||||
win_stride.height, win_stride.width, effect_size.height, effect_size.width, block_hists,
|
||||
detector, (float)free_coef, (float)hit_threshold, labels);
|
||||
hog::classify_hists(win_size.height, win_size.width, block_stride.height,
|
||||
block_stride.width, win_stride.height, win_stride.width,
|
||||
effect_size.height, effect_size.width, block_hists, detector,
|
||||
(float)free_coef, (float)hit_threshold, labels);
|
||||
|
||||
labels.download(labels_host);
|
||||
unsigned char *vec = labels_host.ptr();
|
||||
@ -306,8 +438,9 @@ void cv::ocl::HOGDescriptor::detect(const oclMat &img, vector<Point> &hits, doub
|
||||
|
||||
|
||||
|
||||
void cv::ocl::HOGDescriptor::detectMultiScale(const oclMat &img, vector<Rect> &found_locations, double hit_threshold,
|
||||
Size win_stride, Size padding, double scale0, int group_threshold)
|
||||
void cv::ocl::HOGDescriptor::detectMultiScale(const oclMat &img, vector<Rect> &found_locations,
|
||||
double hit_threshold, Size win_stride, Size padding,
|
||||
double scale0, int group_threshold)
|
||||
{
|
||||
CV_Assert(img.type() == CV_8UC1 || img.type() == CV_8UC4);
|
||||
CV_Assert(scale0 > 1);
|
||||
@ -333,7 +466,8 @@ void cv::ocl::HOGDescriptor::detectMultiScale(const oclMat &img, vector<Rect> &f
|
||||
if (win_stride == Size())
|
||||
win_stride = block_stride;
|
||||
else
|
||||
CV_Assert(win_stride.width % block_stride.width == 0 && win_stride.height % block_stride.height == 0);
|
||||
CV_Assert(win_stride.width % block_stride.width == 0 &&
|
||||
win_stride.height % block_stride.height == 0);
|
||||
init_buffer(img, win_stride);
|
||||
image_scale.create(img.size(), img.type());
|
||||
|
||||
@ -347,16 +481,18 @@ void cv::ocl::HOGDescriptor::detectMultiScale(const oclMat &img, vector<Rect> &f
|
||||
}
|
||||
else
|
||||
{
|
||||
hog::resize( img, image_scale, effect_size);
|
||||
resize(img, image_scale, effect_size);
|
||||
detect(image_scale, locations, hit_threshold, win_stride, padding);
|
||||
}
|
||||
Size scaled_win_size(cvRound(win_size.width * scale), cvRound(win_size.height * scale));
|
||||
Size scaled_win_size(cvRound(win_size.width * scale),
|
||||
cvRound(win_size.height * scale));
|
||||
for (size_t j = 0; j < locations.size(); j++)
|
||||
all_candidates.push_back(Rect(Point2d((CvPoint)locations[j]) * scale, scaled_win_size));
|
||||
all_candidates.push_back(Rect(Point2d((CvPoint)locations[j]) * scale,
|
||||
scaled_win_size));
|
||||
}
|
||||
|
||||
found_locations.assign(all_candidates.begin(), all_candidates.end());
|
||||
groupRectangles(found_locations, group_threshold, 0.2/*magic number copied from CPU version*/);
|
||||
groupRectangles(found_locations, group_threshold, 0.2);
|
||||
}
|
||||
|
||||
int cv::ocl::HOGDescriptor::numPartsWithin(int size, int part_size, int stride)
|
||||
@ -364,9 +500,11 @@ int cv::ocl::HOGDescriptor::numPartsWithin(int size, int part_size, int stride)
|
||||
return (size - part_size + stride) / stride;
|
||||
}
|
||||
|
||||
cv::Size cv::ocl::HOGDescriptor::numPartsWithin(cv::Size size, cv::Size part_size, cv::Size stride)
|
||||
cv::Size cv::ocl::HOGDescriptor::numPartsWithin(cv::Size size, cv::Size part_size,
|
||||
cv::Size stride)
|
||||
{
|
||||
return Size(numPartsWithin(size.width, part_size.width, stride.width), numPartsWithin(size.height, part_size.height, stride.height));
|
||||
return Size(numPartsWithin(size.width, part_size.width, stride.width),
|
||||
numPartsWithin(size.height, part_size.height, stride.height));
|
||||
}
|
||||
|
||||
std::vector<float> cv::ocl::HOGDescriptor::getDefaultPeopleDetector()
|
||||
@ -1547,7 +1685,8 @@ static int power_2up(unsigned int n)
|
||||
return -1; // Input is too big
|
||||
}
|
||||
|
||||
void cv::ocl::device::hog::set_up_constants(int nbins, int block_stride_x, int block_stride_y,
|
||||
void cv::ocl::device::hog::set_up_constants(int nbins,
|
||||
int block_stride_x, int block_stride_y,
|
||||
int nblocks_win_x, int nblocks_win_y)
|
||||
{
|
||||
cnbins = nbins;
|
||||
@ -1559,53 +1698,32 @@ void cv::ocl::device::hog::set_up_constants(int nbins, int block_stride_x, int b
|
||||
int block_hist_size = nbins * CELLS_PER_BLOCK_X * CELLS_PER_BLOCK_Y;
|
||||
cblock_hist_size = block_hist_size;
|
||||
|
||||
int block_hist_size_2up = power_2up(block_hist_size);
|
||||
cblock_hist_size_2up = block_hist_size_2up;
|
||||
|
||||
int descr_width = nblocks_win_x * block_hist_size;
|
||||
cdescr_width = descr_width;
|
||||
cdescr_height = nblocks_win_y;
|
||||
|
||||
int descr_size = descr_width * nblocks_win_y;
|
||||
cdescr_size = descr_size;
|
||||
}
|
||||
|
||||
static inline int divUp(int total, int grain)
|
||||
{
|
||||
return (total + grain - 1) / grain;
|
||||
}
|
||||
|
||||
static void openCLExecuteKernel_hog(Context *clCxt , const char **source, string kernelName,
|
||||
size_t globalThreads[3], size_t localThreads[3],
|
||||
vector< pair<size_t, const void *> > &args)
|
||||
{
|
||||
cl_kernel kernel = openCLGetKernelFromSource(clCxt, source, kernelName);
|
||||
size_t wave_size = queryDeviceInfo<WAVEFRONT_SIZE, size_t>(kernel);
|
||||
openCLSafeCall(clReleaseKernel(kernel));
|
||||
if (wave_size <= 16)
|
||||
{
|
||||
char build_options[64];
|
||||
sprintf(build_options, (wave_size == 16) ? "-D WAVE_SIZE_16" : "-D WAVE_SIZE_1");
|
||||
openCLExecuteKernel(clCxt, source, kernelName, globalThreads, localThreads, args, -1, -1, build_options);
|
||||
}
|
||||
else
|
||||
openCLExecuteKernel(clCxt, source, kernelName, globalThreads, localThreads, args, -1, -1);
|
||||
}
|
||||
|
||||
void cv::ocl::device::hog::compute_hists(int nbins, int block_stride_x, int block_stride_y,
|
||||
int height, int width, const cv::ocl::oclMat &grad,
|
||||
const cv::ocl::oclMat &qangle, float sigma, cv::ocl::oclMat &block_hists)
|
||||
void cv::ocl::device::hog::compute_hists(int nbins,
|
||||
int block_stride_x, int block_stride_y,
|
||||
int height, int width, float sigma,
|
||||
const cv::ocl::oclMat &grad,
|
||||
const cv::ocl::oclMat &qangle,
|
||||
const cv::ocl::oclMat &gauss_w_lut,
|
||||
cv::ocl::oclMat &block_hists)
|
||||
{
|
||||
Context *clCxt = Context::getContext();
|
||||
string kernelName = "compute_hists_kernel";
|
||||
vector< pair<size_t, const void *> > args;
|
||||
string kernelName = (sigma == 4.0f) ? "compute_hists_lut_kernel" :
|
||||
"compute_hists_kernel";
|
||||
|
||||
int img_block_width = (width - CELLS_PER_BLOCK_X * CELL_WIDTH + block_stride_x) / block_stride_x;
|
||||
int img_block_height = (height - CELLS_PER_BLOCK_Y * CELL_HEIGHT + block_stride_y) / block_stride_y;
|
||||
|
||||
int img_block_width = (width - CELLS_PER_BLOCK_X * CELL_WIDTH + block_stride_x)
|
||||
/ block_stride_x;
|
||||
int img_block_height = (height - CELLS_PER_BLOCK_Y * CELL_HEIGHT + block_stride_y)
|
||||
/ block_stride_y;
|
||||
int blocks_total = img_block_width * img_block_height;
|
||||
int blocks_in_group = 4;
|
||||
size_t localThreads[3] = { blocks_in_group * 24, 2, 1 };
|
||||
size_t globalThreads[3] = { divUp(blocks_total, blocks_in_group) * localThreads[0], 2, 1 };
|
||||
|
||||
int grad_quadstep = grad.step >> 2;
|
||||
int qangle_step = qangle.step;
|
||||
@ -1613,6 +1731,11 @@ void cv::ocl::device::hog::compute_hists(int nbins, int block_stride_x, int bloc
|
||||
// Precompute gaussian spatial window parameter
|
||||
float scale = 1.f / (2.f * sigma * sigma);
|
||||
|
||||
int blocks_in_group = 4;
|
||||
size_t localThreads[3] = { blocks_in_group * 24, 2, 1 };
|
||||
size_t globalThreads[3] = {
|
||||
divUp(img_block_width * img_block_height, blocks_in_group) * localThreads[0], 2, 1 };
|
||||
|
||||
int hists_size = (nbins * CELLS_PER_BLOCK_X * CELLS_PER_BLOCK_Y * 12) * sizeof(float);
|
||||
int final_hists_size = (nbins * CELLS_PER_BLOCK_X * CELLS_PER_BLOCK_Y) * sizeof(float);
|
||||
int smem = (hists_size + final_hists_size) * blocks_in_group;
|
||||
@ -1628,62 +1751,120 @@ void cv::ocl::device::hog::compute_hists(int nbins, int block_stride_x, int bloc
|
||||
args.push_back( make_pair( sizeof(cl_int), (void *)&qangle_step));
|
||||
args.push_back( make_pair( sizeof(cl_mem), (void *)&grad.data));
|
||||
args.push_back( make_pair( sizeof(cl_mem), (void *)&qangle.data));
|
||||
if (kernelName.compare("compute_hists_lut_kernel") == 0)
|
||||
args.push_back( make_pair( sizeof(cl_mem), (void *)&gauss_w_lut.data));
|
||||
else
|
||||
args.push_back( make_pair( sizeof(cl_float), (void *)&scale));
|
||||
args.push_back( make_pair( sizeof(cl_mem), (void *)&block_hists.data));
|
||||
args.push_back( make_pair( smem, (void *)NULL));
|
||||
|
||||
openCLExecuteKernel_hog(clCxt, &objdetect_hog, kernelName, globalThreads, localThreads, args);
|
||||
openCLExecuteKernel(clCxt, &objdetect_hog, kernelName, globalThreads,
|
||||
localThreads, args, -1, -1);
|
||||
}
|
||||
|
||||
void cv::ocl::device::hog::normalize_hists(int nbins, int block_stride_x, int block_stride_y,
|
||||
int height, int width, cv::ocl::oclMat &block_hists, float threshold)
|
||||
void cv::ocl::device::hog::normalize_hists(int nbins,
|
||||
int block_stride_x, int block_stride_y,
|
||||
int height, int width,
|
||||
cv::ocl::oclMat &block_hists,
|
||||
float threshold)
|
||||
{
|
||||
Context *clCxt = Context::getContext();
|
||||
string kernelName = "normalize_hists_kernel";
|
||||
vector< pair<size_t, const void *> > args;
|
||||
string kernelName;
|
||||
|
||||
int block_hist_size = nbins * CELLS_PER_BLOCK_X * CELLS_PER_BLOCK_Y;
|
||||
int nthreads = power_2up(block_hist_size);
|
||||
int img_block_width = (width - CELLS_PER_BLOCK_X * CELL_WIDTH + block_stride_x)
|
||||
/ block_stride_x;
|
||||
int img_block_height = (height - CELLS_PER_BLOCK_Y * CELL_HEIGHT + block_stride_y)
|
||||
/ block_stride_y;
|
||||
int nthreads;
|
||||
size_t globalThreads[3] = { 1, 1, 1 };
|
||||
size_t localThreads[3] = { 1, 1, 1 };
|
||||
|
||||
int img_block_width = (width - CELLS_PER_BLOCK_X * CELL_WIDTH + block_stride_x) / block_stride_x;
|
||||
int img_block_height = (height - CELLS_PER_BLOCK_Y * CELL_HEIGHT + block_stride_y) / block_stride_y;
|
||||
size_t globalThreads[3] = { img_block_width * nthreads, img_block_height, 1 };
|
||||
size_t localThreads[3] = { nthreads, 1, 1 };
|
||||
if ( nbins == 9 )
|
||||
{
|
||||
/* optimized for the case of 9 bins */
|
||||
kernelName = "normalize_hists_36_kernel";
|
||||
int blocks_in_group = NTHREADS / block_hist_size;
|
||||
nthreads = blocks_in_group * block_hist_size;
|
||||
int num_groups = divUp( img_block_width * img_block_height, blocks_in_group);
|
||||
globalThreads[0] = nthreads * num_groups;
|
||||
localThreads[0] = nthreads;
|
||||
}
|
||||
else
|
||||
{
|
||||
kernelName = "normalize_hists_kernel";
|
||||
nthreads = power_2up(block_hist_size);
|
||||
globalThreads[0] = img_block_width * nthreads;
|
||||
globalThreads[1] = img_block_height;
|
||||
localThreads[0] = nthreads;
|
||||
|
||||
if ((nthreads < 32) || (nthreads > 512) )
|
||||
cv::ocl::error("normalize_hists: histogram's size is too small or too big", __FILE__, __LINE__, "normalize_hists");
|
||||
cv::ocl::error("normalize_hists: histogram's size is too small or too big",
|
||||
__FILE__, __LINE__, "normalize_hists");
|
||||
|
||||
args.push_back( make_pair( sizeof(cl_int), (void *)&nthreads));
|
||||
args.push_back( make_pair( sizeof(cl_int), (void *)&block_hist_size));
|
||||
args.push_back( make_pair( sizeof(cl_int), (void *)&img_block_width));
|
||||
}
|
||||
|
||||
args.push_back( make_pair( sizeof(cl_mem), (void *)&block_hists.data));
|
||||
args.push_back( make_pair( sizeof(cl_float), (void *)&threshold));
|
||||
args.push_back( make_pair( nthreads * sizeof(float), (void *)NULL));
|
||||
|
||||
openCLExecuteKernel_hog(clCxt, &objdetect_hog, kernelName, globalThreads, localThreads, args);
|
||||
if(hog_device_cpu)
|
||||
openCLExecuteKernel(clCxt, &objdetect_hog, kernelName, globalThreads,
|
||||
localThreads, args, -1, -1, "-D CPU");
|
||||
else
|
||||
openCLExecuteKernel(clCxt, &objdetect_hog, kernelName, globalThreads,
|
||||
localThreads, args, -1, -1);
|
||||
}
|
||||
|
||||
void cv::ocl::device::hog::classify_hists(int win_height, int win_width, int block_stride_y,
|
||||
int block_stride_x, int win_stride_y, int win_stride_x, int height,
|
||||
int width, const cv::ocl::oclMat &block_hists, const cv::ocl::oclMat &coefs, float free_coef,
|
||||
float threshold, cv::ocl::oclMat &labels)
|
||||
void cv::ocl::device::hog::classify_hists(int win_height, int win_width,
|
||||
int block_stride_y, int block_stride_x,
|
||||
int win_stride_y, int win_stride_x,
|
||||
int height, int width,
|
||||
const cv::ocl::oclMat &block_hists,
|
||||
const cv::ocl::oclMat &coefs,
|
||||
float free_coef, float threshold,
|
||||
cv::ocl::oclMat &labels)
|
||||
{
|
||||
Context *clCxt = Context::getContext();
|
||||
string kernelName = "classify_hists_kernel";
|
||||
vector< pair<size_t, const void *> > args;
|
||||
|
||||
int nthreads;
|
||||
string kernelName;
|
||||
switch (cdescr_width)
|
||||
{
|
||||
case 180:
|
||||
nthreads = 180;
|
||||
kernelName = "classify_hists_180_kernel";
|
||||
args.push_back( make_pair( sizeof(cl_int), (void *)&cdescr_width));
|
||||
args.push_back( make_pair( sizeof(cl_int), (void *)&cdescr_height));
|
||||
break;
|
||||
case 252:
|
||||
nthreads = 256;
|
||||
kernelName = "classify_hists_252_kernel";
|
||||
args.push_back( make_pair( sizeof(cl_int), (void *)&cdescr_width));
|
||||
args.push_back( make_pair( sizeof(cl_int), (void *)&cdescr_height));
|
||||
break;
|
||||
default:
|
||||
nthreads = 256;
|
||||
kernelName = "classify_hists_kernel";
|
||||
args.push_back( make_pair( sizeof(cl_int), (void *)&cdescr_size));
|
||||
args.push_back( make_pair( sizeof(cl_int), (void *)&cdescr_width));
|
||||
}
|
||||
|
||||
int win_block_stride_x = win_stride_x / block_stride_x;
|
||||
int win_block_stride_y = win_stride_y / block_stride_y;
|
||||
int img_win_width = (width - win_width + win_stride_x) / win_stride_x;
|
||||
int img_win_height = (height - win_height + win_stride_y) / win_stride_y;
|
||||
int img_block_width = (width - CELLS_PER_BLOCK_X * CELL_WIDTH + block_stride_x) / block_stride_x;
|
||||
|
||||
size_t globalThreads[3] = { img_win_width * NTHREADS, img_win_height, 1 };
|
||||
size_t localThreads[3] = { NTHREADS, 1, 1 };
|
||||
int img_block_width = (width - CELLS_PER_BLOCK_X * CELL_WIDTH + block_stride_x) /
|
||||
block_stride_x;
|
||||
|
||||
size_t globalThreads[3] = { img_win_width * nthreads, img_win_height, 1 };
|
||||
size_t localThreads[3] = { nthreads, 1, 1 };
|
||||
args.push_back( make_pair( sizeof(cl_int), (void *)&cblock_hist_size));
|
||||
args.push_back( make_pair( sizeof(cl_int), (void *)&cdescr_size));
|
||||
args.push_back( make_pair( sizeof(cl_int), (void *)&cdescr_width));
|
||||
args.push_back( make_pair( sizeof(cl_int), (void *)&img_win_width));
|
||||
args.push_back( make_pair( sizeof(cl_int), (void *)&img_block_width));
|
||||
args.push_back( make_pair( sizeof(cl_int), (void *)&win_block_stride_x));
|
||||
@ -1694,12 +1875,20 @@ void cv::ocl::device::hog::classify_hists(int win_height, int win_width, int blo
|
||||
args.push_back( make_pair( sizeof(cl_float), (void *)&threshold));
|
||||
args.push_back( make_pair( sizeof(cl_mem), (void *)&labels.data));
|
||||
|
||||
openCLExecuteKernel_hog(clCxt, &objdetect_hog, kernelName, globalThreads, localThreads, args);
|
||||
if(hog_device_cpu)
|
||||
openCLExecuteKernel(clCxt, &objdetect_hog, kernelName, globalThreads,
|
||||
localThreads, args, -1, -1, "-D CPU");
|
||||
else
|
||||
openCLExecuteKernel(clCxt, &objdetect_hog, kernelName, globalThreads,
|
||||
localThreads, args, -1, -1);
|
||||
}
|
||||
|
||||
void cv::ocl::device::hog::extract_descrs_by_rows(int win_height, int win_width, int block_stride_y, int block_stride_x,
|
||||
int win_stride_y, int win_stride_x, int height, int width,
|
||||
const cv::ocl::oclMat &block_hists, cv::ocl::oclMat &descriptors)
|
||||
void cv::ocl::device::hog::extract_descrs_by_rows(int win_height, int win_width,
|
||||
int block_stride_y, int block_stride_x,
|
||||
int win_stride_y, int win_stride_x,
|
||||
int height, int width,
|
||||
const cv::ocl::oclMat &block_hists,
|
||||
cv::ocl::oclMat &descriptors)
|
||||
{
|
||||
Context *clCxt = Context::getContext();
|
||||
string kernelName = "extract_descrs_by_rows_kernel";
|
||||
@ -1709,7 +1898,8 @@ void cv::ocl::device::hog::extract_descrs_by_rows(int win_height, int win_width,
|
||||
int win_block_stride_y = win_stride_y / block_stride_y;
|
||||
int img_win_width = (width - win_width + win_stride_x) / win_stride_x;
|
||||
int img_win_height = (height - win_height + win_stride_y) / win_stride_y;
|
||||
int img_block_width = (width - CELLS_PER_BLOCK_X * CELL_WIDTH + block_stride_x) / block_stride_x;
|
||||
int img_block_width = (width - CELLS_PER_BLOCK_X * CELL_WIDTH + block_stride_x) /
|
||||
block_stride_x;
|
||||
int descriptors_quadstep = descriptors.step >> 2;
|
||||
|
||||
size_t globalThreads[3] = { img_win_width * NTHREADS, img_win_height, 1 };
|
||||
@ -1725,12 +1915,16 @@ void cv::ocl::device::hog::extract_descrs_by_rows(int win_height, int win_width,
|
||||
args.push_back( make_pair( sizeof(cl_mem), (void *)&block_hists.data));
|
||||
args.push_back( make_pair( sizeof(cl_mem), (void *)&descriptors.data));
|
||||
|
||||
openCLExecuteKernel(clCxt, &objdetect_hog, kernelName, globalThreads, localThreads, args, -1, -1);
|
||||
openCLExecuteKernel(clCxt, &objdetect_hog, kernelName, globalThreads,
|
||||
localThreads, args, -1, -1);
|
||||
}
|
||||
|
||||
void cv::ocl::device::hog::extract_descrs_by_cols(int win_height, int win_width, int block_stride_y, int block_stride_x,
|
||||
int win_stride_y, int win_stride_x, int height, int width,
|
||||
const cv::ocl::oclMat &block_hists, cv::ocl::oclMat &descriptors)
|
||||
void cv::ocl::device::hog::extract_descrs_by_cols(int win_height, int win_width,
|
||||
int block_stride_y, int block_stride_x,
|
||||
int win_stride_y, int win_stride_x,
|
||||
int height, int width,
|
||||
const cv::ocl::oclMat &block_hists,
|
||||
cv::ocl::oclMat &descriptors)
|
||||
{
|
||||
Context *clCxt = Context::getContext();
|
||||
string kernelName = "extract_descrs_by_cols_kernel";
|
||||
@ -1740,7 +1934,8 @@ void cv::ocl::device::hog::extract_descrs_by_cols(int win_height, int win_width,
|
||||
int win_block_stride_y = win_stride_y / block_stride_y;
|
||||
int img_win_width = (width - win_width + win_stride_x) / win_stride_x;
|
||||
int img_win_height = (height - win_height + win_stride_y) / win_stride_y;
|
||||
int img_block_width = (width - CELLS_PER_BLOCK_X * CELL_WIDTH + block_stride_x) / block_stride_x;
|
||||
int img_block_width = (width - CELLS_PER_BLOCK_X * CELL_WIDTH + block_stride_x) /
|
||||
block_stride_x;
|
||||
int descriptors_quadstep = descriptors.step >> 2;
|
||||
|
||||
size_t globalThreads[3] = { img_win_width * NTHREADS, img_win_height, 1 };
|
||||
@ -1757,11 +1952,16 @@ void cv::ocl::device::hog::extract_descrs_by_cols(int win_height, int win_width,
|
||||
args.push_back( make_pair( sizeof(cl_mem), (void *)&block_hists.data));
|
||||
args.push_back( make_pair( sizeof(cl_mem), (void *)&descriptors.data));
|
||||
|
||||
openCLExecuteKernel(clCxt, &objdetect_hog, kernelName, globalThreads, localThreads, args, -1, -1);
|
||||
openCLExecuteKernel(clCxt, &objdetect_hog, kernelName, globalThreads,
|
||||
localThreads, args, -1, -1);
|
||||
}
|
||||
|
||||
void cv::ocl::device::hog::compute_gradients_8UC1(int height, int width, const cv::ocl::oclMat &img,
|
||||
float angle_scale, cv::ocl::oclMat &grad, cv::ocl::oclMat &qangle, bool correct_gamma)
|
||||
void cv::ocl::device::hog::compute_gradients_8UC1(int height, int width,
|
||||
const cv::ocl::oclMat &img,
|
||||
float angle_scale,
|
||||
cv::ocl::oclMat &grad,
|
||||
cv::ocl::oclMat &qangle,
|
||||
bool correct_gamma)
|
||||
{
|
||||
Context *clCxt = Context::getContext();
|
||||
string kernelName = "compute_gradients_8UC1_kernel";
|
||||
@ -1786,11 +1986,16 @@ void cv::ocl::device::hog::compute_gradients_8UC1(int height, int width, const c
|
||||
args.push_back( make_pair( sizeof(cl_char), (void *)&correctGamma));
|
||||
args.push_back( make_pair( sizeof(cl_int), (void *)&cnbins));
|
||||
|
||||
openCLExecuteKernel(clCxt, &objdetect_hog, kernelName, globalThreads, localThreads, args, -1, -1);
|
||||
openCLExecuteKernel(clCxt, &objdetect_hog, kernelName, globalThreads,
|
||||
localThreads, args, -1, -1);
|
||||
}
|
||||
|
||||
void cv::ocl::device::hog::compute_gradients_8UC4(int height, int width, const cv::ocl::oclMat &img,
|
||||
float angle_scale, cv::ocl::oclMat &grad, cv::ocl::oclMat &qangle, bool correct_gamma)
|
||||
void cv::ocl::device::hog::compute_gradients_8UC4(int height, int width,
|
||||
const cv::ocl::oclMat &img,
|
||||
float angle_scale,
|
||||
cv::ocl::oclMat &grad,
|
||||
cv::ocl::oclMat &qangle,
|
||||
bool correct_gamma)
|
||||
{
|
||||
Context *clCxt = Context::getContext();
|
||||
string kernelName = "compute_gradients_8UC4_kernel";
|
||||
@ -1816,39 +2021,6 @@ void cv::ocl::device::hog::compute_gradients_8UC4(int height, int width, const c
|
||||
args.push_back( make_pair( sizeof(cl_char), (void *)&correctGamma));
|
||||
args.push_back( make_pair( sizeof(cl_int), (void *)&cnbins));
|
||||
|
||||
openCLExecuteKernel(clCxt, &objdetect_hog, kernelName, globalThreads, localThreads, args, -1, -1);
|
||||
}
|
||||
|
||||
void cv::ocl::device::hog::resize( const oclMat &src, oclMat &dst, const Size sz)
|
||||
{
|
||||
CV_Assert( (src.channels() == dst.channels()) );
|
||||
Context *clCxt = Context::getContext();
|
||||
|
||||
string kernelName = (src.type() == CV_8UC1) ? "resize_8UC1_kernel" : "resize_8UC4_kernel";
|
||||
size_t blkSizeX = 16, blkSizeY = 16;
|
||||
size_t glbSizeX = sz.width % blkSizeX == 0 ? sz.width : (sz.width / blkSizeX + 1) * blkSizeX;
|
||||
size_t glbSizeY = sz.height % blkSizeY == 0 ? sz.height : (sz.height / blkSizeY + 1) * blkSizeY;
|
||||
size_t globalThreads[3] = {glbSizeX, glbSizeY, 1};
|
||||
size_t localThreads[3] = {blkSizeX, blkSizeY, 1};
|
||||
|
||||
float ifx = (float)src.cols / sz.width;
|
||||
float ify = (float)src.rows / sz.height;
|
||||
int src_step = static_cast<int>(src.step);
|
||||
int dst_step = static_cast<int>(dst.step);
|
||||
|
||||
vector< pair<size_t, const void *> > args;
|
||||
args.push_back( make_pair(sizeof(cl_mem), (void *)&dst.data));
|
||||
args.push_back( make_pair(sizeof(cl_mem), (void *)&src.data));
|
||||
args.push_back( make_pair(sizeof(cl_int), (void *)&dst.offset));
|
||||
args.push_back( make_pair(sizeof(cl_int), (void *)&src.offset));
|
||||
args.push_back( make_pair(sizeof(cl_int), (void *)&dst_step));
|
||||
args.push_back( make_pair(sizeof(cl_int), (void *)&src_step));
|
||||
args.push_back( make_pair(sizeof(cl_int), (void *)&src.cols));
|
||||
args.push_back( make_pair(sizeof(cl_int), (void *)&src.rows));
|
||||
args.push_back( make_pair(sizeof(cl_int), (void *)&sz.width));
|
||||
args.push_back( make_pair(sizeof(cl_int), (void *)&sz.height));
|
||||
args.push_back( make_pair(sizeof(cl_float), (void *)&ifx));
|
||||
args.push_back( make_pair(sizeof(cl_float), (void *)&ify));
|
||||
|
||||
openCLExecuteKernel(clCxt, &objdetect_hog, kernelName, globalThreads, localThreads, args, -1, -1);
|
||||
openCLExecuteKernel(clCxt, &objdetect_hog, kernelName, globalThreads,
|
||||
localThreads, args, -1, -1);
|
||||
}
|
@ -43,7 +43,6 @@
|
||||
//
|
||||
//M*/
|
||||
|
||||
|
||||
#define CELL_WIDTH 8
|
||||
#define CELL_HEIGHT 8
|
||||
#define CELLS_PER_BLOCK_X 2
|
||||
@ -51,6 +50,100 @@
|
||||
#define NTHREADS 256
|
||||
#define CV_PI_F 3.1415926535897932384626433832795f
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
// Histogram computation
|
||||
// 12 threads for a cell, 12x4 threads per block
|
||||
// Use pre-computed gaussian and interp_weight lookup tables if sigma is 4.0f
|
||||
__kernel void compute_hists_lut_kernel(
|
||||
const int cblock_stride_x, const int cblock_stride_y,
|
||||
const int cnbins, const int cblock_hist_size, const int img_block_width,
|
||||
const int blocks_in_group, const int blocks_total,
|
||||
const int grad_quadstep, const int qangle_step,
|
||||
__global const float* grad, __global const uchar* qangle,
|
||||
__global const float* gauss_w_lut,
|
||||
__global float* block_hists, __local float* smem)
|
||||
{
|
||||
const int lx = get_local_id(0);
|
||||
const int lp = lx / 24; /* local group id */
|
||||
const int gid = get_group_id(0) * blocks_in_group + lp;/* global group id */
|
||||
const int gidY = gid / img_block_width;
|
||||
const int gidX = gid - gidY * img_block_width;
|
||||
|
||||
const int lidX = lx - lp * 24;
|
||||
const int lidY = get_local_id(1);
|
||||
|
||||
const int cell_x = lidX / 12;
|
||||
const int cell_y = lidY;
|
||||
const int cell_thread_x = lidX - cell_x * 12;
|
||||
|
||||
__local float* hists = smem + lp * cnbins * (CELLS_PER_BLOCK_X *
|
||||
CELLS_PER_BLOCK_Y * 12 + CELLS_PER_BLOCK_X * CELLS_PER_BLOCK_Y);
|
||||
__local float* final_hist = hists + cnbins *
|
||||
(CELLS_PER_BLOCK_X * CELLS_PER_BLOCK_Y * 12);
|
||||
|
||||
const int offset_x = gidX * cblock_stride_x + (cell_x << 2) + cell_thread_x;
|
||||
const int offset_y = gidY * cblock_stride_y + (cell_y << 2);
|
||||
|
||||
__global const float* grad_ptr = (gid < blocks_total) ?
|
||||
grad + offset_y * grad_quadstep + (offset_x << 1) : grad;
|
||||
__global const uchar* qangle_ptr = (gid < blocks_total) ?
|
||||
qangle + offset_y * qangle_step + (offset_x << 1) : qangle;
|
||||
|
||||
__local float* hist = hists + 12 * (cell_y * CELLS_PER_BLOCK_Y + cell_x) +
|
||||
cell_thread_x;
|
||||
for (int bin_id = 0; bin_id < cnbins; ++bin_id)
|
||||
hist[bin_id * 48] = 0.f;
|
||||
|
||||
const int dist_x = -4 + cell_thread_x - 4 * cell_x;
|
||||
const int dist_center_x = dist_x - 4 * (1 - 2 * cell_x);
|
||||
|
||||
const int dist_y_begin = -4 - 4 * lidY;
|
||||
for (int dist_y = dist_y_begin; dist_y < dist_y_begin + 12; ++dist_y)
|
||||
{
|
||||
float2 vote = (float2) (grad_ptr[0], grad_ptr[1]);
|
||||
uchar2 bin = (uchar2) (qangle_ptr[0], qangle_ptr[1]);
|
||||
|
||||
grad_ptr += grad_quadstep;
|
||||
qangle_ptr += qangle_step;
|
||||
|
||||
int dist_center_y = dist_y - 4 * (1 - 2 * cell_y);
|
||||
|
||||
int idx = (dist_center_y + 8) * 16 + (dist_center_x + 8);
|
||||
float gaussian = gauss_w_lut[idx];
|
||||
idx = (dist_y + 8) * 16 + (dist_x + 8);
|
||||
float interp_weight = gauss_w_lut[256+idx];
|
||||
|
||||
hist[bin.x * 48] += gaussian * interp_weight * vote.x;
|
||||
hist[bin.y * 48] += gaussian * interp_weight * vote.y;
|
||||
}
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
volatile __local float* hist_ = hist;
|
||||
for (int bin_id = 0; bin_id < cnbins; ++bin_id, hist_ += 48)
|
||||
{
|
||||
if (cell_thread_x < 6)
|
||||
hist_[0] += hist_[6];
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
if (cell_thread_x < 3)
|
||||
hist_[0] += hist_[3];
|
||||
#ifdef CPU
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
#endif
|
||||
if (cell_thread_x == 0)
|
||||
final_hist[(cell_x * 2 + cell_y) * cnbins + bin_id] =
|
||||
hist_[0] + hist_[1] + hist_[2];
|
||||
}
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
int tid = (cell_y * CELLS_PER_BLOCK_Y + cell_x) * 12 + cell_thread_x;
|
||||
if ((tid < cblock_hist_size) && (gid < blocks_total))
|
||||
{
|
||||
__global float* block_hist = block_hists +
|
||||
(gidY * img_block_width + gidX) * cblock_hist_size;
|
||||
block_hist[tid] = final_hist[tid];
|
||||
}
|
||||
}
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
// Histogram computation
|
||||
// 12 threads for a cell, 12x4 threads per block
|
||||
@ -125,16 +218,14 @@ __kernel void compute_hists_kernel(
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
if (cell_thread_x < 3)
|
||||
hist_[0] += hist_[3];
|
||||
#ifdef WAVE_SIZE_1
|
||||
#ifdef CPU
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
#endif
|
||||
if (cell_thread_x == 0)
|
||||
final_hist[(cell_x * 2 + cell_y) * cnbins + bin_id] =
|
||||
hist_[0] + hist_[1] + hist_[2];
|
||||
}
|
||||
#ifdef WAVE_SIZE_1
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
#endif
|
||||
|
||||
int tid = (cell_y * CELLS_PER_BLOCK_Y + cell_x) * 12 + cell_thread_x;
|
||||
if ((tid < cblock_hist_size) && (gid < blocks_total))
|
||||
@ -145,6 +236,57 @@ __kernel void compute_hists_kernel(
|
||||
}
|
||||
}
|
||||
|
||||
//-------------------------------------------------------------
|
||||
// Normalization of histograms via L2Hys_norm
|
||||
// optimized for the case of 9 bins
|
||||
__kernel void normalize_hists_36_kernel(__global float* block_hists,
|
||||
const float threshold, __local float *squares)
|
||||
{
|
||||
const int tid = get_local_id(0);
|
||||
const int gid = get_global_id(0);
|
||||
const int bid = tid / 36; /* block-hist id, (0 - 6) */
|
||||
const int boffset = bid * 36; /* block-hist offset in the work-group */
|
||||
const int hid = tid - boffset; /* histogram bin id, (0 - 35) */
|
||||
|
||||
float elem = block_hists[gid];
|
||||
squares[tid] = elem * elem;
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
__local float* smem = squares + boffset;
|
||||
float sum = smem[hid];
|
||||
if (hid < 18)
|
||||
smem[hid] = sum = sum + smem[hid + 18];
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
if (hid < 9)
|
||||
smem[hid] = sum = sum + smem[hid + 9];
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
if (hid < 4)
|
||||
smem[hid] = sum + smem[hid + 4];
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
sum = smem[0] + smem[1] + smem[2] + smem[3] + smem[8];
|
||||
|
||||
elem = elem / (sqrt(sum) + 3.6f);
|
||||
elem = min(elem, threshold);
|
||||
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
squares[tid] = elem * elem;
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
sum = smem[hid];
|
||||
if (hid < 18)
|
||||
smem[hid] = sum = sum + smem[hid + 18];
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
if (hid < 9)
|
||||
smem[hid] = sum = sum + smem[hid + 9];
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
if (hid < 4)
|
||||
smem[hid] = sum + smem[hid + 4];
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
sum = smem[0] + smem[1] + smem[2] + smem[3] + smem[8];
|
||||
|
||||
block_hists[gid] = elem / (sqrt(sum) + 1e-3f);
|
||||
}
|
||||
|
||||
//-------------------------------------------------------------
|
||||
// Normalization of histograms via L2Hys_norm
|
||||
//
|
||||
@ -153,76 +295,50 @@ float reduce_smem(volatile __local float* smem, int size)
|
||||
unsigned int tid = get_local_id(0);
|
||||
float sum = smem[tid];
|
||||
|
||||
if (size >= 512)
|
||||
{
|
||||
if (tid < 256) smem[tid] = sum = sum + smem[tid + 256];
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
}
|
||||
if (size >= 256)
|
||||
{
|
||||
if (tid < 128) smem[tid] = sum = sum + smem[tid + 128];
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
}
|
||||
if (size >= 128)
|
||||
{
|
||||
if (tid < 64) smem[tid] = sum = sum + smem[tid + 64];
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
}
|
||||
|
||||
if (size >= 512) { if (tid < 256) smem[tid] = sum = sum + smem[tid + 256];
|
||||
barrier(CLK_LOCAL_MEM_FENCE); }
|
||||
if (size >= 256) { if (tid < 128) smem[tid] = sum = sum + smem[tid + 128];
|
||||
barrier(CLK_LOCAL_MEM_FENCE); }
|
||||
if (size >= 128) { if (tid < 64) smem[tid] = sum = sum + smem[tid + 64];
|
||||
barrier(CLK_LOCAL_MEM_FENCE); }
|
||||
#ifdef CPU
|
||||
if (size >= 64) { if (tid < 32) smem[tid] = sum = sum + smem[tid + 32];
|
||||
barrier(CLK_LOCAL_MEM_FENCE); }
|
||||
if (size >= 32) { if (tid < 16) smem[tid] = sum = sum + smem[tid + 16];
|
||||
barrier(CLK_LOCAL_MEM_FENCE); }
|
||||
if (size >= 16) { if (tid < 8) smem[tid] = sum = sum + smem[tid + 8];
|
||||
barrier(CLK_LOCAL_MEM_FENCE); }
|
||||
if (size >= 8) { if (tid < 4) smem[tid] = sum = sum + smem[tid + 4];
|
||||
barrier(CLK_LOCAL_MEM_FENCE); }
|
||||
if (size >= 4) { if (tid < 2) smem[tid] = sum = sum + smem[tid + 2];
|
||||
barrier(CLK_LOCAL_MEM_FENCE); }
|
||||
if (size >= 2) { if (tid < 1) smem[tid] = sum = sum + smem[tid + 1];
|
||||
barrier(CLK_LOCAL_MEM_FENCE); }
|
||||
#else
|
||||
if (tid < 32)
|
||||
{
|
||||
if (size >= 64) smem[tid] = sum = sum + smem[tid + 32];
|
||||
#if defined(WAVE_SIZE_16) || defined(WAVE_SIZE_1)
|
||||
}
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
if (tid < 16)
|
||||
{
|
||||
#endif
|
||||
if (size >= 32) smem[tid] = sum = sum + smem[tid + 16];
|
||||
#ifdef WAVE_SIZE_1
|
||||
}
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
if (tid < 8)
|
||||
{
|
||||
#endif
|
||||
if (size >= 16) smem[tid] = sum = sum + smem[tid + 8];
|
||||
#ifdef WAVE_SIZE_1
|
||||
}
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
if (tid < 4)
|
||||
{
|
||||
#endif
|
||||
if (size >= 8) smem[tid] = sum = sum + smem[tid + 4];
|
||||
#ifdef WAVE_SIZE_1
|
||||
}
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
if (tid < 2)
|
||||
{
|
||||
#endif
|
||||
if (size >= 4) smem[tid] = sum = sum + smem[tid + 2];
|
||||
#ifdef WAVE_SIZE_1
|
||||
}
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
if (tid < 1)
|
||||
{
|
||||
#endif
|
||||
if (size >= 2) smem[tid] = sum = sum + smem[tid + 1];
|
||||
}
|
||||
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
sum = smem[0];
|
||||
#endif
|
||||
|
||||
return sum;
|
||||
}
|
||||
|
||||
__kernel void normalize_hists_kernel(const int nthreads, const int block_hist_size, const int img_block_width,
|
||||
__kernel void normalize_hists_kernel(
|
||||
const int nthreads, const int block_hist_size, const int img_block_width,
|
||||
__global float* block_hists, const float threshold, __local float *squares)
|
||||
{
|
||||
const int tid = get_local_id(0);
|
||||
const int gidX = get_group_id(0);
|
||||
const int gidY = get_group_id(1);
|
||||
|
||||
__global float* hist = block_hists + (gidY * img_block_width + gidX) * block_hist_size + tid;
|
||||
__global float* hist = block_hists + (gidY * img_block_width + gidX) *
|
||||
block_hist_size + tid;
|
||||
|
||||
float elem = 0.f;
|
||||
if (tid < block_hist_size)
|
||||
@ -249,8 +365,10 @@ __kernel void normalize_hists_kernel(const int nthreads, const int block_hist_si
|
||||
|
||||
//---------------------------------------------------------------------
|
||||
// Linear SVM based classification
|
||||
//
|
||||
__kernel void classify_hists_kernel(const int cblock_hist_size, const int cdescr_size, const int cdescr_width,
|
||||
// 48x96 window, 9 bins and default parameters
|
||||
// 180 threads, each thread corresponds to a bin in a row
|
||||
__kernel void classify_hists_180_kernel(
|
||||
const int cdescr_width, const int cdescr_height, const int cblock_hist_size,
|
||||
const int img_win_width, const int img_block_width,
|
||||
const int win_block_stride_x, const int win_block_stride_y,
|
||||
__global const float * block_hists, __global const float* coefs,
|
||||
@ -260,14 +378,85 @@ __kernel void classify_hists_kernel(const int cblock_hist_size, const int cdescr
|
||||
const int gidX = get_group_id(0);
|
||||
const int gidY = get_group_id(1);
|
||||
|
||||
__global const float* hist = block_hists + (gidY * win_block_stride_y * img_block_width + gidX * win_block_stride_x) * cblock_hist_size;
|
||||
__global const float* hist = block_hists + (gidY * win_block_stride_y *
|
||||
img_block_width + gidX * win_block_stride_x) * cblock_hist_size;
|
||||
|
||||
float product = 0.f;
|
||||
for (int i = tid; i < cdescr_size; i += NTHREADS)
|
||||
|
||||
for (int i = 0; i < cdescr_height; i++)
|
||||
{
|
||||
int offset_y = i / cdescr_width;
|
||||
int offset_x = i - offset_y * cdescr_width;
|
||||
product += coefs[i] * hist[offset_y * img_block_width * cblock_hist_size + offset_x];
|
||||
product += coefs[i * cdescr_width + tid] *
|
||||
hist[i * img_block_width * cblock_hist_size + tid];
|
||||
}
|
||||
|
||||
__local float products[180];
|
||||
|
||||
products[tid] = product;
|
||||
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
if (tid < 90) products[tid] = product = product + products[tid + 90];
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
if (tid < 45) products[tid] = product = product + products[tid + 45];
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
volatile __local float* smem = products;
|
||||
#ifdef CPU
|
||||
if (tid < 13) smem[tid] = product = product + smem[tid + 32];
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
if (tid < 16) smem[tid] = product = product + smem[tid + 16];
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
if(tid<8) smem[tid] = product = product + smem[tid + 8];
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
if(tid<4) smem[tid] = product = product + smem[tid + 4];
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
if(tid<2) smem[tid] = product = product + smem[tid + 2];
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
#else
|
||||
if (tid < 13)
|
||||
{
|
||||
smem[tid] = product = product + smem[tid + 32];
|
||||
}
|
||||
if (tid < 16)
|
||||
{
|
||||
smem[tid] = product = product + smem[tid + 16];
|
||||
smem[tid] = product = product + smem[tid + 8];
|
||||
smem[tid] = product = product + smem[tid + 4];
|
||||
smem[tid] = product = product + smem[tid + 2];
|
||||
}
|
||||
#endif
|
||||
|
||||
if (tid == 0){
|
||||
product = product + smem[tid + 1];
|
||||
labels[gidY * img_win_width + gidX] = (product + free_coef >= threshold);
|
||||
}
|
||||
}
|
||||
|
||||
//---------------------------------------------------------------------
|
||||
// Linear SVM based classification
|
||||
// 64x128 window, 9 bins and default parameters
|
||||
// 256 threads, 252 of them are used
|
||||
__kernel void classify_hists_252_kernel(
|
||||
const int cdescr_width, const int cdescr_height, const int cblock_hist_size,
|
||||
const int img_win_width, const int img_block_width,
|
||||
const int win_block_stride_x, const int win_block_stride_y,
|
||||
__global const float * block_hists, __global const float* coefs,
|
||||
float free_coef, float threshold, __global uchar* labels)
|
||||
{
|
||||
const int tid = get_local_id(0);
|
||||
const int gidX = get_group_id(0);
|
||||
const int gidY = get_group_id(1);
|
||||
|
||||
__global const float* hist = block_hists + (gidY * win_block_stride_y *
|
||||
img_block_width + gidX * win_block_stride_x) * cblock_hist_size;
|
||||
|
||||
float product = 0.f;
|
||||
if (tid < cdescr_width)
|
||||
{
|
||||
for (int i = 0; i < cdescr_height; i++)
|
||||
product += coefs[i * cdescr_width + tid] *
|
||||
hist[i * img_block_width * cblock_hist_size + tid];
|
||||
}
|
||||
|
||||
__local float products[NTHREADS];
|
||||
@ -283,55 +472,106 @@ __kernel void classify_hists_kernel(const int cblock_hist_size, const int cdescr
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
volatile __local float* smem = products;
|
||||
#ifdef CPU
|
||||
if(tid<32) smem[tid] = product = product + smem[tid + 32];
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
if(tid<16) smem[tid] = product = product + smem[tid + 16];
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
if(tid<8) smem[tid] = product = product + smem[tid + 8];
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
if(tid<4) smem[tid] = product = product + smem[tid + 4];
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
if(tid<2) smem[tid] = product = product + smem[tid + 2];
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
#else
|
||||
if (tid < 32)
|
||||
{
|
||||
smem[tid] = product = product + smem[tid + 32];
|
||||
#if defined(WAVE_SIZE_16) || defined(WAVE_SIZE_1)
|
||||
}
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
if (tid < 16)
|
||||
{
|
||||
#endif
|
||||
smem[tid] = product = product + smem[tid + 16];
|
||||
#ifdef WAVE_SIZE_1
|
||||
}
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
if (tid < 8)
|
||||
{
|
||||
#endif
|
||||
smem[tid] = product = product + smem[tid + 8];
|
||||
#ifdef WAVE_SIZE_1
|
||||
}
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
if (tid < 4)
|
||||
{
|
||||
#endif
|
||||
smem[tid] = product = product + smem[tid + 4];
|
||||
#ifdef WAVE_SIZE_1
|
||||
}
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
if (tid < 2)
|
||||
{
|
||||
#endif
|
||||
smem[tid] = product = product + smem[tid + 2];
|
||||
#ifdef WAVE_SIZE_1
|
||||
}
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
if (tid < 1)
|
||||
{
|
||||
#endif
|
||||
smem[tid] = product = product + smem[tid + 1];
|
||||
if (tid == 0){
|
||||
product = product + smem[tid + 1];
|
||||
labels[gidY * img_win_width + gidX] = (product + free_coef >= threshold);
|
||||
}
|
||||
}
|
||||
|
||||
if (tid == 0)
|
||||
//---------------------------------------------------------------------
|
||||
// Linear SVM based classification
|
||||
// 256 threads
|
||||
__kernel void classify_hists_kernel(
|
||||
const int cdescr_size, const int cdescr_width, const int cblock_hist_size,
|
||||
const int img_win_width, const int img_block_width,
|
||||
const int win_block_stride_x, const int win_block_stride_y,
|
||||
__global const float * block_hists, __global const float* coefs,
|
||||
float free_coef, float threshold, __global uchar* labels)
|
||||
{
|
||||
const int tid = get_local_id(0);
|
||||
const int gidX = get_group_id(0);
|
||||
const int gidY = get_group_id(1);
|
||||
|
||||
__global const float* hist = block_hists + (gidY * win_block_stride_y *
|
||||
img_block_width + gidX * win_block_stride_x) * cblock_hist_size;
|
||||
|
||||
float product = 0.f;
|
||||
for (int i = tid; i < cdescr_size; i += NTHREADS)
|
||||
{
|
||||
int offset_y = i / cdescr_width;
|
||||
int offset_x = i - offset_y * cdescr_width;
|
||||
product += coefs[i] *
|
||||
hist[offset_y * img_block_width * cblock_hist_size + offset_x];
|
||||
}
|
||||
|
||||
__local float products[NTHREADS];
|
||||
|
||||
products[tid] = product;
|
||||
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
if (tid < 128) products[tid] = product = product + products[tid + 128];
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
if (tid < 64) products[tid] = product = product + products[tid + 64];
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
volatile __local float* smem = products;
|
||||
#ifdef CPU
|
||||
if(tid<32) smem[tid] = product = product + smem[tid + 32];
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
if(tid<16) smem[tid] = product = product + smem[tid + 16];
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
if(tid<8) smem[tid] = product = product + smem[tid + 8];
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
if(tid<4) smem[tid] = product = product + smem[tid + 4];
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
if(tid<2) smem[tid] = product = product + smem[tid + 2];
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
#else
|
||||
if (tid < 32)
|
||||
{
|
||||
smem[tid] = product = product + smem[tid + 32];
|
||||
smem[tid] = product = product + smem[tid + 16];
|
||||
smem[tid] = product = product + smem[tid + 8];
|
||||
smem[tid] = product = product + smem[tid + 4];
|
||||
smem[tid] = product = product + smem[tid + 2];
|
||||
}
|
||||
#endif
|
||||
if (tid == 0){
|
||||
smem[tid] = product = product + smem[tid + 1];
|
||||
labels[gidY * img_win_width + gidX] = (product + free_coef >= threshold);
|
||||
}
|
||||
}
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
// Extract descriptors
|
||||
|
||||
__kernel void extract_descrs_by_rows_kernel(const int cblock_hist_size, const int descriptors_quadstep, const int cdescr_size, const int cdescr_width,
|
||||
const int img_block_width, const int win_block_stride_x, const int win_block_stride_y,
|
||||
__kernel void extract_descrs_by_rows_kernel(
|
||||
const int cblock_hist_size, const int descriptors_quadstep,
|
||||
const int cdescr_size, const int cdescr_width, const int img_block_width,
|
||||
const int win_block_stride_x, const int win_block_stride_y,
|
||||
__global const float* block_hists, __global float* descriptors)
|
||||
{
|
||||
int tid = get_local_id(0);
|
||||
@ -339,10 +579,12 @@ __kernel void extract_descrs_by_rows_kernel(const int cblock_hist_size, const in
|
||||
int gidY = get_group_id(1);
|
||||
|
||||
// Get left top corner of the window in src
|
||||
__global const float* hist = block_hists + (gidY * win_block_stride_y * img_block_width + gidX * win_block_stride_x) * cblock_hist_size;
|
||||
__global const float* hist = block_hists + (gidY * win_block_stride_y *
|
||||
img_block_width + gidX * win_block_stride_x) * cblock_hist_size;
|
||||
|
||||
// Get left top corner of the window in dst
|
||||
__global float* descriptor = descriptors + (gidY * get_num_groups(0) + gidX) * descriptors_quadstep;
|
||||
__global float* descriptor = descriptors +
|
||||
(gidY * get_num_groups(0) + gidX) * descriptors_quadstep;
|
||||
|
||||
// Copy elements from src to dst
|
||||
for (int i = tid; i < cdescr_size; i += NTHREADS)
|
||||
@ -353,19 +595,23 @@ __kernel void extract_descrs_by_rows_kernel(const int cblock_hist_size, const in
|
||||
}
|
||||
}
|
||||
|
||||
__kernel void extract_descrs_by_cols_kernel(const int cblock_hist_size, const int descriptors_quadstep, const int cdescr_size,
|
||||
const int cnblocks_win_x, const int cnblocks_win_y, const int img_block_width, const int win_block_stride_x,
|
||||
const int win_block_stride_y, __global const float* block_hists, __global float* descriptors)
|
||||
__kernel void extract_descrs_by_cols_kernel(
|
||||
const int cblock_hist_size, const int descriptors_quadstep, const int cdescr_size,
|
||||
const int cnblocks_win_x, const int cnblocks_win_y, const int img_block_width,
|
||||
const int win_block_stride_x, const int win_block_stride_y,
|
||||
__global const float* block_hists, __global float* descriptors)
|
||||
{
|
||||
int tid = get_local_id(0);
|
||||
int gidX = get_group_id(0);
|
||||
int gidY = get_group_id(1);
|
||||
|
||||
// Get left top corner of the window in src
|
||||
__global const float* hist = block_hists + (gidY * win_block_stride_y * img_block_width + gidX * win_block_stride_x) * cblock_hist_size;
|
||||
__global const float* hist = block_hists + (gidY * win_block_stride_y *
|
||||
img_block_width + gidX * win_block_stride_x) * cblock_hist_size;
|
||||
|
||||
// Get left top corner of the window in dst
|
||||
__global float* descriptor = descriptors + (gidY * get_num_groups(0) + gidX) * descriptors_quadstep;
|
||||
__global float* descriptor = descriptors +
|
||||
(gidY * get_num_groups(0) + gidX) * descriptors_quadstep;
|
||||
|
||||
// Copy elements from src to dst
|
||||
for (int i = tid; i < cdescr_size; i += NTHREADS)
|
||||
@ -376,14 +622,17 @@ __kernel void extract_descrs_by_cols_kernel(const int cblock_hist_size, const in
|
||||
int y = block_idx / cnblocks_win_x;
|
||||
int x = block_idx - y * cnblocks_win_x;
|
||||
|
||||
descriptor[(x * cnblocks_win_y + y) * cblock_hist_size + idx_in_block] = hist[(y * img_block_width + x) * cblock_hist_size + idx_in_block];
|
||||
descriptor[(x * cnblocks_win_y + y) * cblock_hist_size + idx_in_block] =
|
||||
hist[(y * img_block_width + x) * cblock_hist_size + idx_in_block];
|
||||
}
|
||||
}
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
// Gradients computation
|
||||
|
||||
__kernel void compute_gradients_8UC4_kernel(const int height, const int width, const int img_step, const int grad_quadstep, const int qangle_step,
|
||||
__kernel void compute_gradients_8UC4_kernel(
|
||||
const int height, const int width,
|
||||
const int img_step, const int grad_quadstep, const int qangle_step,
|
||||
const __global uchar4 * img, __global float * grad, __global uchar * qangle,
|
||||
const float angle_scale, const char correct_gamma, const int cnbins)
|
||||
{
|
||||
@ -426,8 +675,10 @@ __kernel void compute_gradients_8UC4_kernel(const int height, const int width, c
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
if (x < width)
|
||||
{
|
||||
float3 a = (float3) (sh_row[tid], sh_row[tid + (NTHREADS + 2)], sh_row[tid + 2 * (NTHREADS + 2)]);
|
||||
float3 b = (float3) (sh_row[tid + 2], sh_row[tid + 2 + (NTHREADS + 2)], sh_row[tid + 2 + 2 * (NTHREADS + 2)]);
|
||||
float3 a = (float3) (sh_row[tid], sh_row[tid + (NTHREADS + 2)],
|
||||
sh_row[tid + 2 * (NTHREADS + 2)]);
|
||||
float3 b = (float3) (sh_row[tid + 2], sh_row[tid + 2 + (NTHREADS + 2)],
|
||||
sh_row[tid + 2 + 2 * (NTHREADS + 2)]);
|
||||
|
||||
float3 dx;
|
||||
if (correct_gamma == 1)
|
||||
@ -482,7 +733,9 @@ __kernel void compute_gradients_8UC4_kernel(const int height, const int width, c
|
||||
}
|
||||
}
|
||||
|
||||
__kernel void compute_gradients_8UC1_kernel(const int height, const int width, const int img_step, const int grad_quadstep, const int qangle_step,
|
||||
__kernel void compute_gradients_8UC1_kernel(
|
||||
const int height, const int width,
|
||||
const int img_step, const int grad_quadstep, const int qangle_step,
|
||||
__global const uchar * img, __global float * grad, __global uchar * qangle,
|
||||
const float angle_scale, const char correct_gamma, const int cnbins)
|
||||
{
|
||||
@ -540,42 +793,3 @@ __kernel void compute_gradients_8UC1_kernel(const int height, const int width, c
|
||||
grad[ ((gidY * grad_quadstep + x) << 1) + 1 ] = mag * ang;
|
||||
}
|
||||
}
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
// Resize
|
||||
|
||||
__kernel void resize_8UC4_kernel(__global uchar4 * dst, __global const uchar4 * src,
|
||||
int dst_offset, int src_offset, int dst_step, int src_step,
|
||||
int src_cols, int src_rows, int dst_cols, int dst_rows, float ifx, float ify )
|
||||
{
|
||||
int dx = get_global_id(0);
|
||||
int dy = get_global_id(1);
|
||||
|
||||
int sx = (int)floor(dx*ifx+0.5f);
|
||||
int sy = (int)floor(dy*ify+0.5f);
|
||||
sx = min(sx, src_cols-1);
|
||||
sy = min(sy, src_rows-1);
|
||||
int dpos = (dst_offset>>2) + dy * (dst_step>>2) + dx;
|
||||
int spos = (src_offset>>2) + sy * (src_step>>2) + sx;
|
||||
|
||||
if(dx<dst_cols && dy<dst_rows)
|
||||
dst[dpos] = src[spos];
|
||||
}
|
||||
|
||||
__kernel void resize_8UC1_kernel(__global uchar * dst, __global const uchar * src,
|
||||
int dst_offset, int src_offset, int dst_step, int src_step,
|
||||
int src_cols, int src_rows, int dst_cols, int dst_rows, float ifx, float ify )
|
||||
{
|
||||
int dx = get_global_id(0);
|
||||
int dy = get_global_id(1);
|
||||
|
||||
int sx = (int)floor(dx*ifx+0.5f);
|
||||
int sy = (int)floor(dy*ify+0.5f);
|
||||
sx = min(sx, src_cols-1);
|
||||
sy = min(sy, src_rows-1);
|
||||
int dpos = dst_offset + dy * dst_step + dx;
|
||||
int spos = src_offset + sy * src_step + sx;
|
||||
|
||||
if(dx<dst_cols && dy<dst_rows)
|
||||
dst[dpos] = src[spos];
|
||||
}
|
Loading…
Reference in New Issue
Block a user