Merge pull request #1123 from bitwangyaoyao:2.4_fix
This commit is contained in:
commit
bc78e87a61
@ -48,8 +48,8 @@
|
|||||||
///////////// PyrLKOpticalFlow ////////////////////////
|
///////////// PyrLKOpticalFlow ////////////////////////
|
||||||
PERFTEST(PyrLKOpticalFlow)
|
PERFTEST(PyrLKOpticalFlow)
|
||||||
{
|
{
|
||||||
std::string images1[] = {"rubberwhale1.png", "basketball1.png"};
|
std::string images1[] = {"rubberwhale1.png", "aloeL.jpg"};
|
||||||
std::string images2[] = {"rubberwhale2.png", "basketball2.png"};
|
std::string images2[] = {"rubberwhale2.png", "aloeR.jpg"};
|
||||||
|
|
||||||
for (size_t i = 0; i < sizeof(images1) / sizeof(std::string); i++)
|
for (size_t i = 0; i < sizeof(images1) / sizeof(std::string); i++)
|
||||||
{
|
{
|
||||||
|
@ -56,98 +56,6 @@ using namespace std;
|
|||||||
|
|
||||||
static oclMat gauss_w_lut;
|
static oclMat gauss_w_lut;
|
||||||
static bool hog_device_cpu;
|
static bool hog_device_cpu;
|
||||||
/* pre-compute gaussian and interp_weight lookup tables if sigma is 4.0f */
|
|
||||||
static const float gaussian_interp_lut[] =
|
|
||||||
{
|
|
||||||
/* gaussian lut */
|
|
||||||
0.01831564f, 0.02926831f, 0.04393693f, 0.06196101f, 0.08208500f, 0.10215643f,
|
|
||||||
0.11943297f, 0.13117145f, 0.13533528f, 0.13117145f, 0.11943297f, 0.10215643f,
|
|
||||||
0.08208500f, 0.06196101f, 0.04393693f, 0.02926831f, 0.02926831f, 0.04677062f,
|
|
||||||
0.07021102f, 0.09901341f, 0.13117145f, 0.16324551f, 0.19085334f, 0.20961139f,
|
|
||||||
0.21626517f, 0.20961139f, 0.19085334f, 0.16324551f, 0.13117145f, 0.09901341f,
|
|
||||||
0.07021102f, 0.04677062f, 0.04393693f, 0.07021102f, 0.10539922f, 0.14863673f,
|
|
||||||
0.19691168f, 0.24506053f, 0.28650481f, 0.31466395f, 0.32465246f, 0.31466395f,
|
|
||||||
0.28650481f, 0.24506053f, 0.19691168f, 0.14863673f, 0.10539922f, 0.07021102f,
|
|
||||||
0.06196101f, 0.09901341f, 0.14863673f, 0.20961139f, 0.27768996f, 0.34559074f,
|
|
||||||
0.40403652f, 0.44374731f, 0.45783335f, 0.44374731f, 0.40403652f, 0.34559074f,
|
|
||||||
0.27768996f, 0.20961139f, 0.14863673f, 0.09901341f, 0.08208500f, 0.13117145f,
|
|
||||||
0.19691168f, 0.27768996f, 0.36787945f, 0.45783335f, 0.53526145f, 0.58786964f,
|
|
||||||
0.60653067f, 0.58786964f, 0.53526145f, 0.45783335f, 0.36787945f, 0.27768996f,
|
|
||||||
0.19691168f, 0.13117145f, 0.10215643f, 0.16324551f, 0.24506053f, 0.34559074f,
|
|
||||||
0.45783335f, 0.56978285f, 0.66614360f, 0.73161560f, 0.75483960f, 0.73161560f,
|
|
||||||
0.66614360f, 0.56978285f, 0.45783335f, 0.34559074f, 0.24506053f, 0.16324551f,
|
|
||||||
0.11943297f, 0.19085334f, 0.28650481f, 0.40403652f, 0.53526145f, 0.66614360f,
|
|
||||||
0.77880079f, 0.85534531f, 0.88249689f, 0.85534531f, 0.77880079f, 0.66614360f,
|
|
||||||
0.53526145f, 0.40403652f, 0.28650481f, 0.19085334f, 0.13117145f, 0.20961139f,
|
|
||||||
0.31466395f, 0.44374731f, 0.58786964f, 0.73161560f, 0.85534531f, 0.93941307f,
|
|
||||||
0.96923321f, 0.93941307f, 0.85534531f, 0.73161560f, 0.58786964f, 0.44374731f,
|
|
||||||
0.31466395f, 0.20961139f, 0.13533528f, 0.21626517f, 0.32465246f, 0.45783335f,
|
|
||||||
0.60653067f, 0.75483960f, 0.88249689f, 0.96923321f, 1.00000000f, 0.96923321f,
|
|
||||||
0.88249689f, 0.75483960f, 0.60653067f, 0.45783335f, 0.32465246f, 0.21626517f,
|
|
||||||
0.13117145f, 0.20961139f, 0.31466395f, 0.44374731f, 0.58786964f, 0.73161560f,
|
|
||||||
0.85534531f, 0.93941307f, 0.96923321f, 0.93941307f, 0.85534531f, 0.73161560f,
|
|
||||||
0.58786964f, 0.44374731f, 0.31466395f, 0.20961139f, 0.11943297f, 0.19085334f,
|
|
||||||
0.28650481f, 0.40403652f, 0.53526145f, 0.66614360f, 0.77880079f, 0.85534531f,
|
|
||||||
0.88249689f, 0.85534531f, 0.77880079f, 0.66614360f, 0.53526145f, 0.40403652f,
|
|
||||||
0.28650481f, 0.19085334f, 0.10215643f, 0.16324551f, 0.24506053f, 0.34559074f,
|
|
||||||
0.45783335f, 0.56978285f, 0.66614360f, 0.73161560f, 0.75483960f, 0.73161560f,
|
|
||||||
0.66614360f, 0.56978285f, 0.45783335f, 0.34559074f, 0.24506053f, 0.16324551f,
|
|
||||||
0.08208500f, 0.13117145f, 0.19691168f, 0.27768996f, 0.36787945f, 0.45783335f,
|
|
||||||
0.53526145f, 0.58786964f, 0.60653067f, 0.58786964f, 0.53526145f, 0.45783335f,
|
|
||||||
0.36787945f, 0.27768996f, 0.19691168f, 0.13117145f, 0.06196101f, 0.09901341f,
|
|
||||||
0.14863673f, 0.20961139f, 0.27768996f, 0.34559074f, 0.40403652f, 0.44374731f,
|
|
||||||
0.45783335f, 0.44374731f, 0.40403652f, 0.34559074f, 0.27768996f, 0.20961139f,
|
|
||||||
0.14863673f, 0.09901341f, 0.04393693f, 0.07021102f, 0.10539922f, 0.14863673f,
|
|
||||||
0.19691168f, 0.24506053f, 0.28650481f, 0.31466395f, 0.32465246f, 0.31466395f,
|
|
||||||
0.28650481f, 0.24506053f, 0.19691168f, 0.14863673f, 0.10539922f, 0.07021102f,
|
|
||||||
0.02926831f, 0.04677062f, 0.07021102f, 0.09901341f, 0.13117145f, 0.16324551f,
|
|
||||||
0.19085334f, 0.20961139f, 0.21626517f, 0.20961139f, 0.19085334f, 0.16324551f,
|
|
||||||
0.13117145f, 0.09901341f, 0.07021102f, 0.04677062f,
|
|
||||||
/* interp_weight lut */
|
|
||||||
0.00390625f, 0.01171875f, 0.01953125f, 0.02734375f, 0.03515625f, 0.04296875f,
|
|
||||||
0.05078125f, 0.05859375f, 0.05859375f, 0.05078125f, 0.04296875f, 0.03515625f,
|
|
||||||
0.02734375f, 0.01953125f, 0.01171875f, 0.00390625f, 0.01171875f, 0.03515625f,
|
|
||||||
0.05859375f, 0.08203125f, 0.10546875f, 0.12890625f, 0.15234375f, 0.17578125f,
|
|
||||||
0.17578125f, 0.15234375f, 0.12890625f, 0.10546875f, 0.08203125f, 0.05859375f,
|
|
||||||
0.03515625f, 0.01171875f, 0.01953125f, 0.05859375f, 0.09765625f, 0.13671875f,
|
|
||||||
0.17578125f, 0.21484375f, 0.25390625f, 0.29296875f, 0.29296875f, 0.25390625f,
|
|
||||||
0.21484375f, 0.17578125f, 0.13671875f, 0.09765625f, 0.05859375f, 0.01953125f,
|
|
||||||
0.02734375f, 0.08203125f, 0.13671875f, 0.19140625f, 0.24609375f, 0.30078125f,
|
|
||||||
0.35546875f, 0.41015625f, 0.41015625f, 0.35546875f, 0.30078125f, 0.24609375f,
|
|
||||||
0.19140625f, 0.13671875f, 0.08203125f, 0.02734375f, 0.03515625f, 0.10546875f,
|
|
||||||
0.17578125f, 0.24609375f, 0.31640625f, 0.38671875f, 0.45703125f, 0.52734375f,
|
|
||||||
0.52734375f, 0.45703125f, 0.38671875f, 0.31640625f, 0.24609375f, 0.17578125f,
|
|
||||||
0.10546875f, 0.03515625f, 0.04296875f, 0.12890625f, 0.21484375f, 0.30078125f,
|
|
||||||
0.38671875f, 0.47265625f, 0.55859375f, 0.64453125f, 0.64453125f, 0.55859375f,
|
|
||||||
0.47265625f, 0.38671875f, 0.30078125f, 0.21484375f, 0.12890625f, 0.04296875f,
|
|
||||||
0.05078125f, 0.15234375f, 0.25390625f, 0.35546875f, 0.45703125f, 0.55859375f,
|
|
||||||
0.66015625f, 0.76171875f, 0.76171875f, 0.66015625f, 0.55859375f, 0.45703125f,
|
|
||||||
0.35546875f, 0.25390625f, 0.15234375f, 0.05078125f, 0.05859375f, 0.17578125f,
|
|
||||||
0.29296875f, 0.41015625f, 0.52734375f, 0.64453125f, 0.76171875f, 0.87890625f,
|
|
||||||
0.87890625f, 0.76171875f, 0.64453125f, 0.52734375f, 0.41015625f, 0.29296875f,
|
|
||||||
0.17578125f, 0.05859375f, 0.05859375f, 0.17578125f, 0.29296875f, 0.41015625f,
|
|
||||||
0.52734375f, 0.64453125f, 0.76171875f, 0.87890625f, 0.87890625f, 0.76171875f,
|
|
||||||
0.64453125f, 0.52734375f, 0.41015625f, 0.29296875f, 0.17578125f, 0.05859375f,
|
|
||||||
0.05078125f, 0.15234375f, 0.25390625f, 0.35546875f, 0.45703125f, 0.55859375f,
|
|
||||||
0.66015625f, 0.76171875f, 0.76171875f, 0.66015625f, 0.55859375f, 0.45703125f,
|
|
||||||
0.35546875f, 0.25390625f, 0.15234375f, 0.05078125f, 0.04296875f, 0.12890625f,
|
|
||||||
0.21484375f, 0.30078125f, 0.38671875f, 0.47265625f, 0.55859375f, 0.64453125f,
|
|
||||||
0.64453125f, 0.55859375f, 0.47265625f, 0.38671875f, 0.30078125f, 0.21484375f,
|
|
||||||
0.12890625f, 0.04296875f, 0.03515625f, 0.10546875f, 0.17578125f, 0.24609375f,
|
|
||||||
0.31640625f, 0.38671875f, 0.45703125f, 0.52734375f, 0.52734375f, 0.45703125f,
|
|
||||||
0.38671875f, 0.31640625f, 0.24609375f, 0.17578125f, 0.10546875f, 0.03515625f,
|
|
||||||
0.02734375f, 0.08203125f, 0.13671875f, 0.19140625f, 0.24609375f, 0.30078125f,
|
|
||||||
0.35546875f, 0.41015625f, 0.41015625f, 0.35546875f, 0.30078125f, 0.24609375f,
|
|
||||||
0.19140625f, 0.13671875f, 0.08203125f, 0.02734375f, 0.01953125f, 0.05859375f,
|
|
||||||
0.09765625f, 0.13671875f, 0.17578125f, 0.21484375f, 0.25390625f, 0.29296875f,
|
|
||||||
0.29296875f, 0.25390625f, 0.21484375f, 0.17578125f, 0.13671875f, 0.09765625f,
|
|
||||||
0.05859375f, 0.01953125f, 0.01171875f, 0.03515625f, 0.05859375f, 0.08203125f,
|
|
||||||
0.10546875f, 0.12890625f, 0.15234375f, 0.17578125f, 0.17578125f, 0.15234375f,
|
|
||||||
0.12890625f, 0.10546875f, 0.08203125f, 0.05859375f, 0.03515625f, 0.01171875f,
|
|
||||||
0.00390625f, 0.01171875f, 0.01953125f, 0.02734375f, 0.03515625f, 0.04296875f,
|
|
||||||
0.05078125f, 0.05859375f, 0.05859375f, 0.05078125f, 0.04296875f, 0.03515625f,
|
|
||||||
0.02734375f, 0.01953125f, 0.01171875f, 0.00390625f
|
|
||||||
};
|
|
||||||
|
|
||||||
namespace cv
|
namespace cv
|
||||||
{
|
{
|
||||||
@ -180,7 +88,7 @@ namespace cv
|
|||||||
int nblocks_win_x, int nblocks_win_y);
|
int nblocks_win_x, int nblocks_win_y);
|
||||||
|
|
||||||
void compute_hists(int nbins, int block_stride_x, int blovck_stride_y,
|
void compute_hists(int nbins, int block_stride_x, int blovck_stride_y,
|
||||||
int height, int width, float sigma, const cv::ocl::oclMat &grad,
|
int height, int width, const cv::ocl::oclMat &grad,
|
||||||
const cv::ocl::oclMat &qangle,
|
const cv::ocl::oclMat &qangle,
|
||||||
const cv::ocl::oclMat &gauss_w_lut, cv::ocl::oclMat &block_hists);
|
const cv::ocl::oclMat &gauss_w_lut, cv::ocl::oclMat &block_hists);
|
||||||
|
|
||||||
@ -254,7 +162,7 @@ cv::ocl::HOGDescriptor::HOGDescriptor(Size win_size_, Size block_size_, Size blo
|
|||||||
|
|
||||||
effect_size = Size(0, 0);
|
effect_size = Size(0, 0);
|
||||||
|
|
||||||
if (queryDeviceInfo<IS_CPU_DEVICE, bool>())
|
if (queryDeviceInfo<IS_CPU_DEVICE, bool>())
|
||||||
hog_device_cpu = true;
|
hog_device_cpu = true;
|
||||||
else
|
else
|
||||||
hog_device_cpu = false;
|
hog_device_cpu = false;
|
||||||
@ -328,10 +236,18 @@ void cv::ocl::HOGDescriptor::init_buffer(const oclMat &img, Size win_stride)
|
|||||||
Size wins_per_img = numPartsWithin(img.size(), win_size, win_stride);
|
Size wins_per_img = numPartsWithin(img.size(), win_size, win_stride);
|
||||||
labels.create(1, wins_per_img.area(), CV_8U);
|
labels.create(1, wins_per_img.area(), CV_8U);
|
||||||
|
|
||||||
vector<float> v_lut = vector<float>(gaussian_interp_lut, gaussian_interp_lut +
|
float sigma = getWinSigma();
|
||||||
sizeof(gaussian_interp_lut) / sizeof(gaussian_interp_lut[0]));
|
float scale = 1.f / (2.f * sigma * sigma);
|
||||||
Mat m_lut(v_lut);
|
Mat gaussian_lut(1, 512, CV_32FC1);
|
||||||
gauss_w_lut.upload(m_lut.reshape(1,1));
|
int idx = 0;
|
||||||
|
for(int i=-8; i<8; i++)
|
||||||
|
for(int j=-8; j<8; j++)
|
||||||
|
gaussian_lut.at<float>(idx++) = std::exp(-(j * j + i * i) * scale);
|
||||||
|
for(int i=-8; i<8; i++)
|
||||||
|
for(int j=-8; j<8; j++)
|
||||||
|
gaussian_lut.at<float>(idx++) = (8.f - fabs(j + 0.5f)) * (8.f - fabs(i + 0.5f)) / 64.f;
|
||||||
|
|
||||||
|
gauss_w_lut.upload(gaussian_lut);
|
||||||
}
|
}
|
||||||
|
|
||||||
void cv::ocl::HOGDescriptor::computeGradient(const oclMat &img, oclMat &grad, oclMat &qangle)
|
void cv::ocl::HOGDescriptor::computeGradient(const oclMat &img, oclMat &grad, oclMat &qangle)
|
||||||
@ -358,7 +274,7 @@ void cv::ocl::HOGDescriptor::computeBlockHistograms(const oclMat &img)
|
|||||||
computeGradient(img, this->grad, this->qangle);
|
computeGradient(img, this->grad, this->qangle);
|
||||||
|
|
||||||
hog::compute_hists(nbins, block_stride.width, block_stride.height, effect_size.height,
|
hog::compute_hists(nbins, block_stride.width, block_stride.height, effect_size.height,
|
||||||
effect_size.width, (float)getWinSigma(), grad, qangle, gauss_w_lut, block_hists);
|
effect_size.width, grad, qangle, gauss_w_lut, block_hists);
|
||||||
|
|
||||||
hog::normalize_hists(nbins, block_stride.width, block_stride.height, effect_size.height,
|
hog::normalize_hists(nbins, block_stride.width, block_stride.height, effect_size.height,
|
||||||
effect_size.width, block_hists, (float)threshold_L2hys);
|
effect_size.width, block_hists, (float)threshold_L2hys);
|
||||||
@ -1708,7 +1624,7 @@ void cv::ocl::device::hog::set_up_constants(int nbins,
|
|||||||
|
|
||||||
void cv::ocl::device::hog::compute_hists(int nbins,
|
void cv::ocl::device::hog::compute_hists(int nbins,
|
||||||
int block_stride_x, int block_stride_y,
|
int block_stride_x, int block_stride_y,
|
||||||
int height, int width, float sigma,
|
int height, int width,
|
||||||
const cv::ocl::oclMat &grad,
|
const cv::ocl::oclMat &grad,
|
||||||
const cv::ocl::oclMat &qangle,
|
const cv::ocl::oclMat &qangle,
|
||||||
const cv::ocl::oclMat &gauss_w_lut,
|
const cv::ocl::oclMat &gauss_w_lut,
|
||||||
@ -1716,8 +1632,7 @@ void cv::ocl::device::hog::compute_hists(int nbins,
|
|||||||
{
|
{
|
||||||
Context *clCxt = Context::getContext();
|
Context *clCxt = Context::getContext();
|
||||||
vector< pair<size_t, const void *> > args;
|
vector< pair<size_t, const void *> > args;
|
||||||
string kernelName = (sigma == 4.0f) ? "compute_hists_lut_kernel" :
|
string kernelName = "compute_hists_lut_kernel";
|
||||||
"compute_hists_kernel";
|
|
||||||
|
|
||||||
int img_block_width = (width - CELLS_PER_BLOCK_X * CELL_WIDTH + block_stride_x)
|
int img_block_width = (width - CELLS_PER_BLOCK_X * CELL_WIDTH + block_stride_x)
|
||||||
/ block_stride_x;
|
/ block_stride_x;
|
||||||
@ -1728,9 +1643,6 @@ void cv::ocl::device::hog::compute_hists(int nbins,
|
|||||||
int grad_quadstep = grad.step >> 2;
|
int grad_quadstep = grad.step >> 2;
|
||||||
int qangle_step = qangle.step;
|
int qangle_step = qangle.step;
|
||||||
|
|
||||||
// Precompute gaussian spatial window parameter
|
|
||||||
float scale = 1.f / (2.f * sigma * sigma);
|
|
||||||
|
|
||||||
int blocks_in_group = 4;
|
int blocks_in_group = 4;
|
||||||
size_t localThreads[3] = { blocks_in_group * 24, 2, 1 };
|
size_t localThreads[3] = { blocks_in_group * 24, 2, 1 };
|
||||||
size_t globalThreads[3] = {
|
size_t globalThreads[3] = {
|
||||||
@ -1751,15 +1663,23 @@ void cv::ocl::device::hog::compute_hists(int nbins,
|
|||||||
args.push_back( make_pair( sizeof(cl_int), (void *)&qangle_step));
|
args.push_back( make_pair( sizeof(cl_int), (void *)&qangle_step));
|
||||||
args.push_back( make_pair( sizeof(cl_mem), (void *)&grad.data));
|
args.push_back( make_pair( sizeof(cl_mem), (void *)&grad.data));
|
||||||
args.push_back( make_pair( sizeof(cl_mem), (void *)&qangle.data));
|
args.push_back( make_pair( sizeof(cl_mem), (void *)&qangle.data));
|
||||||
if (kernelName.compare("compute_hists_lut_kernel") == 0)
|
args.push_back( make_pair( sizeof(cl_mem), (void *)&gauss_w_lut.data));
|
||||||
args.push_back( make_pair( sizeof(cl_mem), (void *)&gauss_w_lut.data));
|
|
||||||
else
|
|
||||||
args.push_back( make_pair( sizeof(cl_float), (void *)&scale));
|
|
||||||
args.push_back( make_pair( sizeof(cl_mem), (void *)&block_hists.data));
|
args.push_back( make_pair( sizeof(cl_mem), (void *)&block_hists.data));
|
||||||
args.push_back( make_pair( smem, (void *)NULL));
|
args.push_back( make_pair( smem, (void *)NULL));
|
||||||
|
|
||||||
openCLExecuteKernel(clCxt, &objdetect_hog, kernelName, globalThreads,
|
if(hog_device_cpu)
|
||||||
localThreads, args, -1, -1);
|
{
|
||||||
|
openCLExecuteKernel(clCxt, &objdetect_hog, kernelName, globalThreads,
|
||||||
|
localThreads, args, -1, -1, "-D CPU");
|
||||||
|
}else
|
||||||
|
{
|
||||||
|
cl_kernel kernel = openCLGetKernelFromSource(clCxt, &objdetect_hog, kernelName);
|
||||||
|
int wave_size = queryDeviceInfo<WAVEFRONT_SIZE, int>(kernel);
|
||||||
|
char opt[32] = {0};
|
||||||
|
sprintf(opt, "-D WAVE_SIZE=%d", wave_size);
|
||||||
|
openCLExecuteKernel(clCxt, &objdetect_hog, kernelName, globalThreads,
|
||||||
|
localThreads, args, -1, -1, opt);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
void cv::ocl::device::hog::normalize_hists(int nbins,
|
void cv::ocl::device::hog::normalize_hists(int nbins,
|
||||||
|
@ -53,7 +53,7 @@
|
|||||||
//----------------------------------------------------------------------------
|
//----------------------------------------------------------------------------
|
||||||
// Histogram computation
|
// Histogram computation
|
||||||
// 12 threads for a cell, 12x4 threads per block
|
// 12 threads for a cell, 12x4 threads per block
|
||||||
// Use pre-computed gaussian and interp_weight lookup tables if sigma is 4.0f
|
// Use pre-computed gaussian and interp_weight lookup tables
|
||||||
__kernel void compute_hists_lut_kernel(
|
__kernel void compute_hists_lut_kernel(
|
||||||
const int cblock_stride_x, const int cblock_stride_y,
|
const int cblock_stride_x, const int cblock_stride_y,
|
||||||
const int cnbins, const int cblock_hist_size, const int img_block_width,
|
const int cnbins, const int cblock_hist_size, const int img_block_width,
|
||||||
@ -146,99 +146,6 @@ __kernel void compute_hists_lut_kernel(
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
//----------------------------------------------------------------------------
|
|
||||||
// Histogram computation
|
|
||||||
// 12 threads for a cell, 12x4 threads per block
|
|
||||||
__kernel void compute_hists_kernel(
|
|
||||||
const int cblock_stride_x, const int cblock_stride_y,
|
|
||||||
const int cnbins, const int cblock_hist_size, const int img_block_width,
|
|
||||||
const int blocks_in_group, const int blocks_total,
|
|
||||||
const int grad_quadstep, const int qangle_step,
|
|
||||||
__global const float* grad, __global const uchar* qangle,
|
|
||||||
const float scale, __global float* block_hists, __local float* smem)
|
|
||||||
{
|
|
||||||
const int lx = get_local_id(0);
|
|
||||||
const int lp = lx / 24; /* local group id */
|
|
||||||
const int gid = get_group_id(0) * blocks_in_group + lp;/* global group id */
|
|
||||||
const int gidY = gid / img_block_width;
|
|
||||||
const int gidX = gid - gidY * img_block_width;
|
|
||||||
|
|
||||||
const int lidX = lx - lp * 24;
|
|
||||||
const int lidY = get_local_id(1);
|
|
||||||
|
|
||||||
const int cell_x = lidX / 12;
|
|
||||||
const int cell_y = lidY;
|
|
||||||
const int cell_thread_x = lidX - cell_x * 12;
|
|
||||||
|
|
||||||
__local float* hists = smem + lp * cnbins * (CELLS_PER_BLOCK_X *
|
|
||||||
CELLS_PER_BLOCK_Y * 12 + CELLS_PER_BLOCK_X * CELLS_PER_BLOCK_Y);
|
|
||||||
__local float* final_hist = hists + cnbins *
|
|
||||||
(CELLS_PER_BLOCK_X * CELLS_PER_BLOCK_Y * 12);
|
|
||||||
|
|
||||||
const int offset_x = gidX * cblock_stride_x + (cell_x << 2) + cell_thread_x;
|
|
||||||
const int offset_y = gidY * cblock_stride_y + (cell_y << 2);
|
|
||||||
|
|
||||||
__global const float* grad_ptr = (gid < blocks_total) ?
|
|
||||||
grad + offset_y * grad_quadstep + (offset_x << 1) : grad;
|
|
||||||
__global const uchar* qangle_ptr = (gid < blocks_total) ?
|
|
||||||
qangle + offset_y * qangle_step + (offset_x << 1) : qangle;
|
|
||||||
|
|
||||||
__local float* hist = hists + 12 * (cell_y * CELLS_PER_BLOCK_Y + cell_x) +
|
|
||||||
cell_thread_x;
|
|
||||||
for (int bin_id = 0; bin_id < cnbins; ++bin_id)
|
|
||||||
hist[bin_id * 48] = 0.f;
|
|
||||||
|
|
||||||
const int dist_x = -4 + cell_thread_x - 4 * cell_x;
|
|
||||||
const int dist_center_x = dist_x - 4 * (1 - 2 * cell_x);
|
|
||||||
|
|
||||||
const int dist_y_begin = -4 - 4 * lidY;
|
|
||||||
for (int dist_y = dist_y_begin; dist_y < dist_y_begin + 12; ++dist_y)
|
|
||||||
{
|
|
||||||
float2 vote = (float2) (grad_ptr[0], grad_ptr[1]);
|
|
||||||
uchar2 bin = (uchar2) (qangle_ptr[0], qangle_ptr[1]);
|
|
||||||
|
|
||||||
grad_ptr += grad_quadstep;
|
|
||||||
qangle_ptr += qangle_step;
|
|
||||||
|
|
||||||
int dist_center_y = dist_y - 4 * (1 - 2 * cell_y);
|
|
||||||
|
|
||||||
float gaussian = exp(-(dist_center_y * dist_center_y + dist_center_x *
|
|
||||||
dist_center_x) * scale);
|
|
||||||
float interp_weight = (8.f - fabs(dist_y + 0.5f)) *
|
|
||||||
(8.f - fabs(dist_x + 0.5f)) / 64.f;
|
|
||||||
|
|
||||||
hist[bin.x * 48] += gaussian * interp_weight * vote.x;
|
|
||||||
hist[bin.y * 48] += gaussian * interp_weight * vote.y;
|
|
||||||
}
|
|
||||||
barrier(CLK_LOCAL_MEM_FENCE);
|
|
||||||
|
|
||||||
volatile __local float* hist_ = hist;
|
|
||||||
for (int bin_id = 0; bin_id < cnbins; ++bin_id, hist_ += 48)
|
|
||||||
{
|
|
||||||
if (cell_thread_x < 6)
|
|
||||||
hist_[0] += hist_[6];
|
|
||||||
barrier(CLK_LOCAL_MEM_FENCE);
|
|
||||||
if (cell_thread_x < 3)
|
|
||||||
hist_[0] += hist_[3];
|
|
||||||
#ifdef CPU
|
|
||||||
barrier(CLK_LOCAL_MEM_FENCE);
|
|
||||||
#endif
|
|
||||||
if (cell_thread_x == 0)
|
|
||||||
final_hist[(cell_x * 2 + cell_y) * cnbins + bin_id] =
|
|
||||||
hist_[0] + hist_[1] + hist_[2];
|
|
||||||
}
|
|
||||||
#ifdef CPU
|
|
||||||
barrier(CLK_LOCAL_MEM_FENCE);
|
|
||||||
#endif
|
|
||||||
int tid = (cell_y * CELLS_PER_BLOCK_Y + cell_x) * 12 + cell_thread_x;
|
|
||||||
if ((tid < cblock_hist_size) && (gid < blocks_total))
|
|
||||||
{
|
|
||||||
__global float* block_hist = block_hists +
|
|
||||||
(gidY * img_block_width + gidX) * cblock_hist_size;
|
|
||||||
block_hist[tid] = final_hist[tid];
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
//-------------------------------------------------------------
|
//-------------------------------------------------------------
|
||||||
// Normalization of histograms via L2Hys_norm
|
// Normalization of histograms via L2Hys_norm
|
||||||
// optimized for the case of 9 bins
|
// optimized for the case of 9 bins
|
||||||
|
File diff suppressed because it is too large
Load Diff
Loading…
x
Reference in New Issue
Block a user