From 270b2c7918466a9470748cf5002507ca723298fc Mon Sep 17 00:00:00 2001 From: yao Date: Fri, 26 Jul 2013 11:17:27 +0800 Subject: [PATCH] generating the lut table instead of hard coding one --- modules/ocl/src/hog.cpp | 126 ++++-------------------- modules/ocl/src/opencl/objdetect_hog.cl | 95 +----------------- 2 files changed, 18 insertions(+), 203 deletions(-) diff --git a/modules/ocl/src/hog.cpp b/modules/ocl/src/hog.cpp index 2e2b3a992..c7ac4098f 100644 --- a/modules/ocl/src/hog.cpp +++ b/modules/ocl/src/hog.cpp @@ -56,98 +56,6 @@ using namespace std; static oclMat gauss_w_lut; static bool hog_device_cpu; -/* pre-compute gaussian and interp_weight lookup tables if sigma is 4.0f */ -static const float gaussian_interp_lut[] = -{ - /* gaussian lut */ - 0.01831564f, 0.02926831f, 0.04393693f, 0.06196101f, 0.08208500f, 0.10215643f, - 0.11943297f, 0.13117145f, 0.13533528f, 0.13117145f, 0.11943297f, 0.10215643f, - 0.08208500f, 0.06196101f, 0.04393693f, 0.02926831f, 0.02926831f, 0.04677062f, - 0.07021102f, 0.09901341f, 0.13117145f, 0.16324551f, 0.19085334f, 0.20961139f, - 0.21626517f, 0.20961139f, 0.19085334f, 0.16324551f, 0.13117145f, 0.09901341f, - 0.07021102f, 0.04677062f, 0.04393693f, 0.07021102f, 0.10539922f, 0.14863673f, - 0.19691168f, 0.24506053f, 0.28650481f, 0.31466395f, 0.32465246f, 0.31466395f, - 0.28650481f, 0.24506053f, 0.19691168f, 0.14863673f, 0.10539922f, 0.07021102f, - 0.06196101f, 0.09901341f, 0.14863673f, 0.20961139f, 0.27768996f, 0.34559074f, - 0.40403652f, 0.44374731f, 0.45783335f, 0.44374731f, 0.40403652f, 0.34559074f, - 0.27768996f, 0.20961139f, 0.14863673f, 0.09901341f, 0.08208500f, 0.13117145f, - 0.19691168f, 0.27768996f, 0.36787945f, 0.45783335f, 0.53526145f, 0.58786964f, - 0.60653067f, 0.58786964f, 0.53526145f, 0.45783335f, 0.36787945f, 0.27768996f, - 0.19691168f, 0.13117145f, 0.10215643f, 0.16324551f, 0.24506053f, 0.34559074f, - 0.45783335f, 0.56978285f, 0.66614360f, 0.73161560f, 0.75483960f, 0.73161560f, - 0.66614360f, 0.56978285f, 0.45783335f, 0.34559074f, 0.24506053f, 0.16324551f, - 0.11943297f, 0.19085334f, 0.28650481f, 0.40403652f, 0.53526145f, 0.66614360f, - 0.77880079f, 0.85534531f, 0.88249689f, 0.85534531f, 0.77880079f, 0.66614360f, - 0.53526145f, 0.40403652f, 0.28650481f, 0.19085334f, 0.13117145f, 0.20961139f, - 0.31466395f, 0.44374731f, 0.58786964f, 0.73161560f, 0.85534531f, 0.93941307f, - 0.96923321f, 0.93941307f, 0.85534531f, 0.73161560f, 0.58786964f, 0.44374731f, - 0.31466395f, 0.20961139f, 0.13533528f, 0.21626517f, 0.32465246f, 0.45783335f, - 0.60653067f, 0.75483960f, 0.88249689f, 0.96923321f, 1.00000000f, 0.96923321f, - 0.88249689f, 0.75483960f, 0.60653067f, 0.45783335f, 0.32465246f, 0.21626517f, - 0.13117145f, 0.20961139f, 0.31466395f, 0.44374731f, 0.58786964f, 0.73161560f, - 0.85534531f, 0.93941307f, 0.96923321f, 0.93941307f, 0.85534531f, 0.73161560f, - 0.58786964f, 0.44374731f, 0.31466395f, 0.20961139f, 0.11943297f, 0.19085334f, - 0.28650481f, 0.40403652f, 0.53526145f, 0.66614360f, 0.77880079f, 0.85534531f, - 0.88249689f, 0.85534531f, 0.77880079f, 0.66614360f, 0.53526145f, 0.40403652f, - 0.28650481f, 0.19085334f, 0.10215643f, 0.16324551f, 0.24506053f, 0.34559074f, - 0.45783335f, 0.56978285f, 0.66614360f, 0.73161560f, 0.75483960f, 0.73161560f, - 0.66614360f, 0.56978285f, 0.45783335f, 0.34559074f, 0.24506053f, 0.16324551f, - 0.08208500f, 0.13117145f, 0.19691168f, 0.27768996f, 0.36787945f, 0.45783335f, - 0.53526145f, 0.58786964f, 0.60653067f, 0.58786964f, 0.53526145f, 0.45783335f, - 0.36787945f, 0.27768996f, 0.19691168f, 0.13117145f, 0.06196101f, 0.09901341f, - 0.14863673f, 0.20961139f, 0.27768996f, 0.34559074f, 0.40403652f, 0.44374731f, - 0.45783335f, 0.44374731f, 0.40403652f, 0.34559074f, 0.27768996f, 0.20961139f, - 0.14863673f, 0.09901341f, 0.04393693f, 0.07021102f, 0.10539922f, 0.14863673f, - 0.19691168f, 0.24506053f, 0.28650481f, 0.31466395f, 0.32465246f, 0.31466395f, - 0.28650481f, 0.24506053f, 0.19691168f, 0.14863673f, 0.10539922f, 0.07021102f, - 0.02926831f, 0.04677062f, 0.07021102f, 0.09901341f, 0.13117145f, 0.16324551f, - 0.19085334f, 0.20961139f, 0.21626517f, 0.20961139f, 0.19085334f, 0.16324551f, - 0.13117145f, 0.09901341f, 0.07021102f, 0.04677062f, - /* interp_weight lut */ - 0.00390625f, 0.01171875f, 0.01953125f, 0.02734375f, 0.03515625f, 0.04296875f, - 0.05078125f, 0.05859375f, 0.05859375f, 0.05078125f, 0.04296875f, 0.03515625f, - 0.02734375f, 0.01953125f, 0.01171875f, 0.00390625f, 0.01171875f, 0.03515625f, - 0.05859375f, 0.08203125f, 0.10546875f, 0.12890625f, 0.15234375f, 0.17578125f, - 0.17578125f, 0.15234375f, 0.12890625f, 0.10546875f, 0.08203125f, 0.05859375f, - 0.03515625f, 0.01171875f, 0.01953125f, 0.05859375f, 0.09765625f, 0.13671875f, - 0.17578125f, 0.21484375f, 0.25390625f, 0.29296875f, 0.29296875f, 0.25390625f, - 0.21484375f, 0.17578125f, 0.13671875f, 0.09765625f, 0.05859375f, 0.01953125f, - 0.02734375f, 0.08203125f, 0.13671875f, 0.19140625f, 0.24609375f, 0.30078125f, - 0.35546875f, 0.41015625f, 0.41015625f, 0.35546875f, 0.30078125f, 0.24609375f, - 0.19140625f, 0.13671875f, 0.08203125f, 0.02734375f, 0.03515625f, 0.10546875f, - 0.17578125f, 0.24609375f, 0.31640625f, 0.38671875f, 0.45703125f, 0.52734375f, - 0.52734375f, 0.45703125f, 0.38671875f, 0.31640625f, 0.24609375f, 0.17578125f, - 0.10546875f, 0.03515625f, 0.04296875f, 0.12890625f, 0.21484375f, 0.30078125f, - 0.38671875f, 0.47265625f, 0.55859375f, 0.64453125f, 0.64453125f, 0.55859375f, - 0.47265625f, 0.38671875f, 0.30078125f, 0.21484375f, 0.12890625f, 0.04296875f, - 0.05078125f, 0.15234375f, 0.25390625f, 0.35546875f, 0.45703125f, 0.55859375f, - 0.66015625f, 0.76171875f, 0.76171875f, 0.66015625f, 0.55859375f, 0.45703125f, - 0.35546875f, 0.25390625f, 0.15234375f, 0.05078125f, 0.05859375f, 0.17578125f, - 0.29296875f, 0.41015625f, 0.52734375f, 0.64453125f, 0.76171875f, 0.87890625f, - 0.87890625f, 0.76171875f, 0.64453125f, 0.52734375f, 0.41015625f, 0.29296875f, - 0.17578125f, 0.05859375f, 0.05859375f, 0.17578125f, 0.29296875f, 0.41015625f, - 0.52734375f, 0.64453125f, 0.76171875f, 0.87890625f, 0.87890625f, 0.76171875f, - 0.64453125f, 0.52734375f, 0.41015625f, 0.29296875f, 0.17578125f, 0.05859375f, - 0.05078125f, 0.15234375f, 0.25390625f, 0.35546875f, 0.45703125f, 0.55859375f, - 0.66015625f, 0.76171875f, 0.76171875f, 0.66015625f, 0.55859375f, 0.45703125f, - 0.35546875f, 0.25390625f, 0.15234375f, 0.05078125f, 0.04296875f, 0.12890625f, - 0.21484375f, 0.30078125f, 0.38671875f, 0.47265625f, 0.55859375f, 0.64453125f, - 0.64453125f, 0.55859375f, 0.47265625f, 0.38671875f, 0.30078125f, 0.21484375f, - 0.12890625f, 0.04296875f, 0.03515625f, 0.10546875f, 0.17578125f, 0.24609375f, - 0.31640625f, 0.38671875f, 0.45703125f, 0.52734375f, 0.52734375f, 0.45703125f, - 0.38671875f, 0.31640625f, 0.24609375f, 0.17578125f, 0.10546875f, 0.03515625f, - 0.02734375f, 0.08203125f, 0.13671875f, 0.19140625f, 0.24609375f, 0.30078125f, - 0.35546875f, 0.41015625f, 0.41015625f, 0.35546875f, 0.30078125f, 0.24609375f, - 0.19140625f, 0.13671875f, 0.08203125f, 0.02734375f, 0.01953125f, 0.05859375f, - 0.09765625f, 0.13671875f, 0.17578125f, 0.21484375f, 0.25390625f, 0.29296875f, - 0.29296875f, 0.25390625f, 0.21484375f, 0.17578125f, 0.13671875f, 0.09765625f, - 0.05859375f, 0.01953125f, 0.01171875f, 0.03515625f, 0.05859375f, 0.08203125f, - 0.10546875f, 0.12890625f, 0.15234375f, 0.17578125f, 0.17578125f, 0.15234375f, - 0.12890625f, 0.10546875f, 0.08203125f, 0.05859375f, 0.03515625f, 0.01171875f, - 0.00390625f, 0.01171875f, 0.01953125f, 0.02734375f, 0.03515625f, 0.04296875f, - 0.05078125f, 0.05859375f, 0.05859375f, 0.05078125f, 0.04296875f, 0.03515625f, - 0.02734375f, 0.01953125f, 0.01171875f, 0.00390625f -}; namespace cv { @@ -180,7 +88,7 @@ namespace cv int nblocks_win_x, int nblocks_win_y); void compute_hists(int nbins, int block_stride_x, int blovck_stride_y, - int height, int width, float sigma, const cv::ocl::oclMat &grad, + int height, int width, const cv::ocl::oclMat &grad, const cv::ocl::oclMat &qangle, const cv::ocl::oclMat &gauss_w_lut, cv::ocl::oclMat &block_hists); @@ -328,10 +236,18 @@ void cv::ocl::HOGDescriptor::init_buffer(const oclMat &img, Size win_stride) Size wins_per_img = numPartsWithin(img.size(), win_size, win_stride); labels.create(1, wins_per_img.area(), CV_8U); - vector v_lut = vector(gaussian_interp_lut, gaussian_interp_lut + - sizeof(gaussian_interp_lut) / sizeof(gaussian_interp_lut[0])); - Mat m_lut(v_lut); - gauss_w_lut.upload(m_lut.reshape(1,1)); + float sigma = getWinSigma(); + float scale = 1.f / (2.f * sigma * sigma); + Mat gaussian_lut(1, 512, CV_32FC1); + int idx = 0; + for(int i=-8; i<8; i++) + for(int j=-8; j<8; j++) + gaussian_lut.at(idx++) = std::exp(-(j * j + i * i) * scale); + for(int i=-8; i<8; i++) + for(int j=-8; j<8; j++) + gaussian_lut.at(idx++) = (8.f - fabs(j + 0.5f)) * (8.f - fabs(i + 0.5f)) / 64.f; + + gauss_w_lut.upload(gaussian_lut); } void cv::ocl::HOGDescriptor::computeGradient(const oclMat &img, oclMat &grad, oclMat &qangle) @@ -358,7 +274,7 @@ void cv::ocl::HOGDescriptor::computeBlockHistograms(const oclMat &img) computeGradient(img, this->grad, this->qangle); hog::compute_hists(nbins, block_stride.width, block_stride.height, effect_size.height, - effect_size.width, (float)getWinSigma(), grad, qangle, gauss_w_lut, block_hists); + effect_size.width, grad, qangle, gauss_w_lut, block_hists); hog::normalize_hists(nbins, block_stride.width, block_stride.height, effect_size.height, effect_size.width, block_hists, (float)threshold_L2hys); @@ -1708,7 +1624,7 @@ void cv::ocl::device::hog::set_up_constants(int nbins, void cv::ocl::device::hog::compute_hists(int nbins, int block_stride_x, int block_stride_y, - int height, int width, float sigma, + int height, int width, const cv::ocl::oclMat &grad, const cv::ocl::oclMat &qangle, const cv::ocl::oclMat &gauss_w_lut, @@ -1716,8 +1632,7 @@ void cv::ocl::device::hog::compute_hists(int nbins, { Context *clCxt = Context::getContext(); vector< pair > args; - string kernelName = (sigma == 4.0f) ? "compute_hists_lut_kernel" : - "compute_hists_kernel"; + string kernelName = "compute_hists_lut_kernel"; int img_block_width = (width - CELLS_PER_BLOCK_X * CELL_WIDTH + block_stride_x) / block_stride_x; @@ -1728,9 +1643,6 @@ void cv::ocl::device::hog::compute_hists(int nbins, int grad_quadstep = grad.step >> 2; int qangle_step = qangle.step; - // Precompute gaussian spatial window parameter - float scale = 1.f / (2.f * sigma * sigma); - int blocks_in_group = 4; size_t localThreads[3] = { blocks_in_group * 24, 2, 1 }; size_t globalThreads[3] = { @@ -1751,14 +1663,10 @@ void cv::ocl::device::hog::compute_hists(int nbins, args.push_back( make_pair( sizeof(cl_int), (void *)&qangle_step)); args.push_back( make_pair( sizeof(cl_mem), (void *)&grad.data)); args.push_back( make_pair( sizeof(cl_mem), (void *)&qangle.data)); - if (kernelName.compare("compute_hists_lut_kernel") == 0) - args.push_back( make_pair( sizeof(cl_mem), (void *)&gauss_w_lut.data)); - else - args.push_back( make_pair( sizeof(cl_float), (void *)&scale)); + args.push_back( make_pair( sizeof(cl_mem), (void *)&gauss_w_lut.data)); args.push_back( make_pair( sizeof(cl_mem), (void *)&block_hists.data)); args.push_back( make_pair( smem, (void *)NULL)); - if(hog_device_cpu) { openCLExecuteKernel(clCxt, &objdetect_hog, kernelName, globalThreads, diff --git a/modules/ocl/src/opencl/objdetect_hog.cl b/modules/ocl/src/opencl/objdetect_hog.cl index 8ca12704e..036322760 100644 --- a/modules/ocl/src/opencl/objdetect_hog.cl +++ b/modules/ocl/src/opencl/objdetect_hog.cl @@ -53,7 +53,7 @@ //---------------------------------------------------------------------------- // Histogram computation // 12 threads for a cell, 12x4 threads per block -// Use pre-computed gaussian and interp_weight lookup tables if sigma is 4.0f +// Use pre-computed gaussian and interp_weight lookup tables __kernel void compute_hists_lut_kernel( const int cblock_stride_x, const int cblock_stride_y, const int cnbins, const int cblock_hist_size, const int img_block_width, @@ -146,99 +146,6 @@ __kernel void compute_hists_lut_kernel( } } -//---------------------------------------------------------------------------- -// Histogram computation -// 12 threads for a cell, 12x4 threads per block -__kernel void compute_hists_kernel( - const int cblock_stride_x, const int cblock_stride_y, - const int cnbins, const int cblock_hist_size, const int img_block_width, - const int blocks_in_group, const int blocks_total, - const int grad_quadstep, const int qangle_step, - __global const float* grad, __global const uchar* qangle, - const float scale, __global float* block_hists, __local float* smem) -{ - const int lx = get_local_id(0); - const int lp = lx / 24; /* local group id */ - const int gid = get_group_id(0) * blocks_in_group + lp;/* global group id */ - const int gidY = gid / img_block_width; - const int gidX = gid - gidY * img_block_width; - - const int lidX = lx - lp * 24; - const int lidY = get_local_id(1); - - const int cell_x = lidX / 12; - const int cell_y = lidY; - const int cell_thread_x = lidX - cell_x * 12; - - __local float* hists = smem + lp * cnbins * (CELLS_PER_BLOCK_X * - CELLS_PER_BLOCK_Y * 12 + CELLS_PER_BLOCK_X * CELLS_PER_BLOCK_Y); - __local float* final_hist = hists + cnbins * - (CELLS_PER_BLOCK_X * CELLS_PER_BLOCK_Y * 12); - - const int offset_x = gidX * cblock_stride_x + (cell_x << 2) + cell_thread_x; - const int offset_y = gidY * cblock_stride_y + (cell_y << 2); - - __global const float* grad_ptr = (gid < blocks_total) ? - grad + offset_y * grad_quadstep + (offset_x << 1) : grad; - __global const uchar* qangle_ptr = (gid < blocks_total) ? - qangle + offset_y * qangle_step + (offset_x << 1) : qangle; - - __local float* hist = hists + 12 * (cell_y * CELLS_PER_BLOCK_Y + cell_x) + - cell_thread_x; - for (int bin_id = 0; bin_id < cnbins; ++bin_id) - hist[bin_id * 48] = 0.f; - - const int dist_x = -4 + cell_thread_x - 4 * cell_x; - const int dist_center_x = dist_x - 4 * (1 - 2 * cell_x); - - const int dist_y_begin = -4 - 4 * lidY; - for (int dist_y = dist_y_begin; dist_y < dist_y_begin + 12; ++dist_y) - { - float2 vote = (float2) (grad_ptr[0], grad_ptr[1]); - uchar2 bin = (uchar2) (qangle_ptr[0], qangle_ptr[1]); - - grad_ptr += grad_quadstep; - qangle_ptr += qangle_step; - - int dist_center_y = dist_y - 4 * (1 - 2 * cell_y); - - float gaussian = exp(-(dist_center_y * dist_center_y + dist_center_x * - dist_center_x) * scale); - float interp_weight = (8.f - fabs(dist_y + 0.5f)) * - (8.f - fabs(dist_x + 0.5f)) / 64.f; - - hist[bin.x * 48] += gaussian * interp_weight * vote.x; - hist[bin.y * 48] += gaussian * interp_weight * vote.y; - } - barrier(CLK_LOCAL_MEM_FENCE); - - volatile __local float* hist_ = hist; - for (int bin_id = 0; bin_id < cnbins; ++bin_id, hist_ += 48) - { - if (cell_thread_x < 6) - hist_[0] += hist_[6]; - barrier(CLK_LOCAL_MEM_FENCE); - if (cell_thread_x < 3) - hist_[0] += hist_[3]; -#ifdef CPU - barrier(CLK_LOCAL_MEM_FENCE); -#endif - if (cell_thread_x == 0) - final_hist[(cell_x * 2 + cell_y) * cnbins + bin_id] = - hist_[0] + hist_[1] + hist_[2]; - } -#ifdef CPU - barrier(CLK_LOCAL_MEM_FENCE); -#endif - int tid = (cell_y * CELLS_PER_BLOCK_Y + cell_x) * 12 + cell_thread_x; - if ((tid < cblock_hist_size) && (gid < blocks_total)) - { - __global float* block_hist = block_hists + - (gidY * img_block_width + gidX) * cblock_hist_size; - block_hist[tid] = final_hist[tid]; - } -} - //------------------------------------------------------------- // Normalization of histograms via L2Hys_norm // optimized for the case of 9 bins