diff --git a/modules/ocl/src/hog.cpp b/modules/ocl/src/hog.cpp index a3514586f..3533cce69 100644 --- a/modules/ocl/src/hog.cpp +++ b/modules/ocl/src/hog.cpp @@ -15,7 +15,7 @@ // Third party copyrights are property of their respective owners. // // @Authors -// Wenju He, wenju@multicorewareinc.com +// Wenju He, wenju@multicorewareinc.com // // Redistribution and use in source and binary forms, with or without modification, // are permitted provided that the following conditions are met: @@ -48,13 +48,107 @@ using namespace cv; using namespace cv::ocl; using namespace std; - #define CELL_WIDTH 8 #define CELL_HEIGHT 8 #define CELLS_PER_BLOCK_X 2 #define CELLS_PER_BLOCK_Y 2 #define NTHREADS 256 +static oclMat gauss_w_lut; +static bool hog_device_cpu; +/* pre-compute gaussian and interp_weight lookup tables if sigma is 4.0f */ +static const float gaussian_interp_lut[] = +{ + /* gaussian lut */ + 0.01831564f, 0.02926831f, 0.04393693f, 0.06196101f, 0.08208500f, 0.10215643f, + 0.11943297f, 0.13117145f, 0.13533528f, 0.13117145f, 0.11943297f, 0.10215643f, + 0.08208500f, 0.06196101f, 0.04393693f, 0.02926831f, 0.02926831f, 0.04677062f, + 0.07021102f, 0.09901341f, 0.13117145f, 0.16324551f, 0.19085334f, 0.20961139f, + 0.21626517f, 0.20961139f, 0.19085334f, 0.16324551f, 0.13117145f, 0.09901341f, + 0.07021102f, 0.04677062f, 0.04393693f, 0.07021102f, 0.10539922f, 0.14863673f, + 0.19691168f, 0.24506053f, 0.28650481f, 0.31466395f, 0.32465246f, 0.31466395f, + 0.28650481f, 0.24506053f, 0.19691168f, 0.14863673f, 0.10539922f, 0.07021102f, + 0.06196101f, 0.09901341f, 0.14863673f, 0.20961139f, 0.27768996f, 0.34559074f, + 0.40403652f, 0.44374731f, 0.45783335f, 0.44374731f, 0.40403652f, 0.34559074f, + 0.27768996f, 0.20961139f, 0.14863673f, 0.09901341f, 0.08208500f, 0.13117145f, + 0.19691168f, 0.27768996f, 0.36787945f, 0.45783335f, 0.53526145f, 0.58786964f, + 0.60653067f, 0.58786964f, 0.53526145f, 0.45783335f, 0.36787945f, 0.27768996f, + 0.19691168f, 0.13117145f, 0.10215643f, 0.16324551f, 0.24506053f, 0.34559074f, + 0.45783335f, 0.56978285f, 0.66614360f, 0.73161560f, 0.75483960f, 0.73161560f, + 0.66614360f, 0.56978285f, 0.45783335f, 0.34559074f, 0.24506053f, 0.16324551f, + 0.11943297f, 0.19085334f, 0.28650481f, 0.40403652f, 0.53526145f, 0.66614360f, + 0.77880079f, 0.85534531f, 0.88249689f, 0.85534531f, 0.77880079f, 0.66614360f, + 0.53526145f, 0.40403652f, 0.28650481f, 0.19085334f, 0.13117145f, 0.20961139f, + 0.31466395f, 0.44374731f, 0.58786964f, 0.73161560f, 0.85534531f, 0.93941307f, + 0.96923321f, 0.93941307f, 0.85534531f, 0.73161560f, 0.58786964f, 0.44374731f, + 0.31466395f, 0.20961139f, 0.13533528f, 0.21626517f, 0.32465246f, 0.45783335f, + 0.60653067f, 0.75483960f, 0.88249689f, 0.96923321f, 1.00000000f, 0.96923321f, + 0.88249689f, 0.75483960f, 0.60653067f, 0.45783335f, 0.32465246f, 0.21626517f, + 0.13117145f, 0.20961139f, 0.31466395f, 0.44374731f, 0.58786964f, 0.73161560f, + 0.85534531f, 0.93941307f, 0.96923321f, 0.93941307f, 0.85534531f, 0.73161560f, + 0.58786964f, 0.44374731f, 0.31466395f, 0.20961139f, 0.11943297f, 0.19085334f, + 0.28650481f, 0.40403652f, 0.53526145f, 0.66614360f, 0.77880079f, 0.85534531f, + 0.88249689f, 0.85534531f, 0.77880079f, 0.66614360f, 0.53526145f, 0.40403652f, + 0.28650481f, 0.19085334f, 0.10215643f, 0.16324551f, 0.24506053f, 0.34559074f, + 0.45783335f, 0.56978285f, 0.66614360f, 0.73161560f, 0.75483960f, 0.73161560f, + 0.66614360f, 0.56978285f, 0.45783335f, 0.34559074f, 0.24506053f, 0.16324551f, + 0.08208500f, 0.13117145f, 0.19691168f, 0.27768996f, 0.36787945f, 0.45783335f, + 0.53526145f, 0.58786964f, 0.60653067f, 0.58786964f, 0.53526145f, 0.45783335f, + 0.36787945f, 0.27768996f, 0.19691168f, 0.13117145f, 0.06196101f, 0.09901341f, + 0.14863673f, 0.20961139f, 0.27768996f, 0.34559074f, 0.40403652f, 0.44374731f, + 0.45783335f, 0.44374731f, 0.40403652f, 0.34559074f, 0.27768996f, 0.20961139f, + 0.14863673f, 0.09901341f, 0.04393693f, 0.07021102f, 0.10539922f, 0.14863673f, + 0.19691168f, 0.24506053f, 0.28650481f, 0.31466395f, 0.32465246f, 0.31466395f, + 0.28650481f, 0.24506053f, 0.19691168f, 0.14863673f, 0.10539922f, 0.07021102f, + 0.02926831f, 0.04677062f, 0.07021102f, 0.09901341f, 0.13117145f, 0.16324551f, + 0.19085334f, 0.20961139f, 0.21626517f, 0.20961139f, 0.19085334f, 0.16324551f, + 0.13117145f, 0.09901341f, 0.07021102f, 0.04677062f, + /* interp_weight lut */ + 0.00390625f, 0.01171875f, 0.01953125f, 0.02734375f, 0.03515625f, 0.04296875f, + 0.05078125f, 0.05859375f, 0.05859375f, 0.05078125f, 0.04296875f, 0.03515625f, + 0.02734375f, 0.01953125f, 0.01171875f, 0.00390625f, 0.01171875f, 0.03515625f, + 0.05859375f, 0.08203125f, 0.10546875f, 0.12890625f, 0.15234375f, 0.17578125f, + 0.17578125f, 0.15234375f, 0.12890625f, 0.10546875f, 0.08203125f, 0.05859375f, + 0.03515625f, 0.01171875f, 0.01953125f, 0.05859375f, 0.09765625f, 0.13671875f, + 0.17578125f, 0.21484375f, 0.25390625f, 0.29296875f, 0.29296875f, 0.25390625f, + 0.21484375f, 0.17578125f, 0.13671875f, 0.09765625f, 0.05859375f, 0.01953125f, + 0.02734375f, 0.08203125f, 0.13671875f, 0.19140625f, 0.24609375f, 0.30078125f, + 0.35546875f, 0.41015625f, 0.41015625f, 0.35546875f, 0.30078125f, 0.24609375f, + 0.19140625f, 0.13671875f, 0.08203125f, 0.02734375f, 0.03515625f, 0.10546875f, + 0.17578125f, 0.24609375f, 0.31640625f, 0.38671875f, 0.45703125f, 0.52734375f, + 0.52734375f, 0.45703125f, 0.38671875f, 0.31640625f, 0.24609375f, 0.17578125f, + 0.10546875f, 0.03515625f, 0.04296875f, 0.12890625f, 0.21484375f, 0.30078125f, + 0.38671875f, 0.47265625f, 0.55859375f, 0.64453125f, 0.64453125f, 0.55859375f, + 0.47265625f, 0.38671875f, 0.30078125f, 0.21484375f, 0.12890625f, 0.04296875f, + 0.05078125f, 0.15234375f, 0.25390625f, 0.35546875f, 0.45703125f, 0.55859375f, + 0.66015625f, 0.76171875f, 0.76171875f, 0.66015625f, 0.55859375f, 0.45703125f, + 0.35546875f, 0.25390625f, 0.15234375f, 0.05078125f, 0.05859375f, 0.17578125f, + 0.29296875f, 0.41015625f, 0.52734375f, 0.64453125f, 0.76171875f, 0.87890625f, + 0.87890625f, 0.76171875f, 0.64453125f, 0.52734375f, 0.41015625f, 0.29296875f, + 0.17578125f, 0.05859375f, 0.05859375f, 0.17578125f, 0.29296875f, 0.41015625f, + 0.52734375f, 0.64453125f, 0.76171875f, 0.87890625f, 0.87890625f, 0.76171875f, + 0.64453125f, 0.52734375f, 0.41015625f, 0.29296875f, 0.17578125f, 0.05859375f, + 0.05078125f, 0.15234375f, 0.25390625f, 0.35546875f, 0.45703125f, 0.55859375f, + 0.66015625f, 0.76171875f, 0.76171875f, 0.66015625f, 0.55859375f, 0.45703125f, + 0.35546875f, 0.25390625f, 0.15234375f, 0.05078125f, 0.04296875f, 0.12890625f, + 0.21484375f, 0.30078125f, 0.38671875f, 0.47265625f, 0.55859375f, 0.64453125f, + 0.64453125f, 0.55859375f, 0.47265625f, 0.38671875f, 0.30078125f, 0.21484375f, + 0.12890625f, 0.04296875f, 0.03515625f, 0.10546875f, 0.17578125f, 0.24609375f, + 0.31640625f, 0.38671875f, 0.45703125f, 0.52734375f, 0.52734375f, 0.45703125f, + 0.38671875f, 0.31640625f, 0.24609375f, 0.17578125f, 0.10546875f, 0.03515625f, + 0.02734375f, 0.08203125f, 0.13671875f, 0.19140625f, 0.24609375f, 0.30078125f, + 0.35546875f, 0.41015625f, 0.41015625f, 0.35546875f, 0.30078125f, 0.24609375f, + 0.19140625f, 0.13671875f, 0.08203125f, 0.02734375f, 0.01953125f, 0.05859375f, + 0.09765625f, 0.13671875f, 0.17578125f, 0.21484375f, 0.25390625f, 0.29296875f, + 0.29296875f, 0.25390625f, 0.21484375f, 0.17578125f, 0.13671875f, 0.09765625f, + 0.05859375f, 0.01953125f, 0.01171875f, 0.03515625f, 0.05859375f, 0.08203125f, + 0.10546875f, 0.12890625f, 0.15234375f, 0.17578125f, 0.17578125f, 0.15234375f, + 0.12890625f, 0.10546875f, 0.08203125f, 0.05859375f, 0.03515625f, 0.01171875f, + 0.00390625f, 0.01171875f, 0.01953125f, 0.02734375f, 0.03515625f, 0.04296875f, + 0.05078125f, 0.05859375f, 0.05859375f, 0.05078125f, 0.04296875f, 0.03515625f, + 0.02734375f, 0.01953125f, 0.01171875f, 0.00390625f +}; + namespace cv { namespace ocl @@ -78,38 +172,43 @@ namespace cv int cnblocks_win_x; int cnblocks_win_y; int cblock_hist_size; - int cblock_hist_size_2up; int cdescr_size; int cdescr_width; + int cdescr_height; void set_up_constants(int nbins, int block_stride_x, int block_stride_y, int nblocks_win_x, int nblocks_win_y); void compute_hists(int nbins, int block_stride_x, int blovck_stride_y, - int height, int width, const cv::ocl::oclMat &grad, - const cv::ocl::oclMat &qangle, float sigma, cv::ocl::oclMat &block_hists); + int height, int width, float sigma, const cv::ocl::oclMat &grad, + const cv::ocl::oclMat &qangle, + const cv::ocl::oclMat &gauss_w_lut, cv::ocl::oclMat &block_hists); void normalize_hists(int nbins, int block_stride_x, int block_stride_y, - int height, int width, cv::ocl::oclMat &block_hists, float threshold); + int height, int width, cv::ocl::oclMat &block_hists, + float threshold); void classify_hists(int win_height, int win_width, int block_stride_y, - int block_stride_x, int win_stride_y, int win_stride_x, int height, - int width, const cv::ocl::oclMat &block_hists, const cv::ocl::oclMat &coefs, float free_coef, + int block_stride_x, int win_stride_y, int win_stride_x, + int height, int width, const cv::ocl::oclMat &block_hists, + const cv::ocl::oclMat &coefs, float free_coef, float threshold, cv::ocl::oclMat &labels); - void extract_descrs_by_rows(int win_height, int win_width, int block_stride_y, int block_stride_x, - int win_stride_y, int win_stride_x, int height, int width, const cv::ocl::oclMat &block_hists, + void extract_descrs_by_rows(int win_height, int win_width, int block_stride_y, + int block_stride_x, int win_stride_y, int win_stride_x, + int height, int width, const cv::ocl::oclMat &block_hists, cv::ocl::oclMat &descriptors); - void extract_descrs_by_cols(int win_height, int win_width, int block_stride_y, int block_stride_x, - int win_stride_y, int win_stride_x, int height, int width, const cv::ocl::oclMat &block_hists, + void extract_descrs_by_cols(int win_height, int win_width, int block_stride_y, + int block_stride_x, int win_stride_y, int win_stride_x, + int height, int width, const cv::ocl::oclMat &block_hists, cv::ocl::oclMat &descriptors); void compute_gradients_8UC1(int height, int width, const cv::ocl::oclMat &img, - float angle_scale, cv::ocl::oclMat &grad, cv::ocl::oclMat &qangle, bool correct_gamma); + float angle_scale, cv::ocl::oclMat &grad, + cv::ocl::oclMat &qangle, bool correct_gamma); void compute_gradients_8UC4(int height, int width, const cv::ocl::oclMat &img, - float angle_scale, cv::ocl::oclMat &grad, cv::ocl::oclMat &qangle, bool correct_gamma); - - void resize( const oclMat &src, oclMat &dst, const Size sz); + float angle_scale, cv::ocl::oclMat &grad, + cv::ocl::oclMat &qangle, bool correct_gamma); } } } @@ -117,8 +216,14 @@ namespace cv using namespace ::cv::ocl::device; -cv::ocl::HOGDescriptor::HOGDescriptor(Size win_size_, Size block_size_, Size block_stride_, Size cell_size_, - int nbins_, double win_sigma_, double threshold_L2hys_, bool gamma_correction_, int nlevels_) +static inline int divUp(int total, int grain) +{ + return (total + grain - 1) / grain; +} + +cv::ocl::HOGDescriptor::HOGDescriptor(Size win_size_, Size block_size_, Size block_stride_, + Size cell_size_, int nbins_, double win_sigma_, + double threshold_L2hys_, bool gamma_correction_, int nlevels_) : win_size(win_size_), block_size(block_size_), block_stride(block_stride_), @@ -132,19 +237,27 @@ cv::ocl::HOGDescriptor::HOGDescriptor(Size win_size_, Size block_size_, Size blo CV_Assert((win_size.width - block_size.width ) % block_stride.width == 0 && (win_size.height - block_size.height) % block_stride.height == 0); - CV_Assert(block_size.width % cell_size.width == 0 && block_size.height % cell_size.height == 0); + CV_Assert(block_size.width % cell_size.width == 0 && + block_size.height % cell_size.height == 0); CV_Assert(block_stride == cell_size); CV_Assert(cell_size == Size(8, 8)); - Size cells_per_block = Size(block_size.width / cell_size.width, block_size.height / cell_size.height); + Size cells_per_block(block_size.width / cell_size.width, + block_size.height / cell_size.height); CV_Assert(cells_per_block == Size(2, 2)); cv::Size blocks_per_win = numPartsWithin(win_size, block_size, block_stride); - hog::set_up_constants(nbins, block_stride.width, block_stride.height, blocks_per_win.width, blocks_per_win.height); + hog::set_up_constants(nbins, block_stride.width, block_stride.height, + blocks_per_win.width, blocks_per_win.height); effect_size = Size(0, 0); + + if (queryDeviceInfo()) + hog_device_cpu = true; + else + hog_device_cpu = false; } size_t cv::ocl::HOGDescriptor::getDescriptorSize() const @@ -154,7 +267,8 @@ size_t cv::ocl::HOGDescriptor::getDescriptorSize() const size_t cv::ocl::HOGDescriptor::getBlockHistogramSize() const { - Size cells_per_block = Size(block_size.width / cell_size.width, block_size.height / cell_size.height); + Size cells_per_block = Size(block_size.width / cell_size.width, + block_size.height / cell_size.height); return (size_t)(nbins * cells_per_block.area()); } @@ -167,7 +281,8 @@ bool cv::ocl::HOGDescriptor::checkDetectorSize() const { size_t detector_size = detector.rows * detector.cols; size_t descriptor_size = getDescriptorSize(); - return detector_size == 0 || detector_size == descriptor_size || detector_size == descriptor_size + 1; + return detector_size == 0 || detector_size == descriptor_size || + detector_size == descriptor_size + 1; } void cv::ocl::HOGDescriptor::setSVMDetector(const vector &_detector) @@ -207,10 +322,16 @@ void cv::ocl::HOGDescriptor::init_buffer(const oclMat &img, Size win_stride) const size_t block_hist_size = getBlockHistogramSize(); const Size blocks_per_img = numPartsWithin(img.size(), block_size, block_stride); - block_hists.create(1, static_cast(block_hist_size * blocks_per_img.area()), CV_32F); + block_hists.create(1, + static_cast(block_hist_size * blocks_per_img.area()) + 256, CV_32F); Size wins_per_img = numPartsWithin(img.size(), win_size, win_stride); labels.create(1, wins_per_img.area(), CV_8U); + + vector v_lut = vector(gaussian_interp_lut, gaussian_interp_lut + + sizeof(gaussian_interp_lut) / sizeof(gaussian_interp_lut[0])); + Mat m_lut(v_lut); + gauss_w_lut.upload(m_lut.reshape(1,1)); } void cv::ocl::HOGDescriptor::computeGradient(const oclMat &img, oclMat &grad, oclMat &qangle) @@ -221,29 +342,34 @@ void cv::ocl::HOGDescriptor::computeGradient(const oclMat &img, oclMat &grad, oc switch (img.type()) { case CV_8UC1: - hog::compute_gradients_8UC1(effect_size.height, effect_size.width, img, angleScale, grad, qangle, gamma_correction); + hog::compute_gradients_8UC1(effect_size.height, effect_size.width, img, + angleScale, grad, qangle, gamma_correction); break; case CV_8UC4: - hog::compute_gradients_8UC4(effect_size.height, effect_size.width, img, angleScale, grad, qangle, gamma_correction); + hog::compute_gradients_8UC4(effect_size.height, effect_size.width, img, + angleScale, grad, qangle, gamma_correction); break; } } + void cv::ocl::HOGDescriptor::computeBlockHistograms(const oclMat &img) { - computeGradient(img, grad, qangle); + computeGradient(img, this->grad, this->qangle); - hog::compute_hists(nbins, block_stride.width, block_stride.height, effect_size.height, effect_size.width, - grad, qangle, (float)getWinSigma(), block_hists); + hog::compute_hists(nbins, block_stride.width, block_stride.height, effect_size.height, + effect_size.width, (float)getWinSigma(), grad, qangle, gauss_w_lut, block_hists); - hog::normalize_hists(nbins, block_stride.width, block_stride.height, effect_size.height, effect_size.width, - block_hists, (float)threshold_L2hys); + hog::normalize_hists(nbins, block_stride.width, block_stride.height, effect_size.height, + effect_size.width, block_hists, (float)threshold_L2hys); } -void cv::ocl::HOGDescriptor::getDescriptors(const oclMat &img, Size win_stride, oclMat &descriptors, int descr_format) +void cv::ocl::HOGDescriptor::getDescriptors(const oclMat &img, Size win_stride, + oclMat &descriptors, int descr_format) { - CV_Assert(win_stride.width % block_stride.width == 0 && win_stride.height % block_stride.height == 0); + CV_Assert(win_stride.width % block_stride.width == 0 && + win_stride.height % block_stride.height == 0); init_buffer(img, win_stride); @@ -253,17 +379,20 @@ void cv::ocl::HOGDescriptor::getDescriptors(const oclMat &img, Size win_stride, Size blocks_per_win = numPartsWithin(win_size, block_size, block_stride); Size wins_per_img = numPartsWithin(effect_size, win_size, win_stride); - descriptors.create(wins_per_img.area(), static_cast(blocks_per_win.area() * block_hist_size), CV_32F); + descriptors.create(wins_per_img.area(), + static_cast(blocks_per_win.area() * block_hist_size), CV_32F); switch (descr_format) { case DESCR_FORMAT_ROW_BY_ROW: - hog::extract_descrs_by_rows(win_size.height, win_size.width, block_stride.height, block_stride.width, - win_stride.height, win_stride.width, effect_size.height, effect_size.width, block_hists, descriptors); + hog::extract_descrs_by_rows(win_size.height, win_size.width, + block_stride.height, block_stride.width, win_stride.height, win_stride.width, + effect_size.height, effect_size.width, block_hists, descriptors); break; case DESCR_FORMAT_COL_BY_COL: - hog::extract_descrs_by_cols(win_size.height, win_size.width, block_stride.height, block_stride.width, - win_stride.height, win_stride.width, effect_size.height, effect_size.width, block_hists, descriptors); + hog::extract_descrs_by_cols(win_size.height, win_size.width, + block_stride.height, block_stride.width, win_stride.height, win_stride.width, + effect_size.height, effect_size.width, block_hists, descriptors); break; default: CV_Error(CV_StsBadArg, "Unknown descriptor format"); @@ -271,7 +400,8 @@ void cv::ocl::HOGDescriptor::getDescriptors(const oclMat &img, Size win_stride, } -void cv::ocl::HOGDescriptor::detect(const oclMat &img, vector &hits, double hit_threshold, Size win_stride, Size padding) +void cv::ocl::HOGDescriptor::detect(const oclMat &img, vector &hits, + double hit_threshold, Size win_stride, Size padding) { CV_Assert(img.type() == CV_8UC1 || img.type() == CV_8UC4); CV_Assert(padding == Size(0, 0)); @@ -283,14 +413,16 @@ void cv::ocl::HOGDescriptor::detect(const oclMat &img, vector &hits, doub if (win_stride == Size()) win_stride = block_stride; else - CV_Assert(win_stride.width % block_stride.width == 0 && win_stride.height % block_stride.height == 0); + CV_Assert(win_stride.width % block_stride.width == 0 && + win_stride.height % block_stride.height == 0); init_buffer(img, win_stride); computeBlockHistograms(img); - hog::classify_hists(win_size.height, win_size.width, block_stride.height, block_stride.width, - win_stride.height, win_stride.width, effect_size.height, effect_size.width, block_hists, - detector, (float)free_coef, (float)hit_threshold, labels); + hog::classify_hists(win_size.height, win_size.width, block_stride.height, + block_stride.width, win_stride.height, win_stride.width, + effect_size.height, effect_size.width, block_hists, detector, + (float)free_coef, (float)hit_threshold, labels); labels.download(labels_host); unsigned char *vec = labels_host.ptr(); @@ -306,8 +438,9 @@ void cv::ocl::HOGDescriptor::detect(const oclMat &img, vector &hits, doub -void cv::ocl::HOGDescriptor::detectMultiScale(const oclMat &img, vector &found_locations, double hit_threshold, - Size win_stride, Size padding, double scale0, int group_threshold) +void cv::ocl::HOGDescriptor::detectMultiScale(const oclMat &img, vector &found_locations, + double hit_threshold, Size win_stride, Size padding, + double scale0, int group_threshold) { CV_Assert(img.type() == CV_8UC1 || img.type() == CV_8UC4); CV_Assert(scale0 > 1); @@ -333,7 +466,8 @@ void cv::ocl::HOGDescriptor::detectMultiScale(const oclMat &img, vector &f if (win_stride == Size()) win_stride = block_stride; else - CV_Assert(win_stride.width % block_stride.width == 0 && win_stride.height % block_stride.height == 0); + CV_Assert(win_stride.width % block_stride.width == 0 && + win_stride.height % block_stride.height == 0); init_buffer(img, win_stride); image_scale.create(img.size(), img.type()); @@ -347,16 +481,18 @@ void cv::ocl::HOGDescriptor::detectMultiScale(const oclMat &img, vector &f } else { - hog::resize( img, image_scale, effect_size); + resize(img, image_scale, effect_size); detect(image_scale, locations, hit_threshold, win_stride, padding); } - Size scaled_win_size(cvRound(win_size.width * scale), cvRound(win_size.height * scale)); + Size scaled_win_size(cvRound(win_size.width * scale), + cvRound(win_size.height * scale)); for (size_t j = 0; j < locations.size(); j++) - all_candidates.push_back(Rect(Point2d((CvPoint)locations[j]) * scale, scaled_win_size)); + all_candidates.push_back(Rect(Point2d((CvPoint)locations[j]) * scale, + scaled_win_size)); } found_locations.assign(all_candidates.begin(), all_candidates.end()); - groupRectangles(found_locations, group_threshold, 0.2/*magic number copied from CPU version*/); + groupRectangles(found_locations, group_threshold, 0.2); } int cv::ocl::HOGDescriptor::numPartsWithin(int size, int part_size, int stride) @@ -364,9 +500,11 @@ int cv::ocl::HOGDescriptor::numPartsWithin(int size, int part_size, int stride) return (size - part_size + stride) / stride; } -cv::Size cv::ocl::HOGDescriptor::numPartsWithin(cv::Size size, cv::Size part_size, cv::Size stride) +cv::Size cv::ocl::HOGDescriptor::numPartsWithin(cv::Size size, cv::Size part_size, + cv::Size stride) { - return Size(numPartsWithin(size.width, part_size.width, stride.width), numPartsWithin(size.height, part_size.height, stride.height)); + return Size(numPartsWithin(size.width, part_size.width, stride.width), + numPartsWithin(size.height, part_size.height, stride.height)); } std::vector cv::ocl::HOGDescriptor::getDefaultPeopleDetector() @@ -1547,8 +1685,9 @@ static int power_2up(unsigned int n) return -1; // Input is too big } -void cv::ocl::device::hog::set_up_constants(int nbins, int block_stride_x, int block_stride_y, - int nblocks_win_x, int nblocks_win_y) +void cv::ocl::device::hog::set_up_constants(int nbins, + int block_stride_x, int block_stride_y, + int nblocks_win_x, int nblocks_win_y) { cnbins = nbins; cblock_stride_x = block_stride_x; @@ -1559,53 +1698,32 @@ void cv::ocl::device::hog::set_up_constants(int nbins, int block_stride_x, int b int block_hist_size = nbins * CELLS_PER_BLOCK_X * CELLS_PER_BLOCK_Y; cblock_hist_size = block_hist_size; - int block_hist_size_2up = power_2up(block_hist_size); - cblock_hist_size_2up = block_hist_size_2up; - int descr_width = nblocks_win_x * block_hist_size; cdescr_width = descr_width; + cdescr_height = nblocks_win_y; int descr_size = descr_width * nblocks_win_y; cdescr_size = descr_size; } -static inline int divUp(int total, int grain) -{ - return (total + grain - 1) / grain; -} - -static void openCLExecuteKernel_hog(Context *clCxt , const char **source, string kernelName, - size_t globalThreads[3], size_t localThreads[3], - vector< pair > &args) -{ - cl_kernel kernel = openCLGetKernelFromSource(clCxt, source, kernelName); - size_t wave_size = queryDeviceInfo(kernel); - openCLSafeCall(clReleaseKernel(kernel)); - if (wave_size <= 16) - { - char build_options[64]; - sprintf(build_options, (wave_size == 16) ? "-D WAVE_SIZE_16" : "-D WAVE_SIZE_1"); - openCLExecuteKernel(clCxt, source, kernelName, globalThreads, localThreads, args, -1, -1, build_options); - } - else - openCLExecuteKernel(clCxt, source, kernelName, globalThreads, localThreads, args, -1, -1); -} - -void cv::ocl::device::hog::compute_hists(int nbins, int block_stride_x, int block_stride_y, - int height, int width, const cv::ocl::oclMat &grad, - const cv::ocl::oclMat &qangle, float sigma, cv::ocl::oclMat &block_hists) +void cv::ocl::device::hog::compute_hists(int nbins, + int block_stride_x, int block_stride_y, + int height, int width, float sigma, + const cv::ocl::oclMat &grad, + const cv::ocl::oclMat &qangle, + const cv::ocl::oclMat &gauss_w_lut, + cv::ocl::oclMat &block_hists) { Context *clCxt = Context::getContext(); - string kernelName = "compute_hists_kernel"; vector< pair > args; + string kernelName = (sigma == 4.0f) ? "compute_hists_lut_kernel" : + "compute_hists_kernel"; - int img_block_width = (width - CELLS_PER_BLOCK_X * CELL_WIDTH + block_stride_x) / block_stride_x; - int img_block_height = (height - CELLS_PER_BLOCK_Y * CELL_HEIGHT + block_stride_y) / block_stride_y; - + int img_block_width = (width - CELLS_PER_BLOCK_X * CELL_WIDTH + block_stride_x) + / block_stride_x; + int img_block_height = (height - CELLS_PER_BLOCK_Y * CELL_HEIGHT + block_stride_y) + / block_stride_y; int blocks_total = img_block_width * img_block_height; - int blocks_in_group = 4; - size_t localThreads[3] = { blocks_in_group * 24, 2, 1 }; - size_t globalThreads[3] = { divUp(blocks_total, blocks_in_group) * localThreads[0], 2, 1 }; int grad_quadstep = grad.step >> 2; int qangle_step = qangle.step; @@ -1613,6 +1731,11 @@ void cv::ocl::device::hog::compute_hists(int nbins, int block_stride_x, int bloc // Precompute gaussian spatial window parameter float scale = 1.f / (2.f * sigma * sigma); + int blocks_in_group = 4; + size_t localThreads[3] = { blocks_in_group * 24, 2, 1 }; + size_t globalThreads[3] = { + divUp(img_block_width * img_block_height, blocks_in_group) * localThreads[0], 2, 1 }; + int hists_size = (nbins * CELLS_PER_BLOCK_X * CELLS_PER_BLOCK_Y * 12) * sizeof(float); int final_hists_size = (nbins * CELLS_PER_BLOCK_X * CELLS_PER_BLOCK_Y) * sizeof(float); int smem = (hists_size + final_hists_size) * blocks_in_group; @@ -1628,62 +1751,120 @@ void cv::ocl::device::hog::compute_hists(int nbins, int block_stride_x, int bloc args.push_back( make_pair( sizeof(cl_int), (void *)&qangle_step)); args.push_back( make_pair( sizeof(cl_mem), (void *)&grad.data)); args.push_back( make_pair( sizeof(cl_mem), (void *)&qangle.data)); - args.push_back( make_pair( sizeof(cl_float), (void *)&scale)); + if (kernelName.compare("compute_hists_lut_kernel") == 0) + args.push_back( make_pair( sizeof(cl_mem), (void *)&gauss_w_lut.data)); + else + args.push_back( make_pair( sizeof(cl_float), (void *)&scale)); args.push_back( make_pair( sizeof(cl_mem), (void *)&block_hists.data)); args.push_back( make_pair( smem, (void *)NULL)); - openCLExecuteKernel_hog(clCxt, &objdetect_hog, kernelName, globalThreads, localThreads, args); + openCLExecuteKernel(clCxt, &objdetect_hog, kernelName, globalThreads, + localThreads, args, -1, -1); } -void cv::ocl::device::hog::normalize_hists(int nbins, int block_stride_x, int block_stride_y, - int height, int width, cv::ocl::oclMat &block_hists, float threshold) +void cv::ocl::device::hog::normalize_hists(int nbins, + int block_stride_x, int block_stride_y, + int height, int width, + cv::ocl::oclMat &block_hists, + float threshold) { Context *clCxt = Context::getContext(); - string kernelName = "normalize_hists_kernel"; vector< pair > args; + string kernelName; int block_hist_size = nbins * CELLS_PER_BLOCK_X * CELLS_PER_BLOCK_Y; - int nthreads = power_2up(block_hist_size); + int img_block_width = (width - CELLS_PER_BLOCK_X * CELL_WIDTH + block_stride_x) + / block_stride_x; + int img_block_height = (height - CELLS_PER_BLOCK_Y * CELL_HEIGHT + block_stride_y) + / block_stride_y; + int nthreads; + size_t globalThreads[3] = { 1, 1, 1 }; + size_t localThreads[3] = { 1, 1, 1 }; + + if ( nbins == 9 ) + { + /* optimized for the case of 9 bins */ + kernelName = "normalize_hists_36_kernel"; + int blocks_in_group = NTHREADS / block_hist_size; + nthreads = blocks_in_group * block_hist_size; + int num_groups = divUp( img_block_width * img_block_height, blocks_in_group); + globalThreads[0] = nthreads * num_groups; + localThreads[0] = nthreads; + } + else + { + kernelName = "normalize_hists_kernel"; + nthreads = power_2up(block_hist_size); + globalThreads[0] = img_block_width * nthreads; + globalThreads[1] = img_block_height; + localThreads[0] = nthreads; - int img_block_width = (width - CELLS_PER_BLOCK_X * CELL_WIDTH + block_stride_x) / block_stride_x; - int img_block_height = (height - CELLS_PER_BLOCK_Y * CELL_HEIGHT + block_stride_y) / block_stride_y; - size_t globalThreads[3] = { img_block_width * nthreads, img_block_height, 1 }; - size_t localThreads[3] = { nthreads, 1, 1 }; + if ((nthreads < 32) || (nthreads > 512) ) + cv::ocl::error("normalize_hists: histogram's size is too small or too big", + __FILE__, __LINE__, "normalize_hists"); - if ((nthreads < 32) || (nthreads > 512) ) - cv::ocl::error("normalize_hists: histogram's size is too small or too big", __FILE__, __LINE__, "normalize_hists"); + args.push_back( make_pair( sizeof(cl_int), (void *)&nthreads)); + args.push_back( make_pair( sizeof(cl_int), (void *)&block_hist_size)); + args.push_back( make_pair( sizeof(cl_int), (void *)&img_block_width)); + } - args.push_back( make_pair( sizeof(cl_int), (void *)&nthreads)); - args.push_back( make_pair( sizeof(cl_int), (void *)&block_hist_size)); - args.push_back( make_pair( sizeof(cl_int), (void *)&img_block_width)); args.push_back( make_pair( sizeof(cl_mem), (void *)&block_hists.data)); args.push_back( make_pair( sizeof(cl_float), (void *)&threshold)); args.push_back( make_pair( nthreads * sizeof(float), (void *)NULL)); - openCLExecuteKernel_hog(clCxt, &objdetect_hog, kernelName, globalThreads, localThreads, args); + if(hog_device_cpu) + openCLExecuteKernel(clCxt, &objdetect_hog, kernelName, globalThreads, + localThreads, args, -1, -1, "-D CPU"); + else + openCLExecuteKernel(clCxt, &objdetect_hog, kernelName, globalThreads, + localThreads, args, -1, -1); } -void cv::ocl::device::hog::classify_hists(int win_height, int win_width, int block_stride_y, - int block_stride_x, int win_stride_y, int win_stride_x, int height, - int width, const cv::ocl::oclMat &block_hists, const cv::ocl::oclMat &coefs, float free_coef, - float threshold, cv::ocl::oclMat &labels) +void cv::ocl::device::hog::classify_hists(int win_height, int win_width, + int block_stride_y, int block_stride_x, + int win_stride_y, int win_stride_x, + int height, int width, + const cv::ocl::oclMat &block_hists, + const cv::ocl::oclMat &coefs, + float free_coef, float threshold, + cv::ocl::oclMat &labels) { Context *clCxt = Context::getContext(); - string kernelName = "classify_hists_kernel"; vector< pair > args; + int nthreads; + string kernelName; + switch (cdescr_width) + { + case 180: + nthreads = 180; + kernelName = "classify_hists_180_kernel"; + args.push_back( make_pair( sizeof(cl_int), (void *)&cdescr_width)); + args.push_back( make_pair( sizeof(cl_int), (void *)&cdescr_height)); + break; + case 252: + nthreads = 256; + kernelName = "classify_hists_252_kernel"; + args.push_back( make_pair( sizeof(cl_int), (void *)&cdescr_width)); + args.push_back( make_pair( sizeof(cl_int), (void *)&cdescr_height)); + break; + default: + nthreads = 256; + kernelName = "classify_hists_kernel"; + args.push_back( make_pair( sizeof(cl_int), (void *)&cdescr_size)); + args.push_back( make_pair( sizeof(cl_int), (void *)&cdescr_width)); + } + int win_block_stride_x = win_stride_x / block_stride_x; int win_block_stride_y = win_stride_y / block_stride_y; int img_win_width = (width - win_width + win_stride_x) / win_stride_x; int img_win_height = (height - win_height + win_stride_y) / win_stride_y; - int img_block_width = (width - CELLS_PER_BLOCK_X * CELL_WIDTH + block_stride_x) / block_stride_x; - - size_t globalThreads[3] = { img_win_width * NTHREADS, img_win_height, 1 }; - size_t localThreads[3] = { NTHREADS, 1, 1 }; + int img_block_width = (width - CELLS_PER_BLOCK_X * CELL_WIDTH + block_stride_x) / + block_stride_x; + size_t globalThreads[3] = { img_win_width * nthreads, img_win_height, 1 }; + size_t localThreads[3] = { nthreads, 1, 1 }; args.push_back( make_pair( sizeof(cl_int), (void *)&cblock_hist_size)); - args.push_back( make_pair( sizeof(cl_int), (void *)&cdescr_size)); - args.push_back( make_pair( sizeof(cl_int), (void *)&cdescr_width)); args.push_back( make_pair( sizeof(cl_int), (void *)&img_win_width)); args.push_back( make_pair( sizeof(cl_int), (void *)&img_block_width)); args.push_back( make_pair( sizeof(cl_int), (void *)&win_block_stride_x)); @@ -1694,12 +1875,20 @@ void cv::ocl::device::hog::classify_hists(int win_height, int win_width, int blo args.push_back( make_pair( sizeof(cl_float), (void *)&threshold)); args.push_back( make_pair( sizeof(cl_mem), (void *)&labels.data)); - openCLExecuteKernel_hog(clCxt, &objdetect_hog, kernelName, globalThreads, localThreads, args); + if(hog_device_cpu) + openCLExecuteKernel(clCxt, &objdetect_hog, kernelName, globalThreads, + localThreads, args, -1, -1, "-D CPU"); + else + openCLExecuteKernel(clCxt, &objdetect_hog, kernelName, globalThreads, + localThreads, args, -1, -1); } -void cv::ocl::device::hog::extract_descrs_by_rows(int win_height, int win_width, int block_stride_y, int block_stride_x, - int win_stride_y, int win_stride_x, int height, int width, - const cv::ocl::oclMat &block_hists, cv::ocl::oclMat &descriptors) +void cv::ocl::device::hog::extract_descrs_by_rows(int win_height, int win_width, + int block_stride_y, int block_stride_x, + int win_stride_y, int win_stride_x, + int height, int width, + const cv::ocl::oclMat &block_hists, + cv::ocl::oclMat &descriptors) { Context *clCxt = Context::getContext(); string kernelName = "extract_descrs_by_rows_kernel"; @@ -1709,7 +1898,8 @@ void cv::ocl::device::hog::extract_descrs_by_rows(int win_height, int win_width, int win_block_stride_y = win_stride_y / block_stride_y; int img_win_width = (width - win_width + win_stride_x) / win_stride_x; int img_win_height = (height - win_height + win_stride_y) / win_stride_y; - int img_block_width = (width - CELLS_PER_BLOCK_X * CELL_WIDTH + block_stride_x) / block_stride_x; + int img_block_width = (width - CELLS_PER_BLOCK_X * CELL_WIDTH + block_stride_x) / + block_stride_x; int descriptors_quadstep = descriptors.step >> 2; size_t globalThreads[3] = { img_win_width * NTHREADS, img_win_height, 1 }; @@ -1725,12 +1915,16 @@ void cv::ocl::device::hog::extract_descrs_by_rows(int win_height, int win_width, args.push_back( make_pair( sizeof(cl_mem), (void *)&block_hists.data)); args.push_back( make_pair( sizeof(cl_mem), (void *)&descriptors.data)); - openCLExecuteKernel(clCxt, &objdetect_hog, kernelName, globalThreads, localThreads, args, -1, -1); + openCLExecuteKernel(clCxt, &objdetect_hog, kernelName, globalThreads, + localThreads, args, -1, -1); } -void cv::ocl::device::hog::extract_descrs_by_cols(int win_height, int win_width, int block_stride_y, int block_stride_x, - int win_stride_y, int win_stride_x, int height, int width, - const cv::ocl::oclMat &block_hists, cv::ocl::oclMat &descriptors) +void cv::ocl::device::hog::extract_descrs_by_cols(int win_height, int win_width, + int block_stride_y, int block_stride_x, + int win_stride_y, int win_stride_x, + int height, int width, + const cv::ocl::oclMat &block_hists, + cv::ocl::oclMat &descriptors) { Context *clCxt = Context::getContext(); string kernelName = "extract_descrs_by_cols_kernel"; @@ -1740,7 +1934,8 @@ void cv::ocl::device::hog::extract_descrs_by_cols(int win_height, int win_width, int win_block_stride_y = win_stride_y / block_stride_y; int img_win_width = (width - win_width + win_stride_x) / win_stride_x; int img_win_height = (height - win_height + win_stride_y) / win_stride_y; - int img_block_width = (width - CELLS_PER_BLOCK_X * CELL_WIDTH + block_stride_x) / block_stride_x; + int img_block_width = (width - CELLS_PER_BLOCK_X * CELL_WIDTH + block_stride_x) / + block_stride_x; int descriptors_quadstep = descriptors.step >> 2; size_t globalThreads[3] = { img_win_width * NTHREADS, img_win_height, 1 }; @@ -1757,11 +1952,16 @@ void cv::ocl::device::hog::extract_descrs_by_cols(int win_height, int win_width, args.push_back( make_pair( sizeof(cl_mem), (void *)&block_hists.data)); args.push_back( make_pair( sizeof(cl_mem), (void *)&descriptors.data)); - openCLExecuteKernel(clCxt, &objdetect_hog, kernelName, globalThreads, localThreads, args, -1, -1); + openCLExecuteKernel(clCxt, &objdetect_hog, kernelName, globalThreads, + localThreads, args, -1, -1); } -void cv::ocl::device::hog::compute_gradients_8UC1(int height, int width, const cv::ocl::oclMat &img, - float angle_scale, cv::ocl::oclMat &grad, cv::ocl::oclMat &qangle, bool correct_gamma) +void cv::ocl::device::hog::compute_gradients_8UC1(int height, int width, + const cv::ocl::oclMat &img, + float angle_scale, + cv::ocl::oclMat &grad, + cv::ocl::oclMat &qangle, + bool correct_gamma) { Context *clCxt = Context::getContext(); string kernelName = "compute_gradients_8UC1_kernel"; @@ -1786,11 +1986,16 @@ void cv::ocl::device::hog::compute_gradients_8UC1(int height, int width, const c args.push_back( make_pair( sizeof(cl_char), (void *)&correctGamma)); args.push_back( make_pair( sizeof(cl_int), (void *)&cnbins)); - openCLExecuteKernel(clCxt, &objdetect_hog, kernelName, globalThreads, localThreads, args, -1, -1); + openCLExecuteKernel(clCxt, &objdetect_hog, kernelName, globalThreads, + localThreads, args, -1, -1); } -void cv::ocl::device::hog::compute_gradients_8UC4(int height, int width, const cv::ocl::oclMat &img, - float angle_scale, cv::ocl::oclMat &grad, cv::ocl::oclMat &qangle, bool correct_gamma) +void cv::ocl::device::hog::compute_gradients_8UC4(int height, int width, + const cv::ocl::oclMat &img, + float angle_scale, + cv::ocl::oclMat &grad, + cv::ocl::oclMat &qangle, + bool correct_gamma) { Context *clCxt = Context::getContext(); string kernelName = "compute_gradients_8UC4_kernel"; @@ -1816,39 +2021,6 @@ void cv::ocl::device::hog::compute_gradients_8UC4(int height, int width, const c args.push_back( make_pair( sizeof(cl_char), (void *)&correctGamma)); args.push_back( make_pair( sizeof(cl_int), (void *)&cnbins)); - openCLExecuteKernel(clCxt, &objdetect_hog, kernelName, globalThreads, localThreads, args, -1, -1); -} - -void cv::ocl::device::hog::resize( const oclMat &src, oclMat &dst, const Size sz) -{ - CV_Assert( (src.channels() == dst.channels()) ); - Context *clCxt = Context::getContext(); - - string kernelName = (src.type() == CV_8UC1) ? "resize_8UC1_kernel" : "resize_8UC4_kernel"; - size_t blkSizeX = 16, blkSizeY = 16; - size_t glbSizeX = sz.width % blkSizeX == 0 ? sz.width : (sz.width / blkSizeX + 1) * blkSizeX; - size_t glbSizeY = sz.height % blkSizeY == 0 ? sz.height : (sz.height / blkSizeY + 1) * blkSizeY; - size_t globalThreads[3] = {glbSizeX, glbSizeY, 1}; - size_t localThreads[3] = {blkSizeX, blkSizeY, 1}; - - float ifx = (float)src.cols / sz.width; - float ify = (float)src.rows / sz.height; - int src_step = static_cast(src.step); - int dst_step = static_cast(dst.step); - - vector< pair > args; - args.push_back( make_pair(sizeof(cl_mem), (void *)&dst.data)); - args.push_back( make_pair(sizeof(cl_mem), (void *)&src.data)); - args.push_back( make_pair(sizeof(cl_int), (void *)&dst.offset)); - args.push_back( make_pair(sizeof(cl_int), (void *)&src.offset)); - args.push_back( make_pair(sizeof(cl_int), (void *)&dst_step)); - args.push_back( make_pair(sizeof(cl_int), (void *)&src_step)); - args.push_back( make_pair(sizeof(cl_int), (void *)&src.cols)); - args.push_back( make_pair(sizeof(cl_int), (void *)&src.rows)); - args.push_back( make_pair(sizeof(cl_int), (void *)&sz.width)); - args.push_back( make_pair(sizeof(cl_int), (void *)&sz.height)); - args.push_back( make_pair(sizeof(cl_float), (void *)&ifx)); - args.push_back( make_pair(sizeof(cl_float), (void *)&ify)); - - openCLExecuteKernel(clCxt, &objdetect_hog, kernelName, globalThreads, localThreads, args, -1, -1); -} + openCLExecuteKernel(clCxt, &objdetect_hog, kernelName, globalThreads, + localThreads, args, -1, -1); +} \ No newline at end of file diff --git a/modules/ocl/src/opencl/objdetect_hog.cl b/modules/ocl/src/opencl/objdetect_hog.cl index 8852facae..05d538330 100644 --- a/modules/ocl/src/opencl/objdetect_hog.cl +++ b/modules/ocl/src/opencl/objdetect_hog.cl @@ -43,7 +43,6 @@ // //M*/ - #define CELL_WIDTH 8 #define CELL_HEIGHT 8 #define CELLS_PER_BLOCK_X 2 @@ -51,6 +50,100 @@ #define NTHREADS 256 #define CV_PI_F 3.1415926535897932384626433832795f +//---------------------------------------------------------------------------- +// Histogram computation +// 12 threads for a cell, 12x4 threads per block +// Use pre-computed gaussian and interp_weight lookup tables if sigma is 4.0f +__kernel void compute_hists_lut_kernel( + const int cblock_stride_x, const int cblock_stride_y, + const int cnbins, const int cblock_hist_size, const int img_block_width, + const int blocks_in_group, const int blocks_total, + const int grad_quadstep, const int qangle_step, + __global const float* grad, __global const uchar* qangle, + __global const float* gauss_w_lut, + __global float* block_hists, __local float* smem) +{ + const int lx = get_local_id(0); + const int lp = lx / 24; /* local group id */ + const int gid = get_group_id(0) * blocks_in_group + lp;/* global group id */ + const int gidY = gid / img_block_width; + const int gidX = gid - gidY * img_block_width; + + const int lidX = lx - lp * 24; + const int lidY = get_local_id(1); + + const int cell_x = lidX / 12; + const int cell_y = lidY; + const int cell_thread_x = lidX - cell_x * 12; + + __local float* hists = smem + lp * cnbins * (CELLS_PER_BLOCK_X * + CELLS_PER_BLOCK_Y * 12 + CELLS_PER_BLOCK_X * CELLS_PER_BLOCK_Y); + __local float* final_hist = hists + cnbins * + (CELLS_PER_BLOCK_X * CELLS_PER_BLOCK_Y * 12); + + const int offset_x = gidX * cblock_stride_x + (cell_x << 2) + cell_thread_x; + const int offset_y = gidY * cblock_stride_y + (cell_y << 2); + + __global const float* grad_ptr = (gid < blocks_total) ? + grad + offset_y * grad_quadstep + (offset_x << 1) : grad; + __global const uchar* qangle_ptr = (gid < blocks_total) ? + qangle + offset_y * qangle_step + (offset_x << 1) : qangle; + + __local float* hist = hists + 12 * (cell_y * CELLS_PER_BLOCK_Y + cell_x) + + cell_thread_x; + for (int bin_id = 0; bin_id < cnbins; ++bin_id) + hist[bin_id * 48] = 0.f; + + const int dist_x = -4 + cell_thread_x - 4 * cell_x; + const int dist_center_x = dist_x - 4 * (1 - 2 * cell_x); + + const int dist_y_begin = -4 - 4 * lidY; + for (int dist_y = dist_y_begin; dist_y < dist_y_begin + 12; ++dist_y) + { + float2 vote = (float2) (grad_ptr[0], grad_ptr[1]); + uchar2 bin = (uchar2) (qangle_ptr[0], qangle_ptr[1]); + + grad_ptr += grad_quadstep; + qangle_ptr += qangle_step; + + int dist_center_y = dist_y - 4 * (1 - 2 * cell_y); + + int idx = (dist_center_y + 8) * 16 + (dist_center_x + 8); + float gaussian = gauss_w_lut[idx]; + idx = (dist_y + 8) * 16 + (dist_x + 8); + float interp_weight = gauss_w_lut[256+idx]; + + hist[bin.x * 48] += gaussian * interp_weight * vote.x; + hist[bin.y * 48] += gaussian * interp_weight * vote.y; + } + barrier(CLK_LOCAL_MEM_FENCE); + + volatile __local float* hist_ = hist; + for (int bin_id = 0; bin_id < cnbins; ++bin_id, hist_ += 48) + { + if (cell_thread_x < 6) + hist_[0] += hist_[6]; + barrier(CLK_LOCAL_MEM_FENCE); + if (cell_thread_x < 3) + hist_[0] += hist_[3]; +#ifdef CPU + barrier(CLK_LOCAL_MEM_FENCE); +#endif + if (cell_thread_x == 0) + final_hist[(cell_x * 2 + cell_y) * cnbins + bin_id] = + hist_[0] + hist_[1] + hist_[2]; + } + barrier(CLK_LOCAL_MEM_FENCE); + + int tid = (cell_y * CELLS_PER_BLOCK_Y + cell_x) * 12 + cell_thread_x; + if ((tid < cblock_hist_size) && (gid < blocks_total)) + { + __global float* block_hist = block_hists + + (gidY * img_block_width + gidX) * cblock_hist_size; + block_hist[tid] = final_hist[tid]; + } +} + //---------------------------------------------------------------------------- // Histogram computation // 12 threads for a cell, 12x4 threads per block @@ -125,16 +218,14 @@ __kernel void compute_hists_kernel( barrier(CLK_LOCAL_MEM_FENCE); if (cell_thread_x < 3) hist_[0] += hist_[3]; -#ifdef WAVE_SIZE_1 +#ifdef CPU barrier(CLK_LOCAL_MEM_FENCE); #endif if (cell_thread_x == 0) final_hist[(cell_x * 2 + cell_y) * cnbins + bin_id] = hist_[0] + hist_[1] + hist_[2]; } -#ifdef WAVE_SIZE_1 barrier(CLK_LOCAL_MEM_FENCE); -#endif int tid = (cell_y * CELLS_PER_BLOCK_Y + cell_x) * 12 + cell_thread_x; if ((tid < cblock_hist_size) && (gid < blocks_total)) @@ -145,6 +236,57 @@ __kernel void compute_hists_kernel( } } +//------------------------------------------------------------- +// Normalization of histograms via L2Hys_norm +// optimized for the case of 9 bins +__kernel void normalize_hists_36_kernel(__global float* block_hists, + const float threshold, __local float *squares) +{ + const int tid = get_local_id(0); + const int gid = get_global_id(0); + const int bid = tid / 36; /* block-hist id, (0 - 6) */ + const int boffset = bid * 36; /* block-hist offset in the work-group */ + const int hid = tid - boffset; /* histogram bin id, (0 - 35) */ + + float elem = block_hists[gid]; + squares[tid] = elem * elem; + barrier(CLK_LOCAL_MEM_FENCE); + + __local float* smem = squares + boffset; + float sum = smem[hid]; + if (hid < 18) + smem[hid] = sum = sum + smem[hid + 18]; + barrier(CLK_LOCAL_MEM_FENCE); + if (hid < 9) + smem[hid] = sum = sum + smem[hid + 9]; + barrier(CLK_LOCAL_MEM_FENCE); + if (hid < 4) + smem[hid] = sum + smem[hid + 4]; + barrier(CLK_LOCAL_MEM_FENCE); + sum = smem[0] + smem[1] + smem[2] + smem[3] + smem[8]; + + elem = elem / (sqrt(sum) + 3.6f); + elem = min(elem, threshold); + + barrier(CLK_LOCAL_MEM_FENCE); + squares[tid] = elem * elem; + barrier(CLK_LOCAL_MEM_FENCE); + + sum = smem[hid]; + if (hid < 18) + smem[hid] = sum = sum + smem[hid + 18]; + barrier(CLK_LOCAL_MEM_FENCE); + if (hid < 9) + smem[hid] = sum = sum + smem[hid + 9]; + barrier(CLK_LOCAL_MEM_FENCE); + if (hid < 4) + smem[hid] = sum + smem[hid + 4]; + barrier(CLK_LOCAL_MEM_FENCE); + sum = smem[0] + smem[1] + smem[2] + smem[3] + smem[8]; + + block_hists[gid] = elem / (sqrt(sum) + 1e-3f); +} + //------------------------------------------------------------- // Normalization of histograms via L2Hys_norm // @@ -153,76 +295,50 @@ float reduce_smem(volatile __local float* smem, int size) unsigned int tid = get_local_id(0); float sum = smem[tid]; - if (size >= 512) - { - if (tid < 256) smem[tid] = sum = sum + smem[tid + 256]; - barrier(CLK_LOCAL_MEM_FENCE); - } - if (size >= 256) - { - if (tid < 128) smem[tid] = sum = sum + smem[tid + 128]; - barrier(CLK_LOCAL_MEM_FENCE); - } - if (size >= 128) - { - if (tid < 64) smem[tid] = sum = sum + smem[tid + 64]; - barrier(CLK_LOCAL_MEM_FENCE); - } - + if (size >= 512) { if (tid < 256) smem[tid] = sum = sum + smem[tid + 256]; + barrier(CLK_LOCAL_MEM_FENCE); } + if (size >= 256) { if (tid < 128) smem[tid] = sum = sum + smem[tid + 128]; + barrier(CLK_LOCAL_MEM_FENCE); } + if (size >= 128) { if (tid < 64) smem[tid] = sum = sum + smem[tid + 64]; + barrier(CLK_LOCAL_MEM_FENCE); } +#ifdef CPU + if (size >= 64) { if (tid < 32) smem[tid] = sum = sum + smem[tid + 32]; + barrier(CLK_LOCAL_MEM_FENCE); } + if (size >= 32) { if (tid < 16) smem[tid] = sum = sum + smem[tid + 16]; + barrier(CLK_LOCAL_MEM_FENCE); } + if (size >= 16) { if (tid < 8) smem[tid] = sum = sum + smem[tid + 8]; + barrier(CLK_LOCAL_MEM_FENCE); } + if (size >= 8) { if (tid < 4) smem[tid] = sum = sum + smem[tid + 4]; + barrier(CLK_LOCAL_MEM_FENCE); } + if (size >= 4) { if (tid < 2) smem[tid] = sum = sum + smem[tid + 2]; + barrier(CLK_LOCAL_MEM_FENCE); } + if (size >= 2) { if (tid < 1) smem[tid] = sum = sum + smem[tid + 1]; + barrier(CLK_LOCAL_MEM_FENCE); } +#else if (tid < 32) { if (size >= 64) smem[tid] = sum = sum + smem[tid + 32]; -#if defined(WAVE_SIZE_16) || defined(WAVE_SIZE_1) - } - barrier(CLK_LOCAL_MEM_FENCE); - if (tid < 16) - { -#endif if (size >= 32) smem[tid] = sum = sum + smem[tid + 16]; -#ifdef WAVE_SIZE_1 - } - barrier(CLK_LOCAL_MEM_FENCE); - if (tid < 8) - { -#endif if (size >= 16) smem[tid] = sum = sum + smem[tid + 8]; -#ifdef WAVE_SIZE_1 - } - barrier(CLK_LOCAL_MEM_FENCE); - if (tid < 4) - { -#endif if (size >= 8) smem[tid] = sum = sum + smem[tid + 4]; -#ifdef WAVE_SIZE_1 - } - barrier(CLK_LOCAL_MEM_FENCE); - if (tid < 2) - { -#endif if (size >= 4) smem[tid] = sum = sum + smem[tid + 2]; -#ifdef WAVE_SIZE_1 - } - barrier(CLK_LOCAL_MEM_FENCE); - if (tid < 1) - { -#endif if (size >= 2) smem[tid] = sum = sum + smem[tid + 1]; } - - barrier(CLK_LOCAL_MEM_FENCE); - sum = smem[0]; +#endif return sum; } -__kernel void normalize_hists_kernel(const int nthreads, const int block_hist_size, const int img_block_width, - __global float* block_hists, const float threshold, __local float *squares) +__kernel void normalize_hists_kernel( + const int nthreads, const int block_hist_size, const int img_block_width, + __global float* block_hists, const float threshold, __local float *squares) { const int tid = get_local_id(0); const int gidX = get_group_id(0); const int gidY = get_group_id(1); - __global float* hist = block_hists + (gidY * img_block_width + gidX) * block_hist_size + tid; + __global float* hist = block_hists + (gidY * img_block_width + gidX) * + block_hist_size + tid; float elem = 0.f; if (tid < block_hist_size) @@ -249,25 +365,98 @@ __kernel void normalize_hists_kernel(const int nthreads, const int block_hist_si //--------------------------------------------------------------------- // Linear SVM based classification -// -__kernel void classify_hists_kernel(const int cblock_hist_size, const int cdescr_size, const int cdescr_width, - const int img_win_width, const int img_block_width, - const int win_block_stride_x, const int win_block_stride_y, - __global const float * block_hists, __global const float* coefs, - float free_coef, float threshold, __global uchar* labels) +// 48x96 window, 9 bins and default parameters +// 180 threads, each thread corresponds to a bin in a row +__kernel void classify_hists_180_kernel( + const int cdescr_width, const int cdescr_height, const int cblock_hist_size, + const int img_win_width, const int img_block_width, + const int win_block_stride_x, const int win_block_stride_y, + __global const float * block_hists, __global const float* coefs, + float free_coef, float threshold, __global uchar* labels) { const int tid = get_local_id(0); const int gidX = get_group_id(0); const int gidY = get_group_id(1); - __global const float* hist = block_hists + (gidY * win_block_stride_y * img_block_width + gidX * win_block_stride_x) * cblock_hist_size; + __global const float* hist = block_hists + (gidY * win_block_stride_y * + img_block_width + gidX * win_block_stride_x) * cblock_hist_size; float product = 0.f; - for (int i = tid; i < cdescr_size; i += NTHREADS) + + for (int i = 0; i < cdescr_height; i++) { - int offset_y = i / cdescr_width; - int offset_x = i - offset_y * cdescr_width; - product += coefs[i] * hist[offset_y * img_block_width * cblock_hist_size + offset_x]; + product += coefs[i * cdescr_width + tid] * + hist[i * img_block_width * cblock_hist_size + tid]; + } + + __local float products[180]; + + products[tid] = product; + + barrier(CLK_LOCAL_MEM_FENCE); + + if (tid < 90) products[tid] = product = product + products[tid + 90]; + barrier(CLK_LOCAL_MEM_FENCE); + + if (tid < 45) products[tid] = product = product + products[tid + 45]; + barrier(CLK_LOCAL_MEM_FENCE); + + volatile __local float* smem = products; +#ifdef CPU + if (tid < 13) smem[tid] = product = product + smem[tid + 32]; + barrier(CLK_LOCAL_MEM_FENCE); + if (tid < 16) smem[tid] = product = product + smem[tid + 16]; + barrier(CLK_LOCAL_MEM_FENCE); + if(tid<8) smem[tid] = product = product + smem[tid + 8]; + barrier(CLK_LOCAL_MEM_FENCE); + if(tid<4) smem[tid] = product = product + smem[tid + 4]; + barrier(CLK_LOCAL_MEM_FENCE); + if(tid<2) smem[tid] = product = product + smem[tid + 2]; + barrier(CLK_LOCAL_MEM_FENCE); +#else + if (tid < 13) + { + smem[tid] = product = product + smem[tid + 32]; + } + if (tid < 16) + { + smem[tid] = product = product + smem[tid + 16]; + smem[tid] = product = product + smem[tid + 8]; + smem[tid] = product = product + smem[tid + 4]; + smem[tid] = product = product + smem[tid + 2]; + } +#endif + + if (tid == 0){ + product = product + smem[tid + 1]; + labels[gidY * img_win_width + gidX] = (product + free_coef >= threshold); + } +} + +//--------------------------------------------------------------------- +// Linear SVM based classification +// 64x128 window, 9 bins and default parameters +// 256 threads, 252 of them are used +__kernel void classify_hists_252_kernel( + const int cdescr_width, const int cdescr_height, const int cblock_hist_size, + const int img_win_width, const int img_block_width, + const int win_block_stride_x, const int win_block_stride_y, + __global const float * block_hists, __global const float* coefs, + float free_coef, float threshold, __global uchar* labels) +{ + const int tid = get_local_id(0); + const int gidX = get_group_id(0); + const int gidY = get_group_id(1); + + __global const float* hist = block_hists + (gidY * win_block_stride_y * + img_block_width + gidX * win_block_stride_x) * cblock_hist_size; + + float product = 0.f; + if (tid < cdescr_width) + { + for (int i = 0; i < cdescr_height; i++) + product += coefs[i * cdescr_width + tid] * + hist[i * img_block_width * cblock_hist_size + tid]; } __local float products[NTHREADS]; @@ -282,67 +471,120 @@ __kernel void classify_hists_kernel(const int cblock_hist_size, const int cdescr if (tid < 64) products[tid] = product = product + products[tid + 64]; barrier(CLK_LOCAL_MEM_FENCE); - volatile __local float* smem = products; + volatile __local float* smem = products; +#ifdef CPU + if(tid<32) smem[tid] = product = product + smem[tid + 32]; + barrier(CLK_LOCAL_MEM_FENCE); + if(tid<16) smem[tid] = product = product + smem[tid + 16]; + barrier(CLK_LOCAL_MEM_FENCE); + if(tid<8) smem[tid] = product = product + smem[tid + 8]; + barrier(CLK_LOCAL_MEM_FENCE); + if(tid<4) smem[tid] = product = product + smem[tid + 4]; + barrier(CLK_LOCAL_MEM_FENCE); + if(tid<2) smem[tid] = product = product + smem[tid + 2]; + barrier(CLK_LOCAL_MEM_FENCE); +#else if (tid < 32) - { + { smem[tid] = product = product + smem[tid + 32]; -#if defined(WAVE_SIZE_16) || defined(WAVE_SIZE_1) - } - barrier(CLK_LOCAL_MEM_FENCE); - if (tid < 16) - { -#endif smem[tid] = product = product + smem[tid + 16]; -#ifdef WAVE_SIZE_1 - } - barrier(CLK_LOCAL_MEM_FENCE); - if (tid < 8) - { -#endif smem[tid] = product = product + smem[tid + 8]; -#ifdef WAVE_SIZE_1 - } - barrier(CLK_LOCAL_MEM_FENCE); - if (tid < 4) - { -#endif smem[tid] = product = product + smem[tid + 4]; -#ifdef WAVE_SIZE_1 - } - barrier(CLK_LOCAL_MEM_FENCE); - if (tid < 2) - { -#endif smem[tid] = product = product + smem[tid + 2]; -#ifdef WAVE_SIZE_1 } - barrier(CLK_LOCAL_MEM_FENCE); - if (tid < 1) - { #endif - smem[tid] = product = product + smem[tid + 1]; + if (tid == 0){ + product = product + smem[tid + 1]; + labels[gidY * img_win_width + gidX] = (product + free_coef >= threshold); + } +} + +//--------------------------------------------------------------------- +// Linear SVM based classification +// 256 threads +__kernel void classify_hists_kernel( + const int cdescr_size, const int cdescr_width, const int cblock_hist_size, + const int img_win_width, const int img_block_width, + const int win_block_stride_x, const int win_block_stride_y, + __global const float * block_hists, __global const float* coefs, + float free_coef, float threshold, __global uchar* labels) +{ + const int tid = get_local_id(0); + const int gidX = get_group_id(0); + const int gidY = get_group_id(1); + + __global const float* hist = block_hists + (gidY * win_block_stride_y * + img_block_width + gidX * win_block_stride_x) * cblock_hist_size; + + float product = 0.f; + for (int i = tid; i < cdescr_size; i += NTHREADS) + { + int offset_y = i / cdescr_width; + int offset_x = i - offset_y * cdescr_width; + product += coefs[i] * + hist[offset_y * img_block_width * cblock_hist_size + offset_x]; } - if (tid == 0) + __local float products[NTHREADS]; + + products[tid] = product; + + barrier(CLK_LOCAL_MEM_FENCE); + + if (tid < 128) products[tid] = product = product + products[tid + 128]; + barrier(CLK_LOCAL_MEM_FENCE); + + if (tid < 64) products[tid] = product = product + products[tid + 64]; + barrier(CLK_LOCAL_MEM_FENCE); + + volatile __local float* smem = products; +#ifdef CPU + if(tid<32) smem[tid] = product = product + smem[tid + 32]; + barrier(CLK_LOCAL_MEM_FENCE); + if(tid<16) smem[tid] = product = product + smem[tid + 16]; + barrier(CLK_LOCAL_MEM_FENCE); + if(tid<8) smem[tid] = product = product + smem[tid + 8]; + barrier(CLK_LOCAL_MEM_FENCE); + if(tid<4) smem[tid] = product = product + smem[tid + 4]; + barrier(CLK_LOCAL_MEM_FENCE); + if(tid<2) smem[tid] = product = product + smem[tid + 2]; + barrier(CLK_LOCAL_MEM_FENCE); +#else + if (tid < 32) + { + smem[tid] = product = product + smem[tid + 32]; + smem[tid] = product = product + smem[tid + 16]; + smem[tid] = product = product + smem[tid + 8]; + smem[tid] = product = product + smem[tid + 4]; + smem[tid] = product = product + smem[tid + 2]; + } +#endif + if (tid == 0){ + smem[tid] = product = product + smem[tid + 1]; labels[gidY * img_win_width + gidX] = (product + free_coef >= threshold); + } } //---------------------------------------------------------------------------- // Extract descriptors -__kernel void extract_descrs_by_rows_kernel(const int cblock_hist_size, const int descriptors_quadstep, const int cdescr_size, const int cdescr_width, - const int img_block_width, const int win_block_stride_x, const int win_block_stride_y, - __global const float* block_hists, __global float* descriptors) +__kernel void extract_descrs_by_rows_kernel( + const int cblock_hist_size, const int descriptors_quadstep, + const int cdescr_size, const int cdescr_width, const int img_block_width, + const int win_block_stride_x, const int win_block_stride_y, + __global const float* block_hists, __global float* descriptors) { int tid = get_local_id(0); int gidX = get_group_id(0); int gidY = get_group_id(1); // Get left top corner of the window in src - __global const float* hist = block_hists + (gidY * win_block_stride_y * img_block_width + gidX * win_block_stride_x) * cblock_hist_size; + __global const float* hist = block_hists + (gidY * win_block_stride_y * + img_block_width + gidX * win_block_stride_x) * cblock_hist_size; // Get left top corner of the window in dst - __global float* descriptor = descriptors + (gidY * get_num_groups(0) + gidX) * descriptors_quadstep; + __global float* descriptor = descriptors + + (gidY * get_num_groups(0) + gidX) * descriptors_quadstep; // Copy elements from src to dst for (int i = tid; i < cdescr_size; i += NTHREADS) @@ -353,19 +595,23 @@ __kernel void extract_descrs_by_rows_kernel(const int cblock_hist_size, const in } } -__kernel void extract_descrs_by_cols_kernel(const int cblock_hist_size, const int descriptors_quadstep, const int cdescr_size, - const int cnblocks_win_x, const int cnblocks_win_y, const int img_block_width, const int win_block_stride_x, - const int win_block_stride_y, __global const float* block_hists, __global float* descriptors) +__kernel void extract_descrs_by_cols_kernel( + const int cblock_hist_size, const int descriptors_quadstep, const int cdescr_size, + const int cnblocks_win_x, const int cnblocks_win_y, const int img_block_width, + const int win_block_stride_x, const int win_block_stride_y, + __global const float* block_hists, __global float* descriptors) { int tid = get_local_id(0); int gidX = get_group_id(0); int gidY = get_group_id(1); // Get left top corner of the window in src - __global const float* hist = block_hists + (gidY * win_block_stride_y * img_block_width + gidX * win_block_stride_x) * cblock_hist_size; + __global const float* hist = block_hists + (gidY * win_block_stride_y * + img_block_width + gidX * win_block_stride_x) * cblock_hist_size; // Get left top corner of the window in dst - __global float* descriptor = descriptors + (gidY * get_num_groups(0) + gidX) * descriptors_quadstep; + __global float* descriptor = descriptors + + (gidY * get_num_groups(0) + gidX) * descriptors_quadstep; // Copy elements from src to dst for (int i = tid; i < cdescr_size; i += NTHREADS) @@ -376,16 +622,19 @@ __kernel void extract_descrs_by_cols_kernel(const int cblock_hist_size, const in int y = block_idx / cnblocks_win_x; int x = block_idx - y * cnblocks_win_x; - descriptor[(x * cnblocks_win_y + y) * cblock_hist_size + idx_in_block] = hist[(y * img_block_width + x) * cblock_hist_size + idx_in_block]; + descriptor[(x * cnblocks_win_y + y) * cblock_hist_size + idx_in_block] = + hist[(y * img_block_width + x) * cblock_hist_size + idx_in_block]; } } //---------------------------------------------------------------------------- // Gradients computation -__kernel void compute_gradients_8UC4_kernel(const int height, const int width, const int img_step, const int grad_quadstep, const int qangle_step, - const __global uchar4 * img, __global float * grad, __global uchar * qangle, - const float angle_scale, const char correct_gamma, const int cnbins) +__kernel void compute_gradients_8UC4_kernel( + const int height, const int width, + const int img_step, const int grad_quadstep, const int qangle_step, + const __global uchar4 * img, __global float * grad, __global uchar * qangle, + const float angle_scale, const char correct_gamma, const int cnbins) { const int x = get_global_id(0); const int tid = get_local_id(0); @@ -426,8 +675,10 @@ __kernel void compute_gradients_8UC4_kernel(const int height, const int width, c barrier(CLK_LOCAL_MEM_FENCE); if (x < width) { - float3 a = (float3) (sh_row[tid], sh_row[tid + (NTHREADS + 2)], sh_row[tid + 2 * (NTHREADS + 2)]); - float3 b = (float3) (sh_row[tid + 2], sh_row[tid + 2 + (NTHREADS + 2)], sh_row[tid + 2 + 2 * (NTHREADS + 2)]); + float3 a = (float3) (sh_row[tid], sh_row[tid + (NTHREADS + 2)], + sh_row[tid + 2 * (NTHREADS + 2)]); + float3 b = (float3) (sh_row[tid + 2], sh_row[tid + 2 + (NTHREADS + 2)], + sh_row[tid + 2 + 2 * (NTHREADS + 2)]); float3 dx; if (correct_gamma == 1) @@ -482,9 +733,11 @@ __kernel void compute_gradients_8UC4_kernel(const int height, const int width, c } } -__kernel void compute_gradients_8UC1_kernel(const int height, const int width, const int img_step, const int grad_quadstep, const int qangle_step, - __global const uchar * img, __global float * grad, __global uchar * qangle, - const float angle_scale, const char correct_gamma, const int cnbins) +__kernel void compute_gradients_8UC1_kernel( + const int height, const int width, + const int img_step, const int grad_quadstep, const int qangle_step, + __global const uchar * img, __global float * grad, __global uchar * qangle, + const float angle_scale, const char correct_gamma, const int cnbins) { const int x = get_global_id(0); const int tid = get_local_id(0); @@ -539,43 +792,4 @@ __kernel void compute_gradients_8UC1_kernel(const int height, const int width, c grad[ (gidY * grad_quadstep + x) << 1 ] = mag * (1.f - ang); grad[ ((gidY * grad_quadstep + x) << 1) + 1 ] = mag * ang; } -} - -//---------------------------------------------------------------------------- -// Resize - -__kernel void resize_8UC4_kernel(__global uchar4 * dst, __global const uchar4 * src, - int dst_offset, int src_offset, int dst_step, int src_step, - int src_cols, int src_rows, int dst_cols, int dst_rows, float ifx, float ify ) -{ - int dx = get_global_id(0); - int dy = get_global_id(1); - - int sx = (int)floor(dx*ifx+0.5f); - int sy = (int)floor(dy*ify+0.5f); - sx = min(sx, src_cols-1); - sy = min(sy, src_rows-1); - int dpos = (dst_offset>>2) + dy * (dst_step>>2) + dx; - int spos = (src_offset>>2) + sy * (src_step>>2) + sx; - - if(dx