Merge pull request #2234 from KonstantinMatskevich:ocl_tapi_hog
This commit is contained in:
commit
22146e4b18
@ -285,10 +285,11 @@ public:
|
||||
CV_WRAP virtual void save(const String& filename, const String& objname = String()) const;
|
||||
virtual void copyTo(HOGDescriptor& c) const;
|
||||
|
||||
CV_WRAP virtual void compute(const Mat& img,
|
||||
CV_WRAP virtual void compute(InputArray img,
|
||||
CV_OUT std::vector<float>& descriptors,
|
||||
Size winStride = Size(), Size padding = Size(),
|
||||
const std::vector<Point>& locations = std::vector<Point>()) const;
|
||||
|
||||
//with found weights output
|
||||
CV_WRAP virtual void detect(const Mat& img, CV_OUT std::vector<Point>& foundLocations,
|
||||
CV_OUT std::vector<double>& weights,
|
||||
@ -300,13 +301,14 @@ public:
|
||||
double hitThreshold = 0, Size winStride = Size(),
|
||||
Size padding = Size(),
|
||||
const std::vector<Point>& searchLocations=std::vector<Point>()) const;
|
||||
|
||||
//with result weights output
|
||||
CV_WRAP virtual void detectMultiScale(const Mat& img, CV_OUT std::vector<Rect>& foundLocations,
|
||||
CV_WRAP virtual void detectMultiScale(InputArray img, CV_OUT std::vector<Rect>& foundLocations,
|
||||
CV_OUT std::vector<double>& foundWeights, double hitThreshold = 0,
|
||||
Size winStride = Size(), Size padding = Size(), double scale = 1.05,
|
||||
double finalThreshold = 2.0,bool useMeanshiftGrouping = false) const;
|
||||
//without found weights output
|
||||
virtual void detectMultiScale(const Mat& img, CV_OUT std::vector<Rect>& foundLocations,
|
||||
virtual void detectMultiScale(InputArray img, CV_OUT std::vector<Rect>& foundLocations,
|
||||
double hitThreshold = 0, Size winStride = Size(),
|
||||
Size padding = Size(), double scale = 1.05,
|
||||
double finalThreshold = 2.0, bool useMeanshiftGrouping = false) const;
|
||||
@ -328,25 +330,27 @@ public:
|
||||
CV_PROP double L2HysThreshold;
|
||||
CV_PROP bool gammaCorrection;
|
||||
CV_PROP std::vector<float> svmDetector;
|
||||
UMat oclSvmDetector;
|
||||
float free_coef;
|
||||
CV_PROP int nlevels;
|
||||
|
||||
|
||||
// evaluate specified ROI and return confidence value for each location
|
||||
virtual void detectROI(const cv::Mat& img, const std::vector<cv::Point> &locations,
|
||||
// evaluate specified ROI and return confidence value for each location
|
||||
virtual void detectROI(const cv::Mat& img, const std::vector<cv::Point> &locations,
|
||||
CV_OUT std::vector<cv::Point>& foundLocations, CV_OUT std::vector<double>& confidences,
|
||||
double hitThreshold = 0, cv::Size winStride = Size(),
|
||||
cv::Size padding = Size()) const;
|
||||
|
||||
// evaluate specified ROI and return confidence value for each location in multiple scales
|
||||
virtual void detectMultiScaleROI(const cv::Mat& img,
|
||||
// evaluate specified ROI and return confidence value for each location in multiple scales
|
||||
virtual void detectMultiScaleROI(const cv::Mat& img,
|
||||
CV_OUT std::vector<cv::Rect>& foundLocations,
|
||||
std::vector<DetectionROI>& locations,
|
||||
double hitThreshold = 0,
|
||||
int groupThreshold = 0) const;
|
||||
|
||||
// read/parse Dalal's alt model file
|
||||
void readALTModel(String modelfile);
|
||||
void groupRectangles(std::vector<cv::Rect>& rectList, std::vector<double>& weights, int groupThreshold, double eps) const;
|
||||
// read/parse Dalal's alt model file
|
||||
void readALTModel(String modelfile);
|
||||
void groupRectangles(std::vector<cv::Rect>& rectList, std::vector<double>& weights, int groupThreshold, double eps) const;
|
||||
};
|
||||
|
||||
|
||||
|
94
modules/objdetect/perf/opencl/perf_hogdetect.cpp
Normal file
94
modules/objdetect/perf/opencl/perf_hogdetect.cpp
Normal file
@ -0,0 +1,94 @@
|
||||
/*M///////////////////////////////////////////////////////////////////////////////////////
|
||||
//
|
||||
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
|
||||
//
|
||||
// By downloading, copying, installing or using the software you agree to this license.
|
||||
// If you do not agree to this license, do not download, install,
|
||||
// copy or use the software.
|
||||
//
|
||||
//
|
||||
// License Agreement
|
||||
// For Open Source Computer Vision Library
|
||||
//
|
||||
// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
|
||||
// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
|
||||
// Third party copyrights are property of their respective owners.
|
||||
//
|
||||
// @Authors
|
||||
// Fangfang Bai, fangfang@multicorewareinc.com
|
||||
// Jin Ma, jin@multicorewareinc.com
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without modification,
|
||||
// are permitted provided that the following conditions are met:
|
||||
//
|
||||
// * Redistribution's of source code must retain the above copyright notice,
|
||||
// this list of conditions and the following disclaimer.
|
||||
//
|
||||
// * Redistribution's in binary form must reproduce the above copyright notice,
|
||||
// this list of conditions and the following disclaimer in the documentation
|
||||
// and/or other materials provided with the distribution.
|
||||
//
|
||||
// * The name of the copyright holders may not be used to endorse or promote products
|
||||
// derived from this software without specific prior written permission.
|
||||
//
|
||||
// This software is provided by the copyright holders and contributors as is and
|
||||
// any express or implied warranties, including, but not limited to, the implied
|
||||
// warranties of merchantability and fitness for a particular purpose are disclaimed.
|
||||
// In no event shall the Intel Corporation or contributors be liable for any direct,
|
||||
// indirect, incidental, special, exemplary, or consequential damages
|
||||
// (including, but not limited to, procurement of substitute goods or services;
|
||||
// loss of use, data, or profits; or business interruption) however caused
|
||||
// and on any theory of liability, whether in contract, strict liability,
|
||||
// or tort (including negligence or otherwise) arising in any way out of
|
||||
// the use of this software, even if advised of the possibility of such damage.
|
||||
//
|
||||
//M*/
|
||||
|
||||
#include "perf_precomp.hpp"
|
||||
#include "opencv2/ts/ocl_perf.hpp"
|
||||
|
||||
#ifdef HAVE_OPENCL
|
||||
|
||||
namespace cvtest {
|
||||
namespace ocl {
|
||||
///////////// HOG////////////////////////
|
||||
|
||||
struct RectLess :
|
||||
public std::binary_function<cv::Rect, cv::Rect, bool>
|
||||
{
|
||||
bool operator()(const cv::Rect& a,
|
||||
const cv::Rect& b) const
|
||||
{
|
||||
if (a.x != b.x)
|
||||
return a.x < b.x;
|
||||
else if (a.y != b.y)
|
||||
return a.y < b.y;
|
||||
else if (a.width != b.width)
|
||||
return a.width < b.width;
|
||||
else
|
||||
return a.height < b.height;
|
||||
}
|
||||
};
|
||||
|
||||
OCL_PERF_TEST(HOGFixture, HOG)
|
||||
{
|
||||
UMat src;
|
||||
imread(getDataPath("gpu/hog/road.png"), cv::IMREAD_GRAYSCALE).copyTo(src);
|
||||
ASSERT_FALSE(src.empty());
|
||||
|
||||
vector<cv::Rect> found_locations;
|
||||
declare.in(src);
|
||||
|
||||
HOGDescriptor hog;
|
||||
hog.setSVMDetector(hog.getDefaultPeopleDetector());
|
||||
|
||||
OCL_TEST_CYCLE() hog.detectMultiScale(src, found_locations);
|
||||
|
||||
std::sort(found_locations.begin(), found_locations.end(), RectLess());
|
||||
SANITY_CHECK(found_locations, 1 + DBL_EPSILON);
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
#endif
|
@ -42,6 +42,7 @@
|
||||
|
||||
#include "precomp.hpp"
|
||||
#include "opencv2/core/core_c.h"
|
||||
#include "opencl_kernels.hpp"
|
||||
|
||||
#include <cstdio>
|
||||
#include <iterator>
|
||||
@ -58,6 +59,29 @@
|
||||
namespace cv
|
||||
{
|
||||
|
||||
#define NTHREADS 256
|
||||
|
||||
enum {DESCR_FORMAT_COL_BY_COL, DESCR_FORMAT_ROW_BY_ROW};
|
||||
|
||||
static int numPartsWithin(int size, int part_size, int stride)
|
||||
{
|
||||
return (size - part_size + stride) / stride;
|
||||
}
|
||||
|
||||
static Size numPartsWithin(cv::Size size, cv::Size part_size,
|
||||
cv::Size stride)
|
||||
{
|
||||
return Size(numPartsWithin(size.width, part_size.width, stride.width),
|
||||
numPartsWithin(size.height, part_size.height, stride.height));
|
||||
}
|
||||
|
||||
static size_t getBlockHistogramSize(Size block_size, Size cell_size, int nbins)
|
||||
{
|
||||
Size cells_per_block = Size(block_size.width / cell_size.width,
|
||||
block_size.height / cell_size.height);
|
||||
return (size_t)(nbins * cells_per_block.area());
|
||||
}
|
||||
|
||||
size_t HOGDescriptor::getDescriptorSize() const
|
||||
{
|
||||
CV_Assert(blockSize.width % cellSize.width == 0 &&
|
||||
@ -88,7 +112,24 @@ bool HOGDescriptor::checkDetectorSize() const
|
||||
void HOGDescriptor::setSVMDetector(InputArray _svmDetector)
|
||||
{
|
||||
_svmDetector.getMat().convertTo(svmDetector, CV_32F);
|
||||
CV_Assert( checkDetectorSize() );
|
||||
CV_Assert(checkDetectorSize());
|
||||
|
||||
Mat detector_reordered(1, (int)svmDetector.size(), CV_32FC1);
|
||||
|
||||
size_t block_hist_size = getBlockHistogramSize(blockSize, cellSize, nbins);
|
||||
cv::Size blocks_per_img = numPartsWithin(winSize, blockSize, blockStride);
|
||||
|
||||
for (int i = 0; i < blocks_per_img.height; ++i)
|
||||
for (int j = 0; j < blocks_per_img.width; ++j)
|
||||
{
|
||||
const float *src = &svmDetector[0] + (j * blocks_per_img.height + i) * block_hist_size;
|
||||
float *dst = (float*)detector_reordered.data + (i * blocks_per_img.width + j) * block_hist_size;
|
||||
for (size_t k = 0; k < block_hist_size; ++k)
|
||||
dst[k] = src[k];
|
||||
}
|
||||
size_t descriptor_size = getDescriptorSize();
|
||||
free_coef = svmDetector.size() > descriptor_size ? svmDetector[descriptor_size] : 0;
|
||||
detector_reordered.copyTo(oclSvmDetector);
|
||||
}
|
||||
|
||||
#define CV_TYPE_NAME_HOG_DESCRIPTOR "opencv-object-detector-hog"
|
||||
@ -1029,7 +1070,318 @@ static inline int gcd(int a, int b)
|
||||
return a;
|
||||
}
|
||||
|
||||
void HOGDescriptor::compute(const Mat& img, std::vector<float>& descriptors,
|
||||
#ifdef HAVE_OPENCL
|
||||
|
||||
static bool ocl_compute_gradients_8UC1(int height, int width, InputArray _img, float angle_scale,
|
||||
UMat grad, UMat qangle, bool correct_gamma, int nbins)
|
||||
{
|
||||
ocl::Kernel k("compute_gradients_8UC1_kernel", ocl::objdetect::objdetect_hog_oclsrc);
|
||||
if(k.empty())
|
||||
return false;
|
||||
|
||||
UMat img = _img.getUMat();
|
||||
|
||||
size_t localThreads[3] = { NTHREADS, 1, 1 };
|
||||
size_t globalThreads[3] = { width, height, 1 };
|
||||
char correctGamma = (correct_gamma) ? 1 : 0;
|
||||
int grad_quadstep = (int)grad.step >> 3;
|
||||
int qangle_step_shift = 0;
|
||||
int qangle_step = (int)qangle.step >> (1 + qangle_step_shift);
|
||||
|
||||
int idx = 0;
|
||||
idx = k.set(idx, height);
|
||||
idx = k.set(idx, width);
|
||||
idx = k.set(idx, (int)img.step1());
|
||||
idx = k.set(idx, grad_quadstep);
|
||||
idx = k.set(idx, qangle_step);
|
||||
idx = k.set(idx, ocl::KernelArg::PtrReadOnly(img));
|
||||
idx = k.set(idx, ocl::KernelArg::PtrWriteOnly(grad));
|
||||
idx = k.set(idx, ocl::KernelArg::PtrWriteOnly(qangle));
|
||||
idx = k.set(idx, angle_scale);
|
||||
idx = k.set(idx, correctGamma);
|
||||
idx = k.set(idx, nbins);
|
||||
|
||||
return k.run(2, globalThreads, localThreads, false);
|
||||
}
|
||||
|
||||
static bool ocl_computeGradient(InputArray img, UMat grad, UMat qangle, int nbins, Size effect_size, bool gamma_correction)
|
||||
{
|
||||
float angleScale = (float)(nbins / CV_PI);
|
||||
|
||||
return ocl_compute_gradients_8UC1(effect_size.height, effect_size.width, img,
|
||||
angleScale, grad, qangle, gamma_correction, nbins);
|
||||
}
|
||||
|
||||
#define CELL_WIDTH 8
|
||||
#define CELL_HEIGHT 8
|
||||
#define CELLS_PER_BLOCK_X 2
|
||||
#define CELLS_PER_BLOCK_Y 2
|
||||
|
||||
static bool ocl_compute_hists(int nbins, int block_stride_x, int block_stride_y, int height, int width,
|
||||
UMat grad, UMat qangle, UMat gauss_w_lut, UMat block_hists, size_t block_hist_size)
|
||||
{
|
||||
ocl::Kernel k("compute_hists_lut_kernel", ocl::objdetect::objdetect_hog_oclsrc);
|
||||
if(k.empty())
|
||||
return false;
|
||||
bool is_cpu = cv::ocl::Device::getDefault().type() == cv::ocl::Device::TYPE_CPU;
|
||||
cv::String opts;
|
||||
if(is_cpu)
|
||||
opts = "-D CPU ";
|
||||
else
|
||||
opts = cv::format("-D WAVE_SIZE=%d", k.preferedWorkGroupSizeMultiple());
|
||||
k.create("compute_hists_lut_kernel", ocl::objdetect::objdetect_hog_oclsrc, opts);
|
||||
if(k.empty())
|
||||
return false;
|
||||
|
||||
int img_block_width = (width - CELLS_PER_BLOCK_X * CELL_WIDTH + block_stride_x)/block_stride_x;
|
||||
int img_block_height = (height - CELLS_PER_BLOCK_Y * CELL_HEIGHT + block_stride_y)/block_stride_y;
|
||||
int blocks_total = img_block_width * img_block_height;
|
||||
|
||||
int qangle_step_shift = 0;
|
||||
int grad_quadstep = (int)grad.step >> 2;
|
||||
int qangle_step = (int)qangle.step >> qangle_step_shift;
|
||||
|
||||
int blocks_in_group = 4;
|
||||
size_t localThreads[3] = { blocks_in_group * 24, 2, 1 };
|
||||
size_t globalThreads[3] = {((img_block_width * img_block_height + blocks_in_group - 1)/blocks_in_group) * localThreads[0], 2, 1 };
|
||||
|
||||
int hists_size = (nbins * CELLS_PER_BLOCK_X * CELLS_PER_BLOCK_Y * 12) * sizeof(float);
|
||||
int final_hists_size = (nbins * CELLS_PER_BLOCK_X * CELLS_PER_BLOCK_Y) * sizeof(float);
|
||||
|
||||
int smem = (hists_size + final_hists_size) * blocks_in_group;
|
||||
|
||||
int idx = 0;
|
||||
idx = k.set(idx, block_stride_x);
|
||||
idx = k.set(idx, block_stride_y);
|
||||
idx = k.set(idx, nbins);
|
||||
idx = k.set(idx, (int)block_hist_size);
|
||||
idx = k.set(idx, img_block_width);
|
||||
idx = k.set(idx, blocks_in_group);
|
||||
idx = k.set(idx, blocks_total);
|
||||
idx = k.set(idx, grad_quadstep);
|
||||
idx = k.set(idx, qangle_step);
|
||||
idx = k.set(idx, ocl::KernelArg::PtrReadOnly(grad));
|
||||
idx = k.set(idx, ocl::KernelArg::PtrReadOnly(qangle));
|
||||
idx = k.set(idx, ocl::KernelArg::PtrReadOnly(gauss_w_lut));
|
||||
idx = k.set(idx, ocl::KernelArg::PtrWriteOnly(block_hists));
|
||||
idx = k.set(idx, (void*)NULL, (size_t)smem);
|
||||
|
||||
return k.run(2, globalThreads, localThreads, false);
|
||||
}
|
||||
|
||||
static int power_2up(unsigned int n)
|
||||
{
|
||||
for(unsigned int i = 1; i<=1024; i<<=1)
|
||||
if(n < i)
|
||||
return i;
|
||||
return -1; // Input is too big
|
||||
}
|
||||
|
||||
static bool ocl_normalize_hists(int nbins, int block_stride_x, int block_stride_y,
|
||||
int height, int width, UMat block_hists, float threshold)
|
||||
{
|
||||
int block_hist_size = nbins * CELLS_PER_BLOCK_X * CELLS_PER_BLOCK_Y;
|
||||
int img_block_width = (width - CELLS_PER_BLOCK_X * CELL_WIDTH + block_stride_x)
|
||||
/ block_stride_x;
|
||||
int img_block_height = (height - CELLS_PER_BLOCK_Y * CELL_HEIGHT + block_stride_y)
|
||||
/ block_stride_y;
|
||||
int nthreads;
|
||||
size_t globalThreads[3] = { 1, 1, 1 };
|
||||
size_t localThreads[3] = { 1, 1, 1 };
|
||||
|
||||
int idx = 0;
|
||||
bool is_cpu = cv::ocl::Device::getDefault().type() == cv::ocl::Device::TYPE_CPU;
|
||||
cv::String opts;
|
||||
ocl::Kernel k;
|
||||
if ( nbins == 9 )
|
||||
{
|
||||
k.create("normalize_hists_36_kernel", ocl::objdetect::objdetect_hog_oclsrc, "");
|
||||
if(k.empty())
|
||||
return false;
|
||||
if(is_cpu)
|
||||
opts = "-D CPU ";
|
||||
else
|
||||
opts = cv::format("-D WAVE_SIZE=%d", k.preferedWorkGroupSizeMultiple());
|
||||
k.create("normalize_hists_36_kernel", ocl::objdetect::objdetect_hog_oclsrc, opts);
|
||||
if(k.empty())
|
||||
return false;
|
||||
|
||||
int blocks_in_group = NTHREADS / block_hist_size;
|
||||
nthreads = blocks_in_group * block_hist_size;
|
||||
int num_groups = (img_block_width * img_block_height + blocks_in_group - 1)/blocks_in_group;
|
||||
globalThreads[0] = nthreads * num_groups;
|
||||
localThreads[0] = nthreads;
|
||||
}
|
||||
else
|
||||
{
|
||||
k.create("normalize_hists_kernel", ocl::objdetect::objdetect_hog_oclsrc, "");
|
||||
if(k.empty())
|
||||
return false;
|
||||
if(is_cpu)
|
||||
opts = "-D CPU ";
|
||||
else
|
||||
opts = cv::format("-D WAVE_SIZE=%d", k.preferedWorkGroupSizeMultiple());
|
||||
k.create("normalize_hists_kernel", ocl::objdetect::objdetect_hog_oclsrc, opts);
|
||||
if(k.empty())
|
||||
return false;
|
||||
|
||||
nthreads = power_2up(block_hist_size);
|
||||
globalThreads[0] = img_block_width * nthreads;
|
||||
globalThreads[1] = img_block_height;
|
||||
localThreads[0] = nthreads;
|
||||
|
||||
if ((nthreads < 32) || (nthreads > 512) )
|
||||
return false;
|
||||
|
||||
idx = k.set(idx, nthreads);
|
||||
idx = k.set(idx, block_hist_size);
|
||||
idx = k.set(idx, img_block_width);
|
||||
}
|
||||
idx = k.set(idx, ocl::KernelArg::PtrReadWrite(block_hists));
|
||||
idx = k.set(idx, threshold);
|
||||
idx = k.set(idx, (void*)NULL, nthreads * sizeof(float));
|
||||
|
||||
return k.run(2, globalThreads, localThreads, false);
|
||||
}
|
||||
|
||||
static bool ocl_extract_descrs_by_rows(int win_height, int win_width, int block_stride_y, int block_stride_x, int win_stride_y, int win_stride_x,
|
||||
int height, int width, UMat block_hists, UMat descriptors,
|
||||
int block_hist_size, int descr_size, int descr_width)
|
||||
{
|
||||
ocl::Kernel k("extract_descrs_by_rows_kernel", ocl::objdetect::objdetect_hog_oclsrc);
|
||||
if(k.empty())
|
||||
return false;
|
||||
|
||||
int win_block_stride_x = win_stride_x / block_stride_x;
|
||||
int win_block_stride_y = win_stride_y / block_stride_y;
|
||||
int img_win_width = (width - win_width + win_stride_x) / win_stride_x;
|
||||
int img_win_height = (height - win_height + win_stride_y) / win_stride_y;
|
||||
int img_block_width = (width - CELLS_PER_BLOCK_X * CELL_WIDTH + block_stride_x) /
|
||||
block_stride_x;
|
||||
|
||||
int descriptors_quadstep = (int)descriptors.step >> 2;
|
||||
|
||||
size_t globalThreads[3] = { img_win_width * NTHREADS, img_win_height, 1 };
|
||||
size_t localThreads[3] = { NTHREADS, 1, 1 };
|
||||
|
||||
int idx = 0;
|
||||
idx = k.set(idx, block_hist_size);
|
||||
idx = k.set(idx, descriptors_quadstep);
|
||||
idx = k.set(idx, descr_size);
|
||||
idx = k.set(idx, descr_width);
|
||||
idx = k.set(idx, img_block_width);
|
||||
idx = k.set(idx, win_block_stride_x);
|
||||
idx = k.set(idx, win_block_stride_y);
|
||||
idx = k.set(idx, ocl::KernelArg::PtrReadOnly(block_hists));
|
||||
idx = k.set(idx, ocl::KernelArg::PtrWriteOnly(descriptors));
|
||||
|
||||
return k.run(2, globalThreads, localThreads, false);
|
||||
}
|
||||
|
||||
static bool ocl_extract_descrs_by_cols(int win_height, int win_width, int block_stride_y, int block_stride_x, int win_stride_y, int win_stride_x,
|
||||
int height, int width, UMat block_hists, UMat descriptors,
|
||||
int block_hist_size, int descr_size, int nblocks_win_x, int nblocks_win_y)
|
||||
{
|
||||
ocl::Kernel k("extract_descrs_by_cols_kernel", ocl::objdetect::objdetect_hog_oclsrc);
|
||||
if(k.empty())
|
||||
return false;
|
||||
|
||||
int win_block_stride_x = win_stride_x / block_stride_x;
|
||||
int win_block_stride_y = win_stride_y / block_stride_y;
|
||||
int img_win_width = (width - win_width + win_stride_x) / win_stride_x;
|
||||
int img_win_height = (height - win_height + win_stride_y) / win_stride_y;
|
||||
int img_block_width = (width - CELLS_PER_BLOCK_X * CELL_WIDTH + block_stride_x) /
|
||||
block_stride_x;
|
||||
|
||||
int descriptors_quadstep = (int)descriptors.step >> 2;
|
||||
|
||||
size_t globalThreads[3] = { img_win_width * NTHREADS, img_win_height, 1 };
|
||||
size_t localThreads[3] = { NTHREADS, 1, 1 };
|
||||
|
||||
int idx = 0;
|
||||
idx = k.set(idx, block_hist_size);
|
||||
idx = k.set(idx, descriptors_quadstep);
|
||||
idx = k.set(idx, descr_size);
|
||||
idx = k.set(idx, nblocks_win_x);
|
||||
idx = k.set(idx, nblocks_win_y);
|
||||
idx = k.set(idx, img_block_width);
|
||||
idx = k.set(idx, win_block_stride_x);
|
||||
idx = k.set(idx, win_block_stride_y);
|
||||
idx = k.set(idx, ocl::KernelArg::PtrReadOnly(block_hists));
|
||||
idx = k.set(idx, ocl::KernelArg::PtrWriteOnly(descriptors));
|
||||
|
||||
return k.run(2, globalThreads, localThreads, false);
|
||||
}
|
||||
|
||||
static bool ocl_compute(InputArray _img, Size win_stride, std::vector<float>& _descriptors, int descr_format, Size blockSize,
|
||||
Size cellSize, int nbins, Size blockStride, Size winSize, float sigma, bool gammaCorrection, double L2HysThreshold)
|
||||
{
|
||||
Size imgSize = _img.size();
|
||||
Size effect_size = imgSize;
|
||||
|
||||
UMat grad(imgSize, CV_32FC2);
|
||||
UMat qangle(imgSize, CV_8UC2);
|
||||
|
||||
const size_t block_hist_size = getBlockHistogramSize(blockSize, cellSize, nbins);
|
||||
const Size blocks_per_img = numPartsWithin(imgSize, blockSize, blockStride);
|
||||
UMat block_hists(1, static_cast<int>(block_hist_size * blocks_per_img.area()) + 256, CV_32F);
|
||||
|
||||
Size wins_per_img = numPartsWithin(imgSize, winSize, win_stride);
|
||||
UMat labels(1, wins_per_img.area(), CV_8U);
|
||||
|
||||
float scale = 1.f / (2.f * sigma * sigma);
|
||||
Mat gaussian_lut(1, 512, CV_32FC1);
|
||||
int idx = 0;
|
||||
for(int i=-8; i<8; i++)
|
||||
for(int j=-8; j<8; j++)
|
||||
gaussian_lut.at<float>(idx++) = std::exp(-(j * j + i * i) * scale);
|
||||
for(int i=-8; i<8; i++)
|
||||
for(int j=-8; j<8; j++)
|
||||
gaussian_lut.at<float>(idx++) = (8.f - fabs(j + 0.5f)) * (8.f - fabs(i + 0.5f)) / 64.f;
|
||||
|
||||
if(!ocl_computeGradient(_img, grad, qangle, nbins, effect_size, gammaCorrection))
|
||||
return false;
|
||||
|
||||
UMat gauss_w_lut;
|
||||
gaussian_lut.copyTo(gauss_w_lut);
|
||||
if(!ocl_compute_hists(nbins, blockStride.width, blockStride.height, effect_size.height,
|
||||
effect_size.width, grad, qangle, gauss_w_lut, block_hists, block_hist_size))
|
||||
return false;
|
||||
|
||||
if(!ocl_normalize_hists(nbins, blockStride.width, blockStride.height, effect_size.height,
|
||||
effect_size.width, block_hists, (float)L2HysThreshold))
|
||||
return false;
|
||||
|
||||
Size blocks_per_win = numPartsWithin(winSize, blockSize, blockStride);
|
||||
wins_per_img = numPartsWithin(effect_size, winSize, win_stride);
|
||||
|
||||
int descr_size = blocks_per_win.area()*(int)block_hist_size;
|
||||
int descr_width = (int)block_hist_size*blocks_per_win.width;
|
||||
|
||||
UMat descriptors(wins_per_img.area(), static_cast<int>(blocks_per_win.area() * block_hist_size), CV_32F);
|
||||
switch (descr_format)
|
||||
{
|
||||
case DESCR_FORMAT_ROW_BY_ROW:
|
||||
if(!ocl_extract_descrs_by_rows(winSize.height, winSize.width,
|
||||
blockStride.height, blockStride.width, win_stride.height, win_stride.width, effect_size.height,
|
||||
effect_size.width, block_hists, descriptors, (int)block_hist_size, descr_size, descr_width))
|
||||
return false;
|
||||
break;
|
||||
case DESCR_FORMAT_COL_BY_COL:
|
||||
if(!ocl_extract_descrs_by_cols(winSize.height, winSize.width,
|
||||
blockStride.height, blockStride.width, win_stride.height, win_stride.width, effect_size.height, effect_size.width,
|
||||
block_hists, descriptors, (int)block_hist_size, descr_size, blocks_per_win.width, blocks_per_win.height))
|
||||
return false;
|
||||
break;
|
||||
default:
|
||||
return false;
|
||||
}
|
||||
descriptors.reshape(1, (int)descriptors.total()).getMat(ACCESS_READ).copyTo(_descriptors);
|
||||
return true;
|
||||
}
|
||||
#endif //HAVE_OPENCL
|
||||
|
||||
void HOGDescriptor::compute(InputArray _img, std::vector<float>& descriptors,
|
||||
Size winStride, Size padding, const std::vector<Point>& locations) const
|
||||
{
|
||||
if( winStride == Size() )
|
||||
@ -1037,11 +1389,18 @@ void HOGDescriptor::compute(const Mat& img, std::vector<float>& descriptors,
|
||||
Size cacheStride(gcd(winStride.width, blockStride.width),
|
||||
gcd(winStride.height, blockStride.height));
|
||||
|
||||
Size imgSize = _img.size();
|
||||
|
||||
size_t nwindows = locations.size();
|
||||
padding.width = (int)alignSize(std::max(padding.width, 0), cacheStride.width);
|
||||
padding.height = (int)alignSize(std::max(padding.height, 0), cacheStride.height);
|
||||
Size paddedImgSize(img.cols + padding.width*2, img.rows + padding.height*2);
|
||||
Size paddedImgSize(imgSize.width + padding.width*2, imgSize.height + padding.height*2);
|
||||
|
||||
CV_OCL_RUN(_img.dims() <= 2 && _img.type() == CV_8UC1 && _img.isUMat(),
|
||||
ocl_compute(_img, winStride, descriptors, DESCR_FORMAT_COL_BY_COL, blockSize,
|
||||
cellSize, nbins, blockStride, winSize, (float)getWinSigma(), gammaCorrection, L2HysThreshold))
|
||||
|
||||
Mat img = _img.getMat();
|
||||
HOGCache cache(this, img, padding, padding, nwindows == 0, cacheStride);
|
||||
|
||||
if( !nwindows )
|
||||
@ -1263,20 +1622,215 @@ private:
|
||||
Mutex* mtx;
|
||||
};
|
||||
|
||||
#ifdef HAVE_OPENCL
|
||||
|
||||
static bool ocl_classify_hists(int win_height, int win_width, int block_stride_y, int block_stride_x,
|
||||
int win_stride_y, int win_stride_x, int height, int width,
|
||||
const UMat& block_hists, UMat detector,
|
||||
float free_coef, float threshold, UMat& labels, Size descr_size, int block_hist_size)
|
||||
{
|
||||
int nthreads;
|
||||
bool is_cpu = cv::ocl::Device::getDefault().type() == cv::ocl::Device::TYPE_CPU;
|
||||
cv::String opts;
|
||||
|
||||
ocl::Kernel k;
|
||||
int idx = 0;
|
||||
switch (descr_size.width)
|
||||
{
|
||||
case 180:
|
||||
nthreads = 180;
|
||||
k.create("classify_hists_180_kernel", ocl::objdetect::objdetect_hog_oclsrc, "");
|
||||
if(k.empty())
|
||||
return false;
|
||||
if(is_cpu)
|
||||
opts = "-D CPU ";
|
||||
else
|
||||
opts = cv::format("-D WAVE_SIZE=%d", k.preferedWorkGroupSizeMultiple());
|
||||
k.create("classify_hists_180_kernel", ocl::objdetect::objdetect_hog_oclsrc, opts);
|
||||
if(k.empty())
|
||||
return false;
|
||||
idx = k.set(idx, descr_size.width);
|
||||
idx = k.set(idx, descr_size.height);
|
||||
break;
|
||||
|
||||
case 252:
|
||||
nthreads = 256;
|
||||
k.create("classify_hists_252_kernel", ocl::objdetect::objdetect_hog_oclsrc, "");
|
||||
if(k.empty())
|
||||
return false;
|
||||
if(is_cpu)
|
||||
opts = "-D CPU ";
|
||||
else
|
||||
opts = cv::format("-D WAVE_SIZE=%d", k.preferedWorkGroupSizeMultiple());
|
||||
k.create("classify_hists_252_kernel", ocl::objdetect::objdetect_hog_oclsrc, opts);
|
||||
if(k.empty())
|
||||
return false;
|
||||
idx = k.set(idx, descr_size.width);
|
||||
idx = k.set(idx, descr_size.height);
|
||||
break;
|
||||
|
||||
default:
|
||||
nthreads = 256;
|
||||
k.create("classify_hists_kernel", ocl::objdetect::objdetect_hog_oclsrc, "");
|
||||
if(k.empty())
|
||||
return false;
|
||||
if(is_cpu)
|
||||
opts = "-D CPU ";
|
||||
else
|
||||
opts = cv::format("-D WAVE_SIZE=%d", k.preferedWorkGroupSizeMultiple());
|
||||
k.create("classify_hists_kernel", ocl::objdetect::objdetect_hog_oclsrc, opts);
|
||||
if(k.empty())
|
||||
return false;
|
||||
idx = k.set(idx, descr_size.area());
|
||||
idx = k.set(idx, descr_size.height);
|
||||
}
|
||||
|
||||
int win_block_stride_x = win_stride_x / block_stride_x;
|
||||
int win_block_stride_y = win_stride_y / block_stride_y;
|
||||
int img_win_width = (width - win_width + win_stride_x) / win_stride_x;
|
||||
int img_win_height = (height - win_height + win_stride_y) / win_stride_y;
|
||||
int img_block_width = (width - CELLS_PER_BLOCK_X * CELL_WIDTH + block_stride_x) /
|
||||
block_stride_x;
|
||||
|
||||
size_t globalThreads[3] = { img_win_width * nthreads, img_win_height, 1 };
|
||||
size_t localThreads[3] = { nthreads, 1, 1 };
|
||||
|
||||
idx = k.set(idx, block_hist_size);
|
||||
idx = k.set(idx, img_win_width);
|
||||
idx = k.set(idx, img_block_width);
|
||||
idx = k.set(idx, win_block_stride_x);
|
||||
idx = k.set(idx, win_block_stride_y);
|
||||
idx = k.set(idx, ocl::KernelArg::PtrReadOnly(block_hists));
|
||||
idx = k.set(idx, ocl::KernelArg::PtrReadOnly(detector));
|
||||
idx = k.set(idx, free_coef);
|
||||
idx = k.set(idx, threshold);
|
||||
idx = k.set(idx, ocl::KernelArg::PtrWriteOnly(labels));
|
||||
|
||||
return k.run(2, globalThreads, localThreads, false);
|
||||
}
|
||||
|
||||
static bool ocl_detect(InputArray img, std::vector<Point> &hits, double hit_threshold, Size win_stride,
|
||||
const UMat& oclSvmDetector, Size blockSize, Size cellSize, int nbins, Size blockStride, Size winSize,
|
||||
bool gammaCorrection, double L2HysThreshold, float sigma, float free_coef)
|
||||
{
|
||||
hits.clear();
|
||||
if (oclSvmDetector.empty())
|
||||
return false;
|
||||
|
||||
Size imgSize = img.size();
|
||||
Size effect_size = imgSize;
|
||||
UMat grad(imgSize, CV_32FC2);
|
||||
UMat qangle(imgSize, CV_8UC2);
|
||||
|
||||
const size_t block_hist_size = getBlockHistogramSize(blockSize, cellSize, nbins);
|
||||
const Size blocks_per_img = numPartsWithin(imgSize, blockSize, blockStride);
|
||||
UMat block_hists(1, static_cast<int>(block_hist_size * blocks_per_img.area()) + 256, CV_32F);
|
||||
|
||||
Size wins_per_img = numPartsWithin(imgSize, winSize, win_stride);
|
||||
UMat labels(1, wins_per_img.area(), CV_8U);
|
||||
|
||||
float scale = 1.f / (2.f * sigma * sigma);
|
||||
Mat gaussian_lut(1, 512, CV_32FC1);
|
||||
int idx = 0;
|
||||
for(int i=-8; i<8; i++)
|
||||
for(int j=-8; j<8; j++)
|
||||
gaussian_lut.at<float>(idx++) = std::exp(-(j * j + i * i) * scale);
|
||||
for(int i=-8; i<8; i++)
|
||||
for(int j=-8; j<8; j++)
|
||||
gaussian_lut.at<float>(idx++) = (8.f - fabs(j + 0.5f)) * (8.f - fabs(i + 0.5f)) / 64.f;
|
||||
|
||||
if(!ocl_computeGradient(img, grad, qangle, nbins, effect_size, gammaCorrection))
|
||||
return false;
|
||||
|
||||
UMat gauss_w_lut;
|
||||
gaussian_lut.copyTo(gauss_w_lut);
|
||||
if(!ocl_compute_hists(nbins, blockStride.width, blockStride.height, effect_size.height,
|
||||
effect_size.width, grad, qangle, gauss_w_lut, block_hists, block_hist_size))
|
||||
return false;
|
||||
|
||||
if(!ocl_normalize_hists(nbins, blockStride.width, blockStride.height, effect_size.height,
|
||||
effect_size.width, block_hists, (float)L2HysThreshold))
|
||||
return false;
|
||||
|
||||
Size blocks_per_win = numPartsWithin(winSize, blockSize, blockStride);
|
||||
|
||||
Size descr_size((int)block_hist_size*blocks_per_win.width, blocks_per_win.height);
|
||||
|
||||
if(!ocl_classify_hists(winSize.height, winSize.width, blockStride.height,
|
||||
blockStride.width, win_stride.height, win_stride.width,
|
||||
effect_size.height, effect_size.width, block_hists, oclSvmDetector,
|
||||
free_coef, (float)hit_threshold, labels, descr_size, (int)block_hist_size))
|
||||
return false;
|
||||
|
||||
Mat labels_host = labels.getMat(ACCESS_READ);
|
||||
unsigned char *vec = labels_host.ptr();
|
||||
for (int i = 0; i < wins_per_img.area(); i++)
|
||||
{
|
||||
int y = i / wins_per_img.width;
|
||||
int x = i - wins_per_img.width * y;
|
||||
if (vec[i])
|
||||
{
|
||||
hits.push_back(Point(x * win_stride.width, y * win_stride.height));
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
static bool ocl_detectMultiScale(InputArray _img, std::vector<Rect> &found_locations, std::vector<double>& level_scale,
|
||||
double hit_threshold, Size win_stride, double group_threshold,
|
||||
const UMat& oclSvmDetector, Size blockSize, Size cellSize,
|
||||
int nbins, Size blockStride, Size winSize, bool gammaCorrection,
|
||||
double L2HysThreshold, float sigma, float free_coef)
|
||||
{
|
||||
std::vector<Rect> all_candidates;
|
||||
std::vector<Point> locations;
|
||||
UMat image_scale;
|
||||
Size imgSize = _img.size();
|
||||
image_scale.create(imgSize, _img.type());
|
||||
|
||||
for (size_t i = 0; i<level_scale.size() ; i++)
|
||||
{
|
||||
double scale = level_scale[i];
|
||||
Size effect_size = Size(cvRound(imgSize.width / scale), cvRound(imgSize.height / scale));
|
||||
if (effect_size == imgSize)
|
||||
{
|
||||
if(!ocl_detect(_img, locations, hit_threshold, win_stride, oclSvmDetector, blockSize, cellSize, nbins,
|
||||
blockStride, winSize, gammaCorrection, L2HysThreshold, sigma, free_coef))
|
||||
return false;
|
||||
}
|
||||
else
|
||||
{
|
||||
resize(_img, image_scale, effect_size);
|
||||
if(!ocl_detect(image_scale, locations, hit_threshold, win_stride, oclSvmDetector, blockSize, cellSize, nbins,
|
||||
blockStride, winSize, gammaCorrection, L2HysThreshold, sigma, free_coef))
|
||||
return false;
|
||||
}
|
||||
Size scaled_win_size(cvRound(winSize.width * scale),
|
||||
cvRound(winSize.height * scale));
|
||||
for (size_t j = 0; j < locations.size(); j++)
|
||||
all_candidates.push_back(Rect(Point2d(locations[j]) * scale, scaled_win_size));
|
||||
}
|
||||
found_locations.assign(all_candidates.begin(), all_candidates.end());
|
||||
cv::groupRectangles(found_locations, (int)group_threshold, 0.2);
|
||||
return true;
|
||||
}
|
||||
#endif //HAVE_OPENCL
|
||||
|
||||
void HOGDescriptor::detectMultiScale(
|
||||
const Mat& img, std::vector<Rect>& foundLocations, std::vector<double>& foundWeights,
|
||||
InputArray _img, std::vector<Rect>& foundLocations, std::vector<double>& foundWeights,
|
||||
double hitThreshold, Size winStride, Size padding,
|
||||
double scale0, double finalThreshold, bool useMeanshiftGrouping) const
|
||||
{
|
||||
double scale = 1.;
|
||||
int levels = 0;
|
||||
|
||||
Size imgSize = _img.size();
|
||||
std::vector<double> levelScale;
|
||||
for( levels = 0; levels < nlevels; levels++ )
|
||||
{
|
||||
levelScale.push_back(scale);
|
||||
if( cvRound(img.cols/scale) < winSize.width ||
|
||||
cvRound(img.rows/scale) < winSize.height ||
|
||||
if( cvRound(imgSize.width/scale) < winSize.width ||
|
||||
cvRound(imgSize.height/scale) < winSize.height ||
|
||||
scale0 <= 1 )
|
||||
break;
|
||||
scale *= scale0;
|
||||
@ -1284,12 +1838,21 @@ void HOGDescriptor::detectMultiScale(
|
||||
levels = std::max(levels, 1);
|
||||
levelScale.resize(levels);
|
||||
|
||||
if(winStride == Size())
|
||||
winStride = blockStride;
|
||||
|
||||
CV_OCL_RUN(_img.dims() <= 2 && _img.type() == CV_8UC1 && scale0 > 1 && winStride.width % blockStride.width == 0 &&
|
||||
winStride.height % blockStride.height == 0 && padding == Size(0,0) && _img.isUMat(),
|
||||
ocl_detectMultiScale(_img, foundLocations, levelScale, hitThreshold, winStride, finalThreshold, oclSvmDetector,
|
||||
blockSize, cellSize, nbins, blockStride, winSize, gammaCorrection, L2HysThreshold, (float)getWinSigma(), free_coef));
|
||||
|
||||
std::vector<Rect> allCandidates;
|
||||
std::vector<double> tempScales;
|
||||
std::vector<double> tempWeights;
|
||||
std::vector<double> foundScales;
|
||||
Mutex mtx;
|
||||
|
||||
Mutex mtx;
|
||||
Mat img = _img.getMat();
|
||||
Range range(0, (int)levelScale.size());
|
||||
HOGInvoker invoker(this, img, hitThreshold, winStride, padding, &levelScale[0], &allCandidates, &mtx, &tempWeights, &tempScales);
|
||||
parallel_for_(range, invoker);
|
||||
@ -1306,7 +1869,7 @@ void HOGDescriptor::detectMultiScale(
|
||||
groupRectangles(foundLocations, foundWeights, (int)finalThreshold, 0.2);
|
||||
}
|
||||
|
||||
void HOGDescriptor::detectMultiScale(const Mat& img, std::vector<Rect>& foundLocations,
|
||||
void HOGDescriptor::detectMultiScale(InputArray img, std::vector<Rect>& foundLocations,
|
||||
double hitThreshold, Size winStride, Size padding,
|
||||
double scale0, double finalThreshold, bool useMeanshiftGrouping) const
|
||||
{
|
||||
|
726
modules/objdetect/src/opencl/objdetect_hog.cl
Normal file
726
modules/objdetect/src/opencl/objdetect_hog.cl
Normal file
@ -0,0 +1,726 @@
|
||||
/*M///////////////////////////////////////////////////////////////////////////////////////
|
||||
//
|
||||
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
|
||||
//
|
||||
// By downloading, copying, installing or using the software you agree to this license.
|
||||
// If you do not agree to this license, do not download, install,
|
||||
// copy or use the software.
|
||||
//
|
||||
//
|
||||
// License Agreement
|
||||
// For Open Source Computer Vision Library
|
||||
//
|
||||
// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
|
||||
// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
|
||||
// Third party copyrights are property of their respective owners.
|
||||
//
|
||||
// @Authors
|
||||
// Wenju He, wenju@multicorewareinc.com
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without modification,
|
||||
// are permitted provided that the following conditions are met:
|
||||
//
|
||||
// * Redistribution's of source code must retain the above copyright notice,
|
||||
// this list of conditions and the following disclaimer.
|
||||
//
|
||||
// * Redistribution's in binary form must reproduce the above copyright notice,
|
||||
// this list of conditions and the following disclaimer in the documentation
|
||||
// and/or other materials provided with the distribution.
|
||||
//
|
||||
// * The name of the copyright holders may not be used to endorse or promote products
|
||||
// derived from this software without specific prior written permission.
|
||||
//
|
||||
// This software is provided by the copyright holders and contributors as is and
|
||||
// any express or implied warranties, including, but not limited to, the implied
|
||||
// warranties of merchantability and fitness for a particular purpose are disclaimed.
|
||||
// In no event shall the Intel Corporation or contributors be liable for any direct,
|
||||
// indirect, incidental, special, exemplary, or consequential damages
|
||||
// (including, but not limited to, procurement of substitute goods or services;
|
||||
// loss of use, data, or profits; or business interruption) however caused
|
||||
// and on any theory of liability, whether in contract, strict liability,
|
||||
// or tort (including negligence or otherwise) arising in any way out of
|
||||
// the use of this software, even if advised of the possibility of such damage.
|
||||
//
|
||||
//M*/
|
||||
|
||||
#define CELL_WIDTH 8
|
||||
#define CELL_HEIGHT 8
|
||||
#define CELLS_PER_BLOCK_X 2
|
||||
#define CELLS_PER_BLOCK_Y 2
|
||||
#define NTHREADS 256
|
||||
#define CV_PI_F 3.1415926535897932384626433832795f
|
||||
|
||||
#ifdef INTEL_DEVICE
|
||||
#define QANGLE_TYPE int
|
||||
#define QANGLE_TYPE2 int2
|
||||
#else
|
||||
#define QANGLE_TYPE uchar
|
||||
#define QANGLE_TYPE2 uchar2
|
||||
#endif
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
// Histogram computation
|
||||
// 12 threads for a cell, 12x4 threads per block
|
||||
// Use pre-computed gaussian and interp_weight lookup tables
|
||||
__kernel void compute_hists_lut_kernel(
|
||||
const int cblock_stride_x, const int cblock_stride_y,
|
||||
const int cnbins, const int cblock_hist_size, const int img_block_width,
|
||||
const int blocks_in_group, const int blocks_total,
|
||||
const int grad_quadstep, const int qangle_step,
|
||||
__global const float* grad, __global const QANGLE_TYPE* qangle,
|
||||
__global const float* gauss_w_lut,
|
||||
__global float* block_hists, __local float* smem)
|
||||
{
|
||||
const int lx = get_local_id(0);
|
||||
const int lp = lx / 24; /* local group id */
|
||||
const int gid = get_group_id(0) * blocks_in_group + lp;/* global group id */
|
||||
const int gidY = gid / img_block_width;
|
||||
const int gidX = gid - gidY * img_block_width;
|
||||
|
||||
const int lidX = lx - lp * 24;
|
||||
const int lidY = get_local_id(1);
|
||||
|
||||
const int cell_x = lidX / 12;
|
||||
const int cell_y = lidY;
|
||||
const int cell_thread_x = lidX - cell_x * 12;
|
||||
|
||||
__local float* hists = smem + lp * cnbins * (CELLS_PER_BLOCK_X *
|
||||
CELLS_PER_BLOCK_Y * 12 + CELLS_PER_BLOCK_X * CELLS_PER_BLOCK_Y);
|
||||
__local float* final_hist = hists + cnbins *
|
||||
(CELLS_PER_BLOCK_X * CELLS_PER_BLOCK_Y * 12);
|
||||
|
||||
const int offset_x = gidX * cblock_stride_x + (cell_x << 2) + cell_thread_x;
|
||||
const int offset_y = gidY * cblock_stride_y + (cell_y << 2);
|
||||
|
||||
__global const float* grad_ptr = (gid < blocks_total) ?
|
||||
grad + offset_y * grad_quadstep + (offset_x << 1) : grad;
|
||||
__global const QANGLE_TYPE* qangle_ptr = (gid < blocks_total) ?
|
||||
qangle + offset_y * qangle_step + (offset_x << 1) : qangle;
|
||||
|
||||
__local float* hist = hists + 12 * (cell_y * CELLS_PER_BLOCK_Y + cell_x) +
|
||||
cell_thread_x;
|
||||
for (int bin_id = 0; bin_id < cnbins; ++bin_id)
|
||||
hist[bin_id * 48] = 0.f;
|
||||
|
||||
const int dist_x = -4 + cell_thread_x - 4 * cell_x;
|
||||
const int dist_center_x = dist_x - 4 * (1 - 2 * cell_x);
|
||||
|
||||
const int dist_y_begin = -4 - 4 * lidY;
|
||||
for (int dist_y = dist_y_begin; dist_y < dist_y_begin + 12; ++dist_y)
|
||||
{
|
||||
float2 vote = (float2) (grad_ptr[0], grad_ptr[1]);
|
||||
QANGLE_TYPE2 bin = (QANGLE_TYPE2) (qangle_ptr[0], qangle_ptr[1]);
|
||||
|
||||
grad_ptr += grad_quadstep;
|
||||
qangle_ptr += qangle_step;
|
||||
|
||||
int dist_center_y = dist_y - 4 * (1 - 2 * cell_y);
|
||||
|
||||
int idx = (dist_center_y + 8) * 16 + (dist_center_x + 8);
|
||||
float gaussian = gauss_w_lut[idx];
|
||||
idx = (dist_y + 8) * 16 + (dist_x + 8);
|
||||
float interp_weight = gauss_w_lut[256+idx];
|
||||
|
||||
hist[bin.x * 48] += gaussian * interp_weight * vote.x;
|
||||
hist[bin.y * 48] += gaussian * interp_weight * vote.y;
|
||||
}
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
volatile __local float* hist_ = hist;
|
||||
for (int bin_id = 0; bin_id < cnbins; ++bin_id, hist_ += 48)
|
||||
{
|
||||
if (cell_thread_x < 6)
|
||||
hist_[0] += hist_[6];
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
if (cell_thread_x < 3)
|
||||
hist_[0] += hist_[3];
|
||||
#ifdef CPU
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
#endif
|
||||
if (cell_thread_x == 0)
|
||||
final_hist[(cell_x * 2 + cell_y) * cnbins + bin_id] =
|
||||
hist_[0] + hist_[1] + hist_[2];
|
||||
}
|
||||
#ifdef CPU
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
#endif
|
||||
|
||||
int tid = (cell_y * CELLS_PER_BLOCK_Y + cell_x) * 12 + cell_thread_x;
|
||||
if ((tid < cblock_hist_size) && (gid < blocks_total))
|
||||
{
|
||||
__global float* block_hist = block_hists +
|
||||
(gidY * img_block_width + gidX) * cblock_hist_size;
|
||||
block_hist[tid] = final_hist[tid];
|
||||
}
|
||||
}
|
||||
|
||||
//-------------------------------------------------------------
|
||||
// Normalization of histograms via L2Hys_norm
|
||||
// optimized for the case of 9 bins
|
||||
__kernel void normalize_hists_36_kernel(__global float* block_hists,
|
||||
const float threshold, __local float *squares)
|
||||
{
|
||||
const int tid = get_local_id(0);
|
||||
const int gid = get_global_id(0);
|
||||
const int bid = tid / 36; /* block-hist id, (0 - 6) */
|
||||
const int boffset = bid * 36; /* block-hist offset in the work-group */
|
||||
const int hid = tid - boffset; /* histogram bin id, (0 - 35) */
|
||||
|
||||
float elem = block_hists[gid];
|
||||
squares[tid] = elem * elem;
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
__local float* smem = squares + boffset;
|
||||
float sum = smem[hid];
|
||||
if (hid < 18)
|
||||
smem[hid] = sum = sum + smem[hid + 18];
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
if (hid < 9)
|
||||
smem[hid] = sum = sum + smem[hid + 9];
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
if (hid < 4)
|
||||
smem[hid] = sum + smem[hid + 4];
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
sum = smem[0] + smem[1] + smem[2] + smem[3] + smem[8];
|
||||
|
||||
elem = elem / (sqrt(sum) + 3.6f);
|
||||
elem = min(elem, threshold);
|
||||
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
squares[tid] = elem * elem;
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
sum = smem[hid];
|
||||
if (hid < 18)
|
||||
smem[hid] = sum = sum + smem[hid + 18];
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
if (hid < 9)
|
||||
smem[hid] = sum = sum + smem[hid + 9];
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
if (hid < 4)
|
||||
smem[hid] = sum + smem[hid + 4];
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
sum = smem[0] + smem[1] + smem[2] + smem[3] + smem[8];
|
||||
|
||||
block_hists[gid] = elem / (sqrt(sum) + 1e-3f);
|
||||
}
|
||||
|
||||
//-------------------------------------------------------------
|
||||
// Normalization of histograms via L2Hys_norm
|
||||
//
|
||||
inline float reduce_smem(volatile __local float* smem, int size)
|
||||
{
|
||||
unsigned int tid = get_local_id(0);
|
||||
float sum = smem[tid];
|
||||
|
||||
if (size >= 512) { if (tid < 256) smem[tid] = sum = sum + smem[tid + 256];
|
||||
barrier(CLK_LOCAL_MEM_FENCE); }
|
||||
if (size >= 256) { if (tid < 128) smem[tid] = sum = sum + smem[tid + 128];
|
||||
barrier(CLK_LOCAL_MEM_FENCE); }
|
||||
if (size >= 128) { if (tid < 64) smem[tid] = sum = sum + smem[tid + 64];
|
||||
barrier(CLK_LOCAL_MEM_FENCE); }
|
||||
#ifdef CPU
|
||||
if (size >= 64) { if (tid < 32) smem[tid] = sum = sum + smem[tid + 32];
|
||||
barrier(CLK_LOCAL_MEM_FENCE); }
|
||||
if (size >= 32) { if (tid < 16) smem[tid] = sum = sum + smem[tid + 16];
|
||||
barrier(CLK_LOCAL_MEM_FENCE); }
|
||||
if (size >= 16) { if (tid < 8) smem[tid] = sum = sum + smem[tid + 8];
|
||||
barrier(CLK_LOCAL_MEM_FENCE); }
|
||||
if (size >= 8) { if (tid < 4) smem[tid] = sum = sum + smem[tid + 4];
|
||||
barrier(CLK_LOCAL_MEM_FENCE); }
|
||||
if (size >= 4) { if (tid < 2) smem[tid] = sum = sum + smem[tid + 2];
|
||||
barrier(CLK_LOCAL_MEM_FENCE); }
|
||||
if (size >= 2) { if (tid < 1) smem[tid] = sum = sum + smem[tid + 1];
|
||||
barrier(CLK_LOCAL_MEM_FENCE); }
|
||||
#else
|
||||
if (tid < 32)
|
||||
{
|
||||
if (size >= 64) smem[tid] = sum = sum + smem[tid + 32];
|
||||
#if WAVE_SIZE < 32
|
||||
} barrier(CLK_LOCAL_MEM_FENCE);
|
||||
if (tid < 16) {
|
||||
#endif
|
||||
if (size >= 32) smem[tid] = sum = sum + smem[tid + 16];
|
||||
if (size >= 16) smem[tid] = sum = sum + smem[tid + 8];
|
||||
if (size >= 8) smem[tid] = sum = sum + smem[tid + 4];
|
||||
if (size >= 4) smem[tid] = sum = sum + smem[tid + 2];
|
||||
if (size >= 2) smem[tid] = sum = sum + smem[tid + 1];
|
||||
}
|
||||
#endif
|
||||
|
||||
return sum;
|
||||
}
|
||||
|
||||
__kernel void normalize_hists_kernel(
|
||||
const int nthreads, const int block_hist_size, const int img_block_width,
|
||||
__global float* block_hists, const float threshold, __local float *squares)
|
||||
{
|
||||
const int tid = get_local_id(0);
|
||||
const int gidX = get_group_id(0);
|
||||
const int gidY = get_group_id(1);
|
||||
|
||||
__global float* hist = block_hists + (gidY * img_block_width + gidX) *
|
||||
block_hist_size + tid;
|
||||
|
||||
float elem = 0.f;
|
||||
if (tid < block_hist_size)
|
||||
elem = hist[0];
|
||||
|
||||
squares[tid] = elem * elem;
|
||||
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
float sum = reduce_smem(squares, nthreads);
|
||||
|
||||
float scale = 1.0f / (sqrt(sum) + 0.1f * block_hist_size);
|
||||
elem = min(elem * scale, threshold);
|
||||
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
squares[tid] = elem * elem;
|
||||
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
sum = reduce_smem(squares, nthreads);
|
||||
scale = 1.0f / (sqrt(sum) + 1e-3f);
|
||||
|
||||
if (tid < block_hist_size)
|
||||
hist[0] = elem * scale;
|
||||
}
|
||||
|
||||
//---------------------------------------------------------------------
|
||||
// Linear SVM based classification
|
||||
// 48x96 window, 9 bins and default parameters
|
||||
// 180 threads, each thread corresponds to a bin in a row
|
||||
__kernel void classify_hists_180_kernel(
|
||||
const int cdescr_width, const int cdescr_height, const int cblock_hist_size,
|
||||
const int img_win_width, const int img_block_width,
|
||||
const int win_block_stride_x, const int win_block_stride_y,
|
||||
__global const float * block_hists, __global const float* coefs,
|
||||
float free_coef, float threshold, __global uchar* labels)
|
||||
{
|
||||
const int tid = get_local_id(0);
|
||||
const int gidX = get_group_id(0);
|
||||
const int gidY = get_group_id(1);
|
||||
|
||||
__global const float* hist = block_hists + (gidY * win_block_stride_y *
|
||||
img_block_width + gidX * win_block_stride_x) * cblock_hist_size;
|
||||
|
||||
float product = 0.f;
|
||||
|
||||
for (int i = 0; i < cdescr_height; i++)
|
||||
{
|
||||
product += coefs[i * cdescr_width + tid] *
|
||||
hist[i * img_block_width * cblock_hist_size + tid];
|
||||
}
|
||||
|
||||
__local float products[180];
|
||||
|
||||
products[tid] = product;
|
||||
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
if (tid < 90) products[tid] = product = product + products[tid + 90];
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
if (tid < 45) products[tid] = product = product + products[tid + 45];
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
volatile __local float* smem = products;
|
||||
#ifdef CPU
|
||||
if (tid < 13) smem[tid] = product = product + smem[tid + 32];
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
if (tid < 16) smem[tid] = product = product + smem[tid + 16];
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
if(tid<8) smem[tid] = product = product + smem[tid + 8];
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
if(tid<4) smem[tid] = product = product + smem[tid + 4];
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
if(tid<2) smem[tid] = product = product + smem[tid + 2];
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
#else
|
||||
if (tid < 13)
|
||||
{
|
||||
smem[tid] = product = product + smem[tid + 32];
|
||||
}
|
||||
#if WAVE_SIZE < 32
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
#endif
|
||||
if (tid < 16)
|
||||
{
|
||||
smem[tid] = product = product + smem[tid + 16];
|
||||
smem[tid] = product = product + smem[tid + 8];
|
||||
smem[tid] = product = product + smem[tid + 4];
|
||||
smem[tid] = product = product + smem[tid + 2];
|
||||
}
|
||||
#endif
|
||||
|
||||
if (tid == 0){
|
||||
product = product + smem[tid + 1];
|
||||
labels[gidY * img_win_width + gidX] = (product + free_coef >= threshold);
|
||||
}
|
||||
}
|
||||
|
||||
//---------------------------------------------------------------------
|
||||
// Linear SVM based classification
|
||||
// 64x128 window, 9 bins and default parameters
|
||||
// 256 threads, 252 of them are used
|
||||
__kernel void classify_hists_252_kernel(
|
||||
const int cdescr_width, const int cdescr_height, const int cblock_hist_size,
|
||||
const int img_win_width, const int img_block_width,
|
||||
const int win_block_stride_x, const int win_block_stride_y,
|
||||
__global const float * block_hists, __global const float* coefs,
|
||||
float free_coef, float threshold, __global uchar* labels)
|
||||
{
|
||||
const int tid = get_local_id(0);
|
||||
const int gidX = get_group_id(0);
|
||||
const int gidY = get_group_id(1);
|
||||
|
||||
__global const float* hist = block_hists + (gidY * win_block_stride_y *
|
||||
img_block_width + gidX * win_block_stride_x) * cblock_hist_size;
|
||||
|
||||
float product = 0.f;
|
||||
if (tid < cdescr_width)
|
||||
{
|
||||
for (int i = 0; i < cdescr_height; i++)
|
||||
product += coefs[i * cdescr_width + tid] *
|
||||
hist[i * img_block_width * cblock_hist_size + tid];
|
||||
}
|
||||
|
||||
__local float products[NTHREADS];
|
||||
|
||||
products[tid] = product;
|
||||
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
if (tid < 128) products[tid] = product = product + products[tid + 128];
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
if (tid < 64) products[tid] = product = product + products[tid + 64];
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
volatile __local float* smem = products;
|
||||
#ifdef CPU
|
||||
if(tid<32) smem[tid] = product = product + smem[tid + 32];
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
if(tid<16) smem[tid] = product = product + smem[tid + 16];
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
if(tid<8) smem[tid] = product = product + smem[tid + 8];
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
if(tid<4) smem[tid] = product = product + smem[tid + 4];
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
if(tid<2) smem[tid] = product = product + smem[tid + 2];
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
#else
|
||||
if (tid < 32)
|
||||
{
|
||||
smem[tid] = product = product + smem[tid + 32];
|
||||
#if WAVE_SIZE < 32
|
||||
} barrier(CLK_LOCAL_MEM_FENCE);
|
||||
if (tid < 16) {
|
||||
#endif
|
||||
smem[tid] = product = product + smem[tid + 16];
|
||||
smem[tid] = product = product + smem[tid + 8];
|
||||
smem[tid] = product = product + smem[tid + 4];
|
||||
smem[tid] = product = product + smem[tid + 2];
|
||||
}
|
||||
#endif
|
||||
if (tid == 0){
|
||||
product = product + smem[tid + 1];
|
||||
labels[gidY * img_win_width + gidX] = (product + free_coef >= threshold);
|
||||
}
|
||||
}
|
||||
|
||||
//---------------------------------------------------------------------
|
||||
// Linear SVM based classification
|
||||
// 256 threads
|
||||
__kernel void classify_hists_kernel(
|
||||
const int cdescr_size, const int cdescr_width, const int cblock_hist_size,
|
||||
const int img_win_width, const int img_block_width,
|
||||
const int win_block_stride_x, const int win_block_stride_y,
|
||||
__global const float * block_hists, __global const float* coefs,
|
||||
float free_coef, float threshold, __global uchar* labels)
|
||||
{
|
||||
const int tid = get_local_id(0);
|
||||
const int gidX = get_group_id(0);
|
||||
const int gidY = get_group_id(1);
|
||||
|
||||
__global const float* hist = block_hists + (gidY * win_block_stride_y *
|
||||
img_block_width + gidX * win_block_stride_x) * cblock_hist_size;
|
||||
|
||||
float product = 0.f;
|
||||
for (int i = tid; i < cdescr_size; i += NTHREADS)
|
||||
{
|
||||
int offset_y = i / cdescr_width;
|
||||
int offset_x = i - offset_y * cdescr_width;
|
||||
product += coefs[i] *
|
||||
hist[offset_y * img_block_width * cblock_hist_size + offset_x];
|
||||
}
|
||||
|
||||
__local float products[NTHREADS];
|
||||
|
||||
products[tid] = product;
|
||||
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
if (tid < 128) products[tid] = product = product + products[tid + 128];
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
if (tid < 64) products[tid] = product = product + products[tid + 64];
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
volatile __local float* smem = products;
|
||||
#ifdef CPU
|
||||
if(tid<32) smem[tid] = product = product + smem[tid + 32];
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
if(tid<16) smem[tid] = product = product + smem[tid + 16];
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
if(tid<8) smem[tid] = product = product + smem[tid + 8];
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
if(tid<4) smem[tid] = product = product + smem[tid + 4];
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
if(tid<2) smem[tid] = product = product + smem[tid + 2];
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
#else
|
||||
if (tid < 32)
|
||||
{
|
||||
smem[tid] = product = product + smem[tid + 32];
|
||||
#if WAVE_SIZE < 32
|
||||
} barrier(CLK_LOCAL_MEM_FENCE);
|
||||
if (tid < 16) {
|
||||
#endif
|
||||
smem[tid] = product = product + smem[tid + 16];
|
||||
smem[tid] = product = product + smem[tid + 8];
|
||||
smem[tid] = product = product + smem[tid + 4];
|
||||
smem[tid] = product = product + smem[tid + 2];
|
||||
}
|
||||
#endif
|
||||
if (tid == 0){
|
||||
smem[tid] = product = product + smem[tid + 1];
|
||||
labels[gidY * img_win_width + gidX] = (product + free_coef >= threshold);
|
||||
}
|
||||
}
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
// Extract descriptors
|
||||
|
||||
__kernel void extract_descrs_by_rows_kernel(
|
||||
const int cblock_hist_size, const int descriptors_quadstep,
|
||||
const int cdescr_size, const int cdescr_width, const int img_block_width,
|
||||
const int win_block_stride_x, const int win_block_stride_y,
|
||||
__global const float* block_hists, __global float* descriptors)
|
||||
{
|
||||
int tid = get_local_id(0);
|
||||
int gidX = get_group_id(0);
|
||||
int gidY = get_group_id(1);
|
||||
|
||||
// Get left top corner of the window in src
|
||||
__global const float* hist = block_hists + (gidY * win_block_stride_y *
|
||||
img_block_width + gidX * win_block_stride_x) * cblock_hist_size;
|
||||
|
||||
// Get left top corner of the window in dst
|
||||
__global float* descriptor = descriptors +
|
||||
(gidY * get_num_groups(0) + gidX) * descriptors_quadstep;
|
||||
|
||||
// Copy elements from src to dst
|
||||
for (int i = tid; i < cdescr_size; i += NTHREADS)
|
||||
{
|
||||
int offset_y = i / cdescr_width;
|
||||
int offset_x = i - offset_y * cdescr_width;
|
||||
descriptor[i] = hist[offset_y * img_block_width * cblock_hist_size + offset_x];
|
||||
}
|
||||
}
|
||||
|
||||
__kernel void extract_descrs_by_cols_kernel(
|
||||
const int cblock_hist_size, const int descriptors_quadstep, const int cdescr_size,
|
||||
const int cnblocks_win_x, const int cnblocks_win_y, const int img_block_width,
|
||||
const int win_block_stride_x, const int win_block_stride_y,
|
||||
__global const float* block_hists, __global float* descriptors)
|
||||
{
|
||||
int tid = get_local_id(0);
|
||||
int gidX = get_group_id(0);
|
||||
int gidY = get_group_id(1);
|
||||
|
||||
// Get left top corner of the window in src
|
||||
__global const float* hist = block_hists + (gidY * win_block_stride_y *
|
||||
img_block_width + gidX * win_block_stride_x) * cblock_hist_size;
|
||||
|
||||
// Get left top corner of the window in dst
|
||||
__global float* descriptor = descriptors +
|
||||
(gidY * get_num_groups(0) + gidX) * descriptors_quadstep;
|
||||
|
||||
// Copy elements from src to dst
|
||||
for (int i = tid; i < cdescr_size; i += NTHREADS)
|
||||
{
|
||||
int block_idx = i / cblock_hist_size;
|
||||
int idx_in_block = i - block_idx * cblock_hist_size;
|
||||
|
||||
int y = block_idx / cnblocks_win_x;
|
||||
int x = block_idx - y * cnblocks_win_x;
|
||||
|
||||
descriptor[(x * cnblocks_win_y + y) * cblock_hist_size + idx_in_block] =
|
||||
hist[(y * img_block_width + x) * cblock_hist_size + idx_in_block];
|
||||
}
|
||||
}
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
// Gradients computation
|
||||
|
||||
__kernel void compute_gradients_8UC4_kernel(
|
||||
const int height, const int width,
|
||||
const int img_step, const int grad_quadstep, const int qangle_step,
|
||||
const __global uchar4 * img, __global float * grad, __global QANGLE_TYPE * qangle,
|
||||
const float angle_scale, const char correct_gamma, const int cnbins)
|
||||
{
|
||||
const int x = get_global_id(0);
|
||||
const int tid = get_local_id(0);
|
||||
const int gSizeX = get_local_size(0);
|
||||
const int gidY = get_group_id(1);
|
||||
|
||||
__global const uchar4* row = img + gidY * img_step;
|
||||
|
||||
__local float sh_row[(NTHREADS + 2) * 3];
|
||||
|
||||
uchar4 val;
|
||||
if (x < width)
|
||||
val = row[x];
|
||||
else
|
||||
val = row[width - 2];
|
||||
|
||||
sh_row[tid + 1] = val.x;
|
||||
sh_row[tid + 1 + (NTHREADS + 2)] = val.y;
|
||||
sh_row[tid + 1 + 2 * (NTHREADS + 2)] = val.z;
|
||||
|
||||
if (tid == 0)
|
||||
{
|
||||
val = row[max(x - 1, 1)];
|
||||
sh_row[0] = val.x;
|
||||
sh_row[(NTHREADS + 2)] = val.y;
|
||||
sh_row[2 * (NTHREADS + 2)] = val.z;
|
||||
}
|
||||
|
||||
if (tid == gSizeX - 1)
|
||||
{
|
||||
val = row[min(x + 1, width - 2)];
|
||||
sh_row[gSizeX + 1] = val.x;
|
||||
sh_row[gSizeX + 1 + (NTHREADS + 2)] = val.y;
|
||||
sh_row[gSizeX + 1 + 2 * (NTHREADS + 2)] = val.z;
|
||||
}
|
||||
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
if (x < width)
|
||||
{
|
||||
float3 a = (float3) (sh_row[tid], sh_row[tid + (NTHREADS + 2)],
|
||||
sh_row[tid + 2 * (NTHREADS + 2)]);
|
||||
float3 b = (float3) (sh_row[tid + 2], sh_row[tid + 2 + (NTHREADS + 2)],
|
||||
sh_row[tid + 2 + 2 * (NTHREADS + 2)]);
|
||||
|
||||
float3 dx;
|
||||
if (correct_gamma == 1)
|
||||
dx = sqrt(b) - sqrt(a);
|
||||
else
|
||||
dx = b - a;
|
||||
|
||||
float3 dy = (float3) 0.f;
|
||||
|
||||
if (gidY > 0 && gidY < height - 1)
|
||||
{
|
||||
a = convert_float3(img[(gidY - 1) * img_step + x].xyz);
|
||||
b = convert_float3(img[(gidY + 1) * img_step + x].xyz);
|
||||
|
||||
if (correct_gamma == 1)
|
||||
dy = sqrt(b) - sqrt(a);
|
||||
else
|
||||
dy = b - a;
|
||||
}
|
||||
|
||||
float best_dx = dx.x;
|
||||
float best_dy = dy.x;
|
||||
|
||||
float mag0 = dx.x * dx.x + dy.x * dy.x;
|
||||
float mag1 = dx.y * dx.y + dy.y * dy.y;
|
||||
if (mag0 < mag1)
|
||||
{
|
||||
best_dx = dx.y;
|
||||
best_dy = dy.y;
|
||||
mag0 = mag1;
|
||||
}
|
||||
|
||||
mag1 = dx.z * dx.z + dy.z * dy.z;
|
||||
if (mag0 < mag1)
|
||||
{
|
||||
best_dx = dx.z;
|
||||
best_dy = dy.z;
|
||||
mag0 = mag1;
|
||||
}
|
||||
|
||||
mag0 = sqrt(mag0);
|
||||
|
||||
float ang = (atan2(best_dy, best_dx) + CV_PI_F) * angle_scale - 0.5f;
|
||||
int hidx = (int)floor(ang);
|
||||
ang -= hidx;
|
||||
hidx = (hidx + cnbins) % cnbins;
|
||||
|
||||
qangle[(gidY * qangle_step + x) << 1] = hidx;
|
||||
qangle[((gidY * qangle_step + x) << 1) + 1] = (hidx + 1) % cnbins;
|
||||
grad[(gidY * grad_quadstep + x) << 1] = mag0 * (1.f - ang);
|
||||
grad[((gidY * grad_quadstep + x) << 1) + 1] = mag0 * ang;
|
||||
}
|
||||
}
|
||||
|
||||
__kernel void compute_gradients_8UC1_kernel(
|
||||
const int height, const int width,
|
||||
const int img_step, const int grad_quadstep, const int qangle_step,
|
||||
__global const uchar * img, __global float * grad, __global QANGLE_TYPE * qangle,
|
||||
const float angle_scale, const char correct_gamma, const int cnbins)
|
||||
{
|
||||
const int x = get_global_id(0);
|
||||
const int tid = get_local_id(0);
|
||||
const int gSizeX = get_local_size(0);
|
||||
const int gidY = get_group_id(1);
|
||||
|
||||
__global const uchar* row = img + gidY * img_step;
|
||||
|
||||
__local float sh_row[NTHREADS + 2];
|
||||
|
||||
if (x < width)
|
||||
sh_row[tid + 1] = row[x];
|
||||
else
|
||||
sh_row[tid + 1] = row[width - 2];
|
||||
|
||||
if (tid == 0)
|
||||
sh_row[0] = row[max(x - 1, 1)];
|
||||
|
||||
if (tid == gSizeX - 1)
|
||||
sh_row[gSizeX + 1] = row[min(x + 1, width - 2)];
|
||||
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
if (x < width)
|
||||
{
|
||||
float dx;
|
||||
|
||||
if (correct_gamma == 1)
|
||||
dx = sqrt(sh_row[tid + 2]) - sqrt(sh_row[tid]);
|
||||
else
|
||||
dx = sh_row[tid + 2] - sh_row[tid];
|
||||
|
||||
float dy = 0.f;
|
||||
if (gidY > 0 && gidY < height - 1)
|
||||
{
|
||||
float a = (float) img[ (gidY + 1) * img_step + x ];
|
||||
float b = (float) img[ (gidY - 1) * img_step + x ];
|
||||
if (correct_gamma == 1)
|
||||
dy = sqrt(a) - sqrt(b);
|
||||
else
|
||||
dy = a - b;
|
||||
}
|
||||
float mag = sqrt(dx * dx + dy * dy);
|
||||
|
||||
float ang = (atan2(dy, dx) + CV_PI_F) * angle_scale - 0.5f;
|
||||
int hidx = (int)floor(ang);
|
||||
ang -= hidx;
|
||||
hidx = (hidx + cnbins) % cnbins;
|
||||
|
||||
qangle[ (gidY * qangle_step + x) << 1 ] = hidx;
|
||||
qangle[ ((gidY * qangle_step + x) << 1) + 1 ] = (hidx + 1) % cnbins;
|
||||
grad[ (gidY * grad_quadstep + x) << 1 ] = mag * (1.f - ang);
|
||||
grad[ ((gidY * grad_quadstep + x) << 1) + 1 ] = mag * ang;
|
||||
}
|
||||
}
|
121
modules/objdetect/test/opencl/test_hogdetector.cpp
Normal file
121
modules/objdetect/test/opencl/test_hogdetector.cpp
Normal file
@ -0,0 +1,121 @@
|
||||
///////////////////////////////////////////////////////////////////////////////////////
|
||||
//
|
||||
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
|
||||
//
|
||||
// By downloading, copying, installing or using the software you agree to this license.
|
||||
// If you do not agree to this license, do not download, install,
|
||||
// copy or use the software.
|
||||
//
|
||||
//
|
||||
// License Agreement
|
||||
// For Open Source Computer Vision Library
|
||||
//
|
||||
// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved.
|
||||
// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
|
||||
// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
|
||||
// Third party copyrights are property of their respective owners.
|
||||
//
|
||||
// @Authors
|
||||
// Niko Li, newlife20080214@gmail.com
|
||||
// Jia Haipeng, jiahaipeng95@gmail.com
|
||||
// Shengen Yan, yanshengen@gmail.com
|
||||
// Jiang Liyuan,jlyuan001.good@163.com
|
||||
// Rock Li, Rock.Li@amd.com
|
||||
// Zailong Wu, bullet@yeah.net
|
||||
// Yao Wang, bitwangyaoyao@gmail.com
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without modification,
|
||||
// are permitted provided that the following conditions are met:
|
||||
//
|
||||
// * Redistribution's of source code must retain the above copyright notice,
|
||||
// this list of conditions and the following disclaimer.
|
||||
//
|
||||
// * Redistribution's in binary form must reproduce the above copyright notice,
|
||||
// this list of conditions and the following disclaimer in the documentation
|
||||
// and/or other materials provided with the distribution.
|
||||
//
|
||||
// * The name of the copyright holders may not be used to endorse or promote products
|
||||
// derived from this software without specific prior written permission.
|
||||
//
|
||||
// This software is provided by the copyright holders and contributors "as is" and
|
||||
// any express or implied warranties, including, but not limited to, the implied
|
||||
// warranties of merchantability and fitness for a particular purpose are disclaimed.
|
||||
// In no event shall the Intel Corporation or contributors be liable for any direct,
|
||||
// indirect, incidental, special, exemplary, or consequential damages
|
||||
// (including, but not limited to, procurement of substitute goods or services;
|
||||
// loss of use, data, or profits; or business interruption) however caused
|
||||
// and on any theory of liability, whether in contract, strict liability,
|
||||
// or tort (including negligence or otherwise) arising in any way out of
|
||||
// the use of this software, even if advised of the possibility of such damage.
|
||||
//
|
||||
//M*/
|
||||
|
||||
#include "test_precomp.hpp"
|
||||
#include "opencv2/ts/ocl_test.hpp"
|
||||
|
||||
#ifdef HAVE_OPENCL
|
||||
|
||||
namespace cvtest {
|
||||
namespace ocl {
|
||||
|
||||
///////////////////// HOG /////////////////////////////
|
||||
PARAM_TEST_CASE(HOG, Size, MatType)
|
||||
{
|
||||
Size winSize;
|
||||
int type;
|
||||
Mat img;
|
||||
UMat uimg;
|
||||
virtual void SetUp()
|
||||
{
|
||||
winSize = GET_PARAM(0);
|
||||
type = GET_PARAM(1);
|
||||
img = readImage("cascadeandhog/images/image_00000000_0.png", IMREAD_GRAYSCALE);
|
||||
ASSERT_FALSE(img.empty());
|
||||
img.copyTo(uimg);
|
||||
}
|
||||
};
|
||||
|
||||
OCL_TEST_P(HOG, GetDescriptors)
|
||||
{
|
||||
HOGDescriptor hog;
|
||||
hog.gammaCorrection = true;
|
||||
|
||||
hog.setSVMDetector(hog.getDefaultPeopleDetector());
|
||||
|
||||
std::vector<float> cpu_descriptors;
|
||||
std::vector<float> gpu_descriptors;
|
||||
|
||||
OCL_OFF(hog.compute(img, cpu_descriptors, hog.winSize));
|
||||
OCL_ON(hog.compute(uimg, gpu_descriptors, hog.winSize));
|
||||
|
||||
Mat cpu_desc(cpu_descriptors), gpu_desc(gpu_descriptors);
|
||||
|
||||
EXPECT_MAT_SIMILAR(cpu_desc, gpu_desc, 1e-1);
|
||||
}
|
||||
|
||||
OCL_TEST_P(HOG, Detect)
|
||||
{
|
||||
HOGDescriptor hog;
|
||||
hog.winSize = winSize;
|
||||
hog.gammaCorrection = true;
|
||||
|
||||
if (winSize.width == 48 && winSize.height == 96)
|
||||
hog.setSVMDetector(hog.getDaimlerPeopleDetector());
|
||||
else
|
||||
hog.setSVMDetector(hog.getDefaultPeopleDetector());
|
||||
|
||||
std::vector<Rect> cpu_found;
|
||||
std::vector<Rect> gpu_found;
|
||||
|
||||
OCL_OFF(hog.detectMultiScale(img, cpu_found, 0, Size(8, 8), Size(0, 0), 1.05, 6));
|
||||
OCL_ON(hog.detectMultiScale(uimg, gpu_found, 0, Size(8, 8), Size(0, 0), 1.05, 6));
|
||||
|
||||
EXPECT_LT(checkRectSimilarity(img.size(), cpu_found, gpu_found), 1.0);
|
||||
}
|
||||
|
||||
INSTANTIATE_TEST_CASE_P(OCL_ObjDetect, HOG, testing::Combine(
|
||||
testing::Values(Size(64, 128), Size(48, 96)),
|
||||
testing::Values( MatType(CV_8UC1) ) ) );
|
||||
|
||||
}}
|
||||
#endif
|
Loading…
Reference in New Issue
Block a user