removed gpuobjdetect module

This commit is contained in:
Vladislav Vinogradov
2013-04-11 12:51:00 +04:00
parent 28b1caa730
commit eda124ec32
27 changed files with 172 additions and 606 deletions

View File

@@ -7,7 +7,7 @@ set(the_description "GPU-accelerated Computer Vision")
ocv_warnings_disable(CMAKE_CXX_FLAGS -Wundef -Wmissing-declarations -Wshadow -Wunused-parameter)
ocv_define_module(gpu opencv_gpuarithm opencv_gpufilters opencv_gpuwarping opencv_gpuimgproc
opencv_gpufeatures2d opencv_gpuvideo opencv_gpustereo opencv_gpuobjdetect)
opencv_gpufeatures2d opencv_gpuvideo opencv_gpustereo)
if(HAVE_CUDA)
add_subdirectory(perf4au)

View File

@@ -8,3 +8,4 @@ gpu. GPU-accelerated Computer Vision
introduction
initalization_and_information
data_structures
object_detection

View File

@@ -0,0 +1,317 @@
Object Detection
================
.. highlight:: cpp
gpu::HOGDescriptor
------------------
.. ocv:struct:: gpu::HOGDescriptor
The class implements Histogram of Oriented Gradients ([Dalal2005]_) object detector. ::
struct CV_EXPORTS HOGDescriptor
{
enum { DEFAULT_WIN_SIGMA = -1 };
enum { DEFAULT_NLEVELS = 64 };
enum { DESCR_FORMAT_ROW_BY_ROW, DESCR_FORMAT_COL_BY_COL };
HOGDescriptor(Size win_size=Size(64, 128), Size block_size=Size(16, 16),
Size block_stride=Size(8, 8), Size cell_size=Size(8, 8),
int nbins=9, double win_sigma=DEFAULT_WIN_SIGMA,
double threshold_L2hys=0.2, bool gamma_correction=true,
int nlevels=DEFAULT_NLEVELS);
size_t getDescriptorSize() const;
size_t getBlockHistogramSize() const;
void setSVMDetector(const vector<float>& detector);
static vector<float> getDefaultPeopleDetector();
static vector<float> getPeopleDetector48x96();
static vector<float> getPeopleDetector64x128();
void detect(const GpuMat& img, vector<Point>& found_locations,
double hit_threshold=0, Size win_stride=Size(),
Size padding=Size());
void detectMultiScale(const GpuMat& img, vector<Rect>& found_locations,
double hit_threshold=0, Size win_stride=Size(),
Size padding=Size(), double scale0=1.05,
int group_threshold=2);
void getDescriptors(const GpuMat& img, Size win_stride,
GpuMat& descriptors,
int descr_format=DESCR_FORMAT_COL_BY_COL);
Size win_size;
Size block_size;
Size block_stride;
Size cell_size;
int nbins;
double win_sigma;
double threshold_L2hys;
bool gamma_correction;
int nlevels;
private:
// Hidden
}
Interfaces of all methods are kept similar to the ``CPU HOG`` descriptor and detector analogues as much as possible.
gpu::HOGDescriptor::HOGDescriptor
-------------------------------------
Creates the ``HOG`` descriptor and detector.
.. ocv:function:: gpu::HOGDescriptor::HOGDescriptor(Size win_size=Size(64, 128), Size block_size=Size(16, 16), Size block_stride=Size(8, 8), Size cell_size=Size(8, 8), int nbins=9, double win_sigma=DEFAULT_WIN_SIGMA, double threshold_L2hys=0.2, bool gamma_correction=true, int nlevels=DEFAULT_NLEVELS)
:param win_size: Detection window size. Align to block size and block stride.
:param block_size: Block size in pixels. Align to cell size. Only (16,16) is supported for now.
:param block_stride: Block stride. It must be a multiple of cell size.
:param cell_size: Cell size. Only (8, 8) is supported for now.
:param nbins: Number of bins. Only 9 bins per cell are supported for now.
:param win_sigma: Gaussian smoothing window parameter.
:param threshold_L2hys: L2-Hys normalization method shrinkage.
:param gamma_correction: Flag to specify whether the gamma correction preprocessing is required or not.
:param nlevels: Maximum number of detection window increases.
gpu::HOGDescriptor::getDescriptorSize
-----------------------------------------
Returns the number of coefficients required for the classification.
.. ocv:function:: size_t gpu::HOGDescriptor::getDescriptorSize() const
gpu::HOGDescriptor::getBlockHistogramSize
---------------------------------------------
Returns the block histogram size.
.. ocv:function:: size_t gpu::HOGDescriptor::getBlockHistogramSize() const
gpu::HOGDescriptor::setSVMDetector
--------------------------------------
Sets coefficients for the linear SVM classifier.
.. ocv:function:: void gpu::HOGDescriptor::setSVMDetector(const vector<float>& detector)
gpu::HOGDescriptor::getDefaultPeopleDetector
------------------------------------------------
Returns coefficients of the classifier trained for people detection (for default window size).
.. ocv:function:: static vector<float> gpu::HOGDescriptor::getDefaultPeopleDetector()
gpu::HOGDescriptor::getPeopleDetector48x96
----------------------------------------------
Returns coefficients of the classifier trained for people detection (for 48x96 windows).
.. ocv:function:: static vector<float> gpu::HOGDescriptor::getPeopleDetector48x96()
gpu::HOGDescriptor::getPeopleDetector64x128
-----------------------------------------------
Returns coefficients of the classifier trained for people detection (for 64x128 windows).
.. ocv:function:: static vector<float> gpu::HOGDescriptor::getPeopleDetector64x128()
gpu::HOGDescriptor::detect
------------------------------
Performs object detection without a multi-scale window.
.. ocv:function:: void gpu::HOGDescriptor::detect(const GpuMat& img, vector<Point>& found_locations, double hit_threshold=0, Size win_stride=Size(), Size padding=Size())
:param img: Source image. ``CV_8UC1`` and ``CV_8UC4`` types are supported for now.
:param found_locations: Left-top corner points of detected objects boundaries.
:param hit_threshold: Threshold for the distance between features and SVM classifying plane. Usually it is 0 and should be specfied in the detector coefficients (as the last free coefficient). But if the free coefficient is omitted (which is allowed), you can specify it manually here.
:param win_stride: Window stride. It must be a multiple of block stride.
:param padding: Mock parameter to keep the CPU interface compatibility. It must be (0,0).
gpu::HOGDescriptor::detectMultiScale
----------------------------------------
Performs object detection with a multi-scale window.
.. ocv:function:: void gpu::HOGDescriptor::detectMultiScale(const GpuMat& img, vector<Rect>& found_locations, double hit_threshold=0, Size win_stride=Size(), Size padding=Size(), double scale0=1.05, int group_threshold=2)
:param img: Source image. See :ocv:func:`gpu::HOGDescriptor::detect` for type limitations.
:param found_locations: Detected objects boundaries.
:param hit_threshold: Threshold for the distance between features and SVM classifying plane. See :ocv:func:`gpu::HOGDescriptor::detect` for details.
:param win_stride: Window stride. It must be a multiple of block stride.
:param padding: Mock parameter to keep the CPU interface compatibility. It must be (0,0).
:param scale0: Coefficient of the detection window increase.
:param group_threshold: Coefficient to regulate the similarity threshold. When detected, some objects can be covered by many rectangles. 0 means not to perform grouping. See :ocv:func:`groupRectangles` .
gpu::HOGDescriptor::getDescriptors
--------------------------------------
Returns block descriptors computed for the whole image.
.. ocv:function:: void gpu::HOGDescriptor::getDescriptors(const GpuMat& img, Size win_stride, GpuMat& descriptors, int descr_format=DESCR_FORMAT_COL_BY_COL)
:param img: Source image. See :ocv:func:`gpu::HOGDescriptor::detect` for type limitations.
:param win_stride: Window stride. It must be a multiple of block stride.
:param descriptors: 2D array of descriptors.
:param descr_format: Descriptor storage format:
* **DESCR_FORMAT_ROW_BY_ROW** - Row-major order.
* **DESCR_FORMAT_COL_BY_COL** - Column-major order.
The function is mainly used to learn the classifier.
gpu::CascadeClassifier_GPU
--------------------------
.. ocv:class:: gpu::CascadeClassifier_GPU
Cascade classifier class used for object detection. Supports HAAR and LBP cascades. ::
class CV_EXPORTS CascadeClassifier_GPU
{
public:
CascadeClassifier_GPU();
CascadeClassifier_GPU(const String& filename);
~CascadeClassifier_GPU();
bool empty() const;
bool load(const String& filename);
void release();
/* Returns number of detected objects */
int detectMultiScale( const GpuMat& image, GpuMat& objectsBuf, double scaleFactor=1.2, int minNeighbors=4, Size minSize=Size());
int detectMultiScale( const GpuMat& image, GpuMat& objectsBuf, Size maxObjectSize, Size minSize = Size(), double scaleFactor = 1.1, int minNeighbors = 4);
/* Finds only the largest object. Special mode if training is required.*/
bool findLargestObject;
/* Draws rectangles in input image */
bool visualizeInPlace;
Size getClassifierSize() const;
};
gpu::CascadeClassifier_GPU::CascadeClassifier_GPU
-----------------------------------------------------
Loads the classifier from a file. Cascade type is detected automatically by constructor parameter.
.. ocv:function:: gpu::CascadeClassifier_GPU::CascadeClassifier_GPU(const String& filename)
:param filename: Name of the file from which the classifier is loaded. Only the old ``haar`` classifier (trained by the ``haar`` training application) and NVIDIA's ``nvbin`` are supported for HAAR and only new type of OpenCV XML cascade supported for LBP.
gpu::CascadeClassifier_GPU::empty
-------------------------------------
Checks whether the classifier is loaded or not.
.. ocv:function:: bool gpu::CascadeClassifier_GPU::empty() const
gpu::CascadeClassifier_GPU::load
------------------------------------
Loads the classifier from a file. The previous content is destroyed.
.. ocv:function:: bool gpu::CascadeClassifier_GPU::load(const String& filename)
:param filename: Name of the file from which the classifier is loaded. Only the old ``haar`` classifier (trained by the ``haar`` training application) and NVIDIA's ``nvbin`` are supported for HAAR and only new type of OpenCV XML cascade supported for LBP.
gpu::CascadeClassifier_GPU::release
---------------------------------------
Destroys the loaded classifier.
.. ocv:function:: void gpu::CascadeClassifier_GPU::release()
gpu::CascadeClassifier_GPU::detectMultiScale
------------------------------------------------
Detects objects of different sizes in the input image.
.. ocv:function:: int gpu::CascadeClassifier_GPU::detectMultiScale(const GpuMat& image, GpuMat& objectsBuf, double scaleFactor=1.2, int minNeighbors=4, Size minSize=Size())
.. ocv:function:: int gpu::CascadeClassifier_GPU::detectMultiScale(const GpuMat& image, GpuMat& objectsBuf, Size maxObjectSize, Size minSize = Size(), double scaleFactor = 1.1, int minNeighbors = 4)
:param image: Matrix of type ``CV_8U`` containing an image where objects should be detected.
:param objectsBuf: Buffer to store detected objects (rectangles). If it is empty, it is allocated with the default size. If not empty, the function searches not more than N objects, where ``N = sizeof(objectsBufer's data)/sizeof(cv::Rect)``.
:param maxObjectSize: Maximum possible object size. Objects larger than that are ignored. Used for second signature and supported only for LBP cascades.
:param scaleFactor: Parameter specifying how much the image size is reduced at each image scale.
:param minNeighbors: Parameter specifying how many neighbors each candidate rectangle should have to retain it.
:param minSize: Minimum possible object size. Objects smaller than that are ignored.
The detected objects are returned as a list of rectangles.
The function returns the number of detected objects, so you can retrieve them as in the following example: ::
gpu::CascadeClassifier_GPU cascade_gpu(...);
Mat image_cpu = imread(...)
GpuMat image_gpu(image_cpu);
GpuMat objbuf;
int detections_number = cascade_gpu.detectMultiScale( image_gpu,
objbuf, 1.2, minNeighbors);
Mat obj_host;
// download only detected number of rectangles
objbuf.colRange(0, detections_number).download(obj_host);
Rect* faces = obj_host.ptr<Rect>();
for(int i = 0; i < detections_num; ++i)
cv::rectangle(image_cpu, faces[i], Scalar(255));
imshow("Faces", image_cpu);
.. seealso:: :ocv:func:`CascadeClassifier::detectMultiScale`
.. [Dalal2005] Navneet Dalal and Bill Triggs. *Histogram of oriented gradients for human detection*. 2005.

View File

@@ -51,10 +51,129 @@
#include "opencv2/gpufeatures2d.hpp"
#include "opencv2/gpuvideo.hpp"
#include "opencv2/gpustereo.hpp"
#include "opencv2/gpuobjdetect.hpp"
namespace cv { namespace gpu {
//////////////// HOG (Histogram-of-Oriented-Gradients) Descriptor and Object Detector //////////////
struct CV_EXPORTS HOGConfidence
{
double scale;
std::vector<Point> locations;
std::vector<double> confidences;
std::vector<double> part_scores[4];
};
struct CV_EXPORTS HOGDescriptor
{
enum { DEFAULT_WIN_SIGMA = -1 };
enum { DEFAULT_NLEVELS = 64 };
enum { DESCR_FORMAT_ROW_BY_ROW, DESCR_FORMAT_COL_BY_COL };
HOGDescriptor(Size win_size=Size(64, 128), Size block_size=Size(16, 16),
Size block_stride=Size(8, 8), Size cell_size=Size(8, 8),
int nbins=9, double win_sigma=DEFAULT_WIN_SIGMA,
double threshold_L2hys=0.2, bool gamma_correction=true,
int nlevels=DEFAULT_NLEVELS);
size_t getDescriptorSize() const;
size_t getBlockHistogramSize() const;
void setSVMDetector(const std::vector<float>& detector);
static std::vector<float> getDefaultPeopleDetector();
static std::vector<float> getPeopleDetector48x96();
static std::vector<float> getPeopleDetector64x128();
void detect(const GpuMat& img, std::vector<Point>& found_locations,
double hit_threshold=0, Size win_stride=Size(),
Size padding=Size());
void detectMultiScale(const GpuMat& img, std::vector<Rect>& found_locations,
double hit_threshold=0, Size win_stride=Size(),
Size padding=Size(), double scale0=1.05,
int group_threshold=2);
void computeConfidence(const GpuMat& img, std::vector<Point>& hits, double hit_threshold,
Size win_stride, Size padding, std::vector<Point>& locations, std::vector<double>& confidences);
void computeConfidenceMultiScale(const GpuMat& img, std::vector<Rect>& found_locations,
double hit_threshold, Size win_stride, Size padding,
std::vector<HOGConfidence> &conf_out, int group_threshold);
void getDescriptors(const GpuMat& img, Size win_stride,
GpuMat& descriptors,
int descr_format=DESCR_FORMAT_COL_BY_COL);
Size win_size;
Size block_size;
Size block_stride;
Size cell_size;
int nbins;
double win_sigma;
double threshold_L2hys;
bool gamma_correction;
int nlevels;
protected:
void computeBlockHistograms(const GpuMat& img);
void computeGradient(const GpuMat& img, GpuMat& grad, GpuMat& qangle);
double getWinSigma() const;
bool checkDetectorSize() const;
static int numPartsWithin(int size, int part_size, int stride);
static Size numPartsWithin(Size size, Size part_size, Size stride);
// Coefficients of the separating plane
float free_coef;
GpuMat detector;
// Results of the last classification step
GpuMat labels, labels_buf;
Mat labels_host;
// Results of the last histogram evaluation step
GpuMat block_hists, block_hists_buf;
// Gradients conputation results
GpuMat grad, qangle, grad_buf, qangle_buf;
// returns subbuffer with required size, reallocates buffer if nessesary.
static GpuMat getBuffer(const Size& sz, int type, GpuMat& buf);
static GpuMat getBuffer(int rows, int cols, int type, GpuMat& buf);
std::vector<GpuMat> image_scales;
};
// The cascade classifier class for object detection: supports old haar and new lbp xlm formats and nvbin for haar cascades olny.
class CV_EXPORTS CascadeClassifier_GPU
{
public:
CascadeClassifier_GPU();
CascadeClassifier_GPU(const String& filename);
~CascadeClassifier_GPU();
bool empty() const;
bool load(const String& filename);
void release();
/* returns number of detected objects */
int detectMultiScale(const GpuMat& image, GpuMat& objectsBuf, double scaleFactor = 1.2, int minNeighbors = 4, Size minSize = Size());
int detectMultiScale(const GpuMat& image, GpuMat& objectsBuf, Size maxObjectSize, Size minSize = Size(), double scaleFactor = 1.1, int minNeighbors = 4);
bool findLargestObject;
bool visualizeInPlace;
Size getClassifierSize() const;
private:
struct CascadeClassifierImpl;
CascadeClassifierImpl* impl;
struct HaarCascade;
struct LbpCascade;
friend class CascadeClassifier_GPU_LBP;
};
//!performs labeling via graph cuts of a 2D regular 4-connected graph.
CV_EXPORTS void graphcut(GpuMat& terminals, GpuMat& leftTransp, GpuMat& rightTransp, GpuMat& top, GpuMat& bottom, GpuMat& labels,
GpuMat& buf, Stream& stream = Stream::Null());

View File

@@ -0,0 +1,173 @@
/*M///////////////////////////////////////////////////////////////////////////////////////
//
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
//
// By downloading, copying, installing or using the software you agree to this license.
// If you do not agree to this license, do not download, install,
// copy or use the software.
//
//
// License Agreement
// For Open Source Computer Vision Library
//
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
// Third party copyrights are property of their respective owners.
//
// Redistribution and use in source and binary forms, with or without modification,
// are permitted provided that the following conditions are met:
//
// * Redistribution's of source code must retain the above copyright notice,
// this list of conditions and the following disclaimer.
//
// * Redistribution's in binary form must reproduce the above copyright notice,
// this list of conditions and the following disclaimer in the documentation
// and/or other materials provided with the distribution.
//
// * The name of the copyright holders may not be used to endorse or promote products
// derived from this software without specific prior written permission.
//
// This software is provided by the copyright holders and contributors "as is" and
// any express or implied warranties, including, but not limited to, the implied
// warranties of merchantability and fitness for a particular purpose are disclaimed.
// In no event shall the Intel Corporation or contributors be liable for any direct,
// indirect, incidental, special, exemplary, or consequential damages
// (including, but not limited to, procurement of substitute goods or services;
// loss of use, data, or profits; or business interruption) however caused
// and on any theory of liability, whether in contract, strict liability,
// or tort (including negligence or otherwise) arising in any way out of
// the use of this software, even if advised of the possibility of such damage.
//
//M*/
#include "perf_precomp.hpp"
using namespace std;
using namespace testing;
using namespace perf;
///////////////////////////////////////////////////////////////
// HOG
DEF_PARAM_TEST_1(Image, string);
PERF_TEST_P(Image, ObjDetect_HOG,
Values<string>("gpu/hog/road.png",
"gpu/caltech/image_00000009_0.png",
"gpu/caltech/image_00000032_0.png",
"gpu/caltech/image_00000165_0.png",
"gpu/caltech/image_00000261_0.png",
"gpu/caltech/image_00000469_0.png",
"gpu/caltech/image_00000527_0.png",
"gpu/caltech/image_00000574_0.png"))
{
declare.time(300.0);
const cv::Mat img = readImage(GetParam(), cv::IMREAD_GRAYSCALE);
ASSERT_FALSE(img.empty());
if (PERF_RUN_GPU())
{
const cv::gpu::GpuMat d_img(img);
std::vector<cv::Rect> gpu_found_locations;
cv::gpu::HOGDescriptor d_hog;
d_hog.setSVMDetector(cv::gpu::HOGDescriptor::getDefaultPeopleDetector());
TEST_CYCLE() d_hog.detectMultiScale(d_img, gpu_found_locations);
SANITY_CHECK(gpu_found_locations);
}
else
{
std::vector<cv::Rect> cpu_found_locations;
cv::HOGDescriptor hog;
hog.setSVMDetector(cv::gpu::HOGDescriptor::getDefaultPeopleDetector());
TEST_CYCLE() hog.detectMultiScale(img, cpu_found_locations);
SANITY_CHECK(cpu_found_locations);
}
}
///////////////////////////////////////////////////////////////
// HaarClassifier
typedef pair<string, string> pair_string;
DEF_PARAM_TEST_1(ImageAndCascade, pair_string);
PERF_TEST_P(ImageAndCascade, ObjDetect_HaarClassifier,
Values<pair_string>(make_pair("gpu/haarcascade/group_1_640x480_VGA.pgm", "gpu/perf/haarcascade_frontalface_alt.xml")))
{
const cv::Mat img = readImage(GetParam().first, cv::IMREAD_GRAYSCALE);
ASSERT_FALSE(img.empty());
if (PERF_RUN_GPU())
{
cv::gpu::CascadeClassifier_GPU d_cascade;
ASSERT_TRUE(d_cascade.load(perf::TestBase::getDataPath(GetParam().second)));
const cv::gpu::GpuMat d_img(img);
cv::gpu::GpuMat objects_buffer;
int detections_num = 0;
TEST_CYCLE() detections_num = d_cascade.detectMultiScale(d_img, objects_buffer);
std::vector<cv::Rect> gpu_rects(detections_num);
cv::Mat gpu_rects_mat(1, detections_num, cv::DataType<cv::Rect>::type, &gpu_rects[0]);
objects_buffer.colRange(0, detections_num).download(gpu_rects_mat);
cv::groupRectangles(gpu_rects, 3, 0.2);
SANITY_CHECK(gpu_rects);
}
else
{
cv::CascadeClassifier cascade;
ASSERT_TRUE(cascade.load(perf::TestBase::getDataPath("gpu/perf/haarcascade_frontalface_alt.xml")));
std::vector<cv::Rect> cpu_rects;
TEST_CYCLE() cascade.detectMultiScale(img, cpu_rects);
SANITY_CHECK(cpu_rects);
}
}
///////////////////////////////////////////////////////////////
// LBP cascade
PERF_TEST_P(ImageAndCascade, ObjDetect_LBPClassifier,
Values<pair_string>(make_pair("gpu/haarcascade/group_1_640x480_VGA.pgm", "gpu/lbpcascade/lbpcascade_frontalface.xml")))
{
const cv::Mat img = readImage(GetParam().first, cv::IMREAD_GRAYSCALE);
ASSERT_FALSE(img.empty());
if (PERF_RUN_GPU())
{
cv::gpu::CascadeClassifier_GPU d_cascade;
ASSERT_TRUE(d_cascade.load(perf::TestBase::getDataPath(GetParam().second)));
const cv::gpu::GpuMat d_img(img);
cv::gpu::GpuMat objects_buffer;
int detections_num = 0;
TEST_CYCLE() detections_num = d_cascade.detectMultiScale(d_img, objects_buffer);
std::vector<cv::Rect> gpu_rects(detections_num);
cv::Mat gpu_rects_mat(1, detections_num, cv::DataType<cv::Rect>::type, &gpu_rects[0]);
objects_buffer.colRange(0, detections_num).download(gpu_rects_mat);
cv::groupRectangles(gpu_rects, 3, 0.2);
SANITY_CHECK(gpu_rects);
}
else
{
cv::CascadeClassifier cascade;
ASSERT_TRUE(cascade.load(perf::TestBase::getDataPath("gpu/lbpcascade/lbpcascade_frontalface.xml")));
std::vector<cv::Rect> cpu_rects;
TEST_CYCLE() cascade.detectMultiScale(img, cpu_rects);
SANITY_CHECK(cpu_rects);
}
}

View File

@@ -56,6 +56,7 @@
#include "opencv2/gpu.hpp"
#include "opencv2/calib3d.hpp"
#include "opencv2/objdetect.hpp"
#ifdef GTEST_CREATE_SHARED_LIBRARY
#error no modules except ts should have GTEST_CREATE_SHARED_LIBRARY defined

View File

@@ -0,0 +1,755 @@
/*M///////////////////////////////////////////////////////////////////////////////////////
//
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
//
// By downloading, copying, installing or using the software you agree to this license.
// If you do not agree to this license, do not download, install,
// copy or use the software.
//
//
// License Agreement
// For Open Source Computer Vision Library
//
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
// Third party copyrights are property of their respective owners.
//
// Redistribution and use in source and binary forms, with or without modification,
// are permitted provided that the following conditions are met:
//
// * Redistribution's of source code must retain the above copyright notice,
// this list of conditions and the following disclaimer.
//
// * Redistribution's in binary form must reproduce the above copyright notice,
// this list of conditions and the following disclaimer in the documentation
// and/or other materials provided with the distribution.
//
// * The name of the copyright holders may not be used to endorse or promote products
// derived from this software without specific prior written permission.
//
// This software is provided by the copyright holders and contributors "as is" and
// any express or implied warranties, including, but not limited to, the implied
// warranties of merchantability and fitness for a particular purpose are disclaimed.
// In no event shall the Intel Corporation or contributors be liable for any direct,
// indirect, incidental, special, exemplary, or consequential damages
// (including, but not limited to, procurement of substitute goods or services;
// loss of use, data, or profits; or business interruption) however caused
// and on any theory of liability, whether in contract, strict liability,
// or tort (including negligence or otherwise) arising in any way out of
// the use of this software, even if advised of the possibility of such damage.
//
//M*/
#include "precomp.hpp"
#include "opencv2/objdetect/objdetect_c.h"
using namespace cv;
using namespace cv::gpu;
#if !defined (HAVE_CUDA) || defined (CUDA_DISABLER)
cv::gpu::CascadeClassifier_GPU::CascadeClassifier_GPU() { throw_no_cuda(); }
cv::gpu::CascadeClassifier_GPU::CascadeClassifier_GPU(const String&) { throw_no_cuda(); }
cv::gpu::CascadeClassifier_GPU::~CascadeClassifier_GPU() { throw_no_cuda(); }
bool cv::gpu::CascadeClassifier_GPU::empty() const { throw_no_cuda(); return true; }
bool cv::gpu::CascadeClassifier_GPU::load(const String&) { throw_no_cuda(); return true; }
Size cv::gpu::CascadeClassifier_GPU::getClassifierSize() const { throw_no_cuda(); return Size();}
void cv::gpu::CascadeClassifier_GPU::release() { throw_no_cuda(); }
int cv::gpu::CascadeClassifier_GPU::detectMultiScale( const GpuMat&, GpuMat&, double, int, Size) {throw_no_cuda(); return -1;}
int cv::gpu::CascadeClassifier_GPU::detectMultiScale( const GpuMat&, GpuMat&, Size, Size, double, int) {throw_no_cuda(); return -1;}
#else
struct cv::gpu::CascadeClassifier_GPU::CascadeClassifierImpl
{
public:
CascadeClassifierImpl(){}
virtual ~CascadeClassifierImpl(){}
virtual unsigned int process(const GpuMat& src, GpuMat& objects, float scaleStep, int minNeighbors,
bool findLargestObject, bool visualizeInPlace, cv::Size ncvMinSize, cv::Size maxObjectSize) = 0;
virtual cv::Size getClassifierCvSize() const = 0;
virtual bool read(const String& classifierAsXml) = 0;
};
#ifndef HAVE_OPENCV_GPULEGACY
struct cv::gpu::CascadeClassifier_GPU::HaarCascade
{
public:
HaarCascade()
{
throw_no_cuda();
}
unsigned int process(const GpuMat&, GpuMat&, float, int, bool, bool, cv::Size, cv::Size)
{
throw_no_cuda();
return 0;
}
cv::Size getClassifierCvSize() const
{
throw_no_cuda();
return cv::Size();
}
bool read(const String&)
{
throw_no_cuda();
return false;
}
};
#else
struct cv::gpu::CascadeClassifier_GPU::HaarCascade : cv::gpu::CascadeClassifier_GPU::CascadeClassifierImpl
{
public:
HaarCascade() : lastAllocatedFrameSize(-1, -1)
{
ncvSetDebugOutputHandler(NCVDebugOutputHandler);
}
bool read(const String& filename)
{
ncvSafeCall( load(filename) );
return true;
}
NCVStatus process(const GpuMat& src, GpuMat& objects, float scaleStep, int minNeighbors,
bool findLargestObject, bool visualizeInPlace, cv::Size ncvMinSize,
/*out*/unsigned int& numDetections)
{
calculateMemReqsAndAllocate(src.size());
NCVMemPtr src_beg;
src_beg.ptr = (void*)src.ptr<Ncv8u>();
src_beg.memtype = NCVMemoryTypeDevice;
NCVMemSegment src_seg;
src_seg.begin = src_beg;
src_seg.size = src.step * src.rows;
NCVMatrixReuse<Ncv8u> d_src(src_seg, static_cast<int>(devProp.textureAlignment), src.cols, src.rows, static_cast<int>(src.step), true);
ncvAssertReturn(d_src.isMemReused(), NCV_ALLOCATOR_BAD_REUSE);
CV_Assert(objects.rows == 1);
NCVMemPtr objects_beg;
objects_beg.ptr = (void*)objects.ptr<NcvRect32u>();
objects_beg.memtype = NCVMemoryTypeDevice;
NCVMemSegment objects_seg;
objects_seg.begin = objects_beg;
objects_seg.size = objects.step * objects.rows;
NCVVectorReuse<NcvRect32u> d_rects(objects_seg, objects.cols);
ncvAssertReturn(d_rects.isMemReused(), NCV_ALLOCATOR_BAD_REUSE);
NcvSize32u roi;
roi.width = d_src.width();
roi.height = d_src.height();
NcvSize32u winMinSize(ncvMinSize.width, ncvMinSize.height);
Ncv32u flags = 0;
flags |= findLargestObject? NCVPipeObjDet_FindLargestObject : 0;
flags |= visualizeInPlace ? NCVPipeObjDet_VisualizeInPlace : 0;
ncvStat = ncvDetectObjectsMultiScale_device(
d_src, roi, d_rects, numDetections, haar, *h_haarStages,
*d_haarStages, *d_haarNodes, *d_haarFeatures,
winMinSize,
minNeighbors,
scaleStep, 1,
flags,
*gpuAllocator, *cpuAllocator, devProp, 0);
ncvAssertReturnNcvStat(ncvStat);
ncvAssertCUDAReturn(cudaStreamSynchronize(0), NCV_CUDA_ERROR);
return NCV_SUCCESS;
}
unsigned int process(const GpuMat& image, GpuMat& objectsBuf, float scaleFactor, int minNeighbors,
bool findLargestObject, bool visualizeInPlace, cv::Size minSize, cv::Size /*maxObjectSize*/)
{
CV_Assert( scaleFactor > 1 && image.depth() == CV_8U);
const int defaultObjSearchNum = 100;
if (objectsBuf.empty())
{
objectsBuf.create(1, defaultObjSearchNum, DataType<Rect>::type);
}
cv::Size ncvMinSize = this->getClassifierCvSize();
if (ncvMinSize.width < minSize.width && ncvMinSize.height < minSize.height)
{
ncvMinSize.width = minSize.width;
ncvMinSize.height = minSize.height;
}
unsigned int numDetections;
ncvSafeCall(this->process(image, objectsBuf, (float)scaleFactor, minNeighbors, findLargestObject, visualizeInPlace, ncvMinSize, numDetections));
return numDetections;
}
cv::Size getClassifierCvSize() const { return cv::Size(haar.ClassifierSize.width, haar.ClassifierSize.height); }
private:
static void NCVDebugOutputHandler(const String &msg) { CV_Error(cv::Error::GpuApiCallError, msg.c_str()); }
NCVStatus load(const String& classifierFile)
{
int devId = cv::gpu::getDevice();
ncvAssertCUDAReturn(cudaGetDeviceProperties(&devProp, devId), NCV_CUDA_ERROR);
// Load the classifier from file (assuming its size is about 1 mb) using a simple allocator
gpuCascadeAllocator = new NCVMemNativeAllocator(NCVMemoryTypeDevice, static_cast<int>(devProp.textureAlignment));
cpuCascadeAllocator = new NCVMemNativeAllocator(NCVMemoryTypeHostPinned, static_cast<int>(devProp.textureAlignment));
ncvAssertPrintReturn(gpuCascadeAllocator->isInitialized(), "Error creating cascade GPU allocator", NCV_CUDA_ERROR);
ncvAssertPrintReturn(cpuCascadeAllocator->isInitialized(), "Error creating cascade CPU allocator", NCV_CUDA_ERROR);
Ncv32u haarNumStages, haarNumNodes, haarNumFeatures;
ncvStat = ncvHaarGetClassifierSize(classifierFile, haarNumStages, haarNumNodes, haarNumFeatures);
ncvAssertPrintReturn(ncvStat == NCV_SUCCESS, "Error reading classifier size (check the file)", NCV_FILE_ERROR);
h_haarStages = new NCVVectorAlloc<HaarStage64>(*cpuCascadeAllocator, haarNumStages);
h_haarNodes = new NCVVectorAlloc<HaarClassifierNode128>(*cpuCascadeAllocator, haarNumNodes);
h_haarFeatures = new NCVVectorAlloc<HaarFeature64>(*cpuCascadeAllocator, haarNumFeatures);
ncvAssertPrintReturn(h_haarStages->isMemAllocated(), "Error in cascade CPU allocator", NCV_CUDA_ERROR);
ncvAssertPrintReturn(h_haarNodes->isMemAllocated(), "Error in cascade CPU allocator", NCV_CUDA_ERROR);
ncvAssertPrintReturn(h_haarFeatures->isMemAllocated(), "Error in cascade CPU allocator", NCV_CUDA_ERROR);
ncvStat = ncvHaarLoadFromFile_host(classifierFile, haar, *h_haarStages, *h_haarNodes, *h_haarFeatures);
ncvAssertPrintReturn(ncvStat == NCV_SUCCESS, "Error loading classifier", NCV_FILE_ERROR);
d_haarStages = new NCVVectorAlloc<HaarStage64>(*gpuCascadeAllocator, haarNumStages);
d_haarNodes = new NCVVectorAlloc<HaarClassifierNode128>(*gpuCascadeAllocator, haarNumNodes);
d_haarFeatures = new NCVVectorAlloc<HaarFeature64>(*gpuCascadeAllocator, haarNumFeatures);
ncvAssertPrintReturn(d_haarStages->isMemAllocated(), "Error in cascade GPU allocator", NCV_CUDA_ERROR);
ncvAssertPrintReturn(d_haarNodes->isMemAllocated(), "Error in cascade GPU allocator", NCV_CUDA_ERROR);
ncvAssertPrintReturn(d_haarFeatures->isMemAllocated(), "Error in cascade GPU allocator", NCV_CUDA_ERROR);
ncvStat = h_haarStages->copySolid(*d_haarStages, 0);
ncvAssertPrintReturn(ncvStat == NCV_SUCCESS, "Error copying cascade to GPU", NCV_CUDA_ERROR);
ncvStat = h_haarNodes->copySolid(*d_haarNodes, 0);
ncvAssertPrintReturn(ncvStat == NCV_SUCCESS, "Error copying cascade to GPU", NCV_CUDA_ERROR);
ncvStat = h_haarFeatures->copySolid(*d_haarFeatures, 0);
ncvAssertPrintReturn(ncvStat == NCV_SUCCESS, "Error copying cascade to GPU", NCV_CUDA_ERROR);
return NCV_SUCCESS;
}
NCVStatus calculateMemReqsAndAllocate(const Size& frameSize)
{
if (lastAllocatedFrameSize == frameSize)
{
return NCV_SUCCESS;
}
// Calculate memory requirements and create real allocators
NCVMemStackAllocator gpuCounter(static_cast<int>(devProp.textureAlignment));
NCVMemStackAllocator cpuCounter(static_cast<int>(devProp.textureAlignment));
ncvAssertPrintReturn(gpuCounter.isInitialized(), "Error creating GPU memory counter", NCV_CUDA_ERROR);
ncvAssertPrintReturn(cpuCounter.isInitialized(), "Error creating CPU memory counter", NCV_CUDA_ERROR);
NCVMatrixAlloc<Ncv8u> d_src(gpuCounter, frameSize.width, frameSize.height);
NCVMatrixAlloc<Ncv8u> h_src(cpuCounter, frameSize.width, frameSize.height);
ncvAssertReturn(d_src.isMemAllocated(), NCV_ALLOCATOR_BAD_ALLOC);
ncvAssertReturn(h_src.isMemAllocated(), NCV_ALLOCATOR_BAD_ALLOC);
NCVVectorAlloc<NcvRect32u> d_rects(gpuCounter, 100);
ncvAssertReturn(d_rects.isMemAllocated(), NCV_ALLOCATOR_BAD_ALLOC);
NcvSize32u roi;
roi.width = d_src.width();
roi.height = d_src.height();
Ncv32u numDetections;
ncvStat = ncvDetectObjectsMultiScale_device(d_src, roi, d_rects, numDetections, haar, *h_haarStages,
*d_haarStages, *d_haarNodes, *d_haarFeatures, haar.ClassifierSize, 4, 1.2f, 1, 0, gpuCounter, cpuCounter, devProp, 0);
ncvAssertReturnNcvStat(ncvStat);
ncvAssertCUDAReturn(cudaStreamSynchronize(0), NCV_CUDA_ERROR);
gpuAllocator = new NCVMemStackAllocator(NCVMemoryTypeDevice, gpuCounter.maxSize(), static_cast<int>(devProp.textureAlignment));
cpuAllocator = new NCVMemStackAllocator(NCVMemoryTypeHostPinned, cpuCounter.maxSize(), static_cast<int>(devProp.textureAlignment));
ncvAssertPrintReturn(gpuAllocator->isInitialized(), "Error creating GPU memory allocator", NCV_CUDA_ERROR);
ncvAssertPrintReturn(cpuAllocator->isInitialized(), "Error creating CPU memory allocator", NCV_CUDA_ERROR);
lastAllocatedFrameSize = frameSize;
return NCV_SUCCESS;
}
cudaDeviceProp devProp;
NCVStatus ncvStat;
Ptr<NCVMemNativeAllocator> gpuCascadeAllocator;
Ptr<NCVMemNativeAllocator> cpuCascadeAllocator;
Ptr<NCVVectorAlloc<HaarStage64> > h_haarStages;
Ptr<NCVVectorAlloc<HaarClassifierNode128> > h_haarNodes;
Ptr<NCVVectorAlloc<HaarFeature64> > h_haarFeatures;
HaarClassifierCascadeDescriptor haar;
Ptr<NCVVectorAlloc<HaarStage64> > d_haarStages;
Ptr<NCVVectorAlloc<HaarClassifierNode128> > d_haarNodes;
Ptr<NCVVectorAlloc<HaarFeature64> > d_haarFeatures;
Size lastAllocatedFrameSize;
Ptr<NCVMemStackAllocator> gpuAllocator;
Ptr<NCVMemStackAllocator> cpuAllocator;
virtual ~HaarCascade(){}
};
#endif
cv::Size operator -(const cv::Size& a, const cv::Size& b)
{
return cv::Size(a.width - b.width, a.height - b.height);
}
cv::Size operator +(const cv::Size& a, const int& i)
{
return cv::Size(a.width + i, a.height + i);
}
cv::Size operator *(const cv::Size& a, const float& f)
{
return cv::Size(cvRound(a.width * f), cvRound(a.height * f));
}
cv::Size operator /(const cv::Size& a, const float& f)
{
return cv::Size(cvRound(a.width / f), cvRound(a.height / f));
}
bool operator <=(const cv::Size& a, const cv::Size& b)
{
return a.width <= b.width && a.height <= b.width;
}
struct PyrLavel
{
PyrLavel(int _order, float _scale, cv::Size frame, cv::Size window, cv::Size minObjectSize)
{
do
{
order = _order;
scale = pow(_scale, order);
sFrame = frame / scale;
workArea = sFrame - window + 1;
sWindow = window * scale;
_order++;
} while (sWindow <= minObjectSize);
}
bool isFeasible(cv::Size maxObj)
{
return workArea.width > 0 && workArea.height > 0 && sWindow <= maxObj;
}
PyrLavel next(float factor, cv::Size frame, cv::Size window, cv::Size minObjectSize)
{
return PyrLavel(order + 1, factor, frame, window, minObjectSize);
}
int order;
float scale;
cv::Size sFrame;
cv::Size workArea;
cv::Size sWindow;
};
namespace cv { namespace gpu { namespace cudev
{
namespace lbp
{
void classifyPyramid(int frameW,
int frameH,
int windowW,
int windowH,
float initalScale,
float factor,
int total,
const PtrStepSzb& mstages,
const int nstages,
const PtrStepSzi& mnodes,
const PtrStepSzf& mleaves,
const PtrStepSzi& msubsets,
const PtrStepSzb& mfeatures,
const int subsetSize,
PtrStepSz<int4> objects,
unsigned int* classified,
PtrStepSzi integral);
void connectedConmonents(PtrStepSz<int4> candidates, int ncandidates, PtrStepSz<int4> objects,int groupThreshold, float grouping_eps, unsigned int* nclasses);
}
}}}
struct cv::gpu::CascadeClassifier_GPU::LbpCascade : cv::gpu::CascadeClassifier_GPU::CascadeClassifierImpl
{
public:
struct Stage
{
int first;
int ntrees;
float threshold;
};
LbpCascade(){}
virtual ~LbpCascade(){}
virtual unsigned int process(const GpuMat& image, GpuMat& objects, float scaleFactor, int groupThreshold, bool /*findLargestObject*/,
bool /*visualizeInPlace*/, cv::Size minObjectSize, cv::Size maxObjectSize)
{
CV_Assert(scaleFactor > 1 && image.depth() == CV_8U);
// const int defaultObjSearchNum = 100;
const float grouping_eps = 0.2f;
if( !objects.empty() && objects.depth() == CV_32S)
objects.reshape(4, 1);
else
objects.create(1 , image.cols >> 4, CV_32SC4);
// used for debug
// candidates.setTo(cv::Scalar::all(0));
// objects.setTo(cv::Scalar::all(0));
if (maxObjectSize == cv::Size())
maxObjectSize = image.size();
allocateBuffers(image.size());
unsigned int classified = 0;
GpuMat dclassified(1, 1, CV_32S);
cudaSafeCall( cudaMemcpy(dclassified.ptr(), &classified, sizeof(int), cudaMemcpyHostToDevice) );
PyrLavel level(0, 1.0f, image.size(), NxM, minObjectSize);
while (level.isFeasible(maxObjectSize))
{
int acc = level.sFrame.width + 1;
float iniScale = level.scale;
cv::Size area = level.workArea;
int step = 1 + (level.scale <= 2.f);
int total = 0, prev = 0;
while (acc <= integralFactor * (image.cols + 1) && level.isFeasible(maxObjectSize))
{
// create sutable matrix headers
GpuMat src = resuzeBuffer(cv::Rect(0, 0, level.sFrame.width, level.sFrame.height));
GpuMat sint = integral(cv::Rect(prev, 0, level.sFrame.width + 1, level.sFrame.height + 1));
GpuMat buff = integralBuffer;
// generate integral for scale
gpu::resize(image, src, level.sFrame, 0, 0, cv::INTER_LINEAR);
gpu::integralBuffered(src, sint, buff);
// calculate job
int totalWidth = level.workArea.width / step;
total += totalWidth * (level.workArea.height / step);
// go to next pyramide level
level = level.next(scaleFactor, image.size(), NxM, minObjectSize);
area = level.workArea;
step = (1 + (level.scale <= 2.f));
prev = acc;
acc += level.sFrame.width + 1;
}
cudev::lbp::classifyPyramid(image.cols, image.rows, NxM.width - 1, NxM.height - 1, iniScale, scaleFactor, total, stage_mat, stage_mat.cols / sizeof(Stage), nodes_mat,
leaves_mat, subsets_mat, features_mat, subsetSize, candidates, dclassified.ptr<unsigned int>(), integral);
}
if (groupThreshold <= 0 || objects.empty())
return 0;
cudaSafeCall( cudaMemcpy(&classified, dclassified.ptr(), sizeof(int), cudaMemcpyDeviceToHost) );
cudev::lbp::connectedConmonents(candidates, classified, objects, groupThreshold, grouping_eps, dclassified.ptr<unsigned int>());
cudaSafeCall( cudaMemcpy(&classified, dclassified.ptr(), sizeof(int), cudaMemcpyDeviceToHost) );
cudaSafeCall( cudaDeviceSynchronize() );
return classified;
}
virtual cv::Size getClassifierCvSize() const { return NxM; }
bool read(const String& classifierAsXml)
{
FileStorage fs(classifierAsXml, FileStorage::READ);
return fs.isOpened() ? read(fs.getFirstTopLevelNode()) : false;
}
private:
void allocateBuffers(cv::Size frame)
{
if (frame == cv::Size())
return;
if (resuzeBuffer.empty() || frame.width > resuzeBuffer.cols || frame.height > resuzeBuffer.rows)
{
resuzeBuffer.create(frame, CV_8UC1);
integral.create(frame.height + 1, integralFactor * (frame.width + 1), CV_32SC1);
NcvSize32u roiSize;
roiSize.width = frame.width;
roiSize.height = frame.height;
cudaDeviceProp prop;
cudaSafeCall( cudaGetDeviceProperties(&prop, cv::gpu::getDevice()) );
Ncv32u bufSize;
ncvSafeCall( nppiStIntegralGetSize_8u32u(roiSize, &bufSize, prop) );
integralBuffer.create(1, bufSize, CV_8UC1);
candidates.create(1 , frame.width >> 1, CV_32SC4);
}
}
bool read(const FileNode &root)
{
const char *GPU_CC_STAGE_TYPE = "stageType";
const char *GPU_CC_FEATURE_TYPE = "featureType";
const char *GPU_CC_BOOST = "BOOST";
const char *GPU_CC_LBP = "LBP";
const char *GPU_CC_MAX_CAT_COUNT = "maxCatCount";
const char *GPU_CC_HEIGHT = "height";
const char *GPU_CC_WIDTH = "width";
const char *GPU_CC_STAGE_PARAMS = "stageParams";
const char *GPU_CC_MAX_DEPTH = "maxDepth";
const char *GPU_CC_FEATURE_PARAMS = "featureParams";
const char *GPU_CC_STAGES = "stages";
const char *GPU_CC_STAGE_THRESHOLD = "stageThreshold";
const float GPU_THRESHOLD_EPS = 1e-5f;
const char *GPU_CC_WEAK_CLASSIFIERS = "weakClassifiers";
const char *GPU_CC_INTERNAL_NODES = "internalNodes";
const char *GPU_CC_LEAF_VALUES = "leafValues";
const char *GPU_CC_FEATURES = "features";
const char *GPU_CC_RECT = "rect";
String stageTypeStr = (String)root[GPU_CC_STAGE_TYPE];
CV_Assert(stageTypeStr == GPU_CC_BOOST);
String featureTypeStr = (String)root[GPU_CC_FEATURE_TYPE];
CV_Assert(featureTypeStr == GPU_CC_LBP);
NxM.width = (int)root[GPU_CC_WIDTH];
NxM.height = (int)root[GPU_CC_HEIGHT];
CV_Assert( NxM.height > 0 && NxM.width > 0 );
isStumps = ((int)(root[GPU_CC_STAGE_PARAMS][GPU_CC_MAX_DEPTH]) == 1) ? true : false;
CV_Assert(isStumps);
FileNode fn = root[GPU_CC_FEATURE_PARAMS];
if (fn.empty())
return false;
ncategories = fn[GPU_CC_MAX_CAT_COUNT];
subsetSize = (ncategories + 31) / 32;
nodeStep = 3 + ( ncategories > 0 ? subsetSize : 1 );
fn = root[GPU_CC_STAGES];
if (fn.empty())
return false;
std::vector<Stage> stages;
stages.reserve(fn.size());
std::vector<int> cl_trees;
std::vector<int> cl_nodes;
std::vector<float> cl_leaves;
std::vector<int> subsets;
FileNodeIterator it = fn.begin(), it_end = fn.end();
for (size_t si = 0; it != it_end; si++, ++it )
{
FileNode fns = *it;
Stage st;
st.threshold = (float)fns[GPU_CC_STAGE_THRESHOLD] - GPU_THRESHOLD_EPS;
fns = fns[GPU_CC_WEAK_CLASSIFIERS];
if (fns.empty())
return false;
st.ntrees = (int)fns.size();
st.first = (int)cl_trees.size();
stages.push_back(st);// (int, int, float)
cl_trees.reserve(stages[si].first + stages[si].ntrees);
// weak trees
FileNodeIterator it1 = fns.begin(), it1_end = fns.end();
for ( ; it1 != it1_end; ++it1 )
{
FileNode fnw = *it1;
FileNode internalNodes = fnw[GPU_CC_INTERNAL_NODES];
FileNode leafValues = fnw[GPU_CC_LEAF_VALUES];
if ( internalNodes.empty() || leafValues.empty() )
return false;
int nodeCount = (int)internalNodes.size()/nodeStep;
cl_trees.push_back(nodeCount);
cl_nodes.reserve((cl_nodes.size() + nodeCount) * 3);
cl_leaves.reserve(cl_leaves.size() + leafValues.size());
if( subsetSize > 0 )
subsets.reserve(subsets.size() + nodeCount * subsetSize);
// nodes
FileNodeIterator iIt = internalNodes.begin(), iEnd = internalNodes.end();
for( ; iIt != iEnd; )
{
cl_nodes.push_back((int)*(iIt++));
cl_nodes.push_back((int)*(iIt++));
cl_nodes.push_back((int)*(iIt++));
if( subsetSize > 0 )
for( int j = 0; j < subsetSize; j++, ++iIt )
subsets.push_back((int)*iIt);
}
// leaves
iIt = leafValues.begin(), iEnd = leafValues.end();
for( ; iIt != iEnd; ++iIt )
cl_leaves.push_back((float)*iIt);
}
}
fn = root[GPU_CC_FEATURES];
if( fn.empty() )
return false;
std::vector<uchar> features;
features.reserve(fn.size() * 4);
FileNodeIterator f_it = fn.begin(), f_end = fn.end();
for (; f_it != f_end; ++f_it)
{
FileNode rect = (*f_it)[GPU_CC_RECT];
FileNodeIterator r_it = rect.begin();
features.push_back(saturate_cast<uchar>((int)*(r_it++)));
features.push_back(saturate_cast<uchar>((int)*(r_it++)));
features.push_back(saturate_cast<uchar>((int)*(r_it++)));
features.push_back(saturate_cast<uchar>((int)*(r_it++)));
}
// copy data structures on gpu
stage_mat.upload(cv::Mat(1, (int) (stages.size() * sizeof(Stage)), CV_8UC1, (uchar*)&(stages[0]) ));
trees_mat.upload(cv::Mat(cl_trees).reshape(1,1));
nodes_mat.upload(cv::Mat(cl_nodes).reshape(1,1));
leaves_mat.upload(cv::Mat(cl_leaves).reshape(1,1));
subsets_mat.upload(cv::Mat(subsets).reshape(1,1));
features_mat.upload(cv::Mat(features).reshape(4,1));
return true;
}
enum stage { BOOST = 0 };
enum feature { LBP = 1, HAAR = 2 };
static const stage stageType = BOOST;
static const feature featureType = LBP;
cv::Size NxM;
bool isStumps;
int ncategories;
int subsetSize;
int nodeStep;
// gpu representation of classifier
GpuMat stage_mat;
GpuMat trees_mat;
GpuMat nodes_mat;
GpuMat leaves_mat;
GpuMat subsets_mat;
GpuMat features_mat;
GpuMat integral;
GpuMat integralBuffer;
GpuMat resuzeBuffer;
GpuMat candidates;
static const int integralFactor = 4;
};
cv::gpu::CascadeClassifier_GPU::CascadeClassifier_GPU()
: findLargestObject(false), visualizeInPlace(false), impl(0) {}
cv::gpu::CascadeClassifier_GPU::CascadeClassifier_GPU(const String& filename)
: findLargestObject(false), visualizeInPlace(false), impl(0) { load(filename); }
cv::gpu::CascadeClassifier_GPU::~CascadeClassifier_GPU() { release(); }
void cv::gpu::CascadeClassifier_GPU::release() { if (impl) { delete impl; impl = 0; } }
bool cv::gpu::CascadeClassifier_GPU::empty() const { return impl == 0; }
Size cv::gpu::CascadeClassifier_GPU::getClassifierSize() const
{
return this->empty() ? Size() : impl->getClassifierCvSize();
}
int cv::gpu::CascadeClassifier_GPU::detectMultiScale( const GpuMat& image, GpuMat& objectsBuf, double scaleFactor, int minNeighbors, Size minSize)
{
CV_Assert( !this->empty());
return impl->process(image, objectsBuf, (float)scaleFactor, minNeighbors, findLargestObject, visualizeInPlace, minSize, cv::Size());
}
int cv::gpu::CascadeClassifier_GPU::detectMultiScale(const GpuMat& image, GpuMat& objectsBuf, Size maxObjectSize, Size minSize, double scaleFactor, int minNeighbors)
{
CV_Assert( !this->empty());
return impl->process(image, objectsBuf, (float)scaleFactor, minNeighbors, findLargestObject, visualizeInPlace, minSize, maxObjectSize);
}
bool cv::gpu::CascadeClassifier_GPU::load(const String& filename)
{
release();
String fext = filename.substr(filename.find_last_of(".") + 1);
fext = fext.toLowerCase();
if (fext == "nvbin")
{
impl = new HaarCascade();
return impl->read(filename);
}
FileStorage fs(filename, FileStorage::READ);
if (!fs.isOpened())
{
impl = new HaarCascade();
return impl->read(filename);
}
const char *GPU_CC_LBP = "LBP";
String featureTypeStr = (String)fs.getFirstTopLevelNode()["featureType"];
if (featureTypeStr == GPU_CC_LBP)
impl = new LbpCascade();
else
impl = new HaarCascade();
impl->read(filename);
return !this->empty();
}
#endif

814
modules/gpu/src/cuda/hog.cu Normal file
View File

@@ -0,0 +1,814 @@
/*M///////////////////////////////////////////////////////////////////////////////////////
//
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
//
// By downloading, copying, installing or using the software you agree to this license.
// If you do not agree to this license, do not download, install,
// copy or use the software.
//
//
// License Agreement
// For Open Source Computer Vision Library
//
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
// Third party copyrights are property of their respective owners.
//
// Redistribution and use in source and binary forms, with or without modification,
// are permitted provided that the following conditions are met:
//
// * Redistribution's of source code must retain the above copyright notice,
// this list of conditions and the following disclaimer.
//
// * Redistribution's in binary form must reproduce the above copyright notice,
// this list of conditions and the following disclaimer in the documentation
// and/or other materials provided with the distribution.
//
// * The name of the copyright holders may not be used to endorse or promote products
// derived from this software without specific prior written permission.
//
// This software is provided by the copyright holders and contributors "as is" and
// any express or implied warranties, including, but not limited to, the implied
// warranties of merchantability and fitness for a particular purpose are disclaimed.
// In no event shall the Intel Corporation or contributors be liable for any direct,
// indirect, incidental, special, exemplary, or consequential damages
// (including, but not limited to, procurement of substitute goods or services;
// loss of use, data, or profits; or business interruption) however caused
// and on any theory of liability, whether in contract, strict liability,
// or tort (including negligence or otherwise) arising in any way out of
// the use of this software, even if advised of the possibility of such damage.
//
//M*/
#if !defined CUDA_DISABLER
#include "opencv2/core/cuda/common.hpp"
#include "opencv2/core/cuda/reduce.hpp"
#include "opencv2/core/cuda/functional.hpp"
#include "opencv2/core/cuda/warp_shuffle.hpp"
namespace cv { namespace gpu { namespace cudev
{
// Other values are not supported
#define CELL_WIDTH 8
#define CELL_HEIGHT 8
#define CELLS_PER_BLOCK_X 2
#define CELLS_PER_BLOCK_Y 2
namespace hog
{
__constant__ int cnbins;
__constant__ int cblock_stride_x;
__constant__ int cblock_stride_y;
__constant__ int cnblocks_win_x;
__constant__ int cnblocks_win_y;
__constant__ int cblock_hist_size;
__constant__ int cblock_hist_size_2up;
__constant__ int cdescr_size;
__constant__ int cdescr_width;
/* Returns the nearest upper power of two, works only for
the typical GPU thread count (pert block) values */
int power_2up(unsigned int n)
{
if (n < 1) return 1;
else if (n < 2) return 2;
else if (n < 4) return 4;
else if (n < 8) return 8;
else if (n < 16) return 16;
else if (n < 32) return 32;
else if (n < 64) return 64;
else if (n < 128) return 128;
else if (n < 256) return 256;
else if (n < 512) return 512;
else if (n < 1024) return 1024;
return -1; // Input is too big
}
void set_up_constants(int nbins, int block_stride_x, int block_stride_y,
int nblocks_win_x, int nblocks_win_y)
{
cudaSafeCall( cudaMemcpyToSymbol(cnbins, &nbins, sizeof(nbins)) );
cudaSafeCall( cudaMemcpyToSymbol(cblock_stride_x, &block_stride_x, sizeof(block_stride_x)) );
cudaSafeCall( cudaMemcpyToSymbol(cblock_stride_y, &block_stride_y, sizeof(block_stride_y)) );
cudaSafeCall( cudaMemcpyToSymbol(cnblocks_win_x, &nblocks_win_x, sizeof(nblocks_win_x)) );
cudaSafeCall( cudaMemcpyToSymbol(cnblocks_win_y, &nblocks_win_y, sizeof(nblocks_win_y)) );
int block_hist_size = nbins * CELLS_PER_BLOCK_X * CELLS_PER_BLOCK_Y;
cudaSafeCall( cudaMemcpyToSymbol(cblock_hist_size, &block_hist_size, sizeof(block_hist_size)) );
int block_hist_size_2up = power_2up(block_hist_size);
cudaSafeCall( cudaMemcpyToSymbol(cblock_hist_size_2up, &block_hist_size_2up, sizeof(block_hist_size_2up)) );
int descr_width = nblocks_win_x * block_hist_size;
cudaSafeCall( cudaMemcpyToSymbol(cdescr_width, &descr_width, sizeof(descr_width)) );
int descr_size = descr_width * nblocks_win_y;
cudaSafeCall( cudaMemcpyToSymbol(cdescr_size, &descr_size, sizeof(descr_size)) );
}
//----------------------------------------------------------------------------
// Histogram computation
template <int nblocks> // Number of histogram blocks processed by single GPU thread block
__global__ void compute_hists_kernel_many_blocks(const int img_block_width, const PtrStepf grad,
const PtrStepb qangle, float scale, float* block_hists)
{
const int block_x = threadIdx.z;
const int cell_x = threadIdx.x / 16;
const int cell_y = threadIdx.y;
const int cell_thread_x = threadIdx.x & 0xF;
if (blockIdx.x * blockDim.z + block_x >= img_block_width)
return;
extern __shared__ float smem[];
float* hists = smem;
float* final_hist = smem + cnbins * 48 * nblocks;
const int offset_x = (blockIdx.x * blockDim.z + block_x) * cblock_stride_x +
4 * cell_x + cell_thread_x;
const int offset_y = blockIdx.y * cblock_stride_y + 4 * cell_y;
const float* grad_ptr = grad.ptr(offset_y) + offset_x * 2;
const unsigned char* qangle_ptr = qangle.ptr(offset_y) + offset_x * 2;
// 12 means that 12 pixels affect on block's cell (in one row)
if (cell_thread_x < 12)
{
float* hist = hists + 12 * (cell_y * blockDim.z * CELLS_PER_BLOCK_Y +
cell_x + block_x * CELLS_PER_BLOCK_X) +
cell_thread_x;
for (int bin_id = 0; bin_id < cnbins; ++bin_id)
hist[bin_id * 48 * nblocks] = 0.f;
const int dist_x = -4 + (int)cell_thread_x - 4 * cell_x;
const int dist_y_begin = -4 - 4 * (int)threadIdx.y;
for (int dist_y = dist_y_begin; dist_y < dist_y_begin + 12; ++dist_y)
{
float2 vote = *(const float2*)grad_ptr;
uchar2 bin = *(const uchar2*)qangle_ptr;
grad_ptr += grad.step/sizeof(float);
qangle_ptr += qangle.step;
int dist_center_y = dist_y - 4 * (1 - 2 * cell_y);
int dist_center_x = dist_x - 4 * (1 - 2 * cell_x);
float gaussian = ::expf(-(dist_center_y * dist_center_y +
dist_center_x * dist_center_x) * scale);
float interp_weight = (8.f - ::fabs(dist_y + 0.5f)) *
(8.f - ::fabs(dist_x + 0.5f)) / 64.f;
hist[bin.x * 48 * nblocks] += gaussian * interp_weight * vote.x;
hist[bin.y * 48 * nblocks] += gaussian * interp_weight * vote.y;
}
volatile float* hist_ = hist;
for (int bin_id = 0; bin_id < cnbins; ++bin_id, hist_ += 48 * nblocks)
{
if (cell_thread_x < 6) hist_[0] += hist_[6];
if (cell_thread_x < 3) hist_[0] += hist_[3];
if (cell_thread_x == 0)
final_hist[((cell_x + block_x * 2) * 2 + cell_y) * cnbins + bin_id]
= hist_[0] + hist_[1] + hist_[2];
}
}
__syncthreads();
float* block_hist = block_hists + (blockIdx.y * img_block_width +
blockIdx.x * blockDim.z + block_x) *
cblock_hist_size;
int tid = (cell_y * CELLS_PER_BLOCK_Y + cell_x) * 16 + cell_thread_x;
if (tid < cblock_hist_size)
block_hist[tid] = final_hist[block_x * cblock_hist_size + tid];
}
void compute_hists(int nbins, int block_stride_x, int block_stride_y,
int height, int width, const PtrStepSzf& grad,
const PtrStepSzb& qangle, float sigma, float* block_hists)
{
const int nblocks = 1;
int img_block_width = (width - CELLS_PER_BLOCK_X * CELL_WIDTH + block_stride_x) /
block_stride_x;
int img_block_height = (height - CELLS_PER_BLOCK_Y * CELL_HEIGHT + block_stride_y) /
block_stride_y;
dim3 grid(divUp(img_block_width, nblocks), img_block_height);
dim3 threads(32, 2, nblocks);
cudaSafeCall(cudaFuncSetCacheConfig(compute_hists_kernel_many_blocks<nblocks>,
cudaFuncCachePreferL1));
// Precompute gaussian spatial window parameter
float scale = 1.f / (2.f * sigma * sigma);
int hists_size = (nbins * CELLS_PER_BLOCK_X * CELLS_PER_BLOCK_Y * 12 * nblocks) * sizeof(float);
int final_hists_size = (nbins * CELLS_PER_BLOCK_X * CELLS_PER_BLOCK_Y * nblocks) * sizeof(float);
int smem = hists_size + final_hists_size;
compute_hists_kernel_many_blocks<nblocks><<<grid, threads, smem>>>(
img_block_width, grad, qangle, scale, block_hists);
cudaSafeCall( cudaGetLastError() );
cudaSafeCall( cudaDeviceSynchronize() );
}
//-------------------------------------------------------------
// Normalization of histograms via L2Hys_norm
//
template<int size>
__device__ float reduce_smem(float* smem, float val)
{
unsigned int tid = threadIdx.x;
float sum = val;
reduce<size>(smem, sum, tid, plus<float>());
if (size == 32)
{
#if __CUDA_ARCH__ >= 300
return shfl(sum, 0);
#else
return smem[0];
#endif
}
else
{
#if __CUDA_ARCH__ >= 300
if (threadIdx.x == 0)
smem[0] = sum;
#endif
__syncthreads();
return smem[0];
}
}
template <int nthreads, // Number of threads which process one block historgam
int nblocks> // Number of block hisograms processed by one GPU thread block
__global__ void normalize_hists_kernel_many_blocks(const int block_hist_size,
const int img_block_width,
float* block_hists, float threshold)
{
if (blockIdx.x * blockDim.z + threadIdx.z >= img_block_width)
return;
float* hist = block_hists + (blockIdx.y * img_block_width +
blockIdx.x * blockDim.z + threadIdx.z) *
block_hist_size + threadIdx.x;
__shared__ float sh_squares[nthreads * nblocks];
float* squares = sh_squares + threadIdx.z * nthreads;
float elem = 0.f;
if (threadIdx.x < block_hist_size)
elem = hist[0];
float sum = reduce_smem<nthreads>(squares, elem * elem);
float scale = 1.0f / (::sqrtf(sum) + 0.1f * block_hist_size);
elem = ::min(elem * scale, threshold);
sum = reduce_smem<nthreads>(squares, elem * elem);
scale = 1.0f / (::sqrtf(sum) + 1e-3f);
if (threadIdx.x < block_hist_size)
hist[0] = elem * scale;
}
void normalize_hists(int nbins, int block_stride_x, int block_stride_y,
int height, int width, float* block_hists, float threshold)
{
const int nblocks = 1;
int block_hist_size = nbins * CELLS_PER_BLOCK_X * CELLS_PER_BLOCK_Y;
int nthreads = power_2up(block_hist_size);
dim3 threads(nthreads, 1, nblocks);
int img_block_width = (width - CELLS_PER_BLOCK_X * CELL_WIDTH + block_stride_x) / block_stride_x;
int img_block_height = (height - CELLS_PER_BLOCK_Y * CELL_HEIGHT + block_stride_y) / block_stride_y;
dim3 grid(divUp(img_block_width, nblocks), img_block_height);
if (nthreads == 32)
normalize_hists_kernel_many_blocks<32, nblocks><<<grid, threads>>>(block_hist_size, img_block_width, block_hists, threshold);
else if (nthreads == 64)
normalize_hists_kernel_many_blocks<64, nblocks><<<grid, threads>>>(block_hist_size, img_block_width, block_hists, threshold);
else if (nthreads == 128)
normalize_hists_kernel_many_blocks<64, nblocks><<<grid, threads>>>(block_hist_size, img_block_width, block_hists, threshold);
else if (nthreads == 256)
normalize_hists_kernel_many_blocks<256, nblocks><<<grid, threads>>>(block_hist_size, img_block_width, block_hists, threshold);
else if (nthreads == 512)
normalize_hists_kernel_many_blocks<512, nblocks><<<grid, threads>>>(block_hist_size, img_block_width, block_hists, threshold);
else
CV_Error(cv::Error::StsBadArg, "normalize_hists: histogram's size is too big, try to decrease number of bins");
cudaSafeCall( cudaGetLastError() );
cudaSafeCall( cudaDeviceSynchronize() );
}
//---------------------------------------------------------------------
// Linear SVM based classification
//
// return confidence values not just positive location
template <int nthreads, // Number of threads per one histogram block
int nblocks> // Number of histogram block processed by single GPU thread block
__global__ void compute_confidence_hists_kernel_many_blocks(const int img_win_width, const int img_block_width,
const int win_block_stride_x, const int win_block_stride_y,
const float* block_hists, const float* coefs,
float free_coef, float threshold, float* confidences)
{
const int win_x = threadIdx.z;
if (blockIdx.x * blockDim.z + win_x >= img_win_width)
return;
const float* hist = block_hists + (blockIdx.y * win_block_stride_y * img_block_width +
blockIdx.x * win_block_stride_x * blockDim.z + win_x) *
cblock_hist_size;
float product = 0.f;
for (int i = threadIdx.x; i < cdescr_size; i += nthreads)
{
int offset_y = i / cdescr_width;
int offset_x = i - offset_y * cdescr_width;
product += coefs[i] * hist[offset_y * img_block_width * cblock_hist_size + offset_x];
}
__shared__ float products[nthreads * nblocks];
const int tid = threadIdx.z * nthreads + threadIdx.x;
reduce<nthreads>(products, product, tid, plus<float>());
if (threadIdx.x == 0)
confidences[blockIdx.y * img_win_width + blockIdx.x * blockDim.z + win_x] = product + free_coef;
}
void compute_confidence_hists(int win_height, int win_width, int block_stride_y, int block_stride_x,
int win_stride_y, int win_stride_x, int height, int width, float* block_hists,
float* coefs, float free_coef, float threshold, float *confidences)
{
const int nthreads = 256;
const int nblocks = 1;
int win_block_stride_x = win_stride_x / block_stride_x;
int win_block_stride_y = win_stride_y / block_stride_y;
int img_win_width = (width - win_width + win_stride_x) / win_stride_x;
int img_win_height = (height - win_height + win_stride_y) / win_stride_y;
dim3 threads(nthreads, 1, nblocks);
dim3 grid(divUp(img_win_width, nblocks), img_win_height);
cudaSafeCall(cudaFuncSetCacheConfig(compute_confidence_hists_kernel_many_blocks<nthreads, nblocks>,
cudaFuncCachePreferL1));
int img_block_width = (width - CELLS_PER_BLOCK_X * CELL_WIDTH + block_stride_x) /
block_stride_x;
compute_confidence_hists_kernel_many_blocks<nthreads, nblocks><<<grid, threads>>>(
img_win_width, img_block_width, win_block_stride_x, win_block_stride_y,
block_hists, coefs, free_coef, threshold, confidences);
cudaSafeCall(cudaThreadSynchronize());
}
template <int nthreads, // Number of threads per one histogram block
int nblocks> // Number of histogram block processed by single GPU thread block
__global__ void classify_hists_kernel_many_blocks(const int img_win_width, const int img_block_width,
const int win_block_stride_x, const int win_block_stride_y,
const float* block_hists, const float* coefs,
float free_coef, float threshold, unsigned char* labels)
{
const int win_x = threadIdx.z;
if (blockIdx.x * blockDim.z + win_x >= img_win_width)
return;
const float* hist = block_hists + (blockIdx.y * win_block_stride_y * img_block_width +
blockIdx.x * win_block_stride_x * blockDim.z + win_x) *
cblock_hist_size;
float product = 0.f;
for (int i = threadIdx.x; i < cdescr_size; i += nthreads)
{
int offset_y = i / cdescr_width;
int offset_x = i - offset_y * cdescr_width;
product += coefs[i] * hist[offset_y * img_block_width * cblock_hist_size + offset_x];
}
__shared__ float products[nthreads * nblocks];
const int tid = threadIdx.z * nthreads + threadIdx.x;
reduce<nthreads>(products, product, tid, plus<float>());
if (threadIdx.x == 0)
labels[blockIdx.y * img_win_width + blockIdx.x * blockDim.z + win_x] = (product + free_coef >= threshold);
}
void classify_hists(int win_height, int win_width, int block_stride_y, int block_stride_x,
int win_stride_y, int win_stride_x, int height, int width, float* block_hists,
float* coefs, float free_coef, float threshold, unsigned char* labels)
{
const int nthreads = 256;
const int nblocks = 1;
int win_block_stride_x = win_stride_x / block_stride_x;
int win_block_stride_y = win_stride_y / block_stride_y;
int img_win_width = (width - win_width + win_stride_x) / win_stride_x;
int img_win_height = (height - win_height + win_stride_y) / win_stride_y;
dim3 threads(nthreads, 1, nblocks);
dim3 grid(divUp(img_win_width, nblocks), img_win_height);
cudaSafeCall(cudaFuncSetCacheConfig(classify_hists_kernel_many_blocks<nthreads, nblocks>, cudaFuncCachePreferL1));
int img_block_width = (width - CELLS_PER_BLOCK_X * CELL_WIDTH + block_stride_x) / block_stride_x;
classify_hists_kernel_many_blocks<nthreads, nblocks><<<grid, threads>>>(
img_win_width, img_block_width, win_block_stride_x, win_block_stride_y,
block_hists, coefs, free_coef, threshold, labels);
cudaSafeCall( cudaGetLastError() );
cudaSafeCall( cudaDeviceSynchronize() );
}
//----------------------------------------------------------------------------
// Extract descriptors
template <int nthreads>
__global__ void extract_descrs_by_rows_kernel(const int img_block_width, const int win_block_stride_x, const int win_block_stride_y,
const float* block_hists, PtrStepf descriptors)
{
// Get left top corner of the window in src
const float* hist = block_hists + (blockIdx.y * win_block_stride_y * img_block_width +
blockIdx.x * win_block_stride_x) * cblock_hist_size;
// Get left top corner of the window in dst
float* descriptor = descriptors.ptr(blockIdx.y * gridDim.x + blockIdx.x);
// Copy elements from src to dst
for (int i = threadIdx.x; i < cdescr_size; i += nthreads)
{
int offset_y = i / cdescr_width;
int offset_x = i - offset_y * cdescr_width;
descriptor[i] = hist[offset_y * img_block_width * cblock_hist_size + offset_x];
}
}
void extract_descrs_by_rows(int win_height, int win_width, int block_stride_y, int block_stride_x, int win_stride_y, int win_stride_x,
int height, int width, float* block_hists, PtrStepSzf descriptors)
{
const int nthreads = 256;
int win_block_stride_x = win_stride_x / block_stride_x;
int win_block_stride_y = win_stride_y / block_stride_y;
int img_win_width = (width - win_width + win_stride_x) / win_stride_x;
int img_win_height = (height - win_height + win_stride_y) / win_stride_y;
dim3 threads(nthreads, 1);
dim3 grid(img_win_width, img_win_height);
int img_block_width = (width - CELLS_PER_BLOCK_X * CELL_WIDTH + block_stride_x) / block_stride_x;
extract_descrs_by_rows_kernel<nthreads><<<grid, threads>>>(
img_block_width, win_block_stride_x, win_block_stride_y, block_hists, descriptors);
cudaSafeCall( cudaGetLastError() );
cudaSafeCall( cudaDeviceSynchronize() );
}
template <int nthreads>
__global__ void extract_descrs_by_cols_kernel(const int img_block_width, const int win_block_stride_x,
const int win_block_stride_y, const float* block_hists,
PtrStepf descriptors)
{
// Get left top corner of the window in src
const float* hist = block_hists + (blockIdx.y * win_block_stride_y * img_block_width +
blockIdx.x * win_block_stride_x) * cblock_hist_size;
// Get left top corner of the window in dst
float* descriptor = descriptors.ptr(blockIdx.y * gridDim.x + blockIdx.x);
// Copy elements from src to dst
for (int i = threadIdx.x; i < cdescr_size; i += nthreads)
{
int block_idx = i / cblock_hist_size;
int idx_in_block = i - block_idx * cblock_hist_size;
int y = block_idx / cnblocks_win_x;
int x = block_idx - y * cnblocks_win_x;
descriptor[(x * cnblocks_win_y + y) * cblock_hist_size + idx_in_block]
= hist[(y * img_block_width + x) * cblock_hist_size + idx_in_block];
}
}
void extract_descrs_by_cols(int win_height, int win_width, int block_stride_y, int block_stride_x,
int win_stride_y, int win_stride_x, int height, int width, float* block_hists,
PtrStepSzf descriptors)
{
const int nthreads = 256;
int win_block_stride_x = win_stride_x / block_stride_x;
int win_block_stride_y = win_stride_y / block_stride_y;
int img_win_width = (width - win_width + win_stride_x) / win_stride_x;
int img_win_height = (height - win_height + win_stride_y) / win_stride_y;
dim3 threads(nthreads, 1);
dim3 grid(img_win_width, img_win_height);
int img_block_width = (width - CELLS_PER_BLOCK_X * CELL_WIDTH + block_stride_x) / block_stride_x;
extract_descrs_by_cols_kernel<nthreads><<<grid, threads>>>(
img_block_width, win_block_stride_x, win_block_stride_y, block_hists, descriptors);
cudaSafeCall( cudaGetLastError() );
cudaSafeCall( cudaDeviceSynchronize() );
}
//----------------------------------------------------------------------------
// Gradients computation
template <int nthreads, int correct_gamma>
__global__ void compute_gradients_8UC4_kernel(int height, int width, const PtrStepb img,
float angle_scale, PtrStepf grad, PtrStepb qangle)
{
const int x = blockIdx.x * blockDim.x + threadIdx.x;
const uchar4* row = (const uchar4*)img.ptr(blockIdx.y);
__shared__ float sh_row[(nthreads + 2) * 3];
uchar4 val;
if (x < width)
val = row[x];
else
val = row[width - 2];
sh_row[threadIdx.x + 1] = val.x;
sh_row[threadIdx.x + 1 + (nthreads + 2)] = val.y;
sh_row[threadIdx.x + 1 + 2 * (nthreads + 2)] = val.z;
if (threadIdx.x == 0)
{
val = row[::max(x - 1, 1)];
sh_row[0] = val.x;
sh_row[(nthreads + 2)] = val.y;
sh_row[2 * (nthreads + 2)] = val.z;
}
if (threadIdx.x == blockDim.x - 1)
{
val = row[::min(x + 1, width - 2)];
sh_row[blockDim.x + 1] = val.x;
sh_row[blockDim.x + 1 + (nthreads + 2)] = val.y;
sh_row[blockDim.x + 1 + 2 * (nthreads + 2)] = val.z;
}
__syncthreads();
if (x < width)
{
float3 a, b;
b.x = sh_row[threadIdx.x + 2];
b.y = sh_row[threadIdx.x + 2 + (nthreads + 2)];
b.z = sh_row[threadIdx.x + 2 + 2 * (nthreads + 2)];
a.x = sh_row[threadIdx.x];
a.y = sh_row[threadIdx.x + (nthreads + 2)];
a.z = sh_row[threadIdx.x + 2 * (nthreads + 2)];
float3 dx;
if (correct_gamma)
dx = make_float3(::sqrtf(b.x) - ::sqrtf(a.x), ::sqrtf(b.y) - ::sqrtf(a.y), ::sqrtf(b.z) - ::sqrtf(a.z));
else
dx = make_float3(b.x - a.x, b.y - a.y, b.z - a.z);
float3 dy = make_float3(0.f, 0.f, 0.f);
if (blockIdx.y > 0 && blockIdx.y < height - 1)
{
val = ((const uchar4*)img.ptr(blockIdx.y - 1))[x];
a = make_float3(val.x, val.y, val.z);
val = ((const uchar4*)img.ptr(blockIdx.y + 1))[x];
b = make_float3(val.x, val.y, val.z);
if (correct_gamma)
dy = make_float3(::sqrtf(b.x) - ::sqrtf(a.x), ::sqrtf(b.y) - ::sqrtf(a.y), ::sqrtf(b.z) - ::sqrtf(a.z));
else
dy = make_float3(b.x - a.x, b.y - a.y, b.z - a.z);
}
float best_dx = dx.x;
float best_dy = dy.x;
float mag0 = dx.x * dx.x + dy.x * dy.x;
float mag1 = dx.y * dx.y + dy.y * dy.y;
if (mag0 < mag1)
{
best_dx = dx.y;
best_dy = dy.y;
mag0 = mag1;
}
mag1 = dx.z * dx.z + dy.z * dy.z;
if (mag0 < mag1)
{
best_dx = dx.z;
best_dy = dy.z;
mag0 = mag1;
}
mag0 = ::sqrtf(mag0);
float ang = (::atan2f(best_dy, best_dx) + CV_PI_F) * angle_scale - 0.5f;
int hidx = (int)::floorf(ang);
ang -= hidx;
hidx = (hidx + cnbins) % cnbins;
((uchar2*)qangle.ptr(blockIdx.y))[x] = make_uchar2(hidx, (hidx + 1) % cnbins);
((float2*)grad.ptr(blockIdx.y))[x] = make_float2(mag0 * (1.f - ang), mag0 * ang);
}
}
void compute_gradients_8UC4(int nbins, int height, int width, const PtrStepSzb& img,
float angle_scale, PtrStepSzf grad, PtrStepSzb qangle, bool correct_gamma)
{
(void)nbins;
const int nthreads = 256;
dim3 bdim(nthreads, 1);
dim3 gdim(divUp(width, bdim.x), divUp(height, bdim.y));
if (correct_gamma)
compute_gradients_8UC4_kernel<nthreads, 1><<<gdim, bdim>>>(height, width, img, angle_scale, grad, qangle);
else
compute_gradients_8UC4_kernel<nthreads, 0><<<gdim, bdim>>>(height, width, img, angle_scale, grad, qangle);
cudaSafeCall( cudaGetLastError() );
cudaSafeCall( cudaDeviceSynchronize() );
}
template <int nthreads, int correct_gamma>
__global__ void compute_gradients_8UC1_kernel(int height, int width, const PtrStepb img,
float angle_scale, PtrStepf grad, PtrStepb qangle)
{
const int x = blockIdx.x * blockDim.x + threadIdx.x;
const unsigned char* row = (const unsigned char*)img.ptr(blockIdx.y);
__shared__ float sh_row[nthreads + 2];
if (x < width)
sh_row[threadIdx.x + 1] = row[x];
else
sh_row[threadIdx.x + 1] = row[width - 2];
if (threadIdx.x == 0)
sh_row[0] = row[::max(x - 1, 1)];
if (threadIdx.x == blockDim.x - 1)
sh_row[blockDim.x + 1] = row[::min(x + 1, width - 2)];
__syncthreads();
if (x < width)
{
float dx;
if (correct_gamma)
dx = ::sqrtf(sh_row[threadIdx.x + 2]) - ::sqrtf(sh_row[threadIdx.x]);
else
dx = sh_row[threadIdx.x + 2] - sh_row[threadIdx.x];
float dy = 0.f;
if (blockIdx.y > 0 && blockIdx.y < height - 1)
{
float a = ((const unsigned char*)img.ptr(blockIdx.y + 1))[x];
float b = ((const unsigned char*)img.ptr(blockIdx.y - 1))[x];
if (correct_gamma)
dy = ::sqrtf(a) - ::sqrtf(b);
else
dy = a - b;
}
float mag = ::sqrtf(dx * dx + dy * dy);
float ang = (::atan2f(dy, dx) + CV_PI_F) * angle_scale - 0.5f;
int hidx = (int)::floorf(ang);
ang -= hidx;
hidx = (hidx + cnbins) % cnbins;
((uchar2*)qangle.ptr(blockIdx.y))[x] = make_uchar2(hidx, (hidx + 1) % cnbins);
((float2*) grad.ptr(blockIdx.y))[x] = make_float2(mag * (1.f - ang), mag * ang);
}
}
void compute_gradients_8UC1(int nbins, int height, int width, const PtrStepSzb& img,
float angle_scale, PtrStepSzf grad, PtrStepSzb qangle, bool correct_gamma)
{
(void)nbins;
const int nthreads = 256;
dim3 bdim(nthreads, 1);
dim3 gdim(divUp(width, bdim.x), divUp(height, bdim.y));
if (correct_gamma)
compute_gradients_8UC1_kernel<nthreads, 1><<<gdim, bdim>>>(height, width, img, angle_scale, grad, qangle);
else
compute_gradients_8UC1_kernel<nthreads, 0><<<gdim, bdim>>>(height, width, img, angle_scale, grad, qangle);
cudaSafeCall( cudaGetLastError() );
cudaSafeCall( cudaDeviceSynchronize() );
}
//-------------------------------------------------------------------
// Resize
texture<uchar4, 2, cudaReadModeNormalizedFloat> resize8UC4_tex;
texture<uchar, 2, cudaReadModeNormalizedFloat> resize8UC1_tex;
__global__ void resize_for_hog_kernel(float sx, float sy, PtrStepSz<uchar> dst, int colOfs)
{
unsigned int x = blockIdx.x * blockDim.x + threadIdx.x;
unsigned int y = blockIdx.y * blockDim.y + threadIdx.y;
if (x < dst.cols && y < dst.rows)
dst.ptr(y)[x] = tex2D(resize8UC1_tex, x * sx + colOfs, y * sy) * 255;
}
__global__ void resize_for_hog_kernel(float sx, float sy, PtrStepSz<uchar4> dst, int colOfs)
{
unsigned int x = blockIdx.x * blockDim.x + threadIdx.x;
unsigned int y = blockIdx.y * blockDim.y + threadIdx.y;
if (x < dst.cols && y < dst.rows)
{
float4 val = tex2D(resize8UC4_tex, x * sx + colOfs, y * sy);
dst.ptr(y)[x] = make_uchar4(val.x * 255, val.y * 255, val.z * 255, val.w * 255);
}
}
template<class T, class TEX>
static void resize_for_hog(const PtrStepSzb& src, PtrStepSzb dst, TEX& tex)
{
tex.filterMode = cudaFilterModeLinear;
size_t texOfs = 0;
int colOfs = 0;
cudaChannelFormatDesc desc = cudaCreateChannelDesc<T>();
cudaSafeCall( cudaBindTexture2D(&texOfs, tex, src.data, desc, src.cols, src.rows, src.step) );
if (texOfs != 0)
{
colOfs = static_cast<int>( texOfs/sizeof(T) );
cudaSafeCall( cudaUnbindTexture(tex) );
cudaSafeCall( cudaBindTexture2D(&texOfs, tex, src.data, desc, src.cols, src.rows, src.step) );
}
dim3 threads(32, 8);
dim3 grid(divUp(dst.cols, threads.x), divUp(dst.rows, threads.y));
float sx = static_cast<float>(src.cols) / dst.cols;
float sy = static_cast<float>(src.rows) / dst.rows;
resize_for_hog_kernel<<<grid, threads>>>(sx, sy, (PtrStepSz<T>)dst, colOfs);
cudaSafeCall( cudaGetLastError() );
cudaSafeCall( cudaDeviceSynchronize() );
cudaSafeCall( cudaUnbindTexture(tex) );
}
void resize_8UC1(const PtrStepSzb& src, PtrStepSzb dst) { resize_for_hog<uchar> (src, dst, resize8UC1_tex); }
void resize_8UC4(const PtrStepSzb& src, PtrStepSzb dst) { resize_for_hog<uchar4>(src, dst, resize8UC4_tex); }
} // namespace hog
}}} // namespace cv { namespace gpu { namespace cudev
#endif /* CUDA_DISABLER */

303
modules/gpu/src/cuda/lbp.cu Normal file
View File

@@ -0,0 +1,303 @@
/*M///////////////////////////////////////////////////////////////////////////////////////
//
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
//
// By downloading, copying, installing or using the software you agree to this license.
// If you do not agree to this license, do not download, install,
// copy or use the software.
//
//
// License Agreement
// For Open Source Computer Vision Library
//
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
// Third party copyrights are property of their respective owners.
//
// Redistribution and use in source and binary forms, with or without modification,
// are permitted provided that the following conditions are met:
//
// * Redistribution's of source code must retain the above copyright notice,
// this list of conditions and the following disclaimer.
//
// * Redistribution's in binary form must reproduce the above copyright notice,
// this list of conditions and the following disclaimer in the documentation
// and/or other materials provided with the distribution.
//
// * The name of the copyright holders may not be used to endorse or promote products
// derived from this software without specific prior written permission.
//
// This software is provided by the copyright holders and contributors "as is" and
// any express or implied warranties, including, but not limited to, the implied
// warranties of merchantability and fitness for a particular purpose are disclaimed.
// In no event shall the Intel Corporation or contributors be liable for any direct,
// indirect, incidental, special, exemplary, or consequential damages
// (including, but not limited to, procurement of substitute goods or services;
// loss of use, data, or profits; or business interruption) however caused
// and on any theory of liability, whether in contract, strict liability,
// or tort (including negligence or otherwise) arising in any way out of
// the use of this software, even if advised of the possibility of such damage.
//
//M*/
#if !defined CUDA_DISABLER
#include "lbp.hpp"
#include "opencv2/core/cuda/vec_traits.hpp"
#include "opencv2/core/cuda/saturate_cast.hpp"
namespace cv { namespace gpu { namespace cudev
{
namespace lbp
{
struct LBP
{
__host__ __device__ __forceinline__ LBP() {}
__device__ __forceinline__ int operator() (const int* integral, int ty, int fh, int fw, int& shift) const
{
int anchors[9];
anchors[0] = integral[ty];
anchors[1] = integral[ty + fw];
anchors[0] -= anchors[1];
anchors[2] = integral[ty + fw * 2];
anchors[1] -= anchors[2];
anchors[2] -= integral[ty + fw * 3];
ty += fh;
anchors[3] = integral[ty];
anchors[4] = integral[ty + fw];
anchors[3] -= anchors[4];
anchors[5] = integral[ty + fw * 2];
anchors[4] -= anchors[5];
anchors[5] -= integral[ty + fw * 3];
anchors[0] -= anchors[3];
anchors[1] -= anchors[4];
anchors[2] -= anchors[5];
// 0 - 2 contains s0 - s2
ty += fh;
anchors[6] = integral[ty];
anchors[7] = integral[ty + fw];
anchors[6] -= anchors[7];
anchors[8] = integral[ty + fw * 2];
anchors[7] -= anchors[8];
anchors[8] -= integral[ty + fw * 3];
anchors[3] -= anchors[6];
anchors[4] -= anchors[7];
anchors[5] -= anchors[8];
// 3 - 5 contains s3 - s5
anchors[0] -= anchors[4];
anchors[1] -= anchors[4];
anchors[2] -= anchors[4];
anchors[3] -= anchors[4];
anchors[5] -= anchors[4];
int response = (~(anchors[0] >> 31)) & 4;
response |= (~(anchors[1] >> 31)) & 2;;
response |= (~(anchors[2] >> 31)) & 1;
shift = (~(anchors[5] >> 31)) & 16;
shift |= (~(anchors[3] >> 31)) & 1;
ty += fh;
anchors[0] = integral[ty];
anchors[1] = integral[ty + fw];
anchors[0] -= anchors[1];
anchors[2] = integral[ty + fw * 2];
anchors[1] -= anchors[2];
anchors[2] -= integral[ty + fw * 3];
anchors[6] -= anchors[0];
anchors[7] -= anchors[1];
anchors[8] -= anchors[2];
// 0 -2 contains s6 - s8
anchors[6] -= anchors[4];
anchors[7] -= anchors[4];
anchors[8] -= anchors[4];
shift |= (~(anchors[6] >> 31)) & 2;
shift |= (~(anchors[7] >> 31)) & 4;
shift |= (~(anchors[8] >> 31)) & 8;
return response;
}
};
template<typename Pr>
__global__ void disjoin(int4* candidates, int4* objects, unsigned int n, int groupThreshold, float grouping_eps, unsigned int* nclasses)
{
unsigned int tid = threadIdx.x;
extern __shared__ int sbuff[];
int* labels = sbuff;
int* rrects = sbuff + n;
Pr predicate(grouping_eps);
partition(candidates, n, labels, predicate);
rrects[tid * 4 + 0] = 0;
rrects[tid * 4 + 1] = 0;
rrects[tid * 4 + 2] = 0;
rrects[tid * 4 + 3] = 0;
__syncthreads();
int cls = labels[tid];
Emulation::smem::atomicAdd((rrects + cls * 4 + 0), candidates[tid].x);
Emulation::smem::atomicAdd((rrects + cls * 4 + 1), candidates[tid].y);
Emulation::smem::atomicAdd((rrects + cls * 4 + 2), candidates[tid].z);
Emulation::smem::atomicAdd((rrects + cls * 4 + 3), candidates[tid].w);
__syncthreads();
labels[tid] = 0;
__syncthreads();
Emulation::smem::atomicInc((unsigned int*)labels + cls, n);
__syncthreads();
*nclasses = 0;
int active = labels[tid];
if (active)
{
int* r1 = rrects + tid * 4;
float s = 1.f / active;
r1[0] = saturate_cast<int>(r1[0] * s);
r1[1] = saturate_cast<int>(r1[1] * s);
r1[2] = saturate_cast<int>(r1[2] * s);
r1[3] = saturate_cast<int>(r1[3] * s);
}
__syncthreads();
if (active && active >= groupThreshold)
{
int* r1 = rrects + tid * 4;
int4 r_out = make_int4(r1[0], r1[1], r1[2], r1[3]);
int aidx = Emulation::smem::atomicInc(nclasses, n);
objects[aidx] = r_out;
}
}
void connectedConmonents(PtrStepSz<int4> candidates, int ncandidates, PtrStepSz<int4> objects, int groupThreshold, float grouping_eps, unsigned int* nclasses)
{
if (!ncandidates) return;
int block = ncandidates;
int smem = block * ( sizeof(int) + sizeof(int4) );
disjoin<InSameComponint><<<1, block, smem>>>(candidates, objects, ncandidates, groupThreshold, grouping_eps, nclasses);
cudaSafeCall( cudaGetLastError() );
}
struct Cascade
{
__host__ __device__ __forceinline__ Cascade(const Stage* _stages, int _nstages, const ClNode* _nodes, const float* _leaves,
const int* _subsets, const uchar4* _features, int _subsetSize)
: stages(_stages), nstages(_nstages), nodes(_nodes), leaves(_leaves), subsets(_subsets), features(_features), subsetSize(_subsetSize){}
__device__ __forceinline__ bool operator() (int y, int x, int* integral, const int pitch) const
{
int current_node = 0;
int current_leave = 0;
for (int s = 0; s < nstages; ++s)
{
float sum = 0;
Stage stage = stages[s];
for (int t = 0; t < stage.ntrees; t++)
{
ClNode node = nodes[current_node];
uchar4 feature = features[node.featureIdx];
int shift;
int c = evaluator(integral, (y + feature.y) * pitch + x + feature.x, feature.w * pitch, feature.z, shift);
int idx = (subsets[ current_node * subsetSize + c] & ( 1 << shift)) ? current_leave : current_leave + 1;
sum += leaves[idx];
current_node += 1;
current_leave += 2;
}
if (sum < stage.threshold)
return false;
}
return true;
}
const Stage* stages;
const int nstages;
const ClNode* nodes;
const float* leaves;
const int* subsets;
const uchar4* features;
const int subsetSize;
const LBP evaluator;
};
// stepShift, scale, width_k, sum_prev => y = sum_prev + tid_k / width_k, x = tid_k - tid_k / width_k
__global__ void lbp_cascade(const Cascade cascade, int frameW, int frameH, int windowW, int windowH, float scale, const float factor,
const int total, int* integral, const int pitch, PtrStepSz<int4> objects, unsigned int* classified)
{
int ftid = blockIdx.x * blockDim.x + threadIdx.x;
if (ftid >= total) return;
int step = (scale <= 2.f);
int windowsForLine = (__float2int_rn( __fdividef(frameW, scale)) - windowW) >> step;
int stotal = windowsForLine * ( (__float2int_rn( __fdividef(frameH, scale)) - windowH) >> step);
int wshift = 0;
int scaleTid = ftid;
while (scaleTid >= stotal)
{
scaleTid -= stotal;
wshift += __float2int_rn(__fdividef(frameW, scale)) + 1;
scale *= factor;
step = (scale <= 2.f);
windowsForLine = ( ((__float2int_rn(__fdividef(frameW, scale)) - windowW) >> step));
stotal = windowsForLine * ( (__float2int_rn(__fdividef(frameH, scale)) - windowH) >> step);
}
int y = __fdividef(scaleTid, windowsForLine);
int x = scaleTid - y * windowsForLine;
x <<= step;
y <<= step;
if (cascade(y, x + wshift, integral, pitch))
{
if(x >= __float2int_rn(__fdividef(frameW, scale)) - windowW) return;
int4 rect;
rect.x = __float2int_rn(x * scale);
rect.y = __float2int_rn(y * scale);
rect.z = __float2int_rn(windowW * scale);
rect.w = __float2int_rn(windowH * scale);
int res = atomicInc(classified, (unsigned int)objects.cols);
objects(0, res) = rect;
}
}
void classifyPyramid(int frameW, int frameH, int windowW, int windowH, float initialScale, float factor, int workAmount,
const PtrStepSzb& mstages, const int nstages, const PtrStepSzi& mnodes, const PtrStepSzf& mleaves, const PtrStepSzi& msubsets, const PtrStepSzb& mfeatures,
const int subsetSize, PtrStepSz<int4> objects, unsigned int* classified, PtrStepSzi integral)
{
const int block = 128;
int grid = divUp(workAmount, block);
cudaFuncSetCacheConfig(lbp_cascade, cudaFuncCachePreferL1);
Cascade cascade((Stage*)mstages.ptr(), nstages, (ClNode*)mnodes.ptr(), mleaves.ptr(), msubsets.ptr(), (uchar4*)mfeatures.ptr(), subsetSize);
lbp_cascade<<<grid, block>>>(cascade, frameW, frameH, windowW, windowH, initialScale, factor, workAmount, integral.ptr(), (int)integral.step / sizeof(int), objects, classified);
}
}
}}}
#endif /* CUDA_DISABLER */

View File

@@ -0,0 +1,112 @@
/*M///////////////////////////////////////////////////////////////////////////////////////
//
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
//
// By downloading, copying, installing or using the software you agree to this license.
// If you do not agree to this license, do not download, install,
// copy or use the software.
//
//
// License Agreement
// For Open Source Computer Vision Library
//
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
// Third party copyrights are property of their respective owners.
//
// Redistribution and use in source and binary forms, with or without modification,
// are permitted provided that the following conditions are met:
//
// * Redistribution's of source code must retain the above copyright notice,
// this list of conditions and the following disclaimer.
//
// * Redistribution's in binary form must reproduce the above copyright notice,
// this list of conditions and the following disclaimer in the documentation
// and/or other materials provided with the distribution.
//
// * The name of the copyright holders may not be used to endorse or promote products
// derived from this software without specific prior written permission.
//
// This software is provided by the copyright holders and contributors "as is" and
// any express or implied warranties, including, but not limited to, the implied
// warranties of merchantability and fitness for a particular purpose are disclaimed.
// In no event shall the Intel Corporation or contributors be liable for any direct,
// indirect, incidental, special, exemplary, or consequential damages
// (including, but not limited to, procurement of substitute goods or services;
// loss of use, data, or profits; or business interruption) however caused
// and on any theory of liability, whether in contract, strict liability,
// or tort (including negligence or otherwise) arising in any way out of
// the use of this software, even if advised of the possibility of such damage.
//
//M*/
#ifndef __OPENCV_GPU_DEVICE_LBP_HPP_
#define __OPENCV_GPU_DEVICE_LBP_HPP_
#include "opencv2/core/cuda/common.hpp"
#include "opencv2/core/cuda/emulation.hpp"
namespace cv { namespace gpu { namespace cudev {
namespace lbp {
struct Stage
{
int first;
int ntrees;
float threshold;
};
struct ClNode
{
int left;
int right;
int featureIdx;
};
struct InSameComponint
{
public:
__device__ __forceinline__ InSameComponint(float _eps) : eps(_eps) {}
__device__ __forceinline__ InSameComponint(const InSameComponint& other) : eps(other.eps) {}
__device__ __forceinline__ bool operator()(const int4& r1, const int4& r2) const
{
float delta = eps * (::min(r1.z, r2.z) + ::min(r1.w, r2.w)) * 0.5f;
return ::abs(r1.x - r2.x) <= delta && ::abs(r1.y - r2.y) <= delta
&& ::abs(r1.x + r1.z - r2.x - r2.z) <= delta && ::abs(r1.y + r1.w - r2.y - r2.w) <= delta;
}
float eps;
};
template<typename Pr>
__device__ __forceinline__ void partition(int4* vec, unsigned int n, int* labels, Pr predicate)
{
unsigned tid = threadIdx.x;
labels[tid] = tid;
__syncthreads();
for (unsigned int id = 0; id < n; id++)
{
if (tid != id && predicate(vec[tid], vec[id]))
{
int p = labels[tid];
int q = labels[id];
if (p < q)
{
Emulation::smem::atomicMin(labels + id, p);
}
else if (p > q)
{
Emulation::smem::atomicMin(labels + tid, q);
}
}
}
__syncthreads();
}
} // lbp
} } }// namespaces
#endif

1619
modules/gpu/src/hog.cpp Normal file

File diff suppressed because it is too large Load Diff

View File

@@ -43,9 +43,20 @@
#ifndef __OPENCV_PRECOMP_H__
#define __OPENCV_PRECOMP_H__
#include <vector>
#include <iostream>
#include "opencv2/gpu.hpp"
#include "opencv2/calib3d.hpp"
#include "opencv2/objdetect.hpp"
#include "opencv2/core/gpu_private.hpp"
#include "opencv2/opencv_modules.hpp"
#ifdef HAVE_OPENCV_GPULEGACY
# include "opencv2/gpulegacy.hpp"
# include "opencv2/gpulegacy/private.hpp"
#endif
#endif /* __OPENCV_PRECOMP_H__ */

View File

@@ -0,0 +1,427 @@
/*M///////////////////////////////////////////////////////////////////////////////////////
//
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
//
// By downloading, copying, installing or using the software you agree to this license.
// If you do not agree to this license, do not download, install,
// copy or use the software.
//
//
// License Agreement
// For Open Source Computer Vision Library
//
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
// Third party copyrights are property of their respective owners.
//
// Redistribution and use in source and binary forms, with or without modification,
// are permitted provided that the following conditions are met:
//
// * Redistribution's of source code must retain the above copyright notice,
// this list of conditions and the following disclaimer.
//
// * Redistribution's in binary form must reproduce the above copyright notice,
// this list of conditions and the following disclaimer in the documentation
// and/or other materials provided with the distribution.
//
// * The name of the copyright holders may not be used to endorse or promote products
// derived from this software without specific prior written permission.
//
// This software is provided by the copyright holders and contributors "as is" and
// any express or implied warranties, including, but not limited to, the implied
// warranties of merchantability and fitness for a particular purpose are disclaimed.
// In no event shall the Intel Corporation or contributors be liable for any direct,
// indirect, incidental, special, exemplary, or consequential damages
// (including, but not limited to, procurement of substitute goods or services;
// loss of use, data, or profits; or business interruption) however caused
// and on any theory of liability, whether in contract, strict liability,
// or tort (including negligence or otherwise) arising in any way out of
// the use of this software, even if advised of the possibility of such damage.
//
//M*/
#include "test_precomp.hpp"
#ifdef HAVE_CUDA
using namespace cvtest;
//#define DUMP
struct HOG : testing::TestWithParam<cv::gpu::DeviceInfo>, cv::gpu::HOGDescriptor
{
cv::gpu::DeviceInfo devInfo;
#ifdef DUMP
std::ofstream f;
#else
std::ifstream f;
#endif
int wins_per_img_x;
int wins_per_img_y;
int blocks_per_win_x;
int blocks_per_win_y;
int block_hist_size;
virtual void SetUp()
{
devInfo = GetParam();
cv::gpu::setDevice(devInfo.deviceID());
}
#ifdef DUMP
void dump(const cv::Mat& blockHists, const std::vector<cv::Point>& locations)
{
f.write((char*)&blockHists.rows, sizeof(blockHists.rows));
f.write((char*)&blockHists.cols, sizeof(blockHists.cols));
for (int i = 0; i < blockHists.rows; ++i)
{
for (int j = 0; j < blockHists.cols; ++j)
{
float val = blockHists.at<float>(i, j);
f.write((char*)&val, sizeof(val));
}
}
int nlocations = locations.size();
f.write((char*)&nlocations, sizeof(nlocations));
for (int i = 0; i < locations.size(); ++i)
f.write((char*)&locations[i], sizeof(locations[i]));
}
#else
void compare(const cv::Mat& blockHists, const std::vector<cv::Point>& locations)
{
int rows, cols;
f.read((char*)&rows, sizeof(rows));
f.read((char*)&cols, sizeof(cols));
ASSERT_EQ(rows, blockHists.rows);
ASSERT_EQ(cols, blockHists.cols);
for (int i = 0; i < blockHists.rows; ++i)
{
for (int j = 0; j < blockHists.cols; ++j)
{
float val;
f.read((char*)&val, sizeof(val));
ASSERT_NEAR(val, blockHists.at<float>(i, j), 1e-3);
}
}
int nlocations;
f.read((char*)&nlocations, sizeof(nlocations));
ASSERT_EQ(nlocations, static_cast<int>(locations.size()));
for (int i = 0; i < nlocations; ++i)
{
cv::Point location;
f.read((char*)&location, sizeof(location));
ASSERT_EQ(location, locations[i]);
}
}
#endif
void testDetect(const cv::Mat& img)
{
gamma_correction = false;
setSVMDetector(cv::gpu::HOGDescriptor::getDefaultPeopleDetector());
std::vector<cv::Point> locations;
// Test detect
detect(loadMat(img), locations, 0);
#ifdef DUMP
dump(cv::Mat(block_hists), locations);
#else
compare(cv::Mat(block_hists), locations);
#endif
// Test detect on smaller image
cv::Mat img2;
cv::resize(img, img2, cv::Size(img.cols / 2, img.rows / 2));
detect(loadMat(img2), locations, 0);
#ifdef DUMP
dump(cv::Mat(block_hists), locations);
#else
compare(cv::Mat(block_hists), locations);
#endif
// Test detect on greater image
cv::resize(img, img2, cv::Size(img.cols * 2, img.rows * 2));
detect(loadMat(img2), locations, 0);
#ifdef DUMP
dump(cv::Mat(block_hists), locations);
#else
compare(cv::Mat(block_hists), locations);
#endif
}
// Does not compare border value, as interpolation leads to delta
void compare_inner_parts(cv::Mat d1, cv::Mat d2)
{
for (int i = 1; i < blocks_per_win_y - 1; ++i)
for (int j = 1; j < blocks_per_win_x - 1; ++j)
for (int k = 0; k < block_hist_size; ++k)
{
float a = d1.at<float>(0, (i * blocks_per_win_x + j) * block_hist_size);
float b = d2.at<float>(0, (i * blocks_per_win_x + j) * block_hist_size);
ASSERT_FLOAT_EQ(a, b);
}
}
};
// desabled while resize does not fixed
GPU_TEST_P(HOG, Detect)
{
cv::Mat img_rgb = readImage("hog/road.png");
ASSERT_FALSE(img_rgb.empty());
#ifdef DUMP
f.open((std::string(cvtest::TS::ptr()->get_data_path()) + "hog/expected_output.bin").c_str(), std::ios_base::binary);
ASSERT_TRUE(f.is_open());
#else
f.open((std::string(cvtest::TS::ptr()->get_data_path()) + "hog/expected_output.bin").c_str(), std::ios_base::binary);
ASSERT_TRUE(f.is_open());
#endif
// Test on color image
cv::Mat img;
cv::cvtColor(img_rgb, img, cv::COLOR_BGR2BGRA);
testDetect(img);
// Test on gray image
cv::cvtColor(img_rgb, img, cv::COLOR_BGR2GRAY);
testDetect(img);
f.close();
}
GPU_TEST_P(HOG, GetDescriptors)
{
// Load image (e.g. train data, composed from windows)
cv::Mat img_rgb = readImage("hog/train_data.png");
ASSERT_FALSE(img_rgb.empty());
// Convert to C4
cv::Mat img;
cv::cvtColor(img_rgb, img, cv::COLOR_BGR2BGRA);
cv::gpu::GpuMat d_img(img);
// Convert train images into feature vectors (train table)
cv::gpu::GpuMat descriptors, descriptors_by_cols;
getDescriptors(d_img, win_size, descriptors, DESCR_FORMAT_ROW_BY_ROW);
getDescriptors(d_img, win_size, descriptors_by_cols, DESCR_FORMAT_COL_BY_COL);
// Check size of the result train table
wins_per_img_x = 3;
wins_per_img_y = 2;
blocks_per_win_x = 7;
blocks_per_win_y = 15;
block_hist_size = 36;
cv::Size descr_size_expected = cv::Size(blocks_per_win_x * blocks_per_win_y * block_hist_size,
wins_per_img_x * wins_per_img_y);
ASSERT_EQ(descr_size_expected, descriptors.size());
// Check both formats of output descriptors are handled correctly
cv::Mat dr(descriptors);
cv::Mat dc(descriptors_by_cols);
for (int i = 0; i < wins_per_img_x * wins_per_img_y; ++i)
{
const float* l = dr.rowRange(i, i + 1).ptr<float>();
const float* r = dc.rowRange(i, i + 1).ptr<float>();
for (int y = 0; y < blocks_per_win_y; ++y)
for (int x = 0; x < blocks_per_win_x; ++x)
for (int k = 0; k < block_hist_size; ++k)
ASSERT_EQ(l[(y * blocks_per_win_x + x) * block_hist_size + k],
r[(x * blocks_per_win_y + y) * block_hist_size + k]);
}
/* Now we want to extract the same feature vectors, but from single images. NOTE: results will
be defferent, due to border values interpolation. Using of many small images is slower, however we
wont't call getDescriptors and will use computeBlockHistograms instead of. computeBlockHistograms
works good, it can be checked in the gpu_hog sample */
img_rgb = readImage("hog/positive1.png");
ASSERT_TRUE(!img_rgb.empty());
cv::cvtColor(img_rgb, img, cv::COLOR_BGR2BGRA);
computeBlockHistograms(cv::gpu::GpuMat(img));
// Everything is fine with interpolation for left top subimage
ASSERT_EQ(0.0, cv::norm((cv::Mat)block_hists, (cv::Mat)descriptors.rowRange(0, 1)));
img_rgb = readImage("hog/positive2.png");
ASSERT_TRUE(!img_rgb.empty());
cv::cvtColor(img_rgb, img, cv::COLOR_BGR2BGRA);
computeBlockHistograms(cv::gpu::GpuMat(img));
compare_inner_parts(cv::Mat(block_hists), cv::Mat(descriptors.rowRange(1, 2)));
img_rgb = readImage("hog/negative1.png");
ASSERT_TRUE(!img_rgb.empty());
cv::cvtColor(img_rgb, img, cv::COLOR_BGR2BGRA);
computeBlockHistograms(cv::gpu::GpuMat(img));
compare_inner_parts(cv::Mat(block_hists), cv::Mat(descriptors.rowRange(2, 3)));
img_rgb = readImage("hog/negative2.png");
ASSERT_TRUE(!img_rgb.empty());
cv::cvtColor(img_rgb, img, cv::COLOR_BGR2BGRA);
computeBlockHistograms(cv::gpu::GpuMat(img));
compare_inner_parts(cv::Mat(block_hists), cv::Mat(descriptors.rowRange(3, 4)));
img_rgb = readImage("hog/positive3.png");
ASSERT_TRUE(!img_rgb.empty());
cv::cvtColor(img_rgb, img, cv::COLOR_BGR2BGRA);
computeBlockHistograms(cv::gpu::GpuMat(img));
compare_inner_parts(cv::Mat(block_hists), cv::Mat(descriptors.rowRange(4, 5)));
img_rgb = readImage("hog/negative3.png");
ASSERT_TRUE(!img_rgb.empty());
cv::cvtColor(img_rgb, img, cv::COLOR_BGR2BGRA);
computeBlockHistograms(cv::gpu::GpuMat(img));
compare_inner_parts(cv::Mat(block_hists), cv::Mat(descriptors.rowRange(5, 6)));
}
INSTANTIATE_TEST_CASE_P(GPU_ObjDetect, HOG, ALL_DEVICES);
//============== caltech hog tests =====================//
struct CalTech : public ::testing::TestWithParam<std::tr1::tuple<cv::gpu::DeviceInfo, std::string> >
{
cv::gpu::DeviceInfo devInfo;
cv::Mat img;
virtual void SetUp()
{
devInfo = GET_PARAM(0);
cv::gpu::setDevice(devInfo.deviceID());
img = readImage(GET_PARAM(1), cv::IMREAD_GRAYSCALE);
ASSERT_FALSE(img.empty());
}
};
GPU_TEST_P(CalTech, HOG)
{
cv::gpu::GpuMat d_img(img);
cv::Mat markedImage(img.clone());
cv::gpu::HOGDescriptor d_hog;
d_hog.setSVMDetector(cv::gpu::HOGDescriptor::getDefaultPeopleDetector());
d_hog.nlevels = d_hog.nlevels + 32;
std::vector<cv::Rect> found_locations;
d_hog.detectMultiScale(d_img, found_locations);
#if defined (LOG_CASCADE_STATISTIC)
for (int i = 0; i < (int)found_locations.size(); i++)
{
cv::Rect r = found_locations[i];
std::cout << r.x << " " << r.y << " " << r.width << " " << r.height << std::endl;
cv::rectangle(markedImage, r , CV_RGB(255, 0, 0));
}
cv::imshow("Res", markedImage); cv::waitKey();
#endif
}
INSTANTIATE_TEST_CASE_P(detect, CalTech, testing::Combine(ALL_DEVICES,
::testing::Values<std::string>("caltech/image_00000009_0.png", "caltech/image_00000032_0.png",
"caltech/image_00000165_0.png", "caltech/image_00000261_0.png", "caltech/image_00000469_0.png",
"caltech/image_00000527_0.png", "caltech/image_00000574_0.png")));
//////////////////////////////////////////////////////////////////////////////////////////
/// LBP classifier
PARAM_TEST_CASE(LBP_Read_classifier, cv::gpu::DeviceInfo, int)
{
cv::gpu::DeviceInfo devInfo;
virtual void SetUp()
{
devInfo = GET_PARAM(0);
cv::gpu::setDevice(devInfo.deviceID());
}
};
GPU_TEST_P(LBP_Read_classifier, Accuracy)
{
cv::gpu::CascadeClassifier_GPU classifier;
std::string classifierXmlPath = std::string(cvtest::TS::ptr()->get_data_path()) + "lbpcascade/lbpcascade_frontalface.xml";
ASSERT_TRUE(classifier.load(classifierXmlPath));
}
INSTANTIATE_TEST_CASE_P(GPU_ObjDetect, LBP_Read_classifier,
testing::Combine(ALL_DEVICES, testing::Values<int>(0)));
PARAM_TEST_CASE(LBP_classify, cv::gpu::DeviceInfo, int)
{
cv::gpu::DeviceInfo devInfo;
virtual void SetUp()
{
devInfo = GET_PARAM(0);
cv::gpu::setDevice(devInfo.deviceID());
}
};
GPU_TEST_P(LBP_classify, Accuracy)
{
std::string classifierXmlPath = std::string(cvtest::TS::ptr()->get_data_path()) + "lbpcascade/lbpcascade_frontalface.xml";
std::string imagePath = std::string(cvtest::TS::ptr()->get_data_path()) + "lbpcascade/er.png";
cv::CascadeClassifier cpuClassifier(classifierXmlPath);
ASSERT_FALSE(cpuClassifier.empty());
cv::Mat image = cv::imread(imagePath);
image = image.colRange(0, image.cols/2);
cv::Mat grey;
cvtColor(image, grey, cv::COLOR_BGR2GRAY);
ASSERT_FALSE(image.empty());
std::vector<cv::Rect> rects;
cpuClassifier.detectMultiScale(grey, rects);
cv::Mat markedImage = image.clone();
std::vector<cv::Rect>::iterator it = rects.begin();
for (; it != rects.end(); ++it)
cv::rectangle(markedImage, *it, cv::Scalar(255, 0, 0));
cv::gpu::CascadeClassifier_GPU gpuClassifier;
ASSERT_TRUE(gpuClassifier.load(classifierXmlPath));
cv::gpu::GpuMat gpu_rects;
cv::gpu::GpuMat tested(grey);
int count = gpuClassifier.detectMultiScale(tested, gpu_rects);
#if defined (LOG_CASCADE_STATISTIC)
cv::Mat downloaded(gpu_rects);
const cv::Rect* faces = downloaded.ptr<cv::Rect>();
for (int i = 0; i < count; i++)
{
cv::Rect r = faces[i];
std::cout << r.x << " " << r.y << " " << r.width << " " << r.height << std::endl;
cv::rectangle(markedImage, r , CV_RGB(255, 0, 0));
}
#endif
#if defined (LOG_CASCADE_STATISTIC)
cv::imshow("Res", markedImage); cv::waitKey();
#endif
(void)count;
}
INSTANTIATE_TEST_CASE_P(GPU_ObjDetect, LBP_classify,
testing::Combine(ALL_DEVICES, testing::Values<int>(0)));
#endif // HAVE_CUDA

View File

@@ -51,12 +51,15 @@
#ifndef __OPENCV_TEST_PRECOMP_HPP__
#define __OPENCV_TEST_PRECOMP_HPP__
#include <fstream>
#include "opencv2/ts.hpp"
#include "opencv2/ts/gpu_test.hpp"
#include "opencv2/gpu.hpp"
#include "opencv2/core.hpp"
#include "opencv2/core/opengl.hpp"
#include "opencv2/gpu.hpp"
#include "opencv2/calib3d.hpp"
#include "opencv2/objdetect.hpp"
#endif