diff --git a/modules/objdetect/include/opencv2/objdetect.hpp b/modules/objdetect/include/opencv2/objdetect.hpp index 07f1cb9f1..ff665738a 100644 --- a/modules/objdetect/include/opencv2/objdetect.hpp +++ b/modules/objdetect/include/opencv2/objdetect.hpp @@ -285,10 +285,11 @@ public: CV_WRAP virtual void save(const String& filename, const String& objname = String()) const; virtual void copyTo(HOGDescriptor& c) const; - CV_WRAP virtual void compute(const Mat& img, + CV_WRAP virtual void compute(InputArray img, CV_OUT std::vector& descriptors, Size winStride = Size(), Size padding = Size(), const std::vector& locations = std::vector()) const; + //with found weights output CV_WRAP virtual void detect(const Mat& img, CV_OUT std::vector& foundLocations, CV_OUT std::vector& weights, @@ -300,13 +301,14 @@ public: double hitThreshold = 0, Size winStride = Size(), Size padding = Size(), const std::vector& searchLocations=std::vector()) const; + //with result weights output - CV_WRAP virtual void detectMultiScale(const Mat& img, CV_OUT std::vector& foundLocations, + CV_WRAP virtual void detectMultiScale(InputArray img, CV_OUT std::vector& foundLocations, CV_OUT std::vector& foundWeights, double hitThreshold = 0, Size winStride = Size(), Size padding = Size(), double scale = 1.05, double finalThreshold = 2.0,bool useMeanshiftGrouping = false) const; //without found weights output - virtual void detectMultiScale(const Mat& img, CV_OUT std::vector& foundLocations, + virtual void detectMultiScale(InputArray img, CV_OUT std::vector& foundLocations, double hitThreshold = 0, Size winStride = Size(), Size padding = Size(), double scale = 1.05, double finalThreshold = 2.0, bool useMeanshiftGrouping = false) const; @@ -328,25 +330,27 @@ public: CV_PROP double L2HysThreshold; CV_PROP bool gammaCorrection; CV_PROP std::vector svmDetector; + UMat oclSvmDetector; + float free_coef; CV_PROP int nlevels; - // evaluate specified ROI and return confidence value for each location - virtual void detectROI(const cv::Mat& img, const std::vector &locations, + // evaluate specified ROI and return confidence value for each location + virtual void detectROI(const cv::Mat& img, const std::vector &locations, CV_OUT std::vector& foundLocations, CV_OUT std::vector& confidences, double hitThreshold = 0, cv::Size winStride = Size(), cv::Size padding = Size()) const; - // evaluate specified ROI and return confidence value for each location in multiple scales - virtual void detectMultiScaleROI(const cv::Mat& img, + // evaluate specified ROI and return confidence value for each location in multiple scales + virtual void detectMultiScaleROI(const cv::Mat& img, CV_OUT std::vector& foundLocations, std::vector& locations, double hitThreshold = 0, int groupThreshold = 0) const; - // read/parse Dalal's alt model file - void readALTModel(String modelfile); - void groupRectangles(std::vector& rectList, std::vector& weights, int groupThreshold, double eps) const; + // read/parse Dalal's alt model file + void readALTModel(String modelfile); + void groupRectangles(std::vector& rectList, std::vector& weights, int groupThreshold, double eps) const; }; diff --git a/modules/objdetect/perf/opencl/perf_hogdetect.cpp b/modules/objdetect/perf/opencl/perf_hogdetect.cpp new file mode 100644 index 000000000..1d107151a --- /dev/null +++ b/modules/objdetect/perf/opencl/perf_hogdetect.cpp @@ -0,0 +1,94 @@ +/*M/////////////////////////////////////////////////////////////////////////////////////// +// +// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. +// +// By downloading, copying, installing or using the software you agree to this license. +// If you do not agree to this license, do not download, install, +// copy or use the software. +// +// +// License Agreement +// For Open Source Computer Vision Library +// +// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved. +// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved. +// Third party copyrights are property of their respective owners. +// +// @Authors +// Fangfang Bai, fangfang@multicorewareinc.com +// Jin Ma, jin@multicorewareinc.com +// +// Redistribution and use in source and binary forms, with or without modification, +// are permitted provided that the following conditions are met: +// +// * Redistribution's of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// +// * Redistribution's in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// +// * The name of the copyright holders may not be used to endorse or promote products +// derived from this software without specific prior written permission. +// +// This software is provided by the copyright holders and contributors as is and +// any express or implied warranties, including, but not limited to, the implied +// warranties of merchantability and fitness for a particular purpose are disclaimed. +// In no event shall the Intel Corporation or contributors be liable for any direct, +// indirect, incidental, special, exemplary, or consequential damages +// (including, but not limited to, procurement of substitute goods or services; +// loss of use, data, or profits; or business interruption) however caused +// and on any theory of liability, whether in contract, strict liability, +// or tort (including negligence or otherwise) arising in any way out of +// the use of this software, even if advised of the possibility of such damage. +// +//M*/ + +#include "perf_precomp.hpp" +#include "opencv2/ts/ocl_perf.hpp" + +#ifdef HAVE_OPENCL + +namespace cvtest { +namespace ocl { +///////////// HOG//////////////////////// + +struct RectLess : + public std::binary_function +{ + bool operator()(const cv::Rect& a, + const cv::Rect& b) const + { + if (a.x != b.x) + return a.x < b.x; + else if (a.y != b.y) + return a.y < b.y; + else if (a.width != b.width) + return a.width < b.width; + else + return a.height < b.height; + } +}; + +OCL_PERF_TEST(HOGFixture, HOG) +{ + UMat src; + imread(getDataPath("gpu/hog/road.png"), cv::IMREAD_GRAYSCALE).copyTo(src); + ASSERT_FALSE(src.empty()); + + vector found_locations; + declare.in(src); + + HOGDescriptor hog; + hog.setSVMDetector(hog.getDefaultPeopleDetector()); + + OCL_TEST_CYCLE() hog.detectMultiScale(src, found_locations); + + std::sort(found_locations.begin(), found_locations.end(), RectLess()); + SANITY_CHECK(found_locations, 1 + DBL_EPSILON); +} + +} +} + +#endif diff --git a/modules/objdetect/src/hog.cpp b/modules/objdetect/src/hog.cpp index cef5355c5..18bb7afc2 100644 --- a/modules/objdetect/src/hog.cpp +++ b/modules/objdetect/src/hog.cpp @@ -42,6 +42,7 @@ #include "precomp.hpp" #include "opencv2/core/core_c.h" +#include "opencl_kernels.hpp" #include #include @@ -58,6 +59,29 @@ namespace cv { +#define NTHREADS 256 + +enum {DESCR_FORMAT_COL_BY_COL, DESCR_FORMAT_ROW_BY_ROW}; + +static int numPartsWithin(int size, int part_size, int stride) +{ + return (size - part_size + stride) / stride; +} + +static Size numPartsWithin(cv::Size size, cv::Size part_size, + cv::Size stride) +{ + return Size(numPartsWithin(size.width, part_size.width, stride.width), + numPartsWithin(size.height, part_size.height, stride.height)); +} + +static size_t getBlockHistogramSize(Size block_size, Size cell_size, int nbins) +{ + Size cells_per_block = Size(block_size.width / cell_size.width, + block_size.height / cell_size.height); + return (size_t)(nbins * cells_per_block.area()); +} + size_t HOGDescriptor::getDescriptorSize() const { CV_Assert(blockSize.width % cellSize.width == 0 && @@ -88,7 +112,24 @@ bool HOGDescriptor::checkDetectorSize() const void HOGDescriptor::setSVMDetector(InputArray _svmDetector) { _svmDetector.getMat().convertTo(svmDetector, CV_32F); - CV_Assert( checkDetectorSize() ); + CV_Assert(checkDetectorSize()); + + Mat detector_reordered(1, (int)svmDetector.size(), CV_32FC1); + + size_t block_hist_size = getBlockHistogramSize(blockSize, cellSize, nbins); + cv::Size blocks_per_img = numPartsWithin(winSize, blockSize, blockStride); + + for (int i = 0; i < blocks_per_img.height; ++i) + for (int j = 0; j < blocks_per_img.width; ++j) + { + const float *src = &svmDetector[0] + (j * blocks_per_img.height + i) * block_hist_size; + float *dst = (float*)detector_reordered.data + (i * blocks_per_img.width + j) * block_hist_size; + for (size_t k = 0; k < block_hist_size; ++k) + dst[k] = src[k]; + } + size_t descriptor_size = getDescriptorSize(); + free_coef = svmDetector.size() > descriptor_size ? svmDetector[descriptor_size] : 0; + detector_reordered.copyTo(oclSvmDetector); } #define CV_TYPE_NAME_HOG_DESCRIPTOR "opencv-object-detector-hog" @@ -1029,7 +1070,318 @@ static inline int gcd(int a, int b) return a; } -void HOGDescriptor::compute(const Mat& img, std::vector& descriptors, +#ifdef HAVE_OPENCL + +static bool ocl_compute_gradients_8UC1(int height, int width, InputArray _img, float angle_scale, + UMat grad, UMat qangle, bool correct_gamma, int nbins) +{ + ocl::Kernel k("compute_gradients_8UC1_kernel", ocl::objdetect::objdetect_hog_oclsrc); + if(k.empty()) + return false; + + UMat img = _img.getUMat(); + + size_t localThreads[3] = { NTHREADS, 1, 1 }; + size_t globalThreads[3] = { width, height, 1 }; + char correctGamma = (correct_gamma) ? 1 : 0; + int grad_quadstep = (int)grad.step >> 3; + int qangle_step_shift = 0; + int qangle_step = (int)qangle.step >> (1 + qangle_step_shift); + + int idx = 0; + idx = k.set(idx, height); + idx = k.set(idx, width); + idx = k.set(idx, (int)img.step1()); + idx = k.set(idx, grad_quadstep); + idx = k.set(idx, qangle_step); + idx = k.set(idx, ocl::KernelArg::PtrReadOnly(img)); + idx = k.set(idx, ocl::KernelArg::PtrWriteOnly(grad)); + idx = k.set(idx, ocl::KernelArg::PtrWriteOnly(qangle)); + idx = k.set(idx, angle_scale); + idx = k.set(idx, correctGamma); + idx = k.set(idx, nbins); + + return k.run(2, globalThreads, localThreads, false); +} + +static bool ocl_computeGradient(InputArray img, UMat grad, UMat qangle, int nbins, Size effect_size, bool gamma_correction) +{ + float angleScale = (float)(nbins / CV_PI); + + return ocl_compute_gradients_8UC1(effect_size.height, effect_size.width, img, + angleScale, grad, qangle, gamma_correction, nbins); +} + +#define CELL_WIDTH 8 +#define CELL_HEIGHT 8 +#define CELLS_PER_BLOCK_X 2 +#define CELLS_PER_BLOCK_Y 2 + +static bool ocl_compute_hists(int nbins, int block_stride_x, int block_stride_y, int height, int width, + UMat grad, UMat qangle, UMat gauss_w_lut, UMat block_hists, size_t block_hist_size) +{ + ocl::Kernel k("compute_hists_lut_kernel", ocl::objdetect::objdetect_hog_oclsrc); + if(k.empty()) + return false; + bool is_cpu = cv::ocl::Device::getDefault().type() == cv::ocl::Device::TYPE_CPU; + cv::String opts; + if(is_cpu) + opts = "-D CPU "; + else + opts = cv::format("-D WAVE_SIZE=%d", k.preferedWorkGroupSizeMultiple()); + k.create("compute_hists_lut_kernel", ocl::objdetect::objdetect_hog_oclsrc, opts); + if(k.empty()) + return false; + + int img_block_width = (width - CELLS_PER_BLOCK_X * CELL_WIDTH + block_stride_x)/block_stride_x; + int img_block_height = (height - CELLS_PER_BLOCK_Y * CELL_HEIGHT + block_stride_y)/block_stride_y; + int blocks_total = img_block_width * img_block_height; + + int qangle_step_shift = 0; + int grad_quadstep = (int)grad.step >> 2; + int qangle_step = (int)qangle.step >> qangle_step_shift; + + int blocks_in_group = 4; + size_t localThreads[3] = { blocks_in_group * 24, 2, 1 }; + size_t globalThreads[3] = {((img_block_width * img_block_height + blocks_in_group - 1)/blocks_in_group) * localThreads[0], 2, 1 }; + + int hists_size = (nbins * CELLS_PER_BLOCK_X * CELLS_PER_BLOCK_Y * 12) * sizeof(float); + int final_hists_size = (nbins * CELLS_PER_BLOCK_X * CELLS_PER_BLOCK_Y) * sizeof(float); + + int smem = (hists_size + final_hists_size) * blocks_in_group; + + int idx = 0; + idx = k.set(idx, block_stride_x); + idx = k.set(idx, block_stride_y); + idx = k.set(idx, nbins); + idx = k.set(idx, (int)block_hist_size); + idx = k.set(idx, img_block_width); + idx = k.set(idx, blocks_in_group); + idx = k.set(idx, blocks_total); + idx = k.set(idx, grad_quadstep); + idx = k.set(idx, qangle_step); + idx = k.set(idx, ocl::KernelArg::PtrReadOnly(grad)); + idx = k.set(idx, ocl::KernelArg::PtrReadOnly(qangle)); + idx = k.set(idx, ocl::KernelArg::PtrReadOnly(gauss_w_lut)); + idx = k.set(idx, ocl::KernelArg::PtrWriteOnly(block_hists)); + idx = k.set(idx, (void*)NULL, (size_t)smem); + + return k.run(2, globalThreads, localThreads, false); +} + +static int power_2up(unsigned int n) +{ + for(unsigned int i = 1; i<=1024; i<<=1) + if(n < i) + return i; + return -1; // Input is too big +} + +static bool ocl_normalize_hists(int nbins, int block_stride_x, int block_stride_y, + int height, int width, UMat block_hists, float threshold) +{ + int block_hist_size = nbins * CELLS_PER_BLOCK_X * CELLS_PER_BLOCK_Y; + int img_block_width = (width - CELLS_PER_BLOCK_X * CELL_WIDTH + block_stride_x) + / block_stride_x; + int img_block_height = (height - CELLS_PER_BLOCK_Y * CELL_HEIGHT + block_stride_y) + / block_stride_y; + int nthreads; + size_t globalThreads[3] = { 1, 1, 1 }; + size_t localThreads[3] = { 1, 1, 1 }; + + int idx = 0; + bool is_cpu = cv::ocl::Device::getDefault().type() == cv::ocl::Device::TYPE_CPU; + cv::String opts; + ocl::Kernel k; + if ( nbins == 9 ) + { + k.create("normalize_hists_36_kernel", ocl::objdetect::objdetect_hog_oclsrc, ""); + if(k.empty()) + return false; + if(is_cpu) + opts = "-D CPU "; + else + opts = cv::format("-D WAVE_SIZE=%d", k.preferedWorkGroupSizeMultiple()); + k.create("normalize_hists_36_kernel", ocl::objdetect::objdetect_hog_oclsrc, opts); + if(k.empty()) + return false; + + int blocks_in_group = NTHREADS / block_hist_size; + nthreads = blocks_in_group * block_hist_size; + int num_groups = (img_block_width * img_block_height + blocks_in_group - 1)/blocks_in_group; + globalThreads[0] = nthreads * num_groups; + localThreads[0] = nthreads; + } + else + { + k.create("normalize_hists_kernel", ocl::objdetect::objdetect_hog_oclsrc, ""); + if(k.empty()) + return false; + if(is_cpu) + opts = "-D CPU "; + else + opts = cv::format("-D WAVE_SIZE=%d", k.preferedWorkGroupSizeMultiple()); + k.create("normalize_hists_kernel", ocl::objdetect::objdetect_hog_oclsrc, opts); + if(k.empty()) + return false; + + nthreads = power_2up(block_hist_size); + globalThreads[0] = img_block_width * nthreads; + globalThreads[1] = img_block_height; + localThreads[0] = nthreads; + + if ((nthreads < 32) || (nthreads > 512) ) + return false; + + idx = k.set(idx, nthreads); + idx = k.set(idx, block_hist_size); + idx = k.set(idx, img_block_width); + } + idx = k.set(idx, ocl::KernelArg::PtrReadWrite(block_hists)); + idx = k.set(idx, threshold); + idx = k.set(idx, (void*)NULL, nthreads * sizeof(float)); + + return k.run(2, globalThreads, localThreads, false); +} + +static bool ocl_extract_descrs_by_rows(int win_height, int win_width, int block_stride_y, int block_stride_x, int win_stride_y, int win_stride_x, + int height, int width, UMat block_hists, UMat descriptors, + int block_hist_size, int descr_size, int descr_width) +{ + ocl::Kernel k("extract_descrs_by_rows_kernel", ocl::objdetect::objdetect_hog_oclsrc); + if(k.empty()) + return false; + + int win_block_stride_x = win_stride_x / block_stride_x; + int win_block_stride_y = win_stride_y / block_stride_y; + int img_win_width = (width - win_width + win_stride_x) / win_stride_x; + int img_win_height = (height - win_height + win_stride_y) / win_stride_y; + int img_block_width = (width - CELLS_PER_BLOCK_X * CELL_WIDTH + block_stride_x) / + block_stride_x; + + int descriptors_quadstep = (int)descriptors.step >> 2; + + size_t globalThreads[3] = { img_win_width * NTHREADS, img_win_height, 1 }; + size_t localThreads[3] = { NTHREADS, 1, 1 }; + + int idx = 0; + idx = k.set(idx, block_hist_size); + idx = k.set(idx, descriptors_quadstep); + idx = k.set(idx, descr_size); + idx = k.set(idx, descr_width); + idx = k.set(idx, img_block_width); + idx = k.set(idx, win_block_stride_x); + idx = k.set(idx, win_block_stride_y); + idx = k.set(idx, ocl::KernelArg::PtrReadOnly(block_hists)); + idx = k.set(idx, ocl::KernelArg::PtrWriteOnly(descriptors)); + + return k.run(2, globalThreads, localThreads, false); +} + +static bool ocl_extract_descrs_by_cols(int win_height, int win_width, int block_stride_y, int block_stride_x, int win_stride_y, int win_stride_x, + int height, int width, UMat block_hists, UMat descriptors, + int block_hist_size, int descr_size, int nblocks_win_x, int nblocks_win_y) +{ + ocl::Kernel k("extract_descrs_by_cols_kernel", ocl::objdetect::objdetect_hog_oclsrc); + if(k.empty()) + return false; + + int win_block_stride_x = win_stride_x / block_stride_x; + int win_block_stride_y = win_stride_y / block_stride_y; + int img_win_width = (width - win_width + win_stride_x) / win_stride_x; + int img_win_height = (height - win_height + win_stride_y) / win_stride_y; + int img_block_width = (width - CELLS_PER_BLOCK_X * CELL_WIDTH + block_stride_x) / + block_stride_x; + + int descriptors_quadstep = (int)descriptors.step >> 2; + + size_t globalThreads[3] = { img_win_width * NTHREADS, img_win_height, 1 }; + size_t localThreads[3] = { NTHREADS, 1, 1 }; + + int idx = 0; + idx = k.set(idx, block_hist_size); + idx = k.set(idx, descriptors_quadstep); + idx = k.set(idx, descr_size); + idx = k.set(idx, nblocks_win_x); + idx = k.set(idx, nblocks_win_y); + idx = k.set(idx, img_block_width); + idx = k.set(idx, win_block_stride_x); + idx = k.set(idx, win_block_stride_y); + idx = k.set(idx, ocl::KernelArg::PtrReadOnly(block_hists)); + idx = k.set(idx, ocl::KernelArg::PtrWriteOnly(descriptors)); + + return k.run(2, globalThreads, localThreads, false); +} + +static bool ocl_compute(InputArray _img, Size win_stride, std::vector& _descriptors, int descr_format, Size blockSize, + Size cellSize, int nbins, Size blockStride, Size winSize, float sigma, bool gammaCorrection, double L2HysThreshold) +{ + Size imgSize = _img.size(); + Size effect_size = imgSize; + + UMat grad(imgSize, CV_32FC2); + UMat qangle(imgSize, CV_8UC2); + + const size_t block_hist_size = getBlockHistogramSize(blockSize, cellSize, nbins); + const Size blocks_per_img = numPartsWithin(imgSize, blockSize, blockStride); + UMat block_hists(1, static_cast(block_hist_size * blocks_per_img.area()) + 256, CV_32F); + + Size wins_per_img = numPartsWithin(imgSize, winSize, win_stride); + UMat labels(1, wins_per_img.area(), CV_8U); + + float scale = 1.f / (2.f * sigma * sigma); + Mat gaussian_lut(1, 512, CV_32FC1); + int idx = 0; + for(int i=-8; i<8; i++) + for(int j=-8; j<8; j++) + gaussian_lut.at(idx++) = std::exp(-(j * j + i * i) * scale); + for(int i=-8; i<8; i++) + for(int j=-8; j<8; j++) + gaussian_lut.at(idx++) = (8.f - fabs(j + 0.5f)) * (8.f - fabs(i + 0.5f)) / 64.f; + + if(!ocl_computeGradient(_img, grad, qangle, nbins, effect_size, gammaCorrection)) + return false; + + UMat gauss_w_lut; + gaussian_lut.copyTo(gauss_w_lut); + if(!ocl_compute_hists(nbins, blockStride.width, blockStride.height, effect_size.height, + effect_size.width, grad, qangle, gauss_w_lut, block_hists, block_hist_size)) + return false; + + if(!ocl_normalize_hists(nbins, blockStride.width, blockStride.height, effect_size.height, + effect_size.width, block_hists, (float)L2HysThreshold)) + return false; + + Size blocks_per_win = numPartsWithin(winSize, blockSize, blockStride); + wins_per_img = numPartsWithin(effect_size, winSize, win_stride); + + int descr_size = blocks_per_win.area()*(int)block_hist_size; + int descr_width = (int)block_hist_size*blocks_per_win.width; + + UMat descriptors(wins_per_img.area(), static_cast(blocks_per_win.area() * block_hist_size), CV_32F); + switch (descr_format) + { + case DESCR_FORMAT_ROW_BY_ROW: + if(!ocl_extract_descrs_by_rows(winSize.height, winSize.width, + blockStride.height, blockStride.width, win_stride.height, win_stride.width, effect_size.height, + effect_size.width, block_hists, descriptors, (int)block_hist_size, descr_size, descr_width)) + return false; + break; + case DESCR_FORMAT_COL_BY_COL: + if(!ocl_extract_descrs_by_cols(winSize.height, winSize.width, + blockStride.height, blockStride.width, win_stride.height, win_stride.width, effect_size.height, effect_size.width, + block_hists, descriptors, (int)block_hist_size, descr_size, blocks_per_win.width, blocks_per_win.height)) + return false; + break; + default: + return false; + } + descriptors.reshape(1, (int)descriptors.total()).getMat(ACCESS_READ).copyTo(_descriptors); + return true; +} +#endif //HAVE_OPENCL + +void HOGDescriptor::compute(InputArray _img, std::vector& descriptors, Size winStride, Size padding, const std::vector& locations) const { if( winStride == Size() ) @@ -1037,11 +1389,18 @@ void HOGDescriptor::compute(const Mat& img, std::vector& descriptors, Size cacheStride(gcd(winStride.width, blockStride.width), gcd(winStride.height, blockStride.height)); + Size imgSize = _img.size(); + size_t nwindows = locations.size(); padding.width = (int)alignSize(std::max(padding.width, 0), cacheStride.width); padding.height = (int)alignSize(std::max(padding.height, 0), cacheStride.height); - Size paddedImgSize(img.cols + padding.width*2, img.rows + padding.height*2); + Size paddedImgSize(imgSize.width + padding.width*2, imgSize.height + padding.height*2); + CV_OCL_RUN(_img.dims() <= 2 && _img.type() == CV_8UC1 && _img.isUMat(), + ocl_compute(_img, winStride, descriptors, DESCR_FORMAT_COL_BY_COL, blockSize, + cellSize, nbins, blockStride, winSize, (float)getWinSigma(), gammaCorrection, L2HysThreshold)) + + Mat img = _img.getMat(); HOGCache cache(this, img, padding, padding, nwindows == 0, cacheStride); if( !nwindows ) @@ -1263,20 +1622,215 @@ private: Mutex* mtx; }; +#ifdef HAVE_OPENCL + +static bool ocl_classify_hists(int win_height, int win_width, int block_stride_y, int block_stride_x, + int win_stride_y, int win_stride_x, int height, int width, + const UMat& block_hists, UMat detector, + float free_coef, float threshold, UMat& labels, Size descr_size, int block_hist_size) +{ + int nthreads; + bool is_cpu = cv::ocl::Device::getDefault().type() == cv::ocl::Device::TYPE_CPU; + cv::String opts; + + ocl::Kernel k; + int idx = 0; + switch (descr_size.width) + { + case 180: + nthreads = 180; + k.create("classify_hists_180_kernel", ocl::objdetect::objdetect_hog_oclsrc, ""); + if(k.empty()) + return false; + if(is_cpu) + opts = "-D CPU "; + else + opts = cv::format("-D WAVE_SIZE=%d", k.preferedWorkGroupSizeMultiple()); + k.create("classify_hists_180_kernel", ocl::objdetect::objdetect_hog_oclsrc, opts); + if(k.empty()) + return false; + idx = k.set(idx, descr_size.width); + idx = k.set(idx, descr_size.height); + break; + + case 252: + nthreads = 256; + k.create("classify_hists_252_kernel", ocl::objdetect::objdetect_hog_oclsrc, ""); + if(k.empty()) + return false; + if(is_cpu) + opts = "-D CPU "; + else + opts = cv::format("-D WAVE_SIZE=%d", k.preferedWorkGroupSizeMultiple()); + k.create("classify_hists_252_kernel", ocl::objdetect::objdetect_hog_oclsrc, opts); + if(k.empty()) + return false; + idx = k.set(idx, descr_size.width); + idx = k.set(idx, descr_size.height); + break; + + default: + nthreads = 256; + k.create("classify_hists_kernel", ocl::objdetect::objdetect_hog_oclsrc, ""); + if(k.empty()) + return false; + if(is_cpu) + opts = "-D CPU "; + else + opts = cv::format("-D WAVE_SIZE=%d", k.preferedWorkGroupSizeMultiple()); + k.create("classify_hists_kernel", ocl::objdetect::objdetect_hog_oclsrc, opts); + if(k.empty()) + return false; + idx = k.set(idx, descr_size.area()); + idx = k.set(idx, descr_size.height); + } + + int win_block_stride_x = win_stride_x / block_stride_x; + int win_block_stride_y = win_stride_y / block_stride_y; + int img_win_width = (width - win_width + win_stride_x) / win_stride_x; + int img_win_height = (height - win_height + win_stride_y) / win_stride_y; + int img_block_width = (width - CELLS_PER_BLOCK_X * CELL_WIDTH + block_stride_x) / + block_stride_x; + + size_t globalThreads[3] = { img_win_width * nthreads, img_win_height, 1 }; + size_t localThreads[3] = { nthreads, 1, 1 }; + + idx = k.set(idx, block_hist_size); + idx = k.set(idx, img_win_width); + idx = k.set(idx, img_block_width); + idx = k.set(idx, win_block_stride_x); + idx = k.set(idx, win_block_stride_y); + idx = k.set(idx, ocl::KernelArg::PtrReadOnly(block_hists)); + idx = k.set(idx, ocl::KernelArg::PtrReadOnly(detector)); + idx = k.set(idx, free_coef); + idx = k.set(idx, threshold); + idx = k.set(idx, ocl::KernelArg::PtrWriteOnly(labels)); + + return k.run(2, globalThreads, localThreads, false); +} + +static bool ocl_detect(InputArray img, std::vector &hits, double hit_threshold, Size win_stride, + const UMat& oclSvmDetector, Size blockSize, Size cellSize, int nbins, Size blockStride, Size winSize, + bool gammaCorrection, double L2HysThreshold, float sigma, float free_coef) +{ + hits.clear(); + if (oclSvmDetector.empty()) + return false; + + Size imgSize = img.size(); + Size effect_size = imgSize; + UMat grad(imgSize, CV_32FC2); + UMat qangle(imgSize, CV_8UC2); + + const size_t block_hist_size = getBlockHistogramSize(blockSize, cellSize, nbins); + const Size blocks_per_img = numPartsWithin(imgSize, blockSize, blockStride); + UMat block_hists(1, static_cast(block_hist_size * blocks_per_img.area()) + 256, CV_32F); + + Size wins_per_img = numPartsWithin(imgSize, winSize, win_stride); + UMat labels(1, wins_per_img.area(), CV_8U); + + float scale = 1.f / (2.f * sigma * sigma); + Mat gaussian_lut(1, 512, CV_32FC1); + int idx = 0; + for(int i=-8; i<8; i++) + for(int j=-8; j<8; j++) + gaussian_lut.at(idx++) = std::exp(-(j * j + i * i) * scale); + for(int i=-8; i<8; i++) + for(int j=-8; j<8; j++) + gaussian_lut.at(idx++) = (8.f - fabs(j + 0.5f)) * (8.f - fabs(i + 0.5f)) / 64.f; + + if(!ocl_computeGradient(img, grad, qangle, nbins, effect_size, gammaCorrection)) + return false; + + UMat gauss_w_lut; + gaussian_lut.copyTo(gauss_w_lut); + if(!ocl_compute_hists(nbins, blockStride.width, blockStride.height, effect_size.height, + effect_size.width, grad, qangle, gauss_w_lut, block_hists, block_hist_size)) + return false; + + if(!ocl_normalize_hists(nbins, blockStride.width, blockStride.height, effect_size.height, + effect_size.width, block_hists, (float)L2HysThreshold)) + return false; + + Size blocks_per_win = numPartsWithin(winSize, blockSize, blockStride); + + Size descr_size((int)block_hist_size*blocks_per_win.width, blocks_per_win.height); + + if(!ocl_classify_hists(winSize.height, winSize.width, blockStride.height, + blockStride.width, win_stride.height, win_stride.width, + effect_size.height, effect_size.width, block_hists, oclSvmDetector, + free_coef, (float)hit_threshold, labels, descr_size, (int)block_hist_size)) + return false; + + Mat labels_host = labels.getMat(ACCESS_READ); + unsigned char *vec = labels_host.ptr(); + for (int i = 0; i < wins_per_img.area(); i++) + { + int y = i / wins_per_img.width; + int x = i - wins_per_img.width * y; + if (vec[i]) + { + hits.push_back(Point(x * win_stride.width, y * win_stride.height)); + } + } + return true; +} + +static bool ocl_detectMultiScale(InputArray _img, std::vector &found_locations, std::vector& level_scale, + double hit_threshold, Size win_stride, double group_threshold, + const UMat& oclSvmDetector, Size blockSize, Size cellSize, + int nbins, Size blockStride, Size winSize, bool gammaCorrection, + double L2HysThreshold, float sigma, float free_coef) +{ + std::vector all_candidates; + std::vector locations; + UMat image_scale; + Size imgSize = _img.size(); + image_scale.create(imgSize, _img.type()); + + for (size_t i = 0; i& foundLocations, std::vector& foundWeights, + InputArray _img, std::vector& foundLocations, std::vector& foundWeights, double hitThreshold, Size winStride, Size padding, double scale0, double finalThreshold, bool useMeanshiftGrouping) const { double scale = 1.; int levels = 0; + Size imgSize = _img.size(); std::vector levelScale; for( levels = 0; levels < nlevels; levels++ ) { levelScale.push_back(scale); - if( cvRound(img.cols/scale) < winSize.width || - cvRound(img.rows/scale) < winSize.height || + if( cvRound(imgSize.width/scale) < winSize.width || + cvRound(imgSize.height/scale) < winSize.height || scale0 <= 1 ) break; scale *= scale0; @@ -1284,12 +1838,21 @@ void HOGDescriptor::detectMultiScale( levels = std::max(levels, 1); levelScale.resize(levels); + if(winStride == Size()) + winStride = blockStride; + + CV_OCL_RUN(_img.dims() <= 2 && _img.type() == CV_8UC1 && scale0 > 1 && winStride.width % blockStride.width == 0 && + winStride.height % blockStride.height == 0 && padding == Size(0,0) && _img.isUMat(), + ocl_detectMultiScale(_img, foundLocations, levelScale, hitThreshold, winStride, finalThreshold, oclSvmDetector, + blockSize, cellSize, nbins, blockStride, winSize, gammaCorrection, L2HysThreshold, (float)getWinSigma(), free_coef)); + std::vector allCandidates; std::vector tempScales; std::vector tempWeights; std::vector foundScales; - Mutex mtx; + Mutex mtx; + Mat img = _img.getMat(); Range range(0, (int)levelScale.size()); HOGInvoker invoker(this, img, hitThreshold, winStride, padding, &levelScale[0], &allCandidates, &mtx, &tempWeights, &tempScales); parallel_for_(range, invoker); @@ -1306,7 +1869,7 @@ void HOGDescriptor::detectMultiScale( groupRectangles(foundLocations, foundWeights, (int)finalThreshold, 0.2); } -void HOGDescriptor::detectMultiScale(const Mat& img, std::vector& foundLocations, +void HOGDescriptor::detectMultiScale(InputArray img, std::vector& foundLocations, double hitThreshold, Size winStride, Size padding, double scale0, double finalThreshold, bool useMeanshiftGrouping) const { diff --git a/modules/objdetect/src/opencl/objdetect_hog.cl b/modules/objdetect/src/opencl/objdetect_hog.cl new file mode 100644 index 000000000..e931e82b5 --- /dev/null +++ b/modules/objdetect/src/opencl/objdetect_hog.cl @@ -0,0 +1,726 @@ +/*M/////////////////////////////////////////////////////////////////////////////////////// +// +// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. +// +// By downloading, copying, installing or using the software you agree to this license. +// If you do not agree to this license, do not download, install, +// copy or use the software. +// +// +// License Agreement +// For Open Source Computer Vision Library +// +// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved. +// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved. +// Third party copyrights are property of their respective owners. +// +// @Authors +// Wenju He, wenju@multicorewareinc.com +// +// Redistribution and use in source and binary forms, with or without modification, +// are permitted provided that the following conditions are met: +// +// * Redistribution's of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// +// * Redistribution's in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// +// * The name of the copyright holders may not be used to endorse or promote products +// derived from this software without specific prior written permission. +// +// This software is provided by the copyright holders and contributors as is and +// any express or implied warranties, including, but not limited to, the implied +// warranties of merchantability and fitness for a particular purpose are disclaimed. +// In no event shall the Intel Corporation or contributors be liable for any direct, +// indirect, incidental, special, exemplary, or consequential damages +// (including, but not limited to, procurement of substitute goods or services; +// loss of use, data, or profits; or business interruption) however caused +// and on any theory of liability, whether in contract, strict liability, +// or tort (including negligence or otherwise) arising in any way out of +// the use of this software, even if advised of the possibility of such damage. +// +//M*/ + +#define CELL_WIDTH 8 +#define CELL_HEIGHT 8 +#define CELLS_PER_BLOCK_X 2 +#define CELLS_PER_BLOCK_Y 2 +#define NTHREADS 256 +#define CV_PI_F 3.1415926535897932384626433832795f + +#ifdef INTEL_DEVICE +#define QANGLE_TYPE int +#define QANGLE_TYPE2 int2 +#else +#define QANGLE_TYPE uchar +#define QANGLE_TYPE2 uchar2 +#endif + +//---------------------------------------------------------------------------- +// Histogram computation +// 12 threads for a cell, 12x4 threads per block +// Use pre-computed gaussian and interp_weight lookup tables +__kernel void compute_hists_lut_kernel( + const int cblock_stride_x, const int cblock_stride_y, + const int cnbins, const int cblock_hist_size, const int img_block_width, + const int blocks_in_group, const int blocks_total, + const int grad_quadstep, const int qangle_step, + __global const float* grad, __global const QANGLE_TYPE* qangle, + __global const float* gauss_w_lut, + __global float* block_hists, __local float* smem) +{ + const int lx = get_local_id(0); + const int lp = lx / 24; /* local group id */ + const int gid = get_group_id(0) * blocks_in_group + lp;/* global group id */ + const int gidY = gid / img_block_width; + const int gidX = gid - gidY * img_block_width; + + const int lidX = lx - lp * 24; + const int lidY = get_local_id(1); + + const int cell_x = lidX / 12; + const int cell_y = lidY; + const int cell_thread_x = lidX - cell_x * 12; + + __local float* hists = smem + lp * cnbins * (CELLS_PER_BLOCK_X * + CELLS_PER_BLOCK_Y * 12 + CELLS_PER_BLOCK_X * CELLS_PER_BLOCK_Y); + __local float* final_hist = hists + cnbins * + (CELLS_PER_BLOCK_X * CELLS_PER_BLOCK_Y * 12); + + const int offset_x = gidX * cblock_stride_x + (cell_x << 2) + cell_thread_x; + const int offset_y = gidY * cblock_stride_y + (cell_y << 2); + + __global const float* grad_ptr = (gid < blocks_total) ? + grad + offset_y * grad_quadstep + (offset_x << 1) : grad; + __global const QANGLE_TYPE* qangle_ptr = (gid < blocks_total) ? + qangle + offset_y * qangle_step + (offset_x << 1) : qangle; + + __local float* hist = hists + 12 * (cell_y * CELLS_PER_BLOCK_Y + cell_x) + + cell_thread_x; + for (int bin_id = 0; bin_id < cnbins; ++bin_id) + hist[bin_id * 48] = 0.f; + + const int dist_x = -4 + cell_thread_x - 4 * cell_x; + const int dist_center_x = dist_x - 4 * (1 - 2 * cell_x); + + const int dist_y_begin = -4 - 4 * lidY; + for (int dist_y = dist_y_begin; dist_y < dist_y_begin + 12; ++dist_y) + { + float2 vote = (float2) (grad_ptr[0], grad_ptr[1]); + QANGLE_TYPE2 bin = (QANGLE_TYPE2) (qangle_ptr[0], qangle_ptr[1]); + + grad_ptr += grad_quadstep; + qangle_ptr += qangle_step; + + int dist_center_y = dist_y - 4 * (1 - 2 * cell_y); + + int idx = (dist_center_y + 8) * 16 + (dist_center_x + 8); + float gaussian = gauss_w_lut[idx]; + idx = (dist_y + 8) * 16 + (dist_x + 8); + float interp_weight = gauss_w_lut[256+idx]; + + hist[bin.x * 48] += gaussian * interp_weight * vote.x; + hist[bin.y * 48] += gaussian * interp_weight * vote.y; + } + barrier(CLK_LOCAL_MEM_FENCE); + + volatile __local float* hist_ = hist; + for (int bin_id = 0; bin_id < cnbins; ++bin_id, hist_ += 48) + { + if (cell_thread_x < 6) + hist_[0] += hist_[6]; + barrier(CLK_LOCAL_MEM_FENCE); + if (cell_thread_x < 3) + hist_[0] += hist_[3]; +#ifdef CPU + barrier(CLK_LOCAL_MEM_FENCE); +#endif + if (cell_thread_x == 0) + final_hist[(cell_x * 2 + cell_y) * cnbins + bin_id] = + hist_[0] + hist_[1] + hist_[2]; + } +#ifdef CPU + barrier(CLK_LOCAL_MEM_FENCE); +#endif + + int tid = (cell_y * CELLS_PER_BLOCK_Y + cell_x) * 12 + cell_thread_x; + if ((tid < cblock_hist_size) && (gid < blocks_total)) + { + __global float* block_hist = block_hists + + (gidY * img_block_width + gidX) * cblock_hist_size; + block_hist[tid] = final_hist[tid]; + } +} + +//------------------------------------------------------------- +// Normalization of histograms via L2Hys_norm +// optimized for the case of 9 bins +__kernel void normalize_hists_36_kernel(__global float* block_hists, + const float threshold, __local float *squares) +{ + const int tid = get_local_id(0); + const int gid = get_global_id(0); + const int bid = tid / 36; /* block-hist id, (0 - 6) */ + const int boffset = bid * 36; /* block-hist offset in the work-group */ + const int hid = tid - boffset; /* histogram bin id, (0 - 35) */ + + float elem = block_hists[gid]; + squares[tid] = elem * elem; + barrier(CLK_LOCAL_MEM_FENCE); + + __local float* smem = squares + boffset; + float sum = smem[hid]; + if (hid < 18) + smem[hid] = sum = sum + smem[hid + 18]; + barrier(CLK_LOCAL_MEM_FENCE); + if (hid < 9) + smem[hid] = sum = sum + smem[hid + 9]; + barrier(CLK_LOCAL_MEM_FENCE); + if (hid < 4) + smem[hid] = sum + smem[hid + 4]; + barrier(CLK_LOCAL_MEM_FENCE); + sum = smem[0] + smem[1] + smem[2] + smem[3] + smem[8]; + + elem = elem / (sqrt(sum) + 3.6f); + elem = min(elem, threshold); + + barrier(CLK_LOCAL_MEM_FENCE); + squares[tid] = elem * elem; + barrier(CLK_LOCAL_MEM_FENCE); + + sum = smem[hid]; + if (hid < 18) + smem[hid] = sum = sum + smem[hid + 18]; + barrier(CLK_LOCAL_MEM_FENCE); + if (hid < 9) + smem[hid] = sum = sum + smem[hid + 9]; + barrier(CLK_LOCAL_MEM_FENCE); + if (hid < 4) + smem[hid] = sum + smem[hid + 4]; + barrier(CLK_LOCAL_MEM_FENCE); + sum = smem[0] + smem[1] + smem[2] + smem[3] + smem[8]; + + block_hists[gid] = elem / (sqrt(sum) + 1e-3f); +} + +//------------------------------------------------------------- +// Normalization of histograms via L2Hys_norm +// +inline float reduce_smem(volatile __local float* smem, int size) +{ + unsigned int tid = get_local_id(0); + float sum = smem[tid]; + + if (size >= 512) { if (tid < 256) smem[tid] = sum = sum + smem[tid + 256]; + barrier(CLK_LOCAL_MEM_FENCE); } + if (size >= 256) { if (tid < 128) smem[tid] = sum = sum + smem[tid + 128]; + barrier(CLK_LOCAL_MEM_FENCE); } + if (size >= 128) { if (tid < 64) smem[tid] = sum = sum + smem[tid + 64]; + barrier(CLK_LOCAL_MEM_FENCE); } +#ifdef CPU + if (size >= 64) { if (tid < 32) smem[tid] = sum = sum + smem[tid + 32]; + barrier(CLK_LOCAL_MEM_FENCE); } + if (size >= 32) { if (tid < 16) smem[tid] = sum = sum + smem[tid + 16]; + barrier(CLK_LOCAL_MEM_FENCE); } + if (size >= 16) { if (tid < 8) smem[tid] = sum = sum + smem[tid + 8]; + barrier(CLK_LOCAL_MEM_FENCE); } + if (size >= 8) { if (tid < 4) smem[tid] = sum = sum + smem[tid + 4]; + barrier(CLK_LOCAL_MEM_FENCE); } + if (size >= 4) { if (tid < 2) smem[tid] = sum = sum + smem[tid + 2]; + barrier(CLK_LOCAL_MEM_FENCE); } + if (size >= 2) { if (tid < 1) smem[tid] = sum = sum + smem[tid + 1]; + barrier(CLK_LOCAL_MEM_FENCE); } +#else + if (tid < 32) + { + if (size >= 64) smem[tid] = sum = sum + smem[tid + 32]; +#if WAVE_SIZE < 32 + } barrier(CLK_LOCAL_MEM_FENCE); + if (tid < 16) { +#endif + if (size >= 32) smem[tid] = sum = sum + smem[tid + 16]; + if (size >= 16) smem[tid] = sum = sum + smem[tid + 8]; + if (size >= 8) smem[tid] = sum = sum + smem[tid + 4]; + if (size >= 4) smem[tid] = sum = sum + smem[tid + 2]; + if (size >= 2) smem[tid] = sum = sum + smem[tid + 1]; + } +#endif + + return sum; +} + +__kernel void normalize_hists_kernel( + const int nthreads, const int block_hist_size, const int img_block_width, + __global float* block_hists, const float threshold, __local float *squares) +{ + const int tid = get_local_id(0); + const int gidX = get_group_id(0); + const int gidY = get_group_id(1); + + __global float* hist = block_hists + (gidY * img_block_width + gidX) * + block_hist_size + tid; + + float elem = 0.f; + if (tid < block_hist_size) + elem = hist[0]; + + squares[tid] = elem * elem; + + barrier(CLK_LOCAL_MEM_FENCE); + float sum = reduce_smem(squares, nthreads); + + float scale = 1.0f / (sqrt(sum) + 0.1f * block_hist_size); + elem = min(elem * scale, threshold); + + barrier(CLK_LOCAL_MEM_FENCE); + squares[tid] = elem * elem; + + barrier(CLK_LOCAL_MEM_FENCE); + sum = reduce_smem(squares, nthreads); + scale = 1.0f / (sqrt(sum) + 1e-3f); + + if (tid < block_hist_size) + hist[0] = elem * scale; +} + +//--------------------------------------------------------------------- +// Linear SVM based classification +// 48x96 window, 9 bins and default parameters +// 180 threads, each thread corresponds to a bin in a row +__kernel void classify_hists_180_kernel( + const int cdescr_width, const int cdescr_height, const int cblock_hist_size, + const int img_win_width, const int img_block_width, + const int win_block_stride_x, const int win_block_stride_y, + __global const float * block_hists, __global const float* coefs, + float free_coef, float threshold, __global uchar* labels) +{ + const int tid = get_local_id(0); + const int gidX = get_group_id(0); + const int gidY = get_group_id(1); + + __global const float* hist = block_hists + (gidY * win_block_stride_y * + img_block_width + gidX * win_block_stride_x) * cblock_hist_size; + + float product = 0.f; + + for (int i = 0; i < cdescr_height; i++) + { + product += coefs[i * cdescr_width + tid] * + hist[i * img_block_width * cblock_hist_size + tid]; + } + + __local float products[180]; + + products[tid] = product; + + barrier(CLK_LOCAL_MEM_FENCE); + + if (tid < 90) products[tid] = product = product + products[tid + 90]; + barrier(CLK_LOCAL_MEM_FENCE); + + if (tid < 45) products[tid] = product = product + products[tid + 45]; + barrier(CLK_LOCAL_MEM_FENCE); + + volatile __local float* smem = products; +#ifdef CPU + if (tid < 13) smem[tid] = product = product + smem[tid + 32]; + barrier(CLK_LOCAL_MEM_FENCE); + if (tid < 16) smem[tid] = product = product + smem[tid + 16]; + barrier(CLK_LOCAL_MEM_FENCE); + if(tid<8) smem[tid] = product = product + smem[tid + 8]; + barrier(CLK_LOCAL_MEM_FENCE); + if(tid<4) smem[tid] = product = product + smem[tid + 4]; + barrier(CLK_LOCAL_MEM_FENCE); + if(tid<2) smem[tid] = product = product + smem[tid + 2]; + barrier(CLK_LOCAL_MEM_FENCE); +#else + if (tid < 13) + { + smem[tid] = product = product + smem[tid + 32]; + } +#if WAVE_SIZE < 32 + barrier(CLK_LOCAL_MEM_FENCE); +#endif + if (tid < 16) + { + smem[tid] = product = product + smem[tid + 16]; + smem[tid] = product = product + smem[tid + 8]; + smem[tid] = product = product + smem[tid + 4]; + smem[tid] = product = product + smem[tid + 2]; + } +#endif + + if (tid == 0){ + product = product + smem[tid + 1]; + labels[gidY * img_win_width + gidX] = (product + free_coef >= threshold); + } +} + +//--------------------------------------------------------------------- +// Linear SVM based classification +// 64x128 window, 9 bins and default parameters +// 256 threads, 252 of them are used +__kernel void classify_hists_252_kernel( + const int cdescr_width, const int cdescr_height, const int cblock_hist_size, + const int img_win_width, const int img_block_width, + const int win_block_stride_x, const int win_block_stride_y, + __global const float * block_hists, __global const float* coefs, + float free_coef, float threshold, __global uchar* labels) +{ + const int tid = get_local_id(0); + const int gidX = get_group_id(0); + const int gidY = get_group_id(1); + + __global const float* hist = block_hists + (gidY * win_block_stride_y * + img_block_width + gidX * win_block_stride_x) * cblock_hist_size; + + float product = 0.f; + if (tid < cdescr_width) + { + for (int i = 0; i < cdescr_height; i++) + product += coefs[i * cdescr_width + tid] * + hist[i * img_block_width * cblock_hist_size + tid]; + } + + __local float products[NTHREADS]; + + products[tid] = product; + + barrier(CLK_LOCAL_MEM_FENCE); + + if (tid < 128) products[tid] = product = product + products[tid + 128]; + barrier(CLK_LOCAL_MEM_FENCE); + + if (tid < 64) products[tid] = product = product + products[tid + 64]; + barrier(CLK_LOCAL_MEM_FENCE); + + volatile __local float* smem = products; +#ifdef CPU + if(tid<32) smem[tid] = product = product + smem[tid + 32]; + barrier(CLK_LOCAL_MEM_FENCE); + if(tid<16) smem[tid] = product = product + smem[tid + 16]; + barrier(CLK_LOCAL_MEM_FENCE); + if(tid<8) smem[tid] = product = product + smem[tid + 8]; + barrier(CLK_LOCAL_MEM_FENCE); + if(tid<4) smem[tid] = product = product + smem[tid + 4]; + barrier(CLK_LOCAL_MEM_FENCE); + if(tid<2) smem[tid] = product = product + smem[tid + 2]; + barrier(CLK_LOCAL_MEM_FENCE); +#else + if (tid < 32) + { + smem[tid] = product = product + smem[tid + 32]; +#if WAVE_SIZE < 32 + } barrier(CLK_LOCAL_MEM_FENCE); + if (tid < 16) { +#endif + smem[tid] = product = product + smem[tid + 16]; + smem[tid] = product = product + smem[tid + 8]; + smem[tid] = product = product + smem[tid + 4]; + smem[tid] = product = product + smem[tid + 2]; + } +#endif + if (tid == 0){ + product = product + smem[tid + 1]; + labels[gidY * img_win_width + gidX] = (product + free_coef >= threshold); + } +} + +//--------------------------------------------------------------------- +// Linear SVM based classification +// 256 threads +__kernel void classify_hists_kernel( + const int cdescr_size, const int cdescr_width, const int cblock_hist_size, + const int img_win_width, const int img_block_width, + const int win_block_stride_x, const int win_block_stride_y, + __global const float * block_hists, __global const float* coefs, + float free_coef, float threshold, __global uchar* labels) +{ + const int tid = get_local_id(0); + const int gidX = get_group_id(0); + const int gidY = get_group_id(1); + + __global const float* hist = block_hists + (gidY * win_block_stride_y * + img_block_width + gidX * win_block_stride_x) * cblock_hist_size; + + float product = 0.f; + for (int i = tid; i < cdescr_size; i += NTHREADS) + { + int offset_y = i / cdescr_width; + int offset_x = i - offset_y * cdescr_width; + product += coefs[i] * + hist[offset_y * img_block_width * cblock_hist_size + offset_x]; + } + + __local float products[NTHREADS]; + + products[tid] = product; + + barrier(CLK_LOCAL_MEM_FENCE); + + if (tid < 128) products[tid] = product = product + products[tid + 128]; + barrier(CLK_LOCAL_MEM_FENCE); + + if (tid < 64) products[tid] = product = product + products[tid + 64]; + barrier(CLK_LOCAL_MEM_FENCE); + + volatile __local float* smem = products; +#ifdef CPU + if(tid<32) smem[tid] = product = product + smem[tid + 32]; + barrier(CLK_LOCAL_MEM_FENCE); + if(tid<16) smem[tid] = product = product + smem[tid + 16]; + barrier(CLK_LOCAL_MEM_FENCE); + if(tid<8) smem[tid] = product = product + smem[tid + 8]; + barrier(CLK_LOCAL_MEM_FENCE); + if(tid<4) smem[tid] = product = product + smem[tid + 4]; + barrier(CLK_LOCAL_MEM_FENCE); + if(tid<2) smem[tid] = product = product + smem[tid + 2]; + barrier(CLK_LOCAL_MEM_FENCE); +#else + if (tid < 32) + { + smem[tid] = product = product + smem[tid + 32]; +#if WAVE_SIZE < 32 + } barrier(CLK_LOCAL_MEM_FENCE); + if (tid < 16) { +#endif + smem[tid] = product = product + smem[tid + 16]; + smem[tid] = product = product + smem[tid + 8]; + smem[tid] = product = product + smem[tid + 4]; + smem[tid] = product = product + smem[tid + 2]; + } +#endif + if (tid == 0){ + smem[tid] = product = product + smem[tid + 1]; + labels[gidY * img_win_width + gidX] = (product + free_coef >= threshold); + } +} + +//---------------------------------------------------------------------------- +// Extract descriptors + +__kernel void extract_descrs_by_rows_kernel( + const int cblock_hist_size, const int descriptors_quadstep, + const int cdescr_size, const int cdescr_width, const int img_block_width, + const int win_block_stride_x, const int win_block_stride_y, + __global const float* block_hists, __global float* descriptors) +{ + int tid = get_local_id(0); + int gidX = get_group_id(0); + int gidY = get_group_id(1); + + // Get left top corner of the window in src + __global const float* hist = block_hists + (gidY * win_block_stride_y * + img_block_width + gidX * win_block_stride_x) * cblock_hist_size; + + // Get left top corner of the window in dst + __global float* descriptor = descriptors + + (gidY * get_num_groups(0) + gidX) * descriptors_quadstep; + + // Copy elements from src to dst + for (int i = tid; i < cdescr_size; i += NTHREADS) + { + int offset_y = i / cdescr_width; + int offset_x = i - offset_y * cdescr_width; + descriptor[i] = hist[offset_y * img_block_width * cblock_hist_size + offset_x]; + } +} + +__kernel void extract_descrs_by_cols_kernel( + const int cblock_hist_size, const int descriptors_quadstep, const int cdescr_size, + const int cnblocks_win_x, const int cnblocks_win_y, const int img_block_width, + const int win_block_stride_x, const int win_block_stride_y, + __global const float* block_hists, __global float* descriptors) +{ + int tid = get_local_id(0); + int gidX = get_group_id(0); + int gidY = get_group_id(1); + + // Get left top corner of the window in src + __global const float* hist = block_hists + (gidY * win_block_stride_y * + img_block_width + gidX * win_block_stride_x) * cblock_hist_size; + + // Get left top corner of the window in dst + __global float* descriptor = descriptors + + (gidY * get_num_groups(0) + gidX) * descriptors_quadstep; + + // Copy elements from src to dst + for (int i = tid; i < cdescr_size; i += NTHREADS) + { + int block_idx = i / cblock_hist_size; + int idx_in_block = i - block_idx * cblock_hist_size; + + int y = block_idx / cnblocks_win_x; + int x = block_idx - y * cnblocks_win_x; + + descriptor[(x * cnblocks_win_y + y) * cblock_hist_size + idx_in_block] = + hist[(y * img_block_width + x) * cblock_hist_size + idx_in_block]; + } +} + +//---------------------------------------------------------------------------- +// Gradients computation + +__kernel void compute_gradients_8UC4_kernel( + const int height, const int width, + const int img_step, const int grad_quadstep, const int qangle_step, + const __global uchar4 * img, __global float * grad, __global QANGLE_TYPE * qangle, + const float angle_scale, const char correct_gamma, const int cnbins) +{ + const int x = get_global_id(0); + const int tid = get_local_id(0); + const int gSizeX = get_local_size(0); + const int gidY = get_group_id(1); + + __global const uchar4* row = img + gidY * img_step; + + __local float sh_row[(NTHREADS + 2) * 3]; + + uchar4 val; + if (x < width) + val = row[x]; + else + val = row[width - 2]; + + sh_row[tid + 1] = val.x; + sh_row[tid + 1 + (NTHREADS + 2)] = val.y; + sh_row[tid + 1 + 2 * (NTHREADS + 2)] = val.z; + + if (tid == 0) + { + val = row[max(x - 1, 1)]; + sh_row[0] = val.x; + sh_row[(NTHREADS + 2)] = val.y; + sh_row[2 * (NTHREADS + 2)] = val.z; + } + + if (tid == gSizeX - 1) + { + val = row[min(x + 1, width - 2)]; + sh_row[gSizeX + 1] = val.x; + sh_row[gSizeX + 1 + (NTHREADS + 2)] = val.y; + sh_row[gSizeX + 1 + 2 * (NTHREADS + 2)] = val.z; + } + + barrier(CLK_LOCAL_MEM_FENCE); + if (x < width) + { + float3 a = (float3) (sh_row[tid], sh_row[tid + (NTHREADS + 2)], + sh_row[tid + 2 * (NTHREADS + 2)]); + float3 b = (float3) (sh_row[tid + 2], sh_row[tid + 2 + (NTHREADS + 2)], + sh_row[tid + 2 + 2 * (NTHREADS + 2)]); + + float3 dx; + if (correct_gamma == 1) + dx = sqrt(b) - sqrt(a); + else + dx = b - a; + + float3 dy = (float3) 0.f; + + if (gidY > 0 && gidY < height - 1) + { + a = convert_float3(img[(gidY - 1) * img_step + x].xyz); + b = convert_float3(img[(gidY + 1) * img_step + x].xyz); + + if (correct_gamma == 1) + dy = sqrt(b) - sqrt(a); + else + dy = b - a; + } + + float best_dx = dx.x; + float best_dy = dy.x; + + float mag0 = dx.x * dx.x + dy.x * dy.x; + float mag1 = dx.y * dx.y + dy.y * dy.y; + if (mag0 < mag1) + { + best_dx = dx.y; + best_dy = dy.y; + mag0 = mag1; + } + + mag1 = dx.z * dx.z + dy.z * dy.z; + if (mag0 < mag1) + { + best_dx = dx.z; + best_dy = dy.z; + mag0 = mag1; + } + + mag0 = sqrt(mag0); + + float ang = (atan2(best_dy, best_dx) + CV_PI_F) * angle_scale - 0.5f; + int hidx = (int)floor(ang); + ang -= hidx; + hidx = (hidx + cnbins) % cnbins; + + qangle[(gidY * qangle_step + x) << 1] = hidx; + qangle[((gidY * qangle_step + x) << 1) + 1] = (hidx + 1) % cnbins; + grad[(gidY * grad_quadstep + x) << 1] = mag0 * (1.f - ang); + grad[((gidY * grad_quadstep + x) << 1) + 1] = mag0 * ang; + } +} + +__kernel void compute_gradients_8UC1_kernel( + const int height, const int width, + const int img_step, const int grad_quadstep, const int qangle_step, + __global const uchar * img, __global float * grad, __global QANGLE_TYPE * qangle, + const float angle_scale, const char correct_gamma, const int cnbins) +{ + const int x = get_global_id(0); + const int tid = get_local_id(0); + const int gSizeX = get_local_size(0); + const int gidY = get_group_id(1); + + __global const uchar* row = img + gidY * img_step; + + __local float sh_row[NTHREADS + 2]; + + if (x < width) + sh_row[tid + 1] = row[x]; + else + sh_row[tid + 1] = row[width - 2]; + + if (tid == 0) + sh_row[0] = row[max(x - 1, 1)]; + + if (tid == gSizeX - 1) + sh_row[gSizeX + 1] = row[min(x + 1, width - 2)]; + + barrier(CLK_LOCAL_MEM_FENCE); + if (x < width) + { + float dx; + + if (correct_gamma == 1) + dx = sqrt(sh_row[tid + 2]) - sqrt(sh_row[tid]); + else + dx = sh_row[tid + 2] - sh_row[tid]; + + float dy = 0.f; + if (gidY > 0 && gidY < height - 1) + { + float a = (float) img[ (gidY + 1) * img_step + x ]; + float b = (float) img[ (gidY - 1) * img_step + x ]; + if (correct_gamma == 1) + dy = sqrt(a) - sqrt(b); + else + dy = a - b; + } + float mag = sqrt(dx * dx + dy * dy); + + float ang = (atan2(dy, dx) + CV_PI_F) * angle_scale - 0.5f; + int hidx = (int)floor(ang); + ang -= hidx; + hidx = (hidx + cnbins) % cnbins; + + qangle[ (gidY * qangle_step + x) << 1 ] = hidx; + qangle[ ((gidY * qangle_step + x) << 1) + 1 ] = (hidx + 1) % cnbins; + grad[ (gidY * grad_quadstep + x) << 1 ] = mag * (1.f - ang); + grad[ ((gidY * grad_quadstep + x) << 1) + 1 ] = mag * ang; + } +} diff --git a/modules/objdetect/test/opencl/test_hogdetector.cpp b/modules/objdetect/test/opencl/test_hogdetector.cpp new file mode 100644 index 000000000..8568352b6 --- /dev/null +++ b/modules/objdetect/test/opencl/test_hogdetector.cpp @@ -0,0 +1,121 @@ +/////////////////////////////////////////////////////////////////////////////////////// +// +// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. +// +// By downloading, copying, installing or using the software you agree to this license. +// If you do not agree to this license, do not download, install, +// copy or use the software. +// +// +// License Agreement +// For Open Source Computer Vision Library +// +// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved. +// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved. +// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved. +// Third party copyrights are property of their respective owners. +// +// @Authors +// Niko Li, newlife20080214@gmail.com +// Jia Haipeng, jiahaipeng95@gmail.com +// Shengen Yan, yanshengen@gmail.com +// Jiang Liyuan,jlyuan001.good@163.com +// Rock Li, Rock.Li@amd.com +// Zailong Wu, bullet@yeah.net +// Yao Wang, bitwangyaoyao@gmail.com +// +// Redistribution and use in source and binary forms, with or without modification, +// are permitted provided that the following conditions are met: +// +// * Redistribution's of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// +// * Redistribution's in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// +// * The name of the copyright holders may not be used to endorse or promote products +// derived from this software without specific prior written permission. +// +// This software is provided by the copyright holders and contributors "as is" and +// any express or implied warranties, including, but not limited to, the implied +// warranties of merchantability and fitness for a particular purpose are disclaimed. +// In no event shall the Intel Corporation or contributors be liable for any direct, +// indirect, incidental, special, exemplary, or consequential damages +// (including, but not limited to, procurement of substitute goods or services; +// loss of use, data, or profits; or business interruption) however caused +// and on any theory of liability, whether in contract, strict liability, +// or tort (including negligence or otherwise) arising in any way out of +// the use of this software, even if advised of the possibility of such damage. +// +//M*/ + +#include "test_precomp.hpp" +#include "opencv2/ts/ocl_test.hpp" + +#ifdef HAVE_OPENCL + +namespace cvtest { +namespace ocl { + +///////////////////// HOG ///////////////////////////// +PARAM_TEST_CASE(HOG, Size, MatType) +{ + Size winSize; + int type; + Mat img; + UMat uimg; + virtual void SetUp() + { + winSize = GET_PARAM(0); + type = GET_PARAM(1); + img = readImage("cascadeandhog/images/image_00000000_0.png", IMREAD_GRAYSCALE); + ASSERT_FALSE(img.empty()); + img.copyTo(uimg); + } +}; + +OCL_TEST_P(HOG, GetDescriptors) +{ + HOGDescriptor hog; + hog.gammaCorrection = true; + + hog.setSVMDetector(hog.getDefaultPeopleDetector()); + + std::vector cpu_descriptors; + std::vector gpu_descriptors; + + OCL_OFF(hog.compute(img, cpu_descriptors, hog.winSize)); + OCL_ON(hog.compute(uimg, gpu_descriptors, hog.winSize)); + + Mat cpu_desc(cpu_descriptors), gpu_desc(gpu_descriptors); + + EXPECT_MAT_SIMILAR(cpu_desc, gpu_desc, 1e-1); +} + +OCL_TEST_P(HOG, Detect) +{ + HOGDescriptor hog; + hog.winSize = winSize; + hog.gammaCorrection = true; + + if (winSize.width == 48 && winSize.height == 96) + hog.setSVMDetector(hog.getDaimlerPeopleDetector()); + else + hog.setSVMDetector(hog.getDefaultPeopleDetector()); + + std::vector cpu_found; + std::vector gpu_found; + + OCL_OFF(hog.detectMultiScale(img, cpu_found, 0, Size(8, 8), Size(0, 0), 1.05, 6)); + OCL_ON(hog.detectMultiScale(uimg, gpu_found, 0, Size(8, 8), Size(0, 0), 1.05, 6)); + + EXPECT_LT(checkRectSimilarity(img.size(), cpu_found, gpu_found), 1.0); +} + +INSTANTIATE_TEST_CASE_P(OCL_ObjDetect, HOG, testing::Combine( + testing::Values(Size(64, 128), Size(48, 96)), + testing::Values( MatType(CV_8UC1) ) ) ); + +}} +#endif