Merge remote-tracking branch 'origin/2.4'

Pull requests:
	#943 from jet47:cuda-5.5-support
	#944 from jet47:cmake-2.8.11-cuda-fix
	#912 from SpecLad:contributing
	#934 from SpecLad:parallel-for
	#931 from jet47:gpu-test-fixes
	#932 from bitwangyaoyao:2.4_fixBFM
	#918 from bitwangyaoyao:2.4_samples
	#924 from pengx17:2.4_arithm_fix
	#925 from pengx17:2.4_canny_tmp_fix
	#927 from bitwangyaoyao:2.4_perf
	#930 from pengx17:2.4_haar_ext
	#928 from apavlenko:bugfix_3027
	#920 from asmorkalov:android_move
	#910 from pengx17:2.4_oclgfft
	#913 from janm399:2.4
	#916 from bitwangyaoyao:2.4_fixPyrLK
	#919 from abidrahmank:2.4
	#923 from pengx17:2.4_macfix

Conflicts:
	modules/calib3d/src/stereobm.cpp
	modules/features2d/src/detectors.cpp
	modules/gpu/src/error.cpp
	modules/gpu/src/precomp.hpp
	modules/imgproc/src/distransform.cpp
	modules/imgproc/src/morph.cpp
	modules/ocl/include/opencv2/ocl/ocl.hpp
	modules/ocl/perf/perf_color.cpp
	modules/ocl/perf/perf_imgproc.cpp
	modules/ocl/perf/perf_match_template.cpp
	modules/ocl/perf/precomp.cpp
	modules/ocl/perf/precomp.hpp
	modules/ocl/src/arithm.cpp
	modules/ocl/src/canny.cpp
	modules/ocl/src/filtering.cpp
	modules/ocl/src/haar.cpp
	modules/ocl/src/hog.cpp
	modules/ocl/src/imgproc.cpp
	modules/ocl/src/opencl/haarobjectdetect.cl
	modules/ocl/src/pyrlk.cpp
	modules/video/src/bgfg_gaussmix2.cpp
	modules/video/src/lkpyramid.cpp
	platforms/linux/scripts/cmake_arm_gnueabi_hardfp.sh
	platforms/linux/scripts/cmake_arm_gnueabi_softfp.sh
	platforms/scripts/ABI_compat_generator.py
	samples/ocl/facedetect.cpp
This commit is contained in:
Roman Donchenko
2013-06-04 18:31:51 +04:00
236 changed files with 5549 additions and 3276 deletions

View File

@@ -121,8 +121,9 @@ namespace cv
CV_EXPORTS void setBinpath(const char *path);
//The two functions below enable other opencl program to use ocl module's cl_context and cl_command_queue
//returns cl_context *
CV_EXPORTS void* getoclContext();
//returns cl_command_queue *
CV_EXPORTS void* getoclCommandQueue();
//explicit call clFinish. The global command queue will be used.
@@ -460,6 +461,7 @@ namespace cv
// support all C1 types
CV_EXPORTS void minMax(const oclMat &src, double *minVal, double *maxVal = 0, const oclMat &mask = oclMat());
CV_EXPORTS void minMax_buf(const oclMat &src, double *minVal, double *maxVal, const oclMat &mask, oclMat& buf);
//! finds global minimum and maximum array elements and returns their values with locations
// support all C1 types
@@ -808,7 +810,11 @@ namespace cv
CV_EXPORTS void integral(const oclMat &src, oclMat &sum, oclMat &sqsum);
CV_EXPORTS void integral(const oclMat &src, oclMat &sum);
CV_EXPORTS void cornerHarris(const oclMat &src, oclMat &dst, int blockSize, int ksize, double k, int bordertype = cv::BORDER_DEFAULT);
CV_EXPORTS void cornerHarris_dxdy(const oclMat &src, oclMat &dst, oclMat &Dx, oclMat &Dy,
int blockSize, int ksize, double k, int bordertype = cv::BORDER_DEFAULT);
CV_EXPORTS void cornerMinEigenVal(const oclMat &src, oclMat &dst, int blockSize, int ksize, int bordertype = cv::BORDER_DEFAULT);
CV_EXPORTS void cornerMinEigenVal_dxdy(const oclMat &src, oclMat &dst, oclMat &Dx, oclMat &Dy,
int blockSize, int ksize, int bordertype = cv::BORDER_DEFAULT);
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////CascadeClassifier//////////////////////////////////////////////////////////////////
@@ -826,13 +832,14 @@ namespace cv
};
#endif
#if 0
class CV_EXPORTS OclCascadeClassifierBuf : public cv::CascadeClassifier
{
public:
OclCascadeClassifierBuf() :
m_flags(0), initialized(false), m_scaleFactor(0), buffers(NULL) {}
~OclCascadeClassifierBuf() {}
~OclCascadeClassifierBuf() { release(); }
void detectMultiScale(oclMat &image, CV_OUT std::vector<cv::Rect>& faces,
double scaleFactor = 1.1, int minNeighbors = 3, int flags = 0,
@@ -864,6 +871,7 @@ namespace cv
oclMat gimg1, gsum, gsqsum;
void * buffers;
};
#endif
/////////////////////////////// Pyramid /////////////////////////////////////
CV_EXPORTS void pyrDown(const oclMat &src, oclMat &dst);
@@ -1388,6 +1396,51 @@ namespace cv
explicit BFMatcher_OCL(int norm = NORM_L2) : BruteForceMatcher_OCL_base(norm == NORM_L1 ? L1Dist : norm == NORM_L2 ? L2Dist : HammingDist) {}
};
class CV_EXPORTS GoodFeaturesToTrackDetector_OCL
{
public:
explicit GoodFeaturesToTrackDetector_OCL(int maxCorners = 1000, double qualityLevel = 0.01, double minDistance = 0.0,
int blockSize = 3, bool useHarrisDetector = false, double harrisK = 0.04);
//! return 1 rows matrix with CV_32FC2 type
void operator ()(const oclMat& image, oclMat& corners, const oclMat& mask = oclMat());
//! download points of type Point2f to a vector. the vector's content will be erased
void downloadPoints(const oclMat &points, std::vector<Point2f> &points_v);
int maxCorners;
double qualityLevel;
double minDistance;
int blockSize;
bool useHarrisDetector;
double harrisK;
void releaseMemory()
{
Dx_.release();
Dy_.release();
eig_.release();
minMaxbuf_.release();
tmpCorners_.release();
}
private:
oclMat Dx_;
oclMat Dy_;
oclMat eig_;
oclMat minMaxbuf_;
oclMat tmpCorners_;
};
inline GoodFeaturesToTrackDetector_OCL::GoodFeaturesToTrackDetector_OCL(int maxCorners_, double qualityLevel_, double minDistance_,
int blockSize_, bool useHarrisDetector_, double harrisK_)
{
maxCorners = maxCorners_;
qualityLevel = qualityLevel_;
minDistance = minDistance_;
blockSize = blockSize_;
useHarrisDetector = useHarrisDetector_;
harrisK = harrisK_;
}
/////////////////////////////// PyrLKOpticalFlow /////////////////////////////////////
class CV_EXPORTS PyrLKOpticalFlow

View File

@@ -47,7 +47,7 @@
#define __OPENCV_OCL_PRIVATE_UTIL__
#if defined __APPLE__
#include <OpenCL/OpenCL.h>
#include <OpenCL/opencl.h>
#else
#include <CL/opencl.h>
#endif
@@ -121,6 +121,33 @@ namespace cv
cl_mem CV_EXPORTS bindTexture(const oclMat &mat);
void CV_EXPORTS releaseTexture(cl_mem& texture);
//Represents an image texture object
class CV_EXPORTS TextureCL
{
public:
TextureCL(cl_mem tex, int r, int c, int t)
: tex_(tex), rows(r), cols(c), type(t) {}
~TextureCL()
{
openCLFree(tex_);
}
operator cl_mem()
{
return tex_;
}
cl_mem const tex_;
const int rows;
const int cols;
const int type;
private:
//disable assignment
void operator=(const TextureCL&);
};
// bind oclMat to OpenCL image textures and retunrs an TextureCL object
// note:
// for faster clamping, there is no buffer padding for the constructed texture
Ptr<TextureCL> CV_EXPORTS bindTexturePtr(const oclMat &mat);
// returns whether the current context supports image2d_t format or not
bool CV_EXPORTS support_image2d(Context *clCxt = Context::getContext());
@@ -132,7 +159,7 @@ namespace cv
};
template<DEVICE_INFO _it, typename _ty>
_ty queryDeviceInfo(cl_kernel kernel = NULL);
//info should have been pre-allocated
template<>
int CV_EXPORTS queryDeviceInfo<WAVEFRONT_SIZE, int>(cl_kernel kernel);
template<>

View File

@@ -48,7 +48,7 @@
///////////// Lut ////////////////////////
PERFTEST(lut)
{
Mat src, lut, dst;
Mat src, lut, dst, ocl_dst;
ocl::oclMat d_src, d_lut, d_dst;
int all_type[] = {CV_8UC1, CV_8UC3};
@@ -77,11 +77,6 @@ PERFTEST(lut)
ocl::LUT(d_src, d_lut, d_dst);
WARMUP_OFF;
cv::Mat ocl_mat_dst;
d_dst.download(ocl_mat_dst);
TestSystem::instance().setAccurate(ExpectedMatNear(ocl_mat_dst, dst, 0));
GPU_ON;
ocl::LUT(d_src, d_lut, d_dst);
GPU_OFF;
@@ -90,9 +85,10 @@ PERFTEST(lut)
d_src.upload(src);
d_lut.upload(lut);
ocl::LUT(d_src, d_lut, d_dst);
d_dst.download(dst);
d_dst.download(ocl_dst);
GPU_FULL_OFF;
TestSystem::instance().ExpectedMatNear(ocl_dst, dst, 0);
}
}
@@ -101,7 +97,7 @@ PERFTEST(lut)
///////////// Exp ////////////////////////
PERFTEST(Exp)
{
Mat src, dst;
Mat src, dst, ocl_dst;
ocl::oclMat d_src, d_dst;
for (int size = Min_Size; size <= Max_Size; size *= Multiple)
@@ -121,11 +117,6 @@ PERFTEST(Exp)
ocl::exp(d_src, d_dst);
WARMUP_OFF;
cv::Mat ocl_mat_dst;
d_dst.download(ocl_mat_dst);
TestSystem::instance().setAccurate(ExpectedMatNear(ocl_mat_dst, dst, 2));
GPU_ON;
ocl::exp(d_src, d_dst);
GPU_OFF;
@@ -133,15 +124,17 @@ PERFTEST(Exp)
GPU_FULL_ON;
d_src.upload(src);
ocl::exp(d_src, d_dst);
d_dst.download(dst);
d_dst.download(ocl_dst);
GPU_FULL_OFF;
TestSystem::instance().ExpectedMatNear(ocl_dst, dst, 2);
}
}
///////////// LOG ////////////////////////
PERFTEST(Log)
{
Mat src, dst;
Mat src, dst, ocl_dst;
ocl::oclMat d_src, d_dst;
for (int size = Min_Size; size <= Max_Size; size *= Multiple)
@@ -161,11 +154,6 @@ PERFTEST(Log)
ocl::log(d_src, d_dst);
WARMUP_OFF;
cv::Mat ocl_mat_dst;
d_dst.download(ocl_mat_dst);
TestSystem::instance().setAccurate(ExpectedMatNear(ocl_mat_dst, dst, 1));
GPU_ON;
ocl::log(d_src, d_dst);
GPU_OFF;
@@ -173,15 +161,17 @@ PERFTEST(Log)
GPU_FULL_ON;
d_src.upload(src);
ocl::log(d_src, d_dst);
d_dst.download(dst);
d_dst.download(ocl_dst);
GPU_FULL_OFF;
TestSystem::instance().ExpectedMatNear(ocl_dst, dst, 1);
}
}
///////////// Add ////////////////////////
PERFTEST(Add)
{
Mat src1, src2, dst;
Mat src1, src2, dst, ocl_dst;
ocl::oclMat d_src1, d_src2, d_dst;
int all_type[] = {CV_8UC1, CV_32FC1};
@@ -201,6 +191,7 @@ PERFTEST(Add)
CPU_ON;
add(src1, src2, dst);
CPU_OFF;
d_src1.upload(src1);
d_src2.upload(src2);
@@ -208,11 +199,6 @@ PERFTEST(Add)
ocl::add(d_src1, d_src2, d_dst);
WARMUP_OFF;
cv::Mat ocl_mat_dst;
d_dst.download(ocl_mat_dst);
TestSystem::instance().setAccurate(ExpectedMatNear(ocl_mat_dst, dst, 0.0));
GPU_ON;
ocl::add(d_src1, d_src2, d_dst);
GPU_OFF;
@@ -221,8 +207,10 @@ PERFTEST(Add)
d_src1.upload(src1);
d_src2.upload(src2);
ocl::add(d_src1, d_src2, d_dst);
d_dst.download(dst);
d_dst.download(ocl_dst);
GPU_FULL_OFF;
TestSystem::instance().ExpectedMatNear(ocl_dst, dst, 0.0);
}
}
@@ -231,7 +219,7 @@ PERFTEST(Add)
///////////// Mul ////////////////////////
PERFTEST(Mul)
{
Mat src1, src2, dst;
Mat src1, src2, dst, ocl_dst;
ocl::oclMat d_src1, d_src2, d_dst;
int all_type[] = {CV_8UC1, CV_8UC4};
@@ -260,11 +248,6 @@ PERFTEST(Mul)
ocl::multiply(d_src1, d_src2, d_dst);
WARMUP_OFF;
cv::Mat ocl_mat_dst;
d_dst.download(ocl_mat_dst);
TestSystem::instance().setAccurate(ExpectedMatNear(ocl_mat_dst, dst, 0.0));
GPU_ON;
ocl::multiply(d_src1, d_src2, d_dst);
GPU_OFF;
@@ -273,8 +256,10 @@ PERFTEST(Mul)
d_src1.upload(src1);
d_src2.upload(src2);
ocl::multiply(d_src1, d_src2, d_dst);
d_dst.download(dst);
d_dst.download(ocl_dst);
GPU_FULL_OFF;
TestSystem::instance().ExpectedMatNear(ocl_dst, dst, 0.0);
}
}
@@ -283,7 +268,7 @@ PERFTEST(Mul)
///////////// Div ////////////////////////
PERFTEST(Div)
{
Mat src1, src2, dst;
Mat src1, src2, dst, ocl_dst;
ocl::oclMat d_src1, d_src2, d_dst;
int all_type[] = {CV_8UC1, CV_8UC4};
std::string type_name[] = {"CV_8UC1", "CV_8UC4"};
@@ -304,6 +289,7 @@ PERFTEST(Div)
CPU_ON;
divide(src1, src2, dst);
CPU_OFF;
d_src1.upload(src1);
d_src2.upload(src2);
@@ -311,11 +297,6 @@ PERFTEST(Div)
ocl::divide(d_src1, d_src2, d_dst);
WARMUP_OFF;
cv::Mat ocl_mat_dst;
d_dst.download(ocl_mat_dst);
TestSystem::instance().setAccurate(ExpectedMatNear(ocl_mat_dst, dst, 1));
GPU_ON;
ocl::divide(d_src1, d_src2, d_dst);
GPU_OFF;
@@ -324,8 +305,10 @@ PERFTEST(Div)
d_src1.upload(src1);
d_src2.upload(src2);
ocl::divide(d_src1, d_src2, d_dst);
d_dst.download(dst);
d_dst.download(ocl_dst);
GPU_FULL_OFF;
TestSystem::instance().ExpectedMatNear(ocl_dst, dst, 1);
}
}
@@ -334,7 +317,7 @@ PERFTEST(Div)
///////////// Absdiff ////////////////////////
PERFTEST(Absdiff)
{
Mat src1, src2, dst;
Mat src1, src2, dst, ocl_dst;
ocl::oclMat d_src1, d_src2, d_dst;
int all_type[] = {CV_8UC1, CV_8UC4};
@@ -355,6 +338,7 @@ PERFTEST(Absdiff)
CPU_ON;
absdiff(src1, src2, dst);
CPU_OFF;
d_src1.upload(src1);
d_src2.upload(src2);
@@ -362,11 +346,6 @@ PERFTEST(Absdiff)
ocl::absdiff(d_src1, d_src2, d_dst);
WARMUP_OFF;
cv::Mat ocl_mat_dst;
d_dst.download(ocl_mat_dst);
TestSystem::instance().setAccurate(ExpectedMatNear(ocl_mat_dst, dst, 0.0));
GPU_ON;
ocl::absdiff(d_src1, d_src2, d_dst);
GPU_OFF;
@@ -375,8 +354,10 @@ PERFTEST(Absdiff)
d_src1.upload(src1);
d_src2.upload(src2);
ocl::absdiff(d_src1, d_src2, d_dst);
d_dst.download(dst);
d_dst.download(ocl_dst);
GPU_FULL_OFF;
TestSystem::instance().ExpectedMatNear(ocl_dst, dst, 0.0);
}
}
@@ -385,7 +366,7 @@ PERFTEST(Absdiff)
///////////// CartToPolar ////////////////////////
PERFTEST(CartToPolar)
{
Mat src1, src2, dst, dst1;
Mat src1, src2, dst, dst1, ocl_dst, ocl_dst1;
ocl::oclMat d_src1, d_src2, d_dst, d_dst1;
int all_type[] = {CV_32FC1};
@@ -408,6 +389,7 @@ PERFTEST(CartToPolar)
CPU_ON;
cartToPolar(src1, src2, dst, dst1, 1);
CPU_OFF;
d_src1.upload(src1);
d_src2.upload(src2);
@@ -415,14 +397,6 @@ PERFTEST(CartToPolar)
ocl::cartToPolar(d_src1, d_src2, d_dst, d_dst1, 1);
WARMUP_OFF;
cv::Mat ocl_mat_dst;
d_dst.download(ocl_mat_dst);
cv::Mat ocl_mat_dst1;
d_dst1.download(ocl_mat_dst1);
TestSystem::instance().setAccurate(ExpectedMatNear(ocl_mat_dst1, dst1, 0.5)&&ExpectedMatNear(ocl_mat_dst, dst, 0.5));
GPU_ON;
ocl::cartToPolar(d_src1, d_src2, d_dst, d_dst1, 1);
GPU_OFF;
@@ -431,9 +405,15 @@ PERFTEST(CartToPolar)
d_src1.upload(src1);
d_src2.upload(src2);
ocl::cartToPolar(d_src1, d_src2, d_dst, d_dst1, 1);
d_dst.download(dst);
d_dst1.download(dst1);
d_dst.download(ocl_dst);
d_dst1.download(ocl_dst1);
GPU_FULL_OFF;
double diff1 = checkNorm(ocl_dst1, dst1);
double diff2 = checkNorm(ocl_dst, dst);
double max_diff = max(diff1, diff2);
TestSystem::instance().setAccurate(max_diff<=.5?1:0, max_diff);
}
}
@@ -442,7 +422,7 @@ PERFTEST(CartToPolar)
///////////// PolarToCart ////////////////////////
PERFTEST(PolarToCart)
{
Mat src1, src2, dst, dst1;
Mat src1, src2, dst, dst1, ocl_dst, ocl_dst1;
ocl::oclMat d_src1, d_src2, d_dst, d_dst1;
int all_type[] = {CV_32FC1};
@@ -472,14 +452,6 @@ PERFTEST(PolarToCart)
ocl::polarToCart(d_src1, d_src2, d_dst, d_dst1, 1);
WARMUP_OFF;
cv::Mat ocl_mat_dst;
d_dst.download(ocl_mat_dst);
cv::Mat ocl_mat_dst1;
d_dst1.download(ocl_mat_dst1);
TestSystem::instance().setAccurate(ExpectedMatNear(ocl_mat_dst1, dst1, 0.5)&&ExpectedMatNear(ocl_mat_dst, dst, 0.5));
GPU_ON;
ocl::polarToCart(d_src1, d_src2, d_dst, d_dst1, 1);
GPU_OFF;
@@ -488,9 +460,15 @@ PERFTEST(PolarToCart)
d_src1.upload(src1);
d_src2.upload(src2);
ocl::polarToCart(d_src1, d_src2, d_dst, d_dst1, 1);
d_dst.download(dst);
d_dst1.download(dst1);
d_dst.download(ocl_dst);
d_dst1.download(ocl_dst1);
GPU_FULL_OFF;
double diff1 = checkNorm(ocl_dst1, dst1);
double diff2 = checkNorm(ocl_dst, dst);
double max_diff = max(diff1, diff2);
TestSystem::instance().setAccurate(max_diff<=.5?1:0, max_diff);
}
}
@@ -499,7 +477,7 @@ PERFTEST(PolarToCart)
///////////// Magnitude ////////////////////////
PERFTEST(magnitude)
{
Mat x, y, mag;
Mat x, y, mag, ocl_mag;
ocl::oclMat d_x, d_y, d_mag;
int all_type[] = {CV_32FC1};
@@ -526,11 +504,6 @@ PERFTEST(magnitude)
ocl::magnitude(d_x, d_y, d_mag);
WARMUP_OFF;
cv::Mat ocl_mat_dst;
d_mag.download(ocl_mat_dst);
TestSystem::instance().setAccurate(ExpectedMatNear(ocl_mat_dst, mag, 1e-5));
GPU_ON;
ocl::magnitude(d_x, d_y, d_mag);
GPU_OFF;
@@ -539,8 +512,10 @@ PERFTEST(magnitude)
d_x.upload(x);
d_y.upload(y);
ocl::magnitude(d_x, d_y, d_mag);
d_mag.download(mag);
d_mag.download(ocl_mag);
GPU_FULL_OFF;
TestSystem::instance().ExpectedMatNear(ocl_mag, mag, 1e-5);
}
}
@@ -549,7 +524,7 @@ PERFTEST(magnitude)
///////////// Transpose ////////////////////////
PERFTEST(Transpose)
{
Mat src, dst;
Mat src, dst, ocl_dst;
ocl::oclMat d_src, d_dst;
int all_type[] = {CV_8UC1, CV_8UC4};
@@ -575,11 +550,6 @@ PERFTEST(Transpose)
ocl::transpose(d_src, d_dst);
WARMUP_OFF;
cv::Mat ocl_mat_dst;
d_dst.download(ocl_mat_dst);
TestSystem::instance().setAccurate(ExpectedMatNear(ocl_mat_dst, dst, 1e-5));
GPU_ON;
ocl::transpose(d_src, d_dst);
GPU_OFF;
@@ -587,8 +557,10 @@ PERFTEST(Transpose)
GPU_FULL_ON;
d_src.upload(src);
ocl::transpose(d_src, d_dst);
d_dst.download(dst);
d_dst.download(ocl_dst);
GPU_FULL_OFF;
TestSystem::instance().ExpectedMatNear(ocl_dst, dst, 1e-5);
}
}
@@ -597,7 +569,7 @@ PERFTEST(Transpose)
///////////// Flip ////////////////////////
PERFTEST(Flip)
{
Mat src, dst;
Mat src, dst, ocl_dst;
ocl::oclMat d_src, d_dst;
int all_type[] = {CV_8UC1, CV_8UC4};
@@ -623,11 +595,6 @@ PERFTEST(Flip)
ocl::flip(d_src, d_dst, 0);
WARMUP_OFF;
cv::Mat ocl_mat_dst;
d_dst.download(ocl_mat_dst);
TestSystem::instance().setAccurate(ExpectedMatNear(ocl_mat_dst, dst, 1e-5));
GPU_ON;
ocl::flip(d_src, d_dst, 0);
GPU_OFF;
@@ -635,8 +602,10 @@ PERFTEST(Flip)
GPU_FULL_ON;
d_src.upload(src);
ocl::flip(d_src, d_dst, 0);
d_dst.download(dst);
d_dst.download(ocl_dst);
GPU_FULL_OFF;
TestSystem::instance().ExpectedMatNear(ocl_dst, dst, 1e-5);
}
}
@@ -671,7 +640,10 @@ PERFTEST(minMax)
ocl::minMax(d_src, &min_val_, &max_val_);
WARMUP_OFF;
TestSystem::instance().setAccurate(EeceptDoubleEQ<double>(max_val_, max_val)&&EeceptDoubleEQ<double>(min_val_, min_val));
if(EeceptDoubleEQ<double>(max_val_, max_val) && EeceptDoubleEQ<double>(min_val_, min_val))
TestSystem::instance().setAccurate(1, max(fabs(max_val_-max_val), fabs(min_val_-min_val)));
else
TestSystem::instance().setAccurate(0, max(fabs(max_val_-max_val), fabs(min_val_-min_val)));
GPU_ON;
ocl::minMax(d_src, &min_val, &max_val);
@@ -724,8 +696,6 @@ PERFTEST(minMaxLoc)
minlocVal_ = src.at<unsigned char>(min_loc_);
maxlocVal = src.at<unsigned char>(max_loc);
maxlocVal_ = src.at<unsigned char>(max_loc_);
error0 = ::abs(src.at<unsigned char>(min_loc_) - src.at<unsigned char>(min_loc));
error1 = ::abs(src.at<unsigned char>(max_loc_) - src.at<unsigned char>(max_loc));
}
if(src.depth() == 1)
{
@@ -733,8 +703,6 @@ PERFTEST(minMaxLoc)
minlocVal_ = src.at<signed char>(min_loc_);
maxlocVal = src.at<signed char>(max_loc);
maxlocVal_ = src.at<signed char>(max_loc_);
error0 = ::abs(src.at<signed char>(min_loc_) - src.at<signed char>(min_loc));
error1 = ::abs(src.at<signed char>(max_loc_) - src.at<signed char>(max_loc));
}
if(src.depth() == 2)
{
@@ -742,8 +710,6 @@ PERFTEST(minMaxLoc)
minlocVal_ = src.at<unsigned short>(min_loc_);
maxlocVal = src.at<unsigned short>(max_loc);
maxlocVal_ = src.at<unsigned short>(max_loc_);
error0 = ::abs(src.at<unsigned short>(min_loc_) - src.at<unsigned short>(min_loc));
error1 = ::abs(src.at<unsigned short>(max_loc_) - src.at<unsigned short>(max_loc));
}
if(src.depth() == 3)
{
@@ -751,8 +717,6 @@ PERFTEST(minMaxLoc)
minlocVal_ = src.at<signed short>(min_loc_);
maxlocVal = src.at<signed short>(max_loc);
maxlocVal_ = src.at<signed short>(max_loc_);
error0 = ::abs(src.at<signed short>(min_loc_) - src.at<signed short>(min_loc));
error1 = ::abs(src.at<signed short>(max_loc_) - src.at<signed short>(max_loc));
}
if(src.depth() == 4)
{
@@ -760,8 +724,6 @@ PERFTEST(minMaxLoc)
minlocVal_ = src.at<int>(min_loc_);
maxlocVal = src.at<int>(max_loc);
maxlocVal_ = src.at<int>(max_loc_);
error0 = ::abs(src.at<int>(min_loc_) - src.at<int>(min_loc));
error1 = ::abs(src.at<int>(max_loc_) - src.at<int>(max_loc));
}
if(src.depth() == 5)
{
@@ -769,8 +731,6 @@ PERFTEST(minMaxLoc)
minlocVal_ = src.at<float>(min_loc_);
maxlocVal = src.at<float>(max_loc);
maxlocVal_ = src.at<float>(max_loc_);
error0 = ::abs(src.at<float>(min_loc_) - src.at<float>(min_loc));
error1 = ::abs(src.at<float>(max_loc_) - src.at<float>(max_loc));
}
if(src.depth() == 6)
{
@@ -778,16 +738,16 @@ PERFTEST(minMaxLoc)
minlocVal_ = src.at<double>(min_loc_);
maxlocVal = src.at<double>(max_loc);
maxlocVal_ = src.at<double>(max_loc_);
error0 = ::abs(src.at<double>(min_loc_) - src.at<double>(min_loc));
error1 = ::abs(src.at<double>(max_loc_) - src.at<double>(max_loc));
}
TestSystem::instance().setAccurate(EeceptDoubleEQ<double>(error1, 0.0)
&&EeceptDoubleEQ<double>(error0, 0.0)
&&EeceptDoubleEQ<double>(maxlocVal_, maxlocVal)
error0 = ::abs(minlocVal_ - minlocVal);
error1 = ::abs(maxlocVal_ - maxlocVal);
if( EeceptDoubleEQ<double>(maxlocVal_, maxlocVal)
&&EeceptDoubleEQ<double>(minlocVal_, minlocVal)
&&EeceptDoubleEQ<double>(max_val_, max_val)
&&EeceptDoubleEQ<double>(min_val_, min_val));
&&EeceptDoubleEQ<double>(min_val_, min_val))
TestSystem::instance().setAccurate(1, 0.);
else
TestSystem::instance().setAccurate(0, max(error0, error1));
GPU_ON;
ocl::minMaxLoc(d_src, &min_val, &max_val, &min_loc, &max_loc);
@@ -831,11 +791,13 @@ PERFTEST(Sum)
gpures = ocl::sum(d_src);
WARMUP_OFF;
TestSystem::instance().setAccurate(ExceptDoubleNear(cpures[3], gpures[3], 0.1)
&&ExceptDoubleNear(cpures[2], gpures[2], 0.1)
&&ExceptDoubleNear(cpures[1], gpures[1], 0.1)
&&ExceptDoubleNear(cpures[0], gpures[0], 0.1));
vector<double> diffs(4);
diffs[3] = fabs(cpures[3] - gpures[3]);
diffs[2] = fabs(cpures[2] - gpures[2]);
diffs[1] = fabs(cpures[1] - gpures[1]);
diffs[0] = fabs(cpures[0] - gpures[0]);
double max_diff = *max_element(diffs.begin(), diffs.end());
TestSystem::instance().setAccurate(max_diff<0.1?1:0, max_diff);
GPU_ON;
gpures = ocl::sum(d_src);
@@ -879,7 +841,11 @@ PERFTEST(countNonZero)
gpures = ocl::countNonZero(d_src);
WARMUP_OFF;
TestSystem::instance().setAccurate((EeceptDoubleEQ<double>((double)cpures, (double)gpures)));
int diff = abs(cpures - gpures);
if(diff == 0)
TestSystem::instance().setAccurate(1, 0);
else
TestSystem::instance().setAccurate(0, diff);
GPU_ON;
ocl::countNonZero(d_src);
@@ -897,7 +863,7 @@ PERFTEST(countNonZero)
///////////// Phase ////////////////////////
PERFTEST(Phase)
{
Mat src1, src2, dst;
Mat src1, src2, dst, ocl_dst;
ocl::oclMat d_src1, d_src2, d_dst;
int all_type[] = {CV_32FC1};
@@ -913,12 +879,12 @@ PERFTEST(Phase)
gen(src2, size, size, all_type[j], 0, 256);
gen(dst, size, size, all_type[j], 0, 256);
phase(src1, src2, dst, 1);
CPU_ON;
phase(src1, src2, dst, 1);
CPU_OFF;
d_src1.upload(src1);
d_src2.upload(src2);
@@ -926,11 +892,6 @@ PERFTEST(Phase)
ocl::phase(d_src1, d_src2, d_dst, 1);
WARMUP_OFF;
cv::Mat ocl_mat_dst;
d_dst.download(ocl_mat_dst);
TestSystem::instance().setAccurate(ExpectedMatNear(ocl_mat_dst, dst, 1e-2));
GPU_ON;
ocl::phase(d_src1, d_src2, d_dst, 1);
GPU_OFF;
@@ -939,8 +900,10 @@ PERFTEST(Phase)
d_src1.upload(src1);
d_src2.upload(src2);
ocl::phase(d_src1, d_src2, d_dst, 1);
d_dst.download(dst);
d_dst.download(ocl_dst);
GPU_FULL_OFF;
TestSystem::instance().ExpectedMatNear(ocl_dst, dst, 1e-2);
}
}
@@ -949,7 +912,7 @@ PERFTEST(Phase)
///////////// bitwise_and////////////////////////
PERFTEST(bitwise_and)
{
Mat src1, src2, dst;
Mat src1, src2, dst, ocl_dst;
ocl::oclMat d_src1, d_src2, d_dst;
int all_type[] = {CV_8UC1, CV_32SC1};
@@ -965,7 +928,6 @@ PERFTEST(bitwise_and)
gen(src2, size, size, all_type[j], 0, 256);
gen(dst, size, size, all_type[j], 0, 256);
bitwise_and(src1, src2, dst);
CPU_ON;
@@ -978,11 +940,6 @@ PERFTEST(bitwise_and)
ocl::bitwise_and(d_src1, d_src2, d_dst);
WARMUP_OFF;
cv::Mat ocl_mat_dst;
d_dst.download(ocl_mat_dst);
TestSystem::instance().setAccurate(ExpectedMatNear(ocl_mat_dst, dst, 0.0));
GPU_ON;
ocl::bitwise_and(d_src1, d_src2, d_dst);
GPU_OFF;
@@ -991,8 +948,10 @@ PERFTEST(bitwise_and)
d_src1.upload(src1);
d_src2.upload(src2);
ocl::bitwise_and(d_src1, d_src2, d_dst);
d_dst.download(dst);
d_dst.download(ocl_dst);
GPU_FULL_OFF;
TestSystem::instance().ExpectedMatNear(ocl_dst, dst, 0.0);
}
}
@@ -1001,7 +960,7 @@ PERFTEST(bitwise_and)
///////////// bitwise_not////////////////////////
PERFTEST(bitwise_not)
{
Mat src1, dst;
Mat src1, dst, ocl_dst;
ocl::oclMat d_src1, d_dst;
int all_type[] = {CV_8UC1, CV_32SC1};
@@ -1016,7 +975,6 @@ PERFTEST(bitwise_not)
gen(src1, size, size, all_type[j], 0, 256);
gen(dst, size, size, all_type[j], 0, 256);
bitwise_not(src1, dst);
CPU_ON;
@@ -1028,11 +986,6 @@ PERFTEST(bitwise_not)
ocl::bitwise_not(d_src1, d_dst);
WARMUP_OFF;
cv::Mat ocl_mat_dst;
d_dst.download(ocl_mat_dst);
TestSystem::instance().setAccurate(ExpectedMatNear(ocl_mat_dst, dst, 0.0));
GPU_ON;
ocl::bitwise_not(d_src1, d_dst);
GPU_OFF;
@@ -1040,8 +993,10 @@ PERFTEST(bitwise_not)
GPU_FULL_ON;
d_src1.upload(src1);
ocl::bitwise_not(d_src1, d_dst);
d_dst.download(dst);
d_dst.download(ocl_dst);
GPU_FULL_OFF;
TestSystem::instance().ExpectedMatNear(ocl_dst, dst, 0.0);
}
}
@@ -1050,7 +1005,7 @@ PERFTEST(bitwise_not)
///////////// compare////////////////////////
PERFTEST(compare)
{
Mat src1, src2, dst;
Mat src1, src2, dst, ocl_dst;
ocl::oclMat d_src1, d_src2, d_dst;
int CMP_EQ = 0;
@@ -1067,12 +1022,12 @@ PERFTEST(compare)
gen(src2, size, size, all_type[j], 0, 256);
gen(dst, size, size, all_type[j], 0, 256);
compare(src1, src2, dst, CMP_EQ);
CPU_ON;
compare(src1, src2, dst, CMP_EQ);
CPU_OFF;
d_src1.upload(src1);
d_src2.upload(src2);
@@ -1080,11 +1035,6 @@ PERFTEST(compare)
ocl::compare(d_src1, d_src2, d_dst, CMP_EQ);
WARMUP_OFF;
cv::Mat ocl_mat_dst;
d_dst.download(ocl_mat_dst);
TestSystem::instance().setAccurate(ExpectedMatNear(ocl_mat_dst, dst, 0.0));
GPU_ON;
ocl::compare(d_src1, d_src2, d_dst, CMP_EQ);
GPU_OFF;
@@ -1093,8 +1043,10 @@ PERFTEST(compare)
d_src1.upload(src1);
d_src2.upload(src2);
ocl::compare(d_src1, d_src2, d_dst, CMP_EQ);
d_dst.download(dst);
d_dst.download(ocl_dst);
GPU_FULL_OFF;
TestSystem::instance().ExpectedMatNear(ocl_dst, dst, 0.0);
}
}
@@ -1103,7 +1055,7 @@ PERFTEST(compare)
///////////// pow ////////////////////////
PERFTEST(pow)
{
Mat src, dst;
Mat src, dst, ocl_dst;
ocl::oclMat d_src, d_dst;
int all_type[] = {CV_32FC1};
@@ -1129,11 +1081,6 @@ PERFTEST(pow)
ocl::pow(d_src, -2.0, d_dst);
WARMUP_OFF;
cv::Mat ocl_mat_dst;
d_dst.download(ocl_mat_dst);
TestSystem::instance().setAccurate(ExpectedMatNear(ocl_mat_dst, dst, 1.0));
GPU_ON;
ocl::pow(d_src, -2.0, d_dst);
GPU_OFF;
@@ -1141,8 +1088,10 @@ PERFTEST(pow)
GPU_FULL_ON;
d_src.upload(src);
ocl::pow(d_src, -2.0, d_dst);
d_dst.download(dst);
d_dst.download(ocl_dst);
GPU_FULL_OFF;
TestSystem::instance().ExpectedMatNear(ocl_dst, dst, 1.0);
}
}
@@ -1151,7 +1100,7 @@ PERFTEST(pow)
///////////// MagnitudeSqr////////////////////////
PERFTEST(MagnitudeSqr)
{
Mat src1, src2, dst;
Mat src1, src2, dst, ocl_dst;
ocl::oclMat d_src1, d_src2, d_dst;
int all_type[] = {CV_32FC1};
@@ -1167,53 +1116,36 @@ PERFTEST(MagnitudeSqr)
gen(src2, size, size, all_type[t], 0, 256);
gen(dst, size, size, all_type[t], 0, 256);
CPU_ON;
for (int i = 0; i < src1.rows; ++i)
for (int j = 0; j < src1.cols; ++j)
{
float val1 = src1.at<float>(i, j);
float val2 = src2.at<float>(i, j);
((float *)(dst.data))[i * dst.step / 4 + j] = val1 * val1 + val2 * val2;
}
CPU_OFF;
CPU_ON;
d_src1.upload(src1);
d_src2.upload(src2);
for (int i = 0; i < src1.rows; ++i)
for (int j = 0; j < src1.cols; ++j)
{
float val1 = src1.at<float>(i, j);
float val2 = src2.at<float>(i, j);
WARMUP_ON;
ocl::magnitudeSqr(d_src1, d_src2, d_dst);
WARMUP_OFF;
((float *)(dst.data))[i * dst.step / 4 + j] = val1 * val1 + val2 * val2;
GPU_ON;
ocl::magnitudeSqr(d_src1, d_src2, d_dst);
GPU_OFF;
}
GPU_FULL_ON;
d_src1.upload(src1);
d_src2.upload(src2);
ocl::magnitudeSqr(d_src1, d_src2, d_dst);
d_dst.download(ocl_dst);
GPU_FULL_OFF;
CPU_OFF;
d_src1.upload(src1);
d_src2.upload(src2);
WARMUP_ON;
ocl::magnitudeSqr(d_src1, d_src2, d_dst);
WARMUP_OFF;
cv::Mat ocl_mat_dst;
d_dst.download(ocl_mat_dst);
TestSystem::instance().setAccurate(ExpectedMatNear(ocl_mat_dst, dst, 1.0));
GPU_ON;
ocl::magnitudeSqr(d_src1, d_src2, d_dst);
GPU_OFF;
GPU_FULL_ON;
d_src1.upload(src1);
d_src2.upload(src2);
ocl::magnitudeSqr(d_src1, d_src2, d_dst);
d_dst.download(dst);
GPU_FULL_OFF;
TestSystem::instance().ExpectedMatNear(ocl_dst, dst, 1.0);
}
}
@@ -1222,7 +1154,7 @@ PERFTEST(MagnitudeSqr)
///////////// AddWeighted////////////////////////
PERFTEST(AddWeighted)
{
Mat src1, src2, dst;
Mat src1, src2, dst, ocl_dst;
ocl::oclMat d_src1, d_src2, d_dst;
double alpha = 2.0, beta = 1.0, gama = 3.0;
@@ -1252,11 +1184,6 @@ PERFTEST(AddWeighted)
ocl::addWeighted(d_src1, alpha, d_src2, beta, gama, d_dst);
WARMUP_OFF;
cv::Mat ocl_mat_dst;
d_dst.download(ocl_mat_dst);
TestSystem::instance().setAccurate(ExpectedMatNear(ocl_mat_dst, dst, 1e-5));
GPU_ON;
ocl::addWeighted(d_src1, alpha, d_src2, beta, gama, d_dst);
GPU_OFF;
@@ -1265,8 +1192,10 @@ PERFTEST(AddWeighted)
d_src1.upload(src1);
d_src2.upload(src2);
ocl::addWeighted(d_src1, alpha, d_src2, beta, gama, d_dst);
d_dst.download(dst);
d_dst.download(ocl_dst);
GPU_FULL_OFF;
TestSystem::instance().ExpectedMatNear(ocl_dst, dst, 1e-5);
}
}

View File

@@ -71,7 +71,7 @@ void blendLinearGold(const cv::Mat &img1, const cv::Mat &img2, const cv::Mat &we
}
PERFTEST(blend)
{
Mat src1, src2, weights1, weights2, dst;
Mat src1, src2, weights1, weights2, dst, ocl_dst;
ocl::oclMat d_src1, d_src2, d_weights1, d_weights2, d_dst;
int all_type[] = {CV_8UC1, CV_8UC4};
@@ -103,10 +103,6 @@ PERFTEST(blend)
ocl::blendLinear(d_src1, d_src2, d_weights1, d_weights2, d_dst);
WARMUP_OFF;
cv::Mat ocl_mat;
d_dst.download(ocl_mat);
TestSystem::instance().setAccurate(ExpectedMatNear(dst, ocl_mat, 1.f));
GPU_ON;
ocl::blendLinear(d_src1, d_src2, d_weights1, d_weights2, d_dst);
GPU_OFF;
@@ -117,8 +113,10 @@ PERFTEST(blend)
d_weights1.upload(weights1);
d_weights2.upload(weights2);
ocl::blendLinear(d_src1, d_src2, d_weights1, d_weights2, d_dst);
d_dst.download(dst);
d_dst.download(ocl_dst);
GPU_FULL_OFF;
TestSystem::instance().ExpectedMatNear(dst, ocl_dst, 1.f);
}
}
}

View File

@@ -88,9 +88,6 @@ PERFTEST(BruteForceMatcher)
d_matcher.matchSingle(d_query, d_train, d_trainIdx, d_distance);
WARMUP_OFF;
d_matcher.match(d_query, d_train, d_matches[0]);
TestSystem::instance().setAccurate(AssertEQ<size_t>(d_matches[0].size(), matches[0].size()));
GPU_ON;
d_matcher.matchSingle(d_query, d_train, d_trainIdx, d_distance);
GPU_OFF;
@@ -98,9 +95,15 @@ PERFTEST(BruteForceMatcher)
GPU_FULL_ON;
d_query.upload(query);
d_train.upload(train);
d_matcher.match(d_query, d_train, matches[0]);
d_matcher.match(d_query, d_train, d_matches[0]);
GPU_FULL_OFF;
int diff = abs((int)d_matches[0].size() - (int)matches[0].size());
if(diff == 0)
TestSystem::instance().setAccurate(1, 0);
else
TestSystem::instance().setAccurate(0, diff);
SUBTEST << size << "; knnMatch";
matcher.knnMatch(query, train, matches, 2);
@@ -123,7 +126,11 @@ PERFTEST(BruteForceMatcher)
d_matcher.knnMatch(d_query, d_train, d_matches, 2);
GPU_FULL_OFF;
TestSystem::instance().setAccurate(AssertEQ<size_t>(d_matches[0].size(), matches[0].size()));
diff = abs((int)d_matches[0].size() - (int)matches[0].size());
if(diff == 0)
TestSystem::instance().setAccurate(1, 0);
else
TestSystem::instance().setAccurate(0, diff);
SUBTEST << size << "; radiusMatch";
@@ -151,6 +158,10 @@ PERFTEST(BruteForceMatcher)
d_matcher.radiusMatch(d_query, d_train, d_matches, max_distance);
GPU_FULL_OFF;
TestSystem::instance().setAccurate(AssertEQ<size_t>(d_matches[0].size(), matches[0].size()));
diff = abs((int)d_matches[0].size() - (int)matches[0].size());
if(diff == 0)
TestSystem::instance().setAccurate(1, 0);
else
TestSystem::instance().setAccurate(0, diff);
}
}

View File

@@ -57,7 +57,7 @@ PERFTEST(Canny)
SUBTEST << img.cols << 'x' << img.rows << "; aloeL.jpg" << "; edges" << "; CV_8UC1";
Mat edges(img.size(), CV_8UC1);
Mat edges(img.size(), CV_8UC1), ocl_edges;
CPU_ON;
Canny(img, edges, 50.0, 100.0);
@@ -71,8 +71,6 @@ PERFTEST(Canny)
ocl::Canny(d_img, d_buf, d_edges, 50.0, 100.0);
WARMUP_OFF;
TestSystem::instance().setAccurate(ExceptedMatSimilar(edges, d_edges, 2e-2));
GPU_ON;
ocl::Canny(d_img, d_buf, d_edges, 50.0, 100.0);
GPU_OFF;
@@ -80,6 +78,8 @@ PERFTEST(Canny)
GPU_FULL_ON;
d_img.upload(img);
ocl::Canny(d_img, d_buf, d_edges, 50.0, 100.0);
d_edges.download(edges);
d_edges.download(ocl_edges);
GPU_FULL_OFF;
TestSystem::instance().ExceptedMatSimilar(edges, ocl_edges, 2e-2);
}

View File

@@ -48,7 +48,7 @@
///////////// cvtColor////////////////////////
PERFTEST(cvtColor)
{
Mat src, dst;
Mat src, dst, ocl_dst;
ocl::oclMat d_src, d_dst;
int all_type[] = {CV_8UC4};
@@ -73,10 +73,6 @@ PERFTEST(cvtColor)
ocl::cvtColor(d_src, d_dst, COLOR_RGBA2GRAY, 4);
WARMUP_OFF;
cv::Mat ocl_mat;
d_dst.download(ocl_mat);
TestSystem::instance().setAccurate(ExceptedMatSimilar(dst, ocl_mat, 1e-5));
GPU_ON;
ocl::cvtColor(d_src, d_dst, COLOR_RGBA2GRAY, 4);
GPU_OFF;
@@ -84,8 +80,10 @@ PERFTEST(cvtColor)
GPU_FULL_ON;
d_src.upload(src);
ocl::cvtColor(d_src, d_dst, COLOR_RGBA2GRAY, 4);
d_dst.download(dst);
d_dst.download(ocl_dst);
GPU_FULL_OFF;
TestSystem::instance().ExceptedMatSimilar(dst, ocl_dst, 1e-5);
}

View File

@@ -48,7 +48,7 @@
///////////// columnSum////////////////////////
PERFTEST(columnSum)
{
Mat src, dst;
Mat src, dst, ocl_dst;
ocl::oclMat d_src, d_dst;
for (int size = Min_Size; size <= Max_Size; size *= Multiple)
@@ -63,23 +63,16 @@ PERFTEST(columnSum)
dst.at<float>(0, j) = src.at<float>(0, j);
for (int i = 1; i < src.rows; ++i)
{for (int j = 0; j < src.cols; ++j)
{
for (int j = 0; j < src.cols; ++j)
dst.at<float>(i, j) = dst.at<float>(i - 1 , j) + src.at<float>(i , j);
}
}
CPU_OFF;
d_src.upload(src);
WARMUP_ON;
ocl::columnSum(d_src, d_dst);
WARMUP_OFF;
cv::Mat ocl_mat;
d_dst.download(ocl_mat);
TestSystem::instance().setAccurate(ExpectedMatNear(dst, ocl_mat, 5e-1));
GPU_ON;
ocl::columnSum(d_src, d_dst);
GPU_OFF;
@@ -87,7 +80,9 @@ PERFTEST(columnSum)
GPU_FULL_ON;
d_src.upload(src);
ocl::columnSum(d_src, d_dst);
d_dst.download(dst);
d_dst.download(ocl_dst);
GPU_FULL_OFF;
TestSystem::instance().ExpectedMatNear(dst, ocl_dst, 5e-1);
}
}

View File

@@ -48,7 +48,7 @@
///////////// dft ////////////////////////
PERFTEST(dft)
{
Mat src, dst;
Mat src, dst, ocl_dst;
ocl::oclMat d_src, d_dst;
int all_type[] = {CV_32FC2};
@@ -74,8 +74,6 @@ PERFTEST(dft)
ocl::dft(d_src, d_dst, Size(size, size));
WARMUP_OFF;
TestSystem::instance().setAccurate(ExpectedMatNear(dst, cv::Mat(d_dst), src.size().area() * 1e-4));
GPU_ON;
ocl::dft(d_src, d_dst, Size(size, size));
GPU_OFF;
@@ -83,8 +81,10 @@ PERFTEST(dft)
GPU_FULL_ON;
d_src.upload(src);
ocl::dft(d_src, d_dst, Size(size, size));
d_dst.download(dst);
d_dst.download(ocl_dst);
GPU_FULL_OFF;
TestSystem::instance().ExpectedMatNear(dst, ocl_dst, src.size().area() * 1e-4);
}
}

View File

@@ -48,7 +48,7 @@
///////////// Blur////////////////////////
PERFTEST(Blur)
{
Mat src1, dst;
Mat src1, dst, ocl_dst;
ocl::oclMat d_src1, d_dst;
Size ksize = Size(3, 3);
@@ -65,7 +65,6 @@ PERFTEST(Blur)
gen(src1, size, size, all_type[j], 0, 256);
gen(dst, size, size, all_type[j], 0, 256);
blur(src1, dst, ksize, Point(-1, -1), bordertype);
CPU_ON;
@@ -78,8 +77,6 @@ PERFTEST(Blur)
ocl::blur(d_src1, d_dst, ksize, Point(-1, -1), bordertype);
WARMUP_OFF;
TestSystem::instance().setAccurate(ExpectedMatNear(cv::Mat(d_dst), dst, 1.0));
GPU_ON;
ocl::blur(d_src1, d_dst, ksize, Point(-1, -1), bordertype);
GPU_OFF;
@@ -87,8 +84,10 @@ PERFTEST(Blur)
GPU_FULL_ON;
d_src1.upload(src1);
ocl::blur(d_src1, d_dst, ksize, Point(-1, -1), bordertype);
d_dst.download(dst);
d_dst.download(ocl_dst);
GPU_FULL_OFF;
TestSystem::instance().ExpectedMatNear(ocl_dst, dst, 1.0);
}
}
@@ -96,7 +95,7 @@ PERFTEST(Blur)
///////////// Laplacian////////////////////////
PERFTEST(Laplacian)
{
Mat src1, dst;
Mat src1, dst, ocl_dst;
ocl::oclMat d_src1, d_dst;
int ksize = 3;
@@ -112,7 +111,6 @@ PERFTEST(Laplacian)
gen(src1, size, size, all_type[j], 0, 256);
gen(dst, size, size, all_type[j], 0, 256);
Laplacian(src1, dst, -1, ksize, 1);
CPU_ON;
@@ -125,8 +123,6 @@ PERFTEST(Laplacian)
ocl::Laplacian(d_src1, d_dst, -1, ksize, 1);
WARMUP_OFF;
TestSystem::instance().setAccurate(ExpectedMatNear(cv::Mat(d_dst), dst, 1e-5));
GPU_ON;
ocl::Laplacian(d_src1, d_dst, -1, ksize, 1);
GPU_OFF;
@@ -134,8 +130,10 @@ PERFTEST(Laplacian)
GPU_FULL_ON;
d_src1.upload(src1);
ocl::Laplacian(d_src1, d_dst, -1, ksize, 1);
d_dst.download(dst);
d_dst.download(ocl_dst);
GPU_FULL_OFF;
TestSystem::instance().ExpectedMatNear(ocl_dst, dst, 1e-5);
}
}
@@ -144,7 +142,7 @@ PERFTEST(Laplacian)
///////////// Erode ////////////////////
PERFTEST(Erode)
{
Mat src, dst, ker;
Mat src, dst, ker, ocl_dst;
ocl::oclMat d_src, d_dst;
int all_type[] = {CV_8UC1, CV_8UC4, CV_32FC1, CV_32FC4};
@@ -171,8 +169,6 @@ PERFTEST(Erode)
ocl::erode(d_src, d_dst, ker);
WARMUP_OFF;
TestSystem::instance().setAccurate(ExpectedMatNear(cv::Mat(d_dst), dst, 1e-5));
GPU_ON;
ocl::erode(d_src, d_dst, ker);
GPU_OFF;
@@ -180,8 +176,10 @@ PERFTEST(Erode)
GPU_FULL_ON;
d_src.upload(src);
ocl::erode(d_src, d_dst, ker);
d_dst.download(dst);
d_dst.download(ocl_dst);
GPU_FULL_OFF;
TestSystem::instance().ExpectedMatNear(ocl_dst, dst, 1e-5);
}
}
@@ -190,7 +188,7 @@ PERFTEST(Erode)
///////////// Sobel ////////////////////////
PERFTEST(Sobel)
{
Mat src, dst;
Mat src, dst, ocl_dst;
ocl::oclMat d_src, d_dst;
int dx = 1;
@@ -218,8 +216,6 @@ PERFTEST(Sobel)
ocl::Sobel(d_src, d_dst, -1, dx, dy);
WARMUP_OFF;
TestSystem::instance().setAccurate(ExpectedMatNear(cv::Mat(d_dst), dst, 1));
GPU_ON;
ocl::Sobel(d_src, d_dst, -1, dx, dy);
GPU_OFF;
@@ -227,8 +223,10 @@ PERFTEST(Sobel)
GPU_FULL_ON;
d_src.upload(src);
ocl::Sobel(d_src, d_dst, -1, dx, dy);
d_dst.download(dst);
d_dst.download(ocl_dst);
GPU_FULL_OFF;
TestSystem::instance().ExpectedMatNear(ocl_dst, dst, 1);
}
}
@@ -236,7 +234,7 @@ PERFTEST(Sobel)
///////////// Scharr ////////////////////////
PERFTEST(Scharr)
{
Mat src, dst;
Mat src, dst, ocl_dst;
ocl::oclMat d_src, d_dst;
int dx = 1;
@@ -264,8 +262,6 @@ PERFTEST(Scharr)
ocl::Scharr(d_src, d_dst, -1, dx, dy);
WARMUP_OFF;
TestSystem::instance().setAccurate(ExpectedMatNear(cv::Mat(d_dst), dst, 1));
GPU_ON;
ocl::Scharr(d_src, d_dst, -1, dx, dy);
GPU_OFF;
@@ -273,8 +269,10 @@ PERFTEST(Scharr)
GPU_FULL_ON;
d_src.upload(src);
ocl::Scharr(d_src, d_dst, -1, dx, dy);
d_dst.download(dst);
d_dst.download(ocl_dst);
GPU_FULL_OFF;
TestSystem::instance().ExpectedMatNear(ocl_dst, dst, 1);
}
}
@@ -283,7 +281,7 @@ PERFTEST(Scharr)
///////////// GaussianBlur ////////////////////////
PERFTEST(GaussianBlur)
{
Mat src, dst;
Mat src, dst, ocl_dst;
int all_type[] = {CV_8UC1, CV_8UC4, CV_32FC1, CV_32FC4};
std::string type_name[] = {"CV_8UC1", "CV_8UC4", "CV_32FC1", "CV_32FC4"};
@@ -311,9 +309,6 @@ PERFTEST(GaussianBlur)
ocl::GaussianBlur(d_src, d_dst, Size(9, 9), 0);
WARMUP_OFF;
TestSystem::instance().setAccurate(ExpectedMatNear(cv::Mat(d_dst), dst, 1.0));
GPU_ON;
ocl::GaussianBlur(d_src, d_dst, Size(9, 9), 0);
GPU_OFF;
@@ -321,8 +316,10 @@ PERFTEST(GaussianBlur)
GPU_FULL_ON;
d_src.upload(src);
ocl::GaussianBlur(d_src, d_dst, Size(9, 9), 0);
d_dst.download(dst);
d_dst.download(ocl_dst);
GPU_FULL_OFF;
TestSystem::instance().ExpectedMatNear(ocl_dst, dst, 1.0);
}
}
@@ -349,7 +346,7 @@ PERFTEST(filter2D)
Mat kernel;
gen(kernel, ksize, ksize, CV_32FC1, 0.0, 1.0);
Mat dst(src);
Mat dst, ocl_dst;
dst.setTo(0);
cv::filter2D(src, dst, -1, kernel);
@@ -357,17 +354,12 @@ PERFTEST(filter2D)
cv::filter2D(src, dst, -1, kernel);
CPU_OFF;
ocl::oclMat d_src(src);
ocl::oclMat d_dst(d_src);
d_dst.setTo(0);
ocl::oclMat d_src(src), d_dst;
WARMUP_ON;
ocl::filter2D(d_src, d_dst, -1, kernel);
WARMUP_OFF;
TestSystem::instance().setAccurate(ExpectedMatNear(cv::Mat(d_dst), dst, 1e-5));
GPU_ON;
ocl::filter2D(d_src, d_dst, -1, kernel);
GPU_OFF;
@@ -375,8 +367,10 @@ PERFTEST(filter2D)
GPU_FULL_ON;
d_src.upload(src);
ocl::filter2D(d_src, d_dst, -1, kernel);
d_dst.download(dst);
d_dst.download(ocl_dst);
GPU_FULL_OFF;
TestSystem::instance().ExpectedMatNear(ocl_dst, dst, 1e-5);
}
}

View File

@@ -48,7 +48,7 @@
///////////// gemm ////////////////////////
PERFTEST(gemm)
{
Mat src1, src2, src3, dst;
Mat src1, src2, src3, dst, ocl_dst;
ocl::oclMat d_src1, d_src2, d_src3, d_dst;
for (int size = Min_Size; size <= Max_Size; size *= Multiple)
@@ -72,7 +72,6 @@ PERFTEST(gemm)
WARMUP_ON;
ocl::gemm(d_src1, d_src2, 1.0, d_src3, 1.0, d_dst);
WARMUP_OFF;
TestSystem::instance().setAccurate(ExpectedMatNear(cv::Mat(d_dst), dst, src1.cols * src1.rows * 1e-4));
GPU_ON;
ocl::gemm(d_src1, d_src2, 1.0, d_src3, 1.0, d_dst);
@@ -83,7 +82,9 @@ PERFTEST(gemm)
d_src2.upload(src2);
d_src3.upload(src3);
ocl::gemm(d_src1, d_src2, 1.0, d_src3, 1.0, d_dst);
d_dst.download(dst);
d_dst.download(ocl_dst);
GPU_FULL_OFF;
TestSystem::instance().ExpectedMatNear(ocl_dst, dst, src1.cols * src1.rows * 1e-4);
}
}

View File

@@ -125,8 +125,10 @@ PERFTEST(Haar)
1.1, 2, 0 | CV_HAAR_SCALE_IMAGE, Size(30, 30));
WARMUP_OFF;
//Testing whether the expected is equal to the actual.
TestSystem::instance().setAccurate(ExpectedEQ<vector<Rect>::size_type, vector<Rect>::size_type>(faces.size(), oclfaces.size()));
if(faces.size() == oclfaces.size())
TestSystem::instance().setAccurate(1, 0);
else
TestSystem::instance().setAccurate(0, abs((int)faces.size() - (int)oclfaces.size()));
faces.clear();

View File

@@ -146,10 +146,8 @@ PERFTEST(HOG)
}
}
cv::Mat ocl_mat;
ocl_mat = cv::Mat(d_comp);
ocl_mat.convertTo(ocl_mat, cv::Mat(comp).type());
TestSystem::instance().setAccurate(ExpectedMatNear(ocl_mat, cv::Mat(comp), 3));
cv::Mat gpu_rst(d_comp), cpu_rst(comp);
TestSystem::instance().ExpectedMatNear(gpu_rst, cpu_rst, 3);
GPU_ON;
ocl_hog.detectMultiScale(d_src, found_locations);

View File

@@ -48,7 +48,7 @@
///////////// equalizeHist ////////////////////////
PERFTEST(equalizeHist)
{
Mat src, dst;
Mat src, dst, ocl_dst;
int all_type[] = {CV_8UC1};
std::string type_name[] = {"CV_8UC1"};
@@ -75,9 +75,6 @@ PERFTEST(equalizeHist)
ocl::equalizeHist(d_src, d_dst);
WARMUP_OFF;
TestSystem::instance().setAccurate(ExpectedMatNear(dst, cv::Mat(d_dst), 1.1));
GPU_ON;
ocl::equalizeHist(d_src, d_dst);
GPU_OFF;
@@ -85,8 +82,10 @@ PERFTEST(equalizeHist)
GPU_FULL_ON;
d_src.upload(src);
ocl::equalizeHist(d_src, d_dst);
d_dst.download(dst);
d_dst.download(ocl_dst);
GPU_FULL_OFF;
TestSystem::instance().ExpectedMatNear(dst, ocl_dst, 1.1);
}
}
@@ -94,7 +93,7 @@ PERFTEST(equalizeHist)
/////////// CopyMakeBorder //////////////////////
PERFTEST(CopyMakeBorder)
{
Mat src, dst;
Mat src, dst, ocl_dst;
ocl::oclMat d_dst;
int bordertype = BORDER_CONSTANT;
@@ -122,9 +121,6 @@ PERFTEST(CopyMakeBorder)
ocl::copyMakeBorder(d_src, d_dst, 7, 5, 5, 7, bordertype, cv::Scalar(1.0));
WARMUP_OFF;
TestSystem::instance().setAccurate(ExpectedMatNear(dst, cv::Mat(d_dst), 0.0));
GPU_ON;
ocl::copyMakeBorder(d_src, d_dst, 7, 5, 5, 7, bordertype, cv::Scalar(1.0));
GPU_OFF;
@@ -132,8 +128,10 @@ PERFTEST(CopyMakeBorder)
GPU_FULL_ON;
d_src.upload(src);
ocl::copyMakeBorder(d_src, d_dst, 7, 5, 5, 7, bordertype, cv::Scalar(1.0));
d_dst.download(dst);
d_dst.download(ocl_dst);
GPU_FULL_OFF;
TestSystem::instance().ExpectedMatNear(dst, ocl_dst, 0.0);
}
}
@@ -141,7 +139,7 @@ PERFTEST(CopyMakeBorder)
///////////// cornerMinEigenVal ////////////////////////
PERFTEST(cornerMinEigenVal)
{
Mat src, dst;
Mat src, dst, ocl_dst;
ocl::oclMat d_dst;
int blockSize = 7, apertureSize = 1 + 2 * (rand() % 4);
@@ -155,7 +153,6 @@ PERFTEST(cornerMinEigenVal)
{
SUBTEST << size << 'x' << size << "; " << type_name[j] ;
gen(src, size, size, all_type[j], 0, 256);
cornerMinEigenVal(src, dst, blockSize, apertureSize, borderType);
@@ -170,9 +167,6 @@ PERFTEST(cornerMinEigenVal)
ocl::cornerMinEigenVal(d_src, d_dst, blockSize, apertureSize, borderType);
WARMUP_OFF;
TestSystem::instance().setAccurate(ExpectedMatNear(dst, cv::Mat(d_dst), 1.0));
GPU_ON;
ocl::cornerMinEigenVal(d_src, d_dst, blockSize, apertureSize, borderType);
GPU_OFF;
@@ -180,8 +174,10 @@ PERFTEST(cornerMinEigenVal)
GPU_FULL_ON;
d_src.upload(src);
ocl::cornerMinEigenVal(d_src, d_dst, blockSize, apertureSize, borderType);
d_dst.download(dst);
d_dst.download(ocl_dst);
GPU_FULL_OFF;
TestSystem::instance().ExpectedMatNear(dst, ocl_dst, 1.0);
}
}
@@ -189,7 +185,7 @@ PERFTEST(cornerMinEigenVal)
///////////// cornerHarris ////////////////////////
PERFTEST(cornerHarris)
{
Mat src, dst;
Mat src, dst, ocl_dst;
ocl::oclMat d_src, d_dst;
int all_type[] = {CV_8UC1, CV_32FC1};
@@ -215,8 +211,6 @@ PERFTEST(cornerHarris)
ocl::cornerHarris(d_src, d_dst, 5, 7, 0.1, BORDER_REFLECT);
WARMUP_OFF;
TestSystem::instance().setAccurate(ExpectedMatNear(dst, cv::Mat(d_dst), 1.0));
GPU_ON;
ocl::cornerHarris(d_src, d_dst, 5, 7, 0.1, BORDER_REFLECT);
GPU_OFF;
@@ -224,8 +218,10 @@ PERFTEST(cornerHarris)
GPU_FULL_ON;
d_src.upload(src);
ocl::cornerHarris(d_src, d_dst, 5, 7, 0.1, BORDER_REFLECT);
d_dst.download(dst);
d_dst.download(ocl_dst);
GPU_FULL_OFF;
TestSystem::instance().ExpectedMatNear(dst, ocl_dst, 1.0);
}
@@ -234,7 +230,7 @@ PERFTEST(cornerHarris)
///////////// integral ////////////////////////
PERFTEST(integral)
{
Mat src, sum;
Mat src, sum, ocl_sum;
ocl::oclMat d_src, d_sum, d_buf;
int all_type[] = {CV_8UC1};
@@ -260,12 +256,6 @@ PERFTEST(integral)
ocl::integral(d_src, d_sum);
WARMUP_OFF;
cv::Mat ocl_mat;
d_sum.download(ocl_mat);
if(sum.type() == ocl_mat.type()) //we won't test accuracy when cpu function overlow
TestSystem::instance().setAccurate(ExpectedMatNear(sum, ocl_mat, 0.0));
GPU_ON;
ocl::integral(d_src, d_sum);
GPU_OFF;
@@ -273,8 +263,12 @@ PERFTEST(integral)
GPU_FULL_ON;
d_src.upload(src);
ocl::integral(d_src, d_sum);
d_sum.download(sum);
d_sum.download(ocl_sum);
GPU_FULL_OFF;
if(sum.type() == ocl_sum.type()) //we won't test accuracy when cpu function overlow
TestSystem::instance().ExpectedMatNear(sum, ocl_sum, 0.0);
}
}
@@ -282,7 +276,7 @@ PERFTEST(integral)
///////////// WarpAffine ////////////////////////
PERFTEST(WarpAffine)
{
Mat src, dst;
Mat src, dst, ocl_dst;
ocl::oclMat d_src, d_dst;
static const double coeffs[2][3] =
@@ -319,8 +313,6 @@ PERFTEST(WarpAffine)
ocl::warpAffine(d_src, d_dst, M, size1, interpolation);
WARMUP_OFF;
TestSystem::instance().setAccurate(ExpectedMatNear(dst, cv::Mat(d_dst), 1.0));
GPU_ON;
ocl::warpAffine(d_src, d_dst, M, size1, interpolation);
GPU_OFF;
@@ -328,8 +320,10 @@ PERFTEST(WarpAffine)
GPU_FULL_ON;
d_src.upload(src);
ocl::warpAffine(d_src, d_dst, M, size1, interpolation);
d_dst.download(dst);
d_dst.download(ocl_dst);
GPU_FULL_OFF;
TestSystem::instance().ExpectedMatNear(dst, ocl_dst, 1.0);
}
}
@@ -337,7 +331,7 @@ PERFTEST(WarpAffine)
///////////// WarpPerspective ////////////////////////
PERFTEST(WarpPerspective)
{
Mat src, dst;
Mat src, dst, ocl_dst;
ocl::oclMat d_src, d_dst;
static const double coeffs[3][3] =
@@ -374,8 +368,6 @@ PERFTEST(WarpPerspective)
ocl::warpPerspective(d_src, d_dst, M, size1, interpolation);
WARMUP_OFF;
TestSystem::instance().setAccurate(ExpectedMatNear(dst, cv::Mat(d_dst), 1.0));
GPU_ON;
ocl::warpPerspective(d_src, d_dst, M, size1, interpolation);
GPU_OFF;
@@ -383,8 +375,10 @@ PERFTEST(WarpPerspective)
GPU_FULL_ON;
d_src.upload(src);
ocl::warpPerspective(d_src, d_dst, M, size1, interpolation);
d_dst.download(dst);
d_dst.download(ocl_dst);
GPU_FULL_OFF;
TestSystem::instance().ExpectedMatNear(dst, ocl_dst, 1.0);
}
}
@@ -393,7 +387,7 @@ PERFTEST(WarpPerspective)
///////////// resize ////////////////////////
PERFTEST(resize)
{
Mat src, dst;
Mat src, dst, ocl_dst;
ocl::oclMat d_src, d_dst;
@@ -420,9 +414,6 @@ PERFTEST(resize)
ocl::resize(d_src, d_dst, Size(), 2.0, 2.0);
WARMUP_OFF;
TestSystem::instance().setAccurate(ExpectedMatNear(dst, cv::Mat(d_dst), 1.0));
GPU_ON;
ocl::resize(d_src, d_dst, Size(), 2.0, 2.0);
GPU_OFF;
@@ -430,8 +421,10 @@ PERFTEST(resize)
GPU_FULL_ON;
d_src.upload(src);
ocl::resize(d_src, d_dst, Size(), 2.0, 2.0);
d_dst.download(dst);
d_dst.download(ocl_dst);
GPU_FULL_OFF;
TestSystem::instance().ExpectedMatNear(dst, ocl_dst, 1.0);
}
}
@@ -456,8 +449,6 @@ PERFTEST(resize)
ocl::resize(d_src, d_dst, Size(), 0.5, 0.5);
WARMUP_OFF;
TestSystem::instance().setAccurate(ExpectedMatNear(dst, cv::Mat(d_dst), 1.0));
GPU_ON;
ocl::resize(d_src, d_dst, Size(), 0.5, 0.5);
GPU_OFF;
@@ -465,8 +456,10 @@ PERFTEST(resize)
GPU_FULL_ON;
d_src.upload(src);
ocl::resize(d_src, d_dst, Size(), 0.5, 0.5);
d_dst.download(dst);
d_dst.download(ocl_dst);
GPU_FULL_OFF;
TestSystem::instance().ExpectedMatNear(dst, ocl_dst, 1.0);
}
}
@@ -474,10 +467,9 @@ PERFTEST(resize)
///////////// threshold////////////////////////
PERFTEST(threshold)
{
Mat src, dst;
Mat src, dst, ocl_dst;
ocl::oclMat d_src, d_dst;
for (int size = Min_Size; size <= Max_Size; size *= Multiple)
{
SUBTEST << size << 'x' << size << "; 8UC1; THRESH_BINARY";
@@ -496,9 +488,6 @@ PERFTEST(threshold)
ocl::threshold(d_src, d_dst, 50.0, 0.0, THRESH_BINARY);
WARMUP_OFF;
TestSystem::instance().setAccurate(ExpectedMatNear(dst, cv::Mat(d_dst), 1.0));
GPU_ON;
ocl::threshold(d_src, d_dst, 50.0, 0.0, THRESH_BINARY);
GPU_OFF;
@@ -506,9 +495,10 @@ PERFTEST(threshold)
GPU_FULL_ON;
d_src.upload(src);
ocl::threshold(d_src, d_dst, 50.0, 0.0, THRESH_BINARY);
d_dst.download(dst);
d_dst.download(ocl_dst);
GPU_FULL_OFF;
TestSystem::instance().ExpectedMatNear(dst, ocl_dst, 1.0);
}
for (int size = Min_Size; size <= Max_Size; size *= Multiple)
@@ -529,8 +519,6 @@ PERFTEST(threshold)
ocl::threshold(d_src, d_dst, 50.0, 0.0, THRESH_TRUNC);
WARMUP_OFF;
TestSystem::instance().setAccurate(ExpectedMatNear(dst, cv::Mat(d_dst), 1.0));
GPU_ON;
ocl::threshold(d_src, d_dst, 50.0, 0.0, THRESH_TRUNC);
GPU_OFF;
@@ -538,8 +526,10 @@ PERFTEST(threshold)
GPU_FULL_ON;
d_src.upload(src);
ocl::threshold(d_src, d_dst, 50.0, 0.0, THRESH_TRUNC);
d_dst.download(dst);
d_dst.download(ocl_dst);
GPU_FULL_OFF;
TestSystem::instance().ExpectedMatNear(dst, ocl_dst, 1.0);
}
}
///////////// meanShiftFiltering////////////////////////
@@ -726,7 +716,7 @@ void meanShiftFiltering_(const Mat &src_roi, Mat &dst_roi, int sp, int sr, cv::T
PERFTEST(meanShiftFiltering)
{
int sp = 5, sr = 6;
Mat src, dst;
Mat src, dst, ocl_dst;
ocl::oclMat d_src, d_dst;
@@ -753,11 +743,6 @@ PERFTEST(meanShiftFiltering)
ocl::meanShiftFiltering(d_src, d_dst, sp, sr, crit);
WARMUP_OFF;
cv::Mat ocl_mat;
d_dst.download(ocl_mat);
TestSystem::instance().setAccurate(ExpectedMatNear(dst, ocl_mat, 0.0));
GPU_ON;
ocl::meanShiftFiltering(d_src, d_dst, sp, sr);
GPU_OFF;
@@ -765,8 +750,10 @@ PERFTEST(meanShiftFiltering)
GPU_FULL_ON;
d_src.upload(src);
ocl::meanShiftFiltering(d_src, d_dst, sp, sr);
d_dst.download(dst);
d_dst.download(ocl_dst);
GPU_FULL_OFF;
TestSystem::instance().ExpectedMatNear(dst, ocl_dst, 0.0);
}
}
///////////// meanShiftProc////////////////////////
@@ -1010,8 +997,9 @@ void meanShiftProc_(const Mat &src_roi, Mat &dst_roi, Mat &dstCoor_roi, int sp,
}
PERFTEST(meanShiftProc)
{
Mat src, dst, dstCoor_roi;
ocl::oclMat d_src, d_dst, d_dstCoor_roi;
Mat src;
vector<Mat> dst(2), ocl_dst(2);
ocl::oclMat d_src, d_dst, d_dstCoor;
TermCriteria crit(TermCriteria::COUNT + TermCriteria::EPS, 5, 1);
@@ -1020,42 +1008,41 @@ PERFTEST(meanShiftProc)
SUBTEST << size << 'x' << size << "; 8UC4 and CV_16SC2 ";
gen(src, size, size, CV_8UC4, Scalar::all(0), Scalar::all(256));
gen(dst, size, size, CV_8UC4, Scalar::all(0), Scalar::all(256));
gen(dstCoor_roi, size, size, CV_16SC2, Scalar::all(0), Scalar::all(256));
gen(dst[0], size, size, CV_8UC4, Scalar::all(0), Scalar::all(256));
gen(dst[1], size, size, CV_16SC2, Scalar::all(0), Scalar::all(256));
meanShiftProc_(src, dst, dstCoor_roi, 5, 6, crit);
meanShiftProc_(src, dst[0], dst[1], 5, 6, crit);
CPU_ON;
meanShiftProc_(src, dst, dstCoor_roi, 5, 6, crit);
meanShiftProc_(src, dst[0], dst[1], 5, 6, crit);
CPU_OFF;
d_src.upload(src);
WARMUP_ON;
ocl::meanShiftProc(d_src, d_dst, d_dstCoor_roi, 5, 6, crit);
ocl::meanShiftProc(d_src, d_dst, d_dstCoor, 5, 6, crit);
WARMUP_OFF;
TestSystem::instance().setAccurate(ExpectedMatNear(dstCoor_roi, cv::Mat(d_dstCoor_roi), 0.0)
&&ExpectedMatNear(dst, cv::Mat(d_dst), 0.0));
GPU_ON;
ocl::meanShiftProc(d_src, d_dst, d_dstCoor_roi, 5, 6, crit);
ocl::meanShiftProc(d_src, d_dst, d_dstCoor, 5, 6, crit);
GPU_OFF;
GPU_FULL_ON;
d_src.upload(src);
ocl::meanShiftProc(d_src, d_dst, d_dstCoor_roi, 5, 6, crit);
d_dst.download(dst);
d_dstCoor_roi.download(dstCoor_roi);
ocl::meanShiftProc(d_src, d_dst, d_dstCoor, 5, 6, crit);
d_dst.download(ocl_dst[0]);
d_dstCoor.download(ocl_dst[1]);
GPU_FULL_OFF;
vector<double> eps(2, 0.);
TestSystem::instance().ExpectMatsNear(dst, ocl_dst, eps);
}
}
///////////// remap////////////////////////
PERFTEST(remap)
{
Mat src, dst, xmap, ymap;
Mat src, dst, xmap, ymap, ocl_dst;
ocl::oclMat d_src, d_dst, d_xmap, d_ymap;
int all_type[] = {CV_8UC1, CV_8UC4};
@@ -1088,7 +1075,6 @@ PERFTEST(remap)
}
}
remap(src, dst, xmap, ymap, interpolation, borderMode);
CPU_ON;
@@ -1104,12 +1090,6 @@ PERFTEST(remap)
ocl::remap(d_src, d_dst, d_xmap, d_ymap, interpolation, borderMode);
WARMUP_OFF;
if(interpolation == 0)
TestSystem::instance().setAccurate(ExpectedMatNear(dst, cv::Mat(d_dst), 1.0));
else
TestSystem::instance().setAccurate(ExpectedMatNear(dst, cv::Mat(d_dst), 2.0));
GPU_ON;
ocl::remap(d_src, d_dst, d_xmap, d_ymap, interpolation, borderMode);
GPU_OFF;
@@ -1117,8 +1097,10 @@ PERFTEST(remap)
GPU_FULL_ON;
d_src.upload(src);
ocl::remap(d_src, d_dst, d_xmap, d_ymap, interpolation, borderMode);
d_dst.download(dst);
d_dst.download(ocl_dst);
GPU_FULL_OFF;
TestSystem::instance().ExpectedMatNear(dst, ocl_dst, 2.0);
}
}

View File

@@ -56,11 +56,9 @@
PERFTEST(matchTemplate)
{
//InitMatchTemplate();
Mat src, templ, dst;
Mat src, templ, dst, ocl_dst;
int templ_size = 5;
for (int size = Min_Size; size <= Max_Size; size *= Multiple)
{
int all_type[] = {CV_32FC1, CV_32FC4};
@@ -82,16 +80,12 @@ PERFTEST(matchTemplate)
matchTemplate(src, templ, dst, TM_CCORR);
CPU_OFF;
ocl::oclMat d_src(src), d_templ, d_dst;
d_templ.upload(templ);
ocl::oclMat d_src(src), d_templ(templ), d_dst;
WARMUP_ON;
ocl::matchTemplate(d_src, d_templ, d_dst, TM_CCORR);
WARMUP_OFF;
TestSystem::instance().setAccurate(ExpectedMatNear(dst, cv::Mat(d_dst), templ.rows * templ.cols * 1e-1));
GPU_ON;
ocl::matchTemplate(d_src, d_templ, d_dst, TM_CCORR);
GPU_OFF;
@@ -100,8 +94,10 @@ PERFTEST(matchTemplate)
d_src.upload(src);
d_templ.upload(templ);
ocl::matchTemplate(d_src, d_templ, d_dst, TM_CCORR);
d_dst.download(dst);
d_dst.download(ocl_dst);
GPU_FULL_OFF;
TestSystem::instance().ExpectedMatNear(dst, ocl_dst, templ.rows * templ.cols * 1e-1);
}
}
@@ -131,8 +127,6 @@ PERFTEST(matchTemplate)
ocl::matchTemplate(d_src, d_templ, d_dst, TM_CCORR_NORMED);
WARMUP_OFF;
TestSystem::instance().setAccurate(ExpectedMatNear(dst, cv::Mat(d_dst), templ.rows * templ.cols * 1e-1));
GPU_ON;
ocl::matchTemplate(d_src, d_templ, d_dst, TM_CCORR_NORMED);
GPU_OFF;
@@ -141,8 +135,10 @@ PERFTEST(matchTemplate)
d_src.upload(src);
d_templ.upload(templ);
ocl::matchTemplate(d_src, d_templ, d_dst, TM_CCORR_NORMED);
d_dst.download(dst);
d_dst.download(ocl_dst);
GPU_FULL_OFF;
TestSystem::instance().ExpectedMatNear(dst, ocl_dst, templ.rows * templ.cols * 1e-1);
}
}
}

View File

@@ -48,7 +48,7 @@
///////////// ConvertTo////////////////////////
PERFTEST(ConvertTo)
{
Mat src, dst;
Mat src, dst, ocl_dst;
ocl::oclMat d_src, d_dst;
int all_type[] = {CV_8UC1, CV_8UC4};
@@ -77,9 +77,6 @@ PERFTEST(ConvertTo)
d_src.convertTo(d_dst, CV_32FC1);
WARMUP_OFF;
TestSystem::instance().setAccurate(ExpectedMatNear(dst, cv::Mat(d_dst), 0.0));
GPU_ON;
d_src.convertTo(d_dst, CV_32FC1);
GPU_OFF;
@@ -87,8 +84,10 @@ PERFTEST(ConvertTo)
GPU_FULL_ON;
d_src.upload(src);
d_src.convertTo(d_dst, CV_32FC1);
d_dst.download(dst);
d_dst.download(ocl_dst);
GPU_FULL_OFF;
TestSystem::instance().ExpectedMatNear(dst, ocl_dst, 0.0);
}
}
@@ -96,7 +95,7 @@ PERFTEST(ConvertTo)
///////////// copyTo////////////////////////
PERFTEST(copyTo)
{
Mat src, dst;
Mat src, dst, ocl_dst;
ocl::oclMat d_src, d_dst;
int all_type[] = {CV_8UC1, CV_8UC4};
@@ -125,9 +124,6 @@ PERFTEST(copyTo)
d_src.copyTo(d_dst);
WARMUP_OFF;
TestSystem::instance().setAccurate(ExpectedMatNear(dst, cv::Mat(d_dst), 0.0));
GPU_ON;
d_src.copyTo(d_dst);
GPU_OFF;
@@ -135,8 +131,10 @@ PERFTEST(copyTo)
GPU_FULL_ON;
d_src.upload(src);
d_src.copyTo(d_dst);
d_dst.download(dst);
d_dst.download(ocl_dst);
GPU_FULL_OFF;
TestSystem::instance().ExpectedMatNear(dst, ocl_dst, 0.0);
}
}
@@ -144,9 +142,9 @@ PERFTEST(copyTo)
///////////// setTo////////////////////////
PERFTEST(setTo)
{
Mat src, dst;
Mat src, ocl_src;
Scalar val(1, 2, 3, 4);
ocl::oclMat d_src, d_dst;
ocl::oclMat d_src;
int all_type[] = {CV_8UC1, CV_8UC4};
std::string type_name[] = {"CV_8UC1", "CV_8UC4"};
@@ -171,10 +169,10 @@ PERFTEST(setTo)
d_src.setTo(val);
WARMUP_OFF;
TestSystem::instance().setAccurate(ExpectedMatNear(src, cv::Mat(d_src), 1.0));
d_src.download(ocl_src);
TestSystem::instance().ExpectedMatNear(src, ocl_src, 1.0);
GPU_ON;
GPU_ON;;
d_src.setTo(val);
GPU_OFF;

View File

@@ -48,39 +48,40 @@
///////////// norm////////////////////////
PERFTEST(norm)
{
Mat src, buf;
ocl::oclMat d_src, d_buf;
Mat src1, src2, ocl_src1;
ocl::oclMat d_src1, d_src2;
for (int size = Min_Size; size <= Max_Size; size *= Multiple)
{
SUBTEST << size << 'x' << size << "; CV_8UC1; NORM_INF";
gen(src, size, size, CV_8UC1, Scalar::all(0), Scalar::all(1));
gen(buf, size, size, CV_8UC1, Scalar::all(0), Scalar::all(1));
gen(src1, size, size, CV_8UC1, Scalar::all(0), Scalar::all(1));
gen(src2, size, size, CV_8UC1, Scalar::all(0), Scalar::all(1));
norm(src, NORM_INF);
norm(src1, src2, NORM_INF);
CPU_ON;
norm(src, NORM_INF);
norm(src1, src2, NORM_INF);
CPU_OFF;
d_src.upload(src);
d_buf.upload(buf);
d_src1.upload(src1);
d_src2.upload(src2);
WARMUP_ON;
ocl::norm(d_src, d_buf, NORM_INF);
ocl::norm(d_src1, d_src2, NORM_INF);
WARMUP_OFF;
TestSystem::instance().setAccurate(ExpectedMatNear(src, cv::Mat(d_buf), .5));
d_src1.download(ocl_src1);
TestSystem::instance().ExpectedMatNear(src1, ocl_src1, .5);
GPU_ON;
ocl::norm(d_src, d_buf, NORM_INF);
ocl::norm(d_src1, d_src2, NORM_INF);
GPU_OFF;
GPU_FULL_ON;
d_src.upload(src);
ocl::norm(d_src, d_buf, NORM_INF);
d_src1.upload(src1);
d_src2.upload(src2);
ocl::norm(d_src1, d_src2, NORM_INF);
GPU_FULL_OFF;
}
}

View File

@@ -82,8 +82,8 @@ PERFTEST(PyrLKOpticalFlow)
SUBTEST << frame0.cols << "x" << frame0.rows << "; color; " << points << " points";
else
SUBTEST << frame0.cols << "x" << frame0.rows << "; gray; " << points << " points";
Mat nextPts_cpu;
Mat status_cpu;
Mat ocl_nextPts;
Mat ocl_status;
vector<Point2f> pts;
goodFeaturesToTrack(i == 0 ? gray_frame : frame0, pts, points, 0.01, 0.0);
@@ -116,12 +116,6 @@ PERFTEST(PyrLKOpticalFlow)
d_pyrLK.sparse(d_frame0, d_frame1, d_pts, d_nextPts, d_status, &d_err);
WARMUP_OFF;
std::vector<cv::Point2f> ocl_nextPts(d_nextPts.cols);
std::vector<unsigned char> ocl_status(d_status.cols);
TestSystem::instance().setAccurate(AssertEQ<size_t>(nextPts.size(), ocl_nextPts.size()));
TestSystem::instance().setAccurate(AssertEQ<size_t>(status.size(), ocl_status.size()));
GPU_ON;
d_pyrLK.sparse(d_frame0, d_frame1, d_pts, d_nextPts, d_status, &d_err);
GPU_OFF;
@@ -133,17 +127,102 @@ PERFTEST(PyrLKOpticalFlow)
d_pyrLK.sparse(d_frame0, d_frame1, d_pts, d_nextPts, d_status, &d_err);
if (!d_nextPts.empty())
{
d_nextPts.download(nextPts_cpu);
}
d_nextPts.download(ocl_nextPts);
if (!d_status.empty())
{
d_status.download(status_cpu);
}
d_status.download(ocl_status);
GPU_FULL_OFF;
size_t mismatch = 0;
for (int i = 0; i < (int)nextPts.size(); ++i)
{
if(status[i] != ocl_status.at<unsigned char>(0, i)){
mismatch++;
continue;
}
if(status[i]){
Point2f gpu_rst = ocl_nextPts.at<Point2f>(0, i);
Point2f cpu_rst = nextPts[i];
if(fabs(gpu_rst.x - cpu_rst.x) >= 1. || fabs(gpu_rst.y - cpu_rst.y) >= 1.)
mismatch++;
}
}
double ratio = (double)mismatch / (double)nextPts.size();
if(ratio < .02)
TestSystem::instance().setAccurate(1, ratio);
else
TestSystem::instance().setAccurate(0, ratio);
}
}
}
PERFTEST(tvl1flow)
{
cv::Mat frame0 = imread("rubberwhale1.png", cv::IMREAD_GRAYSCALE);
assert(!frame0.empty());
cv::Mat frame1 = imread("rubberwhale2.png", cv::IMREAD_GRAYSCALE);
assert(!frame1.empty());
cv::ocl::OpticalFlowDual_TVL1_OCL d_alg;
cv::ocl::oclMat d_flowx(frame0.size(), CV_32FC1);
cv::ocl::oclMat d_flowy(frame1.size(), CV_32FC1);
cv::Ptr<cv::DenseOpticalFlow> alg = cv::createOptFlow_DualTVL1();
cv::Mat flow;
SUBTEST << frame0.cols << 'x' << frame0.rows << "; rubberwhale1.png; "<<frame1.cols<<'x'<<frame1.rows<<"; rubberwhale2.png";
alg->calc(frame0, frame1, flow);
CPU_ON;
alg->calc(frame0, frame1, flow);
CPU_OFF;
cv::Mat gold[2];
cv::split(flow, gold);
cv::ocl::oclMat d0(frame0.size(), CV_32FC1);
d0.upload(frame0);
cv::ocl::oclMat d1(frame1.size(), CV_32FC1);
d1.upload(frame1);
WARMUP_ON;
d_alg(d0, d1, d_flowx, d_flowy);
WARMUP_OFF;
/*
double diff1 = 0.0, diff2 = 0.0;
if(ExceptedMatSimilar(gold[0], cv::Mat(d_flowx), 3e-3, diff1) == 1
&&ExceptedMatSimilar(gold[1], cv::Mat(d_flowy), 3e-3, diff2) == 1)
TestSystem::instance().setAccurate(1);
else
TestSystem::instance().setAccurate(0);
TestSystem::instance().setDiff(diff1);
TestSystem::instance().setDiff(diff2);
*/
GPU_ON;
d_alg(d0, d1, d_flowx, d_flowy);
d_alg.collectGarbage();
GPU_OFF;
cv::Mat flowx, flowy;
GPU_FULL_ON;
d0.upload(frame0);
d1.upload(frame1);
d_alg(d0, d1, d_flowx, d_flowy);
d_alg.collectGarbage();
d_flowx.download(flowx);
d_flowy.download(flowy);
GPU_FULL_OFF;
TestSystem::instance().ExceptedMatSimilar(gold[0], flowx, 3e-3);
TestSystem::instance().ExceptedMatSimilar(gold[1], flowy, 3e-3);
}

View File

@@ -48,7 +48,7 @@
///////////// pyrDown //////////////////////
PERFTEST(pyrDown)
{
Mat src, dst;
Mat src, dst, ocl_dst;
int all_type[] = {CV_8UC1, CV_8UC4};
std::string type_name[] = {"CV_8UC1", "CV_8UC4"};
@@ -73,9 +73,6 @@ PERFTEST(pyrDown)
ocl::pyrDown(d_src, d_dst);
WARMUP_OFF;
TestSystem::instance().setAccurate(ExpectedMatNear(dst, cv::Mat(d_dst), dst.depth() == CV_32F ? 1e-4f : 1.0f));
GPU_ON;
ocl::pyrDown(d_src, d_dst);
GPU_OFF;
@@ -83,8 +80,53 @@ PERFTEST(pyrDown)
GPU_FULL_ON;
d_src.upload(src);
ocl::pyrDown(d_src, d_dst);
d_dst.download(dst);
d_dst.download(ocl_dst);
GPU_FULL_OFF;
TestSystem::instance().ExpectedMatNear(dst, ocl_dst, dst.depth() == CV_32F ? 1e-4f : 1.0f);
}
}
}
///////////// pyrUp ////////////////////////
PERFTEST(pyrUp)
{
Mat src, dst, ocl_dst;
int all_type[] = {CV_8UC1, CV_8UC4};
std::string type_name[] = {"CV_8UC1", "CV_8UC4"};
for (int size = 500; size <= 2000; size *= 2)
{
for (size_t j = 0; j < sizeof(all_type) / sizeof(int); j++)
{
SUBTEST << size << 'x' << size << "; " << type_name[j] ;
gen(src, size, size, all_type[j], 0, 256);
pyrUp(src, dst);
CPU_ON;
pyrUp(src, dst);
CPU_OFF;
ocl::oclMat d_src(src);
ocl::oclMat d_dst;
WARMUP_ON;
ocl::pyrUp(d_src, d_dst);
WARMUP_OFF;
GPU_ON;
ocl::pyrUp(d_src, d_dst);
GPU_OFF;
GPU_FULL_ON;
d_src.upload(src);
ocl::pyrUp(d_src, d_dst);
d_dst.download(ocl_dst);
GPU_FULL_OFF;
TestSystem::instance().ExpectedMatNear(dst, ocl_dst, (src.depth() == CV_32F ? 1e-4f : 1.0));
}
}
}

View File

@@ -1,89 +0,0 @@
/*M///////////////////////////////////////////////////////////////////////////////////////
//
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
//
// By downloading, copying, installing or using the software you agree to this license.
// If you do not agree to this license, do not download, install,
// copy or use the software.
//
//
// License Agreement
// For Open Source Computer Vision Library
//
// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
// Third party copyrights are property of their respective owners.
//
// @Authors
// Fangfang Bai, fangfang@multicorewareinc.com
// Jin Ma, jin@multicorewareinc.com
//
// Redistribution and use in source and binary forms, with or without modification,
// are permitted provided that the following conditions are met:
//
// * Redistribution's of source code must retain the above copyright notice,
// this list of conditions and the following disclaimer.
//
// * Redistribution's in binary form must reproduce the above copyright notice,
// this list of conditions and the following disclaimer in the documentation
// and/or other oclMaterials provided with the distribution.
//
// * The name of the copyright holders may not be used to endorse or promote products
// derived from this software without specific prior written permission.
//
// This software is provided by the copyright holders and contributors as is and
// any express or implied warranties, including, but not limited to, the implied
// warranties of merchantability and fitness for a particular purpose are disclaimed.
// In no event shall the Intel Corporation or contributors be liable for any direct,
// indirect, incidental, special, exemplary, or consequential damages
// (including, but not limited to, procurement of substitute goods or services;
// loss of use, data, or profits; or business interruption) however caused
// and on any theory of liability, whether in contract, strict liability,
// or tort (including negligence or otherwise) arising in any way out of
// the use of this software, even if advised of the possibility of such damage.
//
//M*/
#include "precomp.hpp"
///////////// pyrUp ////////////////////////
PERFTEST(pyrUp)
{
Mat src, dst;
int all_type[] = {CV_8UC1, CV_8UC4};
std::string type_name[] = {"CV_8UC1", "CV_8UC4"};
for (int size = 500; size <= 2000; size *= 2)
{
for (size_t j = 0; j < sizeof(all_type) / sizeof(int); j++)
{
SUBTEST << size << 'x' << size << "; " << type_name[j] ;
gen(src, size, size, all_type[j], 0, 256);
pyrUp(src, dst);
CPU_ON;
pyrUp(src, dst);
CPU_OFF;
ocl::oclMat d_src(src);
ocl::oclMat d_dst;
WARMUP_ON;
ocl::pyrUp(d_src, d_dst);
WARMUP_OFF;
TestSystem::instance().setAccurate(ExpectedMatNear(dst, cv::Mat(d_dst), (src.depth() == CV_32F ? 1e-4f : 1.0)));
GPU_ON;
ocl::pyrUp(d_src, d_dst);
GPU_OFF;
GPU_FULL_ON;
d_src.upload(src);
ocl::pyrUp(d_src, d_dst);
d_dst.download(dst);
GPU_FULL_OFF;
}
}
}

View File

@@ -48,7 +48,7 @@
///////////// Merge////////////////////////
PERFTEST(Merge)
{
Mat dst;
Mat dst, ocl_dst;
ocl::oclMat d_dst;
int channels = 4;
@@ -85,22 +85,20 @@ PERFTEST(Merge)
ocl::merge(d_src, d_dst);
WARMUP_OFF;
TestSystem::instance().setAccurate(ExpectedMatNear(cv::Mat(dst), cv::Mat(d_dst), 0.0));
GPU_ON;
ocl::merge(d_src, d_dst);
GPU_OFF;
GPU_FULL_ON;
for (int i = 0; i < channels; ++i)
{
d_src[i] = ocl::oclMat(size1, CV_8U, cv::Scalar::all(i));
d_src[i] = ocl::oclMat(size1, all_type[j], cv::Scalar::all(i));
}
ocl::merge(d_src, d_dst);
d_dst.download(dst);
d_dst.download(ocl_dst);
GPU_FULL_OFF;
TestSystem::instance().ExpectedMatNear(dst, ocl_dst, 0.0);
}
}
@@ -122,7 +120,7 @@ PERFTEST(Split)
Mat src(size1, CV_MAKE_TYPE(all_type[j], 4), cv::Scalar(1, 2, 3, 4));
std::vector<cv::Mat> dst;
std::vector<cv::Mat> dst, ocl_dst(4);
split(src, dst);
@@ -135,22 +133,7 @@ PERFTEST(Split)
WARMUP_ON;
ocl::split(d_src, d_dst);
WARMUP_OFF;
if(d_dst.size() == dst.size())
{
TestSystem::instance().setAccurate(1);
for(size_t i = 0; i < dst.size(); i++)
{
if(ExpectedMatNear(dst[i], cv::Mat(d_dst[i]), 0.0) == 0)
{
TestSystem::instance().setAccurate(0);
break;
}
}
}else
TestSystem::instance().setAccurate(0);
WARMUP_OFF;
GPU_ON;
ocl::split(d_src, d_dst);
@@ -159,7 +142,12 @@ PERFTEST(Split)
GPU_FULL_ON;
d_src.upload(src);
ocl::split(d_src, d_dst);
for(size_t i = 0; i < dst.size(); i++)
d_dst[i].download(ocl_dst[i]);
GPU_FULL_OFF;
vector<double> eps(4, 0.);
TestSystem::instance().ExpectMatsNear(dst, ocl_dst, eps);
}
}

View File

@@ -114,7 +114,6 @@ void TestSystem::finishCurrentSubtest()
return;
}
int is_accurate = is_accurate_;
double cpu_time = cpu_elapsed_ / getTickFrequency() * 1000.0;
double gpu_time = gpu_elapsed_ / getTickFrequency() * 1000.0;
double gpu_full_time = gpu_full_elapsed_ / getTickFrequency() * 1000.0;
@@ -171,8 +170,8 @@ void TestSystem::finishCurrentSubtest()
deviation = std::sqrt(sum / gpu_times_.size());
}
printMetrics(is_accurate, cpu_time, gpu_time, gpu_full_time, speedup, fullspeedup);
writeMetrics(is_accurate, cpu_time, gpu_time, gpu_full_time, speedup, fullspeedup, gpu_min, gpu_max, deviation);
printMetrics(is_accurate_, cpu_time, gpu_time, gpu_full_time, speedup, fullspeedup);
writeMetrics(cpu_time, gpu_time, gpu_full_time, speedup, fullspeedup, gpu_min, gpu_max, deviation);
num_subtests_called_++;
resetCurrentSubtest();
@@ -219,7 +218,7 @@ void TestSystem::writeHeading()
}
}
fprintf(record_, "NAME,DESCRIPTION,ACCURACY,CPU (ms),GPU (ms),SPEEDUP,GPUTOTAL (ms),TOTALSPEEDUP,GPU Min (ms),GPU Max (ms), Standard deviation (ms)\n");
fprintf(record_, "NAME,DESCRIPTION,ACCURACY,DIFFERENCE,CPU (ms),GPU (ms),SPEEDUP,GPUTOTAL (ms),TOTALSPEEDUP,GPU Min (ms),GPU Max (ms), Standard deviation (ms)\n");
fflush(record_);
}
@@ -392,7 +391,7 @@ void TestSystem::printMetrics(int is_accurate, double cpu_time, double gpu_time,
#endif
}
void TestSystem::writeMetrics(int is_accurate, double cpu_time, double gpu_time, double gpu_full_time, double speedup, double fullspeedup, double gpu_min, double gpu_max, double std_dev)
void TestSystem::writeMetrics(double cpu_time, double gpu_time, double gpu_full_time, double speedup, double fullspeedup, double gpu_min, double gpu_max, double std_dev)
{
if (!record_)
{
@@ -402,21 +401,24 @@ void TestSystem::writeMetrics(int is_accurate, double cpu_time, double gpu_time,
string _is_accurate_;
if(is_accurate == 1)
if(is_accurate_ == 1)
_is_accurate_ = "Pass";
else if(is_accurate == 0)
else if(is_accurate_ == 0)
_is_accurate_ = "Fail";
else if(is_accurate == -1)
else if(is_accurate_ == -1)
_is_accurate_ = " ";
else
{
std::cout<<"is_accurate errer: "<<is_accurate<<"\n";
std::cout<<"is_accurate errer: "<<is_accurate_<<"\n";
exit(-1);
}
fprintf(record_, "%s,%s,%s,%.3f,%.3f,%.3f,%.3f,%.3f,%.3f,%.3f,%.3f\n", itname_changed_ ? itname_.c_str() : "",
fprintf(record_, "%s,%s,%s,%.2f,%.3f,%.3f,%.3f,%.3f,%.3f,%.3f,%.3f,%.3f\n",
itname_changed_ ? itname_.c_str() : "",
cur_subtest_description_.str().c_str(),
_is_accurate_.c_str(), cpu_time, gpu_time, speedup, gpu_full_time, fullspeedup,
_is_accurate_.c_str(),
accurate_diff_,
cpu_time, gpu_time, speedup, gpu_full_time, fullspeedup,
gpu_min, gpu_max, std_dev);
if (itname_changed_)
@@ -469,134 +471,6 @@ void gen(Mat &mat, int rows, int cols, int type, Scalar low, Scalar high)
RNG rng(0);
rng.fill(mat, RNG::UNIFORM, low, high);
}
#if 0
void gen(Mat &mat, int rows, int cols, int type, int low, int high, int n)
{
assert(n > 0&&n <= cols * rows);
assert(type == CV_8UC1||type == CV_8UC3||type == CV_8UC4
||type == CV_32FC1||type == CV_32FC3||type == CV_32FC4);
RNG rng;
//generate random position without duplication
std::vector<int> pos;
for(int i = 0; i < cols * rows; i++)
{
pos.push_back(i);
}
for(int i = 0; i < cols * rows; i++)
{
int temp = i + rng.uniform(0, cols * rows - 1 - i);
int temp1 = pos[temp];
pos[temp]= pos[i];
pos[i] = temp1;
}
std::vector<int> selected_pos;
for(int i = 0; i < n; i++)
{
selected_pos.push_back(pos[i]);
}
pos.clear();
//end of generating random y without duplication
if(type == CV_8UC1)
{
typedef struct coorStruct_
{
int x;
int y;
uchar xy;
}coorStruct;
coorStruct coor_struct;
std::vector<coorStruct> coor;
for(int i = 0; i < n; i++)
{
coor_struct.x = -1;
coor_struct.y = -1;
coor_struct.xy = (uchar)rng.uniform(low, high);
coor.push_back(coor_struct);
}
for(int i = 0; i < n; i++)
{
coor[i].y = selected_pos[i]/cols;
coor[i].x = selected_pos[i]%cols;
}
selected_pos.clear();
mat.create(rows, cols, type);
mat.setTo(0);
for(int i = 0; i < n; i++)
{
mat.at<unsigned char>(coor[i].y, coor[i].x) = coor[i].xy;
}
}
if(type == CV_8UC4 || type == CV_8UC3)
{
mat.create(rows, cols, type);
mat.setTo(0);
typedef struct Coor
{
int x;
int y;
uchar r;
uchar g;
uchar b;
uchar alpha;
}coor;
std::vector<coor> coor_vect;
coor xy_coor;
for(int i = 0; i < n; i++)
{
xy_coor.r = (uchar)rng.uniform(low, high);
xy_coor.g = (uchar)rng.uniform(low, high);
xy_coor.b = (uchar)rng.uniform(low, high);
if(type == CV_8UC4)
xy_coor.alpha = (uchar)rng.uniform(low, high);
coor_vect.push_back(xy_coor);
}
for(int i = 0; i < n; i++)
{
coor_vect[i].y = selected_pos[i]/((int)mat.step1()/mat.elemSize());
coor_vect[i].x = selected_pos[i]%((int)mat.step1()/mat.elemSize());
//printf("coor_vect[%d] = (%d, %d)\n", i, coor_vect[i].y, coor_vect[i].x);
}
if(type == CV_8UC4)
{
for(int i = 0; i < n; i++)
{
mat.at<unsigned char>(coor_vect[i].y, 4 * coor_vect[i].x) = coor_vect[i].r;
mat.at<unsigned char>(coor_vect[i].y, 4 * coor_vect[i].x + 1) = coor_vect[i].g;
mat.at<unsigned char>(coor_vect[i].y, 4 * coor_vect[i].x + 2) = coor_vect[i].b;
mat.at<unsigned char>(coor_vect[i].y, 4 * coor_vect[i].x + 3) = coor_vect[i].alpha;
}
}else if(type == CV_8UC3)
{
for(int i = 0; i < n; i++)
{
mat.at<unsigned char>(coor_vect[i].y, 3 * coor_vect[i].x) = coor_vect[i].r;
mat.at<unsigned char>(coor_vect[i].y, 3 * coor_vect[i].x + 1) = coor_vect[i].g;
mat.at<unsigned char>(coor_vect[i].y, 3 * coor_vect[i].x + 2) = coor_vect[i].b;
}
}
}
}
#endif
string abspath(const string &relpath)
{
@@ -619,31 +493,3 @@ double checkSimilarity(const Mat &m1, const Mat &m2)
matchTemplate(m1, m2, diff, TM_CCORR_NORMED);
return std::abs(diff.at<float>(0, 0) - 1.f);
}
int ExpectedMatNear(cv::Mat dst, cv::Mat cpu_dst, double eps)
{
assert(dst.type() == cpu_dst.type());
assert(dst.size() == cpu_dst.size());
if(checkNorm(cv::Mat(dst), cv::Mat(cpu_dst)) < eps ||checkNorm(cv::Mat(dst), cv::Mat(cpu_dst)) == eps)
return 1;
return 0;
}
int ExceptDoubleNear(double val1, double val2, double abs_error)
{
const double diff = fabs(val1 - val2);
if (diff <= abs_error)
return 1;
return 0;
}
int ExceptedMatSimilar(cv::Mat dst, cv::Mat cpu_dst, double eps)
{
assert(dst.type() == cpu_dst.type());
assert(dst.size() == cpu_dst.size());
if(checkSimilarity(cv::Mat(cpu_dst), cv::Mat(dst)) <= eps)
return 1;
return 0;
}

View File

@@ -322,9 +322,46 @@ public:
itname_changed_ = true;
}
void setAccurate(int is_accurate = -1)
void setAccurate(int accurate, double diff)
{
is_accurate_ = is_accurate;
is_accurate_ = accurate;
accurate_diff_ = diff;
}
void ExpectMatsNear(vector<Mat>& dst, vector<Mat>& cpu_dst, vector<double>& eps)
{
assert(dst.size() == cpu_dst.size());
assert(cpu_dst.size() == eps.size());
is_accurate_ = 1;
for(size_t i=0; i<dst.size(); i++)
{
double cur_diff = checkNorm(dst[i], cpu_dst[i]);
accurate_diff_ = max(accurate_diff_, cur_diff);
if(cur_diff > eps[i])
is_accurate_ = 0;
}
}
void ExpectedMatNear(cv::Mat& dst, cv::Mat& cpu_dst, double eps)
{
assert(dst.type() == cpu_dst.type());
assert(dst.size() == cpu_dst.size());
accurate_diff_ = checkNorm(dst, cpu_dst);
if(accurate_diff_ <= eps)
is_accurate_ = 1;
else
is_accurate_ = 0;
}
void ExceptedMatSimilar(cv::Mat& dst, cv::Mat& cpu_dst, double eps)
{
assert(dst.type() == cpu_dst.type());
assert(dst.size() == cpu_dst.size());
accurate_diff_ = checkSimilarity(cpu_dst, dst);
if(accurate_diff_ <= eps)
is_accurate_ = 1;
else
is_accurate_ = 0;
}
std::stringstream &getCurSubtestDescription()
@@ -342,7 +379,7 @@ private:
num_iters_(10), cpu_num_iters_(2),
gpu_warmup_iters_(1), cur_iter_idx_(0), cur_warmup_idx_(0),
record_(0), recordname_("performance"), itname_changed_(true),
is_accurate_(-1)
is_accurate_(-1), accurate_diff_(0.)
{
cpu_times_.reserve(num_iters_);
gpu_times_.reserve(num_iters_);
@@ -363,6 +400,7 @@ private:
gpu_times_.clear();
gpu_full_times_.clear();
is_accurate_ = -1;
accurate_diff_ = 0.;
}
double meanTime(const std::vector<int64> &samples);
@@ -373,7 +411,7 @@ private:
void writeHeading();
void writeSummary();
void writeMetrics(int is_accurate, double cpu_time, double gpu_time = 0.0f, double gpu_full_time = 0.0f,
void writeMetrics(double cpu_time, double gpu_time = 0.0f, double gpu_full_time = 0.0f,
double speedup = 0.0f, double fullspeedup = 0.0f,
double gpu_min = 0.0f, double gpu_max = 0.0f, double std_dev = 0.0f);
@@ -425,6 +463,7 @@ private:
bool itname_changed_;
int is_accurate_;
double accurate_diff_;
};

View File

@@ -412,11 +412,11 @@ static void arithmetic_scalar_run(const oclMat &src, oclMat &dst, String kernelN
args.push_back( std::make_pair( sizeof(cl_int), (void *)&cols ));
args.push_back( std::make_pair( sizeof(cl_int), (void *)&dst_step1 ));
float f_scalar = (float)scalar;
if(src.clCxt->supportsFeature(Context::CL_DOUBLE))
args.push_back( std::make_pair( sizeof(cl_double), (void *)&scalar ));
else
{
float f_scalar = (float)scalar;
args.push_back( std::make_pair( sizeof(cl_float), (void *)&f_scalar));
}
@@ -783,45 +783,55 @@ static void arithmetic_minMax_mask_run(const oclMat &src, const oclMat &mask, cl
}
}
template <typename T> void arithmetic_minMax(const oclMat &src, double *minVal, double *maxVal, const oclMat &mask)
template <typename T> void arithmetic_minMax(const oclMat &src, double *minVal, double *maxVal,
const oclMat &mask, oclMat &buf)
{
size_t groupnum = src.clCxt->computeUnits();
CV_Assert(groupnum != 0);
groupnum = groupnum * 2;
int vlen = 8;
int dbsize = groupnum * 2 * vlen * sizeof(T) ;
Context *clCxt = src.clCxt;
cl_mem dstBuffer = openCLCreateBuffer(clCxt, CL_MEM_WRITE_ONLY, dbsize);
*minVal = std::numeric_limits<double>::max() , *maxVal = -std::numeric_limits<double>::max();
ensureSizeIsEnough(1, dbsize, CV_8UC1, buf);
cl_mem buf_data = reinterpret_cast<cl_mem>(buf.data);
if (mask.empty())
{
arithmetic_minMax_run(src, mask, dstBuffer, vlen, groupnum, "arithm_op_minMax");
arithmetic_minMax_run(src, mask, buf_data, vlen, groupnum, "arithm_op_minMax");
}
else
{
arithmetic_minMax_mask_run(src, mask, dstBuffer, vlen, groupnum, "arithm_op_minMax_mask");
arithmetic_minMax_mask_run(src, mask, buf_data, vlen, groupnum, "arithm_op_minMax_mask");
}
T *p = new T[groupnum * vlen * 2];
memset(p, 0, dbsize);
openCLReadBuffer(clCxt, dstBuffer, (void *)p, dbsize);
if(minVal != NULL){
Mat matbuf = Mat(buf);
T *p = matbuf.ptr<T>();
if(minVal != NULL)
{
*minVal = std::numeric_limits<double>::max();
for(int i = 0; i < vlen * (int)groupnum; i++)
{
*minVal = *minVal < p[i] ? *minVal : p[i];
}
}
if(maxVal != NULL){
if(maxVal != NULL)
{
*maxVal = -std::numeric_limits<double>::max();
for(int i = vlen * (int)groupnum; i < 2 * vlen * (int)groupnum; i++)
{
*maxVal = *maxVal > p[i] ? *maxVal : p[i];
}
}
delete[] p;
openCLFree(dstBuffer);
}
typedef void (*minMaxFunc)(const oclMat &src, double *minVal, double *maxVal, const oclMat &mask);
typedef void (*minMaxFunc)(const oclMat &src, double *minVal, double *maxVal, const oclMat &mask, oclMat &buf);
void cv::ocl::minMax(const oclMat &src, double *minVal, double *maxVal, const oclMat &mask)
{
oclMat buf;
minMax_buf(src, minVal, maxVal, mask, buf);
}
void cv::ocl::minMax_buf(const oclMat &src, double *minVal, double *maxVal, const oclMat &mask, oclMat &buf)
{
CV_Assert(src.oclchannels() == 1);
if(!src.clCxt->supportsFeature(Context::CL_DOUBLE) && src.depth() == CV_64F)
@@ -841,7 +851,7 @@ void cv::ocl::minMax(const oclMat &src, double *minVal, double *maxVal, const oc
};
minMaxFunc func;
func = functab[src.depth()];
func(src, minVal, maxVal, mask);
func(src, minVal, maxVal, mask, buf);
}
//////////////////////////////////////////////////////////////////////////////
@@ -1688,10 +1698,11 @@ void bitwise_run(const oclMat &src1, const oclMat &src2, oclMat &dst, String ker
args.push_back( std::make_pair( sizeof(cl_int), (void *)&cols ));
args.push_back( std::make_pair( sizeof(cl_int), (void *)&dst_step1 ));
T scalar;
if(_scalar != NULL)
{
double scalar1 = *((double *)_scalar);
T scalar = (T)scalar1;
scalar = (T)scalar1;
args.push_back( std::make_pair( sizeof(T), (void *)&scalar ));
}
@@ -2308,9 +2319,9 @@ static void arithmetic_pow_run(const oclMat &src1, double p, oclMat &dst, String
args.push_back( std::make_pair( sizeof(cl_int), (void *)&dst.rows ));
args.push_back( std::make_pair( sizeof(cl_int), (void *)&cols ));
args.push_back( std::make_pair( sizeof(cl_int), (void *)&dst_step1 ));
float pf = p;
if(!src1.clCxt->supportsFeature(Context::CL_DOUBLE))
{
float pf = p;
args.push_back( std::make_pair( sizeof(cl_float), (void *)&pf ));
}
else

View File

@@ -244,11 +244,12 @@ static void matchDispatcher(const oclMat &query, const oclMat &train, const oclM
{
const oclMat zeroMask;
const oclMat &tempMask = mask.data ? mask : zeroMask;
bool is_cpu = queryDeviceInfo<IS_CPU_DEVICE, bool>();
if (query.cols <= 64)
{
matchUnrolledCached<16, 64>(query, train, tempMask, trainIdx, distance, distType);
}
else if (query.cols <= 128)
else if (query.cols <= 128 && !is_cpu)
{
matchUnrolledCached<16, 128>(query, train, tempMask, trainIdx, distance, distType);
}
@@ -263,11 +264,12 @@ static void matchDispatcher(const oclMat &query, const oclMat *trains, int n, co
{
const oclMat zeroMask;
const oclMat &tempMask = mask.data ? mask : zeroMask;
bool is_cpu = queryDeviceInfo<IS_CPU_DEVICE, bool>();
if (query.cols <= 64)
{
matchUnrolledCached<16, 64>(query, trains, n, tempMask, trainIdx, imgIdx, distance, distType);
}
else if (query.cols <= 128)
else if (query.cols <= 128 && !is_cpu)
{
matchUnrolledCached<16, 128>(query, trains, n, tempMask, trainIdx, imgIdx, distance, distType);
}
@@ -283,11 +285,12 @@ static void matchDispatcher(const oclMat &query, const oclMat &train, float maxD
{
const oclMat zeroMask;
const oclMat &tempMask = mask.data ? mask : zeroMask;
bool is_cpu = queryDeviceInfo<IS_CPU_DEVICE, bool>();
if (query.cols <= 64)
{
matchUnrolledCached<16, 64>(query, train, maxDistance, tempMask, trainIdx, distance, nMatches, distType);
}
else if (query.cols <= 128)
else if (query.cols <= 128 && !is_cpu)
{
matchUnrolledCached<16, 128>(query, train, maxDistance, tempMask, trainIdx, distance, nMatches, distType);
}
@@ -465,11 +468,12 @@ static void calcDistanceDispatcher(const oclMat &query, const oclMat &train, con
static void match2Dispatcher(const oclMat &query, const oclMat &train, const oclMat &mask,
const oclMat &trainIdx, const oclMat &distance, int distType)
{
bool is_cpu = queryDeviceInfo<IS_CPU_DEVICE, bool>();
if (query.cols <= 64)
{
knn_matchUnrolledCached<16, 64>(query, train, mask, trainIdx, distance, distType);
}
else if (query.cols <= 128)
else if (query.cols <= 128 && !is_cpu)
{
knn_matchUnrolledCached<16, 128>(query, train, mask, trainIdx, distance, distType);
}

View File

@@ -239,7 +239,7 @@ void canny::calcSobelRowPass_gpu(const oclMat &src, oclMat &dx_buf, oclMat &dy_b
size_t globalThreads[3] = {cols, rows, 1};
size_t localThreads[3] = {16, 16, 1};
openCLExecuteKernel2(clCxt, &imgproc_canny, kernelName, globalThreads, localThreads, args, -1, -1);
openCLExecuteKernel(clCxt, &imgproc_canny, kernelName, globalThreads, localThreads, args, -1, -1);
}
void canny::calcMagnitude_gpu(const oclMat &dx_buf, const oclMat &dy_buf, oclMat &dx, oclMat &dy, oclMat &mag, int rows, int cols, bool L2Grad)
@@ -269,12 +269,8 @@ void canny::calcMagnitude_gpu(const oclMat &dx_buf, const oclMat &dy_buf, oclMat
size_t globalThreads[3] = {cols, rows, 1};
size_t localThreads[3] = {16, 16, 1};
char build_options [15] = "";
if(L2Grad)
{
strcat(build_options, "-D L2GRAD");
}
openCLExecuteKernel2(clCxt, &imgproc_canny, kernelName, globalThreads, localThreads, args, -1, -1, build_options);
const char * build_options = L2Grad ? "-D L2GRAD":"";
openCLExecuteKernel(clCxt, &imgproc_canny, kernelName, globalThreads, localThreads, args, -1, -1, build_options);
}
void canny::calcMagnitude_gpu(const oclMat &dx, const oclMat &dy, oclMat &mag, int rows, int cols, bool L2Grad)
{
@@ -297,12 +293,8 @@ void canny::calcMagnitude_gpu(const oclMat &dx, const oclMat &dy, oclMat &mag, i
size_t globalThreads[3] = {cols, rows, 1};
size_t localThreads[3] = {16, 16, 1};
char build_options [15] = "";
if(L2Grad)
{
strcat(build_options, "-D L2GRAD");
}
openCLExecuteKernel2(clCxt, &imgproc_canny, kernelName, globalThreads, localThreads, args, -1, -1, build_options);
const char * build_options = L2Grad ? "-D L2GRAD":"";
openCLExecuteKernel(clCxt, &imgproc_canny, kernelName, globalThreads, localThreads, args, -1, -1, build_options);
}
void canny::calcMap_gpu(oclMat &dx, oclMat &dy, oclMat &mag, oclMat &map, int rows, int cols, float low_thresh, float high_thresh)
@@ -333,7 +325,7 @@ void canny::calcMap_gpu(oclMat &dx, oclMat &dy, oclMat &mag, oclMat &map, int ro
String kernelName = "calcMap";
size_t localThreads[3] = {16, 16, 1};
openCLExecuteKernel2(clCxt, &imgproc_canny, kernelName, globalThreads, localThreads, args, -1, -1);
openCLExecuteKernel(clCxt, &imgproc_canny, kernelName, globalThreads, localThreads, args, -1, -1);
}
void canny::edgesHysteresisLocal_gpu(oclMat &map, oclMat &st1, void *counter, int rows, int cols)
@@ -353,7 +345,7 @@ void canny::edgesHysteresisLocal_gpu(oclMat &map, oclMat &st1, void *counter, in
size_t globalThreads[3] = {cols, rows, 1};
size_t localThreads[3] = {16, 16, 1};
openCLExecuteKernel2(clCxt, &imgproc_canny, kernelName, globalThreads, localThreads, args, -1, -1);
openCLExecuteKernel(clCxt, &imgproc_canny, kernelName, globalThreads, localThreads, args, -1, -1);
}
void canny::edgesHysteresisGlobal_gpu(oclMat &map, oclMat &st1, oclMat &st2, void *counter, int rows, int cols)
@@ -383,7 +375,7 @@ void canny::edgesHysteresisGlobal_gpu(oclMat &map, oclMat &st1, oclMat &st2, voi
args.push_back( std::make_pair( sizeof(cl_int), (void *)&map.step));
args.push_back( std::make_pair( sizeof(cl_int), (void *)&map.offset));
openCLExecuteKernel2(clCxt, &imgproc_canny, kernelName, globalThreads, localThreads, args, -1, -1, DISABLE);
openCLExecuteKernel(clCxt, &imgproc_canny, kernelName, globalThreads, localThreads, args, -1, -1);
openCLSafeCall(clEnqueueReadBuffer(*(cl_command_queue*)getoclCommandQueue(), (cl_mem)counter, 1, 0, sizeof(int), &count, 0, NULL, NULL));
std::swap(st1, st2);
}
@@ -408,5 +400,5 @@ void canny::getEdges_gpu(oclMat &map, oclMat &dst, int rows, int cols)
size_t globalThreads[3] = {cols, rows, 1};
size_t localThreads[3] = {16, 16, 1};
openCLExecuteKernel2(clCxt, &imgproc_canny, kernelName, globalThreads, localThreads, args, -1, -1);
openCLExecuteKernel(clCxt, &imgproc_canny, kernelName, globalThreads, localThreads, args, -1, -1);
}

View File

@@ -356,8 +356,7 @@ static void GPUDilate(const oclMat &src, oclMat &dst, oclMat &mat_kernel,
char compile_option[128];
sprintf(compile_option, "-D RADIUSX=%d -D RADIUSY=%d -D LSIZE0=%d -D LSIZE1=%d -D DILATE %s %s",
anchor.x, anchor.y, (int)localThreads[0], (int)localThreads[1],
rectKernel?"-D RECTKERNEL":"",
s);
s, rectKernel?"-D RECTKERNEL":"");
std::vector< std::pair<size_t, const void *> > args;
args.push_back(std::make_pair(sizeof(cl_mem), (void *)&src.data));
args.push_back(std::make_pair(sizeof(cl_mem), (void *)&dst.data));

349
modules/ocl/src/gfft.cpp Normal file
View File

@@ -0,0 +1,349 @@
/*M///////////////////////////////////////////////////////////////////////////////////////
//
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
//
// By downloading, copying, installing or using the software you agree to this license.
// If you do not agree to this license, do not download, install,
// copy or use the software.
//
//
// License Agreement
// For Open Source Computer Vision Library
//
// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
// Third party copyrights are property of their respective owners.
//
// @Authors
// Peng Xiao, pengxiao@outlook.com
//
// Redistribution and use in source and binary forms, with or without modification,
// are permitted provided that the following conditions are met:
//
// * Redistribution's of source code must retain the above copyright notice,
// this list of conditions and the following disclaimer.
//
// * Redistribution's in binary form must reproduce the above copyright notice,
// this list of conditions and the following disclaimer in the documentation
// and/or other oclMaterials provided with the distribution.
//
// * The name of the copyright holders may not be used to endorse or promote products
// derived from this software without specific prior written permission.
//
// This software is provided by the copyright holders and contributors as is and
// any express or implied warranties, including, but not limited to, the implied
// warranties of merchantability and fitness for a particular purpose are disclaimed.
// In no event shall the Intel Corporation or contributors be liable for any direct,
// indirect, incidental, special, exemplary, or consequential damages
// (including, but not limited to, procurement of substitute goods or services;
// loss of use, data, or profits; or business interruption) however caused
// and on any theory of liability, whether in contract, strict liability,
// or tort (including negligence or otherwise) arising in any way out of
// the use of this software, even if advised of the possibility of such damage.
//
//M*/
#include <iomanip>
#include "precomp.hpp"
using namespace cv;
using namespace cv::ocl;
static bool use_cpu_sorter = true;
namespace cv
{
namespace ocl
{
///////////////////////////OpenCL kernel strings///////////////////////////
extern const char *imgproc_gfft;
}
}
namespace
{
enum SortMethod
{
CPU_STL,
BITONIC,
SELECTION
};
const int GROUP_SIZE = 256;
template<SortMethod method>
struct Sorter
{
//typedef EigType;
};
//TODO(pengx): optimize GPU sorter's performance thus CPU sorter is removed.
template<>
struct Sorter<CPU_STL>
{
typedef oclMat EigType;
static cv::Mutex cs;
static Mat mat_eig;
//prototype
static int clfloat2Gt(cl_float2 pt1, cl_float2 pt2)
{
float v1 = mat_eig.at<float>(cvRound(pt1.s[1]), cvRound(pt1.s[0]));
float v2 = mat_eig.at<float>(cvRound(pt2.s[1]), cvRound(pt2.s[0]));
return v1 > v2;
}
static void sortCorners_caller(const EigType& eig_tex, oclMat& corners, const int count)
{
cv::AutoLock lock(cs);
//temporarily use STL's sort function
Mat mat_corners = corners;
mat_eig = eig_tex;
std::sort(mat_corners.begin<cl_float2>(), mat_corners.begin<cl_float2>() + count, clfloat2Gt);
corners = mat_corners;
}
};
cv::Mutex Sorter<CPU_STL>::cs;
cv::Mat Sorter<CPU_STL>::mat_eig;
template<>
struct Sorter<BITONIC>
{
typedef TextureCL EigType;
static void sortCorners_caller(const EigType& eig_tex, oclMat& corners, const int count)
{
Context * cxt = Context::getContext();
size_t globalThreads[3] = {count / 2, 1, 1};
size_t localThreads[3] = {GROUP_SIZE, 1, 1};
// 2^numStages should be equal to count or the output is invalid
int numStages = 0;
for(int i = count; i > 1; i >>= 1)
{
++numStages;
}
const int argc = 5;
std::vector< std::pair<size_t, const void *> > args(argc);
String kernelname = "sortCorners_bitonicSort";
args[0] = std::make_pair(sizeof(cl_mem), (void *)&eig_tex);
args[1] = std::make_pair(sizeof(cl_mem), (void *)&corners.data);
args[2] = std::make_pair(sizeof(cl_int), (void *)&count);
for(int stage = 0; stage < numStages; ++stage)
{
args[3] = std::make_pair(sizeof(cl_int), (void *)&stage);
for(int passOfStage = 0; passOfStage < stage + 1; ++passOfStage)
{
args[4] = std::make_pair(sizeof(cl_int), (void *)&passOfStage);
openCLExecuteKernel(cxt, &imgproc_gfft, kernelname, globalThreads, localThreads, args, -1, -1);
}
}
}
};
template<>
struct Sorter<SELECTION>
{
typedef TextureCL EigType;
static void sortCorners_caller(const EigType& eig_tex, oclMat& corners, const int count)
{
Context * cxt = Context::getContext();
size_t globalThreads[3] = {count, 1, 1};
size_t localThreads[3] = {GROUP_SIZE, 1, 1};
std::vector< std::pair<size_t, const void *> > args;
//local
String kernelname = "sortCorners_selectionSortLocal";
int lds_size = GROUP_SIZE * sizeof(cl_float2);
args.push_back( std::make_pair( sizeof(cl_mem), (void*)&eig_tex) );
args.push_back( std::make_pair( sizeof(cl_mem), (void*)&corners.data) );
args.push_back( std::make_pair( sizeof(cl_int), (void*)&count) );
args.push_back( std::make_pair( lds_size, (void*)NULL) );
openCLExecuteKernel(cxt, &imgproc_gfft, kernelname, globalThreads, localThreads, args, -1, -1);
//final
kernelname = "sortCorners_selectionSortFinal";
args.pop_back();
openCLExecuteKernel(cxt, &imgproc_gfft, kernelname, globalThreads, localThreads, args, -1, -1);
}
};
int findCorners_caller(
const TextureCL& eig,
const float threshold,
const oclMat& mask,
oclMat& corners,
const int max_count)
{
std::vector<int> k;
Context * cxt = Context::getContext();
std::vector< std::pair<size_t, const void*> > args;
String kernelname = "findCorners";
const int mask_strip = mask.step / mask.elemSize1();
oclMat g_counter(1, 1, CV_32SC1);
g_counter.setTo(0);
args.push_back(std::make_pair( sizeof(cl_mem), (void*)&eig ));
args.push_back(std::make_pair( sizeof(cl_mem), (void*)&mask.data ));
args.push_back(std::make_pair( sizeof(cl_mem), (void*)&corners.data ));
args.push_back(std::make_pair( sizeof(cl_int), (void*)&mask_strip));
args.push_back(std::make_pair( sizeof(cl_float), (void*)&threshold ));
args.push_back(std::make_pair( sizeof(cl_int), (void*)&eig.rows ));
args.push_back(std::make_pair( sizeof(cl_int), (void*)&eig.cols ));
args.push_back(std::make_pair( sizeof(cl_int), (void*)&max_count ));
args.push_back(std::make_pair( sizeof(cl_mem), (void*)&g_counter.data ));
size_t globalThreads[3] = {eig.cols, eig.rows, 1};
size_t localThreads[3] = {16, 16, 1};
const char * opt = mask.empty() ? "" : "-D WITH_MASK";
openCLExecuteKernel(cxt, &imgproc_gfft, kernelname, globalThreads, localThreads, args, -1, -1, opt);
return std::min(Mat(g_counter).at<int>(0), max_count);
}
}//unnamed namespace
void cv::ocl::GoodFeaturesToTrackDetector_OCL::operator ()(const oclMat& image, oclMat& corners, const oclMat& mask)
{
CV_Assert(qualityLevel > 0 && minDistance >= 0 && maxCorners >= 0);
CV_Assert(mask.empty() || (mask.type() == CV_8UC1 && mask.size() == image.size()));
CV_DbgAssert(support_image2d());
ensureSizeIsEnough(image.size(), CV_32F, eig_);
if (useHarrisDetector)
cornerMinEigenVal_dxdy(image, eig_, Dx_, Dy_, blockSize, 3, harrisK);
else
cornerMinEigenVal_dxdy(image, eig_, Dx_, Dy_, blockSize, 3);
double maxVal = 0;
minMax_buf(eig_, 0, &maxVal, oclMat(), minMaxbuf_);
ensureSizeIsEnough(1, std::max(1000, static_cast<int>(image.size().area() * 0.05)), CV_32FC2, tmpCorners_);
Ptr<TextureCL> eig_tex = bindTexturePtr(eig_);
int total = findCorners_caller(
*eig_tex,
static_cast<float>(maxVal * qualityLevel),
mask,
tmpCorners_,
tmpCorners_.cols);
if (total == 0)
{
corners.release();
return;
}
if(use_cpu_sorter)
{
Sorter<CPU_STL>::sortCorners_caller(eig_, tmpCorners_, total);
}
else
{
//if total is power of 2
if(((total - 1) & (total)) == 0)
{
Sorter<BITONIC>::sortCorners_caller(*eig_tex, tmpCorners_, total);
}
else
{
Sorter<SELECTION>::sortCorners_caller(*eig_tex, tmpCorners_, total);
}
}
if (minDistance < 1)
{
corners = tmpCorners_(Rect(0, 0, maxCorners > 0 ? std::min(maxCorners, total) : total, 1));
}
else
{
std::vector<Point2f> tmp(total);
downloadPoints(tmpCorners_, tmp);
std::vector<Point2f> tmp2;
tmp2.reserve(total);
const int cell_size = cvRound(minDistance);
const int grid_width = (image.cols + cell_size - 1) / cell_size;
const int grid_height = (image.rows + cell_size - 1) / cell_size;
std::vector< std::vector<Point2f> > grid(grid_width * grid_height);
for (int i = 0; i < total; ++i)
{
Point2f p = tmp[i];
bool good = true;
int x_cell = static_cast<int>(p.x / cell_size);
int y_cell = static_cast<int>(p.y / cell_size);
int x1 = x_cell - 1;
int y1 = y_cell - 1;
int x2 = x_cell + 1;
int y2 = y_cell + 1;
// boundary check
x1 = std::max(0, x1);
y1 = std::max(0, y1);
x2 = std::min(grid_width - 1, x2);
y2 = std::min(grid_height - 1, y2);
for (int yy = y1; yy <= y2; yy++)
{
for (int xx = x1; xx <= x2; xx++)
{
std::vector<Point2f>& m = grid[yy * grid_width + xx];
if (!m.empty())
{
for(size_t j = 0; j < m.size(); j++)
{
float dx = p.x - m[j].x;
float dy = p.y - m[j].y;
if (dx * dx + dy * dy < minDistance * minDistance)
{
good = false;
goto break_out;
}
}
}
}
}
break_out:
if(good)
{
grid[y_cell * grid_width + x_cell].push_back(p);
tmp2.push_back(p);
if (maxCorners > 0 && tmp2.size() == static_cast<size_t>(maxCorners))
break;
}
}
corners.upload(Mat(1, static_cast<int>(tmp2.size()), CV_32FC2, &tmp2[0]));
}
}
void cv::ocl::GoodFeaturesToTrackDetector_OCL::downloadPoints(const oclMat &points, std::vector<Point2f> &points_v)
{
CV_DbgAssert(points.type() == CV_32FC2);
points_v.resize(points.cols);
openCLSafeCall(clEnqueueReadBuffer(
*reinterpret_cast<cl_command_queue*>(getoclCommandQueue()),
reinterpret_cast<cl_mem>(points.data),
CL_TRUE,
0,
points.cols * sizeof(Point2f),
&points_v[0],
0,
NULL,
NULL));
}

View File

@@ -136,47 +136,22 @@ struct CvHidHaarClassifierCascade
};
typedef struct
{
//int rows;
//int ystep;
int width_height;
//int height;
int grpnumperline_totalgrp;
//int totalgrp;
int imgoff;
float factor;
} detect_piramid_info;
#if defined WIN32 && !defined __MINGW__ && !defined __MINGW32__
#ifdef WIN32
#define _ALIGNED_ON(_ALIGNMENT) __declspec(align(_ALIGNMENT))
typedef _ALIGNED_ON(128) struct GpuHidHaarFeature
{
_ALIGNED_ON(32) struct
{
_ALIGNED_ON(4) int p0 ;
_ALIGNED_ON(4) int p1 ;
_ALIGNED_ON(4) int p2 ;
_ALIGNED_ON(4) int p3 ;
_ALIGNED_ON(4) float weight ;
}
/*_ALIGNED_ON(32)*/ rect[CV_HAAR_FEATURE_MAX] ;
}
GpuHidHaarFeature;
typedef _ALIGNED_ON(128) struct GpuHidHaarTreeNode
{
_ALIGNED_ON(64) int p[CV_HAAR_FEATURE_MAX][4];
//_ALIGNED_ON(16) int p1[CV_HAAR_FEATURE_MAX] ;
//_ALIGNED_ON(16) int p2[CV_HAAR_FEATURE_MAX] ;
//_ALIGNED_ON(16) int p3[CV_HAAR_FEATURE_MAX] ;
/*_ALIGNED_ON(16)*/
float weight[CV_HAAR_FEATURE_MAX] ;
/*_ALIGNED_ON(4)*/
float threshold ;
_ALIGNED_ON(8) float alpha[2] ;
_ALIGNED_ON(16) float alpha[3] ;
_ALIGNED_ON(4) int left ;
_ALIGNED_ON(4) int right ;
// GpuHidHaarFeature feature __attribute__((aligned (128)));
}
GpuHidHaarTreeNode;
@@ -184,7 +159,6 @@ GpuHidHaarTreeNode;
typedef _ALIGNED_ON(32) struct GpuHidHaarClassifier
{
_ALIGNED_ON(4) int count;
//CvHaarFeature* orig_feature;
_ALIGNED_ON(8) GpuHidHaarTreeNode *node ;
_ALIGNED_ON(8) float *alpha ;
}
@@ -219,32 +193,16 @@ typedef _ALIGNED_ON(64) struct GpuHidHaarClassifierCascade
_ALIGNED_ON(4) int p2 ;
_ALIGNED_ON(4) int p3 ;
_ALIGNED_ON(4) float inv_window_area ;
// GpuHidHaarStageClassifier* stage_classifier __attribute__((aligned (8)));
} GpuHidHaarClassifierCascade;
#else
#define _ALIGNED_ON(_ALIGNMENT) __attribute__((aligned(_ALIGNMENT) ))
typedef struct _ALIGNED_ON(128) GpuHidHaarFeature
{
struct _ALIGNED_ON(32)
{
int p0 _ALIGNED_ON(4);
int p1 _ALIGNED_ON(4);
int p2 _ALIGNED_ON(4);
int p3 _ALIGNED_ON(4);
float weight _ALIGNED_ON(4);
}
rect[CV_HAAR_FEATURE_MAX] _ALIGNED_ON(32);
}
GpuHidHaarFeature;
typedef struct _ALIGNED_ON(128) GpuHidHaarTreeNode
{
int p[CV_HAAR_FEATURE_MAX][4] _ALIGNED_ON(64);
float weight[CV_HAAR_FEATURE_MAX];// _ALIGNED_ON(16);
float threshold;// _ALIGNED_ON(4);
float alpha[2] _ALIGNED_ON(8);
float alpha[3] _ALIGNED_ON(16);
int left _ALIGNED_ON(4);
int right _ALIGNED_ON(4);
}
@@ -287,7 +245,6 @@ typedef struct _ALIGNED_ON(64) GpuHidHaarClassifierCascade
int p2 _ALIGNED_ON(4);
int p3 _ALIGNED_ON(4);
float inv_window_area _ALIGNED_ON(4);
// GpuHidHaarStageClassifier* stage_classifier __attribute__((aligned (8)));
} GpuHidHaarClassifierCascade;
#endif
@@ -295,36 +252,6 @@ const int icv_object_win_border = 1;
const float icv_stage_threshold_bias = 0.0001f;
double globaltime = 0;
// static CvHaarClassifierCascade * gpuCreateHaarClassifierCascade( int stage_count )
// {
// CvHaarClassifierCascade *cascade = 0;
// int block_size = sizeof(*cascade) + stage_count * sizeof(*cascade->stage_classifier);
// if( stage_count <= 0 )
// CV_Error( CV_StsOutOfRange, "Number of stages should be positive" );
// cascade = (CvHaarClassifierCascade *)cvAlloc( block_size );
// memset( cascade, 0, block_size );
// cascade->stage_classifier = (CvHaarStageClassifier *)(cascade + 1);
// cascade->flags = CV_HAAR_MAGIC_VAL;
// cascade->count = stage_count;
// return cascade;
// }
//static int globalcounter = 0;
// static void gpuReleaseHidHaarClassifierCascade( GpuHidHaarClassifierCascade **_cascade )
// {
// if( _cascade && *_cascade )
// {
// cvFree( _cascade );
// }
// }
/* create more efficient internal representation of haar classifier cascade */
static GpuHidHaarClassifierCascade * gpuCreateHidHaarClassifierCascade( CvHaarClassifierCascade *cascade, int *size, int *totalclassifier)
{
@@ -440,24 +367,12 @@ static GpuHidHaarClassifierCascade * gpuCreateHidHaarClassifierCascade( CvHaarCl
hid_stage_classifier->two_rects = 1;
haar_classifier_ptr += stage_classifier->count;
/*
hid_stage_classifier->parent = (stage_classifier->parent == -1)
? NULL : stage_classifier_ptr + stage_classifier->parent;
hid_stage_classifier->next = (stage_classifier->next == -1)
? NULL : stage_classifier_ptr + stage_classifier->next;
hid_stage_classifier->child = (stage_classifier->child == -1)
? NULL : stage_classifier_ptr + stage_classifier->child;
out->is_tree |= hid_stage_classifier->next != NULL;
*/
for( j = 0; j < stage_classifier->count; j++ )
{
CvHaarClassifier *classifier = stage_classifier->classifier + j;
GpuHidHaarClassifier *hid_classifier = hid_stage_classifier->classifier + j;
int node_count = classifier->count;
// float* alpha_ptr = (float*)(haar_node_ptr + node_count);
float *alpha_ptr = &haar_node_ptr->alpha[0];
hid_classifier->count = node_count;
@@ -484,16 +399,12 @@ static GpuHidHaarClassifierCascade * gpuCreateHidHaarClassifierCascade( CvHaarCl
node->p[2][3] = 0;
node->weight[2] = 0;
}
// memset( &(node->feature.rect[2]), 0, sizeof(node->feature.rect[2]) );
else
hid_stage_classifier->two_rects = 0;
memcpy( node->alpha, classifier->alpha, (node_count + 1)*sizeof(alpha_ptr[0]));
haar_node_ptr = haar_node_ptr + 1;
}
memcpy( alpha_ptr, classifier->alpha, (node_count + 1)*sizeof(alpha_ptr[0]));
haar_node_ptr = haar_node_ptr + 1;
// (GpuHidHaarTreeNode*)cvAlignPtr(alpha_ptr+node_count+1, sizeof(void*));
// (GpuHidHaarTreeNode*)(alpha_ptr+node_count+1);
out->is_stump_based &= node_count == 1;
}
}
@@ -506,25 +417,19 @@ static GpuHidHaarClassifierCascade * gpuCreateHidHaarClassifierCascade( CvHaarCl
#define sum_elem_ptr(sum,row,col) \
((sumtype*)CV_MAT_ELEM_PTR_FAST((sum),(row),(col),sizeof(sumtype)))
((sumtype*)CV_MAT_ELEM_PTR_FAST((sum),(row),(col),sizeof(sumtype)))
#define sqsum_elem_ptr(sqsum,row,col) \
((sqsumtype*)CV_MAT_ELEM_PTR_FAST((sqsum),(row),(col),sizeof(sqsumtype)))
((sqsumtype*)CV_MAT_ELEM_PTR_FAST((sqsum),(row),(col),sizeof(sqsumtype)))
#define calc_sum(rect,offset) \
((rect).p0[offset] - (rect).p1[offset] - (rect).p2[offset] + (rect).p3[offset])
((rect).p0[offset] - (rect).p1[offset] - (rect).p2[offset] + (rect).p3[offset])
static void gpuSetImagesForHaarClassifierCascade( CvHaarClassifierCascade *_cascade,
/* const CvArr* _sum,
const CvArr* _sqsum,
const CvArr* _tilted_sum,*/
double scale,
int step)
{
// CvMat sum_stub, *sum = (CvMat*)_sum;
// CvMat sqsum_stub, *sqsum = (CvMat*)_sqsum;
// CvMat tilted_stub, *tilted = (CvMat*)_tilted_sum;
GpuHidHaarClassifierCascade *cascade;
int coi0 = 0, coi1 = 0;
int i;
@@ -540,61 +445,25 @@ static void gpuSetImagesForHaarClassifierCascade( CvHaarClassifierCascade *_casc
if( scale <= 0 )
CV_Error( CV_StsOutOfRange, "Scale must be positive" );
// sum = cvGetMat( sum, &sum_stub, &coi0 );
// sqsum = cvGetMat( sqsum, &sqsum_stub, &coi1 );
if( coi0 || coi1 )
CV_Error( CV_BadCOI, "COI is not supported" );
// if( !CV_ARE_SIZES_EQ( sum, sqsum ))
// CV_Error( CV_StsUnmatchedSizes, "All integral images must have the same size" );
// if( CV_MAT_TYPE(sqsum->type) != CV_64FC1 ||
// CV_MAT_TYPE(sum->type) != CV_32SC1 )
// CV_Error( CV_StsUnsupportedFormat,
// "Only (32s, 64f, 32s) combination of (sum,sqsum,tilted_sum) formats is allowed" );
if( !_cascade->hid_cascade )
gpuCreateHidHaarClassifierCascade(_cascade, &datasize, &total);
cascade = (GpuHidHaarClassifierCascade *) _cascade->hid_cascade;
stage_classifier = (GpuHidHaarStageClassifier *) (cascade + 1);
if( cascade->has_tilted_features )
{
// tilted = cvGetMat( tilted, &tilted_stub, &coi1 );
// if( CV_MAT_TYPE(tilted->type) != CV_32SC1 )
// CV_Error( CV_StsUnsupportedFormat,
// "Only (32s, 64f, 32s) combination of (sum,sqsum,tilted_sum) formats is allowed" );
// if( sum->step != tilted->step )
// CV_Error( CV_StsUnmatchedSizes,
// "Sum and tilted_sum must have the same stride (step, widthStep)" );
// if( !CV_ARE_SIZES_EQ( sum, tilted ))
// CV_Error( CV_StsUnmatchedSizes, "All integral images must have the same size" );
// cascade->tilted = *tilted;
}
_cascade->scale = scale;
_cascade->real_window_size.width = cvRound( _cascade->orig_window_size.width * scale );
_cascade->real_window_size.height = cvRound( _cascade->orig_window_size.height * scale );
//cascade->sum = *sum;
//cascade->sqsum = *sqsum;
equRect.x = equRect.y = cvRound(scale);
equRect.width = cvRound((_cascade->orig_window_size.width - 2) * scale);
equRect.height = cvRound((_cascade->orig_window_size.height - 2) * scale);
weight_scale = 1. / (equRect.width * equRect.height);
cascade->inv_window_area = weight_scale;
// cascade->pq0 = equRect.y * step + equRect.x;
// cascade->pq1 = equRect.y * step + equRect.x + equRect.width ;
// cascade->pq2 = (equRect.y + equRect.height)*step + equRect.x;
// cascade->pq3 = (equRect.y + equRect.height)*step + equRect.x + equRect.width ;
cascade->pq0 = equRect.x;
cascade->pq1 = equRect.y;
cascade->pq2 = equRect.x + equRect.width;
@@ -617,10 +486,6 @@ static void gpuSetImagesForHaarClassifierCascade( CvHaarClassifierCascade *_casc
{
CvHaarFeature *feature =
&_cascade->stage_classifier[i].classifier[j].haar_feature[l];
/* GpuHidHaarClassifier* classifier =
cascade->stage_classifier[i].classifier + j; */
//GpuHidHaarFeature* hidfeature =
// &cascade->stage_classifier[i].classifier[j].node[l].feature;
GpuHidHaarTreeNode *hidnode = &stage_classifier[i].classifier[j].node[l];
double sum0 = 0, area0 = 0;
CvRect r[3];
@@ -635,8 +500,6 @@ static void gpuSetImagesForHaarClassifierCascade( CvHaarClassifierCascade *_casc
/* align blocks */
for( k = 0; k < CV_HAAR_FEATURE_MAX; k++ )
{
//if( !hidfeature->rect[k].p0 )
// break;
if(!hidnode->p[k][0])
break;
r[k] = feature->rect[k].r;
@@ -716,15 +579,6 @@ static void gpuSetImagesForHaarClassifierCascade( CvHaarClassifierCascade *_casc
if( !feature->tilted )
{
/* hidfeature->rect[k].p0 = tr.y * sum->cols + tr.x;
hidfeature->rect[k].p1 = tr.y * sum->cols + tr.x + tr.width;
hidfeature->rect[k].p2 = (tr.y + tr.height) * sum->cols + tr.x;
hidfeature->rect[k].p3 = (tr.y + tr.height) * sum->cols + tr.x + tr.width;
*/
/*hidnode->p0[k] = tr.y * step + tr.x;
hidnode->p1[k] = tr.y * step + tr.x + tr.width;
hidnode->p2[k] = (tr.y + tr.height) * step + tr.x;
hidnode->p3[k] = (tr.y + tr.height) * step + tr.x + tr.width;*/
hidnode->p[k][0] = tr.x;
hidnode->p[k][1] = tr.y;
hidnode->p[k][2] = tr.x + tr.width;
@@ -732,37 +586,24 @@ static void gpuSetImagesForHaarClassifierCascade( CvHaarClassifierCascade *_casc
}
else
{
/* hidfeature->rect[k].p2 = (tr.y + tr.width) * tilted->cols + tr.x + tr.width;
hidfeature->rect[k].p3 = (tr.y + tr.width + tr.height) * tilted->cols + tr.x + tr.width - tr.height;
hidfeature->rect[k].p0 = tr.y * tilted->cols + tr.x;
hidfeature->rect[k].p1 = (tr.y + tr.height) * tilted->cols + tr.x - tr.height;
*/
hidnode->p[k][2] = (tr.y + tr.width) * step + tr.x + tr.width;
hidnode->p[k][3] = (tr.y + tr.width + tr.height) * step + tr.x + tr.width - tr.height;
hidnode->p[k][0] = tr.y * step + tr.x;
hidnode->p[k][1] = (tr.y + tr.height) * step + tr.x - tr.height;
}
//hidfeature->rect[k].weight = (float)(feature->rect[k].weight * correction_ratio);
hidnode->weight[k] = (float)(feature->rect[k].weight * correction_ratio);
if( k == 0 )
area0 = tr.width * tr.height;
else
//sum0 += hidfeature->rect[k].weight * tr.width * tr.height;
sum0 += hidnode->weight[k] * tr.width * tr.height;
}
// hidfeature->rect[0].weight = (float)(-sum0/area0);
hidnode->weight[0] = (float)(-sum0 / area0);
} /* l */
} /* j */
}
}
static void gpuSetHaarClassifierCascade( CvHaarClassifierCascade *_cascade
/*double scale=0.0,*/
/*int step*/)
static void gpuSetHaarClassifierCascade( CvHaarClassifierCascade *_cascade)
{
GpuHidHaarClassifierCascade *cascade;
int i;
@@ -816,11 +657,7 @@ static void gpuSetHaarClassifierCascade( CvHaarClassifierCascade *_cascade
if(!hidnode->p[k][0])
break;
r[k] = feature->rect[k].r;
// base_w = (int)CV_IMIN( (unsigned)base_w, (unsigned)(r[k].width-1) );
// base_w = (int)CV_IMIN( (unsigned)base_w, (unsigned)(r[k].x - r[0].x-1) );
// base_h = (int)CV_IMIN( (unsigned)base_h, (unsigned)(r[k].height-1) );
// base_h = (int)CV_IMIN( (unsigned)base_h, (unsigned)(r[k].y - r[0].y-1) );
}
}
nr = k;
for( k = 0; k < nr; k++ )
@@ -838,7 +675,6 @@ static void gpuSetHaarClassifierCascade( CvHaarClassifierCascade *_cascade
hidnode->p[k][3] = tr.height;
hidnode->weight[k] = (float)(feature->rect[k].weight * correction_ratio);
}
//hidnode->weight[0]=(float)(-sum0/area0);
} /* l */
} /* j */
}
@@ -851,7 +687,6 @@ CvSeq *cv::ocl::OclCascadeClassifier::oclHaarDetectObjects( oclMat &gimg, CvMemS
const double GROUP_EPS = 0.2;
CvSeq *result_seq = 0;
cv::Ptr<CvMemStorage> temp_storage;
cv::ConcurrentRectVector allCandidates;
std::vector<cv::Rect> rectList;
@@ -909,6 +744,7 @@ CvSeq *cv::ocl::OclCascadeClassifier::oclHaarDetectObjects( oclMat &gimg, CvMemS
if( gimg.cols < minSize.width || gimg.rows < minSize.height )
CV_Error(CV_StsError, "Image too small");
cl_command_queue qu = reinterpret_cast<cl_command_queue>(Context::getContext()->oclCommandQueue());
if( (flags & CV_HAAR_SCALE_IMAGE) )
{
CvSize winSize0 = cascade->orig_window_size;
@@ -951,7 +787,7 @@ CvSeq *cv::ocl::OclCascadeClassifier::oclHaarDetectObjects( oclMat &gimg, CvMemS
size_t blocksize = 8;
size_t localThreads[3] = { blocksize, blocksize , 1 };
size_t globalThreads[3] = { grp_per_CU * gsum.clCxt->computeUnits() *localThreads[0],
size_t globalThreads[3] = { grp_per_CU *(gsum.clCxt->computeUnits()) *localThreads[0],
localThreads[1], 1
};
int outputsz = 256 * globalThreads[0] / localThreads[0];
@@ -996,7 +832,6 @@ CvSeq *cv::ocl::OclCascadeClassifier::oclHaarDetectObjects( oclMat &gimg, CvMemS
gpuSetImagesForHaarClassifierCascade( cascade, 1., gsum.step / 4 );
stagebuffer = openCLCreateBuffer(gsum.clCxt, CL_MEM_READ_ONLY, sizeof(GpuHidHaarStageClassifier) * gcascade->count);
cl_command_queue qu = (cl_command_queue)gsum.clCxt->oclCommandQueue();
openCLSafeCall(clEnqueueWriteBuffer(qu, stagebuffer, 1, 0, sizeof(GpuHidHaarStageClassifier)*gcascade->count, stage, 0, NULL, NULL));
nodebuffer = openCLCreateBuffer(gsum.clCxt, CL_MEM_READ_ONLY, nodenum * sizeof(GpuHidHaarTreeNode));
@@ -1043,7 +878,9 @@ CvSeq *cv::ocl::OclCascadeClassifier::oclHaarDetectObjects( oclMat &gimg, CvMemS
args.push_back ( std::make_pair(sizeof(cl_int4) , (void *)&pq ));
args.push_back ( std::make_pair(sizeof(cl_float) , (void *)&correction ));
openCLExecuteKernel(gsum.clCxt, &haarobjectdetect, "gpuRunHaarClassifierCascade", globalThreads, localThreads, args, -1, -1);
const char * build_options = gcascade->is_stump_based ? "-D STUMP_BASED=1" : "-D STUMP_BASED=0";
openCLExecuteKernel(gsum.clCxt, &haarobjectdetect, "gpuRunHaarClassifierCascade", globalThreads, localThreads, args, -1, -1, build_options);
openCLReadBuffer( gsum.clCxt, candidatebuffer, candidate, 4 * sizeof(int)*outputsz );
@@ -1058,6 +895,7 @@ CvSeq *cv::ocl::OclCascadeClassifier::oclHaarDetectObjects( oclMat &gimg, CvMemS
openCLSafeCall(clReleaseMemObject(scaleinfobuffer));
openCLSafeCall(clReleaseMemObject(nodebuffer));
openCLSafeCall(clReleaseMemObject(candidatebuffer));
}
else
{
@@ -1115,7 +953,6 @@ CvSeq *cv::ocl::OclCascadeClassifier::oclHaarDetectObjects( oclMat &gimg, CvMemS
sizeof(GpuHidHaarStageClassifier) * gcascade->count - sizeof(GpuHidHaarClassifier) * totalclassifier) / sizeof(GpuHidHaarTreeNode);
nodebuffer = openCLCreateBuffer(gsum.clCxt, CL_MEM_READ_ONLY,
nodenum * sizeof(GpuHidHaarTreeNode));
cl_command_queue qu = (cl_command_queue)gsum.clCxt->oclCommandQueue();
openCLSafeCall(clEnqueueWriteBuffer(qu, nodebuffer, 1, 0,
nodenum * sizeof(GpuHidHaarTreeNode),
node, 0, NULL, NULL));
@@ -1157,7 +994,6 @@ CvSeq *cv::ocl::OclCascadeClassifier::oclHaarDetectObjects( oclMat &gimg, CvMemS
args1.push_back ( std::make_pair(sizeof(cl_int) , (void *)&startnodenum ));
size_t globalThreads2[3] = {nodenum, 1, 1};
openCLExecuteKernel(gsum.clCxt, &haarobjectdetect_scaled2, "gpuscaleclassifier", globalThreads2, NULL/*localThreads2*/, args1, -1, -1);
}
@@ -1193,7 +1029,7 @@ CvSeq *cv::ocl::OclCascadeClassifier::oclHaarDetectObjects( oclMat &gimg, CvMemS
args.push_back ( std::make_pair(sizeof(cl_mem) , (void *)&correctionbuffer ));
args.push_back ( std::make_pair(sizeof(cl_int) , (void *)&nodenum ));
openCLExecuteKernel(gsum.clCxt, &haarobjectdetect_scaled2, "gpuRunHaarClassifierCascade_scaled2", globalThreads, localThreads, args, -1, -1);
openCLExecuteKernel(gsum.clCxt, &haarobjectdetect_scaled2, "gpuRunHaarClassifierCascade_scaled2", globalThreads, localThreads, args, -1, -1, build_options);
candidate = (int *)clEnqueueMapBuffer(qu, candidatebuffer, 1, CL_MAP_READ, 0, 4 * sizeof(int) * outputsz, 0, 0, 0, &status);
@@ -1281,7 +1117,7 @@ void cv::ocl::OclCascadeClassifierBuf::detectMultiScale(oclMat &gimg, CV_OUT std
int blocksize = 8;
int grp_per_CU = 12;
size_t localThreads[3] = { blocksize, blocksize, 1 };
size_t globalThreads[3] = { grp_per_CU * Context::getContext()->computeUnits() * localThreads[0],
size_t globalThreads[3] = { grp_per_CU * cv::ocl::Context::getContext()->computeUnits() *localThreads[0],
localThreads[1],
1 };
int outputsz = 256 * globalThreads[0] / localThreads[0];
@@ -1297,8 +1133,6 @@ void cv::ocl::OclCascadeClassifierBuf::detectMultiScale(oclMat &gimg, CV_OUT std
CvHaarClassifierCascade *cascade = oldCascade;
GpuHidHaarClassifierCascade *gcascade;
GpuHidHaarStageClassifier *stage;
GpuHidHaarClassifier *classifier;
GpuHidHaarTreeNode *node;
if( CV_MAT_DEPTH(gimg.type()) != CV_8U )
CV_Error( CV_StsUnsupportedFormat, "Only 8-bit images are supported" );
@@ -1311,7 +1145,7 @@ void cv::ocl::OclCascadeClassifierBuf::detectMultiScale(oclMat &gimg, CV_OUT std
}
int *candidate;
cl_command_queue qu = reinterpret_cast<cl_command_queue>(Context::getContext()->oclCommandQueue());
if( (flags & CV_HAAR_SCALE_IMAGE) )
{
int indexy = 0;
@@ -1337,19 +1171,6 @@ void cv::ocl::OclCascadeClassifierBuf::detectMultiScale(oclMat &gimg, CV_OUT std
gcascade = (GpuHidHaarClassifierCascade *)(cascade->hid_cascade);
stage = (GpuHidHaarStageClassifier *)(gcascade + 1);
classifier = (GpuHidHaarClassifier *)(stage + gcascade->count);
node = (GpuHidHaarTreeNode *)(classifier->node);
gpuSetImagesForHaarClassifierCascade( cascade, 1., gsum.step / 4 );
cl_command_queue qu = (cl_command_queue)gsum.clCxt->oclCommandQueue();
openCLSafeCall(clEnqueueWriteBuffer(qu, ((OclBuffers *)buffers)->stagebuffer, 1, 0,
sizeof(GpuHidHaarStageClassifier) * gcascade->count,
stage, 0, NULL, NULL));
openCLSafeCall(clEnqueueWriteBuffer(qu, ((OclBuffers *)buffers)->nodebuffer, 1, 0,
m_nodenum * sizeof(GpuHidHaarTreeNode),
node, 0, NULL, NULL));
int startstage = 0;
int endstage = gcascade->count;
@@ -1386,17 +1207,23 @@ void cv::ocl::OclCascadeClassifierBuf::detectMultiScale(oclMat &gimg, CV_OUT std
args.push_back ( make_pair(sizeof(cl_int4) , (void *)&pq ));
args.push_back ( make_pair(sizeof(cl_float) , (void *)&correction ));
openCLExecuteKernel(gsum.clCxt, &haarobjectdetect, "gpuRunHaarClassifierCascade", globalThreads, localThreads, args, -1, -1);
const char * build_options = gcascade->is_stump_based ? "-D STUMP_BASED=1" : "-D STUMP_BASED=0";
openCLExecuteKernel(gsum.clCxt, &haarobjectdetect, "gpuRunHaarClassifierCascade", globalThreads, localThreads, args, -1, -1, build_options);
candidate = (int *)malloc(4 * sizeof(int) * outputsz);
memset(candidate, 0, 4 * sizeof(int) * outputsz);
openCLReadBuffer( gsum.clCxt, ((OclBuffers *)buffers)->candidatebuffer, candidate, 4 * sizeof(int)*outputsz );
for(int i = 0; i < outputsz; i++)
{
if(candidate[4 * i + 2] != 0)
{
allCandidates.push_back(Rect(candidate[4 * i], candidate[4 * i + 1],
candidate[4 * i + 2], candidate[4 * i + 3]));
}
}
free((void *)candidate);
candidate = NULL;
}
@@ -1404,6 +1231,132 @@ void cv::ocl::OclCascadeClassifierBuf::detectMultiScale(oclMat &gimg, CV_OUT std
{
cv::ocl::integral(gimg, gsum, gsqsum);
gcascade = (GpuHidHaarClassifierCascade *)cascade->hid_cascade;
int step = gsum.step / 4;
int startnode = 0;
int splitstage = 3;
int startstage = 0;
int endstage = gcascade->count;
vector<pair<size_t, const void *> > args;
args.push_back ( make_pair(sizeof(cl_mem) , (void *)&((OclBuffers *)buffers)->stagebuffer ));
args.push_back ( make_pair(sizeof(cl_mem) , (void *)&((OclBuffers *)buffers)->scaleinfobuffer ));
args.push_back ( make_pair(sizeof(cl_mem) , (void *)&((OclBuffers *)buffers)->newnodebuffer ));
args.push_back ( make_pair(sizeof(cl_mem) , (void *)&gsum.data ));
args.push_back ( make_pair(sizeof(cl_mem) , (void *)&gsqsum.data ));
args.push_back ( make_pair(sizeof(cl_mem) , (void *)&((OclBuffers *)buffers)->candidatebuffer ));
args.push_back ( make_pair(sizeof(cl_int) , (void *)&gsum.rows ));
args.push_back ( make_pair(sizeof(cl_int) , (void *)&gsum.cols ));
args.push_back ( make_pair(sizeof(cl_int) , (void *)&step ));
args.push_back ( make_pair(sizeof(cl_int) , (void *)&m_loopcount ));
args.push_back ( make_pair(sizeof(cl_int) , (void *)&startstage ));
args.push_back ( make_pair(sizeof(cl_int) , (void *)&splitstage ));
args.push_back ( make_pair(sizeof(cl_int) , (void *)&endstage ));
args.push_back ( make_pair(sizeof(cl_int) , (void *)&startnode ));
args.push_back ( make_pair(sizeof(cl_mem) , (void *)&((OclBuffers *)buffers)->pbuffer ));
args.push_back ( make_pair(sizeof(cl_mem) , (void *)&((OclBuffers *)buffers)->correctionbuffer ));
args.push_back ( make_pair(sizeof(cl_int) , (void *)&m_nodenum ));
const char * build_options = gcascade->is_stump_based ? "-D STUMP_BASED=1" : "-D STUMP_BASED=0";
openCLExecuteKernel(gsum.clCxt, &haarobjectdetect_scaled2, "gpuRunHaarClassifierCascade_scaled2", globalThreads, localThreads, args, -1, -1, build_options);
candidate = (int *)clEnqueueMapBuffer(qu, ((OclBuffers *)buffers)->candidatebuffer, 1, CL_MAP_READ, 0, 4 * sizeof(int) * outputsz, 0, 0, 0, NULL);
for(int i = 0; i < outputsz; i++)
{
if(candidate[4 * i + 2] != 0)
allCandidates.push_back(Rect(candidate[4 * i], candidate[4 * i + 1],
candidate[4 * i + 2], candidate[4 * i + 3]));
}
clEnqueueUnmapMemObject(qu, ((OclBuffers *)buffers)->candidatebuffer, candidate, 0, 0, 0);
}
rectList.resize(allCandidates.size());
if(!allCandidates.empty())
std::copy(allCandidates.begin(), allCandidates.end(), rectList.begin());
if( minNeighbors != 0 || findBiggestObject )
groupRectangles(rectList, rweights, std::max(minNeighbors, 1), GROUP_EPS);
else
rweights.resize(rectList.size(), 0);
GenResult(faces, rectList, rweights);
}
void cv::ocl::OclCascadeClassifierBuf::Init(const int rows, const int cols,
double scaleFactor, int flags,
const int outputsz, const size_t localThreads[],
CvSize minSize, CvSize maxSize)
{
if(initialized)
{
return; // we only allow one time initialization
}
CvHaarClassifierCascade *cascade = oldCascade;
if( !CV_IS_HAAR_CLASSIFIER(cascade) )
CV_Error( !cascade ? CV_StsNullPtr : CV_StsBadArg, "Invalid classifier cascade" );
if( scaleFactor <= 1 )
CV_Error( CV_StsOutOfRange, "scale factor must be > 1" );
if( cols < minSize.width || rows < minSize.height )
CV_Error(CV_StsError, "Image too small");
int datasize=0;
int totalclassifier=0;
if( !cascade->hid_cascade )
{
gpuCreateHidHaarClassifierCascade(cascade, &datasize, &totalclassifier);
}
if( maxSize.height == 0 || maxSize.width == 0 )
{
maxSize.height = rows;
maxSize.width = cols;
}
findBiggestObject = (flags & CV_HAAR_FIND_BIGGEST_OBJECT) != 0;
if( findBiggestObject )
flags &= ~(CV_HAAR_SCALE_IMAGE | CV_HAAR_DO_CANNY_PRUNING);
CreateBaseBufs(datasize, totalclassifier, flags, outputsz);
CreateFactorRelatedBufs(rows, cols, flags, scaleFactor, localThreads, minSize, maxSize);
m_scaleFactor = scaleFactor;
m_rows = rows;
m_cols = cols;
m_flags = flags;
m_minSize = minSize;
m_maxSize = maxSize;
// initialize nodes
GpuHidHaarClassifierCascade *gcascade;
GpuHidHaarStageClassifier *stage;
GpuHidHaarClassifier *classifier;
GpuHidHaarTreeNode *node;
cl_command_queue qu = reinterpret_cast<cl_command_queue>(Context::getContext()->oclCommandQueue());
if( (flags & CV_HAAR_SCALE_IMAGE) )
{
gcascade = (GpuHidHaarClassifierCascade *)(cascade->hid_cascade);
stage = (GpuHidHaarStageClassifier *)(gcascade + 1);
classifier = (GpuHidHaarClassifier *)(stage + gcascade->count);
node = (GpuHidHaarTreeNode *)(classifier->node);
gpuSetImagesForHaarClassifierCascade( cascade, 1., gsum.step / 4 );
openCLSafeCall(clEnqueueWriteBuffer(qu, ((OclBuffers *)buffers)->stagebuffer, 1, 0,
sizeof(GpuHidHaarStageClassifier) * gcascade->count,
stage, 0, NULL, NULL));
openCLSafeCall(clEnqueueWriteBuffer(qu, ((OclBuffers *)buffers)->nodebuffer, 1, 0,
m_nodenum * sizeof(GpuHidHaarTreeNode),
node, 0, NULL, NULL));
}
else
{
gpuSetHaarClassifierCascade(cascade);
gcascade = (GpuHidHaarClassifierCascade *)cascade->hid_cascade;
@@ -1411,15 +1364,12 @@ void cv::ocl::OclCascadeClassifierBuf::detectMultiScale(oclMat &gimg, CV_OUT std
classifier = (GpuHidHaarClassifier *)(stage + gcascade->count);
node = (GpuHidHaarTreeNode *)(classifier->node);
cl_command_queue qu = (cl_command_queue)gsum.clCxt->oclCommandQueue();
openCLSafeCall(clEnqueueWriteBuffer(qu, ((OclBuffers *)buffers)->nodebuffer, 1, 0,
m_nodenum * sizeof(GpuHidHaarTreeNode),
node, 0, NULL, NULL));
m_nodenum * sizeof(GpuHidHaarTreeNode),
node, 0, NULL, NULL));
cl_int4 *p = (cl_int4 *)malloc(sizeof(cl_int4) * m_loopcount);
float *correction = (float *)malloc(sizeof(float) * m_loopcount);
int startstage = 0;
int endstage = gcascade->count;
double factor;
for(int i = 0; i < m_loopcount; i++)
{
@@ -1445,105 +1395,15 @@ void cv::ocl::OclCascadeClassifierBuf::detectMultiScale(oclMat &gimg, CV_OUT std
size_t globalThreads2[3] = {m_nodenum, 1, 1};
openCLExecuteKernel(gsum.clCxt, &haarobjectdetect_scaled2, "gpuscaleclassifier", globalThreads2, NULL/*localThreads2*/, args1, -1, -1);
openCLExecuteKernel(Context::getContext(), &haarobjectdetect_scaled2, "gpuscaleclassifier", globalThreads2, NULL/*localThreads2*/, args1, -1, -1);
}
int step = gsum.step / 4;
int startnode = 0;
int splitstage = 3;
openCLSafeCall(clEnqueueWriteBuffer(qu, ((OclBuffers *)buffers)->stagebuffer, 1, 0, sizeof(GpuHidHaarStageClassifier)*gcascade->count, stage, 0, NULL, NULL));
openCLSafeCall(clEnqueueWriteBuffer(qu, ((OclBuffers *)buffers)->pbuffer, 1, 0, sizeof(cl_int4)*m_loopcount, p, 0, NULL, NULL));
openCLSafeCall(clEnqueueWriteBuffer(qu, ((OclBuffers *)buffers)->correctionbuffer, 1, 0, sizeof(cl_float)*m_loopcount, correction, 0, NULL, NULL));
vector<pair<size_t, const void *> > args;
args.push_back ( make_pair(sizeof(cl_mem) , (void *)&((OclBuffers *)buffers)->stagebuffer ));
args.push_back ( make_pair(sizeof(cl_mem) , (void *)&((OclBuffers *)buffers)->scaleinfobuffer ));
args.push_back ( make_pair(sizeof(cl_mem) , (void *)&((OclBuffers *)buffers)->newnodebuffer ));
args.push_back ( make_pair(sizeof(cl_mem) , (void *)&gsum.data ));
args.push_back ( make_pair(sizeof(cl_mem) , (void *)&gsqsum.data ));
args.push_back ( make_pair(sizeof(cl_mem) , (void *)&((OclBuffers *)buffers)->candidatebuffer ));
args.push_back ( make_pair(sizeof(cl_int) , (void *)&gsum.rows ));
args.push_back ( make_pair(sizeof(cl_int) , (void *)&gsum.cols ));
args.push_back ( make_pair(sizeof(cl_int) , (void *)&step ));
args.push_back ( make_pair(sizeof(cl_int) , (void *)&m_loopcount ));
args.push_back ( make_pair(sizeof(cl_int) , (void *)&startstage ));
args.push_back ( make_pair(sizeof(cl_int) , (void *)&splitstage ));
args.push_back ( make_pair(sizeof(cl_int) , (void *)&endstage ));
args.push_back ( make_pair(sizeof(cl_int) , (void *)&startnode ));
args.push_back ( make_pair(sizeof(cl_mem) , (void *)&((OclBuffers *)buffers)->pbuffer ));
args.push_back ( make_pair(sizeof(cl_mem) , (void *)&((OclBuffers *)buffers)->correctionbuffer ));
args.push_back ( make_pair(sizeof(cl_int) , (void *)&m_nodenum ));
openCLExecuteKernel(gsum.clCxt, &haarobjectdetect_scaled2, "gpuRunHaarClassifierCascade_scaled2", globalThreads, localThreads, args, -1, -1);
candidate = (int *)clEnqueueMapBuffer(qu, ((OclBuffers *)buffers)->candidatebuffer, 1, CL_MAP_READ, 0, 4 * sizeof(int) * outputsz, 0, 0, 0, NULL);
for(int i = 0; i < outputsz; i++)
{
if(candidate[4 * i + 2] != 0)
allCandidates.push_back(Rect(candidate[4 * i], candidate[4 * i + 1],
candidate[4 * i + 2], candidate[4 * i + 3]));
}
free(p);
free(correction);
clEnqueueUnmapMemObject(qu, ((OclBuffers *)buffers)->candidatebuffer, candidate, 0, 0, 0);
}
rectList.resize(allCandidates.size());
if(!allCandidates.empty())
std::copy(allCandidates.begin(), allCandidates.end(), rectList.begin());
if( minNeighbors != 0 || findBiggestObject )
groupRectangles(rectList, rweights, std::max(minNeighbors, 1), GROUP_EPS);
else
rweights.resize(rectList.size(), 0);
GenResult(faces, rectList, rweights);
}
void cv::ocl::OclCascadeClassifierBuf::Init(const int rows, const int cols,
double scaleFactor, int flags,
const int outputsz, const size_t localThreads[],
CvSize minSize, CvSize maxSize)
{
CvHaarClassifierCascade *cascade = oldCascade;
if( !CV_IS_HAAR_CLASSIFIER(cascade) )
CV_Error( !cascade ? CV_StsNullPtr : CV_StsBadArg, "Invalid classifier cascade" );
if( scaleFactor <= 1 )
CV_Error( CV_StsOutOfRange, "scale factor must be > 1" );
if( cols < minSize.width || rows < minSize.height )
CV_Error(CV_StsError, "Image too small");
int datasize=0;
int totalclassifier=0;
if( !cascade->hid_cascade )
gpuCreateHidHaarClassifierCascade(cascade, &datasize, &totalclassifier);
if( maxSize.height == 0 || maxSize.width == 0 )
{
maxSize.height = rows;
maxSize.width = cols;
}
findBiggestObject = (flags & CV_HAAR_FIND_BIGGEST_OBJECT) != 0;
if( findBiggestObject )
flags &= ~(CV_HAAR_SCALE_IMAGE | CV_HAAR_DO_CANNY_PRUNING);
CreateBaseBufs(datasize, totalclassifier, flags, outputsz);
CreateFactorRelatedBufs(rows, cols, flags, scaleFactor, localThreads, minSize, maxSize);
m_scaleFactor = scaleFactor;
m_rows = rows;
m_cols = cols;
m_flags = flags;
m_minSize = minSize;
m_maxSize = maxSize;
initialized = true;
}
@@ -1642,6 +1502,7 @@ void cv::ocl::OclCascadeClassifierBuf::CreateFactorRelatedBufs(
CvSize sz;
CvSize winSize0 = oldCascade->orig_window_size;
detect_piramid_info *scaleinfo;
cl_command_queue qu = reinterpret_cast<cl_command_queue>(Context::getContext()->oclCommandQueue());
if (flags & CV_HAAR_SCALE_IMAGE)
{
for(factor = 1.f;; factor *= scaleFactor)
@@ -1743,7 +1604,7 @@ void cv::ocl::OclCascadeClassifierBuf::CreateFactorRelatedBufs(
((OclBuffers *)buffers)->scaleinfobuffer = openCLCreateBuffer(cv::ocl::Context::getContext(), CL_MEM_READ_ONLY, sizeof(detect_piramid_info) * loopcount);
}
openCLSafeCall(clEnqueueWriteBuffer((cl_command_queue)cv::ocl::Context::getContext()->oclCommandQueue(), ((OclBuffers *)buffers)->scaleinfobuffer, 1, 0,
openCLSafeCall(clEnqueueWriteBuffer(qu, ((OclBuffers *)buffers)->scaleinfobuffer, 1, 0,
sizeof(detect_piramid_info)*loopcount,
scaleinfo, 0, NULL, NULL));
free(scaleinfo);
@@ -1755,7 +1616,8 @@ void cv::ocl::OclCascadeClassifierBuf::GenResult(CV_OUT std::vector<cv::Rect>& f
const std::vector<cv::Rect> &rectList,
const std::vector<int> &rweights)
{
CvSeq *result_seq = cvCreateSeq( 0, sizeof(CvSeq), sizeof(CvAvgComp), cvCreateMemStorage(0) );
MemStorage tempStorage(cvCreateMemStorage(0));
CvSeq *result_seq = cvCreateSeq( 0, sizeof(CvSeq), sizeof(CvAvgComp), tempStorage );
if( findBiggestObject && rectList.size() )
{
@@ -1791,168 +1653,32 @@ void cv::ocl::OclCascadeClassifierBuf::GenResult(CV_OUT std::vector<cv::Rect>& f
void cv::ocl::OclCascadeClassifierBuf::release()
{
openCLSafeCall(clReleaseMemObject(((OclBuffers *)buffers)->stagebuffer));
openCLSafeCall(clReleaseMemObject(((OclBuffers *)buffers)->scaleinfobuffer));
openCLSafeCall(clReleaseMemObject(((OclBuffers *)buffers)->nodebuffer));
openCLSafeCall(clReleaseMemObject(((OclBuffers *)buffers)->candidatebuffer));
if( (m_flags & CV_HAAR_SCALE_IMAGE) )
if(initialized)
{
cvFree(&oldCascade->hid_cascade);
}
else
{
openCLSafeCall(clReleaseMemObject(((OclBuffers *)buffers)->newnodebuffer));
openCLSafeCall(clReleaseMemObject(((OclBuffers *)buffers)->correctionbuffer));
openCLSafeCall(clReleaseMemObject(((OclBuffers *)buffers)->pbuffer));
}
openCLSafeCall(clReleaseMemObject(((OclBuffers *)buffers)->stagebuffer));
openCLSafeCall(clReleaseMemObject(((OclBuffers *)buffers)->scaleinfobuffer));
openCLSafeCall(clReleaseMemObject(((OclBuffers *)buffers)->nodebuffer));
openCLSafeCall(clReleaseMemObject(((OclBuffers *)buffers)->candidatebuffer));
free(buffers);
buffers = NULL;
if( (m_flags & CV_HAAR_SCALE_IMAGE) )
{
cvFree(&oldCascade->hid_cascade);
}
else
{
openCLSafeCall(clReleaseMemObject(((OclBuffers *)buffers)->newnodebuffer));
openCLSafeCall(clReleaseMemObject(((OclBuffers *)buffers)->correctionbuffer));
openCLSafeCall(clReleaseMemObject(((OclBuffers *)buffers)->pbuffer));
}
free(buffers);
buffers = NULL;
initialized = false;
}
}
#ifndef _MAX_PATH
#define _MAX_PATH 1024
#endif
/****************************************************************************************\
* Persistence functions *
\****************************************************************************************/
/* field names */
#define ICV_HAAR_SIZE_NAME "size"
#define ICV_HAAR_STAGES_NAME "stages"
#define ICV_HAAR_TREES_NAME "trees"
#define ICV_HAAR_FEATURE_NAME "feature"
#define ICV_HAAR_RECTS_NAME "rects"
#define ICV_HAAR_TILTED_NAME "tilted"
#define ICV_HAAR_THRESHOLD_NAME "threshold"
#define ICV_HAAR_LEFT_NODE_NAME "left_node"
#define ICV_HAAR_LEFT_VAL_NAME "left_val"
#define ICV_HAAR_RIGHT_NODE_NAME "right_node"
#define ICV_HAAR_RIGHT_VAL_NAME "right_val"
#define ICV_HAAR_STAGE_THRESHOLD_NAME "stage_threshold"
#define ICV_HAAR_PARENT_NAME "parent"
#define ICV_HAAR_NEXT_NAME "next"
static int gpuRunHaarClassifierCascade( /*const CvHaarClassifierCascade *_cascade, CvPoint pt, int start_stage */)
{
return 1;
}
namespace cv
{
namespace ocl
{
struct gpuHaarDetectObjects_ScaleImage_Invoker
{
gpuHaarDetectObjects_ScaleImage_Invoker( const CvHaarClassifierCascade *_cascade,
int _stripSize, double _factor,
const Mat &_sum1, const Mat &_sqsum1, Mat *_norm1,
Mat *_mask1, Rect _equRect, ConcurrentRectVector &_vec )
{
cascade = _cascade;
stripSize = _stripSize;
factor = _factor;
sum1 = _sum1;
sqsum1 = _sqsum1;
norm1 = _norm1;
mask1 = _mask1;
equRect = _equRect;
vec = &_vec;
}
void operator()( const BlockedRange &range ) const
{
Size winSize0 = cascade->orig_window_size;
Size winSize(cvRound(winSize0.width * factor), cvRound(winSize0.height * factor));
int y1 = range.begin() * stripSize, y2 = std::min(range.end() * stripSize, sum1.rows - 1 - winSize0.height);
Size ssz(sum1.cols - 1 - winSize0.width, y2 - y1);
int x, y, ystep = factor > 2 ? 1 : 2;
for( y = y1; y < y2; y += ystep )
for( x = 0; x < ssz.width; x += ystep )
{
if( gpuRunHaarClassifierCascade( /*cascade, cvPoint(x, y), 0*/ ) > 0 )
vec->push_back(Rect(cvRound(x * factor), cvRound(y * factor),
winSize.width, winSize.height));
}
}
const CvHaarClassifierCascade *cascade;
int stripSize;
double factor;
Mat sum1, sqsum1, *norm1, *mask1;
Rect equRect;
ConcurrentRectVector *vec;
};
struct gpuHaarDetectObjects_ScaleCascade_Invoker
{
gpuHaarDetectObjects_ScaleCascade_Invoker( const CvHaarClassifierCascade *_cascade,
Size _winsize, const Range &_xrange, double _ystep,
size_t _sumstep, const int **_p, const int **_pq,
ConcurrentRectVector &_vec )
{
cascade = _cascade;
winsize = _winsize;
xrange = _xrange;
ystep = _ystep;
sumstep = _sumstep;
p = _p;
pq = _pq;
vec = &_vec;
}
void operator()( const BlockedRange &range ) const
{
int iy, startY = range.begin(), endY = range.end();
const int *p0 = p[0], *p1 = p[1], *p2 = p[2], *p3 = p[3];
const int *pq0 = pq[0], *pq1 = pq[1], *pq2 = pq[2], *pq3 = pq[3];
bool doCannyPruning = p0 != 0;
int sstep = (int)(sumstep / sizeof(p0[0]));
for( iy = startY; iy < endY; iy++ )
{
int ix, y = cvRound(iy * ystep), ixstep = 1;
for( ix = xrange.start; ix < xrange.end; ix += ixstep )
{
int x = cvRound(ix * ystep); // it should really be ystep, not ixstep
if( doCannyPruning )
{
int offset = y * sstep + x;
int s = p0[offset] - p1[offset] - p2[offset] + p3[offset];
int sq = pq0[offset] - pq1[offset] - pq2[offset] + pq3[offset];
if( s < 100 || sq < 20 )
{
ixstep = 2;
continue;
}
}
int result = gpuRunHaarClassifierCascade(/* cascade, cvPoint(x, y), 0 */);
if( result > 0 )
vec->push_back(Rect(x, y, winsize.width, winsize.height));
ixstep = result != 0 ? 1 : 2;
}
}
}
const CvHaarClassifierCascade *cascade;
double ystep;
size_t sumstep;
Size winsize;
Range xrange;
const int **p;
const int **pq;
ConcurrentRectVector *vec;
};
}
}
#endif

View File

@@ -270,7 +270,7 @@ namespace cv
size_t globalThreads[3] = {glbSizeX, glbSizeY, 1};
size_t localThreads[3] = {blkSizeX, blkSizeY, 1};
float borderFloat[4] = {(float)borderValue[0], (float)borderValue[1], (float)borderValue[2], (float)borderValue[3]};
std::vector< std::pair<size_t, const void *> > args;
if(map1.channels() == 2)
{
@@ -292,7 +292,7 @@ namespace cv
args.push_back( std::make_pair(sizeof(cl_int), (void *)&cols));
float borderFloat[4] = {(float)borderValue[0], (float)borderValue[1], (float)borderValue[2], (float)borderValue[3]};
if(src.clCxt->supportsFeature(Context::CL_DOUBLE))
if(src.clCxt->supportsFeature(Context::CL_DOUBLE))
{
args.push_back( std::make_pair(sizeof(cl_double4), (void *)&borderValue));
}
@@ -326,7 +326,6 @@ namespace cv
}
else
{
float borderFloat[4] = {(float)borderValue[0], (float)borderValue[1], (float)borderValue[2], (float)borderValue[3]};
args.push_back( std::make_pair(sizeof(cl_float4), (void *)&borderFloat));
}
}
@@ -1210,30 +1209,41 @@ namespace cv
void cornerHarris(const oclMat &src, oclMat &dst, int blockSize, int ksize,
double k, int borderType)
{
if(!src.clCxt->supportsFeature(Context::CL_DOUBLE) && src.depth() == CV_64F)
{
CV_Error(Error::GpuNotSupported, "select device don't support double");
}
CV_Assert(src.cols >= blockSize / 2 && src.rows >= blockSize / 2);
oclMat Dx, Dy;
CV_Assert(borderType == cv::BORDER_CONSTANT || borderType == cv::BORDER_REFLECT101 || borderType == cv::BORDER_REPLICATE || borderType == cv::BORDER_REFLECT);
extractCovData(src, Dx, Dy, blockSize, ksize, borderType);
dst.create(src.size(), CV_32F);
corner_ocl(imgproc_calcHarris, "calcHarris", blockSize, static_cast<float>(k), Dx, Dy, dst, borderType);
oclMat dx, dy;
cornerHarris_dxdy(src, dst, dx, dy, blockSize, ksize, k, borderType);
}
void cornerMinEigenVal(const oclMat &src, oclMat &dst, int blockSize, int ksize, int borderType)
void cornerHarris_dxdy(const oclMat &src, oclMat &dst, oclMat &dx, oclMat &dy, int blockSize, int ksize,
double k, int borderType)
{
if(!src.clCxt->supportsFeature(Context::CL_DOUBLE) && src.depth() == CV_64F)
{
CV_Error(Error::GpuNotSupported, "select device don't support double");
}
CV_Assert(src.cols >= blockSize / 2 && src.rows >= blockSize / 2);
oclMat Dx, Dy;
CV_Assert(borderType == cv::BORDER_CONSTANT || borderType == cv::BORDER_REFLECT101 || borderType == cv::BORDER_REPLICATE || borderType == cv::BORDER_REFLECT);
extractCovData(src, Dx, Dy, blockSize, ksize, borderType);
extractCovData(src, dx, dy, blockSize, ksize, borderType);
dst.create(src.size(), CV_32F);
corner_ocl(imgproc_calcMinEigenVal, "calcMinEigenVal", blockSize, 0, Dx, Dy, dst, borderType);
corner_ocl(imgproc_calcHarris, "calcHarris", blockSize, static_cast<float>(k), dx, dy, dst, borderType);
}
void cornerMinEigenVal(const oclMat &src, oclMat &dst, int blockSize, int ksize, int borderType)
{
oclMat dx, dy;
cornerMinEigenVal_dxdy(src, dst, dx, dy, blockSize, ksize, borderType);
}
void cornerMinEigenVal_dxdy(const oclMat &src, oclMat &dst, oclMat &dx, oclMat &dy, int blockSize, int ksize, int borderType)
{
if(!src.clCxt->supportsFeature(Context::CL_DOUBLE) && src.depth() == CV_64F)
{
CV_Error(Error::GpuNotSupported, "select device don't support double");
}
CV_Assert(src.cols >= blockSize / 2 && src.rows >= blockSize / 2);
CV_Assert(borderType == cv::BORDER_CONSTANT || borderType == cv::BORDER_REFLECT101 || borderType == cv::BORDER_REPLICATE || borderType == cv::BORDER_REFLECT);
extractCovData(src, dx, dy, blockSize, ksize, borderType);
dst.create(src.size(), CV_32F);
corner_ocl(imgproc_calcMinEigenVal, "calcMinEigenVal", blockSize, 0, dx, dy, dst, borderType);
}
/////////////////////////////////// MeanShiftfiltering ///////////////////////////////////////////////
static void meanShiftFiltering_gpu(const oclMat &src, oclMat dst, int sp, int sr, int maxIter, float eps)

View File

@@ -43,9 +43,28 @@
//
//M*/
#define CL_USE_DEPRECATED_OPENCL_1_1_APIS
#include "precomp.hpp"
#ifdef __GNUC__
#if ((__GNUC__ * 100) + __GNUC_MINOR__) >= 402
#define GCC_DIAG_STR(s) #s
#define GCC_DIAG_JOINSTR(x,y) GCC_DIAG_STR(x ## y)
# define GCC_DIAG_DO_PRAGMA(x) _Pragma (#x)
# define GCC_DIAG_PRAGMA(x) GCC_DIAG_DO_PRAGMA(GCC diagnostic x)
# if ((__GNUC__ * 100) + __GNUC_MINOR__) >= 406
# define GCC_DIAG_OFF(x) GCC_DIAG_PRAGMA(push) \
GCC_DIAG_PRAGMA(ignored GCC_DIAG_JOINSTR(-W,x))
# define GCC_DIAG_ON(x) GCC_DIAG_PRAGMA(pop)
# else
# define GCC_DIAG_OFF(x) GCC_DIAG_PRAGMA(ignored GCC_DIAG_JOINSTR(-W,x))
# define GCC_DIAG_ON(x) GCC_DIAG_PRAGMA(warning GCC_DIAG_JOINSTR(-W,x))
# endif
#else
# define GCC_DIAG_OFF(x)
# define GCC_DIAG_ON(x)
#endif
#endif /* __GNUC__ */
using namespace std;
namespace cv
@@ -121,6 +140,9 @@ namespace cv
build_options, finish_mode);
}
#ifdef __GNUC__
GCC_DIAG_OFF(deprecated-declarations)
#endif
cl_mem bindTexture(const oclMat &mat)
{
cl_mem texture;
@@ -156,7 +178,7 @@ namespace cv
format.image_channel_order = CL_RGBA;
break;
default:
CV_Error(-1, "Image forma is not supported");
CV_Error(-1, "Image format is not supported");
break;
}
#ifdef CL_VERSION_1_2
@@ -180,10 +202,6 @@ namespace cv
else
#endif
{
#ifdef __GNUC__
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wdeprecated-declarations"
#endif
texture = clCreateImage2D(
(cl_context)mat.clCxt->oclContext(),
CL_MEM_READ_WRITE,
@@ -193,9 +211,6 @@ namespace cv
0,
NULL,
&err);
#ifdef __GNUC__
#pragma GCC diagnostic pop
#endif
}
size_t origin[] = { 0, 0, 0 };
size_t region[] = { mat.cols, mat.rows, 1 };
@@ -225,6 +240,14 @@ namespace cv
openCLSafeCall(err);
return texture;
}
#ifdef __GNUC__
GCC_DIAG_ON(deprecated-declarations)
#endif
Ptr<TextureCL> bindTexturePtr(const oclMat &mat)
{
return Ptr<TextureCL>(new TextureCL(bindTexture(mat), mat.rows, mat.cols, mat.type()));
}
void releaseTexture(cl_mem& texture)
{
openCLFree(texture);

View File

@@ -127,7 +127,7 @@ __kernel void arithm_add_D2 (__global ushort *src1, int src1_step, int src1_offs
#ifdef dst_align
#undef dst_align
#endif
#define dst_align ((dst_offset >> 1) & 3)
#define dst_align ((dst_offset / 2) & 3)
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
@@ -165,7 +165,7 @@ __kernel void arithm_add_D3 (__global short *src1, int src1_step, int src1_offse
#ifdef dst_align
#undef dst_align
#endif
#define dst_align ((dst_offset >> 1) & 3)
#define dst_align ((dst_offset / 2) & 3)
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
@@ -335,7 +335,7 @@ __kernel void arithm_add_with_mask_C1_D2 (__global ushort *src1, int src1_step,
#ifdef dst_align
#undef dst_align
#endif
#define dst_align ((dst_offset >> 1) & 1)
#define dst_align ((dst_offset / 2) & 1)
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
@@ -375,7 +375,7 @@ __kernel void arithm_add_with_mask_C1_D3 (__global short *src1, int src1_step, i
#ifdef dst_align
#undef dst_align
#endif
#define dst_align ((dst_offset >> 1) & 1)
#define dst_align ((dst_offset / 2) & 1)
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
@@ -507,7 +507,7 @@ __kernel void arithm_add_with_mask_C2_D0 (__global uchar *src1, int src1_step, i
#ifdef dst_align
#undef dst_align
#endif
#define dst_align ((dst_offset >> 1) & 1)
#define dst_align ((dst_offset / 2) & 1)
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);

View File

@@ -126,7 +126,7 @@ __kernel void arithm_s_add_with_mask_C1_D2 (__global ushort *src1, int src1_st
#ifdef dst_align
#undef dst_align
#endif
#define dst_align ((dst_offset >> 1) & 1)
#define dst_align ((dst_offset / 2) & 1)
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
@@ -164,7 +164,7 @@ __kernel void arithm_s_add_with_mask_C1_D3 (__global short *src1, int src1_ste
#ifdef dst_align
#undef dst_align
#endif
#define dst_align ((dst_offset >> 1) & 1)
#define dst_align ((dst_offset / 2) & 1)
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
@@ -288,7 +288,7 @@ __kernel void arithm_s_add_with_mask_C2_D0 (__global uchar *src1, int src1_ste
#ifdef dst_align
#undef dst_align
#endif
#define dst_align ((dst_offset >> 1) & 1)
#define dst_align ((dst_offset / 2) & 1)
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);

View File

@@ -120,7 +120,7 @@ __kernel void morph_C1_D0(__global const uchar * restrict src,
int gidy = get_global_id(1);
int out_addr = mad24(gidy,dst_step_in_pixel,gidx+dst_offset_in_pixel);
if(gidx+3<cols && gidy<rows && (dst_offset_in_pixel&3)==0)
if(gidx+3<cols && gidy<rows && ((dst_offset_in_pixel&3)==0))
{
*(__global uchar4*)&dst[out_addr] = res;
}

View File

@@ -10,6 +10,7 @@
// Wang Weiyan, wangweiyanster@gmail.com
// Jia Haipeng, jiahaipeng95@gmail.com
// Nathan, liujun@multicorewareinc.com
// Peng Xiao, pengxiao@outlook.com
// Redistribution and use in source and binary forms, with or without modification,
// are permitted provided that the following conditions are met:
//
@@ -45,27 +46,16 @@
typedef int sumtype;
typedef float sqsumtype;
typedef struct __attribute__((aligned (128))) GpuHidHaarFeature
{
struct __attribute__((aligned (32)))
{
int p0 __attribute__((aligned (4)));
int p1 __attribute__((aligned (4)));
int p2 __attribute__((aligned (4)));
int p3 __attribute__((aligned (4)));
float weight __attribute__((aligned (4)));
}
rect[CV_HAAR_FEATURE_MAX] __attribute__((aligned (32)));
}
GpuHidHaarFeature;
#ifndef STUMP_BASED
#define STUMP_BASED 1
#endif
typedef struct __attribute__((aligned (128) )) GpuHidHaarTreeNode
{
int p[CV_HAAR_FEATURE_MAX][4] __attribute__((aligned (64)));
float weight[CV_HAAR_FEATURE_MAX] /*__attribute__((aligned (16)))*/;
float threshold /*__attribute__((aligned (4)))*/;
float alpha[2] __attribute__((aligned (8)));
float weight[CV_HAAR_FEATURE_MAX];
float threshold;
float alpha[3] __attribute__((aligned (16)));
int left __attribute__((aligned (4)));
int right __attribute__((aligned (4)));
}
@@ -111,7 +101,6 @@ typedef struct __attribute__((aligned (64))) GpuHidHaarClassifierCascade
float inv_window_area __attribute__((aligned (4)));
} GpuHidHaarClassifierCascade;
__kernel void __attribute__((reqd_work_group_size(8,8,1)))gpuRunHaarClassifierCascade(
global GpuHidHaarStageClassifier * stagecascadeptr,
global int4 * info,
@@ -234,7 +223,7 @@ __kernel void __attribute__((reqd_work_group_size(8,8,1)))gpuRunHaarClassifierCa
float stage_sum = 0.f;
int2 stageinfo = *(global int2*)(stagecascadeptr+stageloop);
float stagethreshold = as_float(stageinfo.y);
for(int nodeloop = 0; nodeloop < stageinfo.x; nodeloop++ )
for(int nodeloop = 0; nodeloop < stageinfo.x; )
{
__global GpuHidHaarTreeNode* currentnodeptr = (nodeptr + nodecounter);
@@ -242,7 +231,8 @@ __kernel void __attribute__((reqd_work_group_size(8,8,1)))gpuRunHaarClassifierCa
int4 info2 = *(__global int4*)(&(currentnodeptr->p[1][0]));
int4 info3 = *(__global int4*)(&(currentnodeptr->p[2][0]));
float4 w = *(__global float4*)(&(currentnodeptr->weight[0]));
float2 alpha2 = *(__global float2*)(&(currentnodeptr->alpha[0]));
float3 alpha3 = *(__global float3*)(&(currentnodeptr->alpha[0]));
float nodethreshold = w.w * variance_norm_factor;
info1.x +=lcl_off;
@@ -261,8 +251,34 @@ __kernel void __attribute__((reqd_work_group_size(8,8,1)))gpuRunHaarClassifierCa
classsum += (lcldata[mad24(info3.y,readwidth,info3.x)] - lcldata[mad24(info3.y,readwidth,info3.z)] -
lcldata[mad24(info3.w,readwidth,info3.x)] + lcldata[mad24(info3.w,readwidth,info3.z)]) * w.z;
stage_sum += classsum >= nodethreshold ? alpha2.y : alpha2.x;
bool passThres = classsum >= nodethreshold;
#if STUMP_BASED
stage_sum += passThres ? alpha3.y : alpha3.x;
nodecounter++;
nodeloop++;
#else
bool isRootNode = (nodecounter & 1) == 0;
if(isRootNode)
{
if( (passThres && currentnodeptr->right) ||
(!passThres && currentnodeptr->left))
{
nodecounter ++;
}
else
{
stage_sum += alpha3.x;
nodecounter += 2;
nodeloop ++;
}
}
else
{
stage_sum += passThres ? alpha3.z : alpha3.y;
nodecounter ++;
nodeloop ++;
}
#endif
}
result = (stage_sum >= stagethreshold);
@@ -301,18 +317,20 @@ __kernel void __attribute__((reqd_work_group_size(8,8,1)))gpuRunHaarClassifierCa
if(lcl_compute_win_id < queuecount)
{
int tempnodecounter = lcl_compute_id;
float part_sum = 0.f;
for(int lcl_loop=0; lcl_loop<lcl_loops && tempnodecounter<stageinfo.x; lcl_loop++)
const int stump_factor = STUMP_BASED ? 1 : 2;
int root_offset = 0;
for(int lcl_loop=0; lcl_loop<lcl_loops && tempnodecounter<stageinfo.x;)
{
__global GpuHidHaarTreeNode* currentnodeptr = (nodeptr + nodecounter + tempnodecounter);
__global GpuHidHaarTreeNode* currentnodeptr =
nodeptr + (nodecounter + tempnodecounter) * stump_factor + root_offset;
int4 info1 = *(__global int4*)(&(currentnodeptr->p[0][0]));
int4 info2 = *(__global int4*)(&(currentnodeptr->p[1][0]));
int4 info3 = *(__global int4*)(&(currentnodeptr->p[2][0]));
float4 w = *(__global float4*)(&(currentnodeptr->weight[0]));
float2 alpha2 = *(__global float2*)(&(currentnodeptr->alpha[0]));
float3 alpha3 = *(__global float3*)(&(currentnodeptr->alpha[0]));
float nodethreshold = w.w * variance_norm_factor;
info1.x +=queue_pixel;
@@ -332,8 +350,34 @@ __kernel void __attribute__((reqd_work_group_size(8,8,1)))gpuRunHaarClassifierCa
classsum += (lcldata[mad24(info3.y,readwidth,info3.x)] - lcldata[mad24(info3.y,readwidth,info3.z)] -
lcldata[mad24(info3.w,readwidth,info3.x)] + lcldata[mad24(info3.w,readwidth,info3.z)]) * w.z;
part_sum += classsum >= nodethreshold ? alpha2.y : alpha2.x;
tempnodecounter +=lcl_compute_win;
bool passThres = classsum >= nodethreshold;
#if STUMP_BASED
part_sum += passThres ? alpha3.y : alpha3.x;
tempnodecounter += lcl_compute_win;
lcl_loop++;
#else
if(root_offset == 0)
{
if( (passThres && currentnodeptr->right) ||
(!passThres && currentnodeptr->left))
{
root_offset = 1;
}
else
{
part_sum += alpha3.x;
tempnodecounter += lcl_compute_win;
lcl_loop++;
}
}
else
{
part_sum += passThres ? alpha3.z : alpha3.y;
tempnodecounter += lcl_compute_win;
lcl_loop++;
root_offset = 0;
}
#endif
}//end for(int lcl_loop=0;lcl_loop<lcl_loops;lcl_loop++)
partialsum[lcl_id]=part_sum;
}
@@ -377,155 +421,3 @@ __kernel void __attribute__((reqd_work_group_size(8,8,1)))gpuRunHaarClassifierCa
}//end for(int grploop=grpidx;grploop<totalgrp;grploop+=grpnumx)
}//end for(int scalei = 0; scalei <loopcount; scalei++)
}
/*
if(stagecascade->two_rects)
{
#pragma unroll
for( n = 0; n < stagecascade->count; n++ )
{
t1 = *(node + counter);
t = t1.threshold * variance_norm_factor;
classsum = calc_sum1(t1,p_offset,0) * t1.weight[0];
classsum += calc_sum1(t1, p_offset,1) * t1.weight[1];
stage_sum += classsum >= t ? t1.alpha[1]:t1.alpha[0];
counter++;
}
}
else
{
#pragma unroll
for( n = 0; n < stagecascade->count; n++ )
{
t = node[counter].threshold*variance_norm_factor;
classsum = calc_sum1(node[counter],p_offset,0) * node[counter].weight[0];
classsum += calc_sum1(node[counter],p_offset,1) * node[counter].weight[1];
if( node[counter].p0[2] )
classsum += calc_sum1(node[counter],p_offset,2) * node[counter].weight[2];
stage_sum += classsum >= t ? node[counter].alpha[1]:node[counter].alpha[0];// modify
counter++;
}
}
*/
/*
__kernel void gpuRunHaarClassifierCascade_ScaleWindow(
constant GpuHidHaarClassifierCascade * _cascade,
global GpuHidHaarStageClassifier * stagecascadeptr,
//global GpuHidHaarClassifier * classifierptr,
global GpuHidHaarTreeNode * nodeptr,
global int * sum,
global float * sqsum,
global int * _candidate,
int pixel_step,
int cols,
int rows,
int start_stage,
int end_stage,
//int counts,
int nodenum,
int ystep,
int detect_width,
//int detect_height,
int loopcount,
int outputstep)
//float scalefactor)
{
unsigned int x1 = get_global_id(0);
unsigned int y1 = get_global_id(1);
int p_offset;
int m, n;
int result;
int counter;
float mean, variance_norm_factor;
for(int i=0;i<loopcount;i++)
{
constant GpuHidHaarClassifierCascade * cascade = _cascade + i;
global int * candidate = _candidate + i*outputstep;
int window_width = cascade->p1 - cascade->p0;
int window_height = window_width;
result = 1;
counter = 0;
unsigned int x = mul24(x1,ystep);
unsigned int y = mul24(y1,ystep);
if((x < cols - window_width - 1) && (y < rows - window_height -1))
{
global GpuHidHaarStageClassifier *stagecascade = stagecascadeptr +cascade->count*i+ start_stage;
//global GpuHidHaarClassifier *classifier = classifierptr;
global GpuHidHaarTreeNode *node = nodeptr + nodenum*i;
p_offset = mad24(y, pixel_step, x);// modify
mean = (*(sum + p_offset + (int)cascade->p0) - *(sum + p_offset + (int)cascade->p1) -
*(sum + p_offset + (int)cascade->p2) + *(sum + p_offset + (int)cascade->p3))
*cascade->inv_window_area;
variance_norm_factor = *(sqsum + p_offset + cascade->p0) - *(sqsum + cascade->p1 + p_offset) -
*(sqsum + p_offset + cascade->p2) + *(sqsum + cascade->p3 + p_offset);
variance_norm_factor = variance_norm_factor * cascade->inv_window_area - mean * mean;
variance_norm_factor = variance_norm_factor >=0.f ? sqrt(variance_norm_factor) : 1;//modify
// if( cascade->is_stump_based )
//{
for( m = start_stage; m < end_stage; m++ )
{
float stage_sum = 0.f;
float t, classsum;
GpuHidHaarTreeNode t1;
//#pragma unroll
for( n = 0; n < stagecascade->count; n++ )
{
t1 = *(node + counter);
t = t1.threshold * variance_norm_factor;
classsum = calc_sum1(t1, p_offset ,0) * t1.weight[0] + calc_sum1(t1, p_offset ,1) * t1.weight[1];
if((t1.p0[2]) && (!stagecascade->two_rects))
classsum += calc_sum1(t1, p_offset, 2) * t1.weight[2];
stage_sum += classsum >= t ? t1.alpha[1] : t1.alpha[0];// modify
counter++;
}
if (stage_sum < stagecascade->threshold)
{
result = 0;
break;
}
stagecascade++;
}
if(result)
{
candidate[4 * (y1 * detect_width + x1)] = x;
candidate[4 * (y1 * detect_width + x1) + 1] = y;
candidate[4 * (y1 * detect_width + x1)+2] = window_width;
candidate[4 * (y1 * detect_width + x1) + 3] = window_height;
}
//}
}
}
}
*/

View File

@@ -17,7 +17,7 @@
// @Authors
// Wu Xinglong, wxl370@126.com
// Sen Liu, swjtuls1987@126.com
//
// Peng Xiao, pengxiao@outlook.com
// Redistribution and use in source and binary forms, with or without modification,
// are permitted provided that the following conditions are met:
//
@@ -49,25 +49,13 @@
#define CV_HAAR_FEATURE_MAX 3
typedef int sumtype;
typedef float sqsumtype;
typedef struct __attribute__((aligned(128))) GpuHidHaarFeature
{
struct __attribute__((aligned(32)))
{
int p0 __attribute__((aligned(4)));
int p1 __attribute__((aligned(4)));
int p2 __attribute__((aligned(4)));
int p3 __attribute__((aligned(4)));
float weight __attribute__((aligned(4)));
}
rect[CV_HAAR_FEATURE_MAX] __attribute__((aligned(32)));
}
GpuHidHaarFeature;
typedef struct __attribute__((aligned(128))) GpuHidHaarTreeNode
{
int p[CV_HAAR_FEATURE_MAX][4] __attribute__((aligned(64)));
float weight[CV_HAAR_FEATURE_MAX] /*__attribute__((aligned (16)))*/;
float threshold /*__attribute__((aligned (4)))*/;
float alpha[2] __attribute__((aligned(8)));
float alpha[3] __attribute__((aligned(16)));
int left __attribute__((aligned(4)));
int right __attribute__((aligned(4)));
}
@@ -174,45 +162,83 @@ __kernel void gpuRunHaarClassifierCascade_scaled2(
const int p_offset = mad24(y, step, x);
cascadeinfo.x += p_offset;
cascadeinfo.z += p_offset;
mean = (sum[clamp(mad24(cascadeinfo.y, step, cascadeinfo.x), 0, max_idx)] - sum[clamp(mad24(cascadeinfo.y, step, cascadeinfo.z), 0, max_idx)] -
sum[clamp(mad24(cascadeinfo.w, step, cascadeinfo.x), 0, max_idx)] + sum[clamp(mad24(cascadeinfo.w, step, cascadeinfo.z), 0, max_idx)])
mean = (sum[clamp(mad24(cascadeinfo.y, step, cascadeinfo.x), 0, max_idx)]
- sum[clamp(mad24(cascadeinfo.y, step, cascadeinfo.z), 0, max_idx)] -
sum[clamp(mad24(cascadeinfo.w, step, cascadeinfo.x), 0, max_idx)]
+ sum[clamp(mad24(cascadeinfo.w, step, cascadeinfo.z), 0, max_idx)])
* correction_t;
variance_norm_factor = sqsum[clamp(mad24(cascadeinfo.y, step, cascadeinfo.x), 0, max_idx)] - sqsum[clamp(mad24(cascadeinfo.y, step, cascadeinfo.z), 0, max_idx)] -
sqsum[clamp(mad24(cascadeinfo.w, step, cascadeinfo.x), 0, max_idx)] + sqsum[clamp(mad24(cascadeinfo.w, step, cascadeinfo.z), 0, max_idx)];
variance_norm_factor = sqsum[clamp(mad24(cascadeinfo.y, step, cascadeinfo.x), 0, max_idx)]
- sqsum[clamp(mad24(cascadeinfo.y, step, cascadeinfo.z), 0, max_idx)] -
sqsum[clamp(mad24(cascadeinfo.w, step, cascadeinfo.x), 0, max_idx)]
+ sqsum[clamp(mad24(cascadeinfo.w, step, cascadeinfo.z), 0, max_idx)];
variance_norm_factor = variance_norm_factor * correction_t - mean * mean;
variance_norm_factor = variance_norm_factor >= 0.f ? sqrt(variance_norm_factor) : 1.f;
bool result = true;
nodecounter = startnode + nodecount * scalei;
for (int stageloop = start_stage; (stageloop < end_stage) && result; stageloop++)
{
float stage_sum = 0.f;
int stagecount = stagecascadeptr[stageloop].count;
for (int nodeloop = 0; nodeloop < stagecount; nodeloop++)
for (int nodeloop = 0; nodeloop < stagecount;)
{
__global GpuHidHaarTreeNode *currentnodeptr = (nodeptr + nodecounter);
int4 info1 = *(__global int4 *)(&(currentnodeptr->p[0][0]));
int4 info2 = *(__global int4 *)(&(currentnodeptr->p[1][0]));
int4 info3 = *(__global int4 *)(&(currentnodeptr->p[2][0]));
float4 w = *(__global float4 *)(&(currentnodeptr->weight[0]));
float2 alpha2 = *(__global float2 *)(&(currentnodeptr->alpha[0]));
float3 alpha3 = *(__global float3 *)(&(currentnodeptr->alpha[0]));
float nodethreshold = w.w * variance_norm_factor;
info1.x += p_offset;
info1.z += p_offset;
info2.x += p_offset;
info2.z += p_offset;
float classsum = (sum[clamp(mad24(info1.y, step, info1.x), 0, max_idx)] - sum[clamp(mad24(info1.y, step, info1.z), 0, max_idx)] -
sum[clamp(mad24(info1.w, step, info1.x), 0, max_idx)] + sum[clamp(mad24(info1.w, step, info1.z), 0, max_idx)]) * w.x;
classsum += (sum[clamp(mad24(info2.y, step, info2.x), 0, max_idx)] - sum[clamp(mad24(info2.y, step, info2.z), 0, max_idx)] -
sum[clamp(mad24(info2.w, step, info2.x), 0, max_idx)] + sum[clamp(mad24(info2.w, step, info2.z), 0, max_idx)]) * w.y;
info3.x += p_offset;
info3.z += p_offset;
classsum += (sum[clamp(mad24(info3.y, step, info3.x), 0, max_idx)] - sum[clamp(mad24(info3.y, step, info3.z), 0, max_idx)] -
sum[clamp(mad24(info3.w, step, info3.x), 0, max_idx)] + sum[clamp(mad24(info3.w, step, info3.z), 0, max_idx)]) * w.z;
stage_sum += classsum >= nodethreshold ? alpha2.y : alpha2.x;
float classsum = (sum[clamp(mad24(info1.y, step, info1.x), 0, max_idx)]
- sum[clamp(mad24(info1.y, step, info1.z), 0, max_idx)] -
sum[clamp(mad24(info1.w, step, info1.x), 0, max_idx)]
+ sum[clamp(mad24(info1.w, step, info1.z), 0, max_idx)]) * w.x;
classsum += (sum[clamp(mad24(info2.y, step, info2.x), 0, max_idx)]
- sum[clamp(mad24(info2.y, step, info2.z), 0, max_idx)] -
sum[clamp(mad24(info2.w, step, info2.x), 0, max_idx)]
+ sum[clamp(mad24(info2.w, step, info2.z), 0, max_idx)]) * w.y;
classsum += (sum[clamp(mad24(info3.y, step, info3.x), 0, max_idx)]
- sum[clamp(mad24(info3.y, step, info3.z), 0, max_idx)] -
sum[clamp(mad24(info3.w, step, info3.x), 0, max_idx)]
+ sum[clamp(mad24(info3.w, step, info3.z), 0, max_idx)]) * w.z;
bool passThres = classsum >= nodethreshold;
#if STUMP_BASED
stage_sum += passThres ? alpha3.y : alpha3.x;
nodecounter++;
nodeloop++;
#else
bool isRootNode = (nodecounter & 1) == 0;
if(isRootNode)
{
if( (passThres && currentnodeptr->right) ||
(!passThres && currentnodeptr->left))
{
nodecounter ++;
}
else
{
stage_sum += alpha3.x;
nodecounter += 2;
nodeloop ++;
}
}
else
{
stage_sum += (passThres ? alpha3.z : alpha3.y);
nodecounter ++;
nodeloop ++;
}
#endif
}
result = (bool)(stage_sum >= stagecascadeptr[stageloop].threshold);
result = (int)(stage_sum >= stagecascadeptr[stageloop].threshold);
}
barrier(CLK_LOCAL_MEM_FENCE);
@@ -222,7 +248,6 @@ __kernel void gpuRunHaarClassifierCascade_scaled2(
int queueindex = atomic_inc(lclcount);
lcloutindex[queueindex] = (y << 16) | x;
}
barrier(CLK_LOCAL_MEM_FENCE);
int queuecount = lclcount[0];
@@ -277,5 +302,6 @@ __kernel void gpuscaleclassifier(global GpuHidHaarTreeNode *orinode, global GpuH
newnode[counter].threshold = t1.threshold;
newnode[counter].alpha[0] = t1.alpha[0];
newnode[counter].alpha[1] = t1.alpha[1];
newnode[counter].alpha[2] = t1.alpha[2];
}

View File

@@ -297,6 +297,9 @@ calcMap
map_step /= sizeof(*map);
map_offset /= sizeof(*map);
mag += mag_offset;
map += map_offset;
__local float smem[18][18];
int gidx = get_global_id(0);
@@ -389,7 +392,7 @@ edgesHysteresisLocal
(
__global int * map,
__global ushort2 * st,
volatile __global unsigned int * counter,
__global unsigned int * counter,
int rows,
int cols,
int map_step,
@@ -399,6 +402,8 @@ edgesHysteresisLocal
map_step /= sizeof(*map);
map_offset /= sizeof(*map);
map += map_offset;
__local int smem[18][18];
int gidx = get_global_id(0);
@@ -416,12 +421,12 @@ edgesHysteresisLocal
if(ly < 14)
{
smem[ly][lx] =
map[grp_idx + lx + min(grp_idy + ly, rows - 1) * map_step + map_offset];
map[grp_idx + lx + min(grp_idy + ly, rows - 1) * map_step];
}
if(ly < 4 && grp_idy + ly + 14 <= rows && grp_idx + lx <= cols)
{
smem[ly + 14][lx] =
map[grp_idx + lx + min(grp_idy + ly + 14, rows - 1) * map_step + map_offset];
map[grp_idx + lx + min(grp_idy + ly + 14, rows - 1) * map_step];
}
barrier(CLK_LOCAL_MEM_FENCE);
@@ -482,14 +487,17 @@ edgesHysteresisLocal
__constant int c_dx[8] = {-1, 0, 1, -1, 1, -1, 0, 1};
__constant int c_dy[8] = {-1, -1, -1, 0, 0, 1, 1, 1};
#define stack_size 512
__kernel
void edgesHysteresisGlobal
void
__attribute__((reqd_work_group_size(128,1,1)))
edgesHysteresisGlobal
(
__global int * map,
__global ushort2 * st1,
__global ushort2 * st2,
volatile __global int * counter,
__global int * counter,
int rows,
int cols,
int count,
@@ -501,6 +509,8 @@ void edgesHysteresisGlobal
map_step /= sizeof(*map);
map_offset /= sizeof(*map);
map += map_offset;
int gidx = get_global_id(0);
int gidy = get_global_id(1);
@@ -510,7 +520,7 @@ void edgesHysteresisGlobal
int grp_idx = get_group_id(0);
int grp_idy = get_group_id(1);
volatile __local unsigned int s_counter;
__local unsigned int s_counter;
__local unsigned int s_ind;
__local ushort2 s_st[stack_size];
@@ -564,9 +574,9 @@ void edgesHysteresisGlobal
pos.x += c_dx[lidx & 7];
pos.y += c_dy[lidx & 7];
if (map[pos.x + map_offset + pos.y * map_step] == 1)
if (map[pos.x + pos.y * map_step] == 1)
{
map[pos.x + map_offset + pos.y * map_step] = 2;
map[pos.x + pos.y * map_step] = 2;
ind = atomic_inc(&s_counter);
@@ -621,6 +631,6 @@ void getEdges
if(gidy < rows && gidx < cols)
{
dst[gidx + gidy * dst_step] = (uchar)(-(map[gidx + 1 + (gidy + 1) * map_step] >> 1));
dst[gidx + gidy * dst_step] = (uchar)(-(map[gidx + 1 + (gidy + 1) * map_step + map_offset] >> 1));
}
}

View File

@@ -0,0 +1,276 @@
/*M///////////////////////////////////////////////////////////////////////////////////////
//
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
//
// By downloading, copying, installing or using the software you agree to this license.
// If you do not agree to this license, do not download, install,
// copy or use the software.
//
//
// License Agreement
// For Open Source Computer Vision Library
//
// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
// Third party copyrights are property of their respective owners.
//
// @Authors
// Peng Xiao, pengxiao@outlook.com
//
// Redistribution and use in source and binary forms, with or without modification,
// are permitted provided that the following conditions are met:
//
// * Redistribution's of source code must retain the above copyright notice,
// this list of conditions and the following disclaimer.
//
// * Redistribution's in binary form must reproduce the above copyright notice,
// this list of conditions and the following disclaimer in the documentation
// and/or other oclMaterials provided with the distribution.
//
// * The name of the copyright holders may not be used to endorse or promote products
// derived from this software without specific prior written permission.
//
// This software is provided by the copyright holders and contributors as is and
// any express or implied warranties, including, but not limited to, the implied
// warranties of merchantability and fitness for a particular purpose are disclaimed.
// In no event shall the Intel Corporation or contributors be liable for any direct,
// indirect, incidental, special, exemplary, or consequential damages
// (including, but not limited to, procurement of substitute goods or services;
// loss of use, data, or profits; or business interruption) however caused
// and on any theory of liability, whether in contract, strict liability,
// or tort (including negligence or otherwise) arising in any way out of
// the use of this software, even if advised of the possibility of such damage.
//
//M*/
#ifndef WITH_MASK
#define WITH_MASK 0
#endif
__constant sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP_TO_EDGE | CLK_FILTER_NEAREST;
inline float ELEM_INT2(image2d_t _eig, int _x, int _y)
{
return read_imagef(_eig, sampler, (int2)(_x, _y)).x;
}
inline float ELEM_FLT2(image2d_t _eig, float2 pt)
{
return read_imagef(_eig, sampler, pt).x;
}
__kernel
void findCorners
(
image2d_t eig,
__global const char * mask,
__global float2 * corners,
const int mask_strip,// in pixels
const float threshold,
const int rows,
const int cols,
const int max_count,
__global int * g_counter
)
{
const int j = get_global_id(0);
const int i = get_global_id(1);
if (i > 0 && i < rows - 1 && j > 0 && j < cols - 1
#if WITH_MASK
&& mask[i * mask_strip + j] != 0
#endif
)
{
const float val = ELEM_INT2(eig, j, i);
if (val > threshold)
{
float maxVal = val;
maxVal = fmax(ELEM_INT2(eig, j - 1, i - 1), maxVal);
maxVal = fmax(ELEM_INT2(eig, j , i - 1), maxVal);
maxVal = fmax(ELEM_INT2(eig, j + 1, i - 1), maxVal);
maxVal = fmax(ELEM_INT2(eig, j - 1, i), maxVal);
maxVal = fmax(ELEM_INT2(eig, j + 1, i), maxVal);
maxVal = fmax(ELEM_INT2(eig, j - 1, i + 1), maxVal);
maxVal = fmax(ELEM_INT2(eig, j , i + 1), maxVal);
maxVal = fmax(ELEM_INT2(eig, j + 1, i + 1), maxVal);
if (val == maxVal)
{
const int ind = atomic_inc(g_counter);
if (ind < max_count)
corners[ind] = (float2)(j, i);
}
}
}
}
//bitonic sort
__kernel
void sortCorners_bitonicSort
(
image2d_t eig,
__global float2 * corners,
const int count,
const int stage,
const int passOfStage
)
{
const int threadId = get_global_id(0);
if(threadId >= count / 2)
{
return;
}
const int sortOrder = (((threadId/(1 << stage)) % 2)) == 1 ? 1 : 0; // 0 is descent
const int pairDistance = 1 << (stage - passOfStage);
const int blockWidth = 2 * pairDistance;
const int leftId = min( (threadId % pairDistance)
+ (threadId / pairDistance) * blockWidth, count );
const int rightId = min( leftId + pairDistance, count );
const float2 leftPt = corners[leftId];
const float2 rightPt = corners[rightId];
const float leftVal = ELEM_FLT2(eig, leftPt);
const float rightVal = ELEM_FLT2(eig, rightPt);
const bool compareResult = leftVal > rightVal;
float2 greater = compareResult ? leftPt:rightPt;
float2 lesser = compareResult ? rightPt:leftPt;
corners[leftId] = sortOrder ? lesser : greater;
corners[rightId] = sortOrder ? greater : lesser;
}
//selection sort for gfft
//kernel is ported from Bolt library:
//https://github.com/HSA-Libraries/Bolt/blob/master/include/bolt/cl/sort_kernels.cl
// Local sort will firstly sort elements of each workgroup using selection sort
// its performance is O(n)
__kernel
void sortCorners_selectionSortLocal
(
image2d_t eig,
__global float2 * corners,
const int count,
__local float2 * scratch
)
{
int i = get_local_id(0); // index in workgroup
int numOfGroups = get_num_groups(0); // index in workgroup
int groupID = get_group_id(0);
int wg = get_local_size(0); // workgroup size = block size
int n; // number of elements to be processed for this work group
int offset = groupID * wg;
int same = 0;
corners += offset;
n = (groupID == (numOfGroups-1))? (count - wg*(numOfGroups-1)) : wg;
float2 pt1, pt2;
pt1 = corners[min(i, n)];
scratch[i] = pt1;
barrier(CLK_LOCAL_MEM_FENCE);
if(i >= n)
{
return;
}
float val1 = ELEM_FLT2(eig, pt1);
float val2;
int pos = 0;
for (int j=0;j<n;++j)
{
pt2 = scratch[j];
val2 = ELEM_FLT2(eig, pt2);
if(val2 > val1)
pos++;//calculate the rank of this element in this work group
else
{
if(val1 > val2)
continue;
else
{
// val1 and val2 are same
same++;
}
}
}
for (int j=0; j< same; j++)
corners[pos + j] = pt1;
}
__kernel
void sortCorners_selectionSortFinal
(
image2d_t eig,
__global float2 * corners,
const int count
)
{
const int i = get_local_id(0); // index in workgroup
const int numOfGroups = get_num_groups(0); // index in workgroup
const int groupID = get_group_id(0);
const int wg = get_local_size(0); // workgroup size = block size
int pos = 0, same = 0;
const int offset = get_group_id(0) * wg;
const int remainder = count - wg*(numOfGroups-1);
if((offset + i ) >= count)
return;
float2 pt1, pt2;
pt1 = corners[groupID*wg + i];
float val1 = ELEM_FLT2(eig, pt1);
float val2;
for(int j=0; j<numOfGroups-1; j++ )
{
for(int k=0; k<wg; k++)
{
pt2 = corners[j*wg + k];
val2 = ELEM_FLT2(eig, pt2);
if(val1 > val2)
break;
else
{
//Increment only if the value is not the same.
if( val2 > val1 )
pos++;
else
same++;
}
}
}
for(int k=0; k<remainder; k++)
{
pt2 = corners[(numOfGroups-1)*wg + k];
val2 = ELEM_FLT2(eig, pt2);
if(val1 > val2)
break;
else
{
//Don't increment if the value is the same.
//Two elements are same if (*userComp)(jData, iData) and (*userComp)(iData, jData) are both false
if(val2 > val1)
pos++;
else
same++;
}
}
for (int j=0; j< same; j++)
corners[pos + j] = pt1;
}

View File

@@ -143,7 +143,7 @@ __kernel void threshold_C1_D5(__global const float * restrict src, __global floa
int4 dpos = (int4)(dstart, dstart+1, dstart+2, dstart+3);
float4 dVal = *(__global float4*)(dst+dst_offset+gy*dst_step+dstart);
int4 con = dpos >= 0 && dpos < dst_cols;
ddata = convert_float4(con) != 0 ? ddata : dVal;
ddata = convert_float4(con) != (float4)(0) ? ddata : dVal;
if(dstart < dst_cols)
{
*(__global float4*)(dst+dst_offset+gy*dst_step+dstart) = ddata;

View File

@@ -46,145 +46,10 @@
//#pragma OPENCL EXTENSION cl_amd_printf : enable
__kernel void calcSharrDeriv_vertical_C1_D0(__global const uchar* src, int srcStep, int rows, int cols, int cn, __global short* dx_buf, int dx_bufStep, __global short* dy_buf, int dy_bufStep)
{
const int x = get_global_id(0);
const int y = get_global_id(1);
if (y < rows && x < cols * cn)
{
const uchar src_val0 = (src + (y > 0 ? y-1 : rows > 1 ? 1 : 0) * srcStep)[x];
const uchar src_val1 = (src + y * srcStep)[x];
const uchar src_val2 = (src + (y < rows-1 ? y+1 : rows > 1 ? rows-2 : 0) * srcStep)[x];
((__global short*)((__global char*)dx_buf + y * dx_bufStep / 2))[x] = (src_val0 + src_val2) * 3 + src_val1 * 10;
((__global short*)((__global char*)dy_buf + y * dy_bufStep / 2))[x] = src_val2 - src_val0;
}
}
__kernel void calcSharrDeriv_vertical_C4_D0(__global const uchar* src, int srcStep, int rows, int cols, int cn, __global short* dx_buf, int dx_bufStep, __global short* dy_buf, int dy_bufStep)
{
const int x = get_global_id(0);
const int y = get_global_id(1);
if (y < rows && x < cols * cn)
{
const uchar src_val0 = (src + (y > 0 ? y - 1 : 1) * srcStep)[x];
const uchar src_val1 = (src + y * srcStep)[x];
const uchar src_val2 = (src + (y < rows - 1 ? y + 1 : rows - 2) * srcStep)[x];
((__global short*)((__global char*)dx_buf + y * dx_bufStep / 2))[x] = (src_val0 + src_val2) * 3 + src_val1 * 10;
((__global short*)((__global char*)dy_buf + y * dy_bufStep / 2))[x] = src_val2 - src_val0;
}
}
__kernel void calcSharrDeriv_horizontal_C1_D0(int rows, int cols, int cn, __global const short* dx_buf, int dx_bufStep, __global const short* dy_buf, int dy_bufStep, __global short* dIdx, int dIdxStep, __global short* dIdy, int dIdyStep)
{
const int x = get_global_id(0);
const int y = get_global_id(1);
const int colsn = cols * cn;
if (y < rows && x < colsn)
{
__global const short* dx_buf_row = dx_buf + y * dx_bufStep;
__global const short* dy_buf_row = dy_buf + y * dy_bufStep;
const int xr = x + cn < colsn ? x + cn : (cols - 2) * cn + x + cn - colsn;
const int xl = x - cn >= 0 ? x - cn : cn + x;
((__global short*)((__global char*)dIdx + y * dIdxStep / 2))[x] = dx_buf_row[xr] - dx_buf_row[xl];
((__global short*)((__global char*)dIdy + y * dIdyStep / 2))[x] = (dy_buf_row[xr] + dy_buf_row[xl]) * 3 + dy_buf_row[x] * 10;
}
}
__kernel void calcSharrDeriv_horizontal_C4_D0(int rows, int cols, int cn, __global const short* dx_buf, int dx_bufStep, __global const short* dy_buf, int dy_bufStep, __global short* dIdx, int dIdxStep, __global short* dIdy, int dIdyStep)
{
const int x = get_global_id(0);
const int y = get_global_id(1);
const int colsn = cols * cn;
if (y < rows && x < colsn)
{
__global const short* dx_buf_row = dx_buf + y * dx_bufStep;
__global const short* dy_buf_row = dy_buf + y * dy_bufStep;
const int xr = x + cn < colsn ? x + cn : (cols - 2) * cn + x + cn - colsn;
const int xl = x - cn >= 0 ? x - cn : cn + x;
((__global short*)((__global char*)dIdx + y * dIdxStep / 2))[x] = dx_buf_row[xr] - dx_buf_row[xl];
((__global short*)((__global char*)dIdy + y * dIdyStep / 2))[x] = (dy_buf_row[xr] + dy_buf_row[xl]) * 3 + dy_buf_row[x] * 10;
}
}
#define W_BITS 14
#define W_BITS1 14
#define CV_DESCALE(x, n) (((x) + (1 << ((n)-1))) >> (n))
int linearFilter_uchar(__global const uchar* src, int srcStep, int cn, float2 pt, int x, int y)
{
int2 ipt;
ipt.x = convert_int_sat_rtn(pt.x);
ipt.y = convert_int_sat_rtn(pt.y);
float a = pt.x - ipt.x;
float b = pt.y - ipt.y;
int iw00 = convert_int_sat_rte((1.0f - a) * (1.0f - b) * (1 << W_BITS));
int iw01 = convert_int_sat_rte(a * (1.0f - b) * (1 << W_BITS));
int iw10 = convert_int_sat_rte((1.0f - a) * b * (1 << W_BITS));
int iw11 = (1 << W_BITS) - iw00 - iw01 - iw10;
__global const uchar* src_row = src + (ipt.y + y) * srcStep + ipt.x * cn;
__global const uchar* src_row1 = src + (ipt.y + y + 1) * srcStep + ipt.x * cn;
return CV_DESCALE(src_row[x] * iw00 + src_row[x + cn] * iw01 + src_row1[x] * iw10 + src_row1[x + cn] * iw11, W_BITS1 - 5);
}
int linearFilter_short(__global const short* src, int srcStep, int cn, float2 pt, int x, int y)
{
int2 ipt;
ipt.x = convert_int_sat_rtn(pt.x);
ipt.y = convert_int_sat_rtn(pt.y);
float a = pt.x - ipt.x;
float b = pt.y - ipt.y;
int iw00 = convert_int_sat_rte((1.0f - a) * (1.0f - b) * (1 << W_BITS));
int iw01 = convert_int_sat_rte(a * (1.0f - b) * (1 << W_BITS));
int iw10 = convert_int_sat_rte((1.0f - a) * b * (1 << W_BITS));
int iw11 = (1 << W_BITS) - iw00 - iw01 - iw10;
__global const short* src_row = src + (ipt.y + y) * srcStep + ipt.x * cn;
__global const short* src_row1 = src + (ipt.y + y + 1) * srcStep + ipt.x * cn;
return CV_DESCALE(src_row[x] * iw00 + src_row[x + cn] * iw01 + src_row1[x] * iw10 + src_row1[x + cn] * iw11, W_BITS1);
}
float linearFilter_float(__global const float* src, int srcStep, int cn, float2 pt, float x, float y)
{
int2 ipt;
ipt.x = convert_int_sat_rtn(pt.x);
ipt.y = convert_int_sat_rtn(pt.y);
float a = pt.x - ipt.x;
float b = pt.y - ipt.y;
float iw00 = ((1.0f - a) * (1.0f - b) * (1 << W_BITS));
float iw01 = (a * (1.0f - b) * (1 << W_BITS));
float iw10 = ((1.0f - a) * b * (1 << W_BITS));
float iw11 = (1 << W_BITS) - iw00 - iw01 - iw10;
__global const float* src_row = src + (int)(ipt.y + y) * srcStep / 4 + ipt.x * cn;
__global const float* src_row1 = src + (int)(ipt.y + y + 1) * srcStep / 4 + ipt.x * cn;
return src_row[(int)x] * iw00 + src_row[(int)x + cn] * iw01 + src_row1[(int)x] * iw10 + src_row1[(int)x + cn] * iw11, W_BITS1 - 5;
}
#define BUFFER 64
#ifndef WAVE_SIZE
#define WAVE_SIZE 1
#endif
#ifdef CPU
void reduce3(float val1, float val2, float val3, __local float* smem1, __local float* smem2, __local float* smem3, int tid)
{
@@ -193,71 +58,51 @@ void reduce3(float val1, float val2, float val3, __local float* smem1, __local
smem3[tid] = val3;
barrier(CLK_LOCAL_MEM_FENCE);
#if BUFFER > 128
if (tid < 128)
{
smem1[tid] = val1 += smem1[tid + 128];
smem2[tid] = val2 += smem2[tid + 128];
smem3[tid] = val3 += smem3[tid + 128];
}
barrier(CLK_LOCAL_MEM_FENCE);
#endif
#if BUFFER > 64
if (tid < 64)
{
smem1[tid] = val1 += smem1[tid + 64];
smem2[tid] = val2 += smem2[tid + 64];
smem3[tid] = val3 += smem3[tid + 64];
}
barrier(CLK_LOCAL_MEM_FENCE);
#endif
if (tid < 32)
{
smem1[tid] = val1 += smem1[tid + 32];
smem2[tid] = val2 += smem2[tid + 32];
smem3[tid] = val3 += smem3[tid + 32];
smem1[tid] += smem1[tid + 32];
smem2[tid] += smem2[tid + 32];
smem3[tid] += smem3[tid + 32];
}
barrier(CLK_LOCAL_MEM_FENCE);
if (tid < 16)
{
smem1[tid] = val1 += smem1[tid + 16];
smem2[tid] = val2 += smem2[tid + 16];
smem3[tid] = val3 += smem3[tid + 16];
smem1[tid] += smem1[tid + 16];
smem2[tid] += smem2[tid + 16];
smem3[tid] += smem3[tid + 16];
}
barrier(CLK_LOCAL_MEM_FENCE);
if (tid < 8)
{
smem1[tid] = val1 += smem1[tid + 8];
smem2[tid] = val2 += smem2[tid + 8];
smem3[tid] = val3 += smem3[tid + 8];
smem1[tid] += smem1[tid + 8];
smem2[tid] += smem2[tid + 8];
smem3[tid] += smem3[tid + 8];
}
barrier(CLK_LOCAL_MEM_FENCE);
if (tid < 4)
{
smem1[tid] = val1 += smem1[tid + 4];
smem2[tid] = val2 += smem2[tid + 4];
smem3[tid] = val3 += smem3[tid + 4];
smem1[tid] += smem1[tid + 4];
smem2[tid] += smem2[tid + 4];
smem3[tid] += smem3[tid + 4];
}
barrier(CLK_LOCAL_MEM_FENCE);
if (tid < 2)
{
smem1[tid] = val1 += smem1[tid + 2];
smem2[tid] = val2 += smem2[tid + 2];
smem3[tid] = val3 += smem3[tid + 2];
smem1[tid] += smem1[tid + 2];
smem2[tid] += smem2[tid + 2];
smem3[tid] += smem3[tid + 2];
}
barrier(CLK_LOCAL_MEM_FENCE);
if (tid < 1)
{
smem1[BUFFER] = val1 += smem1[tid + 1];
smem2[BUFFER] = val2 += smem2[tid + 1];
smem3[BUFFER] = val3 += smem3[tid + 1];
smem1[BUFFER] = smem1[tid] + smem1[tid + 1];
smem2[BUFFER] = smem2[tid] + smem2[tid + 1];
smem3[BUFFER] = smem3[tid] + smem3[tid + 1];
}
barrier(CLK_LOCAL_MEM_FENCE);
}
@@ -268,63 +113,45 @@ void reduce2(float val1, float val2, volatile __local float* smem1, volatile __l
smem2[tid] = val2;
barrier(CLK_LOCAL_MEM_FENCE);
#if BUFFER > 128
if (tid < 128)
{
smem1[tid] = (val1 += smem1[tid + 128]);
smem2[tid] = (val2 += smem2[tid + 128]);
}
barrier(CLK_LOCAL_MEM_FENCE);
#endif
#if BUFFER > 64
if (tid < 64)
{
smem1[tid] = (val1 += smem1[tid + 64]);
smem2[tid] = (val2 += smem2[tid + 64]);
}
barrier(CLK_LOCAL_MEM_FENCE);
#endif
if (tid < 32)
{
smem1[tid] = (val1 += smem1[tid + 32]);
smem2[tid] = (val2 += smem2[tid + 32]);
smem1[tid] += smem1[tid + 32];
smem2[tid] += smem2[tid + 32];
}
barrier(CLK_LOCAL_MEM_FENCE);
if (tid < 16)
{
smem1[tid] = (val1 += smem1[tid + 16]);
smem2[tid] = (val2 += smem2[tid + 16]);
smem1[tid] += smem1[tid + 16];
smem2[tid] += smem2[tid + 16];
}
barrier(CLK_LOCAL_MEM_FENCE);
if (tid < 8)
{
smem1[tid] = (val1 += smem1[tid + 8]);
smem2[tid] = (val2 += smem2[tid + 8]);
smem1[tid] += smem1[tid + 8];
smem2[tid] += smem2[tid + 8];
}
barrier(CLK_LOCAL_MEM_FENCE);
if (tid < 4)
{
smem1[tid] = (val1 += smem1[tid + 4]);
smem2[tid] = (val2 += smem2[tid + 4]);
smem1[tid] += smem1[tid + 4];
smem2[tid] += smem2[tid + 4];
}
barrier(CLK_LOCAL_MEM_FENCE);
if (tid < 2)
{
smem1[tid] = (val1 += smem1[tid + 2]);
smem2[tid] = (val2 += smem2[tid + 2]);
smem1[tid] += smem1[tid + 2];
smem2[tid] += smem2[tid + 2];
}
barrier(CLK_LOCAL_MEM_FENCE);
if (tid < 1)
{
smem1[BUFFER] = (val1 += smem1[tid + 1]);
smem2[BUFFER] = (val2 += smem2[tid + 1]);
smem1[BUFFER] = smem1[tid] + smem1[tid + 1];
smem2[BUFFER] = smem2[tid] + smem2[tid + 1];
}
barrier(CLK_LOCAL_MEM_FENCE);
}
@@ -334,205 +161,146 @@ void reduce1(float val1, volatile __local float* smem1, int tid)
smem1[tid] = val1;
barrier(CLK_LOCAL_MEM_FENCE);
#if BUFFER > 128
if (tid < 128)
{
smem1[tid] = (val1 += smem1[tid + 128]);
}
barrier(CLK_LOCAL_MEM_FENCE);
#endif
#if BUFFER > 64
if (tid < 64)
{
smem1[tid] = (val1 += smem1[tid + 64]);
}
barrier(CLK_LOCAL_MEM_FENCE);
#endif
if (tid < 32)
{
smem1[tid] = (val1 += smem1[tid + 32]);
smem1[tid] += smem1[tid + 32];
}
barrier(CLK_LOCAL_MEM_FENCE);
if (tid < 16)
{
smem1[tid] = (val1 += smem1[tid + 16]);
smem1[tid] += smem1[tid + 16];
}
barrier(CLK_LOCAL_MEM_FENCE);
if (tid < 8)
{
smem1[tid] = (val1 += smem1[tid + 8]);
smem1[tid] += smem1[tid + 8];
}
barrier(CLK_LOCAL_MEM_FENCE);
if (tid < 4)
{
smem1[tid] = (val1 += smem1[tid + 4]);
smem1[tid] += smem1[tid + 4];
}
barrier(CLK_LOCAL_MEM_FENCE);
if (tid < 2)
{
smem1[tid] = (val1 += smem1[tid + 2]);
smem1[tid] += smem1[tid + 2];
}
barrier(CLK_LOCAL_MEM_FENCE);
if (tid < 1)
{
smem1[BUFFER] = (val1 += smem1[tid + 1]);
smem1[BUFFER] = smem1[tid] + smem1[tid + 1];
}
barrier(CLK_LOCAL_MEM_FENCE);
}
#else
void reduce3(float val1, float val2, float val3, __local float* smem1, __local float* smem2, __local float* smem3, int tid)
void reduce3(float val1, float val2, float val3,
__local volatile float* smem1, __local volatile float* smem2, __local volatile float* smem3, int tid)
{
smem1[tid] = val1;
smem2[tid] = val2;
smem3[tid] = val3;
barrier(CLK_LOCAL_MEM_FENCE);
#if BUFFER > 128
if (tid < 128)
{
smem1[tid] = val1 += smem1[tid + 128];
smem2[tid] = val2 += smem2[tid + 128];
smem3[tid] = val3 += smem3[tid + 128];
}
barrier(CLK_LOCAL_MEM_FENCE);
#endif
#if BUFFER > 64
if (tid < 64)
{
smem1[tid] = val1 += smem1[tid + 64];
smem2[tid] = val2 += smem2[tid + 64];
smem3[tid] = val3 += smem3[tid + 64];
}
barrier(CLK_LOCAL_MEM_FENCE);
#endif
if (tid < 32)
{
volatile __local float* vmem1 = smem1;
volatile __local float* vmem2 = smem2;
volatile __local float* vmem3 = smem3;
smem1[tid] += smem1[tid + 32];
smem2[tid] += smem2[tid + 32];
smem3[tid] += smem3[tid + 32];
#if WAVE_SIZE < 32
} barrier(CLK_LOCAL_MEM_FENCE);
if (tid < 16) {
#endif
smem1[tid] += smem1[tid + 16];
smem2[tid] += smem2[tid + 16];
smem3[tid] += smem3[tid + 16];
#if WAVE_SIZE <16
} barrier(CLK_LOCAL_MEM_FENCE);
if (tid < 8) {
#endif
smem1[tid] += smem1[tid + 8];
smem2[tid] += smem2[tid + 8];
smem3[tid] += smem3[tid + 8];
vmem1[tid] = val1 += vmem1[tid + 32];
vmem2[tid] = val2 += vmem2[tid + 32];
vmem3[tid] = val3 += vmem3[tid + 32];
smem1[tid] += smem1[tid + 4];
smem2[tid] += smem2[tid + 4];
smem3[tid] += smem3[tid + 4];
vmem1[tid] = val1 += vmem1[tid + 16];
vmem2[tid] = val2 += vmem2[tid + 16];
vmem3[tid] = val3 += vmem3[tid + 16];
smem1[tid] += smem1[tid + 2];
smem2[tid] += smem2[tid + 2];
smem3[tid] += smem3[tid + 2];
vmem1[tid] = val1 += vmem1[tid + 8];
vmem2[tid] = val2 += vmem2[tid + 8];
vmem3[tid] = val3 += vmem3[tid + 8];
vmem1[tid] = val1 += vmem1[tid + 4];
vmem2[tid] = val2 += vmem2[tid + 4];
vmem3[tid] = val3 += vmem3[tid + 4];
vmem1[tid] = val1 += vmem1[tid + 2];
vmem2[tid] = val2 += vmem2[tid + 2];
vmem3[tid] = val3 += vmem3[tid + 2];
vmem1[tid] = val1 += vmem1[tid + 1];
vmem2[tid] = val2 += vmem2[tid + 1];
vmem3[tid] = val3 += vmem3[tid + 1];
smem1[tid] += smem1[tid + 1];
smem2[tid] += smem2[tid + 1];
smem3[tid] += smem3[tid + 1];
}
}
void reduce2(float val1, float val2, __local float* smem1, __local float* smem2, int tid)
void reduce2(float val1, float val2, __local volatile float* smem1, __local volatile float* smem2, int tid)
{
smem1[tid] = val1;
smem2[tid] = val2;
barrier(CLK_LOCAL_MEM_FENCE);
#if BUFFER > 128
if (tid < 128)
{
smem1[tid] = val1 += smem1[tid + 128];
smem2[tid] = val2 += smem2[tid + 128];
}
barrier(CLK_LOCAL_MEM_FENCE);
#endif
#if BUFFER > 64
if (tid < 64)
{
smem1[tid] = val1 += smem1[tid + 64];
smem2[tid] = val2 += smem2[tid + 64];
}
barrier(CLK_LOCAL_MEM_FENCE);
#endif
if (tid < 32)
{
volatile __local float* vmem1 = smem1;
volatile __local float* vmem2 = smem2;
smem1[tid] += smem1[tid + 32];
smem2[tid] += smem2[tid + 32];
#if WAVE_SIZE < 32
} barrier(CLK_LOCAL_MEM_FENCE);
if (tid < 16) {
#endif
smem1[tid] += smem1[tid + 16];
smem2[tid] += smem2[tid + 16];
#if WAVE_SIZE <16
} barrier(CLK_LOCAL_MEM_FENCE);
if (tid < 8) {
#endif
smem1[tid] += smem1[tid + 8];
smem2[tid] += smem2[tid + 8];
vmem1[tid] = val1 += vmem1[tid + 32];
vmem2[tid] = val2 += vmem2[tid + 32];
smem1[tid] += smem1[tid + 4];
smem2[tid] += smem2[tid + 4];
vmem1[tid] = val1 += vmem1[tid + 16];
vmem2[tid] = val2 += vmem2[tid + 16];
smem1[tid] += smem1[tid + 2];
smem2[tid] += smem2[tid + 2];
vmem1[tid] = val1 += vmem1[tid + 8];
vmem2[tid] = val2 += vmem2[tid + 8];
vmem1[tid] = val1 += vmem1[tid + 4];
vmem2[tid] = val2 += vmem2[tid + 4];
vmem1[tid] = val1 += vmem1[tid + 2];
vmem2[tid] = val2 += vmem2[tid + 2];
vmem1[tid] = val1 += vmem1[tid + 1];
vmem2[tid] = val2 += vmem2[tid + 1];
smem1[tid] += smem1[tid + 1];
smem2[tid] += smem2[tid + 1];
}
}
void reduce1(float val1, __local float* smem1, int tid)
void reduce1(float val1, __local volatile float* smem1, int tid)
{
smem1[tid] = val1;
barrier(CLK_LOCAL_MEM_FENCE);
#if BUFFER > 128
if (tid < 128)
{
smem1[tid] = val1 += smem1[tid + 128];
}
barrier(CLK_LOCAL_MEM_FENCE);
#endif
#if BUFFER > 64
if (tid < 64)
{
smem1[tid] = val1 += smem1[tid + 64];
}
barrier(CLK_LOCAL_MEM_FENCE);
#endif
if (tid < 32)
{
volatile __local float* vmem1 = smem1;
vmem1[tid] = val1 += vmem1[tid + 32];
vmem1[tid] = val1 += vmem1[tid + 16];
vmem1[tid] = val1 += vmem1[tid + 8];
vmem1[tid] = val1 += vmem1[tid + 4];
vmem1[tid] = val1 += vmem1[tid + 2];
vmem1[tid] = val1 += vmem1[tid + 1];
smem1[tid] += smem1[tid + 32];
#if WAVE_SIZE < 32
} barrier(CLK_LOCAL_MEM_FENCE);
if (tid < 16) {
#endif
smem1[tid] += smem1[tid + 16];
#if WAVE_SIZE <16
} barrier(CLK_LOCAL_MEM_FENCE);
if (tid < 8) {
#endif
smem1[tid] += smem1[tid + 8];
smem1[tid] += smem1[tid + 4];
smem1[tid] += smem1[tid + 2];
smem1[tid] += smem1[tid + 1];
}
}
#endif
#define SCALE (1.0f / (1 << 20))
#define THRESHOLD 0.01f
#define DIMENSION 21
// Image read mode
__constant sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP_TO_EDGE | CLK_FILTER_LINEAR;

View File

@@ -61,6 +61,8 @@
#include <exception>
#include <stdio.h>
#undef OPENCV_NOSTL
#include "opencv2/imgproc.hpp"
#include "opencv2/objdetect.hpp"
#include "opencv2/ocl.hpp"
@@ -74,6 +76,7 @@
#if defined (HAVE_OPENCL)
#define CL_USE_DEPRECATED_OPENCL_1_1_APIS
#include "opencv2/ocl/private/util.hpp"
#include "safe_call.hpp"

View File

@@ -15,8 +15,8 @@
// Third party copyrights are property of their respective owners.
//
// @Authors
// Dachuan Zhao, dachuan@multicorewareinc.com
// Yao Wang, yao@multicorewareinc.com
// Dachuan Zhao, dachuan@multicorewareinc.com
// Yao Wang, yao@multicorewareinc.com
// Nathan, liujun@multicorewareinc.com
//
// Redistribution and use in source and binary forms, with or without modification,
@@ -54,35 +54,20 @@ namespace cv
{
namespace ocl
{
///////////////////////////OpenCL kernel strings///////////////////////////
extern const char *pyrlk;
extern const char *pyrlk_no_image;
extern const char *operator_setTo;
extern const char *operator_convertTo;
extern const char *operator_copyToM;
extern const char *arithm_mul;
extern const char *pyr_down;
}
}
struct dim3
{
unsigned int x, y, z;
};
struct float2
{
float x, y;
};
struct int2
{
int x, y;
};
namespace
{
void calcPatchSize(cv::Size winSize, int cn, dim3 &block, dim3 &patch, bool isDeviceArch11)
static void calcPatchSize(cv::Size winSize, int cn, dim3 &block, dim3 &patch, bool isDeviceArch11)
{
winSize.width *= cn;
@@ -102,12 +87,6 @@ void calcPatchSize(cv::Size winSize, int cn, dim3 &block, dim3 &patch, bool isDe
block.z = patch.z = 1;
}
}
inline int divUp(int total, int grain)
{
return (total + grain - 1) / grain;
}
///////////////////////////////////////////////////////////////////////////
//////////////////////////////// ConvertTo ////////////////////////////////
@@ -448,89 +427,6 @@ static void copyTo(const oclMat &src, oclMat &m )
src.data, src.step, src.cols * src.elemSize(), src.rows, src.offset);
}
// static void copyTo(const oclMat &src, oclMat &mat, const oclMat &mask)
// {
// if (mask.empty())
// {
// copyTo(src, mat);
// }
// else
// {
// mat.create(src.size(), src.type());
// copy_to_with_mask_cus(src, mat, mask, "copy_to_with_mask");
// }
// }
static void arithmetic_run(const oclMat &src1, oclMat &dst, String kernelName, const char **kernelString, void *_scalar)
{
if(!src1.clCxt->supportsFeature(Context::CL_DOUBLE) && src1.type() == CV_64F)
{
CV_Error(Error::GpuNotSupported, "Selected device don't support double\r\n");
return;
}
//dst.create(src1.size(), src1.type());
//CV_Assert(src1.cols == src2.cols && src2.cols == dst.cols &&
// src1.rows == src2.rows && src2.rows == dst.rows);
CV_Assert(src1.cols == dst.cols &&
src1.rows == dst.rows);
CV_Assert(src1.type() == dst.type());
CV_Assert(src1.depth() != CV_8S);
Context *clCxt = src1.clCxt;
//int channels = dst.channels();
//int depth = dst.depth();
//int vector_lengths[4][7] = {{4, 0, 4, 4, 1, 1, 1},
// {4, 0, 4, 4, 1, 1, 1},
// {4, 0, 4, 4, 1, 1, 1},
// {4, 0, 4, 4, 1, 1, 1}
//};
//size_t vector_length = vector_lengths[channels-1][depth];
//int offset_cols = (dst.offset / dst.elemSize1()) & (vector_length - 1);
//int cols = divUp(dst.cols * channels + offset_cols, vector_length);
size_t localThreads[3] = { 16, 16, 1 };
//size_t globalThreads[3] = { divUp(cols, localThreads[0]) * localThreads[0],
// divUp(dst.rows, localThreads[1]) * localThreads[1],
// 1
// };
size_t globalThreads[3] = { src1.cols,
src1.rows,
1
};
int dst_step1 = dst.cols * dst.elemSize();
std::vector<std::pair<size_t , const void *> > args;
args.push_back( std::make_pair( sizeof(cl_mem), (void *)&src1.data ));
args.push_back( std::make_pair( sizeof(cl_int), (void *)&src1.step ));
args.push_back( std::make_pair( sizeof(cl_int), (void *)&src1.offset ));
//args.push_back( std::make_pair( sizeof(cl_mem), (void *)&src2.data ));
//args.push_back( std::make_pair( sizeof(cl_int), (void *)&src2.step ));
//args.push_back( std::make_pair( sizeof(cl_int), (void *)&src2.offset ));
args.push_back( std::make_pair( sizeof(cl_mem), (void *)&dst.data ));
args.push_back( std::make_pair( sizeof(cl_int), (void *)&dst.step ));
args.push_back( std::make_pair( sizeof(cl_int), (void *)&dst.offset ));
args.push_back( std::make_pair( sizeof(cl_int), (void *)&src1.rows ));
args.push_back( std::make_pair( sizeof(cl_int), (void *)&src1.cols ));
args.push_back( std::make_pair( sizeof(cl_int), (void *)&dst_step1 ));
//if(_scalar != NULL)
//{
float scalar1 = *((float *)_scalar);
args.push_back( std::make_pair( sizeof(float), (float *)&scalar1 ));
//}
openCLExecuteKernel2(clCxt, kernelString, kernelName, globalThreads, localThreads, args, -1, src1.depth(), CLFLUSH);
}
static void multiply_cus(const oclMat &src1, oclMat &dst, float scalar)
{
arithmetic_run(src1, dst, "arithm_muls", &arithm_mul, (void *)(&scalar));
}
static void pyrdown_run_cus(const oclMat &src, const oclMat &dst)
{
@@ -576,15 +472,7 @@ static void lkSparse_run(oclMat &I, oclMat &J,
size_t localThreads[3] = { 8, isImageSupported ? 8 : 32, 1 };
size_t globalThreads[3] = { 8 * ptcount, isImageSupported ? 8 : 32, 1};
int cn = I.oclchannels();
char calcErr;
if (level == 0)
{
calcErr = 1;
}
else
{
calcErr = 0;
}
char calcErr = level==0?1:0;
std::vector<std::pair<size_t , const void *> > args;
@@ -614,7 +502,16 @@ static void lkSparse_run(oclMat &I, oclMat &J,
if(isImageSupported)
{
openCLExecuteKernel2(clCxt, &pyrlk, kernelName, globalThreads, localThreads, args, I.oclchannels(), I.depth(), CLFLUSH);
std::stringstream idxStr;
idxStr << kernelName.c_str() << "_C" << I.oclchannels() << "_D" << I.depth();
cl_kernel kernel = openCLGetKernelFromSource(clCxt, &pyrlk, idxStr.str().c_str());
int wave_size = queryDeviceInfo<WAVEFRONT_SIZE, int>(kernel);
openCLSafeCall(clReleaseKernel(kernel));
static char opt[16] = {0};
sprintf(opt, " -D WAVE_SIZE=%d", wave_size);
openCLExecuteKernel2(clCxt, &pyrlk, kernelName, globalThreads, localThreads, args, I.oclchannels(), I.depth(), opt, CLFLUSH);
releaseTexture(ITex);
releaseTexture(JTex);
}
@@ -656,9 +553,7 @@ void cv::ocl::PyrLKOpticalFlow::sparse(const oclMat &prevImg, const oclMat &next
oclMat temp1 = (useInitialFlow ? nextPts : prevPts).reshape(1);
oclMat temp2 = nextPts.reshape(1);
//oclMat scalar(temp1.rows, temp1.cols, temp1.type(), Scalar(1.0f / (1 << maxLevel) / 2.0f));
multiply_cus(temp1, temp2, 1.0f / (1 << maxLevel) / 2.0f);
//::multiply(temp1, 1.0f / (1 << maxLevel) / 2.0f, temp2);
multiply(1.0f/(1<<maxLevel)/2.0f, temp1, temp2);
ensureSizeIsEnough(1, prevPts.cols, CV_8UC1, status);
//status.setTo(Scalar::all(1));
@@ -675,7 +570,6 @@ void cv::ocl::PyrLKOpticalFlow::sparse(const oclMat &prevImg, const oclMat &next
//ensureSizeIsEnough(1, prevPts.cols, CV_32FC1, err);
// build the image pyramids.
prevPyr_.resize(maxLevel + 1);
nextPyr_.resize(maxLevel + 1);
@@ -703,7 +597,6 @@ void cv::ocl::PyrLKOpticalFlow::sparse(const oclMat &prevImg, const oclMat &next
}
// dI/dx ~ Ix, dI/dy ~ Iy
for (int level = maxLevel; level >= 0; level--)
{
lkSparse_run(prevPyr_[level], nextPyr_[level],

View File

@@ -47,7 +47,7 @@
#define __OPENCV_OPENCL_SAFE_CALL_HPP__
#if defined __APPLE__
#include <OpenCL/OpenCL.h>
#include <OpenCL/opencl.h>
#else
#include <CL/cl.h>
#endif

View File

@@ -73,7 +73,6 @@ TEST_P(Canny, Accuracy)
double low_thresh = 50.0;
double high_thresh = 100.0;
cv::resize(img, img, cv::Size(512, 384));
cv::ocl::oclMat ocl_img = cv::ocl::oclMat(img);
cv::ocl::oclMat edges;

View File

@@ -55,6 +55,12 @@ using namespace testing;
using namespace std;
using namespace cv;
extern string workdir;
namespace
{
IMPLEMENT_PARAM_CLASS(CascadeName, std::string);
CascadeName cascade_frontalface_alt(std::string("haarcascade_frontalface_alt.xml"));
CascadeName cascade_frontalface_alt2(std::string("haarcascade_frontalface_alt2.xml"));
struct getRect
{
Rect operator ()(const CvAvgComp &e) const
@@ -62,23 +68,24 @@ struct getRect
return e.rect;
}
};
}
PARAM_TEST_CASE(Haar, double, int)
PARAM_TEST_CASE(Haar, double, int, CascadeName)
{
cv::ocl::OclCascadeClassifier cascade, nestedCascade;
cv::ocl::OclCascadeClassifierBuf cascadebuf;
cv::CascadeClassifier cpucascade, cpunestedCascade;
double scale;
int flags;
std::string cascadeName;
virtual void SetUp()
{
scale = GET_PARAM(0);
flags = GET_PARAM(1);
string cascadeName = workdir + "../../data/haarcascades/haarcascade_frontalface_alt.xml";
cascadeName = (workdir + "../../data/haarcascades/").append(GET_PARAM(2));
if( (!cascade.load( cascadeName )) || (!cpucascade.load(cascadeName)) || (!cascadebuf.load( cascadeName )))
if( (!cascade.load( cascadeName )) || (!cpucascade.load(cascadeName)) )
{
cout << "ERROR: Could not load classifier cascade" << endl;
return;
@@ -115,7 +122,7 @@ TEST_P(Haar, FaceDetect)
Seq<CvAvgComp>(_objects).copyTo(vecAvgComp);
oclfaces.resize(vecAvgComp.size());
std::transform(vecAvgComp.begin(), vecAvgComp.end(), oclfaces.begin(), getRect());
cpucascade.detectMultiScale( smallImg, faces, 1.1, 3,
flags,
Size(30, 30), Size(0, 0) );
@@ -136,7 +143,6 @@ TEST_P(Haar, FaceDetectUseBuf)
vector<Rect> faces, oclfaces;
Mat gray, smallImg(cvRound (img.rows / scale), cvRound(img.cols / scale), CV_8UC1 );
MemStorage storage(cvCreateMemStorage(0));
cvtColor( img, gray, CV_BGR2GRAY );
resize( gray, smallImg, smallImg.size(), 0, 0, INTER_LINEAR );
equalizeHist( smallImg, smallImg );
@@ -144,19 +150,31 @@ TEST_P(Haar, FaceDetectUseBuf)
cv::ocl::oclMat image;
image.upload(smallImg);
cv::ocl::OclCascadeClassifierBuf cascadebuf;
if( !cascadebuf.load( cascadeName ) )
{
cout << "ERROR: Could not load classifier cascade for FaceDetectUseBuf!" << endl;
return;
}
cascadebuf.detectMultiScale( image, oclfaces, 1.1, 3,
flags,
Size(30, 30), Size(0, 0) );
cascadebuf.release();
cpucascade.detectMultiScale( smallImg, faces, 1.1, 3,
flags,
Size(30, 30), Size(0, 0) );
EXPECT_EQ(faces.size(), oclfaces.size());
// intentionally run ocl facedetect again and check if it still works after the first run
cascadebuf.detectMultiScale( image, oclfaces, 1.1, 3,
flags,
Size(30, 30));
cascadebuf.release();
EXPECT_EQ(faces.size(), oclfaces.size());
}
INSTANTIATE_TEST_CASE_P(FaceDetect, Haar,
Combine(Values(1.0),
Values(CV_HAAR_SCALE_IMAGE, 0)));
Values(CV_HAAR_SCALE_IMAGE, 0), Values(cascade_frontalface_alt, cascade_frontalface_alt2)));
#endif // HAVE_OPENCL

View File

@@ -55,6 +55,83 @@ using namespace testing;
using namespace std;
extern string workdir;
//////////////////////////////////////////////////////
// GoodFeaturesToTrack
namespace
{
IMPLEMENT_PARAM_CLASS(MinDistance, double)
}
PARAM_TEST_CASE(GoodFeaturesToTrack, MinDistance)
{
double minDistance;
virtual void SetUp()
{
minDistance = GET_PARAM(0);
}
};
TEST_P(GoodFeaturesToTrack, Accuracy)
{
cv::Mat frame = readImage(workdir + "../gpu/rubberwhale1.png", cv::IMREAD_GRAYSCALE);
ASSERT_FALSE(frame.empty());
int maxCorners = 1000;
double qualityLevel = 0.01;
cv::ocl::GoodFeaturesToTrackDetector_OCL detector(maxCorners, qualityLevel, minDistance);
cv::ocl::oclMat d_pts;
detector(oclMat(frame), d_pts);
ASSERT_FALSE(d_pts.empty());
std::vector<cv::Point2f> pts(d_pts.cols);
detector.downloadPoints(d_pts, pts);
std::vector<cv::Point2f> pts_gold;
cv::goodFeaturesToTrack(frame, pts_gold, maxCorners, qualityLevel, minDistance);
ASSERT_EQ(pts_gold.size(), pts.size());
size_t mistmatch = 0;
for (size_t i = 0; i < pts.size(); ++i)
{
cv::Point2i a = pts_gold[i];
cv::Point2i b = pts[i];
bool eq = std::abs(a.x - b.x) < 1 && std::abs(a.y - b.y) < 1;
if (!eq)
++mistmatch;
}
double bad_ratio = static_cast<double>(mistmatch) / pts.size();
ASSERT_LE(bad_ratio, 0.01);
}
TEST_P(GoodFeaturesToTrack, EmptyCorners)
{
int maxCorners = 1000;
double qualityLevel = 0.01;
cv::ocl::GoodFeaturesToTrackDetector_OCL detector(maxCorners, qualityLevel, minDistance);
cv::ocl::oclMat src(100, 100, CV_8UC1, cv::Scalar::all(0));
cv::ocl::oclMat corners(1, maxCorners, CV_32FC2);
detector(src, corners);
ASSERT_TRUE(corners.empty());
}
INSTANTIATE_TEST_CASE_P(OCL_Video, GoodFeaturesToTrack,
testing::Values(MinDistance(0.0), MinDistance(3.0)));
//////////////////////////////////////////////////////////////////////////
PARAM_TEST_CASE(TVL1, bool)
{