Merge remote-tracking branch 'origin/2.4'

Pull requests:
	#943 from jet47:cuda-5.5-support
	#944 from jet47:cmake-2.8.11-cuda-fix
	#912 from SpecLad:contributing
	#934 from SpecLad:parallel-for
	#931 from jet47:gpu-test-fixes
	#932 from bitwangyaoyao:2.4_fixBFM
	#918 from bitwangyaoyao:2.4_samples
	#924 from pengx17:2.4_arithm_fix
	#925 from pengx17:2.4_canny_tmp_fix
	#927 from bitwangyaoyao:2.4_perf
	#930 from pengx17:2.4_haar_ext
	#928 from apavlenko:bugfix_3027
	#920 from asmorkalov:android_move
	#910 from pengx17:2.4_oclgfft
	#913 from janm399:2.4
	#916 from bitwangyaoyao:2.4_fixPyrLK
	#919 from abidrahmank:2.4
	#923 from pengx17:2.4_macfix

Conflicts:
	modules/calib3d/src/stereobm.cpp
	modules/features2d/src/detectors.cpp
	modules/gpu/src/error.cpp
	modules/gpu/src/precomp.hpp
	modules/imgproc/src/distransform.cpp
	modules/imgproc/src/morph.cpp
	modules/ocl/include/opencv2/ocl/ocl.hpp
	modules/ocl/perf/perf_color.cpp
	modules/ocl/perf/perf_imgproc.cpp
	modules/ocl/perf/perf_match_template.cpp
	modules/ocl/perf/precomp.cpp
	modules/ocl/perf/precomp.hpp
	modules/ocl/src/arithm.cpp
	modules/ocl/src/canny.cpp
	modules/ocl/src/filtering.cpp
	modules/ocl/src/haar.cpp
	modules/ocl/src/hog.cpp
	modules/ocl/src/imgproc.cpp
	modules/ocl/src/opencl/haarobjectdetect.cl
	modules/ocl/src/pyrlk.cpp
	modules/video/src/bgfg_gaussmix2.cpp
	modules/video/src/lkpyramid.cpp
	platforms/linux/scripts/cmake_arm_gnueabi_hardfp.sh
	platforms/linux/scripts/cmake_arm_gnueabi_softfp.sh
	platforms/scripts/ABI_compat_generator.py
	samples/ocl/facedetect.cpp
This commit is contained in:
Roman Donchenko
2013-06-04 18:31:51 +04:00
236 changed files with 5549 additions and 3276 deletions

View File

@@ -6,7 +6,7 @@ set(the_description "Auxiliary module for Android native camera support")
set(OPENCV_MODULE_TYPE STATIC)
ocv_define_module(androidcamera INTERNAL opencv_core log dl)
ocv_include_directories("${CMAKE_CURRENT_SOURCE_DIR}/camera_wrapper" "${OpenCV_SOURCE_DIR}/android/service/engine/jni/include")
ocv_include_directories("${CMAKE_CURRENT_SOURCE_DIR}/camera_wrapper" "${OpenCV_SOURCE_DIR}/platforms/android/service/engine/jni/include")
# Android source tree for native camera
SET (ANDROID_SOURCE_TREE "ANDROID_SOURCE_TREE-NOTFOUND" CACHE PATH

View File

@@ -117,31 +117,6 @@ namespace cv
transform(points, modif_points, transformation);
}
class Mutex
{
public:
Mutex() {
}
void lock()
{
#ifdef HAVE_TBB
resultsMutex.lock();
#endif
}
void unlock()
{
#ifdef HAVE_TBB
resultsMutex.unlock();
#endif
}
private:
#ifdef HAVE_TBB
tbb::mutex resultsMutex;
#endif
};
struct CameraParameters
{
void init(Mat _intrinsics, Mat _distCoeffs)

View File

@@ -120,11 +120,8 @@ namespace cv { namespace gpu { namespace cudev
return dst;
}
__device__ __forceinline__ RGB2RGB()
: unary_function<typename TypeVec<T, scn>::vec_type, typename TypeVec<T, dcn>::vec_type>(){}
__device__ __forceinline__ RGB2RGB(const RGB2RGB& other_)
:unary_function<typename TypeVec<T, scn>::vec_type, typename TypeVec<T, dcn>::vec_type>(){}
__host__ __device__ __forceinline__ RGB2RGB() {}
__host__ __device__ __forceinline__ RGB2RGB(const RGB2RGB&) {}
};
template <> struct RGB2RGB<uchar, 4, 4, 2> : unary_function<uint, uint>
@@ -141,8 +138,8 @@ namespace cv { namespace gpu { namespace cudev
return dst;
}
__device__ __forceinline__ RGB2RGB():unary_function<uint, uint>(){}
__device__ __forceinline__ RGB2RGB(const RGB2RGB& other_):unary_function<uint, uint>(){}
__host__ __device__ __forceinline__ RGB2RGB() {}
__host__ __device__ __forceinline__ RGB2RGB(const RGB2RGB&) {}
};
}
@@ -203,8 +200,8 @@ namespace cv { namespace gpu { namespace cudev
return RGB2RGB5x5Converter<green_bits, bidx>::cvt(src);
}
__device__ __forceinline__ RGB2RGB5x5():unary_function<uchar3, ushort>(){}
__device__ __forceinline__ RGB2RGB5x5(const RGB2RGB5x5& other_):unary_function<uchar3, ushort>(){}
__host__ __device__ __forceinline__ RGB2RGB5x5() {}
__host__ __device__ __forceinline__ RGB2RGB5x5(const RGB2RGB5x5&) {}
};
template<int bidx, int green_bits> struct RGB2RGB5x5<4, bidx,green_bits> : unary_function<uint, ushort>
@@ -214,8 +211,8 @@ namespace cv { namespace gpu { namespace cudev
return RGB2RGB5x5Converter<green_bits, bidx>::cvt(src);
}
__device__ __forceinline__ RGB2RGB5x5():unary_function<uint, ushort>(){}
__device__ __forceinline__ RGB2RGB5x5(const RGB2RGB5x5& other_):unary_function<uint, ushort>(){}
__host__ __device__ __forceinline__ RGB2RGB5x5() {}
__host__ __device__ __forceinline__ RGB2RGB5x5(const RGB2RGB5x5&) {}
};
}
@@ -282,8 +279,8 @@ namespace cv { namespace gpu { namespace cudev
RGB5x52RGBConverter<green_bits, bidx>::cvt(src, dst);
return dst;
}
__device__ __forceinline__ RGB5x52RGB():unary_function<ushort, uchar3>(){}
__device__ __forceinline__ RGB5x52RGB(const RGB5x52RGB& other_):unary_function<ushort, uchar3>(){}
__host__ __device__ __forceinline__ RGB5x52RGB() {}
__host__ __device__ __forceinline__ RGB5x52RGB(const RGB5x52RGB&) {}
};
@@ -295,8 +292,8 @@ namespace cv { namespace gpu { namespace cudev
RGB5x52RGBConverter<green_bits, bidx>::cvt(src, dst);
return dst;
}
__device__ __forceinline__ RGB5x52RGB():unary_function<ushort, uint>(){}
__device__ __forceinline__ RGB5x52RGB(const RGB5x52RGB& other_):unary_function<ushort, uint>(){}
__host__ __device__ __forceinline__ RGB5x52RGB() {}
__host__ __device__ __forceinline__ RGB5x52RGB(const RGB5x52RGB&) {}
};
}
@@ -325,9 +322,8 @@ namespace cv { namespace gpu { namespace cudev
return dst;
}
__device__ __forceinline__ Gray2RGB():unary_function<T, typename TypeVec<T, dcn>::vec_type>(){}
__device__ __forceinline__ Gray2RGB(const Gray2RGB& other_)
: unary_function<T, typename TypeVec<T, dcn>::vec_type>(){}
__host__ __device__ __forceinline__ Gray2RGB() {}
__host__ __device__ __forceinline__ Gray2RGB(const Gray2RGB&) {}
};
template <> struct Gray2RGB<uchar, 4> : unary_function<uchar, uint>
@@ -342,8 +338,8 @@ namespace cv { namespace gpu { namespace cudev
return dst;
}
__device__ __forceinline__ Gray2RGB():unary_function<uchar, uint>(){}
__device__ __forceinline__ Gray2RGB(const Gray2RGB& other_):unary_function<uchar, uint>(){}
__host__ __device__ __forceinline__ Gray2RGB() {}
__host__ __device__ __forceinline__ Gray2RGB(const Gray2RGB&) {}
};
}
@@ -384,8 +380,8 @@ namespace cv { namespace gpu { namespace cudev
return Gray2RGB5x5Converter<green_bits>::cvt(src);
}
__device__ __forceinline__ Gray2RGB5x5():unary_function<uchar, ushort>(){}
__device__ __forceinline__ Gray2RGB5x5(const Gray2RGB5x5& other_):unary_function<uchar, ushort>(){}
__host__ __device__ __forceinline__ Gray2RGB5x5() {}
__host__ __device__ __forceinline__ Gray2RGB5x5(const Gray2RGB5x5&) {}
};
}
@@ -426,8 +422,8 @@ namespace cv { namespace gpu { namespace cudev
{
return RGB5x52GrayConverter<green_bits>::cvt(src);
}
__device__ __forceinline__ RGB5x52Gray() : unary_function<ushort, uchar>(){}
__device__ __forceinline__ RGB5x52Gray(const RGB5x52Gray& other_) : unary_function<ushort, uchar>(){}
__host__ __device__ __forceinline__ RGB5x52Gray() {}
__host__ __device__ __forceinline__ RGB5x52Gray(const RGB5x52Gray&) {}
};
}
@@ -467,9 +463,8 @@ namespace cv { namespace gpu { namespace cudev
{
return RGB2GrayConvert<bidx>(&src.x);
}
__device__ __forceinline__ RGB2Gray() : unary_function<typename TypeVec<T, scn>::vec_type, T>(){}
__device__ __forceinline__ RGB2Gray(const RGB2Gray& other_)
: unary_function<typename TypeVec<T, scn>::vec_type, T>(){}
__host__ __device__ __forceinline__ RGB2Gray() {}
__host__ __device__ __forceinline__ RGB2Gray(const RGB2Gray&) {}
};
template <int bidx> struct RGB2Gray<uchar, 4, bidx> : unary_function<uint, uchar>
@@ -478,8 +473,8 @@ namespace cv { namespace gpu { namespace cudev
{
return RGB2GrayConvert<bidx>(src);
}
__device__ __forceinline__ RGB2Gray() : unary_function<uint, uchar>(){}
__device__ __forceinline__ RGB2Gray(const RGB2Gray& other_) : unary_function<uint, uchar>(){}
__host__ __device__ __forceinline__ RGB2Gray() {}
__host__ __device__ __forceinline__ RGB2Gray(const RGB2Gray&) {}
};
}
@@ -529,10 +524,8 @@ namespace cv { namespace gpu { namespace cudev
RGB2YUVConvert<bidx>(&src.x, dst);
return dst;
}
__device__ __forceinline__ RGB2YUV()
: unary_function<typename TypeVec<T, scn>::vec_type, typename TypeVec<T, dcn>::vec_type>(){}
__device__ __forceinline__ RGB2YUV(const RGB2YUV& other_)
: unary_function<typename TypeVec<T, scn>::vec_type, typename TypeVec<T, dcn>::vec_type>(){}
__host__ __device__ __forceinline__ RGB2YUV() {}
__host__ __device__ __forceinline__ RGB2YUV(const RGB2YUV&) {}
};
}
@@ -609,10 +602,8 @@ namespace cv { namespace gpu { namespace cudev
return dst;
}
__device__ __forceinline__ YUV2RGB()
: unary_function<typename TypeVec<T, scn>::vec_type, typename TypeVec<T, dcn>::vec_type>(){}
__device__ __forceinline__ YUV2RGB(const YUV2RGB& other_)
: unary_function<typename TypeVec<T, scn>::vec_type, typename TypeVec<T, dcn>::vec_type>(){}
__host__ __device__ __forceinline__ YUV2RGB() {}
__host__ __device__ __forceinline__ YUV2RGB(const YUV2RGB&) {}
};
template <int bidx> struct YUV2RGB<uchar, 4, 4, bidx> : unary_function<uint, uint>
@@ -621,8 +612,8 @@ namespace cv { namespace gpu { namespace cudev
{
return YUV2RGBConvert<bidx>(src);
}
__device__ __forceinline__ YUV2RGB() : unary_function<uint, uint>(){}
__device__ __forceinline__ YUV2RGB(const YUV2RGB& other_) : unary_function<uint, uint>(){}
__host__ __device__ __forceinline__ YUV2RGB() {}
__host__ __device__ __forceinline__ YUV2RGB(const YUV2RGB&) {}
};
}
@@ -689,10 +680,8 @@ namespace cv { namespace gpu { namespace cudev
RGB2YCrCbConvert<bidx>(&src.x, dst);
return dst;
}
__device__ __forceinline__ RGB2YCrCb()
: unary_function<typename TypeVec<T, scn>::vec_type, typename TypeVec<T, dcn>::vec_type>(){}
__device__ __forceinline__ RGB2YCrCb(const RGB2YCrCb& other_)
: unary_function<typename TypeVec<T, scn>::vec_type, typename TypeVec<T, dcn>::vec_type>(){}
__host__ __device__ __forceinline__ RGB2YCrCb() {}
__host__ __device__ __forceinline__ RGB2YCrCb(const RGB2YCrCb&) {}
};
template <int bidx> struct RGB2YCrCb<uchar, 4, 4, bidx> : unary_function<uint, uint>
@@ -702,8 +691,8 @@ namespace cv { namespace gpu { namespace cudev
return RGB2YCrCbConvert<bidx>(src);
}
__device__ __forceinline__ RGB2YCrCb() : unary_function<uint, uint>(){}
__device__ __forceinline__ RGB2YCrCb(const RGB2YCrCb& other_) : unary_function<uint, uint>(){}
__host__ __device__ __forceinline__ RGB2YCrCb() {}
__host__ __device__ __forceinline__ RGB2YCrCb(const RGB2YCrCb&) {}
};
}
@@ -771,10 +760,8 @@ namespace cv { namespace gpu { namespace cudev
return dst;
}
__device__ __forceinline__ YCrCb2RGB()
: unary_function<typename TypeVec<T, scn>::vec_type, typename TypeVec<T, dcn>::vec_type>(){}
__device__ __forceinline__ YCrCb2RGB(const YCrCb2RGB& other_)
: unary_function<typename TypeVec<T, scn>::vec_type, typename TypeVec<T, dcn>::vec_type>(){}
__host__ __device__ __forceinline__ YCrCb2RGB() {}
__host__ __device__ __forceinline__ YCrCb2RGB(const YCrCb2RGB&) {}
};
template <int bidx> struct YCrCb2RGB<uchar, 4, 4, bidx> : unary_function<uint, uint>
@@ -783,8 +770,8 @@ namespace cv { namespace gpu { namespace cudev
{
return YCrCb2RGBConvert<bidx>(src);
}
__device__ __forceinline__ YCrCb2RGB() : unary_function<uint, uint>(){}
__device__ __forceinline__ YCrCb2RGB(const YCrCb2RGB& other_) : unary_function<uint, uint>(){}
__host__ __device__ __forceinline__ YCrCb2RGB() {}
__host__ __device__ __forceinline__ YCrCb2RGB(const YCrCb2RGB&) {}
};
}
@@ -849,10 +836,8 @@ namespace cv { namespace gpu { namespace cudev
return dst;
}
__device__ __forceinline__ RGB2XYZ()
: unary_function<typename TypeVec<T, scn>::vec_type, typename TypeVec<T, dcn>::vec_type>(){}
__device__ __forceinline__ RGB2XYZ(const RGB2XYZ& other_)
: unary_function<typename TypeVec<T, scn>::vec_type, typename TypeVec<T, dcn>::vec_type>(){}
__host__ __device__ __forceinline__ RGB2XYZ() {}
__host__ __device__ __forceinline__ RGB2XYZ(const RGB2XYZ&) {}
};
template <int bidx> struct RGB2XYZ<uchar, 4, 4, bidx> : unary_function<uint, uint>
@@ -861,8 +846,8 @@ namespace cv { namespace gpu { namespace cudev
{
return RGB2XYZConvert<bidx>(src);
}
__device__ __forceinline__ RGB2XYZ() : unary_function<uint, uint>(){}
__device__ __forceinline__ RGB2XYZ(const RGB2XYZ& other_) : unary_function<uint, uint>(){}
__host__ __device__ __forceinline__ RGB2XYZ() {}
__host__ __device__ __forceinline__ RGB2XYZ(const RGB2XYZ&) {}
};
}
@@ -926,10 +911,8 @@ namespace cv { namespace gpu { namespace cudev
return dst;
}
__device__ __forceinline__ XYZ2RGB()
: unary_function<typename TypeVec<T, scn>::vec_type, typename TypeVec<T, dcn>::vec_type>(){}
__device__ __forceinline__ XYZ2RGB(const XYZ2RGB& other_)
: unary_function<typename TypeVec<T, scn>::vec_type, typename TypeVec<T, dcn>::vec_type>(){}
__host__ __device__ __forceinline__ XYZ2RGB() {}
__host__ __device__ __forceinline__ XYZ2RGB(const XYZ2RGB&) {}
};
template <int bidx> struct XYZ2RGB<uchar, 4, 4, bidx> : unary_function<uint, uint>
@@ -938,8 +921,8 @@ namespace cv { namespace gpu { namespace cudev
{
return XYZ2RGBConvert<bidx>(src);
}
__device__ __forceinline__ XYZ2RGB() : unary_function<uint, uint>(){}
__device__ __forceinline__ XYZ2RGB(const XYZ2RGB& other_) : unary_function<uint, uint>(){}
__host__ __device__ __forceinline__ XYZ2RGB() {}
__host__ __device__ __forceinline__ XYZ2RGB(const XYZ2RGB&) {}
};
}
@@ -1066,10 +1049,8 @@ namespace cv { namespace gpu { namespace cudev
return dst;
}
__device__ __forceinline__ RGB2HSV()
: unary_function<typename TypeVec<T, scn>::vec_type, typename TypeVec<T, dcn>::vec_type>(){}
__device__ __forceinline__ RGB2HSV(const RGB2HSV& other_)
: unary_function<typename TypeVec<T, scn>::vec_type, typename TypeVec<T, dcn>::vec_type>(){}
__host__ __device__ __forceinline__ RGB2HSV() {}
__host__ __device__ __forceinline__ RGB2HSV(const RGB2HSV&) {}
};
template <int bidx, int hr> struct RGB2HSV<uchar, 4, 4, bidx, hr> : unary_function<uint, uint>
@@ -1078,8 +1059,8 @@ namespace cv { namespace gpu { namespace cudev
{
return RGB2HSVConvert<bidx, hr>(src);
}
__device__ __forceinline__ RGB2HSV():unary_function<uint, uint>(){}
__device__ __forceinline__ RGB2HSV(const RGB2HSV& other_):unary_function<uint, uint>(){}
__host__ __device__ __forceinline__ RGB2HSV() {}
__host__ __device__ __forceinline__ RGB2HSV(const RGB2HSV&) {}
};
}
@@ -1208,10 +1189,8 @@ namespace cv { namespace gpu { namespace cudev
return dst;
}
__device__ __forceinline__ HSV2RGB()
: unary_function<typename TypeVec<T, scn>::vec_type, typename TypeVec<T, dcn>::vec_type>(){}
__device__ __forceinline__ HSV2RGB(const HSV2RGB& other_)
: unary_function<typename TypeVec<T, scn>::vec_type, typename TypeVec<T, dcn>::vec_type>(){}
__host__ __device__ __forceinline__ HSV2RGB() {}
__host__ __device__ __forceinline__ HSV2RGB(const HSV2RGB&) {}
};
template <int bidx, int hr> struct HSV2RGB<uchar, 4, 4, bidx, hr> : unary_function<uint, uint>
@@ -1220,8 +1199,8 @@ namespace cv { namespace gpu { namespace cudev
{
return HSV2RGBConvert<bidx, hr>(src);
}
__device__ __forceinline__ HSV2RGB():unary_function<uint, uint>(){}
__device__ __forceinline__ HSV2RGB(const HSV2RGB& other_):unary_function<uint, uint>(){}
__host__ __device__ __forceinline__ HSV2RGB() {}
__host__ __device__ __forceinline__ HSV2RGB(const HSV2RGB&) {}
};
}
@@ -1343,10 +1322,8 @@ namespace cv { namespace gpu { namespace cudev
return dst;
}
__device__ __forceinline__ RGB2HLS()
: unary_function<typename TypeVec<T, scn>::vec_type, typename TypeVec<T, dcn>::vec_type>(){}
__device__ __forceinline__ RGB2HLS(const RGB2HLS& other_)
: unary_function<typename TypeVec<T, scn>::vec_type, typename TypeVec<T, dcn>::vec_type>(){}
__host__ __device__ __forceinline__ RGB2HLS() {}
__host__ __device__ __forceinline__ RGB2HLS(const RGB2HLS&) {}
};
template <int bidx, int hr> struct RGB2HLS<uchar, 4, 4, bidx, hr> : unary_function<uint, uint>
@@ -1355,8 +1332,8 @@ namespace cv { namespace gpu { namespace cudev
{
return RGB2HLSConvert<bidx, hr>(src);
}
__device__ __forceinline__ RGB2HLS() : unary_function<uint, uint>(){}
__device__ __forceinline__ RGB2HLS(const RGB2HLS& other_) : unary_function<uint, uint>(){}
__host__ __device__ __forceinline__ RGB2HLS() {}
__host__ __device__ __forceinline__ RGB2HLS(const RGB2HLS&) {}
};
}
@@ -1485,10 +1462,8 @@ namespace cv { namespace gpu { namespace cudev
return dst;
}
__device__ __forceinline__ HLS2RGB()
: unary_function<typename TypeVec<T, scn>::vec_type, typename TypeVec<T, dcn>::vec_type>(){}
__device__ __forceinline__ HLS2RGB(const HLS2RGB& other_)
: unary_function<typename TypeVec<T, scn>::vec_type, typename TypeVec<T, dcn>::vec_type>(){}
__host__ __device__ __forceinline__ HLS2RGB() {}
__host__ __device__ __forceinline__ HLS2RGB(const HLS2RGB&) {}
};
template <int bidx, int hr> struct HLS2RGB<uchar, 4, 4, bidx, hr> : unary_function<uint, uint>
@@ -1497,8 +1472,8 @@ namespace cv { namespace gpu { namespace cudev
{
return HLS2RGBConvert<bidx, hr>(src);
}
__device__ __forceinline__ HLS2RGB() : unary_function<uint, uint>(){}
__device__ __forceinline__ HLS2RGB(const HLS2RGB& other_) : unary_function<uint, uint>(){}
__host__ __device__ __forceinline__ HLS2RGB() {}
__host__ __device__ __forceinline__ HLS2RGB(const HLS2RGB&) {}
};
}
@@ -1651,8 +1626,8 @@ namespace cv { namespace gpu { namespace cudev
return dst;
}
__device__ __forceinline__ RGB2Lab() {}
__device__ __forceinline__ RGB2Lab(const RGB2Lab& other_) {}
__host__ __device__ __forceinline__ RGB2Lab() {}
__host__ __device__ __forceinline__ RGB2Lab(const RGB2Lab&) {}
};
template <int scn, int dcn, bool srgb, int blueIdx>
struct RGB2Lab<float, scn, dcn, srgb, blueIdx>
@@ -1666,8 +1641,8 @@ namespace cv { namespace gpu { namespace cudev
return dst;
}
__device__ __forceinline__ RGB2Lab() {}
__device__ __forceinline__ RGB2Lab(const RGB2Lab& other_) {}
__host__ __device__ __forceinline__ RGB2Lab() {}
__host__ __device__ __forceinline__ RGB2Lab(const RGB2Lab&) {}
};
}
@@ -1764,8 +1739,8 @@ namespace cv { namespace gpu { namespace cudev
return dst;
}
__device__ __forceinline__ Lab2RGB() {}
__device__ __forceinline__ Lab2RGB(const Lab2RGB& other_) {}
__host__ __device__ __forceinline__ Lab2RGB() {}
__host__ __device__ __forceinline__ Lab2RGB(const Lab2RGB&) {}
};
template <int scn, int dcn, bool srgb, int blueIdx>
struct Lab2RGB<float, scn, dcn, srgb, blueIdx>
@@ -1779,8 +1754,8 @@ namespace cv { namespace gpu { namespace cudev
return dst;
}
__device__ __forceinline__ Lab2RGB() {}
__device__ __forceinline__ Lab2RGB(const Lab2RGB& other_) {}
__host__ __device__ __forceinline__ Lab2RGB() {}
__host__ __device__ __forceinline__ Lab2RGB(const Lab2RGB&) {}
};
}
@@ -1863,8 +1838,8 @@ namespace cv { namespace gpu { namespace cudev
return dst;
}
__device__ __forceinline__ RGB2Luv() {}
__device__ __forceinline__ RGB2Luv(const RGB2Luv& other_) {}
__host__ __device__ __forceinline__ RGB2Luv() {}
__host__ __device__ __forceinline__ RGB2Luv(const RGB2Luv&) {}
};
template <int scn, int dcn, bool srgb, int blueIdx>
struct RGB2Luv<float, scn, dcn, srgb, blueIdx>
@@ -1878,8 +1853,8 @@ namespace cv { namespace gpu { namespace cudev
return dst;
}
__device__ __forceinline__ RGB2Luv() {}
__device__ __forceinline__ RGB2Luv(const RGB2Luv& other_) {}
__host__ __device__ __forceinline__ RGB2Luv() {}
__host__ __device__ __forceinline__ RGB2Luv(const RGB2Luv&) {}
};
}
@@ -1964,8 +1939,8 @@ namespace cv { namespace gpu { namespace cudev
return dst;
}
__device__ __forceinline__ Luv2RGB() {}
__device__ __forceinline__ Luv2RGB(const Luv2RGB& other_) {}
__host__ __device__ __forceinline__ Luv2RGB() {}
__host__ __device__ __forceinline__ Luv2RGB(const Luv2RGB&) {}
};
template <int scn, int dcn, bool srgb, int blueIdx>
struct Luv2RGB<float, scn, dcn, srgb, blueIdx>
@@ -1979,8 +1954,8 @@ namespace cv { namespace gpu { namespace cudev
return dst;
}
__device__ __forceinline__ Luv2RGB() {}
__device__ __forceinline__ Luv2RGB(const Luv2RGB& other_) {}
__host__ __device__ __forceinline__ Luv2RGB() {}
__host__ __device__ __forceinline__ Luv2RGB(const Luv2RGB&) {}
};
}

View File

@@ -63,8 +63,8 @@ namespace cv { namespace gpu { namespace cudev
{
return a + b;
}
__device__ __forceinline__ plus(const plus& other):binary_function<T,T,T>(){}
__device__ __forceinline__ plus():binary_function<T,T,T>(){}
__host__ __device__ __forceinline__ plus() {}
__host__ __device__ __forceinline__ plus(const plus&) {}
};
template <typename T> struct minus : binary_function<T, T, T>
@@ -74,8 +74,8 @@ namespace cv { namespace gpu { namespace cudev
{
return a - b;
}
__device__ __forceinline__ minus(const minus& other):binary_function<T,T,T>(){}
__device__ __forceinline__ minus():binary_function<T,T,T>(){}
__host__ __device__ __forceinline__ minus() {}
__host__ __device__ __forceinline__ minus(const minus&) {}
};
template <typename T> struct multiplies : binary_function<T, T, T>
@@ -85,8 +85,8 @@ namespace cv { namespace gpu { namespace cudev
{
return a * b;
}
__device__ __forceinline__ multiplies(const multiplies& other):binary_function<T,T,T>(){}
__device__ __forceinline__ multiplies():binary_function<T,T,T>(){}
__host__ __device__ __forceinline__ multiplies() {}
__host__ __device__ __forceinline__ multiplies(const multiplies&) {}
};
template <typename T> struct divides : binary_function<T, T, T>
@@ -96,8 +96,8 @@ namespace cv { namespace gpu { namespace cudev
{
return a / b;
}
__device__ __forceinline__ divides(const divides& other):binary_function<T,T,T>(){}
__device__ __forceinline__ divides():binary_function<T,T,T>(){}
__host__ __device__ __forceinline__ divides() {}
__host__ __device__ __forceinline__ divides(const divides&) {}
};
template <typename T> struct modulus : binary_function<T, T, T>
@@ -107,8 +107,8 @@ namespace cv { namespace gpu { namespace cudev
{
return a % b;
}
__device__ __forceinline__ modulus(const modulus& other):binary_function<T,T,T>(){}
__device__ __forceinline__ modulus():binary_function<T,T,T>(){}
__host__ __device__ __forceinline__ modulus() {}
__host__ __device__ __forceinline__ modulus(const modulus&) {}
};
template <typename T> struct negate : unary_function<T, T>
@@ -117,8 +117,8 @@ namespace cv { namespace gpu { namespace cudev
{
return -a;
}
__device__ __forceinline__ negate(const negate& other):unary_function<T,T>(){}
__device__ __forceinline__ negate():unary_function<T,T>(){}
__host__ __device__ __forceinline__ negate() {}
__host__ __device__ __forceinline__ negate(const negate&) {}
};
// Comparison Operations
@@ -129,8 +129,8 @@ namespace cv { namespace gpu { namespace cudev
{
return a == b;
}
__device__ __forceinline__ equal_to(const equal_to& other):binary_function<T,T,bool>(){}
__device__ __forceinline__ equal_to():binary_function<T,T,bool>(){}
__host__ __device__ __forceinline__ equal_to() {}
__host__ __device__ __forceinline__ equal_to(const equal_to&) {}
};
template <typename T> struct not_equal_to : binary_function<T, T, bool>
@@ -140,8 +140,8 @@ namespace cv { namespace gpu { namespace cudev
{
return a != b;
}
__device__ __forceinline__ not_equal_to(const not_equal_to& other):binary_function<T,T,bool>(){}
__device__ __forceinline__ not_equal_to():binary_function<T,T,bool>(){}
__host__ __device__ __forceinline__ not_equal_to() {}
__host__ __device__ __forceinline__ not_equal_to(const not_equal_to&) {}
};
template <typename T> struct greater : binary_function<T, T, bool>
@@ -151,8 +151,8 @@ namespace cv { namespace gpu { namespace cudev
{
return a > b;
}
__device__ __forceinline__ greater(const greater& other):binary_function<T,T,bool>(){}
__device__ __forceinline__ greater():binary_function<T,T,bool>(){}
__host__ __device__ __forceinline__ greater() {}
__host__ __device__ __forceinline__ greater(const greater&) {}
};
template <typename T> struct less : binary_function<T, T, bool>
@@ -162,8 +162,8 @@ namespace cv { namespace gpu { namespace cudev
{
return a < b;
}
__device__ __forceinline__ less(const less& other):binary_function<T,T,bool>(){}
__device__ __forceinline__ less():binary_function<T,T,bool>(){}
__host__ __device__ __forceinline__ less() {}
__host__ __device__ __forceinline__ less(const less&) {}
};
template <typename T> struct greater_equal : binary_function<T, T, bool>
@@ -173,8 +173,8 @@ namespace cv { namespace gpu { namespace cudev
{
return a >= b;
}
__device__ __forceinline__ greater_equal(const greater_equal& other):binary_function<T,T,bool>(){}
__device__ __forceinline__ greater_equal():binary_function<T,T,bool>(){}
__host__ __device__ __forceinline__ greater_equal() {}
__host__ __device__ __forceinline__ greater_equal(const greater_equal&) {}
};
template <typename T> struct less_equal : binary_function<T, T, bool>
@@ -184,8 +184,8 @@ namespace cv { namespace gpu { namespace cudev
{
return a <= b;
}
__device__ __forceinline__ less_equal(const less_equal& other):binary_function<T,T,bool>(){}
__device__ __forceinline__ less_equal():binary_function<T,T,bool>(){}
__host__ __device__ __forceinline__ less_equal() {}
__host__ __device__ __forceinline__ less_equal(const less_equal&) {}
};
// Logical Operations
@@ -196,8 +196,8 @@ namespace cv { namespace gpu { namespace cudev
{
return a && b;
}
__device__ __forceinline__ logical_and(const logical_and& other):binary_function<T,T,bool>(){}
__device__ __forceinline__ logical_and():binary_function<T,T,bool>(){}
__host__ __device__ __forceinline__ logical_and() {}
__host__ __device__ __forceinline__ logical_and(const logical_and&) {}
};
template <typename T> struct logical_or : binary_function<T, T, bool>
@@ -207,8 +207,8 @@ namespace cv { namespace gpu { namespace cudev
{
return a || b;
}
__device__ __forceinline__ logical_or(const logical_or& other):binary_function<T,T,bool>(){}
__device__ __forceinline__ logical_or():binary_function<T,T,bool>(){}
__host__ __device__ __forceinline__ logical_or() {}
__host__ __device__ __forceinline__ logical_or(const logical_or&) {}
};
template <typename T> struct logical_not : unary_function<T, bool>
@@ -217,8 +217,8 @@ namespace cv { namespace gpu { namespace cudev
{
return !a;
}
__device__ __forceinline__ logical_not(const logical_not& other):unary_function<T,bool>(){}
__device__ __forceinline__ logical_not():unary_function<T,bool>(){}
__host__ __device__ __forceinline__ logical_not() {}
__host__ __device__ __forceinline__ logical_not(const logical_not&) {}
};
// Bitwise Operations
@@ -229,8 +229,8 @@ namespace cv { namespace gpu { namespace cudev
{
return a & b;
}
__device__ __forceinline__ bit_and(const bit_and& other):binary_function<T,T,T>(){}
__device__ __forceinline__ bit_and():binary_function<T,T,T>(){}
__host__ __device__ __forceinline__ bit_and() {}
__host__ __device__ __forceinline__ bit_and(const bit_and&) {}
};
template <typename T> struct bit_or : binary_function<T, T, T>
@@ -240,8 +240,8 @@ namespace cv { namespace gpu { namespace cudev
{
return a | b;
}
__device__ __forceinline__ bit_or(const bit_or& other):binary_function<T,T,T>(){}
__device__ __forceinline__ bit_or():binary_function<T,T,T>(){}
__host__ __device__ __forceinline__ bit_or() {}
__host__ __device__ __forceinline__ bit_or(const bit_or&) {}
};
template <typename T> struct bit_xor : binary_function<T, T, T>
@@ -251,8 +251,8 @@ namespace cv { namespace gpu { namespace cudev
{
return a ^ b;
}
__device__ __forceinline__ bit_xor(const bit_xor& other):binary_function<T,T,T>(){}
__device__ __forceinline__ bit_xor():binary_function<T,T,T>(){}
__host__ __device__ __forceinline__ bit_xor() {}
__host__ __device__ __forceinline__ bit_xor(const bit_xor&) {}
};
template <typename T> struct bit_not : unary_function<T, T>
@@ -261,8 +261,8 @@ namespace cv { namespace gpu { namespace cudev
{
return ~v;
}
__device__ __forceinline__ bit_not(const bit_not& other):unary_function<T,T>(){}
__device__ __forceinline__ bit_not():unary_function<T,T>(){}
__host__ __device__ __forceinline__ bit_not() {}
__host__ __device__ __forceinline__ bit_not(const bit_not&) {}
};
// Generalized Identity Operations
@@ -272,8 +272,8 @@ namespace cv { namespace gpu { namespace cudev
{
return x;
}
__device__ __forceinline__ identity(const identity& other):unary_function<T,T>(){}
__device__ __forceinline__ identity():unary_function<T,T>(){}
__host__ __device__ __forceinline__ identity() {}
__host__ __device__ __forceinline__ identity(const identity&) {}
};
template <typename T1, typename T2> struct project1st : binary_function<T1, T2, T1>
@@ -282,8 +282,8 @@ namespace cv { namespace gpu { namespace cudev
{
return lhs;
}
__device__ __forceinline__ project1st(const project1st& other):binary_function<T1,T2,T1>(){}
__device__ __forceinline__ project1st():binary_function<T1,T2,T1>(){}
__host__ __device__ __forceinline__ project1st() {}
__host__ __device__ __forceinline__ project1st(const project1st&) {}
};
template <typename T1, typename T2> struct project2nd : binary_function<T1, T2, T2>
@@ -292,8 +292,8 @@ namespace cv { namespace gpu { namespace cudev
{
return rhs;
}
__device__ __forceinline__ project2nd(const project2nd& other):binary_function<T1,T2,T2>(){}
__device__ __forceinline__ project2nd():binary_function<T1,T2,T2>(){}
__host__ __device__ __forceinline__ project2nd() {}
__host__ __device__ __forceinline__ project2nd(const project2nd&) {}
};
// Min/Max Operations
@@ -302,8 +302,8 @@ namespace cv { namespace gpu { namespace cudev
template <> struct name<type> : binary_function<type, type, type> \
{ \
__device__ __forceinline__ type operator()(type lhs, type rhs) const {return op(lhs, rhs);} \
__device__ __forceinline__ name() {}\
__device__ __forceinline__ name(const name&) {}\
__host__ __device__ __forceinline__ name() {}\
__host__ __device__ __forceinline__ name(const name&) {}\
};
template <typename T> struct maximum : binary_function<T, T, T>
@@ -312,8 +312,8 @@ namespace cv { namespace gpu { namespace cudev
{
return max(lhs, rhs);
}
__device__ __forceinline__ maximum() {}
__device__ __forceinline__ maximum(const maximum&) {}
__host__ __device__ __forceinline__ maximum() {}
__host__ __device__ __forceinline__ maximum(const maximum&) {}
};
OPENCV_GPU_IMPLEMENT_MINMAX(maximum, uchar, ::max)
@@ -332,8 +332,8 @@ namespace cv { namespace gpu { namespace cudev
{
return min(lhs, rhs);
}
__device__ __forceinline__ minimum() {}
__device__ __forceinline__ minimum(const minimum&) {}
__host__ __device__ __forceinline__ minimum() {}
__host__ __device__ __forceinline__ minimum(const minimum&) {}
};
OPENCV_GPU_IMPLEMENT_MINMAX(minimum, uchar, ::min)
@@ -349,7 +349,6 @@ namespace cv { namespace gpu { namespace cudev
#undef OPENCV_GPU_IMPLEMENT_MINMAX
// Math functions
///bound=========================================
template <typename T> struct abs_func : unary_function<T, T>
{
@@ -358,8 +357,8 @@ namespace cv { namespace gpu { namespace cudev
return abs(x);
}
__device__ __forceinline__ abs_func() {}
__device__ __forceinline__ abs_func(const abs_func&) {}
__host__ __device__ __forceinline__ abs_func() {}
__host__ __device__ __forceinline__ abs_func(const abs_func&) {}
};
template <> struct abs_func<unsigned char> : unary_function<unsigned char, unsigned char>
{
@@ -368,8 +367,8 @@ namespace cv { namespace gpu { namespace cudev
return x;
}
__device__ __forceinline__ abs_func() {}
__device__ __forceinline__ abs_func(const abs_func&) {}
__host__ __device__ __forceinline__ abs_func() {}
__host__ __device__ __forceinline__ abs_func(const abs_func&) {}
};
template <> struct abs_func<signed char> : unary_function<signed char, signed char>
{
@@ -378,8 +377,8 @@ namespace cv { namespace gpu { namespace cudev
return ::abs((int)x);
}
__device__ __forceinline__ abs_func() {}
__device__ __forceinline__ abs_func(const abs_func&) {}
__host__ __device__ __forceinline__ abs_func() {}
__host__ __device__ __forceinline__ abs_func(const abs_func&) {}
};
template <> struct abs_func<char> : unary_function<char, char>
{
@@ -388,8 +387,8 @@ namespace cv { namespace gpu { namespace cudev
return ::abs((int)x);
}
__device__ __forceinline__ abs_func() {}
__device__ __forceinline__ abs_func(const abs_func&) {}
__host__ __device__ __forceinline__ abs_func() {}
__host__ __device__ __forceinline__ abs_func(const abs_func&) {}
};
template <> struct abs_func<unsigned short> : unary_function<unsigned short, unsigned short>
{
@@ -398,8 +397,8 @@ namespace cv { namespace gpu { namespace cudev
return x;
}
__device__ __forceinline__ abs_func() {}
__device__ __forceinline__ abs_func(const abs_func&) {}
__host__ __device__ __forceinline__ abs_func() {}
__host__ __device__ __forceinline__ abs_func(const abs_func&) {}
};
template <> struct abs_func<short> : unary_function<short, short>
{
@@ -408,8 +407,8 @@ namespace cv { namespace gpu { namespace cudev
return ::abs((int)x);
}
__device__ __forceinline__ abs_func() {}
__device__ __forceinline__ abs_func(const abs_func&) {}
__host__ __device__ __forceinline__ abs_func() {}
__host__ __device__ __forceinline__ abs_func(const abs_func&) {}
};
template <> struct abs_func<unsigned int> : unary_function<unsigned int, unsigned int>
{
@@ -418,8 +417,8 @@ namespace cv { namespace gpu { namespace cudev
return x;
}
__device__ __forceinline__ abs_func() {}
__device__ __forceinline__ abs_func(const abs_func&) {}
__host__ __device__ __forceinline__ abs_func() {}
__host__ __device__ __forceinline__ abs_func(const abs_func&) {}
};
template <> struct abs_func<int> : unary_function<int, int>
{
@@ -428,8 +427,8 @@ namespace cv { namespace gpu { namespace cudev
return ::abs(x);
}
__device__ __forceinline__ abs_func() {}
__device__ __forceinline__ abs_func(const abs_func&) {}
__host__ __device__ __forceinline__ abs_func() {}
__host__ __device__ __forceinline__ abs_func(const abs_func&) {}
};
template <> struct abs_func<float> : unary_function<float, float>
{
@@ -438,8 +437,8 @@ namespace cv { namespace gpu { namespace cudev
return ::fabsf(x);
}
__device__ __forceinline__ abs_func() {}
__device__ __forceinline__ abs_func(const abs_func&) {}
__host__ __device__ __forceinline__ abs_func() {}
__host__ __device__ __forceinline__ abs_func(const abs_func&) {}
};
template <> struct abs_func<double> : unary_function<double, double>
{
@@ -448,8 +447,8 @@ namespace cv { namespace gpu { namespace cudev
return ::fabs(x);
}
__device__ __forceinline__ abs_func() {}
__device__ __forceinline__ abs_func(const abs_func&) {}
__host__ __device__ __forceinline__ abs_func() {}
__host__ __device__ __forceinline__ abs_func(const abs_func&) {}
};
#define OPENCV_GPU_IMPLEMENT_UN_FUNCTOR(name, func) \
@@ -459,8 +458,8 @@ namespace cv { namespace gpu { namespace cudev
{ \
return func ## f(v); \
} \
__device__ __forceinline__ name ## _func() {} \
__device__ __forceinline__ name ## _func(const name ## _func&) {} \
__host__ __device__ __forceinline__ name ## _func() {} \
__host__ __device__ __forceinline__ name ## _func(const name ## _func&) {} \
}; \
template <> struct name ## _func<double> : unary_function<double, double> \
{ \
@@ -468,8 +467,8 @@ namespace cv { namespace gpu { namespace cudev
{ \
return func(v); \
} \
__device__ __forceinline__ name ## _func() {} \
__device__ __forceinline__ name ## _func(const name ## _func&) {} \
__host__ __device__ __forceinline__ name ## _func() {} \
__host__ __device__ __forceinline__ name ## _func(const name ## _func&) {} \
};
#define OPENCV_GPU_IMPLEMENT_BIN_FUNCTOR(name, func) \
@@ -479,6 +478,8 @@ namespace cv { namespace gpu { namespace cudev
{ \
return func ## f(v1, v2); \
} \
__host__ __device__ __forceinline__ name ## _func() {} \
__host__ __device__ __forceinline__ name ## _func(const name ## _func&) {} \
}; \
template <> struct name ## _func<double> : binary_function<double, double, double> \
{ \
@@ -486,6 +487,8 @@ namespace cv { namespace gpu { namespace cudev
{ \
return func(v1, v2); \
} \
__host__ __device__ __forceinline__ name ## _func() {} \
__host__ __device__ __forceinline__ name ## _func(const name ## _func&) {} \
};
OPENCV_GPU_IMPLEMENT_UN_FUNCTOR(sqrt, ::sqrt)
@@ -522,8 +525,8 @@ namespace cv { namespace gpu { namespace cudev
{
return src1 * src1 + src2 * src2;
}
__device__ __forceinline__ hypot_sqr_func(const hypot_sqr_func& other) : binary_function<T, T, float>(){}
__device__ __forceinline__ hypot_sqr_func() : binary_function<T, T, float>(){}
__host__ __device__ __forceinline__ hypot_sqr_func() {}
__host__ __device__ __forceinline__ hypot_sqr_func(const hypot_sqr_func&) {}
};
// Saturate Cast Functor
@@ -533,8 +536,8 @@ namespace cv { namespace gpu { namespace cudev
{
return saturate_cast<D>(v);
}
__device__ __forceinline__ saturate_cast_func(const saturate_cast_func& other):unary_function<T, D>(){}
__device__ __forceinline__ saturate_cast_func():unary_function<T, D>(){}
__host__ __device__ __forceinline__ saturate_cast_func() {}
__host__ __device__ __forceinline__ saturate_cast_func(const saturate_cast_func&) {}
};
// Threshold Functors
@@ -547,10 +550,9 @@ namespace cv { namespace gpu { namespace cudev
return (src > thresh) * maxVal;
}
__device__ __forceinline__ thresh_binary_func(const thresh_binary_func& other)
: unary_function<T, T>(), thresh(other.thresh), maxVal(other.maxVal){}
__device__ __forceinline__ thresh_binary_func():unary_function<T, T>(){}
__host__ __device__ __forceinline__ thresh_binary_func() {}
__host__ __device__ __forceinline__ thresh_binary_func(const thresh_binary_func& other)
: thresh(other.thresh), maxVal(other.maxVal) {}
const T thresh;
const T maxVal;
@@ -565,10 +567,9 @@ namespace cv { namespace gpu { namespace cudev
return (src <= thresh) * maxVal;
}
__device__ __forceinline__ thresh_binary_inv_func(const thresh_binary_inv_func& other)
: unary_function<T, T>(), thresh(other.thresh), maxVal(other.maxVal){}
__device__ __forceinline__ thresh_binary_inv_func():unary_function<T, T>(){}
__host__ __device__ __forceinline__ thresh_binary_inv_func() {}
__host__ __device__ __forceinline__ thresh_binary_inv_func(const thresh_binary_inv_func& other)
: thresh(other.thresh), maxVal(other.maxVal) {}
const T thresh;
const T maxVal;
@@ -583,10 +584,9 @@ namespace cv { namespace gpu { namespace cudev
return minimum<T>()(src, thresh);
}
__device__ __forceinline__ thresh_trunc_func(const thresh_trunc_func& other)
: unary_function<T, T>(), thresh(other.thresh){}
__device__ __forceinline__ thresh_trunc_func():unary_function<T, T>(){}
__host__ __device__ __forceinline__ thresh_trunc_func() {}
__host__ __device__ __forceinline__ thresh_trunc_func(const thresh_trunc_func& other)
: thresh(other.thresh) {}
const T thresh;
};
@@ -599,10 +599,10 @@ namespace cv { namespace gpu { namespace cudev
{
return (src > thresh) * src;
}
__device__ __forceinline__ thresh_to_zero_func(const thresh_to_zero_func& other)
: unary_function<T, T>(), thresh(other.thresh){}
__device__ __forceinline__ thresh_to_zero_func():unary_function<T, T>(){}
__host__ __device__ __forceinline__ thresh_to_zero_func() {}
__host__ __device__ __forceinline__ thresh_to_zero_func(const thresh_to_zero_func& other)
: thresh(other.thresh) {}
const T thresh;
};
@@ -615,14 +615,14 @@ namespace cv { namespace gpu { namespace cudev
{
return (src <= thresh) * src;
}
__device__ __forceinline__ thresh_to_zero_inv_func(const thresh_to_zero_inv_func& other)
: unary_function<T, T>(), thresh(other.thresh){}
__device__ __forceinline__ thresh_to_zero_inv_func():unary_function<T, T>(){}
__host__ __device__ __forceinline__ thresh_to_zero_inv_func() {}
__host__ __device__ __forceinline__ thresh_to_zero_inv_func(const thresh_to_zero_inv_func& other)
: thresh(other.thresh) {}
const T thresh;
};
//bound!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! ============>
// Function Object Adaptors
template <typename Predicate> struct unary_negate : unary_function<typename Predicate::argument_type, bool>
{
@@ -633,8 +633,8 @@ namespace cv { namespace gpu { namespace cudev
return !pred(x);
}
__device__ __forceinline__ unary_negate(const unary_negate& other) : unary_function<typename Predicate::argument_type, bool>(){}
__device__ __forceinline__ unary_negate() : unary_function<typename Predicate::argument_type, bool>(){}
__host__ __device__ __forceinline__ unary_negate() {}
__host__ __device__ __forceinline__ unary_negate(const unary_negate& other) : pred(other.pred) {}
const Predicate pred;
};
@@ -653,11 +653,9 @@ namespace cv { namespace gpu { namespace cudev
{
return !pred(x,y);
}
__device__ __forceinline__ binary_negate(const binary_negate& other)
: binary_function<typename Predicate::first_argument_type, typename Predicate::second_argument_type, bool>(){}
__device__ __forceinline__ binary_negate() :
binary_function<typename Predicate::first_argument_type, typename Predicate::second_argument_type, bool>(){}
__host__ __device__ __forceinline__ binary_negate() {}
__host__ __device__ __forceinline__ binary_negate(const binary_negate& other) : pred(other.pred) {}
const Predicate pred;
};
@@ -676,8 +674,8 @@ namespace cv { namespace gpu { namespace cudev
return op(arg1, a);
}
__device__ __forceinline__ binder1st(const binder1st& other) :
unary_function<typename Op::second_argument_type, typename Op::result_type>(){}
__host__ __device__ __forceinline__ binder1st() {}
__host__ __device__ __forceinline__ binder1st(const binder1st& other) : op(other.op), arg1(other.arg1) {}
const Op op;
const typename Op::first_argument_type arg1;
@@ -697,8 +695,8 @@ namespace cv { namespace gpu { namespace cudev
return op(a, arg2);
}
__device__ __forceinline__ binder2nd(const binder2nd& other) :
unary_function<typename Op::first_argument_type, typename Op::result_type>(), op(other.op), arg2(other.arg2){}
__host__ __device__ __forceinline__ binder2nd() {}
__host__ __device__ __forceinline__ binder2nd(const binder2nd& other) : op(other.op), arg2(other.arg2) {}
const Op op;
const typename Op::second_argument_type arg2;

View File

@@ -124,8 +124,8 @@ namespace cv { namespace gpu { namespace cudev
struct WithOutMask
{
__device__ __forceinline__ WithOutMask(){}
__device__ __forceinline__ WithOutMask(const WithOutMask& mask){}
__host__ __device__ __forceinline__ WithOutMask(){}
__host__ __device__ __forceinline__ WithOutMask(const WithOutMask&){}
__device__ __forceinline__ void next() const
{

View File

@@ -212,7 +212,7 @@ static void keepStrongest( int N, std::vector<KeyPoint>& keypoints )
}
namespace {
class GridAdaptedFeatureDetectorInvoker
class GridAdaptedFeatureDetectorInvoker : public ParallelLoopBody
{
private:
int gridRows_, gridCols_;
@@ -221,29 +221,24 @@ private:
const Mat& image_;
const Mat& mask_;
const Ptr<FeatureDetector>& detector_;
#ifdef HAVE_TBB
tbb::mutex* kptLock_;
#endif
Mutex* kptLock_;
GridAdaptedFeatureDetectorInvoker& operator=(const GridAdaptedFeatureDetectorInvoker&); // to quiet MSVC
public:
GridAdaptedFeatureDetectorInvoker(const Ptr<FeatureDetector>& detector, const Mat& image, const Mat& mask, std::vector<KeyPoint>& keypoints, int maxPerCell, int gridRows, int gridCols
#ifdef HAVE_TBB
, tbb::mutex* kptLock
#endif
) : gridRows_(gridRows), gridCols_(gridCols), maxPerCell_(maxPerCell),
keypoints_(keypoints), image_(image), mask_(mask), detector_(detector)
#ifdef HAVE_TBB
, kptLock_(kptLock)
#endif
GridAdaptedFeatureDetectorInvoker(const Ptr<FeatureDetector>& detector, const Mat& image, const Mat& mask,
std::vector<KeyPoint>& keypoints, int maxPerCell, int gridRows, int gridCols,
cv::Mutex* kptLock)
: gridRows_(gridRows), gridCols_(gridCols), maxPerCell_(maxPerCell),
keypoints_(keypoints), image_(image), mask_(mask), detector_(detector),
kptLock_(kptLock)
{
}
void operator() (const BlockedRange& range) const
void operator() (const Range& range) const
{
for (int i = range.begin(); i < range.end(); ++i)
for (int i = range.start; i < range.end; ++i)
{
int celly = i / gridCols_;
int cellx = i - celly * gridCols_;
@@ -268,9 +263,8 @@ public:
it->pt.x += col_range.start;
it->pt.y += row_range.start;
}
#ifdef HAVE_TBB
tbb::mutex::scoped_lock join_keypoints(*kptLock_);
#endif
cv::AutoLock join_keypoints(*kptLock_);
keypoints_.insert( keypoints_.end(), sub_keypoints.begin(), sub_keypoints.end() );
}
}
@@ -287,13 +281,9 @@ void GridAdaptedFeatureDetector::detectImpl( const Mat& image, std::vector<KeyPo
keypoints.reserve(maxTotalKeypoints);
int maxPerCell = maxTotalKeypoints / (gridRows * gridCols);
#ifdef HAVE_TBB
tbb::mutex kptLock;
cv::parallel_for(cv::BlockedRange(0, gridRows * gridCols),
cv::Mutex kptLock;
cv::parallel_for_(cv::Range(0, gridRows * gridCols),
GridAdaptedFeatureDetectorInvoker(detector, image, mask, keypoints, maxPerCell, gridRows, gridCols, &kptLock));
#else
GridAdaptedFeatureDetectorInvoker(detector, image, mask, keypoints, maxPerCell, gridRows, gridCols)(cv::BlockedRange(0, gridRows * gridCols));
#endif
}
/*

View File

@@ -38,7 +38,7 @@ if(HAVE_CUDA)
ocv_cuda_compile(cuda_objs ${lib_cuda} ${ncv_cuda})
set(cuda_link_libs ${CUDA_LIBRARIES} ${CUDA_npp_LIBRARY})
set(cuda_link_libs ${CUDA_LIBRARIES} ${CUDA_CUDA_LIBRARY} ${CUDA_npp_LIBRARY})
if(WITH_NVCUVID)
set(cuda_link_libs ${cuda_link_libs} ${CUDA_nvcuvid_LIBRARY})

View File

@@ -150,7 +150,7 @@ namespace
}
// Computes rotation, translation pair for small subsets if the input data
class TransformHypothesesGenerator
class TransformHypothesesGenerator : public ParallelLoopBody
{
public:
TransformHypothesesGenerator(const Mat& object_, const Mat& image_, const Mat& dist_coef_,
@@ -160,7 +160,7 @@ namespace
num_points(num_points_), subset_size(subset_size_), rot_matrices(rot_matrices_),
transl_vectors(transl_vectors_) {}
void operator()(const BlockedRange& range) const
void operator()(const Range& range) const
{
// Input data for generation of the current hypothesis
std::vector<int> subset_indices(subset_size);
@@ -172,7 +172,7 @@ namespace
Mat rot_mat(3, 3, CV_64F);
Mat transl_vec(1, 3, CV_64F);
for (int iter = range.begin(); iter < range.end(); ++iter)
for (int iter = range.start; iter < range.end; ++iter)
{
selectRandom(subset_size, num_points, subset_indices);
for (int i = 0; i < subset_size; ++i)
@@ -238,7 +238,7 @@ void cv::gpu::solvePnPRansac(const Mat& object, const Mat& image, const Mat& cam
// Generate set of hypotheses using small subsets of the input data
TransformHypothesesGenerator body(object, image_normalized, empty_dist_coef, eye_camera_mat,
num_points, subset_size, rot_matrices, transl_vectors);
parallel_for(BlockedRange(0, num_iters), body);
parallel_for_(Range(0, num_iters), body);
// Compute scores (i.e. number of inliers) for each hypothesis
GpuMat d_object(object);

View File

@@ -67,8 +67,8 @@ namespace cv { namespace gpu { namespace cudev
crot1.x * p.x + crot1.y * p.y + crot1.z * p.z + ctransl.y,
crot2.x * p.x + crot2.y * p.y + crot2.z * p.z + ctransl.z);
}
__device__ __forceinline__ TransformOp() {}
__device__ __forceinline__ TransformOp(const TransformOp&) {}
__host__ __device__ __forceinline__ TransformOp() {}
__host__ __device__ __forceinline__ TransformOp(const TransformOp&) {}
};
void call(const PtrStepSz<float3> src, const float* rot,
@@ -106,8 +106,8 @@ namespace cv { namespace gpu { namespace cudev
(cproj0.x * t.x + cproj0.y * t.y) / t.z + cproj0.z,
(cproj1.x * t.x + cproj1.y * t.y) / t.z + cproj1.z);
}
__device__ __forceinline__ ProjectOp() {}
__device__ __forceinline__ ProjectOp(const ProjectOp&) {}
__host__ __device__ __forceinline__ ProjectOp() {}
__host__ __device__ __forceinline__ ProjectOp(const ProjectOp&) {}
};
void call(const PtrStepSz<float3> src, const float* rot,

View File

@@ -62,8 +62,8 @@ namespace canny
return ::abs(x) + ::abs(y);
}
__device__ __forceinline__ L1() {}
__device__ __forceinline__ L1(const L1&) {}
__host__ __device__ __forceinline__ L1() {}
__host__ __device__ __forceinline__ L1(const L1&) {}
};
struct L2 : binary_function<int, int, float>
{
@@ -72,8 +72,8 @@ namespace canny
return ::sqrtf(x * x + y * y);
}
__device__ __forceinline__ L2() {}
__device__ __forceinline__ L2(const L2&) {}
__host__ __device__ __forceinline__ L2() {}
__host__ __device__ __forceinline__ L2(const L2&) {}
};
}
@@ -470,8 +470,8 @@ namespace canny
return (uchar)(-(e >> 1));
}
__device__ __forceinline__ GetEdges() {}
__device__ __forceinline__ GetEdges(const GetEdges&) {}
__host__ __device__ __forceinline__ GetEdges() {}
__host__ __device__ __forceinline__ GetEdges(const GetEdges&) {}
};
}

View File

@@ -162,8 +162,8 @@ namespace arithm
return vadd4(a, b);
}
__device__ __forceinline__ VAdd4() {}
__device__ __forceinline__ VAdd4(const VAdd4& other) {}
__host__ __device__ __forceinline__ VAdd4() {}
__host__ __device__ __forceinline__ VAdd4(const VAdd4&) {}
};
////////////////////////////////////
@@ -175,8 +175,8 @@ namespace arithm
return vadd2(a, b);
}
__device__ __forceinline__ VAdd2() {}
__device__ __forceinline__ VAdd2(const VAdd2& other) {}
__host__ __device__ __forceinline__ VAdd2() {}
__host__ __device__ __forceinline__ VAdd2(const VAdd2&) {}
};
////////////////////////////////////
@@ -188,8 +188,8 @@ namespace arithm
return saturate_cast<D>(a + b);
}
__device__ __forceinline__ AddMat() {}
__device__ __forceinline__ AddMat(const AddMat& other) {}
__host__ __device__ __forceinline__ AddMat() {}
__host__ __device__ __forceinline__ AddMat(const AddMat&) {}
};
}
@@ -397,8 +397,8 @@ namespace arithm
return vsub4(a, b);
}
__device__ __forceinline__ VSub4() {}
__device__ __forceinline__ VSub4(const VSub4& other) {}
__host__ __device__ __forceinline__ VSub4() {}
__host__ __device__ __forceinline__ VSub4(const VSub4&) {}
};
////////////////////////////////////
@@ -410,8 +410,8 @@ namespace arithm
return vsub2(a, b);
}
__device__ __forceinline__ VSub2() {}
__device__ __forceinline__ VSub2(const VSub2& other) {}
__host__ __device__ __forceinline__ VSub2() {}
__host__ __device__ __forceinline__ VSub2(const VSub2&) {}
};
////////////////////////////////////
@@ -423,8 +423,8 @@ namespace arithm
return saturate_cast<D>(a - b);
}
__device__ __forceinline__ SubMat() {}
__device__ __forceinline__ SubMat(const SubMat& other) {}
__host__ __device__ __forceinline__ SubMat() {}
__host__ __device__ __forceinline__ SubMat(const SubMat&) {}
};
}
@@ -617,8 +617,8 @@ namespace arithm
return res;
}
__device__ __forceinline__ Mul_8uc4_32f() {}
__device__ __forceinline__ Mul_8uc4_32f(const Mul_8uc4_32f& other) {}
__host__ __device__ __forceinline__ Mul_8uc4_32f() {}
__host__ __device__ __forceinline__ Mul_8uc4_32f(const Mul_8uc4_32f&) {}
};
struct Mul_16sc4_32f : binary_function<short4, float, short4>
@@ -629,8 +629,8 @@ namespace arithm
saturate_cast<short>(a.z * b), saturate_cast<short>(a.w * b));
}
__device__ __forceinline__ Mul_16sc4_32f() {}
__device__ __forceinline__ Mul_16sc4_32f(const Mul_16sc4_32f& other) {}
__host__ __device__ __forceinline__ Mul_16sc4_32f() {}
__host__ __device__ __forceinline__ Mul_16sc4_32f(const Mul_16sc4_32f&) {}
};
template <typename T, typename D> struct Mul : binary_function<T, T, D>
@@ -640,8 +640,8 @@ namespace arithm
return saturate_cast<D>(a * b);
}
__device__ __forceinline__ Mul() {}
__device__ __forceinline__ Mul(const Mul& other) {}
__host__ __device__ __forceinline__ Mul() {}
__host__ __device__ __forceinline__ Mul(const Mul&) {}
};
template <typename T, typename S, typename D> struct MulScale : binary_function<T, T, D>
@@ -888,8 +888,8 @@ namespace arithm
return b != 0 ? saturate_cast<D>(a / b) : 0;
}
__device__ __forceinline__ Div() {}
__device__ __forceinline__ Div(const Div& other) {}
__host__ __device__ __forceinline__ Div() {}
__host__ __device__ __forceinline__ Div(const Div&) {}
};
template <typename T> struct Div<T, float> : binary_function<T, T, float>
{
@@ -898,8 +898,8 @@ namespace arithm
return b != 0 ? static_cast<float>(a) / b : 0;
}
__device__ __forceinline__ Div() {}
__device__ __forceinline__ Div(const Div& other) {}
__host__ __device__ __forceinline__ Div() {}
__host__ __device__ __forceinline__ Div(const Div&) {}
};
template <typename T> struct Div<T, double> : binary_function<T, T, double>
{
@@ -908,8 +908,8 @@ namespace arithm
return b != 0 ? static_cast<double>(a) / b : 0;
}
__device__ __forceinline__ Div() {}
__device__ __forceinline__ Div(const Div& other) {}
__host__ __device__ __forceinline__ Div() {}
__host__ __device__ __forceinline__ Div(const Div&) {}
};
template <typename T, typename S, typename D> struct DivScale : binary_function<T, T, D>
@@ -1196,8 +1196,8 @@ namespace arithm
return vabsdiff4(a, b);
}
__device__ __forceinline__ VAbsDiff4() {}
__device__ __forceinline__ VAbsDiff4(const VAbsDiff4& other) {}
__host__ __device__ __forceinline__ VAbsDiff4() {}
__host__ __device__ __forceinline__ VAbsDiff4(const VAbsDiff4&) {}
};
////////////////////////////////////
@@ -1209,8 +1209,8 @@ namespace arithm
return vabsdiff2(a, b);
}
__device__ __forceinline__ VAbsDiff2() {}
__device__ __forceinline__ VAbsDiff2(const VAbsDiff2& other) {}
__host__ __device__ __forceinline__ VAbsDiff2() {}
__host__ __device__ __forceinline__ VAbsDiff2(const VAbsDiff2&) {}
};
////////////////////////////////////
@@ -1235,8 +1235,8 @@ namespace arithm
return saturate_cast<T>(_abs(a - b));
}
__device__ __forceinline__ AbsDiffMat() {}
__device__ __forceinline__ AbsDiffMat(const AbsDiffMat& other) {}
__host__ __device__ __forceinline__ AbsDiffMat() {}
__host__ __device__ __forceinline__ AbsDiffMat(const AbsDiffMat&) {}
};
}
@@ -1370,8 +1370,8 @@ namespace arithm
return saturate_cast<T>(x * x);
}
__device__ __forceinline__ Sqr() {}
__device__ __forceinline__ Sqr(const Sqr& other) {}
__host__ __device__ __forceinline__ Sqr() {}
__host__ __device__ __forceinline__ Sqr(const Sqr&) {}
};
}
@@ -1466,8 +1466,8 @@ namespace arithm
return saturate_cast<T>(f(x));
}
__device__ __forceinline__ Exp() {}
__device__ __forceinline__ Exp(const Exp& other) {}
__host__ __device__ __forceinline__ Exp() {}
__host__ __device__ __forceinline__ Exp(const Exp&) {}
};
}
@@ -1507,8 +1507,8 @@ namespace arithm
return vcmpeq4(a, b);
}
__device__ __forceinline__ VCmpEq4() {}
__device__ __forceinline__ VCmpEq4(const VCmpEq4& other) {}
__host__ __device__ __forceinline__ VCmpEq4() {}
__host__ __device__ __forceinline__ VCmpEq4(const VCmpEq4&) {}
};
struct VCmpNe4 : binary_function<uint, uint, uint>
{
@@ -1517,8 +1517,8 @@ namespace arithm
return vcmpne4(a, b);
}
__device__ __forceinline__ VCmpNe4() {}
__device__ __forceinline__ VCmpNe4(const VCmpNe4& other) {}
__host__ __device__ __forceinline__ VCmpNe4() {}
__host__ __device__ __forceinline__ VCmpNe4(const VCmpNe4&) {}
};
struct VCmpLt4 : binary_function<uint, uint, uint>
{
@@ -1527,8 +1527,8 @@ namespace arithm
return vcmplt4(a, b);
}
__device__ __forceinline__ VCmpLt4() {}
__device__ __forceinline__ VCmpLt4(const VCmpLt4& other) {}
__host__ __device__ __forceinline__ VCmpLt4() {}
__host__ __device__ __forceinline__ VCmpLt4(const VCmpLt4&) {}
};
struct VCmpLe4 : binary_function<uint, uint, uint>
{
@@ -1537,8 +1537,8 @@ namespace arithm
return vcmple4(a, b);
}
__device__ __forceinline__ VCmpLe4() {}
__device__ __forceinline__ VCmpLe4(const VCmpLe4& other) {}
__host__ __device__ __forceinline__ VCmpLe4() {}
__host__ __device__ __forceinline__ VCmpLe4(const VCmpLe4&) {}
};
////////////////////////////////////
@@ -2008,8 +2008,8 @@ namespace arithm
return vmin4(a, b);
}
__device__ __forceinline__ VMin4() {}
__device__ __forceinline__ VMin4(const VMin4& other) {}
__host__ __device__ __forceinline__ VMin4() {}
__host__ __device__ __forceinline__ VMin4(const VMin4&) {}
};
////////////////////////////////////
@@ -2021,8 +2021,8 @@ namespace arithm
return vmin2(a, b);
}
__device__ __forceinline__ VMin2() {}
__device__ __forceinline__ VMin2(const VMin2& other) {}
__host__ __device__ __forceinline__ VMin2() {}
__host__ __device__ __forceinline__ VMin2(const VMin2&) {}
};
}
@@ -2100,8 +2100,8 @@ namespace arithm
return vmax4(a, b);
}
__device__ __forceinline__ VMax4() {}
__device__ __forceinline__ VMax4(const VMax4& other) {}
__host__ __device__ __forceinline__ VMax4() {}
__host__ __device__ __forceinline__ VMax4(const VMax4&) {}
};
////////////////////////////////////
@@ -2113,8 +2113,8 @@ namespace arithm
return vmax2(a, b);
}
__device__ __forceinline__ VMax2() {}
__device__ __forceinline__ VMax2(const VMax2& other) {}
__host__ __device__ __forceinline__ VMax2() {}
__host__ __device__ __forceinline__ VMax2(const VMax2&) {}
};
}

View File

@@ -188,10 +188,20 @@ double cv::gpu::norm(const GpuMat& src1, const GpuMat& src2, int normType)
CV_Assert(src1.size() == src2.size() && src1.type() == src2.type());
CV_Assert(normType == NORM_INF || normType == NORM_L1 || normType == NORM_L2);
typedef NppStatus (*npp_norm_diff_func_t)(const Npp8u* pSrc1, int nSrcStep1, const Npp8u* pSrc2, int nSrcStep2,
NppiSize oSizeROI, Npp64f* pRetVal);
#if CUDA_VERSION < 5050
typedef NppStatus (*func_t)(const Npp8u* pSrc1, int nSrcStep1, const Npp8u* pSrc2, int nSrcStep2, NppiSize oSizeROI, Npp64f* pRetVal);
static const npp_norm_diff_func_t npp_norm_diff_func[] = {nppiNormDiff_Inf_8u_C1R, nppiNormDiff_L1_8u_C1R, nppiNormDiff_L2_8u_C1R};
static const func_t funcs[] = {nppiNormDiff_Inf_8u_C1R, nppiNormDiff_L1_8u_C1R, nppiNormDiff_L2_8u_C1R};
#else
typedef NppStatus (*func_t)(const Npp8u* pSrc1, int nSrcStep1, const Npp8u* pSrc2, int nSrcStep2,
NppiSize oSizeROI, Npp64f* pRetVal, Npp8u * pDeviceBuffer);
typedef NppStatus (*buf_size_func_t)(NppiSize oSizeROI, int* hpBufferSize);
static const func_t funcs[] = {nppiNormDiff_Inf_8u_C1R, nppiNormDiff_L1_8u_C1R, nppiNormDiff_L2_8u_C1R};
static const buf_size_func_t buf_size_funcs[] = {nppiNormDiffInfGetBufferHostSize_8u_C1R, nppiNormDiffL1GetBufferHostSize_8u_C1R, nppiNormDiffL2GetBufferHostSize_8u_C1R};
#endif
NppiSize sz;
sz.width = src1.cols;
@@ -203,7 +213,16 @@ double cv::gpu::norm(const GpuMat& src1, const GpuMat& src2, int normType)
DeviceBuffer dbuf;
nppSafeCall( npp_norm_diff_func[funcIdx](src1.ptr<Npp8u>(), static_cast<int>(src1.step), src2.ptr<Npp8u>(), static_cast<int>(src2.step), sz, dbuf) );
#if CUDA_VERSION < 5050
nppSafeCall( funcs[funcIdx](src1.ptr<Npp8u>(), static_cast<int>(src1.step), src2.ptr<Npp8u>(), static_cast<int>(src2.step), sz, dbuf) );
#else
int bufSize;
buf_size_funcs[funcIdx](sz, &bufSize);
GpuMat buf(1, bufSize, CV_8UC1);
nppSafeCall( funcs[funcIdx](src1.ptr<Npp8u>(), static_cast<int>(src1.step), src2.ptr<Npp8u>(), static_cast<int>(src2.step), sz, dbuf, buf.data) );
#endif
cudaSafeCall( cudaDeviceSynchronize() );

View File

@@ -352,7 +352,7 @@ GPU_TEST_P(Add_Scalar, WithOutMask)
cv::Mat dst_gold(size, depth.second, cv::Scalar::all(0));
cv::add(mat, val, dst_gold, cv::noArray(), depth.second);
EXPECT_MAT_NEAR(dst_gold, dst, depth.first >= CV_32F || depth.second >= CV_32F ? 1e-4 : 0.0);
EXPECT_MAT_NEAR(dst_gold, dst, depth.first >= CV_32F || depth.second >= CV_32F ? 1e-4 : 1.0);
}
}
@@ -383,7 +383,7 @@ GPU_TEST_P(Add_Scalar, WithMask)
cv::Mat dst_gold(size, depth.second, cv::Scalar::all(0));
cv::add(mat, val, dst_gold, mask, depth.second);
EXPECT_MAT_NEAR(dst_gold, dst, depth.first >= CV_32F || depth.second >= CV_32F ? 1e-4 : 0.0);
EXPECT_MAT_NEAR(dst_gold, dst, depth.first >= CV_32F || depth.second >= CV_32F ? 1e-4 : 1.0);
}
}
@@ -567,7 +567,7 @@ GPU_TEST_P(Subtract_Scalar, WithOutMask)
cv::Mat dst_gold(size, depth.second, cv::Scalar::all(0));
cv::subtract(mat, val, dst_gold, cv::noArray(), depth.second);
EXPECT_MAT_NEAR(dst_gold, dst, depth.first >= CV_32F || depth.second >= CV_32F ? 1e-4 : 0.0);
EXPECT_MAT_NEAR(dst_gold, dst, depth.first >= CV_32F || depth.second >= CV_32F ? 1e-4 : 1.0);
}
}
@@ -598,7 +598,7 @@ GPU_TEST_P(Subtract_Scalar, WithMask)
cv::Mat dst_gold(size, depth.second, cv::Scalar::all(0));
cv::subtract(mat, val, dst_gold, mask, depth.second);
EXPECT_MAT_NEAR(dst_gold, dst, depth.first >= CV_32F || depth.second >= CV_32F ? 1e-4 : 0.0);
EXPECT_MAT_NEAR(dst_gold, dst, depth.first >= CV_32F || depth.second >= CV_32F ? 1e-4 : 1.0);
}
}
@@ -2148,7 +2148,7 @@ GPU_TEST_P(Min, Scalar)
cv::Mat dst_gold = cv::min(src, val);
EXPECT_MAT_NEAR(dst_gold, dst, 0.0);
EXPECT_MAT_NEAR(dst_gold, dst, depth < CV_32F ? 1.0 : 1e-5);
}
}
@@ -2231,7 +2231,7 @@ GPU_TEST_P(Max, Scalar)
cv::Mat dst_gold = cv::max(src, val);
EXPECT_MAT_NEAR(dst_gold, dst, 0.0);
EXPECT_MAT_NEAR(dst_gold, dst, depth < CV_32F ? 1.0 : 1e-5);
}
}

View File

@@ -113,6 +113,7 @@ If you use ``cvtColor`` with 8-bit images, the conversion will have some informa
The function can do the following transformations:
*
RGB :math:`\leftrightarrow` GRAY ( ``CV_BGR2GRAY, CV_RGB2GRAY, CV_GRAY2BGR, CV_GRAY2RGB`` )
Transformations within RGB space like adding/removing the alpha channel, reversing the channel order, conversion to/from 16-bit RGB color (R5:G6:B5 or R5:G5:B5), as well as conversion to/from grayscale using:
.. math::
@@ -755,7 +756,7 @@ Runs the GrabCut algorithm.
* **GC_PR_BGD** defines a possible background pixel.
* **GC_PR_BGD** defines a possible foreground pixel.
* **GC_PR_FGD** defines a possible foreground pixel.
:param rect: ROI containing a segmented object. The pixels outside of the ROI are marked as "obvious background". The parameter is only used when ``mode==GC_INIT_WITH_RECT`` .

View File

@@ -0,0 +1,334 @@
/*M///////////////////////////////////////////////////////////////////////////////////////
//
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
//
// By downloading, copying, installing or using the software you agree to this license.
// If you do not agree to this license, do not download, install,
// copy or use the software.
//
//
// License Agreement
// For Open Source Computer Vision Library
//
// Copyright (C) 2013, NVIDIA Corporation, all rights reserved.
// Third party copyrights are property of their respective owners.
//
// Redistribution and use in source and binary forms, with or without modification,
// are permitted provided that the following conditions are met:
//
// * Redistribution's of source code must retain the above copyright notice,
// this list of conditions and the following disclaimer.
//
// * Redistribution's in binary form must reproduce the above copyright notice,
// this list of conditions and the following disclaimer in the documentation
// and/or other materials provided with the distribution.
//
// * The name of the copyright holders may not be used to endorse or promote products
// derived from this software without specific prior written permission.
//
// This software is provided by the copyright holders and contributors "as is" and
// any express or implied warranties, including, but not limited to, the implied
// warranties of merchantability and fitness for a particular purpose are disclaimed.
// In no event shall the copyright holders or contributors be liable for any direct,
// indirect, incidental, special, exemplary, or consequential damages
// (including, but not limited to, procurement of substitute goods or services;
// loss of use, data, or profits; or business interruption) however caused
// and on any theory of liability, whether in contract, strict liability,
// or tort (including negligence or otherwise) arising in any way out of
// the use of this software, even if advised of the possibility of such damage.
//
//M*/
#include "precomp.hpp"
// ----------------------------------------------------------------------
// CLAHE
namespace
{
class CLAHE_CalcLut_Body : public cv::ParallelLoopBody
{
public:
CLAHE_CalcLut_Body(const cv::Mat& src, cv::Mat& lut, cv::Size tileSize, int tilesX, int tilesY, int clipLimit, float lutScale) :
src_(src), lut_(lut), tileSize_(tileSize), tilesX_(tilesX), tilesY_(tilesY), clipLimit_(clipLimit), lutScale_(lutScale)
{
}
void operator ()(const cv::Range& range) const;
private:
cv::Mat src_;
mutable cv::Mat lut_;
cv::Size tileSize_;
int tilesX_;
int tilesY_;
int clipLimit_;
float lutScale_;
};
void CLAHE_CalcLut_Body::operator ()(const cv::Range& range) const
{
const int histSize = 256;
uchar* tileLut = lut_.ptr(range.start);
const size_t lut_step = lut_.step;
for (int k = range.start; k < range.end; ++k, tileLut += lut_step)
{
const int ty = k / tilesX_;
const int tx = k % tilesX_;
// retrieve tile submatrix
cv::Rect tileROI;
tileROI.x = tx * tileSize_.width;
tileROI.y = ty * tileSize_.height;
tileROI.width = tileSize_.width;
tileROI.height = tileSize_.height;
const cv::Mat tile = src_(tileROI);
// calc histogram
int tileHist[histSize] = {0, };
int height = tileROI.height;
const size_t sstep = tile.step;
for (const uchar* ptr = tile.ptr<uchar>(0); height--; ptr += sstep)
{
int x = 0;
for (; x <= tileROI.width - 4; x += 4)
{
int t0 = ptr[x], t1 = ptr[x+1];
tileHist[t0]++; tileHist[t1]++;
t0 = ptr[x+2]; t1 = ptr[x+3];
tileHist[t0]++; tileHist[t1]++;
}
for (; x < tileROI.width; ++x)
tileHist[ptr[x]]++;
}
// clip histogram
if (clipLimit_ > 0)
{
// how many pixels were clipped
int clipped = 0;
for (int i = 0; i < histSize; ++i)
{
if (tileHist[i] > clipLimit_)
{
clipped += tileHist[i] - clipLimit_;
tileHist[i] = clipLimit_;
}
}
// redistribute clipped pixels
int redistBatch = clipped / histSize;
int residual = clipped - redistBatch * histSize;
for (int i = 0; i < histSize; ++i)
tileHist[i] += redistBatch;
for (int i = 0; i < residual; ++i)
tileHist[i]++;
}
// calc Lut
int sum = 0;
for (int i = 0; i < histSize; ++i)
{
sum += tileHist[i];
tileLut[i] = cv::saturate_cast<uchar>(sum * lutScale_);
}
}
}
class CLAHE_Interpolation_Body : public cv::ParallelLoopBody
{
public:
CLAHE_Interpolation_Body(const cv::Mat& src, cv::Mat& dst, const cv::Mat& lut, cv::Size tileSize, int tilesX, int tilesY) :
src_(src), dst_(dst), lut_(lut), tileSize_(tileSize), tilesX_(tilesX), tilesY_(tilesY)
{
}
void operator ()(const cv::Range& range) const;
private:
cv::Mat src_;
mutable cv::Mat dst_;
cv::Mat lut_;
cv::Size tileSize_;
int tilesX_;
int tilesY_;
};
void CLAHE_Interpolation_Body::operator ()(const cv::Range& range) const
{
const size_t lut_step = lut_.step;
for (int y = range.start; y < range.end; ++y)
{
const uchar* srcRow = src_.ptr<uchar>(y);
uchar* dstRow = dst_.ptr<uchar>(y);
const float tyf = (static_cast<float>(y) / tileSize_.height) - 0.5f;
int ty1 = cvFloor(tyf);
int ty2 = ty1 + 1;
const float ya = tyf - ty1;
ty1 = std::max(ty1, 0);
ty2 = std::min(ty2, tilesY_ - 1);
const uchar* lutPlane1 = lut_.ptr(ty1 * tilesX_);
const uchar* lutPlane2 = lut_.ptr(ty2 * tilesX_);
for (int x = 0; x < src_.cols; ++x)
{
const float txf = (static_cast<float>(x) / tileSize_.width) - 0.5f;
int tx1 = cvFloor(txf);
int tx2 = tx1 + 1;
const float xa = txf - tx1;
tx1 = std::max(tx1, 0);
tx2 = std::min(tx2, tilesX_ - 1);
const int srcVal = srcRow[x];
const size_t ind1 = tx1 * lut_step + srcVal;
const size_t ind2 = tx2 * lut_step + srcVal;
float res = 0;
res += lutPlane1[ind1] * ((1.0f - xa) * (1.0f - ya));
res += lutPlane1[ind2] * ((xa) * (1.0f - ya));
res += lutPlane2[ind1] * ((1.0f - xa) * (ya));
res += lutPlane2[ind2] * ((xa) * (ya));
dstRow[x] = cv::saturate_cast<uchar>(res);
}
}
}
class CLAHE_Impl : public cv::CLAHE
{
public:
CLAHE_Impl(double clipLimit = 40.0, int tilesX = 8, int tilesY = 8);
cv::AlgorithmInfo* info() const;
void apply(cv::InputArray src, cv::OutputArray dst);
void setClipLimit(double clipLimit);
double getClipLimit() const;
void setTilesGridSize(cv::Size tileGridSize);
cv::Size getTilesGridSize() const;
void collectGarbage();
private:
double clipLimit_;
int tilesX_;
int tilesY_;
cv::Mat srcExt_;
cv::Mat lut_;
};
CLAHE_Impl::CLAHE_Impl(double clipLimit, int tilesX, int tilesY) :
clipLimit_(clipLimit), tilesX_(tilesX), tilesY_(tilesY)
{
}
CV_INIT_ALGORITHM(CLAHE_Impl, "CLAHE",
obj.info()->addParam(obj, "clipLimit", obj.clipLimit_);
obj.info()->addParam(obj, "tilesX", obj.tilesX_);
obj.info()->addParam(obj, "tilesY", obj.tilesY_))
void CLAHE_Impl::apply(cv::InputArray _src, cv::OutputArray _dst)
{
cv::Mat src = _src.getMat();
CV_Assert( src.type() == CV_8UC1 );
_dst.create( src.size(), src.type() );
cv::Mat dst = _dst.getMat();
const int histSize = 256;
lut_.create(tilesX_ * tilesY_, histSize, CV_8UC1);
cv::Size tileSize;
cv::Mat srcForLut;
if (src.cols % tilesX_ == 0 && src.rows % tilesY_ == 0)
{
tileSize = cv::Size(src.cols / tilesX_, src.rows / tilesY_);
srcForLut = src;
}
else
{
cv::copyMakeBorder(src, srcExt_, 0, tilesY_ - (src.rows % tilesY_), 0, tilesX_ - (src.cols % tilesX_), cv::BORDER_REFLECT_101);
tileSize = cv::Size(srcExt_.cols / tilesX_, srcExt_.rows / tilesY_);
srcForLut = srcExt_;
}
const int tileSizeTotal = tileSize.area();
const float lutScale = static_cast<float>(histSize - 1) / tileSizeTotal;
int clipLimit = 0;
if (clipLimit_ > 0.0)
{
clipLimit = static_cast<int>(clipLimit_ * tileSizeTotal / histSize);
clipLimit = std::max(clipLimit, 1);
}
CLAHE_CalcLut_Body calcLutBody(srcForLut, lut_, tileSize, tilesX_, tilesY_, clipLimit, lutScale);
cv::parallel_for_(cv::Range(0, tilesX_ * tilesY_), calcLutBody);
CLAHE_Interpolation_Body interpolationBody(src, dst, lut_, tileSize, tilesX_, tilesY_);
cv::parallel_for_(cv::Range(0, src.rows), interpolationBody);
}
void CLAHE_Impl::setClipLimit(double clipLimit)
{
clipLimit_ = clipLimit;
}
double CLAHE_Impl::getClipLimit() const
{
return clipLimit_;
}
void CLAHE_Impl::setTilesGridSize(cv::Size tileGridSize)
{
tilesX_ = tileGridSize.width;
tilesY_ = tileGridSize.height;
}
cv::Size CLAHE_Impl::getTilesGridSize() const
{
return cv::Size(tilesX_, tilesY_);
}
void CLAHE_Impl::collectGarbage()
{
srcExt_.release();
lut_.release();
}
}
cv::Ptr<cv::CLAHE> cv::createCLAHE(double clipLimit, cv::Size tileGridSize)
{
return new CLAHE_Impl(clipLimit, tileGridSize.width, tileGridSize.height);
}

View File

@@ -1815,7 +1815,7 @@ const int ITUR_BT_601_CGV = -385875;
const int ITUR_BT_601_CBV = -74448;
template<int bIdx, int uIdx>
struct YUV420sp2RGB888Invoker
struct YUV420sp2RGB888Invoker : ParallelLoopBody
{
Mat* dst;
const uchar* my1, *muv;
@@ -1824,10 +1824,10 @@ struct YUV420sp2RGB888Invoker
YUV420sp2RGB888Invoker(Mat* _dst, int _stride, const uchar* _y1, const uchar* _uv)
: dst(_dst), my1(_y1), muv(_uv), width(_dst->cols), stride(_stride) {}
void operator()(const BlockedRange& range) const
void operator()(const Range& range) const
{
int rangeBegin = range.begin() * 2;
int rangeEnd = range.end() * 2;
int rangeBegin = range.start * 2;
int rangeEnd = range.end * 2;
//R = 1.164(Y - 16) + 1.596(V - 128)
//G = 1.164(Y - 16) - 0.813(V - 128) - 0.391(U - 128)
@@ -1884,7 +1884,7 @@ struct YUV420sp2RGB888Invoker
};
template<int bIdx, int uIdx>
struct YUV420sp2RGBA8888Invoker
struct YUV420sp2RGBA8888Invoker : ParallelLoopBody
{
Mat* dst;
const uchar* my1, *muv;
@@ -1893,10 +1893,10 @@ struct YUV420sp2RGBA8888Invoker
YUV420sp2RGBA8888Invoker(Mat* _dst, int _stride, const uchar* _y1, const uchar* _uv)
: dst(_dst), my1(_y1), muv(_uv), width(_dst->cols), stride(_stride) {}
void operator()(const BlockedRange& range) const
void operator()(const Range& range) const
{
int rangeBegin = range.begin() * 2;
int rangeEnd = range.end() * 2;
int rangeBegin = range.start * 2;
int rangeEnd = range.end * 2;
//R = 1.164(Y - 16) + 1.596(V - 128)
//G = 1.164(Y - 16) - 0.813(V - 128) - 0.391(U - 128)
@@ -1957,7 +1957,7 @@ struct YUV420sp2RGBA8888Invoker
};
template<int bIdx>
struct YUV420p2RGB888Invoker
struct YUV420p2RGB888Invoker : ParallelLoopBody
{
Mat* dst;
const uchar* my1, *mu, *mv;
@@ -1967,19 +1967,19 @@ struct YUV420p2RGB888Invoker
YUV420p2RGB888Invoker(Mat* _dst, int _stride, const uchar* _y1, const uchar* _u, const uchar* _v, int _ustepIdx, int _vstepIdx)
: dst(_dst), my1(_y1), mu(_u), mv(_v), width(_dst->cols), stride(_stride), ustepIdx(_ustepIdx), vstepIdx(_vstepIdx) {}
void operator()(const BlockedRange& range) const
void operator()(const Range& range) const
{
const int rangeBegin = range.begin() * 2;
const int rangeEnd = range.end() * 2;
const int rangeBegin = range.start * 2;
const int rangeEnd = range.end * 2;
size_t uvsteps[2] = {width/2, stride - width/2};
int usIdx = ustepIdx, vsIdx = vstepIdx;
const uchar* y1 = my1 + rangeBegin * stride;
const uchar* u1 = mu + (range.begin() / 2) * stride;
const uchar* v1 = mv + (range.begin() / 2) * stride;
const uchar* u1 = mu + (range.start / 2) * stride;
const uchar* v1 = mv + (range.start / 2) * stride;
if(range.begin() % 2 == 1)
if(range.start % 2 == 1)
{
u1 += uvsteps[(usIdx++) & 1];
v1 += uvsteps[(vsIdx++) & 1];
@@ -2025,7 +2025,7 @@ struct YUV420p2RGB888Invoker
};
template<int bIdx>
struct YUV420p2RGBA8888Invoker
struct YUV420p2RGBA8888Invoker : ParallelLoopBody
{
Mat* dst;
const uchar* my1, *mu, *mv;
@@ -2035,19 +2035,19 @@ struct YUV420p2RGBA8888Invoker
YUV420p2RGBA8888Invoker(Mat* _dst, int _stride, const uchar* _y1, const uchar* _u, const uchar* _v, int _ustepIdx, int _vstepIdx)
: dst(_dst), my1(_y1), mu(_u), mv(_v), width(_dst->cols), stride(_stride), ustepIdx(_ustepIdx), vstepIdx(_vstepIdx) {}
void operator()(const BlockedRange& range) const
void operator()(const Range& range) const
{
int rangeBegin = range.begin() * 2;
int rangeEnd = range.end() * 2;
int rangeBegin = range.start * 2;
int rangeEnd = range.end * 2;
size_t uvsteps[2] = {width/2, stride - width/2};
int usIdx = ustepIdx, vsIdx = vstepIdx;
const uchar* y1 = my1 + rangeBegin * stride;
const uchar* u1 = mu + (range.begin() / 2) * stride;
const uchar* v1 = mv + (range.begin() / 2) * stride;
const uchar* u1 = mu + (range.start / 2) * stride;
const uchar* v1 = mv + (range.start / 2) * stride;
if(range.begin() % 2 == 1)
if(range.start % 2 == 1)
{
u1 += uvsteps[(usIdx++) & 1];
v1 += uvsteps[(vsIdx++) & 1];
@@ -2102,48 +2102,40 @@ template<int bIdx, int uIdx>
inline void cvtYUV420sp2RGB(Mat& _dst, int _stride, const uchar* _y1, const uchar* _uv)
{
YUV420sp2RGB888Invoker<bIdx, uIdx> converter(&_dst, _stride, _y1, _uv);
#ifdef HAVE_TBB
if (_dst.total() >= MIN_SIZE_FOR_PARALLEL_YUV420_CONVERSION)
parallel_for(BlockedRange(0, _dst.rows/2), converter);
parallel_for_(Range(0, _dst.rows/2), converter);
else
#endif
converter(BlockedRange(0, _dst.rows/2));
converter(Range(0, _dst.rows/2));
}
template<int bIdx, int uIdx>
inline void cvtYUV420sp2RGBA(Mat& _dst, int _stride, const uchar* _y1, const uchar* _uv)
{
YUV420sp2RGBA8888Invoker<bIdx, uIdx> converter(&_dst, _stride, _y1, _uv);
#ifdef HAVE_TBB
if (_dst.total() >= MIN_SIZE_FOR_PARALLEL_YUV420_CONVERSION)
parallel_for(BlockedRange(0, _dst.rows/2), converter);
parallel_for_(Range(0, _dst.rows/2), converter);
else
#endif
converter(BlockedRange(0, _dst.rows/2));
converter(Range(0, _dst.rows/2));
}
template<int bIdx>
inline void cvtYUV420p2RGB(Mat& _dst, int _stride, const uchar* _y1, const uchar* _u, const uchar* _v, int ustepIdx, int vstepIdx)
{
YUV420p2RGB888Invoker<bIdx> converter(&_dst, _stride, _y1, _u, _v, ustepIdx, vstepIdx);
#ifdef HAVE_TBB
if (_dst.total() >= MIN_SIZE_FOR_PARALLEL_YUV420_CONVERSION)
parallel_for(BlockedRange(0, _dst.rows/2), converter);
parallel_for_(Range(0, _dst.rows/2), converter);
else
#endif
converter(BlockedRange(0, _dst.rows/2));
converter(Range(0, _dst.rows/2));
}
template<int bIdx>
inline void cvtYUV420p2RGBA(Mat& _dst, int _stride, const uchar* _y1, const uchar* _u, const uchar* _v, int ustepIdx, int vstepIdx)
{
YUV420p2RGBA8888Invoker<bIdx> converter(&_dst, _stride, _y1, _u, _v, ustepIdx, vstepIdx);
#ifdef HAVE_TBB
if (_dst.total() >= MIN_SIZE_FOR_PARALLEL_YUV420_CONVERSION)
parallel_for(BlockedRange(0, _dst.rows/2), converter);
parallel_for_(Range(0, _dst.rows/2), converter);
else
#endif
converter(BlockedRange(0, _dst.rows/2));
converter(Range(0, _dst.rows/2));
}
///////////////////////////////////// RGB -> YUV420p /////////////////////////////////////
@@ -2227,7 +2219,7 @@ static void cvtRGBtoYUV420p(const Mat& src, Mat& dst)
///////////////////////////////////// YUV422 -> RGB /////////////////////////////////////
template<int bIdx, int uIdx, int yIdx>
struct YUV422toRGB888Invoker
struct YUV422toRGB888Invoker : ParallelLoopBody
{
Mat* dst;
const uchar* src;
@@ -2236,10 +2228,10 @@ struct YUV422toRGB888Invoker
YUV422toRGB888Invoker(Mat* _dst, int _stride, const uchar* _yuv)
: dst(_dst), src(_yuv), width(_dst->cols), stride(_stride) {}
void operator()(const BlockedRange& range) const
void operator()(const Range& range) const
{
int rangeBegin = range.begin();
int rangeEnd = range.end();
int rangeBegin = range.start;
int rangeEnd = range.end;
const int uidx = 1 - yIdx + uIdx * 2;
const int vidx = (2 + uidx) % 4;
@@ -2273,7 +2265,7 @@ struct YUV422toRGB888Invoker
};
template<int bIdx, int uIdx, int yIdx>
struct YUV422toRGBA8888Invoker
struct YUV422toRGBA8888Invoker : ParallelLoopBody
{
Mat* dst;
const uchar* src;
@@ -2282,10 +2274,10 @@ struct YUV422toRGBA8888Invoker
YUV422toRGBA8888Invoker(Mat* _dst, int _stride, const uchar* _yuv)
: dst(_dst), src(_yuv), width(_dst->cols), stride(_stride) {}
void operator()(const BlockedRange& range) const
void operator()(const Range& range) const
{
int rangeBegin = range.begin();
int rangeEnd = range.end();
int rangeBegin = range.start;
int rangeEnd = range.end;
const int uidx = 1 - yIdx + uIdx * 2;
const int vidx = (2 + uidx) % 4;
@@ -2326,24 +2318,20 @@ template<int bIdx, int uIdx, int yIdx>
inline void cvtYUV422toRGB(Mat& _dst, int _stride, const uchar* _yuv)
{
YUV422toRGB888Invoker<bIdx, uIdx, yIdx> converter(&_dst, _stride, _yuv);
#ifdef HAVE_TBB
if (_dst.total() >= MIN_SIZE_FOR_PARALLEL_YUV422_CONVERSION)
parallel_for(BlockedRange(0, _dst.rows), converter);
parallel_for_(Range(0, _dst.rows), converter);
else
#endif
converter(BlockedRange(0, _dst.rows));
converter(Range(0, _dst.rows));
}
template<int bIdx, int uIdx, int yIdx>
inline void cvtYUV422toRGBA(Mat& _dst, int _stride, const uchar* _yuv)
{
YUV422toRGBA8888Invoker<bIdx, uIdx, yIdx> converter(&_dst, _stride, _yuv);
#ifdef HAVE_TBB
if (_dst.total() >= MIN_SIZE_FOR_PARALLEL_YUV422_CONVERSION)
parallel_for(BlockedRange(0, _dst.rows), converter);
parallel_for_(Range(0, _dst.rows), converter);
else
#endif
converter(BlockedRange(0, _dst.rows));
converter(Range(0, _dst.rows));
}
/////////////////////////// RGBA <-> mRGBA (alpha premultiplied) //////////////

View File

@@ -442,7 +442,7 @@ static void getDistanceTransformMask( int maskType, float *metrics )
}
}
struct DTColumnInvoker
struct DTColumnInvoker : ParallelLoopBody
{
DTColumnInvoker( const Mat* _src, Mat* _dst, const int* _sat_tab, const float* _sqr_tab)
{
@@ -452,9 +452,9 @@ struct DTColumnInvoker
sqr_tab = _sqr_tab;
}
void operator()( const BlockedRange& range ) const
void operator()( const Range& range ) const
{
int i, i1 = range.begin(), i2 = range.end();
int i, i1 = range.start, i2 = range.end;
int m = src->rows;
size_t sstep = src->step, dstep = dst->step/sizeof(float);
AutoBuffer<int> _d(m);
@@ -489,7 +489,7 @@ struct DTColumnInvoker
};
struct DTRowInvoker
struct DTRowInvoker : ParallelLoopBody
{
DTRowInvoker( Mat* _dst, const float* _sqr_tab, const float* _inv_tab )
{
@@ -498,10 +498,10 @@ struct DTRowInvoker
inv_tab = _inv_tab;
}
void operator()( const BlockedRange& range ) const
void operator()( const Range& range ) const
{
const float inf = 1e15f;
int i, i1 = range.begin(), i2 = range.end();
int i, i1 = range.start, i2 = range.end;
int n = dst->cols;
AutoBuffer<uchar> _buf((n+2)*2*sizeof(float) + (n+2)*sizeof(int));
float* f = (float*)(uchar*)_buf;
@@ -578,7 +578,7 @@ trueDistTrans( const Mat& src, Mat& dst )
for( ; i <= m*3; i++ )
sat_tab[i] = i - shift;
cv::parallel_for(cv::BlockedRange(0, n), cv::DTColumnInvoker(&src, &dst, sat_tab, sqr_tab));
cv::parallel_for_(cv::Range(0, n), cv::DTColumnInvoker(&src, &dst, sat_tab, sqr_tab));
// stage 2: compute modified distance transform for each row
float* inv_tab = sqr_tab + n;
@@ -590,7 +590,7 @@ trueDistTrans( const Mat& src, Mat& dst )
sqr_tab[i] = (float)(i*i);
}
cv::parallel_for(cv::BlockedRange(0, m), cv::DTRowInvoker(&dst, sqr_tab, inv_tab));
cv::parallel_for_(cv::Range(0, m), cv::DTRowInvoker(&dst, sqr_tab, inv_tab));
}
@@ -664,7 +664,7 @@ distanceATS_L1_8u( const Mat& src, Mat& dst )
// do right edge
a = lut[dbase[width-1+dststep]];
dbase[width-1] = (uchar)(MIN(a, dbase[width-1]));
for( x = width - 2; x >= 0; x-- )
{
int b = dbase[x+dststep];

View File

@@ -2985,29 +2985,23 @@ cvCalcProbDensity( const CvHistogram* hist, const CvHistogram* hist_mask,
}
}
class EqualizeHistCalcHist_Invoker
class EqualizeHistCalcHist_Invoker : public cv::ParallelLoopBody
{
public:
enum {HIST_SZ = 256};
#ifdef HAVE_TBB
typedef tbb::mutex* MutextPtr;
#else
typedef void* MutextPtr;
#endif
EqualizeHistCalcHist_Invoker(cv::Mat& src, int* histogram, MutextPtr histogramLock)
EqualizeHistCalcHist_Invoker(cv::Mat& src, int* histogram, cv::Mutex* histogramLock)
: src_(src), globalHistogram_(histogram), histogramLock_(histogramLock)
{ }
void operator()( const cv::BlockedRange& rowRange ) const
void operator()( const cv::Range& rowRange ) const
{
int localHistogram[HIST_SZ] = {0, };
const size_t sstep = src_.step;
int width = src_.cols;
int height = rowRange.end() - rowRange.begin();
int height = rowRange.end - rowRange.start;
if (src_.isContinuous())
{
@@ -3015,7 +3009,7 @@ public:
height = 1;
}
for (const uchar* ptr = src_.ptr<uchar>(rowRange.begin()); height--; ptr += sstep)
for (const uchar* ptr = src_.ptr<uchar>(rowRange.start); height--; ptr += sstep)
{
int x = 0;
for (; x <= width - 4; x += 4)
@@ -3030,9 +3024,7 @@ public:
localHistogram[ptr[x]]++;
}
#ifdef HAVE_TBB
tbb::mutex::scoped_lock lock(*histogramLock_);
#endif
cv::AutoLock lock(*histogramLock_);
for( int i = 0; i < HIST_SZ; i++ )
globalHistogram_[i] += localHistogram[i];
@@ -3040,12 +3032,7 @@ public:
static bool isWorthParallel( const cv::Mat& src )
{
#ifdef HAVE_TBB
return ( src.total() >= 640*480 );
#else
(void)src;
return false;
#endif
}
private:
@@ -3053,10 +3040,10 @@ private:
cv::Mat& src_;
int* globalHistogram_;
MutextPtr histogramLock_;
cv::Mutex* histogramLock_;
};
class EqualizeHistLut_Invoker
class EqualizeHistLut_Invoker : public cv::ParallelLoopBody
{
public:
EqualizeHistLut_Invoker( cv::Mat& src, cv::Mat& dst, int* lut )
@@ -3065,13 +3052,13 @@ public:
lut_(lut)
{ }
void operator()( const cv::BlockedRange& rowRange ) const
void operator()( const cv::Range& rowRange ) const
{
const size_t sstep = src_.step;
const size_t dstep = dst_.step;
int width = src_.cols;
int height = rowRange.end() - rowRange.begin();
int height = rowRange.end - rowRange.start;
int* lut = lut_;
if (src_.isContinuous() && dst_.isContinuous())
@@ -3080,8 +3067,8 @@ public:
height = 1;
}
const uchar* sptr = src_.ptr<uchar>(rowRange.begin());
uchar* dptr = dst_.ptr<uchar>(rowRange.begin());
const uchar* sptr = src_.ptr<uchar>(rowRange.start);
uchar* dptr = dst_.ptr<uchar>(rowRange.start);
for (; height--; sptr += sstep, dptr += dstep)
{
@@ -3110,12 +3097,7 @@ public:
static bool isWorthParallel( const cv::Mat& src )
{
#ifdef HAVE_TBB
return ( src.total() >= 640*480 );
#else
(void)src;
return false;
#endif
}
private:
@@ -3142,23 +3124,18 @@ void cv::equalizeHist( InputArray _src, OutputArray _dst )
if(src.empty())
return;
#ifdef HAVE_TBB
tbb::mutex histogramLockInstance;
EqualizeHistCalcHist_Invoker::MutextPtr histogramLock = &histogramLockInstance;
#else
EqualizeHistCalcHist_Invoker::MutextPtr histogramLock = 0;
#endif
Mutex histogramLockInstance;
const int hist_sz = EqualizeHistCalcHist_Invoker::HIST_SZ;
int hist[hist_sz] = {0,};
int lut[hist_sz];
EqualizeHistCalcHist_Invoker calcBody(src, hist, histogramLock);
EqualizeHistCalcHist_Invoker calcBody(src, hist, &histogramLockInstance);
EqualizeHistLut_Invoker lutBody(src, dst, lut);
cv::BlockedRange heightRange(0, src.rows);
cv::Range heightRange(0, src.rows);
if(EqualizeHistCalcHist_Invoker::isWorthParallel(src))
parallel_for(heightRange, calcBody);
parallel_for_(heightRange, calcBody);
else
calcBody(heightRange);
@@ -3182,303 +3159,11 @@ void cv::equalizeHist( InputArray _src, OutputArray _dst )
}
if(EqualizeHistLut_Invoker::isWorthParallel(src))
parallel_for(heightRange, lutBody);
parallel_for_(heightRange, lutBody);
else
lutBody(heightRange);
}
// ----------------------------------------------------------------------
// CLAHE
namespace
{
class CLAHE_CalcLut_Body : public cv::ParallelLoopBody
{
public:
CLAHE_CalcLut_Body(const cv::Mat& src, cv::Mat& lut, cv::Size tileSize, int tilesX, int tilesY, int clipLimit, float lutScale) :
src_(src), lut_(lut), tileSize_(tileSize), tilesX_(tilesX), tilesY_(tilesY), clipLimit_(clipLimit), lutScale_(lutScale)
{
}
void operator ()(const cv::Range& range) const;
private:
cv::Mat src_;
mutable cv::Mat lut_;
cv::Size tileSize_;
int tilesX_;
int tilesY_;
int clipLimit_;
float lutScale_;
};
void CLAHE_CalcLut_Body::operator ()(const cv::Range& range) const
{
const int histSize = 256;
uchar* tileLut = lut_.ptr(range.start);
const size_t lut_step = lut_.step;
for (int k = range.start; k < range.end; ++k, tileLut += lut_step)
{
const int ty = k / tilesX_;
const int tx = k % tilesX_;
// retrieve tile submatrix
cv::Rect tileROI;
tileROI.x = tx * tileSize_.width;
tileROI.y = ty * tileSize_.height;
tileROI.width = tileSize_.width;
tileROI.height = tileSize_.height;
const cv::Mat tile = src_(tileROI);
// calc histogram
int tileHist[histSize] = {0, };
int height = tileROI.height;
const size_t sstep = tile.step;
for (const uchar* ptr = tile.ptr<uchar>(0); height--; ptr += sstep)
{
int x = 0;
for (; x <= tileROI.width - 4; x += 4)
{
int t0 = ptr[x], t1 = ptr[x+1];
tileHist[t0]++; tileHist[t1]++;
t0 = ptr[x+2]; t1 = ptr[x+3];
tileHist[t0]++; tileHist[t1]++;
}
for (; x < tileROI.width; ++x)
tileHist[ptr[x]]++;
}
// clip histogram
if (clipLimit_ > 0)
{
// how many pixels were clipped
int clipped = 0;
for (int i = 0; i < histSize; ++i)
{
if (tileHist[i] > clipLimit_)
{
clipped += tileHist[i] - clipLimit_;
tileHist[i] = clipLimit_;
}
}
// redistribute clipped pixels
int redistBatch = clipped / histSize;
int residual = clipped - redistBatch * histSize;
for (int i = 0; i < histSize; ++i)
tileHist[i] += redistBatch;
for (int i = 0; i < residual; ++i)
tileHist[i]++;
}
// calc Lut
int sum = 0;
for (int i = 0; i < histSize; ++i)
{
sum += tileHist[i];
tileLut[i] = cv::saturate_cast<uchar>(sum * lutScale_);
}
}
}
class CLAHE_Interpolation_Body : public cv::ParallelLoopBody
{
public:
CLAHE_Interpolation_Body(const cv::Mat& src, cv::Mat& dst, const cv::Mat& lut, cv::Size tileSize, int tilesX, int tilesY) :
src_(src), dst_(dst), lut_(lut), tileSize_(tileSize), tilesX_(tilesX), tilesY_(tilesY)
{
}
void operator ()(const cv::Range& range) const;
private:
cv::Mat src_;
mutable cv::Mat dst_;
cv::Mat lut_;
cv::Size tileSize_;
int tilesX_;
int tilesY_;
};
void CLAHE_Interpolation_Body::operator ()(const cv::Range& range) const
{
const size_t lut_step = lut_.step;
for (int y = range.start; y < range.end; ++y)
{
const uchar* srcRow = src_.ptr<uchar>(y);
uchar* dstRow = dst_.ptr<uchar>(y);
const float tyf = (static_cast<float>(y) / tileSize_.height) - 0.5f;
int ty1 = cvFloor(tyf);
int ty2 = ty1 + 1;
const float ya = tyf - ty1;
ty1 = std::max(ty1, 0);
ty2 = std::min(ty2, tilesY_ - 1);
const uchar* lutPlane1 = lut_.ptr(ty1 * tilesX_);
const uchar* lutPlane2 = lut_.ptr(ty2 * tilesX_);
for (int x = 0; x < src_.cols; ++x)
{
const float txf = (static_cast<float>(x) / tileSize_.width) - 0.5f;
int tx1 = cvFloor(txf);
int tx2 = tx1 + 1;
const float xa = txf - tx1;
tx1 = std::max(tx1, 0);
tx2 = std::min(tx2, tilesX_ - 1);
const int srcVal = srcRow[x];
const size_t ind1 = tx1 * lut_step + srcVal;
const size_t ind2 = tx2 * lut_step + srcVal;
float res = 0;
res += lutPlane1[ind1] * ((1.0f - xa) * (1.0f - ya));
res += lutPlane1[ind2] * ((xa) * (1.0f - ya));
res += lutPlane2[ind1] * ((1.0f - xa) * (ya));
res += lutPlane2[ind2] * ((xa) * (ya));
dstRow[x] = cv::saturate_cast<uchar>(res);
}
}
}
class CLAHE_Impl : public cv::CLAHE
{
public:
CLAHE_Impl(double clipLimit = 40.0, int tilesX = 8, int tilesY = 8);
cv::AlgorithmInfo* info() const;
void apply(cv::InputArray src, cv::OutputArray dst);
void setClipLimit(double clipLimit);
double getClipLimit() const;
void setTilesGridSize(cv::Size tileGridSize);
cv::Size getTilesGridSize() const;
void collectGarbage();
private:
double clipLimit_;
int tilesX_;
int tilesY_;
cv::Mat srcExt_;
cv::Mat lut_;
};
CLAHE_Impl::CLAHE_Impl(double clipLimit, int tilesX, int tilesY) :
clipLimit_(clipLimit), tilesX_(tilesX), tilesY_(tilesY)
{
}
CV_INIT_ALGORITHM(CLAHE_Impl, "CLAHE",
obj.info()->addParam(obj, "clipLimit", obj.clipLimit_);
obj.info()->addParam(obj, "tilesX", obj.tilesX_);
obj.info()->addParam(obj, "tilesY", obj.tilesY_))
void CLAHE_Impl::apply(cv::InputArray _src, cv::OutputArray _dst)
{
cv::Mat src = _src.getMat();
CV_Assert( src.type() == CV_8UC1 );
_dst.create( src.size(), src.type() );
cv::Mat dst = _dst.getMat();
const int histSize = 256;
lut_.create(tilesX_ * tilesY_, histSize, CV_8UC1);
cv::Size tileSize;
cv::Mat srcForLut;
if (src.cols % tilesX_ == 0 && src.rows % tilesY_ == 0)
{
tileSize = cv::Size(src.cols / tilesX_, src.rows / tilesY_);
srcForLut = src;
}
else
{
cv::copyMakeBorder(src, srcExt_, 0, tilesY_ - (src.rows % tilesY_), 0, tilesX_ - (src.cols % tilesX_), cv::BORDER_REFLECT_101);
tileSize = cv::Size(srcExt_.cols / tilesX_, srcExt_.rows / tilesY_);
srcForLut = srcExt_;
}
const int tileSizeTotal = tileSize.area();
const float lutScale = static_cast<float>(histSize - 1) / tileSizeTotal;
int clipLimit = 0;
if (clipLimit_ > 0.0)
{
clipLimit = static_cast<int>(clipLimit_ * tileSizeTotal / histSize);
clipLimit = std::max(clipLimit, 1);
}
CLAHE_CalcLut_Body calcLutBody(srcForLut, lut_, tileSize, tilesX_, tilesY_, clipLimit, lutScale);
cv::parallel_for_(cv::Range(0, tilesX_ * tilesY_), calcLutBody);
CLAHE_Interpolation_Body interpolationBody(src, dst, lut_, tileSize, tilesX_, tilesY_);
cv::parallel_for_(cv::Range(0, src.rows), interpolationBody);
}
void CLAHE_Impl::setClipLimit(double clipLimit)
{
clipLimit_ = clipLimit;
}
double CLAHE_Impl::getClipLimit() const
{
return clipLimit_;
}
void CLAHE_Impl::setTilesGridSize(cv::Size tileGridSize)
{
tilesX_ = tileGridSize.width;
tilesY_ = tileGridSize.height;
}
cv::Size CLAHE_Impl::getTilesGridSize() const
{
return cv::Size(tilesX_, tilesY_);
}
void CLAHE_Impl::collectGarbage()
{
srcExt_.release();
lut_.release();
}
}
cv::Ptr<cv::CLAHE> cv::createCLAHE(double clipLimit, cv::Size tileGridSize)
{
return new CLAHE_Impl(clipLimit, tileGridSize.width, tileGridSize.height);
}
// ----------------------------------------------------------------------
/* Implementation of RTTI and Generic Functions for CvHistogram */

View File

@@ -1081,7 +1081,7 @@ cv::Mat cv::getStructuringElement(int shape, Size ksize, Point anchor)
namespace cv
{
class MorphologyRunner
class MorphologyRunner : public ParallelLoopBody
{
public:
MorphologyRunner(Mat _src, Mat _dst, int _nStripes, int _iterations,
@@ -1102,14 +1102,14 @@ public:
columnBorderType = _columnBorderType;
}
void operator () ( const BlockedRange& range ) const
void operator () ( const Range& range ) const
{
int row0 = std::min(cvRound(range.begin() * src.rows / nStripes), src.rows);
int row1 = std::min(cvRound(range.end() * src.rows / nStripes), src.rows);
int row0 = std::min(cvRound(range.start * src.rows / nStripes), src.rows);
int row1 = std::min(cvRound(range.end * src.rows / nStripes), src.rows);
/*if(0)
printf("Size = (%d, %d), range[%d,%d), row0 = %d, row1 = %d\n",
src.rows, src.cols, range.begin(), range.end(), row0, row1);*/
src.rows, src.cols, range.start, range.end, row0, row1);*/
Mat srcStripe = src.rowRange(row0, row1);
Mat dstStripe = dst.rowRange(row0, row1);
@@ -1173,15 +1173,15 @@ static void morphOp( int op, InputArray _src, OutputArray _dst,
}
int nStripes = 1;
#if defined HAVE_TBB && defined HAVE_TEGRA_OPTIMIZATION
#if defined HAVE_TEGRA_OPTIMIZATION
if (src.data != dst.data && iterations == 1 && //NOTE: threads are not used for inplace processing
(borderType & BORDER_ISOLATED) == 0 && //TODO: check border types
src.rows >= 64 ) //NOTE: just heuristics
nStripes = 4;
#endif
parallel_for(BlockedRange(0, nStripes),
MorphologyRunner(src, dst, nStripes, iterations, op, kernel, anchor, borderType, borderType, borderValue));
parallel_for_(Range(0, nStripes),
MorphologyRunner(src, dst, nStripes, iterations, op, kernel, anchor, borderType, borderType, borderValue));
//Ptr<FilterEngine> f = createMorphologyFilter(op, src.type(),
// kernel, anchor, borderType, borderType, borderValue );

View File

@@ -40,10 +40,6 @@
#include "precomp.hpp"
#ifdef HAVE_TBB
#include <tbb/tbb.h>
#endif
CvANN_MLP_TrainParams::CvANN_MLP_TrainParams()
{
term_crit = cvTermCriteria( CV_TERMCRIT_ITER + CV_TERMCRIT_EPS, 1000, 0.01 );
@@ -1022,7 +1018,7 @@ int CvANN_MLP::train_backprop( CvVectors x0, CvVectors u, const double* sw )
return iter;
}
struct rprop_loop {
struct rprop_loop : cv::ParallelLoopBody {
rprop_loop(const CvANN_MLP* _point, double**& _weights, int& _count, int& _ivcount, CvVectors* _x0,
int& _l_count, CvMat*& _layer_sizes, int& _ovcount, int& _max_count,
CvVectors* _u, const double*& _sw, double& _inv_count, CvMat*& _dEdw, int& _dcount0, double* _E, int _buf_sz)
@@ -1063,7 +1059,7 @@ struct rprop_loop {
int buf_sz;
void operator()( const cv::BlockedRange& range ) const
void operator()( const cv::Range& range ) const
{
double* buf_ptr;
double** x = 0;
@@ -1084,7 +1080,7 @@ struct rprop_loop {
buf_ptr += (df[i] - x[i])*2;
}
for(int si = range.begin(); si < range.end(); si++ )
for(int si = range.start; si < range.end; si++ )
{
if (si % dcount0 != 0) continue;
int n1, n2, k;
@@ -1170,36 +1166,33 @@ struct rprop_loop {
}
// backward pass, update dEdw
#ifdef HAVE_TBB
static tbb::spin_mutex mutex;
tbb::spin_mutex::scoped_lock lock;
#endif
static cv::Mutex mutex;
for(int i = l_count-1; i > 0; i-- )
{
n1 = layer_sizes->data.i[i-1]; n2 = layer_sizes->data.i[i];
cvInitMatHeader( &_df, dcount, n2, CV_64F, df[i] );
cvMul( grad1, &_df, grad1 );
#ifdef HAVE_TBB
lock.acquire(mutex);
#endif
cvInitMatHeader( &_dEdw, n1, n2, CV_64F, dEdw->data.db+(weights[i]-weights[0]) );
cvInitMatHeader( x1, dcount, n1, CV_64F, x[i-1] );
cvGEMM( x1, grad1, 1, &_dEdw, 1, &_dEdw, CV_GEMM_A_T );
// update bias part of dEdw
for( k = 0; k < dcount; k++ )
{
double* dst = _dEdw.data.db + n1*n2;
const double* src = grad1->data.db + k*n2;
for(int j = 0; j < n2; j++ )
dst[j] += src[j];
{
cv::AutoLock lock(mutex);
cvInitMatHeader( &_dEdw, n1, n2, CV_64F, dEdw->data.db+(weights[i]-weights[0]) );
cvInitMatHeader( x1, dcount, n1, CV_64F, x[i-1] );
cvGEMM( x1, grad1, 1, &_dEdw, 1, &_dEdw, CV_GEMM_A_T );
// update bias part of dEdw
for( k = 0; k < dcount; k++ )
{
double* dst = _dEdw.data.db + n1*n2;
const double* src = grad1->data.db + k*n2;
for(int j = 0; j < n2; j++ )
dst[j] += src[j];
}
if (i > 1)
cvInitMatHeader( &_w, n1, n2, CV_64F, weights[i] );
}
if (i > 1)
cvInitMatHeader( &_w, n1, n2, CV_64F, weights[i] );
#ifdef HAVE_TBB
lock.release();
#endif
cvInitMatHeader( grad2, dcount, n1, CV_64F, grad2->data.db );
if( i > 1 )
cvGEMM( grad1, &_w, 1, 0, 0, grad2, CV_GEMM_B_T );
@@ -1297,7 +1290,7 @@ int CvANN_MLP::train_rprop( CvVectors x0, CvVectors u, const double* sw )
double E = 0;
// first, iterate through all the samples and compute dEdw
cv::parallel_for(cv::BlockedRange(0, count),
cv::parallel_for_(cv::Range(0, count),
rprop_loop(this, weights, count, ivcount, &x0, l_count, layer_sizes,
ovcount, max_count, &u, sw, inv_count, dEdw, dcount0, &E, buf_sz)
);

View File

@@ -884,7 +884,7 @@ float CvGBTrees::predict_serial( const CvMat* _sample, const CvMat* _missing,
}
class Tree_predictor
class Tree_predictor : public cv::ParallelLoopBody
{
private:
pCvSeq* weak;
@@ -894,9 +894,7 @@ private:
const CvMat* missing;
const float shrinkage;
#ifdef HAVE_TBB
static tbb::spin_mutex SumMutex;
#endif
static cv::Mutex SumMutex;
public:
@@ -915,14 +913,11 @@ public:
Tree_predictor& operator=( const Tree_predictor& )
{ return *this; }
virtual void operator()(const cv::BlockedRange& range) const
virtual void operator()(const cv::Range& range) const
{
#ifdef HAVE_TBB
tbb::spin_mutex::scoped_lock lock;
#endif
CvSeqReader reader;
int begin = range.begin();
int end = range.end();
int begin = range.start;
int end = range.end;
int weak_count = end - begin;
CvDTree* tree;
@@ -940,13 +935,11 @@ public:
tmp_sum += shrinkage*(float)(tree->predict(sample, missing)->value);
}
}
#ifdef HAVE_TBB
lock.acquire(SumMutex);
sum[i] += tmp_sum;
lock.release();
#else
sum[i] += tmp_sum;
#endif
{
cv::AutoLock lock(SumMutex);
sum[i] += tmp_sum;
}
}
} // Tree_predictor::operator()
@@ -954,11 +947,7 @@ public:
}; // class Tree_predictor
#ifdef HAVE_TBB
tbb::spin_mutex Tree_predictor::SumMutex;
#endif
cv::Mutex Tree_predictor::SumMutex;
float CvGBTrees::predict( const CvMat* _sample, const CvMat* _missing,
@@ -976,12 +965,7 @@ float CvGBTrees::predict( const CvMat* _sample, const CvMat* _missing,
Tree_predictor predictor = Tree_predictor(weak_seq, class_count,
params.shrinkage, _sample, _missing, sum);
//#ifdef HAVE_TBB
// tbb::parallel_for(cv::BlockedRange(begin, end), predictor,
// tbb::auto_partitioner());
//#else
cv::parallel_for(cv::BlockedRange(begin, end), predictor);
//#endif
cv::parallel_for_(cv::Range(begin, end), predictor);
for (int i=0; i<class_count; ++i)
sum[i] = sum[i] /** params.shrinkage*/ + base_value;
@@ -1210,7 +1194,7 @@ void CvGBTrees::read( CvFileStorage* fs, CvFileNode* node )
//===========================================================================
class Sample_predictor
class Sample_predictor : public cv::ParallelLoopBody
{
private:
const CvGBTrees* gbt;
@@ -1240,10 +1224,10 @@ public:
{}
virtual void operator()(const cv::BlockedRange& range) const
virtual void operator()(const cv::Range& range) const
{
int begin = range.begin();
int end = range.end();
int begin = range.start;
int end = range.end;
CvMat x;
CvMat miss;
@@ -1299,11 +1283,7 @@ CvGBTrees::calc_error( CvMLData* _data, int type, std::vector<float> *resp )
Sample_predictor predictor = Sample_predictor(this, pred_resp, _data->get_values(),
_data->get_missing(), _sample_idx);
//#ifdef HAVE_TBB
// tbb::parallel_for(cv::BlockedRange(0,n), predictor, tbb::auto_partitioner());
//#else
cv::parallel_for(cv::BlockedRange(0,n), predictor);
//#endif
cv::parallel_for_(cv::Range(0,n), predictor);
int* sidx = _sample_idx ? _sample_idx->data.i : 0;
int r_step = CV_IS_MAT_CONT(response->type) ?

View File

@@ -306,7 +306,7 @@ float CvKNearest::write_results( int k, int k1, int start, int end,
return result;
}
struct P1 {
struct P1 : cv::ParallelLoopBody {
P1(const CvKNearest* _pointer, int _buf_sz, int _k, const CvMat* __samples, const float** __neighbors,
int _k1, CvMat* __results, CvMat* __neighbor_responses, CvMat* __dist, float* _result)
{
@@ -333,10 +333,10 @@ struct P1 {
float* result;
int buf_sz;
void operator()( const cv::BlockedRange& range ) const
void operator()( const cv::Range& range ) const
{
cv::AutoBuffer<float> buf(buf_sz);
for(int i = range.begin(); i < range.end(); i += 1 )
for(int i = range.start; i < range.end; i += 1 )
{
float* neighbor_responses = &buf[0];
float* dist = neighbor_responses + 1*k;
@@ -410,8 +410,8 @@ float CvKNearest::find_nearest( const CvMat* _samples, int k, CvMat* _results,
int k1 = get_sample_count();
k1 = MIN( k1, k );
cv::parallel_for(cv::BlockedRange(0, count), P1(this, buf_sz, k, _samples, _neighbors, k1,
_results, _neighbor_responses, _dist, &result)
cv::parallel_for_(cv::Range(0, count), P1(this, buf_sz, k, _samples, _neighbors, k1,
_results, _neighbor_responses, _dist, &result)
);
return result;

View File

@@ -277,7 +277,7 @@ bool CvNormalBayesClassifier::train( const CvMat* _train_data, const CvMat* _res
return result;
}
struct predict_body {
struct predict_body : cv::ParallelLoopBody {
predict_body(CvMat* _c, CvMat** _cov_rotate_mats, CvMat** _inv_eigen_values, CvMat** _avg,
const CvMat* _samples, const int* _vidx, CvMat* _cls_labels,
CvMat* _results, float* _value, int _var_count1
@@ -307,7 +307,7 @@ struct predict_body {
float* value;
int var_count1;
void operator()( const cv::BlockedRange& range ) const
void operator()( const cv::Range& range ) const
{
int cls = -1;
@@ -324,7 +324,7 @@ struct predict_body {
cv::AutoBuffer<double> buffer(nclasses + var_count1);
CvMat diff = cvMat( 1, var_count1, CV_64FC1, &buffer[0] );
for(int k = range.begin(); k < range.end(); k += 1 )
for(int k = range.start; k < range.end; k += 1 )
{
int ival;
double opt = FLT_MAX;
@@ -397,9 +397,9 @@ float CvNormalBayesClassifier::predict( const CvMat* samples, CvMat* results ) c
const int* vidx = var_idx ? var_idx->data.i : 0;
cv::parallel_for(cv::BlockedRange(0, samples->rows), predict_body(c, cov_rotate_mats, inv_eigen_values, avg, samples,
vidx, cls_labels, results, &value, var_count
));
cv::parallel_for_(cv::Range(0, samples->rows),
predict_body(c, cov_rotate_mats, inv_eigen_values, avg, samples,
vidx, cls_labels, results, &value, var_count));
return value;
}

View File

@@ -2192,7 +2192,7 @@ float CvSVM::predict( const CvMat* sample, bool returnDFVal ) const
return result;
}
struct predict_body_svm {
struct predict_body_svm : ParallelLoopBody {
predict_body_svm(const CvSVM* _pointer, float* _result, const CvMat* _samples, CvMat* _results)
{
pointer = _pointer;
@@ -2206,9 +2206,9 @@ struct predict_body_svm {
const CvMat* samples;
CvMat* results;
void operator()( const cv::BlockedRange& range ) const
void operator()( const cv::Range& range ) const
{
for(int i = range.begin(); i < range.end(); i++ )
for(int i = range.start; i < range.end; i++ )
{
CvMat sample;
cvGetRow( samples, &sample, i );
@@ -2224,7 +2224,7 @@ struct predict_body_svm {
float CvSVM::predict(const CvMat* samples, CV_OUT CvMat* results) const
{
float result = 0;
cv::parallel_for(cv::BlockedRange(0, samples->rows),
cv::parallel_for_(cv::Range(0, samples->rows),
predict_body_svm(this, &result, samples, results)
);
return result;

View File

@@ -258,7 +258,7 @@ interpolateKeypoint( float N9[3][9], int dx, int dy, int ds, KeyPoint& kpt )
}
// Multi-threaded construction of the scale-space pyramid
struct SURFBuildInvoker
struct SURFBuildInvoker : ParallelLoopBody
{
SURFBuildInvoker( const Mat& _sum, const std::vector<int>& _sizes,
const std::vector<int>& _sampleSteps,
@@ -271,9 +271,9 @@ struct SURFBuildInvoker
traces = &_traces;
}
void operator()(const BlockedRange& range) const
void operator()(const Range& range) const
{
for( int i=range.begin(); i<range.end(); i++ )
for( int i=range.start; i<range.end; i++ )
calcLayerDetAndTrace( *sum, (*sizes)[i], (*sampleSteps)[i], (*dets)[i], (*traces)[i] );
}
@@ -285,7 +285,7 @@ struct SURFBuildInvoker
};
// Multi-threaded search of the scale-space pyramid for keypoints
struct SURFFindInvoker
struct SURFFindInvoker : ParallelLoopBody
{
SURFFindInvoker( const Mat& _sum, const Mat& _mask_sum,
const std::vector<Mat>& _dets, const std::vector<Mat>& _traces,
@@ -310,9 +310,9 @@ struct SURFFindInvoker
const std::vector<int>& sizes, std::vector<KeyPoint>& keypoints,
int octave, int layer, float hessianThreshold, int sampleStep );
void operator()(const BlockedRange& range) const
void operator()(const Range& range) const
{
for( int i=range.begin(); i<range.end(); i++ )
for( int i=range.start; i<range.end; i++ )
{
int layer = (*middleIndices)[i];
int octave = i / nOctaveLayers;
@@ -333,14 +333,10 @@ struct SURFFindInvoker
int nOctaveLayers;
float hessianThreshold;
#ifdef HAVE_TBB
static tbb::mutex findMaximaInLayer_m;
#endif
static Mutex findMaximaInLayer_m;
};
#ifdef HAVE_TBB
tbb::mutex SURFFindInvoker::findMaximaInLayer_m;
#endif
Mutex SURFFindInvoker::findMaximaInLayer_m;
/*
@@ -437,9 +433,7 @@ void SURFFindInvoker::findMaximaInLayer( const Mat& sum, const Mat& mask_sum,
if( interp_ok )
{
/*printf( "KeyPoint %f %f %d\n", point.pt.x, point.pt.y, point.size );*/
#ifdef HAVE_TBB
tbb::mutex::scoped_lock lock(findMaximaInLayer_m);
#endif
cv::AutoLock lock(findMaximaInLayer_m);
keypoints.push_back(kpt);
}
}
@@ -505,20 +499,20 @@ static void fastHessianDetector( const Mat& sum, const Mat& mask_sum, std::vecto
}
// Calculate hessian determinant and trace samples in each layer
parallel_for( BlockedRange(0, nTotalLayers),
SURFBuildInvoker(sum, sizes, sampleSteps, dets, traces) );
parallel_for_( Range(0, nTotalLayers),
SURFBuildInvoker(sum, sizes, sampleSteps, dets, traces) );
// Find maxima in the determinant of the hessian
parallel_for( BlockedRange(0, nMiddleLayers),
SURFFindInvoker(sum, mask_sum, dets, traces, sizes,
sampleSteps, middleIndices, keypoints,
nOctaveLayers, hessianThreshold) );
parallel_for_( Range(0, nMiddleLayers),
SURFFindInvoker(sum, mask_sum, dets, traces, sizes,
sampleSteps, middleIndices, keypoints,
nOctaveLayers, hessianThreshold) );
std::sort(keypoints.begin(), keypoints.end(), KeypointGreater());
}
struct SURFInvoker
struct SURFInvoker : ParallelLoopBody
{
enum { ORI_RADIUS = 6, ORI_WIN = 60, PATCH_SZ = 20 };
@@ -566,7 +560,7 @@ struct SURFInvoker
}
}
void operator()(const BlockedRange& range) const
void operator()(const Range& range) const
{
/* X and Y gradient wavelet data */
const int NX=2, NY=2;
@@ -584,7 +578,7 @@ struct SURFInvoker
int dsize = extended ? 128 : 64;
int k, k1 = range.begin(), k2 = range.end();
int k, k1 = range.start, k2 = range.end;
float maxSize = 0;
for( k = k1; k < k2; k++ )
{
@@ -952,7 +946,7 @@ void SURF::operator()(InputArray _img, InputArray _mask,
// we call SURFInvoker in any case, even if we do not need descriptors,
// since it computes orientation of each feature.
parallel_for(BlockedRange(0, N), SURFInvoker(img, sum, keypoints, descriptors, extended, upright) );
parallel_for_(Range(0, N), SURFInvoker(img, sum, keypoints, descriptors, extended, upright) );
// remove keypoints that were marked for deletion
for( i = j = 0; i < N; i++ )

View File

@@ -1164,15 +1164,10 @@ void CascadeClassifier::detectMultiScale( const Mat& image, std::vector<Rect>& o
int stripCount, stripSize;
#ifdef HAVE_TBB
const int PTS_PER_THREAD = 1000;
stripCount = ((processingRectSize.width/yStep)*(processingRectSize.height + yStep-1)/yStep + PTS_PER_THREAD/2)/PTS_PER_THREAD;
stripCount = std::min(std::max(stripCount, 1), 100);
stripSize = (((processingRectSize.height + stripCount - 1)/stripCount + yStep-1)/yStep)*yStep;
#else
stripCount = 1;
stripSize = processingRectSize.height;
#endif
if( !detectSingleScale( scaledImage, stripCount, processingRectSize, stripSize, yStep, factor, candidates,
rejectLevels, levelWeights, outputRejectLevels ) )

View File

@@ -582,7 +582,6 @@ int searchObjectThresholdSomeComponents(const CvLSVMFeaturePyramid *H,
// For each component perform searching
for (i = 0; i < kComponents; i++)
{
#ifdef HAVE_TBB
int error = searchObjectThreshold(H, &(filters[componentIndex]), kPartFilters[i],
b[i], maxXBorder, maxYBorder, scoreThreshold,
&(pointsArr[i]), &(levelsArr[i]), &(kPointsArr[i]),
@@ -598,13 +597,6 @@ int searchObjectThresholdSomeComponents(const CvLSVMFeaturePyramid *H,
free(partsDisplacementArr);
return LATENT_SVM_SEARCH_OBJECT_FAILED;
}
#else
(void)numThreads;
searchObjectThreshold(H, &(filters[componentIndex]), kPartFilters[i],
b[i], maxXBorder, maxYBorder, scoreThreshold,
&(pointsArr[i]), &(levelsArr[i]), &(kPointsArr[i]),
&(scoreArr[i]), &(partsDisplacementArr[i]));
#endif
estimateBoxes(pointsArr[i], levelsArr[i], kPointsArr[i],
filters[componentIndex]->sizeX, filters[componentIndex]->sizeY, &(oppPointsArr[i]));
componentIndex += (kPartFilters[i] + 1);

View File

@@ -121,8 +121,9 @@ namespace cv
CV_EXPORTS void setBinpath(const char *path);
//The two functions below enable other opencl program to use ocl module's cl_context and cl_command_queue
//returns cl_context *
CV_EXPORTS void* getoclContext();
//returns cl_command_queue *
CV_EXPORTS void* getoclCommandQueue();
//explicit call clFinish. The global command queue will be used.
@@ -460,6 +461,7 @@ namespace cv
// support all C1 types
CV_EXPORTS void minMax(const oclMat &src, double *minVal, double *maxVal = 0, const oclMat &mask = oclMat());
CV_EXPORTS void minMax_buf(const oclMat &src, double *minVal, double *maxVal, const oclMat &mask, oclMat& buf);
//! finds global minimum and maximum array elements and returns their values with locations
// support all C1 types
@@ -808,7 +810,11 @@ namespace cv
CV_EXPORTS void integral(const oclMat &src, oclMat &sum, oclMat &sqsum);
CV_EXPORTS void integral(const oclMat &src, oclMat &sum);
CV_EXPORTS void cornerHarris(const oclMat &src, oclMat &dst, int blockSize, int ksize, double k, int bordertype = cv::BORDER_DEFAULT);
CV_EXPORTS void cornerHarris_dxdy(const oclMat &src, oclMat &dst, oclMat &Dx, oclMat &Dy,
int blockSize, int ksize, double k, int bordertype = cv::BORDER_DEFAULT);
CV_EXPORTS void cornerMinEigenVal(const oclMat &src, oclMat &dst, int blockSize, int ksize, int bordertype = cv::BORDER_DEFAULT);
CV_EXPORTS void cornerMinEigenVal_dxdy(const oclMat &src, oclMat &dst, oclMat &Dx, oclMat &Dy,
int blockSize, int ksize, int bordertype = cv::BORDER_DEFAULT);
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////CascadeClassifier//////////////////////////////////////////////////////////////////
@@ -826,13 +832,14 @@ namespace cv
};
#endif
#if 0
class CV_EXPORTS OclCascadeClassifierBuf : public cv::CascadeClassifier
{
public:
OclCascadeClassifierBuf() :
m_flags(0), initialized(false), m_scaleFactor(0), buffers(NULL) {}
~OclCascadeClassifierBuf() {}
~OclCascadeClassifierBuf() { release(); }
void detectMultiScale(oclMat &image, CV_OUT std::vector<cv::Rect>& faces,
double scaleFactor = 1.1, int minNeighbors = 3, int flags = 0,
@@ -864,6 +871,7 @@ namespace cv
oclMat gimg1, gsum, gsqsum;
void * buffers;
};
#endif
/////////////////////////////// Pyramid /////////////////////////////////////
CV_EXPORTS void pyrDown(const oclMat &src, oclMat &dst);
@@ -1388,6 +1396,51 @@ namespace cv
explicit BFMatcher_OCL(int norm = NORM_L2) : BruteForceMatcher_OCL_base(norm == NORM_L1 ? L1Dist : norm == NORM_L2 ? L2Dist : HammingDist) {}
};
class CV_EXPORTS GoodFeaturesToTrackDetector_OCL
{
public:
explicit GoodFeaturesToTrackDetector_OCL(int maxCorners = 1000, double qualityLevel = 0.01, double minDistance = 0.0,
int blockSize = 3, bool useHarrisDetector = false, double harrisK = 0.04);
//! return 1 rows matrix with CV_32FC2 type
void operator ()(const oclMat& image, oclMat& corners, const oclMat& mask = oclMat());
//! download points of type Point2f to a vector. the vector's content will be erased
void downloadPoints(const oclMat &points, std::vector<Point2f> &points_v);
int maxCorners;
double qualityLevel;
double minDistance;
int blockSize;
bool useHarrisDetector;
double harrisK;
void releaseMemory()
{
Dx_.release();
Dy_.release();
eig_.release();
minMaxbuf_.release();
tmpCorners_.release();
}
private:
oclMat Dx_;
oclMat Dy_;
oclMat eig_;
oclMat minMaxbuf_;
oclMat tmpCorners_;
};
inline GoodFeaturesToTrackDetector_OCL::GoodFeaturesToTrackDetector_OCL(int maxCorners_, double qualityLevel_, double minDistance_,
int blockSize_, bool useHarrisDetector_, double harrisK_)
{
maxCorners = maxCorners_;
qualityLevel = qualityLevel_;
minDistance = minDistance_;
blockSize = blockSize_;
useHarrisDetector = useHarrisDetector_;
harrisK = harrisK_;
}
/////////////////////////////// PyrLKOpticalFlow /////////////////////////////////////
class CV_EXPORTS PyrLKOpticalFlow

View File

@@ -47,7 +47,7 @@
#define __OPENCV_OCL_PRIVATE_UTIL__
#if defined __APPLE__
#include <OpenCL/OpenCL.h>
#include <OpenCL/opencl.h>
#else
#include <CL/opencl.h>
#endif
@@ -121,6 +121,33 @@ namespace cv
cl_mem CV_EXPORTS bindTexture(const oclMat &mat);
void CV_EXPORTS releaseTexture(cl_mem& texture);
//Represents an image texture object
class CV_EXPORTS TextureCL
{
public:
TextureCL(cl_mem tex, int r, int c, int t)
: tex_(tex), rows(r), cols(c), type(t) {}
~TextureCL()
{
openCLFree(tex_);
}
operator cl_mem()
{
return tex_;
}
cl_mem const tex_;
const int rows;
const int cols;
const int type;
private:
//disable assignment
void operator=(const TextureCL&);
};
// bind oclMat to OpenCL image textures and retunrs an TextureCL object
// note:
// for faster clamping, there is no buffer padding for the constructed texture
Ptr<TextureCL> CV_EXPORTS bindTexturePtr(const oclMat &mat);
// returns whether the current context supports image2d_t format or not
bool CV_EXPORTS support_image2d(Context *clCxt = Context::getContext());
@@ -132,7 +159,7 @@ namespace cv
};
template<DEVICE_INFO _it, typename _ty>
_ty queryDeviceInfo(cl_kernel kernel = NULL);
//info should have been pre-allocated
template<>
int CV_EXPORTS queryDeviceInfo<WAVEFRONT_SIZE, int>(cl_kernel kernel);
template<>

View File

@@ -48,7 +48,7 @@
///////////// Lut ////////////////////////
PERFTEST(lut)
{
Mat src, lut, dst;
Mat src, lut, dst, ocl_dst;
ocl::oclMat d_src, d_lut, d_dst;
int all_type[] = {CV_8UC1, CV_8UC3};
@@ -77,11 +77,6 @@ PERFTEST(lut)
ocl::LUT(d_src, d_lut, d_dst);
WARMUP_OFF;
cv::Mat ocl_mat_dst;
d_dst.download(ocl_mat_dst);
TestSystem::instance().setAccurate(ExpectedMatNear(ocl_mat_dst, dst, 0));
GPU_ON;
ocl::LUT(d_src, d_lut, d_dst);
GPU_OFF;
@@ -90,9 +85,10 @@ PERFTEST(lut)
d_src.upload(src);
d_lut.upload(lut);
ocl::LUT(d_src, d_lut, d_dst);
d_dst.download(dst);
d_dst.download(ocl_dst);
GPU_FULL_OFF;
TestSystem::instance().ExpectedMatNear(ocl_dst, dst, 0);
}
}
@@ -101,7 +97,7 @@ PERFTEST(lut)
///////////// Exp ////////////////////////
PERFTEST(Exp)
{
Mat src, dst;
Mat src, dst, ocl_dst;
ocl::oclMat d_src, d_dst;
for (int size = Min_Size; size <= Max_Size; size *= Multiple)
@@ -121,11 +117,6 @@ PERFTEST(Exp)
ocl::exp(d_src, d_dst);
WARMUP_OFF;
cv::Mat ocl_mat_dst;
d_dst.download(ocl_mat_dst);
TestSystem::instance().setAccurate(ExpectedMatNear(ocl_mat_dst, dst, 2));
GPU_ON;
ocl::exp(d_src, d_dst);
GPU_OFF;
@@ -133,15 +124,17 @@ PERFTEST(Exp)
GPU_FULL_ON;
d_src.upload(src);
ocl::exp(d_src, d_dst);
d_dst.download(dst);
d_dst.download(ocl_dst);
GPU_FULL_OFF;
TestSystem::instance().ExpectedMatNear(ocl_dst, dst, 2);
}
}
///////////// LOG ////////////////////////
PERFTEST(Log)
{
Mat src, dst;
Mat src, dst, ocl_dst;
ocl::oclMat d_src, d_dst;
for (int size = Min_Size; size <= Max_Size; size *= Multiple)
@@ -161,11 +154,6 @@ PERFTEST(Log)
ocl::log(d_src, d_dst);
WARMUP_OFF;
cv::Mat ocl_mat_dst;
d_dst.download(ocl_mat_dst);
TestSystem::instance().setAccurate(ExpectedMatNear(ocl_mat_dst, dst, 1));
GPU_ON;
ocl::log(d_src, d_dst);
GPU_OFF;
@@ -173,15 +161,17 @@ PERFTEST(Log)
GPU_FULL_ON;
d_src.upload(src);
ocl::log(d_src, d_dst);
d_dst.download(dst);
d_dst.download(ocl_dst);
GPU_FULL_OFF;
TestSystem::instance().ExpectedMatNear(ocl_dst, dst, 1);
}
}
///////////// Add ////////////////////////
PERFTEST(Add)
{
Mat src1, src2, dst;
Mat src1, src2, dst, ocl_dst;
ocl::oclMat d_src1, d_src2, d_dst;
int all_type[] = {CV_8UC1, CV_32FC1};
@@ -201,6 +191,7 @@ PERFTEST(Add)
CPU_ON;
add(src1, src2, dst);
CPU_OFF;
d_src1.upload(src1);
d_src2.upload(src2);
@@ -208,11 +199,6 @@ PERFTEST(Add)
ocl::add(d_src1, d_src2, d_dst);
WARMUP_OFF;
cv::Mat ocl_mat_dst;
d_dst.download(ocl_mat_dst);
TestSystem::instance().setAccurate(ExpectedMatNear(ocl_mat_dst, dst, 0.0));
GPU_ON;
ocl::add(d_src1, d_src2, d_dst);
GPU_OFF;
@@ -221,8 +207,10 @@ PERFTEST(Add)
d_src1.upload(src1);
d_src2.upload(src2);
ocl::add(d_src1, d_src2, d_dst);
d_dst.download(dst);
d_dst.download(ocl_dst);
GPU_FULL_OFF;
TestSystem::instance().ExpectedMatNear(ocl_dst, dst, 0.0);
}
}
@@ -231,7 +219,7 @@ PERFTEST(Add)
///////////// Mul ////////////////////////
PERFTEST(Mul)
{
Mat src1, src2, dst;
Mat src1, src2, dst, ocl_dst;
ocl::oclMat d_src1, d_src2, d_dst;
int all_type[] = {CV_8UC1, CV_8UC4};
@@ -260,11 +248,6 @@ PERFTEST(Mul)
ocl::multiply(d_src1, d_src2, d_dst);
WARMUP_OFF;
cv::Mat ocl_mat_dst;
d_dst.download(ocl_mat_dst);
TestSystem::instance().setAccurate(ExpectedMatNear(ocl_mat_dst, dst, 0.0));
GPU_ON;
ocl::multiply(d_src1, d_src2, d_dst);
GPU_OFF;
@@ -273,8 +256,10 @@ PERFTEST(Mul)
d_src1.upload(src1);
d_src2.upload(src2);
ocl::multiply(d_src1, d_src2, d_dst);
d_dst.download(dst);
d_dst.download(ocl_dst);
GPU_FULL_OFF;
TestSystem::instance().ExpectedMatNear(ocl_dst, dst, 0.0);
}
}
@@ -283,7 +268,7 @@ PERFTEST(Mul)
///////////// Div ////////////////////////
PERFTEST(Div)
{
Mat src1, src2, dst;
Mat src1, src2, dst, ocl_dst;
ocl::oclMat d_src1, d_src2, d_dst;
int all_type[] = {CV_8UC1, CV_8UC4};
std::string type_name[] = {"CV_8UC1", "CV_8UC4"};
@@ -304,6 +289,7 @@ PERFTEST(Div)
CPU_ON;
divide(src1, src2, dst);
CPU_OFF;
d_src1.upload(src1);
d_src2.upload(src2);
@@ -311,11 +297,6 @@ PERFTEST(Div)
ocl::divide(d_src1, d_src2, d_dst);
WARMUP_OFF;
cv::Mat ocl_mat_dst;
d_dst.download(ocl_mat_dst);
TestSystem::instance().setAccurate(ExpectedMatNear(ocl_mat_dst, dst, 1));
GPU_ON;
ocl::divide(d_src1, d_src2, d_dst);
GPU_OFF;
@@ -324,8 +305,10 @@ PERFTEST(Div)
d_src1.upload(src1);
d_src2.upload(src2);
ocl::divide(d_src1, d_src2, d_dst);
d_dst.download(dst);
d_dst.download(ocl_dst);
GPU_FULL_OFF;
TestSystem::instance().ExpectedMatNear(ocl_dst, dst, 1);
}
}
@@ -334,7 +317,7 @@ PERFTEST(Div)
///////////// Absdiff ////////////////////////
PERFTEST(Absdiff)
{
Mat src1, src2, dst;
Mat src1, src2, dst, ocl_dst;
ocl::oclMat d_src1, d_src2, d_dst;
int all_type[] = {CV_8UC1, CV_8UC4};
@@ -355,6 +338,7 @@ PERFTEST(Absdiff)
CPU_ON;
absdiff(src1, src2, dst);
CPU_OFF;
d_src1.upload(src1);
d_src2.upload(src2);
@@ -362,11 +346,6 @@ PERFTEST(Absdiff)
ocl::absdiff(d_src1, d_src2, d_dst);
WARMUP_OFF;
cv::Mat ocl_mat_dst;
d_dst.download(ocl_mat_dst);
TestSystem::instance().setAccurate(ExpectedMatNear(ocl_mat_dst, dst, 0.0));
GPU_ON;
ocl::absdiff(d_src1, d_src2, d_dst);
GPU_OFF;
@@ -375,8 +354,10 @@ PERFTEST(Absdiff)
d_src1.upload(src1);
d_src2.upload(src2);
ocl::absdiff(d_src1, d_src2, d_dst);
d_dst.download(dst);
d_dst.download(ocl_dst);
GPU_FULL_OFF;
TestSystem::instance().ExpectedMatNear(ocl_dst, dst, 0.0);
}
}
@@ -385,7 +366,7 @@ PERFTEST(Absdiff)
///////////// CartToPolar ////////////////////////
PERFTEST(CartToPolar)
{
Mat src1, src2, dst, dst1;
Mat src1, src2, dst, dst1, ocl_dst, ocl_dst1;
ocl::oclMat d_src1, d_src2, d_dst, d_dst1;
int all_type[] = {CV_32FC1};
@@ -408,6 +389,7 @@ PERFTEST(CartToPolar)
CPU_ON;
cartToPolar(src1, src2, dst, dst1, 1);
CPU_OFF;
d_src1.upload(src1);
d_src2.upload(src2);
@@ -415,14 +397,6 @@ PERFTEST(CartToPolar)
ocl::cartToPolar(d_src1, d_src2, d_dst, d_dst1, 1);
WARMUP_OFF;
cv::Mat ocl_mat_dst;
d_dst.download(ocl_mat_dst);
cv::Mat ocl_mat_dst1;
d_dst1.download(ocl_mat_dst1);
TestSystem::instance().setAccurate(ExpectedMatNear(ocl_mat_dst1, dst1, 0.5)&&ExpectedMatNear(ocl_mat_dst, dst, 0.5));
GPU_ON;
ocl::cartToPolar(d_src1, d_src2, d_dst, d_dst1, 1);
GPU_OFF;
@@ -431,9 +405,15 @@ PERFTEST(CartToPolar)
d_src1.upload(src1);
d_src2.upload(src2);
ocl::cartToPolar(d_src1, d_src2, d_dst, d_dst1, 1);
d_dst.download(dst);
d_dst1.download(dst1);
d_dst.download(ocl_dst);
d_dst1.download(ocl_dst1);
GPU_FULL_OFF;
double diff1 = checkNorm(ocl_dst1, dst1);
double diff2 = checkNorm(ocl_dst, dst);
double max_diff = max(diff1, diff2);
TestSystem::instance().setAccurate(max_diff<=.5?1:0, max_diff);
}
}
@@ -442,7 +422,7 @@ PERFTEST(CartToPolar)
///////////// PolarToCart ////////////////////////
PERFTEST(PolarToCart)
{
Mat src1, src2, dst, dst1;
Mat src1, src2, dst, dst1, ocl_dst, ocl_dst1;
ocl::oclMat d_src1, d_src2, d_dst, d_dst1;
int all_type[] = {CV_32FC1};
@@ -472,14 +452,6 @@ PERFTEST(PolarToCart)
ocl::polarToCart(d_src1, d_src2, d_dst, d_dst1, 1);
WARMUP_OFF;
cv::Mat ocl_mat_dst;
d_dst.download(ocl_mat_dst);
cv::Mat ocl_mat_dst1;
d_dst1.download(ocl_mat_dst1);
TestSystem::instance().setAccurate(ExpectedMatNear(ocl_mat_dst1, dst1, 0.5)&&ExpectedMatNear(ocl_mat_dst, dst, 0.5));
GPU_ON;
ocl::polarToCart(d_src1, d_src2, d_dst, d_dst1, 1);
GPU_OFF;
@@ -488,9 +460,15 @@ PERFTEST(PolarToCart)
d_src1.upload(src1);
d_src2.upload(src2);
ocl::polarToCart(d_src1, d_src2, d_dst, d_dst1, 1);
d_dst.download(dst);
d_dst1.download(dst1);
d_dst.download(ocl_dst);
d_dst1.download(ocl_dst1);
GPU_FULL_OFF;
double diff1 = checkNorm(ocl_dst1, dst1);
double diff2 = checkNorm(ocl_dst, dst);
double max_diff = max(diff1, diff2);
TestSystem::instance().setAccurate(max_diff<=.5?1:0, max_diff);
}
}
@@ -499,7 +477,7 @@ PERFTEST(PolarToCart)
///////////// Magnitude ////////////////////////
PERFTEST(magnitude)
{
Mat x, y, mag;
Mat x, y, mag, ocl_mag;
ocl::oclMat d_x, d_y, d_mag;
int all_type[] = {CV_32FC1};
@@ -526,11 +504,6 @@ PERFTEST(magnitude)
ocl::magnitude(d_x, d_y, d_mag);
WARMUP_OFF;
cv::Mat ocl_mat_dst;
d_mag.download(ocl_mat_dst);
TestSystem::instance().setAccurate(ExpectedMatNear(ocl_mat_dst, mag, 1e-5));
GPU_ON;
ocl::magnitude(d_x, d_y, d_mag);
GPU_OFF;
@@ -539,8 +512,10 @@ PERFTEST(magnitude)
d_x.upload(x);
d_y.upload(y);
ocl::magnitude(d_x, d_y, d_mag);
d_mag.download(mag);
d_mag.download(ocl_mag);
GPU_FULL_OFF;
TestSystem::instance().ExpectedMatNear(ocl_mag, mag, 1e-5);
}
}
@@ -549,7 +524,7 @@ PERFTEST(magnitude)
///////////// Transpose ////////////////////////
PERFTEST(Transpose)
{
Mat src, dst;
Mat src, dst, ocl_dst;
ocl::oclMat d_src, d_dst;
int all_type[] = {CV_8UC1, CV_8UC4};
@@ -575,11 +550,6 @@ PERFTEST(Transpose)
ocl::transpose(d_src, d_dst);
WARMUP_OFF;
cv::Mat ocl_mat_dst;
d_dst.download(ocl_mat_dst);
TestSystem::instance().setAccurate(ExpectedMatNear(ocl_mat_dst, dst, 1e-5));
GPU_ON;
ocl::transpose(d_src, d_dst);
GPU_OFF;
@@ -587,8 +557,10 @@ PERFTEST(Transpose)
GPU_FULL_ON;
d_src.upload(src);
ocl::transpose(d_src, d_dst);
d_dst.download(dst);
d_dst.download(ocl_dst);
GPU_FULL_OFF;
TestSystem::instance().ExpectedMatNear(ocl_dst, dst, 1e-5);
}
}
@@ -597,7 +569,7 @@ PERFTEST(Transpose)
///////////// Flip ////////////////////////
PERFTEST(Flip)
{
Mat src, dst;
Mat src, dst, ocl_dst;
ocl::oclMat d_src, d_dst;
int all_type[] = {CV_8UC1, CV_8UC4};
@@ -623,11 +595,6 @@ PERFTEST(Flip)
ocl::flip(d_src, d_dst, 0);
WARMUP_OFF;
cv::Mat ocl_mat_dst;
d_dst.download(ocl_mat_dst);
TestSystem::instance().setAccurate(ExpectedMatNear(ocl_mat_dst, dst, 1e-5));
GPU_ON;
ocl::flip(d_src, d_dst, 0);
GPU_OFF;
@@ -635,8 +602,10 @@ PERFTEST(Flip)
GPU_FULL_ON;
d_src.upload(src);
ocl::flip(d_src, d_dst, 0);
d_dst.download(dst);
d_dst.download(ocl_dst);
GPU_FULL_OFF;
TestSystem::instance().ExpectedMatNear(ocl_dst, dst, 1e-5);
}
}
@@ -671,7 +640,10 @@ PERFTEST(minMax)
ocl::minMax(d_src, &min_val_, &max_val_);
WARMUP_OFF;
TestSystem::instance().setAccurate(EeceptDoubleEQ<double>(max_val_, max_val)&&EeceptDoubleEQ<double>(min_val_, min_val));
if(EeceptDoubleEQ<double>(max_val_, max_val) && EeceptDoubleEQ<double>(min_val_, min_val))
TestSystem::instance().setAccurate(1, max(fabs(max_val_-max_val), fabs(min_val_-min_val)));
else
TestSystem::instance().setAccurate(0, max(fabs(max_val_-max_val), fabs(min_val_-min_val)));
GPU_ON;
ocl::minMax(d_src, &min_val, &max_val);
@@ -724,8 +696,6 @@ PERFTEST(minMaxLoc)
minlocVal_ = src.at<unsigned char>(min_loc_);
maxlocVal = src.at<unsigned char>(max_loc);
maxlocVal_ = src.at<unsigned char>(max_loc_);
error0 = ::abs(src.at<unsigned char>(min_loc_) - src.at<unsigned char>(min_loc));
error1 = ::abs(src.at<unsigned char>(max_loc_) - src.at<unsigned char>(max_loc));
}
if(src.depth() == 1)
{
@@ -733,8 +703,6 @@ PERFTEST(minMaxLoc)
minlocVal_ = src.at<signed char>(min_loc_);
maxlocVal = src.at<signed char>(max_loc);
maxlocVal_ = src.at<signed char>(max_loc_);
error0 = ::abs(src.at<signed char>(min_loc_) - src.at<signed char>(min_loc));
error1 = ::abs(src.at<signed char>(max_loc_) - src.at<signed char>(max_loc));
}
if(src.depth() == 2)
{
@@ -742,8 +710,6 @@ PERFTEST(minMaxLoc)
minlocVal_ = src.at<unsigned short>(min_loc_);
maxlocVal = src.at<unsigned short>(max_loc);
maxlocVal_ = src.at<unsigned short>(max_loc_);
error0 = ::abs(src.at<unsigned short>(min_loc_) - src.at<unsigned short>(min_loc));
error1 = ::abs(src.at<unsigned short>(max_loc_) - src.at<unsigned short>(max_loc));
}
if(src.depth() == 3)
{
@@ -751,8 +717,6 @@ PERFTEST(minMaxLoc)
minlocVal_ = src.at<signed short>(min_loc_);
maxlocVal = src.at<signed short>(max_loc);
maxlocVal_ = src.at<signed short>(max_loc_);
error0 = ::abs(src.at<signed short>(min_loc_) - src.at<signed short>(min_loc));
error1 = ::abs(src.at<signed short>(max_loc_) - src.at<signed short>(max_loc));
}
if(src.depth() == 4)
{
@@ -760,8 +724,6 @@ PERFTEST(minMaxLoc)
minlocVal_ = src.at<int>(min_loc_);
maxlocVal = src.at<int>(max_loc);
maxlocVal_ = src.at<int>(max_loc_);
error0 = ::abs(src.at<int>(min_loc_) - src.at<int>(min_loc));
error1 = ::abs(src.at<int>(max_loc_) - src.at<int>(max_loc));
}
if(src.depth() == 5)
{
@@ -769,8 +731,6 @@ PERFTEST(minMaxLoc)
minlocVal_ = src.at<float>(min_loc_);
maxlocVal = src.at<float>(max_loc);
maxlocVal_ = src.at<float>(max_loc_);
error0 = ::abs(src.at<float>(min_loc_) - src.at<float>(min_loc));
error1 = ::abs(src.at<float>(max_loc_) - src.at<float>(max_loc));
}
if(src.depth() == 6)
{
@@ -778,16 +738,16 @@ PERFTEST(minMaxLoc)
minlocVal_ = src.at<double>(min_loc_);
maxlocVal = src.at<double>(max_loc);
maxlocVal_ = src.at<double>(max_loc_);
error0 = ::abs(src.at<double>(min_loc_) - src.at<double>(min_loc));
error1 = ::abs(src.at<double>(max_loc_) - src.at<double>(max_loc));
}
TestSystem::instance().setAccurate(EeceptDoubleEQ<double>(error1, 0.0)
&&EeceptDoubleEQ<double>(error0, 0.0)
&&EeceptDoubleEQ<double>(maxlocVal_, maxlocVal)
error0 = ::abs(minlocVal_ - minlocVal);
error1 = ::abs(maxlocVal_ - maxlocVal);
if( EeceptDoubleEQ<double>(maxlocVal_, maxlocVal)
&&EeceptDoubleEQ<double>(minlocVal_, minlocVal)
&&EeceptDoubleEQ<double>(max_val_, max_val)
&&EeceptDoubleEQ<double>(min_val_, min_val));
&&EeceptDoubleEQ<double>(min_val_, min_val))
TestSystem::instance().setAccurate(1, 0.);
else
TestSystem::instance().setAccurate(0, max(error0, error1));
GPU_ON;
ocl::minMaxLoc(d_src, &min_val, &max_val, &min_loc, &max_loc);
@@ -831,11 +791,13 @@ PERFTEST(Sum)
gpures = ocl::sum(d_src);
WARMUP_OFF;
TestSystem::instance().setAccurate(ExceptDoubleNear(cpures[3], gpures[3], 0.1)
&&ExceptDoubleNear(cpures[2], gpures[2], 0.1)
&&ExceptDoubleNear(cpures[1], gpures[1], 0.1)
&&ExceptDoubleNear(cpures[0], gpures[0], 0.1));
vector<double> diffs(4);
diffs[3] = fabs(cpures[3] - gpures[3]);
diffs[2] = fabs(cpures[2] - gpures[2]);
diffs[1] = fabs(cpures[1] - gpures[1]);
diffs[0] = fabs(cpures[0] - gpures[0]);
double max_diff = *max_element(diffs.begin(), diffs.end());
TestSystem::instance().setAccurate(max_diff<0.1?1:0, max_diff);
GPU_ON;
gpures = ocl::sum(d_src);
@@ -879,7 +841,11 @@ PERFTEST(countNonZero)
gpures = ocl::countNonZero(d_src);
WARMUP_OFF;
TestSystem::instance().setAccurate((EeceptDoubleEQ<double>((double)cpures, (double)gpures)));
int diff = abs(cpures - gpures);
if(diff == 0)
TestSystem::instance().setAccurate(1, 0);
else
TestSystem::instance().setAccurate(0, diff);
GPU_ON;
ocl::countNonZero(d_src);
@@ -897,7 +863,7 @@ PERFTEST(countNonZero)
///////////// Phase ////////////////////////
PERFTEST(Phase)
{
Mat src1, src2, dst;
Mat src1, src2, dst, ocl_dst;
ocl::oclMat d_src1, d_src2, d_dst;
int all_type[] = {CV_32FC1};
@@ -913,12 +879,12 @@ PERFTEST(Phase)
gen(src2, size, size, all_type[j], 0, 256);
gen(dst, size, size, all_type[j], 0, 256);
phase(src1, src2, dst, 1);
CPU_ON;
phase(src1, src2, dst, 1);
CPU_OFF;
d_src1.upload(src1);
d_src2.upload(src2);
@@ -926,11 +892,6 @@ PERFTEST(Phase)
ocl::phase(d_src1, d_src2, d_dst, 1);
WARMUP_OFF;
cv::Mat ocl_mat_dst;
d_dst.download(ocl_mat_dst);
TestSystem::instance().setAccurate(ExpectedMatNear(ocl_mat_dst, dst, 1e-2));
GPU_ON;
ocl::phase(d_src1, d_src2, d_dst, 1);
GPU_OFF;
@@ -939,8 +900,10 @@ PERFTEST(Phase)
d_src1.upload(src1);
d_src2.upload(src2);
ocl::phase(d_src1, d_src2, d_dst, 1);
d_dst.download(dst);
d_dst.download(ocl_dst);
GPU_FULL_OFF;
TestSystem::instance().ExpectedMatNear(ocl_dst, dst, 1e-2);
}
}
@@ -949,7 +912,7 @@ PERFTEST(Phase)
///////////// bitwise_and////////////////////////
PERFTEST(bitwise_and)
{
Mat src1, src2, dst;
Mat src1, src2, dst, ocl_dst;
ocl::oclMat d_src1, d_src2, d_dst;
int all_type[] = {CV_8UC1, CV_32SC1};
@@ -965,7 +928,6 @@ PERFTEST(bitwise_and)
gen(src2, size, size, all_type[j], 0, 256);
gen(dst, size, size, all_type[j], 0, 256);
bitwise_and(src1, src2, dst);
CPU_ON;
@@ -978,11 +940,6 @@ PERFTEST(bitwise_and)
ocl::bitwise_and(d_src1, d_src2, d_dst);
WARMUP_OFF;
cv::Mat ocl_mat_dst;
d_dst.download(ocl_mat_dst);
TestSystem::instance().setAccurate(ExpectedMatNear(ocl_mat_dst, dst, 0.0));
GPU_ON;
ocl::bitwise_and(d_src1, d_src2, d_dst);
GPU_OFF;
@@ -991,8 +948,10 @@ PERFTEST(bitwise_and)
d_src1.upload(src1);
d_src2.upload(src2);
ocl::bitwise_and(d_src1, d_src2, d_dst);
d_dst.download(dst);
d_dst.download(ocl_dst);
GPU_FULL_OFF;
TestSystem::instance().ExpectedMatNear(ocl_dst, dst, 0.0);
}
}
@@ -1001,7 +960,7 @@ PERFTEST(bitwise_and)
///////////// bitwise_not////////////////////////
PERFTEST(bitwise_not)
{
Mat src1, dst;
Mat src1, dst, ocl_dst;
ocl::oclMat d_src1, d_dst;
int all_type[] = {CV_8UC1, CV_32SC1};
@@ -1016,7 +975,6 @@ PERFTEST(bitwise_not)
gen(src1, size, size, all_type[j], 0, 256);
gen(dst, size, size, all_type[j], 0, 256);
bitwise_not(src1, dst);
CPU_ON;
@@ -1028,11 +986,6 @@ PERFTEST(bitwise_not)
ocl::bitwise_not(d_src1, d_dst);
WARMUP_OFF;
cv::Mat ocl_mat_dst;
d_dst.download(ocl_mat_dst);
TestSystem::instance().setAccurate(ExpectedMatNear(ocl_mat_dst, dst, 0.0));
GPU_ON;
ocl::bitwise_not(d_src1, d_dst);
GPU_OFF;
@@ -1040,8 +993,10 @@ PERFTEST(bitwise_not)
GPU_FULL_ON;
d_src1.upload(src1);
ocl::bitwise_not(d_src1, d_dst);
d_dst.download(dst);
d_dst.download(ocl_dst);
GPU_FULL_OFF;
TestSystem::instance().ExpectedMatNear(ocl_dst, dst, 0.0);
}
}
@@ -1050,7 +1005,7 @@ PERFTEST(bitwise_not)
///////////// compare////////////////////////
PERFTEST(compare)
{
Mat src1, src2, dst;
Mat src1, src2, dst, ocl_dst;
ocl::oclMat d_src1, d_src2, d_dst;
int CMP_EQ = 0;
@@ -1067,12 +1022,12 @@ PERFTEST(compare)
gen(src2, size, size, all_type[j], 0, 256);
gen(dst, size, size, all_type[j], 0, 256);
compare(src1, src2, dst, CMP_EQ);
CPU_ON;
compare(src1, src2, dst, CMP_EQ);
CPU_OFF;
d_src1.upload(src1);
d_src2.upload(src2);
@@ -1080,11 +1035,6 @@ PERFTEST(compare)
ocl::compare(d_src1, d_src2, d_dst, CMP_EQ);
WARMUP_OFF;
cv::Mat ocl_mat_dst;
d_dst.download(ocl_mat_dst);
TestSystem::instance().setAccurate(ExpectedMatNear(ocl_mat_dst, dst, 0.0));
GPU_ON;
ocl::compare(d_src1, d_src2, d_dst, CMP_EQ);
GPU_OFF;
@@ -1093,8 +1043,10 @@ PERFTEST(compare)
d_src1.upload(src1);
d_src2.upload(src2);
ocl::compare(d_src1, d_src2, d_dst, CMP_EQ);
d_dst.download(dst);
d_dst.download(ocl_dst);
GPU_FULL_OFF;
TestSystem::instance().ExpectedMatNear(ocl_dst, dst, 0.0);
}
}
@@ -1103,7 +1055,7 @@ PERFTEST(compare)
///////////// pow ////////////////////////
PERFTEST(pow)
{
Mat src, dst;
Mat src, dst, ocl_dst;
ocl::oclMat d_src, d_dst;
int all_type[] = {CV_32FC1};
@@ -1129,11 +1081,6 @@ PERFTEST(pow)
ocl::pow(d_src, -2.0, d_dst);
WARMUP_OFF;
cv::Mat ocl_mat_dst;
d_dst.download(ocl_mat_dst);
TestSystem::instance().setAccurate(ExpectedMatNear(ocl_mat_dst, dst, 1.0));
GPU_ON;
ocl::pow(d_src, -2.0, d_dst);
GPU_OFF;
@@ -1141,8 +1088,10 @@ PERFTEST(pow)
GPU_FULL_ON;
d_src.upload(src);
ocl::pow(d_src, -2.0, d_dst);
d_dst.download(dst);
d_dst.download(ocl_dst);
GPU_FULL_OFF;
TestSystem::instance().ExpectedMatNear(ocl_dst, dst, 1.0);
}
}
@@ -1151,7 +1100,7 @@ PERFTEST(pow)
///////////// MagnitudeSqr////////////////////////
PERFTEST(MagnitudeSqr)
{
Mat src1, src2, dst;
Mat src1, src2, dst, ocl_dst;
ocl::oclMat d_src1, d_src2, d_dst;
int all_type[] = {CV_32FC1};
@@ -1167,53 +1116,36 @@ PERFTEST(MagnitudeSqr)
gen(src2, size, size, all_type[t], 0, 256);
gen(dst, size, size, all_type[t], 0, 256);
CPU_ON;
for (int i = 0; i < src1.rows; ++i)
for (int j = 0; j < src1.cols; ++j)
{
float val1 = src1.at<float>(i, j);
float val2 = src2.at<float>(i, j);
((float *)(dst.data))[i * dst.step / 4 + j] = val1 * val1 + val2 * val2;
}
CPU_OFF;
CPU_ON;
d_src1.upload(src1);
d_src2.upload(src2);
for (int i = 0; i < src1.rows; ++i)
for (int j = 0; j < src1.cols; ++j)
{
float val1 = src1.at<float>(i, j);
float val2 = src2.at<float>(i, j);
WARMUP_ON;
ocl::magnitudeSqr(d_src1, d_src2, d_dst);
WARMUP_OFF;
((float *)(dst.data))[i * dst.step / 4 + j] = val1 * val1 + val2 * val2;
GPU_ON;
ocl::magnitudeSqr(d_src1, d_src2, d_dst);
GPU_OFF;
}
GPU_FULL_ON;
d_src1.upload(src1);
d_src2.upload(src2);
ocl::magnitudeSqr(d_src1, d_src2, d_dst);
d_dst.download(ocl_dst);
GPU_FULL_OFF;
CPU_OFF;
d_src1.upload(src1);
d_src2.upload(src2);
WARMUP_ON;
ocl::magnitudeSqr(d_src1, d_src2, d_dst);
WARMUP_OFF;
cv::Mat ocl_mat_dst;
d_dst.download(ocl_mat_dst);
TestSystem::instance().setAccurate(ExpectedMatNear(ocl_mat_dst, dst, 1.0));
GPU_ON;
ocl::magnitudeSqr(d_src1, d_src2, d_dst);
GPU_OFF;
GPU_FULL_ON;
d_src1.upload(src1);
d_src2.upload(src2);
ocl::magnitudeSqr(d_src1, d_src2, d_dst);
d_dst.download(dst);
GPU_FULL_OFF;
TestSystem::instance().ExpectedMatNear(ocl_dst, dst, 1.0);
}
}
@@ -1222,7 +1154,7 @@ PERFTEST(MagnitudeSqr)
///////////// AddWeighted////////////////////////
PERFTEST(AddWeighted)
{
Mat src1, src2, dst;
Mat src1, src2, dst, ocl_dst;
ocl::oclMat d_src1, d_src2, d_dst;
double alpha = 2.0, beta = 1.0, gama = 3.0;
@@ -1252,11 +1184,6 @@ PERFTEST(AddWeighted)
ocl::addWeighted(d_src1, alpha, d_src2, beta, gama, d_dst);
WARMUP_OFF;
cv::Mat ocl_mat_dst;
d_dst.download(ocl_mat_dst);
TestSystem::instance().setAccurate(ExpectedMatNear(ocl_mat_dst, dst, 1e-5));
GPU_ON;
ocl::addWeighted(d_src1, alpha, d_src2, beta, gama, d_dst);
GPU_OFF;
@@ -1265,8 +1192,10 @@ PERFTEST(AddWeighted)
d_src1.upload(src1);
d_src2.upload(src2);
ocl::addWeighted(d_src1, alpha, d_src2, beta, gama, d_dst);
d_dst.download(dst);
d_dst.download(ocl_dst);
GPU_FULL_OFF;
TestSystem::instance().ExpectedMatNear(ocl_dst, dst, 1e-5);
}
}

View File

@@ -71,7 +71,7 @@ void blendLinearGold(const cv::Mat &img1, const cv::Mat &img2, const cv::Mat &we
}
PERFTEST(blend)
{
Mat src1, src2, weights1, weights2, dst;
Mat src1, src2, weights1, weights2, dst, ocl_dst;
ocl::oclMat d_src1, d_src2, d_weights1, d_weights2, d_dst;
int all_type[] = {CV_8UC1, CV_8UC4};
@@ -103,10 +103,6 @@ PERFTEST(blend)
ocl::blendLinear(d_src1, d_src2, d_weights1, d_weights2, d_dst);
WARMUP_OFF;
cv::Mat ocl_mat;
d_dst.download(ocl_mat);
TestSystem::instance().setAccurate(ExpectedMatNear(dst, ocl_mat, 1.f));
GPU_ON;
ocl::blendLinear(d_src1, d_src2, d_weights1, d_weights2, d_dst);
GPU_OFF;
@@ -117,8 +113,10 @@ PERFTEST(blend)
d_weights1.upload(weights1);
d_weights2.upload(weights2);
ocl::blendLinear(d_src1, d_src2, d_weights1, d_weights2, d_dst);
d_dst.download(dst);
d_dst.download(ocl_dst);
GPU_FULL_OFF;
TestSystem::instance().ExpectedMatNear(dst, ocl_dst, 1.f);
}
}
}

View File

@@ -88,9 +88,6 @@ PERFTEST(BruteForceMatcher)
d_matcher.matchSingle(d_query, d_train, d_trainIdx, d_distance);
WARMUP_OFF;
d_matcher.match(d_query, d_train, d_matches[0]);
TestSystem::instance().setAccurate(AssertEQ<size_t>(d_matches[0].size(), matches[0].size()));
GPU_ON;
d_matcher.matchSingle(d_query, d_train, d_trainIdx, d_distance);
GPU_OFF;
@@ -98,9 +95,15 @@ PERFTEST(BruteForceMatcher)
GPU_FULL_ON;
d_query.upload(query);
d_train.upload(train);
d_matcher.match(d_query, d_train, matches[0]);
d_matcher.match(d_query, d_train, d_matches[0]);
GPU_FULL_OFF;
int diff = abs((int)d_matches[0].size() - (int)matches[0].size());
if(diff == 0)
TestSystem::instance().setAccurate(1, 0);
else
TestSystem::instance().setAccurate(0, diff);
SUBTEST << size << "; knnMatch";
matcher.knnMatch(query, train, matches, 2);
@@ -123,7 +126,11 @@ PERFTEST(BruteForceMatcher)
d_matcher.knnMatch(d_query, d_train, d_matches, 2);
GPU_FULL_OFF;
TestSystem::instance().setAccurate(AssertEQ<size_t>(d_matches[0].size(), matches[0].size()));
diff = abs((int)d_matches[0].size() - (int)matches[0].size());
if(diff == 0)
TestSystem::instance().setAccurate(1, 0);
else
TestSystem::instance().setAccurate(0, diff);
SUBTEST << size << "; radiusMatch";
@@ -151,6 +158,10 @@ PERFTEST(BruteForceMatcher)
d_matcher.radiusMatch(d_query, d_train, d_matches, max_distance);
GPU_FULL_OFF;
TestSystem::instance().setAccurate(AssertEQ<size_t>(d_matches[0].size(), matches[0].size()));
diff = abs((int)d_matches[0].size() - (int)matches[0].size());
if(diff == 0)
TestSystem::instance().setAccurate(1, 0);
else
TestSystem::instance().setAccurate(0, diff);
}
}

View File

@@ -57,7 +57,7 @@ PERFTEST(Canny)
SUBTEST << img.cols << 'x' << img.rows << "; aloeL.jpg" << "; edges" << "; CV_8UC1";
Mat edges(img.size(), CV_8UC1);
Mat edges(img.size(), CV_8UC1), ocl_edges;
CPU_ON;
Canny(img, edges, 50.0, 100.0);
@@ -71,8 +71,6 @@ PERFTEST(Canny)
ocl::Canny(d_img, d_buf, d_edges, 50.0, 100.0);
WARMUP_OFF;
TestSystem::instance().setAccurate(ExceptedMatSimilar(edges, d_edges, 2e-2));
GPU_ON;
ocl::Canny(d_img, d_buf, d_edges, 50.0, 100.0);
GPU_OFF;
@@ -80,6 +78,8 @@ PERFTEST(Canny)
GPU_FULL_ON;
d_img.upload(img);
ocl::Canny(d_img, d_buf, d_edges, 50.0, 100.0);
d_edges.download(edges);
d_edges.download(ocl_edges);
GPU_FULL_OFF;
TestSystem::instance().ExceptedMatSimilar(edges, ocl_edges, 2e-2);
}

View File

@@ -48,7 +48,7 @@
///////////// cvtColor////////////////////////
PERFTEST(cvtColor)
{
Mat src, dst;
Mat src, dst, ocl_dst;
ocl::oclMat d_src, d_dst;
int all_type[] = {CV_8UC4};
@@ -73,10 +73,6 @@ PERFTEST(cvtColor)
ocl::cvtColor(d_src, d_dst, COLOR_RGBA2GRAY, 4);
WARMUP_OFF;
cv::Mat ocl_mat;
d_dst.download(ocl_mat);
TestSystem::instance().setAccurate(ExceptedMatSimilar(dst, ocl_mat, 1e-5));
GPU_ON;
ocl::cvtColor(d_src, d_dst, COLOR_RGBA2GRAY, 4);
GPU_OFF;
@@ -84,8 +80,10 @@ PERFTEST(cvtColor)
GPU_FULL_ON;
d_src.upload(src);
ocl::cvtColor(d_src, d_dst, COLOR_RGBA2GRAY, 4);
d_dst.download(dst);
d_dst.download(ocl_dst);
GPU_FULL_OFF;
TestSystem::instance().ExceptedMatSimilar(dst, ocl_dst, 1e-5);
}

View File

@@ -48,7 +48,7 @@
///////////// columnSum////////////////////////
PERFTEST(columnSum)
{
Mat src, dst;
Mat src, dst, ocl_dst;
ocl::oclMat d_src, d_dst;
for (int size = Min_Size; size <= Max_Size; size *= Multiple)
@@ -63,23 +63,16 @@ PERFTEST(columnSum)
dst.at<float>(0, j) = src.at<float>(0, j);
for (int i = 1; i < src.rows; ++i)
{for (int j = 0; j < src.cols; ++j)
{
for (int j = 0; j < src.cols; ++j)
dst.at<float>(i, j) = dst.at<float>(i - 1 , j) + src.at<float>(i , j);
}
}
CPU_OFF;
d_src.upload(src);
WARMUP_ON;
ocl::columnSum(d_src, d_dst);
WARMUP_OFF;
cv::Mat ocl_mat;
d_dst.download(ocl_mat);
TestSystem::instance().setAccurate(ExpectedMatNear(dst, ocl_mat, 5e-1));
GPU_ON;
ocl::columnSum(d_src, d_dst);
GPU_OFF;
@@ -87,7 +80,9 @@ PERFTEST(columnSum)
GPU_FULL_ON;
d_src.upload(src);
ocl::columnSum(d_src, d_dst);
d_dst.download(dst);
d_dst.download(ocl_dst);
GPU_FULL_OFF;
TestSystem::instance().ExpectedMatNear(dst, ocl_dst, 5e-1);
}
}

View File

@@ -48,7 +48,7 @@
///////////// dft ////////////////////////
PERFTEST(dft)
{
Mat src, dst;
Mat src, dst, ocl_dst;
ocl::oclMat d_src, d_dst;
int all_type[] = {CV_32FC2};
@@ -74,8 +74,6 @@ PERFTEST(dft)
ocl::dft(d_src, d_dst, Size(size, size));
WARMUP_OFF;
TestSystem::instance().setAccurate(ExpectedMatNear(dst, cv::Mat(d_dst), src.size().area() * 1e-4));
GPU_ON;
ocl::dft(d_src, d_dst, Size(size, size));
GPU_OFF;
@@ -83,8 +81,10 @@ PERFTEST(dft)
GPU_FULL_ON;
d_src.upload(src);
ocl::dft(d_src, d_dst, Size(size, size));
d_dst.download(dst);
d_dst.download(ocl_dst);
GPU_FULL_OFF;
TestSystem::instance().ExpectedMatNear(dst, ocl_dst, src.size().area() * 1e-4);
}
}

View File

@@ -48,7 +48,7 @@
///////////// Blur////////////////////////
PERFTEST(Blur)
{
Mat src1, dst;
Mat src1, dst, ocl_dst;
ocl::oclMat d_src1, d_dst;
Size ksize = Size(3, 3);
@@ -65,7 +65,6 @@ PERFTEST(Blur)
gen(src1, size, size, all_type[j], 0, 256);
gen(dst, size, size, all_type[j], 0, 256);
blur(src1, dst, ksize, Point(-1, -1), bordertype);
CPU_ON;
@@ -78,8 +77,6 @@ PERFTEST(Blur)
ocl::blur(d_src1, d_dst, ksize, Point(-1, -1), bordertype);
WARMUP_OFF;
TestSystem::instance().setAccurate(ExpectedMatNear(cv::Mat(d_dst), dst, 1.0));
GPU_ON;
ocl::blur(d_src1, d_dst, ksize, Point(-1, -1), bordertype);
GPU_OFF;
@@ -87,8 +84,10 @@ PERFTEST(Blur)
GPU_FULL_ON;
d_src1.upload(src1);
ocl::blur(d_src1, d_dst, ksize, Point(-1, -1), bordertype);
d_dst.download(dst);
d_dst.download(ocl_dst);
GPU_FULL_OFF;
TestSystem::instance().ExpectedMatNear(ocl_dst, dst, 1.0);
}
}
@@ -96,7 +95,7 @@ PERFTEST(Blur)
///////////// Laplacian////////////////////////
PERFTEST(Laplacian)
{
Mat src1, dst;
Mat src1, dst, ocl_dst;
ocl::oclMat d_src1, d_dst;
int ksize = 3;
@@ -112,7 +111,6 @@ PERFTEST(Laplacian)
gen(src1, size, size, all_type[j], 0, 256);
gen(dst, size, size, all_type[j], 0, 256);
Laplacian(src1, dst, -1, ksize, 1);
CPU_ON;
@@ -125,8 +123,6 @@ PERFTEST(Laplacian)
ocl::Laplacian(d_src1, d_dst, -1, ksize, 1);
WARMUP_OFF;
TestSystem::instance().setAccurate(ExpectedMatNear(cv::Mat(d_dst), dst, 1e-5));
GPU_ON;
ocl::Laplacian(d_src1, d_dst, -1, ksize, 1);
GPU_OFF;
@@ -134,8 +130,10 @@ PERFTEST(Laplacian)
GPU_FULL_ON;
d_src1.upload(src1);
ocl::Laplacian(d_src1, d_dst, -1, ksize, 1);
d_dst.download(dst);
d_dst.download(ocl_dst);
GPU_FULL_OFF;
TestSystem::instance().ExpectedMatNear(ocl_dst, dst, 1e-5);
}
}
@@ -144,7 +142,7 @@ PERFTEST(Laplacian)
///////////// Erode ////////////////////
PERFTEST(Erode)
{
Mat src, dst, ker;
Mat src, dst, ker, ocl_dst;
ocl::oclMat d_src, d_dst;
int all_type[] = {CV_8UC1, CV_8UC4, CV_32FC1, CV_32FC4};
@@ -171,8 +169,6 @@ PERFTEST(Erode)
ocl::erode(d_src, d_dst, ker);
WARMUP_OFF;
TestSystem::instance().setAccurate(ExpectedMatNear(cv::Mat(d_dst), dst, 1e-5));
GPU_ON;
ocl::erode(d_src, d_dst, ker);
GPU_OFF;
@@ -180,8 +176,10 @@ PERFTEST(Erode)
GPU_FULL_ON;
d_src.upload(src);
ocl::erode(d_src, d_dst, ker);
d_dst.download(dst);
d_dst.download(ocl_dst);
GPU_FULL_OFF;
TestSystem::instance().ExpectedMatNear(ocl_dst, dst, 1e-5);
}
}
@@ -190,7 +188,7 @@ PERFTEST(Erode)
///////////// Sobel ////////////////////////
PERFTEST(Sobel)
{
Mat src, dst;
Mat src, dst, ocl_dst;
ocl::oclMat d_src, d_dst;
int dx = 1;
@@ -218,8 +216,6 @@ PERFTEST(Sobel)
ocl::Sobel(d_src, d_dst, -1, dx, dy);
WARMUP_OFF;
TestSystem::instance().setAccurate(ExpectedMatNear(cv::Mat(d_dst), dst, 1));
GPU_ON;
ocl::Sobel(d_src, d_dst, -1, dx, dy);
GPU_OFF;
@@ -227,8 +223,10 @@ PERFTEST(Sobel)
GPU_FULL_ON;
d_src.upload(src);
ocl::Sobel(d_src, d_dst, -1, dx, dy);
d_dst.download(dst);
d_dst.download(ocl_dst);
GPU_FULL_OFF;
TestSystem::instance().ExpectedMatNear(ocl_dst, dst, 1);
}
}
@@ -236,7 +234,7 @@ PERFTEST(Sobel)
///////////// Scharr ////////////////////////
PERFTEST(Scharr)
{
Mat src, dst;
Mat src, dst, ocl_dst;
ocl::oclMat d_src, d_dst;
int dx = 1;
@@ -264,8 +262,6 @@ PERFTEST(Scharr)
ocl::Scharr(d_src, d_dst, -1, dx, dy);
WARMUP_OFF;
TestSystem::instance().setAccurate(ExpectedMatNear(cv::Mat(d_dst), dst, 1));
GPU_ON;
ocl::Scharr(d_src, d_dst, -1, dx, dy);
GPU_OFF;
@@ -273,8 +269,10 @@ PERFTEST(Scharr)
GPU_FULL_ON;
d_src.upload(src);
ocl::Scharr(d_src, d_dst, -1, dx, dy);
d_dst.download(dst);
d_dst.download(ocl_dst);
GPU_FULL_OFF;
TestSystem::instance().ExpectedMatNear(ocl_dst, dst, 1);
}
}
@@ -283,7 +281,7 @@ PERFTEST(Scharr)
///////////// GaussianBlur ////////////////////////
PERFTEST(GaussianBlur)
{
Mat src, dst;
Mat src, dst, ocl_dst;
int all_type[] = {CV_8UC1, CV_8UC4, CV_32FC1, CV_32FC4};
std::string type_name[] = {"CV_8UC1", "CV_8UC4", "CV_32FC1", "CV_32FC4"};
@@ -311,9 +309,6 @@ PERFTEST(GaussianBlur)
ocl::GaussianBlur(d_src, d_dst, Size(9, 9), 0);
WARMUP_OFF;
TestSystem::instance().setAccurate(ExpectedMatNear(cv::Mat(d_dst), dst, 1.0));
GPU_ON;
ocl::GaussianBlur(d_src, d_dst, Size(9, 9), 0);
GPU_OFF;
@@ -321,8 +316,10 @@ PERFTEST(GaussianBlur)
GPU_FULL_ON;
d_src.upload(src);
ocl::GaussianBlur(d_src, d_dst, Size(9, 9), 0);
d_dst.download(dst);
d_dst.download(ocl_dst);
GPU_FULL_OFF;
TestSystem::instance().ExpectedMatNear(ocl_dst, dst, 1.0);
}
}
@@ -349,7 +346,7 @@ PERFTEST(filter2D)
Mat kernel;
gen(kernel, ksize, ksize, CV_32FC1, 0.0, 1.0);
Mat dst(src);
Mat dst, ocl_dst;
dst.setTo(0);
cv::filter2D(src, dst, -1, kernel);
@@ -357,17 +354,12 @@ PERFTEST(filter2D)
cv::filter2D(src, dst, -1, kernel);
CPU_OFF;
ocl::oclMat d_src(src);
ocl::oclMat d_dst(d_src);
d_dst.setTo(0);
ocl::oclMat d_src(src), d_dst;
WARMUP_ON;
ocl::filter2D(d_src, d_dst, -1, kernel);
WARMUP_OFF;
TestSystem::instance().setAccurate(ExpectedMatNear(cv::Mat(d_dst), dst, 1e-5));
GPU_ON;
ocl::filter2D(d_src, d_dst, -1, kernel);
GPU_OFF;
@@ -375,8 +367,10 @@ PERFTEST(filter2D)
GPU_FULL_ON;
d_src.upload(src);
ocl::filter2D(d_src, d_dst, -1, kernel);
d_dst.download(dst);
d_dst.download(ocl_dst);
GPU_FULL_OFF;
TestSystem::instance().ExpectedMatNear(ocl_dst, dst, 1e-5);
}
}

View File

@@ -48,7 +48,7 @@
///////////// gemm ////////////////////////
PERFTEST(gemm)
{
Mat src1, src2, src3, dst;
Mat src1, src2, src3, dst, ocl_dst;
ocl::oclMat d_src1, d_src2, d_src3, d_dst;
for (int size = Min_Size; size <= Max_Size; size *= Multiple)
@@ -72,7 +72,6 @@ PERFTEST(gemm)
WARMUP_ON;
ocl::gemm(d_src1, d_src2, 1.0, d_src3, 1.0, d_dst);
WARMUP_OFF;
TestSystem::instance().setAccurate(ExpectedMatNear(cv::Mat(d_dst), dst, src1.cols * src1.rows * 1e-4));
GPU_ON;
ocl::gemm(d_src1, d_src2, 1.0, d_src3, 1.0, d_dst);
@@ -83,7 +82,9 @@ PERFTEST(gemm)
d_src2.upload(src2);
d_src3.upload(src3);
ocl::gemm(d_src1, d_src2, 1.0, d_src3, 1.0, d_dst);
d_dst.download(dst);
d_dst.download(ocl_dst);
GPU_FULL_OFF;
TestSystem::instance().ExpectedMatNear(ocl_dst, dst, src1.cols * src1.rows * 1e-4);
}
}

View File

@@ -125,8 +125,10 @@ PERFTEST(Haar)
1.1, 2, 0 | CV_HAAR_SCALE_IMAGE, Size(30, 30));
WARMUP_OFF;
//Testing whether the expected is equal to the actual.
TestSystem::instance().setAccurate(ExpectedEQ<vector<Rect>::size_type, vector<Rect>::size_type>(faces.size(), oclfaces.size()));
if(faces.size() == oclfaces.size())
TestSystem::instance().setAccurate(1, 0);
else
TestSystem::instance().setAccurate(0, abs((int)faces.size() - (int)oclfaces.size()));
faces.clear();

View File

@@ -146,10 +146,8 @@ PERFTEST(HOG)
}
}
cv::Mat ocl_mat;
ocl_mat = cv::Mat(d_comp);
ocl_mat.convertTo(ocl_mat, cv::Mat(comp).type());
TestSystem::instance().setAccurate(ExpectedMatNear(ocl_mat, cv::Mat(comp), 3));
cv::Mat gpu_rst(d_comp), cpu_rst(comp);
TestSystem::instance().ExpectedMatNear(gpu_rst, cpu_rst, 3);
GPU_ON;
ocl_hog.detectMultiScale(d_src, found_locations);

View File

@@ -48,7 +48,7 @@
///////////// equalizeHist ////////////////////////
PERFTEST(equalizeHist)
{
Mat src, dst;
Mat src, dst, ocl_dst;
int all_type[] = {CV_8UC1};
std::string type_name[] = {"CV_8UC1"};
@@ -75,9 +75,6 @@ PERFTEST(equalizeHist)
ocl::equalizeHist(d_src, d_dst);
WARMUP_OFF;
TestSystem::instance().setAccurate(ExpectedMatNear(dst, cv::Mat(d_dst), 1.1));
GPU_ON;
ocl::equalizeHist(d_src, d_dst);
GPU_OFF;
@@ -85,8 +82,10 @@ PERFTEST(equalizeHist)
GPU_FULL_ON;
d_src.upload(src);
ocl::equalizeHist(d_src, d_dst);
d_dst.download(dst);
d_dst.download(ocl_dst);
GPU_FULL_OFF;
TestSystem::instance().ExpectedMatNear(dst, ocl_dst, 1.1);
}
}
@@ -94,7 +93,7 @@ PERFTEST(equalizeHist)
/////////// CopyMakeBorder //////////////////////
PERFTEST(CopyMakeBorder)
{
Mat src, dst;
Mat src, dst, ocl_dst;
ocl::oclMat d_dst;
int bordertype = BORDER_CONSTANT;
@@ -122,9 +121,6 @@ PERFTEST(CopyMakeBorder)
ocl::copyMakeBorder(d_src, d_dst, 7, 5, 5, 7, bordertype, cv::Scalar(1.0));
WARMUP_OFF;
TestSystem::instance().setAccurate(ExpectedMatNear(dst, cv::Mat(d_dst), 0.0));
GPU_ON;
ocl::copyMakeBorder(d_src, d_dst, 7, 5, 5, 7, bordertype, cv::Scalar(1.0));
GPU_OFF;
@@ -132,8 +128,10 @@ PERFTEST(CopyMakeBorder)
GPU_FULL_ON;
d_src.upload(src);
ocl::copyMakeBorder(d_src, d_dst, 7, 5, 5, 7, bordertype, cv::Scalar(1.0));
d_dst.download(dst);
d_dst.download(ocl_dst);
GPU_FULL_OFF;
TestSystem::instance().ExpectedMatNear(dst, ocl_dst, 0.0);
}
}
@@ -141,7 +139,7 @@ PERFTEST(CopyMakeBorder)
///////////// cornerMinEigenVal ////////////////////////
PERFTEST(cornerMinEigenVal)
{
Mat src, dst;
Mat src, dst, ocl_dst;
ocl::oclMat d_dst;
int blockSize = 7, apertureSize = 1 + 2 * (rand() % 4);
@@ -155,7 +153,6 @@ PERFTEST(cornerMinEigenVal)
{
SUBTEST << size << 'x' << size << "; " << type_name[j] ;
gen(src, size, size, all_type[j], 0, 256);
cornerMinEigenVal(src, dst, blockSize, apertureSize, borderType);
@@ -170,9 +167,6 @@ PERFTEST(cornerMinEigenVal)
ocl::cornerMinEigenVal(d_src, d_dst, blockSize, apertureSize, borderType);
WARMUP_OFF;
TestSystem::instance().setAccurate(ExpectedMatNear(dst, cv::Mat(d_dst), 1.0));
GPU_ON;
ocl::cornerMinEigenVal(d_src, d_dst, blockSize, apertureSize, borderType);
GPU_OFF;
@@ -180,8 +174,10 @@ PERFTEST(cornerMinEigenVal)
GPU_FULL_ON;
d_src.upload(src);
ocl::cornerMinEigenVal(d_src, d_dst, blockSize, apertureSize, borderType);
d_dst.download(dst);
d_dst.download(ocl_dst);
GPU_FULL_OFF;
TestSystem::instance().ExpectedMatNear(dst, ocl_dst, 1.0);
}
}
@@ -189,7 +185,7 @@ PERFTEST(cornerMinEigenVal)
///////////// cornerHarris ////////////////////////
PERFTEST(cornerHarris)
{
Mat src, dst;
Mat src, dst, ocl_dst;
ocl::oclMat d_src, d_dst;
int all_type[] = {CV_8UC1, CV_32FC1};
@@ -215,8 +211,6 @@ PERFTEST(cornerHarris)
ocl::cornerHarris(d_src, d_dst, 5, 7, 0.1, BORDER_REFLECT);
WARMUP_OFF;
TestSystem::instance().setAccurate(ExpectedMatNear(dst, cv::Mat(d_dst), 1.0));
GPU_ON;
ocl::cornerHarris(d_src, d_dst, 5, 7, 0.1, BORDER_REFLECT);
GPU_OFF;
@@ -224,8 +218,10 @@ PERFTEST(cornerHarris)
GPU_FULL_ON;
d_src.upload(src);
ocl::cornerHarris(d_src, d_dst, 5, 7, 0.1, BORDER_REFLECT);
d_dst.download(dst);
d_dst.download(ocl_dst);
GPU_FULL_OFF;
TestSystem::instance().ExpectedMatNear(dst, ocl_dst, 1.0);
}
@@ -234,7 +230,7 @@ PERFTEST(cornerHarris)
///////////// integral ////////////////////////
PERFTEST(integral)
{
Mat src, sum;
Mat src, sum, ocl_sum;
ocl::oclMat d_src, d_sum, d_buf;
int all_type[] = {CV_8UC1};
@@ -260,12 +256,6 @@ PERFTEST(integral)
ocl::integral(d_src, d_sum);
WARMUP_OFF;
cv::Mat ocl_mat;
d_sum.download(ocl_mat);
if(sum.type() == ocl_mat.type()) //we won't test accuracy when cpu function overlow
TestSystem::instance().setAccurate(ExpectedMatNear(sum, ocl_mat, 0.0));
GPU_ON;
ocl::integral(d_src, d_sum);
GPU_OFF;
@@ -273,8 +263,12 @@ PERFTEST(integral)
GPU_FULL_ON;
d_src.upload(src);
ocl::integral(d_src, d_sum);
d_sum.download(sum);
d_sum.download(ocl_sum);
GPU_FULL_OFF;
if(sum.type() == ocl_sum.type()) //we won't test accuracy when cpu function overlow
TestSystem::instance().ExpectedMatNear(sum, ocl_sum, 0.0);
}
}
@@ -282,7 +276,7 @@ PERFTEST(integral)
///////////// WarpAffine ////////////////////////
PERFTEST(WarpAffine)
{
Mat src, dst;
Mat src, dst, ocl_dst;
ocl::oclMat d_src, d_dst;
static const double coeffs[2][3] =
@@ -319,8 +313,6 @@ PERFTEST(WarpAffine)
ocl::warpAffine(d_src, d_dst, M, size1, interpolation);
WARMUP_OFF;
TestSystem::instance().setAccurate(ExpectedMatNear(dst, cv::Mat(d_dst), 1.0));
GPU_ON;
ocl::warpAffine(d_src, d_dst, M, size1, interpolation);
GPU_OFF;
@@ -328,8 +320,10 @@ PERFTEST(WarpAffine)
GPU_FULL_ON;
d_src.upload(src);
ocl::warpAffine(d_src, d_dst, M, size1, interpolation);
d_dst.download(dst);
d_dst.download(ocl_dst);
GPU_FULL_OFF;
TestSystem::instance().ExpectedMatNear(dst, ocl_dst, 1.0);
}
}
@@ -337,7 +331,7 @@ PERFTEST(WarpAffine)
///////////// WarpPerspective ////////////////////////
PERFTEST(WarpPerspective)
{
Mat src, dst;
Mat src, dst, ocl_dst;
ocl::oclMat d_src, d_dst;
static const double coeffs[3][3] =
@@ -374,8 +368,6 @@ PERFTEST(WarpPerspective)
ocl::warpPerspective(d_src, d_dst, M, size1, interpolation);
WARMUP_OFF;
TestSystem::instance().setAccurate(ExpectedMatNear(dst, cv::Mat(d_dst), 1.0));
GPU_ON;
ocl::warpPerspective(d_src, d_dst, M, size1, interpolation);
GPU_OFF;
@@ -383,8 +375,10 @@ PERFTEST(WarpPerspective)
GPU_FULL_ON;
d_src.upload(src);
ocl::warpPerspective(d_src, d_dst, M, size1, interpolation);
d_dst.download(dst);
d_dst.download(ocl_dst);
GPU_FULL_OFF;
TestSystem::instance().ExpectedMatNear(dst, ocl_dst, 1.0);
}
}
@@ -393,7 +387,7 @@ PERFTEST(WarpPerspective)
///////////// resize ////////////////////////
PERFTEST(resize)
{
Mat src, dst;
Mat src, dst, ocl_dst;
ocl::oclMat d_src, d_dst;
@@ -420,9 +414,6 @@ PERFTEST(resize)
ocl::resize(d_src, d_dst, Size(), 2.0, 2.0);
WARMUP_OFF;
TestSystem::instance().setAccurate(ExpectedMatNear(dst, cv::Mat(d_dst), 1.0));
GPU_ON;
ocl::resize(d_src, d_dst, Size(), 2.0, 2.0);
GPU_OFF;
@@ -430,8 +421,10 @@ PERFTEST(resize)
GPU_FULL_ON;
d_src.upload(src);
ocl::resize(d_src, d_dst, Size(), 2.0, 2.0);
d_dst.download(dst);
d_dst.download(ocl_dst);
GPU_FULL_OFF;
TestSystem::instance().ExpectedMatNear(dst, ocl_dst, 1.0);
}
}
@@ -456,8 +449,6 @@ PERFTEST(resize)
ocl::resize(d_src, d_dst, Size(), 0.5, 0.5);
WARMUP_OFF;
TestSystem::instance().setAccurate(ExpectedMatNear(dst, cv::Mat(d_dst), 1.0));
GPU_ON;
ocl::resize(d_src, d_dst, Size(), 0.5, 0.5);
GPU_OFF;
@@ -465,8 +456,10 @@ PERFTEST(resize)
GPU_FULL_ON;
d_src.upload(src);
ocl::resize(d_src, d_dst, Size(), 0.5, 0.5);
d_dst.download(dst);
d_dst.download(ocl_dst);
GPU_FULL_OFF;
TestSystem::instance().ExpectedMatNear(dst, ocl_dst, 1.0);
}
}
@@ -474,10 +467,9 @@ PERFTEST(resize)
///////////// threshold////////////////////////
PERFTEST(threshold)
{
Mat src, dst;
Mat src, dst, ocl_dst;
ocl::oclMat d_src, d_dst;
for (int size = Min_Size; size <= Max_Size; size *= Multiple)
{
SUBTEST << size << 'x' << size << "; 8UC1; THRESH_BINARY";
@@ -496,9 +488,6 @@ PERFTEST(threshold)
ocl::threshold(d_src, d_dst, 50.0, 0.0, THRESH_BINARY);
WARMUP_OFF;
TestSystem::instance().setAccurate(ExpectedMatNear(dst, cv::Mat(d_dst), 1.0));
GPU_ON;
ocl::threshold(d_src, d_dst, 50.0, 0.0, THRESH_BINARY);
GPU_OFF;
@@ -506,9 +495,10 @@ PERFTEST(threshold)
GPU_FULL_ON;
d_src.upload(src);
ocl::threshold(d_src, d_dst, 50.0, 0.0, THRESH_BINARY);
d_dst.download(dst);
d_dst.download(ocl_dst);
GPU_FULL_OFF;
TestSystem::instance().ExpectedMatNear(dst, ocl_dst, 1.0);
}
for (int size = Min_Size; size <= Max_Size; size *= Multiple)
@@ -529,8 +519,6 @@ PERFTEST(threshold)
ocl::threshold(d_src, d_dst, 50.0, 0.0, THRESH_TRUNC);
WARMUP_OFF;
TestSystem::instance().setAccurate(ExpectedMatNear(dst, cv::Mat(d_dst), 1.0));
GPU_ON;
ocl::threshold(d_src, d_dst, 50.0, 0.0, THRESH_TRUNC);
GPU_OFF;
@@ -538,8 +526,10 @@ PERFTEST(threshold)
GPU_FULL_ON;
d_src.upload(src);
ocl::threshold(d_src, d_dst, 50.0, 0.0, THRESH_TRUNC);
d_dst.download(dst);
d_dst.download(ocl_dst);
GPU_FULL_OFF;
TestSystem::instance().ExpectedMatNear(dst, ocl_dst, 1.0);
}
}
///////////// meanShiftFiltering////////////////////////
@@ -726,7 +716,7 @@ void meanShiftFiltering_(const Mat &src_roi, Mat &dst_roi, int sp, int sr, cv::T
PERFTEST(meanShiftFiltering)
{
int sp = 5, sr = 6;
Mat src, dst;
Mat src, dst, ocl_dst;
ocl::oclMat d_src, d_dst;
@@ -753,11 +743,6 @@ PERFTEST(meanShiftFiltering)
ocl::meanShiftFiltering(d_src, d_dst, sp, sr, crit);
WARMUP_OFF;
cv::Mat ocl_mat;
d_dst.download(ocl_mat);
TestSystem::instance().setAccurate(ExpectedMatNear(dst, ocl_mat, 0.0));
GPU_ON;
ocl::meanShiftFiltering(d_src, d_dst, sp, sr);
GPU_OFF;
@@ -765,8 +750,10 @@ PERFTEST(meanShiftFiltering)
GPU_FULL_ON;
d_src.upload(src);
ocl::meanShiftFiltering(d_src, d_dst, sp, sr);
d_dst.download(dst);
d_dst.download(ocl_dst);
GPU_FULL_OFF;
TestSystem::instance().ExpectedMatNear(dst, ocl_dst, 0.0);
}
}
///////////// meanShiftProc////////////////////////
@@ -1010,8 +997,9 @@ void meanShiftProc_(const Mat &src_roi, Mat &dst_roi, Mat &dstCoor_roi, int sp,
}
PERFTEST(meanShiftProc)
{
Mat src, dst, dstCoor_roi;
ocl::oclMat d_src, d_dst, d_dstCoor_roi;
Mat src;
vector<Mat> dst(2), ocl_dst(2);
ocl::oclMat d_src, d_dst, d_dstCoor;
TermCriteria crit(TermCriteria::COUNT + TermCriteria::EPS, 5, 1);
@@ -1020,42 +1008,41 @@ PERFTEST(meanShiftProc)
SUBTEST << size << 'x' << size << "; 8UC4 and CV_16SC2 ";
gen(src, size, size, CV_8UC4, Scalar::all(0), Scalar::all(256));
gen(dst, size, size, CV_8UC4, Scalar::all(0), Scalar::all(256));
gen(dstCoor_roi, size, size, CV_16SC2, Scalar::all(0), Scalar::all(256));
gen(dst[0], size, size, CV_8UC4, Scalar::all(0), Scalar::all(256));
gen(dst[1], size, size, CV_16SC2, Scalar::all(0), Scalar::all(256));
meanShiftProc_(src, dst, dstCoor_roi, 5, 6, crit);
meanShiftProc_(src, dst[0], dst[1], 5, 6, crit);
CPU_ON;
meanShiftProc_(src, dst, dstCoor_roi, 5, 6, crit);
meanShiftProc_(src, dst[0], dst[1], 5, 6, crit);
CPU_OFF;
d_src.upload(src);
WARMUP_ON;
ocl::meanShiftProc(d_src, d_dst, d_dstCoor_roi, 5, 6, crit);
ocl::meanShiftProc(d_src, d_dst, d_dstCoor, 5, 6, crit);
WARMUP_OFF;
TestSystem::instance().setAccurate(ExpectedMatNear(dstCoor_roi, cv::Mat(d_dstCoor_roi), 0.0)
&&ExpectedMatNear(dst, cv::Mat(d_dst), 0.0));
GPU_ON;
ocl::meanShiftProc(d_src, d_dst, d_dstCoor_roi, 5, 6, crit);
ocl::meanShiftProc(d_src, d_dst, d_dstCoor, 5, 6, crit);
GPU_OFF;
GPU_FULL_ON;
d_src.upload(src);
ocl::meanShiftProc(d_src, d_dst, d_dstCoor_roi, 5, 6, crit);
d_dst.download(dst);
d_dstCoor_roi.download(dstCoor_roi);
ocl::meanShiftProc(d_src, d_dst, d_dstCoor, 5, 6, crit);
d_dst.download(ocl_dst[0]);
d_dstCoor.download(ocl_dst[1]);
GPU_FULL_OFF;
vector<double> eps(2, 0.);
TestSystem::instance().ExpectMatsNear(dst, ocl_dst, eps);
}
}
///////////// remap////////////////////////
PERFTEST(remap)
{
Mat src, dst, xmap, ymap;
Mat src, dst, xmap, ymap, ocl_dst;
ocl::oclMat d_src, d_dst, d_xmap, d_ymap;
int all_type[] = {CV_8UC1, CV_8UC4};
@@ -1088,7 +1075,6 @@ PERFTEST(remap)
}
}
remap(src, dst, xmap, ymap, interpolation, borderMode);
CPU_ON;
@@ -1104,12 +1090,6 @@ PERFTEST(remap)
ocl::remap(d_src, d_dst, d_xmap, d_ymap, interpolation, borderMode);
WARMUP_OFF;
if(interpolation == 0)
TestSystem::instance().setAccurate(ExpectedMatNear(dst, cv::Mat(d_dst), 1.0));
else
TestSystem::instance().setAccurate(ExpectedMatNear(dst, cv::Mat(d_dst), 2.0));
GPU_ON;
ocl::remap(d_src, d_dst, d_xmap, d_ymap, interpolation, borderMode);
GPU_OFF;
@@ -1117,8 +1097,10 @@ PERFTEST(remap)
GPU_FULL_ON;
d_src.upload(src);
ocl::remap(d_src, d_dst, d_xmap, d_ymap, interpolation, borderMode);
d_dst.download(dst);
d_dst.download(ocl_dst);
GPU_FULL_OFF;
TestSystem::instance().ExpectedMatNear(dst, ocl_dst, 2.0);
}
}

View File

@@ -56,11 +56,9 @@
PERFTEST(matchTemplate)
{
//InitMatchTemplate();
Mat src, templ, dst;
Mat src, templ, dst, ocl_dst;
int templ_size = 5;
for (int size = Min_Size; size <= Max_Size; size *= Multiple)
{
int all_type[] = {CV_32FC1, CV_32FC4};
@@ -82,16 +80,12 @@ PERFTEST(matchTemplate)
matchTemplate(src, templ, dst, TM_CCORR);
CPU_OFF;
ocl::oclMat d_src(src), d_templ, d_dst;
d_templ.upload(templ);
ocl::oclMat d_src(src), d_templ(templ), d_dst;
WARMUP_ON;
ocl::matchTemplate(d_src, d_templ, d_dst, TM_CCORR);
WARMUP_OFF;
TestSystem::instance().setAccurate(ExpectedMatNear(dst, cv::Mat(d_dst), templ.rows * templ.cols * 1e-1));
GPU_ON;
ocl::matchTemplate(d_src, d_templ, d_dst, TM_CCORR);
GPU_OFF;
@@ -100,8 +94,10 @@ PERFTEST(matchTemplate)
d_src.upload(src);
d_templ.upload(templ);
ocl::matchTemplate(d_src, d_templ, d_dst, TM_CCORR);
d_dst.download(dst);
d_dst.download(ocl_dst);
GPU_FULL_OFF;
TestSystem::instance().ExpectedMatNear(dst, ocl_dst, templ.rows * templ.cols * 1e-1);
}
}
@@ -131,8 +127,6 @@ PERFTEST(matchTemplate)
ocl::matchTemplate(d_src, d_templ, d_dst, TM_CCORR_NORMED);
WARMUP_OFF;
TestSystem::instance().setAccurate(ExpectedMatNear(dst, cv::Mat(d_dst), templ.rows * templ.cols * 1e-1));
GPU_ON;
ocl::matchTemplate(d_src, d_templ, d_dst, TM_CCORR_NORMED);
GPU_OFF;
@@ -141,8 +135,10 @@ PERFTEST(matchTemplate)
d_src.upload(src);
d_templ.upload(templ);
ocl::matchTemplate(d_src, d_templ, d_dst, TM_CCORR_NORMED);
d_dst.download(dst);
d_dst.download(ocl_dst);
GPU_FULL_OFF;
TestSystem::instance().ExpectedMatNear(dst, ocl_dst, templ.rows * templ.cols * 1e-1);
}
}
}

View File

@@ -48,7 +48,7 @@
///////////// ConvertTo////////////////////////
PERFTEST(ConvertTo)
{
Mat src, dst;
Mat src, dst, ocl_dst;
ocl::oclMat d_src, d_dst;
int all_type[] = {CV_8UC1, CV_8UC4};
@@ -77,9 +77,6 @@ PERFTEST(ConvertTo)
d_src.convertTo(d_dst, CV_32FC1);
WARMUP_OFF;
TestSystem::instance().setAccurate(ExpectedMatNear(dst, cv::Mat(d_dst), 0.0));
GPU_ON;
d_src.convertTo(d_dst, CV_32FC1);
GPU_OFF;
@@ -87,8 +84,10 @@ PERFTEST(ConvertTo)
GPU_FULL_ON;
d_src.upload(src);
d_src.convertTo(d_dst, CV_32FC1);
d_dst.download(dst);
d_dst.download(ocl_dst);
GPU_FULL_OFF;
TestSystem::instance().ExpectedMatNear(dst, ocl_dst, 0.0);
}
}
@@ -96,7 +95,7 @@ PERFTEST(ConvertTo)
///////////// copyTo////////////////////////
PERFTEST(copyTo)
{
Mat src, dst;
Mat src, dst, ocl_dst;
ocl::oclMat d_src, d_dst;
int all_type[] = {CV_8UC1, CV_8UC4};
@@ -125,9 +124,6 @@ PERFTEST(copyTo)
d_src.copyTo(d_dst);
WARMUP_OFF;
TestSystem::instance().setAccurate(ExpectedMatNear(dst, cv::Mat(d_dst), 0.0));
GPU_ON;
d_src.copyTo(d_dst);
GPU_OFF;
@@ -135,8 +131,10 @@ PERFTEST(copyTo)
GPU_FULL_ON;
d_src.upload(src);
d_src.copyTo(d_dst);
d_dst.download(dst);
d_dst.download(ocl_dst);
GPU_FULL_OFF;
TestSystem::instance().ExpectedMatNear(dst, ocl_dst, 0.0);
}
}
@@ -144,9 +142,9 @@ PERFTEST(copyTo)
///////////// setTo////////////////////////
PERFTEST(setTo)
{
Mat src, dst;
Mat src, ocl_src;
Scalar val(1, 2, 3, 4);
ocl::oclMat d_src, d_dst;
ocl::oclMat d_src;
int all_type[] = {CV_8UC1, CV_8UC4};
std::string type_name[] = {"CV_8UC1", "CV_8UC4"};
@@ -171,10 +169,10 @@ PERFTEST(setTo)
d_src.setTo(val);
WARMUP_OFF;
TestSystem::instance().setAccurate(ExpectedMatNear(src, cv::Mat(d_src), 1.0));
d_src.download(ocl_src);
TestSystem::instance().ExpectedMatNear(src, ocl_src, 1.0);
GPU_ON;
GPU_ON;;
d_src.setTo(val);
GPU_OFF;

View File

@@ -48,39 +48,40 @@
///////////// norm////////////////////////
PERFTEST(norm)
{
Mat src, buf;
ocl::oclMat d_src, d_buf;
Mat src1, src2, ocl_src1;
ocl::oclMat d_src1, d_src2;
for (int size = Min_Size; size <= Max_Size; size *= Multiple)
{
SUBTEST << size << 'x' << size << "; CV_8UC1; NORM_INF";
gen(src, size, size, CV_8UC1, Scalar::all(0), Scalar::all(1));
gen(buf, size, size, CV_8UC1, Scalar::all(0), Scalar::all(1));
gen(src1, size, size, CV_8UC1, Scalar::all(0), Scalar::all(1));
gen(src2, size, size, CV_8UC1, Scalar::all(0), Scalar::all(1));
norm(src, NORM_INF);
norm(src1, src2, NORM_INF);
CPU_ON;
norm(src, NORM_INF);
norm(src1, src2, NORM_INF);
CPU_OFF;
d_src.upload(src);
d_buf.upload(buf);
d_src1.upload(src1);
d_src2.upload(src2);
WARMUP_ON;
ocl::norm(d_src, d_buf, NORM_INF);
ocl::norm(d_src1, d_src2, NORM_INF);
WARMUP_OFF;
TestSystem::instance().setAccurate(ExpectedMatNear(src, cv::Mat(d_buf), .5));
d_src1.download(ocl_src1);
TestSystem::instance().ExpectedMatNear(src1, ocl_src1, .5);
GPU_ON;
ocl::norm(d_src, d_buf, NORM_INF);
ocl::norm(d_src1, d_src2, NORM_INF);
GPU_OFF;
GPU_FULL_ON;
d_src.upload(src);
ocl::norm(d_src, d_buf, NORM_INF);
d_src1.upload(src1);
d_src2.upload(src2);
ocl::norm(d_src1, d_src2, NORM_INF);
GPU_FULL_OFF;
}
}

View File

@@ -82,8 +82,8 @@ PERFTEST(PyrLKOpticalFlow)
SUBTEST << frame0.cols << "x" << frame0.rows << "; color; " << points << " points";
else
SUBTEST << frame0.cols << "x" << frame0.rows << "; gray; " << points << " points";
Mat nextPts_cpu;
Mat status_cpu;
Mat ocl_nextPts;
Mat ocl_status;
vector<Point2f> pts;
goodFeaturesToTrack(i == 0 ? gray_frame : frame0, pts, points, 0.01, 0.0);
@@ -116,12 +116,6 @@ PERFTEST(PyrLKOpticalFlow)
d_pyrLK.sparse(d_frame0, d_frame1, d_pts, d_nextPts, d_status, &d_err);
WARMUP_OFF;
std::vector<cv::Point2f> ocl_nextPts(d_nextPts.cols);
std::vector<unsigned char> ocl_status(d_status.cols);
TestSystem::instance().setAccurate(AssertEQ<size_t>(nextPts.size(), ocl_nextPts.size()));
TestSystem::instance().setAccurate(AssertEQ<size_t>(status.size(), ocl_status.size()));
GPU_ON;
d_pyrLK.sparse(d_frame0, d_frame1, d_pts, d_nextPts, d_status, &d_err);
GPU_OFF;
@@ -133,17 +127,102 @@ PERFTEST(PyrLKOpticalFlow)
d_pyrLK.sparse(d_frame0, d_frame1, d_pts, d_nextPts, d_status, &d_err);
if (!d_nextPts.empty())
{
d_nextPts.download(nextPts_cpu);
}
d_nextPts.download(ocl_nextPts);
if (!d_status.empty())
{
d_status.download(status_cpu);
}
d_status.download(ocl_status);
GPU_FULL_OFF;
size_t mismatch = 0;
for (int i = 0; i < (int)nextPts.size(); ++i)
{
if(status[i] != ocl_status.at<unsigned char>(0, i)){
mismatch++;
continue;
}
if(status[i]){
Point2f gpu_rst = ocl_nextPts.at<Point2f>(0, i);
Point2f cpu_rst = nextPts[i];
if(fabs(gpu_rst.x - cpu_rst.x) >= 1. || fabs(gpu_rst.y - cpu_rst.y) >= 1.)
mismatch++;
}
}
double ratio = (double)mismatch / (double)nextPts.size();
if(ratio < .02)
TestSystem::instance().setAccurate(1, ratio);
else
TestSystem::instance().setAccurate(0, ratio);
}
}
}
PERFTEST(tvl1flow)
{
cv::Mat frame0 = imread("rubberwhale1.png", cv::IMREAD_GRAYSCALE);
assert(!frame0.empty());
cv::Mat frame1 = imread("rubberwhale2.png", cv::IMREAD_GRAYSCALE);
assert(!frame1.empty());
cv::ocl::OpticalFlowDual_TVL1_OCL d_alg;
cv::ocl::oclMat d_flowx(frame0.size(), CV_32FC1);
cv::ocl::oclMat d_flowy(frame1.size(), CV_32FC1);
cv::Ptr<cv::DenseOpticalFlow> alg = cv::createOptFlow_DualTVL1();
cv::Mat flow;
SUBTEST << frame0.cols << 'x' << frame0.rows << "; rubberwhale1.png; "<<frame1.cols<<'x'<<frame1.rows<<"; rubberwhale2.png";
alg->calc(frame0, frame1, flow);
CPU_ON;
alg->calc(frame0, frame1, flow);
CPU_OFF;
cv::Mat gold[2];
cv::split(flow, gold);
cv::ocl::oclMat d0(frame0.size(), CV_32FC1);
d0.upload(frame0);
cv::ocl::oclMat d1(frame1.size(), CV_32FC1);
d1.upload(frame1);
WARMUP_ON;
d_alg(d0, d1, d_flowx, d_flowy);
WARMUP_OFF;
/*
double diff1 = 0.0, diff2 = 0.0;
if(ExceptedMatSimilar(gold[0], cv::Mat(d_flowx), 3e-3, diff1) == 1
&&ExceptedMatSimilar(gold[1], cv::Mat(d_flowy), 3e-3, diff2) == 1)
TestSystem::instance().setAccurate(1);
else
TestSystem::instance().setAccurate(0);
TestSystem::instance().setDiff(diff1);
TestSystem::instance().setDiff(diff2);
*/
GPU_ON;
d_alg(d0, d1, d_flowx, d_flowy);
d_alg.collectGarbage();
GPU_OFF;
cv::Mat flowx, flowy;
GPU_FULL_ON;
d0.upload(frame0);
d1.upload(frame1);
d_alg(d0, d1, d_flowx, d_flowy);
d_alg.collectGarbage();
d_flowx.download(flowx);
d_flowy.download(flowy);
GPU_FULL_OFF;
TestSystem::instance().ExceptedMatSimilar(gold[0], flowx, 3e-3);
TestSystem::instance().ExceptedMatSimilar(gold[1], flowy, 3e-3);
}

View File

@@ -48,7 +48,7 @@
///////////// pyrDown //////////////////////
PERFTEST(pyrDown)
{
Mat src, dst;
Mat src, dst, ocl_dst;
int all_type[] = {CV_8UC1, CV_8UC4};
std::string type_name[] = {"CV_8UC1", "CV_8UC4"};
@@ -73,9 +73,6 @@ PERFTEST(pyrDown)
ocl::pyrDown(d_src, d_dst);
WARMUP_OFF;
TestSystem::instance().setAccurate(ExpectedMatNear(dst, cv::Mat(d_dst), dst.depth() == CV_32F ? 1e-4f : 1.0f));
GPU_ON;
ocl::pyrDown(d_src, d_dst);
GPU_OFF;
@@ -83,8 +80,53 @@ PERFTEST(pyrDown)
GPU_FULL_ON;
d_src.upload(src);
ocl::pyrDown(d_src, d_dst);
d_dst.download(dst);
d_dst.download(ocl_dst);
GPU_FULL_OFF;
TestSystem::instance().ExpectedMatNear(dst, ocl_dst, dst.depth() == CV_32F ? 1e-4f : 1.0f);
}
}
}
///////////// pyrUp ////////////////////////
PERFTEST(pyrUp)
{
Mat src, dst, ocl_dst;
int all_type[] = {CV_8UC1, CV_8UC4};
std::string type_name[] = {"CV_8UC1", "CV_8UC4"};
for (int size = 500; size <= 2000; size *= 2)
{
for (size_t j = 0; j < sizeof(all_type) / sizeof(int); j++)
{
SUBTEST << size << 'x' << size << "; " << type_name[j] ;
gen(src, size, size, all_type[j], 0, 256);
pyrUp(src, dst);
CPU_ON;
pyrUp(src, dst);
CPU_OFF;
ocl::oclMat d_src(src);
ocl::oclMat d_dst;
WARMUP_ON;
ocl::pyrUp(d_src, d_dst);
WARMUP_OFF;
GPU_ON;
ocl::pyrUp(d_src, d_dst);
GPU_OFF;
GPU_FULL_ON;
d_src.upload(src);
ocl::pyrUp(d_src, d_dst);
d_dst.download(ocl_dst);
GPU_FULL_OFF;
TestSystem::instance().ExpectedMatNear(dst, ocl_dst, (src.depth() == CV_32F ? 1e-4f : 1.0));
}
}
}

View File

@@ -1,89 +0,0 @@
/*M///////////////////////////////////////////////////////////////////////////////////////
//
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
//
// By downloading, copying, installing or using the software you agree to this license.
// If you do not agree to this license, do not download, install,
// copy or use the software.
//
//
// License Agreement
// For Open Source Computer Vision Library
//
// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
// Third party copyrights are property of their respective owners.
//
// @Authors
// Fangfang Bai, fangfang@multicorewareinc.com
// Jin Ma, jin@multicorewareinc.com
//
// Redistribution and use in source and binary forms, with or without modification,
// are permitted provided that the following conditions are met:
//
// * Redistribution's of source code must retain the above copyright notice,
// this list of conditions and the following disclaimer.
//
// * Redistribution's in binary form must reproduce the above copyright notice,
// this list of conditions and the following disclaimer in the documentation
// and/or other oclMaterials provided with the distribution.
//
// * The name of the copyright holders may not be used to endorse or promote products
// derived from this software without specific prior written permission.
//
// This software is provided by the copyright holders and contributors as is and
// any express or implied warranties, including, but not limited to, the implied
// warranties of merchantability and fitness for a particular purpose are disclaimed.
// In no event shall the Intel Corporation or contributors be liable for any direct,
// indirect, incidental, special, exemplary, or consequential damages
// (including, but not limited to, procurement of substitute goods or services;
// loss of use, data, or profits; or business interruption) however caused
// and on any theory of liability, whether in contract, strict liability,
// or tort (including negligence or otherwise) arising in any way out of
// the use of this software, even if advised of the possibility of such damage.
//
//M*/
#include "precomp.hpp"
///////////// pyrUp ////////////////////////
PERFTEST(pyrUp)
{
Mat src, dst;
int all_type[] = {CV_8UC1, CV_8UC4};
std::string type_name[] = {"CV_8UC1", "CV_8UC4"};
for (int size = 500; size <= 2000; size *= 2)
{
for (size_t j = 0; j < sizeof(all_type) / sizeof(int); j++)
{
SUBTEST << size << 'x' << size << "; " << type_name[j] ;
gen(src, size, size, all_type[j], 0, 256);
pyrUp(src, dst);
CPU_ON;
pyrUp(src, dst);
CPU_OFF;
ocl::oclMat d_src(src);
ocl::oclMat d_dst;
WARMUP_ON;
ocl::pyrUp(d_src, d_dst);
WARMUP_OFF;
TestSystem::instance().setAccurate(ExpectedMatNear(dst, cv::Mat(d_dst), (src.depth() == CV_32F ? 1e-4f : 1.0)));
GPU_ON;
ocl::pyrUp(d_src, d_dst);
GPU_OFF;
GPU_FULL_ON;
d_src.upload(src);
ocl::pyrUp(d_src, d_dst);
d_dst.download(dst);
GPU_FULL_OFF;
}
}
}

View File

@@ -48,7 +48,7 @@
///////////// Merge////////////////////////
PERFTEST(Merge)
{
Mat dst;
Mat dst, ocl_dst;
ocl::oclMat d_dst;
int channels = 4;
@@ -85,22 +85,20 @@ PERFTEST(Merge)
ocl::merge(d_src, d_dst);
WARMUP_OFF;
TestSystem::instance().setAccurate(ExpectedMatNear(cv::Mat(dst), cv::Mat(d_dst), 0.0));
GPU_ON;
ocl::merge(d_src, d_dst);
GPU_OFF;
GPU_FULL_ON;
for (int i = 0; i < channels; ++i)
{
d_src[i] = ocl::oclMat(size1, CV_8U, cv::Scalar::all(i));
d_src[i] = ocl::oclMat(size1, all_type[j], cv::Scalar::all(i));
}
ocl::merge(d_src, d_dst);
d_dst.download(dst);
d_dst.download(ocl_dst);
GPU_FULL_OFF;
TestSystem::instance().ExpectedMatNear(dst, ocl_dst, 0.0);
}
}
@@ -122,7 +120,7 @@ PERFTEST(Split)
Mat src(size1, CV_MAKE_TYPE(all_type[j], 4), cv::Scalar(1, 2, 3, 4));
std::vector<cv::Mat> dst;
std::vector<cv::Mat> dst, ocl_dst(4);
split(src, dst);
@@ -135,22 +133,7 @@ PERFTEST(Split)
WARMUP_ON;
ocl::split(d_src, d_dst);
WARMUP_OFF;
if(d_dst.size() == dst.size())
{
TestSystem::instance().setAccurate(1);
for(size_t i = 0; i < dst.size(); i++)
{
if(ExpectedMatNear(dst[i], cv::Mat(d_dst[i]), 0.0) == 0)
{
TestSystem::instance().setAccurate(0);
break;
}
}
}else
TestSystem::instance().setAccurate(0);
WARMUP_OFF;
GPU_ON;
ocl::split(d_src, d_dst);
@@ -159,7 +142,12 @@ PERFTEST(Split)
GPU_FULL_ON;
d_src.upload(src);
ocl::split(d_src, d_dst);
for(size_t i = 0; i < dst.size(); i++)
d_dst[i].download(ocl_dst[i]);
GPU_FULL_OFF;
vector<double> eps(4, 0.);
TestSystem::instance().ExpectMatsNear(dst, ocl_dst, eps);
}
}

View File

@@ -114,7 +114,6 @@ void TestSystem::finishCurrentSubtest()
return;
}
int is_accurate = is_accurate_;
double cpu_time = cpu_elapsed_ / getTickFrequency() * 1000.0;
double gpu_time = gpu_elapsed_ / getTickFrequency() * 1000.0;
double gpu_full_time = gpu_full_elapsed_ / getTickFrequency() * 1000.0;
@@ -171,8 +170,8 @@ void TestSystem::finishCurrentSubtest()
deviation = std::sqrt(sum / gpu_times_.size());
}
printMetrics(is_accurate, cpu_time, gpu_time, gpu_full_time, speedup, fullspeedup);
writeMetrics(is_accurate, cpu_time, gpu_time, gpu_full_time, speedup, fullspeedup, gpu_min, gpu_max, deviation);
printMetrics(is_accurate_, cpu_time, gpu_time, gpu_full_time, speedup, fullspeedup);
writeMetrics(cpu_time, gpu_time, gpu_full_time, speedup, fullspeedup, gpu_min, gpu_max, deviation);
num_subtests_called_++;
resetCurrentSubtest();
@@ -219,7 +218,7 @@ void TestSystem::writeHeading()
}
}
fprintf(record_, "NAME,DESCRIPTION,ACCURACY,CPU (ms),GPU (ms),SPEEDUP,GPUTOTAL (ms),TOTALSPEEDUP,GPU Min (ms),GPU Max (ms), Standard deviation (ms)\n");
fprintf(record_, "NAME,DESCRIPTION,ACCURACY,DIFFERENCE,CPU (ms),GPU (ms),SPEEDUP,GPUTOTAL (ms),TOTALSPEEDUP,GPU Min (ms),GPU Max (ms), Standard deviation (ms)\n");
fflush(record_);
}
@@ -392,7 +391,7 @@ void TestSystem::printMetrics(int is_accurate, double cpu_time, double gpu_time,
#endif
}
void TestSystem::writeMetrics(int is_accurate, double cpu_time, double gpu_time, double gpu_full_time, double speedup, double fullspeedup, double gpu_min, double gpu_max, double std_dev)
void TestSystem::writeMetrics(double cpu_time, double gpu_time, double gpu_full_time, double speedup, double fullspeedup, double gpu_min, double gpu_max, double std_dev)
{
if (!record_)
{
@@ -402,21 +401,24 @@ void TestSystem::writeMetrics(int is_accurate, double cpu_time, double gpu_time,
string _is_accurate_;
if(is_accurate == 1)
if(is_accurate_ == 1)
_is_accurate_ = "Pass";
else if(is_accurate == 0)
else if(is_accurate_ == 0)
_is_accurate_ = "Fail";
else if(is_accurate == -1)
else if(is_accurate_ == -1)
_is_accurate_ = " ";
else
{
std::cout<<"is_accurate errer: "<<is_accurate<<"\n";
std::cout<<"is_accurate errer: "<<is_accurate_<<"\n";
exit(-1);
}
fprintf(record_, "%s,%s,%s,%.3f,%.3f,%.3f,%.3f,%.3f,%.3f,%.3f,%.3f\n", itname_changed_ ? itname_.c_str() : "",
fprintf(record_, "%s,%s,%s,%.2f,%.3f,%.3f,%.3f,%.3f,%.3f,%.3f,%.3f,%.3f\n",
itname_changed_ ? itname_.c_str() : "",
cur_subtest_description_.str().c_str(),
_is_accurate_.c_str(), cpu_time, gpu_time, speedup, gpu_full_time, fullspeedup,
_is_accurate_.c_str(),
accurate_diff_,
cpu_time, gpu_time, speedup, gpu_full_time, fullspeedup,
gpu_min, gpu_max, std_dev);
if (itname_changed_)
@@ -469,134 +471,6 @@ void gen(Mat &mat, int rows, int cols, int type, Scalar low, Scalar high)
RNG rng(0);
rng.fill(mat, RNG::UNIFORM, low, high);
}
#if 0
void gen(Mat &mat, int rows, int cols, int type, int low, int high, int n)
{
assert(n > 0&&n <= cols * rows);
assert(type == CV_8UC1||type == CV_8UC3||type == CV_8UC4
||type == CV_32FC1||type == CV_32FC3||type == CV_32FC4);
RNG rng;
//generate random position without duplication
std::vector<int> pos;
for(int i = 0; i < cols * rows; i++)
{
pos.push_back(i);
}
for(int i = 0; i < cols * rows; i++)
{
int temp = i + rng.uniform(0, cols * rows - 1 - i);
int temp1 = pos[temp];
pos[temp]= pos[i];
pos[i] = temp1;
}
std::vector<int> selected_pos;
for(int i = 0; i < n; i++)
{
selected_pos.push_back(pos[i]);
}
pos.clear();
//end of generating random y without duplication
if(type == CV_8UC1)
{
typedef struct coorStruct_
{
int x;
int y;
uchar xy;
}coorStruct;
coorStruct coor_struct;
std::vector<coorStruct> coor;
for(int i = 0; i < n; i++)
{
coor_struct.x = -1;
coor_struct.y = -1;
coor_struct.xy = (uchar)rng.uniform(low, high);
coor.push_back(coor_struct);
}
for(int i = 0; i < n; i++)
{
coor[i].y = selected_pos[i]/cols;
coor[i].x = selected_pos[i]%cols;
}
selected_pos.clear();
mat.create(rows, cols, type);
mat.setTo(0);
for(int i = 0; i < n; i++)
{
mat.at<unsigned char>(coor[i].y, coor[i].x) = coor[i].xy;
}
}
if(type == CV_8UC4 || type == CV_8UC3)
{
mat.create(rows, cols, type);
mat.setTo(0);
typedef struct Coor
{
int x;
int y;
uchar r;
uchar g;
uchar b;
uchar alpha;
}coor;
std::vector<coor> coor_vect;
coor xy_coor;
for(int i = 0; i < n; i++)
{
xy_coor.r = (uchar)rng.uniform(low, high);
xy_coor.g = (uchar)rng.uniform(low, high);
xy_coor.b = (uchar)rng.uniform(low, high);
if(type == CV_8UC4)
xy_coor.alpha = (uchar)rng.uniform(low, high);
coor_vect.push_back(xy_coor);
}
for(int i = 0; i < n; i++)
{
coor_vect[i].y = selected_pos[i]/((int)mat.step1()/mat.elemSize());
coor_vect[i].x = selected_pos[i]%((int)mat.step1()/mat.elemSize());
//printf("coor_vect[%d] = (%d, %d)\n", i, coor_vect[i].y, coor_vect[i].x);
}
if(type == CV_8UC4)
{
for(int i = 0; i < n; i++)
{
mat.at<unsigned char>(coor_vect[i].y, 4 * coor_vect[i].x) = coor_vect[i].r;
mat.at<unsigned char>(coor_vect[i].y, 4 * coor_vect[i].x + 1) = coor_vect[i].g;
mat.at<unsigned char>(coor_vect[i].y, 4 * coor_vect[i].x + 2) = coor_vect[i].b;
mat.at<unsigned char>(coor_vect[i].y, 4 * coor_vect[i].x + 3) = coor_vect[i].alpha;
}
}else if(type == CV_8UC3)
{
for(int i = 0; i < n; i++)
{
mat.at<unsigned char>(coor_vect[i].y, 3 * coor_vect[i].x) = coor_vect[i].r;
mat.at<unsigned char>(coor_vect[i].y, 3 * coor_vect[i].x + 1) = coor_vect[i].g;
mat.at<unsigned char>(coor_vect[i].y, 3 * coor_vect[i].x + 2) = coor_vect[i].b;
}
}
}
}
#endif
string abspath(const string &relpath)
{
@@ -619,31 +493,3 @@ double checkSimilarity(const Mat &m1, const Mat &m2)
matchTemplate(m1, m2, diff, TM_CCORR_NORMED);
return std::abs(diff.at<float>(0, 0) - 1.f);
}
int ExpectedMatNear(cv::Mat dst, cv::Mat cpu_dst, double eps)
{
assert(dst.type() == cpu_dst.type());
assert(dst.size() == cpu_dst.size());
if(checkNorm(cv::Mat(dst), cv::Mat(cpu_dst)) < eps ||checkNorm(cv::Mat(dst), cv::Mat(cpu_dst)) == eps)
return 1;
return 0;
}
int ExceptDoubleNear(double val1, double val2, double abs_error)
{
const double diff = fabs(val1 - val2);
if (diff <= abs_error)
return 1;
return 0;
}
int ExceptedMatSimilar(cv::Mat dst, cv::Mat cpu_dst, double eps)
{
assert(dst.type() == cpu_dst.type());
assert(dst.size() == cpu_dst.size());
if(checkSimilarity(cv::Mat(cpu_dst), cv::Mat(dst)) <= eps)
return 1;
return 0;
}

View File

@@ -322,9 +322,46 @@ public:
itname_changed_ = true;
}
void setAccurate(int is_accurate = -1)
void setAccurate(int accurate, double diff)
{
is_accurate_ = is_accurate;
is_accurate_ = accurate;
accurate_diff_ = diff;
}
void ExpectMatsNear(vector<Mat>& dst, vector<Mat>& cpu_dst, vector<double>& eps)
{
assert(dst.size() == cpu_dst.size());
assert(cpu_dst.size() == eps.size());
is_accurate_ = 1;
for(size_t i=0; i<dst.size(); i++)
{
double cur_diff = checkNorm(dst[i], cpu_dst[i]);
accurate_diff_ = max(accurate_diff_, cur_diff);
if(cur_diff > eps[i])
is_accurate_ = 0;
}
}
void ExpectedMatNear(cv::Mat& dst, cv::Mat& cpu_dst, double eps)
{
assert(dst.type() == cpu_dst.type());
assert(dst.size() == cpu_dst.size());
accurate_diff_ = checkNorm(dst, cpu_dst);
if(accurate_diff_ <= eps)
is_accurate_ = 1;
else
is_accurate_ = 0;
}
void ExceptedMatSimilar(cv::Mat& dst, cv::Mat& cpu_dst, double eps)
{
assert(dst.type() == cpu_dst.type());
assert(dst.size() == cpu_dst.size());
accurate_diff_ = checkSimilarity(cpu_dst, dst);
if(accurate_diff_ <= eps)
is_accurate_ = 1;
else
is_accurate_ = 0;
}
std::stringstream &getCurSubtestDescription()
@@ -342,7 +379,7 @@ private:
num_iters_(10), cpu_num_iters_(2),
gpu_warmup_iters_(1), cur_iter_idx_(0), cur_warmup_idx_(0),
record_(0), recordname_("performance"), itname_changed_(true),
is_accurate_(-1)
is_accurate_(-1), accurate_diff_(0.)
{
cpu_times_.reserve(num_iters_);
gpu_times_.reserve(num_iters_);
@@ -363,6 +400,7 @@ private:
gpu_times_.clear();
gpu_full_times_.clear();
is_accurate_ = -1;
accurate_diff_ = 0.;
}
double meanTime(const std::vector<int64> &samples);
@@ -373,7 +411,7 @@ private:
void writeHeading();
void writeSummary();
void writeMetrics(int is_accurate, double cpu_time, double gpu_time = 0.0f, double gpu_full_time = 0.0f,
void writeMetrics(double cpu_time, double gpu_time = 0.0f, double gpu_full_time = 0.0f,
double speedup = 0.0f, double fullspeedup = 0.0f,
double gpu_min = 0.0f, double gpu_max = 0.0f, double std_dev = 0.0f);
@@ -425,6 +463,7 @@ private:
bool itname_changed_;
int is_accurate_;
double accurate_diff_;
};

View File

@@ -412,11 +412,11 @@ static void arithmetic_scalar_run(const oclMat &src, oclMat &dst, String kernelN
args.push_back( std::make_pair( sizeof(cl_int), (void *)&cols ));
args.push_back( std::make_pair( sizeof(cl_int), (void *)&dst_step1 ));
float f_scalar = (float)scalar;
if(src.clCxt->supportsFeature(Context::CL_DOUBLE))
args.push_back( std::make_pair( sizeof(cl_double), (void *)&scalar ));
else
{
float f_scalar = (float)scalar;
args.push_back( std::make_pair( sizeof(cl_float), (void *)&f_scalar));
}
@@ -783,45 +783,55 @@ static void arithmetic_minMax_mask_run(const oclMat &src, const oclMat &mask, cl
}
}
template <typename T> void arithmetic_minMax(const oclMat &src, double *minVal, double *maxVal, const oclMat &mask)
template <typename T> void arithmetic_minMax(const oclMat &src, double *minVal, double *maxVal,
const oclMat &mask, oclMat &buf)
{
size_t groupnum = src.clCxt->computeUnits();
CV_Assert(groupnum != 0);
groupnum = groupnum * 2;
int vlen = 8;
int dbsize = groupnum * 2 * vlen * sizeof(T) ;
Context *clCxt = src.clCxt;
cl_mem dstBuffer = openCLCreateBuffer(clCxt, CL_MEM_WRITE_ONLY, dbsize);
*minVal = std::numeric_limits<double>::max() , *maxVal = -std::numeric_limits<double>::max();
ensureSizeIsEnough(1, dbsize, CV_8UC1, buf);
cl_mem buf_data = reinterpret_cast<cl_mem>(buf.data);
if (mask.empty())
{
arithmetic_minMax_run(src, mask, dstBuffer, vlen, groupnum, "arithm_op_minMax");
arithmetic_minMax_run(src, mask, buf_data, vlen, groupnum, "arithm_op_minMax");
}
else
{
arithmetic_minMax_mask_run(src, mask, dstBuffer, vlen, groupnum, "arithm_op_minMax_mask");
arithmetic_minMax_mask_run(src, mask, buf_data, vlen, groupnum, "arithm_op_minMax_mask");
}
T *p = new T[groupnum * vlen * 2];
memset(p, 0, dbsize);
openCLReadBuffer(clCxt, dstBuffer, (void *)p, dbsize);
if(minVal != NULL){
Mat matbuf = Mat(buf);
T *p = matbuf.ptr<T>();
if(minVal != NULL)
{
*minVal = std::numeric_limits<double>::max();
for(int i = 0; i < vlen * (int)groupnum; i++)
{
*minVal = *minVal < p[i] ? *minVal : p[i];
}
}
if(maxVal != NULL){
if(maxVal != NULL)
{
*maxVal = -std::numeric_limits<double>::max();
for(int i = vlen * (int)groupnum; i < 2 * vlen * (int)groupnum; i++)
{
*maxVal = *maxVal > p[i] ? *maxVal : p[i];
}
}
delete[] p;
openCLFree(dstBuffer);
}
typedef void (*minMaxFunc)(const oclMat &src, double *minVal, double *maxVal, const oclMat &mask);
typedef void (*minMaxFunc)(const oclMat &src, double *minVal, double *maxVal, const oclMat &mask, oclMat &buf);
void cv::ocl::minMax(const oclMat &src, double *minVal, double *maxVal, const oclMat &mask)
{
oclMat buf;
minMax_buf(src, minVal, maxVal, mask, buf);
}
void cv::ocl::minMax_buf(const oclMat &src, double *minVal, double *maxVal, const oclMat &mask, oclMat &buf)
{
CV_Assert(src.oclchannels() == 1);
if(!src.clCxt->supportsFeature(Context::CL_DOUBLE) && src.depth() == CV_64F)
@@ -841,7 +851,7 @@ void cv::ocl::minMax(const oclMat &src, double *minVal, double *maxVal, const oc
};
minMaxFunc func;
func = functab[src.depth()];
func(src, minVal, maxVal, mask);
func(src, minVal, maxVal, mask, buf);
}
//////////////////////////////////////////////////////////////////////////////
@@ -1688,10 +1698,11 @@ void bitwise_run(const oclMat &src1, const oclMat &src2, oclMat &dst, String ker
args.push_back( std::make_pair( sizeof(cl_int), (void *)&cols ));
args.push_back( std::make_pair( sizeof(cl_int), (void *)&dst_step1 ));
T scalar;
if(_scalar != NULL)
{
double scalar1 = *((double *)_scalar);
T scalar = (T)scalar1;
scalar = (T)scalar1;
args.push_back( std::make_pair( sizeof(T), (void *)&scalar ));
}
@@ -2308,9 +2319,9 @@ static void arithmetic_pow_run(const oclMat &src1, double p, oclMat &dst, String
args.push_back( std::make_pair( sizeof(cl_int), (void *)&dst.rows ));
args.push_back( std::make_pair( sizeof(cl_int), (void *)&cols ));
args.push_back( std::make_pair( sizeof(cl_int), (void *)&dst_step1 ));
float pf = p;
if(!src1.clCxt->supportsFeature(Context::CL_DOUBLE))
{
float pf = p;
args.push_back( std::make_pair( sizeof(cl_float), (void *)&pf ));
}
else

View File

@@ -244,11 +244,12 @@ static void matchDispatcher(const oclMat &query, const oclMat &train, const oclM
{
const oclMat zeroMask;
const oclMat &tempMask = mask.data ? mask : zeroMask;
bool is_cpu = queryDeviceInfo<IS_CPU_DEVICE, bool>();
if (query.cols <= 64)
{
matchUnrolledCached<16, 64>(query, train, tempMask, trainIdx, distance, distType);
}
else if (query.cols <= 128)
else if (query.cols <= 128 && !is_cpu)
{
matchUnrolledCached<16, 128>(query, train, tempMask, trainIdx, distance, distType);
}
@@ -263,11 +264,12 @@ static void matchDispatcher(const oclMat &query, const oclMat *trains, int n, co
{
const oclMat zeroMask;
const oclMat &tempMask = mask.data ? mask : zeroMask;
bool is_cpu = queryDeviceInfo<IS_CPU_DEVICE, bool>();
if (query.cols <= 64)
{
matchUnrolledCached<16, 64>(query, trains, n, tempMask, trainIdx, imgIdx, distance, distType);
}
else if (query.cols <= 128)
else if (query.cols <= 128 && !is_cpu)
{
matchUnrolledCached<16, 128>(query, trains, n, tempMask, trainIdx, imgIdx, distance, distType);
}
@@ -283,11 +285,12 @@ static void matchDispatcher(const oclMat &query, const oclMat &train, float maxD
{
const oclMat zeroMask;
const oclMat &tempMask = mask.data ? mask : zeroMask;
bool is_cpu = queryDeviceInfo<IS_CPU_DEVICE, bool>();
if (query.cols <= 64)
{
matchUnrolledCached<16, 64>(query, train, maxDistance, tempMask, trainIdx, distance, nMatches, distType);
}
else if (query.cols <= 128)
else if (query.cols <= 128 && !is_cpu)
{
matchUnrolledCached<16, 128>(query, train, maxDistance, tempMask, trainIdx, distance, nMatches, distType);
}
@@ -465,11 +468,12 @@ static void calcDistanceDispatcher(const oclMat &query, const oclMat &train, con
static void match2Dispatcher(const oclMat &query, const oclMat &train, const oclMat &mask,
const oclMat &trainIdx, const oclMat &distance, int distType)
{
bool is_cpu = queryDeviceInfo<IS_CPU_DEVICE, bool>();
if (query.cols <= 64)
{
knn_matchUnrolledCached<16, 64>(query, train, mask, trainIdx, distance, distType);
}
else if (query.cols <= 128)
else if (query.cols <= 128 && !is_cpu)
{
knn_matchUnrolledCached<16, 128>(query, train, mask, trainIdx, distance, distType);
}

View File

@@ -239,7 +239,7 @@ void canny::calcSobelRowPass_gpu(const oclMat &src, oclMat &dx_buf, oclMat &dy_b
size_t globalThreads[3] = {cols, rows, 1};
size_t localThreads[3] = {16, 16, 1};
openCLExecuteKernel2(clCxt, &imgproc_canny, kernelName, globalThreads, localThreads, args, -1, -1);
openCLExecuteKernel(clCxt, &imgproc_canny, kernelName, globalThreads, localThreads, args, -1, -1);
}
void canny::calcMagnitude_gpu(const oclMat &dx_buf, const oclMat &dy_buf, oclMat &dx, oclMat &dy, oclMat &mag, int rows, int cols, bool L2Grad)
@@ -269,12 +269,8 @@ void canny::calcMagnitude_gpu(const oclMat &dx_buf, const oclMat &dy_buf, oclMat
size_t globalThreads[3] = {cols, rows, 1};
size_t localThreads[3] = {16, 16, 1};
char build_options [15] = "";
if(L2Grad)
{
strcat(build_options, "-D L2GRAD");
}
openCLExecuteKernel2(clCxt, &imgproc_canny, kernelName, globalThreads, localThreads, args, -1, -1, build_options);
const char * build_options = L2Grad ? "-D L2GRAD":"";
openCLExecuteKernel(clCxt, &imgproc_canny, kernelName, globalThreads, localThreads, args, -1, -1, build_options);
}
void canny::calcMagnitude_gpu(const oclMat &dx, const oclMat &dy, oclMat &mag, int rows, int cols, bool L2Grad)
{
@@ -297,12 +293,8 @@ void canny::calcMagnitude_gpu(const oclMat &dx, const oclMat &dy, oclMat &mag, i
size_t globalThreads[3] = {cols, rows, 1};
size_t localThreads[3] = {16, 16, 1};
char build_options [15] = "";
if(L2Grad)
{
strcat(build_options, "-D L2GRAD");
}
openCLExecuteKernel2(clCxt, &imgproc_canny, kernelName, globalThreads, localThreads, args, -1, -1, build_options);
const char * build_options = L2Grad ? "-D L2GRAD":"";
openCLExecuteKernel(clCxt, &imgproc_canny, kernelName, globalThreads, localThreads, args, -1, -1, build_options);
}
void canny::calcMap_gpu(oclMat &dx, oclMat &dy, oclMat &mag, oclMat &map, int rows, int cols, float low_thresh, float high_thresh)
@@ -333,7 +325,7 @@ void canny::calcMap_gpu(oclMat &dx, oclMat &dy, oclMat &mag, oclMat &map, int ro
String kernelName = "calcMap";
size_t localThreads[3] = {16, 16, 1};
openCLExecuteKernel2(clCxt, &imgproc_canny, kernelName, globalThreads, localThreads, args, -1, -1);
openCLExecuteKernel(clCxt, &imgproc_canny, kernelName, globalThreads, localThreads, args, -1, -1);
}
void canny::edgesHysteresisLocal_gpu(oclMat &map, oclMat &st1, void *counter, int rows, int cols)
@@ -353,7 +345,7 @@ void canny::edgesHysteresisLocal_gpu(oclMat &map, oclMat &st1, void *counter, in
size_t globalThreads[3] = {cols, rows, 1};
size_t localThreads[3] = {16, 16, 1};
openCLExecuteKernel2(clCxt, &imgproc_canny, kernelName, globalThreads, localThreads, args, -1, -1);
openCLExecuteKernel(clCxt, &imgproc_canny, kernelName, globalThreads, localThreads, args, -1, -1);
}
void canny::edgesHysteresisGlobal_gpu(oclMat &map, oclMat &st1, oclMat &st2, void *counter, int rows, int cols)
@@ -383,7 +375,7 @@ void canny::edgesHysteresisGlobal_gpu(oclMat &map, oclMat &st1, oclMat &st2, voi
args.push_back( std::make_pair( sizeof(cl_int), (void *)&map.step));
args.push_back( std::make_pair( sizeof(cl_int), (void *)&map.offset));
openCLExecuteKernel2(clCxt, &imgproc_canny, kernelName, globalThreads, localThreads, args, -1, -1, DISABLE);
openCLExecuteKernel(clCxt, &imgproc_canny, kernelName, globalThreads, localThreads, args, -1, -1);
openCLSafeCall(clEnqueueReadBuffer(*(cl_command_queue*)getoclCommandQueue(), (cl_mem)counter, 1, 0, sizeof(int), &count, 0, NULL, NULL));
std::swap(st1, st2);
}
@@ -408,5 +400,5 @@ void canny::getEdges_gpu(oclMat &map, oclMat &dst, int rows, int cols)
size_t globalThreads[3] = {cols, rows, 1};
size_t localThreads[3] = {16, 16, 1};
openCLExecuteKernel2(clCxt, &imgproc_canny, kernelName, globalThreads, localThreads, args, -1, -1);
openCLExecuteKernel(clCxt, &imgproc_canny, kernelName, globalThreads, localThreads, args, -1, -1);
}

View File

@@ -356,8 +356,7 @@ static void GPUDilate(const oclMat &src, oclMat &dst, oclMat &mat_kernel,
char compile_option[128];
sprintf(compile_option, "-D RADIUSX=%d -D RADIUSY=%d -D LSIZE0=%d -D LSIZE1=%d -D DILATE %s %s",
anchor.x, anchor.y, (int)localThreads[0], (int)localThreads[1],
rectKernel?"-D RECTKERNEL":"",
s);
s, rectKernel?"-D RECTKERNEL":"");
std::vector< std::pair<size_t, const void *> > args;
args.push_back(std::make_pair(sizeof(cl_mem), (void *)&src.data));
args.push_back(std::make_pair(sizeof(cl_mem), (void *)&dst.data));

349
modules/ocl/src/gfft.cpp Normal file
View File

@@ -0,0 +1,349 @@
/*M///////////////////////////////////////////////////////////////////////////////////////
//
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
//
// By downloading, copying, installing or using the software you agree to this license.
// If you do not agree to this license, do not download, install,
// copy or use the software.
//
//
// License Agreement
// For Open Source Computer Vision Library
//
// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
// Third party copyrights are property of their respective owners.
//
// @Authors
// Peng Xiao, pengxiao@outlook.com
//
// Redistribution and use in source and binary forms, with or without modification,
// are permitted provided that the following conditions are met:
//
// * Redistribution's of source code must retain the above copyright notice,
// this list of conditions and the following disclaimer.
//
// * Redistribution's in binary form must reproduce the above copyright notice,
// this list of conditions and the following disclaimer in the documentation
// and/or other oclMaterials provided with the distribution.
//
// * The name of the copyright holders may not be used to endorse or promote products
// derived from this software without specific prior written permission.
//
// This software is provided by the copyright holders and contributors as is and
// any express or implied warranties, including, but not limited to, the implied
// warranties of merchantability and fitness for a particular purpose are disclaimed.
// In no event shall the Intel Corporation or contributors be liable for any direct,
// indirect, incidental, special, exemplary, or consequential damages
// (including, but not limited to, procurement of substitute goods or services;
// loss of use, data, or profits; or business interruption) however caused
// and on any theory of liability, whether in contract, strict liability,
// or tort (including negligence or otherwise) arising in any way out of
// the use of this software, even if advised of the possibility of such damage.
//
//M*/
#include <iomanip>
#include "precomp.hpp"
using namespace cv;
using namespace cv::ocl;
static bool use_cpu_sorter = true;
namespace cv
{
namespace ocl
{
///////////////////////////OpenCL kernel strings///////////////////////////
extern const char *imgproc_gfft;
}
}
namespace
{
enum SortMethod
{
CPU_STL,
BITONIC,
SELECTION
};
const int GROUP_SIZE = 256;
template<SortMethod method>
struct Sorter
{
//typedef EigType;
};
//TODO(pengx): optimize GPU sorter's performance thus CPU sorter is removed.
template<>
struct Sorter<CPU_STL>
{
typedef oclMat EigType;
static cv::Mutex cs;
static Mat mat_eig;
//prototype
static int clfloat2Gt(cl_float2 pt1, cl_float2 pt2)
{
float v1 = mat_eig.at<float>(cvRound(pt1.s[1]), cvRound(pt1.s[0]));
float v2 = mat_eig.at<float>(cvRound(pt2.s[1]), cvRound(pt2.s[0]));
return v1 > v2;
}
static void sortCorners_caller(const EigType& eig_tex, oclMat& corners, const int count)
{
cv::AutoLock lock(cs);
//temporarily use STL's sort function
Mat mat_corners = corners;
mat_eig = eig_tex;
std::sort(mat_corners.begin<cl_float2>(), mat_corners.begin<cl_float2>() + count, clfloat2Gt);
corners = mat_corners;
}
};
cv::Mutex Sorter<CPU_STL>::cs;
cv::Mat Sorter<CPU_STL>::mat_eig;
template<>
struct Sorter<BITONIC>
{
typedef TextureCL EigType;
static void sortCorners_caller(const EigType& eig_tex, oclMat& corners, const int count)
{
Context * cxt = Context::getContext();
size_t globalThreads[3] = {count / 2, 1, 1};
size_t localThreads[3] = {GROUP_SIZE, 1, 1};
// 2^numStages should be equal to count or the output is invalid
int numStages = 0;
for(int i = count; i > 1; i >>= 1)
{
++numStages;
}
const int argc = 5;
std::vector< std::pair<size_t, const void *> > args(argc);
String kernelname = "sortCorners_bitonicSort";
args[0] = std::make_pair(sizeof(cl_mem), (void *)&eig_tex);
args[1] = std::make_pair(sizeof(cl_mem), (void *)&corners.data);
args[2] = std::make_pair(sizeof(cl_int), (void *)&count);
for(int stage = 0; stage < numStages; ++stage)
{
args[3] = std::make_pair(sizeof(cl_int), (void *)&stage);
for(int passOfStage = 0; passOfStage < stage + 1; ++passOfStage)
{
args[4] = std::make_pair(sizeof(cl_int), (void *)&passOfStage);
openCLExecuteKernel(cxt, &imgproc_gfft, kernelname, globalThreads, localThreads, args, -1, -1);
}
}
}
};
template<>
struct Sorter<SELECTION>
{
typedef TextureCL EigType;
static void sortCorners_caller(const EigType& eig_tex, oclMat& corners, const int count)
{
Context * cxt = Context::getContext();
size_t globalThreads[3] = {count, 1, 1};
size_t localThreads[3] = {GROUP_SIZE, 1, 1};
std::vector< std::pair<size_t, const void *> > args;
//local
String kernelname = "sortCorners_selectionSortLocal";
int lds_size = GROUP_SIZE * sizeof(cl_float2);
args.push_back( std::make_pair( sizeof(cl_mem), (void*)&eig_tex) );
args.push_back( std::make_pair( sizeof(cl_mem), (void*)&corners.data) );
args.push_back( std::make_pair( sizeof(cl_int), (void*)&count) );
args.push_back( std::make_pair( lds_size, (void*)NULL) );
openCLExecuteKernel(cxt, &imgproc_gfft, kernelname, globalThreads, localThreads, args, -1, -1);
//final
kernelname = "sortCorners_selectionSortFinal";
args.pop_back();
openCLExecuteKernel(cxt, &imgproc_gfft, kernelname, globalThreads, localThreads, args, -1, -1);
}
};
int findCorners_caller(
const TextureCL& eig,
const float threshold,
const oclMat& mask,
oclMat& corners,
const int max_count)
{
std::vector<int> k;
Context * cxt = Context::getContext();
std::vector< std::pair<size_t, const void*> > args;
String kernelname = "findCorners";
const int mask_strip = mask.step / mask.elemSize1();
oclMat g_counter(1, 1, CV_32SC1);
g_counter.setTo(0);
args.push_back(std::make_pair( sizeof(cl_mem), (void*)&eig ));
args.push_back(std::make_pair( sizeof(cl_mem), (void*)&mask.data ));
args.push_back(std::make_pair( sizeof(cl_mem), (void*)&corners.data ));
args.push_back(std::make_pair( sizeof(cl_int), (void*)&mask_strip));
args.push_back(std::make_pair( sizeof(cl_float), (void*)&threshold ));
args.push_back(std::make_pair( sizeof(cl_int), (void*)&eig.rows ));
args.push_back(std::make_pair( sizeof(cl_int), (void*)&eig.cols ));
args.push_back(std::make_pair( sizeof(cl_int), (void*)&max_count ));
args.push_back(std::make_pair( sizeof(cl_mem), (void*)&g_counter.data ));
size_t globalThreads[3] = {eig.cols, eig.rows, 1};
size_t localThreads[3] = {16, 16, 1};
const char * opt = mask.empty() ? "" : "-D WITH_MASK";
openCLExecuteKernel(cxt, &imgproc_gfft, kernelname, globalThreads, localThreads, args, -1, -1, opt);
return std::min(Mat(g_counter).at<int>(0), max_count);
}
}//unnamed namespace
void cv::ocl::GoodFeaturesToTrackDetector_OCL::operator ()(const oclMat& image, oclMat& corners, const oclMat& mask)
{
CV_Assert(qualityLevel > 0 && minDistance >= 0 && maxCorners >= 0);
CV_Assert(mask.empty() || (mask.type() == CV_8UC1 && mask.size() == image.size()));
CV_DbgAssert(support_image2d());
ensureSizeIsEnough(image.size(), CV_32F, eig_);
if (useHarrisDetector)
cornerMinEigenVal_dxdy(image, eig_, Dx_, Dy_, blockSize, 3, harrisK);
else
cornerMinEigenVal_dxdy(image, eig_, Dx_, Dy_, blockSize, 3);
double maxVal = 0;
minMax_buf(eig_, 0, &maxVal, oclMat(), minMaxbuf_);
ensureSizeIsEnough(1, std::max(1000, static_cast<int>(image.size().area() * 0.05)), CV_32FC2, tmpCorners_);
Ptr<TextureCL> eig_tex = bindTexturePtr(eig_);
int total = findCorners_caller(
*eig_tex,
static_cast<float>(maxVal * qualityLevel),
mask,
tmpCorners_,
tmpCorners_.cols);
if (total == 0)
{
corners.release();
return;
}
if(use_cpu_sorter)
{
Sorter<CPU_STL>::sortCorners_caller(eig_, tmpCorners_, total);
}
else
{
//if total is power of 2
if(((total - 1) & (total)) == 0)
{
Sorter<BITONIC>::sortCorners_caller(*eig_tex, tmpCorners_, total);
}
else
{
Sorter<SELECTION>::sortCorners_caller(*eig_tex, tmpCorners_, total);
}
}
if (minDistance < 1)
{
corners = tmpCorners_(Rect(0, 0, maxCorners > 0 ? std::min(maxCorners, total) : total, 1));
}
else
{
std::vector<Point2f> tmp(total);
downloadPoints(tmpCorners_, tmp);
std::vector<Point2f> tmp2;
tmp2.reserve(total);
const int cell_size = cvRound(minDistance);
const int grid_width = (image.cols + cell_size - 1) / cell_size;
const int grid_height = (image.rows + cell_size - 1) / cell_size;
std::vector< std::vector<Point2f> > grid(grid_width * grid_height);
for (int i = 0; i < total; ++i)
{
Point2f p = tmp[i];
bool good = true;
int x_cell = static_cast<int>(p.x / cell_size);
int y_cell = static_cast<int>(p.y / cell_size);
int x1 = x_cell - 1;
int y1 = y_cell - 1;
int x2 = x_cell + 1;
int y2 = y_cell + 1;
// boundary check
x1 = std::max(0, x1);
y1 = std::max(0, y1);
x2 = std::min(grid_width - 1, x2);
y2 = std::min(grid_height - 1, y2);
for (int yy = y1; yy <= y2; yy++)
{
for (int xx = x1; xx <= x2; xx++)
{
std::vector<Point2f>& m = grid[yy * grid_width + xx];
if (!m.empty())
{
for(size_t j = 0; j < m.size(); j++)
{
float dx = p.x - m[j].x;
float dy = p.y - m[j].y;
if (dx * dx + dy * dy < minDistance * minDistance)
{
good = false;
goto break_out;
}
}
}
}
}
break_out:
if(good)
{
grid[y_cell * grid_width + x_cell].push_back(p);
tmp2.push_back(p);
if (maxCorners > 0 && tmp2.size() == static_cast<size_t>(maxCorners))
break;
}
}
corners.upload(Mat(1, static_cast<int>(tmp2.size()), CV_32FC2, &tmp2[0]));
}
}
void cv::ocl::GoodFeaturesToTrackDetector_OCL::downloadPoints(const oclMat &points, std::vector<Point2f> &points_v)
{
CV_DbgAssert(points.type() == CV_32FC2);
points_v.resize(points.cols);
openCLSafeCall(clEnqueueReadBuffer(
*reinterpret_cast<cl_command_queue*>(getoclCommandQueue()),
reinterpret_cast<cl_mem>(points.data),
CL_TRUE,
0,
points.cols * sizeof(Point2f),
&points_v[0],
0,
NULL,
NULL));
}

View File

@@ -136,47 +136,22 @@ struct CvHidHaarClassifierCascade
};
typedef struct
{
//int rows;
//int ystep;
int width_height;
//int height;
int grpnumperline_totalgrp;
//int totalgrp;
int imgoff;
float factor;
} detect_piramid_info;
#if defined WIN32 && !defined __MINGW__ && !defined __MINGW32__
#ifdef WIN32
#define _ALIGNED_ON(_ALIGNMENT) __declspec(align(_ALIGNMENT))
typedef _ALIGNED_ON(128) struct GpuHidHaarFeature
{
_ALIGNED_ON(32) struct
{
_ALIGNED_ON(4) int p0 ;
_ALIGNED_ON(4) int p1 ;
_ALIGNED_ON(4) int p2 ;
_ALIGNED_ON(4) int p3 ;
_ALIGNED_ON(4) float weight ;
}
/*_ALIGNED_ON(32)*/ rect[CV_HAAR_FEATURE_MAX] ;
}
GpuHidHaarFeature;
typedef _ALIGNED_ON(128) struct GpuHidHaarTreeNode
{
_ALIGNED_ON(64) int p[CV_HAAR_FEATURE_MAX][4];
//_ALIGNED_ON(16) int p1[CV_HAAR_FEATURE_MAX] ;
//_ALIGNED_ON(16) int p2[CV_HAAR_FEATURE_MAX] ;
//_ALIGNED_ON(16) int p3[CV_HAAR_FEATURE_MAX] ;
/*_ALIGNED_ON(16)*/
float weight[CV_HAAR_FEATURE_MAX] ;
/*_ALIGNED_ON(4)*/
float threshold ;
_ALIGNED_ON(8) float alpha[2] ;
_ALIGNED_ON(16) float alpha[3] ;
_ALIGNED_ON(4) int left ;
_ALIGNED_ON(4) int right ;
// GpuHidHaarFeature feature __attribute__((aligned (128)));
}
GpuHidHaarTreeNode;
@@ -184,7 +159,6 @@ GpuHidHaarTreeNode;
typedef _ALIGNED_ON(32) struct GpuHidHaarClassifier
{
_ALIGNED_ON(4) int count;
//CvHaarFeature* orig_feature;
_ALIGNED_ON(8) GpuHidHaarTreeNode *node ;
_ALIGNED_ON(8) float *alpha ;
}
@@ -219,32 +193,16 @@ typedef _ALIGNED_ON(64) struct GpuHidHaarClassifierCascade
_ALIGNED_ON(4) int p2 ;
_ALIGNED_ON(4) int p3 ;
_ALIGNED_ON(4) float inv_window_area ;
// GpuHidHaarStageClassifier* stage_classifier __attribute__((aligned (8)));
} GpuHidHaarClassifierCascade;
#else
#define _ALIGNED_ON(_ALIGNMENT) __attribute__((aligned(_ALIGNMENT) ))
typedef struct _ALIGNED_ON(128) GpuHidHaarFeature
{
struct _ALIGNED_ON(32)
{
int p0 _ALIGNED_ON(4);
int p1 _ALIGNED_ON(4);
int p2 _ALIGNED_ON(4);
int p3 _ALIGNED_ON(4);
float weight _ALIGNED_ON(4);
}
rect[CV_HAAR_FEATURE_MAX] _ALIGNED_ON(32);
}
GpuHidHaarFeature;
typedef struct _ALIGNED_ON(128) GpuHidHaarTreeNode
{
int p[CV_HAAR_FEATURE_MAX][4] _ALIGNED_ON(64);
float weight[CV_HAAR_FEATURE_MAX];// _ALIGNED_ON(16);
float threshold;// _ALIGNED_ON(4);
float alpha[2] _ALIGNED_ON(8);
float alpha[3] _ALIGNED_ON(16);
int left _ALIGNED_ON(4);
int right _ALIGNED_ON(4);
}
@@ -287,7 +245,6 @@ typedef struct _ALIGNED_ON(64) GpuHidHaarClassifierCascade
int p2 _ALIGNED_ON(4);
int p3 _ALIGNED_ON(4);
float inv_window_area _ALIGNED_ON(4);
// GpuHidHaarStageClassifier* stage_classifier __attribute__((aligned (8)));
} GpuHidHaarClassifierCascade;
#endif
@@ -295,36 +252,6 @@ const int icv_object_win_border = 1;
const float icv_stage_threshold_bias = 0.0001f;
double globaltime = 0;
// static CvHaarClassifierCascade * gpuCreateHaarClassifierCascade( int stage_count )
// {
// CvHaarClassifierCascade *cascade = 0;
// int block_size = sizeof(*cascade) + stage_count * sizeof(*cascade->stage_classifier);
// if( stage_count <= 0 )
// CV_Error( CV_StsOutOfRange, "Number of stages should be positive" );
// cascade = (CvHaarClassifierCascade *)cvAlloc( block_size );
// memset( cascade, 0, block_size );
// cascade->stage_classifier = (CvHaarStageClassifier *)(cascade + 1);
// cascade->flags = CV_HAAR_MAGIC_VAL;
// cascade->count = stage_count;
// return cascade;
// }
//static int globalcounter = 0;
// static void gpuReleaseHidHaarClassifierCascade( GpuHidHaarClassifierCascade **_cascade )
// {
// if( _cascade && *_cascade )
// {
// cvFree( _cascade );
// }
// }
/* create more efficient internal representation of haar classifier cascade */
static GpuHidHaarClassifierCascade * gpuCreateHidHaarClassifierCascade( CvHaarClassifierCascade *cascade, int *size, int *totalclassifier)
{
@@ -440,24 +367,12 @@ static GpuHidHaarClassifierCascade * gpuCreateHidHaarClassifierCascade( CvHaarCl
hid_stage_classifier->two_rects = 1;
haar_classifier_ptr += stage_classifier->count;
/*
hid_stage_classifier->parent = (stage_classifier->parent == -1)
? NULL : stage_classifier_ptr + stage_classifier->parent;
hid_stage_classifier->next = (stage_classifier->next == -1)
? NULL : stage_classifier_ptr + stage_classifier->next;
hid_stage_classifier->child = (stage_classifier->child == -1)
? NULL : stage_classifier_ptr + stage_classifier->child;
out->is_tree |= hid_stage_classifier->next != NULL;
*/
for( j = 0; j < stage_classifier->count; j++ )
{
CvHaarClassifier *classifier = stage_classifier->classifier + j;
GpuHidHaarClassifier *hid_classifier = hid_stage_classifier->classifier + j;
int node_count = classifier->count;
// float* alpha_ptr = (float*)(haar_node_ptr + node_count);
float *alpha_ptr = &haar_node_ptr->alpha[0];
hid_classifier->count = node_count;
@@ -484,16 +399,12 @@ static GpuHidHaarClassifierCascade * gpuCreateHidHaarClassifierCascade( CvHaarCl
node->p[2][3] = 0;
node->weight[2] = 0;
}
// memset( &(node->feature.rect[2]), 0, sizeof(node->feature.rect[2]) );
else
hid_stage_classifier->two_rects = 0;
memcpy( node->alpha, classifier->alpha, (node_count + 1)*sizeof(alpha_ptr[0]));
haar_node_ptr = haar_node_ptr + 1;
}
memcpy( alpha_ptr, classifier->alpha, (node_count + 1)*sizeof(alpha_ptr[0]));
haar_node_ptr = haar_node_ptr + 1;
// (GpuHidHaarTreeNode*)cvAlignPtr(alpha_ptr+node_count+1, sizeof(void*));
// (GpuHidHaarTreeNode*)(alpha_ptr+node_count+1);
out->is_stump_based &= node_count == 1;
}
}
@@ -506,25 +417,19 @@ static GpuHidHaarClassifierCascade * gpuCreateHidHaarClassifierCascade( CvHaarCl
#define sum_elem_ptr(sum,row,col) \
((sumtype*)CV_MAT_ELEM_PTR_FAST((sum),(row),(col),sizeof(sumtype)))
((sumtype*)CV_MAT_ELEM_PTR_FAST((sum),(row),(col),sizeof(sumtype)))
#define sqsum_elem_ptr(sqsum,row,col) \
((sqsumtype*)CV_MAT_ELEM_PTR_FAST((sqsum),(row),(col),sizeof(sqsumtype)))
((sqsumtype*)CV_MAT_ELEM_PTR_FAST((sqsum),(row),(col),sizeof(sqsumtype)))
#define calc_sum(rect,offset) \
((rect).p0[offset] - (rect).p1[offset] - (rect).p2[offset] + (rect).p3[offset])
((rect).p0[offset] - (rect).p1[offset] - (rect).p2[offset] + (rect).p3[offset])
static void gpuSetImagesForHaarClassifierCascade( CvHaarClassifierCascade *_cascade,
/* const CvArr* _sum,
const CvArr* _sqsum,
const CvArr* _tilted_sum,*/
double scale,
int step)
{
// CvMat sum_stub, *sum = (CvMat*)_sum;
// CvMat sqsum_stub, *sqsum = (CvMat*)_sqsum;
// CvMat tilted_stub, *tilted = (CvMat*)_tilted_sum;
GpuHidHaarClassifierCascade *cascade;
int coi0 = 0, coi1 = 0;
int i;
@@ -540,61 +445,25 @@ static void gpuSetImagesForHaarClassifierCascade( CvHaarClassifierCascade *_casc
if( scale <= 0 )
CV_Error( CV_StsOutOfRange, "Scale must be positive" );
// sum = cvGetMat( sum, &sum_stub, &coi0 );
// sqsum = cvGetMat( sqsum, &sqsum_stub, &coi1 );
if( coi0 || coi1 )
CV_Error( CV_BadCOI, "COI is not supported" );
// if( !CV_ARE_SIZES_EQ( sum, sqsum ))
// CV_Error( CV_StsUnmatchedSizes, "All integral images must have the same size" );
// if( CV_MAT_TYPE(sqsum->type) != CV_64FC1 ||
// CV_MAT_TYPE(sum->type) != CV_32SC1 )
// CV_Error( CV_StsUnsupportedFormat,
// "Only (32s, 64f, 32s) combination of (sum,sqsum,tilted_sum) formats is allowed" );
if( !_cascade->hid_cascade )
gpuCreateHidHaarClassifierCascade(_cascade, &datasize, &total);
cascade = (GpuHidHaarClassifierCascade *) _cascade->hid_cascade;
stage_classifier = (GpuHidHaarStageClassifier *) (cascade + 1);
if( cascade->has_tilted_features )
{
// tilted = cvGetMat( tilted, &tilted_stub, &coi1 );
// if( CV_MAT_TYPE(tilted->type) != CV_32SC1 )
// CV_Error( CV_StsUnsupportedFormat,
// "Only (32s, 64f, 32s) combination of (sum,sqsum,tilted_sum) formats is allowed" );
// if( sum->step != tilted->step )
// CV_Error( CV_StsUnmatchedSizes,
// "Sum and tilted_sum must have the same stride (step, widthStep)" );
// if( !CV_ARE_SIZES_EQ( sum, tilted ))
// CV_Error( CV_StsUnmatchedSizes, "All integral images must have the same size" );
// cascade->tilted = *tilted;
}
_cascade->scale = scale;
_cascade->real_window_size.width = cvRound( _cascade->orig_window_size.width * scale );
_cascade->real_window_size.height = cvRound( _cascade->orig_window_size.height * scale );
//cascade->sum = *sum;
//cascade->sqsum = *sqsum;
equRect.x = equRect.y = cvRound(scale);
equRect.width = cvRound((_cascade->orig_window_size.width - 2) * scale);
equRect.height = cvRound((_cascade->orig_window_size.height - 2) * scale);
weight_scale = 1. / (equRect.width * equRect.height);
cascade->inv_window_area = weight_scale;
// cascade->pq0 = equRect.y * step + equRect.x;
// cascade->pq1 = equRect.y * step + equRect.x + equRect.width ;
// cascade->pq2 = (equRect.y + equRect.height)*step + equRect.x;
// cascade->pq3 = (equRect.y + equRect.height)*step + equRect.x + equRect.width ;
cascade->pq0 = equRect.x;
cascade->pq1 = equRect.y;
cascade->pq2 = equRect.x + equRect.width;
@@ -617,10 +486,6 @@ static void gpuSetImagesForHaarClassifierCascade( CvHaarClassifierCascade *_casc
{
CvHaarFeature *feature =
&_cascade->stage_classifier[i].classifier[j].haar_feature[l];
/* GpuHidHaarClassifier* classifier =
cascade->stage_classifier[i].classifier + j; */
//GpuHidHaarFeature* hidfeature =
// &cascade->stage_classifier[i].classifier[j].node[l].feature;
GpuHidHaarTreeNode *hidnode = &stage_classifier[i].classifier[j].node[l];
double sum0 = 0, area0 = 0;
CvRect r[3];
@@ -635,8 +500,6 @@ static void gpuSetImagesForHaarClassifierCascade( CvHaarClassifierCascade *_casc
/* align blocks */
for( k = 0; k < CV_HAAR_FEATURE_MAX; k++ )
{
//if( !hidfeature->rect[k].p0 )
// break;
if(!hidnode->p[k][0])
break;
r[k] = feature->rect[k].r;
@@ -716,15 +579,6 @@ static void gpuSetImagesForHaarClassifierCascade( CvHaarClassifierCascade *_casc
if( !feature->tilted )
{
/* hidfeature->rect[k].p0 = tr.y * sum->cols + tr.x;
hidfeature->rect[k].p1 = tr.y * sum->cols + tr.x + tr.width;
hidfeature->rect[k].p2 = (tr.y + tr.height) * sum->cols + tr.x;
hidfeature->rect[k].p3 = (tr.y + tr.height) * sum->cols + tr.x + tr.width;
*/
/*hidnode->p0[k] = tr.y * step + tr.x;
hidnode->p1[k] = tr.y * step + tr.x + tr.width;
hidnode->p2[k] = (tr.y + tr.height) * step + tr.x;
hidnode->p3[k] = (tr.y + tr.height) * step + tr.x + tr.width;*/
hidnode->p[k][0] = tr.x;
hidnode->p[k][1] = tr.y;
hidnode->p[k][2] = tr.x + tr.width;
@@ -732,37 +586,24 @@ static void gpuSetImagesForHaarClassifierCascade( CvHaarClassifierCascade *_casc
}
else
{
/* hidfeature->rect[k].p2 = (tr.y + tr.width) * tilted->cols + tr.x + tr.width;
hidfeature->rect[k].p3 = (tr.y + tr.width + tr.height) * tilted->cols + tr.x + tr.width - tr.height;
hidfeature->rect[k].p0 = tr.y * tilted->cols + tr.x;
hidfeature->rect[k].p1 = (tr.y + tr.height) * tilted->cols + tr.x - tr.height;
*/
hidnode->p[k][2] = (tr.y + tr.width) * step + tr.x + tr.width;
hidnode->p[k][3] = (tr.y + tr.width + tr.height) * step + tr.x + tr.width - tr.height;
hidnode->p[k][0] = tr.y * step + tr.x;
hidnode->p[k][1] = (tr.y + tr.height) * step + tr.x - tr.height;
}
//hidfeature->rect[k].weight = (float)(feature->rect[k].weight * correction_ratio);
hidnode->weight[k] = (float)(feature->rect[k].weight * correction_ratio);
if( k == 0 )
area0 = tr.width * tr.height;
else
//sum0 += hidfeature->rect[k].weight * tr.width * tr.height;
sum0 += hidnode->weight[k] * tr.width * tr.height;
}
// hidfeature->rect[0].weight = (float)(-sum0/area0);
hidnode->weight[0] = (float)(-sum0 / area0);
} /* l */
} /* j */
}
}
static void gpuSetHaarClassifierCascade( CvHaarClassifierCascade *_cascade
/*double scale=0.0,*/
/*int step*/)
static void gpuSetHaarClassifierCascade( CvHaarClassifierCascade *_cascade)
{
GpuHidHaarClassifierCascade *cascade;
int i;
@@ -816,11 +657,7 @@ static void gpuSetHaarClassifierCascade( CvHaarClassifierCascade *_cascade
if(!hidnode->p[k][0])
break;
r[k] = feature->rect[k].r;
// base_w = (int)CV_IMIN( (unsigned)base_w, (unsigned)(r[k].width-1) );
// base_w = (int)CV_IMIN( (unsigned)base_w, (unsigned)(r[k].x - r[0].x-1) );
// base_h = (int)CV_IMIN( (unsigned)base_h, (unsigned)(r[k].height-1) );
// base_h = (int)CV_IMIN( (unsigned)base_h, (unsigned)(r[k].y - r[0].y-1) );
}
}
nr = k;
for( k = 0; k < nr; k++ )
@@ -838,7 +675,6 @@ static void gpuSetHaarClassifierCascade( CvHaarClassifierCascade *_cascade
hidnode->p[k][3] = tr.height;
hidnode->weight[k] = (float)(feature->rect[k].weight * correction_ratio);
}
//hidnode->weight[0]=(float)(-sum0/area0);
} /* l */
} /* j */
}
@@ -851,7 +687,6 @@ CvSeq *cv::ocl::OclCascadeClassifier::oclHaarDetectObjects( oclMat &gimg, CvMemS
const double GROUP_EPS = 0.2;
CvSeq *result_seq = 0;
cv::Ptr<CvMemStorage> temp_storage;
cv::ConcurrentRectVector allCandidates;
std::vector<cv::Rect> rectList;
@@ -909,6 +744,7 @@ CvSeq *cv::ocl::OclCascadeClassifier::oclHaarDetectObjects( oclMat &gimg, CvMemS
if( gimg.cols < minSize.width || gimg.rows < minSize.height )
CV_Error(CV_StsError, "Image too small");
cl_command_queue qu = reinterpret_cast<cl_command_queue>(Context::getContext()->oclCommandQueue());
if( (flags & CV_HAAR_SCALE_IMAGE) )
{
CvSize winSize0 = cascade->orig_window_size;
@@ -951,7 +787,7 @@ CvSeq *cv::ocl::OclCascadeClassifier::oclHaarDetectObjects( oclMat &gimg, CvMemS
size_t blocksize = 8;
size_t localThreads[3] = { blocksize, blocksize , 1 };
size_t globalThreads[3] = { grp_per_CU * gsum.clCxt->computeUnits() *localThreads[0],
size_t globalThreads[3] = { grp_per_CU *(gsum.clCxt->computeUnits()) *localThreads[0],
localThreads[1], 1
};
int outputsz = 256 * globalThreads[0] / localThreads[0];
@@ -996,7 +832,6 @@ CvSeq *cv::ocl::OclCascadeClassifier::oclHaarDetectObjects( oclMat &gimg, CvMemS
gpuSetImagesForHaarClassifierCascade( cascade, 1., gsum.step / 4 );
stagebuffer = openCLCreateBuffer(gsum.clCxt, CL_MEM_READ_ONLY, sizeof(GpuHidHaarStageClassifier) * gcascade->count);
cl_command_queue qu = (cl_command_queue)gsum.clCxt->oclCommandQueue();
openCLSafeCall(clEnqueueWriteBuffer(qu, stagebuffer, 1, 0, sizeof(GpuHidHaarStageClassifier)*gcascade->count, stage, 0, NULL, NULL));
nodebuffer = openCLCreateBuffer(gsum.clCxt, CL_MEM_READ_ONLY, nodenum * sizeof(GpuHidHaarTreeNode));
@@ -1043,7 +878,9 @@ CvSeq *cv::ocl::OclCascadeClassifier::oclHaarDetectObjects( oclMat &gimg, CvMemS
args.push_back ( std::make_pair(sizeof(cl_int4) , (void *)&pq ));
args.push_back ( std::make_pair(sizeof(cl_float) , (void *)&correction ));
openCLExecuteKernel(gsum.clCxt, &haarobjectdetect, "gpuRunHaarClassifierCascade", globalThreads, localThreads, args, -1, -1);
const char * build_options = gcascade->is_stump_based ? "-D STUMP_BASED=1" : "-D STUMP_BASED=0";
openCLExecuteKernel(gsum.clCxt, &haarobjectdetect, "gpuRunHaarClassifierCascade", globalThreads, localThreads, args, -1, -1, build_options);
openCLReadBuffer( gsum.clCxt, candidatebuffer, candidate, 4 * sizeof(int)*outputsz );
@@ -1058,6 +895,7 @@ CvSeq *cv::ocl::OclCascadeClassifier::oclHaarDetectObjects( oclMat &gimg, CvMemS
openCLSafeCall(clReleaseMemObject(scaleinfobuffer));
openCLSafeCall(clReleaseMemObject(nodebuffer));
openCLSafeCall(clReleaseMemObject(candidatebuffer));
}
else
{
@@ -1115,7 +953,6 @@ CvSeq *cv::ocl::OclCascadeClassifier::oclHaarDetectObjects( oclMat &gimg, CvMemS
sizeof(GpuHidHaarStageClassifier) * gcascade->count - sizeof(GpuHidHaarClassifier) * totalclassifier) / sizeof(GpuHidHaarTreeNode);
nodebuffer = openCLCreateBuffer(gsum.clCxt, CL_MEM_READ_ONLY,
nodenum * sizeof(GpuHidHaarTreeNode));
cl_command_queue qu = (cl_command_queue)gsum.clCxt->oclCommandQueue();
openCLSafeCall(clEnqueueWriteBuffer(qu, nodebuffer, 1, 0,
nodenum * sizeof(GpuHidHaarTreeNode),
node, 0, NULL, NULL));
@@ -1157,7 +994,6 @@ CvSeq *cv::ocl::OclCascadeClassifier::oclHaarDetectObjects( oclMat &gimg, CvMemS
args1.push_back ( std::make_pair(sizeof(cl_int) , (void *)&startnodenum ));
size_t globalThreads2[3] = {nodenum, 1, 1};
openCLExecuteKernel(gsum.clCxt, &haarobjectdetect_scaled2, "gpuscaleclassifier", globalThreads2, NULL/*localThreads2*/, args1, -1, -1);
}
@@ -1193,7 +1029,7 @@ CvSeq *cv::ocl::OclCascadeClassifier::oclHaarDetectObjects( oclMat &gimg, CvMemS
args.push_back ( std::make_pair(sizeof(cl_mem) , (void *)&correctionbuffer ));
args.push_back ( std::make_pair(sizeof(cl_int) , (void *)&nodenum ));
openCLExecuteKernel(gsum.clCxt, &haarobjectdetect_scaled2, "gpuRunHaarClassifierCascade_scaled2", globalThreads, localThreads, args, -1, -1);
openCLExecuteKernel(gsum.clCxt, &haarobjectdetect_scaled2, "gpuRunHaarClassifierCascade_scaled2", globalThreads, localThreads, args, -1, -1, build_options);
candidate = (int *)clEnqueueMapBuffer(qu, candidatebuffer, 1, CL_MAP_READ, 0, 4 * sizeof(int) * outputsz, 0, 0, 0, &status);
@@ -1281,7 +1117,7 @@ void cv::ocl::OclCascadeClassifierBuf::detectMultiScale(oclMat &gimg, CV_OUT std
int blocksize = 8;
int grp_per_CU = 12;
size_t localThreads[3] = { blocksize, blocksize, 1 };
size_t globalThreads[3] = { grp_per_CU * Context::getContext()->computeUnits() * localThreads[0],
size_t globalThreads[3] = { grp_per_CU * cv::ocl::Context::getContext()->computeUnits() *localThreads[0],
localThreads[1],
1 };
int outputsz = 256 * globalThreads[0] / localThreads[0];
@@ -1297,8 +1133,6 @@ void cv::ocl::OclCascadeClassifierBuf::detectMultiScale(oclMat &gimg, CV_OUT std
CvHaarClassifierCascade *cascade = oldCascade;
GpuHidHaarClassifierCascade *gcascade;
GpuHidHaarStageClassifier *stage;
GpuHidHaarClassifier *classifier;
GpuHidHaarTreeNode *node;
if( CV_MAT_DEPTH(gimg.type()) != CV_8U )
CV_Error( CV_StsUnsupportedFormat, "Only 8-bit images are supported" );
@@ -1311,7 +1145,7 @@ void cv::ocl::OclCascadeClassifierBuf::detectMultiScale(oclMat &gimg, CV_OUT std
}
int *candidate;
cl_command_queue qu = reinterpret_cast<cl_command_queue>(Context::getContext()->oclCommandQueue());
if( (flags & CV_HAAR_SCALE_IMAGE) )
{
int indexy = 0;
@@ -1337,19 +1171,6 @@ void cv::ocl::OclCascadeClassifierBuf::detectMultiScale(oclMat &gimg, CV_OUT std
gcascade = (GpuHidHaarClassifierCascade *)(cascade->hid_cascade);
stage = (GpuHidHaarStageClassifier *)(gcascade + 1);
classifier = (GpuHidHaarClassifier *)(stage + gcascade->count);
node = (GpuHidHaarTreeNode *)(classifier->node);
gpuSetImagesForHaarClassifierCascade( cascade, 1., gsum.step / 4 );
cl_command_queue qu = (cl_command_queue)gsum.clCxt->oclCommandQueue();
openCLSafeCall(clEnqueueWriteBuffer(qu, ((OclBuffers *)buffers)->stagebuffer, 1, 0,
sizeof(GpuHidHaarStageClassifier) * gcascade->count,
stage, 0, NULL, NULL));
openCLSafeCall(clEnqueueWriteBuffer(qu, ((OclBuffers *)buffers)->nodebuffer, 1, 0,
m_nodenum * sizeof(GpuHidHaarTreeNode),
node, 0, NULL, NULL));
int startstage = 0;
int endstage = gcascade->count;
@@ -1386,17 +1207,23 @@ void cv::ocl::OclCascadeClassifierBuf::detectMultiScale(oclMat &gimg, CV_OUT std
args.push_back ( make_pair(sizeof(cl_int4) , (void *)&pq ));
args.push_back ( make_pair(sizeof(cl_float) , (void *)&correction ));
openCLExecuteKernel(gsum.clCxt, &haarobjectdetect, "gpuRunHaarClassifierCascade", globalThreads, localThreads, args, -1, -1);
const char * build_options = gcascade->is_stump_based ? "-D STUMP_BASED=1" : "-D STUMP_BASED=0";
openCLExecuteKernel(gsum.clCxt, &haarobjectdetect, "gpuRunHaarClassifierCascade", globalThreads, localThreads, args, -1, -1, build_options);
candidate = (int *)malloc(4 * sizeof(int) * outputsz);
memset(candidate, 0, 4 * sizeof(int) * outputsz);
openCLReadBuffer( gsum.clCxt, ((OclBuffers *)buffers)->candidatebuffer, candidate, 4 * sizeof(int)*outputsz );
for(int i = 0; i < outputsz; i++)
{
if(candidate[4 * i + 2] != 0)
{
allCandidates.push_back(Rect(candidate[4 * i], candidate[4 * i + 1],
candidate[4 * i + 2], candidate[4 * i + 3]));
}
}
free((void *)candidate);
candidate = NULL;
}
@@ -1404,6 +1231,132 @@ void cv::ocl::OclCascadeClassifierBuf::detectMultiScale(oclMat &gimg, CV_OUT std
{
cv::ocl::integral(gimg, gsum, gsqsum);
gcascade = (GpuHidHaarClassifierCascade *)cascade->hid_cascade;
int step = gsum.step / 4;
int startnode = 0;
int splitstage = 3;
int startstage = 0;
int endstage = gcascade->count;
vector<pair<size_t, const void *> > args;
args.push_back ( make_pair(sizeof(cl_mem) , (void *)&((OclBuffers *)buffers)->stagebuffer ));
args.push_back ( make_pair(sizeof(cl_mem) , (void *)&((OclBuffers *)buffers)->scaleinfobuffer ));
args.push_back ( make_pair(sizeof(cl_mem) , (void *)&((OclBuffers *)buffers)->newnodebuffer ));
args.push_back ( make_pair(sizeof(cl_mem) , (void *)&gsum.data ));
args.push_back ( make_pair(sizeof(cl_mem) , (void *)&gsqsum.data ));
args.push_back ( make_pair(sizeof(cl_mem) , (void *)&((OclBuffers *)buffers)->candidatebuffer ));
args.push_back ( make_pair(sizeof(cl_int) , (void *)&gsum.rows ));
args.push_back ( make_pair(sizeof(cl_int) , (void *)&gsum.cols ));
args.push_back ( make_pair(sizeof(cl_int) , (void *)&step ));
args.push_back ( make_pair(sizeof(cl_int) , (void *)&m_loopcount ));
args.push_back ( make_pair(sizeof(cl_int) , (void *)&startstage ));
args.push_back ( make_pair(sizeof(cl_int) , (void *)&splitstage ));
args.push_back ( make_pair(sizeof(cl_int) , (void *)&endstage ));
args.push_back ( make_pair(sizeof(cl_int) , (void *)&startnode ));
args.push_back ( make_pair(sizeof(cl_mem) , (void *)&((OclBuffers *)buffers)->pbuffer ));
args.push_back ( make_pair(sizeof(cl_mem) , (void *)&((OclBuffers *)buffers)->correctionbuffer ));
args.push_back ( make_pair(sizeof(cl_int) , (void *)&m_nodenum ));
const char * build_options = gcascade->is_stump_based ? "-D STUMP_BASED=1" : "-D STUMP_BASED=0";
openCLExecuteKernel(gsum.clCxt, &haarobjectdetect_scaled2, "gpuRunHaarClassifierCascade_scaled2", globalThreads, localThreads, args, -1, -1, build_options);
candidate = (int *)clEnqueueMapBuffer(qu, ((OclBuffers *)buffers)->candidatebuffer, 1, CL_MAP_READ, 0, 4 * sizeof(int) * outputsz, 0, 0, 0, NULL);
for(int i = 0; i < outputsz; i++)
{
if(candidate[4 * i + 2] != 0)
allCandidates.push_back(Rect(candidate[4 * i], candidate[4 * i + 1],
candidate[4 * i + 2], candidate[4 * i + 3]));
}
clEnqueueUnmapMemObject(qu, ((OclBuffers *)buffers)->candidatebuffer, candidate, 0, 0, 0);
}
rectList.resize(allCandidates.size());
if(!allCandidates.empty())
std::copy(allCandidates.begin(), allCandidates.end(), rectList.begin());
if( minNeighbors != 0 || findBiggestObject )
groupRectangles(rectList, rweights, std::max(minNeighbors, 1), GROUP_EPS);
else
rweights.resize(rectList.size(), 0);
GenResult(faces, rectList, rweights);
}
void cv::ocl::OclCascadeClassifierBuf::Init(const int rows, const int cols,
double scaleFactor, int flags,
const int outputsz, const size_t localThreads[],
CvSize minSize, CvSize maxSize)
{
if(initialized)
{
return; // we only allow one time initialization
}
CvHaarClassifierCascade *cascade = oldCascade;
if( !CV_IS_HAAR_CLASSIFIER(cascade) )
CV_Error( !cascade ? CV_StsNullPtr : CV_StsBadArg, "Invalid classifier cascade" );
if( scaleFactor <= 1 )
CV_Error( CV_StsOutOfRange, "scale factor must be > 1" );
if( cols < minSize.width || rows < minSize.height )
CV_Error(CV_StsError, "Image too small");
int datasize=0;
int totalclassifier=0;
if( !cascade->hid_cascade )
{
gpuCreateHidHaarClassifierCascade(cascade, &datasize, &totalclassifier);
}
if( maxSize.height == 0 || maxSize.width == 0 )
{
maxSize.height = rows;
maxSize.width = cols;
}
findBiggestObject = (flags & CV_HAAR_FIND_BIGGEST_OBJECT) != 0;
if( findBiggestObject )
flags &= ~(CV_HAAR_SCALE_IMAGE | CV_HAAR_DO_CANNY_PRUNING);
CreateBaseBufs(datasize, totalclassifier, flags, outputsz);
CreateFactorRelatedBufs(rows, cols, flags, scaleFactor, localThreads, minSize, maxSize);
m_scaleFactor = scaleFactor;
m_rows = rows;
m_cols = cols;
m_flags = flags;
m_minSize = minSize;
m_maxSize = maxSize;
// initialize nodes
GpuHidHaarClassifierCascade *gcascade;
GpuHidHaarStageClassifier *stage;
GpuHidHaarClassifier *classifier;
GpuHidHaarTreeNode *node;
cl_command_queue qu = reinterpret_cast<cl_command_queue>(Context::getContext()->oclCommandQueue());
if( (flags & CV_HAAR_SCALE_IMAGE) )
{
gcascade = (GpuHidHaarClassifierCascade *)(cascade->hid_cascade);
stage = (GpuHidHaarStageClassifier *)(gcascade + 1);
classifier = (GpuHidHaarClassifier *)(stage + gcascade->count);
node = (GpuHidHaarTreeNode *)(classifier->node);
gpuSetImagesForHaarClassifierCascade( cascade, 1., gsum.step / 4 );
openCLSafeCall(clEnqueueWriteBuffer(qu, ((OclBuffers *)buffers)->stagebuffer, 1, 0,
sizeof(GpuHidHaarStageClassifier) * gcascade->count,
stage, 0, NULL, NULL));
openCLSafeCall(clEnqueueWriteBuffer(qu, ((OclBuffers *)buffers)->nodebuffer, 1, 0,
m_nodenum * sizeof(GpuHidHaarTreeNode),
node, 0, NULL, NULL));
}
else
{
gpuSetHaarClassifierCascade(cascade);
gcascade = (GpuHidHaarClassifierCascade *)cascade->hid_cascade;
@@ -1411,15 +1364,12 @@ void cv::ocl::OclCascadeClassifierBuf::detectMultiScale(oclMat &gimg, CV_OUT std
classifier = (GpuHidHaarClassifier *)(stage + gcascade->count);
node = (GpuHidHaarTreeNode *)(classifier->node);
cl_command_queue qu = (cl_command_queue)gsum.clCxt->oclCommandQueue();
openCLSafeCall(clEnqueueWriteBuffer(qu, ((OclBuffers *)buffers)->nodebuffer, 1, 0,
m_nodenum * sizeof(GpuHidHaarTreeNode),
node, 0, NULL, NULL));
m_nodenum * sizeof(GpuHidHaarTreeNode),
node, 0, NULL, NULL));
cl_int4 *p = (cl_int4 *)malloc(sizeof(cl_int4) * m_loopcount);
float *correction = (float *)malloc(sizeof(float) * m_loopcount);
int startstage = 0;
int endstage = gcascade->count;
double factor;
for(int i = 0; i < m_loopcount; i++)
{
@@ -1445,105 +1395,15 @@ void cv::ocl::OclCascadeClassifierBuf::detectMultiScale(oclMat &gimg, CV_OUT std
size_t globalThreads2[3] = {m_nodenum, 1, 1};
openCLExecuteKernel(gsum.clCxt, &haarobjectdetect_scaled2, "gpuscaleclassifier", globalThreads2, NULL/*localThreads2*/, args1, -1, -1);
openCLExecuteKernel(Context::getContext(), &haarobjectdetect_scaled2, "gpuscaleclassifier", globalThreads2, NULL/*localThreads2*/, args1, -1, -1);
}
int step = gsum.step / 4;
int startnode = 0;
int splitstage = 3;
openCLSafeCall(clEnqueueWriteBuffer(qu, ((OclBuffers *)buffers)->stagebuffer, 1, 0, sizeof(GpuHidHaarStageClassifier)*gcascade->count, stage, 0, NULL, NULL));
openCLSafeCall(clEnqueueWriteBuffer(qu, ((OclBuffers *)buffers)->pbuffer, 1, 0, sizeof(cl_int4)*m_loopcount, p, 0, NULL, NULL));
openCLSafeCall(clEnqueueWriteBuffer(qu, ((OclBuffers *)buffers)->correctionbuffer, 1, 0, sizeof(cl_float)*m_loopcount, correction, 0, NULL, NULL));
vector<pair<size_t, const void *> > args;
args.push_back ( make_pair(sizeof(cl_mem) , (void *)&((OclBuffers *)buffers)->stagebuffer ));
args.push_back ( make_pair(sizeof(cl_mem) , (void *)&((OclBuffers *)buffers)->scaleinfobuffer ));
args.push_back ( make_pair(sizeof(cl_mem) , (void *)&((OclBuffers *)buffers)->newnodebuffer ));
args.push_back ( make_pair(sizeof(cl_mem) , (void *)&gsum.data ));
args.push_back ( make_pair(sizeof(cl_mem) , (void *)&gsqsum.data ));
args.push_back ( make_pair(sizeof(cl_mem) , (void *)&((OclBuffers *)buffers)->candidatebuffer ));
args.push_back ( make_pair(sizeof(cl_int) , (void *)&gsum.rows ));
args.push_back ( make_pair(sizeof(cl_int) , (void *)&gsum.cols ));
args.push_back ( make_pair(sizeof(cl_int) , (void *)&step ));
args.push_back ( make_pair(sizeof(cl_int) , (void *)&m_loopcount ));
args.push_back ( make_pair(sizeof(cl_int) , (void *)&startstage ));
args.push_back ( make_pair(sizeof(cl_int) , (void *)&splitstage ));
args.push_back ( make_pair(sizeof(cl_int) , (void *)&endstage ));
args.push_back ( make_pair(sizeof(cl_int) , (void *)&startnode ));
args.push_back ( make_pair(sizeof(cl_mem) , (void *)&((OclBuffers *)buffers)->pbuffer ));
args.push_back ( make_pair(sizeof(cl_mem) , (void *)&((OclBuffers *)buffers)->correctionbuffer ));
args.push_back ( make_pair(sizeof(cl_int) , (void *)&m_nodenum ));
openCLExecuteKernel(gsum.clCxt, &haarobjectdetect_scaled2, "gpuRunHaarClassifierCascade_scaled2", globalThreads, localThreads, args, -1, -1);
candidate = (int *)clEnqueueMapBuffer(qu, ((OclBuffers *)buffers)->candidatebuffer, 1, CL_MAP_READ, 0, 4 * sizeof(int) * outputsz, 0, 0, 0, NULL);
for(int i = 0; i < outputsz; i++)
{
if(candidate[4 * i + 2] != 0)
allCandidates.push_back(Rect(candidate[4 * i], candidate[4 * i + 1],
candidate[4 * i + 2], candidate[4 * i + 3]));
}
free(p);
free(correction);
clEnqueueUnmapMemObject(qu, ((OclBuffers *)buffers)->candidatebuffer, candidate, 0, 0, 0);
}
rectList.resize(allCandidates.size());
if(!allCandidates.empty())
std::copy(allCandidates.begin(), allCandidates.end(), rectList.begin());
if( minNeighbors != 0 || findBiggestObject )
groupRectangles(rectList, rweights, std::max(minNeighbors, 1), GROUP_EPS);
else
rweights.resize(rectList.size(), 0);
GenResult(faces, rectList, rweights);
}
void cv::ocl::OclCascadeClassifierBuf::Init(const int rows, const int cols,
double scaleFactor, int flags,
const int outputsz, const size_t localThreads[],
CvSize minSize, CvSize maxSize)
{
CvHaarClassifierCascade *cascade = oldCascade;
if( !CV_IS_HAAR_CLASSIFIER(cascade) )
CV_Error( !cascade ? CV_StsNullPtr : CV_StsBadArg, "Invalid classifier cascade" );
if( scaleFactor <= 1 )
CV_Error( CV_StsOutOfRange, "scale factor must be > 1" );
if( cols < minSize.width || rows < minSize.height )
CV_Error(CV_StsError, "Image too small");
int datasize=0;
int totalclassifier=0;
if( !cascade->hid_cascade )
gpuCreateHidHaarClassifierCascade(cascade, &datasize, &totalclassifier);
if( maxSize.height == 0 || maxSize.width == 0 )
{
maxSize.height = rows;
maxSize.width = cols;
}
findBiggestObject = (flags & CV_HAAR_FIND_BIGGEST_OBJECT) != 0;
if( findBiggestObject )
flags &= ~(CV_HAAR_SCALE_IMAGE | CV_HAAR_DO_CANNY_PRUNING);
CreateBaseBufs(datasize, totalclassifier, flags, outputsz);
CreateFactorRelatedBufs(rows, cols, flags, scaleFactor, localThreads, minSize, maxSize);
m_scaleFactor = scaleFactor;
m_rows = rows;
m_cols = cols;
m_flags = flags;
m_minSize = minSize;
m_maxSize = maxSize;
initialized = true;
}
@@ -1642,6 +1502,7 @@ void cv::ocl::OclCascadeClassifierBuf::CreateFactorRelatedBufs(
CvSize sz;
CvSize winSize0 = oldCascade->orig_window_size;
detect_piramid_info *scaleinfo;
cl_command_queue qu = reinterpret_cast<cl_command_queue>(Context::getContext()->oclCommandQueue());
if (flags & CV_HAAR_SCALE_IMAGE)
{
for(factor = 1.f;; factor *= scaleFactor)
@@ -1743,7 +1604,7 @@ void cv::ocl::OclCascadeClassifierBuf::CreateFactorRelatedBufs(
((OclBuffers *)buffers)->scaleinfobuffer = openCLCreateBuffer(cv::ocl::Context::getContext(), CL_MEM_READ_ONLY, sizeof(detect_piramid_info) * loopcount);
}
openCLSafeCall(clEnqueueWriteBuffer((cl_command_queue)cv::ocl::Context::getContext()->oclCommandQueue(), ((OclBuffers *)buffers)->scaleinfobuffer, 1, 0,
openCLSafeCall(clEnqueueWriteBuffer(qu, ((OclBuffers *)buffers)->scaleinfobuffer, 1, 0,
sizeof(detect_piramid_info)*loopcount,
scaleinfo, 0, NULL, NULL));
free(scaleinfo);
@@ -1755,7 +1616,8 @@ void cv::ocl::OclCascadeClassifierBuf::GenResult(CV_OUT std::vector<cv::Rect>& f
const std::vector<cv::Rect> &rectList,
const std::vector<int> &rweights)
{
CvSeq *result_seq = cvCreateSeq( 0, sizeof(CvSeq), sizeof(CvAvgComp), cvCreateMemStorage(0) );
MemStorage tempStorage(cvCreateMemStorage(0));
CvSeq *result_seq = cvCreateSeq( 0, sizeof(CvSeq), sizeof(CvAvgComp), tempStorage );
if( findBiggestObject && rectList.size() )
{
@@ -1791,168 +1653,32 @@ void cv::ocl::OclCascadeClassifierBuf::GenResult(CV_OUT std::vector<cv::Rect>& f
void cv::ocl::OclCascadeClassifierBuf::release()
{
openCLSafeCall(clReleaseMemObject(((OclBuffers *)buffers)->stagebuffer));
openCLSafeCall(clReleaseMemObject(((OclBuffers *)buffers)->scaleinfobuffer));
openCLSafeCall(clReleaseMemObject(((OclBuffers *)buffers)->nodebuffer));
openCLSafeCall(clReleaseMemObject(((OclBuffers *)buffers)->candidatebuffer));
if( (m_flags & CV_HAAR_SCALE_IMAGE) )
if(initialized)
{
cvFree(&oldCascade->hid_cascade);
}
else
{
openCLSafeCall(clReleaseMemObject(((OclBuffers *)buffers)->newnodebuffer));
openCLSafeCall(clReleaseMemObject(((OclBuffers *)buffers)->correctionbuffer));
openCLSafeCall(clReleaseMemObject(((OclBuffers *)buffers)->pbuffer));
}
openCLSafeCall(clReleaseMemObject(((OclBuffers *)buffers)->stagebuffer));
openCLSafeCall(clReleaseMemObject(((OclBuffers *)buffers)->scaleinfobuffer));
openCLSafeCall(clReleaseMemObject(((OclBuffers *)buffers)->nodebuffer));
openCLSafeCall(clReleaseMemObject(((OclBuffers *)buffers)->candidatebuffer));
free(buffers);
buffers = NULL;
if( (m_flags & CV_HAAR_SCALE_IMAGE) )
{
cvFree(&oldCascade->hid_cascade);
}
else
{
openCLSafeCall(clReleaseMemObject(((OclBuffers *)buffers)->newnodebuffer));
openCLSafeCall(clReleaseMemObject(((OclBuffers *)buffers)->correctionbuffer));
openCLSafeCall(clReleaseMemObject(((OclBuffers *)buffers)->pbuffer));
}
free(buffers);
buffers = NULL;
initialized = false;
}
}
#ifndef _MAX_PATH
#define _MAX_PATH 1024
#endif
/****************************************************************************************\
* Persistence functions *
\****************************************************************************************/
/* field names */
#define ICV_HAAR_SIZE_NAME "size"
#define ICV_HAAR_STAGES_NAME "stages"
#define ICV_HAAR_TREES_NAME "trees"
#define ICV_HAAR_FEATURE_NAME "feature"
#define ICV_HAAR_RECTS_NAME "rects"
#define ICV_HAAR_TILTED_NAME "tilted"
#define ICV_HAAR_THRESHOLD_NAME "threshold"
#define ICV_HAAR_LEFT_NODE_NAME "left_node"
#define ICV_HAAR_LEFT_VAL_NAME "left_val"
#define ICV_HAAR_RIGHT_NODE_NAME "right_node"
#define ICV_HAAR_RIGHT_VAL_NAME "right_val"
#define ICV_HAAR_STAGE_THRESHOLD_NAME "stage_threshold"
#define ICV_HAAR_PARENT_NAME "parent"
#define ICV_HAAR_NEXT_NAME "next"
static int gpuRunHaarClassifierCascade( /*const CvHaarClassifierCascade *_cascade, CvPoint pt, int start_stage */)
{
return 1;
}
namespace cv
{
namespace ocl
{
struct gpuHaarDetectObjects_ScaleImage_Invoker
{
gpuHaarDetectObjects_ScaleImage_Invoker( const CvHaarClassifierCascade *_cascade,
int _stripSize, double _factor,
const Mat &_sum1, const Mat &_sqsum1, Mat *_norm1,
Mat *_mask1, Rect _equRect, ConcurrentRectVector &_vec )
{
cascade = _cascade;
stripSize = _stripSize;
factor = _factor;
sum1 = _sum1;
sqsum1 = _sqsum1;
norm1 = _norm1;
mask1 = _mask1;
equRect = _equRect;
vec = &_vec;
}
void operator()( const BlockedRange &range ) const
{
Size winSize0 = cascade->orig_window_size;
Size winSize(cvRound(winSize0.width * factor), cvRound(winSize0.height * factor));
int y1 = range.begin() * stripSize, y2 = std::min(range.end() * stripSize, sum1.rows - 1 - winSize0.height);
Size ssz(sum1.cols - 1 - winSize0.width, y2 - y1);
int x, y, ystep = factor > 2 ? 1 : 2;
for( y = y1; y < y2; y += ystep )
for( x = 0; x < ssz.width; x += ystep )
{
if( gpuRunHaarClassifierCascade( /*cascade, cvPoint(x, y), 0*/ ) > 0 )
vec->push_back(Rect(cvRound(x * factor), cvRound(y * factor),
winSize.width, winSize.height));
}
}
const CvHaarClassifierCascade *cascade;
int stripSize;
double factor;
Mat sum1, sqsum1, *norm1, *mask1;
Rect equRect;
ConcurrentRectVector *vec;
};
struct gpuHaarDetectObjects_ScaleCascade_Invoker
{
gpuHaarDetectObjects_ScaleCascade_Invoker( const CvHaarClassifierCascade *_cascade,
Size _winsize, const Range &_xrange, double _ystep,
size_t _sumstep, const int **_p, const int **_pq,
ConcurrentRectVector &_vec )
{
cascade = _cascade;
winsize = _winsize;
xrange = _xrange;
ystep = _ystep;
sumstep = _sumstep;
p = _p;
pq = _pq;
vec = &_vec;
}
void operator()( const BlockedRange &range ) const
{
int iy, startY = range.begin(), endY = range.end();
const int *p0 = p[0], *p1 = p[1], *p2 = p[2], *p3 = p[3];
const int *pq0 = pq[0], *pq1 = pq[1], *pq2 = pq[2], *pq3 = pq[3];
bool doCannyPruning = p0 != 0;
int sstep = (int)(sumstep / sizeof(p0[0]));
for( iy = startY; iy < endY; iy++ )
{
int ix, y = cvRound(iy * ystep), ixstep = 1;
for( ix = xrange.start; ix < xrange.end; ix += ixstep )
{
int x = cvRound(ix * ystep); // it should really be ystep, not ixstep
if( doCannyPruning )
{
int offset = y * sstep + x;
int s = p0[offset] - p1[offset] - p2[offset] + p3[offset];
int sq = pq0[offset] - pq1[offset] - pq2[offset] + pq3[offset];
if( s < 100 || sq < 20 )
{
ixstep = 2;
continue;
}
}
int result = gpuRunHaarClassifierCascade(/* cascade, cvPoint(x, y), 0 */);
if( result > 0 )
vec->push_back(Rect(x, y, winsize.width, winsize.height));
ixstep = result != 0 ? 1 : 2;
}
}
}
const CvHaarClassifierCascade *cascade;
double ystep;
size_t sumstep;
Size winsize;
Range xrange;
const int **p;
const int **pq;
ConcurrentRectVector *vec;
};
}
}
#endif

View File

@@ -270,7 +270,7 @@ namespace cv
size_t globalThreads[3] = {glbSizeX, glbSizeY, 1};
size_t localThreads[3] = {blkSizeX, blkSizeY, 1};
float borderFloat[4] = {(float)borderValue[0], (float)borderValue[1], (float)borderValue[2], (float)borderValue[3]};
std::vector< std::pair<size_t, const void *> > args;
if(map1.channels() == 2)
{
@@ -292,7 +292,7 @@ namespace cv
args.push_back( std::make_pair(sizeof(cl_int), (void *)&cols));
float borderFloat[4] = {(float)borderValue[0], (float)borderValue[1], (float)borderValue[2], (float)borderValue[3]};
if(src.clCxt->supportsFeature(Context::CL_DOUBLE))
if(src.clCxt->supportsFeature(Context::CL_DOUBLE))
{
args.push_back( std::make_pair(sizeof(cl_double4), (void *)&borderValue));
}
@@ -326,7 +326,6 @@ namespace cv
}
else
{
float borderFloat[4] = {(float)borderValue[0], (float)borderValue[1], (float)borderValue[2], (float)borderValue[3]};
args.push_back( std::make_pair(sizeof(cl_float4), (void *)&borderFloat));
}
}
@@ -1210,30 +1209,41 @@ namespace cv
void cornerHarris(const oclMat &src, oclMat &dst, int blockSize, int ksize,
double k, int borderType)
{
if(!src.clCxt->supportsFeature(Context::CL_DOUBLE) && src.depth() == CV_64F)
{
CV_Error(Error::GpuNotSupported, "select device don't support double");
}
CV_Assert(src.cols >= blockSize / 2 && src.rows >= blockSize / 2);
oclMat Dx, Dy;
CV_Assert(borderType == cv::BORDER_CONSTANT || borderType == cv::BORDER_REFLECT101 || borderType == cv::BORDER_REPLICATE || borderType == cv::BORDER_REFLECT);
extractCovData(src, Dx, Dy, blockSize, ksize, borderType);
dst.create(src.size(), CV_32F);
corner_ocl(imgproc_calcHarris, "calcHarris", blockSize, static_cast<float>(k), Dx, Dy, dst, borderType);
oclMat dx, dy;
cornerHarris_dxdy(src, dst, dx, dy, blockSize, ksize, k, borderType);
}
void cornerMinEigenVal(const oclMat &src, oclMat &dst, int blockSize, int ksize, int borderType)
void cornerHarris_dxdy(const oclMat &src, oclMat &dst, oclMat &dx, oclMat &dy, int blockSize, int ksize,
double k, int borderType)
{
if(!src.clCxt->supportsFeature(Context::CL_DOUBLE) && src.depth() == CV_64F)
{
CV_Error(Error::GpuNotSupported, "select device don't support double");
}
CV_Assert(src.cols >= blockSize / 2 && src.rows >= blockSize / 2);
oclMat Dx, Dy;
CV_Assert(borderType == cv::BORDER_CONSTANT || borderType == cv::BORDER_REFLECT101 || borderType == cv::BORDER_REPLICATE || borderType == cv::BORDER_REFLECT);
extractCovData(src, Dx, Dy, blockSize, ksize, borderType);
extractCovData(src, dx, dy, blockSize, ksize, borderType);
dst.create(src.size(), CV_32F);
corner_ocl(imgproc_calcMinEigenVal, "calcMinEigenVal", blockSize, 0, Dx, Dy, dst, borderType);
corner_ocl(imgproc_calcHarris, "calcHarris", blockSize, static_cast<float>(k), dx, dy, dst, borderType);
}
void cornerMinEigenVal(const oclMat &src, oclMat &dst, int blockSize, int ksize, int borderType)
{
oclMat dx, dy;
cornerMinEigenVal_dxdy(src, dst, dx, dy, blockSize, ksize, borderType);
}
void cornerMinEigenVal_dxdy(const oclMat &src, oclMat &dst, oclMat &dx, oclMat &dy, int blockSize, int ksize, int borderType)
{
if(!src.clCxt->supportsFeature(Context::CL_DOUBLE) && src.depth() == CV_64F)
{
CV_Error(Error::GpuNotSupported, "select device don't support double");
}
CV_Assert(src.cols >= blockSize / 2 && src.rows >= blockSize / 2);
CV_Assert(borderType == cv::BORDER_CONSTANT || borderType == cv::BORDER_REFLECT101 || borderType == cv::BORDER_REPLICATE || borderType == cv::BORDER_REFLECT);
extractCovData(src, dx, dy, blockSize, ksize, borderType);
dst.create(src.size(), CV_32F);
corner_ocl(imgproc_calcMinEigenVal, "calcMinEigenVal", blockSize, 0, dx, dy, dst, borderType);
}
/////////////////////////////////// MeanShiftfiltering ///////////////////////////////////////////////
static void meanShiftFiltering_gpu(const oclMat &src, oclMat dst, int sp, int sr, int maxIter, float eps)

View File

@@ -43,9 +43,28 @@
//
//M*/
#define CL_USE_DEPRECATED_OPENCL_1_1_APIS
#include "precomp.hpp"
#ifdef __GNUC__
#if ((__GNUC__ * 100) + __GNUC_MINOR__) >= 402
#define GCC_DIAG_STR(s) #s
#define GCC_DIAG_JOINSTR(x,y) GCC_DIAG_STR(x ## y)
# define GCC_DIAG_DO_PRAGMA(x) _Pragma (#x)
# define GCC_DIAG_PRAGMA(x) GCC_DIAG_DO_PRAGMA(GCC diagnostic x)
# if ((__GNUC__ * 100) + __GNUC_MINOR__) >= 406
# define GCC_DIAG_OFF(x) GCC_DIAG_PRAGMA(push) \
GCC_DIAG_PRAGMA(ignored GCC_DIAG_JOINSTR(-W,x))
# define GCC_DIAG_ON(x) GCC_DIAG_PRAGMA(pop)
# else
# define GCC_DIAG_OFF(x) GCC_DIAG_PRAGMA(ignored GCC_DIAG_JOINSTR(-W,x))
# define GCC_DIAG_ON(x) GCC_DIAG_PRAGMA(warning GCC_DIAG_JOINSTR(-W,x))
# endif
#else
# define GCC_DIAG_OFF(x)
# define GCC_DIAG_ON(x)
#endif
#endif /* __GNUC__ */
using namespace std;
namespace cv
@@ -121,6 +140,9 @@ namespace cv
build_options, finish_mode);
}
#ifdef __GNUC__
GCC_DIAG_OFF(deprecated-declarations)
#endif
cl_mem bindTexture(const oclMat &mat)
{
cl_mem texture;
@@ -156,7 +178,7 @@ namespace cv
format.image_channel_order = CL_RGBA;
break;
default:
CV_Error(-1, "Image forma is not supported");
CV_Error(-1, "Image format is not supported");
break;
}
#ifdef CL_VERSION_1_2
@@ -180,10 +202,6 @@ namespace cv
else
#endif
{
#ifdef __GNUC__
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wdeprecated-declarations"
#endif
texture = clCreateImage2D(
(cl_context)mat.clCxt->oclContext(),
CL_MEM_READ_WRITE,
@@ -193,9 +211,6 @@ namespace cv
0,
NULL,
&err);
#ifdef __GNUC__
#pragma GCC diagnostic pop
#endif
}
size_t origin[] = { 0, 0, 0 };
size_t region[] = { mat.cols, mat.rows, 1 };
@@ -225,6 +240,14 @@ namespace cv
openCLSafeCall(err);
return texture;
}
#ifdef __GNUC__
GCC_DIAG_ON(deprecated-declarations)
#endif
Ptr<TextureCL> bindTexturePtr(const oclMat &mat)
{
return Ptr<TextureCL>(new TextureCL(bindTexture(mat), mat.rows, mat.cols, mat.type()));
}
void releaseTexture(cl_mem& texture)
{
openCLFree(texture);

View File

@@ -127,7 +127,7 @@ __kernel void arithm_add_D2 (__global ushort *src1, int src1_step, int src1_offs
#ifdef dst_align
#undef dst_align
#endif
#define dst_align ((dst_offset >> 1) & 3)
#define dst_align ((dst_offset / 2) & 3)
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
@@ -165,7 +165,7 @@ __kernel void arithm_add_D3 (__global short *src1, int src1_step, int src1_offse
#ifdef dst_align
#undef dst_align
#endif
#define dst_align ((dst_offset >> 1) & 3)
#define dst_align ((dst_offset / 2) & 3)
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
@@ -335,7 +335,7 @@ __kernel void arithm_add_with_mask_C1_D2 (__global ushort *src1, int src1_step,
#ifdef dst_align
#undef dst_align
#endif
#define dst_align ((dst_offset >> 1) & 1)
#define dst_align ((dst_offset / 2) & 1)
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
@@ -375,7 +375,7 @@ __kernel void arithm_add_with_mask_C1_D3 (__global short *src1, int src1_step, i
#ifdef dst_align
#undef dst_align
#endif
#define dst_align ((dst_offset >> 1) & 1)
#define dst_align ((dst_offset / 2) & 1)
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
@@ -507,7 +507,7 @@ __kernel void arithm_add_with_mask_C2_D0 (__global uchar *src1, int src1_step, i
#ifdef dst_align
#undef dst_align
#endif
#define dst_align ((dst_offset >> 1) & 1)
#define dst_align ((dst_offset / 2) & 1)
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);

View File

@@ -126,7 +126,7 @@ __kernel void arithm_s_add_with_mask_C1_D2 (__global ushort *src1, int src1_st
#ifdef dst_align
#undef dst_align
#endif
#define dst_align ((dst_offset >> 1) & 1)
#define dst_align ((dst_offset / 2) & 1)
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
@@ -164,7 +164,7 @@ __kernel void arithm_s_add_with_mask_C1_D3 (__global short *src1, int src1_ste
#ifdef dst_align
#undef dst_align
#endif
#define dst_align ((dst_offset >> 1) & 1)
#define dst_align ((dst_offset / 2) & 1)
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
@@ -288,7 +288,7 @@ __kernel void arithm_s_add_with_mask_C2_D0 (__global uchar *src1, int src1_ste
#ifdef dst_align
#undef dst_align
#endif
#define dst_align ((dst_offset >> 1) & 1)
#define dst_align ((dst_offset / 2) & 1)
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);

View File

@@ -120,7 +120,7 @@ __kernel void morph_C1_D0(__global const uchar * restrict src,
int gidy = get_global_id(1);
int out_addr = mad24(gidy,dst_step_in_pixel,gidx+dst_offset_in_pixel);
if(gidx+3<cols && gidy<rows && (dst_offset_in_pixel&3)==0)
if(gidx+3<cols && gidy<rows && ((dst_offset_in_pixel&3)==0))
{
*(__global uchar4*)&dst[out_addr] = res;
}

View File

@@ -10,6 +10,7 @@
// Wang Weiyan, wangweiyanster@gmail.com
// Jia Haipeng, jiahaipeng95@gmail.com
// Nathan, liujun@multicorewareinc.com
// Peng Xiao, pengxiao@outlook.com
// Redistribution and use in source and binary forms, with or without modification,
// are permitted provided that the following conditions are met:
//
@@ -45,27 +46,16 @@
typedef int sumtype;
typedef float sqsumtype;
typedef struct __attribute__((aligned (128))) GpuHidHaarFeature
{
struct __attribute__((aligned (32)))
{
int p0 __attribute__((aligned (4)));
int p1 __attribute__((aligned (4)));
int p2 __attribute__((aligned (4)));
int p3 __attribute__((aligned (4)));
float weight __attribute__((aligned (4)));
}
rect[CV_HAAR_FEATURE_MAX] __attribute__((aligned (32)));
}
GpuHidHaarFeature;
#ifndef STUMP_BASED
#define STUMP_BASED 1
#endif
typedef struct __attribute__((aligned (128) )) GpuHidHaarTreeNode
{
int p[CV_HAAR_FEATURE_MAX][4] __attribute__((aligned (64)));
float weight[CV_HAAR_FEATURE_MAX] /*__attribute__((aligned (16)))*/;
float threshold /*__attribute__((aligned (4)))*/;
float alpha[2] __attribute__((aligned (8)));
float weight[CV_HAAR_FEATURE_MAX];
float threshold;
float alpha[3] __attribute__((aligned (16)));
int left __attribute__((aligned (4)));
int right __attribute__((aligned (4)));
}
@@ -111,7 +101,6 @@ typedef struct __attribute__((aligned (64))) GpuHidHaarClassifierCascade
float inv_window_area __attribute__((aligned (4)));
} GpuHidHaarClassifierCascade;
__kernel void __attribute__((reqd_work_group_size(8,8,1)))gpuRunHaarClassifierCascade(
global GpuHidHaarStageClassifier * stagecascadeptr,
global int4 * info,
@@ -234,7 +223,7 @@ __kernel void __attribute__((reqd_work_group_size(8,8,1)))gpuRunHaarClassifierCa
float stage_sum = 0.f;
int2 stageinfo = *(global int2*)(stagecascadeptr+stageloop);
float stagethreshold = as_float(stageinfo.y);
for(int nodeloop = 0; nodeloop < stageinfo.x; nodeloop++ )
for(int nodeloop = 0; nodeloop < stageinfo.x; )
{
__global GpuHidHaarTreeNode* currentnodeptr = (nodeptr + nodecounter);
@@ -242,7 +231,8 @@ __kernel void __attribute__((reqd_work_group_size(8,8,1)))gpuRunHaarClassifierCa
int4 info2 = *(__global int4*)(&(currentnodeptr->p[1][0]));
int4 info3 = *(__global int4*)(&(currentnodeptr->p[2][0]));
float4 w = *(__global float4*)(&(currentnodeptr->weight[0]));
float2 alpha2 = *(__global float2*)(&(currentnodeptr->alpha[0]));
float3 alpha3 = *(__global float3*)(&(currentnodeptr->alpha[0]));
float nodethreshold = w.w * variance_norm_factor;
info1.x +=lcl_off;
@@ -261,8 +251,34 @@ __kernel void __attribute__((reqd_work_group_size(8,8,1)))gpuRunHaarClassifierCa
classsum += (lcldata[mad24(info3.y,readwidth,info3.x)] - lcldata[mad24(info3.y,readwidth,info3.z)] -
lcldata[mad24(info3.w,readwidth,info3.x)] + lcldata[mad24(info3.w,readwidth,info3.z)]) * w.z;
stage_sum += classsum >= nodethreshold ? alpha2.y : alpha2.x;
bool passThres = classsum >= nodethreshold;
#if STUMP_BASED
stage_sum += passThres ? alpha3.y : alpha3.x;
nodecounter++;
nodeloop++;
#else
bool isRootNode = (nodecounter & 1) == 0;
if(isRootNode)
{
if( (passThres && currentnodeptr->right) ||
(!passThres && currentnodeptr->left))
{
nodecounter ++;
}
else
{
stage_sum += alpha3.x;
nodecounter += 2;
nodeloop ++;
}
}
else
{
stage_sum += passThres ? alpha3.z : alpha3.y;
nodecounter ++;
nodeloop ++;
}
#endif
}
result = (stage_sum >= stagethreshold);
@@ -301,18 +317,20 @@ __kernel void __attribute__((reqd_work_group_size(8,8,1)))gpuRunHaarClassifierCa
if(lcl_compute_win_id < queuecount)
{
int tempnodecounter = lcl_compute_id;
float part_sum = 0.f;
for(int lcl_loop=0; lcl_loop<lcl_loops && tempnodecounter<stageinfo.x; lcl_loop++)
const int stump_factor = STUMP_BASED ? 1 : 2;
int root_offset = 0;
for(int lcl_loop=0; lcl_loop<lcl_loops && tempnodecounter<stageinfo.x;)
{
__global GpuHidHaarTreeNode* currentnodeptr = (nodeptr + nodecounter + tempnodecounter);
__global GpuHidHaarTreeNode* currentnodeptr =
nodeptr + (nodecounter + tempnodecounter) * stump_factor + root_offset;
int4 info1 = *(__global int4*)(&(currentnodeptr->p[0][0]));
int4 info2 = *(__global int4*)(&(currentnodeptr->p[1][0]));
int4 info3 = *(__global int4*)(&(currentnodeptr->p[2][0]));
float4 w = *(__global float4*)(&(currentnodeptr->weight[0]));
float2 alpha2 = *(__global float2*)(&(currentnodeptr->alpha[0]));
float3 alpha3 = *(__global float3*)(&(currentnodeptr->alpha[0]));
float nodethreshold = w.w * variance_norm_factor;
info1.x +=queue_pixel;
@@ -332,8 +350,34 @@ __kernel void __attribute__((reqd_work_group_size(8,8,1)))gpuRunHaarClassifierCa
classsum += (lcldata[mad24(info3.y,readwidth,info3.x)] - lcldata[mad24(info3.y,readwidth,info3.z)] -
lcldata[mad24(info3.w,readwidth,info3.x)] + lcldata[mad24(info3.w,readwidth,info3.z)]) * w.z;
part_sum += classsum >= nodethreshold ? alpha2.y : alpha2.x;
tempnodecounter +=lcl_compute_win;
bool passThres = classsum >= nodethreshold;
#if STUMP_BASED
part_sum += passThres ? alpha3.y : alpha3.x;
tempnodecounter += lcl_compute_win;
lcl_loop++;
#else
if(root_offset == 0)
{
if( (passThres && currentnodeptr->right) ||
(!passThres && currentnodeptr->left))
{
root_offset = 1;
}
else
{
part_sum += alpha3.x;
tempnodecounter += lcl_compute_win;
lcl_loop++;
}
}
else
{
part_sum += passThres ? alpha3.z : alpha3.y;
tempnodecounter += lcl_compute_win;
lcl_loop++;
root_offset = 0;
}
#endif
}//end for(int lcl_loop=0;lcl_loop<lcl_loops;lcl_loop++)
partialsum[lcl_id]=part_sum;
}
@@ -377,155 +421,3 @@ __kernel void __attribute__((reqd_work_group_size(8,8,1)))gpuRunHaarClassifierCa
}//end for(int grploop=grpidx;grploop<totalgrp;grploop+=grpnumx)
}//end for(int scalei = 0; scalei <loopcount; scalei++)
}
/*
if(stagecascade->two_rects)
{
#pragma unroll
for( n = 0; n < stagecascade->count; n++ )
{
t1 = *(node + counter);
t = t1.threshold * variance_norm_factor;
classsum = calc_sum1(t1,p_offset,0) * t1.weight[0];
classsum += calc_sum1(t1, p_offset,1) * t1.weight[1];
stage_sum += classsum >= t ? t1.alpha[1]:t1.alpha[0];
counter++;
}
}
else
{
#pragma unroll
for( n = 0; n < stagecascade->count; n++ )
{
t = node[counter].threshold*variance_norm_factor;
classsum = calc_sum1(node[counter],p_offset,0) * node[counter].weight[0];
classsum += calc_sum1(node[counter],p_offset,1) * node[counter].weight[1];
if( node[counter].p0[2] )
classsum += calc_sum1(node[counter],p_offset,2) * node[counter].weight[2];
stage_sum += classsum >= t ? node[counter].alpha[1]:node[counter].alpha[0];// modify
counter++;
}
}
*/
/*
__kernel void gpuRunHaarClassifierCascade_ScaleWindow(
constant GpuHidHaarClassifierCascade * _cascade,
global GpuHidHaarStageClassifier * stagecascadeptr,
//global GpuHidHaarClassifier * classifierptr,
global GpuHidHaarTreeNode * nodeptr,
global int * sum,
global float * sqsum,
global int * _candidate,
int pixel_step,
int cols,
int rows,
int start_stage,
int end_stage,
//int counts,
int nodenum,
int ystep,
int detect_width,
//int detect_height,
int loopcount,
int outputstep)
//float scalefactor)
{
unsigned int x1 = get_global_id(0);
unsigned int y1 = get_global_id(1);
int p_offset;
int m, n;
int result;
int counter;
float mean, variance_norm_factor;
for(int i=0;i<loopcount;i++)
{
constant GpuHidHaarClassifierCascade * cascade = _cascade + i;
global int * candidate = _candidate + i*outputstep;
int window_width = cascade->p1 - cascade->p0;
int window_height = window_width;
result = 1;
counter = 0;
unsigned int x = mul24(x1,ystep);
unsigned int y = mul24(y1,ystep);
if((x < cols - window_width - 1) && (y < rows - window_height -1))
{
global GpuHidHaarStageClassifier *stagecascade = stagecascadeptr +cascade->count*i+ start_stage;
//global GpuHidHaarClassifier *classifier = classifierptr;
global GpuHidHaarTreeNode *node = nodeptr + nodenum*i;
p_offset = mad24(y, pixel_step, x);// modify
mean = (*(sum + p_offset + (int)cascade->p0) - *(sum + p_offset + (int)cascade->p1) -
*(sum + p_offset + (int)cascade->p2) + *(sum + p_offset + (int)cascade->p3))
*cascade->inv_window_area;
variance_norm_factor = *(sqsum + p_offset + cascade->p0) - *(sqsum + cascade->p1 + p_offset) -
*(sqsum + p_offset + cascade->p2) + *(sqsum + cascade->p3 + p_offset);
variance_norm_factor = variance_norm_factor * cascade->inv_window_area - mean * mean;
variance_norm_factor = variance_norm_factor >=0.f ? sqrt(variance_norm_factor) : 1;//modify
// if( cascade->is_stump_based )
//{
for( m = start_stage; m < end_stage; m++ )
{
float stage_sum = 0.f;
float t, classsum;
GpuHidHaarTreeNode t1;
//#pragma unroll
for( n = 0; n < stagecascade->count; n++ )
{
t1 = *(node + counter);
t = t1.threshold * variance_norm_factor;
classsum = calc_sum1(t1, p_offset ,0) * t1.weight[0] + calc_sum1(t1, p_offset ,1) * t1.weight[1];
if((t1.p0[2]) && (!stagecascade->two_rects))
classsum += calc_sum1(t1, p_offset, 2) * t1.weight[2];
stage_sum += classsum >= t ? t1.alpha[1] : t1.alpha[0];// modify
counter++;
}
if (stage_sum < stagecascade->threshold)
{
result = 0;
break;
}
stagecascade++;
}
if(result)
{
candidate[4 * (y1 * detect_width + x1)] = x;
candidate[4 * (y1 * detect_width + x1) + 1] = y;
candidate[4 * (y1 * detect_width + x1)+2] = window_width;
candidate[4 * (y1 * detect_width + x1) + 3] = window_height;
}
//}
}
}
}
*/

View File

@@ -17,7 +17,7 @@
// @Authors
// Wu Xinglong, wxl370@126.com
// Sen Liu, swjtuls1987@126.com
//
// Peng Xiao, pengxiao@outlook.com
// Redistribution and use in source and binary forms, with or without modification,
// are permitted provided that the following conditions are met:
//
@@ -49,25 +49,13 @@
#define CV_HAAR_FEATURE_MAX 3
typedef int sumtype;
typedef float sqsumtype;
typedef struct __attribute__((aligned(128))) GpuHidHaarFeature
{
struct __attribute__((aligned(32)))
{
int p0 __attribute__((aligned(4)));
int p1 __attribute__((aligned(4)));
int p2 __attribute__((aligned(4)));
int p3 __attribute__((aligned(4)));
float weight __attribute__((aligned(4)));
}
rect[CV_HAAR_FEATURE_MAX] __attribute__((aligned(32)));
}
GpuHidHaarFeature;
typedef struct __attribute__((aligned(128))) GpuHidHaarTreeNode
{
int p[CV_HAAR_FEATURE_MAX][4] __attribute__((aligned(64)));
float weight[CV_HAAR_FEATURE_MAX] /*__attribute__((aligned (16)))*/;
float threshold /*__attribute__((aligned (4)))*/;
float alpha[2] __attribute__((aligned(8)));
float alpha[3] __attribute__((aligned(16)));
int left __attribute__((aligned(4)));
int right __attribute__((aligned(4)));
}
@@ -174,45 +162,83 @@ __kernel void gpuRunHaarClassifierCascade_scaled2(
const int p_offset = mad24(y, step, x);
cascadeinfo.x += p_offset;
cascadeinfo.z += p_offset;
mean = (sum[clamp(mad24(cascadeinfo.y, step, cascadeinfo.x), 0, max_idx)] - sum[clamp(mad24(cascadeinfo.y, step, cascadeinfo.z), 0, max_idx)] -
sum[clamp(mad24(cascadeinfo.w, step, cascadeinfo.x), 0, max_idx)] + sum[clamp(mad24(cascadeinfo.w, step, cascadeinfo.z), 0, max_idx)])
mean = (sum[clamp(mad24(cascadeinfo.y, step, cascadeinfo.x), 0, max_idx)]
- sum[clamp(mad24(cascadeinfo.y, step, cascadeinfo.z), 0, max_idx)] -
sum[clamp(mad24(cascadeinfo.w, step, cascadeinfo.x), 0, max_idx)]
+ sum[clamp(mad24(cascadeinfo.w, step, cascadeinfo.z), 0, max_idx)])
* correction_t;
variance_norm_factor = sqsum[clamp(mad24(cascadeinfo.y, step, cascadeinfo.x), 0, max_idx)] - sqsum[clamp(mad24(cascadeinfo.y, step, cascadeinfo.z), 0, max_idx)] -
sqsum[clamp(mad24(cascadeinfo.w, step, cascadeinfo.x), 0, max_idx)] + sqsum[clamp(mad24(cascadeinfo.w, step, cascadeinfo.z), 0, max_idx)];
variance_norm_factor = sqsum[clamp(mad24(cascadeinfo.y, step, cascadeinfo.x), 0, max_idx)]
- sqsum[clamp(mad24(cascadeinfo.y, step, cascadeinfo.z), 0, max_idx)] -
sqsum[clamp(mad24(cascadeinfo.w, step, cascadeinfo.x), 0, max_idx)]
+ sqsum[clamp(mad24(cascadeinfo.w, step, cascadeinfo.z), 0, max_idx)];
variance_norm_factor = variance_norm_factor * correction_t - mean * mean;
variance_norm_factor = variance_norm_factor >= 0.f ? sqrt(variance_norm_factor) : 1.f;
bool result = true;
nodecounter = startnode + nodecount * scalei;
for (int stageloop = start_stage; (stageloop < end_stage) && result; stageloop++)
{
float stage_sum = 0.f;
int stagecount = stagecascadeptr[stageloop].count;
for (int nodeloop = 0; nodeloop < stagecount; nodeloop++)
for (int nodeloop = 0; nodeloop < stagecount;)
{
__global GpuHidHaarTreeNode *currentnodeptr = (nodeptr + nodecounter);
int4 info1 = *(__global int4 *)(&(currentnodeptr->p[0][0]));
int4 info2 = *(__global int4 *)(&(currentnodeptr->p[1][0]));
int4 info3 = *(__global int4 *)(&(currentnodeptr->p[2][0]));
float4 w = *(__global float4 *)(&(currentnodeptr->weight[0]));
float2 alpha2 = *(__global float2 *)(&(currentnodeptr->alpha[0]));
float3 alpha3 = *(__global float3 *)(&(currentnodeptr->alpha[0]));
float nodethreshold = w.w * variance_norm_factor;
info1.x += p_offset;
info1.z += p_offset;
info2.x += p_offset;
info2.z += p_offset;
float classsum = (sum[clamp(mad24(info1.y, step, info1.x), 0, max_idx)] - sum[clamp(mad24(info1.y, step, info1.z), 0, max_idx)] -
sum[clamp(mad24(info1.w, step, info1.x), 0, max_idx)] + sum[clamp(mad24(info1.w, step, info1.z), 0, max_idx)]) * w.x;
classsum += (sum[clamp(mad24(info2.y, step, info2.x), 0, max_idx)] - sum[clamp(mad24(info2.y, step, info2.z), 0, max_idx)] -
sum[clamp(mad24(info2.w, step, info2.x), 0, max_idx)] + sum[clamp(mad24(info2.w, step, info2.z), 0, max_idx)]) * w.y;
info3.x += p_offset;
info3.z += p_offset;
classsum += (sum[clamp(mad24(info3.y, step, info3.x), 0, max_idx)] - sum[clamp(mad24(info3.y, step, info3.z), 0, max_idx)] -
sum[clamp(mad24(info3.w, step, info3.x), 0, max_idx)] + sum[clamp(mad24(info3.w, step, info3.z), 0, max_idx)]) * w.z;
stage_sum += classsum >= nodethreshold ? alpha2.y : alpha2.x;
float classsum = (sum[clamp(mad24(info1.y, step, info1.x), 0, max_idx)]
- sum[clamp(mad24(info1.y, step, info1.z), 0, max_idx)] -
sum[clamp(mad24(info1.w, step, info1.x), 0, max_idx)]
+ sum[clamp(mad24(info1.w, step, info1.z), 0, max_idx)]) * w.x;
classsum += (sum[clamp(mad24(info2.y, step, info2.x), 0, max_idx)]
- sum[clamp(mad24(info2.y, step, info2.z), 0, max_idx)] -
sum[clamp(mad24(info2.w, step, info2.x), 0, max_idx)]
+ sum[clamp(mad24(info2.w, step, info2.z), 0, max_idx)]) * w.y;
classsum += (sum[clamp(mad24(info3.y, step, info3.x), 0, max_idx)]
- sum[clamp(mad24(info3.y, step, info3.z), 0, max_idx)] -
sum[clamp(mad24(info3.w, step, info3.x), 0, max_idx)]
+ sum[clamp(mad24(info3.w, step, info3.z), 0, max_idx)]) * w.z;
bool passThres = classsum >= nodethreshold;
#if STUMP_BASED
stage_sum += passThres ? alpha3.y : alpha3.x;
nodecounter++;
nodeloop++;
#else
bool isRootNode = (nodecounter & 1) == 0;
if(isRootNode)
{
if( (passThres && currentnodeptr->right) ||
(!passThres && currentnodeptr->left))
{
nodecounter ++;
}
else
{
stage_sum += alpha3.x;
nodecounter += 2;
nodeloop ++;
}
}
else
{
stage_sum += (passThres ? alpha3.z : alpha3.y);
nodecounter ++;
nodeloop ++;
}
#endif
}
result = (bool)(stage_sum >= stagecascadeptr[stageloop].threshold);
result = (int)(stage_sum >= stagecascadeptr[stageloop].threshold);
}
barrier(CLK_LOCAL_MEM_FENCE);
@@ -222,7 +248,6 @@ __kernel void gpuRunHaarClassifierCascade_scaled2(
int queueindex = atomic_inc(lclcount);
lcloutindex[queueindex] = (y << 16) | x;
}
barrier(CLK_LOCAL_MEM_FENCE);
int queuecount = lclcount[0];
@@ -277,5 +302,6 @@ __kernel void gpuscaleclassifier(global GpuHidHaarTreeNode *orinode, global GpuH
newnode[counter].threshold = t1.threshold;
newnode[counter].alpha[0] = t1.alpha[0];
newnode[counter].alpha[1] = t1.alpha[1];
newnode[counter].alpha[2] = t1.alpha[2];
}

View File

@@ -297,6 +297,9 @@ calcMap
map_step /= sizeof(*map);
map_offset /= sizeof(*map);
mag += mag_offset;
map += map_offset;
__local float smem[18][18];
int gidx = get_global_id(0);
@@ -389,7 +392,7 @@ edgesHysteresisLocal
(
__global int * map,
__global ushort2 * st,
volatile __global unsigned int * counter,
__global unsigned int * counter,
int rows,
int cols,
int map_step,
@@ -399,6 +402,8 @@ edgesHysteresisLocal
map_step /= sizeof(*map);
map_offset /= sizeof(*map);
map += map_offset;
__local int smem[18][18];
int gidx = get_global_id(0);
@@ -416,12 +421,12 @@ edgesHysteresisLocal
if(ly < 14)
{
smem[ly][lx] =
map[grp_idx + lx + min(grp_idy + ly, rows - 1) * map_step + map_offset];
map[grp_idx + lx + min(grp_idy + ly, rows - 1) * map_step];
}
if(ly < 4 && grp_idy + ly + 14 <= rows && grp_idx + lx <= cols)
{
smem[ly + 14][lx] =
map[grp_idx + lx + min(grp_idy + ly + 14, rows - 1) * map_step + map_offset];
map[grp_idx + lx + min(grp_idy + ly + 14, rows - 1) * map_step];
}
barrier(CLK_LOCAL_MEM_FENCE);
@@ -482,14 +487,17 @@ edgesHysteresisLocal
__constant int c_dx[8] = {-1, 0, 1, -1, 1, -1, 0, 1};
__constant int c_dy[8] = {-1, -1, -1, 0, 0, 1, 1, 1};
#define stack_size 512
__kernel
void edgesHysteresisGlobal
void
__attribute__((reqd_work_group_size(128,1,1)))
edgesHysteresisGlobal
(
__global int * map,
__global ushort2 * st1,
__global ushort2 * st2,
volatile __global int * counter,
__global int * counter,
int rows,
int cols,
int count,
@@ -501,6 +509,8 @@ void edgesHysteresisGlobal
map_step /= sizeof(*map);
map_offset /= sizeof(*map);
map += map_offset;
int gidx = get_global_id(0);
int gidy = get_global_id(1);
@@ -510,7 +520,7 @@ void edgesHysteresisGlobal
int grp_idx = get_group_id(0);
int grp_idy = get_group_id(1);
volatile __local unsigned int s_counter;
__local unsigned int s_counter;
__local unsigned int s_ind;
__local ushort2 s_st[stack_size];
@@ -564,9 +574,9 @@ void edgesHysteresisGlobal
pos.x += c_dx[lidx & 7];
pos.y += c_dy[lidx & 7];
if (map[pos.x + map_offset + pos.y * map_step] == 1)
if (map[pos.x + pos.y * map_step] == 1)
{
map[pos.x + map_offset + pos.y * map_step] = 2;
map[pos.x + pos.y * map_step] = 2;
ind = atomic_inc(&s_counter);
@@ -621,6 +631,6 @@ void getEdges
if(gidy < rows && gidx < cols)
{
dst[gidx + gidy * dst_step] = (uchar)(-(map[gidx + 1 + (gidy + 1) * map_step] >> 1));
dst[gidx + gidy * dst_step] = (uchar)(-(map[gidx + 1 + (gidy + 1) * map_step + map_offset] >> 1));
}
}

View File

@@ -0,0 +1,276 @@
/*M///////////////////////////////////////////////////////////////////////////////////////
//
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
//
// By downloading, copying, installing or using the software you agree to this license.
// If you do not agree to this license, do not download, install,
// copy or use the software.
//
//
// License Agreement
// For Open Source Computer Vision Library
//
// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
// Third party copyrights are property of their respective owners.
//
// @Authors
// Peng Xiao, pengxiao@outlook.com
//
// Redistribution and use in source and binary forms, with or without modification,
// are permitted provided that the following conditions are met:
//
// * Redistribution's of source code must retain the above copyright notice,
// this list of conditions and the following disclaimer.
//
// * Redistribution's in binary form must reproduce the above copyright notice,
// this list of conditions and the following disclaimer in the documentation
// and/or other oclMaterials provided with the distribution.
//
// * The name of the copyright holders may not be used to endorse or promote products
// derived from this software without specific prior written permission.
//
// This software is provided by the copyright holders and contributors as is and
// any express or implied warranties, including, but not limited to, the implied
// warranties of merchantability and fitness for a particular purpose are disclaimed.
// In no event shall the Intel Corporation or contributors be liable for any direct,
// indirect, incidental, special, exemplary, or consequential damages
// (including, but not limited to, procurement of substitute goods or services;
// loss of use, data, or profits; or business interruption) however caused
// and on any theory of liability, whether in contract, strict liability,
// or tort (including negligence or otherwise) arising in any way out of
// the use of this software, even if advised of the possibility of such damage.
//
//M*/
#ifndef WITH_MASK
#define WITH_MASK 0
#endif
__constant sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP_TO_EDGE | CLK_FILTER_NEAREST;
inline float ELEM_INT2(image2d_t _eig, int _x, int _y)
{
return read_imagef(_eig, sampler, (int2)(_x, _y)).x;
}
inline float ELEM_FLT2(image2d_t _eig, float2 pt)
{
return read_imagef(_eig, sampler, pt).x;
}
__kernel
void findCorners
(
image2d_t eig,
__global const char * mask,
__global float2 * corners,
const int mask_strip,// in pixels
const float threshold,
const int rows,
const int cols,
const int max_count,
__global int * g_counter
)
{
const int j = get_global_id(0);
const int i = get_global_id(1);
if (i > 0 && i < rows - 1 && j > 0 && j < cols - 1
#if WITH_MASK
&& mask[i * mask_strip + j] != 0
#endif
)
{
const float val = ELEM_INT2(eig, j, i);
if (val > threshold)
{
float maxVal = val;
maxVal = fmax(ELEM_INT2(eig, j - 1, i - 1), maxVal);
maxVal = fmax(ELEM_INT2(eig, j , i - 1), maxVal);
maxVal = fmax(ELEM_INT2(eig, j + 1, i - 1), maxVal);
maxVal = fmax(ELEM_INT2(eig, j - 1, i), maxVal);
maxVal = fmax(ELEM_INT2(eig, j + 1, i), maxVal);
maxVal = fmax(ELEM_INT2(eig, j - 1, i + 1), maxVal);
maxVal = fmax(ELEM_INT2(eig, j , i + 1), maxVal);
maxVal = fmax(ELEM_INT2(eig, j + 1, i + 1), maxVal);
if (val == maxVal)
{
const int ind = atomic_inc(g_counter);
if (ind < max_count)
corners[ind] = (float2)(j, i);
}
}
}
}
//bitonic sort
__kernel
void sortCorners_bitonicSort
(
image2d_t eig,
__global float2 * corners,
const int count,
const int stage,
const int passOfStage
)
{
const int threadId = get_global_id(0);
if(threadId >= count / 2)
{
return;
}
const int sortOrder = (((threadId/(1 << stage)) % 2)) == 1 ? 1 : 0; // 0 is descent
const int pairDistance = 1 << (stage - passOfStage);
const int blockWidth = 2 * pairDistance;
const int leftId = min( (threadId % pairDistance)
+ (threadId / pairDistance) * blockWidth, count );
const int rightId = min( leftId + pairDistance, count );
const float2 leftPt = corners[leftId];
const float2 rightPt = corners[rightId];
const float leftVal = ELEM_FLT2(eig, leftPt);
const float rightVal = ELEM_FLT2(eig, rightPt);
const bool compareResult = leftVal > rightVal;
float2 greater = compareResult ? leftPt:rightPt;
float2 lesser = compareResult ? rightPt:leftPt;
corners[leftId] = sortOrder ? lesser : greater;
corners[rightId] = sortOrder ? greater : lesser;
}
//selection sort for gfft
//kernel is ported from Bolt library:
//https://github.com/HSA-Libraries/Bolt/blob/master/include/bolt/cl/sort_kernels.cl
// Local sort will firstly sort elements of each workgroup using selection sort
// its performance is O(n)
__kernel
void sortCorners_selectionSortLocal
(
image2d_t eig,
__global float2 * corners,
const int count,
__local float2 * scratch
)
{
int i = get_local_id(0); // index in workgroup
int numOfGroups = get_num_groups(0); // index in workgroup
int groupID = get_group_id(0);
int wg = get_local_size(0); // workgroup size = block size
int n; // number of elements to be processed for this work group
int offset = groupID * wg;
int same = 0;
corners += offset;
n = (groupID == (numOfGroups-1))? (count - wg*(numOfGroups-1)) : wg;
float2 pt1, pt2;
pt1 = corners[min(i, n)];
scratch[i] = pt1;
barrier(CLK_LOCAL_MEM_FENCE);
if(i >= n)
{
return;
}
float val1 = ELEM_FLT2(eig, pt1);
float val2;
int pos = 0;
for (int j=0;j<n;++j)
{
pt2 = scratch[j];
val2 = ELEM_FLT2(eig, pt2);
if(val2 > val1)
pos++;//calculate the rank of this element in this work group
else
{
if(val1 > val2)
continue;
else
{
// val1 and val2 are same
same++;
}
}
}
for (int j=0; j< same; j++)
corners[pos + j] = pt1;
}
__kernel
void sortCorners_selectionSortFinal
(
image2d_t eig,
__global float2 * corners,
const int count
)
{
const int i = get_local_id(0); // index in workgroup
const int numOfGroups = get_num_groups(0); // index in workgroup
const int groupID = get_group_id(0);
const int wg = get_local_size(0); // workgroup size = block size
int pos = 0, same = 0;
const int offset = get_group_id(0) * wg;
const int remainder = count - wg*(numOfGroups-1);
if((offset + i ) >= count)
return;
float2 pt1, pt2;
pt1 = corners[groupID*wg + i];
float val1 = ELEM_FLT2(eig, pt1);
float val2;
for(int j=0; j<numOfGroups-1; j++ )
{
for(int k=0; k<wg; k++)
{
pt2 = corners[j*wg + k];
val2 = ELEM_FLT2(eig, pt2);
if(val1 > val2)
break;
else
{
//Increment only if the value is not the same.
if( val2 > val1 )
pos++;
else
same++;
}
}
}
for(int k=0; k<remainder; k++)
{
pt2 = corners[(numOfGroups-1)*wg + k];
val2 = ELEM_FLT2(eig, pt2);
if(val1 > val2)
break;
else
{
//Don't increment if the value is the same.
//Two elements are same if (*userComp)(jData, iData) and (*userComp)(iData, jData) are both false
if(val2 > val1)
pos++;
else
same++;
}
}
for (int j=0; j< same; j++)
corners[pos + j] = pt1;
}

View File

@@ -143,7 +143,7 @@ __kernel void threshold_C1_D5(__global const float * restrict src, __global floa
int4 dpos = (int4)(dstart, dstart+1, dstart+2, dstart+3);
float4 dVal = *(__global float4*)(dst+dst_offset+gy*dst_step+dstart);
int4 con = dpos >= 0 && dpos < dst_cols;
ddata = convert_float4(con) != 0 ? ddata : dVal;
ddata = convert_float4(con) != (float4)(0) ? ddata : dVal;
if(dstart < dst_cols)
{
*(__global float4*)(dst+dst_offset+gy*dst_step+dstart) = ddata;

View File

@@ -46,145 +46,10 @@
//#pragma OPENCL EXTENSION cl_amd_printf : enable
__kernel void calcSharrDeriv_vertical_C1_D0(__global const uchar* src, int srcStep, int rows, int cols, int cn, __global short* dx_buf, int dx_bufStep, __global short* dy_buf, int dy_bufStep)
{
const int x = get_global_id(0);
const int y = get_global_id(1);
if (y < rows && x < cols * cn)
{
const uchar src_val0 = (src + (y > 0 ? y-1 : rows > 1 ? 1 : 0) * srcStep)[x];
const uchar src_val1 = (src + y * srcStep)[x];
const uchar src_val2 = (src + (y < rows-1 ? y+1 : rows > 1 ? rows-2 : 0) * srcStep)[x];
((__global short*)((__global char*)dx_buf + y * dx_bufStep / 2))[x] = (src_val0 + src_val2) * 3 + src_val1 * 10;
((__global short*)((__global char*)dy_buf + y * dy_bufStep / 2))[x] = src_val2 - src_val0;
}
}
__kernel void calcSharrDeriv_vertical_C4_D0(__global const uchar* src, int srcStep, int rows, int cols, int cn, __global short* dx_buf, int dx_bufStep, __global short* dy_buf, int dy_bufStep)
{
const int x = get_global_id(0);
const int y = get_global_id(1);
if (y < rows && x < cols * cn)
{
const uchar src_val0 = (src + (y > 0 ? y - 1 : 1) * srcStep)[x];
const uchar src_val1 = (src + y * srcStep)[x];
const uchar src_val2 = (src + (y < rows - 1 ? y + 1 : rows - 2) * srcStep)[x];
((__global short*)((__global char*)dx_buf + y * dx_bufStep / 2))[x] = (src_val0 + src_val2) * 3 + src_val1 * 10;
((__global short*)((__global char*)dy_buf + y * dy_bufStep / 2))[x] = src_val2 - src_val0;
}
}
__kernel void calcSharrDeriv_horizontal_C1_D0(int rows, int cols, int cn, __global const short* dx_buf, int dx_bufStep, __global const short* dy_buf, int dy_bufStep, __global short* dIdx, int dIdxStep, __global short* dIdy, int dIdyStep)
{
const int x = get_global_id(0);
const int y = get_global_id(1);
const int colsn = cols * cn;
if (y < rows && x < colsn)
{
__global const short* dx_buf_row = dx_buf + y * dx_bufStep;
__global const short* dy_buf_row = dy_buf + y * dy_bufStep;
const int xr = x + cn < colsn ? x + cn : (cols - 2) * cn + x + cn - colsn;
const int xl = x - cn >= 0 ? x - cn : cn + x;
((__global short*)((__global char*)dIdx + y * dIdxStep / 2))[x] = dx_buf_row[xr] - dx_buf_row[xl];
((__global short*)((__global char*)dIdy + y * dIdyStep / 2))[x] = (dy_buf_row[xr] + dy_buf_row[xl]) * 3 + dy_buf_row[x] * 10;
}
}
__kernel void calcSharrDeriv_horizontal_C4_D0(int rows, int cols, int cn, __global const short* dx_buf, int dx_bufStep, __global const short* dy_buf, int dy_bufStep, __global short* dIdx, int dIdxStep, __global short* dIdy, int dIdyStep)
{
const int x = get_global_id(0);
const int y = get_global_id(1);
const int colsn = cols * cn;
if (y < rows && x < colsn)
{
__global const short* dx_buf_row = dx_buf + y * dx_bufStep;
__global const short* dy_buf_row = dy_buf + y * dy_bufStep;
const int xr = x + cn < colsn ? x + cn : (cols - 2) * cn + x + cn - colsn;
const int xl = x - cn >= 0 ? x - cn : cn + x;
((__global short*)((__global char*)dIdx + y * dIdxStep / 2))[x] = dx_buf_row[xr] - dx_buf_row[xl];
((__global short*)((__global char*)dIdy + y * dIdyStep / 2))[x] = (dy_buf_row[xr] + dy_buf_row[xl]) * 3 + dy_buf_row[x] * 10;
}
}
#define W_BITS 14
#define W_BITS1 14
#define CV_DESCALE(x, n) (((x) + (1 << ((n)-1))) >> (n))
int linearFilter_uchar(__global const uchar* src, int srcStep, int cn, float2 pt, int x, int y)
{
int2 ipt;
ipt.x = convert_int_sat_rtn(pt.x);
ipt.y = convert_int_sat_rtn(pt.y);
float a = pt.x - ipt.x;
float b = pt.y - ipt.y;
int iw00 = convert_int_sat_rte((1.0f - a) * (1.0f - b) * (1 << W_BITS));
int iw01 = convert_int_sat_rte(a * (1.0f - b) * (1 << W_BITS));
int iw10 = convert_int_sat_rte((1.0f - a) * b * (1 << W_BITS));
int iw11 = (1 << W_BITS) - iw00 - iw01 - iw10;
__global const uchar* src_row = src + (ipt.y + y) * srcStep + ipt.x * cn;
__global const uchar* src_row1 = src + (ipt.y + y + 1) * srcStep + ipt.x * cn;
return CV_DESCALE(src_row[x] * iw00 + src_row[x + cn] * iw01 + src_row1[x] * iw10 + src_row1[x + cn] * iw11, W_BITS1 - 5);
}
int linearFilter_short(__global const short* src, int srcStep, int cn, float2 pt, int x, int y)
{
int2 ipt;
ipt.x = convert_int_sat_rtn(pt.x);
ipt.y = convert_int_sat_rtn(pt.y);
float a = pt.x - ipt.x;
float b = pt.y - ipt.y;
int iw00 = convert_int_sat_rte((1.0f - a) * (1.0f - b) * (1 << W_BITS));
int iw01 = convert_int_sat_rte(a * (1.0f - b) * (1 << W_BITS));
int iw10 = convert_int_sat_rte((1.0f - a) * b * (1 << W_BITS));
int iw11 = (1 << W_BITS) - iw00 - iw01 - iw10;
__global const short* src_row = src + (ipt.y + y) * srcStep + ipt.x * cn;
__global const short* src_row1 = src + (ipt.y + y + 1) * srcStep + ipt.x * cn;
return CV_DESCALE(src_row[x] * iw00 + src_row[x + cn] * iw01 + src_row1[x] * iw10 + src_row1[x + cn] * iw11, W_BITS1);
}
float linearFilter_float(__global const float* src, int srcStep, int cn, float2 pt, float x, float y)
{
int2 ipt;
ipt.x = convert_int_sat_rtn(pt.x);
ipt.y = convert_int_sat_rtn(pt.y);
float a = pt.x - ipt.x;
float b = pt.y - ipt.y;
float iw00 = ((1.0f - a) * (1.0f - b) * (1 << W_BITS));
float iw01 = (a * (1.0f - b) * (1 << W_BITS));
float iw10 = ((1.0f - a) * b * (1 << W_BITS));
float iw11 = (1 << W_BITS) - iw00 - iw01 - iw10;
__global const float* src_row = src + (int)(ipt.y + y) * srcStep / 4 + ipt.x * cn;
__global const float* src_row1 = src + (int)(ipt.y + y + 1) * srcStep / 4 + ipt.x * cn;
return src_row[(int)x] * iw00 + src_row[(int)x + cn] * iw01 + src_row1[(int)x] * iw10 + src_row1[(int)x + cn] * iw11, W_BITS1 - 5;
}
#define BUFFER 64
#ifndef WAVE_SIZE
#define WAVE_SIZE 1
#endif
#ifdef CPU
void reduce3(float val1, float val2, float val3, __local float* smem1, __local float* smem2, __local float* smem3, int tid)
{
@@ -193,71 +58,51 @@ void reduce3(float val1, float val2, float val3, __local float* smem1, __local
smem3[tid] = val3;
barrier(CLK_LOCAL_MEM_FENCE);
#if BUFFER > 128
if (tid < 128)
{
smem1[tid] = val1 += smem1[tid + 128];
smem2[tid] = val2 += smem2[tid + 128];
smem3[tid] = val3 += smem3[tid + 128];
}
barrier(CLK_LOCAL_MEM_FENCE);
#endif
#if BUFFER > 64
if (tid < 64)
{
smem1[tid] = val1 += smem1[tid + 64];
smem2[tid] = val2 += smem2[tid + 64];
smem3[tid] = val3 += smem3[tid + 64];
}
barrier(CLK_LOCAL_MEM_FENCE);
#endif
if (tid < 32)
{
smem1[tid] = val1 += smem1[tid + 32];
smem2[tid] = val2 += smem2[tid + 32];
smem3[tid] = val3 += smem3[tid + 32];
smem1[tid] += smem1[tid + 32];
smem2[tid] += smem2[tid + 32];
smem3[tid] += smem3[tid + 32];
}
barrier(CLK_LOCAL_MEM_FENCE);
if (tid < 16)
{
smem1[tid] = val1 += smem1[tid + 16];
smem2[tid] = val2 += smem2[tid + 16];
smem3[tid] = val3 += smem3[tid + 16];
smem1[tid] += smem1[tid + 16];
smem2[tid] += smem2[tid + 16];
smem3[tid] += smem3[tid + 16];
}
barrier(CLK_LOCAL_MEM_FENCE);
if (tid < 8)
{
smem1[tid] = val1 += smem1[tid + 8];
smem2[tid] = val2 += smem2[tid + 8];
smem3[tid] = val3 += smem3[tid + 8];
smem1[tid] += smem1[tid + 8];
smem2[tid] += smem2[tid + 8];
smem3[tid] += smem3[tid + 8];
}
barrier(CLK_LOCAL_MEM_FENCE);
if (tid < 4)
{
smem1[tid] = val1 += smem1[tid + 4];
smem2[tid] = val2 += smem2[tid + 4];
smem3[tid] = val3 += smem3[tid + 4];
smem1[tid] += smem1[tid + 4];
smem2[tid] += smem2[tid + 4];
smem3[tid] += smem3[tid + 4];
}
barrier(CLK_LOCAL_MEM_FENCE);
if (tid < 2)
{
smem1[tid] = val1 += smem1[tid + 2];
smem2[tid] = val2 += smem2[tid + 2];
smem3[tid] = val3 += smem3[tid + 2];
smem1[tid] += smem1[tid + 2];
smem2[tid] += smem2[tid + 2];
smem3[tid] += smem3[tid + 2];
}
barrier(CLK_LOCAL_MEM_FENCE);
if (tid < 1)
{
smem1[BUFFER] = val1 += smem1[tid + 1];
smem2[BUFFER] = val2 += smem2[tid + 1];
smem3[BUFFER] = val3 += smem3[tid + 1];
smem1[BUFFER] = smem1[tid] + smem1[tid + 1];
smem2[BUFFER] = smem2[tid] + smem2[tid + 1];
smem3[BUFFER] = smem3[tid] + smem3[tid + 1];
}
barrier(CLK_LOCAL_MEM_FENCE);
}
@@ -268,63 +113,45 @@ void reduce2(float val1, float val2, volatile __local float* smem1, volatile __l
smem2[tid] = val2;
barrier(CLK_LOCAL_MEM_FENCE);
#if BUFFER > 128
if (tid < 128)
{
smem1[tid] = (val1 += smem1[tid + 128]);
smem2[tid] = (val2 += smem2[tid + 128]);
}
barrier(CLK_LOCAL_MEM_FENCE);
#endif
#if BUFFER > 64
if (tid < 64)
{
smem1[tid] = (val1 += smem1[tid + 64]);
smem2[tid] = (val2 += smem2[tid + 64]);
}
barrier(CLK_LOCAL_MEM_FENCE);
#endif
if (tid < 32)
{
smem1[tid] = (val1 += smem1[tid + 32]);
smem2[tid] = (val2 += smem2[tid + 32]);
smem1[tid] += smem1[tid + 32];
smem2[tid] += smem2[tid + 32];
}
barrier(CLK_LOCAL_MEM_FENCE);
if (tid < 16)
{
smem1[tid] = (val1 += smem1[tid + 16]);
smem2[tid] = (val2 += smem2[tid + 16]);
smem1[tid] += smem1[tid + 16];
smem2[tid] += smem2[tid + 16];
}
barrier(CLK_LOCAL_MEM_FENCE);
if (tid < 8)
{
smem1[tid] = (val1 += smem1[tid + 8]);
smem2[tid] = (val2 += smem2[tid + 8]);
smem1[tid] += smem1[tid + 8];
smem2[tid] += smem2[tid + 8];
}
barrier(CLK_LOCAL_MEM_FENCE);
if (tid < 4)
{
smem1[tid] = (val1 += smem1[tid + 4]);
smem2[tid] = (val2 += smem2[tid + 4]);
smem1[tid] += smem1[tid + 4];
smem2[tid] += smem2[tid + 4];
}
barrier(CLK_LOCAL_MEM_FENCE);
if (tid < 2)
{
smem1[tid] = (val1 += smem1[tid + 2]);
smem2[tid] = (val2 += smem2[tid + 2]);
smem1[tid] += smem1[tid + 2];
smem2[tid] += smem2[tid + 2];
}
barrier(CLK_LOCAL_MEM_FENCE);
if (tid < 1)
{
smem1[BUFFER] = (val1 += smem1[tid + 1]);
smem2[BUFFER] = (val2 += smem2[tid + 1]);
smem1[BUFFER] = smem1[tid] + smem1[tid + 1];
smem2[BUFFER] = smem2[tid] + smem2[tid + 1];
}
barrier(CLK_LOCAL_MEM_FENCE);
}
@@ -334,205 +161,146 @@ void reduce1(float val1, volatile __local float* smem1, int tid)
smem1[tid] = val1;
barrier(CLK_LOCAL_MEM_FENCE);
#if BUFFER > 128
if (tid < 128)
{
smem1[tid] = (val1 += smem1[tid + 128]);
}
barrier(CLK_LOCAL_MEM_FENCE);
#endif
#if BUFFER > 64
if (tid < 64)
{
smem1[tid] = (val1 += smem1[tid + 64]);
}
barrier(CLK_LOCAL_MEM_FENCE);
#endif
if (tid < 32)
{
smem1[tid] = (val1 += smem1[tid + 32]);
smem1[tid] += smem1[tid + 32];
}
barrier(CLK_LOCAL_MEM_FENCE);
if (tid < 16)
{
smem1[tid] = (val1 += smem1[tid + 16]);
smem1[tid] += smem1[tid + 16];
}
barrier(CLK_LOCAL_MEM_FENCE);
if (tid < 8)
{
smem1[tid] = (val1 += smem1[tid + 8]);
smem1[tid] += smem1[tid + 8];
}
barrier(CLK_LOCAL_MEM_FENCE);
if (tid < 4)
{
smem1[tid] = (val1 += smem1[tid + 4]);
smem1[tid] += smem1[tid + 4];
}
barrier(CLK_LOCAL_MEM_FENCE);
if (tid < 2)
{
smem1[tid] = (val1 += smem1[tid + 2]);
smem1[tid] += smem1[tid + 2];
}
barrier(CLK_LOCAL_MEM_FENCE);
if (tid < 1)
{
smem1[BUFFER] = (val1 += smem1[tid + 1]);
smem1[BUFFER] = smem1[tid] + smem1[tid + 1];
}
barrier(CLK_LOCAL_MEM_FENCE);
}
#else
void reduce3(float val1, float val2, float val3, __local float* smem1, __local float* smem2, __local float* smem3, int tid)
void reduce3(float val1, float val2, float val3,
__local volatile float* smem1, __local volatile float* smem2, __local volatile float* smem3, int tid)
{
smem1[tid] = val1;
smem2[tid] = val2;
smem3[tid] = val3;
barrier(CLK_LOCAL_MEM_FENCE);
#if BUFFER > 128
if (tid < 128)
{
smem1[tid] = val1 += smem1[tid + 128];
smem2[tid] = val2 += smem2[tid + 128];
smem3[tid] = val3 += smem3[tid + 128];
}
barrier(CLK_LOCAL_MEM_FENCE);
#endif
#if BUFFER > 64
if (tid < 64)
{
smem1[tid] = val1 += smem1[tid + 64];
smem2[tid] = val2 += smem2[tid + 64];
smem3[tid] = val3 += smem3[tid + 64];
}
barrier(CLK_LOCAL_MEM_FENCE);
#endif
if (tid < 32)
{
volatile __local float* vmem1 = smem1;
volatile __local float* vmem2 = smem2;
volatile __local float* vmem3 = smem3;
smem1[tid] += smem1[tid + 32];
smem2[tid] += smem2[tid + 32];
smem3[tid] += smem3[tid + 32];
#if WAVE_SIZE < 32
} barrier(CLK_LOCAL_MEM_FENCE);
if (tid < 16) {
#endif
smem1[tid] += smem1[tid + 16];
smem2[tid] += smem2[tid + 16];
smem3[tid] += smem3[tid + 16];
#if WAVE_SIZE <16
} barrier(CLK_LOCAL_MEM_FENCE);
if (tid < 8) {
#endif
smem1[tid] += smem1[tid + 8];
smem2[tid] += smem2[tid + 8];
smem3[tid] += smem3[tid + 8];
vmem1[tid] = val1 += vmem1[tid + 32];
vmem2[tid] = val2 += vmem2[tid + 32];
vmem3[tid] = val3 += vmem3[tid + 32];
smem1[tid] += smem1[tid + 4];
smem2[tid] += smem2[tid + 4];
smem3[tid] += smem3[tid + 4];
vmem1[tid] = val1 += vmem1[tid + 16];
vmem2[tid] = val2 += vmem2[tid + 16];
vmem3[tid] = val3 += vmem3[tid + 16];
smem1[tid] += smem1[tid + 2];
smem2[tid] += smem2[tid + 2];
smem3[tid] += smem3[tid + 2];
vmem1[tid] = val1 += vmem1[tid + 8];
vmem2[tid] = val2 += vmem2[tid + 8];
vmem3[tid] = val3 += vmem3[tid + 8];
vmem1[tid] = val1 += vmem1[tid + 4];
vmem2[tid] = val2 += vmem2[tid + 4];
vmem3[tid] = val3 += vmem3[tid + 4];
vmem1[tid] = val1 += vmem1[tid + 2];
vmem2[tid] = val2 += vmem2[tid + 2];
vmem3[tid] = val3 += vmem3[tid + 2];
vmem1[tid] = val1 += vmem1[tid + 1];
vmem2[tid] = val2 += vmem2[tid + 1];
vmem3[tid] = val3 += vmem3[tid + 1];
smem1[tid] += smem1[tid + 1];
smem2[tid] += smem2[tid + 1];
smem3[tid] += smem3[tid + 1];
}
}
void reduce2(float val1, float val2, __local float* smem1, __local float* smem2, int tid)
void reduce2(float val1, float val2, __local volatile float* smem1, __local volatile float* smem2, int tid)
{
smem1[tid] = val1;
smem2[tid] = val2;
barrier(CLK_LOCAL_MEM_FENCE);
#if BUFFER > 128
if (tid < 128)
{
smem1[tid] = val1 += smem1[tid + 128];
smem2[tid] = val2 += smem2[tid + 128];
}
barrier(CLK_LOCAL_MEM_FENCE);
#endif
#if BUFFER > 64
if (tid < 64)
{
smem1[tid] = val1 += smem1[tid + 64];
smem2[tid] = val2 += smem2[tid + 64];
}
barrier(CLK_LOCAL_MEM_FENCE);
#endif
if (tid < 32)
{
volatile __local float* vmem1 = smem1;
volatile __local float* vmem2 = smem2;
smem1[tid] += smem1[tid + 32];
smem2[tid] += smem2[tid + 32];
#if WAVE_SIZE < 32
} barrier(CLK_LOCAL_MEM_FENCE);
if (tid < 16) {
#endif
smem1[tid] += smem1[tid + 16];
smem2[tid] += smem2[tid + 16];
#if WAVE_SIZE <16
} barrier(CLK_LOCAL_MEM_FENCE);
if (tid < 8) {
#endif
smem1[tid] += smem1[tid + 8];
smem2[tid] += smem2[tid + 8];
vmem1[tid] = val1 += vmem1[tid + 32];
vmem2[tid] = val2 += vmem2[tid + 32];
smem1[tid] += smem1[tid + 4];
smem2[tid] += smem2[tid + 4];
vmem1[tid] = val1 += vmem1[tid + 16];
vmem2[tid] = val2 += vmem2[tid + 16];
smem1[tid] += smem1[tid + 2];
smem2[tid] += smem2[tid + 2];
vmem1[tid] = val1 += vmem1[tid + 8];
vmem2[tid] = val2 += vmem2[tid + 8];
vmem1[tid] = val1 += vmem1[tid + 4];
vmem2[tid] = val2 += vmem2[tid + 4];
vmem1[tid] = val1 += vmem1[tid + 2];
vmem2[tid] = val2 += vmem2[tid + 2];
vmem1[tid] = val1 += vmem1[tid + 1];
vmem2[tid] = val2 += vmem2[tid + 1];
smem1[tid] += smem1[tid + 1];
smem2[tid] += smem2[tid + 1];
}
}
void reduce1(float val1, __local float* smem1, int tid)
void reduce1(float val1, __local volatile float* smem1, int tid)
{
smem1[tid] = val1;
barrier(CLK_LOCAL_MEM_FENCE);
#if BUFFER > 128
if (tid < 128)
{
smem1[tid] = val1 += smem1[tid + 128];
}
barrier(CLK_LOCAL_MEM_FENCE);
#endif
#if BUFFER > 64
if (tid < 64)
{
smem1[tid] = val1 += smem1[tid + 64];
}
barrier(CLK_LOCAL_MEM_FENCE);
#endif
if (tid < 32)
{
volatile __local float* vmem1 = smem1;
vmem1[tid] = val1 += vmem1[tid + 32];
vmem1[tid] = val1 += vmem1[tid + 16];
vmem1[tid] = val1 += vmem1[tid + 8];
vmem1[tid] = val1 += vmem1[tid + 4];
vmem1[tid] = val1 += vmem1[tid + 2];
vmem1[tid] = val1 += vmem1[tid + 1];
smem1[tid] += smem1[tid + 32];
#if WAVE_SIZE < 32
} barrier(CLK_LOCAL_MEM_FENCE);
if (tid < 16) {
#endif
smem1[tid] += smem1[tid + 16];
#if WAVE_SIZE <16
} barrier(CLK_LOCAL_MEM_FENCE);
if (tid < 8) {
#endif
smem1[tid] += smem1[tid + 8];
smem1[tid] += smem1[tid + 4];
smem1[tid] += smem1[tid + 2];
smem1[tid] += smem1[tid + 1];
}
}
#endif
#define SCALE (1.0f / (1 << 20))
#define THRESHOLD 0.01f
#define DIMENSION 21
// Image read mode
__constant sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP_TO_EDGE | CLK_FILTER_LINEAR;

View File

@@ -61,6 +61,8 @@
#include <exception>
#include <stdio.h>
#undef OPENCV_NOSTL
#include "opencv2/imgproc.hpp"
#include "opencv2/objdetect.hpp"
#include "opencv2/ocl.hpp"
@@ -74,6 +76,7 @@
#if defined (HAVE_OPENCL)
#define CL_USE_DEPRECATED_OPENCL_1_1_APIS
#include "opencv2/ocl/private/util.hpp"
#include "safe_call.hpp"

View File

@@ -15,8 +15,8 @@
// Third party copyrights are property of their respective owners.
//
// @Authors
// Dachuan Zhao, dachuan@multicorewareinc.com
// Yao Wang, yao@multicorewareinc.com
// Dachuan Zhao, dachuan@multicorewareinc.com
// Yao Wang, yao@multicorewareinc.com
// Nathan, liujun@multicorewareinc.com
//
// Redistribution and use in source and binary forms, with or without modification,
@@ -54,35 +54,20 @@ namespace cv
{
namespace ocl
{
///////////////////////////OpenCL kernel strings///////////////////////////
extern const char *pyrlk;
extern const char *pyrlk_no_image;
extern const char *operator_setTo;
extern const char *operator_convertTo;
extern const char *operator_copyToM;
extern const char *arithm_mul;
extern const char *pyr_down;
}
}
struct dim3
{
unsigned int x, y, z;
};
struct float2
{
float x, y;
};
struct int2
{
int x, y;
};
namespace
{
void calcPatchSize(cv::Size winSize, int cn, dim3 &block, dim3 &patch, bool isDeviceArch11)
static void calcPatchSize(cv::Size winSize, int cn, dim3 &block, dim3 &patch, bool isDeviceArch11)
{
winSize.width *= cn;
@@ -102,12 +87,6 @@ void calcPatchSize(cv::Size winSize, int cn, dim3 &block, dim3 &patch, bool isDe
block.z = patch.z = 1;
}
}
inline int divUp(int total, int grain)
{
return (total + grain - 1) / grain;
}
///////////////////////////////////////////////////////////////////////////
//////////////////////////////// ConvertTo ////////////////////////////////
@@ -448,89 +427,6 @@ static void copyTo(const oclMat &src, oclMat &m )
src.data, src.step, src.cols * src.elemSize(), src.rows, src.offset);
}
// static void copyTo(const oclMat &src, oclMat &mat, const oclMat &mask)
// {
// if (mask.empty())
// {
// copyTo(src, mat);
// }
// else
// {
// mat.create(src.size(), src.type());
// copy_to_with_mask_cus(src, mat, mask, "copy_to_with_mask");
// }
// }
static void arithmetic_run(const oclMat &src1, oclMat &dst, String kernelName, const char **kernelString, void *_scalar)
{
if(!src1.clCxt->supportsFeature(Context::CL_DOUBLE) && src1.type() == CV_64F)
{
CV_Error(Error::GpuNotSupported, "Selected device don't support double\r\n");
return;
}
//dst.create(src1.size(), src1.type());
//CV_Assert(src1.cols == src2.cols && src2.cols == dst.cols &&
// src1.rows == src2.rows && src2.rows == dst.rows);
CV_Assert(src1.cols == dst.cols &&
src1.rows == dst.rows);
CV_Assert(src1.type() == dst.type());
CV_Assert(src1.depth() != CV_8S);
Context *clCxt = src1.clCxt;
//int channels = dst.channels();
//int depth = dst.depth();
//int vector_lengths[4][7] = {{4, 0, 4, 4, 1, 1, 1},
// {4, 0, 4, 4, 1, 1, 1},
// {4, 0, 4, 4, 1, 1, 1},
// {4, 0, 4, 4, 1, 1, 1}
//};
//size_t vector_length = vector_lengths[channels-1][depth];
//int offset_cols = (dst.offset / dst.elemSize1()) & (vector_length - 1);
//int cols = divUp(dst.cols * channels + offset_cols, vector_length);
size_t localThreads[3] = { 16, 16, 1 };
//size_t globalThreads[3] = { divUp(cols, localThreads[0]) * localThreads[0],
// divUp(dst.rows, localThreads[1]) * localThreads[1],
// 1
// };
size_t globalThreads[3] = { src1.cols,
src1.rows,
1
};
int dst_step1 = dst.cols * dst.elemSize();
std::vector<std::pair<size_t , const void *> > args;
args.push_back( std::make_pair( sizeof(cl_mem), (void *)&src1.data ));
args.push_back( std::make_pair( sizeof(cl_int), (void *)&src1.step ));
args.push_back( std::make_pair( sizeof(cl_int), (void *)&src1.offset ));
//args.push_back( std::make_pair( sizeof(cl_mem), (void *)&src2.data ));
//args.push_back( std::make_pair( sizeof(cl_int), (void *)&src2.step ));
//args.push_back( std::make_pair( sizeof(cl_int), (void *)&src2.offset ));
args.push_back( std::make_pair( sizeof(cl_mem), (void *)&dst.data ));
args.push_back( std::make_pair( sizeof(cl_int), (void *)&dst.step ));
args.push_back( std::make_pair( sizeof(cl_int), (void *)&dst.offset ));
args.push_back( std::make_pair( sizeof(cl_int), (void *)&src1.rows ));
args.push_back( std::make_pair( sizeof(cl_int), (void *)&src1.cols ));
args.push_back( std::make_pair( sizeof(cl_int), (void *)&dst_step1 ));
//if(_scalar != NULL)
//{
float scalar1 = *((float *)_scalar);
args.push_back( std::make_pair( sizeof(float), (float *)&scalar1 ));
//}
openCLExecuteKernel2(clCxt, kernelString, kernelName, globalThreads, localThreads, args, -1, src1.depth(), CLFLUSH);
}
static void multiply_cus(const oclMat &src1, oclMat &dst, float scalar)
{
arithmetic_run(src1, dst, "arithm_muls", &arithm_mul, (void *)(&scalar));
}
static void pyrdown_run_cus(const oclMat &src, const oclMat &dst)
{
@@ -576,15 +472,7 @@ static void lkSparse_run(oclMat &I, oclMat &J,
size_t localThreads[3] = { 8, isImageSupported ? 8 : 32, 1 };
size_t globalThreads[3] = { 8 * ptcount, isImageSupported ? 8 : 32, 1};
int cn = I.oclchannels();
char calcErr;
if (level == 0)
{
calcErr = 1;
}
else
{
calcErr = 0;
}
char calcErr = level==0?1:0;
std::vector<std::pair<size_t , const void *> > args;
@@ -614,7 +502,16 @@ static void lkSparse_run(oclMat &I, oclMat &J,
if(isImageSupported)
{
openCLExecuteKernel2(clCxt, &pyrlk, kernelName, globalThreads, localThreads, args, I.oclchannels(), I.depth(), CLFLUSH);
std::stringstream idxStr;
idxStr << kernelName.c_str() << "_C" << I.oclchannels() << "_D" << I.depth();
cl_kernel kernel = openCLGetKernelFromSource(clCxt, &pyrlk, idxStr.str().c_str());
int wave_size = queryDeviceInfo<WAVEFRONT_SIZE, int>(kernel);
openCLSafeCall(clReleaseKernel(kernel));
static char opt[16] = {0};
sprintf(opt, " -D WAVE_SIZE=%d", wave_size);
openCLExecuteKernel2(clCxt, &pyrlk, kernelName, globalThreads, localThreads, args, I.oclchannels(), I.depth(), opt, CLFLUSH);
releaseTexture(ITex);
releaseTexture(JTex);
}
@@ -656,9 +553,7 @@ void cv::ocl::PyrLKOpticalFlow::sparse(const oclMat &prevImg, const oclMat &next
oclMat temp1 = (useInitialFlow ? nextPts : prevPts).reshape(1);
oclMat temp2 = nextPts.reshape(1);
//oclMat scalar(temp1.rows, temp1.cols, temp1.type(), Scalar(1.0f / (1 << maxLevel) / 2.0f));
multiply_cus(temp1, temp2, 1.0f / (1 << maxLevel) / 2.0f);
//::multiply(temp1, 1.0f / (1 << maxLevel) / 2.0f, temp2);
multiply(1.0f/(1<<maxLevel)/2.0f, temp1, temp2);
ensureSizeIsEnough(1, prevPts.cols, CV_8UC1, status);
//status.setTo(Scalar::all(1));
@@ -675,7 +570,6 @@ void cv::ocl::PyrLKOpticalFlow::sparse(const oclMat &prevImg, const oclMat &next
//ensureSizeIsEnough(1, prevPts.cols, CV_32FC1, err);
// build the image pyramids.
prevPyr_.resize(maxLevel + 1);
nextPyr_.resize(maxLevel + 1);
@@ -703,7 +597,6 @@ void cv::ocl::PyrLKOpticalFlow::sparse(const oclMat &prevImg, const oclMat &next
}
// dI/dx ~ Ix, dI/dy ~ Iy
for (int level = maxLevel; level >= 0; level--)
{
lkSparse_run(prevPyr_[level], nextPyr_[level],

View File

@@ -47,7 +47,7 @@
#define __OPENCV_OPENCL_SAFE_CALL_HPP__
#if defined __APPLE__
#include <OpenCL/OpenCL.h>
#include <OpenCL/opencl.h>
#else
#include <CL/cl.h>
#endif

View File

@@ -73,7 +73,6 @@ TEST_P(Canny, Accuracy)
double low_thresh = 50.0;
double high_thresh = 100.0;
cv::resize(img, img, cv::Size(512, 384));
cv::ocl::oclMat ocl_img = cv::ocl::oclMat(img);
cv::ocl::oclMat edges;

View File

@@ -55,6 +55,12 @@ using namespace testing;
using namespace std;
using namespace cv;
extern string workdir;
namespace
{
IMPLEMENT_PARAM_CLASS(CascadeName, std::string);
CascadeName cascade_frontalface_alt(std::string("haarcascade_frontalface_alt.xml"));
CascadeName cascade_frontalface_alt2(std::string("haarcascade_frontalface_alt2.xml"));
struct getRect
{
Rect operator ()(const CvAvgComp &e) const
@@ -62,23 +68,24 @@ struct getRect
return e.rect;
}
};
}
PARAM_TEST_CASE(Haar, double, int)
PARAM_TEST_CASE(Haar, double, int, CascadeName)
{
cv::ocl::OclCascadeClassifier cascade, nestedCascade;
cv::ocl::OclCascadeClassifierBuf cascadebuf;
cv::CascadeClassifier cpucascade, cpunestedCascade;
double scale;
int flags;
std::string cascadeName;
virtual void SetUp()
{
scale = GET_PARAM(0);
flags = GET_PARAM(1);
string cascadeName = workdir + "../../data/haarcascades/haarcascade_frontalface_alt.xml";
cascadeName = (workdir + "../../data/haarcascades/").append(GET_PARAM(2));
if( (!cascade.load( cascadeName )) || (!cpucascade.load(cascadeName)) || (!cascadebuf.load( cascadeName )))
if( (!cascade.load( cascadeName )) || (!cpucascade.load(cascadeName)) )
{
cout << "ERROR: Could not load classifier cascade" << endl;
return;
@@ -115,7 +122,7 @@ TEST_P(Haar, FaceDetect)
Seq<CvAvgComp>(_objects).copyTo(vecAvgComp);
oclfaces.resize(vecAvgComp.size());
std::transform(vecAvgComp.begin(), vecAvgComp.end(), oclfaces.begin(), getRect());
cpucascade.detectMultiScale( smallImg, faces, 1.1, 3,
flags,
Size(30, 30), Size(0, 0) );
@@ -136,7 +143,6 @@ TEST_P(Haar, FaceDetectUseBuf)
vector<Rect> faces, oclfaces;
Mat gray, smallImg(cvRound (img.rows / scale), cvRound(img.cols / scale), CV_8UC1 );
MemStorage storage(cvCreateMemStorage(0));
cvtColor( img, gray, CV_BGR2GRAY );
resize( gray, smallImg, smallImg.size(), 0, 0, INTER_LINEAR );
equalizeHist( smallImg, smallImg );
@@ -144,19 +150,31 @@ TEST_P(Haar, FaceDetectUseBuf)
cv::ocl::oclMat image;
image.upload(smallImg);
cv::ocl::OclCascadeClassifierBuf cascadebuf;
if( !cascadebuf.load( cascadeName ) )
{
cout << "ERROR: Could not load classifier cascade for FaceDetectUseBuf!" << endl;
return;
}
cascadebuf.detectMultiScale( image, oclfaces, 1.1, 3,
flags,
Size(30, 30), Size(0, 0) );
cascadebuf.release();
cpucascade.detectMultiScale( smallImg, faces, 1.1, 3,
flags,
Size(30, 30), Size(0, 0) );
EXPECT_EQ(faces.size(), oclfaces.size());
// intentionally run ocl facedetect again and check if it still works after the first run
cascadebuf.detectMultiScale( image, oclfaces, 1.1, 3,
flags,
Size(30, 30));
cascadebuf.release();
EXPECT_EQ(faces.size(), oclfaces.size());
}
INSTANTIATE_TEST_CASE_P(FaceDetect, Haar,
Combine(Values(1.0),
Values(CV_HAAR_SCALE_IMAGE, 0)));
Values(CV_HAAR_SCALE_IMAGE, 0), Values(cascade_frontalface_alt, cascade_frontalface_alt2)));
#endif // HAVE_OPENCL

View File

@@ -55,6 +55,83 @@ using namespace testing;
using namespace std;
extern string workdir;
//////////////////////////////////////////////////////
// GoodFeaturesToTrack
namespace
{
IMPLEMENT_PARAM_CLASS(MinDistance, double)
}
PARAM_TEST_CASE(GoodFeaturesToTrack, MinDistance)
{
double minDistance;
virtual void SetUp()
{
minDistance = GET_PARAM(0);
}
};
TEST_P(GoodFeaturesToTrack, Accuracy)
{
cv::Mat frame = readImage(workdir + "../gpu/rubberwhale1.png", cv::IMREAD_GRAYSCALE);
ASSERT_FALSE(frame.empty());
int maxCorners = 1000;
double qualityLevel = 0.01;
cv::ocl::GoodFeaturesToTrackDetector_OCL detector(maxCorners, qualityLevel, minDistance);
cv::ocl::oclMat d_pts;
detector(oclMat(frame), d_pts);
ASSERT_FALSE(d_pts.empty());
std::vector<cv::Point2f> pts(d_pts.cols);
detector.downloadPoints(d_pts, pts);
std::vector<cv::Point2f> pts_gold;
cv::goodFeaturesToTrack(frame, pts_gold, maxCorners, qualityLevel, minDistance);
ASSERT_EQ(pts_gold.size(), pts.size());
size_t mistmatch = 0;
for (size_t i = 0; i < pts.size(); ++i)
{
cv::Point2i a = pts_gold[i];
cv::Point2i b = pts[i];
bool eq = std::abs(a.x - b.x) < 1 && std::abs(a.y - b.y) < 1;
if (!eq)
++mistmatch;
}
double bad_ratio = static_cast<double>(mistmatch) / pts.size();
ASSERT_LE(bad_ratio, 0.01);
}
TEST_P(GoodFeaturesToTrack, EmptyCorners)
{
int maxCorners = 1000;
double qualityLevel = 0.01;
cv::ocl::GoodFeaturesToTrackDetector_OCL detector(maxCorners, qualityLevel, minDistance);
cv::ocl::oclMat src(100, 100, CV_8UC1, cv::Scalar::all(0));
cv::ocl::oclMat corners(1, maxCorners, CV_32FC2);
detector(src, corners);
ASSERT_TRUE(corners.empty());
}
INSTANTIATE_TEST_CASE_P(OCL_Video, GoodFeaturesToTrack,
testing::Values(MinDistance(0.0), MinDistance(3.0)));
//////////////////////////////////////////////////////////////////////////
PARAM_TEST_CASE(TVL1, bool)
{

View File

@@ -59,17 +59,17 @@ void cv::fastNlMeansDenoising( InputArray _src, OutputArray _dst, float h,
switch (src.type()) {
case CV_8U:
parallel_for(cv::BlockedRange(0, src.rows),
parallel_for_(cv::Range(0, src.rows),
FastNlMeansDenoisingInvoker<uchar>(
src, dst, templateWindowSize, searchWindowSize, h));
break;
case CV_8UC2:
parallel_for(cv::BlockedRange(0, src.rows),
parallel_for_(cv::Range(0, src.rows),
FastNlMeansDenoisingInvoker<cv::Vec2b>(
src, dst, templateWindowSize, searchWindowSize, h));
break;
case CV_8UC3:
parallel_for(cv::BlockedRange(0, src.rows),
parallel_for_(cv::Range(0, src.rows),
FastNlMeansDenoisingInvoker<cv::Vec3b>(
src, dst, templateWindowSize, searchWindowSize, h));
break;
@@ -159,19 +159,19 @@ void cv::fastNlMeansDenoisingMulti( InputArrayOfArrays _srcImgs, OutputArray _ds
switch (srcImgs[0].type()) {
case CV_8U:
parallel_for(cv::BlockedRange(0, srcImgs[0].rows),
parallel_for_(cv::Range(0, srcImgs[0].rows),
FastNlMeansMultiDenoisingInvoker<uchar>(
srcImgs, imgToDenoiseIndex, temporalWindowSize,
dst, templateWindowSize, searchWindowSize, h));
break;
case CV_8UC2:
parallel_for(cv::BlockedRange(0, srcImgs[0].rows),
parallel_for_(cv::Range(0, srcImgs[0].rows),
FastNlMeansMultiDenoisingInvoker<cv::Vec2b>(
srcImgs, imgToDenoiseIndex, temporalWindowSize,
dst, templateWindowSize, searchWindowSize, h));
break;
case CV_8UC3:
parallel_for(cv::BlockedRange(0, srcImgs[0].rows),
parallel_for_(cv::Range(0, srcImgs[0].rows),
FastNlMeansMultiDenoisingInvoker<cv::Vec3b>(
srcImgs, imgToDenoiseIndex, temporalWindowSize,
dst, templateWindowSize, searchWindowSize, h));

View File

@@ -51,12 +51,12 @@
using namespace cv;
template <typename T>
struct FastNlMeansDenoisingInvoker {
struct FastNlMeansDenoisingInvoker : ParallelLoopBody {
public:
FastNlMeansDenoisingInvoker(const Mat& src, Mat& dst,
int template_window_size, int search_window_size, const float h);
void operator() (const BlockedRange& range) const;
void operator() (const Range& range) const;
private:
void operator= (const FastNlMeansDenoisingInvoker&);
@@ -152,9 +152,9 @@ FastNlMeansDenoisingInvoker<T>::FastNlMeansDenoisingInvoker(
}
template <class T>
void FastNlMeansDenoisingInvoker<T>::operator() (const BlockedRange& range) const {
int row_from = range.begin();
int row_to = range.end() - 1;
void FastNlMeansDenoisingInvoker<T>::operator() (const Range& range) const {
int row_from = range.start;
int row_to = range.end - 1;
Array2d<int> dist_sums(search_window_size_, search_window_size_);

View File

@@ -51,13 +51,13 @@
using namespace cv;
template <typename T>
struct FastNlMeansMultiDenoisingInvoker {
struct FastNlMeansMultiDenoisingInvoker : ParallelLoopBody {
public:
FastNlMeansMultiDenoisingInvoker(
const std::vector<Mat>& srcImgs, int imgToDenoiseIndex, int temporalWindowSize,
Mat& dst, int template_window_size, int search_window_size, const float h);
void operator() (const BlockedRange& range) const;
void operator() (const Range& range) const;
private:
void operator= (const FastNlMeansMultiDenoisingInvoker&);
@@ -171,9 +171,9 @@ FastNlMeansMultiDenoisingInvoker<T>::FastNlMeansMultiDenoisingInvoker(
}
template <class T>
void FastNlMeansMultiDenoisingInvoker<T>::operator() (const BlockedRange& range) const {
int row_from = range.begin();
int row_to = range.end() - 1;
void FastNlMeansMultiDenoisingInvoker<T>::operator() (const Range& range) const {
int row_from = range.start;
int row_to = range.end - 1;
Array3d<int> dist_sums(temporal_window_size_, search_window_size_, search_window_size_);

View File

@@ -43,8 +43,9 @@
#ifndef __OPENCV_PRECOMP_H__
#define __OPENCV_PRECOMP_H__
#include "opencv2/photo.hpp"
#include "opencv2/core/private.hpp"
#include "opencv2/core/utility.hpp"
#include "opencv2/photo.hpp"
#ifdef HAVE_TEGRA_OPTIMIZATION
#include "opencv2/photo/photo_tegra.hpp"

View File

@@ -65,7 +65,7 @@ struct DistIdxPair
};
struct MatchPairsBody
struct MatchPairsBody : ParallelLoopBody
{
MatchPairsBody(const MatchPairsBody& other)
: matcher(other.matcher), features(other.features),
@@ -76,10 +76,10 @@ struct MatchPairsBody
: matcher(_matcher), features(_features),
pairwise_matches(_pairwise_matches), near_pairs(_near_pairs) {}
void operator ()(const BlockedRange &r) const
void operator ()(const Range &r) const
{
const int num_images = static_cast<int>(features.size());
for (int i = r.begin(); i < r.end(); ++i)
for (int i = r.start; i < r.end; ++i)
{
int from = near_pairs[i].first;
int to = near_pairs[i].second;
@@ -525,9 +525,9 @@ void FeaturesMatcher::operator ()(const std::vector<ImageFeatures> &features, st
MatchPairsBody body(*this, features, pairwise_matches, near_pairs);
if (is_thread_safe_)
parallel_for(BlockedRange(0, static_cast<int>(near_pairs.size())), body);
parallel_for_(Range(0, static_cast<int>(near_pairs.size())), body);
else
body(BlockedRange(0, static_cast<int>(near_pairs.size())));
body(Range(0, static_cast<int>(near_pairs.size())));
LOGLN_CHAT("");
}

View File

@@ -53,6 +53,7 @@
#include <sstream>
#include <cmath>
#include "opencv2/core.hpp"
#include "opencv2/core/utility.hpp"
#include "opencv2/stitching.hpp"
#include "opencv2/stitching/detail/autocalib.hpp"
#include "opencv2/stitching/detail/blenders.hpp"

View File

@@ -702,14 +702,14 @@ void BackgroundSubtractorMOG2Impl::apply(InputArray _image, OutputArray _fgmask,
parallel_for_(Range(0, image.rows),
MOG2Invoker(image, fgmask,
(GMM*)bgmodel.data,
(float*)(bgmodel.data + sizeof(GMM)*nmixtures*image.rows*image.cols),
bgmodelUsedModes.data, nmixtures, (float)learningRate,
(float)varThreshold,
backgroundRatio, varThresholdGen,
fVarInit, fVarMin, fVarMax, float(-learningRate*fCT), fTau,
bShadowDetection, nShadowDetection),
image.total()/(double)(1 << 16));
(GMM*)bgmodel.data,
(float*)(bgmodel.data + sizeof(GMM)*nmixtures*image.rows*image.cols),
bgmodelUsedModes.data, nmixtures, (float)learningRate,
(float)varThreshold,
backgroundRatio, varThresholdGen,
fVarInit, fVarMin, fVarMax, float(-learningRate*fCT), fTau,
bShadowDetection, nShadowDetection),
image.total()/(double)(1 << 16));
}
void BackgroundSubtractorMOG2Impl::getBackgroundImage(OutputArray backgroundImage) const

View File

@@ -167,7 +167,7 @@ typedef float acctype;
typedef float itemtype;
#endif
void cv::detail::LKTrackerInvoker::operator()(const BlockedRange& range) const
void cv::detail::LKTrackerInvoker::operator()(const Range& range) const
{
Point2f halfWin((winSize.width-1)*0.5f, (winSize.height-1)*0.5f);
const Mat& I = *prevImg;
@@ -181,7 +181,7 @@ void cv::detail::LKTrackerInvoker::operator()(const BlockedRange& range) const
Mat IWinBuf(winSize, CV_MAKETYPE(derivDepth, cn), (deriv_type*)_buf);
Mat derivIWinBuf(winSize, CV_MAKETYPE(derivDepth, cn2), (deriv_type*)_buf + winSize.area()*cn);
for( int ptidx = range.begin(); ptidx < range.end(); ptidx++ )
for( int ptidx = range.start; ptidx < range.end; ptidx++ )
{
Point2f prevPt = prevPts[ptidx]*(float)(1./(1 << level));
Point2f nextPt;
@@ -746,11 +746,11 @@ void cv::calcOpticalFlowPyrLK( InputArray _prevImg, InputArray _nextImg,
typedef cv::detail::LKTrackerInvoker LKTrackerInvoker;
#endif
parallel_for(BlockedRange(0, npoints), LKTrackerInvoker(prevPyr[level * lvlStep1], derivI,
nextPyr[level * lvlStep2], prevPts, nextPts,
status, err,
winSize, criteria, level, maxLevel,
flags, (float)minEigThreshold));
parallel_for_(Range(0, npoints), LKTrackerInvoker(prevPyr[level * lvlStep1], derivI,
nextPyr[level * lvlStep2], prevPts, nextPts,
status, err,
winSize, criteria, level, maxLevel,
flags, (float)minEigThreshold));
}
}

View File

@@ -7,7 +7,7 @@ namespace detail
typedef short deriv_type;
struct LKTrackerInvoker
struct LKTrackerInvoker : ParallelLoopBody
{
LKTrackerInvoker( const Mat& _prevImg, const Mat& _prevDeriv, const Mat& _nextImg,
const Point2f* _prevPts, Point2f* _nextPts,
@@ -15,7 +15,7 @@ namespace detail
Size _winSize, TermCriteria _criteria,
int _level, int _maxLevel, int _flags, float _minEigThreshold );
void operator()(const BlockedRange& range) const;
void operator()(const Range& range) const;
const Mat* prevImg;
const Mat* nextImg;