From fac3d9994c8cb8308ac2919a459abae349ed2f7e Mon Sep 17 00:00:00 2001 From: Vadim Pisarevsky Date: Tue, 31 Jul 2012 19:07:55 +0400 Subject: [PATCH 1/9] integrated another portion of SSE optimizations from Grigory Frolov --- cmake/OpenCVCompilerOptions.cmake | 2 +- .../core/include/opencv2/core/internal.hpp | 12 +- modules/core/src/lapack.cpp | 164 ++++++++--------- modules/imgproc/src/shapedescr.cpp | 172 +++++++++++------- modules/objdetect/src/haar.cpp | 29 ++- 5 files changed, 215 insertions(+), 164 deletions(-) diff --git a/cmake/OpenCVCompilerOptions.cmake b/cmake/OpenCVCompilerOptions.cmake index c0d109a26..2cfcbf50c 100644 --- a/cmake/OpenCVCompilerOptions.cmake +++ b/cmake/OpenCVCompilerOptions.cmake @@ -139,7 +139,7 @@ if(CMAKE_COMPILER_IS_GNUCXX) if(ENABLE_SSSE3) add_extra_compiler_option(-mssse3) endif() - if(HAVE_GCC43_OR_NEWER) + if(HAVE_GCC43_OR_NEWER OR APPLE) if(ENABLE_SSE41) add_extra_compiler_option(-msse4.1) endif() diff --git a/modules/core/include/opencv2/core/internal.hpp b/modules/core/include/opencv2/core/internal.hpp index 2fe56cd7d..235abcc3c 100644 --- a/modules/core/include/opencv2/core/internal.hpp +++ b/modules/core/include/opencv2/core/internal.hpp @@ -120,17 +120,23 @@ CV_INLINE IppiSize ippiSize(int width, int height) # else # define CV_SSSE3 0 # endif -# if defined __SSE4_1__ || (defined _MSC_VER && _MSC_VER >= 1600) +# if defined __SSE4_1__ || (defined _MSC_VER && _MSC_VER >= 1500) # include # define CV_SSE4_1 1 +# else +# define CV_SSE4_1 0 # endif -# if defined __SSE4_2__ || (defined _MSC_VER && _MSC_VER >= 1600) +# if defined __SSE4_2__ || (defined _MSC_VER && _MSC_VER >= 1500) # include # define CV_SSE4_2 1 +# else +# define CV_SSE4_2 0 # endif -# if defined __AVX__ || (defined _MSC_VER && _MSC_VER >= 1600) +# if defined __AVX__ || (defined _MSC_FULL_VER && _MSC_FULL_VER >= 160040219) # include # define CV_AVX 1 +# else +# define CV_AVX 0 # endif # else # define CV_SSE 0 diff --git a/modules/core/src/lapack.cpp b/modules/core/src/lapack.cpp index 74c6edd3b..1c76df641 100644 --- a/modules/core/src/lapack.cpp +++ b/modules/core/src/lapack.cpp @@ -954,7 +954,7 @@ double cv::invert( InputArray _src, OutputArray _dst, int method ) size_t esz = CV_ELEM_SIZE(type); int m = src.rows, n = src.cols; - if( method == DECOMP_SVD ) + if( method == DECOMP_SVD ) { int nm = std::min(m, n); @@ -1010,82 +1010,84 @@ double cv::invert( InputArray _src, OutputArray _dst, int method ) if( type == CV_32FC1 ) { double d = det2(Sf); - #if CV_SSE4_2 - if(USE_SSE4_2) - { - __m128 zero = _mm_setzero_ps(); - __m128 t0 = _mm_loadl_pi(zero, (const __m64*)srcdata); //t0 = sf(0,0) sf(0,1) - __m128 t1 = _mm_loadh_pi(zero,(const __m64*)((const float*)(srcdata+srcstep))); //t1 = sf(1,0) sf(1,1) - __m128 s0 = _mm_blend_ps(t0,t1,12); - d = 1./d; - result = true; - __m128 det =_mm_set1_ps((float)d); - s0 = _mm_mul_ps(s0, det); - const uchar CV_DECL_ALIGNED(16) inv[16] = {0,0,0,0,0,0,0,0x80,0,0,0,0x80,0,0,0,0}; - __m128 pattern = _mm_load_ps((const float*)inv); - s0 = _mm_xor_ps(s0, pattern);//==-1*s0 - s0 = _mm_shuffle_ps(s0, s0, _MM_SHUFFLE(0,2,1,3)); - _mm_storel_pi((__m64*)dstdata, s0); - _mm_storeh_pi((__m64*)((float*)(dstdata+dststep)), s0); - } - #else - if( d != 0. ) + if( d != 0. ) { - double t0, t1; - result = true; - d = 1./d; - t0 = Sf(0,0)*d; - t1 = Sf(1,1)*d; - Df(1,1) = (float)t0; - Df(0,0) = (float)t1; - t0 = -Sf(0,1)*d; - t1 = -Sf(1,0)*d; - Df(0,1) = (float)t0; - Df(1,0) = (float)t1; - } - #endif + result = true; + d = 1./d; + + #if CV_SSE2 + if(USE_SSE2) + { + __m128 zero = _mm_setzero_ps(); + __m128 t0 = _mm_loadl_pi(zero, (const __m64*)srcdata); //t0 = sf(0,0) sf(0,1) + __m128 t1 = _mm_loadh_pi(zero, (const __m64*)(srcdata+srcstep)); //t1 = sf(1,0) sf(1,1) + __m128 s0 = _mm_or_ps(t0, t1); + __m128 det =_mm_set1_ps((float)d); + s0 = _mm_mul_ps(s0, det); + const uchar CV_DECL_ALIGNED(16) inv[16] = {0,0,0,0,0,0,0,0x80,0,0,0,0x80,0,0,0,0}; + __m128 pattern = _mm_load_ps((const float*)inv); + s0 = _mm_xor_ps(s0, pattern);//==-1*s0 + s0 = _mm_shuffle_ps(s0, s0, _MM_SHUFFLE(0,2,1,3)); + _mm_storel_pi((__m64*)dstdata, s0); + _mm_storeh_pi((__m64*)((float*)(dstdata+dststep)), s0); + } + else + #endif + { + double t0, t1; + t0 = Sf(0,0)*d; + t1 = Sf(1,1)*d; + Df(1,1) = (float)t0; + Df(0,0) = (float)t1; + t0 = -Sf(0,1)*d; + t1 = -Sf(1,0)*d; + Df(0,1) = (float)t0; + Df(1,0) = (float)t1; + } + + } } else { double d = det2(Sd); - #if CV_SSE2 - if(USE_SSE2) - { - __m128d s0 = _mm_loadu_pd((const double*)srcdata); //s0 = sf(0,0) sf(0,1) - __m128d s1 = _mm_loadu_pd ((const double*)(srcdata+srcstep));//s1 = sf(1,0) sf(1,1) - __m128d sm = _mm_shuffle_pd(s0, s1, _MM_SHUFFLE2(1,0)); //sm = sf(0,0) sf(1,1) - main diagonal - __m128d ss = _mm_shuffle_pd(s0, s1, _MM_SHUFFLE2(0,1)); //sm = sf(0,1) sf(1,0) - secondary diagonal - result = true; - d = 1./d; - __m128d det = _mm_load1_pd((const double*)&d); - sm = _mm_mul_pd(sm, det); - //__m128d pattern = _mm_set1_pd(-1.); - static const uchar CV_DECL_ALIGNED(16) inv[8] = {0,0,0,0,0,0,0,0x80}; - __m128d pattern = _mm_load1_pd((double*)inv); - ss = _mm_mul_pd(ss, det); - ss = _mm_xor_pd(ss, pattern);//==-1*ss - //ss = _mm_mul_pd(ss,pattern); - s0 = _mm_shuffle_pd(sm, ss, _MM_SHUFFLE2(0,1)); - s1 = _mm_shuffle_pd(ss, sm, _MM_SHUFFLE2(0,1)); - _mm_store_pd((double*)dstdata, s0); - _mm_store_pd((double*)(dstdata+dststep), s1); - } - #else - if( d != 0. ) + if( d != 0. ) { - double t0, t1; - result = true; - d = 1./d; - t0 = Sd(0,0)*d; - t1 = Sd(1,1)*d; - Dd(1,1) = t0; - Dd(0,0) = t1; - t0 = -Sd(0,1)*d; - t1 = -Sd(1,0)*d; - Dd(0,1) = t0; - Dd(1,0) = t1; - } - #endif + result = true; + d = 1./d; + #if CV_SSE2 + if(USE_SSE2) + { + __m128d s0 = _mm_loadu_pd((const double*)srcdata); //s0 = sf(0,0) sf(0,1) + __m128d s1 = _mm_loadu_pd ((const double*)(srcdata+srcstep));//s1 = sf(1,0) sf(1,1) + __m128d sm = _mm_unpacklo_pd(s0, _mm_load_sd((const double*)(srcdata+srcstep)+1)); //sm = sf(0,0) sf(1,1) - main diagonal + __m128d ss = _mm_shuffle_pd(s0, s1, _MM_SHUFFLE2(0,1)); //ss = sf(0,1) sf(1,0) - secondary diagonal + __m128d det = _mm_load1_pd((const double*)&d); + sm = _mm_mul_pd(sm, det); + + uchar CV_DECL_ALIGNED(16) inv[8] = {0,0,0,0,0,0,0,0x80}; + __m128d pattern = _mm_load1_pd((double*)inv); + ss = _mm_mul_pd(ss, det); + ss = _mm_xor_pd(ss, pattern);//==-1*ss + + s0 = _mm_shuffle_pd(sm, ss, _MM_SHUFFLE2(0,1)); + s1 = _mm_shuffle_pd(ss, sm, _MM_SHUFFLE2(0,1)); + _mm_storeu_pd((double*)dstdata, s0); + _mm_storeu_pd((double*)(dstdata+dststep), s1); + } + else + #endif + { + double t0, t1; + t0 = Sd(0,0)*d; + t1 = Sd(1,1)*d; + Dd(1,1) = t0; + Dd(0,0) = t1; + t0 = -Sd(0,1)*d; + t1 = -Sd(1,0)*d; + Dd(0,1) = t0; + Dd(1,0) = t1; + } + } } } else if( n == 3 ) @@ -1095,18 +1097,17 @@ double cv::invert( InputArray _src, OutputArray _dst, int method ) double d = det3(Sf); if( d != 0. ) { - float t[9]; result = true; d = 1./d; - + float t[9]; t[0] = (float)(((double)Sf(1,1) * Sf(2,2) - (double)Sf(1,2) * Sf(2,1)) * d); t[1] = (float)(((double)Sf(0,2) * Sf(2,1) - (double)Sf(0,1) * Sf(2,2)) * d); t[2] = (float)(((double)Sf(0,1) * Sf(1,2) - (double)Sf(0,2) * Sf(1,1)) * d); - + t[3] = (float)(((double)Sf(1,2) * Sf(2,0) - (double)Sf(1,0) * Sf(2,2)) * d); t[4] = (float)(((double)Sf(0,0) * Sf(2,2) - (double)Sf(0,2) * Sf(2,0)) * d); t[5] = (float)(((double)Sf(0,2) * Sf(1,0) - (double)Sf(0,0) * Sf(1,2)) * d); - + t[6] = (float)(((double)Sf(1,0) * Sf(2,1) - (double)Sf(1,1) * Sf(2,0)) * d); t[7] = (float)(((double)Sf(0,1) * Sf(2,0) - (double)Sf(0,0) * Sf(2,1)) * d); t[8] = (float)(((double)Sf(0,0) * Sf(1,1) - (double)Sf(0,1) * Sf(1,0)) * d); @@ -1121,18 +1122,18 @@ double cv::invert( InputArray _src, OutputArray _dst, int method ) double d = det3(Sd); if( d != 0. ) { + result = true; + d = 1./d; double t[9]; - result = true; - d = 1./d; t[0] = (Sd(1,1) * Sd(2,2) - Sd(1,2) * Sd(2,1)) * d; t[1] = (Sd(0,2) * Sd(2,1) - Sd(0,1) * Sd(2,2)) * d; t[2] = (Sd(0,1) * Sd(1,2) - Sd(0,2) * Sd(1,1)) * d; - + t[3] = (Sd(1,2) * Sd(2,0) - Sd(1,0) * Sd(2,2)) * d; t[4] = (Sd(0,0) * Sd(2,2) - Sd(0,2) * Sd(2,0)) * d; t[5] = (Sd(0,2) * Sd(1,0) - Sd(0,0) * Sd(1,2)) * d; - + t[6] = (Sd(1,0) * Sd(2,1) - Sd(1,1) * Sd(2,0)) * d; t[7] = (Sd(0,1) * Sd(2,0) - Sd(0,0) * Sd(2,1)) * d; t[8] = (Sd(0,0) * Sd(1,1) - Sd(0,1) * Sd(1,0)) * d; @@ -1171,7 +1172,7 @@ double cv::invert( InputArray _src, OutputArray _dst, int method ) return result; } - int elem_size = CV_ELEM_SIZE(type); + int elem_size = CV_ELEM_SIZE(type); AutoBuffer buf(n*n*elem_size); Mat src1(n, n, type, (uchar*)buf); src.copyTo(src1); @@ -1193,6 +1194,7 @@ double cv::invert( InputArray _src, OutputArray _dst, int method ) } + /****************************************************************************************\ * Solving a linear system * \****************************************************************************************/ @@ -1603,7 +1605,7 @@ void SVD::backSubst( InputArray _w, InputArray _u, InputArray _vt, Mat w = _w.getMat(), u = _u.getMat(), vt = _vt.getMat(), rhs = _rhs.getMat(); int type = w.type(), esz = (int)w.elemSize(); int m = u.rows, n = vt.cols, nb = rhs.data ? rhs.cols : m, nm = std::min(m, n); - size_t wstep = w.rows == 1 ? esz : w.cols == 1 ? (size_t)w.step : (size_t)w.step + esz; + size_t wstep = w.rows == 1 ? (size_t)esz : w.cols == 1 ? (size_t)w.step : (size_t)w.step + esz; AutoBuffer buffer(nb*sizeof(double) + 16); CV_Assert( w.type() == u.type() && u.type() == vt.type() && u.data && vt.data && w.data ); CV_Assert( u.cols >= nm && vt.rows >= nm && diff --git a/modules/imgproc/src/shapedescr.cpp b/modules/imgproc/src/shapedescr.cpp index 36c0c6c64..9a27b9f17 100644 --- a/modules/imgproc/src/shapedescr.cpp +++ b/modules/imgproc/src/shapedescr.cpp @@ -951,9 +951,6 @@ cvBoundingRect( CvArr* array, int update ) if( ptseq->header_size < (int)sizeof(CvContour)) { - /*if( update == 1 ) - CV_Error( CV_StsBadArg, "The header is too small to fit the rectangle, " - "so it could not be updated" );*/ update = 0; calculate = 1; } @@ -1067,86 +1064,123 @@ cvBoundingRect( CvArr* array, int update ) if( xmin >= size.width ) xmin = ymin = 0; - } - else if( ptseq->total ) - { - int is_float = CV_SEQ_ELTYPE(ptseq) == CV_32FC2; - cvStartReadSeq( ptseq, &reader, 0 ); + } + else if( ptseq->total ) + { + int is_float = CV_SEQ_ELTYPE(ptseq) == CV_32FC2; + cvStartReadSeq( ptseq, &reader, 0 ); + CvPoint pt; + CV_READ_SEQ_ELEM( pt, reader ); + #if CV_SSE4_2 + if(cv::checkHardwareSupport(CV_CPU_SSE4_2)) + { + if( !is_float ) + { + __m128i minval, maxval; + minval = maxval = _mm_loadl_epi64((const __m128i*)(&pt)); //min[0]=pt.x, min[1]=pt.y + + for( i = 1; i < ptseq->total; i++) + { + __m128i ptXY = _mm_loadl_epi64((const __m128i*)(reader.ptr)); + CV_NEXT_SEQ_ELEM(sizeof(pt), reader); + minval = _mm_min_epi32(ptXY, minval); + maxval = _mm_max_epi32(ptXY, maxval); + } + xmin = _mm_cvtsi128_si32(minval); + ymin = _mm_cvtsi128_si32(_mm_srli_si128(minval, 4)); + xmax = _mm_cvtsi128_si32(maxval); + ymax = _mm_cvtsi128_si32(_mm_srli_si128(maxval, 4)); + } + else + { + __m128 minvalf, maxvalf, z = _mm_setzero_ps(), ptXY = _mm_setzero_ps(); + minvalf = maxvalf = _mm_loadl_pi(z, (const __m64*)(&pt)); - if( !is_float ) - { - CvPoint pt; - /* init values */ - CV_READ_SEQ_ELEM( pt, reader ); - xmin = xmax = pt.x; - ymin = ymax = pt.y; + for( i = 1; i < ptseq->total; i++ ) + { + ptXY = _mm_loadl_pi(ptXY, (const __m64*)reader.ptr); + CV_NEXT_SEQ_ELEM(sizeof(pt), reader); - for( i = 1; i < ptseq->total; i++ ) - { - CV_READ_SEQ_ELEM( pt, reader ); - - if( xmin > pt.x ) - xmin = pt.x; - - if( xmax < pt.x ) - xmax = pt.x; - - if( ymin > pt.y ) - ymin = pt.y; - - if( ymax < pt.y ) - ymax = pt.y; + minvalf = _mm_min_ps(minvalf, ptXY); + maxvalf = _mm_max_ps(maxvalf, ptXY); + } + + float xyminf[2], xymaxf[2]; + _mm_storel_pi((__m64*)xyminf, minvalf); + _mm_storel_pi((__m64*)xymaxf, maxvalf); + xmin = cvFloor(xyminf[0]); + ymin = cvFloor(xyminf[1]); + xmax = cvFloor(xymaxf[0]); + ymax = cvFloor(xymaxf[1]); } - } + } else - { - CvPoint pt; - Cv32suf v; - /* init values */ - CV_READ_SEQ_ELEM( pt, reader ); - xmin = xmax = CV_TOGGLE_FLT(pt.x); - ymin = ymax = CV_TOGGLE_FLT(pt.y); + #endif + { + if( !is_float ) + { + xmin = xmax = pt.x; + ymin = ymax = pt.y; - for( i = 1; i < ptseq->total; i++ ) - { - CV_READ_SEQ_ELEM( pt, reader ); - pt.x = CV_TOGGLE_FLT(pt.x); - pt.y = CV_TOGGLE_FLT(pt.y); + for( i = 1; i < ptseq->total; i++ ) + { + CV_READ_SEQ_ELEM( pt, reader ); - if( xmin > pt.x ) - xmin = pt.x; + if( xmin > pt.x ) + xmin = pt.x; - if( xmax < pt.x ) - xmax = pt.x; + if( xmax < pt.x ) + xmax = pt.x; - if( ymin > pt.y ) - ymin = pt.y; + if( ymin > pt.y ) + ymin = pt.y; - if( ymax < pt.y ) - ymax = pt.y; - } + if( ymax < pt.y ) + ymax = pt.y; + } + } + else + { + Cv32suf v; + // init values + xmin = xmax = CV_TOGGLE_FLT(pt.x); + ymin = ymax = CV_TOGGLE_FLT(pt.y); - v.i = CV_TOGGLE_FLT(xmin); xmin = cvFloor(v.f); - v.i = CV_TOGGLE_FLT(ymin); ymin = cvFloor(v.f); - /* because right and bottom sides of - the bounding rectangle are not inclusive - (note +1 in width and height calculation below), - cvFloor is used here instead of cvCeil */ - v.i = CV_TOGGLE_FLT(xmax); xmax = cvFloor(v.f); - v.i = CV_TOGGLE_FLT(ymax); ymax = cvFloor(v.f); - } - } + for( i = 1; i < ptseq->total; i++ ) + { + CV_READ_SEQ_ELEM( pt, reader ); + pt.x = CV_TOGGLE_FLT(pt.x); + pt.y = CV_TOGGLE_FLT(pt.y); - rect.x = xmin; - rect.y = ymin; - rect.width = xmax - xmin + 1; - rect.height = ymax - ymin + 1; + if( xmin > pt.x ) + xmin = pt.x; - if( update ) + if( xmax < pt.x ) + xmax = pt.x; + + if( ymin > pt.y ) + ymin = pt.y; + + if( ymax < pt.y ) + ymax = pt.y; + } + + v.i = CV_TOGGLE_FLT(xmin); xmin = cvFloor(v.f); + v.i = CV_TOGGLE_FLT(ymin); ymin = cvFloor(v.f); + // because right and bottom sides of the bounding rectangle are not inclusive + // (note +1 in width and height calculation below), cvFloor is used here instead of cvCeil + v.i = CV_TOGGLE_FLT(xmax); xmax = cvFloor(v.f); + v.i = CV_TOGGLE_FLT(ymax); ymax = cvFloor(v.f); + } + } + rect.x = xmin; + rect.y = ymin; + rect.width = xmax - xmin + 1; + rect.height = ymax - ymin + 1; + } + if( update ) ((CvContour*)ptseq)->rect = rect; - return rect; } - /* End of file. */ diff --git a/modules/objdetect/src/haar.cpp b/modules/objdetect/src/haar.cpp index 06e89e626..983fdcf39 100644 --- a/modules/objdetect/src/haar.cpp +++ b/modules/objdetect/src/haar.cpp @@ -43,19 +43,26 @@ #include "precomp.hpp" #include - -/*#if CV_SSE2 -# if CV_SSE4 || defined __SSE4__ -# include -# else -# define _mm_blendv_pd(a, b, m) _mm_xor_pd(a, _mm_and_pd(_mm_xor_pd(b, a), m)) -# define _mm_blendv_ps(a, b, m) _mm_xor_ps(a, _mm_and_ps(_mm_xor_ps(b, a), m)) +/* +#if CV_SSE2 +# if !CV_SSE4_1 && !CV_SSE4_2 +# define _mm_blendv_pd(a, b, m) _mm_xor_pd(a, _mm_and_pd(_mm_xor_pd(b, a), m)) +# define _mm_blendv_ps(a, b, m) _mm_xor_ps(a, _mm_and_ps(_mm_xor_ps(b, a), m)) # endif -#if defined CV_ICC -# define CV_HAAR_USE_SSE 1 #endif -#endif*/ +#if defined CV_ICC +# if defined CV_AVX +# define CV_HAAR_USE_AVX 1 +# else +# if defined CV_SSE2 || defined CV_SSE4_1 || defined CV_SSE4_2 +# define CV_HAAR_USE_SSE 1 +# else +# define CV_HAAR_NO_SIMD 1 +# endif +# endif +#endif +*/ /* these settings affect the quality of detection: change with care */ #define CV_ADJUST_FEATURES 1 #define CV_ADJUST_WEIGHTS 0 @@ -730,6 +737,7 @@ cvRunHaarClassifierCascadeSum( const CvHaarClassifierCascade* _cascade, { CvHidHaarClassifier* classifier = cascade->stage_classifier[i].classifier + j; CvHidHaarTreeNode* node = classifier->node; + #ifndef CV_HAAR_USE_SSE double t = node->threshold*variance_norm_factor; double sum = calc_sum(node->feature.rect[0],p_offset) * node->feature.rect[0].weight; @@ -745,6 +753,7 @@ cvRunHaarClassifierCascadeSum( const CvHaarClassifierCascade* _cascade, t = _mm_cmpgt_sd(t, sum); stage_sum = _mm_add_sd(stage_sum, _mm_blendv_pd(b, a, t)); #endif + } } else From 1eedcea58d95a8a7485e34a3ee470dadb338877a Mon Sep 17 00:00:00 2001 From: yao Date: Thu, 2 Aug 2012 11:34:11 +0800 Subject: [PATCH 2/9] for test Signed-off-by: yao --- modules/ocl/cl2cpp.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modules/ocl/cl2cpp.py b/modules/ocl/cl2cpp.py index 9c8d9cfc4..4ed410aad 100644 --- a/modules/ocl/cl2cpp.py +++ b/modules/ocl/cl2cpp.py @@ -14,7 +14,7 @@ cl_list = glob.glob(os.path.join(indir, "*.cl")) kfile = open(outname, "wt") kfile.write("""// This file is auto-generated. Do not edit! -//#include "precomp.hpp" + namespace cv { namespace ocl From 8d73bbb8b71843ae7669cc2e38c32cfebb50d2a1 Mon Sep 17 00:00:00 2001 From: "marina.kolpakova" Date: Thu, 2 Aug 2012 13:18:55 +0400 Subject: [PATCH 3/9] fixed 2228 --- modules/gpu/src/cascadeclassifier.cpp | 936 +++++++++++++------------- 1 file changed, 469 insertions(+), 467 deletions(-) diff --git a/modules/gpu/src/cascadeclassifier.cpp b/modules/gpu/src/cascadeclassifier.cpp index 61f5c9431..1f277f0cf 100644 --- a/modules/gpu/src/cascadeclassifier.cpp +++ b/modules/gpu/src/cascadeclassifier.cpp @@ -1,51 +1,51 @@ -/*M/////////////////////////////////////////////////////////////////////////////////////// -// -// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. -// -// By downloading, copying, installing or using the software you agree to this license. -// If you do not agree to this license, do not download, install, -// copy or use the software. -// -// -// License Agreement -// For Open Source Computer Vision Library -// -// Copyright (C) 2000-2008, Intel Corporation, all rights reserved. -// Copyright (C) 2009, Willow Garage Inc., all rights reserved. -// Third party copyrights are property of their respective owners. -// -// Redistribution and use in source and binary forms, with or without modification, -// are permitted provided that the following conditions are met: -// -// * Redistribution's of source code must retain the above copyright notice, -// this list of conditions and the following disclaimer. -// -// * Redistribution's in binary form must reproduce the above copyright notice, -// this list of conditions and the following disclaimer in the documentation -// and/or other GpuMaterials provided with the distribution. -// -// * The name of the copyright holders may not be used to endorse or promote products -// derived from this software without specific prior written permission. -// -// This software is provided by the copyright holders and contributors "as is" and -// any express or bpied warranties, including, but not limited to, the bpied -// warranties of merchantability and fitness for a particular purpose are disclaimed. -// In no event shall the Intel Corporation or contributors be liable for any direct, -// indirect, incidental, special, exemplary, or consequential damages -// (including, but not limited to, procurement of substitute goods or services; -// loss of use, data, or profits; or business interruption) however caused -// and on any theory of liability, whether in contract, strict liability, -// or tort (including negligence or otherwise) arising in any way out of -// the use of this software, even if advised of the possibility of such damage. -// -//M*/ - -#include "precomp.hpp" -#include -#include - -using namespace cv; -using namespace cv::gpu; +/*M/////////////////////////////////////////////////////////////////////////////////////// +// +// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. +// +// By downloading, copying, installing or using the software you agree to this license. +// If you do not agree to this license, do not download, install, +// copy or use the software. +// +// +// License Agreement +// For Open Source Computer Vision Library +// +// Copyright (C) 2000-2008, Intel Corporation, all rights reserved. +// Copyright (C) 2009, Willow Garage Inc., all rights reserved. +// Third party copyrights are property of their respective owners. +// +// Redistribution and use in source and binary forms, with or without modification, +// are permitted provided that the following conditions are met: +// +// * Redistribution's of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// +// * Redistribution's in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other GpuMaterials provided with the distribution. +// +// * The name of the copyright holders may not be used to endorse or promote products +// derived from this software without specific prior written permission. +// +// This software is provided by the copyright holders and contributors "as is" and +// any express or bpied warranties, including, but not limited to, the bpied +// warranties of merchantability and fitness for a particular purpose are disclaimed. +// In no event shall the Intel Corporation or contributors be liable for any direct, +// indirect, incidental, special, exemplary, or consequential damages +// (including, but not limited to, procurement of substitute goods or services; +// loss of use, data, or profits; or business interruption) however caused +// and on any theory of liability, whether in contract, strict liability, +// or tort (including negligence or otherwise) arising in any way out of +// the use of this software, even if advised of the possibility of such damage. +// +//M*/ + +#include "precomp.hpp" +#include +#include + +using namespace cv; +using namespace cv::gpu; using namespace std; #if !defined (HAVE_CUDA) @@ -94,219 +94,221 @@ public: /*out*/unsigned int& numDetections) { calculateMemReqsAndAllocate(src.size()); - - NCVMemPtr src_beg; - src_beg.ptr = (void*)src.ptr(); - src_beg.memtype = NCVMemoryTypeDevice; - - NCVMemSegment src_seg; - src_seg.begin = src_beg; - src_seg.size = src.step * src.rows; - - NCVMatrixReuse d_src(src_seg, static_cast(devProp.textureAlignment), src.cols, src.rows, static_cast(src.step), true); - ncvAssertReturn(d_src.isMemReused(), NCV_ALLOCATOR_BAD_REUSE); - - CV_Assert(objects.rows == 1); - - NCVMemPtr objects_beg; - objects_beg.ptr = (void*)objects.ptr(); - objects_beg.memtype = NCVMemoryTypeDevice; - - NCVMemSegment objects_seg; - objects_seg.begin = objects_beg; - objects_seg.size = objects.step * objects.rows; - NCVVectorReuse d_rects(objects_seg, objects.cols); - ncvAssertReturn(d_rects.isMemReused(), NCV_ALLOCATOR_BAD_REUSE); - - NcvSize32u roi; - roi.width = d_src.width(); - roi.height = d_src.height(); - + + NCVMemPtr src_beg; + src_beg.ptr = (void*)src.ptr(); + src_beg.memtype = NCVMemoryTypeDevice; + + NCVMemSegment src_seg; + src_seg.begin = src_beg; + src_seg.size = src.step * src.rows; + + NCVMatrixReuse d_src(src_seg, static_cast(devProp.textureAlignment), src.cols, src.rows, static_cast(src.step), true); + ncvAssertReturn(d_src.isMemReused(), NCV_ALLOCATOR_BAD_REUSE); + + CV_Assert(objects.rows == 1); + + NCVMemPtr objects_beg; + objects_beg.ptr = (void*)objects.ptr(); + objects_beg.memtype = NCVMemoryTypeDevice; + + NCVMemSegment objects_seg; + objects_seg.begin = objects_beg; + objects_seg.size = objects.step * objects.rows; + NCVVectorReuse d_rects(objects_seg, objects.cols); + ncvAssertReturn(d_rects.isMemReused(), NCV_ALLOCATOR_BAD_REUSE); + + NcvSize32u roi; + roi.width = d_src.width(); + roi.height = d_src.height(); + NcvSize32u winMinSize(ncvMinSize.width, ncvMinSize.height); - Ncv32u flags = 0; - flags |= findLargestObject? NCVPipeObjDet_FindLargestObject : 0; - flags |= visualizeInPlace ? NCVPipeObjDet_VisualizeInPlace : 0; - - ncvStat = ncvDetectObjectsMultiScale_device( - d_src, roi, d_rects, numDetections, haar, *h_haarStages, - *d_haarStages, *d_haarNodes, *d_haarFeatures, + Ncv32u flags = 0; + flags |= findLargestObject? NCVPipeObjDet_FindLargestObject : 0; + flags |= visualizeInPlace ? NCVPipeObjDet_VisualizeInPlace : 0; + + ncvStat = ncvDetectObjectsMultiScale_device( + d_src, roi, d_rects, numDetections, haar, *h_haarStages, + *d_haarStages, *d_haarNodes, *d_haarFeatures, winMinSize, - minNeighbors, - scaleStep, 1, - flags, - *gpuAllocator, *cpuAllocator, devProp, 0); - ncvAssertReturnNcvStat(ncvStat); - ncvAssertCUDAReturn(cudaStreamSynchronize(0), NCV_CUDA_ERROR); - - return NCV_SUCCESS; - } - + minNeighbors, + scaleStep, 1, + flags, + *gpuAllocator, *cpuAllocator, devProp, 0); + ncvAssertReturnNcvStat(ncvStat); + ncvAssertCUDAReturn(cudaStreamSynchronize(0), NCV_CUDA_ERROR); + + return NCV_SUCCESS; + } + unsigned int process(const GpuMat& image, GpuMat& objectsBuf, float scaleFactor, int minNeighbors, bool findLargestObject, bool visualizeInPlace, cv::Size minSize, cv::Size maxObjectSize) { CV_Assert( scaleFactor > 1 && image.depth() == CV_8U); - + const int defaultObjSearchNum = 100; if (objectsBuf.empty()) { objectsBuf.create(1, defaultObjSearchNum, DataType::type); } - + cv::Size ncvMinSize = this->getClassifierCvSize(); - + if (ncvMinSize.width < (unsigned)minSize.width && ncvMinSize.height < (unsigned)minSize.height) { ncvMinSize.width = minSize.width; ncvMinSize.height = minSize.height; } - + unsigned int numDetections; ncvSafeCall(this->process(image, objectsBuf, (float)scaleFactor, minNeighbors, findLargestObject, visualizeInPlace, ncvMinSize, numDetections)); - + return numDetections; } cv::Size getClassifierCvSize() const { return cv::Size(haar.ClassifierSize.width, haar.ClassifierSize.height); } - + private: static void NCVDebugOutputHandler(const std::string &msg) { CV_Error(CV_GpuApiCallError, msg.c_str()); } - - NCVStatus load(const string& classifierFile) - { - int devId = cv::gpu::getDevice(); - ncvAssertCUDAReturn(cudaGetDeviceProperties(&devProp, devId), NCV_CUDA_ERROR); - - // Load the classifier from file (assuming its size is about 1 mb) using a simple allocator - gpuCascadeAllocator = new NCVMemNativeAllocator(NCVMemoryTypeDevice, static_cast(devProp.textureAlignment)); - cpuCascadeAllocator = new NCVMemNativeAllocator(NCVMemoryTypeHostPinned, static_cast(devProp.textureAlignment)); - - ncvAssertPrintReturn(gpuCascadeAllocator->isInitialized(), "Error creating cascade GPU allocator", NCV_CUDA_ERROR); - ncvAssertPrintReturn(cpuCascadeAllocator->isInitialized(), "Error creating cascade CPU allocator", NCV_CUDA_ERROR); - - Ncv32u haarNumStages, haarNumNodes, haarNumFeatures; - ncvStat = ncvHaarGetClassifierSize(classifierFile, haarNumStages, haarNumNodes, haarNumFeatures); - ncvAssertPrintReturn(ncvStat == NCV_SUCCESS, "Error reading classifier size (check the file)", NCV_FILE_ERROR); - - h_haarStages = new NCVVectorAlloc(*cpuCascadeAllocator, haarNumStages); - h_haarNodes = new NCVVectorAlloc(*cpuCascadeAllocator, haarNumNodes); - h_haarFeatures = new NCVVectorAlloc(*cpuCascadeAllocator, haarNumFeatures); - - ncvAssertPrintReturn(h_haarStages->isMemAllocated(), "Error in cascade CPU allocator", NCV_CUDA_ERROR); - ncvAssertPrintReturn(h_haarNodes->isMemAllocated(), "Error in cascade CPU allocator", NCV_CUDA_ERROR); - ncvAssertPrintReturn(h_haarFeatures->isMemAllocated(), "Error in cascade CPU allocator", NCV_CUDA_ERROR); - - ncvStat = ncvHaarLoadFromFile_host(classifierFile, haar, *h_haarStages, *h_haarNodes, *h_haarFeatures); - ncvAssertPrintReturn(ncvStat == NCV_SUCCESS, "Error loading classifier", NCV_FILE_ERROR); - - d_haarStages = new NCVVectorAlloc(*gpuCascadeAllocator, haarNumStages); - d_haarNodes = new NCVVectorAlloc(*gpuCascadeAllocator, haarNumNodes); - d_haarFeatures = new NCVVectorAlloc(*gpuCascadeAllocator, haarNumFeatures); - - ncvAssertPrintReturn(d_haarStages->isMemAllocated(), "Error in cascade GPU allocator", NCV_CUDA_ERROR); - ncvAssertPrintReturn(d_haarNodes->isMemAllocated(), "Error in cascade GPU allocator", NCV_CUDA_ERROR); - ncvAssertPrintReturn(d_haarFeatures->isMemAllocated(), "Error in cascade GPU allocator", NCV_CUDA_ERROR); - - ncvStat = h_haarStages->copySolid(*d_haarStages, 0); - ncvAssertPrintReturn(ncvStat == NCV_SUCCESS, "Error copying cascade to GPU", NCV_CUDA_ERROR); - ncvStat = h_haarNodes->copySolid(*d_haarNodes, 0); - ncvAssertPrintReturn(ncvStat == NCV_SUCCESS, "Error copying cascade to GPU", NCV_CUDA_ERROR); - ncvStat = h_haarFeatures->copySolid(*d_haarFeatures, 0); - ncvAssertPrintReturn(ncvStat == NCV_SUCCESS, "Error copying cascade to GPU", NCV_CUDA_ERROR); - - return NCV_SUCCESS; - } - - NCVStatus calculateMemReqsAndAllocate(const Size& frameSize) - { - if (lastAllocatedFrameSize == frameSize) - { - return NCV_SUCCESS; - } - - // Calculate memory requirements and create real allocators - NCVMemStackAllocator gpuCounter(static_cast(devProp.textureAlignment)); - NCVMemStackAllocator cpuCounter(static_cast(devProp.textureAlignment)); - - ncvAssertPrintReturn(gpuCounter.isInitialized(), "Error creating GPU memory counter", NCV_CUDA_ERROR); - ncvAssertPrintReturn(cpuCounter.isInitialized(), "Error creating CPU memory counter", NCV_CUDA_ERROR); - - NCVMatrixAlloc d_src(gpuCounter, frameSize.width, frameSize.height); - NCVMatrixAlloc h_src(cpuCounter, frameSize.width, frameSize.height); - - ncvAssertReturn(d_src.isMemAllocated(), NCV_ALLOCATOR_BAD_ALLOC); - ncvAssertReturn(h_src.isMemAllocated(), NCV_ALLOCATOR_BAD_ALLOC); - - NCVVectorAlloc d_rects(gpuCounter, 100); - ncvAssertReturn(d_rects.isMemAllocated(), NCV_ALLOCATOR_BAD_ALLOC); - - NcvSize32u roi; - roi.width = d_src.width(); - roi.height = d_src.height(); - Ncv32u numDetections; - ncvStat = ncvDetectObjectsMultiScale_device(d_src, roi, d_rects, numDetections, haar, *h_haarStages, - *d_haarStages, *d_haarNodes, *d_haarFeatures, haar.ClassifierSize, 4, 1.2f, 1, 0, gpuCounter, cpuCounter, devProp, 0); - - ncvAssertReturnNcvStat(ncvStat); - ncvAssertCUDAReturn(cudaStreamSynchronize(0), NCV_CUDA_ERROR); - - gpuAllocator = new NCVMemStackAllocator(NCVMemoryTypeDevice, gpuCounter.maxSize(), static_cast(devProp.textureAlignment)); - cpuAllocator = new NCVMemStackAllocator(NCVMemoryTypeHostPinned, cpuCounter.maxSize(), static_cast(devProp.textureAlignment)); - - ncvAssertPrintReturn(gpuAllocator->isInitialized(), "Error creating GPU memory allocator", NCV_CUDA_ERROR); - ncvAssertPrintReturn(cpuAllocator->isInitialized(), "Error creating CPU memory allocator", NCV_CUDA_ERROR); - return NCV_SUCCESS; - } - - cudaDeviceProp devProp; - NCVStatus ncvStat; - - Ptr gpuCascadeAllocator; - Ptr cpuCascadeAllocator; - - Ptr > h_haarStages; - Ptr > h_haarNodes; - Ptr > h_haarFeatures; - - HaarClassifierCascadeDescriptor haar; - - Ptr > d_haarStages; - Ptr > d_haarNodes; - Ptr > d_haarFeatures; - - Size lastAllocatedFrameSize; - - Ptr gpuAllocator; - Ptr cpuAllocator; + + NCVStatus load(const string& classifierFile) + { + int devId = cv::gpu::getDevice(); + ncvAssertCUDAReturn(cudaGetDeviceProperties(&devProp, devId), NCV_CUDA_ERROR); + + // Load the classifier from file (assuming its size is about 1 mb) using a simple allocator + gpuCascadeAllocator = new NCVMemNativeAllocator(NCVMemoryTypeDevice, static_cast(devProp.textureAlignment)); + cpuCascadeAllocator = new NCVMemNativeAllocator(NCVMemoryTypeHostPinned, static_cast(devProp.textureAlignment)); + + ncvAssertPrintReturn(gpuCascadeAllocator->isInitialized(), "Error creating cascade GPU allocator", NCV_CUDA_ERROR); + ncvAssertPrintReturn(cpuCascadeAllocator->isInitialized(), "Error creating cascade CPU allocator", NCV_CUDA_ERROR); + + Ncv32u haarNumStages, haarNumNodes, haarNumFeatures; + ncvStat = ncvHaarGetClassifierSize(classifierFile, haarNumStages, haarNumNodes, haarNumFeatures); + ncvAssertPrintReturn(ncvStat == NCV_SUCCESS, "Error reading classifier size (check the file)", NCV_FILE_ERROR); + + h_haarStages = new NCVVectorAlloc(*cpuCascadeAllocator, haarNumStages); + h_haarNodes = new NCVVectorAlloc(*cpuCascadeAllocator, haarNumNodes); + h_haarFeatures = new NCVVectorAlloc(*cpuCascadeAllocator, haarNumFeatures); + + ncvAssertPrintReturn(h_haarStages->isMemAllocated(), "Error in cascade CPU allocator", NCV_CUDA_ERROR); + ncvAssertPrintReturn(h_haarNodes->isMemAllocated(), "Error in cascade CPU allocator", NCV_CUDA_ERROR); + ncvAssertPrintReturn(h_haarFeatures->isMemAllocated(), "Error in cascade CPU allocator", NCV_CUDA_ERROR); + + ncvStat = ncvHaarLoadFromFile_host(classifierFile, haar, *h_haarStages, *h_haarNodes, *h_haarFeatures); + ncvAssertPrintReturn(ncvStat == NCV_SUCCESS, "Error loading classifier", NCV_FILE_ERROR); + + d_haarStages = new NCVVectorAlloc(*gpuCascadeAllocator, haarNumStages); + d_haarNodes = new NCVVectorAlloc(*gpuCascadeAllocator, haarNumNodes); + d_haarFeatures = new NCVVectorAlloc(*gpuCascadeAllocator, haarNumFeatures); + + ncvAssertPrintReturn(d_haarStages->isMemAllocated(), "Error in cascade GPU allocator", NCV_CUDA_ERROR); + ncvAssertPrintReturn(d_haarNodes->isMemAllocated(), "Error in cascade GPU allocator", NCV_CUDA_ERROR); + ncvAssertPrintReturn(d_haarFeatures->isMemAllocated(), "Error in cascade GPU allocator", NCV_CUDA_ERROR); + + ncvStat = h_haarStages->copySolid(*d_haarStages, 0); + ncvAssertPrintReturn(ncvStat == NCV_SUCCESS, "Error copying cascade to GPU", NCV_CUDA_ERROR); + ncvStat = h_haarNodes->copySolid(*d_haarNodes, 0); + ncvAssertPrintReturn(ncvStat == NCV_SUCCESS, "Error copying cascade to GPU", NCV_CUDA_ERROR); + ncvStat = h_haarFeatures->copySolid(*d_haarFeatures, 0); + ncvAssertPrintReturn(ncvStat == NCV_SUCCESS, "Error copying cascade to GPU", NCV_CUDA_ERROR); + + return NCV_SUCCESS; + } + + NCVStatus calculateMemReqsAndAllocate(const Size& frameSize) + { + if (lastAllocatedFrameSize == frameSize) + { + return NCV_SUCCESS; + } + + // Calculate memory requirements and create real allocators + NCVMemStackAllocator gpuCounter(static_cast(devProp.textureAlignment)); + NCVMemStackAllocator cpuCounter(static_cast(devProp.textureAlignment)); + + ncvAssertPrintReturn(gpuCounter.isInitialized(), "Error creating GPU memory counter", NCV_CUDA_ERROR); + ncvAssertPrintReturn(cpuCounter.isInitialized(), "Error creating CPU memory counter", NCV_CUDA_ERROR); + + NCVMatrixAlloc d_src(gpuCounter, frameSize.width, frameSize.height); + NCVMatrixAlloc h_src(cpuCounter, frameSize.width, frameSize.height); + + ncvAssertReturn(d_src.isMemAllocated(), NCV_ALLOCATOR_BAD_ALLOC); + ncvAssertReturn(h_src.isMemAllocated(), NCV_ALLOCATOR_BAD_ALLOC); + + NCVVectorAlloc d_rects(gpuCounter, 100); + ncvAssertReturn(d_rects.isMemAllocated(), NCV_ALLOCATOR_BAD_ALLOC); + + NcvSize32u roi; + roi.width = d_src.width(); + roi.height = d_src.height(); + Ncv32u numDetections; + ncvStat = ncvDetectObjectsMultiScale_device(d_src, roi, d_rects, numDetections, haar, *h_haarStages, + *d_haarStages, *d_haarNodes, *d_haarFeatures, haar.ClassifierSize, 4, 1.2f, 1, 0, gpuCounter, cpuCounter, devProp, 0); + + ncvAssertReturnNcvStat(ncvStat); + ncvAssertCUDAReturn(cudaStreamSynchronize(0), NCV_CUDA_ERROR); + + gpuAllocator = new NCVMemStackAllocator(NCVMemoryTypeDevice, gpuCounter.maxSize(), static_cast(devProp.textureAlignment)); + cpuAllocator = new NCVMemStackAllocator(NCVMemoryTypeHostPinned, cpuCounter.maxSize(), static_cast(devProp.textureAlignment)); + + ncvAssertPrintReturn(gpuAllocator->isInitialized(), "Error creating GPU memory allocator", NCV_CUDA_ERROR); + ncvAssertPrintReturn(cpuAllocator->isInitialized(), "Error creating CPU memory allocator", NCV_CUDA_ERROR); + + lastAllocatedFrameSize = frameSize; + return NCV_SUCCESS; + } + + cudaDeviceProp devProp; + NCVStatus ncvStat; + + Ptr gpuCascadeAllocator; + Ptr cpuCascadeAllocator; + + Ptr > h_haarStages; + Ptr > h_haarNodes; + Ptr > h_haarFeatures; + + HaarClassifierCascadeDescriptor haar; + + Ptr > d_haarStages; + Ptr > d_haarNodes; + Ptr > d_haarFeatures; + + Size lastAllocatedFrameSize; + + Ptr gpuAllocator; + Ptr cpuAllocator; virtual ~HaarCascade(){} -}; - +}; + cv::Size operator -(const cv::Size& a, const cv::Size& b) { return cv::Size(a.width - b.width, a.height - b.height); } - + cv::Size operator +(const cv::Size& a, const int& i) { return cv::Size(a.width + i, a.height + i); } - + cv::Size operator *(const cv::Size& a, const float& f) { return cv::Size(cvRound(a.width * f), cvRound(a.height * f)); } - + cv::Size operator /(const cv::Size& a, const float& f) -{ +{ return cv::Size(cvRound(a.width / f), cvRound(a.height / f)); } bool operator <=(const cv::Size& a, const cv::Size& b) { return a.width <= b.width && a.height <= b.width; -} - +} + struct PyrLavel { PyrLavel(int _order, float _scale, cv::Size frame, cv::Size window, cv::Size minObjectSize) @@ -669,18 +671,18 @@ cv::gpu::CascadeClassifier_GPU::~CascadeClassifier_GPU() { release(); } void cv::gpu::CascadeClassifier_GPU::release() { if (impl) { delete impl; impl = 0; } } bool cv::gpu::CascadeClassifier_GPU::empty() const { return impl == 0; } - -Size cv::gpu::CascadeClassifier_GPU::getClassifierSize() const -{ - return this->empty() ? Size() : impl->getClassifierCvSize(); -} - -int cv::gpu::CascadeClassifier_GPU::detectMultiScale( const GpuMat& image, GpuMat& objectsBuf, double scaleFactor, int minNeighbors, Size minSize) -{ - CV_Assert( !this->empty()); + +Size cv::gpu::CascadeClassifier_GPU::getClassifierSize() const +{ + return this->empty() ? Size() : impl->getClassifierCvSize(); +} + +int cv::gpu::CascadeClassifier_GPU::detectMultiScale( const GpuMat& image, GpuMat& objectsBuf, double scaleFactor, int minNeighbors, Size minSize) +{ + CV_Assert( !this->empty()); return impl->process(image, objectsBuf, (float)scaleFactor, minNeighbors, findLargestObject, visualizeInPlace, minSize, cv::Size()); } - + int cv::gpu::CascadeClassifier_GPU::detectMultiScale(const GpuMat& image, GpuMat& objectsBuf, Size maxObjectSize, Size minSize, double scaleFactor, int minNeighbors) { CV_Assert( !this->empty()); @@ -695,261 +697,261 @@ bool cv::gpu::CascadeClassifier_GPU::load(const string& filename) std::transform(fext.begin(), fext.end(), fext.begin(), ::tolower); if (fext == "nvbin") - { + { impl = new HaarCascade(); return impl->read(filename); - } - + } + FileStorage fs(filename, FileStorage::READ); - + if (!fs.isOpened()) - { + { impl = new HaarCascade(); return impl->read(filename); - } - + } + const char *GPU_CC_LBP = "LBP"; string featureTypeStr = (string)fs.getFirstTopLevelNode()["featureType"]; if (featureTypeStr == GPU_CC_LBP) impl = new LbpCascade(); else impl = new HaarCascade(); - + impl->read(filename); return !this->empty(); -} - +} + ////////////////////////////////////////////////////////////////////////////////////////////////////// - -struct RectConvert -{ - Rect operator()(const NcvRect32u& nr) const { return Rect(nr.x, nr.y, nr.width, nr.height); } - NcvRect32u operator()(const Rect& nr) const - { - NcvRect32u rect; - rect.x = nr.x; - rect.y = nr.y; - rect.width = nr.width; - rect.height = nr.height; - return rect; - } -}; - -void groupRectangles(std::vector &hypotheses, int groupThreshold, double eps, std::vector *weights) -{ - vector rects(hypotheses.size()); - std::transform(hypotheses.begin(), hypotheses.end(), rects.begin(), RectConvert()); - - if (weights) - { - vector weights_int; - weights_int.assign(weights->begin(), weights->end()); - cv::groupRectangles(rects, weights_int, groupThreshold, eps); - } - else - { - cv::groupRectangles(rects, groupThreshold, eps); - } - std::transform(rects.begin(), rects.end(), hypotheses.begin(), RectConvert()); - hypotheses.resize(rects.size()); -} - -NCVStatus loadFromXML(const std::string &filename, - HaarClassifierCascadeDescriptor &haar, - std::vector &haarStages, - std::vector &haarClassifierNodes, - std::vector &haarFeatures) -{ - NCVStatus ncvStat; - - haar.NumStages = 0; - haar.NumClassifierRootNodes = 0; - haar.NumClassifierTotalNodes = 0; - haar.NumFeatures = 0; - haar.ClassifierSize.width = 0; - haar.ClassifierSize.height = 0; - haar.bHasStumpsOnly = true; - haar.bNeedsTiltedII = false; - Ncv32u curMaxTreeDepth; - - std::vector xmlFileCont; - - std::vector h_TmpClassifierNotRootNodes; - haarStages.resize(0); - haarClassifierNodes.resize(0); - haarFeatures.resize(0); - - Ptr oldCascade = (CvHaarClassifierCascade*)cvLoad(filename.c_str(), 0, 0, 0); - if (oldCascade.empty()) - { - return NCV_HAAR_XML_LOADING_EXCEPTION; - } - - haar.ClassifierSize.width = oldCascade->orig_window_size.width; - haar.ClassifierSize.height = oldCascade->orig_window_size.height; - - int stagesCound = oldCascade->count; - for(int s = 0; s < stagesCound; ++s) // by stages - { - HaarStage64 curStage; - curStage.setStartClassifierRootNodeOffset(static_cast(haarClassifierNodes.size())); - - curStage.setStageThreshold(oldCascade->stage_classifier[s].threshold); - - int treesCount = oldCascade->stage_classifier[s].count; - for(int t = 0; t < treesCount; ++t) // by trees - { - Ncv32u nodeId = 0; - CvHaarClassifier* tree = &oldCascade->stage_classifier[s].classifier[t]; - - int nodesCount = tree->count; - for(int n = 0; n < nodesCount; ++n) //by features - { - CvHaarFeature* feature = &tree->haar_feature[n]; - - HaarClassifierNode128 curNode; - curNode.setThreshold(tree->threshold[n]); - - NcvBool bIsLeftNodeLeaf = false; - NcvBool bIsRightNodeLeaf = false; - - HaarClassifierNodeDescriptor32 nodeLeft; - if ( tree->left[n] <= 0 ) - { - Ncv32f leftVal = tree->alpha[-tree->left[n]]; - ncvStat = nodeLeft.create(leftVal); - ncvAssertReturn(ncvStat == NCV_SUCCESS, ncvStat); - bIsLeftNodeLeaf = true; - } - else - { - Ncv32u leftNodeOffset = tree->left[n]; - nodeLeft.create((Ncv32u)(h_TmpClassifierNotRootNodes.size() + leftNodeOffset - 1)); - haar.bHasStumpsOnly = false; - } - curNode.setLeftNodeDesc(nodeLeft); - - HaarClassifierNodeDescriptor32 nodeRight; - if ( tree->right[n] <= 0 ) - { - Ncv32f rightVal = tree->alpha[-tree->right[n]]; - ncvStat = nodeRight.create(rightVal); - ncvAssertReturn(ncvStat == NCV_SUCCESS, ncvStat); - bIsRightNodeLeaf = true; - } - else - { - Ncv32u rightNodeOffset = tree->right[n]; - nodeRight.create((Ncv32u)(h_TmpClassifierNotRootNodes.size() + rightNodeOffset - 1)); - haar.bHasStumpsOnly = false; - } - curNode.setRightNodeDesc(nodeRight); - - Ncv32u tiltedVal = feature->tilted; - haar.bNeedsTiltedII = (tiltedVal != 0); - - Ncv32u featureId = 0; - for(int l = 0; l < CV_HAAR_FEATURE_MAX; ++l) //by rects - { - Ncv32u rectX = feature->rect[l].r.x; - Ncv32u rectY = feature->rect[l].r.y; - Ncv32u rectWidth = feature->rect[l].r.width; - Ncv32u rectHeight = feature->rect[l].r.height; - - Ncv32f rectWeight = feature->rect[l].weight; - - if (rectWeight == 0/* && rectX == 0 &&rectY == 0 && rectWidth == 0 && rectHeight == 0*/) - break; - - HaarFeature64 curFeature; - ncvStat = curFeature.setRect(rectX, rectY, rectWidth, rectHeight, haar.ClassifierSize.width, haar.ClassifierSize.height); - curFeature.setWeight(rectWeight); - ncvAssertReturn(NCV_SUCCESS == ncvStat, ncvStat); - haarFeatures.push_back(curFeature); - - featureId++; - } - - HaarFeatureDescriptor32 tmpFeatureDesc; - ncvStat = tmpFeatureDesc.create(haar.bNeedsTiltedII, bIsLeftNodeLeaf, bIsRightNodeLeaf, - featureId, static_cast(haarFeatures.size()) - featureId); - ncvAssertReturn(NCV_SUCCESS == ncvStat, ncvStat); - curNode.setFeatureDesc(tmpFeatureDesc); - - if (!nodeId) - { - //root node - haarClassifierNodes.push_back(curNode); - curMaxTreeDepth = 1; - } - else - { - //other node - h_TmpClassifierNotRootNodes.push_back(curNode); - curMaxTreeDepth++; - } - - nodeId++; - } - } - - curStage.setNumClassifierRootNodes(treesCount); - haarStages.push_back(curStage); - } - - //fill in cascade stats - haar.NumStages = static_cast(haarStages.size()); - haar.NumClassifierRootNodes = static_cast(haarClassifierNodes.size()); - haar.NumClassifierTotalNodes = static_cast(haar.NumClassifierRootNodes + h_TmpClassifierNotRootNodes.size()); - haar.NumFeatures = static_cast(haarFeatures.size()); - - //merge root and leaf nodes in one classifiers array - Ncv32u offsetRoot = static_cast(haarClassifierNodes.size()); - for (Ncv32u i=0; i &hypotheses, int groupThreshold, double eps, std::vector *weights) +{ + vector rects(hypotheses.size()); + std::transform(hypotheses.begin(), hypotheses.end(), rects.begin(), RectConvert()); + + if (weights) + { + vector weights_int; + weights_int.assign(weights->begin(), weights->end()); + cv::groupRectangles(rects, weights_int, groupThreshold, eps); + } + else + { + cv::groupRectangles(rects, groupThreshold, eps); + } + std::transform(rects.begin(), rects.end(), hypotheses.begin(), RectConvert()); + hypotheses.resize(rects.size()); +} + +NCVStatus loadFromXML(const std::string &filename, + HaarClassifierCascadeDescriptor &haar, + std::vector &haarStages, + std::vector &haarClassifierNodes, + std::vector &haarFeatures) +{ + NCVStatus ncvStat; + + haar.NumStages = 0; + haar.NumClassifierRootNodes = 0; + haar.NumClassifierTotalNodes = 0; + haar.NumFeatures = 0; + haar.ClassifierSize.width = 0; + haar.ClassifierSize.height = 0; + haar.bHasStumpsOnly = true; + haar.bNeedsTiltedII = false; + Ncv32u curMaxTreeDepth; + + std::vector xmlFileCont; + + std::vector h_TmpClassifierNotRootNodes; + haarStages.resize(0); + haarClassifierNodes.resize(0); + haarFeatures.resize(0); + + Ptr oldCascade = (CvHaarClassifierCascade*)cvLoad(filename.c_str(), 0, 0, 0); + if (oldCascade.empty()) + { + return NCV_HAAR_XML_LOADING_EXCEPTION; + } + + haar.ClassifierSize.width = oldCascade->orig_window_size.width; + haar.ClassifierSize.height = oldCascade->orig_window_size.height; + + int stagesCound = oldCascade->count; + for(int s = 0; s < stagesCound; ++s) // by stages + { + HaarStage64 curStage; + curStage.setStartClassifierRootNodeOffset(static_cast(haarClassifierNodes.size())); + + curStage.setStageThreshold(oldCascade->stage_classifier[s].threshold); + + int treesCount = oldCascade->stage_classifier[s].count; + for(int t = 0; t < treesCount; ++t) // by trees + { + Ncv32u nodeId = 0; + CvHaarClassifier* tree = &oldCascade->stage_classifier[s].classifier[t]; + + int nodesCount = tree->count; + for(int n = 0; n < nodesCount; ++n) //by features + { + CvHaarFeature* feature = &tree->haar_feature[n]; + + HaarClassifierNode128 curNode; + curNode.setThreshold(tree->threshold[n]); + + NcvBool bIsLeftNodeLeaf = false; + NcvBool bIsRightNodeLeaf = false; + + HaarClassifierNodeDescriptor32 nodeLeft; + if ( tree->left[n] <= 0 ) + { + Ncv32f leftVal = tree->alpha[-tree->left[n]]; + ncvStat = nodeLeft.create(leftVal); + ncvAssertReturn(ncvStat == NCV_SUCCESS, ncvStat); + bIsLeftNodeLeaf = true; + } + else + { + Ncv32u leftNodeOffset = tree->left[n]; + nodeLeft.create((Ncv32u)(h_TmpClassifierNotRootNodes.size() + leftNodeOffset - 1)); + haar.bHasStumpsOnly = false; + } + curNode.setLeftNodeDesc(nodeLeft); + + HaarClassifierNodeDescriptor32 nodeRight; + if ( tree->right[n] <= 0 ) + { + Ncv32f rightVal = tree->alpha[-tree->right[n]]; + ncvStat = nodeRight.create(rightVal); + ncvAssertReturn(ncvStat == NCV_SUCCESS, ncvStat); + bIsRightNodeLeaf = true; + } + else + { + Ncv32u rightNodeOffset = tree->right[n]; + nodeRight.create((Ncv32u)(h_TmpClassifierNotRootNodes.size() + rightNodeOffset - 1)); + haar.bHasStumpsOnly = false; + } + curNode.setRightNodeDesc(nodeRight); + + Ncv32u tiltedVal = feature->tilted; + haar.bNeedsTiltedII = (tiltedVal != 0); + + Ncv32u featureId = 0; + for(int l = 0; l < CV_HAAR_FEATURE_MAX; ++l) //by rects + { + Ncv32u rectX = feature->rect[l].r.x; + Ncv32u rectY = feature->rect[l].r.y; + Ncv32u rectWidth = feature->rect[l].r.width; + Ncv32u rectHeight = feature->rect[l].r.height; + + Ncv32f rectWeight = feature->rect[l].weight; + + if (rectWeight == 0/* && rectX == 0 &&rectY == 0 && rectWidth == 0 && rectHeight == 0*/) + break; + + HaarFeature64 curFeature; + ncvStat = curFeature.setRect(rectX, rectY, rectWidth, rectHeight, haar.ClassifierSize.width, haar.ClassifierSize.height); + curFeature.setWeight(rectWeight); + ncvAssertReturn(NCV_SUCCESS == ncvStat, ncvStat); + haarFeatures.push_back(curFeature); + + featureId++; + } + + HaarFeatureDescriptor32 tmpFeatureDesc; + ncvStat = tmpFeatureDesc.create(haar.bNeedsTiltedII, bIsLeftNodeLeaf, bIsRightNodeLeaf, + featureId, static_cast(haarFeatures.size()) - featureId); + ncvAssertReturn(NCV_SUCCESS == ncvStat, ncvStat); + curNode.setFeatureDesc(tmpFeatureDesc); + + if (!nodeId) + { + //root node + haarClassifierNodes.push_back(curNode); + curMaxTreeDepth = 1; + } + else + { + //other node + h_TmpClassifierNotRootNodes.push_back(curNode); + curMaxTreeDepth++; + } + + nodeId++; + } + } + + curStage.setNumClassifierRootNodes(treesCount); + haarStages.push_back(curStage); + } + + //fill in cascade stats + haar.NumStages = static_cast(haarStages.size()); + haar.NumClassifierRootNodes = static_cast(haarClassifierNodes.size()); + haar.NumClassifierTotalNodes = static_cast(haar.NumClassifierRootNodes + h_TmpClassifierNotRootNodes.size()); + haar.NumFeatures = static_cast(haarFeatures.size()); + + //merge root and leaf nodes in one classifiers array + Ncv32u offsetRoot = static_cast(haarClassifierNodes.size()); + for (Ncv32u i=0; i Date: Thu, 2 Aug 2012 16:25:30 +0400 Subject: [PATCH 4/9] parallel version of bilateral filter was implemented using parallel_for_ --- .../include/opencv2/core/parallel_tool.hpp | 108 +++++++ modules/core/src/parallel_tool.cpp | 112 +++++++ modules/core/src/precomp.hpp | 1 + modules/imgproc/perf/perf_bilateral.cpp | 38 +++ modules/imgproc/src/precomp.hpp | 1 + modules/imgproc/src/smooth.cpp | 254 +++++++++------ .../imgproc/test/test_bilateral_filter.cpp | 290 ++++++++++++++++++ 7 files changed, 706 insertions(+), 98 deletions(-) create mode 100644 modules/core/include/opencv2/core/parallel_tool.hpp create mode 100644 modules/core/src/parallel_tool.cpp create mode 100644 modules/imgproc/perf/perf_bilateral.cpp create mode 100644 modules/imgproc/test/test_bilateral_filter.cpp diff --git a/modules/core/include/opencv2/core/parallel_tool.hpp b/modules/core/include/opencv2/core/parallel_tool.hpp new file mode 100644 index 000000000..08258d5c2 --- /dev/null +++ b/modules/core/include/opencv2/core/parallel_tool.hpp @@ -0,0 +1,108 @@ +/*M/////////////////////////////////////////////////////////////////////////////////////// +// +// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. +// +// By downloading, copying, installing or using the software you agree to this license. +// If you do not agree to this license, do not download, install, +// copy or use the software. +// +// +// License Agreement +// For Open Source Computer Vision Library +// +// Copyright (C) 2000-2008, Intel Corporation, all rights reserved. +// Copyright (C) 2009-2011, Willow Garage Inc., all rights reserved. +// Third party copyrights are property of their respective owners. +// +// Redistribution and use in source and binary forms, with or without modification, +// are permitted provided that the following conditions are met: +// +// * Redistribution's of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// +// * Redistribution's in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// +// * The name of the copyright holders may not be used to endorse or promote products +// derived from this software without specific prior written permission. +// +// This software is provided by the copyright holders and contributors "as is" and +// any express or implied warranties, including, but not limited to, the implied +// warranties of merchantability and fitness for a particular purpose are disclaimed. +// In no event shall the Intel Corporation or contributors be liable for any direct, +// indirect, incidental, special, exemplary, or consequential damages +// (including, but not limited to, procurement of substitute goods or services; +// loss of use, data, or profits; or business interruption) however caused +// and on any theory of liability, whether in contract, strict liability, +// or tort (including negligence or otherwise) arising in any way out of +// the use of this software, even if advised of the possibility of such damage. +// +//M*/ + +#ifndef __OPENCV_PARALLEL_TOOL_HPP__ +#define __OPENCV_PARALLEL_TOOL_HPP__ + +#ifdef HAVE_CVCONFIG_H +# include +#endif // HAVE_CVCONFIG_H + +/* + HAVE_TBB - using TBB + HAVE_GCD - using GCD + HAVE_OPENMP - using OpenMP + HAVE_CONCURRENCY - using visual studio 2010 concurrency +*/ + +#ifdef HAVE_TBB +# include "tbb/tbb_stddef.h" +# if TBB_VERSION_MAJOR*100 + TBB_VERSION_MINOR >= 202 +# include "tbb/tbb.h" +# include "tbb/task.h" +# undef min +# undef max +# else +# undef HAVE_TBB +# endif // end TBB version +#endif // HAVE_TBB + +#ifdef __cplusplus + +namespace cv +{ + // a base body class + class CV_EXPORTS ParallelLoopBody + { + public: + virtual void operator() (const Range& range) const = 0; + virtual ~ParallelLoopBody(); + }; + + CV_EXPORTS void parallel_for_(const Range& range, const ParallelLoopBody& body); + + template inline + CV_EXPORTS void parallel_do_(Iterator first, Iterator last, const Body& body) + { +#ifdef HAVE_TBB + tbb::parallel_do(first, last, body); +#else + for ( ; first != last; ++first) + body(*first); +#endif // HAVE_TBB + } + + template inline + CV_EXPORTS void parallel_reduce_(const Range& range, Body& body) + { +#ifdef HAVE_TBB + tbb::parallel_reduce(tbb::blocked_range(range.start, range.end), body); +#else + body(range); +#endif // end HAVE_TBB + } + +} // namespace cv + +#endif // __cplusplus + +#endif // __OPENCV_PARALLEL_TOOL_HPP__ diff --git a/modules/core/src/parallel_tool.cpp b/modules/core/src/parallel_tool.cpp new file mode 100644 index 000000000..423d4787d --- /dev/null +++ b/modules/core/src/parallel_tool.cpp @@ -0,0 +1,112 @@ +/*M/////////////////////////////////////////////////////////////////////////////////////// +// +// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. +// +// By downloading, copying, installing or using the software you agree to this license. +// If you do not agree to this license, do not download, install, +// copy or use the software. +// +// +// License Agreement +// For Open Source Computer Vision Library +// +// Copyright (C) 2000-2008, Intel Corporation, all rights reserved. +// Copyright (C) 2009-2011, Willow Garage Inc., all rights reserved. +// Third party copyrights are property of their respective owners. +// +// Redistribution and use in source and binary forms, with or without modification, +// are permitted provided that the following conditions are met: +// +// * Redistribution's of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// +// * Redistribution's in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// +// * The name of the copyright holders may not be used to endorse or promote products +// derived from this software without specific prior written permission. +// +// This software is provided by the copyright holders and contributors "as is" and +// any express or implied warranties, including, but not limited to, the implied +// warranties of merchantability and fitness for a particular purpose are disclaimed. +// In no event shall the Intel Corporation or contributors be liable for any direct, +// indirect, incidental, special, exemplary, or consequential damages +// (including, but not limited to, procurement of substitute goods or services; +// loss of use, data, or profits; or business interruption) however caused +// and on any theory of liability, whether in contract, strict liability, +// or tort (including negligence or otherwise) arising in any way out of +// the use of this software, even if advised of the possibility of such damage. +// +//M*/ + +#include "precomp.hpp" + +#ifdef HAVE_CONCURRENCY +# include +#elif defined HAVE_OPENMP +# include +#elif defined HAVE_GCD +# include +#endif // HAVE_CONCURRENCY + +namespace cv +{ + ParallelLoopBody::~ParallelLoopBody() { } + +#ifdef HAVE_TBB + class TbbProxyLoopBody + { + public: + TbbProxyLoopBody(const ParallelLoopBody& _body) : + body(&_body) + { } + + void operator ()(const tbb::blocked_range& range) const + { + body->operator()(Range(range.begin(), range.end())); + } + + private: + const ParallelLoopBody* body; + }; +#endif // end HAVE_TBB + +#ifdef HAVE_GCD + static + void block_function(void* context, size_t index) + { + ParallelLoopBody* ptr_body = static_cast(context); + ptr_body->operator()(Range(index, index + 1)); + } +#endif // HAVE_GCD + + void parallel_for_(const Range& range, const ParallelLoopBody& body) + { +#ifdef HAVE_TBB + + tbb::parallel_for(tbb::blocked_range(range.start, range.end), TbbProxyLoopBody(body)); + +#elif defined HAVE_CONCURRENCY + + Concurrency::parallel_for(range.start, range.end, body); + +#elif defined HAVE_OPENMP + +#pragma omp parallel for schedule(dynamic) + for (int i = range.start; i < range.end; ++i) + body(Range(i, i + 1)); + +#elif defined (HAVE_GCD) + + dispatch_queue_t concurrent_queue = dispatch_get_global_queue(DISPATCH_QUEUE_PRIORITY_DEFAULT, 0); + dispatch_apply_f(range.end - range.start, concurrent_queue, &const_cast(body), block_function); + +#else + + body(range); + +#endif // end HAVE_TBB + } + +} // namespace cv diff --git a/modules/core/src/precomp.hpp b/modules/core/src/precomp.hpp index 81b9d6e80..60429075a 100644 --- a/modules/core/src/precomp.hpp +++ b/modules/core/src/precomp.hpp @@ -50,6 +50,7 @@ #include "opencv2/core/core.hpp" #include "opencv2/core/core_c.h" #include "opencv2/core/internal.hpp" +#include "opencv2/core/parallel_tool.hpp" #include #include diff --git a/modules/imgproc/perf/perf_bilateral.cpp b/modules/imgproc/perf/perf_bilateral.cpp new file mode 100644 index 000000000..85cfc7d0c --- /dev/null +++ b/modules/imgproc/perf/perf_bilateral.cpp @@ -0,0 +1,38 @@ +#include "perf_precomp.hpp" + +using namespace std; +using namespace cv; +using namespace perf; +using namespace testing; +using std::tr1::make_tuple; +using std::tr1::get; + +CV_ENUM(Mat_Type, CV_8UC1, CV_8UC3, CV_32FC1, CV_32FC3) + +typedef TestBaseWithParam< tr1::tuple > TestBilateralFilter; + +PERF_TEST_P( TestBilateralFilter, BilateralFilter, + Combine( + Values( szVGA, sz1080p ), // image size + Values( 3, 5 ), // d + ValuesIn( Mat_Type::all() ) // image type + ) +) +{ + Size sz; + int d, type; + const double sigmaColor = 1., sigmaSpace = 1.; + + sz = get<0>(GetParam()); + d = get<1>(GetParam()); + type = get<2>(GetParam()); + + Mat src(sz, type); + Mat dst(sz, type); + + declare.in(src, WARMUP_RNG).out(dst).time(20); + + TEST_CYCLE() bilateralFilter(src, dst, d, sigmaColor, sigmaSpace, BORDER_DEFAULT); + + SANITY_CHECK(dst); +} diff --git a/modules/imgproc/src/precomp.hpp b/modules/imgproc/src/precomp.hpp index fef5f755b..998008ae2 100644 --- a/modules/imgproc/src/precomp.hpp +++ b/modules/imgproc/src/precomp.hpp @@ -50,6 +50,7 @@ #include "opencv2/imgproc/imgproc.hpp" #include "opencv2/imgproc/imgproc_c.h" #include "opencv2/core/internal.hpp" +#include "opencv2/core/parallel_tool.hpp" #include #include #include diff --git a/modules/imgproc/src/smooth.cpp b/modules/imgproc/src/smooth.cpp index faec530b8..1bc11c7fc 100644 --- a/modules/imgproc/src/smooth.cpp +++ b/modules/imgproc/src/smooth.cpp @@ -1288,48 +1288,119 @@ void cv::medianBlur( InputArray _src0, OutputArray _dst, int ksize ) namespace cv { +class BilateralFilter_8u_Invoker : + public ParallelLoopBody +{ +public: + BilateralFilter_8u_Invoker(const Mat &_src, Mat& _dst, Mat _temp, int _radius, int _maxk, + int* _space_ofs, float *_space_weight, float *_color_weight) : + ParallelLoopBody(), src(_src), dst(_dst), temp(_temp), radius(_radius), + maxk(_maxk), space_ofs(_space_ofs), space_weight(_space_weight), color_weight(_color_weight) + { + } + + virtual void operator() (const Range& range) const + { + int i, j, cn = src.channels(), k; + Size size = src.size(); + + for( i = range.start; i < range.end; i++ ) + { + const uchar* sptr = temp.data + (i+radius)*temp.step + radius*cn; + uchar* dptr = dst.data + i*dst.step; + + if( cn == 1 ) + { + for( j = 0; j < size.width; j++ ) + { + float sum = 0, wsum = 0; + int val0 = sptr[j]; + for( k = 0; k < maxk; k++ ) + { + int val = sptr[j + space_ofs[k]]; + float w = space_weight[k]*color_weight[std::abs(val - val0)]; + sum += val*w; + wsum += w; + } + // overflow is not possible here => there is no need to use CV_CAST_8U + dptr[j] = (uchar)cvRound(sum/wsum); + } + } + else + { + assert( cn == 3 ); + for( j = 0; j < size.width*3; j += 3 ) + { + float sum_b = 0, sum_g = 0, sum_r = 0, wsum = 0; + int b0 = sptr[j], g0 = sptr[j+1], r0 = sptr[j+2]; + for( k = 0; k < maxk; k++ ) + { + const uchar* sptr_k = sptr + j + space_ofs[k]; + int b = sptr_k[0], g = sptr_k[1], r = sptr_k[2]; + float w = space_weight[k]*color_weight[std::abs(b - b0) + + std::abs(g - g0) + std::abs(r - r0)]; + sum_b += b*w; sum_g += g*w; sum_r += r*w; + wsum += w; + } + wsum = 1.f/wsum; + b0 = cvRound(sum_b*wsum); + g0 = cvRound(sum_g*wsum); + r0 = cvRound(sum_r*wsum); + dptr[j] = (uchar)b0; dptr[j+1] = (uchar)g0; dptr[j+2] = (uchar)r0; + } + } + } + } + +private: + const Mat& src; + Mat &dst, temp; + int radius, maxk, * space_ofs; + float *space_weight, *color_weight; +}; + static void bilateralFilter_8u( const Mat& src, Mat& dst, int d, - double sigma_color, double sigma_space, - int borderType ) + double sigma_color, double sigma_space, + int borderType ) { int cn = src.channels(); - int i, j, k, maxk, radius; + int i, j, maxk, radius; Size size = src.size(); - + CV_Assert( (src.type() == CV_8UC1 || src.type() == CV_8UC3) && - src.type() == dst.type() && src.size() == dst.size() && - src.data != dst.data ); - + src.type() == dst.type() && src.size() == dst.size() && + src.data != dst.data ); + if( sigma_color <= 0 ) sigma_color = 1; if( sigma_space <= 0 ) sigma_space = 1; - + double gauss_color_coeff = -0.5/(sigma_color*sigma_color); double gauss_space_coeff = -0.5/(sigma_space*sigma_space); - + if( d <= 0 ) radius = cvRound(sigma_space*1.5); else radius = d/2; radius = MAX(radius, 1); d = radius*2 + 1; - + Mat temp; copyMakeBorder( src, temp, radius, radius, radius, radius, borderType ); - + vector _color_weight(cn*256); vector _space_weight(d*d); vector _space_ofs(d*d); float* color_weight = &_color_weight[0]; float* space_weight = &_space_weight[0]; int* space_ofs = &_space_ofs[0]; - + // initialize color-related bilateral filter coefficients for( i = 0; i < 256*cn; i++ ) color_weight[i] = (float)std::exp(i*i*gauss_color_coeff); - + // initialize space-related bilateral filter coefficients for( i = -radius, maxk = 0; i <= radius; i++ ) for( j = -radius; j <= radius; j++ ) @@ -1340,55 +1411,89 @@ bilateralFilter_8u( const Mat& src, Mat& dst, int d, space_weight[maxk] = (float)std::exp(r*r*gauss_space_coeff); space_ofs[maxk++] = (int)(i*temp.step + j*cn); } + + BilateralFilter_8u_Invoker body(src, dst, temp, radius, maxk, space_ofs, space_weight, color_weight); + parallel_for_(Range(0, size.height), body); +} - for( i = 0; i < size.height; i++ ) + +class BilateralFilter_32f_Invoker : + public ParallelLoopBody +{ +public: + + BilateralFilter_32f_Invoker(int _cn, int _radius, int _maxk, int *_space_ofs, + Mat _temp, Mat *_dest, Size _size, + float _scale_index, float *_space_weight, float *_expLUT) : + ParallelLoopBody(), cn(_cn), radius(_radius), maxk(_maxk), space_ofs(_space_ofs), + temp(_temp), dest(_dest), size(_size), scale_index(_scale_index), space_weight(_space_weight), expLUT(_expLUT) { - const uchar* sptr = temp.data + (i+radius)*temp.step + radius*cn; - uchar* dptr = dst.data + i*dst.step; + } - if( cn == 1 ) + virtual void operator() (const Range& range) const + { + Mat& dst = *dest; + int i, j, k; + + for( i = range.start; i < range.end; i++ ) { - for( j = 0; j < size.width; j++ ) + const float* sptr = (const float*)(temp.data + (i+radius)*temp.step) + radius*cn; + float* dptr = (float*)(dst.data + i*dst.step); + + if( cn == 1 ) { - float sum = 0, wsum = 0; - int val0 = sptr[j]; - for( k = 0; k < maxk; k++ ) + for( j = 0; j < size.width; j++ ) { - int val = sptr[j + space_ofs[k]]; - float w = space_weight[k]*color_weight[std::abs(val - val0)]; - sum += val*w; - wsum += w; + float sum = 0, wsum = 0; + float val0 = sptr[j]; + for( k = 0; k < maxk; k++ ) + { + float val = sptr[j + space_ofs[k]]; + float alpha = (float)(std::abs(val - val0)*scale_index); + int idx = cvFloor(alpha); + alpha -= idx; + float w = space_weight[k]*(expLUT[idx] + alpha*(expLUT[idx+1] - expLUT[idx])); + sum += val*w; + wsum += w; + } + dptr[j] = (float)(sum/wsum); } - // overflow is not possible here => there is no need to use CV_CAST_8U - dptr[j] = (uchar)cvRound(sum/wsum); } - } - else - { - assert( cn == 3 ); - for( j = 0; j < size.width*3; j += 3 ) + else { - float sum_b = 0, sum_g = 0, sum_r = 0, wsum = 0; - int b0 = sptr[j], g0 = sptr[j+1], r0 = sptr[j+2]; - for( k = 0; k < maxk; k++ ) + assert( cn == 3 ); + for( j = 0; j < size.width*3; j += 3 ) { - const uchar* sptr_k = sptr + j + space_ofs[k]; - int b = sptr_k[0], g = sptr_k[1], r = sptr_k[2]; - float w = space_weight[k]*color_weight[std::abs(b - b0) + - std::abs(g - g0) + std::abs(r - r0)]; - sum_b += b*w; sum_g += g*w; sum_r += r*w; - wsum += w; + float sum_b = 0, sum_g = 0, sum_r = 0, wsum = 0; + float b0 = sptr[j], g0 = sptr[j+1], r0 = sptr[j+2]; + for( k = 0; k < maxk; k++ ) + { + const float* sptr_k = sptr + j + space_ofs[k]; + float b = sptr_k[0], g = sptr_k[1], r = sptr_k[2]; + float alpha = (float)((std::abs(b - b0) + + std::abs(g - g0) + std::abs(r - r0))*scale_index); + int idx = cvFloor(alpha); + alpha -= idx; + float w = space_weight[k]*(expLUT[idx] + alpha*(expLUT[idx+1] - expLUT[idx])); + sum_b += b*w; sum_g += g*w; sum_r += r*w; + wsum += w; + } + wsum = 1.f/wsum; + b0 = sum_b*wsum; + g0 = sum_g*wsum; + r0 = sum_r*wsum; + dptr[j] = b0; dptr[j+1] = g0; dptr[j+2] = r0; } - wsum = 1.f/wsum; - b0 = cvRound(sum_b*wsum); - g0 = cvRound(sum_g*wsum); - r0 = cvRound(sum_r*wsum); - dptr[j] = (uchar)b0; dptr[j+1] = (uchar)g0; dptr[j+2] = (uchar)r0; } } } -} +private: + int cn, radius, maxk, *space_ofs; + Mat temp, *dest; + Size size; + float scale_index, *space_weight, *expLUT; +}; static void bilateralFilter_32f( const Mat& src, Mat& dst, int d, @@ -1396,7 +1501,7 @@ bilateralFilter_32f( const Mat& src, Mat& dst, int d, int borderType ) { int cn = src.channels(); - int i, j, k, maxk, radius; + int i, j, maxk, radius; double minValSrc=-1, maxValSrc=1; const int kExpNumBinsPerChannel = 1 << 12; int kExpNumBins = 0; @@ -1474,57 +1579,10 @@ bilateralFilter_32f( const Mat& src, Mat& dst, int d, space_ofs[maxk++] = (int)(i*(temp.step/sizeof(float)) + j*cn); } - for( i = 0; i < size.height; i++ ) - { - const float* sptr = (const float*)(temp.data + (i+radius)*temp.step) + radius*cn; - float* dptr = (float*)(dst.data + i*dst.step); + // parallel_for usage - if( cn == 1 ) - { - for( j = 0; j < size.width; j++ ) - { - float sum = 0, wsum = 0; - float val0 = sptr[j]; - for( k = 0; k < maxk; k++ ) - { - float val = sptr[j + space_ofs[k]]; - float alpha = (float)(std::abs(val - val0)*scale_index); - int idx = cvFloor(alpha); - alpha -= idx; - float w = space_weight[k]*(expLUT[idx] + alpha*(expLUT[idx+1] - expLUT[idx])); - sum += val*w; - wsum += w; - } - dptr[j] = (float)(sum/wsum); - } - } - else - { - assert( cn == 3 ); - for( j = 0; j < size.width*3; j += 3 ) - { - float sum_b = 0, sum_g = 0, sum_r = 0, wsum = 0; - float b0 = sptr[j], g0 = sptr[j+1], r0 = sptr[j+2]; - for( k = 0; k < maxk; k++ ) - { - const float* sptr_k = sptr + j + space_ofs[k]; - float b = sptr_k[0], g = sptr_k[1], r = sptr_k[2]; - float alpha = (float)((std::abs(b - b0) + - std::abs(g - g0) + std::abs(r - r0))*scale_index); - int idx = cvFloor(alpha); - alpha -= idx; - float w = space_weight[k]*(expLUT[idx] + alpha*(expLUT[idx+1] - expLUT[idx])); - sum_b += b*w; sum_g += g*w; sum_r += r*w; - wsum += w; - } - wsum = 1.f/wsum; - b0 = sum_b*wsum; - g0 = sum_g*wsum; - r0 = sum_r*wsum; - dptr[j] = b0; dptr[j+1] = g0; dptr[j+2] = r0; - } - } - } + BilateralFilter_32f_Invoker body(cn, radius, maxk, space_ofs, temp, &dst, size, scale_index, space_weight, expLUT); + parallel_for_(Range(0, size.height), body); } } diff --git a/modules/imgproc/test/test_bilateral_filter.cpp b/modules/imgproc/test/test_bilateral_filter.cpp new file mode 100644 index 000000000..034f9c363 --- /dev/null +++ b/modules/imgproc/test/test_bilateral_filter.cpp @@ -0,0 +1,290 @@ +/*M/////////////////////////////////////////////////////////////////////////////////////// +// +// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. +// +// By downloading, copying, installing or using the software you agree to this license. +// If you do not agree to this license, do not download, install, +// copy or use the software. +// +// +// License Agreement +// For Open Source Computer Vision Library +// +// Copyright (C) 2000-2008, Intel Corporation, all rights reserved. +// Copyright (C) 2009, Willow Garage Inc., all rights reserved. +// Third party copyrights are property of their respective owners. +// +// Redistribution and use in source and binary forms, with or without modification, +// are permitted provided that the following conditions are met: +// +// * Redistribution's of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// +// * Redistribution's in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// +// * The name of the copyright holders may not be used to endorse or promote products +// derived from this software without specific prior written permission. +// +// This software is provided by the copyright holders and contributors "as is" and +// any express or implied warranties, including, but not limited to, the implied +// warranties of merchantability and fitness for a particular purpose are disclaimed. +// In no event shall the Intel Corporation or contributors be liable for any direct, +// indirect, incidental, special, exemplary, or consequential damages +// (including, but not limited to, procurement of substitute goods or services; +// loss of use, data, or profits; or business interruption) however caused +// and on any theory of liability, whether in contract, strict liability, +// or tort (including negligence or otherwise) arising in any way out of +// the use of this software, even if advised of the possibility of such damage. +// +//M*/ + +#include "test_precomp.hpp" + +using namespace cv; + +namespace cvtest +{ + class CV_BilateralFilterTest : + public cvtest::BaseTest + { + public: + enum + { + MAX_WIDTH = 1920, MIN_WIDTH = 1, + MAX_HEIGHT = 1080, MIN_HEIGHT = 1 + }; + + CV_BilateralFilterTest(); + ~CV_BilateralFilterTest(); + + protected: + virtual void run_func(); + virtual int prepare_test_case(int test_case_index); + virtual int validate_test_results(int test_case_index); + + private: + void reference_bilateral_filter(const Mat& src, Mat& dst, int d, double sigma_color, + double sigma_space, int borderType = BORDER_DEFAULT); + + int getRandInt(RNG& rng, int min_value, int max_value) const; + + double _sigma_color; + double _sigma_space; + + Mat _src; + Mat _parallel_dst; + int _d; + }; + + CV_BilateralFilterTest::CV_BilateralFilterTest() : + cvtest::BaseTest(), _src(), _parallel_dst(), _d() + { + test_case_count = 1000; + } + + CV_BilateralFilterTest::~CV_BilateralFilterTest() + { + } + + int CV_BilateralFilterTest::getRandInt(RNG& rng, int min_value, int max_value) const + { + double rand_value = rng.uniform(log(min_value), log(max_value + 1)); + return cvRound(exp(rand_value)); + } + + void CV_BilateralFilterTest::reference_bilateral_filter(const Mat &src, Mat &dst, int d, + double sigma_color, double sigma_space, int borderType) + { + int cn = src.channels(); + int i, j, k, maxk, radius; + double minValSrc = -1, maxValSrc = 1; + const int kExpNumBinsPerChannel = 1 << 12; + int kExpNumBins = 0; + float lastExpVal = 1.f; + float len, scale_index; + Size size = src.size(); + + dst.create(size, src.type()); + + CV_Assert( (src.type() == CV_32FC1 || src.type() == CV_32FC3) && + src.type() == dst.type() && src.size() == dst.size() && + src.data != dst.data ); + + if( sigma_color <= 0 ) + sigma_color = 1; + if( sigma_space <= 0 ) + sigma_space = 1; + + double gauss_color_coeff = -0.5/(sigma_color*sigma_color); + double gauss_space_coeff = -0.5/(sigma_space*sigma_space); + + if( d <= 0 ) + radius = cvRound(sigma_space*1.5); + else + radius = d/2; + radius = MAX(radius, 1); + d = radius*2 + 1; + // compute the min/max range for the input image (even if multichannel) + + minMaxLoc( src.reshape(1), &minValSrc, &maxValSrc ); + if(std::abs(minValSrc - maxValSrc) < FLT_EPSILON) + { + src.copyTo(dst); + return; + } + + // temporary copy of the image with borders for easy processing + Mat temp; + copyMakeBorder( src, temp, radius, radius, radius, radius, borderType ); + patchNaNs(temp); + + // allocate lookup tables + vector _space_weight(d*d); + vector _space_ofs(d*d); + float* space_weight = &_space_weight[0]; + int* space_ofs = &_space_ofs[0]; + + // assign a length which is slightly more than needed + len = (float)(maxValSrc - minValSrc) * cn; + kExpNumBins = kExpNumBinsPerChannel * cn; + vector _expLUT(kExpNumBins+2); + float* expLUT = &_expLUT[0]; + + scale_index = kExpNumBins/len; + + // initialize the exp LUT + for( i = 0; i < kExpNumBins+2; i++ ) + { + if( lastExpVal > 0.f ) + { + double val = i / scale_index; + expLUT[i] = (float)std::exp(val * val * gauss_color_coeff); + lastExpVal = expLUT[i]; + } + else + expLUT[i] = 0.f; + } + + // initialize space-related bilateral filter coefficients + for( i = -radius, maxk = 0; i <= radius; i++ ) + for( j = -radius; j <= radius; j++ ) + { + double r = std::sqrt((double)i*i + (double)j*j); + if( r > radius ) + continue; + space_weight[maxk] = (float)std::exp(r*r*gauss_space_coeff); + space_ofs[maxk++] = (int)(i*(temp.step/sizeof(float)) + j*cn); + } + + for( i = 0; i < size.height; i++ ) + { + const float* sptr = (const float*)(temp.data + (i+radius)*temp.step) + radius*cn; + float* dptr = (float*)(dst.data + i*dst.step); + + if( cn == 1 ) + { + for( j = 0; j < size.width; j++ ) + { + float sum = 0, wsum = 0; + float val0 = sptr[j]; + for( k = 0; k < maxk; k++ ) + { + float val = sptr[j + space_ofs[k]]; + float alpha = (float)(std::abs(val - val0)*scale_index); + int idx = cvFloor(alpha); + alpha -= idx; + float w = space_weight[k]*(expLUT[idx] + alpha*(expLUT[idx+1] - expLUT[idx])); + sum += val*w; + wsum += w; + } + dptr[j] = (float)(sum/wsum); + } + } + else + { + assert( cn == 3 ); + for( j = 0; j < size.width*3; j += 3 ) + { + float sum_b = 0, sum_g = 0, sum_r = 0, wsum = 0; + float b0 = sptr[j], g0 = sptr[j+1], r0 = sptr[j+2]; + for( k = 0; k < maxk; k++ ) + { + const float* sptr_k = sptr + j + space_ofs[k]; + float b = sptr_k[0], g = sptr_k[1], r = sptr_k[2]; + float alpha = (float)((std::abs(b - b0) + + std::abs(g - g0) + std::abs(r - r0))*scale_index); + int idx = cvFloor(alpha); + alpha -= idx; + float w = space_weight[k]*(expLUT[idx] + alpha*(expLUT[idx+1] - expLUT[idx])); + sum_b += b*w; sum_g += g*w; sum_r += r*w; + wsum += w; + } + wsum = 1.f/wsum; + b0 = sum_b*wsum; + g0 = sum_g*wsum; + r0 = sum_r*wsum; + dptr[j] = b0; dptr[j+1] = g0; dptr[j+2] = r0; + } + } + } + } + + int CV_BilateralFilterTest::prepare_test_case(int /* test_case_index */) + { + const static int types[] = { CV_32FC1, CV_32FC3, CV_8UC1, CV_8UC3 }; + RNG& rng = ts->get_rng(); + Size size(getRandInt(rng, MIN_WIDTH, MAX_WIDTH), getRandInt(rng, MIN_HEIGHT, MAX_HEIGHT)); + int type = types[rng(sizeof(types) / sizeof(types[0]))]; + + _d = rng.uniform(0., 1.) > 0.5 ? 5 : 3; + + _src.create(size, type); + + rng.fill(_src, RNG::UNIFORM, 0, 256); + + _sigma_color = _sigma_space = 1.; + + return 1; + } + + int CV_BilateralFilterTest::validate_test_results(int test_case_index) + { + static const double eps = 1; + + Mat reference_dst, reference_src; + if (_src.depth() == CV_32F) + reference_bilateral_filter(_src, reference_dst, _d, _sigma_color, _sigma_space); + else + { + int type = _src.type(); + _src.convertTo(reference_src, CV_32F); + reference_bilateral_filter(reference_src, reference_dst, _d, _sigma_color, _sigma_space); + reference_dst.convertTo(reference_dst, type); + } + + double e = norm(reference_dst, _parallel_dst); + if (e > eps) + { + ts->printf(cvtest::TS::CONSOLE, "actual error: %g, expected: %g", e, eps); + ts->set_failed_test_info(cvtest::TS::FAIL_BAD_ACCURACY); + } + else + ts->set_failed_test_info(cvtest::TS::OK); + + return BaseTest::validate_test_results(test_case_index); + } + + void CV_BilateralFilterTest::run_func() + { + bilateralFilter(_src, _parallel_dst, _d, _sigma_color, _sigma_space); + } + + TEST(Imgproc_BilateralFilter, accuracy) + { + CV_BilateralFilterTest test; + test.safe_run(); + } + +} // end of namespace cvtest From 8eeacc8cc87c052bf10cfba915d2818eaefb48dc Mon Sep 17 00:00:00 2001 From: niko Date: Fri, 3 Aug 2012 14:08:36 +0800 Subject: [PATCH 5/9] performance and bug fix for addWeighted cartToPolar div exp log resize setTo add channel 3 support add fast way Between CPU and GPU for the data which is aligned --- .../include/opencv2/ocl/matrix_operations.hpp | 2 +- modules/ocl/include/opencv2/ocl/ocl.hpp | 1 + modules/ocl/src/arithm.cpp | 102 ++-- modules/ocl/src/imgproc.cpp | 34 +- modules/ocl/src/initialization.cpp | 26 +- modules/ocl/src/kernels/arithm_addWeighted.cl | 36 +- modules/ocl/src/kernels/arithm_cartToPolar.cl | 4 + modules/ocl/src/kernels/arithm_div.cl | 131 +++-- modules/ocl/src/kernels/arithm_exp.cl | 4 + modules/ocl/src/kernels/arithm_log.cl | 4 +- modules/ocl/src/kernels/convertC3C4.cl | 173 +++--- modules/ocl/src/kernels/imgproc_resize.cl | 209 ++++--- modules/ocl/src/kernels/operator_setTo.cl | 60 +- modules/ocl/src/kernels/operator_setToM.cl | 114 +--- modules/ocl/src/matrix_operations.cpp | 515 ++++++++++++++++-- modules/ocl/src/precomp.hpp | 6 +- modules/ocl/test/test_imgproc.cpp | 2 +- modules/ocl/test/test_matrix_operation.cpp | 100 +++- 18 files changed, 994 insertions(+), 529 deletions(-) diff --git a/modules/ocl/include/opencv2/ocl/matrix_operations.hpp b/modules/ocl/include/opencv2/ocl/matrix_operations.hpp index e90da2bc9..3d75e14b3 100644 --- a/modules/ocl/include/opencv2/ocl/matrix_operations.hpp +++ b/modules/ocl/include/opencv2/ocl/matrix_operations.hpp @@ -49,7 +49,7 @@ namespace cv namespace ocl { ////////////////////////////////////OpenCL kernel strings////////////////////////// - extern const char *convertC3C4; + //extern const char *convertC3C4; //////////////////////////////////////////////////////////////////////// //////////////////////////////// oclMat //////////////////////////////// diff --git a/modules/ocl/include/opencv2/ocl/ocl.hpp b/modules/ocl/include/opencv2/ocl/ocl.hpp index 752b554bd..0efc72283 100644 --- a/modules/ocl/include/opencv2/ocl/ocl.hpp +++ b/modules/ocl/include/opencv2/ocl/ocl.hpp @@ -49,6 +49,7 @@ #include "opencv2/core/core.hpp" #include "opencv2/imgproc/imgproc.hpp" #include "opencv2/objdetect/objdetect.hpp" +#include "opencv2/features2d/features2d.hpp" namespace cv { diff --git a/modules/ocl/src/arithm.cpp b/modules/ocl/src/arithm.cpp index dba7778b1..d70946715 100644 --- a/modules/ocl/src/arithm.cpp +++ b/modules/ocl/src/arithm.cpp @@ -455,13 +455,12 @@ void cv::ocl::multiply(const oclMat &src1, const oclMat &src2, oclMat &dst, doub } void cv::ocl::divide(const oclMat &src1, const oclMat &src2, oclMat &dst, double scalar) { - if(src1.clCxt -> impl -> double_support ==0) - { - CV_Error(-217,"Selected device don't support double\r\n"); - return; - } - arithmetic_run(src1, src2, dst, "arithm_div", &arithm_div, (void *)(&scalar)); + if(src1.clCxt -> impl -> double_support !=0) + arithmetic_run(src1, src2, dst, "arithm_div", &arithm_div, (void *)(&scalar)); + else + arithmetic_run(src1, src2, dst, "arithm_div", &arithm_div, (void *)(&scalar)); + } template void arithmetic_scalar_run(const oclMat &src1, const Scalar &src2, oclMat &dst, const oclMat &mask, string kernelName, const char **kernelString, int isMatSubScalar) @@ -579,7 +578,14 @@ void arithmetic_scalar_run(const oclMat &src, oclMat &dst, string kernelName, co args.push_back( make_pair( sizeof(cl_int), (void *)&src.rows )); args.push_back( make_pair( sizeof(cl_int), (void *)&cols )); args.push_back( make_pair( sizeof(cl_int), (void *)&dst_step1 )); - args.push_back( make_pair( sizeof(cl_double), (void *)&scalar )); + + if(src.clCxt -> impl -> double_support !=0) + args.push_back( make_pair( sizeof(cl_double), (void *)&scalar )); + else + { + float f_scalar = (float)scalar; + args.push_back( make_pair( sizeof(cl_float), (void *)&f_scalar)); + } openCLExecuteKernel(clCxt, kernelString, kernelName, globalThreads, localThreads, args, -1, depth); } @@ -670,9 +676,9 @@ void compare_run(const oclMat &src1, const oclMat &src2, oclMat &dst, string ker int cols = divUp(dst.cols + offset_cols, vector_length); size_t localThreads[3] = { 64, 4, 1 }; size_t globalThreads[3] = { divUp(cols, localThreads[0]) * localThreads[0], - divUp(dst.rows, localThreads[1]) * localThreads[1], - 1 - }; + divUp(dst.rows, localThreads[1]) * localThreads[1], + 1 + }; int dst_step1 = dst.cols * dst.elemSize(); vector > args; args.push_back( make_pair( sizeof(cl_mem), (void *)&src1.data )); @@ -1253,7 +1259,11 @@ void arithmetic_exp_log_run(const oclMat &src, oclMat &dst, string kernelName, c CV_Assert( src.type() == CV_32F || src.type() == CV_64F); Context *clCxt = src.clCxt; - + if(clCxt -> impl -> double_support ==0 && src.type() == CV_64F) + { + CV_Error(-217,"Selected device don't support double\r\n"); + return; + } //int channels = dst.channels(); int depth = dst.depth(); @@ -2193,56 +2203,46 @@ void cv::ocl::addWeighted(const oclMat &src1, double alpha, const oclMat &src2, size_t localThreads[3] = { 256, 1, 1 }; size_t globalThreads[3] = { divUp(cols, localThreads[0]) * localThreads[0], - divUp(dst.rows, localThreads[1]) * localThreads[1], - 1 - }; + divUp(dst.rows, localThreads[1]) * localThreads[1], + 1 + }; int dst_step1 = dst.cols * dst.elemSize(); vector > args; - if(sizeof(double) == 8) + args.push_back( make_pair( sizeof(cl_mem), (void *)&src1.data )); + args.push_back( make_pair( sizeof(cl_int), (void *)&src1.step )); + args.push_back( make_pair( sizeof(cl_int), (void *)&src1.offset)); + args.push_back( make_pair( sizeof(cl_mem), (void *)&src2.data )); + args.push_back( make_pair( sizeof(cl_int), (void *)&src2.step )); + args.push_back( make_pair( sizeof(cl_int), (void *)&src2.offset)); + + if(src1.clCxt -> impl -> double_support != 0) { - args.push_back( make_pair( sizeof(cl_mem), (void *)&src1.data )); args.push_back( make_pair( sizeof(cl_double), (void *)&alpha )); - args.push_back( make_pair( sizeof(cl_int), (void *)&src1.step )); - args.push_back( make_pair( sizeof(cl_int), (void *)&src1.offset)); - args.push_back( make_pair( sizeof(cl_mem), (void *)&src2.data )); args.push_back( make_pair( sizeof(cl_double), (void *)&beta )); - args.push_back( make_pair( sizeof(cl_int), (void *)&src2.step )); - args.push_back( make_pair( sizeof(cl_int), (void *)&src2.offset)); args.push_back( make_pair( sizeof(cl_double), (void *)&gama )); - args.push_back( make_pair( sizeof(cl_mem), (void *)&dst.data )); - args.push_back( make_pair( sizeof(cl_int), (void *)&dst.step )); - args.push_back( make_pair( sizeof(cl_int), (void *)&dst.offset)); - args.push_back( make_pair( sizeof(cl_int), (void *)&src1.rows )); - args.push_back( make_pair( sizeof(cl_int), (void *)&cols )); - args.push_back( make_pair( sizeof(cl_int), (void *)&dst_step1 )); } else { - - args.push_back( make_pair( sizeof(cl_mem), (void *)&src1.data )); args.push_back( make_pair( sizeof(cl_float), (void *)&alpha )); - args.push_back( make_pair( sizeof(cl_int), (void *)&src1.step )); - args.push_back( make_pair( sizeof(cl_int), (void *)&src1.offset)); - args.push_back( make_pair( sizeof(cl_mem), (void *)&src2.data )); args.push_back( make_pair( sizeof(cl_float), (void *)&beta )); - args.push_back( make_pair( sizeof(cl_int), (void *)&src2.step )); - args.push_back( make_pair( sizeof(cl_int), (void *)&src2.offset)); args.push_back( make_pair( sizeof(cl_float), (void *)&gama )); - args.push_back( make_pair( sizeof(cl_mem), (void *)&dst.data )); - args.push_back( make_pair( sizeof(cl_int), (void *)&dst.step )); - args.push_back( make_pair( sizeof(cl_int), (void *)&dst.offset)); - args.push_back( make_pair( sizeof(cl_int), (void *)&src1.rows )); - args.push_back( make_pair( sizeof(cl_int), (void *)&cols )); - args.push_back( make_pair( sizeof(cl_int), (void *)&dst_step1 )); - } + } + + args.push_back( make_pair( sizeof(cl_mem), (void *)&dst.data )); + args.push_back( make_pair( sizeof(cl_int), (void *)&dst.step )); + args.push_back( make_pair( sizeof(cl_int), (void *)&dst.offset)); + args.push_back( make_pair( sizeof(cl_int), (void *)&src1.rows )); + args.push_back( make_pair( sizeof(cl_int), (void *)&cols )); + args.push_back( make_pair( sizeof(cl_int), (void *)&dst_step1 )); + openCLExecuteKernel(clCxt, &arithm_addWeighted, "addWeighted", globalThreads, localThreads, args, -1, depth); } void cv::ocl::magnitudeSqr(const oclMat &src1, const oclMat &src2, oclMat &dst) { CV_Assert(src1.type() == src2.type() && src1.size() == src2.size() && - (src1.depth() == CV_32F )); + (src1.depth() == CV_32F )); dst.create(src1.size(), src1.type()); @@ -2265,9 +2265,9 @@ void cv::ocl::magnitudeSqr(const oclMat &src1, const oclMat &src2, oclMat &dst) size_t localThreads[3] = { 256, 1, 1 }; size_t globalThreads[3] = { divUp(cols, localThreads[0]) * localThreads[0], - divUp(dst.rows, localThreads[1]) * localThreads[1], - 1 - }; + divUp(dst.rows, localThreads[1]) * localThreads[1], + 1 + }; int dst_step1 = dst.cols * dst.elemSize(); vector > args; @@ -2313,9 +2313,9 @@ void cv::ocl::magnitudeSqr(const oclMat &src1, oclMat &dst) size_t localThreads[3] = { 256, 1, 1 }; size_t globalThreads[3] = { divUp(cols, localThreads[0]) * localThreads[0], - divUp(dst.rows, localThreads[1]) * localThreads[1], - 1 - }; + divUp(dst.rows, localThreads[1]) * localThreads[1], + 1 + }; int dst_step1 = dst.cols * dst.elemSize(); vector > args; @@ -2348,9 +2348,9 @@ void arithmetic_pow_run(const oclMat &src1, double p, oclMat &dst, string kernel size_t localThreads[3] = { 64, 4, 1 }; size_t globalThreads[3] = { divUp(cols, localThreads[0]) * localThreads[0], - divUp(rows, localThreads[1]) * localThreads[1], - 1 - }; + divUp(rows, localThreads[1]) * localThreads[1], + 1 + }; int dst_step1 = dst.cols * dst.elemSize(); vector > args; diff --git a/modules/ocl/src/imgproc.cpp b/modules/ocl/src/imgproc.cpp index fd07df5cd..7617c08c5 100644 --- a/modules/ocl/src/imgproc.cpp +++ b/modules/ocl/src/imgproc.cpp @@ -410,7 +410,11 @@ namespace cv float ify = 1. / fy; double ifx_d = 1. / fx; double ify_d = 1. / fy; - + int srcStep_in_pixel = src.step1() / src.channels(); + int srcoffset_in_pixel = src.offset / src.elemSize(); + int dstStep_in_pixel = dst.step1() / dst.channels(); + int dstoffset_in_pixel = dst.offset / dst.elemSize(); + //printf("%d %d\n",src.step1() , dst.elemSize()); string kernelName; if(interpolation == INTER_LINEAR) kernelName = "resizeLN"; @@ -438,25 +442,33 @@ namespace cv { args.push_back( make_pair(sizeof(cl_mem), (void *)&dst.data)); args.push_back( make_pair(sizeof(cl_mem), (void *)&src.data)); - args.push_back( make_pair(sizeof(cl_int), (void *)&dst.offset)); - args.push_back( make_pair(sizeof(cl_int), (void *)&src.offset)); - args.push_back( make_pair(sizeof(cl_int), (void *)&dst.step)); - args.push_back( make_pair(sizeof(cl_int), (void *)&src.step)); + args.push_back( make_pair(sizeof(cl_int), (void *)&dstoffset_in_pixel)); + args.push_back( make_pair(sizeof(cl_int), (void *)&srcoffset_in_pixel)); + args.push_back( make_pair(sizeof(cl_int), (void *)&dstStep_in_pixel)); + args.push_back( make_pair(sizeof(cl_int), (void *)&srcStep_in_pixel)); args.push_back( make_pair(sizeof(cl_int), (void *)&src.cols)); args.push_back( make_pair(sizeof(cl_int), (void *)&src.rows)); args.push_back( make_pair(sizeof(cl_int), (void *)&dst.cols)); args.push_back( make_pair(sizeof(cl_int), (void *)&dst.rows)); - args.push_back( make_pair(sizeof(cl_double), (void *)&ifx_d)); - args.push_back( make_pair(sizeof(cl_double), (void *)&ify_d)); + if(src.clCxt -> impl -> double_support != 0) + { + args.push_back( make_pair(sizeof(cl_double), (void *)&ifx_d)); + args.push_back( make_pair(sizeof(cl_double), (void *)&ify_d)); + } + else + { + args.push_back( make_pair(sizeof(cl_float), (void *)&ifx)); + args.push_back( make_pair(sizeof(cl_float), (void *)&ify)); + } } else { args.push_back( make_pair(sizeof(cl_mem), (void *)&dst.data)); args.push_back( make_pair(sizeof(cl_mem), (void *)&src.data)); - args.push_back( make_pair(sizeof(cl_int), (void *)&dst.offset)); - args.push_back( make_pair(sizeof(cl_int), (void *)&src.offset)); - args.push_back( make_pair(sizeof(cl_int), (void *)&dst.step)); - args.push_back( make_pair(sizeof(cl_int), (void *)&src.step)); + args.push_back( make_pair(sizeof(cl_int), (void *)&dstoffset_in_pixel)); + args.push_back( make_pair(sizeof(cl_int), (void *)&srcoffset_in_pixel)); + args.push_back( make_pair(sizeof(cl_int), (void *)&dstStep_in_pixel)); + args.push_back( make_pair(sizeof(cl_int), (void *)&srcStep_in_pixel)); args.push_back( make_pair(sizeof(cl_int), (void *)&src.cols)); args.push_back( make_pair(sizeof(cl_int), (void *)&src.rows)); args.push_back( make_pair(sizeof(cl_int), (void *)&dst.cols)); diff --git a/modules/ocl/src/initialization.cpp b/modules/ocl/src/initialization.cpp index feff1db83..61e7177ae 100644 --- a/modules/ocl/src/initialization.cpp +++ b/modules/ocl/src/initialization.cpp @@ -378,20 +378,36 @@ namespace cv void openCLMemcpy2D(Context *clCxt, void *dst, size_t dpitch, const void *src, size_t spitch, - size_t width, size_t height, enum openCLMemcpyKind kind) + size_t width, size_t height, enum openCLMemcpyKind kind, int channels) { size_t buffer_origin[3] = {0, 0, 0}; size_t host_origin[3] = {0, 0, 0}; size_t region[3] = {width, height, 1}; if(kind == clMemcpyHostToDevice) { - openCLSafeCall(clEnqueueWriteBufferRect(clCxt->impl->clCmdQueue, (cl_mem)dst, CL_TRUE, - buffer_origin, host_origin, region, dpitch, 0, spitch, 0, src, 0, 0, 0)); + if(dpitch == width || channels==3) + { + openCLSafeCall(clEnqueueWriteBuffer(clCxt->impl->clCmdQueue, (cl_mem)dst, CL_TRUE, + 0, width*height, src, 0, NULL, NULL)); + } + else + { + openCLSafeCall(clEnqueueWriteBufferRect(clCxt->impl->clCmdQueue, (cl_mem)dst, CL_TRUE, + buffer_origin, host_origin, region, dpitch, 0, spitch, 0, src, 0, 0, 0)); + } } else if(kind == clMemcpyDeviceToHost) { - openCLSafeCall(clEnqueueReadBufferRect(clCxt->impl->clCmdQueue, (cl_mem)src, CL_TRUE, - buffer_origin, host_origin, region, spitch, 0, dpitch, 0, dst, 0, 0, 0)); + if(spitch == width || channels==3) + { + openCLSafeCall(clEnqueueReadBuffer(clCxt->impl->clCmdQueue, (cl_mem)src, CL_TRUE, + 0, width*height, dst, 0, NULL, NULL)); + } + else + { + openCLSafeCall(clEnqueueReadBufferRect(clCxt->impl->clCmdQueue, (cl_mem)src, CL_TRUE, + buffer_origin, host_origin, region, spitch, 0, dpitch, 0, dst, 0, 0, 0)); + } } } diff --git a/modules/ocl/src/kernels/arithm_addWeighted.cl b/modules/ocl/src/kernels/arithm_addWeighted.cl index a34fd8d85..434010068 100644 --- a/modules/ocl/src/kernels/arithm_addWeighted.cl +++ b/modules/ocl/src/kernels/arithm_addWeighted.cl @@ -51,9 +51,9 @@ typedef float F; ////////////////////////////////////////////////////////////////////////////////////////////////////// /////////////////////////////////////////////addWeighted////////////////////////////////////////////// /////////////////////////////////////////////////////////////////////////////////////////////////////// -__kernel void addWeighted_D0 (__global uchar *src1, F alpha,int src1_step,int src1_offset, - __global uchar *src2, F beta, int src2_step,int src2_offset, - F gama, +__kernel void addWeighted_D0 (__global uchar *src1,int src1_step,int src1_offset, + __global uchar *src2, int src2_step,int src2_offset, + F alpha,F beta,F gama, __global uchar *dst, int dst_step,int dst_offset, int rows, int cols,int dst_step1) { @@ -99,9 +99,9 @@ __kernel void addWeighted_D0 (__global uchar *src1, F alpha,int src1_step,int sr -__kernel void addWeighted_D2 (__global ushort *src1, F alpha,int src1_step,int src1_offset, - __global ushort *src2, F beta, int src2_step,int src2_offset, - F gama, +__kernel void addWeighted_D2 (__global ushort *src1, int src1_step,int src1_offset, + __global ushort *src2, int src2_step,int src2_offset, + F alpha,F beta,F gama, __global ushort *dst, int dst_step,int dst_offset, int rows, int cols,int dst_step1) { @@ -145,9 +145,9 @@ __kernel void addWeighted_D2 (__global ushort *src1, F alpha,int src1_step,int s } -__kernel void addWeighted_D3 (__global short *src1, F alpha,int src1_step,int src1_offset, - __global short *src2, F beta, int src2_step,int src2_offset, - F gama, +__kernel void addWeighted_D3 (__global short *src1, int src1_step,int src1_offset, + __global short *src2, int src2_step,int src2_offset, + F alpha,F beta,F gama, __global short *dst, int dst_step,int dst_offset, int rows, int cols,int dst_step1) { @@ -190,9 +190,9 @@ __kernel void addWeighted_D3 (__global short *src1, F alpha,int src1_step,int sr } -__kernel void addWeighted_D4 (__global int *src1, F alpha,int src1_step,int src1_offset, - __global int *src2, F beta, int src2_step,int src2_offset, - F gama, +__kernel void addWeighted_D4 (__global int *src1, int src1_step,int src1_offset, + __global int *src2, int src2_step,int src2_offset, + F alpha,F beta, F gama, __global int *dst, int dst_step,int dst_offset, int rows, int cols,int dst_step1) { @@ -238,9 +238,9 @@ __kernel void addWeighted_D4 (__global int *src1, F alpha,int src1_step,int src1 } -__kernel void addWeighted_D5 (__global float *src1, F alpha,int src1_step,int src1_offset, - __global float *src2, F beta, int src2_step,int src2_offset, - F gama, +__kernel void addWeighted_D5 (__global float *src1,int src1_step,int src1_offset, + __global float *src2, int src2_step,int src2_offset, + F alpha,F beta, F gama, __global float *dst, int dst_step,int dst_offset, int rows, int cols,int dst_step1) { @@ -286,9 +286,9 @@ __kernel void addWeighted_D5 (__global float *src1, F alpha,int src1_step,int sr } #if defined (DOUBLE_SUPPORT) -__kernel void addWeighted_D6 (__global double *src1, F alpha,int src1_step,int src1_offset, - __global double *src2, F beta, int src2_step,int src2_offset, - F gama, +__kernel void addWeighted_D6 (__global double *src1, int src1_step,int src1_offset, + __global double *src2, int src2_step,int src2_offset, + F alpha,F beta, F gama, __global double *dst, int dst_step,int dst_offset, int rows, int cols,int dst_step1) { diff --git a/modules/ocl/src/kernels/arithm_cartToPolar.cl b/modules/ocl/src/kernels/arithm_cartToPolar.cl index d4aa83a6a..a2f65e0b7 100644 --- a/modules/ocl/src/kernels/arithm_cartToPolar.cl +++ b/modules/ocl/src/kernels/arithm_cartToPolar.cl @@ -49,6 +49,10 @@ #define CV_PI 3.1415926535897932384626433832795 +#ifndef DBL_EPSILON +#define DBL_EPSILON 0x1.0p-52 +#endif + __kernel void arithm_cartToPolar_D5 (__global float *src1, int src1_step, int src1_offset, __global float *src2, int src2_step, int src2_offset, __global float *dst1, int dst1_step, int dst1_offset, //magnitude diff --git a/modules/ocl/src/kernels/arithm_div.cl b/modules/ocl/src/kernels/arithm_div.cl index 43858f023..ae4f46ab1 100644 --- a/modules/ocl/src/kernels/arithm_div.cl +++ b/modules/ocl/src/kernels/arithm_div.cl @@ -45,36 +45,45 @@ #if defined (DOUBLE_SUPPORT) #pragma OPENCL EXTENSION cl_khr_fp64:enable +typedef double F ; +typedef double4 F4; +#define convert_F4 convert_double4 +#define convert_F convert_double +#else +typedef float F; +typedef float4 F4; +#define convert_F4 convert_float4 +#define convert_F convert_float #endif -uchar round2_uchar(double v){ +uchar round2_uchar(F v){ - uchar v1 = convert_uchar_sat(v); - uchar v2 = convert_uchar_sat(v+(v>=0 ? 0.5 : -0.5)); + uchar v1 = convert_uchar_sat(round(v)); + //uchar v2 = convert_uchar_sat(v+(v>=0 ? 0.5 : -0.5)); - return (((v-v1)==0.5) && (v1%2==0)) ? v1 : v2; + return v1;//(((v-v1)==0.5) && (v1%2==0)) ? v1 : v2; } -ushort round2_ushort(double v){ +ushort round2_ushort(F v){ - ushort v1 = convert_ushort_sat(v); - ushort v2 = convert_ushort_sat(v+(v>=0 ? 0.5 : -0.5)); + ushort v1 = convert_ushort_sat(round(v)); + //ushort v2 = convert_ushort_sat(v+(v>=0 ? 0.5 : -0.5)); - return (((v-v1)==0.5) && (v1%2==0)) ? v1 : v2; + return v1;//(((v-v1)==0.5) && (v1%2==0)) ? v1 : v2; } -short round2_short(double v){ +short round2_short(F v){ - short v1 = convert_short_sat(v); - short v2 = convert_short_sat(v+(v>=0 ? 0.5 : -0.5)); + short v1 = convert_short_sat(round(v)); + //short v2 = convert_short_sat(v+(v>=0 ? 0.5 : -0.5)); - return (((v-v1)==0.5) && (v1%2==0)) ? v1 : v2; + return v1;//(((v-v1)==0.5) && (v1%2==0)) ? v1 : v2; } -int round2_int(double v){ +int round2_int(F v){ - int v1 = convert_int_sat(v); - int v2 = convert_int_sat(v+(v>=0 ? 0.5 : -0.5)); + int v1 = convert_int_sat(round(v)); + //int v2 = convert_int_sat(v+(v>=0 ? 0.5 : -0.5)); - return (((v-v1)==0.5) && (v1%2==0)) ? v1 : v2; + return v1;//(((v-v1)==0.5) && (v1%2==0)) ? v1 : v2; } /////////////////////////////////////////////////////////////////////////////////////// ////////////////////////////divide/////////////////////////////////////////////////// @@ -83,7 +92,7 @@ int round2_int(double v){ __kernel void arithm_div_D0 (__global uchar *src1, int src1_step, int src1_offset, __global uchar *src2, int src2_step, int src2_offset, __global uchar *dst, int dst_step, int dst_offset, - int rows, int cols, int dst_step1, double scalar) + int rows, int cols, int dst_step1, F scalar) { int x = get_global_id(0); int y = get_global_id(1); @@ -104,13 +113,13 @@ __kernel void arithm_div_D0 (__global uchar *src1, int src1_step, int src1_offse uchar4 src2_data = vload4(0, src2 + src2_index); uchar4 dst_data = *((__global uchar4 *)(dst + dst_index)); - double4 tmp = convert_double4(src1_data) * scalar; + F4 tmp = convert_F4(src1_data) * scalar; uchar4 tmp_data; - tmp_data.x = ((tmp.x == 0) || (src2_data.x == 0)) ? 0 : round2_uchar(tmp.x / (double)src2_data.x); - tmp_data.y = ((tmp.y == 0) || (src2_data.y == 0)) ? 0 : round2_uchar(tmp.y / (double)src2_data.y); - tmp_data.z = ((tmp.z == 0) || (src2_data.z == 0)) ? 0 : round2_uchar(tmp.z / (double)src2_data.z); - tmp_data.w = ((tmp.w == 0) || (src2_data.w == 0)) ? 0 : round2_uchar(tmp.w / (double)src2_data.w); + tmp_data.x = ((tmp.x == 0) || (src2_data.x == 0)) ? 0 : round2_uchar(tmp.x / (F)src2_data.x); + tmp_data.y = ((tmp.y == 0) || (src2_data.y == 0)) ? 0 : round2_uchar(tmp.y / (F)src2_data.y); + tmp_data.z = ((tmp.z == 0) || (src2_data.z == 0)) ? 0 : round2_uchar(tmp.z / (F)src2_data.z); + tmp_data.w = ((tmp.w == 0) || (src2_data.w == 0)) ? 0 : round2_uchar(tmp.w / (F)src2_data.w); dst_data.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : dst_data.x; dst_data.y = ((dst_index + 1 >= dst_start) && (dst_index + 1 < dst_end)) ? tmp_data.y : dst_data.y; @@ -124,7 +133,7 @@ __kernel void arithm_div_D0 (__global uchar *src1, int src1_step, int src1_offse __kernel void arithm_div_D2 (__global ushort *src1, int src1_step, int src1_offset, __global ushort *src2, int src2_step, int src2_offset, __global ushort *dst, int dst_step, int dst_offset, - int rows, int cols, int dst_step1, double scalar) + int rows, int cols, int dst_step1, F scalar) { int x = get_global_id(0); int y = get_global_id(1); @@ -145,13 +154,13 @@ __kernel void arithm_div_D2 (__global ushort *src1, int src1_step, int src1_offs ushort4 src2_data = vload4(0, (__global ushort *)((__global char *)src2 + src2_index)); ushort4 dst_data = *((__global ushort4 *)((__global char *)dst + dst_index)); - double4 tmp = convert_double4(src1_data) * scalar; + F4 tmp = convert_F4(src1_data) * scalar; ushort4 tmp_data; - tmp_data.x = ((tmp.x == 0) || (src2_data.x == 0)) ? 0 : round2_ushort(tmp.x / (double)src2_data.x); - tmp_data.y = ((tmp.y == 0) || (src2_data.y == 0)) ? 0 : round2_ushort(tmp.y / (double)src2_data.y); - tmp_data.z = ((tmp.z == 0) || (src2_data.z == 0)) ? 0 : round2_ushort(tmp.z / (double)src2_data.z); - tmp_data.w = ((tmp.w == 0) || (src2_data.w == 0)) ? 0 : round2_ushort(tmp.w / (double)src2_data.w); + tmp_data.x = ((tmp.x == 0) || (src2_data.x == 0)) ? 0 : round2_ushort(tmp.x / (F)src2_data.x); + tmp_data.y = ((tmp.y == 0) || (src2_data.y == 0)) ? 0 : round2_ushort(tmp.y / (F)src2_data.y); + tmp_data.z = ((tmp.z == 0) || (src2_data.z == 0)) ? 0 : round2_ushort(tmp.z / (F)src2_data.z); + tmp_data.w = ((tmp.w == 0) || (src2_data.w == 0)) ? 0 : round2_ushort(tmp.w / (F)src2_data.w); dst_data.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : dst_data.x; dst_data.y = ((dst_index + 2 >= dst_start) && (dst_index + 2 < dst_end)) ? tmp_data.y : dst_data.y; @@ -164,7 +173,7 @@ __kernel void arithm_div_D2 (__global ushort *src1, int src1_step, int src1_offs __kernel void arithm_div_D3 (__global short *src1, int src1_step, int src1_offset, __global short *src2, int src2_step, int src2_offset, __global short *dst, int dst_step, int dst_offset, - int rows, int cols, int dst_step1, double scalar) + int rows, int cols, int dst_step1, F scalar) { int x = get_global_id(0); int y = get_global_id(1); @@ -185,13 +194,13 @@ __kernel void arithm_div_D3 (__global short *src1, int src1_step, int src1_offse short4 src2_data = vload4(0, (__global short *)((__global char *)src2 + src2_index)); short4 dst_data = *((__global short4 *)((__global char *)dst + dst_index)); - double4 tmp = convert_double4(src1_data) * scalar; + F4 tmp = convert_F4(src1_data) * scalar; short4 tmp_data; - tmp_data.x = ((tmp.x == 0) || (src2_data.x == 0)) ? 0 : round2_short(tmp.x / (double)src2_data.x); - tmp_data.y = ((tmp.y == 0) || (src2_data.y == 0)) ? 0 : round2_short(tmp.y / (double)src2_data.y); - tmp_data.z = ((tmp.z == 0) || (src2_data.z == 0)) ? 0 : round2_short(tmp.z / (double)src2_data.z); - tmp_data.w = ((tmp.w == 0) || (src2_data.w == 0)) ? 0 : round2_short(tmp.w / (double)src2_data.w); + tmp_data.x = ((tmp.x == 0) || (src2_data.x == 0)) ? 0 : round2_short(tmp.x / (F)src2_data.x); + tmp_data.y = ((tmp.y == 0) || (src2_data.y == 0)) ? 0 : round2_short(tmp.y / (F)src2_data.y); + tmp_data.z = ((tmp.z == 0) || (src2_data.z == 0)) ? 0 : round2_short(tmp.z / (F)src2_data.z); + tmp_data.w = ((tmp.w == 0) || (src2_data.w == 0)) ? 0 : round2_short(tmp.w / (F)src2_data.w); dst_data.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : dst_data.x; @@ -206,7 +215,7 @@ __kernel void arithm_div_D3 (__global short *src1, int src1_step, int src1_offse __kernel void arithm_div_D4 (__global int *src1, int src1_step, int src1_offset, __global int *src2, int src2_step, int src2_offset, __global int *dst, int dst_step, int dst_offset, - int rows, int cols, int dst_step1, double scalar) + int rows, int cols, int dst_step1, F scalar) { int x = get_global_id(0); int y = get_global_id(1); @@ -220,8 +229,8 @@ __kernel void arithm_div_D4 (__global int *src1, int src1_step, int src1_offset, int data1 = *((__global int *)((__global char *)src1 + src1_index)); int data2 = *((__global int *)((__global char *)src2 + src2_index)); - double tmp = convert_double(data1) * scalar; - int tmp_data = (tmp == 0 || data2 == 0) ? 0 : round2_int(tmp / (convert_double)(data2)); + F tmp = convert_F(data1) * scalar; + int tmp_data = (tmp == 0 || data2 == 0) ? 0 : round2_int(tmp / (convert_F)(data2)); *((__global int *)((__global char *)dst + dst_index)) =tmp_data; } @@ -230,7 +239,7 @@ __kernel void arithm_div_D4 (__global int *src1, int src1_step, int src1_offset, __kernel void arithm_div_D5 (__global float *src1, int src1_step, int src1_offset, __global float *src2, int src2_step, int src2_offset, __global float *dst, int dst_step, int dst_offset, - int rows, int cols, int dst_step1, double scalar) + int rows, int cols, int dst_step1, F scalar) { int x = get_global_id(0); int y = get_global_id(1); @@ -244,13 +253,14 @@ __kernel void arithm_div_D5 (__global float *src1, int src1_step, int src1_offse float data1 = *((__global float *)((__global char *)src1 + src1_index)); float data2 = *((__global float *)((__global char *)src2 + src2_index)); - double tmp = convert_double(data1) * scalar; - float tmp_data = (tmp == 0 || data2 == 0) ? 0 : convert_float(tmp / (convert_double)(data2)); + F tmp = convert_F(data1) * scalar; + float tmp_data = (tmp == 0 || data2 == 0) ? 0 : convert_float(tmp / (convert_F)(data2)); *((__global float *)((__global char *)dst + dst_index)) = tmp_data; } } +#if defined (DOUBLE_SUPPORT) __kernel void arithm_div_D6 (__global double *src1, int src1_step, int src1_offset, __global double *src2, int src2_step, int src2_offset, __global double *dst, int dst_step, int dst_offset, @@ -274,10 +284,11 @@ __kernel void arithm_div_D6 (__global double *src1, int src1_step, int src1_offs *((__global double *)((__global char *)dst + dst_index)) = tmp_data; } } +#endif /************************************div with scalar************************************/ __kernel void arithm_s_div_D0 (__global uchar *src, int src_step, int src_offset, __global uchar *dst, int dst_step, int dst_offset, - int rows, int cols, int dst_step1, double scalar) + int rows, int cols, int dst_step1, F scalar) { int x = get_global_id(0); int y = get_global_id(1); @@ -297,10 +308,10 @@ __kernel void arithm_s_div_D0 (__global uchar *src, int src_step, int src_offset uchar4 dst_data = *((__global uchar4 *)(dst + dst_index)); uchar4 tmp_data; - tmp_data.x = ((scalar == 0) || (src_data.x == 0)) ? 0 : round2_uchar(scalar / (double)src_data.x); - tmp_data.y = ((scalar == 0) || (src_data.y == 0)) ? 0 : round2_uchar(scalar / (double)src_data.y); - tmp_data.z = ((scalar == 0) || (src_data.z == 0)) ? 0 : round2_uchar(scalar / (double)src_data.z); - tmp_data.w = ((scalar == 0) || (src_data.w == 0)) ? 0 : round2_uchar(scalar / (double)src_data.w); + tmp_data.x = ((scalar == 0) || (src_data.x == 0)) ? 0 : round2_uchar(scalar / (F)src_data.x); + tmp_data.y = ((scalar == 0) || (src_data.y == 0)) ? 0 : round2_uchar(scalar / (F)src_data.y); + tmp_data.z = ((scalar == 0) || (src_data.z == 0)) ? 0 : round2_uchar(scalar / (F)src_data.z); + tmp_data.w = ((scalar == 0) || (src_data.w == 0)) ? 0 : round2_uchar(scalar / (F)src_data.w); dst_data.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : dst_data.x; dst_data.y = ((dst_index + 1 >= dst_start) && (dst_index + 1 < dst_end)) ? tmp_data.y : dst_data.y; @@ -313,7 +324,7 @@ __kernel void arithm_s_div_D0 (__global uchar *src, int src_step, int src_offset __kernel void arithm_s_div_D2 (__global ushort *src, int src_step, int src_offset, __global ushort *dst, int dst_step, int dst_offset, - int rows, int cols, int dst_step1, double scalar) + int rows, int cols, int dst_step1, F scalar) { int x = get_global_id(0); int y = get_global_id(1); @@ -333,10 +344,10 @@ __kernel void arithm_s_div_D2 (__global ushort *src, int src_step, int src_offse ushort4 dst_data = *((__global ushort4 *)((__global char *)dst + dst_index)); ushort4 tmp_data; - tmp_data.x = ((scalar == 0) || (src_data.x == 0)) ? 0 : round2_ushort(scalar / (double)src_data.x); - tmp_data.y = ((scalar == 0) || (src_data.y == 0)) ? 0 : round2_ushort(scalar / (double)src_data.y); - tmp_data.z = ((scalar == 0) || (src_data.z == 0)) ? 0 : round2_ushort(scalar / (double)src_data.z); - tmp_data.w = ((scalar == 0) || (src_data.w == 0)) ? 0 : round2_ushort(scalar / (double)src_data.w); + tmp_data.x = ((scalar == 0) || (src_data.x == 0)) ? 0 : round2_ushort(scalar / (F)src_data.x); + tmp_data.y = ((scalar == 0) || (src_data.y == 0)) ? 0 : round2_ushort(scalar / (F)src_data.y); + tmp_data.z = ((scalar == 0) || (src_data.z == 0)) ? 0 : round2_ushort(scalar / (F)src_data.z); + tmp_data.w = ((scalar == 0) || (src_data.w == 0)) ? 0 : round2_ushort(scalar / (F)src_data.w); dst_data.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : dst_data.x; dst_data.y = ((dst_index + 2 >= dst_start) && (dst_index + 2 < dst_end)) ? tmp_data.y : dst_data.y; @@ -348,7 +359,7 @@ __kernel void arithm_s_div_D2 (__global ushort *src, int src_step, int src_offse } __kernel void arithm_s_div_D3 (__global short *src, int src_step, int src_offset, __global short *dst, int dst_step, int dst_offset, - int rows, int cols, int dst_step1, double scalar) + int rows, int cols, int dst_step1, F scalar) { int x = get_global_id(0); int y = get_global_id(1); @@ -368,10 +379,10 @@ __kernel void arithm_s_div_D3 (__global short *src, int src_step, int src_offset short4 dst_data = *((__global short4 *)((__global char *)dst + dst_index)); short4 tmp_data; - tmp_data.x = ((scalar == 0) || (src_data.x == 0)) ? 0 : round2_short(scalar / (double)src_data.x); - tmp_data.y = ((scalar == 0) || (src_data.y == 0)) ? 0 : round2_short(scalar / (double)src_data.y); - tmp_data.z = ((scalar == 0) || (src_data.z == 0)) ? 0 : round2_short(scalar / (double)src_data.z); - tmp_data.w = ((scalar == 0) || (src_data.w == 0)) ? 0 : round2_short(scalar / (double)src_data.w); + tmp_data.x = ((scalar == 0) || (src_data.x == 0)) ? 0 : round2_short(scalar / (F)src_data.x); + tmp_data.y = ((scalar == 0) || (src_data.y == 0)) ? 0 : round2_short(scalar / (F)src_data.y); + tmp_data.z = ((scalar == 0) || (src_data.z == 0)) ? 0 : round2_short(scalar / (F)src_data.z); + tmp_data.w = ((scalar == 0) || (src_data.w == 0)) ? 0 : round2_short(scalar / (F)src_data.w); dst_data.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : dst_data.x; @@ -385,7 +396,7 @@ __kernel void arithm_s_div_D3 (__global short *src, int src_step, int src_offset __kernel void arithm_s_div_D4 (__global int *src, int src_step, int src_offset, __global int *dst, int dst_step, int dst_offset, - int rows, int cols, int dst_step1, double scalar) + int rows, int cols, int dst_step1, F scalar) { int x = get_global_id(0); int y = get_global_id(1); @@ -397,7 +408,7 @@ __kernel void arithm_s_div_D4 (__global int *src, int src_step, int src_offset, int data = *((__global int *)((__global char *)src + src_index)); - int tmp_data = (scalar == 0 || data == 0) ? 0 : round2_int(scalar / (convert_double)(data)); + int tmp_data = (scalar == 0 || data == 0) ? 0 : round2_int(scalar / (convert_F)(data)); *((__global int *)((__global char *)dst + dst_index)) =tmp_data; } @@ -405,7 +416,7 @@ __kernel void arithm_s_div_D4 (__global int *src, int src_step, int src_offset, __kernel void arithm_s_div_D5 (__global float *src, int src_step, int src_offset, __global float *dst, int dst_step, int dst_offset, - int rows, int cols, int dst_step1, double scalar) + int rows, int cols, int dst_step1, F scalar) { int x = get_global_id(0); int y = get_global_id(1); @@ -417,12 +428,13 @@ __kernel void arithm_s_div_D5 (__global float *src, int src_step, int src_offset float data = *((__global float *)((__global char *)src + src_index)); - float tmp_data = (scalar == 0 || data == 0) ? 0 : convert_float(scalar / (convert_double)(data)); + float tmp_data = (scalar == 0 || data == 0) ? 0 : convert_float(scalar / (convert_F)(data)); *((__global float *)((__global char *)dst + dst_index)) = tmp_data; } } +#if defined (DOUBLE_SUPPORT) __kernel void arithm_s_div_D6 (__global double *src, int src_step, int src_offset, __global double *dst, int dst_step, int dst_offset, int rows, int cols, int dst_step1, double scalar) @@ -442,5 +454,6 @@ __kernel void arithm_s_div_D6 (__global double *src, int src_step, int src_offse *((__global double *)((__global char *)dst + dst_index)) = tmp_data; } } +#endif diff --git a/modules/ocl/src/kernels/arithm_exp.cl b/modules/ocl/src/kernels/arithm_exp.cl index 18f7f0111..1b283a093 100644 --- a/modules/ocl/src/kernels/arithm_exp.cl +++ b/modules/ocl/src/kernels/arithm_exp.cl @@ -70,6 +70,8 @@ __kernel void arithm_exp_D5(int rows, int cols, int srcStep, int dstStep, int sr } } + +#if defined (DOUBLE_SUPPORT) __kernel void arithm_exp_D6(int rows, int cols, int srcStep, int dstStep, int srcOffset, int dstOffset, __global double *src, __global double *dst) { int x = get_global_id(0); @@ -87,3 +89,5 @@ __kernel void arithm_exp_D6(int rows, int cols, int srcStep, int dstStep, int sr // dst[dstIdx] = exp(src[srcIdx]); } } + +#endif diff --git a/modules/ocl/src/kernels/arithm_log.cl b/modules/ocl/src/kernels/arithm_log.cl index ba93cc3f5..081084800 100644 --- a/modules/ocl/src/kernels/arithm_log.cl +++ b/modules/ocl/src/kernels/arithm_log.cl @@ -73,7 +73,7 @@ __kernel void arithm_log_D5(int rows, int cols, int srcStep, int dstStep, int sr } } - +#if defined (DOUBLE_SUPPORT) __kernel void arithm_log_D6(int rows, int cols, int srcStep, int dstStep, int srcOffset, int dstOffset, __global double *src, __global double *dst) { int x = get_global_id(0); @@ -91,4 +91,4 @@ __kernel void arithm_log_D6(int rows, int cols, int srcStep, int dstStep, int sr } } - +#endif diff --git a/modules/ocl/src/kernels/convertC3C4.cl b/modules/ocl/src/kernels/convertC3C4.cl index 54f0fd9ee..1b21fe68c 100644 --- a/modules/ocl/src/kernels/convertC3C4.cl +++ b/modules/ocl/src/kernels/convertC3C4.cl @@ -6,7 +6,7 @@ // Third party copyrights are property of their respective owners. // // @Authors -// Zero Lin, zero.lin@amd.com +// Niko Li, newlife20080214@gmail.com // Redistribution and use in source and binary forms, with or without modification, // are permitted provided that the following conditions are met: // @@ -32,106 +32,107 @@ // the use of this software, even if advised of the possibility of such damage. // // - -__kernel void convertC3C4_D0(__global const char4 * restrict src, __global char4 *dst, int cols, int rows, - int srcStep, int dstStep) +//#pragma OPENCL EXTENSION cl_amd_printf : enable +__kernel void convertC3C4(__global const GENTYPE4 * restrict src, __global GENTYPE4 *dst, int cols, int rows, + int dstStep_in_piexl,int pixel_end) { int id = get_global_id(0); - int y = id / cols; - int x = id % cols; + //int pixel_end = mul24(cols -1 , rows -1); + int3 pixelid = (int3)(mul24(id,3),mad24(id,3,1),mad24(id,3,2)); + pixelid = clamp(pixelid,0,pixel_end); + GENTYPE4 pixel0, pixel1, pixel2, outpix0,outpix1,outpix2,outpix3; + pixel0 = src[pixelid.x]; + pixel1 = src[pixelid.y]; + pixel2 = src[pixelid.z]; - int d = y * srcStep + x * 3; - char8 data = (char8)(src[d>>2], src[(d>>2) + 1]); - char temp[8] = {data.s0, data.s1, data.s2, data.s3, data.s4, data.s5, data.s6, data.s7}; - - int start = d & 3; - char4 ndata = (char4)(temp[start], temp[start + 1], temp[start + 2], 0); - if(y < rows) - dst[y * dstStep + x] = ndata; -} -__kernel void convertC3C4_D1(__global const short* restrict src, __global short4 *dst, int cols, int rows, - int srcStep, int dstStep) -{ - int id = get_global_id(0); - int y = id / cols; - int x = id % cols; + outpix0 = (GENTYPE4)(pixel0.x,pixel0.y,pixel0.z,0); + outpix1 = (GENTYPE4)(pixel0.w,pixel1.x,pixel1.y,0); + outpix2 = (GENTYPE4)(pixel1.z,pixel1.w,pixel2.x,0); + outpix3 = (GENTYPE4)(pixel2.y,pixel2.z,pixel2.w,0); - int d = (y * srcStep + x * 6)>>1; - short4 data = *(__global short4 *)(src + ((d>>1)<<1)); - short temp[4] = {data.s0, data.s1, data.s2, data.s3}; - - int start = d & 1; - short4 ndata = (short4)(temp[start], temp[start + 1], temp[start + 2], 0); - if(y < rows) - dst[y * dstStep + x] = ndata; -} - -__kernel void convertC3C4_D2(__global const int * restrict src, __global int4 *dst, int cols, int rows, - int srcStep, int dstStep) -{ - int id = get_global_id(0); - int y = id / cols; - int x = id % cols; - - int d = (y * srcStep + x * 12)>>2; - int4 data = *(__global int4 *)(src + d); - data.z = 0; - - if(y < rows) - dst[y * dstStep + x] = data; -} - -__kernel void convertC4C3_D2(__global const int4 * restrict src, __global int *dst, int cols, int rows, - int srcStep, int dstStep) -{ - int id = get_global_id(0); - int y = id / cols; - int x = id % cols; - - int4 data = src[y * srcStep + x]; - - if(y < rows) + int4 outy = (id<<2)/cols; + int4 outx = (id<<2)%cols; + outx.y++; + outx.z+=2; + outx.w+=3; + outy = select(outy,outy+1,outx>=cols); + outx = select(outx,outx-cols,outx>=cols); + //outpix3 = select(outpix3, outpix0, (uchar4)(outy.w>=rows)); + //outpix2 = select(outpix2, outpix0, (uchar4)(outy.z>=rows)); + //outpix1 = select(outpix1, outpix0, (uchar4)(outy.y>=rows)); + //outx = select(outx,(int4)outx.x,outy>=rows); + //outy = select(outy,(int4)outy.x,outy>=rows); + int4 addr = mad24(outy,dstStep_in_piexl,outx); + if(outx.w=(int4)cols); + x4 = select(x4,x4-(int4)cols,x4>=(int4)cols); + int4 addr = mad24(y4,(int4)srcStep_in_pixel,x4); + GENTYPE4 pixel0,pixel1,pixel2,pixel3, outpixel1, outpixel2; + pixel0 = src[addr.x]; + pixel1 = src[addr.y]; + pixel2 = src[addr.z]; + pixel3 = src[addr.w]; - short4 data = src[y * srcStep + x]; - - if(y < rows) + pixel0.w = pixel1.x; + outpixel1.x = pixel1.y; + outpixel1.y = pixel1.z; + outpixel1.z = pixel2.x; + outpixel1.w = pixel2.y; + outpixel2.x = pixel2.z; + outpixel2.y = pixel3.x; + outpixel2.z = pixel3.y; + outpixel2.w = pixel3.z; + int4 outaddr = mul24(id>>2 , 3); + outaddr.y++; + outaddr.z+=2; + //printf("%d ",outaddr.z); + if(outaddr.z <= pixel_end) { - int d = y * dstStep + x * 3; - dst[d] = data.x; - dst[d + 1] = data.y; - dst[d + 2] = data.z; + dst[outaddr.x] = pixel0; + dst[outaddr.y] = outpixel1; + dst[outaddr.z] = outpixel2; } -} - -__kernel void convertC4C3_D0(__global const char4 * restrict src, __global char *dst, int cols, int rows, - int srcStep, int dstStep) -{ - int id = get_global_id(0); - int y = id / cols; - int x = id % cols; - - char4 data = src[y * srcStep + x]; - - if(y < rows) + else if(outaddr.y <= pixel_end) { - int d = y * dstStep + x * 3; - dst[d] = data.x; - dst[d + 1] = data.y; - dst[d + 2] = data.z; + dst[outaddr.x] = pixel0; + dst[outaddr.y] = outpixel1; } + else if(outaddr.x <= pixel_end) + { + dst[outaddr.x] = pixel0; + } } diff --git a/modules/ocl/src/kernels/imgproc_resize.cl b/modules/ocl/src/kernels/imgproc_resize.cl index 2841886e2..995ce967d 100644 --- a/modules/ocl/src/kernels/imgproc_resize.cl +++ b/modules/ocl/src/kernels/imgproc_resize.cl @@ -16,7 +16,7 @@ // // @Authors // Zhang Ying, zhangying913@gmail.com -// +// Niko Li, newlife20080214@gmail.com // Redistribution and use in source and binary forms, with or without modification, // are permitted provided that the following conditions are met: // @@ -50,21 +50,11 @@ #if defined DOUBLE_SUPPORT #pragma OPENCL EXTENSION cl_khr_fp64:enable -typedef double F ; +#define F double #else -typedef float F; +#define F float #endif -inline uint4 getPoint_8uc4(__global uchar4 * data, int offset, int x, int y, int step) -{ - return convert_uint4(data[(offset>>2)+ y * (step>>2) + x]); -} - -inline float getPoint_32fc1(__global float * data, int offset, int x, int y, int step) -{ - return data[(offset>>2)+ y * (step>>2) + x]; -} - #define INTER_RESIZE_COEF_BITS 11 #define INTER_RESIZE_COEF_SCALE (1 << INTER_RESIZE_COEF_BITS) @@ -72,8 +62,8 @@ inline float getPoint_32fc1(__global float * data, int offset, int x, int y, int #define CAST_SCALE (1.0f/(1<= (l) ? (x):((x)+1)) -__kernel void resizeLN_C1_D0(__global unsigned char * dst, __global unsigned char const * restrict src, - int dst_offset, int src_offset,int dst_step, int src_step, +__kernel void resizeLN_C1_D0(__global uchar * dst, __global uchar const * restrict src, + int dstoffset_in_pixel, int srcoffset_in_pixel,int dststep_in_pixel, int srcstep_in_pixel, int src_cols, int src_rows, int dst_cols, int dst_rows, float ifx, float ify ) { int gx = get_global_id(0); @@ -81,7 +71,7 @@ __kernel void resizeLN_C1_D0(__global unsigned char * dst, __global unsigned cha float4 sx, u, xf; int4 x, DX; - gx = (gx<<2) - (dst_offset&3); + gx = (gx<<2) - (dstoffset_in_pixel&3); DX = (int4)(gx, gx+1, gx+2, gx+3); sx = (convert_float4(DX) + 0.5f) * ifx - 0.5f; xf = floor(sx); @@ -119,10 +109,10 @@ __kernel void resizeLN_C1_D0(__global unsigned char * dst, __global unsigned cha int4 val1, val2, val; int4 sdata1, sdata2, sdata3, sdata4; - int4 pos1 = src_offset + y * src_step + x; - int4 pos2 = src_offset + y * src_step + x_; - int4 pos3 = src_offset + y_ * src_step + x; - int4 pos4 = src_offset + y_ * src_step + x_; + int4 pos1 = mad24(y, srcstep_in_pixel, x+srcoffset_in_pixel); + int4 pos2 = mad24(y, srcstep_in_pixel, x_+srcoffset_in_pixel); + int4 pos3 = mad24(y_, srcstep_in_pixel, x+srcoffset_in_pixel); + int4 pos4 = mad24(y_, srcstep_in_pixel, x_+srcoffset_in_pixel); sdata1.s0 = src[pos1.s0]; sdata1.s1 = src[pos1.s1]; @@ -144,20 +134,44 @@ __kernel void resizeLN_C1_D0(__global unsigned char * dst, __global unsigned cha sdata4.s2 = src[pos4.s2]; sdata4.s3 = src[pos4.s3]; - val1 = U1 * sdata1 + U * sdata2; - val2 = U1 * sdata3 + U * sdata4; - val = V1 * val1 + V * val2; + val1 = mul24(U1 , sdata1) + mul24(U , sdata2); + val2 = mul24(U1 , sdata3) + mul24(U , sdata4); + val = mul24(V1 , val1) + mul24(V , val2); - __global uchar4* d = (__global uchar4*)(dst + dst_offset + dy * dst_step + gx); - uchar4 dVal = *d; - int4 con = ( DX >= 0 && DX < dst_cols && dy >= 0 && dy < dst_rows); + //__global uchar4* d = (__global uchar4*)(dst + dstoffset_in_pixel + dy * dststep_in_pixel + gx); + //uchar4 dVal = *d; + //int4 con = ( DX >= 0 && DX < dst_cols && dy >= 0 && dy < dst_rows); val = ((val + (1<<(CAST_BITS-1))) >> CAST_BITS); - *d = convert_uchar4(con != 0) ? convert_uchar4_sat(val) : dVal; - + //*d = convert_uchar4(con != 0) ? convert_uchar4_sat(val) : dVal; + + pos4 = mad24(dy, dststep_in_pixel, gx+dstoffset_in_pixel); + pos4.y++; + pos4.z+=2; + uchar4 uval = convert_uchar4_sat(val); + int con = (gx >= 0 && gx+3 < dst_cols && dy >= 0 && dy < dst_rows); + if(con) + { + *(__global uchar4*)(dst + pos4.x)=uval; + } + else + { + if(gx >= 0 && gx < dst_cols && dy >= 0 && dy < dst_rows) + { + dst[pos4.x]=uval.x; + } + if(gx+1 >= 0 && gx+1 < dst_cols && dy >= 0 && dy < dst_rows) + { + dst[pos4.y]=uval.y; + } + if(gx+2 >= 0 && gx+2 < dst_cols && dy >= 0 && dy < dst_rows) + { + dst[pos4.z]=uval.z; + } + } } __kernel void resizeLN_C4_D0(__global uchar4 * dst, __global uchar4 * src, - int dst_offset, int src_offset,int dst_step, int src_step, + int dstoffset_in_pixel, int srcoffset_in_pixel,int dststep_in_pixel, int srcstep_in_pixel, int src_cols, int src_rows, int dst_cols, int dst_rows, float ifx, float ify ) { int dx = get_global_id(0); @@ -182,18 +196,25 @@ __kernel void resizeLN_C4_D0(__global uchar4 * dst, __global uchar4 * src, int y_ = INC(y,src_rows); int x_ = INC(x,src_cols); - - uint4 val = U1* V1 * getPoint_8uc4(src,src_offset,x,y,src_step) + - U1* V * getPoint_8uc4(src,src_offset,x,y_,src_step) + - U * V1 * getPoint_8uc4(src,src_offset,x_,y,src_step) + - U * V * getPoint_8uc4(src,src_offset,x_,y_,src_step); - + int4 srcpos; + srcpos.x = mad24(y, srcstep_in_pixel, x+srcoffset_in_pixel); + srcpos.y = mad24(y, srcstep_in_pixel, x_+srcoffset_in_pixel); + srcpos.z = mad24(y_, srcstep_in_pixel, x+srcoffset_in_pixel); + srcpos.w = mad24(y_, srcstep_in_pixel, x_+srcoffset_in_pixel); + int4 data0 = convert_int4(src[srcpos.x]); + int4 data1 = convert_int4(src[srcpos.y]); + int4 data2 = convert_int4(src[srcpos.z]); + int4 data3 = convert_int4(src[srcpos.w]); + int4 val = mul24(mul24(U1, V1) , data0) + mul24(mul24(U, V1) , data1) + +mul24(mul24(U1, V) , data2)+mul24(mul24(U, V) , data3); + int dstpos = mad24(dy, dststep_in_pixel, dx+dstoffset_in_pixel); + uchar4 uval = convert_uchar4((val + (1<<(CAST_BITS-1)))>>CAST_BITS); if(dx>=0 && dx=0 && dy>2) + dy * (dst_step>>2) + dx] = convert_uchar4((val + (1<<(CAST_BITS-1)))>>CAST_BITS); + dst[dstpos] = uval; } __kernel void resizeLN_C1_D5(__global float * dst, __global float * src, - int dst_offset, int src_offset,int dst_step, int src_step, + int dstoffset_in_pixel, int srcoffset_in_pixel,int dststep_in_pixel, int srcstep_in_pixel, int src_cols, int src_rows, int dst_cols, int dst_rows, float ifx, float ify ) { int dx = get_global_id(0); @@ -210,19 +231,29 @@ __kernel void resizeLN_C1_D5(__global float * dst, __global float * src, int y_ = INC(y,src_rows); int x_ = INC(x,src_cols); - - float val1 = (1.0f-u) * getPoint_32fc1(src,src_offset,x,y,src_step) + - u * getPoint_32fc1(src,src_offset,x_,y,src_step) ; - float val2 = (1.0f-u) * getPoint_32fc1(src,src_offset,x,y_,src_step) + - u * getPoint_32fc1(src,src_offset,x_,y_,src_step); - float val = (1.0f-v) * val1 + v * val2; - + float u1 = 1.f-u; + float v1 = 1.f-v; + int4 srcpos; + srcpos.x = mad24(y, srcstep_in_pixel, x+srcoffset_in_pixel); + srcpos.y = mad24(y, srcstep_in_pixel, x_+srcoffset_in_pixel); + srcpos.z = mad24(y_, srcstep_in_pixel, x+srcoffset_in_pixel); + srcpos.w = mad24(y_, srcstep_in_pixel, x_+srcoffset_in_pixel); + float data0 = src[srcpos.x]; + float data1 = src[srcpos.y]; + float data2 = src[srcpos.z]; + float data3 = src[srcpos.w]; + float val1 = u1 * data0 + + u * data1 ; + float val2 = u1 * data2 + + u * data3; + float val = v1 * val1 + v * val2; + int dstpos = mad24(dy, dststep_in_pixel, dx+dstoffset_in_pixel); if(dx>=0 && dx=0 && dy>2) + dy * (dst_step>>2) + dx] = val; + dst[dstpos] = val; } __kernel void resizeLN_C4_D5(__global float4 * dst, __global float4 * src, - int dst_offset, int src_offset,int dst_step, int src_step, + int dstoffset_in_pixel, int srcoffset_in_pixel,int dststep_in_pixel, int srcstep_in_pixel, int src_cols, int src_rows, int dst_cols, int dst_rows, float ifx, float ify ) { int dx = get_global_id(0); @@ -239,31 +270,35 @@ __kernel void resizeLN_C4_D5(__global float4 * dst, __global float4 * src, int y_ = INC(y,src_rows); int x_ = INC(x,src_cols); - + float u1 = 1.f-u; + float v1 = 1.f-v; + int4 srcpos; + srcpos.x = mad24(y, srcstep_in_pixel, x+srcoffset_in_pixel); + srcpos.y = mad24(y, srcstep_in_pixel, x_+srcoffset_in_pixel); + srcpos.z = mad24(y_, srcstep_in_pixel, x+srcoffset_in_pixel); + srcpos.w = mad24(y_, srcstep_in_pixel, x_+srcoffset_in_pixel); float4 s_data1, s_data2, s_data3, s_data4; - src_offset = (src_offset >> 4); - src_step = (src_step >> 4); - s_data1 = src[src_offset + y*src_step + x]; - s_data2 = src[src_offset + y*src_step + x_]; - s_data3 = src[src_offset + y_*src_step + x]; - s_data4 = src[src_offset + y_*src_step + x_]; - s_data1 = (1.0f-u) * s_data1 + u * s_data2; - s_data2 = (1.0f-u) * s_data3 + u * s_data4; - s_data3 = (1.0f-v) * s_data1 + v * s_data2; + s_data1 = src[srcpos.x]; + s_data2 = src[srcpos.y]; + s_data3 = src[srcpos.z]; + s_data4 = src[srcpos.w]; + float4 val = u1 * v1 * s_data1 + u * v1 * s_data2 + +u1 * v *s_data3 + u * v *s_data4; + int dstpos = mad24(dy, dststep_in_pixel, dx+dstoffset_in_pixel); if(dx>=0 && dx=0 && dy>4) + dy * (dst_step>>4) + dx] = s_data3; + dst[dstpos] = val; } __kernel void resizeNN_C1_D0(__global uchar * dst, __global uchar * src, - int dst_offset, int src_offset,int dst_step, int src_step, + int dstoffset_in_pixel, int srcoffset_in_pixel,int dststep_in_pixel, int srcstep_in_pixel, int src_cols, int src_rows, int dst_cols, int dst_rows, F ifx, F ify ) { int gx = get_global_id(0); int dy = get_global_id(1); - gx = (gx<<2) - (dst_offset&3); - int4 GX = (int4)(gx, gx+1, gx+2, gx+3); + gx = (gx<<2) - (dstoffset_in_pixel&3); + //int4 GX = (int4)(gx, gx+1, gx+2, gx+3); int4 sx; int sy; @@ -279,22 +314,42 @@ __kernel void resizeNN_C1_D0(__global uchar * dst, __global uchar * src, sy = min((int)floor(s5), src_rows-1); uchar4 val; - int4 pos = src_offset + sy * src_step + sx; + int4 pos = mad24(sy, srcstep_in_pixel, sx+srcoffset_in_pixel); val.s0 = src[pos.s0]; val.s1 = src[pos.s1]; val.s2 = src[pos.s2]; val.s3 = src[pos.s3]; - __global uchar4* d = (__global uchar4*)(dst + dst_offset + dy * dst_step + gx); - uchar4 dVal = *d; - int4 con = (GX >= 0 && GX < dst_cols && dy >= 0 && dy < dst_rows); - val = convert_uchar4(con != 0) ? val : dVal; - - *d = val; + //__global uchar4* d = (__global uchar4*)(dst + dstoffset_in_pixel + dy * dststep_in_pixel + gx); + //uchar4 dVal = *d; + pos = mad24(dy, dststep_in_pixel, gx+dstoffset_in_pixel); + pos.y++; + pos.z+=2; + + int con = (gx >= 0 && gx+3 < dst_cols && dy >= 0 && dy < dst_rows); + if(con) + { + *(__global uchar4*)(dst + pos.x)=val; + } + else + { + if(gx >= 0 && gx < dst_cols && dy >= 0 && dy < dst_rows) + { + dst[pos.x]=val.x; + } + if(gx+1 >= 0 && gx+1 < dst_cols && dy >= 0 && dy < dst_rows) + { + dst[pos.y]=val.y; + } + if(gx+2 >= 0 && gx+2 < dst_cols && dy >= 0 && dy < dst_rows) + { + dst[pos.z]=val.z; + } + } } __kernel void resizeNN_C4_D0(__global uchar4 * dst, __global uchar4 * src, - int dst_offset, int src_offset,int dst_step, int src_step, + int dstoffset_in_pixel, int srcoffset_in_pixel,int dststep_in_pixel, int srcstep_in_pixel, int src_cols, int src_rows, int dst_cols, int dst_rows, F ifx, F ify ) { int dx = get_global_id(0); @@ -304,8 +359,8 @@ __kernel void resizeNN_C4_D0(__global uchar4 * dst, __global uchar4 * src, F s2 = dy*ify; int sx = fmin((float)floor(s1), (float)src_cols-1); int sy = fmin((float)floor(s2), (float)src_rows-1); - int dpos = (dst_offset>>2) + dy * (dst_step>>2) + dx; - int spos = (src_offset>>2) + sy * (src_step>>2) + sx; + int dpos = mad24(dy, dststep_in_pixel, dx + dstoffset_in_pixel); + int spos = mad24(sy, srcstep_in_pixel, sx + srcoffset_in_pixel); if(dx>=0 && dx=0 && dy>2) + dy * (dst_step>>2) + dx; - int spos = (src_offset>>2) + sy * (src_step>>2) + sx; - + + int dpos = mad24(dy, dststep_in_pixel, dx + dstoffset_in_pixel); + int spos = mad24(sy, srcstep_in_pixel, sx + srcoffset_in_pixel); if(dx>=0 && dx=0 && dy>4) + dy * (dst_step>>4) + dx; - int spos = (src_offset>>4) + sy * (src_step>>4) + sx; + int dpos = mad24(dy, dststep_in_pixel, dx + dstoffset_in_pixel); + int spos = mad24(sy, srcstep_in_pixel, sx + srcoffset_in_pixel); if(dx>=0 && dx=0 && dy=addr_start)&(idx+3 < addr_end) & (y < rows)) { *(__global uchar4*)(dstMat+idx) = out; @@ -65,7 +61,7 @@ __kernel void set_to_without_mask_C1_D0(float4 scalar,__global uchar * dstMat, } } -__kernel void set_to_without_mask_C4_D0(float4 scalar,__global uchar4 * dstMat, +__kernel void set_to_without_mask(GENTYPE scalar,__global GENTYPE * dstMat, int cols,int rows,int dstStep_in_pixel,int offset_in_pixel) { int x=get_global_id(0); @@ -73,52 +69,6 @@ __kernel void set_to_without_mask_C4_D0(float4 scalar,__global uchar4 * dstMat, if ( (x < cols) & (y < rows)) { int idx = mad24(y,dstStep_in_pixel,x+ offset_in_pixel); - dstMat[idx] = convert_uchar4_sat(scalar); + dstMat[idx] = scalar; } } -__kernel void set_to_without_mask_C1_D4(float4 scalar,__global int * dstMat, - int cols,int rows,int dstStep_in_pixel,int offset_in_pixel) -{ - int x=get_global_id(0); - int y=get_global_id(1); - if ( (x < cols) & (y < rows)) - { - int idx = mad24(y, dstStep_in_pixel, x+offset_in_pixel); - dstMat[idx] = convert_int_sat(scalar.x); - } -} -__kernel void set_to_without_mask_C4_D4(float4 scalar,__global int4 * dstMat, - int cols,int rows,int dstStep_in_pixel,int offset_in_pixel) -{ - int x=get_global_id(0); - int y=get_global_id(1); - if ( (x < cols) & (y < rows)) - { - int idx = mad24(y,dstStep_in_pixel,x+ offset_in_pixel); - dstMat[idx] = convert_int4_sat(scalar); - } -} - -__kernel void set_to_without_mask_C1_D5(float4 scalar,__global float * dstMat, - int cols,int rows,int dstStep_in_pixel,int offset_in_pixel) -{ - int x=get_global_id(0); - int y=get_global_id(1); - if ( (x < cols) & (y < rows)) - { - int idx = mad24(y,dstStep_in_pixel,x+ offset_in_pixel); - dstMat[idx] = scalar.x; - } -} -__kernel void set_to_without_mask_C4_D5(float4 scalar,__global float4 * dstMat, - int cols,int rows,int dstStep_in_pixel,int offset_in_pixel) -{ - int x=get_global_id(0); - int y=get_global_id(1); - if ( (x < cols) & (y < rows)) - { - int idx = mad24(y,dstStep_in_pixel,x+ offset_in_pixel); - dstMat[idx] = scalar; - } -} - diff --git a/modules/ocl/src/kernels/operator_setToM.cl b/modules/ocl/src/kernels/operator_setToM.cl index e306657e4..56a579b3f 100644 --- a/modules/ocl/src/kernels/operator_setToM.cl +++ b/modules/ocl/src/kernels/operator_setToM.cl @@ -35,12 +35,6 @@ // -/*#if defined (__ATI__) -#pragma OPENCL EXTENSION cl_amd_fp64:enable -#elif defined (__NVIDIA__) -#pragma OPENCL EXTENSION cl_khr_fp64:enable -#endif -*/ /* __kernel void set_to_with_mask_C1_D0( float4 scalar, @@ -67,7 +61,7 @@ __kernel void set_to_with_mask_C1_D0( */ //#pragma OPENCL EXTENSION cl_amd_printf : enable __kernel void set_to_with_mask_C1_D0( - float4 scalar, + uchar scalar, __global uchar* dstMat, int cols, int rows, @@ -85,7 +79,7 @@ __kernel void set_to_with_mask_C1_D0( int mask_addr_start = mad24(y,maskStep,maskoffset); int mask_addr_end = mad24(y,maskStep,cols+maskoffset); int maskidx = mad24(y,maskStep,x+ maskoffset & (int)0xfffffffc); - uchar out = convert_uchar_sat(scalar.x); + int off_mask = (maskoffset & 3) - (dstoffset_in_pixel & 3) +3; if ( (x < cols) & (y < rows) ) @@ -107,16 +101,16 @@ __kernel void set_to_with_mask_C1_D0( temp_mask2.z = (maskidx+6 >=mask_addr_start)&(maskidx+6 < mask_addr_end) ? temp_mask2.z : 0; temp_mask2.w = (maskidx+7 >=mask_addr_start)&(maskidx+7 < mask_addr_end) ? temp_mask2.w : 0; uchar trans_mask[10] = {temp_mask1.y,temp_mask1.z,temp_mask1.w,temp_mask.x,temp_mask.y,temp_mask.z,temp_mask.w,temp_mask2.x,temp_mask2.y,temp_mask2.z}; - temp_dst.x = (dstidx>=dst_addr_start)&(dstidx=dst_addr_start)&(dstidx+1=dst_addr_start)&(dstidx+2=dst_addr_start)&(dstidx+3=dst_addr_start)&(dstidx=dst_addr_start)&(dstidx+1=dst_addr_start)&(dstidx+2=dst_addr_start)&(dstidx+3 > args; args.push_back( make_pair( sizeof(cl_mem), (void *)&src)); args.push_back( make_pair( sizeof(cl_mem), (void *)&dst.data)); args.push_back( make_pair( sizeof(cl_int), (void *)&dst.wholecols)); args.push_back( make_pair( sizeof(cl_int), (void *)&dst.wholerows)); - args.push_back( make_pair( sizeof(cl_int), (void *)&srcStep)); - args.push_back( make_pair( sizeof(cl_int), (void *)&dstStep)); + args.push_back( make_pair( sizeof(cl_int), (void *)&dstStep_in_pixel)); + args.push_back( make_pair( sizeof(cl_int), (void *)&pixel_end)); - size_t globalThreads[3] = {(dst.wholecols *dst.wholerows + 255) / 256 * 256, 1, 1}; + size_t globalThreads[3] = {((dst.wholecols *dst.wholerows+3)/4 + 255) / 256 * 256, 1, 1}; size_t localThreads[3] = {256, 1, 1}; - openCLExecuteKernel(clCxt, &convertC3C4, kernelName, globalThreads, localThreads, args, -1, dst.elemSize1() >> 1); + openCLExecuteKernel(clCxt, &convertC3C4, kernelName, globalThreads, localThreads, args, -1, -1,compile_option); } //////////////////////////////////////////////////////////////////////// // convert_C4C3 void convert_C4C3(const oclMat &src, cl_mem &dst, int dstStep) { - int srcStep = src.step1() / src.channels(); + int srcStep_in_pixel = src.step1() / src.channels(); + int pixel_end = src.wholecols*src.wholerows -1; Context *clCxt = src.clCxt; string kernelName = "convertC4C3"; + char compile_option[32]; + switch(src.depth()) + { + case 0: + sprintf(compile_option, "-D GENTYPE4=uchar4"); + break; + case 1: + sprintf(compile_option, "-D GENTYPE4=char4"); + break; + case 2: + sprintf(compile_option, "-D GENTYPE4=ushort4"); + break; + case 3: + sprintf(compile_option, "-D GENTYPE4=short4"); + break; + case 4: + sprintf(compile_option, "-D GENTYPE4=int4"); + break; + case 5: + sprintf(compile_option, "-D GENTYPE4=float4"); + break; + case 6: + sprintf(compile_option, "-D GENTYPE4=double4"); + break; + default: + CV_Error(-217,"unknown depth"); + } vector< pair > args; args.push_back( make_pair( sizeof(cl_mem), (void *)&src.data)); args.push_back( make_pair( sizeof(cl_mem), (void *)&dst)); args.push_back( make_pair( sizeof(cl_int), (void *)&src.wholecols)); args.push_back( make_pair( sizeof(cl_int), (void *)&src.wholerows)); - args.push_back( make_pair( sizeof(cl_int), (void *)&srcStep)); - args.push_back( make_pair( sizeof(cl_int), (void *)&dstStep)); + args.push_back( make_pair( sizeof(cl_int), (void *)&srcStep_in_pixel)); + args.push_back( make_pair( sizeof(cl_int), (void *)&pixel_end)); - size_t globalThreads[3] = {(src.wholecols *src.wholerows + 255) / 256 * 256, 1, 1}; + size_t globalThreads[3] = {((src.wholecols *src.wholerows+3)/4 + 255) / 256 * 256, 1, 1}; size_t localThreads[3] = {256, 1, 1}; - openCLExecuteKernel(clCxt, &convertC3C4, kernelName, globalThreads, localThreads, args, -1, src.elemSize1() >> 1); + openCLExecuteKernel(clCxt, &convertC3C4, kernelName, globalThreads, localThreads, args, -1, -1,compile_option); } void cv::ocl::oclMat::upload(const Mat &m) @@ -173,23 +229,47 @@ void cv::ocl::oclMat::upload(const Mat &m) Point ofs; m.locateROI(wholeSize, ofs); int type = m.type(); - //if(m.channels() == 3) - //type = CV_MAKETYPE(m.depth(), 4); + if(m.channels() == 3) + { + type = CV_MAKETYPE(m.depth(), 4); + } create(wholeSize, type); - //if(m.channels() == 3) - //{ - //int pitch = GPU_MATRIX_MALLOC_STEP(wholeSize.width * 3 * m.elemSize1()); - //int err; - //cl_mem temp = clCreateBuffer(clCxt->clContext,CL_MEM_READ_WRITE, - //pitch*wholeSize.height,0,&err); - //CV_DbgAssert(err==0); + if(m.channels() == 3) + { + int pitch = wholeSize.width * 3 * m.elemSize1(); + int tail_padding = m.elemSize1()*3072; + int err; + cl_mem temp = clCreateBuffer(clCxt->impl->clContext,CL_MEM_READ_WRITE, + (pitch*wholeSize.height+tail_padding-1)/tail_padding*tail_padding,0,&err); + openCLVerifyCall(err); - //openCLMemcpy2D(clCxt,temp,pitch,m.datastart,m.step,wholeSize.width*m.elemSize(),wholeSize.height,clMemcpyHostToDevice); - //convert_C3C4(temp, *this, pitch); - //} - //else - openCLMemcpy2D(clCxt, data, step, m.datastart, m.step, wholeSize.width * elemSize(), wholeSize.height, clMemcpyHostToDevice); + openCLMemcpy2D(clCxt,temp,pitch,m.datastart,m.step,wholeSize.width*m.elemSize(),wholeSize.height,clMemcpyHostToDevice,3); + convert_C3C4(temp, *this, pitch); + //int* cputemp=new int[wholeSize.height*wholeSize.width * 3]; + //int* cpudata=new int[this->step*this->wholerows/sizeof(int)]; + //openCLSafeCall(clEnqueueReadBuffer(clCxt->impl->clCmdQueue, temp, CL_TRUE, + // 0, wholeSize.height*wholeSize.width * 3* sizeof(int), cputemp, 0, NULL, NULL)); + //openCLSafeCall(clEnqueueReadBuffer(clCxt->impl->clCmdQueue, (cl_mem)data, CL_TRUE, + // 0, this->step*this->wholerows, cpudata, 0, NULL, NULL)); + //for(int i=0;istep/sizeof(int); + // for(int j=0;jempty()); int t = type(); - //if(download_channels == 3) - //t = CV_MAKETYPE(depth(), 3); + if(download_channels == 3) + { + t = CV_MAKETYPE(depth(), 3); + } m.create(wholerows, wholecols, t); - //if(download_channels == 3) - //{ - //int pitch = GPU_MATRIX_MALLOC_STEP(wholecols * 3 * m.elemSize1()); - //int err; - //cl_mem temp = clCreateBuffer(clCxt->clContext,CL_MEM_READ_WRITE, - //pitch*wholerows,0,&err); - //CV_DbgAssert(err==0); + if(download_channels == 3) + { + int pitch = wholecols * 3 * m.elemSize1(); + int tail_padding = m.elemSize1()*3072; + int err; + cl_mem temp = clCreateBuffer(clCxt->impl->clContext,CL_MEM_READ_WRITE, + (pitch*wholerows+tail_padding-1)/tail_padding*tail_padding,0,&err); + openCLVerifyCall(err); - //convert_C4C3(*this, temp, pitch/m.elemSize1()); - //openCLMemcpy2D(clCxt,m.data,m.step,temp,pitch,wholecols*m.elemSize(),wholerows,clMemcpyDeviceToHost); - //} - //else - openCLMemcpy2D(clCxt, m.data, m.step, data, step, wholecols * elemSize(), wholerows, clMemcpyDeviceToHost); + convert_C4C3(*this, temp, pitch/m.elemSize1()); + openCLMemcpy2D(clCxt,m.data,m.step,temp,pitch,wholecols*m.elemSize(),wholerows,clMemcpyDeviceToHost,3); + //int* cputemp=new int[wholecols*wholerows * 3]; + //int* cpudata=new int[this->step*this->wholerows/sizeof(int)]; + //openCLSafeCall(clEnqueueReadBuffer(clCxt->impl->clCmdQueue, temp, CL_TRUE, + // 0, wholecols*wholerows * 3* sizeof(int), cputemp, 0, NULL, NULL)); + //openCLSafeCall(clEnqueueReadBuffer(clCxt->impl->clCmdQueue, (cl_mem)data, CL_TRUE, + // 0, this->step*this->wholerows, cpudata, 0, NULL, NULL)); + //for(int i=0;istep/sizeof(int); + // for(int j=0;j > args; - cl_float4 val; - val.s[0] = scalar.val[0]; - val.s[1] = scalar.val[1]; - val.s[2] = scalar.val[2]; - val.s[3] = scalar.val[3]; + size_t localThreads[3] = {16, 16, 1}; size_t globalThreads[3]; globalThreads[0] = (dst.cols + localThreads[0] - 1) / localThreads[0] * localThreads[0]; @@ -388,25 +488,168 @@ void set_to_withoutmask_run(const oclMat &dst, const Scalar &scalar, string kern { globalThreads[0] = ((dst.cols + 4) / 4 + localThreads[0] - 1) / localThreads[0] * localThreads[0]; } - args.push_back( make_pair( sizeof(cl_float4) , (void *)&val )); + char compile_option[32]; + union sc + { + cl_uchar4 uval; + cl_char4 cval; + cl_ushort4 usval; + cl_short4 shval; + cl_int4 ival; + cl_float4 fval; + cl_double4 dval; + }val; + switch(dst.depth()) + { + case 0: + val.uval.s[0] = saturate_cast(scalar.val[0]); + val.uval.s[1] = saturate_cast(scalar.val[1]); + val.uval.s[2] = saturate_cast(scalar.val[2]); + val.uval.s[3] = saturate_cast(scalar.val[3]); + switch(dst.channels()) + { + case 1: + sprintf(compile_option, "-D GENTYPE=uchar"); + args.push_back( make_pair( sizeof(cl_uchar) , (void *)&val.uval.s[0] )); + break; + case 4: + sprintf(compile_option, "-D GENTYPE=uchar4"); + args.push_back( make_pair( sizeof(cl_uchar4) , (void *)&val.uval )); + break; + default: + CV_Error(-217,"unsupported channels"); + } + break; + case 1: + val.cval.s[0] = saturate_cast(scalar.val[0]); + val.cval.s[1] = saturate_cast(scalar.val[1]); + val.cval.s[2] = saturate_cast(scalar.val[2]); + val.cval.s[3] = saturate_cast(scalar.val[3]); + switch(dst.channels()) + { + case 1: + sprintf(compile_option, "-D GENTYPE=char"); + args.push_back( make_pair( sizeof(cl_char) , (void *)&val.cval.s[0] )); + break; + case 4: + sprintf(compile_option, "-D GENTYPE=char4"); + args.push_back( make_pair( sizeof(cl_char4) , (void *)&val.cval )); + break; + default: + CV_Error(-217,"unsupported channels"); + } + break; + case 2: + val.usval.s[0] = saturate_cast(scalar.val[0]); + val.usval.s[1] = saturate_cast(scalar.val[1]); + val.usval.s[2] = saturate_cast(scalar.val[2]); + val.usval.s[3] = saturate_cast(scalar.val[3]); + switch(dst.channels()) + { + case 1: + sprintf(compile_option, "-D GENTYPE=ushort"); + args.push_back( make_pair( sizeof(cl_ushort) , (void *)&val.usval.s[0] )); + break; + case 4: + sprintf(compile_option, "-D GENTYPE=ushort4"); + args.push_back( make_pair( sizeof(cl_ushort4) , (void *)&val.usval )); + break; + default: + CV_Error(-217,"unsupported channels"); + } + break; + case 3: + val.shval.s[0] = saturate_cast(scalar.val[0]); + val.shval.s[1] = saturate_cast(scalar.val[1]); + val.shval.s[2] = saturate_cast(scalar.val[2]); + val.shval.s[3] = saturate_cast(scalar.val[3]); + switch(dst.channels()) + { + case 1: + sprintf(compile_option, "-D GENTYPE=short"); + args.push_back( make_pair( sizeof(cl_short) , (void *)&val.shval.s[0] )); + break; + case 4: + sprintf(compile_option, "-D GENTYPE=short4"); + args.push_back( make_pair( sizeof(cl_short4) , (void *)&val.shval )); + break; + default: + CV_Error(-217,"unsupported channels"); + } + break; + case 4: + val.ival.s[0] = saturate_cast(scalar.val[0]); + val.ival.s[1] = saturate_cast(scalar.val[1]); + val.ival.s[2] = saturate_cast(scalar.val[2]); + val.ival.s[3] = saturate_cast(scalar.val[3]); + switch(dst.channels()) + { + case 1: + sprintf(compile_option, "-D GENTYPE=int"); + args.push_back( make_pair( sizeof(cl_int) , (void *)&val.ival.s[0] )); + break; + case 4: + sprintf(compile_option, "-D GENTYPE=int4"); + args.push_back( make_pair( sizeof(cl_int4) , (void *)&val.ival )); + break; + default: + CV_Error(-217,"unsupported channels"); + } + break; + case 5: + val.fval.s[0] = scalar.val[0]; + val.fval.s[1] = scalar.val[1]; + val.fval.s[2] = scalar.val[2]; + val.fval.s[3] = scalar.val[3]; + switch(dst.channels()) + { + case 1: + sprintf(compile_option, "-D GENTYPE=float"); + args.push_back( make_pair( sizeof(cl_float) , (void *)&val.fval.s[0] )); + break; + case 4: + sprintf(compile_option, "-D GENTYPE=float4"); + args.push_back( make_pair( sizeof(cl_float4) , (void *)&val.fval )); + break; + default: + CV_Error(-217,"unsupported channels"); + } + break; + case 6: + val.dval.s[0] = scalar.val[0]; + val.dval.s[1] = scalar.val[1]; + val.dval.s[2] = scalar.val[2]; + val.dval.s[3] = scalar.val[3]; + switch(dst.channels()) + { + case 1: + sprintf(compile_option, "-D GENTYPE=double"); + args.push_back( make_pair( sizeof(cl_double) , (void *)&val.dval.s[0] )); + break; + case 4: + sprintf(compile_option, "-D GENTYPE=double4"); + args.push_back( make_pair( sizeof(cl_double4) , (void *)&val.dval )); + break; + default: + CV_Error(-217,"unsupported channels"); + } + break; + default: + CV_Error(-217,"unknown depth"); + } args.push_back( make_pair( sizeof(cl_mem) , (void *)&dst.data )); args.push_back( make_pair( sizeof(cl_int) , (void *)&dst.cols )); args.push_back( make_pair( sizeof(cl_int) , (void *)&dst.rows )); args.push_back( make_pair( sizeof(cl_int) , (void *)&step_in_pixel )); args.push_back( make_pair( sizeof(cl_int) , (void *)&offset_in_pixel)); openCLExecuteKernel(dst.clCxt , &operator_setTo, kernelName, globalThreads, - localThreads, args, dst.channels(), dst.depth()); + localThreads, args, -1, -1,compile_option); } void set_to_withmask_run(const oclMat &dst, const Scalar &scalar, const oclMat &mask, string kernelName) { CV_DbgAssert( dst.rows == mask.rows && dst.cols == mask.cols); vector > args; - cl_float4 val; - val.s[0] = scalar.val[0]; - val.s[1] = scalar.val[1]; - val.s[2] = scalar.val[2]; - val.s[3] = scalar.val[3]; size_t localThreads[3] = {16, 16, 1}; size_t globalThreads[3]; globalThreads[0] = (dst.cols + localThreads[0] - 1) / localThreads[0] * localThreads[0]; @@ -417,7 +660,155 @@ void set_to_withmask_run(const oclMat &dst, const Scalar &scalar, const oclMat & globalThreads[0] = ((dst.cols + 4) / 4 + localThreads[0] - 1) / localThreads[0] * localThreads[0]; } int step_in_pixel = dst.step / dst.elemSize(), offset_in_pixel = dst.offset / dst.elemSize(); - args.push_back( make_pair( sizeof(cl_float4) , (void *)&val )); + char compile_option[32]; + union sc + { + cl_uchar4 uval; + cl_char4 cval; + cl_ushort4 usval; + cl_short4 shval; + cl_int4 ival; + cl_float4 fval; + cl_double4 dval; + }val; + switch(dst.depth()) + { + case 0: + val.uval.s[0] = saturate_cast(scalar.val[0]); + val.uval.s[1] = saturate_cast(scalar.val[1]); + val.uval.s[2] = saturate_cast(scalar.val[2]); + val.uval.s[3] = saturate_cast(scalar.val[3]); + switch(dst.channels()) + { + case 1: + sprintf(compile_option, "-D GENTYPE=uchar"); + args.push_back( make_pair( sizeof(cl_uchar) , (void *)&val.uval.s[0] )); + break; + case 4: + sprintf(compile_option, "-D GENTYPE=uchar4"); + args.push_back( make_pair( sizeof(cl_uchar4) , (void *)&val.uval )); + break; + default: + CV_Error(-217,"unsupported channels"); + } + break; + case 1: + val.cval.s[0] = saturate_cast(scalar.val[0]); + val.cval.s[1] = saturate_cast(scalar.val[1]); + val.cval.s[2] = saturate_cast(scalar.val[2]); + val.cval.s[3] = saturate_cast(scalar.val[3]); + switch(dst.channels()) + { + case 1: + sprintf(compile_option, "-D GENTYPE=char"); + args.push_back( make_pair( sizeof(cl_char) , (void *)&val.cval.s[0] )); + break; + case 4: + sprintf(compile_option, "-D GENTYPE=char4"); + args.push_back( make_pair( sizeof(cl_char4) , (void *)&val.cval )); + break; + default: + CV_Error(-217,"unsupported channels"); + } + break; + case 2: + val.usval.s[0] = saturate_cast(scalar.val[0]); + val.usval.s[1] = saturate_cast(scalar.val[1]); + val.usval.s[2] = saturate_cast(scalar.val[2]); + val.usval.s[3] = saturate_cast(scalar.val[3]); + switch(dst.channels()) + { + case 1: + sprintf(compile_option, "-D GENTYPE=ushort"); + args.push_back( make_pair( sizeof(cl_ushort) , (void *)&val.usval.s[0] )); + break; + case 4: + sprintf(compile_option, "-D GENTYPE=ushort4"); + args.push_back( make_pair( sizeof(cl_ushort4) , (void *)&val.usval )); + break; + default: + CV_Error(-217,"unsupported channels"); + } + break; + case 3: + val.shval.s[0] = saturate_cast(scalar.val[0]); + val.shval.s[1] = saturate_cast(scalar.val[1]); + val.shval.s[2] = saturate_cast(scalar.val[2]); + val.shval.s[3] = saturate_cast(scalar.val[3]); + switch(dst.channels()) + { + case 1: + sprintf(compile_option, "-D GENTYPE=short"); + args.push_back( make_pair( sizeof(cl_short) , (void *)&val.shval.s[0] )); + break; + case 4: + sprintf(compile_option, "-D GENTYPE=short4"); + args.push_back( make_pair( sizeof(cl_short4) , (void *)&val.shval )); + break; + default: + CV_Error(-217,"unsupported channels"); + } + break; + case 4: + val.ival.s[0] = saturate_cast(scalar.val[0]); + val.ival.s[1] = saturate_cast(scalar.val[1]); + val.ival.s[2] = saturate_cast(scalar.val[2]); + val.ival.s[3] = saturate_cast(scalar.val[3]); + switch(dst.channels()) + { + case 1: + sprintf(compile_option, "-D GENTYPE=int"); + args.push_back( make_pair( sizeof(cl_int) , (void *)&val.ival.s[0] )); + break; + case 4: + sprintf(compile_option, "-D GENTYPE=int4"); + args.push_back( make_pair( sizeof(cl_int4) , (void *)&val.ival )); + break; + default: + CV_Error(-217,"unsupported channels"); + } + break; + case 5: + val.fval.s[0] = scalar.val[0]; + val.fval.s[1] = scalar.val[1]; + val.fval.s[2] = scalar.val[2]; + val.fval.s[3] = scalar.val[3]; + switch(dst.channels()) + { + case 1: + sprintf(compile_option, "-D GENTYPE=float"); + args.push_back( make_pair( sizeof(cl_float) , (void *)&val.fval.s[0] )); + break; + case 4: + sprintf(compile_option, "-D GENTYPE=float4"); + args.push_back( make_pair( sizeof(cl_float4) , (void *)&val.fval )); + break; + default: + CV_Error(-217,"unsupported channels"); + } + break; + case 6: + val.dval.s[0] = scalar.val[0]; + val.dval.s[1] = scalar.val[1]; + val.dval.s[2] = scalar.val[2]; + val.dval.s[3] = scalar.val[3]; + switch(dst.channels()) + { + case 1: + sprintf(compile_option, "-D GENTYPE=double"); + args.push_back( make_pair( sizeof(cl_double) , (void *)&val.dval.s[0] )); + break; + case 4: + sprintf(compile_option, "-D GENTYPE=double4"); + args.push_back( make_pair( sizeof(cl_double4) , (void *)&val.dval )); + break; + default: + CV_Error(-217,"unsupported channels"); + } + break; + default: + CV_Error(-217,"unknown depth"); + } args.push_back( make_pair( sizeof(cl_mem) , (void *)&dst.data )); args.push_back( make_pair( sizeof(cl_int) , (void *)&dst.cols )); args.push_back( make_pair( sizeof(cl_int) , (void *)&dst.rows )); @@ -427,7 +818,7 @@ void set_to_withmask_run(const oclMat &dst, const Scalar &scalar, const oclMat & args.push_back( make_pair( sizeof(cl_int) , (void *)&mask.step )); args.push_back( make_pair( sizeof(cl_int) , (void *)&mask.offset )); openCLExecuteKernel(dst.clCxt , &operator_setToM, kernelName, globalThreads, - localThreads, args, dst.channels(), dst.depth()); + localThreads, args, -1, -1,compile_option); } oclMat &cv::ocl::oclMat::setTo(const Scalar &scalar, const oclMat &mask) @@ -446,11 +837,25 @@ oclMat &cv::ocl::oclMat::setTo(const Scalar &scalar, const oclMat &mask) // (cl_mem)mem,1,0,sizeof(double)*4,s,0,0,0)); if (mask.empty()) { - set_to_withoutmask_run(*this, scalar, "set_to_without_mask"); + if(type()==CV_8UC1) + { + set_to_withoutmask_run(*this, scalar, "set_to_without_mask_C1_D0"); + } + else + { + set_to_withoutmask_run(*this, scalar, "set_to_without_mask"); + } } else { - set_to_withmask_run(*this, scalar, mask, "set_to_with_mask"); + if(type()==CV_8UC1) + { + set_to_withmask_run(*this, scalar, mask,"set_to_with_mask_C1_D0"); + } + else + { + set_to_withmask_run(*this, scalar, mask, "set_to_with_mask"); + } } return *this; diff --git a/modules/ocl/src/precomp.hpp b/modules/ocl/src/precomp.hpp index 587d70dce..0bde1e775 100644 --- a/modules/ocl/src/precomp.hpp +++ b/modules/ocl/src/precomp.hpp @@ -97,7 +97,7 @@ namespace cv size_t widthInBytes, size_t height); void openCLMemcpy2D(Context *clCxt, void *dst, size_t dpitch, const void *src, size_t spitch, - size_t width, size_t height, enum openCLMemcpyKind kind); + size_t width, size_t height, enum openCLMemcpyKind kind, int channels=-1); void openCLCopyBuffer2D(Context *clCxt, void *dst, size_t dpitch, int dst_offset, const void *src, size_t spitch, size_t width, size_t height, int src_offset, enum openCLMemcpyKind kind); @@ -126,8 +126,8 @@ namespace cv cl_mem openCLMalloc(cl_context clCxt, size_t size, cl_mem_flags flags, void *host_ptr); - void openCLMemcpy2DWithNoPadding(cl_command_queue command_queue, cl_mem buffer, size_t size, size_t offset, void *ptr, - enum openCLMemcpyKind kind, cl_bool blocking_write); + //void openCLMemcpy2DWithNoPadding(cl_command_queue command_queue, cl_mem buffer, size_t size, size_t offset, void *ptr, + // enum openCLMemcpyKind kind, cl_bool blocking_write); int savetofile(const Context *clcxt, cl_program &program, const char *fileName); struct Context::Impl { diff --git a/modules/ocl/test/test_imgproc.cpp b/modules/ocl/test/test_imgproc.cpp index ff2f44171..90ff0b441 100644 --- a/modules/ocl/test/test_imgproc.cpp +++ b/modules/ocl/test/test_imgproc.cpp @@ -958,7 +958,7 @@ TEST_P(Remap, Mat) if((interpolation == 1 && map1Type == CV_16SC2) ||(interpolation == 1 && map1Type == CV_16SC1 && map2Type == CV_16SC1)) { cout << "LINEAR don't support the map1Type and map2Type" << endl; - return; + return; } int bordertype[] = {cv::BORDER_CONSTANT,cv::BORDER_REPLICATE/*,BORDER_REFLECT,BORDER_WRAP,BORDER_REFLECT_101*/}; const char* borderstr[]={"BORDER_CONSTANT", "BORDER_REPLICATE"/*, "BORDER_REFLECT","BORDER_WRAP","BORDER_REFLECT_101"*/}; diff --git a/modules/ocl/test/test_matrix_operation.cpp b/modules/ocl/test/test_matrix_operation.cpp index 997fbe7ad..d538748fb 100644 --- a/modules/ocl/test/test_matrix_operation.cpp +++ b/modules/ocl/test/test_matrix_operation.cpp @@ -396,6 +396,101 @@ TEST_P(SetTo, With_mask) } } +//convertC3C4 +PARAM_TEST_CASE(convertC3C4, MatType, cv::Size) +{ + int type; + cv::Size ksize; + + //src mat + cv::Mat mat1; + cv::Mat dst; + + // set up roi + int roicols; + int roirows; + int src1x; + int src1y; + int dstx; + int dsty; + + //src mat with roi + cv::Mat mat1_roi; + cv::Mat dst_roi; + std::vector oclinfo; + //ocl dst mat for testing + cv::ocl::oclMat gdst_whole; + + //ocl mat with roi + cv::ocl::oclMat gmat1; + cv::ocl::oclMat gdst; + + virtual void SetUp() + { + type = GET_PARAM(0); + ksize = GET_PARAM(1); + + + + //dst = randomMat(rng, size, type, 5, 16, false); + int devnums = getDevice(oclinfo); + CV_Assert(devnums > 0); + //if you want to use undefault device, set it here + //setDevice(oclinfo[1]); + } + + void random_roi() + { +#ifdef RANDOMROI + //randomize ROI + cv::RNG &rng = TS::ptr()->get_rng(); + roicols = rng.uniform(2, mat1.cols); + roirows = rng.uniform(2, mat1.rows); + src1x = rng.uniform(0, mat1.cols - roicols); + src1y = rng.uniform(0, mat1.rows - roirows); + dstx = rng.uniform(0, dst.cols - roicols); + dsty = rng.uniform(0, dst.rows - roirows); +#else + roicols = mat1.cols; + roirows = mat1.rows; + src1x = 0; + src1y = 0; + dstx = 0; + dsty = 0; +#endif + + mat1_roi = mat1(Rect(src1x, src1y, roicols, roirows)); + dst_roi = dst(Rect(dstx, dsty, roicols, roirows)); + + gdst_whole = dst; + gdst = gdst_whole(Rect(dstx, dsty, roicols, roirows)); + + + gmat1 = mat1_roi; + } + +}; + +TEST_P(convertC3C4, Accuracy) +{ + cv::RNG &rng = TS::ptr()->get_rng(); + for(int j = 0; j < LOOP_TIMES; j++) + { + //random_roi(); + int width = rng.uniform(2, MWIDTH); + int height = rng.uniform(2, MHEIGHT); + cv::Size size(width, height); + + mat1 = randomMat(rng, size, type, 0, 40, false); + gmat1 = mat1; + cv::Mat cpu_dst; + gmat1.download(cpu_dst); + char sss[1024]; + sprintf(sss, "cols=%d,rows=%d", mat1.cols, mat1.rows); + EXPECT_MAT_NEAR(mat1, cpu_dst, 0.0, sss); + } + +} INSTANTIATE_TEST_CASE_P(MatrixOperation, ConvertTo, Combine( Values(CV_8UC1, CV_8UC4, CV_32SC1, CV_32SC4, CV_32FC1, CV_32FC4), @@ -408,5 +503,8 @@ INSTANTIATE_TEST_CASE_P(MatrixOperation, CopyTo, Combine( INSTANTIATE_TEST_CASE_P(MatrixOperation, SetTo, Combine( Values(CV_8UC1, CV_8UC4, CV_32SC1, CV_32SC4, CV_32FC1, CV_32FC4), Values(false))); // Values(false) is the reserved parameter - + +INSTANTIATE_TEST_CASE_P(MatrixOperation, convertC3C4, Combine( + Values(CV_8UC3, CV_32SC3, CV_32FC3), + Values(cv::Size()))); #endif From 310b1ad7b9aabdd8443cd53f5bd321a22c9d5902 Mon Sep 17 00:00:00 2001 From: Vadim Pisarevsky Date: Fri, 3 Aug 2012 16:41:00 +0400 Subject: [PATCH 6/9] moved parallel_for_ and ParallelLoopBody to core.hpp --- modules/core/include/opencv2/core/core.hpp | 12 ++ .../include/opencv2/core/parallel_tool.hpp | 108 ------------------ .../src/{parallel_tool.cpp => parallel.cpp} | 17 +++ modules/core/src/precomp.hpp | 1 - modules/imgproc/src/precomp.hpp | 1 - 5 files changed, 29 insertions(+), 110 deletions(-) delete mode 100644 modules/core/include/opencv2/core/parallel_tool.hpp rename modules/core/src/{parallel_tool.cpp => parallel.cpp} (90%) diff --git a/modules/core/include/opencv2/core/core.hpp b/modules/core/include/opencv2/core/core.hpp index 42ab6a17d..1e8210e9b 100644 --- a/modules/core/include/opencv2/core/core.hpp +++ b/modules/core/include/opencv2/core/core.hpp @@ -4608,6 +4608,18 @@ float CommandLineParser::analyzeValue(const std::string& str, bool space_ template<> CV_EXPORTS double CommandLineParser::analyzeValue(const std::string& str, bool space_delete); +/////////////////////////////// Parallel Primitives ////////////////////////////////// + +// a base body class +class CV_EXPORTS ParallelLoopBody +{ +public: + virtual void operator() (const Range& range) const = 0; + virtual ~ParallelLoopBody(); +}; + +CV_EXPORTS void parallel_for_(const Range& range, const ParallelLoopBody& body); + } #endif // __cplusplus diff --git a/modules/core/include/opencv2/core/parallel_tool.hpp b/modules/core/include/opencv2/core/parallel_tool.hpp deleted file mode 100644 index 08258d5c2..000000000 --- a/modules/core/include/opencv2/core/parallel_tool.hpp +++ /dev/null @@ -1,108 +0,0 @@ -/*M/////////////////////////////////////////////////////////////////////////////////////// -// -// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. -// -// By downloading, copying, installing or using the software you agree to this license. -// If you do not agree to this license, do not download, install, -// copy or use the software. -// -// -// License Agreement -// For Open Source Computer Vision Library -// -// Copyright (C) 2000-2008, Intel Corporation, all rights reserved. -// Copyright (C) 2009-2011, Willow Garage Inc., all rights reserved. -// Third party copyrights are property of their respective owners. -// -// Redistribution and use in source and binary forms, with or without modification, -// are permitted provided that the following conditions are met: -// -// * Redistribution's of source code must retain the above copyright notice, -// this list of conditions and the following disclaimer. -// -// * Redistribution's in binary form must reproduce the above copyright notice, -// this list of conditions and the following disclaimer in the documentation -// and/or other materials provided with the distribution. -// -// * The name of the copyright holders may not be used to endorse or promote products -// derived from this software without specific prior written permission. -// -// This software is provided by the copyright holders and contributors "as is" and -// any express or implied warranties, including, but not limited to, the implied -// warranties of merchantability and fitness for a particular purpose are disclaimed. -// In no event shall the Intel Corporation or contributors be liable for any direct, -// indirect, incidental, special, exemplary, or consequential damages -// (including, but not limited to, procurement of substitute goods or services; -// loss of use, data, or profits; or business interruption) however caused -// and on any theory of liability, whether in contract, strict liability, -// or tort (including negligence or otherwise) arising in any way out of -// the use of this software, even if advised of the possibility of such damage. -// -//M*/ - -#ifndef __OPENCV_PARALLEL_TOOL_HPP__ -#define __OPENCV_PARALLEL_TOOL_HPP__ - -#ifdef HAVE_CVCONFIG_H -# include -#endif // HAVE_CVCONFIG_H - -/* - HAVE_TBB - using TBB - HAVE_GCD - using GCD - HAVE_OPENMP - using OpenMP - HAVE_CONCURRENCY - using visual studio 2010 concurrency -*/ - -#ifdef HAVE_TBB -# include "tbb/tbb_stddef.h" -# if TBB_VERSION_MAJOR*100 + TBB_VERSION_MINOR >= 202 -# include "tbb/tbb.h" -# include "tbb/task.h" -# undef min -# undef max -# else -# undef HAVE_TBB -# endif // end TBB version -#endif // HAVE_TBB - -#ifdef __cplusplus - -namespace cv -{ - // a base body class - class CV_EXPORTS ParallelLoopBody - { - public: - virtual void operator() (const Range& range) const = 0; - virtual ~ParallelLoopBody(); - }; - - CV_EXPORTS void parallel_for_(const Range& range, const ParallelLoopBody& body); - - template inline - CV_EXPORTS void parallel_do_(Iterator first, Iterator last, const Body& body) - { -#ifdef HAVE_TBB - tbb::parallel_do(first, last, body); -#else - for ( ; first != last; ++first) - body(*first); -#endif // HAVE_TBB - } - - template inline - CV_EXPORTS void parallel_reduce_(const Range& range, Body& body) - { -#ifdef HAVE_TBB - tbb::parallel_reduce(tbb::blocked_range(range.start, range.end), body); -#else - body(range); -#endif // end HAVE_TBB - } - -} // namespace cv - -#endif // __cplusplus - -#endif // __OPENCV_PARALLEL_TOOL_HPP__ diff --git a/modules/core/src/parallel_tool.cpp b/modules/core/src/parallel.cpp similarity index 90% rename from modules/core/src/parallel_tool.cpp rename to modules/core/src/parallel.cpp index 423d4787d..4274caf34 100644 --- a/modules/core/src/parallel_tool.cpp +++ b/modules/core/src/parallel.cpp @@ -48,8 +48,25 @@ # include #elif defined HAVE_GCD # include +#elif defined HAVE_TBB +# include "tbb/tbb_stddef.h" +# if TBB_VERSION_MAJOR*100 + TBB_VERSION_MINOR >= 202 +# include "tbb/tbb.h" +# include "tbb/task.h" +# undef min +# undef max +# else +# undef HAVE_TBB +# endif // end TBB version #endif // HAVE_CONCURRENCY +/* + HAVE_TBB - using TBB + HAVE_GCD - using GCD + HAVE_OPENMP - using OpenMP + HAVE_CONCURRENCY - using visual studio 2010 concurrency +*/ + namespace cv { ParallelLoopBody::~ParallelLoopBody() { } diff --git a/modules/core/src/precomp.hpp b/modules/core/src/precomp.hpp index 60429075a..81b9d6e80 100644 --- a/modules/core/src/precomp.hpp +++ b/modules/core/src/precomp.hpp @@ -50,7 +50,6 @@ #include "opencv2/core/core.hpp" #include "opencv2/core/core_c.h" #include "opencv2/core/internal.hpp" -#include "opencv2/core/parallel_tool.hpp" #include #include diff --git a/modules/imgproc/src/precomp.hpp b/modules/imgproc/src/precomp.hpp index 998008ae2..fef5f755b 100644 --- a/modules/imgproc/src/precomp.hpp +++ b/modules/imgproc/src/precomp.hpp @@ -50,7 +50,6 @@ #include "opencv2/imgproc/imgproc.hpp" #include "opencv2/imgproc/imgproc_c.h" #include "opencv2/core/internal.hpp" -#include "opencv2/core/parallel_tool.hpp" #include #include #include From 5b4297cccffafe7a1ffeb4a5f76268823a205c3a Mon Sep 17 00:00:00 2001 From: "andrey.kamaev" Date: Fri, 3 Aug 2012 17:00:18 +0400 Subject: [PATCH 7/9] Android CMake toolchain is updated for NDK r8b --- android/android.toolchain.cmake | 21 ++++++++++++++------- 1 file changed, 14 insertions(+), 7 deletions(-) diff --git a/android/android.toolchain.cmake b/android/android.toolchain.cmake index ecbe3192b..324074c8f 100644 --- a/android/android.toolchain.cmake +++ b/android/android.toolchain.cmake @@ -180,6 +180,8 @@ # - modified May 2012 # [+] updated for NDK r8 # [+] added mips architecture support +# - modified August 2012 +# [+] updated for NDK r8b # ------------------------------------------------------------------------------ cmake_minimum_required( VERSION 2.6.3 ) @@ -199,7 +201,7 @@ set( CMAKE_SYSTEM_NAME Linux ) #this one not so much set( CMAKE_SYSTEM_VERSION 1 ) -set( ANDROID_SUPPORTED_NDK_VERSIONS ${ANDROID_EXTRA_NDK_VERSIONS} -r8 -r7c -r7b -r7 -r6b -r6 -r5c -r5b -r5 "" ) +set( ANDROID_SUPPORTED_NDK_VERSIONS ${ANDROID_EXTRA_NDK_VERSIONS} -r8b -r8 -r7c -r7b -r7 -r6b -r6 -r5c -r5b -r5 "" ) if(NOT DEFINED ANDROID_NDK_SEARCH_PATHS) if( CMAKE_HOST_WIN32 ) file( TO_CMAKE_PATH "$ENV{PROGRAMFILES}" ANDROID_NDK_SEARCH_PATHS ) @@ -473,11 +475,11 @@ if( BUILD_WITH_ANDROID_NDK ) foreach( __toolchain ${__availableToolchains} ) __DETECT_TOOLCHAIN_MACHINE_NAME( __machine "${ANDROID_NDK}/toolchains/${__toolchain}/prebuilt/${ANDROID_NDK_HOST_SYSTEM_NAME}" ) if( __machine ) - string( REGEX MATCH "[0-9]+.[0-9]+.[0-9]+$" __version "${__toolchain}" ) + string( REGEX MATCH "[0-9]+[.][0-9]+[.]*[0-9]*$" __version "${__toolchain}" ) string( REGEX MATCH "^[^-]+" __arch "${__toolchain}" ) - list( APPEND __availableToolchainMachines ${__machine} ) - list( APPEND __availableToolchainArchs ${__arch} ) - list( APPEND __availableToolchainCompilerVersions ${__version} ) + list( APPEND __availableToolchainMachines "${__machine}" ) + list( APPEND __availableToolchainArchs "${__arch}" ) + list( APPEND __availableToolchainCompilerVersions "${__version}" ) else() list( REMOVE_ITEM __availableToolchains "${__toolchain}" ) endif() @@ -669,8 +671,13 @@ if( BUILD_WITH_ANDROID_NDK ) set( __stlIncludePath "${ANDROID_NDK}/sources/cxx-stl/stlport/stlport" ) set( __stlLibPath "${ANDROID_NDK}/sources/cxx-stl/stlport/libs/${ANDROID_NDK_ABI_NAME}" ) else() - set( __stlIncludePath "${ANDROID_NDK}/sources/cxx-stl/gnu-libstdc++/include" ) - set( __stlLibPath "${ANDROID_NDK}/sources/cxx-stl/gnu-libstdc++/libs/${ANDROID_NDK_ABI_NAME}" ) + if( EXISTS "${ANDROID_NDK}/sources/cxx-stl/gnu-libstdc++/${ANDROID_COMPILER_VERSION}" ) + set( __stlIncludePath "${ANDROID_NDK}/sources/cxx-stl/gnu-libstdc++/${ANDROID_COMPILER_VERSION}/include" ) + set( __stlLibPath "${ANDROID_NDK}/sources/cxx-stl/gnu-libstdc++/${ANDROID_COMPILER_VERSION}/libs/${ANDROID_NDK_ABI_NAME}" ) + else() + set( __stlIncludePath "${ANDROID_NDK}/sources/cxx-stl/gnu-libstdc++/include" ) + set( __stlLibPath "${ANDROID_NDK}/sources/cxx-stl/gnu-libstdc++/libs/${ANDROID_NDK_ABI_NAME}" ) + endif() endif() endif() From 9c0f556d8844ad98cd0e2ce9c11615d0ba460dce Mon Sep 17 00:00:00 2001 From: Vadim Pisarevsky Date: Fri, 3 Aug 2012 17:12:45 +0400 Subject: [PATCH 8/9] fixed a few compile warnings and errors with VS2010. --- modules/imgproc/src/smooth.cpp | 37 +++++++++---------- .../imgproc/test/test_bilateral_filter.cpp | 4 +- 2 files changed, 20 insertions(+), 21 deletions(-) diff --git a/modules/imgproc/src/smooth.cpp b/modules/imgproc/src/smooth.cpp index 1bc11c7fc..2165673c8 100644 --- a/modules/imgproc/src/smooth.cpp +++ b/modules/imgproc/src/smooth.cpp @@ -1292,22 +1292,22 @@ class BilateralFilter_8u_Invoker : public ParallelLoopBody { public: - BilateralFilter_8u_Invoker(const Mat &_src, Mat& _dst, Mat _temp, int _radius, int _maxk, + BilateralFilter_8u_Invoker(Mat& _dest, const Mat& _temp, int _radius, int _maxk, int* _space_ofs, float *_space_weight, float *_color_weight) : - ParallelLoopBody(), src(_src), dst(_dst), temp(_temp), radius(_radius), + ParallelLoopBody(), dest(&_dest), temp(&_temp), radius(_radius), maxk(_maxk), space_ofs(_space_ofs), space_weight(_space_weight), color_weight(_color_weight) { } virtual void operator() (const Range& range) const { - int i, j, cn = src.channels(), k; - Size size = src.size(); + int i, j, cn = dest->channels(), k; + Size size = dest->size(); for( i = range.start; i < range.end; i++ ) { - const uchar* sptr = temp.data + (i+radius)*temp.step + radius*cn; - uchar* dptr = dst.data + i*dst.step; + const uchar* sptr = temp->ptr(i+radius) + radius*cn; + uchar* dptr = dest->ptr(i); if( cn == 1 ) { @@ -1353,9 +1353,9 @@ public: } private: - const Mat& src; - Mat &dst, temp; - int radius, maxk, * space_ofs; + const Mat *temp; + Mat *dest; + int radius, maxk, *space_ofs; float *space_weight, *color_weight; }; @@ -1412,7 +1412,7 @@ bilateralFilter_8u( const Mat& src, Mat& dst, int d, space_ofs[maxk++] = (int)(i*temp.step + j*cn); } - BilateralFilter_8u_Invoker body(src, dst, temp, radius, maxk, space_ofs, space_weight, color_weight); + BilateralFilter_8u_Invoker body(dst, temp, radius, maxk, space_ofs, space_weight, color_weight); parallel_for_(Range(0, size.height), body); } @@ -1423,22 +1423,21 @@ class BilateralFilter_32f_Invoker : public: BilateralFilter_32f_Invoker(int _cn, int _radius, int _maxk, int *_space_ofs, - Mat _temp, Mat *_dest, Size _size, - float _scale_index, float *_space_weight, float *_expLUT) : + const Mat& _temp, Mat& _dest, float _scale_index, float *_space_weight, float *_expLUT) : ParallelLoopBody(), cn(_cn), radius(_radius), maxk(_maxk), space_ofs(_space_ofs), - temp(_temp), dest(_dest), size(_size), scale_index(_scale_index), space_weight(_space_weight), expLUT(_expLUT) + temp(&_temp), dest(&_dest), scale_index(_scale_index), space_weight(_space_weight), expLUT(_expLUT) { } virtual void operator() (const Range& range) const { - Mat& dst = *dest; int i, j, k; + Size size = dest->size(); for( i = range.start; i < range.end; i++ ) { - const float* sptr = (const float*)(temp.data + (i+radius)*temp.step) + radius*cn; - float* dptr = (float*)(dst.data + i*dst.step); + const float* sptr = temp->ptr(i+radius) + radius*cn; + float* dptr = dest->ptr(i); if( cn == 1 ) { @@ -1490,8 +1489,8 @@ public: private: int cn, radius, maxk, *space_ofs; - Mat temp, *dest; - Size size; + const Mat* temp; + Mat *dest; float scale_index, *space_weight, *expLUT; }; @@ -1581,7 +1580,7 @@ bilateralFilter_32f( const Mat& src, Mat& dst, int d, // parallel_for usage - BilateralFilter_32f_Invoker body(cn, radius, maxk, space_ofs, temp, &dst, size, scale_index, space_weight, expLUT); + BilateralFilter_32f_Invoker body(cn, radius, maxk, space_ofs, temp, dst, scale_index, space_weight, expLUT); parallel_for_(Range(0, size.height), body); } diff --git a/modules/imgproc/test/test_bilateral_filter.cpp b/modules/imgproc/test/test_bilateral_filter.cpp index 034f9c363..7379b14c9 100644 --- a/modules/imgproc/test/test_bilateral_filter.cpp +++ b/modules/imgproc/test/test_bilateral_filter.cpp @@ -90,8 +90,8 @@ namespace cvtest int CV_BilateralFilterTest::getRandInt(RNG& rng, int min_value, int max_value) const { - double rand_value = rng.uniform(log(min_value), log(max_value + 1)); - return cvRound(exp(rand_value)); + double rand_value = rng.uniform(log((double)min_value), log((double)max_value + 1)); + return cvRound(exp((double)rand_value)); } void CV_BilateralFilterTest::reference_bilateral_filter(const Mat &src, Mat &dst, int d, From 064d022a4bacfe6c21243739625fef3c1f8c9192 Mon Sep 17 00:00:00 2001 From: Evgeny Talanin Date: Fri, 3 Aug 2012 17:50:55 +0400 Subject: [PATCH 9/9] Set correct path to repo in python tests (changed after migration to git) --- modules/python/test/test.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modules/python/test/test.py b/modules/python/test/test.py index daace0f9e..5a80832e7 100644 --- a/modules/python/test/test.py +++ b/modules/python/test/test.py @@ -69,7 +69,7 @@ class OpenCVTests(unittest.TestCase): def get_sample(self, filename, iscolor = cv.CV_LOAD_IMAGE_COLOR): if not filename in self.image_cache: - filedata = urllib.urlopen("http://code.opencv.org/svn/opencv/trunk/opencv/" + filename).read() + filedata = urllib.urlopen("http://code.opencv.org/projects/opencv/repository/revisions/master/raw/" + filename).read() imagefiledata = cv.CreateMatHeader(1, len(filedata), cv.CV_8UC1) cv.SetData(imagefiledata, filedata, len(filedata)) self.image_cache[filename] = cv.DecodeImageM(imagefiledata, iscolor)