From d084d19779fec1668ab2aefe34d228d854782601 Mon Sep 17 00:00:00 2001 From: Vadim Pisarevsky Date: Mon, 23 Dec 2013 15:28:50 +0400 Subject: [PATCH] added OpenCL optimization for LBP-based face detector --- modules/objdetect/src/cascadedetect.cpp | 117 ++++++++++++------ modules/objdetect/src/cascadedetect.hpp | 3 +- modules/objdetect/src/opencl/cascadedetect.cl | 35 +++--- 3 files changed, 102 insertions(+), 53 deletions(-) diff --git a/modules/objdetect/src/cascadedetect.cpp b/modules/objdetect/src/cascadedetect.cpp index 93225f1e2..07f9bde95 100644 --- a/modules/objdetect/src/cascadedetect.cpp +++ b/modules/objdetect/src/cascadedetect.cpp @@ -743,6 +743,14 @@ bool LBPEvaluator::setWindow( Point pt ) pwin = &sum.at(pt); return true; } + + +void LBPEvaluator::getUMats(std::vector& bufs) +{ + bufs.clear(); + bufs.push_back(usum); + bufs.push_back(ufbuf); +} //---------------------------------------------- HOGEvaluator --------------------------------------- bool HOGEvaluator::Feature :: read( const FileNode& node ) @@ -1162,50 +1170,84 @@ bool CascadeClassifierImpl::detectSingleScale( InputArray _image, Size processin bool CascadeClassifierImpl::ocl_detectSingleScale( InputArray _image, Size processingRectSize, int yStep, double factor, Size sumSize0 ) { - const int VECTOR_SIZE = 1; - Ptr haar = featureEvaluator.dynamicCast(); - if( haar.empty() ) - return false; - - haar->setImage(_image, data.origWinSize, sumSize0); - - if( cascadeKernel.empty() ) - { - cascadeKernel.create("runHaarClassifierStump", ocl::objdetect::cascadedetect_oclsrc, - format("-D VECTOR_SIZE=%d", VECTOR_SIZE)); - if( cascadeKernel.empty() ) - return false; - } - + int featureType = getFeatureType(); + std::vector bufs; + size_t globalsize[] = { processingRectSize.width/yStep, processingRectSize.height/yStep }; + bool ok = false; + if( ustages.empty() ) { copyVectorToUMat(data.stages, ustages); copyVectorToUMat(data.stumps, ustumps); + if( !data.subsets.empty() ) + copyVectorToUMat(data.subsets, usubsets); } - std::vector bufs; - haar->getUMats(bufs); - CV_Assert(bufs.size() == 3); + if( featureType == FeatureEvaluator::HAAR ) + { + Ptr haar = featureEvaluator.dynamicCast(); + if( haar.empty() ) + return false; - Rect normrect = haar->getNormRect(); + haar->setImage(_image, data.origWinSize, sumSize0); + if( haarKernel.empty() ) + { + haarKernel.create("runHaarClassifierStump", ocl::objdetect::cascadedetect_oclsrc, ""); + if( haarKernel.empty() ) + return false; + } + + haar->getUMats(bufs); + Rect normrect = haar->getNormRect(); - //processingRectSize = Size(yStep, yStep); - size_t globalsize[] = { (processingRectSize.width/yStep + VECTOR_SIZE-1)/VECTOR_SIZE, processingRectSize.height/yStep }; + haarKernel.args(ocl::KernelArg::ReadOnlyNoSize(bufs[0]), // sum + ocl::KernelArg::ReadOnlyNoSize(bufs[1]), // sqsum + ocl::KernelArg::PtrReadOnly(bufs[2]), // optfeatures - cascadeKernel.args(ocl::KernelArg::ReadOnlyNoSize(bufs[0]), // sum - ocl::KernelArg::ReadOnlyNoSize(bufs[1]), // sqsum - ocl::KernelArg::PtrReadOnly(bufs[2]), // optfeatures + // cascade classifier + (int)data.stages.size(), + ocl::KernelArg::PtrReadOnly(ustages), + ocl::KernelArg::PtrReadOnly(ustumps), - // cascade classifier - (int)data.stages.size(), - ocl::KernelArg::PtrReadOnly(ustages), - ocl::KernelArg::PtrReadOnly(ustumps), - - ocl::KernelArg::PtrWriteOnly(ufacepos), // positions - processingRectSize, - yStep, (float)factor, - normrect, data.origWinSize, MAX_FACES); - bool ok = cascadeKernel.run(2, globalsize, 0, true); + ocl::KernelArg::PtrWriteOnly(ufacepos), // positions + processingRectSize, + yStep, (float)factor, + normrect, data.origWinSize, MAX_FACES); + ok = haarKernel.run(2, globalsize, 0, true); + } + else if( featureType == FeatureEvaluator::LBP ) + { + Ptr lbp = featureEvaluator.dynamicCast(); + if( lbp.empty() ) + return false; + + lbp->setImage(_image, data.origWinSize, sumSize0); + if( lbpKernel.empty() ) + { + lbpKernel.create("runLBPClassifierStump", ocl::objdetect::cascadedetect_oclsrc, ""); + if( lbpKernel.empty() ) + return false; + } + + lbp->getUMats(bufs); + + int subsetSize = (data.ncategories + 31)/32; + lbpKernel.args(ocl::KernelArg::ReadOnlyNoSize(bufs[0]), // sum + ocl::KernelArg::PtrReadOnly(bufs[1]), // optfeatures + + // cascade classifier + (int)data.stages.size(), + ocl::KernelArg::PtrReadOnly(ustages), + ocl::KernelArg::PtrReadOnly(ustumps), + ocl::KernelArg::PtrReadOnly(usubsets), + subsetSize, + + ocl::KernelArg::PtrWriteOnly(ufacepos), // positions + processingRectSize, + yStep, (float)factor, + data.origWinSize, MAX_FACES); + ok = lbpKernel.run(2, globalsize, 0, true); + } //CV_Assert(ok); return ok; } @@ -1254,6 +1296,7 @@ void CascadeClassifierImpl::detectMultiScaleNoGrouping( InputArray _image, std:: double scaleFactor, Size minObjectSize, Size maxObjectSize, bool outputRejectLevels ) { + int featureType = getFeatureType(); Size imgsz = _image.size(); int imgtype = _image.type(); @@ -1267,7 +1310,8 @@ void CascadeClassifierImpl::detectMultiScaleNoGrouping( InputArray _image, std:: maxObjectSize = imgsz; bool use_ocl = ocl::useOpenCL() && - getFeatureType() == FeatureEvaluator::HAAR && + (featureType == FeatureEvaluator::HAAR || + featureType == FeatureEvaluator::LBP) && !isOldFormatCascade() && data.isStumpBased() && maskGenerator.empty() && @@ -1593,7 +1637,8 @@ bool CascadeClassifierImpl::Data::read(const FileNode &root) bool CascadeClassifierImpl::read_(const FileNode& root) { tryOpenCL = true; - cascadeKernel = ocl::Kernel(); + haarKernel = ocl::Kernel(); + lbpKernel = ocl::Kernel(); ustages.release(); ustumps.release(); if( !data.read(root) ) diff --git a/modules/objdetect/src/cascadedetect.hpp b/modules/objdetect/src/cascadedetect.hpp index a0b2b55c9..3731344d4 100644 --- a/modules/objdetect/src/cascadedetect.hpp +++ b/modules/objdetect/src/cascadedetect.hpp @@ -149,7 +149,7 @@ protected: Ptr maskGenerator; UMat ugrayImage, uimageBuffer; UMat ufacepos, ustages, ustumps, usubsets; - ocl::Kernel cascadeKernel; + ocl::Kernel haarKernel, lbpKernel; bool tryOpenCL; Mutex mtx; @@ -392,6 +392,7 @@ public: virtual bool setImage(InputArray image, Size _origWinSize, Size); virtual bool setWindow(Point pt); + virtual void getUMats(std::vector& bufs); int operator()(int featureIdx) const { return optfeaturesPtr[featureIdx].calc(pwin); } diff --git a/modules/objdetect/src/opencl/cascadedetect.cl b/modules/objdetect/src/opencl/cascadedetect.cl index 7428e89a2..3e0187e5b 100644 --- a/modules/objdetect/src/opencl/cascadedetect.cl +++ b/modules/objdetect/src/opencl/cascadedetect.cl @@ -105,7 +105,7 @@ __kernel void runHaarClassifierStump( } } -#if 0 + __kernel void runLBPClassifierStump( __global const int* sum, int sumstep, int sumoffset, @@ -119,45 +119,48 @@ __kernel void runLBPClassifierStump( volatile __global int* facepos, int2 imgsize, int xyscale, float factor, - int4 normrect, int2 windowsize, int maxFaces) + int2 windowsize, int maxFaces) { int ix = get_global_id(0)*xyscale; int iy = get_global_id(1)*xyscale; sumstep /= sizeof(int); - sqsumstep /= sizeof(int); if( ix < imgsize.x && iy < imgsize.y ) { int stageIdx; __global const Stump* stump = stumps; + __global const int* p = sum + mad24(iy, sumstep, ix); for( stageIdx = 0; stageIdx < nstages; stageIdx++ ) { int i, ntrees = stages[stageIdx].ntrees; float s = 0.f; - for( i = 0; i < ntrees; i++, stump++ ) + for( i = 0; i < ntrees; i++, stump++, bitsets += bitsetSize ) { float4 st = stump->st; __global const OptLBPFeature* f = optfeatures + as_int(st.x); int16 ofs = f->ofs; + #define CALC_SUM_OFS_(p0, p1, p2, p3, ptr) \ + ((ptr)[p0] - (ptr)[p1] - (ptr)[p2] + (ptr)[p3]) + int cval = CALC_SUM_OFS_( ofs.s5, ofs.s6, ofs.s9, ofs.sa, p ); - int4 ofs = f->ofs[0]; - sval = (psum[ofs.x] - psum[ofs.y] - psum[ofs.z] + psum[ofs.w])*weight.x; - ofs = f->ofs[1]; - sval += (psum[ofs.x] - psum[ofs.y] - psum[ofs.z] + psum[ofs.w])*weight.y; - if( weight.z > 0 ) - { - ofs = f->ofs[2]; - sval += (psum[ofs.x] - psum[ofs.y] - psum[ofs.z] + psum[ofs.w])*weight.z; - } + int mask, idx = (CALC_SUM_OFS_( ofs.s0, ofs.s1, ofs.s4, ofs.s5, p ) >= cval ? 4 : 0); // 0 + idx |= (CALC_SUM_OFS_( ofs.s1, ofs.s2, ofs.s5, ofs.s6, p ) >= cval ? 2 : 0); // 1 + idx |= (CALC_SUM_OFS_( ofs.s2, ofs.s3, ofs.s6, ofs.s7, p ) >= cval ? 1 : 0); // 2 - s += (sval < st.y*nf) ? st.z : st.w; + mask = (CALC_SUM_OFS_( ofs.s6, ofs.s7, ofs.sa, ofs.sb, p ) >= cval ? 16 : 0); // 5 + mask |= (CALC_SUM_OFS_( ofs.sa, ofs.sb, ofs.se, ofs.sf, p ) >= cval ? 8 : 0); // 8 + mask |= (CALC_SUM_OFS_( ofs.s9, ofs.sa, ofs.sd, ofs.se, p ) >= cval ? 4 : 0); // 7 + mask |= (CALC_SUM_OFS_( ofs.s8, ofs.s9, ofs.sc, ofs.sd, p ) >= cval ? 2 : 0); // 6 + mask |= (CALC_SUM_OFS_( ofs.s4, ofs.s5, ofs.s8, ofs.s9, p ) >= cval ? 1 : 0); // 7 + + s += (bitsets[idx] & (1 << mask)) ? st.z : st.w; } if( s < stages[stageIdx].threshold ) - break; + break; } if( stageIdx == nstages ) @@ -174,4 +177,4 @@ __kernel void runLBPClassifierStump( } } } -#endif +