converted old haar cascades to the new format; added the conversion function; added OpenCL optimization into CascadeClassfier; optimized the data structures and CPU code for the stump case.

2013-12-19 14:48:42 +04:00
parent 2431c72d1d ec3f22cee2
commit a1784b7320
31 changed files with 492438 additions and 575343 deletions
--- a/modules/objdetect/include/opencv2/objdetect.hpp
+++ b/modules/objdetect/include/opencv2/objdetect.hpp
@@ -111,12 +111,15 @@ public:
 };

 CV_EXPORTS   void groupRectangles(std::vector<Rect>& rectList, int groupThreshold, double eps = 0.2);
-CV_EXPORTS_W void groupRectangles(CV_IN_OUT std::vector<Rect>& rectList, CV_OUT std::vector<int>& weights, int groupThreshold, double eps = 0.2);
-CV_EXPORTS   void groupRectangles(std::vector<Rect>& rectList, int groupThreshold, double eps, std::vector<int>* weights, std::vector<double>* levelWeights );
+CV_EXPORTS_W void groupRectangles(CV_IN_OUT std::vector<Rect>& rectList, CV_OUT std::vector<int>& weights,
+                                  int groupThreshold, double eps = 0.2);
+CV_EXPORTS   void groupRectangles(std::vector<Rect>& rectList, int groupThreshold,
+                                  double eps, std::vector<int>* weights, std::vector<double>* levelWeights );
 CV_EXPORTS   void groupRectangles(std::vector<Rect>& rectList, std::vector<int>& rejectLevels,
                                  std::vector<double>& levelWeights, int groupThreshold, double eps = 0.2);
-CV_EXPORTS   void groupRectangles_meanshift(std::vector<Rect>& rectList, std::vector<double>& foundWeights, std::vector<double>& foundScales,
-                                          double detectThreshold = 0.0, Size winDetSize = Size(64, 128));
+CV_EXPORTS   void groupRectangles_meanshift(std::vector<Rect>& rectList, std::vector<double>& foundWeights,
+                                            std::vector<double>& foundScales,
+                                            double detectThreshold = 0.0, Size winDetSize = Size(64, 128));

 class CV_EXPORTS FeatureEvaluator
 {
@@ -132,7 +135,7 @@ public:
    virtual Ptr<FeatureEvaluator> clone() const;
    virtual int getFeatureType() const;

-    virtual bool setImage(const Mat& img, Size origWinSize);
+    virtual bool setImage(InputArray img, Size origWinSize, Size sumSize);
    virtual bool setWindow(Point p);

    virtual double calcOrd(int featureIdx) const;
@@ -232,6 +235,8 @@ public:
    CV_WRAP int getFeatureType() const;
    void* getOldCascade();

+    CV_WRAP static bool convert(const String& oldcascade, const String& newcascade);
+
    void setMaskGenerator(const Ptr<BaseCascadeClassifier::MaskGenerator>& maskGenerator);
    Ptr<BaseCascadeClassifier::MaskGenerator> getMaskGenerator();

--- a/modules/objdetect/src/cascadedetect.cpp
+++ b/modules/objdetect/src/cascadedetect.cpp
@@ -7,10 +7,10 @@
 //  copy or use the software.
 //
 //
-//                        Intel License Agreement
+//                           License Agreement
 //                For Open Source Computer Vision Library
 //
-// Copyright (C) 2000, Intel Corporation, all rights reserved.
+// Copyright (C) 2008-2013, Itseez Inc., all rights reserved.
 // Third party copyrights are property of their respective owners.
 //
 // Redistribution and use in source and binary forms, with or without modification,
@@ -23,13 +23,13 @@
 //     this list of conditions and the following disclaimer in the documentation
 //     and/or other materials provided with the distribution.
 //
-//   * The name of Intel Corporation may not be used to endorse or promote products
+//   * The name of Itseez Inc. may not be used to endorse or promote products
 //     derived from this software without specific prior written permission.
 //
 // This software is provided by the copyright holders and contributors "as is" and
 // any express or implied warranties, including, but not limited to, the implied
 // warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
+// In no event shall the copyright holders or contributors be liable for any direct,
 // indirect, incidental, special, exemplary, or consequential damages
 // (including, but not limited to, procurement of substitute goods or services;
 // loss of use, data, or profits; or business interruption) however caused
@@ -44,6 +44,7 @@

 #include "cascadedetect.hpp"
 #include "opencv2/objdetect/objdetect_c.h"
+#include "opencl_kernels.hpp"

 #if defined (LOG_CASCADE_STATISTIC)
 struct Logger
@@ -112,6 +113,13 @@ struct Logger

 namespace cv
 {
+	
+template<typename _Tp> void copyVectorToUMat(const std::vector<_Tp>& v, UMat& um)
+{
+	if(v.empty())
+		um.release();
+	Mat(1, (int)(v.size()*sizeof(v[0])), CV_8U, (void*)&v[0]).copyTo(um);
+}

 void groupRectangles(std::vector<Rect>& rectList, int groupThreshold, double eps, std::vector<int>* weights, std::vector<double>* levelWeights)
 {
@@ -434,7 +442,7 @@ FeatureEvaluator::~FeatureEvaluator() {}
 bool FeatureEvaluator::read(const FileNode&) {return true;}
 Ptr<FeatureEvaluator> FeatureEvaluator::clone() const { return Ptr<FeatureEvaluator>(); }
 int FeatureEvaluator::getFeatureType() const {return -1;}
-bool FeatureEvaluator::setImage(const Mat&, Size) {return true;}
+bool FeatureEvaluator::setImage(InputArray, Size, Size) {return true;}
 bool FeatureEvaluator::setWindow(Point) { return true; }
 double FeatureEvaluator::calcOrd(int) const { return 0.; }
 int FeatureEvaluator::calcCat(int) const { return 0; }
@@ -466,7 +474,8 @@ bool HaarEvaluator::Feature :: read( const FileNode& node )

 HaarEvaluator::HaarEvaluator()
 {
-    features = makePtr<std::vector<Feature> >();
+    optfeaturesPtr = 0;
+    pwin = 0;
 }
 HaarEvaluator::~HaarEvaluator()
 {
@@ -474,16 +483,24 @@ HaarEvaluator::~HaarEvaluator()

 bool HaarEvaluator::read(const FileNode& node)
 {
-    features->resize(node.size());
-    featuresPtr = &(*features)[0];
-    FileNodeIterator it = node.begin(), it_end = node.end();
+    size_t i, n = node.size();
+    CV_Assert(n > 0);
+    if(features.empty())
+        features = makePtr<std::vector<Feature> >();
+    if(optfeatures.empty())
+        optfeatures = makePtr<std::vector<OptFeature> >();
+    features->resize(n);
+    FileNodeIterator it = node.begin();
    hasTiltedFeatures = false;
+    std::vector<Feature>& ff = *features;
+    sumSize0 = Size();
+    ufbuf.release();

-    for(int i = 0; it != it_end; ++it, i++)
+    for(i = 0; i < n; i++, ++it)
    {
-        if(!featuresPtr[i].read(*it))
+        if(!ff[i].read(*it))
            return false;
-        if( featuresPtr[i].tilted )
+        if( ff[i].tilted )
            hasTiltedFeatures = true;
    }
    return true;
@@ -494,59 +511,102 @@ Ptr<FeatureEvaluator> HaarEvaluator::clone() const
    Ptr<HaarEvaluator> ret = makePtr<HaarEvaluator>();
    ret->origWinSize = origWinSize;
    ret->features = features;
-    ret->featuresPtr = &(*ret->features)[0];
+    ret->optfeatures = optfeatures;
+    ret->optfeaturesPtr = optfeatures->empty() ? 0 : &(*(ret->optfeatures))[0];
    ret->hasTiltedFeatures = hasTiltedFeatures;
-    ret->sum0 = sum0, ret->sqsum0 = sqsum0, ret->tilted0 = tilted0;
-    ret->sum = sum, ret->sqsum = sqsum, ret->tilted = tilted;
+    ret->sum0 = sum0; ret->sqsum0 = sqsum0;
+    ret->sum = sum; ret->sqsum = sqsum;
+    ret->usum0 = usum0; ret->usqsum0 = usqsum0; ret->ufbuf = ufbuf;
    ret->normrect = normrect;
-    memcpy( ret->p, p, 4*sizeof(p[0]) );
-    memcpy( ret->pq, pq, 4*sizeof(pq[0]) );
-    ret->offset = offset;
+    memcpy( ret->nofs, nofs, 4*sizeof(nofs[0]) );
+    ret->pwin = pwin;
    ret->varianceNormFactor = varianceNormFactor;
    return ret;
 }

-bool HaarEvaluator::setImage( const Mat &image, Size _origWinSize )
+bool HaarEvaluator::setImage( InputArray _image, Size _origWinSize, Size _sumSize )
 {
-    int rn = image.rows+1, cn = image.cols+1;
+    Size imgsz = _image.size();
+    int cols = imgsz.width, rows = imgsz.height;
+    
+    if (imgsz.width < origWinSize.width || imgsz.height < origWinSize.height)
+        return false;
+    
    origWinSize = _origWinSize;
    normrect = Rect(1, 1, origWinSize.width-2, origWinSize.height-2);
-
-    if (image.cols < origWinSize.width || image.rows < origWinSize.height)
-        return false;
-
-    if( sum0.rows < rn || sum0.cols < cn )
+    
+    int rn = _sumSize.height, cn = _sumSize.width, rn_scale = hasTiltedFeatures ? 2 : 1;
+    int sumStep, tofs = 0;
+    CV_Assert(rn >= rows+1 && cn >= cols+1);
+    
+    if( _image.isUMat() )
    {
-        sum0.create(rn, cn, CV_32S);
-        sqsum0.create(rn, cn, CV_64F);
-        if (hasTiltedFeatures)
-            tilted0.create( rn, cn, CV_32S);
-    }
-    sum = Mat(rn, cn, CV_32S, sum0.data);
-    sqsum = Mat(rn, cn, CV_64F, sqsum0.data);
-
-    if( hasTiltedFeatures )
-    {
-        tilted = Mat(rn, cn, CV_32S, tilted0.data);
-        integral(image, sum, sqsum, tilted);
+        usum0.create(rn*rn_scale, cn, CV_32S);
+        usqsum0.create(rn, cn, CV_32S);
+        usum = UMat(usum0, Rect(0, 0, cols+1, rows+1));
+        usqsum = UMat(usqsum0, Rect(0, 0, cols, rows));
+        
+        if( hasTiltedFeatures )
+        {
+            UMat utilted(usum0, Rect(0, _sumSize.height, cols+1, rows+1));
+            integral(_image, usum, noArray(), utilted, CV_32S);
+            tofs = (int)((utilted.offset - usum.offset)/sizeof(int));
+        }
+        else
+        {
+            integral(_image, usum, noArray(), noArray(), CV_32S);
+        }
+        
+        sqrBoxFilter(_image, usqsum, CV_32S,
+                     Size(normrect.width, normrect.height),
+                     Point(0, 0), false);
+        /*sqrBoxFilter(_image.getMat(), sqsum, CV_32S,
+                     Size(normrect.width, normrect.height),
+                     Point(0, 0), false);
+        sqsum.copyTo(usqsum);*/
+        sumStep = (int)(usum.step/usum.elemSize());
    }
    else
-        integral(image, sum, sqsum);
-    const int* sdata = (const int*)sum.data;
-    const double* sqdata = (const double*)sqsum.data;
-    size_t sumStep = sum.step/sizeof(sdata[0]);
-    size_t sqsumStep = sqsum.step/sizeof(sqdata[0]);
+    {
+        sum0.create(rn*rn_scale, cn, CV_32S);
+        sqsum0.create(rn, cn, CV_32S);
+        sum = sum0(Rect(0, 0, cols+1, rows+1));
+        sqsum = sqsum0(Rect(0, 0, cols, rows));
+        
+        if( hasTiltedFeatures )
+        {
+            Mat tilted = sum0(Rect(0, _sumSize.height, cols+1, rows+1));
+            integral(_image, sum, noArray(), tilted, CV_32S);
+            tofs = (int)((tilted.data - sum.data)/sizeof(int));
+        }
+        else
+            integral(_image, sum, noArray(), noArray(), CV_32S);
+        sqrBoxFilter(_image, sqsum, CV_32S,
+                     Size(normrect.width, normrect.height),
+                     Point(0, 0), false);
+        sumStep = (int)(sum.step/sum.elemSize());
+    }

-    CV_SUM_PTRS( p[0], p[1], p[2], p[3], sdata, normrect, sumStep );
-    CV_SUM_PTRS( pq[0], pq[1], pq[2], pq[3], sqdata, normrect, sqsumStep );
+    CV_SUM_OFS( nofs[0], nofs[1], nofs[2], nofs[3], 0, normrect, sumStep );

    size_t fi, nfeatures = features->size();
-
-    for( fi = 0; fi < nfeatures; fi++ )
-        featuresPtr[fi].updatePtrs( !featuresPtr[fi].tilted ? sum : tilted );
+    const std::vector<Feature>& ff = *features;
+    
+    if( sumSize0 != _sumSize )
+    {
+        optfeatures->resize(nfeatures);
+        optfeaturesPtr = &(*optfeatures)[0];
+        for( fi = 0; fi < nfeatures; fi++ )
+            optfeaturesPtr[fi].setOffsets( ff[fi], sumStep, tofs );
+    }
+    if( _image.isUMat() && (sumSize0 != _sumSize || ufbuf.empty()) )
+		copyVectorToUMat(*optfeatures, ufbuf);
+    sumSize0 = _sumSize;
+        
    return true;
 }

+    
 bool  HaarEvaluator::setWindow( Point pt )
 {
    if( pt.x < 0 || pt.y < 0 ||
@@ -554,10 +614,9 @@ bool  HaarEvaluator::setWindow( Point pt )
        pt.y + origWinSize.height >= sum.rows )
        return false;

-    size_t pOffset = pt.y * (sum.step/sizeof(int)) + pt.x;
-    size_t pqOffset = pt.y * (sqsum.step/sizeof(double)) + pt.x;
-    int valsum = CALC_SUM(p, pOffset);
-    double valsqsum = CALC_SUM(pq, pqOffset);
+    const int* p = &sum.at<int>(pt);
+    int valsum = CALC_SUM_OFS(nofs, p);
+    double valsqsum = sqsum.at<int>(pt.y + normrect.y, pt.x + normrect.x);

    double nf = (double)normrect.area() * valsqsum - (double)valsum * valsum;
    if( nf > 0. )
@@ -565,10 +624,23 @@ bool  HaarEvaluator::setWindow( Point pt )
    else
        nf = 1.;
    varianceNormFactor = 1./nf;
-    offset = (int)pOffset;
+    pwin = p;

    return true;
 }
+    
+Rect HaarEvaluator::getNormRect() const
+{
+    return normrect;
+}
+    
+void HaarEvaluator::getUMats(std::vector<UMat>& bufs)
+{
+    bufs.clear();
+    bufs.push_back(usum);
+    bufs.push_back(usqsum);
+    bufs.push_back(ufbuf);
+}

 //----------------------------------------------  LBPEvaluator -------------------------------------
 bool LBPEvaluator::Feature :: read(const FileNode& node )
@@ -612,8 +684,9 @@ Ptr<FeatureEvaluator> LBPEvaluator::clone() const
    return ret;
 }

-bool LBPEvaluator::setImage( const Mat& image, Size _origWinSize )
+bool LBPEvaluator::setImage( InputArray _image, Size _origWinSize, Size )
 {
+    Mat image = _image.getMat();
    int rn = image.rows+1, cn = image.cols+1;
    origWinSize = _origWinSize;

@@ -693,8 +766,9 @@ Ptr<FeatureEvaluator> HOGEvaluator::clone() const
    return ret;
 }

-bool HOGEvaluator::setImage( const Mat& image, Size winSize )
+bool HOGEvaluator::setImage( InputArray _image, Size winSize, Size )
 {
+    Mat image = _image.getMat();
    int rows = image.rows + 1;
    int cols = image.cols + 1;
    origWinSize = winSize;
@@ -880,7 +954,7 @@ int CascadeClassifierImpl::runAt( Ptr<FeatureEvaluator>& evaluator, Point pt, do

    if( !evaluator->setWindow(pt) )
        return -1;
-    if( data.isStumpBased )
+    if( data.isStumpBased() )
    {
        if( data.featureType == FeatureEvaluator::HAAR )
            return predictOrderedStump<HaarEvaluator>( *this, evaluator, weight );
@@ -904,11 +978,6 @@ int CascadeClassifierImpl::runAt( Ptr<FeatureEvaluator>& evaluator, Point pt, do
    }
 }

-bool CascadeClassifierImpl::setImage( Ptr<FeatureEvaluator>& evaluator, const Mat& image )
-{
-    return empty() ? false : evaluator->setImage(image, data.origWinSize);
-}
-
 void CascadeClassifierImpl::setMaskGenerator(const Ptr<MaskGenerator>& _maskGenerator)
 {
    maskGenerator=_maskGenerator;
@@ -1010,11 +1079,12 @@ struct getRect { Rect operator ()(const CvAvgComp& e) const { return e.rect; } }
 struct getNeighbors { int operator ()(const CvAvgComp& e) const { return e.neighbors; } };


-bool CascadeClassifierImpl::detectSingleScale( const Mat& image, int stripCount, Size processingRectSize,
-                                           int stripSize, int yStep, double factor, std::vector<Rect>& candidates,
-                                           std::vector<int>& levels, std::vector<double>& weights, bool outputRejectLevels )
+bool CascadeClassifierImpl::detectSingleScale( InputArray _image, Size processingRectSize,
+                                           int yStep, double factor, std::vector<Rect>& candidates,
+                                           std::vector<int>& levels, std::vector<double>& weights,
+                                           Size sumSize0, bool outputRejectLevels )
 {
-    if( !featureEvaluator->setImage( image, data.origWinSize ) )
+    if( !featureEvaluator->setImage(_image, data.origWinSize, sumSize0) )
        return false;

 #if defined (LOG_CASCADE_STATISTIC)
@@ -1023,13 +1093,21 @@ bool CascadeClassifierImpl::detectSingleScale( const Mat& image, int stripCount,

    Mat currentMask;
    if (maskGenerator) {
+        Mat image = _image.getMat();
        currentMask=maskGenerator->generateMask(image);
    }

    std::vector<Rect> candidatesVector;
    std::vector<int> rejectLevels;
    std::vector<double> levelWeights;
-    Mutex mtx;
+    
+    int stripCount, stripSize;
+    
+    const int PTS_PER_THREAD = 1000;
+    stripCount = ((processingRectSize.width/yStep)*(processingRectSize.height + yStep-1)/yStep + PTS_PER_THREAD/2)/PTS_PER_THREAD;
+    stripCount = std::min(std::max(stripCount, 1), 100);
+    stripSize = (((processingRectSize.height + stripCount - 1)/stripCount + yStep-1)/yStep)*yStep;
+    
    if( outputRejectLevels )
    {
        parallel_for_(Range(0, stripCount), CascadeClassifierInvoker( *this, processingRectSize, stripSize, yStep, factor,
@@ -1051,12 +1129,63 @@ bool CascadeClassifierImpl::detectSingleScale( const Mat& image, int stripCount,
    return true;
 }

+    
+bool CascadeClassifierImpl::ocl_detectSingleScale( InputArray _image, Size processingRectSize,
+                                                   int yStep, double factor, Size sumSize0 )
+{
+    const int VECTOR_SIZE = 1;
+    Ptr<HaarEvaluator> haar = featureEvaluator.dynamicCast<HaarEvaluator>();
+    if( haar.empty() )
+        return false;
+    
+    haar->setImage(_image, data.origWinSize, sumSize0);
+    
+    if( cascadeKernel.empty() )
+    {
+        cascadeKernel.create("runHaarClassifierStump", ocl::objdetect::cascadedetect_oclsrc,
+                             format("-D VECTOR_SIZE=%d", VECTOR_SIZE));
+        if( cascadeKernel.empty() )
+            return false;
+    }
+    
+    if( ustages.empty() )
+    {
+		copyVectorToUMat(data.stages, ustages);
+		copyVectorToUMat(data.stumps, ustumps);
+    }
+    
+    std::vector<UMat> bufs;
+    haar->getUMats(bufs);
+    CV_Assert(bufs.size() == 3);
+    
+    Rect normrect = haar->getNormRect();
+    
+    //processingRectSize = Size(yStep, yStep);
+    size_t globalsize[] = { (processingRectSize.width/yStep + VECTOR_SIZE-1)/VECTOR_SIZE, processingRectSize.height/yStep };
+    
+    cascadeKernel.args(ocl::KernelArg::ReadOnlyNoSize(bufs[0]), // sum
+                       ocl::KernelArg::ReadOnlyNoSize(bufs[1]), // sqsum
+                       ocl::KernelArg::PtrReadOnly(bufs[2]), // optfeatures
+                       
+                       // cascade classifier
+                       (int)data.stages.size(),
+                       ocl::KernelArg::PtrReadOnly(ustages),
+                       ocl::KernelArg::PtrReadOnly(ustumps),
+                       
+                       ocl::KernelArg::PtrWriteOnly(ufacepos), // positions
+                       processingRectSize,
+                       yStep, (float)factor,
+                       normrect, data.origWinSize, MAX_FACES);
+    bool ok = cascadeKernel.run(2, globalsize, 0, true);
+    //CV_Assert(ok);
+    return ok;
+}
+
 bool CascadeClassifierImpl::isOldFormatCascade() const
 {
    return !oldCascade.empty();
 }

-
 int CascadeClassifierImpl::getFeatureType() const
 {
    return featureEvaluator->getFeatureType();
@@ -1067,12 +1196,6 @@ Size CascadeClassifierImpl::getOriginalWindowSize() const
    return data.origWinSize;
 }

-bool CascadeClassifierImpl::setImage(InputArray _image)
-{
-    Mat image = _image.getMat();
-    return featureEvaluator->setImage(image, data.origWinSize);
-}
-
 void* CascadeClassifierImpl::getOldCascade()
 {
    return oldCascade;
@@ -1096,36 +1219,75 @@ static void detectMultiScaleOldFormat( const Mat& image, Ptr<CvHaarClassifierCas
    std::transform(vecAvgComp.begin(), vecAvgComp.end(), objects.begin(), getRect());
 }

-void CascadeClassifierImpl::detectMultiScaleNoGrouping( const Mat& image, std::vector<Rect>& candidates,
+    
+void CascadeClassifierImpl::detectMultiScaleNoGrouping( InputArray _image, std::vector<Rect>& candidates,
                                                    std::vector<int>& rejectLevels, std::vector<double>& levelWeights,
                                                    double scaleFactor, Size minObjectSize, Size maxObjectSize,
                                                    bool outputRejectLevels )
 {
+    Size imgsz = _image.size();
+    int imgtype = _image.type();
+    
+    Mat grayImage, imageBuffer;
+    
    candidates.clear();
-
-    if (maskGenerator)
-        maskGenerator->initializeMask(image);
-
+    rejectLevels.clear();
+    levelWeights.clear();
+    
    if( maxObjectSize.height == 0 || maxObjectSize.width == 0 )
-        maxObjectSize = image.size();
-
-    Mat grayImage = image;
-    if( grayImage.channels() > 1 )
+        maxObjectSize = imgsz;
+    
+    bool use_ocl = ocl::useOpenCL() &&
+        getFeatureType() == FeatureEvaluator::HAAR &&
+        !isOldFormatCascade() &&
+        data.isStumpBased() &&
+        maskGenerator.empty() &&
+        !outputRejectLevels &&
+        tryOpenCL;
+    
+    if( !use_ocl )
    {
-        Mat temp;
-        cvtColor(grayImage, temp, COLOR_BGR2GRAY);
-        grayImage = temp;
-    }
+        Mat image = _image.getMat();
+        if (maskGenerator)
+            maskGenerator->initializeMask(image);

-    Mat imageBuffer(image.rows + 1, image.cols + 1, CV_8U);
+        grayImage = image;
+        if( CV_MAT_CN(imgtype) > 1 )
+        {
+            Mat temp;
+            cvtColor(grayImage, temp, COLOR_BGR2GRAY);
+            grayImage = temp;
+        }
+
+        imageBuffer.create(imgsz.height + 1, imgsz.width + 1, CV_8U);
+    }
+    else
+    {
+        UMat uimage = _image.getUMat();
+        if( CV_MAT_CN(imgtype) > 1 )
+            cvtColor(uimage, ugrayImage, COLOR_BGR2GRAY);
+        else
+            uimage.copyTo(ugrayImage);
+        uimageBuffer.create(imgsz.height + 1, imgsz.width + 1, CV_8U);
+    }
+    
+    Size sumSize0((imgsz.width + SUM_ALIGN) & -SUM_ALIGN, imgsz.height+1);
+    
+    if( use_ocl )
+    {
+        ufacepos.create(1, MAX_FACES*4 + 1, CV_32S);
+        UMat ufacecount(ufacepos, Rect(0,0,1,1));
+        ufacecount.setTo(Scalar::all(0));
+    }

    for( double factor = 1; ; factor *= scaleFactor )
    {
        Size originalWindowSize = getOriginalWindowSize();

        Size windowSize( cvRound(originalWindowSize.width*factor), cvRound(originalWindowSize.height*factor) );
-        Size scaledImageSize( cvRound( grayImage.cols/factor ), cvRound( grayImage.rows/factor ) );
-        Size processingRectSize( scaledImageSize.width - originalWindowSize.width, scaledImageSize.height - originalWindowSize.height );
+        Size scaledImageSize( cvRound( imgsz.width/factor ), cvRound( imgsz.height/factor ) );
+        Size processingRectSize( scaledImageSize.width - originalWindowSize.width,
+                                 scaledImageSize.height - originalWindowSize.height );

        if( processingRectSize.width <= 0 || processingRectSize.height <= 0 )
            break;
@@ -1133,10 +1295,7 @@ void CascadeClassifierImpl::detectMultiScaleNoGrouping( const Mat& image, std::v
            break;
        if( windowSize.width < minObjectSize.width || windowSize.height < minObjectSize.height )
            continue;
-
-        Mat scaledImage( scaledImageSize, CV_8U, imageBuffer.data );
-        resize( grayImage, scaledImage, scaledImageSize, 0, 0, INTER_LINEAR );
-
+        
        int yStep;
        if( getFeatureType() == cv::FeatureEvaluator::HOG )
        {
@@ -1147,16 +1306,46 @@ void CascadeClassifierImpl::detectMultiScaleNoGrouping( const Mat& image, std::v
            yStep = factor > 2. ? 1 : 2;
        }

-        int stripCount, stripSize;
-
-        const int PTS_PER_THREAD = 1000;
-        stripCount = ((processingRectSize.width/yStep)*(processingRectSize.height + yStep-1)/yStep + PTS_PER_THREAD/2)/PTS_PER_THREAD;
-        stripCount = std::min(std::max(stripCount, 1), 100);
-        stripSize = (((processingRectSize.height + stripCount - 1)/stripCount + yStep-1)/yStep)*yStep;
-
-        if( !detectSingleScale( scaledImage, stripCount, processingRectSize, stripSize, yStep, factor, candidates,
-            rejectLevels, levelWeights, outputRejectLevels ) )
-            break;
+        if( use_ocl )
+        {
+            UMat uscaledImage(uimageBuffer, Rect(0, 0, scaledImageSize.width, scaledImageSize.height));
+            resize( ugrayImage, uscaledImage, scaledImageSize, 0, 0, INTER_LINEAR );
+            
+            if( ocl_detectSingleScale( uscaledImage, processingRectSize, yStep, factor, sumSize0 ) )
+                continue;
+            
+            /////// if the OpenCL branch has been executed but failed, fall back to CPU: /////
+            
+            tryOpenCL = false; // for this cascade do not try OpenCL anymore
+            
+            // since we may already have some partial results from OpenCL code (unlikely, but still),
+            // we just recursively call the function again, but with tryOpenCL==false it will
+            // go with CPU route, so there is no infinite recursion
+            detectMultiScaleNoGrouping( _image, candidates, rejectLevels, levelWeights,
+                                       scaleFactor, minObjectSize, maxObjectSize,
+                                       outputRejectLevels);
+            return;
+        }
+        else
+        {
+            Mat scaledImage( scaledImageSize, CV_8U, imageBuffer.data );
+            resize( grayImage, scaledImage, scaledImageSize, 0, 0, INTER_LINEAR );
+            
+            if( !detectSingleScale( scaledImage, processingRectSize, yStep, factor, candidates,
+                                    rejectLevels, levelWeights, sumSize0, outputRejectLevels ) )
+                break;
+        }
+    }
+    
+    if( use_ocl && tryOpenCL )
+    {
+        Mat facepos = ufacepos.getMat(ACCESS_READ);
+        const int* fptr = facepos.ptr<int>();
+        int i, nfaces = fptr[0];
+        for( i = 0; i < nfaces; i++ )
+        {
+            candidates.push_back(Rect(fptr[i*4+1], fptr[i*4+2], fptr[i*4+3], fptr[i*4+4]));
+        }
    }
 }

@@ -1167,21 +1356,21 @@ void CascadeClassifierImpl::detectMultiScale( InputArray _image, std::vector<Rec
                                          int flags, Size minObjectSize, Size maxObjectSize,
                                          bool outputRejectLevels )
 {
-    Mat image = _image.getMat();
-    CV_Assert( scaleFactor > 1 && image.depth() == CV_8U );
+    CV_Assert( scaleFactor > 1 && _image.depth() == CV_8U );

    if( empty() )
        return;

    if( isOldFormatCascade() )
    {
+        Mat image = _image.getMat();
        std::vector<CvAvgComp> fakeVecAvgComp;
        detectMultiScaleOldFormat( image, oldCascade, objects, rejectLevels, levelWeights, fakeVecAvgComp, scaleFactor,
                                   minNeighbors, flags, minObjectSize, maxObjectSize, outputRejectLevels );
    }
    else
    {
-        detectMultiScaleNoGrouping( image, objects, rejectLevels, levelWeights, scaleFactor, minObjectSize, maxObjectSize,
+        detectMultiScaleNoGrouping( _image, objects, rejectLevels, levelWeights, scaleFactor, minObjectSize, maxObjectSize,
                                    outputRejectLevels );
        const double GROUP_EPS = 0.2;
        if( outputRejectLevels )
@@ -1235,6 +1424,12 @@ void CascadeClassifierImpl::detectMultiScale( InputArray _image, std::vector<Rec
    }
 }

+    
+CascadeClassifierImpl::Data::Data()
+{
+    stageType = featureType = ncategories = maxNodesPerTree = 0;
+}
+    
 bool CascadeClassifierImpl::Data::read(const FileNode &root)
 {
    static const float THRESHOLD_EPS = 1e-5f;
@@ -1261,8 +1456,6 @@ bool CascadeClassifierImpl::Data::read(const FileNode &root)
    origWinSize.height = (int)root[CC_HEIGHT];
    CV_Assert( origWinSize.height > 0 && origWinSize.width > 0 );

-    isStumpBased = (int)(root[CC_STAGE_PARAMS][CC_MAX_DEPTH]) == 1 ? true : false;
-
    // load feature params
    FileNode fn = root[CC_FEATURE_PARAMS];
    if( fn.empty() )
@@ -1280,8 +1473,10 @@ bool CascadeClassifierImpl::Data::read(const FileNode &root)
    stages.reserve(fn.size());
    classifiers.clear();
    nodes.clear();
+    stumps.clear();

    FileNodeIterator it = fn.begin(), it_end = fn.end();
+    maxNodesPerTree = 0;

    for( int si = 0; it != it_end; si++, ++it )
    {
@@ -1307,6 +1502,8 @@ bool CascadeClassifierImpl::Data::read(const FileNode &root)

            DTree tree;
            tree.nodeCount = (int)internalNodes.size()/nodeStep;
+            maxNodesPerTree = std::max(maxNodesPerTree, tree.nodeCount);
+            
            classifiers.push_back(tree);

            nodes.reserve(nodes.size() + tree.nodeCount);
@@ -1341,12 +1538,35 @@ bool CascadeClassifierImpl::Data::read(const FileNode &root)
                leaves.push_back((float)*internalNodesIter);
        }
    }
+    
+    if( isStumpBased() )
+    {
+        int nodeOfs = 0, leafOfs = 0;
+        size_t nstages = stages.size();
+        for( size_t stageIdx = 0; stageIdx < nstages; stageIdx++ )
+        {
+            const Stage& stage = stages[stageIdx];
+            
+            int ntrees = stage.ntrees;
+            for( int i = 0; i < ntrees; i++, nodeOfs++, leafOfs+= 2 )
+            {
+                const DTreeNode& node = nodes[nodeOfs];
+                stumps.push_back(Stump(node.featureIdx, node.threshold,
+                                       leaves[leafOfs], leaves[leafOfs+1]));
+            }
+        }
+    }

    return true;
 }

+                       
 bool CascadeClassifierImpl::read_(const FileNode& root)
 {
+    tryOpenCL = true;
+    cascadeKernel = ocl::Kernel();
+    ustages.release();
+    ustumps.release();
    if( !data.read(root) )
        return false;

--- a/modules/objdetect/src/cascadedetect.hpp
+++ b/modules/objdetect/src/cascadedetect.hpp
@@ -42,24 +42,29 @@ public:
    bool isOldFormatCascade() const;
    Size getOriginalWindowSize() const;
    int getFeatureType() const;
-    bool setImage( InputArray );
    void* getOldCascade();

    void setMaskGenerator(const Ptr<MaskGenerator>& maskGenerator);
    Ptr<MaskGenerator> getMaskGenerator();

 protected:
-    bool detectSingleScale( const Mat& image, int stripCount, Size processingRectSize,
-                           int stripSize, int yStep, double factor, std::vector<Rect>& candidates,
-                           std::vector<int>& rejectLevels, std::vector<double>& levelWeights, bool outputRejectLevels = false );
+    enum { SUM_ALIGN = 64 };
+    
+    bool detectSingleScale( InputArray image, Size processingRectSize,
+                            int yStep, double factor, std::vector<Rect>& candidates,
+                            std::vector<int>& rejectLevels, std::vector<double>& levelWeights,
+                            Size sumSize0, bool outputRejectLevels = false );
+    bool ocl_detectSingleScale( InputArray image, Size processingRectSize,
+                                int yStep, double factor, Size sumSize0 );
+    

-    void detectMultiScaleNoGrouping( const Mat& image, std::vector<Rect>& candidates,
+    void detectMultiScaleNoGrouping( InputArray image, std::vector<Rect>& candidates,
                                    std::vector<int>& rejectLevels, std::vector<double>& levelWeights,
                                    double scaleFactor, Size minObjectSize, Size maxObjectSize,
                                    bool outputRejectLevels = false );

-    enum { BOOST = 0
-    };
+    enum { MAX_FACES = 10000 };
+    enum { BOOST = 0 };
    enum { DO_CANNY_PRUNING    = CASCADE_DO_CANNY_PRUNING,
        SCALE_IMAGE         = CASCADE_SCALE_IMAGE,
        FIND_BIGGEST_OBJECT = CASCADE_FIND_BIGGEST_OBJECT,
@@ -80,7 +85,6 @@ protected:
    template<class FEval>
    friend int predictCategoricalStump( CascadeClassifierImpl& cascade, Ptr<FeatureEvaluator> &featureEvaluator, double& weight);

-    bool setImage( Ptr<FeatureEvaluator>& feval, const Mat& image);
    int runAt( Ptr<FeatureEvaluator>& feval, Point pt, double& weight );

    class Data
@@ -105,14 +109,29 @@ protected:
            int ntrees;
            float threshold;
        };
+        
+        struct Stump
+        {
+            Stump() {};
+            Stump(int _featureIdx, float _threshold, float _left, float _right)
+            : featureIdx(_featureIdx), threshold(_threshold), left(_left), right(_right) {}
+            
+            int featureIdx;
+            float threshold;
+            float left;
+            float right;
+        };
+        
+        Data();

        bool read(const FileNode &node);

-        bool isStumpBased;
+        bool isStumpBased() const { return maxNodesPerTree == 1; }

        int stageType;
        int featureType;
        int ncategories;
+        int maxNodesPerTree;
        Size origWinSize;

        std::vector<Stage> stages;
@@ -120,6 +139,7 @@ protected:
        std::vector<DTreeNode> nodes;
        std::vector<float> leaves;
        std::vector<int> subsets;
+        std::vector<Stump> stumps;
    };

    Data data;
@@ -127,6 +147,12 @@ protected:
    Ptr<CvHaarClassifierCascade> oldCascade;

    Ptr<MaskGenerator> maskGenerator;
+    UMat ugrayImage, uimageBuffer;
+    UMat ufacepos, ustages, ustumps, usubsets;
+    ocl::Kernel cascadeKernel;
+    bool tryOpenCL;
+    
+    Mutex mtx;
 };

 #define CC_CASCADE_PARAMS "cascadeParams"
@@ -186,6 +212,36 @@ protected:

 #define CALC_SUM(rect,offset) CALC_SUM_((rect)[0], (rect)[1], (rect)[2], (rect)[3], offset)

+#define CV_SUM_OFS( p0, p1, p2, p3, sum, rect, step )                 \
+/* (x, y) */                                                          \
+(p0) = sum + (rect).x + (step) * (rect).y,                            \
+/* (x + w, y) */                                                      \
+(p1) = sum + (rect).x + (rect).width + (step) * (rect).y,             \
+/* (x + w, y) */                                                      \
+(p2) = sum + (rect).x + (step) * ((rect).y + (rect).height),          \
+/* (x + w, y + h) */                                                  \
+(p3) = sum + (rect).x + (rect).width + (step) * ((rect).y + (rect).height)
+
+#define CV_TILTED_OFS( p0, p1, p2, p3, tilted, rect, step )                     \
+/* (x, y) */                                                                    \
+(p0) = tilted + (rect).x + (step) * (rect).y,                                   \
+/* (x - h, y + h) */                                                            \
+(p1) = tilted + (rect).x - (rect).height + (step) * ((rect).y + (rect).height), \
+/* (x + w, y + w) */                                                            \
+(p2) = tilted + (rect).x + (rect).width + (step) * ((rect).y + (rect).width),   \
+/* (x + w - h, y + w + h) */                                                    \
+(p3) = tilted + (rect).x + (rect).width - (rect).height                         \
+ (step) * ((rect).y + (rect).width + (rect).height)
+
+#define CALC_SUM_(p0, p1, p2, p3, offset) \
+((p0)[offset] - (p1)[offset] - (p2)[offset] + (p3)[offset])
+
+#define CALC_SUM(rect,offset) CALC_SUM_((rect)[0], (rect)[1], (rect)[2], (rect)[3], offset)
+
+#define CALC_SUM_OFS_(p0, p1, p2, p3, ptr) \
+((ptr)[p0] - (ptr)[p1] - (ptr)[p2] + (ptr)[p3])
+    
+#define CALC_SUM_OFS(rect, ptr) CALC_SUM_OFS_((rect)[0], (rect)[1], (rect)[2], (rect)[3], ptr)

 //----------------------------------------------  HaarEvaluator ---------------------------------------
 class HaarEvaluator : public FeatureEvaluator
@@ -195,8 +251,6 @@ public:
    {
        Feature();

-        float calc( int offset ) const;
-        void updatePtrs( const Mat& sum );
        bool read( const FileNode& node );

        bool tilted;
@@ -208,8 +262,19 @@ public:
            Rect r;
            float weight;
        } rect[RECT_NUM];
+    };

-        const int* p[RECT_NUM][4];
+    struct OptFeature
+    {
+        OptFeature();
+
+        enum { RECT_NUM = Feature::RECT_NUM };
+        float calc( const int* pwin ) const;
+
+        void setOffsets( const Feature& _f, int step, int tofs );
+
+        int ofs[RECT_NUM][4];
+        float weight[4];
    };

    HaarEvaluator();
@@ -219,28 +284,30 @@ public:
    virtual Ptr<FeatureEvaluator> clone() const;
    virtual int getFeatureType() const { return FeatureEvaluator::HAAR; }

-    virtual bool setImage(const Mat&, Size origWinSize);
+    virtual bool setImage(InputArray, Size origWinSize, Size sumSize);
    virtual bool setWindow(Point pt);
+    virtual Rect getNormRect() const;
+    virtual void getUMats(std::vector<UMat>& bufs);

    double operator()(int featureIdx) const
-    { return featuresPtr[featureIdx].calc(offset) * varianceNormFactor; }
+    { return optfeaturesPtr[featureIdx].calc(pwin) * varianceNormFactor; }
    virtual double calcOrd(int featureIdx) const
    { return (*this)(featureIdx); }

 protected:
-    Size origWinSize;
+    Size origWinSize, sumSize0;
    Ptr<std::vector<Feature> > features;
-    Feature* featuresPtr; // optimization
+    Ptr<std::vector<OptFeature> > optfeatures;
+    OptFeature* optfeaturesPtr; // optimization
    bool hasTiltedFeatures;

-    Mat sum0, sqsum0, tilted0;
-    Mat sum, sqsum, tilted;
+    Mat sum0, sum, sqsum0, sqsum;
+    UMat usum0, usum, usqsum0, usqsum, ufbuf;

    Rect normrect;
-    const int *p[4];
-    const double *pq[4];
+    int nofs[4];

-    int offset;
+    const int* pwin;
    double varianceNormFactor;
 };

@@ -249,38 +316,46 @@ inline HaarEvaluator::Feature :: Feature()
    tilted = false;
    rect[0].r = rect[1].r = rect[2].r = Rect();
    rect[0].weight = rect[1].weight = rect[2].weight = 0;
-    p[0][0] = p[0][1] = p[0][2] = p[0][3] =
-        p[1][0] = p[1][1] = p[1][2] = p[1][3] =
-        p[2][0] = p[2][1] = p[2][2] = p[2][3] = 0;
 }

-inline float HaarEvaluator::Feature :: calc( int _offset ) const
+inline HaarEvaluator::OptFeature :: OptFeature()
 {
-    float ret = rect[0].weight * CALC_SUM(p[0], _offset) + rect[1].weight * CALC_SUM(p[1], _offset);
+    weight[0] = weight[1] = weight[2] = 0.f;

-    if( rect[2].weight != 0.0f )
-        ret += rect[2].weight * CALC_SUM(p[2], _offset);
+    ofs[0][0] = ofs[0][1] = ofs[0][2] = ofs[0][3] =
+    ofs[1][0] = ofs[1][1] = ofs[1][2] = ofs[1][3] =
+    ofs[2][0] = ofs[2][1] = ofs[2][2] = ofs[2][3] = 0;
+}
+
+inline float HaarEvaluator::OptFeature :: calc( const int* ptr ) const
+{
+    float ret = weight[0] * CALC_SUM_OFS(ofs[0], ptr) +
+                weight[1] * CALC_SUM_OFS(ofs[1], ptr);
+
+    if( weight[2] != 0.0f )
+        ret += weight[2] * CALC_SUM_OFS(ofs[2], ptr);

    return ret;
 }

-inline void HaarEvaluator::Feature :: updatePtrs( const Mat& _sum )
+inline void HaarEvaluator::OptFeature :: setOffsets( const Feature& _f, int step, int tofs )
 {
-    const int* ptr = (const int*)_sum.data;
-    size_t step = _sum.step/sizeof(ptr[0]);
-    if (tilted)
+    weight[0] = _f.rect[0].weight;
+    weight[1] = _f.rect[1].weight;
+    weight[2] = _f.rect[2].weight;
+    
+    Rect r2 = weight[2] > 0 ? _f.rect[2].r : Rect(0,0,0,0);
+    if (_f.tilted)
    {
-        CV_TILTED_PTRS( p[0][0], p[0][1], p[0][2], p[0][3], ptr, rect[0].r, step );
-        CV_TILTED_PTRS( p[1][0], p[1][1], p[1][2], p[1][3], ptr, rect[1].r, step );
-        if (rect[2].weight)
-            CV_TILTED_PTRS( p[2][0], p[2][1], p[2][2], p[2][3], ptr, rect[2].r, step );
+        CV_TILTED_OFS( ofs[0][0], ofs[0][1], ofs[0][2], ofs[0][3], tofs, _f.rect[0].r, step );
+        CV_TILTED_OFS( ofs[1][0], ofs[1][1], ofs[1][2], ofs[1][3], tofs, _f.rect[1].r, step );
+        CV_TILTED_PTRS( ofs[2][0], ofs[2][1], ofs[2][2], ofs[2][3], tofs, r2, step );
    }
    else
    {
-        CV_SUM_PTRS( p[0][0], p[0][1], p[0][2], p[0][3], ptr, rect[0].r, step );
-        CV_SUM_PTRS( p[1][0], p[1][1], p[1][2], p[1][3], ptr, rect[1].r, step );
-        if (rect[2].weight)
-            CV_SUM_PTRS( p[2][0], p[2][1], p[2][2], p[2][3], ptr, rect[2].r, step );
+        CV_SUM_OFS( ofs[0][0], ofs[0][1], ofs[0][2], ofs[0][3], 0, _f.rect[0].r, step );
+        CV_SUM_OFS( ofs[1][0], ofs[1][1], ofs[1][2], ofs[1][3], 0, _f.rect[1].r, step );
+        CV_SUM_OFS( ofs[2][0], ofs[2][1], ofs[2][2], ofs[2][3], 0, r2, step );
    }
 }

@@ -311,7 +386,7 @@ public:
    virtual Ptr<FeatureEvaluator> clone() const;
    virtual int getFeatureType() const { return FeatureEvaluator::LBP; }

-    virtual bool setImage(const Mat& image, Size _origWinSize);
+    virtual bool setImage(InputArray image, Size _origWinSize, Size);
    virtual bool setWindow(Point pt);

    int operator()(int featureIdx) const
@@ -388,7 +463,7 @@ public:
    virtual bool read( const FileNode& node );
    virtual Ptr<FeatureEvaluator> clone() const;
    virtual int getFeatureType() const { return FeatureEvaluator::HOG; }
-    virtual bool setImage( const Mat& image, Size winSize );
+    virtual bool setImage( InputArray image, Size winSize, Size );
    virtual bool setWindow( Point pt );
    double operator()(int featureIdx) const
    {
@@ -533,30 +608,36 @@ template<class FEval>
 inline int predictOrderedStump( CascadeClassifierImpl& cascade,
                                Ptr<FeatureEvaluator> &_featureEvaluator, double& sum )
 {
-    int nodeOfs = 0, leafOfs = 0;
+    CV_Assert(!cascade.data.stumps.empty());
    FEval& featureEvaluator = (FEval&)*_featureEvaluator;
-    float* cascadeLeaves = &cascade.data.leaves[0];
-    CascadeClassifierImpl::Data::DTreeNode* cascadeNodes = &cascade.data.nodes[0];
-    CascadeClassifierImpl::Data::Stage* cascadeStages = &cascade.data.stages[0];
+    const CascadeClassifierImpl::Data::Stump* cascadeStumps = &cascade.data.stumps[0];
+    const CascadeClassifierImpl::Data::Stage* cascadeStages = &cascade.data.stages[0];

    int nstages = (int)cascade.data.stages.size();
+    double tmp = 0;
+    
    for( int stageIdx = 0; stageIdx < nstages; stageIdx++ )
    {
-        CascadeClassifierImpl::Data::Stage& stage = cascadeStages[stageIdx];
-        sum = 0.0;
+        const CascadeClassifierImpl::Data::Stage& stage = cascadeStages[stageIdx];
+        tmp = 0;

        int ntrees = stage.ntrees;
-        for( int i = 0; i < ntrees; i++, nodeOfs++, leafOfs+= 2 )
+        for( int i = 0; i < ntrees; i++ )
        {
-            CascadeClassifierImpl::Data::DTreeNode& node = cascadeNodes[nodeOfs];
-            double value = featureEvaluator(node.featureIdx);
-            sum += cascadeLeaves[ value < node.threshold ? leafOfs : leafOfs + 1 ];
+            const CascadeClassifierImpl::Data::Stump& stump = cascadeStumps[i];
+            double value = featureEvaluator(stump.featureIdx);
+            tmp += value < stump.threshold ? stump.left : stump.right;
        }

-        if( sum < stage.threshold )
+        if( tmp < stage.threshold )
+        {
+            sum = (double)tmp;
            return -stageIdx;
+        }
+        cascadeStumps += ntrees;
    }

+    sum = (double)tmp;
    return 1;
 }

@@ -564,56 +645,44 @@ template<class FEval>
 inline int predictCategoricalStump( CascadeClassifierImpl& cascade,
                                    Ptr<FeatureEvaluator> &_featureEvaluator, double& sum )
 {
+    CV_Assert(!cascade.data.stumps.empty());
    int nstages = (int)cascade.data.stages.size();
-    int nodeOfs = 0, leafOfs = 0;
    FEval& featureEvaluator = (FEval&)*_featureEvaluator;
    size_t subsetSize = (cascade.data.ncategories + 31)/32;
-    int* cascadeSubsets = &cascade.data.subsets[0];
-    float* cascadeLeaves = &cascade.data.leaves[0];
-    CascadeClassifierImpl::Data::DTreeNode* cascadeNodes = &cascade.data.nodes[0];
-    CascadeClassifierImpl::Data::Stage* cascadeStages = &cascade.data.stages[0];
+    const int* cascadeSubsets = &cascade.data.subsets[0];
+    const CascadeClassifierImpl::Data::Stump* cascadeStumps = &cascade.data.stumps[0];
+    const CascadeClassifierImpl::Data::Stage* cascadeStages = &cascade.data.stages[0];

 #ifdef HAVE_TEGRA_OPTIMIZATION
    float tmp = 0; // float accumulator -- float operations are quicker
+#else
+    double tmp = 0;
 #endif
    for( int si = 0; si < nstages; si++ )
    {
-        CascadeClassifierImpl::Data::Stage& stage = cascadeStages[si];
+        const CascadeClassifierImpl::Data::Stage& stage = cascadeStages[si];
        int wi, ntrees = stage.ntrees;
-#ifdef HAVE_TEGRA_OPTIMIZATION
        tmp = 0;
-#else
-        sum = 0;
-#endif

        for( wi = 0; wi < ntrees; wi++ )
        {
-            CascadeClassifierImpl::Data::DTreeNode& node = cascadeNodes[nodeOfs];
-            int c = featureEvaluator(node.featureIdx);
-            const int* subset = &cascadeSubsets[nodeOfs*subsetSize];
-#ifdef HAVE_TEGRA_OPTIMIZATION
-            tmp += cascadeLeaves[ subset[c>>5] & (1 << (c & 31)) ? leafOfs : leafOfs+1];
-#else
-            sum += cascadeLeaves[ subset[c>>5] & (1 << (c & 31)) ? leafOfs : leafOfs+1];
-#endif
-            nodeOfs++;
-            leafOfs += 2;
+            const CascadeClassifierImpl::Data::Stump& stump = cascadeStumps[wi];
+            int c = featureEvaluator(stump.featureIdx);
+            const int* subset = &cascadeSubsets[wi*subsetSize];
+            tmp += (subset[c>>5] & (1 << (c & 31))) ? stump.left : stump.right;
        }
-#ifdef HAVE_TEGRA_OPTIMIZATION
-        if( tmp < stage.threshold ) {
+
+        if( tmp < stage.threshold )
+        {
            sum = (double)tmp;
            return -si;
        }
-#else
-        if( sum < stage.threshold )
-            return -si;
-#endif
+        
+        cascadeStumps += ntrees;
+        cascadeSubsets += ntrees*subsetSize;
    }

-#ifdef HAVE_TEGRA_OPTIMIZATION
    sum = (double)tmp;
-#endif
-
    return 1;
 }
 }
--- a/modules/objdetect/src/cascadedetect_convert.cpp
+++ b/modules/objdetect/src/cascadedetect_convert.cpp
@@ -0,0 +1,276 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2013, Itseez Inc, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of Intel Corporation may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+/* Haar features calculation */
+
+#include "precomp.hpp"
+#include <stdio.h>
+
+namespace cv
+{
+
+/* field names */
+
+#define ICV_HAAR_SIZE_NAME            "size"
+#define ICV_HAAR_STAGES_NAME          "stages"
+#define ICV_HAAR_TREES_NAME           "trees"
+#define ICV_HAAR_FEATURE_NAME         "feature"
+#define ICV_HAAR_RECTS_NAME           "rects"
+#define ICV_HAAR_TILTED_NAME          "tilted"
+#define ICV_HAAR_THRESHOLD_NAME       "threshold"
+#define ICV_HAAR_LEFT_NODE_NAME       "left_node"
+#define ICV_HAAR_LEFT_VAL_NAME        "left_val"
+#define ICV_HAAR_RIGHT_NODE_NAME      "right_node"
+#define ICV_HAAR_RIGHT_VAL_NAME       "right_val"
+#define ICV_HAAR_STAGE_THRESHOLD_NAME "stage_threshold"
+#define ICV_HAAR_PARENT_NAME          "parent"
+#define ICV_HAAR_NEXT_NAME            "next"
+
+namespace haar_cvt
+{
+
+struct HaarFeature
+{
+    enum { RECT_NUM = 3 };
+
+    HaarFeature()
+    {
+        tilted = false;
+        for( int i = 0; i < RECT_NUM; i++ )
+        {
+            rect[i].r = Rect(0,0,0,0);
+            rect[i].weight = 0.f;
+        }
+    }
+    bool tilted;
+    struct
+    {
+        Rect r;
+        float weight;
+    } rect[RECT_NUM];
+};
+
+struct HaarClassifierNode
+{
+    HaarClassifierNode()
+    {
+        f = left = right = 0;
+        threshold = 0.f;
+    }
+    int f, left, right;
+    float threshold;
+};
+
+struct HaarClassifier
+{
+    std::vector<HaarClassifierNode> nodes;
+    std::vector<float> leaves;
+};
+
+struct HaarStageClassifier
+{
+    double threshold;
+    std::vector<HaarClassifier> weaks;
+};
+
+static bool convert(const String& oldcascade, const String& newcascade)
+{
+    FileStorage oldfs(oldcascade, FileStorage::READ);
+    if( !oldfs.isOpened() )
+        return false;
+    FileNode oldroot = oldfs.getFirstTopLevelNode();
+
+    FileNode sznode = oldroot[ICV_HAAR_SIZE_NAME];
+    if( sznode.empty() )
+        return false;
+    int maxdepth = 0;
+    Size cascadesize;
+    cascadesize.width = (int)sznode[0];
+    cascadesize.height = (int)sznode[1];
+    std::vector<HaarFeature> features;
+
+    size_t i, j, k, n;
+
+    FileNode stages_seq = oldroot[ICV_HAAR_STAGES_NAME];
+    size_t nstages = stages_seq.size();
+    std::vector<HaarStageClassifier> stages(nstages);
+
+    for( i = 0; i < nstages; i++ )
+    {
+        FileNode stagenode = stages_seq[i];
+        HaarStageClassifier& stage = stages[i];
+        stage.threshold = (double)stagenode[ICV_HAAR_STAGE_THRESHOLD_NAME];
+        FileNode weaks_seq = stagenode[ICV_HAAR_TREES_NAME];
+        size_t nweaks = weaks_seq.size();
+        stage.weaks.resize(nweaks);
+
+        for( j = 0; j < nweaks; j++ )
+        {
+            HaarClassifier& weak = stage.weaks[j];
+            FileNode weaknode = weaks_seq[j];
+            size_t nnodes = weaknode.size();
+
+            for( n = 0; n < nnodes; n++ )
+            {
+                FileNode nnode = weaknode[n];
+                FileNode fnode = nnode[ICV_HAAR_FEATURE_NAME];
+                HaarFeature f;
+                HaarClassifierNode node;
+                node.f = (int)features.size();
+                f.tilted = (int)fnode[ICV_HAAR_TILTED_NAME] != 0;
+                FileNode rects_seq = fnode[ICV_HAAR_RECTS_NAME];
+                size_t nrects = rects_seq.size();
+
+                for( k = 0; k < nrects; k++ )
+                {
+                    FileNode rnode = rects_seq[k];
+                    f.rect[k].r.x = (int)rnode[0];
+                    f.rect[k].r.y = (int)rnode[1];
+                    f.rect[k].r.width = (int)rnode[2];
+                    f.rect[k].r.height = (int)rnode[3];
+                    f.rect[k].weight = (float)rnode[4];
+                }
+                features.push_back(f);
+                node.threshold = nnode[ICV_HAAR_THRESHOLD_NAME];
+                FileNode leftValNode = nnode[ICV_HAAR_LEFT_VAL_NAME];
+                if( !leftValNode.empty() )
+                {
+                    node.left = -(int)weak.leaves.size();
+                    weak.leaves.push_back((float)leftValNode);
+                }
+                else
+                {
+                    node.left = (int)nnode[ICV_HAAR_LEFT_NODE_NAME];
+                }
+                FileNode rightValNode = nnode[ICV_HAAR_RIGHT_VAL_NAME];
+                if( !rightValNode.empty() )
+                {
+                    node.right = -(int)weak.leaves.size();
+                    weak.leaves.push_back((float)rightValNode);
+                }
+                else
+                {
+                    node.right = (int)nnode[ICV_HAAR_RIGHT_NODE_NAME];
+                }
+                weak.nodes.push_back(node);
+            }
+        }
+    }
+
+    FileStorage newfs(newcascade, FileStorage::WRITE);
+    if( !newfs.isOpened() )
+        return false;
+
+    size_t maxWeakCount = 0, nfeatures = features.size();
+    for( i = 0; i < nstages; i++ )
+        maxWeakCount = std::max(maxWeakCount, stages[i].weaks.size());
+
+    newfs << "cascade" << "{:opencv-cascade-classifier"
+    << "stageType" << "BOOST"
+    << "featureType" << "HAAR"
+    << "height" << cascadesize.width
+    << "width" << cascadesize.height
+    << "stageParams" << "{"
+        << "maxWeakCount" << (int)maxWeakCount
+    << "}"
+    << "featureParams" << "{"
+        << "maxCatCount" << 0
+    << "}"
+    << "stageNum" << (int)nstages
+    << "stages" << "[";
+
+    for( i = 0; i < nstages; i++ )
+    {
+        size_t nweaks = stages[i].weaks.size();
+        newfs << "{" << "maxWeakCount" << (int)nweaks
+            << "stageThreshold" << stages[i].threshold
+            << "weakClassifiers" << "[";
+        for( j = 0; j < nweaks; j++ )
+        {
+            const HaarClassifier& c = stages[i].weaks[j];
+            newfs << "{" << "internalNodes" << "[";
+            size_t nnodes = c.nodes.size(), nleaves = c.leaves.size();
+            for( k = 0; k < nnodes; k++ )
+                newfs << c.nodes[k].left << c.nodes[k].right
+                    << c.nodes[k].f << c.nodes[k].threshold;
+            newfs << "]" << "leafValues" << "[";
+            for( k = 0; k < nleaves; k++ )
+                newfs << c.leaves[k];
+            newfs << "]" << "}";
+        }
+        newfs << "]" << "}";
+    }
+
+    newfs << "]"
+        << "features" << "[";
+
+    for( i = 0; i < nfeatures; i++ )
+    {
+        const HaarFeature& f = features[i];
+        newfs << "{" << "rects" << "[";
+        for( j = 0; j < (size_t)HaarFeature::RECT_NUM; j++ )
+        {
+            if( j >= 2 && fabs(f.rect[j].weight) < FLT_EPSILON )
+                break;
+            newfs << "[" << f.rect[j].r.x << f.rect[j].r.y <<
+                f.rect[j].r.width << f.rect[j].r.height << f.rect[j].weight << "]";
+        }
+        newfs << "]";
+        if( f.tilted )
+            newfs << "tilted" << 1;
+        newfs << "}";
+    }
+
+    newfs << "]" << "}";
+    return true;
+}
+
+}
+
+bool CascadeClassifier::convert(const String& oldcascade, const String& newcascade)
+{
+    bool ok = haar_cvt::convert(oldcascade, newcascade);
+    if( !ok && newcascade.size() > 0 )
+        remove(newcascade.c_str());
+    return ok;
+}
+
+}
--- a/modules/objdetect/src/opencl/cascadedetect.cl
+++ b/modules/objdetect/src/opencl/cascadedetect.cl
@@ -0,0 +1,186 @@
+///////////////////////////// OpenCL kernels for face detection //////////////////////////////
+////////////////////////////// see the opencv/doc/license.txt ///////////////////////////////
+
+typedef struct __attribute__((aligned(4))) OptFeature
+{
+    int4 ofs[3] __attribute__((aligned (4)));
+    float4 weight __attribute__((aligned (4)));
+}
+OptFeature;
+
+typedef struct __attribute__((aligned(4))) Stump
+{
+    int featureIdx __attribute__((aligned (4)));
+    float threshold __attribute__((aligned (4))); // for ordered features only
+    float left __attribute__((aligned (4)));
+    float right __attribute__((aligned (4)));
+}
+Stump;
+
+typedef struct __attribute__((aligned (4))) Stage
+{
+    int first __attribute__((aligned (4)));
+    int ntrees __attribute__((aligned (4)));
+    float threshold __attribute__((aligned (4)));
+}
+Stage;
+
+__kernel void runHaarClassifierStump(
+    __global const int* sum,
+    int sumstep, int sumoffset,
+    __global const int* sqsum,
+    int sqsumstep, int sqsumoffset,
+    __global const OptFeature* optfeatures,
+
+    int nstages,
+    __global const Stage* stages,
+    __global const Stump* stumps,
+
+    volatile __global int* facepos,
+    int2 imgsize, int xyscale, float factor,
+    int4 normrect, int2 windowsize, int maxFaces)
+{
+    int ix = get_global_id(0)*xyscale;
+    int iy = get_global_id(1)*xyscale;
+    sumstep /= sizeof(int);
+    sqsumstep /= sizeof(int);
+    
+    if( ix < imgsize.x && iy < imgsize.y )
+    {
+        int ntrees;
+        int stageIdx, i;
+        float s = 0.f;
+        __global const Stump* stump = stumps;
+        __global const OptFeature* f;
+        
+        __global const int* psum = sum + mad24(iy, sumstep, ix);
+        __global const int* pnsum = psum + mad24(normrect.y, sumstep, normrect.x);
+        int normarea = normrect.z * normrect.w;
+        float invarea = 1.f/normarea;
+        float sval = (pnsum[0] - pnsum[normrect.z] - pnsum[mul24(normrect.w, sumstep)] +
+                      pnsum[mad24(normrect.w, sumstep, normrect.z)])*invarea;
+        float sqval = (sqsum[mad24(iy + normrect.y, sqsumstep, ix + normrect.x)])*invarea;
+        float nf = (float)normarea * sqrt(max(sqval - sval * sval, 0.f));
+        float4 weight, vsval;
+        int4 ofs, ofs0, ofs1, ofs2;
+        nf = nf > 0 ? nf : 1.f;
+        
+        for( stageIdx = 0; stageIdx < nstages; stageIdx++ )
+        {
+            ntrees = stages[stageIdx].ntrees;
+            s = 0.f;
+            for( i = 0; i < ntrees; i++, stump++ )
+            {
+                f = optfeatures + stump->featureIdx;
+                weight = f->weight;
+                
+                ofs = f->ofs[0];
+                sval = (psum[ofs.x] - psum[ofs.y] - psum[ofs.z] + psum[ofs.w])*weight.x;
+                ofs = f->ofs[1];
+                sval += (psum[ofs.x] - psum[ofs.y] - psum[ofs.z] + psum[ofs.w])*weight.y;
+                if( weight.z > 0 )
+                {
+                    ofs = f->ofs[2];
+                    sval += (psum[ofs.x] - psum[ofs.y] - psum[ofs.z] + psum[ofs.w])*weight.z;
+                }
+                
+                s += (sval < stump->threshold*nf) ? stump->left : stump->right;
+            }
+            
+            if( s < stages[stageIdx].threshold )
+                break;
+        }
+        
+        if( stageIdx == nstages )
+        {
+            int nfaces = atomic_inc(facepos);
+            if( nfaces < maxFaces )
+            {
+                volatile __global int* face = facepos + 1 + nfaces*4;
+                face[0] = convert_int_rte(ix*factor);
+                face[1] = convert_int_rte(iy*factor);
+                face[2] = convert_int_rte(windowsize.x*factor);
+                face[3] = convert_int_rte(windowsize.y*factor);
+            }
+        }
+    }
+}
+
+#if 0
+__kernel void runLBPClassifierStump(
+    __global const int* sum,
+    int sumstep, int sumoffset,
+    __global const int* sqsum,
+    int sqsumstep, int sqsumoffset,
+    __global const OptFeature* optfeatures,
+
+    int nstages,
+    __global const Stage* stages,
+    __global const Stump* stumps,
+    __global const int* bitsets,
+    int bitsetSize,
+
+    volatile __global int* facepos,
+    int2 imgsize, int xyscale, float factor,
+    int4 normrect, int2 windowsize, int maxFaces)
+{
+    int ix = get_global_id(0)*xyscale*VECTOR_SIZE;
+    int iy = get_global_id(1)*xyscale;
+    sumstep /= sizeof(int);
+    sqsumstep /= sizeof(int);
+    
+    if( ix < imgsize.x && iy < imgsize.y )
+    {
+        int ntrees;
+        int stageIdx, i;
+        float s = 0.f;
+        __global const Stump* stump = stumps;
+        __global const int* bitset = bitsets;
+        __global const OptFeature* f;
+        
+        __global const int* psum = sum + mad24(iy, sumstep, ix);
+        __global const int* pnsum = psum + mad24(normrect.y, sumstep, normrect.x);
+        int normarea = normrect.z * normrect.w;
+        float invarea = 1.f/normarea;
+        float sval = (pnsum[0] - pnsum[normrect.z] - pnsum[mul24(normrect.w, sumstep)] +
+        pnsum[mad24(normrect.w, sumstep, normrect.z)])*invarea;
+        float sqval = (sqsum[mad24(iy + normrect.y, sqsumstep, ix + normrect.x)])*invarea;
+        float nf = (float)normarea * sqrt(max(sqval - sval * sval, 0.f));
+        float4 weight;
+        int4 ofs;
+        nf = nf > 0 ? nf : 1.f;
+        
+        for( stageIdx = 0; stageIdx < nstages; stageIdx++ )
+        {
+            ntrees = stages[stageIdx].ntrees;
+            s = 0.f;
+            for( i = 0; i < ntrees; i++, stump++, bitset += bitsetSize )
+            {
+                f = optfeatures + stump->featureIdx;
+                
+                weight = f->weight;
+                
+                // compute LBP feature to val
+                s += (bitset[val >> 5] & (1 << (val & 31))) ? stump->left : stump->right;
+            }
+            
+            if( s < stages[stageIdx].threshold )
+            break;
+        }
+        
+        if( stageIdx == nstages )
+        {
+            int nfaces = atomic_inc(facepos);
+            if( nfaces < maxFaces )
+            {
+                volatile __global int* face = facepos + 1 + nfaces*4;
+                face[0] = convert_int_rte(ix*factor);
+                face[1] = convert_int_rte(iy*factor);
+                face[2] = convert_int_rte(windowsize.x*factor);
+                face[3] = convert_int_rte(windowsize.y*factor);
+            }
+        }
+    }
+}
+#endif
+