From d8513d627d68b9fad757606923b627c56f5f2b04 Mon Sep 17 00:00:00 2001
From: Vadim Pisarevsky <vadim.pisarevsky@gmail.com>
Date: Thu, 12 Dec 2013 21:58:42 +0400
Subject: [PATCH] continue adding OpenCL optimization to cascade classifier

---
 .../objdetect/include/opencv2/objdetect.hpp   |  13 +-
 modules/objdetect/src/cascadedetect.cpp       | 288 +++++++++++++-----
 modules/objdetect/src/cascadedetect.hpp       |  88 +++---
 3 files changed, 273 insertions(+), 116 deletions(-)
diff --git a/modules/objdetect/include/opencv2/objdetect.hpp b/modules/objdetect/include/opencv2/objdetect.hpp
index b6d0fd44b..631264c7d 100644
--- a/modules/objdetect/include/opencv2/objdetect.hpp
+++ b/modules/objdetect/include/opencv2/objdetect.hpp
@@ -111,12 +111,15 @@ public:
 };
 
 CV_EXPORTS   void groupRectangles(std::vector<Rect>& rectList, int groupThreshold, double eps = 0.2);
-CV_EXPORTS_W void groupRectangles(CV_IN_OUT std::vector<Rect>& rectList, CV_OUT std::vector<int>& weights, int groupThreshold, double eps = 0.2);
-CV_EXPORTS   void groupRectangles(std::vector<Rect>& rectList, int groupThreshold, double eps, std::vector<int>* weights, std::vector<double>* levelWeights );
+CV_EXPORTS_W void groupRectangles(CV_IN_OUT std::vector<Rect>& rectList, CV_OUT std::vector<int>& weights,
+                                  int groupThreshold, double eps = 0.2);
+CV_EXPORTS   void groupRectangles(std::vector<Rect>& rectList, int groupThreshold,
+                                  double eps, std::vector<int>* weights, std::vector<double>* levelWeights );
 CV_EXPORTS   void groupRectangles(std::vector<Rect>& rectList, std::vector<int>& rejectLevels,
                                   std::vector<double>& levelWeights, int groupThreshold, double eps = 0.2);
-CV_EXPORTS   void groupRectangles_meanshift(std::vector<Rect>& rectList, std::vector<double>& foundWeights, std::vector<double>& foundScales,
-                                          double detectThreshold = 0.0, Size winDetSize = Size(64, 128));
+CV_EXPORTS   void groupRectangles_meanshift(std::vector<Rect>& rectList, std::vector<double>& foundWeights,
+                                            std::vector<double>& foundScales,
+                                            double detectThreshold = 0.0, Size winDetSize = Size(64, 128));
 
 class CV_EXPORTS FeatureEvaluator
 {
@@ -132,7 +135,7 @@ public:
     virtual Ptr<FeatureEvaluator> clone() const;
     virtual int getFeatureType() const;
 
-    virtual bool setImage(const Mat& img, Size origWinSize);
+    virtual bool setImage(InputArray img, Size origWinSize);
     virtual bool setWindow(Point p);
 
     virtual double calcOrd(int featureIdx) const;
diff --git a/modules/objdetect/src/cascadedetect.cpp b/modules/objdetect/src/cascadedetect.cpp
index a72f90fef..41c0e65f7 100644
--- a/modules/objdetect/src/cascadedetect.cpp
+++ b/modules/objdetect/src/cascadedetect.cpp
@@ -7,10 +7,10 @@
 //  copy or use the software.
 //
 //
-//                        Intel License Agreement
+//                           License Agreement
 //                For Open Source Computer Vision Library
 //
-// Copyright (C) 2000, Intel Corporation, all rights reserved.
+// Copyright (C) 2008-2013, Itseez Inc., all rights reserved.
 // Third party copyrights are property of their respective owners.
 //
 // Redistribution and use in source and binary forms, with or without modification,
@@ -23,13 +23,13 @@
 //     this list of conditions and the following disclaimer in the documentation
 //     and/or other materials provided with the distribution.
 //
-//   * The name of Intel Corporation may not be used to endorse or promote products
+//   * The name of Itseez Inc. may not be used to endorse or promote products
 //     derived from this software without specific prior written permission.
 //
 // This software is provided by the copyright holders and contributors "as is" and
 // any express or implied warranties, including, but not limited to, the implied
 // warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
+// In no event shall the copyright holders or contributors be liable for any direct,
 // indirect, incidental, special, exemplary, or consequential damages
 // (including, but not limited to, procurement of substitute goods or services;
 // loss of use, data, or profits; or business interruption) however caused
@@ -434,7 +434,7 @@ FeatureEvaluator::~FeatureEvaluator() {}
 bool FeatureEvaluator::read(const FileNode&) {return true;}
 Ptr<FeatureEvaluator> FeatureEvaluator::clone() const { return Ptr<FeatureEvaluator>(); }
 int FeatureEvaluator::getFeatureType() const {return -1;}
-bool FeatureEvaluator::setImage(const Mat&, Size) {return true;}
+bool FeatureEvaluator::setImage(InputArray, Size) {return true;}
 bool FeatureEvaluator::setWindow(Point) { return true; }
 double FeatureEvaluator::calcOrd(int) const { return 0.; }
 int FeatureEvaluator::calcCat(int) const { return 0; }
@@ -466,7 +466,9 @@ bool HaarEvaluator::Feature :: read( const FileNode& node )
 
 HaarEvaluator::HaarEvaluator()
 {
-    features = makePtr<std::vector<Feature> >();
+    optfeaturesPtr = 0;
+    pwin = 0;
+    pqwin = 0;
 }
 HaarEvaluator::~HaarEvaluator()
 {
@@ -476,16 +478,16 @@ bool HaarEvaluator::read(const FileNode& node)
 {
     size_t i, n = node.size();
     CV_Assert(n > 0);
-    features.resize(n);
-    featuresPtr = &features[0];
+    features->resize(n);
     FileNodeIterator it = node.begin();
     hasTiltedFeatures = false;
+    std::vector<Feature> ff = *features;
 
     for(i = 0; i < n; i++, ++it)
     {
-        if(!features[i].read(*it))
+        if(!ff[i].read(*it))
             return false;
-        if( features[i].tilted )
+        if( ff[i].tilted )
             hasTiltedFeatures = true;
     }
     return true;
@@ -496,55 +498,60 @@ Ptr<FeatureEvaluator> HaarEvaluator::clone() const
     Ptr<HaarEvaluator> ret = makePtr<HaarEvaluator>();
     ret->origWinSize = origWinSize;
     ret->features = features;
+    ret->optfeatures = optfeatures;
+    ret->optfeaturesPtr = optfeatures->empty() ? 0 : &(*(ret->optfeatures))[0];
     ret->hasTiltedFeatures = hasTiltedFeatures;
-    ret->sum0 = sum0, ret->sqsum0 = sqsum0, ret->tilted0 = tilted0;
-    ret->sum = sum, ret->sqsum = sqsum, ret->tilted = tilted;
+    ret->sum0 = sum0; ret->sqsum0 = sqsum0;
+    ret->sum = sum; ret->sqsum = sqsum; ret->tilted = tilted;
     ret->normrect = normrect;
-    memcpy( ret->p, p, 4*sizeof(p[0]) );
-    memcpy( ret->pq, pq, 4*sizeof(pq[0]) );
-    ret->offset = offset;
+    memcpy( ret->nofs, nofs, 4*sizeof(nofs[0]) );
+    memcpy( ret->nqofs, nqofs, 4*sizeof(nqofs[0]) );
+    ret->pwin = pwin; ret->pqwin = pqwin;
     ret->varianceNormFactor = varianceNormFactor;
     return ret;
 }
 
-bool HaarEvaluator::setImage( const Mat &image, Size _origWinSize )
+bool HaarEvaluator::setImage( InputArray _image, Size _origWinSize )
 {
-    int rn = image.rows+1, cn = image.cols+1;
+    Size imgsz = _image.size();
+    int rn = imgsz.height+1, cn = imgsz.width+1, rnt = rn;
     origWinSize = _origWinSize;
     normrect = Rect(1, 1, origWinSize.width-2, origWinSize.height-2);
 
-    if (image.cols < origWinSize.width || image.rows < origWinSize.height)
+    if (imgsz.width < origWinSize.width || imgsz.height < origWinSize.height)
         return false;
 
-    if( sum0.rows < rn || sum0.cols < cn )
+    if( hasTiltedFeatures )
+        rnt = rn*2;
+    
+    if( sum0.rows < rnt || sum0.cols < cn )
     {
-        sum0.create(rn, cn, CV_32S);
+        sum0.create(rnt, cn, CV_32S);
         sqsum0.create(rn, cn, CV_64F);
-        if (hasTiltedFeatures)
-            tilted0.create( rn, cn, CV_32S);
     }
     sum = Mat(rn, cn, CV_32S, sum0.data);
     sqsum = Mat(rn, cn, CV_64F, sqsum0.data);
-
     if( hasTiltedFeatures )
     {
-        tilted = Mat(rn, cn, CV_32S, tilted0.data);
-        integral(image, sum, sqsum, tilted);
+        tilted = Mat(rn, cn, CV_32S, sum0.data + rn*sum.step);
+        integral(_image, sum, sqsum, tilted);
     }
     else
-        integral(image, sum, sqsum);
-    const int* sdata = (const int*)sum.data;
-    const double* sqdata = (const double*)sqsum.data;
-    size_t sumStep = sum.step/sizeof(sdata[0]);
-    size_t sqsumStep = sqsum.step/sizeof(sqdata[0]);
+        integral(_image, sum, sqsum);
+    int sumStep = (int)(sum.step/sum.elemSize());
+    int sqsumStep = (int)(sqsum.step/sqsum.elemSize());
+    int tofs = hasTiltedFeatures ? sumStep*rn : 0;
 
-    CV_SUM_PTRS( p[0], p[1], p[2], p[3], sdata, normrect, sumStep );
-    CV_SUM_PTRS( pq[0], pq[1], pq[2], pq[3], sqdata, normrect, sqsumStep );
+    CV_SUM_OFS( nofs[0], nofs[1], nofs[2], nofs[3], 0, normrect, sumStep );
+    CV_SUM_OFS( nqofs[0], nqofs[1], nqofs[2], nqofs[3], 0, normrect, sqsumStep );
 
-    size_t fi, nfeatures = features.size();
+    size_t fi, nfeatures = features->size();
 
+    optfeatures->resize(nfeatures);
+    optfeaturesPtr = &(*optfeatures)[0];
+    const std::vector<Feature>& ff = *features;
     for( fi = 0; fi < nfeatures; fi++ )
-        optfeaturesPtr[fi].updatePtrs( !featuresPtr[fi].tilted ? sum : tilted );
+        optfeaturesPtr[fi].setOffsets( ff[fi], sumStep, tofs );
     return true;
 }
 
@@ -555,10 +562,10 @@ bool  HaarEvaluator::setWindow( Point pt )
         pt.y + origWinSize.height >= sum.rows )
         return false;
 
-    size_t pOffset = pt.y * (sum.step/sizeof(int)) + pt.x;
-    size_t pqOffset = pt.y * (sqsum.step/sizeof(double)) + pt.x;
-    int valsum = CALC_SUM(p, pOffset);
-    double valsqsum = CALC_SUM(pq, pqOffset);
+    const int* p = &sum.at<int>(pt);
+    const double* pq = &sqsum.at<double>(pt);
+    int valsum = CALC_SUM_OFS(nofs, p);
+    double valsqsum = CALC_SUM_OFS(nqofs, pq);
 
     double nf = (double)normrect.area() * valsqsum - (double)valsum * valsum;
     if( nf > 0. )
@@ -566,7 +573,7 @@ bool  HaarEvaluator::setWindow( Point pt )
     else
         nf = 1.;
     varianceNormFactor = 1./nf;
-    offset = (int)pOffset;
+    pwin = p;
 
     return true;
 }
@@ -613,8 +620,9 @@ Ptr<FeatureEvaluator> LBPEvaluator::clone() const
     return ret;
 }
 
-bool LBPEvaluator::setImage( const Mat& image, Size _origWinSize )
+bool LBPEvaluator::setImage( InputArray _image, Size _origWinSize )
 {
+    Mat image = _image.getMat();
     int rn = image.rows+1, cn = image.cols+1;
     origWinSize = _origWinSize;
 
@@ -694,8 +702,9 @@ Ptr<FeatureEvaluator> HOGEvaluator::clone() const
     return ret;
 }
 
-bool HOGEvaluator::setImage( const Mat& image, Size winSize )
+bool HOGEvaluator::setImage( InputArray _image, Size winSize )
 {
+    Mat image = _image.getMat();
     int rows = image.rows + 1;
     int cols = image.cols + 1;
     origWinSize = winSize;
@@ -1011,11 +1020,11 @@ struct getRect { Rect operator ()(const CvAvgComp& e) const { return e.rect; } }
 struct getNeighbors { int operator ()(const CvAvgComp& e) const { return e.neighbors; } };
 
 
-bool CascadeClassifierImpl::detectSingleScale( const Mat& image, int stripCount, Size processingRectSize,
-                                           int stripSize, int yStep, double factor, std::vector<Rect>& candidates,
+bool CascadeClassifierImpl::detectSingleScale( InputArray _image, Size processingRectSize,
+                                           int yStep, double factor, std::vector<Rect>& candidates,
                                            std::vector<int>& levels, std::vector<double>& weights, bool outputRejectLevels )
 {
-    if( !featureEvaluator->setImage( image, data.origWinSize ) )
+    if( !featureEvaluator->setImage( _image, data.origWinSize ) )
         return false;
 
 #if defined (LOG_CASCADE_STATISTIC)
@@ -1024,13 +1033,21 @@ bool CascadeClassifierImpl::detectSingleScale( const Mat& image, int stripCount,
 
     Mat currentMask;
     if (maskGenerator) {
+        Mat image = _image.getMat();
         currentMask=maskGenerator->generateMask(image);
     }
 
     std::vector<Rect> candidatesVector;
     std::vector<int> rejectLevels;
     std::vector<double> levelWeights;
-    Mutex mtx;
+    
+    int stripCount, stripSize;
+    
+    const int PTS_PER_THREAD = 1000;
+    stripCount = ((processingRectSize.width/yStep)*(processingRectSize.height + yStep-1)/yStep + PTS_PER_THREAD/2)/PTS_PER_THREAD;
+    stripCount = std::min(std::max(stripCount, 1), 100);
+    stripSize = (((processingRectSize.height + stripCount - 1)/stripCount + yStep-1)/yStep)*yStep;
+    
     if( outputRejectLevels )
     {
         parallel_for_(Range(0, stripCount), CascadeClassifierInvoker( *this, processingRectSize, stripSize, yStep, factor,
@@ -1052,6 +1069,70 @@ bool CascadeClassifierImpl::detectSingleScale( const Mat& image, int stripCount,
     return true;
 }
 
+    
+bool CascadeClassifierImpl::ocl_detectSingleScale( InputArray _image, Size processingRectSize,
+                                                   int yStep, double factor, std::vector<Rect>& candidates,
+                                                   std::vector<int>&, std::vector<double>&, bool )
+{
+    Ptr<HaarEvaluator> haar = featureEvaluator.dynamicCast<HaarEvaluator>();
+    if( haar.empty() )
+        return false;
+    
+    if( cascadeKernel.empty() )
+    {
+        //cascadeKernel.create(")
+        if( cascadeKernel.empty() )
+            return false;
+    }
+    
+    if( ustages.empty() )
+    {
+    #define UPLOAD_CASCADE_PART(NAME) \
+        Mat(1, (int)(data.NAME.size()*sizeof(data.NAME[0])), CV_8U, &data.NAME[0]).copyTo(u##NAME)
+        
+        UPLOAD_CASCADE_PART(stages);
+        UPLOAD_CASCADE_PART(classifiers);
+        UPLOAD_CASCADE_PART(nodes);
+        UPLOAD_CASCADE_PART(leaves);
+        ufacepos.create();
+    }
+    
+    haar->setUMat(_image, data.origWinSize, ugrayImage.size());
+    std::vector<UMat> bufs;
+    haar->getUMats(bufs);
+    CV_Assert(bufs.size() == 5);
+                        
+    size_t globalsize[] = { processingRectSize.width, processingRectSize.height };
+    
+    if(!cascadeKernel.args(ocl::KernelArg::PtrReadOnly(bufs[0]), // sum
+                       ocl::KernelArg::PtrReadOnly(bufs[1]), // sqsum
+                       ocl::KernelArg::PtrReadOnly(bufs[2]), // optfeatures
+                       
+                       // cascade classifier
+                       ocl::KernelArg::PtrReadOnly(ustages),
+                       ocl::KernelArg::PtrReadOnly(uclassifiers),
+                       ocl::KernelArg::PtrReadOnly(unodes),
+                       ocl::KernelArg::PtrReadOnly(uleaves),
+                       
+                       ocl::KernelArg::WriteOnly(ufacepos), // positions
+                       ocl::KernelArg::ReadWrite(umisc),
+                       processingRectSize.width,
+                       processingRectSize.height).run(2, globalsize, 0, false))
+        return false;
+    
+    Mat facepos = ufacepos.getMat(ACCESS_READ);
+    const int* fptr = facepos.ptr<int>();
+    int nfaces = fptr[0];
+    for( i = 0; i < nfaces; i++ )
+    {
+        int pos = fptr[i+1];
+        int x =
+        candidates.push_back(Rect()
+    return false;
+}
+    
+
+
 bool CascadeClassifierImpl::isOldFormatCascade() const
 {
     return !oldCascade.empty();
@@ -1097,28 +1178,56 @@ static void detectMultiScaleOldFormat( const Mat& image, Ptr<CvHaarClassifierCas
     std::transform(vecAvgComp.begin(), vecAvgComp.end(), objects.begin(), getRect());
 }
 
-void CascadeClassifierImpl::detectMultiScaleNoGrouping( const Mat& image, std::vector<Rect>& candidates,
+    
+void CascadeClassifierImpl::detectMultiScaleNoGrouping( InputArray _image, std::vector<Rect>& candidates,
                                                     std::vector<int>& rejectLevels, std::vector<double>& levelWeights,
                                                     double scaleFactor, Size minObjectSize, Size maxObjectSize,
                                                     bool outputRejectLevels )
 {
+    Size imgsz = _image.size();
+    int imgtype = _image.type();
+    
+    Mat grayImage, imageBuffer;
+    
     candidates.clear();
-
-    if (maskGenerator)
-        maskGenerator->initializeMask(image);
-
+    rejectLevels.clear();
+    levelWeights.clear();
+    
     if( maxObjectSize.height == 0 || maxObjectSize.width == 0 )
-        maxObjectSize = image.size();
-
-    Mat grayImage = image;
-    if( grayImage.channels() > 1 )
+        maxObjectSize = imgsz;
+    
+    bool use_ocl = ocl::useOpenCL() &&
+        getFeatureType() == FeatureEvaluator::HAAR &&
+        !isOldFormatCascade() &&
+        maskGenerator.empty() &&
+        !outputRejectLevels &&
+        tryOpenCL;
+    
+    if( !use_ocl )
     {
-        Mat temp;
-        cvtColor(grayImage, temp, COLOR_BGR2GRAY);
-        grayImage = temp;
-    }
+        Mat image = _image.getMat();
+        if (maskGenerator)
+            maskGenerator->initializeMask(image);
 
-    Mat imageBuffer(image.rows + 1, image.cols + 1, CV_8U);
+        grayImage = image;
+        if( CV_MAT_CN(imgtype) > 1 )
+        {
+            Mat temp;
+            cvtColor(grayImage, temp, COLOR_BGR2GRAY);
+            grayImage = temp;
+        }
+
+        imageBuffer.create(imgsz.height + 1, imgsz.width + 1, CV_8U);
+    }
+    else
+    {
+        UMat uimage = _image.getUMat();
+        if( CV_MAT_CN(imgtype) > 1 )
+            cvtColor(uimage, ugrayImage, COLOR_BGR2GRAY);
+        else
+            uimage.copyTo(ugrayImage);
+        uimageBuffer.create(imgsz.height + 1, imgsz.width + 1, CV_8U);
+    }
 
     for( double factor = 1; ; factor *= scaleFactor )
     {
@@ -1126,7 +1235,8 @@ void CascadeClassifierImpl::detectMultiScaleNoGrouping( const Mat& image, std::v
 
         Size windowSize( cvRound(originalWindowSize.width*factor), cvRound(originalWindowSize.height*factor) );
         Size scaledImageSize( cvRound( grayImage.cols/factor ), cvRound( grayImage.rows/factor ) );
-        Size processingRectSize( scaledImageSize.width - originalWindowSize.width, scaledImageSize.height - originalWindowSize.height );
+        Size processingRectSize( scaledImageSize.width - originalWindowSize.width,
+                                 scaledImageSize.height - originalWindowSize.height );
 
         if( processingRectSize.width <= 0 || processingRectSize.height <= 0 )
             break;
@@ -1134,10 +1244,7 @@ void CascadeClassifierImpl::detectMultiScaleNoGrouping( const Mat& image, std::v
             break;
         if( windowSize.width < minObjectSize.width || windowSize.height < minObjectSize.height )
             continue;
-
-        Mat scaledImage( scaledImageSize, CV_8U, imageBuffer.data );
-        resize( grayImage, scaledImage, scaledImageSize, 0, 0, INTER_LINEAR );
-
+        
         int yStep;
         if( getFeatureType() == cv::FeatureEvaluator::HOG )
         {
@@ -1148,16 +1255,36 @@ void CascadeClassifierImpl::detectMultiScaleNoGrouping( const Mat& image, std::v
             yStep = factor > 2. ? 1 : 2;
         }
 
-        int stripCount, stripSize;
-
-        const int PTS_PER_THREAD = 1000;
-        stripCount = ((processingRectSize.width/yStep)*(processingRectSize.height + yStep-1)/yStep + PTS_PER_THREAD/2)/PTS_PER_THREAD;
-        stripCount = std::min(std::max(stripCount, 1), 100);
-        stripSize = (((processingRectSize.height + stripCount - 1)/stripCount + yStep-1)/yStep)*yStep;
-
-        if( !detectSingleScale( scaledImage, stripCount, processingRectSize, stripSize, yStep, factor, candidates,
-            rejectLevels, levelWeights, outputRejectLevels ) )
-            break;
+        if( use_ocl )
+        {
+            UMat uscaledImage(uimageBuffer, Rect(0, 0, scaledImageSize.width, scaledImageSize.height));
+            resize( ugrayImage, uscaledImage, scaledImageSize, 0, 0, INTER_LINEAR );
+            
+            if( ocl_detectSingleScale( uscaledImage, processingRectSize, yStep, factor, candidates,
+                                       rejectLevels, levelWeights, outputRejectLevels ) )
+                continue;
+            
+            /////// if the OpenCL branch has been executed but failed, fall back to CPU: /////
+            
+            tryOpenCL = false; // for this cascade do not try OpenCL anymore
+            
+            // since we may already have some partial results from OpenCL code (unlikely, but still),
+            // we just recursively call the function again, but with tryOpenCL==false it will
+            // go with CPU route, so there is no infinite recursion
+            detectMultiScaleNoGrouping( _image, candidates, rejectLevels, levelWeights,
+                                       scaleFactor, minObjectSize, maxObjectSize,
+                                       outputRejectLevels);
+            return;
+        }
+        else
+        {
+            Mat scaledImage( scaledImageSize, CV_8U, imageBuffer.data );
+            resize( grayImage, scaledImage, scaledImageSize, 0, 0, INTER_LINEAR );
+            
+            if( !detectSingleScale( scaledImage, processingRectSize, yStep, factor, candidates,
+                rejectLevels, levelWeights, outputRejectLevels ) )
+                break;
+        }
     }
 }
 
@@ -1168,21 +1295,21 @@ void CascadeClassifierImpl::detectMultiScale( InputArray _image, std::vector<Rec
                                           int flags, Size minObjectSize, Size maxObjectSize,
                                           bool outputRejectLevels )
 {
-    Mat image = _image.getMat();
-    CV_Assert( scaleFactor > 1 && image.depth() == CV_8U );
+    CV_Assert( scaleFactor > 1 && _image.depth() == CV_8U );
 
     if( empty() )
         return;
 
     if( isOldFormatCascade() )
     {
+        Mat image = _image.getMat();
         std::vector<CvAvgComp> fakeVecAvgComp;
         detectMultiScaleOldFormat( image, oldCascade, objects, rejectLevels, levelWeights, fakeVecAvgComp, scaleFactor,
                                    minNeighbors, flags, minObjectSize, maxObjectSize, outputRejectLevels );
     }
     else
     {
-        detectMultiScaleNoGrouping( image, objects, rejectLevels, levelWeights, scaleFactor, minObjectSize, maxObjectSize,
+        detectMultiScaleNoGrouping( _image, objects, rejectLevels, levelWeights, scaleFactor, minObjectSize, maxObjectSize,
                                     outputRejectLevels );
         const double GROUP_EPS = 0.2;
         if( outputRejectLevels )
@@ -1346,8 +1473,15 @@ bool CascadeClassifierImpl::Data::read(const FileNode &root)
     return true;
 }
 
+                       
 bool CascadeClassifierImpl::read_(const FileNode& root)
 {
+    tryOpenCL = true;
+    cascadeKernel = ocl::Kernel();
+    ustages.release();
+    uclassifiers.release();
+    unodes.release();
+    uleaves.release();
     if( !data.read(root) )
         return false;
 
diff --git a/modules/objdetect/src/cascadedetect.hpp b/modules/objdetect/src/cascadedetect.hpp
index 5ab17eb13..e2a9ffaf1 100644
--- a/modules/objdetect/src/cascadedetect.hpp
+++ b/modules/objdetect/src/cascadedetect.hpp
@@ -49,11 +49,17 @@ public:
     Ptr<MaskGenerator> getMaskGenerator();
 
 protected:
-    bool detectSingleScale( const Mat& image, int stripCount, Size processingRectSize,
-                           int stripSize, int yStep, double factor, std::vector<Rect>& candidates,
-                           std::vector<int>& rejectLevels, std::vector<double>& levelWeights, bool outputRejectLevels = false );
+    bool detectSingleScale( InputArray image, Size processingRectSize,
+                            int yStep, double factor, std::vector<Rect>& candidates,
+                            std::vector<int>& rejectLevels, std::vector<double>& levelWeights,
+                            bool outputRejectLevels = false );
+    bool ocl_detectSingleScale( InputArray image, Size processingRectSize,
+                           int yStep, double factor, std::vector<Rect>& candidates,
+                           std::vector<int>& rejectLevels, std::vector<double>& levelWeights,
+                           bool outputRejectLevels = false );
+    
 
-    void detectMultiScaleNoGrouping( const Mat& image, std::vector<Rect>& candidates,
+    void detectMultiScaleNoGrouping( InputArray image, std::vector<Rect>& candidates,
                                     std::vector<int>& rejectLevels, std::vector<double>& levelWeights,
                                     double scaleFactor, Size minObjectSize, Size maxObjectSize,
                                     bool outputRejectLevels = false );
@@ -127,6 +133,12 @@ protected:
     Ptr<CvHaarClassifierCascade> oldCascade;
 
     Ptr<MaskGenerator> maskGenerator;
+    UMat ugrayImage, uimageBuffer;
+    UMat ufacepos, ustages, uclassifiers, unodes, uleaves, usubsets;
+    ocl::Kernel cascadeKernel;
+    bool tryOpenCL;
+    
+    Mutex mtx;
 };
 
 #define CC_CASCADE_PARAMS "cascadeParams"
@@ -212,6 +224,10 @@ protected:
 
 #define CALC_SUM(rect,offset) CALC_SUM_((rect)[0], (rect)[1], (rect)[2], (rect)[3], offset)
 
+#define CALC_SUM_OFS_(p0, p1, p2, p3, ptr) \
+((ptr)[p0] - (ptr)[p1] - (ptr)[p2] + (ptr)[p3])
+    
+#define CALC_SUM_OFS(rect, ptr) CALC_SUM_OFS_((rect)[0], (rect)[1], (rect)[2], (rect)[3], ptr)
 
 //----------------------------------------------  HaarEvaluator ---------------------------------------
 class HaarEvaluator : public FeatureEvaluator
@@ -241,10 +257,10 @@ public:
         enum { RECT_NUM = Feature::RECT_NUM };
         float calc( const int* pwin ) const;
 
-        void setPtrs( const Mat& sum, const Feature& f );
+        void setOffsets( const Feature& _f, int step, int tofs );
 
         int ofs[RECT_NUM][4];
-        float weight[RECT_NUM];
+        float weight[4];
     };
 
     HaarEvaluator();
@@ -254,8 +270,11 @@ public:
     virtual Ptr<FeatureEvaluator> clone() const;
     virtual int getFeatureType() const { return FeatureEvaluator::HAAR; }
 
-    virtual bool setImage(const Mat&, Size origWinSize);
+    virtual bool setImage(InputArray, Size origWinSize);
     virtual bool setWindow(Point pt);
+    
+    virtual bool setUMat(InputArray, Size origWinSize, Size origImgSize);
+    virtual void getUMats(std::vector<UMat>& bufs);
 
     double operator()(int featureIdx) const
     { return optfeaturesPtr[featureIdx].calc(pwin) * varianceNormFactor; }
@@ -263,22 +282,22 @@ public:
     { return (*this)(featureIdx); }
 
 protected:
-    Size origWinSize;
-    std::vector<Feature> features;
-    std::vector<OptFeature> optfeatures;
+    Size origWinSize, origImgSize;
+    Ptr<std::vector<Feature> > features;
+    Ptr<std::vector<OptFeature> > optfeatures;
     OptFeature* optfeaturesPtr; // optimization
     bool hasTiltedFeatures;
 
-    Mat sum0, sqsum0, tilted0;
+    Mat sum0, sqsum0;
     Mat sum, sqsum, tilted;
+    UMat usum, usqsum, fbuf;
 
     Rect normrect;
-    int p[4];
-    int pq[4];
+    int nofs[4];
+    int nqofs[4];
 
     const int* pwin;
     const double* pqwin;
-    int offset;
     double varianceNormFactor;
 };
 
@@ -298,34 +317,35 @@ inline HaarEvaluator::OptFeature :: OptFeature()
     ofs[2][0] = ofs[2][1] = ofs[2][2] = ofs[2][3] = 0;
 }
 
-/*inline float HaarEvaluator::Feature :: calc( int _offset ) const
+inline float HaarEvaluator::OptFeature :: calc( const int* ptr ) const
 {
-    float ret = rect[0].weight * CALC_SUM(p[0], _offset) + rect[1].weight * CALC_SUM(p[1], _offset);
+    float ret = weight[0] * CALC_SUM_OFS(ofs[0], ptr) +
+                weight[1] * CALC_SUM_OFS(ofs[1], ptr);
 
-    if( rect[2].weight != 0.0f )
-        ret += rect[2].weight * CALC_SUM(p[2], _offset);
+    if( weight[2] != 0.0f )
+        ret += weight[2] * CALC_SUM_OFS(ofs[2], ptr);
 
     return ret;
-}*/
+}
 
-inline void HaarEvaluator::OptFeature :: setPtrs( const Mat& _sum, const Feature& _f )
+inline void HaarEvaluator::OptFeature :: setOffsets( const Feature& _f, int step, int tofs )
 {
-    const int* ptr = (const int*)_sum.data;
-    size_t step = _sum.step/sizeof(ptr[0]);
-    size_t tiltedofs =
-    if (tilted)
+    weight[0] = _f.rect[0].weight;
+    weight[1] = _f.rect[1].weight;
+    weight[2] = _f.rect[2].weight;
+    if (_f.tilted)
     {
-        CV_TILTED_PTRS( p[0][0], p[0][1], p[0][2], p[0][3], ptr, rect[0].r, step );
-        CV_TILTED_PTRS( p[1][0], p[1][1], p[1][2], p[1][3], ptr, rect[1].r, step );
-        if (rect[2].weight)
-            CV_TILTED_PTRS( p[2][0], p[2][1], p[2][2], p[2][3], ptr, rect[2].r, step );
+        CV_TILTED_OFS( ofs[0][0], ofs[0][1], ofs[0][2], ofs[0][3], tofs, _f.rect[0].r, step );
+        CV_TILTED_OFS( ofs[1][0], ofs[1][1], ofs[1][2], ofs[1][3], tofs, _f.rect[1].r, step );
+        if (weight[2])
+            CV_TILTED_PTRS( ofs[2][0], ofs[2][1], ofs[2][2], ofs[2][3], tofs, _f.rect[2].r, step );
     }
     else
     {
-        CV_SUM_PTRS( p[0][0], p[0][1], p[0][2], p[0][3], ptr, rect[0].r, step );
-        CV_SUM_PTRS( p[1][0], p[1][1], p[1][2], p[1][3], ptr, rect[1].r, step );
-        if (rect[2].weight)
-            CV_SUM_PTRS( p[2][0], p[2][1], p[2][2], p[2][3], ptr, rect[2].r, step );
+        CV_SUM_OFS( ofs[0][0], ofs[0][1], ofs[0][2], ofs[0][3], 0, _f.rect[0].r, step );
+        CV_SUM_OFS( ofs[1][0], ofs[1][1], ofs[1][2], ofs[1][3], 0, _f.rect[1].r, step );
+        if (weight[2])
+            CV_SUM_OFS( ofs[2][0], ofs[2][1], ofs[2][2], ofs[2][3], 0, _f.rect[2].r, step );
     }
 }
 
@@ -356,7 +376,7 @@ public:
     virtual Ptr<FeatureEvaluator> clone() const;
     virtual int getFeatureType() const { return FeatureEvaluator::LBP; }
 
-    virtual bool setImage(const Mat& image, Size _origWinSize);
+    virtual bool setImage(InputArray image, Size _origWinSize);
     virtual bool setWindow(Point pt);
 
     int operator()(int featureIdx) const
@@ -433,7 +453,7 @@ public:
     virtual bool read( const FileNode& node );
     virtual Ptr<FeatureEvaluator> clone() const;
     virtual int getFeatureType() const { return FeatureEvaluator::HOG; }
-    virtual bool setImage( const Mat& image, Size winSize );
+    virtual bool setImage( InputArray image, Size winSize );
     virtual bool setWindow( Point pt );
     double operator()(int featureIdx) const
     {