lot's of changes; nonfree & photo modules added; SIFT & SURF -> nonfree module; Inpainting -> photo; refactored features2d (ORB is still failing tests), optimized brute-force matcher and made it non-template.

2012-03-15 14:36:01 +00:00
parent 6300215b94
commit 957e80abbd
99 changed files with 6719 additions and 7240 deletions
--- a/modules/nonfree/src/precomp.cpp
+++ b/modules/nonfree/src/precomp.cpp
@@ -0,0 +1,44 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                        Intel License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000, Intel Corporation, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of Intel Corporation may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "precomp.hpp"
+
+/* End of file. */
--- a/modules/nonfree/src/precomp.hpp
+++ b/modules/nonfree/src/precomp.hpp
@@ -0,0 +1,58 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                          License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef __OPENCV_PRECOMP_H__
+#define __OPENCV_PRECOMP_H__
+
+#if _MSC_VER >= 1200
+#pragma warning( disable: 4251 4512 4710 4711 4514 4996 )
+#endif
+
+#ifdef HAVE_CVCONFIG_H 
+#include "cvconfig.h"
+#endif
+
+#include "opencv2/nonfree/nonfree.hpp"
+#include "opencv2/imgproc/imgproc.hpp"
+#include "opencv2/core/internal.hpp"
+
+#endif
--- a/modules/nonfree/src/sift.cpp
+++ b/modules/nonfree/src/sift.cpp
@@ -0,0 +1,772 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                        Intel License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000, Intel Corporation, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of Intel Corporation may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+/**********************************************************************************************\
+ Implementation of SIFT is based on the code from http://blogs.oregonstate.edu/hess/code/sift/
+ Below is the original copyright.
+ 
+//    Copyright (c) 2006-2010, Rob Hess <hess@eecs.oregonstate.edu>
+//    All rights reserved.
+
+//    The following patent has been issued for methods embodied in this
+//    software: "Method and apparatus for identifying scale invariant features
+//    in an image and use of same for locating an object in an image," David
+//    G. Lowe, US Patent 6,711,293 (March 23, 2004). Provisional application
+//    filed March 8, 1999. Asignee: The University of British Columbia. For
+//    further details, contact David Lowe (lowe@cs.ubc.ca) or the
+//    University-Industry Liaison Office of the University of British
+//    Columbia.
+
+//    Note that restrictions imposed by this patent (and possibly others)
+//    exist independently of and may be in conflict with the freedoms granted
+//    in this license, which refers to copyright of the program, not patents
+//    for any methods that it implements.  Both copyright and patent law must
+//    be obeyed to legally use and redistribute this program and it is not the
+//    purpose of this license to induce you to infringe any patents or other
+//    property right claims or to contest validity of any such claims.  If you
+//    redistribute or use the program, then this license merely protects you
+//    from committing copyright infringement.  It does not protect you from
+//    committing patent infringement.  So, before you do anything with this
+//    program, make sure that you have permission to do so not merely in terms
+//    of copyright, but also in terms of patent law.
+
+//    Please note that this license is not to be understood as a guarantee
+//    either.  If you use the program according to this license, but in
+//    conflict with patent law, it does not mean that the licensor will refund
+//    you for any losses that you incur if you are sued for your patent
+//    infringement.
+
+//    Redistribution and use in source and binary forms, with or without
+//    modification, are permitted provided that the following conditions are
+//    met:
+//        * Redistributions of source code must retain the above copyright and
+//          patent notices, this list of conditions and the following
+//          disclaimer.
+//        * Redistributions in binary form must reproduce the above copyright
+//          notice, this list of conditions and the following disclaimer in
+//          the documentation and/or other materials provided with the
+//          distribution.
+//        * Neither the name of Oregon State University nor the names of its
+//          contributors may be used to endorse or promote products derived
+//          from this software without specific prior written permission.
+
+//    THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+//    IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+//    TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+//    PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+//    HOLDER BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+//    EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+//    PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+//    PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+//    LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+//    NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+//    SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+\**********************************************************************************************/
+ 
+#include "precomp.hpp"
+#include <iostream>
+#include <stdarg.h>
+
+namespace cv
+{
+
+/******************************* Defs and macros *****************************/
+
+// default number of sampled intervals per octave
+static const int SIFT_INTVLS = 3;
+
+// default sigma for initial gaussian smoothing
+static const float SIFT_SIGMA = 1.6f;
+
+// default threshold on keypoint contrast |D(x)|
+static const float SIFT_CONTR_THR = 0.04f;
+
+// default threshold on keypoint ratio of principle curvatures
+static const float SIFT_CURV_THR = 10.f;
+
+// double image size before pyramid construction?
+static const bool SIFT_IMG_DBL = true;
+
+// default width of descriptor histogram array
+static const int SIFT_DESCR_WIDTH = 4;
+
+// default number of bins per histogram in descriptor array
+static const int SIFT_DESCR_HIST_BINS = 8;
+
+// assumed gaussian blur for input image
+static const float SIFT_INIT_SIGMA = 0.5f;
+
+// width of border in which to ignore keypoints
+static const int SIFT_IMG_BORDER = 5;
+
+// maximum steps of keypoint interpolation before failure
+static const int SIFT_MAX_INTERP_STEPS = 5;
+
+// default number of bins in histogram for orientation assignment
+static const int SIFT_ORI_HIST_BINS = 36;
+
+// determines gaussian sigma for orientation assignment
+static const float SIFT_ORI_SIG_FCTR = 1.5f;
+
+// determines the radius of the region used in orientation assignment
+static const float SIFT_ORI_RADIUS = 3 * SIFT_ORI_SIG_FCTR;
+
+// orientation magnitude relative to max that results in new feature
+static const float SIFT_ORI_PEAK_RATIO = 0.8f;
+
+// determines the size of a single descriptor orientation histogram
+static const float SIFT_DESCR_SCL_FCTR = 3.f;
+
+// threshold on magnitude of elements of descriptor vector
+static const float SIFT_DESCR_MAG_THR = 0.2f;
+
+// factor used to convert floating-point descriptor to unsigned char
+static const float SIFT_INT_DESCR_FCTR = 512.f;
+    
+static const int SIFT_FIXPT_SCALE = 48;
+    
+    
+static Mat createInitialImage( const Mat& img, bool doubleImageSize, float sigma )
+{
+    Mat gray, gray_fpt;
+    if( img.channels() == 3 || img.channels() == 4 )
+        cvtColor(img, gray, COLOR_BGR2GRAY);
+    else
+        img.copyTo(gray);
+    gray.convertTo(gray_fpt, CV_16S, SIFT_FIXPT_SCALE, 0);
+    
+    float sig_diff;
+    
+    if( doubleImageSize )
+    {
+        sig_diff = sqrtf( std::max(sigma * sigma - SIFT_INIT_SIGMA * SIFT_INIT_SIGMA * 4, 0.01f) );
+        Mat dbl;
+        resize(gray_fpt, dbl, Size(gray.cols*2, gray.rows*2), 0, 0, INTER_LINEAR);
+        GaussianBlur(dbl, dbl, Size(), sig_diff, sig_diff);
+        return dbl;
+    }
+    else
+    {
+        sig_diff = sqrtf( std::max(sigma * sigma - SIFT_INIT_SIGMA * SIFT_INIT_SIGMA, 0.01f) );
+        GaussianBlur(gray_fpt, gray_fpt, Size(), sig_diff, sig_diff);
+        return gray_fpt;
+    }
+}
+    
+    
+void SIFT::buildGaussianPyramid( const Mat& base, vector<Mat>& pyr, int nOctaves ) const
+{
+    vector<double> sig(nOctaveLayers + 3);
+    pyr.resize(nOctaves*(nOctaveLayers + 3));
+    
+    // precompute Gaussian sigmas using the following formula:
+    //  \sigma_{total}^2 = \sigma_{i}^2 + \sigma_{i-1}^2
+    sig[0] = sigma;
+    double k = pow( 2., 1. / nOctaveLayers );
+    for( int i = 1; i < nOctaveLayers + 3; i++ )
+    {
+        double sig_prev = pow(k, (double)(i-1))*sigma;
+        double sig_total = sig_prev*k;
+        sig[i] = std::sqrt(sig_total*sig_total - sig_prev*sig_prev);
+    }
+    
+    for( int o = 0; o < nOctaves; o++ )
+    {
+        for( int i = 0; i < nOctaveLayers + 3; i++ )
+        {
+            Mat& dst = pyr[o*(nOctaveLayers + 3) + i];
+            if( o == 0  &&  i == 0 )
+                dst = base;
+            // base of new octave is halved image from end of previous octave
+            else if( i == 0 )
+            {
+                const Mat& src = pyr[(o-1)*(nOctaveLayers + 3) + nOctaveLayers];
+                resize(src, dst, Size(src.cols/2, src.rows/2),
+                       0, 0, INTER_NEAREST);
+            }
+            else
+            {
+                const Mat& src = pyr[o*(nOctaveLayers + 3) + i-1];
+                GaussianBlur(src, dst, Size(), sig[i], sig[i]);
+            }
+        }
+    }
+}
+
+
+void SIFT::buildDoGPyramid( const vector<Mat>& gpyr, vector<Mat>& dogpyr ) const
+{
+    int nOctaves = (int)gpyr.size()/(nOctaveLayers + 3);
+    dogpyr.resize( nOctaves*(nOctaveLayers + 2) );
+    
+    for( int o = 0; o < nOctaves; o++ )
+    {
+        for( int i = 0; i < nOctaveLayers + 2; i++ )
+        {
+            const Mat& src1 = gpyr[o*(nOctaveLayers + 3) + i];
+            const Mat& src2 = gpyr[o*(nOctaveLayers + 3) + i + 1];
+            Mat& dst = dogpyr[o*(nOctaveLayers + 2) + i];
+            subtract(src2, src1, dst, noArray(), CV_16S);
+        }
+    }
+}
+
+    
+// Computes a gradient orientation histogram at a specified pixel
+static float calcOrientationHist( const Mat& img, Point pt, int radius,
+                                  float sigma, float* hist, int n )
+{
+    int i, j, k, len = (radius*2+1)*(radius*2+1);
+    
+    float expf_scale = -1.f/(2.f * sigma * sigma);
+    AutoBuffer<float> buf(len*4 + n+4);
+    float *X = buf, *Y = X + len, *Mag = X, *Ori = Y + len, *W = Ori + len;
+    float* temphist = W + len + 2;
+    
+    for( i = 0; i < n; i++ )
+        temphist[i] = 0.f;
+    
+    for( i = -radius, k = 0; i <= radius; i++ )
+    {
+        int y = pt.y + i;
+        if( y <= 0 || y >= img.rows - 1 )
+            continue;
+        for( j = -radius; j <= radius; j++ )
+        {
+            int x = pt.x + j;
+            if( x <= 0 || x >= img.cols - 1 )
+                continue;
+            
+            float dx = img.at<short>(y, x+1) - img.at<short>(y, x-1);
+            float dy = img.at<short>(y-1, x) - img.at<short>(y+1, x);
+            
+            X[k] = dx; Y[k] = dy; W[k] = (i*i + j*j)*expf_scale;
+            k++;
+        }
+    }
+    
+    len = k;
+    
+    // compute gradient values, orientations and the weights over the pixel neighborhood
+    exp(W, W, len);
+    fastAtan2(Y, X, Ori, len, true);
+    magnitude(X, Y, Mag, len);
+    
+    for( k = 0; k < len; k++ )
+    {
+        int bin = cvRound((n/360.f)*Ori[k]);
+        if( bin >= n )
+            bin -= n;
+        if( bin < 0 )
+            bin += n;
+        temphist[bin] += W[k]*Mag[k];
+    }
+    
+    // smooth the histogram
+    temphist[-1] = temphist[n-1];
+    temphist[-2] = temphist[n-2];
+    temphist[n] = temphist[0];
+    temphist[n+1] = temphist[1];
+    for( i = 0; i < n; i++ )
+    {
+        hist[i] = (temphist[i-2] + temphist[i+2])*(1.f/16.f) +
+            (temphist[i-1] + temphist[i+1])*(4.f/16.f) +
+            temphist[i]*(6.f/16.f);
+    }
+    
+    float maxval = hist[0];
+    for( i = 1; i < n; i++ )
+        maxval = std::max(maxval, hist[i]);
+    
+    return maxval;
+}
+    
+
+//
+// Interpolates a scale-space extremum's location and scale to subpixel
+// accuracy to form an image feature.  Rejects features with low contrast.
+// Based on Section 4 of Lowe's paper.  
+static bool adjustLocalExtrema( const vector<Mat>& dog_pyr, KeyPoint& kpt, int octv,
+                                int& layer, int& r, int& c, int nOctaveLayers,
+                                float contrastThreshold, float edgeThreshold, float sigma )
+{
+    const float img_scale = 1.f/(255*SIFT_FIXPT_SCALE);
+    const float deriv_scale = img_scale*0.5f;
+    const float second_deriv_scale = img_scale;
+    const float cross_deriv_scale = img_scale*0.25f;
+    
+    float xi=0, xr=0, xc=0, contr;
+    int i = 0;
+    
+    for( ; i < SIFT_MAX_INTERP_STEPS; i++ )
+    {
+        int idx = octv*(nOctaveLayers+2) + layer;
+        const Mat& img = dog_pyr[idx];
+        const Mat& prev = dog_pyr[idx-1];
+        const Mat& next = dog_pyr[idx+1];
+        
+        Matx31f dD((img.at<short>(r, c+1) - img.at<short>(r, c-1))*deriv_scale,
+                   (img.at<short>(r+1, c) - img.at<short>(r-1, c))*deriv_scale,
+                   (next.at<short>(r, c) - prev.at<short>(r, c))*deriv_scale);
+        
+        float v2 = img.at<short>(r, c)*2;
+        float dxx = (img.at<short>(r, c+1) + img.at<short>(r, c-1) - v2)*second_deriv_scale;
+        float dyy = (img.at<short>(r+1, c) + img.at<short>(r-1, c) - v2)*second_deriv_scale;
+        float dss = (next.at<short>(r, c) + prev.at<short>(r, c) - v2)*second_deriv_scale;
+        float dxy = (img.at<short>(r+1, c+1) - img.at<short>(r+1, c-1) -
+                     img.at<short>(r-1, c+1) + img.at<short>(r-1, c-1))*cross_deriv_scale;
+        float dxs = (next.at<short>(r, c+1) - next.at<short>(r, c-1) -
+                     prev.at<short>(r, c+1) + prev.at<short>(r, c-1))*cross_deriv_scale;
+        float dys = (next.at<short>(r+1, c) - next.at<short>(r-1, c) -
+                     prev.at<short>(r+1, c) + prev.at<short>(r-1, c))*cross_deriv_scale;
+        
+        Matx33f H(dxx, dxy, dxs,
+                  dxy, dyy, dys,
+                  dxs, dys, dss);
+        
+        Matx31f X = H.solve<1>(dD, DECOMP_LU);
+        
+        xi = -X(2, 0);
+        xr = -X(1, 0);
+        xc = -X(0, 0);
+        
+        if( std::abs( xi ) < 0.5f  &&  std::abs( xr ) < 0.5f  &&  std::abs( xc ) < 0.5f )
+            break;
+        
+        c += cvRound( xc );
+        r += cvRound( xr );
+        layer += cvRound( xi );
+        
+        if( layer < 1 || layer > nOctaveLayers ||
+           c < SIFT_IMG_BORDER || c >= img.cols - SIFT_IMG_BORDER  ||
+           r < SIFT_IMG_BORDER || r >= img.rows - SIFT_IMG_BORDER )
+            return false;
+    }
+    
+    /* ensure convergence of interpolation */
+    if( i >= SIFT_MAX_INTERP_STEPS )
+        return false;
+    
+    {
+        int idx = octv*(nOctaveLayers+2) + layer;
+        const Mat& img = dog_pyr[idx];
+        const Mat& prev = dog_pyr[idx-1];
+        const Mat& next = dog_pyr[idx+1];
+        Matx31f dD((img.at<short>(r, c+1) - img.at<short>(r, c-1))*deriv_scale,
+                   (img.at<short>(r+1, c) - img.at<short>(r-1, c))*deriv_scale,
+                   (next.at<short>(r, c) - prev.at<short>(r, c))*deriv_scale);
+        float t = dD.dot(Matx31f(xc, xr, xi));
+        
+        contr = img.at<short>(r, c)*img_scale + t * 0.5f;
+        if( std::abs( contr ) * nOctaveLayers < contrastThreshold )
+            return false;
+        
+        /* principal curvatures are computed using the trace and det of Hessian */
+        float v2 = img.at<short>(r, c)*2;
+        float dxx = (img.at<short>(r, c+1) + img.at<short>(r, c-1) - v2)*second_deriv_scale;
+        float dyy = (img.at<short>(r+1, c) + img.at<short>(r-1, c) - v2)*second_deriv_scale;
+        float dxy = (img.at<short>(r+1, c+1) - img.at<short>(r+1, c-1) -
+                     img.at<short>(r-1, c+1) + img.at<short>(r-1, c-1)) * cross_deriv_scale;
+        float tr = dxx + dyy;
+        float det = dxx * dyy - dxy * dxy;
+        
+        if( det <= 0 || tr*tr*edgeThreshold >= (edgeThreshold + 1)*(edgeThreshold + 1)*det )
+            return false;
+    }
+    
+    kpt.pt.x = (c + xc) * (1 << octv);
+    kpt.pt.y = (r + xr) * (1 << octv);
+    kpt.octave = octv + (layer << 8) + (cvRound((xi + 0.5)*255) << 16);
+    kpt.size = sigma*powf(2.f, (layer + xi) / nOctaveLayers)*(1 << octv)*2;
+    
+    return true;
+}
+    
+    
+//
+// Detects features at extrema in DoG scale space.  Bad features are discarded
+// based on contrast and ratio of principal curvatures.
+void SIFT::findScaleSpaceExtrema( const vector<Mat>& gauss_pyr, const vector<Mat>& dog_pyr,
+                                  vector<KeyPoint>& keypoints ) const
+{
+    int nOctaves = (int)gauss_pyr.size()/(nOctaveLayers + 3);
+    int threshold = cvFloor(0.5 * contrastThreshold / nOctaveLayers * 255 * SIFT_FIXPT_SCALE);
+    const int n = SIFT_ORI_HIST_BINS;
+    float hist[n];
+    KeyPoint kpt;
+    
+    keypoints.clear();
+    
+    for( int o = 0; o < nOctaves; o++ )
+        for( int i = 1; i <= nOctaveLayers; i++ )
+        {
+            int idx = o*(nOctaveLayers+2)+i;
+            const Mat& img = dog_pyr[idx];
+            const Mat& prev = dog_pyr[idx-1];
+            const Mat& next = dog_pyr[idx+1];
+            int step = (int)img.step1();
+            int rows = img.rows, cols = img.cols;
+            
+            for( int r = SIFT_IMG_BORDER; r < rows-SIFT_IMG_BORDER; r++)
+            {
+                const short* currptr = img.ptr<short>(r);
+                const short* prevptr = prev.ptr<short>(r);
+                const short* nextptr = next.ptr<short>(r);
+                
+                for( int c = SIFT_IMG_BORDER; c < cols-SIFT_IMG_BORDER; c++)
+                {
+                    int val = currptr[c];
+                    
+                    // find local extrema with pixel accuracy
+                    if( std::abs(val) > threshold &&
+                       ((val > 0 && val >= currptr[c-1] && val >= currptr[c+1] &&
+                         val >= currptr[c-step-1] && val >= currptr[c-step] && val >= currptr[c-step+1] &&
+                         val >= currptr[c+step-1] && val >= currptr[c+step] && val >= currptr[c+step+1] &&
+                         val >= nextptr[c] && val >= nextptr[c-1] && val >= nextptr[c+1] &&
+                         val >= nextptr[c-step-1] && val >= nextptr[c-step] && val >= nextptr[c-step+1] &&
+                         val >= nextptr[c+step-1] && val >= nextptr[c+step] && val >= nextptr[c+step+1] &&
+                         val >= prevptr[c] && val >= prevptr[c-1] && val >= prevptr[c+1] &&
+                         val >= prevptr[c-step-1] && val >= prevptr[c-step] && val >= prevptr[c-step+1] &&
+                         val >= prevptr[c+step-1] && val >= prevptr[c+step] && val >= prevptr[c+step+1]) ||
+                        (val < 0 && val <= currptr[c-1] && val <= currptr[c+1] &&
+                         val <= currptr[c-step-1] && val <= currptr[c-step] && val <= currptr[c-step+1] &&
+                         val <= currptr[c+step-1] && val <= currptr[c+step] && val <= currptr[c+step+1] &&
+                         val <= nextptr[c] && val <= nextptr[c-1] && val <= nextptr[c+1] &&
+                         val <= nextptr[c-step-1] && val <= nextptr[c-step] && val <= nextptr[c-step+1] &&
+                         val <= nextptr[c+step-1] && val <= nextptr[c+step] && val <= nextptr[c+step+1] &&
+                         val <= prevptr[c] && val <= prevptr[c-1] && val <= prevptr[c+1] &&
+                         val <= prevptr[c-step-1] && val <= prevptr[c-step] && val <= prevptr[c-step+1] &&
+                         val <= prevptr[c+step-1] && val <= prevptr[c+step] && val <= prevptr[c+step+1])))
+                    {
+                        int r1 = r, c1 = c, layer = i;
+                        if( !adjustLocalExtrema(dog_pyr, kpt, o, layer, r1, c1, nOctaveLayers,
+                                                contrastThreshold, edgeThreshold, sigma) )
+                            continue;
+                        float scl_octv = kpt.size*0.5f/(1 << o);
+                        float omax = calcOrientationHist(gauss_pyr[o*(nOctaveLayers+3) + layer],
+                                                         Point(c1, r1),
+                                                         cvRound(SIFT_ORI_RADIUS * scl_octv),
+                                                         SIFT_ORI_SIG_FCTR * scl_octv,
+                                                         hist, n);
+                        float mag_thr = (float)(omax * SIFT_ORI_PEAK_RATIO);
+                        for( int j = 0; j < n; j++ )
+                        {
+                            int l = j > 0 ? j - 1 : n - 1;
+                            int r = j < n-1 ? j + 1 : 0;
+                            
+                            if( hist[j] > hist[l]  &&  hist[j] > hist[r]  &&  hist[j] >= mag_thr )
+                            {
+                                float bin = j + 0.5f * (hist[l]-hist[r]) / (hist[l] - 2*hist[j] + hist[r]);
+                                bin = bin < 0 ? n + bin : bin >= n ? bin - n : bin;
+                                kpt.angle = (float)((360.f/n) * bin);
+                                keypoints.push_back(kpt);
+                            }
+                        }
+                    }
+                }
+            }
+        }
+}    
+    
+    
+static void calcSIFTDescriptor( const Mat& img, Point2f ptf, float ori, float scl,
+                               int d, int n, float* dst )
+{
+    Point pt(cvRound(ptf.x), cvRound(ptf.y));
+    float cos_t = cosf(ori*(float)(CV_PI/180));
+    float sin_t = sinf(ori*(float)(CV_PI/180));
+    float bins_per_rad = n / 360.f;
+    float exp_scale = -1.f/(d * d * 0.5f);
+    float hist_width = SIFT_DESCR_SCL_FCTR * scl;
+    int radius = cvRound(hist_width * 1.4142135623730951f * (d + 1) * 0.5f);
+    cos_t /= hist_width;
+    sin_t /= hist_width;
+    
+    int i, j, k, len = (radius*2+1)*(radius*2+1), histlen = (d+2)*(d+2)*(n+2);
+    int rows = img.rows, cols = img.cols;
+    
+    AutoBuffer<float> buf(len*6 + histlen);
+    float *X = buf, *Y = X + len, *Mag = Y, *Ori = Mag + len, *W = Ori + len;
+    float *RBin = W + len, *CBin = RBin + len, *hist = CBin + len;
+    
+    for( i = 0; i < d+2; i++ )
+    {
+        for( j = 0; j < d+2; j++ )
+            for( k = 0; k < n+2; k++ )
+                hist[(i*(d+2) + j)*(n+2) + k] = 0.;
+    }
+    
+    for( i = -radius, k = 0; i <= radius; i++ )
+        for( j = -radius; j <= radius; j++ )
+        {
+            /*
+             Calculate sample's histogram array coords rotated relative to ori.
+             Subtract 0.5 so samples that fall e.g. in the center of row 1 (i.e.
+             r_rot = 1.5) have full weight placed in row 1 after interpolation.
+             */
+            float c_rot = j * cos_t - i * sin_t;
+            float r_rot = j * sin_t + i * cos_t;
+            float rbin = r_rot + d/2 - 0.5f;
+            float cbin = c_rot + d/2 - 0.5f;
+            int r = pt.y + i, c = pt.x + j;
+            
+            if( rbin > -1 && rbin < d && cbin > -1 && cbin < d &&
+               r > 0 && r < rows - 1 && c > 0 && c < cols - 1 )
+            {
+                float dx = img.at<short>(r, c+1) - img.at<short>(r, c-1);
+                float dy = img.at<short>(r-1, c) - img.at<short>(r+1, c);
+                X[k] = dx; Y[k] = dy; RBin[k] = rbin; CBin[k] = cbin;
+                W[k] = (c_rot * c_rot + r_rot * r_rot)*exp_scale;
+                k++;
+            }
+        }
+    
+    len = k;
+    fastAtan2(Y, X, Ori, len, true);
+    magnitude(X, Y, Mag, len);
+    exp(W, W, len);
+    
+    for( k = 0; k < len; k++ )
+    {
+        float rbin = RBin[k], cbin = CBin[k];
+        float obin = (Ori[k] - ori)*bins_per_rad;
+        float mag = Mag[k]*W[k];
+        
+        int r0 = cvFloor( rbin );
+        int c0 = cvFloor( cbin );
+        int o0 = cvFloor( obin );
+        rbin -= r0;
+        cbin -= c0;
+        obin -= o0;
+        
+        if( o0 < 0 )
+            o0 += n;
+        if( o0 >= n )
+            o0 -= n;
+        
+        // histogram update using tri-linear interpolation
+        float v_r1 = mag*rbin, v_r0 = mag - v_r1;
+        float v_rc11 = v_r1*cbin, v_rc10 = v_r1 - v_rc11;
+        float v_rc01 = v_r0*cbin, v_rc00 = v_r0 - v_rc01;
+        float v_rco111 = v_rc11*obin, v_rco110 = v_rc11 - v_rco111;
+        float v_rco101 = v_rc10*obin, v_rco100 = v_rc10 - v_rco101;
+        float v_rco011 = v_rc01*obin, v_rco010 = v_rc01 - v_rco011;
+        float v_rco001 = v_rc00*obin, v_rco000 = v_rc00 - v_rco001;
+        
+        int idx = ((r0+1)*(d+2) + c0+1)*(n+2) + o0;
+        hist[idx] += v_rco000;
+        hist[idx+1] += v_rco001;
+        hist[idx+(n+2)] += v_rco010;
+        hist[idx+(n+3)] += v_rco011;
+        hist[idx+(d+2)*(n+2)] += v_rco100;
+        hist[idx+(d+2)*(n+2)+1] += v_rco101;
+        hist[idx+(d+3)*(n+2)] += v_rco110;
+        hist[idx+(d+3)*(n+2)+1] += v_rco111;
+    }
+    
+    // finalize histogram, since the orientation histograms are circular
+    for( i = 0; i < d; i++ )
+        for( j = 0; j < d; j++ )
+        {
+            int idx = ((i+1)*(d+2) + (j+1))*(n+2);
+            hist[idx] += hist[idx+n];
+            hist[idx+1] += hist[idx+n+1];
+            for( k = 0; k < n; k++ )
+                dst[(i*d + j)*n + k] = hist[idx+k];
+        }
+    // copy histogram to the descriptor,
+    // apply hysteresis thresholding
+    // and scale the result, so that it can be easily converted
+    // to byte array
+    float nrm2 = 0;
+    len = d*d*n;
+    for( k = 0; k < len; k++ )
+        nrm2 += dst[k]*dst[k];
+    float thr = std::sqrt(nrm2)*SIFT_DESCR_MAG_THR;
+    for( i = 0, nrm2 = 0; i < k; i++ )
+    {
+        float val = std::min(dst[i], thr);
+        dst[i] = val;
+        nrm2 += val*val;
+    }
+    nrm2 = SIFT_INT_DESCR_FCTR/std::max(std::sqrt(nrm2), FLT_EPSILON);
+    for( k = 0; k < len; k++ )
+    {
+        dst[k] = saturate_cast<uchar>(dst[k]*nrm2);
+    }
+}
+    
+static void calcDescriptors(const vector<Mat>& gpyr, const vector<KeyPoint>& keypoints,
+                            Mat& descriptors, int nOctaveLayers )
+{
+    int d = SIFT_DESCR_WIDTH, n = SIFT_DESCR_HIST_BINS;
+    
+    for( size_t i = 0; i < keypoints.size(); i++ )
+    {
+        KeyPoint kpt = keypoints[i];
+        int octv=kpt.octave & 255, layer=(kpt.octave >> 8) & 255;
+        float scale = 1.f/(1 << octv);
+        float size=kpt.size*scale;
+        Point2f ptf(kpt.pt.x*scale, kpt.pt.y*scale);
+        const Mat& img = gpyr[octv*(nOctaveLayers + 3) + layer];
+        
+        calcSIFTDescriptor(img, ptf, kpt.angle, size*0.5f, d, n, descriptors.ptr<float>(i));
+    }
+}
+
+//////////////////////////////////////////////////////////////////////////////////////////
+
+SIFT::SIFT( int _nfeatures, int _nOctaveLayers,
+           double _contrastThreshold, double _edgeThreshold, double _sigma )
+    : nfeatures(_nfeatures), nOctaveLayers(_nOctaveLayers),
+    contrastThreshold(_contrastThreshold), edgeThreshold(_edgeThreshold), sigma(_sigma)
+{
+}
+
+int SIFT::descriptorSize() const
+{
+    return SIFT_DESCR_WIDTH*SIFT_DESCR_WIDTH*SIFT_DESCR_HIST_BINS;
+}
+
+int SIFT::descriptorType() const
+{
+    return CV_32F;
+}
+
+static Algorithm* createSIFT()
+{
+    return new SIFT;
+}
+static AlgorithmInfo sift_info("Feature2D.SIFT", createSIFT);
+    
+AlgorithmInfo* SIFT::info() const
+{
+    static volatile bool initialized = false;
+    if( !initialized )
+    {
+        sift_info.addParam(this, "nFeatures", nfeatures);
+        sift_info.addParam(this, "nOctaveLayers", nOctaveLayers);
+        sift_info.addParam(this, "contrastThreshold", contrastThreshold);
+        sift_info.addParam(this, "edgeThreshold", edgeThreshold);
+        sift_info.addParam(this, "sigma", sigma);
+        
+        initialized = true;
+    }
+    return &sift_info;
+}
+
+
+void SIFT::operator()(InputArray _image, InputArray _mask,
+                      vector<KeyPoint>& keypoints) const
+{
+    (*this)(_image, _mask, keypoints, noArray());
+}
+
+
+void SIFT::operator()(InputArray _image, InputArray _mask,
+                      vector<KeyPoint>& keypoints,
+                      OutputArray _descriptors,
+                      bool useProvidedKeypoints) const
+{
+    Mat image = _image.getMat(), mask = _mask.getMat();
+    
+    if( image.empty() || image.depth() != CV_8U )
+        CV_Error( CV_StsBadArg, "image is empty or has incorrect depth (!=CV_8U)" );
+    
+    if( !mask.empty() && mask.type() != CV_8UC1 )
+        CV_Error( CV_StsBadArg, "mask has incorrect type (!=CV_8UC1)" );
+    
+    Mat base = createInitialImage(image, false, sigma);
+    vector<Mat> gpyr, dogpyr;
+    int nOctaves = cvRound(log( (double)std::min( base.cols, base.rows ) ) / log(2.) - 2);
+    
+    //double t, tf = getTickFrequency();
+    //t = (double)getTickCount();
+    buildGaussianPyramid(base, gpyr, nOctaves);
+    buildDoGPyramid(gpyr, dogpyr);
+    
+    //t = (double)getTickCount() - t;
+    //printf("pyramid construction time: %g\n", t*1000./tf);
+    
+    if( !useProvidedKeypoints )
+    {
+        //t = (double)getTickCount();
+        findScaleSpaceExtrema(gpyr, dogpyr, keypoints);
+        KeyPointsFilter::removeDuplicated( keypoints );
+        
+        if( !mask.empty() )
+            KeyPointsFilter::runByPixelsMask( keypoints, mask );
+        
+        if( nfeatures > 0 )
+            KeyPointsFilter::retainBest(keypoints, nfeatures);
+        //t = (double)getTickCount() - t;
+        //printf("keypoint detection time: %g\n", t*1000./tf);
+    }
+    else
+    {
+        // filter keypoints by mask
+        //KeyPointsFilter::runByPixelsMask( keypoints, mask );
+    }
+    
+    if( _descriptors.needed() )
+    {
+        //t = (double)getTickCount();
+        int dsize = descriptorSize();
+        _descriptors.create((int)keypoints.size(), dsize, CV_32F);
+        Mat descriptors = _descriptors.getMat();
+        
+        calcDescriptors(gpyr, keypoints, descriptors, nOctaveLayers);
+        //t = (double)getTickCount() - t;
+        //printf("descriptor extraction time: %g\n", t*1000./tf);
+    }
+}
+
+void SIFT::detectImpl( const Mat& image, vector<KeyPoint>& keypoints, const Mat& mask) const
+{
+    (*this)(image, mask, keypoints, noArray());
+}    
+    
+void SIFT::computeImpl( const Mat& image, vector<KeyPoint>& keypoints, Mat& descriptors) const
+{
+    (*this)(image, Mat(), keypoints, descriptors, true);
+}
+
+}
--- a/modules/nonfree/src/surf.cpp
+++ b/modules/nonfree/src/surf.cpp
@@ -0,0 +1,981 @@
+/* Original code has been submitted by Liu Liu. Here is the copyright.
+----------------------------------------------------------------------------------
+ * An OpenCV Implementation of SURF
+ * Further Information Refer to "SURF: Speed-Up Robust Feature"
+ * Author: Liu Liu
+ * liuliu.1987+opencv@gmail.com
+ *
+ * There are still serveral lacks for this experimental implementation:
+ * 1.The interpolation of sub-pixel mentioned in article was not implemented yet;
+ * 2.A comparision with original libSurf.so shows that the hessian detector is not a 100% match to their implementation;
+ * 3.Due to above reasons, I recommanded the original one for study and reuse;
+ *
+ * However, the speed of this implementation is something comparable to original one.
+ *
+ * Copyright© 2008, Liu Liu All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ * 	Redistributions of source code must retain the above
+ * 	copyright notice, this list of conditions and the following
+ * 	disclaimer.
+ * 	Redistributions in binary form must reproduce the above
+ * 	copyright notice, this list of conditions and the following
+ * 	disclaimer in the documentation and/or other materials
+ * 	provided with the distribution.
+ * 	The name of Contributor may not be used to endorse or
+ * 	promote products derived from this software without
+ * 	specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND
+ * CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES,
+ * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE CONTRIBUTORS BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA,
+ * OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR
+ * TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+ * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY
+ * OF SUCH DAMAGE.
+ */
+
+/* 
+   The following changes have been made, comparing to the original contribution:
+   1. A lot of small optimizations, less memory allocations, got rid of global buffers
+   2. Reversed order of cvGetQuadrangleSubPix and cvResize calls; probably less accurate, but much faster
+   3. The descriptor computing part (which is most expensive) is threaded using OpenMP
+   (subpixel-accurate keypoint localization and scale estimation are still TBD)
+*/
+
+/*
+KeyPoint position and scale interpolation has been implemented as described in
+the Brown and Lowe paper cited by the SURF paper.
+
+The sampling step along the x and y axes of the image for the determinant of the
+Hessian is now the same for each layer in an octave. While this increases the
+computation time, it ensures that a true 3x3x3 neighbourhood exists, with
+samples calculated at the same position in the layers above and below. This
+results in improved maxima detection and non-maxima suppression, and I think it
+is consistent with the description in the SURF paper.
+
+The wavelet size sampling interval has also been made consistent. The wavelet
+size at the first layer of the first octave is now 9 instead of 7. Along with
+regular position sampling steps, this makes location and scale interpolation
+easy. I think this is consistent with the SURF paper and original
+implementation.
+
+The scaling of the wavelet parameters has been fixed to ensure that the patterns
+are symmetric around the centre. Previously the truncation caused by integer
+division in the scaling ratio caused a bias towards the top left of the wavelet,
+resulting in inconsistent keypoint positions.
+
+The matrices for the determinant and trace of the Hessian are now reused in each
+octave.
+
+The extraction of the patch of pixels surrounding a keypoint used to build a
+descriptor has been simplified.
+
+KeyPoint descriptor normalisation has been changed from normalising each 4x4 
+cell (resulting in a descriptor of magnitude 16) to normalising the entire 
+descriptor to magnitude 1.
+
+The default number of octaves has been increased from 3 to 4 to match the
+original SURF binary default. The increase in computation time is minimal since
+the higher octaves are sampled sparsely.
+
+The default number of layers per octave has been reduced from 3 to 2, to prevent
+redundant calculation of similar sizes in consecutive octaves.  This decreases 
+computation time. The number of features extracted may be less, however the 
+additional features were mostly redundant.
+
+The radius of the circle of gradient samples used to assign an orientation has
+been increased from 4 to 6 to match the description in the SURF paper. This is 
+now defined by ORI_RADIUS, and could be made into a parameter.
+
+The size of the sliding window used in orientation assignment has been reduced
+from 120 to 60 degrees to match the description in the SURF paper. This is now
+defined by ORI_WIN, and could be made into a parameter.
+
+Other options like  HAAR_SIZE0, HAAR_SIZE_INC, SAMPLE_STEP0, ORI_SEARCH_INC, 
+ORI_SIGMA and DESC_SIGMA have been separated from the code and documented. 
+These could also be made into parameters.
+
+Modifications by Ian Mahon
+
+*/
+#include "precomp.hpp"
+
+bool cv::initModule_nonfree(void) { return true; }
+
+namespace cv
+{
+
+static const int   SURF_ORI_SEARCH_INC = 5;
+static const float SURF_ORI_SIGMA      = 2.5f;
+static const float SURF_DESC_SIGMA     = 3.3f;
+
+// Wavelet size at first layer of first octave.
+static const int SURF_HAAR_SIZE0 = 9;
+
+// Wavelet size increment between layers. This should be an even number,
+// such that the wavelet sizes in an octave are either all even or all odd.
+// This ensures that when looking for the neighbours of a sample, the layers
+// above and below are aligned correctly.
+static const int SURF_HAAR_SIZE_INC = 6;
+    
+    
+struct SurfHF
+{
+    int p0, p1, p2, p3;
+    float w;
+};
+
+inline float calcHaarPattern( const int* origin, const SurfHF* f, int n )
+{
+    double d = 0;
+    for( int k = 0; k < n; k++ )
+        d += (origin[f[k].p0] + origin[f[k].p3] - origin[f[k].p1] - origin[f[k].p2])*f[k].w;
+    return (float)d;
+}
+
+static void
+resizeHaarPattern( const int src[][5], SurfHF* dst, int n, int oldSize, int newSize, int widthStep )
+{
+    float ratio = (float)newSize/oldSize;
+    for( int k = 0; k < n; k++ )
+    {
+        int dx1 = cvRound( ratio*src[k][0] );
+        int dy1 = cvRound( ratio*src[k][1] );
+        int dx2 = cvRound( ratio*src[k][2] );
+        int dy2 = cvRound( ratio*src[k][3] );
+        dst[k].p0 = dy1*widthStep + dx1;
+        dst[k].p1 = dy2*widthStep + dx1;
+        dst[k].p2 = dy1*widthStep + dx2;
+        dst[k].p3 = dy2*widthStep + dx2;
+        dst[k].w = src[k][4]/((float)(dx2-dx1)*(dy2-dy1));
+    }
+}
+
+/*
+ * Calculate the determinant and trace of the Hessian for a layer of the
+ * scale-space pyramid
+ */
+static void calcLayerDetAndTrace( const Mat& sum, int size, int sampleStep,
+                                  Mat& det, Mat& trace )
+{
+    const int NX=3, NY=3, NXY=4;
+    const int dx_s[NX][5] = { {0, 2, 3, 7, 1}, {3, 2, 6, 7, -2}, {6, 2, 9, 7, 1} };
+    const int dy_s[NY][5] = { {2, 0, 7, 3, 1}, {2, 3, 7, 6, -2}, {2, 6, 7, 9, 1} };
+    const int dxy_s[NXY][5] = { {1, 1, 4, 4, 1}, {5, 1, 8, 4, -1}, {1, 5, 4, 8, -1}, {5, 5, 8, 8, 1} };
+
+    SurfHF Dx[NX], Dy[NY], Dxy[NXY];
+
+    if( size > sum.rows-1 || size > sum.cols-1 )
+       return;
+
+    resizeHaarPattern( dx_s , Dx , NX , 9, size, sum.cols );
+    resizeHaarPattern( dy_s , Dy , NY , 9, size, sum.cols );
+    resizeHaarPattern( dxy_s, Dxy, NXY, 9, size, sum.cols );
+
+    /* The integral image 'sum' is one pixel bigger than the source image */
+    int samples_i = 1+(sum.rows-1-size)/sampleStep;
+    int samples_j = 1+(sum.cols-1-size)/sampleStep;
+
+    /* Ignore pixels where some of the kernel is outside the image */
+    int margin = (size/2)/sampleStep;
+
+    for( int i = 0; i < samples_i; i++ )
+    {
+        const int* sum_ptr = sum.ptr<int>(i*sampleStep);
+        float* det_ptr = &det.at<float>(i+margin, margin);
+        float* trace_ptr = &trace.at<float>(i+margin, margin);
+        for( int j = 0; j < samples_j; j++ )
+        {
+            float dx  = calcHaarPattern( sum_ptr, Dx , 3 );
+            float dy  = calcHaarPattern( sum_ptr, Dy , 3 );
+            float dxy = calcHaarPattern( sum_ptr, Dxy, 4 );
+            sum_ptr += sampleStep;
+            det_ptr[j] = dx*dy - 0.81f*dxy*dxy;
+            trace_ptr[j] = dx + dy;
+        }
+    }
+}
+
+
+/*
+ * Maxima location interpolation as described in "Invariant Features from
+ * Interest Point Groups" by Matthew Brown and David Lowe. This is performed by
+ * fitting a 3D quadratic to a set of neighbouring samples.
+ * 
+ * The gradient vector and Hessian matrix at the initial keypoint location are 
+ * approximated using central differences. The linear system Ax = b is then
+ * solved, where A is the Hessian, b is the negative gradient, and x is the 
+ * offset of the interpolated maxima coordinates from the initial estimate.
+ * This is equivalent to an iteration of Netwon's optimisation algorithm.
+ *
+ * N9 contains the samples in the 3x3x3 neighbourhood of the maxima
+ * dx is the sampling step in x
+ * dy is the sampling step in y
+ * ds is the sampling step in size
+ * point contains the keypoint coordinates and scale to be modified
+ *
+ * Return value is 1 if interpolation was successful, 0 on failure.
+ */
+static int
+interpolateKeypoint( float N9[3][9], int dx, int dy, int ds, KeyPoint& kpt )
+{
+    Matx31f b(-(N9[1][5]-N9[1][3])/2,  // Negative 1st deriv with respect to x
+              -(N9[1][7]-N9[1][1])/2,  // Negative 1st deriv with respect to y
+              -(N9[2][4]-N9[0][4])/2); // Negative 1st deriv with respect to s
+
+    Matx33f A(
+        N9[1][3]-2*N9[1][4]+N9[1][5],            // 2nd deriv x, x
+        (N9[1][8]-N9[1][6]-N9[1][2]+N9[1][0])/4, // 2nd deriv x, y
+        (N9[2][5]-N9[2][3]-N9[0][5]+N9[0][3])/4, // 2nd deriv x, s
+        (N9[1][8]-N9[1][6]-N9[1][2]+N9[1][0])/4, // 2nd deriv x, y 
+        N9[1][1]-2*N9[1][4]+N9[1][7],            // 2nd deriv y, y 
+        (N9[2][7]-N9[2][1]-N9[0][7]+N9[0][1])/4, // 2nd deriv y, s 
+        (N9[2][5]-N9[2][3]-N9[0][5]+N9[0][3])/4, // 2nd deriv x, s
+        (N9[2][7]-N9[2][1]-N9[0][7]+N9[0][1])/4, // 2nd deriv y, s
+        N9[0][4]-2*N9[1][4]+N9[2][4]);           // 2nd deriv s, s
+
+    Matx31f x = A.solve<1>(b, DECOMP_LU);
+    
+    bool ok = (x(0,0) != 0 || x(1,0) != 0 || x(2,0) != 0) &&
+        std::abs(x(0,0)) <= 1 && std::abs(x(1,0)) <= 1 && std::abs(x(2,0)) <= 1;
+    
+    if( ok )
+    {
+        kpt.pt.x += x(0,0)*dx;
+        kpt.pt.y += x(1,0)*dy;
+        kpt.size = cvRound( kpt.size + x(2,0)*ds );
+    }
+    return ok;
+}
+
+/*
+ * Find the maxima in the determinant of the Hessian in a layer of the 
+ * scale-space pyramid
+ */ 
+static void
+findMaximaInLayer( const Mat& sum, const Mat& mask_sum,
+                   const vector<Mat>& dets, const vector<Mat>& traces,
+                   const vector<int>& sizes, vector<KeyPoint>& keypoints,
+                   int octave, int layer, float hessianThreshold, int sampleStep )
+{
+    // Wavelet Data
+    const int NM=1;
+    const int dm[NM][5] = { {0, 0, 9, 9, 1} };
+    SurfHF Dm;
+
+    int size = sizes[layer];
+
+    // The integral image 'sum' is one pixel bigger than the source image
+    int layer_rows = (sum.rows-1)/sampleStep;
+    int layer_cols = (sum.cols-1)/sampleStep;
+
+    // Ignore pixels without a 3x3x3 neighbourhood in the layer above
+    int margin = (sizes[layer+1]/2)/sampleStep+1; 
+
+    if( !mask_sum.empty() )
+       resizeHaarPattern( dm, &Dm, NM, 9, size, mask_sum.cols );
+
+    int step = (int)(dets[layer].step/dets[layer].elemSize());
+    
+    for( int i = margin; i < layer_rows - margin; i++ )
+    {
+        const float* det_ptr = dets[layer].ptr<float>(i);
+        const float* trace_ptr = traces[layer].ptr<float>(i);
+        for( int j = margin; j < layer_cols-margin; j++ )
+        {
+            float val0 = det_ptr[j];
+            if( val0 > hessianThreshold )
+            {
+                /* Coordinates for the start of the wavelet in the sum image. There   
+                   is some integer division involved, so don't try to simplify this
+                   (cancel out sampleStep) without checking the result is the same */
+                int sum_i = sampleStep*(i-(size/2)/sampleStep);
+                int sum_j = sampleStep*(j-(size/2)/sampleStep);
+
+                /* The 3x3x3 neighbouring samples around the maxima. 
+                   The maxima is included at N9[1][4] */
+                
+                const float *det1 = &dets[layer-1].at<float>(i, j);
+                const float *det2 = &dets[layer].at<float>(i, j);
+                const float *det3 = &dets[layer+1].at<float>(i, j);
+                float N9[3][9] = { { det1[-step-1], det1[-step], det1[-step+1],
+                                     det1[-1]  , det1[0] , det1[1],
+                                     det1[step-1] , det1[step] , det1[step+1]  },
+                                   { det2[-step-1], det2[-step], det2[-step+1],
+                                     det2[-1]  , det2[0] , det2[1],
+                                     det2[step-1] , det2[step] , det2[step+1]  },
+                                   { det3[-step-1], det3[-step], det3[-step+1],
+                                     det3[-1]  , det3[0] , det3[1],
+                                     det3[step-1] , det3[step] , det3[step+1]  } };
+
+                /* Check the mask - why not just check the mask at the center of the wavelet? */
+                if( !mask_sum.empty() )
+                {
+                    const int* mask_ptr = &mask_sum.at<int>(sum_i, sum_j);
+                    float mval = calcHaarPattern( mask_ptr, &Dm, 1 );
+                    if( mval < 0.5 )
+                        continue;
+                }
+
+                /* Non-maxima suppression. val0 is at N9[1][4]*/
+                if( val0 > N9[0][0] && val0 > N9[0][1] && val0 > N9[0][2] &&
+                    val0 > N9[0][3] && val0 > N9[0][4] && val0 > N9[0][5] &&
+                    val0 > N9[0][6] && val0 > N9[0][7] && val0 > N9[0][8] &&
+                    val0 > N9[1][0] && val0 > N9[1][1] && val0 > N9[1][2] &&
+                    val0 > N9[1][3]                    && val0 > N9[1][5] &&
+                    val0 > N9[1][6] && val0 > N9[1][7] && val0 > N9[1][8] &&
+                    val0 > N9[2][0] && val0 > N9[2][1] && val0 > N9[2][2] &&
+                    val0 > N9[2][3] && val0 > N9[2][4] && val0 > N9[2][5] &&
+                    val0 > N9[2][6] && val0 > N9[2][7] && val0 > N9[2][8] )
+                {
+                    /* Calculate the wavelet center coordinates for the maxima */
+                    float center_i = sum_i + (size-1)*0.5f;
+                    float center_j = sum_j + (size-1)*0.5f;
+
+                    KeyPoint kpt( center_j, center_i, sizes[layer], -1, val0, octave, CV_SIGN(trace_ptr[j]) );
+
+                    /* Interpolate maxima location within the 3x3x3 neighbourhood  */
+                    int ds = size - sizes[layer-1];
+                    int interp_ok = interpolateKeypoint( N9, sampleStep, sampleStep, ds, kpt );
+
+                    /* Sometimes the interpolation step gives a negative size etc. */
+                    if( interp_ok  )
+                    {
+                        /*printf( "KeyPoint %f %f %d\n", point.pt.x, point.pt.y, point.size );*/
+                    #ifdef HAVE_TBB
+                        static tbb::mutex m;
+                        tbb::mutex::scoped_lock lock(m);
+                    #endif
+                        keypoints.push_back(kpt);
+                    }
+                }
+            }
+        }
+    }
+}
+
+
+// Multi-threaded construction of the scale-space pyramid
+struct SURFBuildInvoker
+{
+    SURFBuildInvoker( const Mat& _sum, const vector<int>& _sizes,
+                      const vector<int>& _sampleSteps,
+                      vector<Mat>& _dets, vector<Mat>& _traces )
+    {
+        sum = &_sum;
+        sizes = &_sizes;
+        sampleSteps = &_sampleSteps;
+        dets = &_dets;
+        traces = &_traces;
+    }
+
+    void operator()(const BlockedRange& range) const
+    {
+        for( int i=range.begin(); i<range.end(); i++ )
+            calcLayerDetAndTrace( *sum, (*sizes)[i], (*sampleSteps)[i], (*dets)[i], (*traces)[i] );
+    }
+
+    const Mat *sum;
+    const vector<int> *sizes;
+    const vector<int> *sampleSteps;
+    vector<Mat>* dets;
+    vector<Mat>* traces;
+};
+
+// Multi-threaded search of the scale-space pyramid for keypoints
+struct SURFFindInvoker
+{
+    SURFFindInvoker( const Mat& _sum, const Mat& _mask_sum,
+                     const vector<Mat>& _dets, const vector<Mat>& _traces,
+                     const vector<int>& _sizes, const vector<int>& _sampleSteps,
+                     const vector<int>& _middleIndices, vector<KeyPoint>& _keypoints,
+                     int _nOctaveLayers, float _hessianThreshold )
+    {
+        sum = &_sum;
+        mask_sum = &_mask_sum;
+        dets = &_dets;
+        traces = &_traces;
+        sizes = &_sizes;
+        sampleSteps = &_sampleSteps;
+        middleIndices = &_middleIndices;
+        keypoints = &_keypoints;
+        nOctaveLayers = _nOctaveLayers;
+        hessianThreshold = _hessianThreshold;
+    }
+
+    void operator()(const BlockedRange& range) const
+    {
+        for( int i=range.begin(); i<range.end(); i++ )
+        {
+            int layer = (*middleIndices)[i];
+            int octave = i % nOctaveLayers;
+            findMaximaInLayer( *sum, *mask_sum, *dets, *traces, *sizes, 
+                               *keypoints, octave, layer, hessianThreshold,
+                               (*sampleSteps)[layer] );
+        }
+    }
+
+    const Mat *sum;
+    const Mat *mask_sum;
+    const vector<Mat>* dets;
+    const vector<Mat>* traces;
+    const vector<int>* sizes;
+    const vector<int>* sampleSteps;
+    const vector<int>* middleIndices;
+    vector<KeyPoint>* keypoints;
+    int nOctaveLayers;
+    float hessianThreshold;
+};
+
+    
+static void fastHessianDetector( const Mat& sum, const Mat& mask_sum, vector<KeyPoint>& keypoints,
+                                 int nOctaves, int nOctaveLayers, float hessianThreshold )
+{
+    /* Sampling step along image x and y axes at first octave. This is doubled
+       for each additional octave. WARNING: Increasing this improves speed,
+       however keypoint extraction becomes unreliable. */
+    const int SAMPLE_STEP0 = 1;
+
+    int nTotalLayers = (nOctaveLayers+2)*nOctaves;
+    int nMiddleLayers = nOctaveLayers*nOctaves;
+
+    vector<Mat> dets(nTotalLayers);
+    vector<Mat> traces(nTotalLayers);
+    vector<int> sizes(nTotalLayers);
+    vector<int> sampleSteps(nTotalLayers);
+    vector<int> middleIndices(nMiddleLayers);
+
+    // Allocate space and calculate properties of each layer
+    int index = 0, middleIndex = 0, step = SAMPLE_STEP0;
+    
+    for( int octave = 0; octave < nOctaves; octave++ )
+    {
+        for( int layer = 0; layer < nOctaveLayers+2; layer++ )
+        {
+            /* The integral image sum is one pixel bigger than the source image*/
+            dets[index].create( (sum.rows-1)/step, (sum.cols-1)/step, CV_32F );
+            traces[index].create( (sum.rows-1)/step, (sum.cols-1)/step, CV_32F );
+            sizes[index] = (SURF_HAAR_SIZE0 + SURF_HAAR_SIZE_INC*layer) << octave;
+            sampleSteps[index] = step;
+
+            if( 0 < layer && layer <= nOctaveLayers )
+                middleIndices[middleIndex++] = index;
+            index++;
+        }
+        step *= 2;
+    }
+
+    // Calculate hessian determinant and trace samples in each layer
+    parallel_for( BlockedRange(0, nTotalLayers),
+                      SURFBuildInvoker(sum, sizes, sampleSteps, dets, traces) );
+
+    // Find maxima in the determinant of the hessian
+    parallel_for( BlockedRange(0, nMiddleLayers),
+                      SURFFindInvoker(sum, mask_sum, dets, traces, sizes,
+                                      sampleSteps, middleIndices, keypoints,
+                                      nOctaveLayers, hessianThreshold) );
+}
+
+
+struct SURFInvoker
+{
+    enum { ORI_RADIUS = 6, ORI_WIN = 60, PATCH_SZ = 20 };
+
+    SURFInvoker( const Mat& _img, const Mat& _sum,
+                 vector<KeyPoint>& _keypoints, Mat& _descriptors,
+                 bool _extended, bool _upright )
+    {
+        keypoints = &_keypoints;
+        descriptors = &_descriptors;
+        img = &_img;
+        sum = &_sum;
+        extended = _extended;
+        upright = _upright;
+
+        // Simple bound for number of grid points in circle of radius ORI_RADIUS
+        const int nOriSampleBound = (2*ORI_RADIUS+1)*(2*ORI_RADIUS+1);
+
+        // Allocate arrays
+        apt.resize(nOriSampleBound);
+        aptw.resize(nOriSampleBound);
+        DW.resize(PATCH_SZ*PATCH_SZ);
+
+        /* Coordinates and weights of samples used to calculate orientation */
+        Mat G_ori = getGaussianKernel( 2*ORI_RADIUS+1, SURF_ORI_SIGMA, CV_32F );
+        nOriSamples = 0;
+        for( int i = -ORI_RADIUS; i <= ORI_RADIUS; i++ )
+        {
+            for( int j = -ORI_RADIUS; j <= ORI_RADIUS; j++ )
+            {
+                if( i*i + j*j <= ORI_RADIUS*ORI_RADIUS )
+                {
+                    apt[nOriSamples] = cvPoint(i,j);
+                    aptw[nOriSamples++] = G_ori.at<float>(i+ORI_RADIUS,0) * G_ori.at<float>(j+ORI_RADIUS,0);
+                }
+            }
+        }
+        CV_Assert( nOriSamples <= nOriSampleBound );
+
+        /* Gaussian used to weight descriptor samples */
+        Mat G_desc = getGaussianKernel( PATCH_SZ, SURF_DESC_SIGMA, CV_32F );
+        for( int i = 0; i < PATCH_SZ; i++ )
+        {
+            for( int j = 0; j < PATCH_SZ; j++ )
+                DW[i*PATCH_SZ+j] = G_desc.at<float>(i,0) * G_desc.at<float>(j,0);
+        }
+    }
+
+    void operator()(const BlockedRange& range) const
+    {
+        /* X and Y gradient wavelet data */
+        const int NX=2, NY=2;
+        const int dx_s[NX][5] = {{0, 0, 2, 4, -1}, {2, 0, 4, 4, 1}};
+        const int dy_s[NY][5] = {{0, 0, 4, 2, 1}, {0, 2, 4, 4, -1}};
+
+        // Optimisation is better using nOriSampleBound than nOriSamples for 
+        // array lengths.  Maybe because it is a constant known at compile time
+        const int nOriSampleBound =(2*ORI_RADIUS+1)*(2*ORI_RADIUS+1);
+
+        float X[nOriSampleBound], Y[nOriSampleBound], angle[nOriSampleBound];
+        uchar PATCH[PATCH_SZ+1][PATCH_SZ+1];
+        float DX[PATCH_SZ][PATCH_SZ], DY[PATCH_SZ][PATCH_SZ];
+        CvMat matX = cvMat(1, nOriSampleBound, CV_32F, X);
+        CvMat matY = cvMat(1, nOriSampleBound, CV_32F, Y);
+        CvMat _angle = cvMat(1, nOriSampleBound, CV_32F, angle);
+        Mat _patch(PATCH_SZ+1, PATCH_SZ+1, CV_8U, PATCH);
+
+        int dsize = extended ? 128 : 64;
+        
+        int k, k1 = range.begin(), k2 = range.end();
+        float maxSize = 0;
+        for( k = k1; k < k2; k++ )
+        {
+            maxSize = std::max(maxSize, (*keypoints)[k].size);
+        }
+        maxSize = cvCeil((PATCH_SZ+1)*maxSize*1.2f/9.0f);
+        Ptr<CvMat> winbuf = cvCreateMat( 1, maxSize > 0 ? maxSize*maxSize : 1, CV_8U );
+        for( k = k1; k < k2; k++ )
+        {
+            int i, j, kk, x, y, nangle;
+            float* vec;
+            SurfHF dx_t[NX], dy_t[NY];
+            KeyPoint& kp = (*keypoints)[k];
+            float size = kp.size;
+            Point2f center = kp.pt;
+            /* The sampling intervals and wavelet sized for selecting an orientation
+             and building the keypoint descriptor are defined relative to 's' */
+            float s = size*1.2f/9.0f;
+            /* To find the dominant orientation, the gradients in x and y are
+             sampled in a circle of radius 6s using wavelets of size 4s.
+             We ensure the gradient wavelet size is even to ensure the 
+             wavelet pattern is balanced and symmetric around its center */
+            int grad_wav_size = 2*cvRound( 2*s );
+            if( sum->rows < grad_wav_size || sum->cols < grad_wav_size )
+            {
+                /* when grad_wav_size is too big,
+                 * the sampling of gradient will be meaningless
+                 * mark keypoint for deletion. */
+                kp.size = -1;
+                continue;
+            }
+
+            float descriptor_dir = 90.f;
+            if (upright == 0)
+            {
+                resizeHaarPattern( dx_s, dx_t, NX, 4, grad_wav_size, sum->cols );
+                resizeHaarPattern( dy_s, dy_t, NY, 4, grad_wav_size, sum->cols );
+                for( kk = 0, nangle = 0; kk < nOriSamples; kk++ )
+                {
+                    x = cvRound( center.x + apt[kk].x*s - (float)(grad_wav_size-1)/2 );
+                    y = cvRound( center.y + apt[kk].y*s - (float)(grad_wav_size-1)/2 );
+                    if( y < 0 || y >= sum->rows - grad_wav_size ||
+                        x < 0 || x >= sum->cols - grad_wav_size )
+                        continue;
+                    const int* ptr = &sum->at<int>(y, x);
+                    float vx = calcHaarPattern( ptr, dx_t, 2 );
+                    float vy = calcHaarPattern( ptr, dy_t, 2 );
+                    X[nangle] = vx*aptw[kk];
+                    Y[nangle] = vy*aptw[kk];
+                    nangle++;
+                }
+                if( nangle == 0 )
+                {
+                    // No gradient could be sampled because the keypoint is too
+                    // near too one or more of the sides of the image. As we
+                    // therefore cannot find a dominant direction, we skip this
+                    // keypoint and mark it for later deletion from the sequence.
+                    kp.size = -1;
+                    continue;
+                }
+                matX.cols = matY.cols = _angle.cols = nangle;
+                cvCartToPolar( &matX, &matY, 0, &_angle, 1 );
+
+                float bestx = 0, besty = 0, descriptor_mod = 0;
+                for( i = 0; i < 360; i += SURF_ORI_SEARCH_INC )
+                {
+                    float sumx = 0, sumy = 0, temp_mod;
+                    for( j = 0; j < nangle; j++ )
+                    {
+                        int d = std::abs(cvRound(angle[j]) - i);
+                        if( d < ORI_WIN/2 || d > 360-ORI_WIN/2 )
+                        {
+                            sumx += X[j];
+                            sumy += Y[j];
+                        }
+                    }
+                    temp_mod = sumx*sumx + sumy*sumy;
+                    if( temp_mod > descriptor_mod )
+                    {
+                        descriptor_mod = temp_mod;
+                        bestx = sumx;
+                        besty = sumy;
+                    }
+                }
+                descriptor_dir = fastAtan2( besty, bestx );
+            }
+            kp.angle = descriptor_dir;
+            if( !descriptors || !descriptors->data )
+                continue;
+            
+            /* Extract a window of pixels around the keypoint of size 20s */
+            int win_size = (int)((PATCH_SZ+1)*s);
+            CV_Assert( winbuf->cols >= win_size*win_size );
+            Mat win(win_size, win_size, CV_8U, winbuf->data.ptr);
+
+            if( !upright )
+            {
+            	descriptor_dir *= (float)(CV_PI/180);
+                float sin_dir = std::sin(descriptor_dir);
+                float cos_dir = std::cos(descriptor_dir);
+
+                /* Subpixel interpolation version (slower). Subpixel not required since
+                the pixels will all get averaged when we scale down to 20 pixels */
+                /*  
+                float w[] = { cos_dir, sin_dir, center.x,
+                -sin_dir, cos_dir , center.y };
+                CvMat W = cvMat(2, 3, CV_32F, w);
+                cvGetQuadrangleSubPix( img, &win, &W );
+                */
+
+                // Nearest neighbour version (faster)
+                float win_offset = -(float)(win_size-1)/2;
+                float start_x = center.x + win_offset*cos_dir + win_offset*sin_dir;
+                float start_y = center.y - win_offset*sin_dir + win_offset*cos_dir;
+                uchar* WIN = win.data;
+                for( i = 0; i < win_size; i++, start_x += sin_dir, start_y += cos_dir )
+                {
+                    float pixel_x = start_x;
+                    float pixel_y = start_y;
+                    for( j = 0; j < win_size; j++, pixel_x += cos_dir, pixel_y -= sin_dir )
+                    {
+                        int x = std::min(std::max(cvRound(pixel_x), 0), img->cols-1);
+                        int y = std::min(std::max(cvRound(pixel_y), 0), img->rows-1);
+                        WIN[i*win_size + j] = img->at<uchar>(y, x);
+                    }
+                }
+            }
+            else
+            {
+                // extract rect - slightly optimized version of the code above
+                // TODO: find faster code, as this is simply an extract rect operation, 
+                //       e.g. by using cvGetSubRect, problem is the border processing
+                // descriptor_dir == 90 grad
+                // sin_dir == 1
+                // cos_dir == 0
+                
+                float win_offset = -(float)(win_size-1)/2;
+                int start_x = cvRound(center.x + win_offset);
+                int start_y = cvRound(center.y - win_offset);
+                uchar* WIN = win.data;
+                for( i = 0; i < win_size; i++, start_x++ )
+                {
+                    int pixel_x = start_x;
+                    int pixel_y = start_y;
+                    for( j = 0; j < win_size; j++, pixel_y-- )
+                    {
+                        x = MAX( pixel_x, 0 );
+                        y = MAX( pixel_y, 0 );
+                        x = MIN( x, img->cols-1 );
+                        y = MIN( y, img->rows-1 );
+                        WIN[i*win_size + j] = img->at<uchar>(y, x);
+                    }
+                }               
+            }
+            // Scale the window to size PATCH_SZ so each pixel's size is s. This
+            // makes calculating the gradients with wavelets of size 2s easy
+            resize(win, _patch, _patch.size(), 0, 0, INTER_AREA);
+
+            // Calculate gradients in x and y with wavelets of size 2s
+            for( i = 0; i < PATCH_SZ; i++ )
+                for( j = 0; j < PATCH_SZ; j++ )
+                {
+                    float dw = DW[i*PATCH_SZ + j];
+                    float vx = (PATCH[i][j+1] - PATCH[i][j] + PATCH[i+1][j+1] - PATCH[i+1][j])*dw;
+                    float vy = (PATCH[i+1][j] - PATCH[i][j] + PATCH[i+1][j+1] - PATCH[i][j+1])*dw;
+                    DX[i][j] = vx;
+                    DY[i][j] = vy;
+                }
+
+            // Construct the descriptor
+            vec = descriptors->ptr<float>(k);
+            for( kk = 0; kk < dsize; kk++ )
+                vec[kk] = 0;
+            double square_mag = 0;
+            if( extended )
+            {
+                // 128-bin descriptor
+                for( i = 0; i < 4; i++ )
+                    for( j = 0; j < 4; j++ )
+                    {
+                        for( y = i*5; y < i*5+5; y++ )
+                        {
+                            for( x = j*5; x < j*5+5; x++ )
+                            {
+                                float tx = DX[y][x], ty = DY[y][x];
+                                if( ty >= 0 )
+                                {
+                                    vec[0] += tx;
+                                    vec[1] += (float)fabs(tx);
+                                } else {
+                                    vec[2] += tx;
+                                    vec[3] += (float)fabs(tx);
+                                }
+                                if ( tx >= 0 )
+                                {
+                                    vec[4] += ty;
+                                    vec[5] += (float)fabs(ty);
+                                } else {
+                                    vec[6] += ty;
+                                    vec[7] += (float)fabs(ty);
+                                }
+                            }
+                        }
+                        for( kk = 0; kk < 8; kk++ )
+                            square_mag += vec[kk]*vec[kk];
+                        vec += 8;
+                    }
+            }
+            else
+            {
+                // 64-bin descriptor
+                for( i = 0; i < 4; i++ )
+                    for( j = 0; j < 4; j++ )
+                    {
+                        for( y = i*5; y < i*5+5; y++ )
+                        {
+                            for( x = j*5; x < j*5+5; x++ )
+                            {
+                                float tx = DX[y][x], ty = DY[y][x];
+                                vec[0] += tx; vec[1] += ty;
+                                vec[2] += (float)fabs(tx); vec[3] += (float)fabs(ty);
+                            }
+                        }
+                        for( kk = 0; kk < 4; kk++ )
+                            square_mag += vec[kk]*vec[kk];
+                        vec+=4;
+                    }
+            }
+
+            // unit vector is essential for contrast invariance
+            vec = descriptors->ptr<float>(k);
+            float scale = (float)(1./(sqrt(square_mag) + DBL_EPSILON));
+            for( kk = 0; kk < dsize; kk++ )
+                vec[kk] *= scale;
+        }
+    }
+
+    // Parameters
+    const Mat* img;
+    const Mat* sum;
+    vector<KeyPoint>* keypoints;
+    Mat* descriptors;
+    bool extended;
+    bool upright;
+
+    // Pre-calculated values
+    int nOriSamples;
+    vector<Point> apt;
+    vector<float> aptw;
+    vector<float> DW;
+};
+
+
+SURF::SURF()
+{
+    hessianThreshold = 100;
+    extended = true;
+    upright = false;
+    nOctaves = 4;
+    nOctaveLayers = 2;
+}
+
+SURF::SURF(double _threshold, bool _extended, bool _upright, int _nOctaves, int _nOctaveLayers)
+{
+    hessianThreshold = _threshold;
+    extended = _extended;
+    upright = _upright;
+    nOctaves = _nOctaves;
+    nOctaveLayers = _nOctaveLayers;
+}
+
+int SURF::descriptorSize() const { return extended ? 128 : 64; }
+int SURF::descriptorType() const { return CV_32F; }
+
+void SURF::operator()(InputArray imgarg, InputArray maskarg,
+                      CV_OUT vector<KeyPoint>& keypoints) const
+{
+    (*this)(imgarg, maskarg, keypoints, noArray(), false);
+}
+    
+void SURF::operator()(InputArray _img, InputArray _mask,
+                      CV_OUT vector<KeyPoint>& keypoints,
+                      OutputArray _descriptors,
+                      bool useProvidedKeypoints) const
+{
+    Mat img = _img.getMat(), mask = _mask.getMat(), mask1, sum, msum;
+    bool doDescriptors = _descriptors.needed();
+    
+    CV_Assert(!img.empty() && img.depth() == CV_8U);
+    if( img.channels() > 1 )
+        cvtColor(img, img, COLOR_BGR2GRAY);
+    
+    CV_Assert(mask.empty() || (mask.type() == CV_8U && mask.size() == img.size()));
+    CV_Assert(hessianThreshold >= 0);
+    CV_Assert(nOctaves > 0);
+    CV_Assert(nOctaveLayers > 0);
+    
+    integral(img, sum, CV_32S);
+    
+    // Compute keypoints only if we are not asked for evaluating the descriptors are some given locations:
+    if( !useProvidedKeypoints )
+    {
+        if( !mask.empty() )
+        {
+            cv::min(mask, 1, mask1);
+            integral(mask1, msum, CV_32S);
+        }
+        fastHessianDetector( sum, msum, keypoints, nOctaves, nOctaveLayers, hessianThreshold );
+    }
+    
+    int i, j, N = (int)keypoints.size();
+    if( N > 0 )
+    {
+        Mat descriptors;
+        if( doDescriptors )
+        {
+            _descriptors.create((int)keypoints.size(), (extended ? 128 : 64), CV_32F);
+            descriptors = _descriptors.getMat();
+        }
+        
+        parallel_for(BlockedRange(0, N), SURFInvoker(img, sum, keypoints, descriptors, extended, upright) );
+        
+        size_t dsize = descriptors.cols*descriptors.elemSize();
+    
+        // remove keypoints that were marked for deletion
+        for( i = j = 0; i < N; i++ )
+        {
+            if( keypoints[i].size > 0 )
+            {
+                if( i > j )
+                {
+                    keypoints[j] = keypoints[i];
+                    if( doDescriptors )
+                        memcpy( descriptors.ptr(j), descriptors.ptr(i), dsize);
+                }
+                j++;
+            }
+        }
+        if( N > j )
+        {
+            N = j;
+            keypoints.resize(N);
+            if( doDescriptors )
+            {
+                Mat d = descriptors.rowRange(0, N);
+                d.copyTo(_descriptors);
+            }
+        }
+    }
+}
+
+
+void SURF::detectImpl( const Mat& image, vector<KeyPoint>& keypoints, const Mat& mask) const
+{
+    (*this)(image, mask, keypoints, noArray(), false);
+}    
+
+void SURF::computeImpl( const Mat& image, vector<KeyPoint>& keypoints, Mat& descriptors) const
+{
+    (*this)(image, Mat(), keypoints, descriptors, true);
+}    
+
+static Algorithm* createSURF()
+{
+    return new SURF;
+}
+static AlgorithmInfo surf_info("Feature2D.SURF", createSURF);
+
+AlgorithmInfo* SURF::info() const
+{
+    static volatile bool initialized = false;
+    if( !initialized )
+    {
+        surf_info.addParam(this, "hessianThreshold", hessianThreshold);
+        surf_info.addParam(this, "nOctaves", nOctaves);
+        surf_info.addParam(this, "nOctaveLayers", nOctaveLayers);
+        surf_info.addParam(this, "extended", extended);
+        surf_info.addParam(this, "upright", upright);
+        
+        initialized = true;
+    }
+    return &surf_info;
+}
+    
+/*
+ 
+ // SurfFeatureDetector
+    SurfFeatureDetector::SurfFeatureDetector( double hessianThreshold, int octaves, int octaveLayers, bool upright )
+    : surf(hessianThreshold, octaves, octaveLayers, false, upright)
+    {}
+    
+    void SurfFeatureDetector::read (const FileNode& fn)
+    {
+        double hessianThreshold = fn["hessianThreshold"];
+        int octaves = fn["octaves"];
+        int octaveLayers = fn["octaveLayers"];
+        bool upright = (int)fn["upright"] != 0;
+        
+        surf = SURF( hessianThreshold, octaves, octaveLayers, false, upright );
+    }
+    
+    void SurfFeatureDetector::write (FileStorage& fs) const
+    {
+        //fs << "algorithm" << getAlgorithmName ();
+        
+        fs << "hessianThreshold" << surf.hessianThreshold;
+        fs << "octaves" << surf.nOctaves;
+        fs << "octaveLayers" << surf.nOctaveLayers;
+        fs << "upright" << surf.upright;
+    }
+    
+    void SurfFeatureDetector::detectImpl( const Mat& image, vector<KeyPoint>& keypoints, const Mat& mask ) const
+    {
+        Mat grayImage = image;
+        if( image.type() != CV_8U ) cvtColor( image, grayImage, CV_BGR2GRAY );
+        
+        surf(grayImage, mask, keypoints);
+    }
+    
+    
+ 
+*/
+}