From 47f72b538f3a03c9244d6fdb8aef5a46f6bdf576 Mon Sep 17 00:00:00 2001
From: Andrey Kamaev <no@email>
Date: Mon, 21 May 2012 13:07:53 +0000
Subject: [PATCH] Added new performance tests for calcOpticalFlowPyrLK and
 buildOpticalFlowPyramid; extracted private header from lkpyramid.cpp

---
 modules/video/perf/perf_optflowpyrlk.cpp | 124 ++++-
 modules/video/src/lkpyramid.cpp          | 631 +++++++++++------------
 modules/video/src/lkpyramid.hpp          |  36 ++
 3 files changed, 467 insertions(+), 324 deletions(-)
 create mode 100644 modules/video/src/lkpyramid.hpp
diff --git a/modules/video/perf/perf_optflowpyrlk.cpp b/modules/video/perf/perf_optflowpyrlk.cpp
index 75fcb926b..2ef5f3c13 100644
--- a/modules/video/perf/perf_optflowpyrlk.cpp
+++ b/modules/video/perf/perf_optflowpyrlk.cpp
@@ -28,12 +28,12 @@ void FormTrackingPointsArray(vector<Point2f>& points, int width, int height, int
     }
 }
 
-PERF_TEST_P(Path_Idx_Cn_NPoints_WSize, OpticalFlowPyrLK, testing::Combine(
+PERF_TEST_P(Path_Idx_Cn_NPoints_WSize, OpticalFlowPyrLK_full, testing::Combine(
                 testing::Values<std::string>("cv/optflow/frames/VGA_%02d.png", "cv/optflow/frames/720p_%02d.jpg"),
-                testing::Range(0, 3),
+                testing::Range(1, 3),
                 testing::Values(1, 3, 4),
                 testing::Values(make_tuple(9, 9), make_tuple(15, 15)),
-                testing::Values(7, 11, 21, 25)
+                testing::Values(7, 11, 25)
                 )
             )
 {
@@ -48,6 +48,7 @@ PERF_TEST_P(Path_Idx_Cn_NPoints_WSize, OpticalFlowPyrLK, testing::Combine(
     int nPointsX = min(get<0>(get<3>(GetParam())), img1.cols);
     int nPointsY = min(get<1>(get<3>(GetParam())), img1.rows);
     int winSize = get<4>(GetParam());
+
     int maxLevel = 2;
     TermCriteria criteria(CV_TERMCRIT_ITER|CV_TERMCRIT_EPS, 7, 0.001);
     int flags = 0;
@@ -91,3 +92,120 @@ PERF_TEST_P(Path_Idx_Cn_NPoints_WSize, OpticalFlowPyrLK, testing::Combine(
                              flags, minEigThreshold);
     }
 }
+
+typedef tr1::tuple<std::string, int, int, tr1::tuple<int,int>, int, bool> Path_Idx_Cn_NPoints_WSize_Deriv_t;
+typedef TestBaseWithParam<Path_Idx_Cn_NPoints_WSize_Deriv_t> Path_Idx_Cn_NPoints_WSize_Deriv;
+
+PERF_TEST_P(Path_Idx_Cn_NPoints_WSize_Deriv, OpticalFlowPyrLK_self, testing::Combine(
+                testing::Values<std::string>("cv/optflow/frames/VGA_%02d.png", "cv/optflow/frames/720p_%02d.jpg"),
+                testing::Range(1, 3),
+                testing::Values(1, 3, 4),
+                testing::Values(make_tuple(9, 9), make_tuple(15, 15)),
+                testing::Values(7, 11, 25),
+                testing::Bool()
+                )
+            )
+{
+    string filename1 = getDataPath(cv::format(get<0>(GetParam()).c_str(), get<1>(GetParam())));
+    string filename2 = getDataPath(cv::format(get<0>(GetParam()).c_str(), get<1>(GetParam()) + 1));
+    Mat img1 = imread(filename1);
+    Mat img2 = imread(filename2);
+    if (img1.empty()) FAIL() << "Unable to load source image " << filename1;
+    if (img2.empty()) FAIL() << "Unable to load source image " << filename2;
+
+    int cn = get<2>(GetParam());
+    int nPointsX = min(get<0>(get<3>(GetParam())), img1.cols);
+    int nPointsY = min(get<1>(get<3>(GetParam())), img1.rows);
+    int winSize = get<4>(GetParam());
+    bool withDerivatives = get<5>(GetParam());
+
+    int maxLevel = 2;
+    TermCriteria criteria(CV_TERMCRIT_ITER|CV_TERMCRIT_EPS, 7, 0.001);
+    int flags = 0;
+    double minEigThreshold = 1e-4;
+
+    Mat frame1, frame2;
+    switch(cn)
+    {
+    case 1:
+        cvtColor(img1, frame1, COLOR_BGR2GRAY, cn);
+        cvtColor(img2, frame2, COLOR_BGR2GRAY, cn);
+        break;
+    case 3:
+        frame1 = img1;
+        frame2 = img2;
+        break;
+    case 4:
+        cvtColor(img1, frame1, COLOR_BGR2BGRA, cn);
+        cvtColor(img2, frame2, COLOR_BGR2BGRA, cn);
+        break;
+    default:
+        FAIL() << "Unexpected number of channels: " << cn;
+    }
+
+    vector<Point2f> inPoints;
+    vector<Point2f> outPoints;
+    vector<uchar> status;
+    vector<float> err;
+
+    FormTrackingPointsArray(inPoints, frame1.cols, frame1.rows, nPointsX, nPointsY);
+    outPoints.resize(inPoints.size());
+    status.resize(inPoints.size());
+    err.resize(inPoints.size());
+
+    std::vector<Mat> pyramid1, pyramid2;
+
+    maxLevel = buildOpticalFlowPyramid(frame1, pyramid1, Size(winSize, winSize), maxLevel, withDerivatives);
+    maxLevel = buildOpticalFlowPyramid(frame2, pyramid2, Size(winSize, winSize), maxLevel, withDerivatives);
+
+    declare.in(pyramid1, pyramid2, inPoints).out(outPoints);
+
+    TEST_CYCLE()
+    {
+        calcOpticalFlowPyrLK(pyramid1, pyramid2, inPoints, outPoints, status, err,
+                             Size(winSize, winSize), maxLevel, criteria,
+                             flags, minEigThreshold);
+    }
+}
+
+CV_ENUM(PyrBorderMode, BORDER_DEFAULT, BORDER_TRANSPARENT);
+typedef tr1::tuple<std::string, int, bool, PyrBorderMode, bool> Path_Win_Deriv_Border_Reuse_t;
+typedef TestBaseWithParam<Path_Win_Deriv_Border_Reuse_t> Path_Win_Deriv_Border_Reuse;
+
+PERF_TEST_P(Path_Win_Deriv_Border_Reuse, OpticalFlowPyrLK_pyr, testing::Combine(
+                testing::Values<std::string>("cv/optflow/frames/720p_01.jpg"),
+                testing::Values(7, 11),
+                testing::Bool(),
+                testing::ValuesIn(PyrBorderMode::all()),
+                testing::Bool()
+                )
+            )
+{
+    string filename = getDataPath(get<0>(GetParam()));
+    Mat img = imread(filename);
+    Size winSize(get<1>(GetParam()), get<1>(GetParam()));
+    bool withDerivatives = get<2>(GetParam());
+    int derivBorder = get<3>(GetParam());
+    int pyrBorder = derivBorder;
+    if(derivBorder != BORDER_TRANSPARENT)
+    {
+        derivBorder = BORDER_CONSTANT;
+        pyrBorder = BORDER_REFLECT_101;
+    }
+    bool tryReuseInputImage = get<4>(GetParam());
+    std::vector<Mat> pyramid;
+
+    img.adjustROI(winSize.height, winSize.height, winSize.width, winSize.width);
+
+    int maxLevel = buildOpticalFlowPyramid(img, pyramid, winSize, 1000, withDerivatives, BORDER_CONSTANT, BORDER_CONSTANT, tryReuseInputImage);
+
+    declare.in(img).out(pyramid);
+
+
+    TEST_CYCLE()
+    {
+        buildOpticalFlowPyramid(img, pyramid, winSize, maxLevel, withDerivatives, pyrBorder, derivBorder, tryReuseInputImage);
+    }
+
+    SANITY_CHECK(pyramid);
+}
\ No newline at end of file
diff --git a/modules/video/src/lkpyramid.cpp b/modules/video/src/lkpyramid.cpp
index 0a4f44b62..8c4370673 100644
--- a/modules/video/src/lkpyramid.cpp
+++ b/modules/video/src/lkpyramid.cpp
@@ -41,17 +41,22 @@
 #include "precomp.hpp"
 #include <float.h>
 #include <stdio.h>
+#include "lkpyramid.hpp"
 
-namespace cv
+namespace 
 {
-
-typedef short deriv_type;
-    
-static void calcSharrDeriv(const Mat& src, Mat& dst)
+static void calcSharrDeriv(const cv::Mat& src, cv::Mat& dst)
 {
+    using namespace cv;
+    using cv::detail::deriv_type;
     int rows = src.rows, cols = src.cols, cn = src.channels(), colsn = cols*cn, depth = src.depth();
     CV_Assert(depth == CV_8U);
     dst.create(rows, cols, CV_MAKETYPE(DataType<deriv_type>::depth, cn*2));
+
+ #ifdef HAVE_TEGRA_OPTIMIZATION
+    if (tegra::calcSharrDeriv(src, dst))
+        return;
+#endif   
     
     int x, y, delta = (int)alignSize((cols + 2)*cn, 16);
     AutoBuffer<deriv_type> _tempBuf(delta*2 + 64);
@@ -127,373 +132,355 @@ static void calcSharrDeriv(const Mat& src, Mat& dst)
     }
 }
 
+}//namespace
     
-struct LKTrackerInvoker
-{
-    LKTrackerInvoker( const Mat& _prevImg, const Mat& _prevDeriv, const Mat& _nextImg,
+cv::detail::LKTrackerInvoker::LKTrackerInvoker(
+                      const Mat& _prevImg, const Mat& _prevDeriv, const Mat& _nextImg,
                       const Point2f* _prevPts, Point2f* _nextPts,
                       uchar* _status, float* _err,
                       Size _winSize, TermCriteria _criteria,
                       int _level, int _maxLevel, int _flags, float _minEigThreshold )
-    {
-        prevImg = &_prevImg;
-        prevDeriv = &_prevDeriv;
-        nextImg = &_nextImg;
-        prevPts = _prevPts;
-        nextPts = _nextPts;
-        status = _status;
-        err = _err;
-        winSize = _winSize;
-        criteria = _criteria;
-        level = _level;
-        maxLevel = _maxLevel;
-        flags = _flags;
-        minEigThreshold = _minEigThreshold;
-    }
+{
+    prevImg = &_prevImg;
+    prevDeriv = &_prevDeriv;
+    nextImg = &_nextImg;
+    prevPts = _prevPts;
+    nextPts = _nextPts;
+    status = _status;
+    err = _err;
+    winSize = _winSize;
+    criteria = _criteria;
+    level = _level;
+    maxLevel = _maxLevel;
+    flags = _flags;
+    minEigThreshold = _minEigThreshold;
+}
+
+void cv::detail::LKTrackerInvoker::operator()(const BlockedRange& range) const
+{
+    Point2f halfWin((winSize.width-1)*0.5f, (winSize.height-1)*0.5f);
+    const Mat& I = *prevImg;
+    const Mat& J = *nextImg;
+    const Mat& derivI = *prevDeriv;
     
-    void operator()(const BlockedRange& range) const
+    int j, cn = I.channels(), cn2 = cn*2;
+    cv::AutoBuffer<deriv_type> _buf(winSize.area()*(cn + cn2));
+    int derivDepth = DataType<deriv_type>::depth;
+    
+    Mat IWinBuf(winSize, CV_MAKETYPE(derivDepth, cn), (deriv_type*)_buf);
+    Mat derivIWinBuf(winSize, CV_MAKETYPE(derivDepth, cn2), (deriv_type*)_buf + winSize.area()*cn);
+    
+    for( int ptidx = range.begin(); ptidx < range.end(); ptidx++ )
     {
-        Point2f halfWin((winSize.width-1)*0.5f, (winSize.height-1)*0.5f);
-        const Mat& I = *prevImg;
-        const Mat& J = *nextImg;
-        const Mat& derivI = *prevDeriv;
-        
-        int j, cn = I.channels(), cn2 = cn*2;
-        cv::AutoBuffer<deriv_type> _buf(winSize.area()*(cn + cn2));
-        int derivDepth = DataType<deriv_type>::depth;
-        
-        Mat IWinBuf(winSize, CV_MAKETYPE(derivDepth, cn), (deriv_type*)_buf);
-        Mat derivIWinBuf(winSize, CV_MAKETYPE(derivDepth, cn2), (deriv_type*)_buf + winSize.area()*cn);
-        
-        for( int ptidx = range.begin(); ptidx < range.end(); ptidx++ )
+        Point2f prevPt = prevPts[ptidx]*(float)(1./(1 << level));
+        Point2f nextPt;
+        if( level == maxLevel )
         {
-            Point2f prevPt = prevPts[ptidx]*(float)(1./(1 << level));
-            Point2f nextPt;
-            if( level == maxLevel )
-            {
-                if( flags & OPTFLOW_USE_INITIAL_FLOW )
-                    nextPt = nextPts[ptidx]*(float)(1./(1 << level));
-                else
-                    nextPt = prevPt;
-            }
+            if( flags & OPTFLOW_USE_INITIAL_FLOW )
+                nextPt = nextPts[ptidx]*(float)(1./(1 << level));
             else
-                nextPt = nextPts[ptidx]*2.f;
-            nextPts[ptidx] = nextPt;
-            
-            Point2i iprevPt, inextPt;
-            prevPt -= halfWin;
-            iprevPt.x = cvFloor(prevPt.x);
-            iprevPt.y = cvFloor(prevPt.y);
-            
-            if( iprevPt.x < -winSize.width || iprevPt.x >= derivI.cols ||
-                iprevPt.y < -winSize.height || iprevPt.y >= derivI.rows )
+                nextPt = prevPt;
+        }
+        else
+            nextPt = nextPts[ptidx]*2.f;
+        nextPts[ptidx] = nextPt;
+        
+        Point2i iprevPt, inextPt;
+        prevPt -= halfWin;
+        iprevPt.x = cvFloor(prevPt.x);
+        iprevPt.y = cvFloor(prevPt.y);
+        
+        if( iprevPt.x < -winSize.width || iprevPt.x >= derivI.cols ||
+            iprevPt.y < -winSize.height || iprevPt.y >= derivI.rows )
+        {
+            if( level == 0 )
             {
-                if( level == 0 )
-                {
-                    if( status )
-                        status[ptidx] = false;
-                    if( err )
-                        err[ptidx] = 0;
-                }
-                continue;
+                if( status )
+                    status[ptidx] = false;
+                if( err )
+                    err[ptidx] = 0;
             }
+            continue;
+        }
+        
+        float a = prevPt.x - iprevPt.x;
+        float b = prevPt.y - iprevPt.y;
+        const int W_BITS = 14, W_BITS1 = 14;
+        const float FLT_SCALE = 1.f/(1 << 20);
+        int iw00 = cvRound((1.f - a)*(1.f - b)*(1 << W_BITS));
+        int iw01 = cvRound(a*(1.f - b)*(1 << W_BITS));
+        int iw10 = cvRound((1.f - a)*b*(1 << W_BITS));
+        int iw11 = (1 << W_BITS) - iw00 - iw01 - iw10;
+        
+        int dstep = (int)(derivI.step/derivI.elemSize1());
+        int step = (int)(I.step/I.elemSize1());
+        CV_Assert( step == (int)(J.step/J.elemSize1()) );
+        float A11 = 0, A12 = 0, A22 = 0;
+        
+#if CV_SSE2
+        __m128i qw0 = _mm_set1_epi32(iw00 + (iw01 << 16));
+        __m128i qw1 = _mm_set1_epi32(iw10 + (iw11 << 16));
+        __m128i z = _mm_setzero_si128();
+        __m128i qdelta_d = _mm_set1_epi32(1 << (W_BITS1-1));
+        __m128i qdelta = _mm_set1_epi32(1 << (W_BITS1-5-1));
+        __m128 qA11 = _mm_setzero_ps(), qA12 = _mm_setzero_ps(), qA22 = _mm_setzero_ps();
+#endif
+        
+        // extract the patch from the first image, compute covariation matrix of derivatives
+        int x, y;
+        for( y = 0; y < winSize.height; y++ )
+        {
+            const uchar* src = (const uchar*)I.data + (y + iprevPt.y)*step + iprevPt.x*cn;
+            const deriv_type* dsrc = (const deriv_type*)derivI.data + (y + iprevPt.y)*dstep + iprevPt.x*cn2;
             
-            float a = prevPt.x - iprevPt.x;
-            float b = prevPt.y - iprevPt.y;
-            const int W_BITS = 14, W_BITS1 = 14;
-            const float FLT_SCALE = 1.f/(1 << 20);
-            int iw00 = cvRound((1.f - a)*(1.f - b)*(1 << W_BITS));
-            int iw01 = cvRound(a*(1.f - b)*(1 << W_BITS));
-            int iw10 = cvRound((1.f - a)*b*(1 << W_BITS));
-            int iw11 = (1 << W_BITS) - iw00 - iw01 - iw10;
+            deriv_type* Iptr = (deriv_type*)(IWinBuf.data + y*IWinBuf.step);
+            deriv_type* dIptr = (deriv_type*)(derivIWinBuf.data + y*derivIWinBuf.step);
             
-            int dstep = (int)(derivI.step/derivI.elemSize1());
-            int step = (int)(I.step/I.elemSize1());
-            CV_Assert( step == (int)(J.step/J.elemSize1()) );
-            float A11 = 0, A12 = 0, A22 = 0;
+            x = 0;
             
 #if CV_SSE2
-            __m128i qw0 = _mm_set1_epi32(iw00 + (iw01 << 16));
-            __m128i qw1 = _mm_set1_epi32(iw10 + (iw11 << 16));
-            __m128i z = _mm_setzero_si128();
-            __m128i qdelta_d = _mm_set1_epi32(1 << (W_BITS1-1));
-            __m128i qdelta = _mm_set1_epi32(1 << (W_BITS1-5-1));
-            __m128 qA11 = _mm_setzero_ps(), qA12 = _mm_setzero_ps(), qA22 = _mm_setzero_ps();
+            for( ; x <= winSize.width*cn - 4; x += 4, dsrc += 4*2, dIptr += 4*2 )
+            {
+                __m128i v00, v01, v10, v11, t0, t1;
+                
+                v00 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int*)(src + x)), z);
+                v01 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int*)(src + x + cn)), z);
+                v10 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int*)(src + x + step)), z);
+                v11 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int*)(src + x + step + cn)), z);
+                
+                t0 = _mm_add_epi32(_mm_madd_epi16(_mm_unpacklo_epi16(v00, v01), qw0),
+                                   _mm_madd_epi16(_mm_unpacklo_epi16(v10, v11), qw1));
+                t0 = _mm_srai_epi32(_mm_add_epi32(t0, qdelta), W_BITS1-5);
+                _mm_storel_epi64((__m128i*)(Iptr + x), _mm_packs_epi32(t0,t0));
+                
+                v00 = _mm_loadu_si128((const __m128i*)(dsrc));
+                v01 = _mm_loadu_si128((const __m128i*)(dsrc + cn2));
+                v10 = _mm_loadu_si128((const __m128i*)(dsrc + dstep));
+                v11 = _mm_loadu_si128((const __m128i*)(dsrc + dstep + cn2));
+                
+                t0 = _mm_add_epi32(_mm_madd_epi16(_mm_unpacklo_epi16(v00, v01), qw0),
+                                   _mm_madd_epi16(_mm_unpacklo_epi16(v10, v11), qw1));
+                t1 = _mm_add_epi32(_mm_madd_epi16(_mm_unpackhi_epi16(v00, v01), qw0),
+                                   _mm_madd_epi16(_mm_unpackhi_epi16(v10, v11), qw1));
+                t0 = _mm_srai_epi32(_mm_add_epi32(t0, qdelta_d), W_BITS1);
+                t1 = _mm_srai_epi32(_mm_add_epi32(t1, qdelta_d), W_BITS1);
+                v00 = _mm_packs_epi32(t0, t1); // Ix0 Iy0 Ix1 Iy1 ...
+                
+                _mm_storeu_si128((__m128i*)dIptr, v00);
+                t0 = _mm_srai_epi32(v00, 16); // Iy0 Iy1 Iy2 Iy3
+                t1 = _mm_srai_epi32(_mm_slli_epi32(v00, 16), 16); // Ix0 Ix1 Ix2 Ix3
+                
+                __m128 fy = _mm_cvtepi32_ps(t0);
+                __m128 fx = _mm_cvtepi32_ps(t1);
+                
+                qA22 = _mm_add_ps(qA22, _mm_mul_ps(fy, fy));
+                qA12 = _mm_add_ps(qA12, _mm_mul_ps(fx, fy));
+                qA11 = _mm_add_ps(qA11, _mm_mul_ps(fx, fx));
+            }
+#endif
+            
+            for( ; x < winSize.width*cn; x++, dsrc += 2, dIptr += 2 )
+            {
+                int ival = CV_DESCALE(src[x]*iw00 + src[x+cn]*iw01 +
+                                      src[x+step]*iw10 + src[x+step+cn]*iw11, W_BITS1-5);
+                int ixval = CV_DESCALE(dsrc[0]*iw00 + dsrc[cn2]*iw01 +
+                                       dsrc[dstep]*iw10 + dsrc[dstep+cn2]*iw11, W_BITS1);
+                int iyval = CV_DESCALE(dsrc[1]*iw00 + dsrc[cn2+1]*iw01 + dsrc[dstep+1]*iw10 +
+                                       dsrc[dstep+cn2+1]*iw11, W_BITS1);
+                
+                Iptr[x] = (short)ival;
+                dIptr[0] = (short)ixval;
+                dIptr[1] = (short)iyval;
+                
+                A11 += (float)(ixval*ixval);
+                A12 += (float)(ixval*iyval);
+                A22 += (float)(iyval*iyval);
+            }
+        }
+        
+#if CV_SSE2
+        float CV_DECL_ALIGNED(16) A11buf[4], A12buf[4], A22buf[4];
+        _mm_store_ps(A11buf, qA11);
+        _mm_store_ps(A12buf, qA12);
+        _mm_store_ps(A22buf, qA22);
+        A11 += A11buf[0] + A11buf[1] + A11buf[2] + A11buf[3];
+        A12 += A12buf[0] + A12buf[1] + A12buf[2] + A12buf[3];
+        A22 += A22buf[0] + A22buf[1] + A22buf[2] + A22buf[3];
+#endif
+        
+        A11 *= FLT_SCALE;
+        A12 *= FLT_SCALE;
+        A22 *= FLT_SCALE;
+        
+        float D = A11*A22 - A12*A12;
+        float minEig = (A22 + A11 - std::sqrt((A11-A22)*(A11-A22) +
+                        4.f*A12*A12))/(2*winSize.width*winSize.height);
+        
+        if( err && (flags & CV_LKFLOW_GET_MIN_EIGENVALS) != 0 )
+            err[ptidx] = (float)minEig;
+        
+        if( minEig < minEigThreshold || D < FLT_EPSILON )
+        {
+            if( level == 0 && status )
+                status[ptidx] = false;
+            continue;
+        }
+        
+        D = 1.f/D;
+        
+        nextPt -= halfWin;
+        Point2f prevDelta;
+        
+        for( j = 0; j < criteria.maxCount; j++ )
+        {
+            inextPt.x = cvFloor(nextPt.x);
+            inextPt.y = cvFloor(nextPt.y);
+            
+            if( inextPt.x < -winSize.width || inextPt.x >= J.cols ||
+               inextPt.y < -winSize.height || inextPt.y >= J.rows )
+            {
+                if( level == 0 && status )
+                    status[ptidx] = false;
+                break;
+            }
+            
+            a = nextPt.x - inextPt.x;
+            b = nextPt.y - inextPt.y;
+            iw00 = cvRound((1.f - a)*(1.f - b)*(1 << W_BITS));
+            iw01 = cvRound(a*(1.f - b)*(1 << W_BITS));
+            iw10 = cvRound((1.f - a)*b*(1 << W_BITS));
+            iw11 = (1 << W_BITS) - iw00 - iw01 - iw10;
+            float b1 = 0, b2 = 0;
+#if CV_SSE2
+            qw0 = _mm_set1_epi32(iw00 + (iw01 << 16));
+            qw1 = _mm_set1_epi32(iw10 + (iw11 << 16));
+            __m128 qb0 = _mm_setzero_ps(), qb1 = _mm_setzero_ps();
 #endif
             
-            // extract the patch from the first image, compute covariation matrix of derivatives
-            int x, y;
             for( y = 0; y < winSize.height; y++ )
             {
-                const uchar* src = (const uchar*)I.data + (y + iprevPt.y)*step + iprevPt.x*cn;
-                const deriv_type* dsrc = (const deriv_type*)derivI.data + (y + iprevPt.y)*dstep + iprevPt.x*cn2;
-                
-                deriv_type* Iptr = (deriv_type*)(IWinBuf.data + y*IWinBuf.step);
-                deriv_type* dIptr = (deriv_type*)(derivIWinBuf.data + y*derivIWinBuf.step);
+                const uchar* Jptr = (const uchar*)J.data + (y + inextPt.y)*step + inextPt.x*cn;
+                const deriv_type* Iptr = (const deriv_type*)(IWinBuf.data + y*IWinBuf.step);
+                const deriv_type* dIptr = (const deriv_type*)(derivIWinBuf.data + y*derivIWinBuf.step);
                 
                 x = 0;
                 
 #if CV_SSE2
-                for( ; x <= winSize.width*cn - 4; x += 4, dsrc += 4*2, dIptr += 4*2 )
+                for( ; x <= winSize.width*cn - 8; x += 8, dIptr += 8*2 )
                 {
-                    __m128i v00, v01, v10, v11, t0, t1;
+                    __m128i diff0 = _mm_loadu_si128((const __m128i*)(Iptr + x)), diff1;
+                    __m128i v00 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(Jptr + x)), z);
+                    __m128i v01 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(Jptr + x + cn)), z);
+                    __m128i v10 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(Jptr + x + step)), z);
+                    __m128i v11 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(Jptr + x + step + cn)), z);
                     
-                    v00 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int*)(src + x)), z);
-                    v01 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int*)(src + x + cn)), z);
-                    v10 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int*)(src + x + step)), z);
-                    v11 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int*)(src + x + step + cn)), z);
-                    
-                    t0 = _mm_add_epi32(_mm_madd_epi16(_mm_unpacklo_epi16(v00, v01), qw0),
-                                       _mm_madd_epi16(_mm_unpacklo_epi16(v10, v11), qw1));
+                    __m128i t0 = _mm_add_epi32(_mm_madd_epi16(_mm_unpacklo_epi16(v00, v01), qw0),
+                                               _mm_madd_epi16(_mm_unpacklo_epi16(v10, v11), qw1));
+                    __m128i t1 = _mm_add_epi32(_mm_madd_epi16(_mm_unpackhi_epi16(v00, v01), qw0),
+                                               _mm_madd_epi16(_mm_unpackhi_epi16(v10, v11), qw1));
                     t0 = _mm_srai_epi32(_mm_add_epi32(t0, qdelta), W_BITS1-5);
-                    _mm_storel_epi64((__m128i*)(Iptr + x), _mm_packs_epi32(t0,t0));
-                    
-                    v00 = _mm_loadu_si128((const __m128i*)(dsrc));
-                    v01 = _mm_loadu_si128((const __m128i*)(dsrc + cn2));
-                    v10 = _mm_loadu_si128((const __m128i*)(dsrc + dstep));
-                    v11 = _mm_loadu_si128((const __m128i*)(dsrc + dstep + cn2));
-                    
-                    t0 = _mm_add_epi32(_mm_madd_epi16(_mm_unpacklo_epi16(v00, v01), qw0),
-                                       _mm_madd_epi16(_mm_unpacklo_epi16(v10, v11), qw1));
-                    t1 = _mm_add_epi32(_mm_madd_epi16(_mm_unpackhi_epi16(v00, v01), qw0),
-                                       _mm_madd_epi16(_mm_unpackhi_epi16(v10, v11), qw1));
-                    t0 = _mm_srai_epi32(_mm_add_epi32(t0, qdelta_d), W_BITS1);
-                    t1 = _mm_srai_epi32(_mm_add_epi32(t1, qdelta_d), W_BITS1);
-                    v00 = _mm_packs_epi32(t0, t1); // Ix0 Iy0 Ix1 Iy1 ...
-                    
-                    _mm_storeu_si128((__m128i*)dIptr, v00);
-                    t0 = _mm_srai_epi32(v00, 16); // Iy0 Iy1 Iy2 Iy3
-                    t1 = _mm_srai_epi32(_mm_slli_epi32(v00, 16), 16); // Ix0 Ix1 Ix2 Ix3
-                    
-                    __m128 fy = _mm_cvtepi32_ps(t0);
-                    __m128 fx = _mm_cvtepi32_ps(t1);
-                    
-                    qA22 = _mm_add_ps(qA22, _mm_mul_ps(fy, fy));
-                    qA12 = _mm_add_ps(qA12, _mm_mul_ps(fx, fy));
-                    qA11 = _mm_add_ps(qA11, _mm_mul_ps(fx, fx));
+                    t1 = _mm_srai_epi32(_mm_add_epi32(t1, qdelta), W_BITS1-5);
+                    diff0 = _mm_subs_epi16(_mm_packs_epi32(t0, t1), diff0);
+                    diff1 = _mm_unpackhi_epi16(diff0, diff0);
+                    diff0 = _mm_unpacklo_epi16(diff0, diff0); // It0 It0 It1 It1 ...
+                    v00 = _mm_loadu_si128((const __m128i*)(dIptr)); // Ix0 Iy0 Ix1 Iy1 ... 
+                    v01 = _mm_loadu_si128((const __m128i*)(dIptr + 8));
+                    v10 = _mm_mullo_epi16(v00, diff0);
+                    v11 = _mm_mulhi_epi16(v00, diff0);
+                    v00 = _mm_unpacklo_epi16(v10, v11);
+                    v10 = _mm_unpackhi_epi16(v10, v11);
+                    qb0 = _mm_add_ps(qb0, _mm_cvtepi32_ps(v00));
+                    qb1 = _mm_add_ps(qb1, _mm_cvtepi32_ps(v10));
+                    v10 = _mm_mullo_epi16(v01, diff1);
+                    v11 = _mm_mulhi_epi16(v01, diff1);
+                    v00 = _mm_unpacklo_epi16(v10, v11);
+                    v10 = _mm_unpackhi_epi16(v10, v11);
+                    qb0 = _mm_add_ps(qb0, _mm_cvtepi32_ps(v00));
+                    qb1 = _mm_add_ps(qb1, _mm_cvtepi32_ps(v10));
                 }
 #endif
                 
-                for( ; x < winSize.width*cn; x++, dsrc += 2, dIptr += 2 )
+                for( ; x < winSize.width*cn; x++, dIptr += 2 )
                 {
-                    int ival = CV_DESCALE(src[x]*iw00 + src[x+cn]*iw01 +
-                                          src[x+step]*iw10 + src[x+step+cn]*iw11, W_BITS1-5);
-                    int ixval = CV_DESCALE(dsrc[0]*iw00 + dsrc[cn2]*iw01 +
-                                           dsrc[dstep]*iw10 + dsrc[dstep+cn2]*iw11, W_BITS1);
-                    int iyval = CV_DESCALE(dsrc[1]*iw00 + dsrc[cn2+1]*iw01 + dsrc[dstep+1]*iw10 +
-                                           dsrc[dstep+cn2+1]*iw11, W_BITS1);
-                    
-                    Iptr[x] = (short)ival;
-                    dIptr[0] = (short)ixval;
-                    dIptr[1] = (short)iyval;
-                    
-                    A11 += (float)(ixval*ixval);
-                    A12 += (float)(ixval*iyval);
-                    A22 += (float)(iyval*iyval);
+                    int diff = CV_DESCALE(Jptr[x]*iw00 + Jptr[x+cn]*iw01 +
+                                          Jptr[x+step]*iw10 + Jptr[x+step+cn]*iw11,
+                                          W_BITS1-5) - Iptr[x];
+                    b1 += (float)(diff*dIptr[0]);
+                    b2 += (float)(diff*dIptr[1]);
                 }
             }
             
 #if CV_SSE2
-            float CV_DECL_ALIGNED(16) A11buf[4], A12buf[4], A22buf[4];
-            _mm_store_ps(A11buf, qA11);
-            _mm_store_ps(A12buf, qA12);
-            _mm_store_ps(A22buf, qA22);
-            A11 += A11buf[0] + A11buf[1] + A11buf[2] + A11buf[3];
-            A12 += A12buf[0] + A12buf[1] + A12buf[2] + A12buf[3];
-            A22 += A22buf[0] + A22buf[1] + A22buf[2] + A22buf[3];
+            float CV_DECL_ALIGNED(16) bbuf[4];
+            _mm_store_ps(bbuf, _mm_add_ps(qb0, qb1));
+            b1 += bbuf[0] + bbuf[2];
+            b2 += bbuf[1] + bbuf[3];
 #endif
+
+            b1 *= FLT_SCALE;
+            b2 *= FLT_SCALE;
             
-            A11 *= FLT_SCALE;
-            A12 *= FLT_SCALE;
-            A22 *= FLT_SCALE;
+            Point2f delta( (float)((A12*b2 - A22*b1) * D),
+                          (float)((A12*b1 - A11*b2) * D));
+            //delta = -delta;
             
-            float D = A11*A22 - A12*A12;
-            float minEig = (A22 + A11 - std::sqrt((A11-A22)*(A11-A22) +
-                            4.f*A12*A12))/(2*winSize.width*winSize.height);
+            nextPt += delta;
+            nextPts[ptidx] = nextPt + halfWin;
             
-            if( err && (flags & CV_LKFLOW_GET_MIN_EIGENVALS) != 0 )
-                err[ptidx] = (float)minEig;
+            if( delta.ddot(delta) <= criteria.epsilon )
+                break;
             
-            if( minEig < minEigThreshold || D < FLT_EPSILON )
+            if( j > 0 && std::abs(delta.x + prevDelta.x) < 0.01 &&
+               std::abs(delta.y + prevDelta.y) < 0.01 )
             {
-                if( level == 0 && status )
+                nextPts[ptidx] -= delta*0.5f;
+                break;
+            }
+            prevDelta = delta;
+        }
+        
+        if( status[ptidx] && err && level == 0 && (flags & CV_LKFLOW_GET_MIN_EIGENVALS) == 0 )
+        {
+            Point2f nextPt = nextPts[ptidx] - halfWin;
+            Point inextPt;
+            
+            inextPt.x = cvFloor(nextPt.x);
+            inextPt.y = cvFloor(nextPt.y);
+            
+            if( inextPt.x < -winSize.width || inextPt.x >= J.cols ||
+                inextPt.y < -winSize.height || inextPt.y >= J.rows )
+            {
+                if( status )
                     status[ptidx] = false;
                 continue;
             }
             
-            D = 1.f/D;
+            float a = nextPt.x - inextPt.x;
+            float b = nextPt.y - inextPt.y;
+            iw00 = cvRound((1.f - a)*(1.f - b)*(1 << W_BITS));
+            iw01 = cvRound(a*(1.f - b)*(1 << W_BITS));
+            iw10 = cvRound((1.f - a)*b*(1 << W_BITS));
+            iw11 = (1 << W_BITS) - iw00 - iw01 - iw10;
+            float errval = 0.f;
             
-            nextPt -= halfWin;
-            Point2f prevDelta;
-            
-            for( j = 0; j < criteria.maxCount; j++ )
+            for( y = 0; y < winSize.height; y++ )
             {
-                inextPt.x = cvFloor(nextPt.x);
-                inextPt.y = cvFloor(nextPt.y);
+                const uchar* Jptr = (const uchar*)J.data + (y + inextPt.y)*step + inextPt.x*cn;
+                const deriv_type* Iptr = (const deriv_type*)(IWinBuf.data + y*IWinBuf.step);
                 
-                if( inextPt.x < -winSize.width || inextPt.x >= J.cols ||
-                   inextPt.y < -winSize.height || inextPt.y >= J.rows )
+                for( x = 0; x < winSize.width*cn; x++ )
                 {
-                    if( level == 0 && status )
-                        status[ptidx] = false;
-                    break;
+                    int diff = CV_DESCALE(Jptr[x]*iw00 + Jptr[x+cn]*iw01 +
+                                          Jptr[x+step]*iw10 + Jptr[x+step+cn]*iw11,
+                                          W_BITS1-5) - Iptr[x];
+                    errval += std::abs((float)diff);
                 }
-                
-                a = nextPt.x - inextPt.x;
-                b = nextPt.y - inextPt.y;
-                iw00 = cvRound((1.f - a)*(1.f - b)*(1 << W_BITS));
-                iw01 = cvRound(a*(1.f - b)*(1 << W_BITS));
-                iw10 = cvRound((1.f - a)*b*(1 << W_BITS));
-                iw11 = (1 << W_BITS) - iw00 - iw01 - iw10;
-                float b1 = 0, b2 = 0;
-#if CV_SSE2
-                qw0 = _mm_set1_epi32(iw00 + (iw01 << 16));
-                qw1 = _mm_set1_epi32(iw10 + (iw11 << 16));
-                __m128 qb0 = _mm_setzero_ps(), qb1 = _mm_setzero_ps();
-#endif
-                
-                for( y = 0; y < winSize.height; y++ )
-                {
-                    const uchar* Jptr = (const uchar*)J.data + (y + inextPt.y)*step + inextPt.x*cn;
-                    const deriv_type* Iptr = (const deriv_type*)(IWinBuf.data + y*IWinBuf.step);
-                    const deriv_type* dIptr = (const deriv_type*)(derivIWinBuf.data + y*derivIWinBuf.step);
-                    
-                    x = 0;
-                    
-#if CV_SSE2
-                    for( ; x <= winSize.width*cn - 8; x += 8, dIptr += 8*2 )
-                    {
-                        __m128i diff0 = _mm_loadu_si128((const __m128i*)(Iptr + x)), diff1;
-                        __m128i v00 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(Jptr + x)), z);
-                        __m128i v01 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(Jptr + x + cn)), z);
-                        __m128i v10 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(Jptr + x + step)), z);
-                        __m128i v11 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(Jptr + x + step + cn)), z);
-                        
-                        __m128i t0 = _mm_add_epi32(_mm_madd_epi16(_mm_unpacklo_epi16(v00, v01), qw0),
-                                                   _mm_madd_epi16(_mm_unpacklo_epi16(v10, v11), qw1));
-                        __m128i t1 = _mm_add_epi32(_mm_madd_epi16(_mm_unpackhi_epi16(v00, v01), qw0),
-                                                   _mm_madd_epi16(_mm_unpackhi_epi16(v10, v11), qw1));
-                        t0 = _mm_srai_epi32(_mm_add_epi32(t0, qdelta), W_BITS1-5);
-                        t1 = _mm_srai_epi32(_mm_add_epi32(t1, qdelta), W_BITS1-5);
-                        diff0 = _mm_subs_epi16(_mm_packs_epi32(t0, t1), diff0);
-                        diff1 = _mm_unpackhi_epi16(diff0, diff0);
-                        diff0 = _mm_unpacklo_epi16(diff0, diff0); // It0 It0 It1 It1 ...
-                        v00 = _mm_loadu_si128((const __m128i*)(dIptr)); // Ix0 Iy0 Ix1 Iy1 ... 
-                        v01 = _mm_loadu_si128((const __m128i*)(dIptr + 8));
-                        v10 = _mm_mullo_epi16(v00, diff0);
-                        v11 = _mm_mulhi_epi16(v00, diff0);
-                        v00 = _mm_unpacklo_epi16(v10, v11);
-                        v10 = _mm_unpackhi_epi16(v10, v11);
-                        qb0 = _mm_add_ps(qb0, _mm_cvtepi32_ps(v00));
-                        qb1 = _mm_add_ps(qb1, _mm_cvtepi32_ps(v10));
-                        v10 = _mm_mullo_epi16(v01, diff1);
-                        v11 = _mm_mulhi_epi16(v01, diff1);
-                        v00 = _mm_unpacklo_epi16(v10, v11);
-                        v10 = _mm_unpackhi_epi16(v10, v11);
-                        qb0 = _mm_add_ps(qb0, _mm_cvtepi32_ps(v00));
-                        qb1 = _mm_add_ps(qb1, _mm_cvtepi32_ps(v10));
-                    }
-#endif
-                    
-                    for( ; x < winSize.width*cn; x++, dIptr += 2 )
-                    {
-                        int diff = CV_DESCALE(Jptr[x]*iw00 + Jptr[x+cn]*iw01 +
-                                              Jptr[x+step]*iw10 + Jptr[x+step+cn]*iw11,
-                                              W_BITS1-5) - Iptr[x];
-                        b1 += (float)(diff*dIptr[0]);
-                        b2 += (float)(diff*dIptr[1]);
-                    }
-                }
-                
-#if CV_SSE2
-                float CV_DECL_ALIGNED(16) bbuf[4];
-                _mm_store_ps(bbuf, _mm_add_ps(qb0, qb1));
-                b1 += bbuf[0] + bbuf[2];
-                b2 += bbuf[1] + bbuf[3];
-#endif
-
-                b1 *= FLT_SCALE;
-                b2 *= FLT_SCALE;
-                
-                Point2f delta( (float)((A12*b2 - A22*b1) * D),
-                              (float)((A12*b1 - A11*b2) * D));
-                //delta = -delta;
-                
-                nextPt += delta;
-                nextPts[ptidx] = nextPt + halfWin;
-                
-                if( delta.ddot(delta) <= criteria.epsilon )
-                    break;
-                
-                if( j > 0 && std::abs(delta.x + prevDelta.x) < 0.01 &&
-                   std::abs(delta.y + prevDelta.y) < 0.01 )
-                {
-                    nextPts[ptidx] -= delta*0.5f;
-                    break;
-                }
-                prevDelta = delta;
-            }
-            
-            if( status[ptidx] && err && level == 0 && (flags & CV_LKFLOW_GET_MIN_EIGENVALS) == 0 )
-            {
-                Point2f nextPt = nextPts[ptidx] - halfWin;
-                Point inextPt;
-                
-                inextPt.x = cvFloor(nextPt.x);
-                inextPt.y = cvFloor(nextPt.y);
-                
-                if( inextPt.x < -winSize.width || inextPt.x >= J.cols ||
-                    inextPt.y < -winSize.height || inextPt.y >= J.rows )
-                {
-                    if( status )
-                        status[ptidx] = false;
-                    continue;
-                }
-                
-                float a = nextPt.x - inextPt.x;
-                float b = nextPt.y - inextPt.y;
-                iw00 = cvRound((1.f - a)*(1.f - b)*(1 << W_BITS));
-                iw01 = cvRound(a*(1.f - b)*(1 << W_BITS));
-                iw10 = cvRound((1.f - a)*b*(1 << W_BITS));
-                iw11 = (1 << W_BITS) - iw00 - iw01 - iw10;
-                float errval = 0.f;
-                
-                for( y = 0; y < winSize.height; y++ )
-                {
-                    const uchar* Jptr = (const uchar*)J.data + (y + inextPt.y)*step + inextPt.x*cn;
-                    const deriv_type* Iptr = (const deriv_type*)(IWinBuf.data + y*IWinBuf.step);
-                    
-                    for( x = 0; x < winSize.width*cn; x++ )
-                    {
-                        int diff = CV_DESCALE(Jptr[x]*iw00 + Jptr[x+cn]*iw01 +
-                                              Jptr[x+step]*iw10 + Jptr[x+step+cn]*iw11,
-                                              W_BITS1-5) - Iptr[x];
-                        errval += std::abs((float)diff);
-                    }
-                }
-                err[ptidx] = errval * 1.f/(32*winSize.width*cn*winSize.height);
             }
+            err[ptidx] = errval * 1.f/(32*winSize.width*cn*winSize.height);
         }
     }
-    
-    const Mat* prevImg;
-    const Mat* nextImg;
-    const Mat* prevDeriv;
-    const Point2f* prevPts;
-    Point2f* nextPts;
-    uchar* status;
-    float* err;
-    Size winSize;
-    TermCriteria criteria;
-    int level;
-    int maxLevel;
-    int flags;
-    float minEigThreshold;
-};
-    
 }
-
-
+    
 int cv::buildOpticalFlowPyramid(InputArray _img, OutputArrayOfArrays pyramid, Size winSize, int maxLevel, bool withDerivatives,
                                 int pyrBorder, int derivBorder, bool tryReuseInputImage)
 {
@@ -503,7 +490,7 @@ int cv::buildOpticalFlowPyramid(InputArray _img, OutputArrayOfArrays pyramid, Si
 
     pyramid.create(1, (maxLevel + 1) * pyrstep, 0 /*type*/, -1, true, 0);
 
-    int derivType = CV_MAKETYPE(DataType<deriv_type>::depth, img.channels() * 2);
+    int derivType = CV_MAKETYPE(DataType<cv::detail::deriv_type>::depth, img.channels() * 2);
 
     //level 0
     bool lvl0IsSet = false;
@@ -602,7 +589,7 @@ void cv::calcOpticalFlowPyrLK( InputArray _prevImg, InputArray _nextImg,
         return;
 #endif
     Mat prevPtsMat = _prevPts.getMat();
-    const int derivDepth = DataType<deriv_type>::depth;
+    const int derivDepth = DataType<cv::detail::deriv_type>::depth;
 
     CV_Assert( maxLevel >= 0 && winSize.width > 2 && winSize.height > 2 );
 
@@ -744,6 +731,8 @@ void cv::calcOpticalFlowPyrLK( InputArray _prevImg, InputArray _nextImg,
         CV_Assert(prevPyr[level * lvlStep1].size() == nextPyr[level * lvlStep2].size());
         CV_Assert(prevPyr[level * lvlStep1].type() == nextPyr[level * lvlStep2].type());
 
+        typedef cv::detail::LKTrackerInvoker LKTrackerInvoker;
+
         parallel_for(BlockedRange(0, npoints), LKTrackerInvoker(prevPyr[level * lvlStep1], derivI,
                                                                 nextPyr[level * lvlStep2], prevPts, nextPts,
                                                                 status, err,
diff --git a/modules/video/src/lkpyramid.hpp b/modules/video/src/lkpyramid.hpp
new file mode 100644
index 000000000..03a51af37
--- /dev/null
+++ b/modules/video/src/lkpyramid.hpp
@@ -0,0 +1,36 @@
+#pragma once
+
+namespace cv
+{
+namespace detail
+{
+
+    typedef short deriv_type;
+
+    struct LKTrackerInvoker
+    {
+        LKTrackerInvoker( const Mat& _prevImg, const Mat& _prevDeriv, const Mat& _nextImg,
+                          const Point2f* _prevPts, Point2f* _nextPts,
+                          uchar* _status, float* _err,
+                          Size _winSize, TermCriteria _criteria,
+                          int _level, int _maxLevel, int _flags, float _minEigThreshold );
+
+        void operator()(const BlockedRange& range) const;
+
+        const Mat* prevImg;
+        const Mat* nextImg;
+        const Mat* prevDeriv;
+        const Point2f* prevPts;
+        Point2f* nextPts;
+        uchar* status;
+        float* err;
+        Size winSize;
+        TermCriteria criteria;
+        int level;
+        int maxLevel;
+        int flags;
+        float minEigThreshold;
+    };
+
+}// namespace detail
+}// namespace cv
\ No newline at end of file