From 47f72b538f3a03c9244d6fdb8aef5a46f6bdf576 Mon Sep 17 00:00:00 2001 From: Andrey Kamaev Date: Mon, 21 May 2012 13:07:53 +0000 Subject: [PATCH] Added new performance tests for calcOpticalFlowPyrLK and buildOpticalFlowPyramid; extracted private header from lkpyramid.cpp --- modules/video/perf/perf_optflowpyrlk.cpp | 124 ++++- modules/video/src/lkpyramid.cpp | 631 +++++++++++------------ modules/video/src/lkpyramid.hpp | 36 ++ 3 files changed, 467 insertions(+), 324 deletions(-) create mode 100644 modules/video/src/lkpyramid.hpp diff --git a/modules/video/perf/perf_optflowpyrlk.cpp b/modules/video/perf/perf_optflowpyrlk.cpp index 75fcb926b..2ef5f3c13 100644 --- a/modules/video/perf/perf_optflowpyrlk.cpp +++ b/modules/video/perf/perf_optflowpyrlk.cpp @@ -28,12 +28,12 @@ void FormTrackingPointsArray(vector& points, int width, int height, int } } -PERF_TEST_P(Path_Idx_Cn_NPoints_WSize, OpticalFlowPyrLK, testing::Combine( +PERF_TEST_P(Path_Idx_Cn_NPoints_WSize, OpticalFlowPyrLK_full, testing::Combine( testing::Values("cv/optflow/frames/VGA_%02d.png", "cv/optflow/frames/720p_%02d.jpg"), - testing::Range(0, 3), + testing::Range(1, 3), testing::Values(1, 3, 4), testing::Values(make_tuple(9, 9), make_tuple(15, 15)), - testing::Values(7, 11, 21, 25) + testing::Values(7, 11, 25) ) ) { @@ -48,6 +48,7 @@ PERF_TEST_P(Path_Idx_Cn_NPoints_WSize, OpticalFlowPyrLK, testing::Combine( int nPointsX = min(get<0>(get<3>(GetParam())), img1.cols); int nPointsY = min(get<1>(get<3>(GetParam())), img1.rows); int winSize = get<4>(GetParam()); + int maxLevel = 2; TermCriteria criteria(CV_TERMCRIT_ITER|CV_TERMCRIT_EPS, 7, 0.001); int flags = 0; @@ -91,3 +92,120 @@ PERF_TEST_P(Path_Idx_Cn_NPoints_WSize, OpticalFlowPyrLK, testing::Combine( flags, minEigThreshold); } } + +typedef tr1::tuple, int, bool> Path_Idx_Cn_NPoints_WSize_Deriv_t; +typedef TestBaseWithParam Path_Idx_Cn_NPoints_WSize_Deriv; + +PERF_TEST_P(Path_Idx_Cn_NPoints_WSize_Deriv, OpticalFlowPyrLK_self, testing::Combine( + testing::Values("cv/optflow/frames/VGA_%02d.png", "cv/optflow/frames/720p_%02d.jpg"), + testing::Range(1, 3), + testing::Values(1, 3, 4), + testing::Values(make_tuple(9, 9), make_tuple(15, 15)), + testing::Values(7, 11, 25), + testing::Bool() + ) + ) +{ + string filename1 = getDataPath(cv::format(get<0>(GetParam()).c_str(), get<1>(GetParam()))); + string filename2 = getDataPath(cv::format(get<0>(GetParam()).c_str(), get<1>(GetParam()) + 1)); + Mat img1 = imread(filename1); + Mat img2 = imread(filename2); + if (img1.empty()) FAIL() << "Unable to load source image " << filename1; + if (img2.empty()) FAIL() << "Unable to load source image " << filename2; + + int cn = get<2>(GetParam()); + int nPointsX = min(get<0>(get<3>(GetParam())), img1.cols); + int nPointsY = min(get<1>(get<3>(GetParam())), img1.rows); + int winSize = get<4>(GetParam()); + bool withDerivatives = get<5>(GetParam()); + + int maxLevel = 2; + TermCriteria criteria(CV_TERMCRIT_ITER|CV_TERMCRIT_EPS, 7, 0.001); + int flags = 0; + double minEigThreshold = 1e-4; + + Mat frame1, frame2; + switch(cn) + { + case 1: + cvtColor(img1, frame1, COLOR_BGR2GRAY, cn); + cvtColor(img2, frame2, COLOR_BGR2GRAY, cn); + break; + case 3: + frame1 = img1; + frame2 = img2; + break; + case 4: + cvtColor(img1, frame1, COLOR_BGR2BGRA, cn); + cvtColor(img2, frame2, COLOR_BGR2BGRA, cn); + break; + default: + FAIL() << "Unexpected number of channels: " << cn; + } + + vector inPoints; + vector outPoints; + vector status; + vector err; + + FormTrackingPointsArray(inPoints, frame1.cols, frame1.rows, nPointsX, nPointsY); + outPoints.resize(inPoints.size()); + status.resize(inPoints.size()); + err.resize(inPoints.size()); + + std::vector pyramid1, pyramid2; + + maxLevel = buildOpticalFlowPyramid(frame1, pyramid1, Size(winSize, winSize), maxLevel, withDerivatives); + maxLevel = buildOpticalFlowPyramid(frame2, pyramid2, Size(winSize, winSize), maxLevel, withDerivatives); + + declare.in(pyramid1, pyramid2, inPoints).out(outPoints); + + TEST_CYCLE() + { + calcOpticalFlowPyrLK(pyramid1, pyramid2, inPoints, outPoints, status, err, + Size(winSize, winSize), maxLevel, criteria, + flags, minEigThreshold); + } +} + +CV_ENUM(PyrBorderMode, BORDER_DEFAULT, BORDER_TRANSPARENT); +typedef tr1::tuple Path_Win_Deriv_Border_Reuse_t; +typedef TestBaseWithParam Path_Win_Deriv_Border_Reuse; + +PERF_TEST_P(Path_Win_Deriv_Border_Reuse, OpticalFlowPyrLK_pyr, testing::Combine( + testing::Values("cv/optflow/frames/720p_01.jpg"), + testing::Values(7, 11), + testing::Bool(), + testing::ValuesIn(PyrBorderMode::all()), + testing::Bool() + ) + ) +{ + string filename = getDataPath(get<0>(GetParam())); + Mat img = imread(filename); + Size winSize(get<1>(GetParam()), get<1>(GetParam())); + bool withDerivatives = get<2>(GetParam()); + int derivBorder = get<3>(GetParam()); + int pyrBorder = derivBorder; + if(derivBorder != BORDER_TRANSPARENT) + { + derivBorder = BORDER_CONSTANT; + pyrBorder = BORDER_REFLECT_101; + } + bool tryReuseInputImage = get<4>(GetParam()); + std::vector pyramid; + + img.adjustROI(winSize.height, winSize.height, winSize.width, winSize.width); + + int maxLevel = buildOpticalFlowPyramid(img, pyramid, winSize, 1000, withDerivatives, BORDER_CONSTANT, BORDER_CONSTANT, tryReuseInputImage); + + declare.in(img).out(pyramid); + + + TEST_CYCLE() + { + buildOpticalFlowPyramid(img, pyramid, winSize, maxLevel, withDerivatives, pyrBorder, derivBorder, tryReuseInputImage); + } + + SANITY_CHECK(pyramid); +} \ No newline at end of file diff --git a/modules/video/src/lkpyramid.cpp b/modules/video/src/lkpyramid.cpp index 0a4f44b62..8c4370673 100644 --- a/modules/video/src/lkpyramid.cpp +++ b/modules/video/src/lkpyramid.cpp @@ -41,17 +41,22 @@ #include "precomp.hpp" #include #include +#include "lkpyramid.hpp" -namespace cv +namespace { - -typedef short deriv_type; - -static void calcSharrDeriv(const Mat& src, Mat& dst) +static void calcSharrDeriv(const cv::Mat& src, cv::Mat& dst) { + using namespace cv; + using cv::detail::deriv_type; int rows = src.rows, cols = src.cols, cn = src.channels(), colsn = cols*cn, depth = src.depth(); CV_Assert(depth == CV_8U); dst.create(rows, cols, CV_MAKETYPE(DataType::depth, cn*2)); + + #ifdef HAVE_TEGRA_OPTIMIZATION + if (tegra::calcSharrDeriv(src, dst)) + return; +#endif int x, y, delta = (int)alignSize((cols + 2)*cn, 16); AutoBuffer _tempBuf(delta*2 + 64); @@ -127,373 +132,355 @@ static void calcSharrDeriv(const Mat& src, Mat& dst) } } +}//namespace -struct LKTrackerInvoker -{ - LKTrackerInvoker( const Mat& _prevImg, const Mat& _prevDeriv, const Mat& _nextImg, +cv::detail::LKTrackerInvoker::LKTrackerInvoker( + const Mat& _prevImg, const Mat& _prevDeriv, const Mat& _nextImg, const Point2f* _prevPts, Point2f* _nextPts, uchar* _status, float* _err, Size _winSize, TermCriteria _criteria, int _level, int _maxLevel, int _flags, float _minEigThreshold ) - { - prevImg = &_prevImg; - prevDeriv = &_prevDeriv; - nextImg = &_nextImg; - prevPts = _prevPts; - nextPts = _nextPts; - status = _status; - err = _err; - winSize = _winSize; - criteria = _criteria; - level = _level; - maxLevel = _maxLevel; - flags = _flags; - minEigThreshold = _minEigThreshold; - } +{ + prevImg = &_prevImg; + prevDeriv = &_prevDeriv; + nextImg = &_nextImg; + prevPts = _prevPts; + nextPts = _nextPts; + status = _status; + err = _err; + winSize = _winSize; + criteria = _criteria; + level = _level; + maxLevel = _maxLevel; + flags = _flags; + minEigThreshold = _minEigThreshold; +} + +void cv::detail::LKTrackerInvoker::operator()(const BlockedRange& range) const +{ + Point2f halfWin((winSize.width-1)*0.5f, (winSize.height-1)*0.5f); + const Mat& I = *prevImg; + const Mat& J = *nextImg; + const Mat& derivI = *prevDeriv; - void operator()(const BlockedRange& range) const + int j, cn = I.channels(), cn2 = cn*2; + cv::AutoBuffer _buf(winSize.area()*(cn + cn2)); + int derivDepth = DataType::depth; + + Mat IWinBuf(winSize, CV_MAKETYPE(derivDepth, cn), (deriv_type*)_buf); + Mat derivIWinBuf(winSize, CV_MAKETYPE(derivDepth, cn2), (deriv_type*)_buf + winSize.area()*cn); + + for( int ptidx = range.begin(); ptidx < range.end(); ptidx++ ) { - Point2f halfWin((winSize.width-1)*0.5f, (winSize.height-1)*0.5f); - const Mat& I = *prevImg; - const Mat& J = *nextImg; - const Mat& derivI = *prevDeriv; - - int j, cn = I.channels(), cn2 = cn*2; - cv::AutoBuffer _buf(winSize.area()*(cn + cn2)); - int derivDepth = DataType::depth; - - Mat IWinBuf(winSize, CV_MAKETYPE(derivDepth, cn), (deriv_type*)_buf); - Mat derivIWinBuf(winSize, CV_MAKETYPE(derivDepth, cn2), (deriv_type*)_buf + winSize.area()*cn); - - for( int ptidx = range.begin(); ptidx < range.end(); ptidx++ ) + Point2f prevPt = prevPts[ptidx]*(float)(1./(1 << level)); + Point2f nextPt; + if( level == maxLevel ) { - Point2f prevPt = prevPts[ptidx]*(float)(1./(1 << level)); - Point2f nextPt; - if( level == maxLevel ) - { - if( flags & OPTFLOW_USE_INITIAL_FLOW ) - nextPt = nextPts[ptidx]*(float)(1./(1 << level)); - else - nextPt = prevPt; - } + if( flags & OPTFLOW_USE_INITIAL_FLOW ) + nextPt = nextPts[ptidx]*(float)(1./(1 << level)); else - nextPt = nextPts[ptidx]*2.f; - nextPts[ptidx] = nextPt; - - Point2i iprevPt, inextPt; - prevPt -= halfWin; - iprevPt.x = cvFloor(prevPt.x); - iprevPt.y = cvFloor(prevPt.y); - - if( iprevPt.x < -winSize.width || iprevPt.x >= derivI.cols || - iprevPt.y < -winSize.height || iprevPt.y >= derivI.rows ) + nextPt = prevPt; + } + else + nextPt = nextPts[ptidx]*2.f; + nextPts[ptidx] = nextPt; + + Point2i iprevPt, inextPt; + prevPt -= halfWin; + iprevPt.x = cvFloor(prevPt.x); + iprevPt.y = cvFloor(prevPt.y); + + if( iprevPt.x < -winSize.width || iprevPt.x >= derivI.cols || + iprevPt.y < -winSize.height || iprevPt.y >= derivI.rows ) + { + if( level == 0 ) { - if( level == 0 ) - { - if( status ) - status[ptidx] = false; - if( err ) - err[ptidx] = 0; - } - continue; + if( status ) + status[ptidx] = false; + if( err ) + err[ptidx] = 0; } + continue; + } + + float a = prevPt.x - iprevPt.x; + float b = prevPt.y - iprevPt.y; + const int W_BITS = 14, W_BITS1 = 14; + const float FLT_SCALE = 1.f/(1 << 20); + int iw00 = cvRound((1.f - a)*(1.f - b)*(1 << W_BITS)); + int iw01 = cvRound(a*(1.f - b)*(1 << W_BITS)); + int iw10 = cvRound((1.f - a)*b*(1 << W_BITS)); + int iw11 = (1 << W_BITS) - iw00 - iw01 - iw10; + + int dstep = (int)(derivI.step/derivI.elemSize1()); + int step = (int)(I.step/I.elemSize1()); + CV_Assert( step == (int)(J.step/J.elemSize1()) ); + float A11 = 0, A12 = 0, A22 = 0; + +#if CV_SSE2 + __m128i qw0 = _mm_set1_epi32(iw00 + (iw01 << 16)); + __m128i qw1 = _mm_set1_epi32(iw10 + (iw11 << 16)); + __m128i z = _mm_setzero_si128(); + __m128i qdelta_d = _mm_set1_epi32(1 << (W_BITS1-1)); + __m128i qdelta = _mm_set1_epi32(1 << (W_BITS1-5-1)); + __m128 qA11 = _mm_setzero_ps(), qA12 = _mm_setzero_ps(), qA22 = _mm_setzero_ps(); +#endif + + // extract the patch from the first image, compute covariation matrix of derivatives + int x, y; + for( y = 0; y < winSize.height; y++ ) + { + const uchar* src = (const uchar*)I.data + (y + iprevPt.y)*step + iprevPt.x*cn; + const deriv_type* dsrc = (const deriv_type*)derivI.data + (y + iprevPt.y)*dstep + iprevPt.x*cn2; - float a = prevPt.x - iprevPt.x; - float b = prevPt.y - iprevPt.y; - const int W_BITS = 14, W_BITS1 = 14; - const float FLT_SCALE = 1.f/(1 << 20); - int iw00 = cvRound((1.f - a)*(1.f - b)*(1 << W_BITS)); - int iw01 = cvRound(a*(1.f - b)*(1 << W_BITS)); - int iw10 = cvRound((1.f - a)*b*(1 << W_BITS)); - int iw11 = (1 << W_BITS) - iw00 - iw01 - iw10; + deriv_type* Iptr = (deriv_type*)(IWinBuf.data + y*IWinBuf.step); + deriv_type* dIptr = (deriv_type*)(derivIWinBuf.data + y*derivIWinBuf.step); - int dstep = (int)(derivI.step/derivI.elemSize1()); - int step = (int)(I.step/I.elemSize1()); - CV_Assert( step == (int)(J.step/J.elemSize1()) ); - float A11 = 0, A12 = 0, A22 = 0; + x = 0; #if CV_SSE2 - __m128i qw0 = _mm_set1_epi32(iw00 + (iw01 << 16)); - __m128i qw1 = _mm_set1_epi32(iw10 + (iw11 << 16)); - __m128i z = _mm_setzero_si128(); - __m128i qdelta_d = _mm_set1_epi32(1 << (W_BITS1-1)); - __m128i qdelta = _mm_set1_epi32(1 << (W_BITS1-5-1)); - __m128 qA11 = _mm_setzero_ps(), qA12 = _mm_setzero_ps(), qA22 = _mm_setzero_ps(); + for( ; x <= winSize.width*cn - 4; x += 4, dsrc += 4*2, dIptr += 4*2 ) + { + __m128i v00, v01, v10, v11, t0, t1; + + v00 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int*)(src + x)), z); + v01 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int*)(src + x + cn)), z); + v10 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int*)(src + x + step)), z); + v11 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int*)(src + x + step + cn)), z); + + t0 = _mm_add_epi32(_mm_madd_epi16(_mm_unpacklo_epi16(v00, v01), qw0), + _mm_madd_epi16(_mm_unpacklo_epi16(v10, v11), qw1)); + t0 = _mm_srai_epi32(_mm_add_epi32(t0, qdelta), W_BITS1-5); + _mm_storel_epi64((__m128i*)(Iptr + x), _mm_packs_epi32(t0,t0)); + + v00 = _mm_loadu_si128((const __m128i*)(dsrc)); + v01 = _mm_loadu_si128((const __m128i*)(dsrc + cn2)); + v10 = _mm_loadu_si128((const __m128i*)(dsrc + dstep)); + v11 = _mm_loadu_si128((const __m128i*)(dsrc + dstep + cn2)); + + t0 = _mm_add_epi32(_mm_madd_epi16(_mm_unpacklo_epi16(v00, v01), qw0), + _mm_madd_epi16(_mm_unpacklo_epi16(v10, v11), qw1)); + t1 = _mm_add_epi32(_mm_madd_epi16(_mm_unpackhi_epi16(v00, v01), qw0), + _mm_madd_epi16(_mm_unpackhi_epi16(v10, v11), qw1)); + t0 = _mm_srai_epi32(_mm_add_epi32(t0, qdelta_d), W_BITS1); + t1 = _mm_srai_epi32(_mm_add_epi32(t1, qdelta_d), W_BITS1); + v00 = _mm_packs_epi32(t0, t1); // Ix0 Iy0 Ix1 Iy1 ... + + _mm_storeu_si128((__m128i*)dIptr, v00); + t0 = _mm_srai_epi32(v00, 16); // Iy0 Iy1 Iy2 Iy3 + t1 = _mm_srai_epi32(_mm_slli_epi32(v00, 16), 16); // Ix0 Ix1 Ix2 Ix3 + + __m128 fy = _mm_cvtepi32_ps(t0); + __m128 fx = _mm_cvtepi32_ps(t1); + + qA22 = _mm_add_ps(qA22, _mm_mul_ps(fy, fy)); + qA12 = _mm_add_ps(qA12, _mm_mul_ps(fx, fy)); + qA11 = _mm_add_ps(qA11, _mm_mul_ps(fx, fx)); + } +#endif + + for( ; x < winSize.width*cn; x++, dsrc += 2, dIptr += 2 ) + { + int ival = CV_DESCALE(src[x]*iw00 + src[x+cn]*iw01 + + src[x+step]*iw10 + src[x+step+cn]*iw11, W_BITS1-5); + int ixval = CV_DESCALE(dsrc[0]*iw00 + dsrc[cn2]*iw01 + + dsrc[dstep]*iw10 + dsrc[dstep+cn2]*iw11, W_BITS1); + int iyval = CV_DESCALE(dsrc[1]*iw00 + dsrc[cn2+1]*iw01 + dsrc[dstep+1]*iw10 + + dsrc[dstep+cn2+1]*iw11, W_BITS1); + + Iptr[x] = (short)ival; + dIptr[0] = (short)ixval; + dIptr[1] = (short)iyval; + + A11 += (float)(ixval*ixval); + A12 += (float)(ixval*iyval); + A22 += (float)(iyval*iyval); + } + } + +#if CV_SSE2 + float CV_DECL_ALIGNED(16) A11buf[4], A12buf[4], A22buf[4]; + _mm_store_ps(A11buf, qA11); + _mm_store_ps(A12buf, qA12); + _mm_store_ps(A22buf, qA22); + A11 += A11buf[0] + A11buf[1] + A11buf[2] + A11buf[3]; + A12 += A12buf[0] + A12buf[1] + A12buf[2] + A12buf[3]; + A22 += A22buf[0] + A22buf[1] + A22buf[2] + A22buf[3]; +#endif + + A11 *= FLT_SCALE; + A12 *= FLT_SCALE; + A22 *= FLT_SCALE; + + float D = A11*A22 - A12*A12; + float minEig = (A22 + A11 - std::sqrt((A11-A22)*(A11-A22) + + 4.f*A12*A12))/(2*winSize.width*winSize.height); + + if( err && (flags & CV_LKFLOW_GET_MIN_EIGENVALS) != 0 ) + err[ptidx] = (float)minEig; + + if( minEig < minEigThreshold || D < FLT_EPSILON ) + { + if( level == 0 && status ) + status[ptidx] = false; + continue; + } + + D = 1.f/D; + + nextPt -= halfWin; + Point2f prevDelta; + + for( j = 0; j < criteria.maxCount; j++ ) + { + inextPt.x = cvFloor(nextPt.x); + inextPt.y = cvFloor(nextPt.y); + + if( inextPt.x < -winSize.width || inextPt.x >= J.cols || + inextPt.y < -winSize.height || inextPt.y >= J.rows ) + { + if( level == 0 && status ) + status[ptidx] = false; + break; + } + + a = nextPt.x - inextPt.x; + b = nextPt.y - inextPt.y; + iw00 = cvRound((1.f - a)*(1.f - b)*(1 << W_BITS)); + iw01 = cvRound(a*(1.f - b)*(1 << W_BITS)); + iw10 = cvRound((1.f - a)*b*(1 << W_BITS)); + iw11 = (1 << W_BITS) - iw00 - iw01 - iw10; + float b1 = 0, b2 = 0; +#if CV_SSE2 + qw0 = _mm_set1_epi32(iw00 + (iw01 << 16)); + qw1 = _mm_set1_epi32(iw10 + (iw11 << 16)); + __m128 qb0 = _mm_setzero_ps(), qb1 = _mm_setzero_ps(); #endif - // extract the patch from the first image, compute covariation matrix of derivatives - int x, y; for( y = 0; y < winSize.height; y++ ) { - const uchar* src = (const uchar*)I.data + (y + iprevPt.y)*step + iprevPt.x*cn; - const deriv_type* dsrc = (const deriv_type*)derivI.data + (y + iprevPt.y)*dstep + iprevPt.x*cn2; - - deriv_type* Iptr = (deriv_type*)(IWinBuf.data + y*IWinBuf.step); - deriv_type* dIptr = (deriv_type*)(derivIWinBuf.data + y*derivIWinBuf.step); + const uchar* Jptr = (const uchar*)J.data + (y + inextPt.y)*step + inextPt.x*cn; + const deriv_type* Iptr = (const deriv_type*)(IWinBuf.data + y*IWinBuf.step); + const deriv_type* dIptr = (const deriv_type*)(derivIWinBuf.data + y*derivIWinBuf.step); x = 0; #if CV_SSE2 - for( ; x <= winSize.width*cn - 4; x += 4, dsrc += 4*2, dIptr += 4*2 ) + for( ; x <= winSize.width*cn - 8; x += 8, dIptr += 8*2 ) { - __m128i v00, v01, v10, v11, t0, t1; + __m128i diff0 = _mm_loadu_si128((const __m128i*)(Iptr + x)), diff1; + __m128i v00 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(Jptr + x)), z); + __m128i v01 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(Jptr + x + cn)), z); + __m128i v10 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(Jptr + x + step)), z); + __m128i v11 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(Jptr + x + step + cn)), z); - v00 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int*)(src + x)), z); - v01 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int*)(src + x + cn)), z); - v10 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int*)(src + x + step)), z); - v11 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int*)(src + x + step + cn)), z); - - t0 = _mm_add_epi32(_mm_madd_epi16(_mm_unpacklo_epi16(v00, v01), qw0), - _mm_madd_epi16(_mm_unpacklo_epi16(v10, v11), qw1)); + __m128i t0 = _mm_add_epi32(_mm_madd_epi16(_mm_unpacklo_epi16(v00, v01), qw0), + _mm_madd_epi16(_mm_unpacklo_epi16(v10, v11), qw1)); + __m128i t1 = _mm_add_epi32(_mm_madd_epi16(_mm_unpackhi_epi16(v00, v01), qw0), + _mm_madd_epi16(_mm_unpackhi_epi16(v10, v11), qw1)); t0 = _mm_srai_epi32(_mm_add_epi32(t0, qdelta), W_BITS1-5); - _mm_storel_epi64((__m128i*)(Iptr + x), _mm_packs_epi32(t0,t0)); - - v00 = _mm_loadu_si128((const __m128i*)(dsrc)); - v01 = _mm_loadu_si128((const __m128i*)(dsrc + cn2)); - v10 = _mm_loadu_si128((const __m128i*)(dsrc + dstep)); - v11 = _mm_loadu_si128((const __m128i*)(dsrc + dstep + cn2)); - - t0 = _mm_add_epi32(_mm_madd_epi16(_mm_unpacklo_epi16(v00, v01), qw0), - _mm_madd_epi16(_mm_unpacklo_epi16(v10, v11), qw1)); - t1 = _mm_add_epi32(_mm_madd_epi16(_mm_unpackhi_epi16(v00, v01), qw0), - _mm_madd_epi16(_mm_unpackhi_epi16(v10, v11), qw1)); - t0 = _mm_srai_epi32(_mm_add_epi32(t0, qdelta_d), W_BITS1); - t1 = _mm_srai_epi32(_mm_add_epi32(t1, qdelta_d), W_BITS1); - v00 = _mm_packs_epi32(t0, t1); // Ix0 Iy0 Ix1 Iy1 ... - - _mm_storeu_si128((__m128i*)dIptr, v00); - t0 = _mm_srai_epi32(v00, 16); // Iy0 Iy1 Iy2 Iy3 - t1 = _mm_srai_epi32(_mm_slli_epi32(v00, 16), 16); // Ix0 Ix1 Ix2 Ix3 - - __m128 fy = _mm_cvtepi32_ps(t0); - __m128 fx = _mm_cvtepi32_ps(t1); - - qA22 = _mm_add_ps(qA22, _mm_mul_ps(fy, fy)); - qA12 = _mm_add_ps(qA12, _mm_mul_ps(fx, fy)); - qA11 = _mm_add_ps(qA11, _mm_mul_ps(fx, fx)); + t1 = _mm_srai_epi32(_mm_add_epi32(t1, qdelta), W_BITS1-5); + diff0 = _mm_subs_epi16(_mm_packs_epi32(t0, t1), diff0); + diff1 = _mm_unpackhi_epi16(diff0, diff0); + diff0 = _mm_unpacklo_epi16(diff0, diff0); // It0 It0 It1 It1 ... + v00 = _mm_loadu_si128((const __m128i*)(dIptr)); // Ix0 Iy0 Ix1 Iy1 ... + v01 = _mm_loadu_si128((const __m128i*)(dIptr + 8)); + v10 = _mm_mullo_epi16(v00, diff0); + v11 = _mm_mulhi_epi16(v00, diff0); + v00 = _mm_unpacklo_epi16(v10, v11); + v10 = _mm_unpackhi_epi16(v10, v11); + qb0 = _mm_add_ps(qb0, _mm_cvtepi32_ps(v00)); + qb1 = _mm_add_ps(qb1, _mm_cvtepi32_ps(v10)); + v10 = _mm_mullo_epi16(v01, diff1); + v11 = _mm_mulhi_epi16(v01, diff1); + v00 = _mm_unpacklo_epi16(v10, v11); + v10 = _mm_unpackhi_epi16(v10, v11); + qb0 = _mm_add_ps(qb0, _mm_cvtepi32_ps(v00)); + qb1 = _mm_add_ps(qb1, _mm_cvtepi32_ps(v10)); } #endif - for( ; x < winSize.width*cn; x++, dsrc += 2, dIptr += 2 ) + for( ; x < winSize.width*cn; x++, dIptr += 2 ) { - int ival = CV_DESCALE(src[x]*iw00 + src[x+cn]*iw01 + - src[x+step]*iw10 + src[x+step+cn]*iw11, W_BITS1-5); - int ixval = CV_DESCALE(dsrc[0]*iw00 + dsrc[cn2]*iw01 + - dsrc[dstep]*iw10 + dsrc[dstep+cn2]*iw11, W_BITS1); - int iyval = CV_DESCALE(dsrc[1]*iw00 + dsrc[cn2+1]*iw01 + dsrc[dstep+1]*iw10 + - dsrc[dstep+cn2+1]*iw11, W_BITS1); - - Iptr[x] = (short)ival; - dIptr[0] = (short)ixval; - dIptr[1] = (short)iyval; - - A11 += (float)(ixval*ixval); - A12 += (float)(ixval*iyval); - A22 += (float)(iyval*iyval); + int diff = CV_DESCALE(Jptr[x]*iw00 + Jptr[x+cn]*iw01 + + Jptr[x+step]*iw10 + Jptr[x+step+cn]*iw11, + W_BITS1-5) - Iptr[x]; + b1 += (float)(diff*dIptr[0]); + b2 += (float)(diff*dIptr[1]); } } #if CV_SSE2 - float CV_DECL_ALIGNED(16) A11buf[4], A12buf[4], A22buf[4]; - _mm_store_ps(A11buf, qA11); - _mm_store_ps(A12buf, qA12); - _mm_store_ps(A22buf, qA22); - A11 += A11buf[0] + A11buf[1] + A11buf[2] + A11buf[3]; - A12 += A12buf[0] + A12buf[1] + A12buf[2] + A12buf[3]; - A22 += A22buf[0] + A22buf[1] + A22buf[2] + A22buf[3]; + float CV_DECL_ALIGNED(16) bbuf[4]; + _mm_store_ps(bbuf, _mm_add_ps(qb0, qb1)); + b1 += bbuf[0] + bbuf[2]; + b2 += bbuf[1] + bbuf[3]; #endif + + b1 *= FLT_SCALE; + b2 *= FLT_SCALE; - A11 *= FLT_SCALE; - A12 *= FLT_SCALE; - A22 *= FLT_SCALE; + Point2f delta( (float)((A12*b2 - A22*b1) * D), + (float)((A12*b1 - A11*b2) * D)); + //delta = -delta; - float D = A11*A22 - A12*A12; - float minEig = (A22 + A11 - std::sqrt((A11-A22)*(A11-A22) + - 4.f*A12*A12))/(2*winSize.width*winSize.height); + nextPt += delta; + nextPts[ptidx] = nextPt + halfWin; - if( err && (flags & CV_LKFLOW_GET_MIN_EIGENVALS) != 0 ) - err[ptidx] = (float)minEig; + if( delta.ddot(delta) <= criteria.epsilon ) + break; - if( minEig < minEigThreshold || D < FLT_EPSILON ) + if( j > 0 && std::abs(delta.x + prevDelta.x) < 0.01 && + std::abs(delta.y + prevDelta.y) < 0.01 ) { - if( level == 0 && status ) + nextPts[ptidx] -= delta*0.5f; + break; + } + prevDelta = delta; + } + + if( status[ptidx] && err && level == 0 && (flags & CV_LKFLOW_GET_MIN_EIGENVALS) == 0 ) + { + Point2f nextPt = nextPts[ptidx] - halfWin; + Point inextPt; + + inextPt.x = cvFloor(nextPt.x); + inextPt.y = cvFloor(nextPt.y); + + if( inextPt.x < -winSize.width || inextPt.x >= J.cols || + inextPt.y < -winSize.height || inextPt.y >= J.rows ) + { + if( status ) status[ptidx] = false; continue; } - D = 1.f/D; + float a = nextPt.x - inextPt.x; + float b = nextPt.y - inextPt.y; + iw00 = cvRound((1.f - a)*(1.f - b)*(1 << W_BITS)); + iw01 = cvRound(a*(1.f - b)*(1 << W_BITS)); + iw10 = cvRound((1.f - a)*b*(1 << W_BITS)); + iw11 = (1 << W_BITS) - iw00 - iw01 - iw10; + float errval = 0.f; - nextPt -= halfWin; - Point2f prevDelta; - - for( j = 0; j < criteria.maxCount; j++ ) + for( y = 0; y < winSize.height; y++ ) { - inextPt.x = cvFloor(nextPt.x); - inextPt.y = cvFloor(nextPt.y); + const uchar* Jptr = (const uchar*)J.data + (y + inextPt.y)*step + inextPt.x*cn; + const deriv_type* Iptr = (const deriv_type*)(IWinBuf.data + y*IWinBuf.step); - if( inextPt.x < -winSize.width || inextPt.x >= J.cols || - inextPt.y < -winSize.height || inextPt.y >= J.rows ) + for( x = 0; x < winSize.width*cn; x++ ) { - if( level == 0 && status ) - status[ptidx] = false; - break; + int diff = CV_DESCALE(Jptr[x]*iw00 + Jptr[x+cn]*iw01 + + Jptr[x+step]*iw10 + Jptr[x+step+cn]*iw11, + W_BITS1-5) - Iptr[x]; + errval += std::abs((float)diff); } - - a = nextPt.x - inextPt.x; - b = nextPt.y - inextPt.y; - iw00 = cvRound((1.f - a)*(1.f - b)*(1 << W_BITS)); - iw01 = cvRound(a*(1.f - b)*(1 << W_BITS)); - iw10 = cvRound((1.f - a)*b*(1 << W_BITS)); - iw11 = (1 << W_BITS) - iw00 - iw01 - iw10; - float b1 = 0, b2 = 0; -#if CV_SSE2 - qw0 = _mm_set1_epi32(iw00 + (iw01 << 16)); - qw1 = _mm_set1_epi32(iw10 + (iw11 << 16)); - __m128 qb0 = _mm_setzero_ps(), qb1 = _mm_setzero_ps(); -#endif - - for( y = 0; y < winSize.height; y++ ) - { - const uchar* Jptr = (const uchar*)J.data + (y + inextPt.y)*step + inextPt.x*cn; - const deriv_type* Iptr = (const deriv_type*)(IWinBuf.data + y*IWinBuf.step); - const deriv_type* dIptr = (const deriv_type*)(derivIWinBuf.data + y*derivIWinBuf.step); - - x = 0; - -#if CV_SSE2 - for( ; x <= winSize.width*cn - 8; x += 8, dIptr += 8*2 ) - { - __m128i diff0 = _mm_loadu_si128((const __m128i*)(Iptr + x)), diff1; - __m128i v00 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(Jptr + x)), z); - __m128i v01 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(Jptr + x + cn)), z); - __m128i v10 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(Jptr + x + step)), z); - __m128i v11 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(Jptr + x + step + cn)), z); - - __m128i t0 = _mm_add_epi32(_mm_madd_epi16(_mm_unpacklo_epi16(v00, v01), qw0), - _mm_madd_epi16(_mm_unpacklo_epi16(v10, v11), qw1)); - __m128i t1 = _mm_add_epi32(_mm_madd_epi16(_mm_unpackhi_epi16(v00, v01), qw0), - _mm_madd_epi16(_mm_unpackhi_epi16(v10, v11), qw1)); - t0 = _mm_srai_epi32(_mm_add_epi32(t0, qdelta), W_BITS1-5); - t1 = _mm_srai_epi32(_mm_add_epi32(t1, qdelta), W_BITS1-5); - diff0 = _mm_subs_epi16(_mm_packs_epi32(t0, t1), diff0); - diff1 = _mm_unpackhi_epi16(diff0, diff0); - diff0 = _mm_unpacklo_epi16(diff0, diff0); // It0 It0 It1 It1 ... - v00 = _mm_loadu_si128((const __m128i*)(dIptr)); // Ix0 Iy0 Ix1 Iy1 ... - v01 = _mm_loadu_si128((const __m128i*)(dIptr + 8)); - v10 = _mm_mullo_epi16(v00, diff0); - v11 = _mm_mulhi_epi16(v00, diff0); - v00 = _mm_unpacklo_epi16(v10, v11); - v10 = _mm_unpackhi_epi16(v10, v11); - qb0 = _mm_add_ps(qb0, _mm_cvtepi32_ps(v00)); - qb1 = _mm_add_ps(qb1, _mm_cvtepi32_ps(v10)); - v10 = _mm_mullo_epi16(v01, diff1); - v11 = _mm_mulhi_epi16(v01, diff1); - v00 = _mm_unpacklo_epi16(v10, v11); - v10 = _mm_unpackhi_epi16(v10, v11); - qb0 = _mm_add_ps(qb0, _mm_cvtepi32_ps(v00)); - qb1 = _mm_add_ps(qb1, _mm_cvtepi32_ps(v10)); - } -#endif - - for( ; x < winSize.width*cn; x++, dIptr += 2 ) - { - int diff = CV_DESCALE(Jptr[x]*iw00 + Jptr[x+cn]*iw01 + - Jptr[x+step]*iw10 + Jptr[x+step+cn]*iw11, - W_BITS1-5) - Iptr[x]; - b1 += (float)(diff*dIptr[0]); - b2 += (float)(diff*dIptr[1]); - } - } - -#if CV_SSE2 - float CV_DECL_ALIGNED(16) bbuf[4]; - _mm_store_ps(bbuf, _mm_add_ps(qb0, qb1)); - b1 += bbuf[0] + bbuf[2]; - b2 += bbuf[1] + bbuf[3]; -#endif - - b1 *= FLT_SCALE; - b2 *= FLT_SCALE; - - Point2f delta( (float)((A12*b2 - A22*b1) * D), - (float)((A12*b1 - A11*b2) * D)); - //delta = -delta; - - nextPt += delta; - nextPts[ptidx] = nextPt + halfWin; - - if( delta.ddot(delta) <= criteria.epsilon ) - break; - - if( j > 0 && std::abs(delta.x + prevDelta.x) < 0.01 && - std::abs(delta.y + prevDelta.y) < 0.01 ) - { - nextPts[ptidx] -= delta*0.5f; - break; - } - prevDelta = delta; - } - - if( status[ptidx] && err && level == 0 && (flags & CV_LKFLOW_GET_MIN_EIGENVALS) == 0 ) - { - Point2f nextPt = nextPts[ptidx] - halfWin; - Point inextPt; - - inextPt.x = cvFloor(nextPt.x); - inextPt.y = cvFloor(nextPt.y); - - if( inextPt.x < -winSize.width || inextPt.x >= J.cols || - inextPt.y < -winSize.height || inextPt.y >= J.rows ) - { - if( status ) - status[ptidx] = false; - continue; - } - - float a = nextPt.x - inextPt.x; - float b = nextPt.y - inextPt.y; - iw00 = cvRound((1.f - a)*(1.f - b)*(1 << W_BITS)); - iw01 = cvRound(a*(1.f - b)*(1 << W_BITS)); - iw10 = cvRound((1.f - a)*b*(1 << W_BITS)); - iw11 = (1 << W_BITS) - iw00 - iw01 - iw10; - float errval = 0.f; - - for( y = 0; y < winSize.height; y++ ) - { - const uchar* Jptr = (const uchar*)J.data + (y + inextPt.y)*step + inextPt.x*cn; - const deriv_type* Iptr = (const deriv_type*)(IWinBuf.data + y*IWinBuf.step); - - for( x = 0; x < winSize.width*cn; x++ ) - { - int diff = CV_DESCALE(Jptr[x]*iw00 + Jptr[x+cn]*iw01 + - Jptr[x+step]*iw10 + Jptr[x+step+cn]*iw11, - W_BITS1-5) - Iptr[x]; - errval += std::abs((float)diff); - } - } - err[ptidx] = errval * 1.f/(32*winSize.width*cn*winSize.height); } + err[ptidx] = errval * 1.f/(32*winSize.width*cn*winSize.height); } } - - const Mat* prevImg; - const Mat* nextImg; - const Mat* prevDeriv; - const Point2f* prevPts; - Point2f* nextPts; - uchar* status; - float* err; - Size winSize; - TermCriteria criteria; - int level; - int maxLevel; - int flags; - float minEigThreshold; -}; - } - - + int cv::buildOpticalFlowPyramid(InputArray _img, OutputArrayOfArrays pyramid, Size winSize, int maxLevel, bool withDerivatives, int pyrBorder, int derivBorder, bool tryReuseInputImage) { @@ -503,7 +490,7 @@ int cv::buildOpticalFlowPyramid(InputArray _img, OutputArrayOfArrays pyramid, Si pyramid.create(1, (maxLevel + 1) * pyrstep, 0 /*type*/, -1, true, 0); - int derivType = CV_MAKETYPE(DataType::depth, img.channels() * 2); + int derivType = CV_MAKETYPE(DataType::depth, img.channels() * 2); //level 0 bool lvl0IsSet = false; @@ -602,7 +589,7 @@ void cv::calcOpticalFlowPyrLK( InputArray _prevImg, InputArray _nextImg, return; #endif Mat prevPtsMat = _prevPts.getMat(); - const int derivDepth = DataType::depth; + const int derivDepth = DataType::depth; CV_Assert( maxLevel >= 0 && winSize.width > 2 && winSize.height > 2 ); @@ -744,6 +731,8 @@ void cv::calcOpticalFlowPyrLK( InputArray _prevImg, InputArray _nextImg, CV_Assert(prevPyr[level * lvlStep1].size() == nextPyr[level * lvlStep2].size()); CV_Assert(prevPyr[level * lvlStep1].type() == nextPyr[level * lvlStep2].type()); + typedef cv::detail::LKTrackerInvoker LKTrackerInvoker; + parallel_for(BlockedRange(0, npoints), LKTrackerInvoker(prevPyr[level * lvlStep1], derivI, nextPyr[level * lvlStep2], prevPts, nextPts, status, err, diff --git a/modules/video/src/lkpyramid.hpp b/modules/video/src/lkpyramid.hpp new file mode 100644 index 000000000..03a51af37 --- /dev/null +++ b/modules/video/src/lkpyramid.hpp @@ -0,0 +1,36 @@ +#pragma once + +namespace cv +{ +namespace detail +{ + + typedef short deriv_type; + + struct LKTrackerInvoker + { + LKTrackerInvoker( const Mat& _prevImg, const Mat& _prevDeriv, const Mat& _nextImg, + const Point2f* _prevPts, Point2f* _nextPts, + uchar* _status, float* _err, + Size _winSize, TermCriteria _criteria, + int _level, int _maxLevel, int _flags, float _minEigThreshold ); + + void operator()(const BlockedRange& range) const; + + const Mat* prevImg; + const Mat* nextImg; + const Mat* prevDeriv; + const Point2f* prevPts; + Point2f* nextPts; + uchar* status; + float* err; + Size winSize; + TermCriteria criteria; + int level; + int maxLevel; + int flags; + float minEigThreshold; + }; + +}// namespace detail +}// namespace cv \ No newline at end of file