From e4965b143fcd431136c7992735f204476dd11501 Mon Sep 17 00:00:00 2001
From: Andrey Kamaev <no@email>
Date: Thu, 28 Jun 2012 11:08:11 +0000
Subject: [PATCH] Fixed discrepancy between SSE and regular baranches in FREAK

---
 modules/core/include/opencv2/core/types_c.h |   4 +-
 modules/core/src/matrix.cpp                 |   2 +-
 modules/features2d/src/freak.cpp            | 101 ++++++++++++--------
 3 files changed, 66 insertions(+), 41 deletions(-)
diff --git a/modules/core/include/opencv2/core/types_c.h b/modules/core/include/opencv2/core/types_c.h
index 40f5da4e6..cbc7872e6 100644
--- a/modules/core/include/opencv2/core/types_c.h
+++ b/modules/core/include/opencv2/core/types_c.h
@@ -83,7 +83,7 @@
 #  if defined WIN32
 #    include <intrin.h>
 #  endif
-#  if __SSE2__ || !defined __GNUC__
+#  if defined __SSE2__ || !defined __GNUC__
 #    include <emmintrin.h>
 #  endif
 #endif
@@ -304,7 +304,7 @@ enum {
 
 CV_INLINE  int  cvRound( double value )
 {
-#if (defined _MSC_VER && defined _M_X64) || (defined __GNUC__ && defined __x86_64__ && __SSE2__ && !defined __APPLE__)
+#if (defined _MSC_VER && defined _M_X64) || (defined __GNUC__ && defined __x86_64__ && defined __SSE2__ && !defined __APPLE__)
     __m128d t = _mm_set_sd( value );
     return _mm_cvtsd_si32(t);
 #elif defined _MSC_VER && defined _M_IX86
diff --git a/modules/core/src/matrix.cpp b/modules/core/src/matrix.cpp
index df90d6cad..e86bb0e53 100644
--- a/modules/core/src/matrix.cpp
+++ b/modules/core/src/matrix.cpp
@@ -150,7 +150,7 @@ static void updateContinuityFlag(Mat& m)
             break;
     }
 
-    int64 t = (int64)m.step[0]*m.size[0];
+    uint64 t = (uint64)m.step[0]*m.size[0];
     if( j <= i && t == (size_t)t )
         m.flags |= Mat::CONTINUOUS_FLAG;
     else
diff --git a/modules/features2d/src/freak.cpp b/modules/features2d/src/freak.cpp
index f5ef837f9..c06c35141 100644
--- a/modules/features2d/src/freak.cpp
+++ b/modules/features2d/src/freak.cpp
@@ -44,16 +44,6 @@
 namespace cv
 {
 
-#if CV_SSSE3
-// binary: 10000000 => char: 128 or hex: 0x80
-static const __m128i binMask = _mm_set_epi8(0x80, 0x80, 0x80,
-                                            0x80, 0x80, 0x80,
-                                            0x80, 0x80, 0x80,
-                                            0x80, 0x80, 0x80,
-                                            0x80, 0x80, 0x80,
-                                            0x80);
-#endif
-
 static const double FREAK_SQRT2 = 1.4142135623731;
 static const double FREAK_INV_SQRT2 = 1.0 / FREAK_SQRT2;
 static const double FREAK_LOG2 = 0.693147180559945;
@@ -234,13 +224,6 @@ void FREAK::computeImpl( const Mat& image, std::vector<KeyPoint>& keypoints, Mat
 
     ((FREAK*)this)->buildPattern();
 
-#if CV_SSSE3
-    register __m128i operand1;
-    register __m128i operand2;
-    register __m128i workReg;
-    register __m128i result128;
-#endif
-
     Mat imgIntegral;
     integral(image, imgIntegral);
     std::vector<int> kpScaleIdx(keypoints.size()); // used to save pattern scale index corresponding to each keypoints
@@ -292,8 +275,13 @@ void FREAK::computeImpl( const Mat& image, std::vector<KeyPoint>& keypoints, Mat
     if( !extAll ) {
         // extract the best comparisons only
         descriptors = cv::Mat::zeros(keypoints.size(), FREAK_NB_PAIRS/8, CV_8U);
-#if CV_SSSE3
+#if CV_SSE2
         __m128i* ptr= (__m128i*) (descriptors.data+(keypoints.size()-1)*descriptors.step[0]);
+        // binary: 10000000 => char: 128 or hex: 0x80
+        const __m128i binMask = _mm_set_epi8(0x80, 0x80, 0x80, 0x80,
+                                             0x80, 0x80, 0x80, 0x80,
+                                             0x80, 0x80, 0x80, 0x80,
+                                             0x80, 0x80, 0x80, 0x80);
 #else
         std::bitset<FREAK_NB_PAIRS>* ptr = (std::bitset<FREAK_NB_PAIRS>*) (descriptors.data+(keypoints.size()-1)*descriptors.step[0]);
 #endif
@@ -329,25 +317,52 @@ void FREAK::computeImpl( const Mat& image, std::vector<KeyPoint>& keypoints, Mat
             for( int i = FREAK_NB_POINTS; i--; ) {
                 pointsValue[i] = meanIntensity(image, imgIntegral, keypoints[k].pt.x,keypoints[k].pt.y, kpScaleIdx[k], thetaIdx, i);
             }
-#if CV_SSSE3
-            // extracting descriptor by blocks of 128 bits using SSE instructions
+#if CV_SSE2
             // note that comparisons order is modified in each block (but first 128 comparisons remain globally the same-->does not affect the 128,384 bits segmanted matching strategy)
-            int cnt(0);
-            for( int n = 4; n-- ; ) {
-                result128 = _mm_setzero_si128();
-                for( int m = 8; m--; cnt+=16 ) {
-                    operand1 = _mm_set_epi8(pointsValue[descriptionPairs[cnt].i],pointsValue[descriptionPairs[cnt+1].i],pointsValue[descriptionPairs[cnt+2].i],pointsValue[descriptionPairs[cnt+3].i],
-                                          pointsValue[descriptionPairs[cnt+4].i],pointsValue[descriptionPairs[cnt+5].i],pointsValue[descriptionPairs[cnt+6].i],pointsValue[descriptionPairs[cnt+7].i],
-                                          pointsValue[descriptionPairs[cnt+8].i],pointsValue[descriptionPairs[cnt+9].i],pointsValue[descriptionPairs[cnt+10].i],pointsValue[descriptionPairs[cnt+11].i],
-                                          pointsValue[descriptionPairs[cnt+12].i],pointsValue[descriptionPairs[cnt+13].i],pointsValue[descriptionPairs[cnt+14].i],pointsValue[descriptionPairs[cnt+15].i]);
+            int cnt = 0;
+            for( int n = FREAK_NB_PAIRS/128; n-- ; )
+            {
+                __m128i result128 = _mm_setzero_si128();
+                for( int m = 128/16; m--; cnt += 16 )
+                {
+                    __m128i operand1 = _mm_set_epi8(
+                        pointsValue[descriptionPairs[cnt+0].i],
+                        pointsValue[descriptionPairs[cnt+1].i],
+                        pointsValue[descriptionPairs[cnt+2].i],
+                        pointsValue[descriptionPairs[cnt+3].i],
+                        pointsValue[descriptionPairs[cnt+4].i],
+                        pointsValue[descriptionPairs[cnt+5].i],
+                        pointsValue[descriptionPairs[cnt+6].i],
+                        pointsValue[descriptionPairs[cnt+7].i],
+                        pointsValue[descriptionPairs[cnt+8].i],
+                        pointsValue[descriptionPairs[cnt+9].i],
+                        pointsValue[descriptionPairs[cnt+10].i],
+                        pointsValue[descriptionPairs[cnt+11].i],
+                        pointsValue[descriptionPairs[cnt+12].i],
+                        pointsValue[descriptionPairs[cnt+13].i],
+                        pointsValue[descriptionPairs[cnt+14].i],
+                        pointsValue[descriptionPairs[cnt+15].i]);
 
-                    operand2 = _mm_set_epi8(pointsValue[descriptionPairs[cnt].j],pointsValue[descriptionPairs[cnt+1].j],pointsValue[descriptionPairs[cnt+2].j],pointsValue[descriptionPairs[cnt+3].j],
-                                          pointsValue[descriptionPairs[cnt+4].j],pointsValue[descriptionPairs[cnt+5].j],pointsValue[descriptionPairs[cnt+6].j],pointsValue[descriptionPairs[cnt+7].j],
-                                          pointsValue[descriptionPairs[cnt+8].j],pointsValue[descriptionPairs[cnt+9].j],pointsValue[descriptionPairs[cnt+10].j],pointsValue[descriptionPairs[cnt+11].j],
-                                          pointsValue[descriptionPairs[cnt+12].j],pointsValue[descriptionPairs[cnt+13].j],pointsValue[descriptionPairs[cnt+14].j],pointsValue[descriptionPairs[cnt+15].j]);
+                    __m128i operand2 = _mm_set_epi8(
+                        pointsValue[descriptionPairs[cnt+0].j],
+                        pointsValue[descriptionPairs[cnt+1].j],
+                        pointsValue[descriptionPairs[cnt+2].j],
+                        pointsValue[descriptionPairs[cnt+3].j],
+                        pointsValue[descriptionPairs[cnt+4].j],
+                        pointsValue[descriptionPairs[cnt+5].j],
+                        pointsValue[descriptionPairs[cnt+6].j],
+                        pointsValue[descriptionPairs[cnt+7].j],
+                        pointsValue[descriptionPairs[cnt+8].j],
+                        pointsValue[descriptionPairs[cnt+9].j],
+                        pointsValue[descriptionPairs[cnt+10].j],
+                        pointsValue[descriptionPairs[cnt+11].j],
+                        pointsValue[descriptionPairs[cnt+12].j],
+                        pointsValue[descriptionPairs[cnt+13].j],
+                        pointsValue[descriptionPairs[cnt+14].j],
+                        pointsValue[descriptionPairs[cnt+15].j]);
 
-                    workReg = _mm_min_epu8(operand1, operand2); // emulated "greater than" for UNSIGNED int
-                    workReg = _mm_cmpeq_epi8(workReg, operand2); // emulated "greater than" for UNSIGNED int
+                    __m128i workReg = _mm_min_epu8(operand1, operand2); // emulated "not less than" for 8-bit UNSIGNED integers
+                    workReg = _mm_cmpeq_epi8(workReg, operand2);        // emulated "not less than" for 8-bit UNSIGNED integers
 
                     workReg = _mm_and_si128(_mm_srli_epi16(binMask, m), workReg); // merge the last 16 bits with the 128bits std::vector until full
                     result128 = _mm_or_si128(result128, workReg);
@@ -355,10 +370,20 @@ void FREAK::computeImpl( const Mat& image, std::vector<KeyPoint>& keypoints, Mat
                 (*ptr) = result128;
                 ++ptr;
             }
-            ptr-=8;
+            ptr -= 8;
 #else
-            for( int m = FREAK_NB_PAIRS; m--; ) {
-                ptr->set(m, pointsValue[descriptionPairs[m].i]>  pointsValue[descriptionPairs[m].j ] );
+            // extracting descriptor preserving the order of SSE version
+            int cnt = 0;
+            for( int n = 7; n < FREAK_NB_PAIRS; n += 128)
+            {
+                for( int m = 8; m--; )
+                {
+                    int nm = n-m;
+                    for(int kk = nm+15*8; kk >= nm; kk-=8, ++cnt)
+                    {
+                        ptr->set(kk, pointsValue[descriptionPairs[cnt].i] >= pointsValue[descriptionPairs[cnt].j]);
+                    }
+                }
             }
             --ptr;
 #endif
@@ -407,7 +432,7 @@ void FREAK::computeImpl( const Mat& image, std::vector<KeyPoint>& keypoints, Mat
             for( int i = 1; i < FREAK_NB_POINTS; ++i ) {
                 //(generate all the pairs)
                 for( int j = 0; j < i; ++j ) {
-                    ptr->set(cnt, pointsValue[i]>pointsValue[j] );
+                    ptr->set(cnt, pointsValue[i] >= pointsValue[j] );
                     ++cnt;
                 }
             }