unified norm computing; added generalized Hamming distance

2011-10-11 15:13:53 +00:00
parent c1277b6147
commit b74116e694
12 changed files with 485 additions and 589 deletions
--- a/modules/core/src/cmdparser.cpp
+++ b/modules/core/src/cmdparser.cpp
@@ -282,7 +282,7 @@ template<typename _Tp>

 		cout << setw(col_p-2) << left << buf;

-		if (buf.length() > col_p-2) 
+		if ((int)buf.length() > col_p-2) 
 		{
 			cout << endl << "  ";
 			cout << setw(col_p-2) << left << " ";
@@ -293,7 +293,7 @@ template<typename _Tp>

 		while (true)
 		{
-			bool tr = (buf.length() > col_d-2) ? true: false;
+			bool tr = ((int)buf.length() > col_d-2) ? true: false;
 			int pos;

 			if (tr)
@@ -301,7 +301,8 @@ template<typename _Tp>
 				pos = buf.find_first_of(' ');
 				while (true)
 				{
-					if (buf.find_first_of(' ', pos + 1 ) < col_d-2 && buf.find_first_of(' ', pos + 1 ) != std::string::npos)
+					if ((int)buf.find_first_of(' ', pos + 1 ) < col_d-2 &&
+                        (int)buf.find_first_of(' ', pos + 1 ) != (int)std::string::npos)
 						pos = buf.find_first_of(' ', pos + 1);
 					else
 						break;
--- a/modules/core/src/matrix.cpp
+++ b/modules/core/src/matrix.cpp
@@ -2161,43 +2161,6 @@ static void generateRandomCenter(const vector<Vec2f>& box, float* center, RNG& r
 }


-static inline float distance(const float* a, const float* b, int n)
-{
-    int j = 0; float d = 0.f;
-#if CV_SSE
-    if( USE_SSE2 )
-    {
-        float CV_DECL_ALIGNED(16) buf[4];
-        __m128 d0 = _mm_setzero_ps(), d1 = _mm_setzero_ps();
-
-        for( ; j <= n - 8; j += 8 )
-        {
-            __m128 t0 = _mm_sub_ps(_mm_loadu_ps(a + j), _mm_loadu_ps(b + j));
-            __m128 t1 = _mm_sub_ps(_mm_loadu_ps(a + j + 4), _mm_loadu_ps(b + j + 4));
-            d0 = _mm_add_ps(d0, _mm_mul_ps(t0, t0));
-            d1 = _mm_add_ps(d1, _mm_mul_ps(t1, t1));
-        }
-        _mm_store_ps(buf, _mm_add_ps(d0, d1));
-        d = buf[0] + buf[1] + buf[2] + buf[3];
-    }
-    else
-#endif
-    {
-        for( ; j <= n - 4; j += 4 )
-        {
-            float t0 = a[j] - b[j], t1 = a[j+1] - b[j+1], t2 = a[j+2] - b[j+2], t3 = a[j+3] - b[j+3];
-            d += t0*t0 + t1*t1 + t2*t2 + t3*t3;
-        }
-    }
-
-    for( ; j < n; j++ )
-    {
-        float t = a[j] - b[j];
-        d += t*t;
-    }
-    return d;
-}
-
 /*
 k-means center initialization using the following algorithm:
 Arthur & Vassilvitskii (2007) k-means++: The Advantages of Careful Seeding
@@ -2218,7 +2181,7 @@ static void generateCentersPP(const Mat& _data, Mat& _out_centers,

    for( i = 0; i < N; i++ )
    {
-        dist[i] = distance(data + step*i, data + step*centers[0], dims);
+        dist[i] = normL2Sqr_(data + step*i, data + step*centers[0], dims);
        sum0 += dist[i];
    }
    
@@ -2236,7 +2199,7 @@ static void generateCentersPP(const Mat& _data, Mat& _out_centers,
            int ci = i;
            for( i = 0; i < N; i++ )
            {
-                tdist2[i] = std::min(distance(data + step*i, data + step*ci, dims), dist[i]);
+                tdist2[i] = std::min(normL2Sqr_(data + step*i, data + step*ci, dims), dist[i]);
                s += tdist2[i];
            }
            
@@ -2434,7 +2397,7 @@ double cv::kmeans( InputArray _data, int K,
                for( k = 0; k < K; k++ )
                {
                    const float* center = centers.ptr<float>(k);
-                    double dist = distance(sample, center, dims);
+                    double dist = normL2Sqr_(sample, center, dims);

                    if( min_dist > dist )
                    {
--- a/modules/core/src/stat.cpp
+++ b/modules/core/src/stat.cpp
@@ -810,15 +810,218 @@ void cv::minMaxLoc( InputArray _img, double* minVal, double* maxVal,
 namespace cv
 {

+float normL2Sqr_(const float* a, const float* b, int n)
+{
+    int j = 0; float d = 0.f;
+#if CV_SSE
+    if( USE_SSE2 )
+    {
+        float CV_DECL_ALIGNED(16) buf[4];
+        __m128 d0 = _mm_setzero_ps(), d1 = _mm_setzero_ps();
+        
+        for( ; j <= n - 8; j += 8 )
+        {
+            __m128 t0 = _mm_sub_ps(_mm_loadu_ps(a + j), _mm_loadu_ps(b + j));
+            __m128 t1 = _mm_sub_ps(_mm_loadu_ps(a + j + 4), _mm_loadu_ps(b + j + 4));
+            d0 = _mm_add_ps(d0, _mm_mul_ps(t0, t0));
+            d1 = _mm_add_ps(d1, _mm_mul_ps(t1, t1));
+        }
+        _mm_store_ps(buf, _mm_add_ps(d0, d1));
+        d = buf[0] + buf[1] + buf[2] + buf[3];
+    }
+    else
+#endif
+    {
+        for( ; j <= n - 4; j += 4 )
+        {
+            float t0 = a[j] - b[j], t1 = a[j+1] - b[j+1], t2 = a[j+2] - b[j+2], t3 = a[j+3] - b[j+3];
+            d += t0*t0 + t1*t1 + t2*t2 + t3*t3;
+        }
+    }
+    
+    for( ; j < n; j++ )
+    {
+        float t = a[j] - b[j];
+        d += t*t;
+    }
+    return d;
+}
+
+    
+float normL1_(const float* a, const float* b, int n)
+{
+    int j = 0; float d = 0.f;
+#if CV_SSE
+    if( USE_SSE2 )
+    {
+        float CV_DECL_ALIGNED(16) buf[4];
+        static const float CV_DECL_ALIGNED(16) absbuf[4] = {0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff};
+        __m128 d0 = _mm_setzero_ps(), d1 = _mm_setzero_ps();
+        __m128 absmask = _mm_load_ps(absbuf);
+        
+        for( ; j <= n - 8; j += 8 )
+        {
+            __m128 t0 = _mm_sub_ps(_mm_loadu_ps(a + j), _mm_loadu_ps(b + j));
+            __m128 t1 = _mm_sub_ps(_mm_loadu_ps(a + j + 4), _mm_loadu_ps(b + j + 4));
+            d0 = _mm_add_ps(d0, _mm_and_ps(t0, absmask));
+            d1 = _mm_add_ps(d1, _mm_and_ps(t1, absmask));
+        }
+        _mm_store_ps(buf, _mm_add_ps(d0, d1));
+        d = buf[0] + buf[1] + buf[2] + buf[3];
+    }
+    else
+#endif
+    {
+        for( ; j <= n - 4; j += 4 )
+        {
+            d += std::abs(a[j] - b[j]) + std::abs(a[j+1] - b[j+1]) +
+                std::abs(a[j+2] - b[j+2]) + std::abs(a[j+3] - b[j+3]);
+        }
+    }
+    
+    for( ; j < n; j++ )
+        d += std::abs(a[j] - b[j]);
+    return d;
+}
+
+int normL1_(const uchar* a, const uchar* b, int n)
+{
+    int j = 0, d = 0;
+#if CV_SSE
+    if( USE_SSE2 )
+    {
+        __m128i d0 = _mm_setzero_si128();
+        
+        for( ; j <= n - 16; j += 16 )
+        {
+            __m128i t0 = _mm_loadu_si128((const __m128i*)(a + j));
+            __m128i t1 = _mm_loadu_si128((const __m128i*)(b + j));
+            
+            d0 = _mm_add_epi32(d0, _mm_sad_epu8(t0, t1));
+        }
+
+        for( ; j <= n - 4; j += 4 )
+        {
+            __m128i t0 = _mm_cvtsi32_si128(*(const int*)(a + j));
+            __m128i t1 = _mm_cvtsi32_si128(*(const int*)(b + j));
+            
+            d0 = _mm_add_epi32(d0, _mm_sad_epu8(t0, t1));
+        }
+        d = _mm_cvtsi128_si32(_mm_add_epi32(d0, _mm_unpackhi_epi64(d0, d0)));
+    }
+    else
+#endif
+    {
+        for( ; j <= n - 4; j += 4 )
+        {
+            d += std::abs(a[j] - b[j]) + std::abs(a[j+1] - b[j+1]) +
+                std::abs(a[j+2] - b[j+2]) + std::abs(a[j+3] - b[j+3]);
+        }
+    }
+    
+    for( ; j < n; j++ )
+        d += std::abs(a[j] - b[j]);
+    return d;
+}
+
+static const uchar popCountTable[] = 
+{
+    0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4, 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
+    1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
+    1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
+    2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
+    1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
+    2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
+    2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
+    3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, 4, 5, 5, 6, 5, 6, 6, 7, 5, 6, 6, 7, 6, 7, 7, 8
+};
+    
+static const uchar popCountTable2[] =
+{
+    0, 1, 1, 1, 1, 2, 2, 2, 1, 2, 2, 2, 1, 2, 2, 2, 1, 2, 2, 2, 2, 3, 3, 3, 2, 3, 3, 3, 2, 3, 3, 3,
+    1, 2, 2, 2, 2, 3, 3, 3, 2, 3, 3, 3, 2, 3, 3, 3, 1, 2, 2, 2, 2, 3, 3, 3, 2, 3, 3, 3, 2, 3, 3, 3,
+    1, 2, 2, 2, 2, 3, 3, 3, 2, 3, 3, 3, 2, 3, 3, 3, 2, 3, 3, 3, 3, 4, 4, 4, 3, 4, 4, 4, 3, 4, 4, 4,
+    2, 3, 3, 3, 3, 4, 4, 4, 3, 4, 4, 4, 3, 4, 4, 4, 2, 3, 3, 3, 3, 4, 4, 4, 3, 4, 4, 4, 3, 4, 4, 4,
+    1, 2, 2, 2, 2, 3, 3, 3, 2, 3, 3, 3, 2, 3, 3, 3, 2, 3, 3, 3, 3, 4, 4, 4, 3, 4, 4, 4, 3, 4, 4, 4,
+    2, 3, 3, 3, 3, 4, 4, 4, 3, 4, 4, 4, 3, 4, 4, 4, 2, 3, 3, 3, 3, 4, 4, 4, 3, 4, 4, 4, 3, 4, 4, 4,
+    1, 2, 2, 2, 2, 3, 3, 3, 2, 3, 3, 3, 2, 3, 3, 3, 2, 3, 3, 3, 3, 4, 4, 4, 3, 4, 4, 4, 3, 4, 4, 4,
+    2, 3, 3, 3, 3, 4, 4, 4, 3, 4, 4, 4, 3, 4, 4, 4, 2, 3, 3, 3, 3, 4, 4, 4, 3, 4, 4, 4, 3, 4, 4, 4
+};
+    
+static const uchar popCountTable4[] =
+{
+    0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+    1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+    1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+    1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+    1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+    1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+    1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+    1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2
+};
+    
+int normHamming(const uchar* a, const uchar* b, int n)
+{
+    int i = 0, result = 0;
+#if defined __GNUC__ && CV_NEON
+    if (CPU_HAS_NEON_FEATURE)
+    {
+        result = 0;  
+        for( ; i <= n - 16; i += 16 )
+        {
+            uint8x16_t A_vec = vld1q_u8 (a + i);
+            uint8x16_t B_vec = vld1q_u8 (b + i);
+            //uint8x16_t veorq_u8 (uint8x16_t, uint8x16_t)
+            uint8x16_t AxorB = veorq_u8 (A_vec, B_vec);
+            
+            uint8x16_t bitsSet = vcntq_u8 (AxorB);
+            //uint16x8_t vpadalq_u8 (uint16x8_t, uint8x16_t)
+            uint16x8_t bitSet8 = vpaddlq_u8 (bitsSet);
+            uint32x4_t bitSet4 = vpaddlq_u16 (bitSet8);
+            
+            uint64x2_t bitSet2 = vpaddlq_u32 (bitSet4);
+            result += vgetq_lane_u64 (bitSet2,0);
+            result += vgetq_lane_u64 (bitSet2,1);
+        }
+    }
+    else
+#endif
+        for( ; i <= n - 4; i += 4 )
+            result += popCountTable[a[i] ^ b[i]] + popCountTable[a[i+1] ^ b[i+1]] +
+                popCountTable[a[i+2] ^ b[i+2]] + popCountTable[a[i+3] ^ b[i+3]];
+    for( ; i < n; i++ )
+        result += popCountTable[a[i] ^ b[i]];
+    return result;
+}
+    
+int normHamming(const uchar* a, const uchar* b, int n, int cellSize)
+{
+    if( cellSize == 1 )
+        return normHamming(a, b, n);
+    const uchar* tab = 0;
+    if( cellSize == 2 )
+        tab = popCountTable2;
+    else if( cellSize == 4 )
+        tab = popCountTable4;
+    else
+        CV_Error( CV_StsBadSize, "bad cell size (not 1, 2 or 4) in normHamming" );
+    int i = 0, result = 0;
+    for( ; i <= n - 4; i += 4 )
+        result += tab[a[i] ^ b[i]] + tab[a[i+1] ^ b[i+1]] +
+            tab[a[i+2] ^ b[i+2]] + tab[a[i+3] ^ b[i+3]];
+    for( ; i < n; i++ )
+        result += tab[a[i] ^ b[i]];
+    return result;
+}
+    
+    
 template<typename T, typename ST> int
 normInf_(const T* src, const uchar* mask, ST* _result, int len, int cn)
 {
    ST result = *_result;
    if( !mask )
    {
-        len *= cn;
-        for( int i = 0; i < len; i++ )
-            result = std::max(result, ST(std::abs(src[i])));
+        result = std::max(result, normInf<T, ST>(src, len*cn));
    }
    else
    {
@@ -826,7 +1029,7 @@ normInf_(const T* src, const uchar* mask, ST* _result, int len, int cn)
            if( mask[i] )
            {
                for( int k = 0; k < cn; k++ )
-                    result = std::max(result, ST(std::abs(src[k])));
+                    result = std::max(result, ST(fast_abs(src[k])));
            }
    }
    *_result = result;
@@ -839,9 +1042,7 @@ normL1_(const T* src, const uchar* mask, ST* _result, int len, int cn)
    ST result = *_result;
    if( !mask )
    {
-        len *= cn;
-        for( int i = 0; i < len; i++ )
-            result += std::abs(src[i]);
+        result += normL1<T, ST>(src, len*cn);
    }
    else
    {
@@ -849,7 +1050,7 @@ normL1_(const T* src, const uchar* mask, ST* _result, int len, int cn)
            if( mask[i] )
            {
                for( int k = 0; k < cn; k++ )
-                    result += std::abs(src[k]);
+                    result += fast_abs(src[k]);
            }
    }
    *_result = result;
@@ -862,12 +1063,7 @@ normL2_(const T* src, const uchar* mask, ST* _result, int len, int cn)
    ST result = *_result;
    if( !mask )
    {
-        len *= cn;
-        for( int i = 0; i < len; i++ )
-        {
-            T v = src[i];
-            result += (ST)v*v;
-        }
+        result += normL2Sqr<T, ST>(src, len*cn);
    }
    else
    {
@@ -891,9 +1087,7 @@ normDiffInf_(const T* src1, const T* src2, const uchar* mask, ST* _result, int l
    ST result = *_result;
    if( !mask )
    {
-        len *= cn;
-        for( int i = 0; i < len; i++ )
-            result = std::max(result, (ST)std::abs(src1[i] - src2[i]));
+        result = std::max(result, normInf<T, ST>(src1, src2, len*cn));
    }
    else
    {
@@ -914,9 +1108,7 @@ normDiffL1_(const T* src1, const T* src2, const uchar* mask, ST* _result, int le
    ST result = *_result;
    if( !mask )
    {
-        len *= cn;
-        for( int i = 0; i < len; i++ )
-            result += std::abs(src1[i] - src2[i]);
+        result += normL1<T, ST>(src1, src2, len*cn);
    }
    else
    {
@@ -937,12 +1129,7 @@ normDiffL2_(const T* src1, const T* src2, const uchar* mask, ST* _result, int le
    ST result = *_result;
    if( !mask )
    {
-        len *= cn;
-        for( int i = 0; i < len; i++ )
-        {
-            ST v = src1[i] - src2[i];
-            result += v*v;
-        }
+        result += normL2Sqr<T, ST>(src1, src2, len*cn);
    }
    else
    {