unified norm computing; added generalized Hamming distance

2011-10-11 15:13:53 +00:00
parent c1277b6147
commit b74116e694
12 changed files with 485 additions and 589 deletions
--- a/modules/core/include/opencv2/core/core.hpp
+++ b/modules/core/include/opencv2/core/core.hpp
@@ -598,6 +598,9 @@ public:
    //! per-element multiplication
    Vec mul(const Vec<_Tp, cn>& v) const;
    
+    //! conjugation (makes sense for complex numbers and quaternions)
+    Vec conj() const;
+    
    /*!
      cross product of the two 3D vectors.
    
--- a/modules/core/include/opencv2/core/operations.hpp
+++ b/modules/core/include/opencv2/core/operations.hpp
@@ -81,7 +81,7 @@
  #define CV_XADD(addr,delta) InterlockedExchangeAdd((long volatile*)(addr), (delta))
 #else

-  template<typename _Tp> static inline _Tp CV_XADD(_Tp* addr, _Tp delta)
+  static inline int CV_XADD(int* addr, int delta)
  { int tmp = *addr; *addr += delta; return tmp; }
    
 #endif
@@ -179,7 +179,14 @@ template<> inline int saturate_cast<int>(double v) { return cvRound(v); }
 // we intentionally do not clip negative numbers, to make -1 become 0xffffffff etc.
 template<> inline unsigned saturate_cast<unsigned>(float v){ return cvRound(v); }
 template<> inline unsigned saturate_cast<unsigned>(double v) { return cvRound(v); }
-
+    
+inline int fast_abs(uchar v) { return v; }
+inline int fast_abs(schar v) { return std::abs((int)v); }
+inline int fast_abs(ushort v) { return v; }
+inline int fast_abs(short v) { return std::abs((int)v); }
+inline int fast_abs(int v) { return std::abs(v); }
+inline float fast_abs(float v) { return std::abs(v); }
+inline double fast_abs(double v) { return std::abs(v); }

 //////////////////////////////// Matx /////////////////////////////////

@@ -891,38 +898,152 @@ Matx<_Tp, n, l> Matx<_Tp, m, n>::solve(const Matx<_Tp, m, l>& rhs, int method) c
    return ok ? x : Matx<_Tp, n, l>::zeros();
 }

+    
+template<typename _Tp, typename _AccTp> static inline
+_AccTp normL2Sqr(const _Tp* a, int n)
+{
+    _AccTp s = 0;
+    int i;
+    for( i = 0; i <= n - 4; i += 4 )
+    {
+        _AccTp v0 = a[i], v1 = a[i+1], v2 = a[i+2], v3 = a[i+3];
+        s += v0*v0 + v1*v1 + v2*v2 + v3*v3;
+    }
+    for( ; i < n; i++ )
+    {
+        _AccTp v = a[i];
+        s += v*v;
+    }
+    return s;
+}
+
+
+template<typename _Tp, typename _AccTp> static inline
+_AccTp normL1(const _Tp* a, int n)
+{
+    _AccTp s = 0;
+    int i;
+    for( i = 0; i <= n - 4; i += 4 )
+    {
+        s += (_AccTp)fast_abs(a[i]) + (_AccTp)fast_abs(a[i+1]) +
+            (_AccTp)fast_abs(a[i+2]) + (_AccTp)fast_abs(a[i+3]);
+    }
+    for( ; i < n; i++ )
+        s += fast_abs(a[i]);
+    return s;
+}
+
+
+template<typename _Tp, typename _AccTp> static inline
+_AccTp normInf(const _Tp* a, int n)
+{
+    _AccTp s = 0;
+    for( int i = 0; i < n; i++ )
+        s = std::max(s, (_AccTp)fast_abs(a[i]));
+    return s;
+}
+    
+    
+template<typename _Tp, typename _AccTp> static inline
+_AccTp normL2Sqr(const _Tp* a, const _Tp* b, int n)
+{
+    _AccTp s = 0;
+    int i;
+    for( i = 0; i <= n - 4; i += 4 )
+    {
+        _AccTp v0 = a[i] - b[i], v1 = a[i+1] - b[i+1], v2 = a[i+2] - b[i+2], v3 = a[i+3] - b[i+3];
+        s += v0*v0 + v1*v1 + v2*v2 + v3*v3;
+    }
+    for( ; i < n; i++ )
+    {
+        _AccTp v = a[i] - b[i];
+        s += v*v;
+    }
+    return s;
+}
+
+CV_EXPORTS float normL2Sqr_(const float* a, const float* b, int n);
+CV_EXPORTS float normL1_(const float* a, const float* b, int n);
+CV_EXPORTS int normL1_(const uchar* a, const uchar* b, int n);
+CV_EXPORTS int normHamming(const uchar* a, const uchar* b, int n);
+CV_EXPORTS int normHamming(const uchar* a, const uchar* b, int n, int cellSize);
+    
+template<> static inline float normL2Sqr(const float* a, const float* b, int n)
+{
+    if( n >= 8 )
+        return normL2Sqr_(a, b, n);
+    float s = 0;
+    for( int i = 0; i < n; i++ )
+    {
+        float v = a[i] - b[i];
+        s += v*v;
+    }
+    return s;
+}
+
+    
+template<typename _Tp, typename _AccTp> static inline
+_AccTp normL1(const _Tp* a, const _Tp* b, int n)
+{
+    _AccTp s = 0;
+    int i;
+    for( i = 0; i <= n - 4; i += 4 )
+    {
+        _AccTp v0 = a[i] - b[i], v1 = a[i+1] - b[i+1], v2 = a[i+2] - b[i+2], v3 = a[i+3] - b[i+3];
+        s += std::abs(v0) + std::abs(v1) + std::abs(v2) + std::abs(v3);
+    }
+    for( ; i < n; i++ )
+    {
+        _AccTp v = a[i] - b[i];
+        s += std::abs(v);
+    }
+    return s;
+}
+
+template<> static inline float normL1(const float* a, const float* b, int n)
+{
+    if( n >= 8 )
+        return normL1_(a, b, n);
+    float s = 0;
+    for( int i = 0; i < n; i++ )
+    {
+        float v = a[i] - b[i];
+        s += std::abs(v);
+    }
+    return s;
+}
+
+template<> static inline int normL1(const uchar* a, const uchar* b, int n)
+{
+    return normL1_(a, b, n);
+}    
+
+template<typename _Tp, typename _AccTp> static inline
+_AccTp normInf(const _Tp* a, const _Tp* b, int n)
+{
+    _AccTp s = 0;
+    for( int i = 0; i < n; i++ )
+    {
+        _AccTp v0 = a[i] - b[i];
+        s = std::max(s, std::abs(v0));
+    }
+    return s;
+}
+    

 template<typename _Tp, int m, int n> static inline
 double norm(const Matx<_Tp, m, n>& M)
 {
-    double s = 0;
-    for( int i = 0; i < m*n; i++ )
-        s += (double)M.val[i]*M.val[i];
-    return std::sqrt(s);
+    return std::sqrt(normL2Sqr<_Tp, double>(M.val, m*n));
 }

    
 template<typename _Tp, int m, int n> static inline
 double norm(const Matx<_Tp, m, n>& M, int normType)
 {
-    if( normType == NORM_INF )
-    {
-        _Tp s = 0;
-        for( int i = 0; i < m*n; i++ )
-            s = std::max(s, std::abs(M.val[i]));
-        return s;
-    }
-    
-    if( normType == NORM_L1 )
-    {
-        _Tp s = 0;
-        for( int i = 0; i < m*n; i++ )
-            s += std::abs(M.val[i]);
-        return s;
-    }
-    
-    CV_DbgAssert( normType == NORM_L2 );
-    return norm(M);
+    return normType == NORM_INF ? (double)normInf<_Tp, DataType<_Tp>::work_type>(M.val, m*n) :
+        normType == NORM_L1 ? (double)normL1<_Tp, DataType<_Tp>::work_type>(M.val, m*n) :
+        std::sqrt((double)normL2Sqr<_Tp, DataType<_Tp>::work_type>(M.val, m*n));
 }
    
    
@@ -1056,7 +1177,37 @@ template<typename _Tp, int cn> inline Vec<_Tp, cn> Vec<_Tp, cn>::mul(const Vec<_
    for( int i = 0; i < cn; i++ ) w.val[i] = saturate_cast<_Tp>(this->val[i]*v.val[i]);
    return w;
 }
+
+template<typename _Tp> Vec<_Tp, 2> conjugate(const Vec<_Tp, 2>& v)
+{
+    return Vec<_Tp, 2>(v[0], -v[1]);
+}
+
+template<typename _Tp> Vec<_Tp, 4> conjugate(const Vec<_Tp, 4>& v)
+{
+    return Vec<_Tp, 4>(v[0], -v[1], -v[2], -v[3]);
+}    
    
+template<> inline Vec<float, 2> Vec<float, 2>::conj() const
+{
+    return conjugate(*this);
+}
+
+template<> inline Vec<double, 2> Vec<double, 2>::conj() const
+{
+    return conjugate(*this);
+}
+
+template<> inline Vec<float, 4> Vec<float, 4>::conj() const
+{
+    return conjugate(*this);
+}
+
+template<> inline Vec<double, 4> Vec<double, 4>::conj() const
+{
+    return conjugate(*this);
+}
+        
 template<typename _Tp, int cn> inline Vec<_Tp, cn> Vec<_Tp, cn>::cross(const Vec<_Tp, cn>& v) const
 {
    CV_Error(CV_StsError, "for arbitrary-size vector there is no cross-product defined");
@@ -1155,7 +1306,33 @@ Vec<_Tp, cn>& operator *= (Vec<_Tp, cn>& a, double alpha)
        a[i] = saturate_cast<_Tp>(a[i]*alpha);
    return a;
 }
+
+template<typename _Tp, int cn> static inline
+Vec<_Tp, cn>& operator /= (Vec<_Tp, cn>& a, int alpha)
+{
+    double ialpha = 1./alpha;
+    for( int i = 0; i < cn; i++ )
+        a[i] = saturate_cast<_Tp>(a[i]*ialpha);
+    return a;
+}
    
+template<typename _Tp, int cn> static inline
+Vec<_Tp, cn>& operator /= (Vec<_Tp, cn>& a, float alpha)
+{
+    float ialpha = 1.f/alpha;
+    for( int i = 0; i < cn; i++ )
+        a[i] = saturate_cast<_Tp>(a[i]*ialpha);
+    return a;
+}
+
+template<typename _Tp, int cn> static inline
+Vec<_Tp, cn>& operator /= (Vec<_Tp, cn>& a, double alpha)
+{
+    double ialpha = 1./alpha;
+    for( int i = 0; i < cn; i++ )
+        a[i] = saturate_cast<_Tp>(a[i]*ialpha);
+    return a;
+}    
    
 template<typename _Tp, int cn> static inline Vec<_Tp, cn>
 operator * (const Vec<_Tp, cn>& a, int alpha)
@@ -1193,6 +1370,24 @@ operator * (double alpha, const Vec<_Tp, cn>& a)
    return Vec<_Tp, cn>(a, alpha, Matx_ScaleOp());
 }    

+template<typename _Tp, int cn> static inline Vec<_Tp, cn>
+operator / (const Vec<_Tp, cn>& a, int alpha)
+{
+    return Vec<_Tp, cn>(a, 1./alpha, Matx_ScaleOp());
+}
+
+template<typename _Tp, int cn> static inline Vec<_Tp, cn>
+operator / (const Vec<_Tp, cn>& a, float alpha)
+{
+    return Vec<_Tp, cn>(a, 1.f/alpha, Matx_ScaleOp());
+}    
+
+template<typename _Tp, int cn> static inline Vec<_Tp, cn>
+operator / (const Vec<_Tp, cn>& a, double alpha)
+{
+    return Vec<_Tp, cn>(a, 1./alpha, Matx_ScaleOp());
+}        
+    
 template<typename _Tp, int cn> static inline Vec<_Tp, cn>
 operator - (const Vec<_Tp, cn>& a)
 {
@@ -1200,6 +1395,20 @@ operator - (const Vec<_Tp, cn>& a)
    for( int i = 0; i < cn; i++ ) t.val[i] = saturate_cast<_Tp>(-a.val[i]);
    return t;
 }
+
+template<typename _Tp> inline Vec<_Tp, 4> operator * (const Vec<_Tp, 4>& v1, const Vec<_Tp, 4>& v2)
+{
+    return Vec<_Tp, 4>(saturate_cast<_Tp>(v1[0]*v2[0] - v1[1]*v2[1] - v1[2]*v2[2] - v1[3]*v2[3]),
+                       saturate_cast<_Tp>(v1[0]*v2[1] + v1[1]*v2[0] + v1[2]*v2[3] - v1[3]*v2[2]),
+                       saturate_cast<_Tp>(v1[0]*v2[2] - v1[1]*v2[3] + v1[2]*v2[0] + v1[3]*v2[1]),
+                       saturate_cast<_Tp>(v1[0]*v2[3] + v1[1]*v2[2] - v1[2]*v2[1] + v1[3]*v2[0]));
+}
+    
+template<typename _Tp> inline Vec<_Tp, 4>& operator *= (Vec<_Tp, 4>& v1, const Vec<_Tp, 4>& v2)
+{
+    v1 = v1 * v2;
+    return v1;
+}
    
 template<> inline Vec<float, 3> Vec<float, 3>::cross(const Vec<float, 3>& v) const
 {
@@ -1215,35 +1424,12 @@ template<> inline Vec<double, 3> Vec<double, 3>::cross(const Vec<double, 3>& v)
                     val[0]*v.val[1] - val[1]*v.val[0]);
 }

-template<typename T1, typename T2> static inline
-Vec<T1, 2>& operator += (Vec<T1, 2>& a, const Vec<T2, 2>& b)
+template<typename _Tp, int cn> inline Vec<_Tp, cn> normalize(const Vec<_Tp, cn>& v)
 {
-    a[0] = saturate_cast<T1>(a[0] + b[0]);
-    a[1] = saturate_cast<T1>(a[1] + b[1]);
-    return a;
+    double nv = norm(v);
+    return v * (nv ? 1./nv : 0.);
 }
-
-template<typename T1, typename T2> static inline
-Vec<T1, 3>& operator += (Vec<T1, 3>& a, const Vec<T2, 3>& b)
-{
-    a[0] = saturate_cast<T1>(a[0] + b[0]);
-    a[1] = saturate_cast<T1>(a[1] + b[1]);
-    a[2] = saturate_cast<T1>(a[2] + b[2]);
-    return a;
-}
-
    
-template<typename T1, typename T2> static inline
-Vec<T1, 4>& operator += (Vec<T1, 4>& a, const Vec<T2, 4>& b)
-{
-    a[0] = saturate_cast<T1>(a[0] + b[0]);
-    a[1] = saturate_cast<T1>(a[1] + b[1]);
-    a[2] = saturate_cast<T1>(a[2] + b[2]);
-    a[3] = saturate_cast<T1>(a[3] + b[3]);
-    return a;
-}
-
-        
 template<typename _Tp, typename _T2, int cn> static inline
 VecCommaInitializer<_Tp, cn> operator << (const Vec<_Tp, cn>& vec, _T2 val)
 {
@@ -1898,8 +2084,8 @@ operator * (const Scalar_<_Tp>& a, const Scalar_<_Tp>& b)
 {
    return Scalar_<_Tp>(saturate_cast<_Tp>(a[0]*b[0] - a[1]*b[1] - a[2]*b[2] - a[3]*b[3]),
                        saturate_cast<_Tp>(a[0]*b[1] + a[1]*b[0] + a[2]*b[3] - a[3]*b[2]),
-                        saturate_cast<_Tp>(a[0]*b[2] - a[1]*b[3] + a[2]*b[0] - a[3]*b[1]),
-                        saturate_cast<_Tp>(a[0]*b[3] + a[1]*b[2] - a[2]*b[1] - a[3]*b[0]));
+                        saturate_cast<_Tp>(a[0]*b[2] - a[1]*b[3] + a[2]*b[0] + a[3]*b[1]),
+                        saturate_cast<_Tp>(a[0]*b[3] + a[1]*b[2] - a[2]*b[1] + a[3]*b[0]));
 }
    
 template<typename _Tp> static inline Scalar_<_Tp>&
--- a/modules/core/src/cmdparser.cpp
+++ b/modules/core/src/cmdparser.cpp
@@ -282,7 +282,7 @@ template<typename _Tp>

 		cout << setw(col_p-2) << left << buf;

-		if (buf.length() > col_p-2) 
+		if ((int)buf.length() > col_p-2) 
 		{
 			cout << endl << "  ";
 			cout << setw(col_p-2) << left << " ";
@@ -293,7 +293,7 @@ template<typename _Tp>

 		while (true)
 		{
-			bool tr = (buf.length() > col_d-2) ? true: false;
+			bool tr = ((int)buf.length() > col_d-2) ? true: false;
 			int pos;

 			if (tr)
@@ -301,7 +301,8 @@ template<typename _Tp>
 				pos = buf.find_first_of(' ');
 				while (true)
 				{
-					if (buf.find_first_of(' ', pos + 1 ) < col_d-2 && buf.find_first_of(' ', pos + 1 ) != std::string::npos)
+					if ((int)buf.find_first_of(' ', pos + 1 ) < col_d-2 &&
+                        (int)buf.find_first_of(' ', pos + 1 ) != (int)std::string::npos)
 						pos = buf.find_first_of(' ', pos + 1);
 					else
 						break;
--- a/modules/core/src/matrix.cpp
+++ b/modules/core/src/matrix.cpp
@@ -2161,43 +2161,6 @@ static void generateRandomCenter(const vector<Vec2f>& box, float* center, RNG& r
 }


-static inline float distance(const float* a, const float* b, int n)
-{
-    int j = 0; float d = 0.f;
-#if CV_SSE
-    if( USE_SSE2 )
-    {
-        float CV_DECL_ALIGNED(16) buf[4];
-        __m128 d0 = _mm_setzero_ps(), d1 = _mm_setzero_ps();
-
-        for( ; j <= n - 8; j += 8 )
-        {
-            __m128 t0 = _mm_sub_ps(_mm_loadu_ps(a + j), _mm_loadu_ps(b + j));
-            __m128 t1 = _mm_sub_ps(_mm_loadu_ps(a + j + 4), _mm_loadu_ps(b + j + 4));
-            d0 = _mm_add_ps(d0, _mm_mul_ps(t0, t0));
-            d1 = _mm_add_ps(d1, _mm_mul_ps(t1, t1));
-        }
-        _mm_store_ps(buf, _mm_add_ps(d0, d1));
-        d = buf[0] + buf[1] + buf[2] + buf[3];
-    }
-    else
-#endif
-    {
-        for( ; j <= n - 4; j += 4 )
-        {
-            float t0 = a[j] - b[j], t1 = a[j+1] - b[j+1], t2 = a[j+2] - b[j+2], t3 = a[j+3] - b[j+3];
-            d += t0*t0 + t1*t1 + t2*t2 + t3*t3;
-        }
-    }
-
-    for( ; j < n; j++ )
-    {
-        float t = a[j] - b[j];
-        d += t*t;
-    }
-    return d;
-}
-
 /*
 k-means center initialization using the following algorithm:
 Arthur & Vassilvitskii (2007) k-means++: The Advantages of Careful Seeding
@@ -2218,7 +2181,7 @@ static void generateCentersPP(const Mat& _data, Mat& _out_centers,

    for( i = 0; i < N; i++ )
    {
-        dist[i] = distance(data + step*i, data + step*centers[0], dims);
+        dist[i] = normL2Sqr_(data + step*i, data + step*centers[0], dims);
        sum0 += dist[i];
    }
    
@@ -2236,7 +2199,7 @@ static void generateCentersPP(const Mat& _data, Mat& _out_centers,
            int ci = i;
            for( i = 0; i < N; i++ )
            {
-                tdist2[i] = std::min(distance(data + step*i, data + step*ci, dims), dist[i]);
+                tdist2[i] = std::min(normL2Sqr_(data + step*i, data + step*ci, dims), dist[i]);
                s += tdist2[i];
            }
            
@@ -2434,7 +2397,7 @@ double cv::kmeans( InputArray _data, int K,
                for( k = 0; k < K; k++ )
                {
                    const float* center = centers.ptr<float>(k);
-                    double dist = distance(sample, center, dims);
+                    double dist = normL2Sqr_(sample, center, dims);

                    if( min_dist > dist )
                    {
--- a/modules/core/src/stat.cpp
+++ b/modules/core/src/stat.cpp
@@ -810,15 +810,218 @@ void cv::minMaxLoc( InputArray _img, double* minVal, double* maxVal,
 namespace cv
 {

+float normL2Sqr_(const float* a, const float* b, int n)
+{
+    int j = 0; float d = 0.f;
+#if CV_SSE
+    if( USE_SSE2 )
+    {
+        float CV_DECL_ALIGNED(16) buf[4];
+        __m128 d0 = _mm_setzero_ps(), d1 = _mm_setzero_ps();
+        
+        for( ; j <= n - 8; j += 8 )
+        {
+            __m128 t0 = _mm_sub_ps(_mm_loadu_ps(a + j), _mm_loadu_ps(b + j));
+            __m128 t1 = _mm_sub_ps(_mm_loadu_ps(a + j + 4), _mm_loadu_ps(b + j + 4));
+            d0 = _mm_add_ps(d0, _mm_mul_ps(t0, t0));
+            d1 = _mm_add_ps(d1, _mm_mul_ps(t1, t1));
+        }
+        _mm_store_ps(buf, _mm_add_ps(d0, d1));
+        d = buf[0] + buf[1] + buf[2] + buf[3];
+    }
+    else
+#endif
+    {
+        for( ; j <= n - 4; j += 4 )
+        {
+            float t0 = a[j] - b[j], t1 = a[j+1] - b[j+1], t2 = a[j+2] - b[j+2], t3 = a[j+3] - b[j+3];
+            d += t0*t0 + t1*t1 + t2*t2 + t3*t3;
+        }
+    }
+    
+    for( ; j < n; j++ )
+    {
+        float t = a[j] - b[j];
+        d += t*t;
+    }
+    return d;
+}
+
+    
+float normL1_(const float* a, const float* b, int n)
+{
+    int j = 0; float d = 0.f;
+#if CV_SSE
+    if( USE_SSE2 )
+    {
+        float CV_DECL_ALIGNED(16) buf[4];
+        static const float CV_DECL_ALIGNED(16) absbuf[4] = {0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff};
+        __m128 d0 = _mm_setzero_ps(), d1 = _mm_setzero_ps();
+        __m128 absmask = _mm_load_ps(absbuf);
+        
+        for( ; j <= n - 8; j += 8 )
+        {
+            __m128 t0 = _mm_sub_ps(_mm_loadu_ps(a + j), _mm_loadu_ps(b + j));
+            __m128 t1 = _mm_sub_ps(_mm_loadu_ps(a + j + 4), _mm_loadu_ps(b + j + 4));
+            d0 = _mm_add_ps(d0, _mm_and_ps(t0, absmask));
+            d1 = _mm_add_ps(d1, _mm_and_ps(t1, absmask));
+        }
+        _mm_store_ps(buf, _mm_add_ps(d0, d1));
+        d = buf[0] + buf[1] + buf[2] + buf[3];
+    }
+    else
+#endif
+    {
+        for( ; j <= n - 4; j += 4 )
+        {
+            d += std::abs(a[j] - b[j]) + std::abs(a[j+1] - b[j+1]) +
+                std::abs(a[j+2] - b[j+2]) + std::abs(a[j+3] - b[j+3]);
+        }
+    }
+    
+    for( ; j < n; j++ )
+        d += std::abs(a[j] - b[j]);
+    return d;
+}
+
+int normL1_(const uchar* a, const uchar* b, int n)
+{
+    int j = 0, d = 0;
+#if CV_SSE
+    if( USE_SSE2 )
+    {
+        __m128i d0 = _mm_setzero_si128();
+        
+        for( ; j <= n - 16; j += 16 )
+        {
+            __m128i t0 = _mm_loadu_si128((const __m128i*)(a + j));
+            __m128i t1 = _mm_loadu_si128((const __m128i*)(b + j));
+            
+            d0 = _mm_add_epi32(d0, _mm_sad_epu8(t0, t1));
+        }
+
+        for( ; j <= n - 4; j += 4 )
+        {
+            __m128i t0 = _mm_cvtsi32_si128(*(const int*)(a + j));
+            __m128i t1 = _mm_cvtsi32_si128(*(const int*)(b + j));
+            
+            d0 = _mm_add_epi32(d0, _mm_sad_epu8(t0, t1));
+        }
+        d = _mm_cvtsi128_si32(_mm_add_epi32(d0, _mm_unpackhi_epi64(d0, d0)));
+    }
+    else
+#endif
+    {
+        for( ; j <= n - 4; j += 4 )
+        {
+            d += std::abs(a[j] - b[j]) + std::abs(a[j+1] - b[j+1]) +
+                std::abs(a[j+2] - b[j+2]) + std::abs(a[j+3] - b[j+3]);
+        }
+    }
+    
+    for( ; j < n; j++ )
+        d += std::abs(a[j] - b[j]);
+    return d;
+}
+
+static const uchar popCountTable[] = 
+{
+    0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4, 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
+    1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
+    1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
+    2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
+    1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
+    2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
+    2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
+    3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, 4, 5, 5, 6, 5, 6, 6, 7, 5, 6, 6, 7, 6, 7, 7, 8
+};
+    
+static const uchar popCountTable2[] =
+{
+    0, 1, 1, 1, 1, 2, 2, 2, 1, 2, 2, 2, 1, 2, 2, 2, 1, 2, 2, 2, 2, 3, 3, 3, 2, 3, 3, 3, 2, 3, 3, 3,
+    1, 2, 2, 2, 2, 3, 3, 3, 2, 3, 3, 3, 2, 3, 3, 3, 1, 2, 2, 2, 2, 3, 3, 3, 2, 3, 3, 3, 2, 3, 3, 3,
+    1, 2, 2, 2, 2, 3, 3, 3, 2, 3, 3, 3, 2, 3, 3, 3, 2, 3, 3, 3, 3, 4, 4, 4, 3, 4, 4, 4, 3, 4, 4, 4,
+    2, 3, 3, 3, 3, 4, 4, 4, 3, 4, 4, 4, 3, 4, 4, 4, 2, 3, 3, 3, 3, 4, 4, 4, 3, 4, 4, 4, 3, 4, 4, 4,
+    1, 2, 2, 2, 2, 3, 3, 3, 2, 3, 3, 3, 2, 3, 3, 3, 2, 3, 3, 3, 3, 4, 4, 4, 3, 4, 4, 4, 3, 4, 4, 4,
+    2, 3, 3, 3, 3, 4, 4, 4, 3, 4, 4, 4, 3, 4, 4, 4, 2, 3, 3, 3, 3, 4, 4, 4, 3, 4, 4, 4, 3, 4, 4, 4,
+    1, 2, 2, 2, 2, 3, 3, 3, 2, 3, 3, 3, 2, 3, 3, 3, 2, 3, 3, 3, 3, 4, 4, 4, 3, 4, 4, 4, 3, 4, 4, 4,
+    2, 3, 3, 3, 3, 4, 4, 4, 3, 4, 4, 4, 3, 4, 4, 4, 2, 3, 3, 3, 3, 4, 4, 4, 3, 4, 4, 4, 3, 4, 4, 4
+};
+    
+static const uchar popCountTable4[] =
+{
+    0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+    1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+    1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+    1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+    1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+    1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+    1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+    1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2
+};
+    
+int normHamming(const uchar* a, const uchar* b, int n)
+{
+    int i = 0, result = 0;
+#if defined __GNUC__ && CV_NEON
+    if (CPU_HAS_NEON_FEATURE)
+    {
+        result = 0;  
+        for( ; i <= n - 16; i += 16 )
+        {
+            uint8x16_t A_vec = vld1q_u8 (a + i);
+            uint8x16_t B_vec = vld1q_u8 (b + i);
+            //uint8x16_t veorq_u8 (uint8x16_t, uint8x16_t)
+            uint8x16_t AxorB = veorq_u8 (A_vec, B_vec);
+            
+            uint8x16_t bitsSet = vcntq_u8 (AxorB);
+            //uint16x8_t vpadalq_u8 (uint16x8_t, uint8x16_t)
+            uint16x8_t bitSet8 = vpaddlq_u8 (bitsSet);
+            uint32x4_t bitSet4 = vpaddlq_u16 (bitSet8);
+            
+            uint64x2_t bitSet2 = vpaddlq_u32 (bitSet4);
+            result += vgetq_lane_u64 (bitSet2,0);
+            result += vgetq_lane_u64 (bitSet2,1);
+        }
+    }
+    else
+#endif
+        for( ; i <= n - 4; i += 4 )
+            result += popCountTable[a[i] ^ b[i]] + popCountTable[a[i+1] ^ b[i+1]] +
+                popCountTable[a[i+2] ^ b[i+2]] + popCountTable[a[i+3] ^ b[i+3]];
+    for( ; i < n; i++ )
+        result += popCountTable[a[i] ^ b[i]];
+    return result;
+}
+    
+int normHamming(const uchar* a, const uchar* b, int n, int cellSize)
+{
+    if( cellSize == 1 )
+        return normHamming(a, b, n);
+    const uchar* tab = 0;
+    if( cellSize == 2 )
+        tab = popCountTable2;
+    else if( cellSize == 4 )
+        tab = popCountTable4;
+    else
+        CV_Error( CV_StsBadSize, "bad cell size (not 1, 2 or 4) in normHamming" );
+    int i = 0, result = 0;
+    for( ; i <= n - 4; i += 4 )
+        result += tab[a[i] ^ b[i]] + tab[a[i+1] ^ b[i+1]] +
+            tab[a[i+2] ^ b[i+2]] + tab[a[i+3] ^ b[i+3]];
+    for( ; i < n; i++ )
+        result += tab[a[i] ^ b[i]];
+    return result;
+}
+    
+    
 template<typename T, typename ST> int
 normInf_(const T* src, const uchar* mask, ST* _result, int len, int cn)
 {
    ST result = *_result;
    if( !mask )
    {
-        len *= cn;
-        for( int i = 0; i < len; i++ )
-            result = std::max(result, ST(std::abs(src[i])));
+        result = std::max(result, normInf<T, ST>(src, len*cn));
    }
    else
    {
@@ -826,7 +1029,7 @@ normInf_(const T* src, const uchar* mask, ST* _result, int len, int cn)
            if( mask[i] )
            {
                for( int k = 0; k < cn; k++ )
-                    result = std::max(result, ST(std::abs(src[k])));
+                    result = std::max(result, ST(fast_abs(src[k])));
            }
    }
    *_result = result;
@@ -839,9 +1042,7 @@ normL1_(const T* src, const uchar* mask, ST* _result, int len, int cn)
    ST result = *_result;
    if( !mask )
    {
-        len *= cn;
-        for( int i = 0; i < len; i++ )
-            result += std::abs(src[i]);
+        result += normL1<T, ST>(src, len*cn);
    }
    else
    {
@@ -849,7 +1050,7 @@ normL1_(const T* src, const uchar* mask, ST* _result, int len, int cn)
            if( mask[i] )
            {
                for( int k = 0; k < cn; k++ )
-                    result += std::abs(src[k]);
+                    result += fast_abs(src[k]);
            }
    }
    *_result = result;
@@ -862,12 +1063,7 @@ normL2_(const T* src, const uchar* mask, ST* _result, int len, int cn)
    ST result = *_result;
    if( !mask )
    {
-        len *= cn;
-        for( int i = 0; i < len; i++ )
-        {
-            T v = src[i];
-            result += (ST)v*v;
-        }
+        result += normL2Sqr<T, ST>(src, len*cn);
    }
    else
    {
@@ -891,9 +1087,7 @@ normDiffInf_(const T* src1, const T* src2, const uchar* mask, ST* _result, int l
    ST result = *_result;
    if( !mask )
    {
-        len *= cn;
-        for( int i = 0; i < len; i++ )
-            result = std::max(result, (ST)std::abs(src1[i] - src2[i]));
+        result = std::max(result, normInf<T, ST>(src1, src2, len*cn));
    }
    else
    {
@@ -914,9 +1108,7 @@ normDiffL1_(const T* src1, const T* src2, const uchar* mask, ST* _result, int le
    ST result = *_result;
    if( !mask )
    {
-        len *= cn;
-        for( int i = 0; i < len; i++ )
-            result += std::abs(src1[i] - src2[i]);
+        result += normL1<T, ST>(src1, src2, len*cn);
    }
    else
    {
@@ -937,12 +1129,7 @@ normDiffL2_(const T* src1, const T* src2, const uchar* mask, ST* _result, int le
    ST result = *_result;
    if( !mask )
    {
-        len *= cn;
-        for( int i = 0; i < len; i++ )
-        {
-            ST v = src1[i] - src2[i];
-            result += v*v;
-        }
+        result += normL2Sqr<T, ST>(src1, src2, len*cn);
    }
    else
    {