fully implemented SSE and NEON cases of intrin.hpp; extended the HAL with some basic math functions

This commit is contained in:
Vadim Pisarevsky
2015-04-16 23:00:26 +03:00
parent a2bba1b9e6
commit ee11a2d266
18 changed files with 2460 additions and 2003 deletions

View File

@@ -79,7 +79,7 @@ public:
for ( int i = begin; i<end; i++ )
{
tdist2[i] = std::min(normL2Sqr_(data + step*i, data + stepci, dims), dist[i]);
tdist2[i] = std::min(normL2Sqr(data + step*i, data + stepci, dims), dist[i]);
}
}
@@ -114,7 +114,7 @@ static void generateCentersPP(const Mat& _data, Mat& _out_centers,
for( i = 0; i < N; i++ )
{
dist[i] = normL2Sqr_(data + step*i, data + step*centers[0], dims);
dist[i] = normL2Sqr(data + step*i, data + step*centers[0], dims);
sum0 += dist[i];
}
@@ -189,7 +189,7 @@ public:
for( int k = 0; k < K; k++ )
{
const float* center = centers.ptr<float>(k);
const double dist = normL2Sqr_(sample, center, dims);
const double dist = normL2Sqr(sample, center, dims);
if( min_dist > dist )
{
@@ -384,7 +384,7 @@ double cv::kmeans( InputArray _data, int K,
if( labels[i] != max_k )
continue;
sample = data.ptr<float>(i);
double dist = normL2Sqr_(sample, _old_center, dims);
double dist = normL2Sqr(sample, _old_center, dims);
if( max_dist <= dist )
{

View File

@@ -50,168 +50,6 @@
namespace cv
{
/****************************************************************************************\
* LU & Cholesky implementation for small matrices *
\****************************************************************************************/
template<typename _Tp> static inline int
LUImpl(_Tp* A, size_t astep, int m, _Tp* b, size_t bstep, int n)
{
int i, j, k, p = 1;
astep /= sizeof(A[0]);
bstep /= sizeof(b[0]);
for( i = 0; i < m; i++ )
{
k = i;
for( j = i+1; j < m; j++ )
if( std::abs(A[j*astep + i]) > std::abs(A[k*astep + i]) )
k = j;
if( std::abs(A[k*astep + i]) < std::numeric_limits<_Tp>::epsilon() )
return 0;
if( k != i )
{
for( j = i; j < m; j++ )
std::swap(A[i*astep + j], A[k*astep + j]);
if( b )
for( j = 0; j < n; j++ )
std::swap(b[i*bstep + j], b[k*bstep + j]);
p = -p;
}
_Tp d = -1/A[i*astep + i];
for( j = i+1; j < m; j++ )
{
_Tp alpha = A[j*astep + i]*d;
for( k = i+1; k < m; k++ )
A[j*astep + k] += alpha*A[i*astep + k];
if( b )
for( k = 0; k < n; k++ )
b[j*bstep + k] += alpha*b[i*bstep + k];
}
A[i*astep + i] = -d;
}
if( b )
{
for( i = m-1; i >= 0; i-- )
for( j = 0; j < n; j++ )
{
_Tp s = b[i*bstep + j];
for( k = i+1; k < m; k++ )
s -= A[i*astep + k]*b[k*bstep + j];
b[i*bstep + j] = s*A[i*astep + i];
}
}
return p;
}
int LU(float* A, size_t astep, int m, float* b, size_t bstep, int n)
{
return LUImpl(A, astep, m, b, bstep, n);
}
int LU(double* A, size_t astep, int m, double* b, size_t bstep, int n)
{
return LUImpl(A, astep, m, b, bstep, n);
}
template<typename _Tp> static inline bool
CholImpl(_Tp* A, size_t astep, int m, _Tp* b, size_t bstep, int n)
{
_Tp* L = A;
int i, j, k;
double s;
astep /= sizeof(A[0]);
bstep /= sizeof(b[0]);
for( i = 0; i < m; i++ )
{
for( j = 0; j < i; j++ )
{
s = A[i*astep + j];
for( k = 0; k < j; k++ )
s -= L[i*astep + k]*L[j*astep + k];
L[i*astep + j] = (_Tp)(s*L[j*astep + j]);
}
s = A[i*astep + i];
for( k = 0; k < j; k++ )
{
double t = L[i*astep + k];
s -= t*t;
}
if( s < std::numeric_limits<_Tp>::epsilon() )
return false;
L[i*astep + i] = (_Tp)(1./std::sqrt(s));
}
if( !b )
return true;
// LLt x = b
// 1: L y = b
// 2. Lt x = y
/*
[ L00 ] y0 b0
[ L10 L11 ] y1 = b1
[ L20 L21 L22 ] y2 b2
[ L30 L31 L32 L33 ] y3 b3
[ L00 L10 L20 L30 ] x0 y0
[ L11 L21 L31 ] x1 = y1
[ L22 L32 ] x2 y2
[ L33 ] x3 y3
*/
for( i = 0; i < m; i++ )
{
for( j = 0; j < n; j++ )
{
s = b[i*bstep + j];
for( k = 0; k < i; k++ )
s -= L[i*astep + k]*b[k*bstep + j];
b[i*bstep + j] = (_Tp)(s*L[i*astep + i]);
}
}
for( i = m-1; i >= 0; i-- )
{
for( j = 0; j < n; j++ )
{
s = b[i*bstep + j];
for( k = m-1; k > i; k-- )
s -= L[k*astep + i]*b[k*bstep + j];
b[i*bstep + j] = (_Tp)(s*L[i*astep + i]);
}
}
return true;
}
bool Cholesky(float* A, size_t astep, int m, float* b, size_t bstep, int n)
{
return CholImpl(A, astep, m, b, bstep, n);
}
bool Cholesky(double* A, size_t astep, int m, double* b, size_t bstep, int n)
{
return CholImpl(A, astep, m, b, bstep, n);
}
template<typename _Tp> static inline _Tp hypot(_Tp a, _Tp b)
{
a = std::abs(a);
@@ -882,7 +720,7 @@ double cv::determinant( InputArray _mat )
Mat a(rows, rows, CV_32F, (uchar*)buffer);
mat.copyTo(a);
result = LU(a.ptr<float>(), a.step, rows, 0, 0, 0);
result = hal::LU(a.ptr<float>(), a.step, rows, 0, 0, 0);
if( result )
{
for( int i = 0; i < rows; i++ )
@@ -906,7 +744,7 @@ double cv::determinant( InputArray _mat )
Mat a(rows, rows, CV_64F, (uchar*)buffer);
mat.copyTo(a);
result = LU(a.ptr<double>(), a.step, rows, 0, 0, 0);
result = hal::LU(a.ptr<double>(), a.step, rows, 0, 0, 0);
if( result )
{
for( int i = 0; i < rows; i++ )
@@ -1169,13 +1007,13 @@ double cv::invert( InputArray _src, OutputArray _dst, int method )
setIdentity(dst);
if( method == DECOMP_LU && type == CV_32F )
result = LU(src1.ptr<float>(), src1.step, n, dst.ptr<float>(), dst.step, n) != 0;
result = hal::LU(src1.ptr<float>(), src1.step, n, dst.ptr<float>(), dst.step, n) != 0;
else if( method == DECOMP_LU && type == CV_64F )
result = LU(src1.ptr<double>(), src1.step, n, dst.ptr<double>(), dst.step, n) != 0;
result = hal::LU(src1.ptr<double>(), src1.step, n, dst.ptr<double>(), dst.step, n) != 0;
else if( method == DECOMP_CHOLESKY && type == CV_32F )
result = Cholesky(src1.ptr<float>(), src1.step, n, dst.ptr<float>(), dst.step, n);
result = hal::Cholesky(src1.ptr<float>(), src1.step, n, dst.ptr<float>(), dst.step, n);
else
result = Cholesky(src1.ptr<double>(), src1.step, n, dst.ptr<double>(), dst.step, n);
result = hal::Cholesky(src1.ptr<double>(), src1.step, n, dst.ptr<double>(), dst.step, n);
if( !result )
dst = Scalar(0);
@@ -1407,16 +1245,16 @@ bool cv::solve( InputArray _src, InputArray _src2arg, OutputArray _dst, int meth
if( method == DECOMP_LU )
{
if( type == CV_32F )
result = LU(a.ptr<float>(), a.step, n, dst.ptr<float>(), dst.step, nb) != 0;
result = hal::LU(a.ptr<float>(), a.step, n, dst.ptr<float>(), dst.step, nb) != 0;
else
result = LU(a.ptr<double>(), a.step, n, dst.ptr<double>(), dst.step, nb) != 0;
result = hal::LU(a.ptr<double>(), a.step, n, dst.ptr<double>(), dst.step, nb) != 0;
}
else if( method == DECOMP_CHOLESKY )
{
if( type == CV_32F )
result = Cholesky(a.ptr<float>(), a.step, n, dst.ptr<float>(), dst.step, nb);
result = hal::Cholesky(a.ptr<float>(), a.step, n, dst.ptr<float>(), dst.step, nb);
else
result = Cholesky(a.ptr<double>(), a.step, n, dst.ptr<double>(), dst.step, nb);
result = hal::Cholesky(a.ptr<double>(), a.step, n, dst.ptr<double>(), dst.step, nb);
}
else
{

File diff suppressed because it is too large Load Diff

View File

@@ -2416,140 +2416,6 @@ void cv::minMaxLoc( InputArray _img, double* minVal, double* maxVal,
namespace cv
{
float normL2Sqr_(const float* a, const float* b, int n)
{
int j = 0; float d = 0.f;
#if CV_SSE
if( USE_SSE2 )
{
float CV_DECL_ALIGNED(16) buf[4];
__m128 d0 = _mm_setzero_ps(), d1 = _mm_setzero_ps();
for( ; j <= n - 8; j += 8 )
{
__m128 t0 = _mm_sub_ps(_mm_loadu_ps(a + j), _mm_loadu_ps(b + j));
__m128 t1 = _mm_sub_ps(_mm_loadu_ps(a + j + 4), _mm_loadu_ps(b + j + 4));
d0 = _mm_add_ps(d0, _mm_mul_ps(t0, t0));
d1 = _mm_add_ps(d1, _mm_mul_ps(t1, t1));
}
_mm_store_ps(buf, _mm_add_ps(d0, d1));
d = buf[0] + buf[1] + buf[2] + buf[3];
}
else
#endif
{
for( ; j <= n - 4; j += 4 )
{
float t0 = a[j] - b[j], t1 = a[j+1] - b[j+1], t2 = a[j+2] - b[j+2], t3 = a[j+3] - b[j+3];
d += t0*t0 + t1*t1 + t2*t2 + t3*t3;
}
}
for( ; j < n; j++ )
{
float t = a[j] - b[j];
d += t*t;
}
return d;
}
float normL1_(const float* a, const float* b, int n)
{
int j = 0; float d = 0.f;
#if CV_SSE
if( USE_SSE2 )
{
float CV_DECL_ALIGNED(16) buf[4];
static const int CV_DECL_ALIGNED(16) absbuf[4] = {0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff};
__m128 d0 = _mm_setzero_ps(), d1 = _mm_setzero_ps();
__m128 absmask = _mm_load_ps((const float*)absbuf);
for( ; j <= n - 8; j += 8 )
{
__m128 t0 = _mm_sub_ps(_mm_loadu_ps(a + j), _mm_loadu_ps(b + j));
__m128 t1 = _mm_sub_ps(_mm_loadu_ps(a + j + 4), _mm_loadu_ps(b + j + 4));
d0 = _mm_add_ps(d0, _mm_and_ps(t0, absmask));
d1 = _mm_add_ps(d1, _mm_and_ps(t1, absmask));
}
_mm_store_ps(buf, _mm_add_ps(d0, d1));
d = buf[0] + buf[1] + buf[2] + buf[3];
}
else
#elif CV_NEON
float32x4_t v_sum = vdupq_n_f32(0.0f);
for ( ; j <= n - 4; j += 4)
v_sum = vaddq_f32(v_sum, vabdq_f32(vld1q_f32(a + j), vld1q_f32(b + j)));
float CV_DECL_ALIGNED(16) buf[4];
vst1q_f32(buf, v_sum);
d = buf[0] + buf[1] + buf[2] + buf[3];
#endif
{
for( ; j <= n - 4; j += 4 )
{
d += std::abs(a[j] - b[j]) + std::abs(a[j+1] - b[j+1]) +
std::abs(a[j+2] - b[j+2]) + std::abs(a[j+3] - b[j+3]);
}
}
for( ; j < n; j++ )
d += std::abs(a[j] - b[j]);
return d;
}
int normL1_(const uchar* a, const uchar* b, int n)
{
int j = 0, d = 0;
#if CV_SSE
if( USE_SSE2 )
{
__m128i d0 = _mm_setzero_si128();
for( ; j <= n - 16; j += 16 )
{
__m128i t0 = _mm_loadu_si128((const __m128i*)(a + j));
__m128i t1 = _mm_loadu_si128((const __m128i*)(b + j));
d0 = _mm_add_epi32(d0, _mm_sad_epu8(t0, t1));
}
for( ; j <= n - 4; j += 4 )
{
__m128i t0 = _mm_cvtsi32_si128(*(const int*)(a + j));
__m128i t1 = _mm_cvtsi32_si128(*(const int*)(b + j));
d0 = _mm_add_epi32(d0, _mm_sad_epu8(t0, t1));
}
d = _mm_cvtsi128_si32(_mm_add_epi32(d0, _mm_unpackhi_epi64(d0, d0)));
}
else
#elif CV_NEON
uint32x4_t v_sum = vdupq_n_u32(0.0f);
for ( ; j <= n - 16; j += 16)
{
uint8x16_t v_dst = vabdq_u8(vld1q_u8(a + j), vld1q_u8(b + j));
uint16x8_t v_low = vmovl_u8(vget_low_u8(v_dst)), v_high = vmovl_u8(vget_high_u8(v_dst));
v_sum = vaddq_u32(v_sum, vaddl_u16(vget_low_u16(v_low), vget_low_u16(v_high)));
v_sum = vaddq_u32(v_sum, vaddl_u16(vget_high_u16(v_low), vget_high_u16(v_high)));
}
uint CV_DECL_ALIGNED(16) buf[4];
vst1q_u32(buf, v_sum);
d = buf[0] + buf[1] + buf[2] + buf[3];
#endif
{
for( ; j <= n - 4; j += 4 )
{
d += std::abs(a[j] - b[j]) + std::abs(a[j+1] - b[j+1]) +
std::abs(a[j+2] - b[j+2]) + std::abs(a[j+3] - b[j+3]);
}
}
for( ; j < n; j++ )
d += std::abs(a[j] - b[j]);
return d;
}
template<typename T, typename ST> int
normInf_(const T* src, const uchar* mask, ST* _result, int len, int cn)
{
@@ -2564,7 +2430,7 @@ normInf_(const T* src, const uchar* mask, ST* _result, int len, int cn)
if( mask[i] )
{
for( int k = 0; k < cn; k++ )
result = std::max(result, ST(std::abs(src[k])));
result = std::max(result, ST(cv_abs(src[k])));
}
}
*_result = result;
@@ -2585,7 +2451,7 @@ normL1_(const T* src, const uchar* mask, ST* _result, int len, int cn)
if( mask[i] )
{
for( int k = 0; k < cn; k++ )
result += std::abs(src[k]);
result += cv_abs(src[k]);
}
}
*_result = result;
@@ -2684,9 +2550,7 @@ normDiffL2_(const T* src1, const T* src2, const uchar* mask, ST* _result, int le
Hamming::ResultType Hamming::operator()( const unsigned char* a, const unsigned char* b, int size ) const
{
int result = 0;
cv::hal::normHamming(a, b, size, result);
return result;
return cv::hal::normHamming(a, b, size);
}
#define CV_DEF_NORM_FUNC(L, suffix, type, ntype) \
@@ -3037,16 +2901,12 @@ double cv::norm( InputArray _src, int normType, InputArray _mask )
if( normType == NORM_HAMMING )
{
int result = 0;
cv::hal::normHamming(data, (int)len, result);
return result;
return hal::normHamming(data, (int)len);
}
if( normType == NORM_HAMMING2 )
{
int result = 0;
hal::normHamming(data, (int)len, 2, result);
return result;
return hal::normHamming(data, (int)len, 2);
}
}
}
@@ -3072,9 +2932,7 @@ double cv::norm( InputArray _src, int normType, InputArray _mask )
for( size_t i = 0; i < it.nplanes; i++, ++it )
{
int one = 0;
cv::hal::normHamming(ptrs[0], total, cellSize, one);
result += one;
result += hal::normHamming(ptrs[0], total, cellSize);
}
return result;
@@ -3558,9 +3416,7 @@ double cv::norm( InputArray _src1, InputArray _src2, int normType, InputArray _m
for( size_t i = 0; i < it.nplanes; i++, ++it )
{
int one = 0;
hal::normHamming(ptrs[0], ptrs[1], total, cellSize, one);
result += one;
result += hal::normHamming(ptrs[0], ptrs[1], total, cellSize);
}
return result;
@@ -3698,7 +3554,7 @@ static void batchDistHamming(const uchar* src1, const uchar* src2, size_t step2,
if( !mask )
{
for( int i = 0; i < nvecs; i++ )
hal::normHamming(src1, src2 + step2*i, len, dist[i]);
dist[i] = hal::normHamming(src1, src2 + step2*i, len);
}
else
{
@@ -3706,7 +3562,7 @@ static void batchDistHamming(const uchar* src1, const uchar* src2, size_t step2,
for( int i = 0; i < nvecs; i++ )
{
if (mask[i])
hal::normHamming(src1, src2 + step2*i, len, dist[i]);
dist[i] = hal::normHamming(src1, src2 + step2*i, len);
else
dist[i] = val0;
}
@@ -3720,7 +3576,7 @@ static void batchDistHamming2(const uchar* src1, const uchar* src2, size_t step2
if( !mask )
{
for( int i = 0; i < nvecs; i++ )
hal::normHamming(src1, src2 + step2*i, len, 2, dist[i]);
dist[i] = hal::normHamming(src1, src2 + step2*i, len, 2);
}
else
{
@@ -3728,7 +3584,7 @@ static void batchDistHamming2(const uchar* src1, const uchar* src2, size_t step2
for( int i = 0; i < nvecs; i++ )
{
if (mask[i])
hal::normHamming(src1, src2 + step2*i, len, 2, dist[i]);
dist[i] = hal::normHamming(src1, src2 + step2*i, len, 2);
else
dist[i] = val0;
}