fully implemented SSE and NEON cases of intrin.hpp; extended the HAL with some basic math functions
This commit is contained in:
@@ -81,28 +81,17 @@ float normL1_(const float* a, const float* b, int n);
|
||||
float normL2Sqr_(const float* a, const float* b, int n);
|
||||
|
||||
void exp(const float* src, float* dst, int n);
|
||||
void exp(const double* src, double* dst, int n);
|
||||
void log(const float* src, float* dst, int n);
|
||||
void log(const double* src, double* dst, int n);
|
||||
|
||||
void fastAtan2(const float* y, const float* x, float* dst, int n, bool angleInDegrees);
|
||||
void magnitude(const float* x, const float* y, float* dst, int n);
|
||||
|
||||
/** @brief Computes the cube root of an argument.
|
||||
|
||||
The function cubeRoot computes \f$\sqrt[3]{\texttt{val}}\f$. Negative arguments are handled correctly.
|
||||
NaN and Inf are not handled. The accuracy approaches the maximum possible accuracy for
|
||||
single-precision data.
|
||||
@param val A function argument.
|
||||
*/
|
||||
float cubeRoot(float val);
|
||||
|
||||
/** @brief Calculates the angle of a 2D vector in degrees.
|
||||
|
||||
The function fastAtan2 calculates the full-range angle of an input 2D vector. The angle is measured
|
||||
in degrees and varies from 0 to 360 degrees. The accuracy is about 0.3 degrees.
|
||||
@param x x-coordinate of the vector.
|
||||
@param y y-coordinate of the vector.
|
||||
*/
|
||||
float fastAtan2(float y, float x);
|
||||
void magnitude(const double* x, const double* y, double* dst, int n);
|
||||
void sqrt(const float* src, float* dst, int len);
|
||||
void sqrt(const double* src, double* dst, int len);
|
||||
void invSqrt(const float* src, float* dst, int len);
|
||||
void invSqrt(const double* src, double* dst, int len);
|
||||
|
||||
}} //cv::hal
|
||||
|
||||
|
||||
@@ -380,7 +380,7 @@ cvRound( double value )
|
||||
TEGRA_ROUND_DBL(value);
|
||||
#elif defined CV_ICC || defined __GNUC__
|
||||
# if CV_VFP
|
||||
ARM_ROUND_DBL(value)
|
||||
ARM_ROUND_DBL(value);
|
||||
# else
|
||||
return (int)lrint(value);
|
||||
# endif
|
||||
@@ -488,7 +488,7 @@ CV_INLINE int cvRound(float value)
|
||||
TEGRA_ROUND_FLT(value);
|
||||
#elif defined CV_ICC || defined __GNUC__
|
||||
# if CV_VFP
|
||||
ARM_ROUND_FLT(value)
|
||||
ARM_ROUND_FLT(value);
|
||||
# else
|
||||
return (int)lrintf(value);
|
||||
# endif
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
@@ -44,4 +44,165 @@
|
||||
|
||||
namespace cv { namespace hal {
|
||||
|
||||
/****************************************************************************************\
|
||||
* LU & Cholesky implementation for small matrices *
|
||||
\****************************************************************************************/
|
||||
|
||||
template<typename _Tp> static inline int
|
||||
LUImpl(_Tp* A, size_t astep, int m, _Tp* b, size_t bstep, int n)
|
||||
{
|
||||
int i, j, k, p = 1;
|
||||
astep /= sizeof(A[0]);
|
||||
bstep /= sizeof(b[0]);
|
||||
|
||||
for( i = 0; i < m; i++ )
|
||||
{
|
||||
k = i;
|
||||
|
||||
for( j = i+1; j < m; j++ )
|
||||
if( std::abs(A[j*astep + i]) > std::abs(A[k*astep + i]) )
|
||||
k = j;
|
||||
|
||||
if( std::abs(A[k*astep + i]) < std::numeric_limits<_Tp>::epsilon() )
|
||||
return 0;
|
||||
|
||||
if( k != i )
|
||||
{
|
||||
for( j = i; j < m; j++ )
|
||||
std::swap(A[i*astep + j], A[k*astep + j]);
|
||||
if( b )
|
||||
for( j = 0; j < n; j++ )
|
||||
std::swap(b[i*bstep + j], b[k*bstep + j]);
|
||||
p = -p;
|
||||
}
|
||||
|
||||
_Tp d = -1/A[i*astep + i];
|
||||
|
||||
for( j = i+1; j < m; j++ )
|
||||
{
|
||||
_Tp alpha = A[j*astep + i]*d;
|
||||
|
||||
for( k = i+1; k < m; k++ )
|
||||
A[j*astep + k] += alpha*A[i*astep + k];
|
||||
|
||||
if( b )
|
||||
for( k = 0; k < n; k++ )
|
||||
b[j*bstep + k] += alpha*b[i*bstep + k];
|
||||
}
|
||||
|
||||
A[i*astep + i] = -d;
|
||||
}
|
||||
|
||||
if( b )
|
||||
{
|
||||
for( i = m-1; i >= 0; i-- )
|
||||
for( j = 0; j < n; j++ )
|
||||
{
|
||||
_Tp s = b[i*bstep + j];
|
||||
for( k = i+1; k < m; k++ )
|
||||
s -= A[i*astep + k]*b[k*bstep + j];
|
||||
b[i*bstep + j] = s*A[i*astep + i];
|
||||
}
|
||||
}
|
||||
|
||||
return p;
|
||||
}
|
||||
|
||||
|
||||
int LU(float* A, size_t astep, int m, float* b, size_t bstep, int n)
|
||||
{
|
||||
return LUImpl(A, astep, m, b, bstep, n);
|
||||
}
|
||||
|
||||
|
||||
int LU(double* A, size_t astep, int m, double* b, size_t bstep, int n)
|
||||
{
|
||||
return LUImpl(A, astep, m, b, bstep, n);
|
||||
}
|
||||
|
||||
|
||||
template<typename _Tp> static inline bool
|
||||
CholImpl(_Tp* A, size_t astep, int m, _Tp* b, size_t bstep, int n)
|
||||
{
|
||||
_Tp* L = A;
|
||||
int i, j, k;
|
||||
double s;
|
||||
astep /= sizeof(A[0]);
|
||||
bstep /= sizeof(b[0]);
|
||||
|
||||
for( i = 0; i < m; i++ )
|
||||
{
|
||||
for( j = 0; j < i; j++ )
|
||||
{
|
||||
s = A[i*astep + j];
|
||||
for( k = 0; k < j; k++ )
|
||||
s -= L[i*astep + k]*L[j*astep + k];
|
||||
L[i*astep + j] = (_Tp)(s*L[j*astep + j]);
|
||||
}
|
||||
s = A[i*astep + i];
|
||||
for( k = 0; k < j; k++ )
|
||||
{
|
||||
double t = L[i*astep + k];
|
||||
s -= t*t;
|
||||
}
|
||||
if( s < std::numeric_limits<_Tp>::epsilon() )
|
||||
return false;
|
||||
L[i*astep + i] = (_Tp)(1./std::sqrt(s));
|
||||
}
|
||||
|
||||
if( !b )
|
||||
return true;
|
||||
|
||||
// LLt x = b
|
||||
// 1: L y = b
|
||||
// 2. Lt x = y
|
||||
|
||||
/*
|
||||
[ L00 ] y0 b0
|
||||
[ L10 L11 ] y1 = b1
|
||||
[ L20 L21 L22 ] y2 b2
|
||||
[ L30 L31 L32 L33 ] y3 b3
|
||||
|
||||
[ L00 L10 L20 L30 ] x0 y0
|
||||
[ L11 L21 L31 ] x1 = y1
|
||||
[ L22 L32 ] x2 y2
|
||||
[ L33 ] x3 y3
|
||||
*/
|
||||
|
||||
for( i = 0; i < m; i++ )
|
||||
{
|
||||
for( j = 0; j < n; j++ )
|
||||
{
|
||||
s = b[i*bstep + j];
|
||||
for( k = 0; k < i; k++ )
|
||||
s -= L[i*astep + k]*b[k*bstep + j];
|
||||
b[i*bstep + j] = (_Tp)(s*L[i*astep + i]);
|
||||
}
|
||||
}
|
||||
|
||||
for( i = m-1; i >= 0; i-- )
|
||||
{
|
||||
for( j = 0; j < n; j++ )
|
||||
{
|
||||
s = b[i*bstep + j];
|
||||
for( k = m-1; k > i; k-- )
|
||||
s -= L[k*astep + i]*b[k*bstep + j];
|
||||
b[i*bstep + j] = (_Tp)(s*L[i*astep + i]);
|
||||
}
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
|
||||
bool Cholesky(float* A, size_t astep, int m, float* b, size_t bstep, int n)
|
||||
{
|
||||
return CholImpl(A, astep, m, b, bstep, n);
|
||||
}
|
||||
|
||||
bool Cholesky(double* A, size_t astep, int m, double* b, size_t bstep, int n)
|
||||
{
|
||||
return CholImpl(A, astep, m, b, bstep, n);
|
||||
}
|
||||
|
||||
}}
|
||||
|
||||
@@ -42,3 +42,7 @@
|
||||
|
||||
#include "opencv2/hal.hpp"
|
||||
#include "opencv2/hal/intrin.hpp"
|
||||
#include <algorithm>
|
||||
#include <cmath>
|
||||
#include <cstdlib>
|
||||
#include <float.h>
|
||||
|
||||
@@ -80,10 +80,10 @@ static const uchar popCountTable4[] =
|
||||
1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2
|
||||
};
|
||||
|
||||
Error::Code normHamming(const uchar* a, int n, int & result)
|
||||
int normHamming(const uchar* a, int n)
|
||||
{
|
||||
int i = 0;
|
||||
result = 0;
|
||||
int result = 0;
|
||||
#if CV_NEON
|
||||
{
|
||||
uint32x4_t bits = vmovq_n_u32(0);
|
||||
@@ -104,13 +104,13 @@ Error::Code normHamming(const uchar* a, int n, int & result)
|
||||
popCountTable[a[i+2]] + popCountTable[a[i+3]];
|
||||
for( ; i < n; i++ )
|
||||
result += popCountTable[a[i]];
|
||||
return Error::Ok;
|
||||
return result;
|
||||
}
|
||||
|
||||
Error::Code normHamming(const uchar* a, const uchar* b, int n, int & result)
|
||||
int normHamming(const uchar* a, const uchar* b, int n)
|
||||
{
|
||||
int i = 0;
|
||||
result = 0;
|
||||
int result = 0;
|
||||
#if CV_NEON
|
||||
{
|
||||
uint32x4_t bits = vmovq_n_u32(0);
|
||||
@@ -133,44 +133,44 @@ Error::Code normHamming(const uchar* a, const uchar* b, int n, int & result)
|
||||
popCountTable[a[i+2] ^ b[i+2]] + popCountTable[a[i+3] ^ b[i+3]];
|
||||
for( ; i < n; i++ )
|
||||
result += popCountTable[a[i] ^ b[i]];
|
||||
return Error::Ok;
|
||||
return result;
|
||||
}
|
||||
|
||||
Error::Code normHamming(const uchar* a, int n, int cellSize, int & result)
|
||||
int normHamming(const uchar* a, int n, int cellSize)
|
||||
{
|
||||
if( cellSize == 1 )
|
||||
return normHamming(a, n, result);
|
||||
return normHamming(a, n);
|
||||
const uchar* tab = 0;
|
||||
if( cellSize == 2 )
|
||||
tab = popCountTable2;
|
||||
else if( cellSize == 4 )
|
||||
tab = popCountTable4;
|
||||
else
|
||||
return Error::Unknown;
|
||||
return -1;
|
||||
int i = 0;
|
||||
result = 0;
|
||||
int result = 0;
|
||||
#if CV_ENABLE_UNROLLED
|
||||
for( ; i <= n - 4; i += 4 )
|
||||
result += tab[a[i]] + tab[a[i+1]] + tab[a[i+2]] + tab[a[i+3]];
|
||||
#endif
|
||||
for( ; i < n; i++ )
|
||||
result += tab[a[i]];
|
||||
return Error::Ok;
|
||||
return result;
|
||||
}
|
||||
|
||||
Error::Code normHamming(const uchar* a, const uchar* b, int n, int cellSize, int & result)
|
||||
int normHamming(const uchar* a, const uchar* b, int n, int cellSize)
|
||||
{
|
||||
if( cellSize == 1 )
|
||||
return normHamming(a, b, n, result);
|
||||
return normHamming(a, b, n);
|
||||
const uchar* tab = 0;
|
||||
if( cellSize == 2 )
|
||||
tab = popCountTable2;
|
||||
else if( cellSize == 4 )
|
||||
tab = popCountTable4;
|
||||
else
|
||||
return Error::Unknown;
|
||||
return -1;
|
||||
int i = 0;
|
||||
result = 0;
|
||||
int result = 0;
|
||||
#if CV_ENABLE_UNROLLED
|
||||
for( ; i <= n - 4; i += 4 )
|
||||
result += tab[a[i] ^ b[i]] + tab[a[i+1] ^ b[i+1]] +
|
||||
@@ -178,7 +178,129 @@ Error::Code normHamming(const uchar* a, const uchar* b, int n, int cellSize, int
|
||||
#endif
|
||||
for( ; i < n; i++ )
|
||||
result += tab[a[i] ^ b[i]];
|
||||
return Error::Ok;
|
||||
return result;
|
||||
}
|
||||
|
||||
float normL2Sqr_(const float* a, const float* b, int n)
|
||||
{
|
||||
int j = 0; float d = 0.f;
|
||||
#if CV_SSE
|
||||
float CV_DECL_ALIGNED(16) buf[4];
|
||||
__m128 d0 = _mm_setzero_ps(), d1 = _mm_setzero_ps();
|
||||
|
||||
for( ; j <= n - 8; j += 8 )
|
||||
{
|
||||
__m128 t0 = _mm_sub_ps(_mm_loadu_ps(a + j), _mm_loadu_ps(b + j));
|
||||
__m128 t1 = _mm_sub_ps(_mm_loadu_ps(a + j + 4), _mm_loadu_ps(b + j + 4));
|
||||
d0 = _mm_add_ps(d0, _mm_mul_ps(t0, t0));
|
||||
d1 = _mm_add_ps(d1, _mm_mul_ps(t1, t1));
|
||||
}
|
||||
_mm_store_ps(buf, _mm_add_ps(d0, d1));
|
||||
d = buf[0] + buf[1] + buf[2] + buf[3];
|
||||
#endif
|
||||
{
|
||||
for( ; j <= n - 4; j += 4 )
|
||||
{
|
||||
float t0 = a[j] - b[j], t1 = a[j+1] - b[j+1], t2 = a[j+2] - b[j+2], t3 = a[j+3] - b[j+3];
|
||||
d += t0*t0 + t1*t1 + t2*t2 + t3*t3;
|
||||
}
|
||||
}
|
||||
|
||||
for( ; j < n; j++ )
|
||||
{
|
||||
float t = a[j] - b[j];
|
||||
d += t*t;
|
||||
}
|
||||
return d;
|
||||
}
|
||||
|
||||
|
||||
float normL1_(const float* a, const float* b, int n)
|
||||
{
|
||||
int j = 0; float d = 0.f;
|
||||
#if CV_SSE
|
||||
float CV_DECL_ALIGNED(16) buf[4];
|
||||
static const int CV_DECL_ALIGNED(16) absbuf[4] = {0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff};
|
||||
__m128 d0 = _mm_setzero_ps(), d1 = _mm_setzero_ps();
|
||||
__m128 absmask = _mm_load_ps((const float*)absbuf);
|
||||
|
||||
for( ; j <= n - 8; j += 8 )
|
||||
{
|
||||
__m128 t0 = _mm_sub_ps(_mm_loadu_ps(a + j), _mm_loadu_ps(b + j));
|
||||
__m128 t1 = _mm_sub_ps(_mm_loadu_ps(a + j + 4), _mm_loadu_ps(b + j + 4));
|
||||
d0 = _mm_add_ps(d0, _mm_and_ps(t0, absmask));
|
||||
d1 = _mm_add_ps(d1, _mm_and_ps(t1, absmask));
|
||||
}
|
||||
_mm_store_ps(buf, _mm_add_ps(d0, d1));
|
||||
d = buf[0] + buf[1] + buf[2] + buf[3];
|
||||
#elif CV_NEON
|
||||
float32x4_t v_sum = vdupq_n_f32(0.0f);
|
||||
for ( ; j <= n - 4; j += 4)
|
||||
v_sum = vaddq_f32(v_sum, vabdq_f32(vld1q_f32(a + j), vld1q_f32(b + j)));
|
||||
|
||||
float CV_DECL_ALIGNED(16) buf[4];
|
||||
vst1q_f32(buf, v_sum);
|
||||
d = buf[0] + buf[1] + buf[2] + buf[3];
|
||||
#endif
|
||||
{
|
||||
for( ; j <= n - 4; j += 4 )
|
||||
{
|
||||
d += std::abs(a[j] - b[j]) + std::abs(a[j+1] - b[j+1]) +
|
||||
std::abs(a[j+2] - b[j+2]) + std::abs(a[j+3] - b[j+3]);
|
||||
}
|
||||
}
|
||||
|
||||
for( ; j < n; j++ )
|
||||
d += std::abs(a[j] - b[j]);
|
||||
return d;
|
||||
}
|
||||
|
||||
int normL1_(const uchar* a, const uchar* b, int n)
|
||||
{
|
||||
int j = 0, d = 0;
|
||||
#if CV_SSE
|
||||
__m128i d0 = _mm_setzero_si128();
|
||||
|
||||
for( ; j <= n - 16; j += 16 )
|
||||
{
|
||||
__m128i t0 = _mm_loadu_si128((const __m128i*)(a + j));
|
||||
__m128i t1 = _mm_loadu_si128((const __m128i*)(b + j));
|
||||
|
||||
d0 = _mm_add_epi32(d0, _mm_sad_epu8(t0, t1));
|
||||
}
|
||||
|
||||
for( ; j <= n - 4; j += 4 )
|
||||
{
|
||||
__m128i t0 = _mm_cvtsi32_si128(*(const int*)(a + j));
|
||||
__m128i t1 = _mm_cvtsi32_si128(*(const int*)(b + j));
|
||||
|
||||
d0 = _mm_add_epi32(d0, _mm_sad_epu8(t0, t1));
|
||||
}
|
||||
d = _mm_cvtsi128_si32(_mm_add_epi32(d0, _mm_unpackhi_epi64(d0, d0)));
|
||||
#elif CV_NEON
|
||||
uint32x4_t v_sum = vdupq_n_u32(0.0f);
|
||||
for ( ; j <= n - 16; j += 16)
|
||||
{
|
||||
uint8x16_t v_dst = vabdq_u8(vld1q_u8(a + j), vld1q_u8(b + j));
|
||||
uint16x8_t v_low = vmovl_u8(vget_low_u8(v_dst)), v_high = vmovl_u8(vget_high_u8(v_dst));
|
||||
v_sum = vaddq_u32(v_sum, vaddl_u16(vget_low_u16(v_low), vget_low_u16(v_high)));
|
||||
v_sum = vaddq_u32(v_sum, vaddl_u16(vget_high_u16(v_low), vget_high_u16(v_high)));
|
||||
}
|
||||
|
||||
uint CV_DECL_ALIGNED(16) buf[4];
|
||||
vst1q_u32(buf, v_sum);
|
||||
d = buf[0] + buf[1] + buf[2] + buf[3];
|
||||
#endif
|
||||
{
|
||||
for( ; j <= n - 4; j += 4 )
|
||||
{
|
||||
d += std::abs(a[j] - b[j]) + std::abs(a[j+1] - b[j+1]) +
|
||||
std::abs(a[j+2] - b[j+2]) + std::abs(a[j+3] - b[j+3]);
|
||||
}
|
||||
}
|
||||
for( ; j < n; j++ )
|
||||
d += std::abs(a[j] - b[j]);
|
||||
return d;
|
||||
}
|
||||
|
||||
}} //cv::hal
|
||||
|
||||
Reference in New Issue
Block a user