integrated patch with some SSE2/SSE4.2 optimizations from Grigory Frolov
This commit is contained in:
parent
54d68da8e7
commit
b782d8bb53
@ -120,11 +120,26 @@ CV_INLINE IppiSize ippiSize(int width, int height)
|
||||
# else
|
||||
# define CV_SSSE3 0
|
||||
# endif
|
||||
# if defined __SSE4_1__ || _MSC_VER >= 1600
|
||||
# include <smmintrin.h>
|
||||
# define CV_SSE4_1 1
|
||||
# endif
|
||||
# if defined __SSE4_2__ || _MSC_VER >= 1600
|
||||
# include <nmmintrin.h>
|
||||
# define CV_SSE4_2 1
|
||||
# endif
|
||||
# if defined __AVX__ || _MSC_VER >= 1600
|
||||
# include <immintrin.h>
|
||||
# define CV_AVX 1
|
||||
# endif
|
||||
# else
|
||||
# define CV_SSE 0
|
||||
# define CV_SSE2 0
|
||||
# define CV_SSE3 0
|
||||
# define CV_SSSE3 0
|
||||
# define CV_SSE4_1 0
|
||||
# define CV_SSE4_2 0
|
||||
# define CV_AVX 0
|
||||
# endif
|
||||
|
||||
#if defined ANDROID && defined __ARM_NEON__
|
||||
|
@ -657,6 +657,62 @@ cvtScale_<short, short, float>( const short* src, size_t sstep,
|
||||
}
|
||||
}
|
||||
|
||||
template<> void
|
||||
cvtScale_<short, int, float>( const short* src, size_t sstep,
|
||||
int* dst, size_t dstep, Size size,
|
||||
float scale, float shift )
|
||||
{
|
||||
sstep /= sizeof(src[0]);
|
||||
dstep /= sizeof(dst[0]);
|
||||
|
||||
for( ; size.height--; src += sstep, dst += dstep )
|
||||
{
|
||||
int x = 0;
|
||||
|
||||
#if CV_SSE2
|
||||
if(USE_SSE2)//~5X
|
||||
{
|
||||
__m128 scale128 = _mm_set1_ps (scale);
|
||||
__m128 shift128 = _mm_set1_ps (shift);
|
||||
for(; x <= size.width - 8; x += 8 )
|
||||
{
|
||||
__m128i r0 = _mm_loadl_epi64((const __m128i*)(src + x));
|
||||
__m128i r1 = _mm_loadl_epi64((const __m128i*)(src + x + 4));
|
||||
__m128 rf0 =_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(r0, r0), 16));
|
||||
__m128 rf1 =_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(r1, r1), 16));
|
||||
rf0 = _mm_add_ps(_mm_mul_ps(rf0, scale128), shift128);
|
||||
rf1 = _mm_add_ps(_mm_mul_ps(rf1, scale128), shift128);
|
||||
r0 = _mm_cvtps_epi32(rf0);
|
||||
r1 = _mm_cvtps_epi32(rf1);
|
||||
|
||||
_mm_storeu_si128((__m128i*)(dst + x), r0);
|
||||
_mm_storeu_si128((__m128i*)(dst + x + 4), r1);
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
//We will wait Haswell
|
||||
/*
|
||||
#if CV_AVX
|
||||
if(USE_AVX)//2X - bad variant
|
||||
{
|
||||
////TODO:AVX implementation (optimization?) required
|
||||
__m256 scale256 = _mm256_set1_ps (scale);
|
||||
__m256 shift256 = _mm256_set1_ps (shift);
|
||||
for(; x <= size.width - 8; x += 8 )
|
||||
{
|
||||
__m256i buf = _mm256_set_epi32((int)(*(src+x+7)),(int)(*(src+x+6)),(int)(*(src+x+5)),(int)(*(src+x+4)),(int)(*(src+x+3)),(int)(*(src+x+2)),(int)(*(src+x+1)),(int)(*(src+x)));
|
||||
__m256 r0 = _mm256_add_ps( _mm256_mul_ps(_mm256_cvtepi32_ps (buf), scale256), shift256);
|
||||
__m256i res = _mm256_cvtps_epi32(r0);
|
||||
_mm256_storeu_si256 ((__m256i*)(dst+x), res);
|
||||
}
|
||||
}
|
||||
#endif*/
|
||||
|
||||
for(; x < size.width; x++ )
|
||||
dst[x] = saturate_cast<int>(src[x]*scale + shift);
|
||||
}
|
||||
}
|
||||
|
||||
template<typename T, typename DT> static void
|
||||
cvt_( const T* src, size_t sstep,
|
||||
|
@ -78,6 +78,66 @@ copyMask_(const uchar* _src, size_t sstep, const uchar* mask, size_t mstep, ucha
|
||||
}
|
||||
}
|
||||
|
||||
template<> static void
|
||||
copyMask_<uchar>(const uchar* _src, size_t sstep, const uchar* mask, size_t mstep, uchar* _dst, size_t dstep, Size size)
|
||||
{
|
||||
for( ; size.height--; mask += mstep, _src += sstep, _dst += dstep )
|
||||
{
|
||||
const uchar* src = (const uchar*)_src;
|
||||
uchar* dst = (uchar*)_dst;
|
||||
int x = 0;
|
||||
#if CV_SSE4_2
|
||||
if(USE_SSE4_2)//
|
||||
{
|
||||
__m128i zero = _mm_setzero_si128 ();
|
||||
|
||||
for( ; x <= size.width - 16; x += 16 )
|
||||
{
|
||||
const __m128i rSrc = _mm_lddqu_si128((const __m128i*)(src+x));
|
||||
__m128i _mask = _mm_lddqu_si128((const __m128i*)(mask+x));
|
||||
__m128i rDst = _mm_lddqu_si128((__m128i*)(dst+x));
|
||||
__m128i _negMask = _mm_cmpeq_epi8(_mask, zero);
|
||||
rDst = _mm_blendv_epi8(rSrc, rDst, _negMask);
|
||||
_mm_storeu_si128((__m128i*)(dst + x), rDst);
|
||||
}
|
||||
}
|
||||
#endif
|
||||
for( ; x < size.width; x++ )
|
||||
if( mask[x] )
|
||||
dst[x] = src[x];
|
||||
}
|
||||
}
|
||||
|
||||
template<> static void
|
||||
copyMask_<ushort>(const uchar* _src, size_t sstep, const uchar* mask, size_t mstep, uchar* _dst, size_t dstep, Size size)
|
||||
{
|
||||
for( ; size.height--; mask += mstep, _src += sstep, _dst += dstep )
|
||||
{
|
||||
const ushort* src = (const ushort*)_src;
|
||||
ushort* dst = (ushort*)_dst;
|
||||
int x = 0;
|
||||
#if CV_SSE4_2
|
||||
if(USE_SSE4_2)//
|
||||
{
|
||||
__m128i zero = _mm_setzero_si128 ();
|
||||
for( ; x <= size.width - 8; x += 8 )
|
||||
{
|
||||
const __m128i rSrc =_mm_lddqu_si128((const __m128i*)(src+x));
|
||||
__m128i _mask = _mm_loadl_epi64((const __m128i*)(mask+x));
|
||||
_mask = _mm_unpacklo_epi8(_mask, _mask);
|
||||
__m128i rDst = _mm_lddqu_si128((const __m128i*)(dst+x));
|
||||
__m128i _negMask = _mm_cmpeq_epi8(_mask, zero);
|
||||
rDst = _mm_blendv_epi8(rSrc, rDst, _negMask);
|
||||
_mm_storeu_si128((__m128i*)(dst + x), rDst);
|
||||
}
|
||||
}
|
||||
#endif
|
||||
for( ; x < size.width; x++ )
|
||||
if( mask[x] )
|
||||
dst[x] = src[x];
|
||||
}
|
||||
}
|
||||
|
||||
static void
|
||||
copyMaskGeneric(const uchar* _src, size_t sstep, const uchar* mask, size_t mstep, uchar* _dst, size_t dstep, Size size, void* _esz)
|
||||
{
|
||||
|
@ -1010,6 +1010,25 @@ double cv::invert( InputArray _src, OutputArray _dst, int method )
|
||||
if( type == CV_32FC1 )
|
||||
{
|
||||
double d = det2(Sf);
|
||||
#if CV_SSE4_2
|
||||
if(USE_SSE4_2)
|
||||
{
|
||||
__m128 zero = _mm_setzero_ps();
|
||||
__m128 t0 = _mm_loadl_pi(zero, (const __m64*)srcdata); //t0 = sf(0,0) sf(0,1)
|
||||
__m128 t1 = _mm_loadh_pi(zero,(const __m64*)((const float*)(srcdata+srcstep))); //t1 = sf(1,0) sf(1,1)
|
||||
__m128 s0 = _mm_blend_ps(t0,t1,12);
|
||||
d = 1./d;
|
||||
result = true;
|
||||
__m128 det =_mm_set1_ps((float)d);
|
||||
s0 = _mm_mul_ps(s0, det);
|
||||
const uchar CV_DECL_ALIGNED(16) inv[16] = {0,0,0,0,0,0,0,0x80,0,0,0,0x80,0,0,0,0};
|
||||
__m128 pattern = _mm_load_ps((const float*)inv);
|
||||
s0 = _mm_xor_ps(s0, pattern);//==-1*s0
|
||||
s0 = _mm_shuffle_ps(s0, s0, _MM_SHUFFLE(0,2,1,3));
|
||||
_mm_storel_pi((__m64*)dstdata, s0);
|
||||
_mm_storeh_pi((__m64*)((float*)(dstdata+dststep)), s0);
|
||||
}
|
||||
#else
|
||||
if( d != 0. )
|
||||
{
|
||||
double t0, t1;
|
||||
@ -1024,10 +1043,34 @@ double cv::invert( InputArray _src, OutputArray _dst, int method )
|
||||
Df(0,1) = (float)t0;
|
||||
Df(1,0) = (float)t1;
|
||||
}
|
||||
#endif
|
||||
}
|
||||
else
|
||||
{
|
||||
double d = det2(Sd);
|
||||
#if CV_SSE2
|
||||
if(USE_SSE2)
|
||||
{
|
||||
__m128d s0 = _mm_loadu_pd((const double*)srcdata); //s0 = sf(0,0) sf(0,1)
|
||||
__m128d s1 = _mm_loadu_pd ((const double*)(srcdata+srcstep));//s1 = sf(1,0) sf(1,1)
|
||||
__m128d sm = _mm_shuffle_pd(s0, s1, _MM_SHUFFLE2(1,0)); //sm = sf(0,0) sf(1,1) - main diagonal
|
||||
__m128d ss = _mm_shuffle_pd(s0, s1, _MM_SHUFFLE2(0,1)); //sm = sf(0,1) sf(1,0) - secondary diagonal
|
||||
result = true;
|
||||
d = 1./d;
|
||||
__m128d det = _mm_load1_pd((const double*)&d);
|
||||
sm = _mm_mul_pd(sm, det);
|
||||
//__m128d pattern = _mm_set1_pd(-1.);
|
||||
static const uchar CV_DECL_ALIGNED(16) inv[8] = {0,0,0,0,0,0,0,0x80};
|
||||
__m128d pattern = _mm_load1_pd((double*)inv);
|
||||
ss = _mm_mul_pd(ss, det);
|
||||
ss = _mm_xor_pd(ss, pattern);//==-1*ss
|
||||
//ss = _mm_mul_pd(ss,pattern);
|
||||
s0 = _mm_shuffle_pd(sm, ss, _MM_SHUFFLE2(0,1));
|
||||
s1 = _mm_shuffle_pd(ss, sm, _MM_SHUFFLE2(0,1));
|
||||
_mm_store_pd((double*)dstdata, s0);
|
||||
_mm_store_pd((double*)(dstdata+dststep), s1);
|
||||
}
|
||||
#else
|
||||
if( d != 0. )
|
||||
{
|
||||
double t0, t1;
|
||||
@ -1042,6 +1085,7 @@ double cv::invert( InputArray _src, OutputArray _dst, int method )
|
||||
Dd(0,1) = t0;
|
||||
Dd(1,0) = t1;
|
||||
}
|
||||
#endif
|
||||
}
|
||||
}
|
||||
else if( n == 3 )
|
||||
@ -1148,6 +1192,7 @@ double cv::invert( InputArray _src, OutputArray _dst, int method )
|
||||
return result;
|
||||
}
|
||||
|
||||
|
||||
/****************************************************************************************\
|
||||
* Solving a linear system *
|
||||
\****************************************************************************************/
|
||||
|
@ -170,6 +170,8 @@ struct NoVec
|
||||
};
|
||||
|
||||
extern volatile bool USE_SSE2;
|
||||
extern volatile bool USE_SSE4_2;
|
||||
extern volatile bool USE_AVX;
|
||||
|
||||
enum { BLOCK_SIZE = 1024 };
|
||||
|
||||
|
@ -221,6 +221,36 @@ static int countNonZero_(const T* src, int len )
|
||||
return nz;
|
||||
}
|
||||
|
||||
template <>
|
||||
int countNonZero_ <uchar> (const uchar* src, int len)
|
||||
{
|
||||
int i=0, nz = 0;
|
||||
#if CV_SSE4_2
|
||||
if(USE_SSE4_2)//5x-6x
|
||||
{
|
||||
__m128i pattern = _mm_setzero_si128 ();
|
||||
__m128i inv = _mm_set1_epi8((char)1);
|
||||
__int64 CV_DECL_ALIGNED(16) buf[2];
|
||||
for (; i<=len-16; i+=16)
|
||||
{
|
||||
__m128i r0 = _mm_lddqu_si128((const __m128i*)(src+i));
|
||||
__m128i res = _mm_cmpeq_epi8(r0, pattern);
|
||||
res = _mm_add_epi8(res, inv);//11111111+1=00000000, 00000000+1=00000001
|
||||
_mm_store_si128 ((__m128i*)buf, res);
|
||||
|
||||
__int64 countLow = _mm_popcnt_u64(buf[0]);
|
||||
nz += countLow;
|
||||
|
||||
__int64 countHigh = _mm_popcnt_u64(buf[1]);
|
||||
nz +=countHigh;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
for( ; i < len; i++ )
|
||||
nz += src[i] != 0;
|
||||
return nz;
|
||||
}
|
||||
|
||||
static int countNonZero8u( const uchar* src, int len )
|
||||
{ return countNonZero_(src, len); }
|
||||
|
||||
|
@ -205,6 +205,8 @@ IPPInitializer ippInitializer;
|
||||
#endif
|
||||
|
||||
volatile bool USE_SSE2 = featuresEnabled.have[CV_CPU_SSE2];
|
||||
volatile bool USE_SSE4_2 = featuresEnabled.have[CV_CPU_SSE4_2];
|
||||
volatile bool USE_AVX = featuresEnabled.have[CV_CPU_AVX];
|
||||
|
||||
void setUseOptimized( bool flag )
|
||||
{
|
||||
|
Loading…
x
Reference in New Issue
Block a user