integrated patch with some SSE2/SSE4.2 optimizations from Grigory Frolov

This commit is contained in:
Vadim Pisarevsky 2012-07-24 17:24:31 +04:00
parent 54d68da8e7
commit b782d8bb53
7 changed files with 220 additions and 10 deletions

View File

@ -120,11 +120,26 @@ CV_INLINE IppiSize ippiSize(int width, int height)
# else
# define CV_SSSE3 0
# endif
# if defined __SSE4_1__ || _MSC_VER >= 1600
# include <smmintrin.h>
# define CV_SSE4_1 1
# endif
# if defined __SSE4_2__ || _MSC_VER >= 1600
# include <nmmintrin.h>
# define CV_SSE4_2 1
# endif
# if defined __AVX__ || _MSC_VER >= 1600
# include <immintrin.h>
# define CV_AVX 1
# endif
# else
# define CV_SSE 0
# define CV_SSE2 0
# define CV_SSE3 0
# define CV_SSSE3 0
# define CV_SSE4_1 0
# define CV_SSE4_2 0
# define CV_AVX 0
# endif
#if defined ANDROID && defined __ARM_NEON__

View File

@ -657,6 +657,62 @@ cvtScale_<short, short, float>( const short* src, size_t sstep,
}
}
template<> void
cvtScale_<short, int, float>( const short* src, size_t sstep,
int* dst, size_t dstep, Size size,
float scale, float shift )
{
sstep /= sizeof(src[0]);
dstep /= sizeof(dst[0]);
for( ; size.height--; src += sstep, dst += dstep )
{
int x = 0;
#if CV_SSE2
if(USE_SSE2)//~5X
{
__m128 scale128 = _mm_set1_ps (scale);
__m128 shift128 = _mm_set1_ps (shift);
for(; x <= size.width - 8; x += 8 )
{
__m128i r0 = _mm_loadl_epi64((const __m128i*)(src + x));
__m128i r1 = _mm_loadl_epi64((const __m128i*)(src + x + 4));
__m128 rf0 =_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(r0, r0), 16));
__m128 rf1 =_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(r1, r1), 16));
rf0 = _mm_add_ps(_mm_mul_ps(rf0, scale128), shift128);
rf1 = _mm_add_ps(_mm_mul_ps(rf1, scale128), shift128);
r0 = _mm_cvtps_epi32(rf0);
r1 = _mm_cvtps_epi32(rf1);
_mm_storeu_si128((__m128i*)(dst + x), r0);
_mm_storeu_si128((__m128i*)(dst + x + 4), r1);
}
}
#endif
//We will wait Haswell
/*
#if CV_AVX
if(USE_AVX)//2X - bad variant
{
////TODO:AVX implementation (optimization?) required
__m256 scale256 = _mm256_set1_ps (scale);
__m256 shift256 = _mm256_set1_ps (shift);
for(; x <= size.width - 8; x += 8 )
{
__m256i buf = _mm256_set_epi32((int)(*(src+x+7)),(int)(*(src+x+6)),(int)(*(src+x+5)),(int)(*(src+x+4)),(int)(*(src+x+3)),(int)(*(src+x+2)),(int)(*(src+x+1)),(int)(*(src+x)));
__m256 r0 = _mm256_add_ps( _mm256_mul_ps(_mm256_cvtepi32_ps (buf), scale256), shift256);
__m256i res = _mm256_cvtps_epi32(r0);
_mm256_storeu_si256 ((__m256i*)(dst+x), res);
}
}
#endif*/
for(; x < size.width; x++ )
dst[x] = saturate_cast<int>(src[x]*scale + shift);
}
}
template<typename T, typename DT> static void
cvt_( const T* src, size_t sstep,

View File

@ -78,6 +78,66 @@ copyMask_(const uchar* _src, size_t sstep, const uchar* mask, size_t mstep, ucha
}
}
template<> static void
copyMask_<uchar>(const uchar* _src, size_t sstep, const uchar* mask, size_t mstep, uchar* _dst, size_t dstep, Size size)
{
for( ; size.height--; mask += mstep, _src += sstep, _dst += dstep )
{
const uchar* src = (const uchar*)_src;
uchar* dst = (uchar*)_dst;
int x = 0;
#if CV_SSE4_2
if(USE_SSE4_2)//
{
__m128i zero = _mm_setzero_si128 ();
for( ; x <= size.width - 16; x += 16 )
{
const __m128i rSrc = _mm_lddqu_si128((const __m128i*)(src+x));
__m128i _mask = _mm_lddqu_si128((const __m128i*)(mask+x));
__m128i rDst = _mm_lddqu_si128((__m128i*)(dst+x));
__m128i _negMask = _mm_cmpeq_epi8(_mask, zero);
rDst = _mm_blendv_epi8(rSrc, rDst, _negMask);
_mm_storeu_si128((__m128i*)(dst + x), rDst);
}
}
#endif
for( ; x < size.width; x++ )
if( mask[x] )
dst[x] = src[x];
}
}
template<> static void
copyMask_<ushort>(const uchar* _src, size_t sstep, const uchar* mask, size_t mstep, uchar* _dst, size_t dstep, Size size)
{
for( ; size.height--; mask += mstep, _src += sstep, _dst += dstep )
{
const ushort* src = (const ushort*)_src;
ushort* dst = (ushort*)_dst;
int x = 0;
#if CV_SSE4_2
if(USE_SSE4_2)//
{
__m128i zero = _mm_setzero_si128 ();
for( ; x <= size.width - 8; x += 8 )
{
const __m128i rSrc =_mm_lddqu_si128((const __m128i*)(src+x));
__m128i _mask = _mm_loadl_epi64((const __m128i*)(mask+x));
_mask = _mm_unpacklo_epi8(_mask, _mask);
__m128i rDst = _mm_lddqu_si128((const __m128i*)(dst+x));
__m128i _negMask = _mm_cmpeq_epi8(_mask, zero);
rDst = _mm_blendv_epi8(rSrc, rDst, _negMask);
_mm_storeu_si128((__m128i*)(dst + x), rDst);
}
}
#endif
for( ; x < size.width; x++ )
if( mask[x] )
dst[x] = src[x];
}
}
static void
copyMaskGeneric(const uchar* _src, size_t sstep, const uchar* mask, size_t mstep, uchar* _dst, size_t dstep, Size size, void* _esz)
{

View File

@ -1010,6 +1010,25 @@ double cv::invert( InputArray _src, OutputArray _dst, int method )
if( type == CV_32FC1 )
{
double d = det2(Sf);
#if CV_SSE4_2
if(USE_SSE4_2)
{
__m128 zero = _mm_setzero_ps();
__m128 t0 = _mm_loadl_pi(zero, (const __m64*)srcdata); //t0 = sf(0,0) sf(0,1)
__m128 t1 = _mm_loadh_pi(zero,(const __m64*)((const float*)(srcdata+srcstep))); //t1 = sf(1,0) sf(1,1)
__m128 s0 = _mm_blend_ps(t0,t1,12);
d = 1./d;
result = true;
__m128 det =_mm_set1_ps((float)d);
s0 = _mm_mul_ps(s0, det);
const uchar CV_DECL_ALIGNED(16) inv[16] = {0,0,0,0,0,0,0,0x80,0,0,0,0x80,0,0,0,0};
__m128 pattern = _mm_load_ps((const float*)inv);
s0 = _mm_xor_ps(s0, pattern);//==-1*s0
s0 = _mm_shuffle_ps(s0, s0, _MM_SHUFFLE(0,2,1,3));
_mm_storel_pi((__m64*)dstdata, s0);
_mm_storeh_pi((__m64*)((float*)(dstdata+dststep)), s0);
}
#else
if( d != 0. )
{
double t0, t1;
@ -1024,10 +1043,34 @@ double cv::invert( InputArray _src, OutputArray _dst, int method )
Df(0,1) = (float)t0;
Df(1,0) = (float)t1;
}
#endif
}
else
{
double d = det2(Sd);
#if CV_SSE2
if(USE_SSE2)
{
__m128d s0 = _mm_loadu_pd((const double*)srcdata); //s0 = sf(0,0) sf(0,1)
__m128d s1 = _mm_loadu_pd ((const double*)(srcdata+srcstep));//s1 = sf(1,0) sf(1,1)
__m128d sm = _mm_shuffle_pd(s0, s1, _MM_SHUFFLE2(1,0)); //sm = sf(0,0) sf(1,1) - main diagonal
__m128d ss = _mm_shuffle_pd(s0, s1, _MM_SHUFFLE2(0,1)); //sm = sf(0,1) sf(1,0) - secondary diagonal
result = true;
d = 1./d;
__m128d det = _mm_load1_pd((const double*)&d);
sm = _mm_mul_pd(sm, det);
//__m128d pattern = _mm_set1_pd(-1.);
static const uchar CV_DECL_ALIGNED(16) inv[8] = {0,0,0,0,0,0,0,0x80};
__m128d pattern = _mm_load1_pd((double*)inv);
ss = _mm_mul_pd(ss, det);
ss = _mm_xor_pd(ss, pattern);//==-1*ss
//ss = _mm_mul_pd(ss,pattern);
s0 = _mm_shuffle_pd(sm, ss, _MM_SHUFFLE2(0,1));
s1 = _mm_shuffle_pd(ss, sm, _MM_SHUFFLE2(0,1));
_mm_store_pd((double*)dstdata, s0);
_mm_store_pd((double*)(dstdata+dststep), s1);
}
#else
if( d != 0. )
{
double t0, t1;
@ -1042,6 +1085,7 @@ double cv::invert( InputArray _src, OutputArray _dst, int method )
Dd(0,1) = t0;
Dd(1,0) = t1;
}
#endif
}
}
else if( n == 3 )
@ -1148,6 +1192,7 @@ double cv::invert( InputArray _src, OutputArray _dst, int method )
return result;
}
/****************************************************************************************\
* Solving a linear system *
\****************************************************************************************/

View File

@ -170,6 +170,8 @@ struct NoVec
};
extern volatile bool USE_SSE2;
extern volatile bool USE_SSE4_2;
extern volatile bool USE_AVX;
enum { BLOCK_SIZE = 1024 };

View File

@ -221,6 +221,36 @@ static int countNonZero_(const T* src, int len )
return nz;
}
template <>
int countNonZero_ <uchar> (const uchar* src, int len)
{
int i=0, nz = 0;
#if CV_SSE4_2
if(USE_SSE4_2)//5x-6x
{
__m128i pattern = _mm_setzero_si128 ();
__m128i inv = _mm_set1_epi8((char)1);
__int64 CV_DECL_ALIGNED(16) buf[2];
for (; i<=len-16; i+=16)
{
__m128i r0 = _mm_lddqu_si128((const __m128i*)(src+i));
__m128i res = _mm_cmpeq_epi8(r0, pattern);
res = _mm_add_epi8(res, inv);//11111111+1=00000000, 00000000+1=00000001
_mm_store_si128 ((__m128i*)buf, res);
__int64 countLow = _mm_popcnt_u64(buf[0]);
nz += countLow;
__int64 countHigh = _mm_popcnt_u64(buf[1]);
nz +=countHigh;
}
}
#endif
for( ; i < len; i++ )
nz += src[i] != 0;
return nz;
}
static int countNonZero8u( const uchar* src, int len )
{ return countNonZero_(src, len); }

View File

@ -205,6 +205,8 @@ IPPInitializer ippInitializer;
#endif
volatile bool USE_SSE2 = featuresEnabled.have[CV_CPU_SSE2];
volatile bool USE_SSE4_2 = featuresEnabled.have[CV_CPU_SSE4_2];
volatile bool USE_AVX = featuresEnabled.have[CV_CPU_AVX];
void setUseOptimized( bool flag )
{