This commit is contained in:
Ilya Lavrenov
2015-01-12 10:59:28 +03:00
parent 28833421ae
commit 8d48632ebe
7 changed files with 105 additions and 39 deletions

View File

@@ -2294,26 +2294,44 @@ cvtScale_<short, int, float>( const short* src, size_t sstep,
{
int x = 0;
#if CV_SSE2
if(USE_SSE2)//~5X
#if CV_AVX2
if (USE_AVX2)
{
__m256 scale256 = _mm256_set1_ps (scale);
__m256 shift256 = _mm256_set1_ps (shift);
__m256i zero = _mm256_setzero_si256();
for ( ; x <= size.width - 16; x += 16)
{
__m128 scale128 = _mm_set1_ps (scale);
__m128 shift128 = _mm_set1_ps (shift);
for(; x <= size.width - 8; x += 8 )
{
__m128i r0 = _mm_loadl_epi64((const __m128i*)(src + x));
__m128i r1 = _mm_loadl_epi64((const __m128i*)(src + x + 4));
__m128 rf0 =_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(r0, r0), 16));
__m128 rf1 =_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(r1, r1), 16));
rf0 = _mm_add_ps(_mm_mul_ps(rf0, scale128), shift128);
rf1 = _mm_add_ps(_mm_mul_ps(rf1, scale128), shift128);
r0 = _mm_cvtps_epi32(rf0);
r1 = _mm_cvtps_epi32(rf1);
_mm_storeu_si128((__m128i*)(dst + x), r0);
_mm_storeu_si128((__m128i*)(dst + x + 4), r1);
}
__m256i v_src = _mm256_loadu_si256((__m256i const *)(src + x));
__m256i v_src_lo = _mm256_unpacklo_epi16(v_src, zero);
__m256i v_src_hi = _mm256_unpackhi_epi16(v_src, zero);
__m256 v_dst0 = _mm256_add_ps( _mm256_mul_ps(_mm256_cvtepi32_ps (v_src_lo), scale256), shift256);
__m256 v_dst1 = _mm256_add_ps( _mm256_mul_ps(_mm256_cvtepi32_ps (v_src_hi), scale256), shift256);
_mm256_storeu_si256 ((__m256i *)(dst + x), _mm256_cvtps_epi32(v_dst0));
_mm256_storeu_si256 ((__m256i *)(dst + x + 8), _mm256_cvtps_epi32(v_dst1));
}
}
#endif
#if CV_SSE2
if (USE_SSE2)//~5X
{
__m128 scale128 = _mm_set1_ps (scale);
__m128 shift128 = _mm_set1_ps (shift);
for(; x <= size.width - 8; x += 8 )
{
__m128i r0 = _mm_loadl_epi64((const __m128i*)(src + x));
__m128i r1 = _mm_loadl_epi64((const __m128i*)(src + x + 4));
__m128 rf0 =_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(r0, r0), 16));
__m128 rf1 =_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(r1, r1), 16));
rf0 = _mm_add_ps(_mm_mul_ps(rf0, scale128), shift128);
rf1 = _mm_add_ps(_mm_mul_ps(rf1, scale128), shift128);
r0 = _mm_cvtps_epi32(rf0);
r1 = _mm_cvtps_epi32(rf1);
_mm_storeu_si128((__m128i*)(dst + x), r0);
_mm_storeu_si128((__m128i*)(dst + x + 4), r1);
}
}
#elif CV_NEON
float32x4_t v_shift = vdupq_n_f32(shift);
for(; x <= size.width - 8; x += 8 )
@@ -2330,24 +2348,6 @@ cvtScale_<short, int, float>( const short* src, size_t sstep,
}
#endif
//We will wait Haswell
/*
#if CV_AVX
if(USE_AVX)//2X - bad variant
{
////TODO:AVX implementation (optimization?) required
__m256 scale256 = _mm256_set1_ps (scale);
__m256 shift256 = _mm256_set1_ps (shift);
for(; x <= size.width - 8; x += 8 )
{
__m256i buf = _mm256_set_epi32((int)(*(src+x+7)),(int)(*(src+x+6)),(int)(*(src+x+5)),(int)(*(src+x+4)),(int)(*(src+x+3)),(int)(*(src+x+2)),(int)(*(src+x+1)),(int)(*(src+x)));
__m256 r0 = _mm256_add_ps( _mm256_mul_ps(_mm256_cvtepi32_ps (buf), scale256), shift256);
__m256i res = _mm256_cvtps_epi32(r0);
_mm256_storeu_si256 ((__m256i*)(dst+x), res);
}
}
#endif*/
for(; x < size.width; x++ )
dst[x] = saturate_cast<int>(src[x]*scale + shift);
}

View File

@@ -192,6 +192,7 @@ struct NoVec
extern volatile bool USE_SSE2;
extern volatile bool USE_SSE4_2;
extern volatile bool USE_AVX;
extern volatile bool USE_AVX2;
enum { BLOCK_SIZE = 1024 };

View File

@@ -82,6 +82,22 @@
pop ebx
}
}
static void __cpuidex(int* cpuid_data, int, int)
{
__asm
{
push edi
mov edi, cpuid_data
mov eax, 7
mov ecx, 0
cpuid
mov [edi], eax
mov [edi + 4], ebx
mov [edi + 8], ecx
mov [edi + 12], edx
pop edi
}
}
#endif
#endif
@@ -203,7 +219,7 @@ struct HWFeatures
enum { MAX_FEATURE = CV_HARDWARE_MAX_FEATURE };
HWFeatures(void)
{
{
memset( have, 0, sizeof(have) );
x86_family = 0;
}
@@ -251,6 +267,40 @@ struct HWFeatures
f.have[CV_CPU_SSE4_2] = (cpuid_data[2] & (1<<20)) != 0;
f.have[CV_CPU_POPCNT] = (cpuid_data[2] & (1<<23)) != 0;
f.have[CV_CPU_AVX] = (((cpuid_data[2] & (1<<28)) != 0)&&((cpuid_data[2] & (1<<27)) != 0));//OS uses XSAVE_XRSTORE and CPU support AVX
// make the second call to the cpuid command in order to get
// information about extended features like AVX2
#if defined _MSC_VER && (defined _M_IX86 || defined _M_X64)
__cpuidex(cpuid_data, 7, 0);
#elif defined __GNUC__ && (defined __i386__ || defined __x86_64__)
#ifdef __x86_64__
asm __volatile__
(
"movl $7, %%eax\n\t"
"movl $0, %%ecx\n\t"
"cpuid\n\t"
:[eax]"=a"(cpuid_data[0]),[ebx]"=b"(cpuid_data[1]),[ecx]"=c"(cpuid_data[2]),[edx]"=d"(cpuid_data[3])
:
: "cc"
);
#else
asm volatile
(
"pushl %%eax\n\t"
"pushl %%edx\n\t"
"movl $7,%%eax\n\t"
"movl $0,%%ecx\n\t"
"cpuid\n\t"
"popl %%edx\n\t"
"popl %%eax\n\t"
: "=b"(cpuid_data[1]), "=c"(cpuid_data[2])
:
: "cc"
);
#endif
#endif
f.have[CV_CPU_AVX2] = (cpuid_data[1] & (1<<5)) != 0;
}
return f;
@@ -290,6 +340,7 @@ IPPInitializer ippInitializer;
volatile bool USE_SSE2 = featuresEnabled.have[CV_CPU_SSE2];
volatile bool USE_SSE4_2 = featuresEnabled.have[CV_CPU_SSE4_2];
volatile bool USE_AVX = featuresEnabled.have[CV_CPU_AVX];
volatile bool USE_AVX2 = featuresEnabled.have[CV_CPU_AVX2];
void setUseOptimized( bool flag )
{