avx2

2015-01-12 10:59:28 +03:00
parent 28833421ae
commit 8d48632ebe
7 changed files with 105 additions and 39 deletions
--- a/modules/core/src/convert.cpp
+++ b/modules/core/src/convert.cpp
@@ -2294,26 +2294,44 @@ cvtScale_<short, int, float>( const short* src, size_t sstep,
    {
        int x = 0;

-         #if CV_SSE2
-            if(USE_SSE2)//~5X
+        #if CV_AVX2
+        if (USE_AVX2)
+        {
+            __m256 scale256 = _mm256_set1_ps (scale);
+            __m256 shift256 = _mm256_set1_ps (shift);
+            __m256i zero = _mm256_setzero_si256();
+            for ( ; x <= size.width - 16; x += 16)
            {
-                __m128 scale128 = _mm_set1_ps (scale);
-                __m128 shift128 = _mm_set1_ps (shift);
-                for(; x <= size.width - 8; x += 8 )
-                {
-                    __m128i r0 = _mm_loadl_epi64((const __m128i*)(src + x));
-                    __m128i r1 = _mm_loadl_epi64((const __m128i*)(src + x + 4));
-                    __m128 rf0 =_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(r0, r0), 16));
-                    __m128 rf1 =_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(r1, r1), 16));
-                    rf0 = _mm_add_ps(_mm_mul_ps(rf0, scale128), shift128);
-                    rf1 = _mm_add_ps(_mm_mul_ps(rf1, scale128), shift128);
-                    r0 = _mm_cvtps_epi32(rf0);
-                    r1 = _mm_cvtps_epi32(rf1);
-
-                    _mm_storeu_si128((__m128i*)(dst + x), r0);
-                    _mm_storeu_si128((__m128i*)(dst + x + 4), r1);
-                }
+                __m256i v_src = _mm256_loadu_si256((__m256i const *)(src + x));
+                __m256i v_src_lo = _mm256_unpacklo_epi16(v_src, zero);
+                __m256i v_src_hi = _mm256_unpackhi_epi16(v_src, zero);
+                __m256 v_dst0 = _mm256_add_ps( _mm256_mul_ps(_mm256_cvtepi32_ps (v_src_lo), scale256), shift256);
+                __m256 v_dst1 = _mm256_add_ps( _mm256_mul_ps(_mm256_cvtepi32_ps (v_src_hi), scale256), shift256);
+                _mm256_storeu_si256 ((__m256i *)(dst + x), _mm256_cvtps_epi32(v_dst0));
+                _mm256_storeu_si256 ((__m256i *)(dst + x + 8), _mm256_cvtps_epi32(v_dst1));
            }
+        }
+        #endif
+        #if CV_SSE2
+        if (USE_SSE2)//~5X
+        {
+            __m128 scale128 = _mm_set1_ps (scale);
+            __m128 shift128 = _mm_set1_ps (shift);
+            for(; x <= size.width - 8; x += 8 )
+            {
+                __m128i r0 = _mm_loadl_epi64((const __m128i*)(src + x));
+                __m128i r1 = _mm_loadl_epi64((const __m128i*)(src + x + 4));
+                __m128 rf0 =_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(r0, r0), 16));
+                __m128 rf1 =_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(r1, r1), 16));
+                rf0 = _mm_add_ps(_mm_mul_ps(rf0, scale128), shift128);
+                rf1 = _mm_add_ps(_mm_mul_ps(rf1, scale128), shift128);
+                r0 = _mm_cvtps_epi32(rf0);
+                r1 = _mm_cvtps_epi32(rf1);
+
+                _mm_storeu_si128((__m128i*)(dst + x), r0);
+                _mm_storeu_si128((__m128i*)(dst + x + 4), r1);
+            }
+        }
        #elif CV_NEON
        float32x4_t v_shift = vdupq_n_f32(shift);
        for(; x <= size.width - 8; x += 8 )
@@ -2330,24 +2348,6 @@ cvtScale_<short, int, float>( const short* src, size_t sstep,
        }
        #endif

-        //We will wait Haswell
-        /*
-        #if CV_AVX
-            if(USE_AVX)//2X - bad variant
-            {
-                ////TODO:AVX implementation (optimization?) required
-                __m256 scale256 = _mm256_set1_ps (scale);
-                __m256 shift256 = _mm256_set1_ps (shift);
-                for(; x <= size.width - 8; x += 8 )
-                {
-                    __m256i buf = _mm256_set_epi32((int)(*(src+x+7)),(int)(*(src+x+6)),(int)(*(src+x+5)),(int)(*(src+x+4)),(int)(*(src+x+3)),(int)(*(src+x+2)),(int)(*(src+x+1)),(int)(*(src+x)));
-                    __m256 r0 = _mm256_add_ps( _mm256_mul_ps(_mm256_cvtepi32_ps (buf), scale256), shift256);
-                    __m256i res = _mm256_cvtps_epi32(r0);
-                    _mm256_storeu_si256 ((__m256i*)(dst+x), res);
-                }
-            }
-        #endif*/
-
        for(; x < size.width; x++ )
            dst[x] = saturate_cast<int>(src[x]*scale + shift);
    }
--- a/modules/core/src/precomp.hpp
+++ b/modules/core/src/precomp.hpp
@@ -192,6 +192,7 @@ struct NoVec
 extern volatile bool USE_SSE2;
 extern volatile bool USE_SSE4_2;
 extern volatile bool USE_AVX;
+extern volatile bool USE_AVX2;

 enum { BLOCK_SIZE = 1024 };

--- a/modules/core/src/system.cpp
+++ b/modules/core/src/system.cpp
@@ -82,6 +82,22 @@
            pop ebx
        }
    }
+    static void __cpuidex(int* cpuid_data, int, int)
+    {
+        __asm
+        {
+            push edi
+            mov edi, cpuid_data
+            mov eax, 7
+            mov ecx, 0
+            cpuid
+            mov [edi], eax
+            mov [edi + 4], ebx
+            mov [edi + 8], ecx
+            mov [edi + 12], edx
+            pop edi
+        }
+    }
  #endif
 #endif

@@ -203,7 +219,7 @@ struct HWFeatures
    enum { MAX_FEATURE = CV_HARDWARE_MAX_FEATURE };

    HWFeatures(void)
-     {
+    {
        memset( have, 0, sizeof(have) );
        x86_family = 0;
    }
@@ -251,6 +267,40 @@ struct HWFeatures
            f.have[CV_CPU_SSE4_2] = (cpuid_data[2] & (1<<20)) != 0;
            f.have[CV_CPU_POPCNT] = (cpuid_data[2] & (1<<23)) != 0;
            f.have[CV_CPU_AVX]    = (((cpuid_data[2] & (1<<28)) != 0)&&((cpuid_data[2] & (1<<27)) != 0));//OS uses XSAVE_XRSTORE and CPU support AVX
+
+            // make the second call to the cpuid command in order to get
+            // information about extended features like AVX2
+        #if defined _MSC_VER && (defined _M_IX86 || defined _M_X64)
+            __cpuidex(cpuid_data, 7, 0);
+        #elif defined __GNUC__ && (defined __i386__ || defined __x86_64__)
+            #ifdef __x86_64__
+            asm __volatile__
+            (
+             "movl $7, %%eax\n\t"
+             "movl $0, %%ecx\n\t"
+             "cpuid\n\t"
+             :[eax]"=a"(cpuid_data[0]),[ebx]"=b"(cpuid_data[1]),[ecx]"=c"(cpuid_data[2]),[edx]"=d"(cpuid_data[3])
+             :
+             : "cc"
+            );
+            #else
+            asm volatile
+            (
+             "pushl %%eax\n\t"
+             "pushl %%edx\n\t"
+             "movl $7,%%eax\n\t"
+             "movl $0,%%ecx\n\t"
+             "cpuid\n\t"
+             "popl %%edx\n\t"
+             "popl %%eax\n\t"
+             : "=b"(cpuid_data[1]), "=c"(cpuid_data[2])
+             :
+             : "cc"
+            );
+            #endif
+        #endif
+            f.have[CV_CPU_AVX2]   = (cpuid_data[1] & (1<<5)) != 0;
+
        }

        return f;
@@ -290,6 +340,7 @@ IPPInitializer ippInitializer;
 volatile bool USE_SSE2 = featuresEnabled.have[CV_CPU_SSE2];
 volatile bool USE_SSE4_2 = featuresEnabled.have[CV_CPU_SSE4_2];
 volatile bool USE_AVX = featuresEnabled.have[CV_CPU_AVX];
+volatile bool USE_AVX2 = featuresEnabled.have[CV_CPU_AVX2];

 void setUseOptimized( bool flag )
 {