|
|
|
|
@@ -54,6 +54,10 @@
|
|
|
|
|
static IppStatus sts = ippInit();
|
|
|
|
|
#endif
|
|
|
|
|
|
|
|
|
|
#ifdef _MSC_VER
|
|
|
|
|
# pragma warning(disable:4752) // Disable warning for mixing SSE and AVX
|
|
|
|
|
#endif
|
|
|
|
|
|
|
|
|
|
namespace cv
|
|
|
|
|
{
|
|
|
|
|
|
|
|
|
|
@@ -451,13 +455,8 @@ struct HResizeNoVec
|
|
|
|
|
|
|
|
|
|
#if CV_SSE2
|
|
|
|
|
|
|
|
|
|
struct VResizeLinearVec_32s8u
|
|
|
|
|
static int VResizeLinearVec_32s8u_sse2(const uchar** _src, uchar* dst, const uchar* _beta, int width)
|
|
|
|
|
{
|
|
|
|
|
int operator()(const uchar** _src, uchar* dst, const uchar* _beta, int width ) const
|
|
|
|
|
{
|
|
|
|
|
if( !checkHardwareSupport(CV_CPU_SSE2) )
|
|
|
|
|
return 0;
|
|
|
|
|
|
|
|
|
|
const int** src = (const int**)_src;
|
|
|
|
|
const short* beta = (const short*)_beta;
|
|
|
|
|
const int *S0 = src[0], *S1 = src[1];
|
|
|
|
|
@@ -530,17 +529,111 @@ struct VResizeLinearVec_32s8u
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
return x;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
#if CV_AVX2
|
|
|
|
|
int VResizeLinearVec_32s8u_avx2(const uchar** _src, uchar* dst, const uchar* _beta, int width )
|
|
|
|
|
{
|
|
|
|
|
const int** src = (const int**)_src;
|
|
|
|
|
const short* beta = (const short*)_beta;
|
|
|
|
|
const int *S0 = src[0], *S1 = src[1];
|
|
|
|
|
int x = 0;
|
|
|
|
|
__m256i b0 = _mm256_set1_epi16(beta[0]), b1 = _mm256_set1_epi16(beta[1]);
|
|
|
|
|
__m256i delta = _mm256_set1_epi16(2);
|
|
|
|
|
const int index[8] = { 0, 4, 1, 5, 2, 6, 3, 7 };
|
|
|
|
|
__m256i shuffle = _mm256_load_si256((const __m256i*)index);
|
|
|
|
|
|
|
|
|
|
if( (((size_t)S0|(size_t)S1)&31) == 0 )
|
|
|
|
|
for( ; x <= width - 32; x += 32 )
|
|
|
|
|
{
|
|
|
|
|
__m256i x0, x1, x2, y0, y1, y2;
|
|
|
|
|
x0 = _mm256_load_si256((const __m256i*)(S0 + x));
|
|
|
|
|
x1 = _mm256_load_si256((const __m256i*)(S0 + x + 8));
|
|
|
|
|
y0 = _mm256_load_si256((const __m256i*)(S1 + x));
|
|
|
|
|
y1 = _mm256_load_si256((const __m256i*)(S1 + x + 8));
|
|
|
|
|
x0 = _mm256_packs_epi32(_mm256_srai_epi32(x0, 4), _mm256_srai_epi32(x1, 4));
|
|
|
|
|
y0 = _mm256_packs_epi32(_mm256_srai_epi32(y0, 4), _mm256_srai_epi32(y1, 4));
|
|
|
|
|
|
|
|
|
|
x1 = _mm256_load_si256((const __m256i*)(S0 + x + 16));
|
|
|
|
|
x2 = _mm256_load_si256((const __m256i*)(S0 + x + 24));
|
|
|
|
|
y1 = _mm256_load_si256((const __m256i*)(S1 + x + 16));
|
|
|
|
|
y2 = _mm256_load_si256((const __m256i*)(S1 + x + 24));
|
|
|
|
|
x1 = _mm256_packs_epi32(_mm256_srai_epi32(x1, 4), _mm256_srai_epi32(x2, 4));
|
|
|
|
|
y1 = _mm256_packs_epi32(_mm256_srai_epi32(y1, 4), _mm256_srai_epi32(y2, 4));
|
|
|
|
|
|
|
|
|
|
x0 = _mm256_adds_epi16(_mm256_mulhi_epi16(x0, b0), _mm256_mulhi_epi16(y0, b1));
|
|
|
|
|
x1 = _mm256_adds_epi16(_mm256_mulhi_epi16(x1, b0), _mm256_mulhi_epi16(y1, b1));
|
|
|
|
|
|
|
|
|
|
x0 = _mm256_srai_epi16(_mm256_adds_epi16(x0, delta), 2);
|
|
|
|
|
x1 = _mm256_srai_epi16(_mm256_adds_epi16(x1, delta), 2);
|
|
|
|
|
x0 = _mm256_packus_epi16(x0, x1);
|
|
|
|
|
x0 = _mm256_permutevar8x32_epi32(x0, shuffle);
|
|
|
|
|
_mm256_storeu_si256( (__m256i*)(dst + x), x0);
|
|
|
|
|
}
|
|
|
|
|
else
|
|
|
|
|
for( ; x <= width - 32; x += 32 )
|
|
|
|
|
{
|
|
|
|
|
__m256i x0, x1, x2, y0, y1, y2;
|
|
|
|
|
x0 = _mm256_loadu_si256((const __m256i*)(S0 + x));
|
|
|
|
|
x1 = _mm256_loadu_si256((const __m256i*)(S0 + x + 8));
|
|
|
|
|
y0 = _mm256_loadu_si256((const __m256i*)(S1 + x));
|
|
|
|
|
y1 = _mm256_loadu_si256((const __m256i*)(S1 + x + 8));
|
|
|
|
|
x0 = _mm256_packs_epi32(_mm256_srai_epi32(x0, 4), _mm256_srai_epi32(x1, 4));
|
|
|
|
|
y0 = _mm256_packs_epi32(_mm256_srai_epi32(y0, 4), _mm256_srai_epi32(y1, 4));
|
|
|
|
|
|
|
|
|
|
x1 = _mm256_loadu_si256((const __m256i*)(S0 + x + 16));
|
|
|
|
|
x2 = _mm256_loadu_si256((const __m256i*)(S0 + x + 24));
|
|
|
|
|
y1 = _mm256_loadu_si256((const __m256i*)(S1 + x + 16));
|
|
|
|
|
y2 = _mm256_loadu_si256((const __m256i*)(S1 + x + 24));
|
|
|
|
|
x1 = _mm256_packs_epi32(_mm256_srai_epi32(x1, 4), _mm256_srai_epi32(x2, 4));
|
|
|
|
|
y1 = _mm256_packs_epi32(_mm256_srai_epi32(y1, 4), _mm256_srai_epi32(y2, 4));
|
|
|
|
|
|
|
|
|
|
x0 = _mm256_adds_epi16(_mm256_mulhi_epi16(x0, b0), _mm256_mulhi_epi16(y0, b1));
|
|
|
|
|
x1 = _mm256_adds_epi16(_mm256_mulhi_epi16(x1, b0), _mm256_mulhi_epi16(y1, b1));
|
|
|
|
|
|
|
|
|
|
x0 = _mm256_srai_epi16(_mm256_adds_epi16(x0, delta), 2);
|
|
|
|
|
x1 = _mm256_srai_epi16(_mm256_adds_epi16(x1, delta), 2);
|
|
|
|
|
x0 = _mm256_packus_epi16(x0, x1);
|
|
|
|
|
x0 = _mm256_permutevar8x32_epi32(x0, shuffle);
|
|
|
|
|
_mm256_storeu_si256( (__m256i*)(dst + x), x0);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
for( ; x < width - 8; x += 8 )
|
|
|
|
|
{
|
|
|
|
|
__m256i x0, y0;
|
|
|
|
|
x0 = _mm256_srai_epi32(_mm256_loadu_si256((const __m256i*)(S0 + x)), 4);
|
|
|
|
|
y0 = _mm256_srai_epi32(_mm256_loadu_si256((const __m256i*)(S1 + x)), 4);
|
|
|
|
|
x0 = _mm256_packs_epi32(x0, x0);
|
|
|
|
|
y0 = _mm256_packs_epi32(y0, y0);
|
|
|
|
|
x0 = _mm256_adds_epi16(_mm256_mulhi_epi16(x0, b0), _mm256_mulhi_epi16(y0, b1));
|
|
|
|
|
x0 = _mm256_srai_epi16(_mm256_adds_epi16(x0, delta), 2);
|
|
|
|
|
x0 = _mm256_packus_epi16(x0, x0);
|
|
|
|
|
*(int*)(dst + x) = _mm_cvtsi128_si32(_mm256_extracti128_si256(x0, 0));
|
|
|
|
|
*(int*)(dst + x + 4) = _mm_cvtsi128_si32(_mm256_extracti128_si256(x0, 1));
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
return x;
|
|
|
|
|
}
|
|
|
|
|
#endif
|
|
|
|
|
|
|
|
|
|
struct VResizeLinearVec_32s8u
|
|
|
|
|
{
|
|
|
|
|
int operator()(const uchar** _src, uchar* dst, const uchar* _beta, int width ) const
|
|
|
|
|
{
|
|
|
|
|
#if CV_AVX2
|
|
|
|
|
if( checkHardwareSupport(CV_CPU_AVX2) )
|
|
|
|
|
return VResizeLinearVec_32s8u_avx2(_src, dst, _beta, width);
|
|
|
|
|
#endif
|
|
|
|
|
if( checkHardwareSupport(CV_CPU_SSE2) )
|
|
|
|
|
return VResizeLinearVec_32s8u_sse2(_src, dst, _beta, width);
|
|
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
|
}
|
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
template<int shiftval> struct VResizeLinearVec_32f16
|
|
|
|
|
template<int shiftval>
|
|
|
|
|
int VResizeLinearVec_32f16_sse2(const uchar** _src, uchar* _dst, const uchar* _beta, int width )
|
|
|
|
|
{
|
|
|
|
|
int operator()(const uchar** _src, uchar* _dst, const uchar* _beta, int width ) const
|
|
|
|
|
{
|
|
|
|
|
if( !checkHardwareSupport(CV_CPU_SSE2) )
|
|
|
|
|
return 0;
|
|
|
|
|
|
|
|
|
|
const float** src = (const float**)_src;
|
|
|
|
|
const float* beta = (const float*)_beta;
|
|
|
|
|
const float *S0 = src[0], *S1 = src[1];
|
|
|
|
|
@@ -626,19 +719,121 @@ template<int shiftval> struct VResizeLinearVec_32f16
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
return x;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
#if CV_AVX2
|
|
|
|
|
template<int shiftval>
|
|
|
|
|
int VResizeLinearVec_32f16_avx2(const uchar** _src, uchar* _dst, const uchar* _beta, int width )
|
|
|
|
|
{
|
|
|
|
|
const float** src = (const float**)_src;
|
|
|
|
|
const float* beta = (const float*)_beta;
|
|
|
|
|
const float *S0 = src[0], *S1 = src[1];
|
|
|
|
|
ushort* dst = (ushort*)_dst;
|
|
|
|
|
int x = 0;
|
|
|
|
|
|
|
|
|
|
__m256 b0 = _mm256_set1_ps(beta[0]), b1 = _mm256_set1_ps(beta[1]);
|
|
|
|
|
__m256i preshift = _mm256_set1_epi32(shiftval);
|
|
|
|
|
__m256i postshift = _mm256_set1_epi16((short)shiftval);
|
|
|
|
|
|
|
|
|
|
if( (((size_t)S0|(size_t)S1)&31) == 0 )
|
|
|
|
|
for( ; x <= width - 32; x += 32 )
|
|
|
|
|
{
|
|
|
|
|
__m256 x0, x1, y0, y1;
|
|
|
|
|
__m256i t0, t1, t2;
|
|
|
|
|
x0 = _mm256_load_ps(S0 + x);
|
|
|
|
|
x1 = _mm256_load_ps(S0 + x + 8);
|
|
|
|
|
y0 = _mm256_load_ps(S1 + x);
|
|
|
|
|
y1 = _mm256_load_ps(S1 + x + 8);
|
|
|
|
|
|
|
|
|
|
x0 = _mm256_add_ps(_mm256_mul_ps(x0, b0), _mm256_mul_ps(y0, b1));
|
|
|
|
|
x1 = _mm256_add_ps(_mm256_mul_ps(x1, b0), _mm256_mul_ps(y1, b1));
|
|
|
|
|
t0 = _mm256_add_epi32(_mm256_cvtps_epi32(x0), preshift);
|
|
|
|
|
t2 = _mm256_add_epi32(_mm256_cvtps_epi32(x1), preshift);
|
|
|
|
|
t0 = _mm256_add_epi16(_mm256_packs_epi32(t0, t2), postshift);
|
|
|
|
|
|
|
|
|
|
x0 = _mm256_load_ps(S0 + x + 16);
|
|
|
|
|
x1 = _mm256_load_ps(S0 + x + 24);
|
|
|
|
|
y0 = _mm256_load_ps(S1 + x + 16);
|
|
|
|
|
y1 = _mm256_load_ps(S1 + x + 24);
|
|
|
|
|
|
|
|
|
|
x0 = _mm256_add_ps(_mm256_mul_ps(x0, b0), _mm256_mul_ps(y0, b1));
|
|
|
|
|
x1 = _mm256_add_ps(_mm256_mul_ps(x1, b0), _mm256_mul_ps(y1, b1));
|
|
|
|
|
t1 = _mm256_add_epi32(_mm256_cvtps_epi32(x0), preshift);
|
|
|
|
|
t2 = _mm256_add_epi32(_mm256_cvtps_epi32(x1), preshift);
|
|
|
|
|
t1 = _mm256_add_epi16(_mm256_packs_epi32(t1, t2), postshift);
|
|
|
|
|
|
|
|
|
|
_mm256_storeu_si256( (__m256i*)(dst + x), t0);
|
|
|
|
|
_mm256_storeu_si256( (__m256i*)(dst + x + 16), t1);
|
|
|
|
|
}
|
|
|
|
|
else
|
|
|
|
|
for( ; x <= width - 32; x += 32 )
|
|
|
|
|
{
|
|
|
|
|
__m256 x0, x1, y0, y1;
|
|
|
|
|
__m256i t0, t1, t2;
|
|
|
|
|
x0 = _mm256_loadu_ps(S0 + x);
|
|
|
|
|
x1 = _mm256_loadu_ps(S0 + x + 8);
|
|
|
|
|
y0 = _mm256_loadu_ps(S1 + x);
|
|
|
|
|
y1 = _mm256_loadu_ps(S1 + x + 8);
|
|
|
|
|
|
|
|
|
|
x0 = _mm256_add_ps(_mm256_mul_ps(x0, b0), _mm256_mul_ps(y0, b1));
|
|
|
|
|
x1 = _mm256_add_ps(_mm256_mul_ps(x1, b0), _mm256_mul_ps(y1, b1));
|
|
|
|
|
t0 = _mm256_add_epi32(_mm256_cvtps_epi32(x0), preshift);
|
|
|
|
|
t2 = _mm256_add_epi32(_mm256_cvtps_epi32(x1), preshift);
|
|
|
|
|
t0 = _mm256_add_epi16(_mm256_packs_epi32(t0, t2), postshift);
|
|
|
|
|
|
|
|
|
|
x0 = _mm256_loadu_ps(S0 + x + 16);
|
|
|
|
|
x1 = _mm256_loadu_ps(S0 + x + 24);
|
|
|
|
|
y0 = _mm256_loadu_ps(S1 + x + 16);
|
|
|
|
|
y1 = _mm256_loadu_ps(S1 + x + 24);
|
|
|
|
|
|
|
|
|
|
x0 = _mm256_add_ps(_mm256_mul_ps(x0, b0), _mm256_mul_ps(y0, b1));
|
|
|
|
|
x1 = _mm256_add_ps(_mm256_mul_ps(x1, b0), _mm256_mul_ps(y1, b1));
|
|
|
|
|
t1 = _mm256_add_epi32(_mm256_cvtps_epi32(x0), preshift);
|
|
|
|
|
t2 = _mm256_add_epi32(_mm256_cvtps_epi32(x1), preshift);
|
|
|
|
|
t1 = _mm256_add_epi16(_mm256_packs_epi32(t1, t2), postshift);
|
|
|
|
|
|
|
|
|
|
_mm256_storeu_si256( (__m256i*)(dst + x), t0);
|
|
|
|
|
_mm256_storeu_si256( (__m256i*)(dst + x + 16), t1);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
for( ; x < width - 8; x += 8 )
|
|
|
|
|
{
|
|
|
|
|
__m256 x0, y0;
|
|
|
|
|
__m256i t0;
|
|
|
|
|
x0 = _mm256_loadu_ps(S0 + x);
|
|
|
|
|
y0 = _mm256_loadu_ps(S1 + x);
|
|
|
|
|
|
|
|
|
|
x0 = _mm256_add_ps(_mm256_mul_ps(x0, b0), _mm256_mul_ps(y0, b1));
|
|
|
|
|
t0 = _mm256_add_epi32(_mm256_cvtps_epi32(x0), preshift);
|
|
|
|
|
t0 = _mm256_add_epi16(_mm256_packs_epi32(t0, t0), postshift);
|
|
|
|
|
_mm_storel_epi64( (__m128i*)(dst + x), _mm256_extracti128_si256(t0, 0));
|
|
|
|
|
_mm_storel_epi64( (__m128i*)(dst + x + 4), _mm256_extracti128_si256(t0, 1));
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
return x;
|
|
|
|
|
}
|
|
|
|
|
#endif
|
|
|
|
|
|
|
|
|
|
template<int shiftval> struct VResizeLinearVec_32f16
|
|
|
|
|
{
|
|
|
|
|
int operator()(const uchar** _src, uchar* _dst, const uchar* _beta, int width ) const
|
|
|
|
|
{
|
|
|
|
|
#if CV_AVX2
|
|
|
|
|
if( checkHardwareSupport(CV_CPU_AVX2) )
|
|
|
|
|
return VResizeLinearVec_32f16_avx2<shiftval>(_src, _dst, _beta, width);
|
|
|
|
|
#endif
|
|
|
|
|
if( checkHardwareSupport(CV_CPU_SSE2) )
|
|
|
|
|
return VResizeLinearVec_32f16_sse2<shiftval>(_src, _dst, _beta, width);
|
|
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
|
}
|
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
typedef VResizeLinearVec_32f16<SHRT_MIN> VResizeLinearVec_32f16u;
|
|
|
|
|
typedef VResizeLinearVec_32f16<0> VResizeLinearVec_32f16s;
|
|
|
|
|
|
|
|
|
|
struct VResizeLinearVec_32f
|
|
|
|
|
static int VResizeLinearVec_32f_sse(const uchar** _src, uchar* _dst, const uchar* _beta, int width )
|
|
|
|
|
{
|
|
|
|
|
int operator()(const uchar** _src, uchar* _dst, const uchar* _beta, int width ) const
|
|
|
|
|
{
|
|
|
|
|
if( !checkHardwareSupport(CV_CPU_SSE) )
|
|
|
|
|
return 0;
|
|
|
|
|
|
|
|
|
|
const float** src = (const float**)_src;
|
|
|
|
|
const float* beta = (const float*)_beta;
|
|
|
|
|
const float *S0 = src[0], *S1 = src[1];
|
|
|
|
|
@@ -679,17 +874,72 @@ struct VResizeLinearVec_32f
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
return x;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
#if CV_AVX
|
|
|
|
|
int VResizeLinearVec_32f_avx(const uchar** _src, uchar* _dst, const uchar* _beta, int width )
|
|
|
|
|
{
|
|
|
|
|
const float** src = (const float**)_src;
|
|
|
|
|
const float* beta = (const float*)_beta;
|
|
|
|
|
const float *S0 = src[0], *S1 = src[1];
|
|
|
|
|
float* dst = (float*)_dst;
|
|
|
|
|
int x = 0;
|
|
|
|
|
|
|
|
|
|
__m256 b0 = _mm256_set1_ps(beta[0]), b1 = _mm256_set1_ps(beta[1]);
|
|
|
|
|
|
|
|
|
|
if( (((size_t)S0|(size_t)S1)&31) == 0 )
|
|
|
|
|
for( ; x <= width - 16; x += 16 )
|
|
|
|
|
{
|
|
|
|
|
__m256 x0, x1, y0, y1;
|
|
|
|
|
x0 = _mm256_load_ps(S0 + x);
|
|
|
|
|
x1 = _mm256_load_ps(S0 + x + 8);
|
|
|
|
|
y0 = _mm256_load_ps(S1 + x);
|
|
|
|
|
y1 = _mm256_load_ps(S1 + x + 8);
|
|
|
|
|
|
|
|
|
|
x0 = _mm256_add_ps(_mm256_mul_ps(x0, b0), _mm256_mul_ps(y0, b1));
|
|
|
|
|
x1 = _mm256_add_ps(_mm256_mul_ps(x1, b0), _mm256_mul_ps(y1, b1));
|
|
|
|
|
|
|
|
|
|
_mm256_storeu_ps( dst + x, x0);
|
|
|
|
|
_mm256_storeu_ps( dst + x + 8, x1);
|
|
|
|
|
}
|
|
|
|
|
else
|
|
|
|
|
for( ; x <= width - 16; x += 16 )
|
|
|
|
|
{
|
|
|
|
|
__m256 x0, x1, y0, y1;
|
|
|
|
|
x0 = _mm256_loadu_ps(S0 + x);
|
|
|
|
|
x1 = _mm256_loadu_ps(S0 + x + 8);
|
|
|
|
|
y0 = _mm256_loadu_ps(S1 + x);
|
|
|
|
|
y1 = _mm256_loadu_ps(S1 + x + 8);
|
|
|
|
|
|
|
|
|
|
x0 = _mm256_add_ps(_mm256_mul_ps(x0, b0), _mm256_mul_ps(y0, b1));
|
|
|
|
|
x1 = _mm256_add_ps(_mm256_mul_ps(x1, b0), _mm256_mul_ps(y1, b1));
|
|
|
|
|
|
|
|
|
|
_mm256_storeu_ps( dst + x, x0);
|
|
|
|
|
_mm256_storeu_ps( dst + x + 8, x1);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
return x;
|
|
|
|
|
}
|
|
|
|
|
#endif
|
|
|
|
|
|
|
|
|
|
struct VResizeLinearVec_32f
|
|
|
|
|
{
|
|
|
|
|
int operator()(const uchar** _src, uchar* _dst, const uchar* _beta, int width ) const
|
|
|
|
|
{
|
|
|
|
|
#if CV_AVX
|
|
|
|
|
if( checkHardwareSupport(CV_CPU_AVX) )
|
|
|
|
|
return VResizeLinearVec_32f_avx(_src, _dst, _beta, width);
|
|
|
|
|
#endif
|
|
|
|
|
if( checkHardwareSupport(CV_CPU_SSE) )
|
|
|
|
|
return VResizeLinearVec_32f_sse(_src, _dst, _beta, width);
|
|
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
|
}
|
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
struct VResizeCubicVec_32s8u
|
|
|
|
|
static int VResizeCubicVec_32s8u_sse2(const uchar** _src, uchar* dst, const uchar* _beta, int width )
|
|
|
|
|
{
|
|
|
|
|
int operator()(const uchar** _src, uchar* dst, const uchar* _beta, int width ) const
|
|
|
|
|
{
|
|
|
|
|
if( !checkHardwareSupport(CV_CPU_SSE2) )
|
|
|
|
|
return 0;
|
|
|
|
|
|
|
|
|
|
const int** src = (const int**)_src;
|
|
|
|
|
const short* beta = (const short*)_beta;
|
|
|
|
|
const int *S0 = src[0], *S1 = src[1], *S2 = src[2], *S3 = src[3];
|
|
|
|
|
@@ -774,17 +1024,124 @@ struct VResizeCubicVec_32s8u
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
return x;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
#if CV_AVX2
|
|
|
|
|
int VResizeCubicVec_32s8u_avx2(const uchar** _src, uchar* dst, const uchar* _beta, int width )
|
|
|
|
|
{
|
|
|
|
|
const int** src = (const int**)_src;
|
|
|
|
|
const short* beta = (const short*)_beta;
|
|
|
|
|
const int *S0 = src[0], *S1 = src[1], *S2 = src[2], *S3 = src[3];
|
|
|
|
|
int x = 0;
|
|
|
|
|
float scale = 1.f/(INTER_RESIZE_COEF_SCALE*INTER_RESIZE_COEF_SCALE);
|
|
|
|
|
__m256 b0 = _mm256_set1_ps(beta[0]*scale), b1 = _mm256_set1_ps(beta[1]*scale),
|
|
|
|
|
b2 = _mm256_set1_ps(beta[2]*scale), b3 = _mm256_set1_ps(beta[3]*scale);
|
|
|
|
|
const int shuffle = 0xd8; // 11 | 01 | 10 | 00
|
|
|
|
|
|
|
|
|
|
if( (((size_t)S0|(size_t)S1|(size_t)S2|(size_t)S3)&31) == 0 )
|
|
|
|
|
for( ; x <= width - 16; x += 16 )
|
|
|
|
|
{
|
|
|
|
|
__m256i x0, x1, y0, y1;
|
|
|
|
|
__m256 s0, s1, f0, f1;
|
|
|
|
|
x0 = _mm256_load_si256((const __m256i*)(S0 + x));
|
|
|
|
|
x1 = _mm256_load_si256((const __m256i*)(S0 + x + 8));
|
|
|
|
|
y0 = _mm256_load_si256((const __m256i*)(S1 + x));
|
|
|
|
|
y1 = _mm256_load_si256((const __m256i*)(S1 + x + 8));
|
|
|
|
|
|
|
|
|
|
s0 = _mm256_mul_ps(_mm256_cvtepi32_ps(x0), b0);
|
|
|
|
|
s1 = _mm256_mul_ps(_mm256_cvtepi32_ps(x1), b0);
|
|
|
|
|
f0 = _mm256_mul_ps(_mm256_cvtepi32_ps(y0), b1);
|
|
|
|
|
f1 = _mm256_mul_ps(_mm256_cvtepi32_ps(y1), b1);
|
|
|
|
|
s0 = _mm256_add_ps(s0, f0);
|
|
|
|
|
s1 = _mm256_add_ps(s1, f1);
|
|
|
|
|
|
|
|
|
|
x0 = _mm256_load_si256((const __m256i*)(S2 + x));
|
|
|
|
|
x1 = _mm256_load_si256((const __m256i*)(S2 + x + 8));
|
|
|
|
|
y0 = _mm256_load_si256((const __m256i*)(S3 + x));
|
|
|
|
|
y1 = _mm256_load_si256((const __m256i*)(S3 + x + 8));
|
|
|
|
|
|
|
|
|
|
f0 = _mm256_mul_ps(_mm256_cvtepi32_ps(x0), b2);
|
|
|
|
|
f1 = _mm256_mul_ps(_mm256_cvtepi32_ps(x1), b2);
|
|
|
|
|
s0 = _mm256_add_ps(s0, f0);
|
|
|
|
|
s1 = _mm256_add_ps(s1, f1);
|
|
|
|
|
f0 = _mm256_mul_ps(_mm256_cvtepi32_ps(y0), b3);
|
|
|
|
|
f1 = _mm256_mul_ps(_mm256_cvtepi32_ps(y1), b3);
|
|
|
|
|
s0 = _mm256_add_ps(s0, f0);
|
|
|
|
|
s1 = _mm256_add_ps(s1, f1);
|
|
|
|
|
|
|
|
|
|
x0 = _mm256_cvtps_epi32(s0);
|
|
|
|
|
x1 = _mm256_cvtps_epi32(s1);
|
|
|
|
|
|
|
|
|
|
x0 = _mm256_packs_epi32(x0, x1);
|
|
|
|
|
x0 = _mm256_permute4x64_epi64(x0, shuffle);
|
|
|
|
|
x0 = _mm256_packus_epi16(x0, x0);
|
|
|
|
|
_mm_storel_epi64( (__m128i*)(dst + x), _mm256_extracti128_si256(x0, 0));
|
|
|
|
|
_mm_storel_epi64( (__m128i*)(dst + x + 8), _mm256_extracti128_si256(x0, 1));
|
|
|
|
|
}
|
|
|
|
|
else
|
|
|
|
|
for( ; x <= width - 16; x += 16 )
|
|
|
|
|
{
|
|
|
|
|
__m256i x0, x1, y0, y1;
|
|
|
|
|
__m256 s0, s1, f0, f1;
|
|
|
|
|
x0 = _mm256_loadu_si256((const __m256i*)(S0 + x));
|
|
|
|
|
x1 = _mm256_loadu_si256((const __m256i*)(S0 + x + 8));
|
|
|
|
|
y0 = _mm256_loadu_si256((const __m256i*)(S1 + x));
|
|
|
|
|
y1 = _mm256_loadu_si256((const __m256i*)(S1 + x + 8));
|
|
|
|
|
|
|
|
|
|
s0 = _mm256_mul_ps(_mm256_cvtepi32_ps(x0), b0);
|
|
|
|
|
s1 = _mm256_mul_ps(_mm256_cvtepi32_ps(x1), b0);
|
|
|
|
|
f0 = _mm256_mul_ps(_mm256_cvtepi32_ps(y0), b1);
|
|
|
|
|
f1 = _mm256_mul_ps(_mm256_cvtepi32_ps(y1), b1);
|
|
|
|
|
s0 = _mm256_add_ps(s0, f0);
|
|
|
|
|
s1 = _mm256_add_ps(s1, f1);
|
|
|
|
|
|
|
|
|
|
x0 = _mm256_loadu_si256((const __m256i*)(S2 + x));
|
|
|
|
|
x1 = _mm256_loadu_si256((const __m256i*)(S2 + x + 8));
|
|
|
|
|
y0 = _mm256_loadu_si256((const __m256i*)(S3 + x));
|
|
|
|
|
y1 = _mm256_loadu_si256((const __m256i*)(S3 + x + 8));
|
|
|
|
|
|
|
|
|
|
f0 = _mm256_mul_ps(_mm256_cvtepi32_ps(x0), b2);
|
|
|
|
|
f1 = _mm256_mul_ps(_mm256_cvtepi32_ps(x1), b2);
|
|
|
|
|
s0 = _mm256_add_ps(s0, f0);
|
|
|
|
|
s1 = _mm256_add_ps(s1, f1);
|
|
|
|
|
f0 = _mm256_mul_ps(_mm256_cvtepi32_ps(y0), b3);
|
|
|
|
|
f1 = _mm256_mul_ps(_mm256_cvtepi32_ps(y1), b3);
|
|
|
|
|
s0 = _mm256_add_ps(s0, f0);
|
|
|
|
|
s1 = _mm256_add_ps(s1, f1);
|
|
|
|
|
|
|
|
|
|
x0 = _mm256_cvtps_epi32(s0);
|
|
|
|
|
x1 = _mm256_cvtps_epi32(s1);
|
|
|
|
|
|
|
|
|
|
x0 = _mm256_packs_epi32(x0, x1);
|
|
|
|
|
x0 = _mm256_permute4x64_epi64(x0, shuffle);
|
|
|
|
|
x0 = _mm256_packus_epi16(x0, x0);
|
|
|
|
|
_mm_storel_epi64( (__m128i*)(dst + x), _mm256_extracti128_si256(x0, 0));
|
|
|
|
|
_mm_storel_epi64( (__m128i*)(dst + x + 8), _mm256_extracti128_si256(x0, 1));
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
return x;
|
|
|
|
|
}
|
|
|
|
|
#endif
|
|
|
|
|
|
|
|
|
|
struct VResizeCubicVec_32s8u
|
|
|
|
|
{
|
|
|
|
|
int operator()(const uchar** _src, uchar* dst, const uchar* _beta, int width ) const
|
|
|
|
|
{
|
|
|
|
|
#if CV_AVX2
|
|
|
|
|
if( checkHardwareSupport(CV_CPU_AVX2) )
|
|
|
|
|
return VResizeCubicVec_32s8u_avx2(_src, dst, _beta, width);
|
|
|
|
|
#endif
|
|
|
|
|
if( checkHardwareSupport(CV_CPU_SSE2) )
|
|
|
|
|
return VResizeCubicVec_32s8u_sse2(_src, dst, _beta, width);
|
|
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
|
}
|
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
template<int shiftval> struct VResizeCubicVec_32f16
|
|
|
|
|
template<int shiftval>
|
|
|
|
|
int VResizeCubicVec_32f16_sse2(const uchar** _src, uchar* _dst, const uchar* _beta, int width )
|
|
|
|
|
{
|
|
|
|
|
int operator()(const uchar** _src, uchar* _dst, const uchar* _beta, int width ) const
|
|
|
|
|
{
|
|
|
|
|
if( !checkHardwareSupport(CV_CPU_SSE2) )
|
|
|
|
|
return 0;
|
|
|
|
|
|
|
|
|
|
const float** src = (const float**)_src;
|
|
|
|
|
const float* beta = (const float*)_beta;
|
|
|
|
|
const float *S0 = src[0], *S1 = src[1], *S2 = src[2], *S3 = src[3];
|
|
|
|
|
@@ -795,6 +1152,44 @@ template<int shiftval> struct VResizeCubicVec_32f16
|
|
|
|
|
__m128i preshift = _mm_set1_epi32(shiftval);
|
|
|
|
|
__m128i postshift = _mm_set1_epi16((short)shiftval);
|
|
|
|
|
|
|
|
|
|
if( (((size_t)S0|(size_t)S1|(size_t)S2|(size_t)S3)&15) == 0 )
|
|
|
|
|
for( ; x <= width - 8; x += 8 )
|
|
|
|
|
{
|
|
|
|
|
__m128 x0, x1, y0, y1, s0, s1;
|
|
|
|
|
__m128i t0, t1;
|
|
|
|
|
x0 = _mm_load_ps(S0 + x);
|
|
|
|
|
x1 = _mm_load_ps(S0 + x + 4);
|
|
|
|
|
y0 = _mm_load_ps(S1 + x);
|
|
|
|
|
y1 = _mm_load_ps(S1 + x + 4);
|
|
|
|
|
|
|
|
|
|
s0 = _mm_mul_ps(x0, b0);
|
|
|
|
|
s1 = _mm_mul_ps(x1, b0);
|
|
|
|
|
y0 = _mm_mul_ps(y0, b1);
|
|
|
|
|
y1 = _mm_mul_ps(y1, b1);
|
|
|
|
|
s0 = _mm_add_ps(s0, y0);
|
|
|
|
|
s1 = _mm_add_ps(s1, y1);
|
|
|
|
|
|
|
|
|
|
x0 = _mm_load_ps(S2 + x);
|
|
|
|
|
x1 = _mm_load_ps(S2 + x + 4);
|
|
|
|
|
y0 = _mm_load_ps(S3 + x);
|
|
|
|
|
y1 = _mm_load_ps(S3 + x + 4);
|
|
|
|
|
|
|
|
|
|
x0 = _mm_mul_ps(x0, b2);
|
|
|
|
|
x1 = _mm_mul_ps(x1, b2);
|
|
|
|
|
y0 = _mm_mul_ps(y0, b3);
|
|
|
|
|
y1 = _mm_mul_ps(y1, b3);
|
|
|
|
|
s0 = _mm_add_ps(s0, x0);
|
|
|
|
|
s1 = _mm_add_ps(s1, x1);
|
|
|
|
|
s0 = _mm_add_ps(s0, y0);
|
|
|
|
|
s1 = _mm_add_ps(s1, y1);
|
|
|
|
|
|
|
|
|
|
t0 = _mm_add_epi32(_mm_cvtps_epi32(s0), preshift);
|
|
|
|
|
t1 = _mm_add_epi32(_mm_cvtps_epi32(s1), preshift);
|
|
|
|
|
|
|
|
|
|
t0 = _mm_add_epi16(_mm_packs_epi32(t0, t1), postshift);
|
|
|
|
|
_mm_storeu_si128( (__m128i*)(dst + x), t0);
|
|
|
|
|
}
|
|
|
|
|
else
|
|
|
|
|
for( ; x <= width - 8; x += 8 )
|
|
|
|
|
{
|
|
|
|
|
__m128 x0, x1, y0, y1, s0, s1;
|
|
|
|
|
@@ -833,19 +1228,124 @@ template<int shiftval> struct VResizeCubicVec_32f16
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
return x;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
#if CV_AVX2
|
|
|
|
|
template<int shiftval>
|
|
|
|
|
int VResizeCubicVec_32f16_avx2(const uchar** _src, uchar* _dst, const uchar* _beta, int width )
|
|
|
|
|
{
|
|
|
|
|
const float** src = (const float**)_src;
|
|
|
|
|
const float* beta = (const float*)_beta;
|
|
|
|
|
const float *S0 = src[0], *S1 = src[1], *S2 = src[2], *S3 = src[3];
|
|
|
|
|
ushort* dst = (ushort*)_dst;
|
|
|
|
|
int x = 0;
|
|
|
|
|
__m256 b0 = _mm256_set1_ps(beta[0]), b1 = _mm256_set1_ps(beta[1]),
|
|
|
|
|
b2 = _mm256_set1_ps(beta[2]), b3 = _mm256_set1_ps(beta[3]);
|
|
|
|
|
__m256i preshift = _mm256_set1_epi32(shiftval);
|
|
|
|
|
__m256i postshift = _mm256_set1_epi16((short)shiftval);
|
|
|
|
|
const int shuffle = 0xd8; // 11 | 01 | 10 | 00
|
|
|
|
|
|
|
|
|
|
if( (((size_t)S0|(size_t)S1|(size_t)S2|(size_t)S3)&31) == 0 )
|
|
|
|
|
for( ; x <= width - 16; x += 16 )
|
|
|
|
|
{
|
|
|
|
|
__m256 x0, x1, y0, y1, s0, s1;
|
|
|
|
|
__m256i t0, t1;
|
|
|
|
|
x0 = _mm256_load_ps(S0 + x);
|
|
|
|
|
x1 = _mm256_load_ps(S0 + x + 8);
|
|
|
|
|
y0 = _mm256_load_ps(S1 + x);
|
|
|
|
|
y1 = _mm256_load_ps(S1 + x + 8);
|
|
|
|
|
|
|
|
|
|
s0 = _mm256_mul_ps(x0, b0);
|
|
|
|
|
s1 = _mm256_mul_ps(x1, b0);
|
|
|
|
|
y0 = _mm256_mul_ps(y0, b1);
|
|
|
|
|
y1 = _mm256_mul_ps(y1, b1);
|
|
|
|
|
s0 = _mm256_add_ps(s0, y0);
|
|
|
|
|
s1 = _mm256_add_ps(s1, y1);
|
|
|
|
|
|
|
|
|
|
x0 = _mm256_load_ps(S2 + x);
|
|
|
|
|
x1 = _mm256_load_ps(S2 + x + 8);
|
|
|
|
|
y0 = _mm256_load_ps(S3 + x);
|
|
|
|
|
y1 = _mm256_load_ps(S3 + x + 8);
|
|
|
|
|
|
|
|
|
|
x0 = _mm256_mul_ps(x0, b2);
|
|
|
|
|
x1 = _mm256_mul_ps(x1, b2);
|
|
|
|
|
y0 = _mm256_mul_ps(y0, b3);
|
|
|
|
|
y1 = _mm256_mul_ps(y1, b3);
|
|
|
|
|
s0 = _mm256_add_ps(s0, x0);
|
|
|
|
|
s1 = _mm256_add_ps(s1, x1);
|
|
|
|
|
s0 = _mm256_add_ps(s0, y0);
|
|
|
|
|
s1 = _mm256_add_ps(s1, y1);
|
|
|
|
|
|
|
|
|
|
t0 = _mm256_add_epi32(_mm256_cvtps_epi32(s0), preshift);
|
|
|
|
|
t1 = _mm256_add_epi32(_mm256_cvtps_epi32(s1), preshift);
|
|
|
|
|
|
|
|
|
|
t0 = _mm256_add_epi16(_mm256_packs_epi32(t0, t1), postshift);
|
|
|
|
|
t0 = _mm256_permute4x64_epi64(t0, shuffle);
|
|
|
|
|
_mm256_storeu_si256( (__m256i*)(dst + x), t0);
|
|
|
|
|
}
|
|
|
|
|
else
|
|
|
|
|
for( ; x <= width - 16; x += 16 )
|
|
|
|
|
{
|
|
|
|
|
__m256 x0, x1, y0, y1, s0, s1;
|
|
|
|
|
__m256i t0, t1;
|
|
|
|
|
x0 = _mm256_loadu_ps(S0 + x);
|
|
|
|
|
x1 = _mm256_loadu_ps(S0 + x + 8);
|
|
|
|
|
y0 = _mm256_loadu_ps(S1 + x);
|
|
|
|
|
y1 = _mm256_loadu_ps(S1 + x + 8);
|
|
|
|
|
|
|
|
|
|
s0 = _mm256_mul_ps(x0, b0);
|
|
|
|
|
s1 = _mm256_mul_ps(x1, b0);
|
|
|
|
|
y0 = _mm256_mul_ps(y0, b1);
|
|
|
|
|
y1 = _mm256_mul_ps(y1, b1);
|
|
|
|
|
s0 = _mm256_add_ps(s0, y0);
|
|
|
|
|
s1 = _mm256_add_ps(s1, y1);
|
|
|
|
|
|
|
|
|
|
x0 = _mm256_loadu_ps(S2 + x);
|
|
|
|
|
x1 = _mm256_loadu_ps(S2 + x + 8);
|
|
|
|
|
y0 = _mm256_loadu_ps(S3 + x);
|
|
|
|
|
y1 = _mm256_loadu_ps(S3 + x + 8);
|
|
|
|
|
|
|
|
|
|
x0 = _mm256_mul_ps(x0, b2);
|
|
|
|
|
x1 = _mm256_mul_ps(x1, b2);
|
|
|
|
|
y0 = _mm256_mul_ps(y0, b3);
|
|
|
|
|
y1 = _mm256_mul_ps(y1, b3);
|
|
|
|
|
s0 = _mm256_add_ps(s0, x0);
|
|
|
|
|
s1 = _mm256_add_ps(s1, x1);
|
|
|
|
|
s0 = _mm256_add_ps(s0, y0);
|
|
|
|
|
s1 = _mm256_add_ps(s1, y1);
|
|
|
|
|
|
|
|
|
|
t0 = _mm256_add_epi32(_mm256_cvtps_epi32(s0), preshift);
|
|
|
|
|
t1 = _mm256_add_epi32(_mm256_cvtps_epi32(s1), preshift);
|
|
|
|
|
|
|
|
|
|
t0 = _mm256_add_epi16(_mm256_packs_epi32(t0, t1), postshift);
|
|
|
|
|
t0 = _mm256_permute4x64_epi64(t0, shuffle);
|
|
|
|
|
_mm256_storeu_si256( (__m256i*)(dst + x), t0);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
return x;
|
|
|
|
|
}
|
|
|
|
|
#endif
|
|
|
|
|
|
|
|
|
|
template<int shiftval> struct VResizeCubicVec_32f16
|
|
|
|
|
{
|
|
|
|
|
int operator()(const uchar** _src, uchar* _dst, const uchar* _beta, int width ) const
|
|
|
|
|
{
|
|
|
|
|
#if CV_AVX2
|
|
|
|
|
if( checkHardwareSupport(CV_CPU_AVX2) )
|
|
|
|
|
return VResizeCubicVec_32f16_avx2<shiftval>(_src, _dst, _beta, width);
|
|
|
|
|
#endif
|
|
|
|
|
if( checkHardwareSupport(CV_CPU_SSE2) )
|
|
|
|
|
return VResizeCubicVec_32f16_sse2<shiftval>(_src, _dst, _beta, width);
|
|
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
|
}
|
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
typedef VResizeCubicVec_32f16<SHRT_MIN> VResizeCubicVec_32f16u;
|
|
|
|
|
typedef VResizeCubicVec_32f16<0> VResizeCubicVec_32f16s;
|
|
|
|
|
|
|
|
|
|
struct VResizeCubicVec_32f
|
|
|
|
|
static int VResizeCubicVec_32f_sse(const uchar** _src, uchar* _dst, const uchar* _beta, int width )
|
|
|
|
|
{
|
|
|
|
|
int operator()(const uchar** _src, uchar* _dst, const uchar* _beta, int width ) const
|
|
|
|
|
{
|
|
|
|
|
if( !checkHardwareSupport(CV_CPU_SSE) )
|
|
|
|
|
return 0;
|
|
|
|
|
|
|
|
|
|
const float** src = (const float**)_src;
|
|
|
|
|
const float* beta = (const float*)_beta;
|
|
|
|
|
const float *S0 = src[0], *S1 = src[1], *S2 = src[2], *S3 = src[3];
|
|
|
|
|
@@ -854,6 +1354,40 @@ struct VResizeCubicVec_32f
|
|
|
|
|
__m128 b0 = _mm_set1_ps(beta[0]), b1 = _mm_set1_ps(beta[1]),
|
|
|
|
|
b2 = _mm_set1_ps(beta[2]), b3 = _mm_set1_ps(beta[3]);
|
|
|
|
|
|
|
|
|
|
if( (((size_t)S0|(size_t)S1|(size_t)S2|(size_t)S3)&15) == 0 )
|
|
|
|
|
for( ; x <= width - 8; x += 8 )
|
|
|
|
|
{
|
|
|
|
|
__m128 x0, x1, y0, y1, s0, s1;
|
|
|
|
|
x0 = _mm_load_ps(S0 + x);
|
|
|
|
|
x1 = _mm_load_ps(S0 + x + 4);
|
|
|
|
|
y0 = _mm_load_ps(S1 + x);
|
|
|
|
|
y1 = _mm_load_ps(S1 + x + 4);
|
|
|
|
|
|
|
|
|
|
s0 = _mm_mul_ps(x0, b0);
|
|
|
|
|
s1 = _mm_mul_ps(x1, b0);
|
|
|
|
|
y0 = _mm_mul_ps(y0, b1);
|
|
|
|
|
y1 = _mm_mul_ps(y1, b1);
|
|
|
|
|
s0 = _mm_add_ps(s0, y0);
|
|
|
|
|
s1 = _mm_add_ps(s1, y1);
|
|
|
|
|
|
|
|
|
|
x0 = _mm_load_ps(S2 + x);
|
|
|
|
|
x1 = _mm_load_ps(S2 + x + 4);
|
|
|
|
|
y0 = _mm_load_ps(S3 + x);
|
|
|
|
|
y1 = _mm_load_ps(S3 + x + 4);
|
|
|
|
|
|
|
|
|
|
x0 = _mm_mul_ps(x0, b2);
|
|
|
|
|
x1 = _mm_mul_ps(x1, b2);
|
|
|
|
|
y0 = _mm_mul_ps(y0, b3);
|
|
|
|
|
y1 = _mm_mul_ps(y1, b3);
|
|
|
|
|
s0 = _mm_add_ps(s0, x0);
|
|
|
|
|
s1 = _mm_add_ps(s1, x1);
|
|
|
|
|
s0 = _mm_add_ps(s0, y0);
|
|
|
|
|
s1 = _mm_add_ps(s1, y1);
|
|
|
|
|
|
|
|
|
|
_mm_storeu_ps( dst + x, s0);
|
|
|
|
|
_mm_storeu_ps( dst + x + 4, s1);
|
|
|
|
|
}
|
|
|
|
|
else
|
|
|
|
|
for( ; x <= width - 8; x += 8 )
|
|
|
|
|
{
|
|
|
|
|
__m128 x0, x1, y0, y1, s0, s1;
|
|
|
|
|
@@ -888,6 +1422,102 @@ struct VResizeCubicVec_32f
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
return x;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
#if CV_AVX
|
|
|
|
|
int VResizeCubicVec_32f_avx(const uchar** _src, uchar* _dst, const uchar* _beta, int width )
|
|
|
|
|
{
|
|
|
|
|
const float** src = (const float**)_src;
|
|
|
|
|
const float* beta = (const float*)_beta;
|
|
|
|
|
const float *S0 = src[0], *S1 = src[1], *S2 = src[2], *S3 = src[3];
|
|
|
|
|
float* dst = (float*)_dst;
|
|
|
|
|
int x = 0;
|
|
|
|
|
__m256 b0 = _mm256_set1_ps(beta[0]), b1 = _mm256_set1_ps(beta[1]),
|
|
|
|
|
b2 = _mm256_set1_ps(beta[2]), b3 = _mm256_set1_ps(beta[3]);
|
|
|
|
|
|
|
|
|
|
if( (((size_t)S0|(size_t)S1|(size_t)S2|(size_t)S3)&31) == 0 )
|
|
|
|
|
for( ; x <= width - 16; x += 16 )
|
|
|
|
|
{
|
|
|
|
|
__m256 x0, x1, y0, y1, s0, s1;
|
|
|
|
|
x0 = _mm256_load_ps(S0 + x);
|
|
|
|
|
x1 = _mm256_load_ps(S0 + x + 8);
|
|
|
|
|
y0 = _mm256_load_ps(S1 + x);
|
|
|
|
|
y1 = _mm256_load_ps(S1 + x + 8);
|
|
|
|
|
|
|
|
|
|
s0 = _mm256_mul_ps(x0, b0);
|
|
|
|
|
s1 = _mm256_mul_ps(x1, b0);
|
|
|
|
|
y0 = _mm256_mul_ps(y0, b1);
|
|
|
|
|
y1 = _mm256_mul_ps(y1, b1);
|
|
|
|
|
s0 = _mm256_add_ps(s0, y0);
|
|
|
|
|
s1 = _mm256_add_ps(s1, y1);
|
|
|
|
|
|
|
|
|
|
x0 = _mm256_load_ps(S2 + x);
|
|
|
|
|
x1 = _mm256_load_ps(S2 + x + 8);
|
|
|
|
|
y0 = _mm256_load_ps(S3 + x);
|
|
|
|
|
y1 = _mm256_load_ps(S3 + x + 8);
|
|
|
|
|
|
|
|
|
|
x0 = _mm256_mul_ps(x0, b2);
|
|
|
|
|
x1 = _mm256_mul_ps(x1, b2);
|
|
|
|
|
y0 = _mm256_mul_ps(y0, b3);
|
|
|
|
|
y1 = _mm256_mul_ps(y1, b3);
|
|
|
|
|
s0 = _mm256_add_ps(s0, x0);
|
|
|
|
|
s1 = _mm256_add_ps(s1, x1);
|
|
|
|
|
s0 = _mm256_add_ps(s0, y0);
|
|
|
|
|
s1 = _mm256_add_ps(s1, y1);
|
|
|
|
|
|
|
|
|
|
_mm256_storeu_ps( dst + x, s0);
|
|
|
|
|
_mm256_storeu_ps( dst + x + 8, s1);
|
|
|
|
|
}
|
|
|
|
|
else
|
|
|
|
|
for( ; x <= width - 16; x += 16 )
|
|
|
|
|
{
|
|
|
|
|
__m256 x0, x1, y0, y1, s0, s1;
|
|
|
|
|
x0 = _mm256_loadu_ps(S0 + x);
|
|
|
|
|
x1 = _mm256_loadu_ps(S0 + x + 8);
|
|
|
|
|
y0 = _mm256_loadu_ps(S1 + x);
|
|
|
|
|
y1 = _mm256_loadu_ps(S1 + x + 8);
|
|
|
|
|
|
|
|
|
|
s0 = _mm256_mul_ps(x0, b0);
|
|
|
|
|
s1 = _mm256_mul_ps(x1, b0);
|
|
|
|
|
y0 = _mm256_mul_ps(y0, b1);
|
|
|
|
|
y1 = _mm256_mul_ps(y1, b1);
|
|
|
|
|
s0 = _mm256_add_ps(s0, y0);
|
|
|
|
|
s1 = _mm256_add_ps(s1, y1);
|
|
|
|
|
|
|
|
|
|
x0 = _mm256_loadu_ps(S2 + x);
|
|
|
|
|
x1 = _mm256_loadu_ps(S2 + x + 8);
|
|
|
|
|
y0 = _mm256_loadu_ps(S3 + x);
|
|
|
|
|
y1 = _mm256_loadu_ps(S3 + x + 8);
|
|
|
|
|
|
|
|
|
|
x0 = _mm256_mul_ps(x0, b2);
|
|
|
|
|
x1 = _mm256_mul_ps(x1, b2);
|
|
|
|
|
y0 = _mm256_mul_ps(y0, b3);
|
|
|
|
|
y1 = _mm256_mul_ps(y1, b3);
|
|
|
|
|
s0 = _mm256_add_ps(s0, x0);
|
|
|
|
|
s1 = _mm256_add_ps(s1, x1);
|
|
|
|
|
s0 = _mm256_add_ps(s0, y0);
|
|
|
|
|
s1 = _mm256_add_ps(s1, y1);
|
|
|
|
|
|
|
|
|
|
_mm256_storeu_ps( dst + x, s0);
|
|
|
|
|
_mm256_storeu_ps( dst + x + 8, s1);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
return x;
|
|
|
|
|
}
|
|
|
|
|
#endif
|
|
|
|
|
|
|
|
|
|
struct VResizeCubicVec_32f
|
|
|
|
|
{
|
|
|
|
|
int operator()(const uchar** _src, uchar* _dst, const uchar* _beta, int width ) const
|
|
|
|
|
{
|
|
|
|
|
#if CV_AVX
|
|
|
|
|
if( checkHardwareSupport(CV_CPU_AVX) )
|
|
|
|
|
return VResizeCubicVec_32f_avx(_src, _dst, _beta, width);
|
|
|
|
|
#endif
|
|
|
|
|
if( checkHardwareSupport(CV_CPU_SSE) )
|
|
|
|
|
return VResizeCubicVec_32f_sse(_src, _dst, _beta, width);
|
|
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
|
}
|
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
|