pyrUp and pyrDown
This commit is contained in:
parent
63fc6ef316
commit
56f3c92737
@ -183,13 +183,329 @@ struct PyrDownVec_32f
|
||||
}
|
||||
};
|
||||
|
||||
typedef PyrDownNoVec<int, ushort> PyrDownVec_32s16u;
|
||||
typedef PyrDownNoVec<int, short> PyrDownVec_32s16s;
|
||||
#if CV_SSE4_1
|
||||
|
||||
typedef PyrUpNoVec<int, uchar> PyrUpVec_32s8u;
|
||||
typedef PyrUpNoVec<int, short> PyrUpVec_32s16s;
|
||||
typedef PyrUpNoVec<int, ushort> PyrUpVec_32s16u;
|
||||
typedef PyrUpNoVec<float, float> PyrUpVec_32f;
|
||||
struct PyrDownVec_32s16u
|
||||
{
|
||||
PyrDownVec_32s16u()
|
||||
{
|
||||
haveSSE = checkHardwareSupport(CV_CPU_SSE4_1);
|
||||
}
|
||||
|
||||
int operator()(int** src, ushort* dst, int, int width) const
|
||||
{
|
||||
int x = 0;
|
||||
|
||||
if (!haveSSE)
|
||||
return x;
|
||||
|
||||
const int *row0 = src[0], *row1 = src[1], *row2 = src[2], *row3 = src[3], *row4 = src[4];
|
||||
__m128i v_delta = _mm_set1_epi32(128);
|
||||
|
||||
for( ; x <= width - 8; x += 8 )
|
||||
{
|
||||
__m128i v_r00 = _mm_loadu_si128((__m128i const *)(row0 + x)),
|
||||
v_r01 = _mm_loadu_si128((__m128i const *)(row0 + x + 4));
|
||||
__m128i v_r10 = _mm_loadu_si128((__m128i const *)(row1 + x)),
|
||||
v_r11 = _mm_loadu_si128((__m128i const *)(row1 + x + 4));
|
||||
__m128i v_r20 = _mm_loadu_si128((__m128i const *)(row2 + x)),
|
||||
v_r21 = _mm_loadu_si128((__m128i const *)(row2 + x + 4));
|
||||
__m128i v_r30 = _mm_loadu_si128((__m128i const *)(row3 + x)),
|
||||
v_r31 = _mm_loadu_si128((__m128i const *)(row3 + x + 4));
|
||||
__m128i v_r40 = _mm_loadu_si128((__m128i const *)(row4 + x)),
|
||||
v_r41 = _mm_loadu_si128((__m128i const *)(row4 + x + 4));
|
||||
|
||||
v_r00 = _mm_add_epi32(_mm_add_epi32(v_r00, v_r40), _mm_add_epi32(v_r20, v_r20));
|
||||
v_r10 = _mm_add_epi32(_mm_add_epi32(v_r10, v_r20), v_r30);
|
||||
|
||||
v_r10 = _mm_slli_epi32(v_r10, 2);
|
||||
__m128i v_dst0 = _mm_srli_epi32(_mm_add_epi32(_mm_add_epi32(v_r00, v_r10), v_delta), 8);
|
||||
|
||||
v_r01 = _mm_add_epi32(_mm_add_epi32(v_r01, v_r41), _mm_add_epi32(v_r21, v_r21));
|
||||
v_r11 = _mm_add_epi32(_mm_add_epi32(v_r11, v_r21), v_r31);
|
||||
v_r11 = _mm_slli_epi32(v_r11, 2);
|
||||
__m128i v_dst1 = _mm_srli_epi32(_mm_add_epi32(_mm_add_epi32(v_r01, v_r11), v_delta), 8);
|
||||
|
||||
_mm_storeu_si128((__m128i *)(dst + x), _mm_packus_epi32(v_dst0, v_dst1));
|
||||
}
|
||||
|
||||
return x;
|
||||
}
|
||||
|
||||
bool haveSSE;
|
||||
};
|
||||
|
||||
#endif
|
||||
|
||||
struct PyrDownVec_32s16s
|
||||
{
|
||||
PyrDownVec_32s16s()
|
||||
{
|
||||
haveSSE = checkHardwareSupport(CV_CPU_SSE2);
|
||||
}
|
||||
|
||||
int operator()(int** src, short* dst, int, int width) const
|
||||
{
|
||||
int x = 0;
|
||||
|
||||
if (!haveSSE)
|
||||
return x;
|
||||
|
||||
const int *row0 = src[0], *row1 = src[1], *row2 = src[2], *row3 = src[3], *row4 = src[4];
|
||||
__m128i v_delta = _mm_set1_epi32(128);
|
||||
|
||||
for( ; x <= width - 8; x += 8 )
|
||||
{
|
||||
__m128i v_r00 = _mm_loadu_si128((__m128i const *)(row0 + x)),
|
||||
v_r01 = _mm_loadu_si128((__m128i const *)(row0 + x + 4));
|
||||
__m128i v_r10 = _mm_loadu_si128((__m128i const *)(row1 + x)),
|
||||
v_r11 = _mm_loadu_si128((__m128i const *)(row1 + x + 4));
|
||||
__m128i v_r20 = _mm_loadu_si128((__m128i const *)(row2 + x)),
|
||||
v_r21 = _mm_loadu_si128((__m128i const *)(row2 + x + 4));
|
||||
__m128i v_r30 = _mm_loadu_si128((__m128i const *)(row3 + x)),
|
||||
v_r31 = _mm_loadu_si128((__m128i const *)(row3 + x + 4));
|
||||
__m128i v_r40 = _mm_loadu_si128((__m128i const *)(row4 + x)),
|
||||
v_r41 = _mm_loadu_si128((__m128i const *)(row4 + x + 4));
|
||||
|
||||
v_r00 = _mm_add_epi32(_mm_add_epi32(v_r00, v_r40), _mm_add_epi32(v_r20, v_r20));
|
||||
v_r10 = _mm_add_epi32(_mm_add_epi32(v_r10, v_r20), v_r30);
|
||||
|
||||
v_r10 = _mm_slli_epi32(v_r10, 2);
|
||||
__m128i v_dst0 = _mm_srai_epi32(_mm_add_epi32(_mm_add_epi32(v_r00, v_r10), v_delta), 8);
|
||||
|
||||
v_r01 = _mm_add_epi32(_mm_add_epi32(v_r01, v_r41), _mm_add_epi32(v_r21, v_r21));
|
||||
v_r11 = _mm_add_epi32(_mm_add_epi32(v_r11, v_r21), v_r31);
|
||||
v_r11 = _mm_slli_epi32(v_r11, 2);
|
||||
__m128i v_dst1 = _mm_srai_epi32(_mm_add_epi32(_mm_add_epi32(v_r01, v_r11), v_delta), 8);
|
||||
|
||||
_mm_storeu_si128((__m128i *)(dst + x), _mm_packs_epi32(v_dst0, v_dst1));
|
||||
}
|
||||
|
||||
return x;
|
||||
}
|
||||
|
||||
bool haveSSE;
|
||||
};
|
||||
|
||||
|
||||
struct PyrUpVec_32s8u
|
||||
{
|
||||
int operator()(int** src, uchar** dst, int, int width) const
|
||||
{
|
||||
int x = 0;
|
||||
|
||||
if (!checkHardwareSupport(CV_CPU_SSE2))
|
||||
return x;
|
||||
|
||||
uchar *dst0 = dst[0], *dst1 = dst[1];
|
||||
const uint *row0 = (uint *)src[0], *row1 = (uint *)src[1], *row2 = (uint *)src[2];
|
||||
__m128i v_delta = _mm_set1_epi16(32), v_zero = _mm_setzero_si128();
|
||||
|
||||
for( ; x <= width - 16; x += 16 )
|
||||
{
|
||||
__m128i v_r0 = _mm_packs_epi32(_mm_loadu_si128((__m128i const *)(row0 + x)),
|
||||
_mm_loadu_si128((__m128i const *)(row0 + x + 4)));
|
||||
__m128i v_r1 = _mm_packs_epi32(_mm_loadu_si128((__m128i const *)(row1 + x)),
|
||||
_mm_loadu_si128((__m128i const *)(row1 + x + 4)));
|
||||
__m128i v_r2 = _mm_packs_epi32(_mm_loadu_si128((__m128i const *)(row2 + x)),
|
||||
_mm_loadu_si128((__m128i const *)(row2 + x + 4)));
|
||||
|
||||
__m128i v_2r1 = _mm_adds_epu16(v_r1, v_r1), v_4r1 = _mm_adds_epu16(v_2r1, v_2r1);
|
||||
__m128i v_dst00 = _mm_adds_epu16(_mm_adds_epu16(v_r0, v_r2), _mm_adds_epu16(v_2r1, v_4r1));
|
||||
__m128i v_dst10 = _mm_slli_epi16(_mm_adds_epu16(v_r1, v_r2), 2);
|
||||
|
||||
v_r0 = _mm_packs_epi32(_mm_loadu_si128((__m128i const *)(row0 + x + 8)),
|
||||
_mm_loadu_si128((__m128i const *)(row0 + x + 12)));
|
||||
v_r1 = _mm_packs_epi32(_mm_loadu_si128((__m128i const *)(row1 + x + 8)),
|
||||
_mm_loadu_si128((__m128i const *)(row1 + x + 12)));
|
||||
v_r2 = _mm_packs_epi32(_mm_loadu_si128((__m128i const *)(row2 + x + 8)),
|
||||
_mm_loadu_si128((__m128i const *)(row2 + x + 12)));
|
||||
|
||||
v_2r1 = _mm_adds_epu16(v_r1, v_r1), v_4r1 = _mm_adds_epu16(v_2r1, v_2r1);
|
||||
__m128i v_dst01 = _mm_adds_epu16(_mm_adds_epu16(v_r0, v_r2), _mm_adds_epu16(v_2r1, v_4r1));
|
||||
__m128i v_dst11 = _mm_slli_epi16(_mm_adds_epu16(v_r1, v_r2), 2);
|
||||
|
||||
_mm_storeu_si128((__m128i *)(dst0 + x), _mm_packus_epi16(_mm_srli_epi16(_mm_adds_epu16(v_dst00, v_delta), 6),
|
||||
_mm_srli_epi16(_mm_adds_epu16(v_dst01, v_delta), 6)));
|
||||
_mm_storeu_si128((__m128i *)(dst1 + x), _mm_packus_epi16(_mm_srli_epi16(_mm_adds_epu16(v_dst10, v_delta), 6),
|
||||
_mm_srli_epi16(_mm_adds_epu16(v_dst11, v_delta), 6)));
|
||||
}
|
||||
|
||||
for( ; x <= width - 8; x += 8 )
|
||||
{
|
||||
__m128i v_r0 = _mm_packs_epi32(_mm_loadu_si128((__m128i const *)(row0 + x)),
|
||||
_mm_loadu_si128((__m128i const *)(row0 + x + 4)));
|
||||
__m128i v_r1 = _mm_packs_epi32(_mm_loadu_si128((__m128i const *)(row1 + x)),
|
||||
_mm_loadu_si128((__m128i const *)(row1 + x + 4)));
|
||||
__m128i v_r2 = _mm_packs_epi32(_mm_loadu_si128((__m128i const *)(row2 + x)),
|
||||
_mm_loadu_si128((__m128i const *)(row2 + x + 4)));
|
||||
|
||||
__m128i v_2r1 = _mm_adds_epu16(v_r1, v_r1), v_4r1 = _mm_adds_epu16(v_2r1, v_2r1);
|
||||
__m128i v_dst0 = _mm_adds_epu16(_mm_adds_epu16(v_r0, v_r2), _mm_adds_epu16(v_2r1, v_4r1));
|
||||
__m128i v_dst1 = _mm_slli_epi16(_mm_adds_epu16(v_r1, v_r2), 2);
|
||||
|
||||
_mm_storel_epi64((__m128i *)(dst0 + x), _mm_packus_epi16(_mm_srli_epi16(_mm_adds_epu16(v_dst0, v_delta), 6), v_zero));
|
||||
_mm_storel_epi64((__m128i *)(dst1 + x), _mm_packus_epi16(_mm_srli_epi16(_mm_adds_epu16(v_dst1, v_delta), 6), v_zero));
|
||||
}
|
||||
|
||||
return x;
|
||||
}
|
||||
};
|
||||
|
||||
struct PyrUpVec_32s16s
|
||||
{
|
||||
int operator()(int** src, short** dst, int, int width) const
|
||||
{
|
||||
int x = 0;
|
||||
|
||||
if (!checkHardwareSupport(CV_CPU_SSE2))
|
||||
return x;
|
||||
|
||||
short *dst0 = dst[0], *dst1 = dst[1];
|
||||
const uint *row0 = (uint *)src[0], *row1 = (uint *)src[1], *row2 = (uint *)src[2];
|
||||
__m128i v_delta = _mm_set1_epi32(32), v_zero = _mm_setzero_si128();
|
||||
|
||||
for( ; x <= width - 8; x += 8 )
|
||||
{
|
||||
__m128i v_r0 = _mm_loadu_si128((__m128i const *)(row0 + x)),
|
||||
v_r1 = _mm_loadu_si128((__m128i const *)(row1 + x)),
|
||||
v_r2 = _mm_loadu_si128((__m128i const *)(row2 + x));
|
||||
__m128i v_2r1 = _mm_slli_epi32(v_r1, 1), v_4r1 = _mm_slli_epi32(v_r1, 2);
|
||||
__m128i v_dst00 = _mm_add_epi32(_mm_add_epi32(v_r0, v_r2), _mm_add_epi32(v_2r1, v_4r1));
|
||||
__m128i v_dst10 = _mm_slli_epi32(_mm_add_epi32(v_r1, v_r2), 2);
|
||||
|
||||
v_r0 = _mm_loadu_si128((__m128i const *)(row0 + x + 4));
|
||||
v_r1 = _mm_loadu_si128((__m128i const *)(row1 + x + 4));
|
||||
v_r2 = _mm_loadu_si128((__m128i const *)(row2 + x + 4));
|
||||
v_2r1 = _mm_slli_epi32(v_r1, 1);
|
||||
v_4r1 = _mm_slli_epi32(v_r1, 2);
|
||||
__m128i v_dst01 = _mm_add_epi32(_mm_add_epi32(v_r0, v_r2), _mm_add_epi32(v_2r1, v_4r1));
|
||||
__m128i v_dst11 = _mm_slli_epi32(_mm_add_epi32(v_r1, v_r2), 2);
|
||||
|
||||
_mm_storeu_si128((__m128i *)(dst0 + x),
|
||||
_mm_packs_epi32(_mm_srai_epi32(_mm_add_epi32(v_dst00, v_delta), 6),
|
||||
_mm_srai_epi32(_mm_add_epi32(v_dst01, v_delta), 6)));
|
||||
_mm_storeu_si128((__m128i *)(dst1 + x),
|
||||
_mm_packs_epi32(_mm_srai_epi32(_mm_add_epi32(v_dst10, v_delta), 6),
|
||||
_mm_srai_epi32(_mm_add_epi32(v_dst11, v_delta), 6)));
|
||||
}
|
||||
|
||||
for( ; x <= width - 4; x += 4 )
|
||||
{
|
||||
__m128i v_r0 = _mm_loadu_si128((__m128i const *)(row0 + x)),
|
||||
v_r1 = _mm_loadu_si128((__m128i const *)(row1 + x)),
|
||||
v_r2 = _mm_loadu_si128((__m128i const *)(row2 + x));
|
||||
__m128i v_2r1 = _mm_slli_epi32(v_r1, 1), v_4r1 = _mm_slli_epi32(v_r1, 2);
|
||||
|
||||
__m128i v_dst0 = _mm_add_epi32(_mm_add_epi32(v_r0, v_r2), _mm_add_epi32(v_2r1, v_4r1));
|
||||
__m128i v_dst1 = _mm_slli_epi32(_mm_add_epi32(v_r1, v_r2), 2);
|
||||
|
||||
_mm_storel_epi64((__m128i *)(dst0 + x),
|
||||
_mm_packs_epi32(_mm_srai_epi32(_mm_add_epi32(v_dst0, v_delta), 6), v_zero));
|
||||
_mm_storel_epi64((__m128i *)(dst1 + x),
|
||||
_mm_packs_epi32(_mm_srai_epi32(_mm_add_epi32(v_dst1, v_delta), 6), v_zero));
|
||||
}
|
||||
|
||||
return x;
|
||||
}
|
||||
};
|
||||
|
||||
#if CV_SSE4_1
|
||||
|
||||
struct PyrUpVec_32s16u
|
||||
{
|
||||
int operator()(int** src, ushort** dst, int, int width) const
|
||||
{
|
||||
int x = 0;
|
||||
|
||||
if (!checkHardwareSupport(CV_CPU_SSE4_1))
|
||||
return x;
|
||||
|
||||
ushort *dst0 = dst[0], *dst1 = dst[1];
|
||||
const uint *row0 = (uint *)src[0], *row1 = (uint *)src[1], *row2 = (uint *)src[2];
|
||||
__m128i v_delta = _mm_set1_epi32(32), v_zero = _mm_setzero_si128();
|
||||
|
||||
for( ; x <= width - 8; x += 8 )
|
||||
{
|
||||
__m128i v_r0 = _mm_loadu_si128((__m128i const *)(row0 + x)),
|
||||
v_r1 = _mm_loadu_si128((__m128i const *)(row1 + x)),
|
||||
v_r2 = _mm_loadu_si128((__m128i const *)(row2 + x));
|
||||
__m128i v_2r1 = _mm_slli_epi32(v_r1, 1), v_4r1 = _mm_slli_epi32(v_r1, 2);
|
||||
__m128i v_dst00 = _mm_add_epi32(_mm_add_epi32(v_r0, v_r2), _mm_add_epi32(v_2r1, v_4r1));
|
||||
__m128i v_dst10 = _mm_slli_epi32(_mm_add_epi32(v_r1, v_r2), 2);
|
||||
|
||||
v_r0 = _mm_loadu_si128((__m128i const *)(row0 + x + 4));
|
||||
v_r1 = _mm_loadu_si128((__m128i const *)(row1 + x + 4));
|
||||
v_r2 = _mm_loadu_si128((__m128i const *)(row2 + x + 4));
|
||||
v_2r1 = _mm_slli_epi32(v_r1, 1);
|
||||
v_4r1 = _mm_slli_epi32(v_r1, 2);
|
||||
__m128i v_dst01 = _mm_add_epi32(_mm_add_epi32(v_r0, v_r2), _mm_add_epi32(v_2r1, v_4r1));
|
||||
__m128i v_dst11 = _mm_slli_epi32(_mm_add_epi32(v_r1, v_r2), 2);
|
||||
|
||||
_mm_storeu_si128((__m128i *)(dst0 + x),
|
||||
_mm_packus_epi32(_mm_srli_epi32(_mm_add_epi32(v_dst00, v_delta), 6),
|
||||
_mm_srli_epi32(_mm_add_epi32(v_dst01, v_delta), 6)));
|
||||
_mm_storeu_si128((__m128i *)(dst1 + x),
|
||||
_mm_packus_epi32(_mm_srli_epi32(_mm_add_epi32(v_dst10, v_delta), 6),
|
||||
_mm_srli_epi32(_mm_add_epi32(v_dst11, v_delta), 6)));
|
||||
}
|
||||
|
||||
for( ; x <= width - 4; x += 4 )
|
||||
{
|
||||
__m128i v_r0 = _mm_loadu_si128((__m128i const *)(row0 + x)),
|
||||
v_r1 = _mm_loadu_si128((__m128i const *)(row1 + x)),
|
||||
v_r2 = _mm_loadu_si128((__m128i const *)(row2 + x));
|
||||
__m128i v_2r1 = _mm_slli_epi32(v_r1, 1), v_4r1 = _mm_slli_epi32(v_r1, 2);
|
||||
|
||||
__m128i v_dst0 = _mm_add_epi32(_mm_add_epi32(v_r0, v_r2), _mm_add_epi32(v_2r1, v_4r1));
|
||||
__m128i v_dst1 = _mm_slli_epi32(_mm_add_epi32(v_r1, v_r2), 2);
|
||||
|
||||
_mm_storel_epi64((__m128i *)(dst0 + x),
|
||||
_mm_packus_epi32(_mm_srli_epi32(_mm_add_epi32(v_dst0, v_delta), 6), v_zero));
|
||||
_mm_storel_epi64((__m128i *)(dst1 + x),
|
||||
_mm_packus_epi32(_mm_srli_epi32(_mm_add_epi32(v_dst1, v_delta), 6), v_zero));
|
||||
}
|
||||
|
||||
return x;
|
||||
}
|
||||
};
|
||||
|
||||
#endif
|
||||
|
||||
struct PyrUpVec_32f
|
||||
{
|
||||
int operator()(float** src, float** dst, int, int width) const
|
||||
{
|
||||
int x = 0;
|
||||
|
||||
if (!checkHardwareSupport(CV_CPU_SSE2))
|
||||
return x;
|
||||
|
||||
const float *row0 = src[0], *row1 = src[1], *row2 = src[2];
|
||||
float *dst0 = dst[0], *dst1 = dst[1];
|
||||
__m128 v_6 = _mm_set1_ps(6.0f), v_scale = _mm_set1_ps(1.f/64.0f),
|
||||
v_scale4 = _mm_mul_ps(v_scale, _mm_set1_ps(4.0f));
|
||||
|
||||
for( ; x <= width - 8; x += 8 )
|
||||
{
|
||||
__m128 v_r0 = _mm_loadu_ps(row0 + x);
|
||||
__m128 v_r1 = _mm_loadu_ps(row1 + x);
|
||||
__m128 v_r2 = _mm_loadu_ps(row2 + x);
|
||||
|
||||
_mm_storeu_ps(dst1 + x, _mm_mul_ps(v_scale4, _mm_add_ps(v_r1, v_r2)));
|
||||
_mm_storeu_ps(dst0 + x, _mm_mul_ps(v_scale, _mm_add_ps(_mm_add_ps(v_r0, _mm_mul_ps(v_6, v_r1)), v_r2)));
|
||||
|
||||
v_r0 = _mm_loadu_ps(row0 + x + 4);
|
||||
v_r1 = _mm_loadu_ps(row1 + x + 4);
|
||||
v_r2 = _mm_loadu_ps(row2 + x + 4);
|
||||
|
||||
_mm_storeu_ps(dst1 + x + 4, _mm_mul_ps(v_scale4, _mm_add_ps(v_r1, v_r2)));
|
||||
_mm_storeu_ps(dst0 + x + 4, _mm_mul_ps(v_scale, _mm_add_ps(_mm_add_ps(v_r0, _mm_mul_ps(v_6, v_r1)), v_r2)));
|
||||
}
|
||||
|
||||
return x;
|
||||
}
|
||||
};
|
||||
|
||||
#elif CV_NEON
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user