diff --git a/test/lpf_8_test.cc b/test/lpf_8_test.cc index abc410798..b9f3d3d5a 100644 --- a/test/lpf_8_test.cc +++ b/test/lpf_8_test.cc @@ -522,6 +522,13 @@ INSTANTIATE_TEST_CASE_P( #endif // CONFIG_VP9_HIGHBITDEPTH #endif +#if HAVE_AVX2 && (!CONFIG_VP9_HIGHBITDEPTH) +INSTANTIATE_TEST_CASE_P( + AVX2_C_COMPARE_SINGLE, Loop8Test6Param, + ::testing::Values( + make_tuple(&vp9_lpf_horizontal_16_avx2, &vp9_lpf_horizontal_16_c, 8))); +#endif + #if HAVE_SSE2 #if CONFIG_VP9_HIGHBITDEPTH INSTANTIATE_TEST_CASE_P( @@ -584,4 +591,5 @@ INSTANTIATE_TEST_CASE_P( &vp9_lpf_vertical_8_dual_c, 8))); #endif // CONFIG_VP9_HIGHBITDEPTH #endif + } // namespace diff --git a/vp9/common/x86/vp9_loopfilter_intrin_avx2.c b/vp9/common/x86/vp9_loopfilter_intrin_avx2.c index 439c028f2..0cb0912ad 100644 --- a/vp9/common/x86/vp9_loopfilter_intrin_avx2.c +++ b/vp9/common/x86/vp9_loopfilter_intrin_avx2.c @@ -9,6 +9,7 @@ */ #include /* AVX2 */ +#include "vpx_ports/mem.h" static void mb_lpf_horizontal_edge_w_avx2_8(unsigned char *s, int p, const unsigned char *_blimit, const unsigned char *_limit, @@ -392,6 +393,11 @@ static void mb_lpf_horizontal_edge_w_avx2_8(unsigned char *s, int p, } } +DECLARE_ALIGNED(32, static const uint8_t, filt_loopfilter_avx2[32]) = { + 0, 128, 1, 128, 2, 128, 3, 128, 4, 128, 5, 128, 6, 128, 7, 128, + 8, 128, 9, 128, 10, 128, 11, 128, 12, 128, 13, 128, 14, 128, 15, 128 +}; + static void mb_lpf_horizontal_edge_w_avx2_16(unsigned char *s, int p, const unsigned char *_blimit, const unsigned char *_limit, const unsigned char *_thresh) { @@ -401,6 +407,9 @@ static void mb_lpf_horizontal_edge_w_avx2_16(unsigned char *s, int p, __m128i p7, p6, p5; __m128i p4, p3, p2, p1, p0, q0, q1, q2, q3, q4; __m128i q5, q6, q7; + __m256i p256_7, q256_7, p256_6, q256_6, p256_5, q256_5, p256_4, + q256_4, p256_3, q256_3, p256_2, q256_2, p256_1, q256_1, + p256_0, q256_0; const __m128i thresh = _mm_broadcastb_epi8( _mm_cvtsi32_si128((int) _thresh[0])); @@ -409,16 +418,37 @@ static void mb_lpf_horizontal_edge_w_avx2_16(unsigned char *s, int p, const __m128i blimit = _mm_broadcastb_epi8( _mm_cvtsi32_si128((int) _blimit[0])); - p4 = _mm_loadu_si128((__m128i *) (s - 5 * p)); - p3 = _mm_loadu_si128((__m128i *) (s - 4 * p)); - p2 = _mm_loadu_si128((__m128i *) (s - 3 * p)); - p1 = _mm_loadu_si128((__m128i *) (s - 2 * p)); - p0 = _mm_loadu_si128((__m128i *) (s - 1 * p)); - q0 = _mm_loadu_si128((__m128i *) (s - 0 * p)); - q1 = _mm_loadu_si128((__m128i *) (s + 1 * p)); - q2 = _mm_loadu_si128((__m128i *) (s + 2 * p)); - q3 = _mm_loadu_si128((__m128i *) (s + 3 * p)); - q4 = _mm_loadu_si128((__m128i *) (s + 4 * p)); + p256_4 = _mm256_castpd_si256(_mm256_broadcast_pd( + (__m128d const *)(s - 5 * p))); + p256_3 = _mm256_castpd_si256(_mm256_broadcast_pd( + (__m128d const *)(s - 4 * p))); + p256_2 = _mm256_castpd_si256(_mm256_broadcast_pd( + (__m128d const *)(s - 3 * p))); + p256_1 = _mm256_castpd_si256(_mm256_broadcast_pd( + (__m128d const *)(s - 2 * p))); + p256_0 = _mm256_castpd_si256(_mm256_broadcast_pd( + (__m128d const *)(s - 1 * p))); + q256_0 = _mm256_castpd_si256(_mm256_broadcast_pd( + (__m128d const *)(s - 0 * p))); + q256_1 = _mm256_castpd_si256(_mm256_broadcast_pd( + (__m128d const *)(s + 1 * p))); + q256_2 = _mm256_castpd_si256(_mm256_broadcast_pd( + (__m128d const *)(s + 2 * p))); + q256_3 = _mm256_castpd_si256(_mm256_broadcast_pd( + (__m128d const *)(s + 3 * p))); + q256_4 = _mm256_castpd_si256(_mm256_broadcast_pd( + (__m128d const *)(s + 4 * p))); + + p4 = _mm256_castsi256_si128(p256_4); + p3 = _mm256_castsi256_si128(p256_3); + p2 = _mm256_castsi256_si128(p256_2); + p1 = _mm256_castsi256_si128(p256_1); + p0 = _mm256_castsi256_si128(p256_0); + q0 = _mm256_castsi256_si128(q256_0); + q1 = _mm256_castsi256_si128(q256_1); + q2 = _mm256_castsi256_si128(q256_2); + q3 = _mm256_castsi256_si128(q256_3); + q4 = _mm256_castsi256_si128(q256_4); { const __m128i abs_p1p0 = _mm_or_si128(_mm_subs_epu8(p1, p0), @@ -534,23 +564,35 @@ static void mb_lpf_horizontal_edge_w_avx2_16(unsigned char *s, int p, flat = _mm_cmpeq_epi8(flat, zero); flat = _mm_and_si128(flat, mask); - p5 = _mm_loadu_si128((__m128i *) (s - 6 * p)); - q5 = _mm_loadu_si128((__m128i *) (s + 5 * p)); + p256_5 = _mm256_castpd_si256(_mm256_broadcast_pd( + (__m128d const *)(s - 6 * p))); + q256_5 = _mm256_castpd_si256(_mm256_broadcast_pd( + (__m128d const *)(s + 5 * p))); + p5 = _mm256_castsi256_si128(p256_5); + q5 = _mm256_castsi256_si128(q256_5); flat2 = _mm_max_epu8( _mm_or_si128(_mm_subs_epu8(p5, p0), _mm_subs_epu8(p0, p5)), _mm_or_si128(_mm_subs_epu8(q5, q0), _mm_subs_epu8(q0, q5))); flat2 = _mm_max_epu8(work, flat2); - p6 = _mm_loadu_si128((__m128i *) (s - 7 * p)); - q6 = _mm_loadu_si128((__m128i *) (s + 6 * p)); + p256_6 = _mm256_castpd_si256(_mm256_broadcast_pd( + (__m128d const *)(s - 7 * p))); + q256_6 = _mm256_castpd_si256(_mm256_broadcast_pd( + (__m128d const *)(s + 6 * p))); + p6 = _mm256_castsi256_si128(p256_6); + q6 = _mm256_castsi256_si128(q256_6); work = _mm_max_epu8( _mm_or_si128(_mm_subs_epu8(p6, p0), _mm_subs_epu8(p0, p6)), _mm_or_si128(_mm_subs_epu8(q6, q0), _mm_subs_epu8(q0, q6))); flat2 = _mm_max_epu8(work, flat2); - p7 = _mm_loadu_si128((__m128i *) (s - 8 * p)); - q7 = _mm_loadu_si128((__m128i *) (s + 7 * p)); + p256_7 = _mm256_castpd_si256(_mm256_broadcast_pd( + (__m128d const *)(s - 8 * p))); + q256_7 = _mm256_castpd_si256(_mm256_broadcast_pd( + (__m128d const *)(s + 7 * p))); + p7 = _mm256_castsi256_si128(p256_7); + q7 = _mm256_castsi256_si128(q256_7); work = _mm_max_epu8( _mm_or_si128(_mm_subs_epu8(p7, p0), _mm_subs_epu8(p0, p7)), _mm_or_si128(_mm_subs_epu8(q7, q0), _mm_subs_epu8(q0, q7))); @@ -566,29 +608,28 @@ static void mb_lpf_horizontal_edge_w_avx2_16(unsigned char *s, int p, { const __m256i eight = _mm256_set1_epi16(8); const __m256i four = _mm256_set1_epi16(4); - __m256i p256_7, q256_7, p256_6, q256_6, p256_5, q256_5, p256_4, - q256_4, p256_3, q256_3, p256_2, q256_2, p256_1, q256_1, - p256_0, q256_0; __m256i pixelFilter_p, pixelFilter_q, pixetFilter_p2p1p0, pixetFilter_q2q1q0, sum_p7, sum_q7, sum_p3, sum_q3, res_p, res_q; - p256_7 = _mm256_cvtepu8_epi16(p7); - p256_6 = _mm256_cvtepu8_epi16(p6); - p256_5 = _mm256_cvtepu8_epi16(p5); - p256_4 = _mm256_cvtepu8_epi16(p4); - p256_3 = _mm256_cvtepu8_epi16(p3); - p256_2 = _mm256_cvtepu8_epi16(p2); - p256_1 = _mm256_cvtepu8_epi16(p1); - p256_0 = _mm256_cvtepu8_epi16(p0); - q256_0 = _mm256_cvtepu8_epi16(q0); - q256_1 = _mm256_cvtepu8_epi16(q1); - q256_2 = _mm256_cvtepu8_epi16(q2); - q256_3 = _mm256_cvtepu8_epi16(q3); - q256_4 = _mm256_cvtepu8_epi16(q4); - q256_5 = _mm256_cvtepu8_epi16(q5); - q256_6 = _mm256_cvtepu8_epi16(q6); - q256_7 = _mm256_cvtepu8_epi16(q7); + const __m256i filter = _mm256_load_si256( + (__m256i const *)filt_loopfilter_avx2); + p256_7 = _mm256_shuffle_epi8(p256_7, filter); + p256_6 = _mm256_shuffle_epi8(p256_6, filter); + p256_5 = _mm256_shuffle_epi8(p256_5, filter); + p256_4 = _mm256_shuffle_epi8(p256_4, filter); + p256_3 = _mm256_shuffle_epi8(p256_3, filter); + p256_2 = _mm256_shuffle_epi8(p256_2, filter); + p256_1 = _mm256_shuffle_epi8(p256_1, filter); + p256_0 = _mm256_shuffle_epi8(p256_0, filter); + q256_0 = _mm256_shuffle_epi8(q256_0, filter); + q256_1 = _mm256_shuffle_epi8(q256_1, filter); + q256_2 = _mm256_shuffle_epi8(q256_2, filter); + q256_3 = _mm256_shuffle_epi8(q256_3, filter); + q256_4 = _mm256_shuffle_epi8(q256_4, filter); + q256_5 = _mm256_shuffle_epi8(q256_5, filter); + q256_6 = _mm256_shuffle_epi8(q256_6, filter); + q256_7 = _mm256_shuffle_epi8(q256_7, filter); pixelFilter_p = _mm256_add_epi16(_mm256_add_epi16(p256_6, p256_5), _mm256_add_epi16(p256_4, p256_3));