SAD32xh and SAD64xh for AVX2

All sad function that process above 32 consecutive elements are optimized
for AVX2:
vp9_sad64x64
vp9_sad64x32
vp9_sad32x64
vp9_sad32x32
vp9_sad32x16
vp9_sad64x64_avg
vp9_sad64x32_avg
vp9_sad32x64_avg
vp9_sad32x32_avg
vp9_sad32x16_avg
The functions that appeared as a hotspot is vp9_sad32x32 and vp9_sad64x64
vp9_sad32x32 was optimized by 68% and vp9_sad64x64 was optimized by 90%
both of them gave and overall ~2.3% user level gain

Change-Id: Iccf86b375a2b54c5fbbe685902ead0c9a561b9fd
This commit is contained in:
levytamar82
2014-10-01 23:47:31 -07:00
parent feee7d97b7
commit 7045aec00a
4 changed files with 205 additions and 10 deletions

View File

@@ -625,6 +625,20 @@ INSTANTIATE_TEST_CASE_P(SSE3, SADTest, ::testing::Values(
#if HAVE_AVX2
#if CONFIG_VP9_ENCODER
const SadMxNVp9Func sad_64x64_avx2_vp9 = vp9_sad64x64_avx2;
const SadMxNVp9Func sad_64x32_avx2_vp9 = vp9_sad64x32_avx2;
const SadMxNVp9Func sad_32x64_avx2_vp9 = vp9_sad32x64_avx2;
const SadMxNVp9Func sad_32x32_avx2_vp9 = vp9_sad32x32_avx2;
const SadMxNVp9Func sad_32x16_avx2_vp9 = vp9_sad32x16_avx2;
const SadMxNVp9Param avx2_vp9_tests[] = {
make_tuple(64, 64, sad_64x64_avx2_vp9),
make_tuple(64, 32, sad_64x32_avx2_vp9),
make_tuple(32, 64, sad_32x64_avx2_vp9),
make_tuple(32, 32, sad_32x32_avx2_vp9),
make_tuple(32, 16, sad_32x16_avx2_vp9),
};
INSTANTIATE_TEST_CASE_P(AVX2, SADVP9Test, ::testing::ValuesIn(avx2_vp9_tests));
const SadMxNx4Func sad_64x64x4d_avx2 = vp9_sad64x64x4d_avx2;
const SadMxNx4Func sad_32x32x4d_avx2 = vp9_sad32x32x4d_avx2;
INSTANTIATE_TEST_CASE_P(AVX2, SADx4Test, ::testing::Values(