[Processing] Add AVX2 VAA routines
Process 8 lines at a time rather than 16 lines at a time because this appears to give more reliable memory subsystem performance on Haswell. Speedup is > 2x as compared to SSE2 when not memory-bound on Haswell. On my Haswell MBP, VAACalcSadSsdBgd is about ~3x faster when uncached, which appears to be related to processing 8 lines at a time as opposed to 16 lines at a time. The other routines are also faster as compared to the SSE2 routines in this case but to a lesser extent.
This commit is contained in:
parent
eb9f56584f
commit
57fc3e9917
@ -64,6 +64,13 @@ void CVAACalculation::InitVaaFuncs (SVaaFuncs& sVaaFuncs, int32_t iCpuFlag) {
|
||||
sVaaFuncs.pfVAACalcSadSsdBgd = VAACalcSadSsdBgd_sse2;
|
||||
sVaaFuncs.pfVAACalcSadVar = VAACalcSadVar_sse2;
|
||||
}
|
||||
if (iCpuFlag & WELS_CPU_AVX2) {
|
||||
sVaaFuncs.pfVAACalcSad = VAACalcSad_avx2;
|
||||
sVaaFuncs.pfVAACalcSadBgd = VAACalcSadBgd_avx2;
|
||||
sVaaFuncs.pfVAACalcSadSsd = VAACalcSadSsd_avx2;
|
||||
sVaaFuncs.pfVAACalcSadSsdBgd = VAACalcSadSsdBgd_avx2;
|
||||
sVaaFuncs.pfVAACalcSadVar = VAACalcSadVar_avx2;
|
||||
}
|
||||
#endif//X86_ASM
|
||||
#ifdef HAVE_NEON
|
||||
if ((iCpuFlag & WELS_CPU_NEON) == WELS_CPU_NEON) {
|
||||
|
@ -104,6 +104,11 @@ VAACalcSadSsdBgdFunc VAACalcSadSsdBgd_sse2;
|
||||
VAACalcSadFunc VAACalcSad_sse2;
|
||||
VAACalcSadVarFunc VAACalcSadVar_sse2;
|
||||
VAACalcSadSsdFunc VAACalcSadSsd_sse2;
|
||||
VAACalcSadBgdFunc VAACalcSadBgd_avx2;
|
||||
VAACalcSadSsdBgdFunc VAACalcSadSsdBgd_avx2;
|
||||
VAACalcSadFunc VAACalcSad_avx2;
|
||||
VAACalcSadVarFunc VAACalcSadVar_avx2;
|
||||
VAACalcSadSsdFunc VAACalcSadSsd_avx2;
|
||||
WELSVP_EXTERN_C_END
|
||||
#endif
|
||||
|
||||
|
File diff suppressed because it is too large
Load Diff
@ -828,6 +828,12 @@ GENERATE_VAACalcSadBgd_UT (VAACalcSadBgd_sse2, 1, WELS_CPU_SSE2)
|
||||
GENERATE_VAACalcSadSsdBgd_UT (VAACalcSadSsdBgd_sse2, 1, WELS_CPU_SSE2)
|
||||
GENERATE_VAACalcSadSsd_UT (VAACalcSadSsd_sse2, 1, WELS_CPU_SSE2)
|
||||
GENERATE_VAACalcSadVar_UT (VAACalcSadVar_sse2, 1, WELS_CPU_SSE2)
|
||||
|
||||
GENERATE_VAACalcSad_UT (VAACalcSad_avx2, 1, WELS_CPU_AVX2)
|
||||
GENERATE_VAACalcSadBgd_UT (VAACalcSadBgd_avx2, 1, WELS_CPU_AVX2)
|
||||
GENERATE_VAACalcSadSsdBgd_UT (VAACalcSadSsdBgd_avx2, 1, WELS_CPU_AVX2)
|
||||
GENERATE_VAACalcSadSsd_UT (VAACalcSadSsd_avx2, 1, WELS_CPU_AVX2)
|
||||
GENERATE_VAACalcSadVar_UT (VAACalcSadVar_avx2, 1, WELS_CPU_AVX2)
|
||||
#endif
|
||||
|
||||
#if defined(HAVE_NEON)
|
||||
|
Loading…
x
Reference in New Issue
Block a user