x86/mpegvideoencdsp: improve ff_pix_sum16_sse2

~15% faster.

Also add an mmxext version that takes advantage of the new code, and
build it alongside with the mmx version only on x86_32.

Reviewed-by: Michael Niedermayer <michaelni@gmx.at>
Signed-off-by: James Almer <jamrial@gmail.com>
This commit is contained in:
James Almer
2014-09-30 22:21:40 -03:00
parent f2e53808e3
commit acebff8e5d
2 changed files with 42 additions and 18 deletions

View File

@@ -24,6 +24,7 @@
#include "libavcodec/mpegvideoencdsp.h"
int ff_pix_sum16_mmx(uint8_t *pix, int line_size);
int ff_pix_sum16_mmxext(uint8_t *pix, int line_size);
int ff_pix_sum16_sse2(uint8_t *pix, int line_size);
int ff_pix_sum16_xop(uint8_t *pix, int line_size);
int ff_pix_norm1_mmx(uint8_t *pix, int line_size);
@@ -218,11 +219,17 @@ av_cold void ff_mpegvideoencdsp_init_x86(MpegvideoEncDSPContext *c,
{
int cpu_flags = av_get_cpu_flags();
#if ARCH_X86_32
if (EXTERNAL_MMX(cpu_flags)) {
c->pix_sum = ff_pix_sum16_mmx;
c->pix_norm1 = ff_pix_norm1_mmx;
}
if (EXTERNAL_MMXEXT(cpu_flags)) {
c->pix_sum = ff_pix_sum16_mmxext;
}
#endif
if (EXTERNAL_SSE2(cpu_flags)) {
c->pix_sum = ff_pix_sum16_sse2;
c->pix_norm1 = ff_pix_norm1_sse2;