flac/x86: add ff_flac_lpc_32_sse4()
benchmarked on sandybridge x86_64: 1358232 decicycles in flac_lpc_32_c 1244575 decicycles in flac_lpc_32_sse4, James Almer's patch 650045 decicycles in flac_lpc_32_sse4, this patch I haven't tested the edgecases such as odd block lengths odd block length tested-by: James Almer <jamrial@gmail.com> Signed-off-by: Michael Niedermayer <michaelni@gmx.at>
This commit is contained in:
parent
4a37e2977c
commit
9c978f243a
@ -128,4 +128,6 @@ av_cold void ff_flacdsp_init(FLACDSPContext *c, enum AVSampleFormat fmt,
|
||||
|
||||
if (ARCH_ARM)
|
||||
ff_flacdsp_init_arm(c, fmt, bps);
|
||||
if (ARCH_X86)
|
||||
ff_flacdsp_init_x86(c, fmt, bps);
|
||||
}
|
||||
|
@ -33,5 +33,6 @@ typedef struct FLACDSPContext {
|
||||
|
||||
void ff_flacdsp_init(FLACDSPContext *c, enum AVSampleFormat fmt, int bps);
|
||||
void ff_flacdsp_init_arm(FLACDSPContext *c, enum AVSampleFormat fmt, int bps);
|
||||
void ff_flacdsp_init_x86(FLACDSPContext *c, enum AVSampleFormat fmt, int bps);
|
||||
|
||||
#endif /* AVCODEC_FLACDSP_H */
|
||||
|
@ -12,6 +12,7 @@ OBJS-$(CONFIG_ENCODERS) += x86/dsputilenc_mmx.o \
|
||||
x86/fdct.o \
|
||||
x86/motion_est.o
|
||||
OBJS-$(CONFIG_FFT) += x86/fft_init.o
|
||||
OBJS-$(CONFIG_FLAC_DECODER) += x86/flacdsp_init.o
|
||||
OBJS-$(CONFIG_H263DSP) += x86/h263dsp_init.o
|
||||
OBJS-$(CONFIG_H264CHROMA) += x86/h264chroma_init.o
|
||||
OBJS-$(CONFIG_H264DSP) += x86/h264dsp_init.o
|
||||
@ -70,6 +71,7 @@ YASM-OBJS-$(CONFIG_DSPUTIL) += x86/dsputil.o \
|
||||
x86/qpel.o
|
||||
YASM-OBJS-$(CONFIG_ENCODERS) += x86/dsputilenc.o
|
||||
YASM-OBJS-$(CONFIG_FFT) += x86/fft.o
|
||||
YASM-OBJS-$(CONFIG_FLAC_DECODER) += x86/flacdsp.o
|
||||
YASM-OBJS-$(CONFIG_H263DSP) += x86/h263_loopfilter.o
|
||||
YASM-OBJS-$(CONFIG_H264CHROMA) += x86/h264_chromamc.o \
|
||||
x86/h264_chromamc_10bit.o
|
||||
|
71
libavcodec/x86/flacdsp.asm
Normal file
71
libavcodec/x86/flacdsp.asm
Normal file
@ -0,0 +1,71 @@
|
||||
;******************************************************************************
|
||||
;* FLAC DSP SIMD optimizations
|
||||
;*
|
||||
;* Copyright (C) 2014 Loren Merritt
|
||||
;*
|
||||
;* This file is part of FFmpeg.
|
||||
;*
|
||||
;* FFmpeg is free software; you can redistribute it and/or
|
||||
;* modify it under the terms of the GNU Lesser General Public
|
||||
;* License as published by the Free Software Foundation; either
|
||||
;* version 2.1 of the License, or (at your option) any later version.
|
||||
;*
|
||||
;* FFmpeg is distributed in the hope that it will be useful,
|
||||
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
;* Lesser General Public License for more details.
|
||||
;*
|
||||
;* You should have received a copy of the GNU Lesser General Public
|
||||
;* License along with FFmpeg; if not, write to the Free Software
|
||||
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
;******************************************************************************
|
||||
|
||||
%include "libavutil/x86/x86util.asm"
|
||||
|
||||
SECTION .text
|
||||
|
||||
INIT_XMM sse4
|
||||
cglobal flac_lpc_32, 5,6,5, decoded, coeffs, pred_order, qlevel, len, j
|
||||
sub lend, pred_orderd
|
||||
jle .ret
|
||||
lea decodedq, [decodedq+pred_orderq*4-8]
|
||||
lea coeffsq, [coeffsq+pred_orderq*4]
|
||||
neg pred_orderq
|
||||
movd m4, qlevelm
|
||||
ALIGN 16
|
||||
.loop_sample:
|
||||
movd m0, [decodedq+pred_orderq*4+8]
|
||||
add decodedq, 8
|
||||
movd m1, [coeffsq+pred_orderq*4]
|
||||
pxor m2, m2
|
||||
pxor m3, m3
|
||||
lea jq, [pred_orderq+1]
|
||||
test jq, jq
|
||||
jz .end_order
|
||||
.loop_order:
|
||||
pmuldq m0, m1
|
||||
paddq m2, m0
|
||||
movd m0, [decodedq+jq*4]
|
||||
pmuldq m1, m0
|
||||
paddq m3, m1
|
||||
movd m1, [coeffsq+jq*4]
|
||||
inc jq
|
||||
jl .loop_order
|
||||
.end_order:
|
||||
pmuldq m0, m1
|
||||
paddq m2, m0
|
||||
psrlq m2, m4
|
||||
movd m0, [decodedq]
|
||||
paddd m0, m2
|
||||
movd [decodedq], m0
|
||||
sub lend, 2
|
||||
jl .ret
|
||||
pmuldq m1, m0
|
||||
paddq m3, m1
|
||||
psrlq m3, m4
|
||||
movd m1, [decodedq+4]
|
||||
paddd m1, m3
|
||||
movd [decodedq+4], m1
|
||||
jg .loop_sample
|
||||
.ret:
|
||||
REP_RET
|
39
libavcodec/x86/flacdsp_init.c
Normal file
39
libavcodec/x86/flacdsp_init.c
Normal file
@ -0,0 +1,39 @@
|
||||
/*
|
||||
* Copyright (c) 2014 James Almer
|
||||
*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
#include "libavcodec/flacdsp.h"
|
||||
#include "libavutil/x86/cpu.h"
|
||||
#include "config.h"
|
||||
|
||||
void ff_flac_lpc_32_sse4(int32_t *samples, const int coeffs[32], int order,
|
||||
int qlevel, int len);
|
||||
|
||||
av_cold void ff_flacdsp_init_x86(FLACDSPContext *c, enum AVSampleFormat fmt,
|
||||
int bps)
|
||||
{
|
||||
#if HAVE_YASM
|
||||
int cpu_flags = av_get_cpu_flags();
|
||||
|
||||
if (EXTERNAL_SSE4(cpu_flags)) {
|
||||
if (bps > 16)
|
||||
c->lpc = ff_flac_lpc_32_sse4;
|
||||
}
|
||||
#endif
|
||||
}
|
Loading…
Reference in New Issue
Block a user