x86/mlpdec: add ff_mlp_rematrix_channel_{sse4,avx2}
2x to 2.5x faster than the C version. Reviewed-by: Michael Niedermayer <michaelni@gmx.at> Signed-off-by: James Almer <jamrial@gmail.com>
This commit is contained in:
parent
ddb813b0ef
commit
0de1d6287e
@ -105,7 +105,7 @@ typedef struct SubStream {
|
||||
/// Whether the LSBs of the matrix output are encoded in the bitstream.
|
||||
uint8_t lsb_bypass[MAX_MATRICES];
|
||||
/// Matrix coefficients, stored as 2.14 fixed point.
|
||||
int32_t matrix_coeff[MAX_MATRICES][MAX_CHANNELS];
|
||||
DECLARE_ALIGNED(32, int32_t, matrix_coeff)[MAX_MATRICES][MAX_CHANNELS];
|
||||
/// Left shift to apply to noise values in 0x31eb substreams.
|
||||
uint8_t matrix_noise_shift[MAX_MATRICES];
|
||||
//@}
|
||||
@ -159,7 +159,7 @@ typedef struct MLPDecodeContext {
|
||||
|
||||
int8_t noise_buffer[MAX_BLOCKSIZE_POW2];
|
||||
int8_t bypassed_lsbs[MAX_BLOCKSIZE][MAX_CHANNELS];
|
||||
int32_t sample_buffer[MAX_BLOCKSIZE][MAX_CHANNELS];
|
||||
DECLARE_ALIGNED(32, int32_t, sample_buffer)[MAX_BLOCKSIZE][MAX_CHANNELS];
|
||||
|
||||
MLPDSPContext dsp;
|
||||
} MLPDecodeContext;
|
||||
|
@ -41,7 +41,7 @@ OBJS-$(CONFIG_CAVS_DECODER) += x86/cavsdsp.o
|
||||
OBJS-$(CONFIG_DCA_DECODER) += x86/dcadsp_init.o
|
||||
OBJS-$(CONFIG_DNXHD_ENCODER) += x86/dnxhdenc_init.o
|
||||
OBJS-$(CONFIG_HEVC_DECODER) += x86/hevcdsp_init.o
|
||||
OBJS-$(CONFIG_MLP_DECODER) += x86/mlpdsp.o
|
||||
OBJS-$(CONFIG_MLP_DECODER) += x86/mlpdsp_init.o
|
||||
OBJS-$(CONFIG_MPEG4_DECODER) += x86/xvididct_init.o
|
||||
OBJS-$(CONFIG_PNG_DECODER) += x86/pngdsp_init.o
|
||||
OBJS-$(CONFIG_PRORES_DECODER) += x86/proresdsp_init.o
|
||||
@ -52,7 +52,7 @@ OBJS-$(CONFIG_RV40_DECODER) += x86/rv34dsp_init.o \
|
||||
OBJS-$(CONFIG_SVQ1_ENCODER) += x86/svq1enc_init.o
|
||||
OBJS-$(CONFIG_V210_DECODER) += x86/v210-init.o
|
||||
OBJS-$(CONFIG_TTA_DECODER) += x86/ttadsp_init.o
|
||||
OBJS-$(CONFIG_TRUEHD_DECODER) += x86/mlpdsp.o
|
||||
OBJS-$(CONFIG_TRUEHD_DECODER) += x86/mlpdsp_init.o
|
||||
OBJS-$(CONFIG_VC1_DECODER) += x86/vc1dsp_init.o
|
||||
OBJS-$(CONFIG_VORBIS_DECODER) += x86/vorbisdsp_init.o
|
||||
OBJS-$(CONFIG_VP6_DECODER) += x86/vp6dsp_init.o
|
||||
@ -132,6 +132,7 @@ YASM-OBJS-$(CONFIG_HEVC_DECODER) += x86/hevc_mc.o \
|
||||
x86/hevc_deblock.o \
|
||||
x86/hevc_idct.o \
|
||||
x86/hevc_res_add.o
|
||||
YASM-OBJS-$(CONFIG_MLP_DECODER) += x86/mlpdsp.o
|
||||
YASM-OBJS-$(CONFIG_PNG_DECODER) += x86/pngdsp.o
|
||||
YASM-OBJS-$(CONFIG_PRORES_DECODER) += x86/proresdsp.o
|
||||
YASM-OBJS-$(CONFIG_PRORES_LGPL_DECODER) += x86/proresdsp.o
|
||||
@ -139,6 +140,7 @@ YASM-OBJS-$(CONFIG_RV30_DECODER) += x86/rv34dsp.o
|
||||
YASM-OBJS-$(CONFIG_RV40_DECODER) += x86/rv34dsp.o \
|
||||
x86/rv40dsp.o
|
||||
YASM-OBJS-$(CONFIG_SVQ1_ENCODER) += x86/svq1enc.o
|
||||
YASM-OBJS-$(CONFIG_TRUEHD_DECODER) += x86/mlpdsp.o
|
||||
YASM-OBJS-$(CONFIG_TTA_DECODER) += x86/ttadsp.o
|
||||
YASM-OBJS-$(CONFIG_V210_DECODER) += x86/v210.o
|
||||
YASM-OBJS-$(CONFIG_VC1_DECODER) += x86/vc1dsp.o
|
||||
|
196
libavcodec/x86/mlpdsp.asm
Normal file
196
libavcodec/x86/mlpdsp.asm
Normal file
@ -0,0 +1,196 @@
|
||||
;******************************************************************************
|
||||
;* SIMD-optimized MLP DSP functions
|
||||
;* Copyright (c) 2014 James Almer <jamrial@gmail.com>
|
||||
;*
|
||||
;* This file is part of FFmpeg.
|
||||
;*
|
||||
;* FFmpeg is free software; you can redistribute it and/or
|
||||
;* modify it under the terms of the GNU Lesser General Public
|
||||
;* License as published by the Free Software Foundation; either
|
||||
;* version 2.1 of the License, or (at your option) any later version.
|
||||
;*
|
||||
;* FFmpeg is distributed in the hope that it will be useful,
|
||||
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
;* Lesser General Public License for more details.
|
||||
;*
|
||||
;* You should have received a copy of the GNU Lesser General Public
|
||||
;* License along with FFmpeg; if not, write to the Free Software
|
||||
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
;******************************************************************************
|
||||
|
||||
%include "libavutil/x86/x86util.asm"
|
||||
|
||||
SECTION_TEXT
|
||||
|
||||
%if ARCH_X86_64
|
||||
|
||||
%macro SHLX 2
|
||||
%if cpuflag(bmi2)
|
||||
shlx %1, %1, %2q
|
||||
%else
|
||||
shl %1, %2b
|
||||
%endif
|
||||
%endmacro
|
||||
|
||||
%macro REMATRIX 0
|
||||
movdqa m0, [samplesq]
|
||||
movdqa m1, [coeffsq ]
|
||||
pshufd m2, m0, q2301
|
||||
pshufd m3, m1, q2301
|
||||
pmuldq m0, m1
|
||||
pmuldq m3, m2
|
||||
paddq m0, m3
|
||||
%if notcpuflag(avx2)
|
||||
movdqa m1, [samplesq + 16]
|
||||
movdqa m2, [coeffsq + 16]
|
||||
pshufd m3, m1, q2301
|
||||
pshufd m4, m2, q2301
|
||||
pmuldq m1, m2
|
||||
pmuldq m4, m3
|
||||
paddq m0, m1
|
||||
paddq m0, m4
|
||||
%else
|
||||
vextracti128 xm1, m0, 1
|
||||
paddq xm0, xm1
|
||||
%endif
|
||||
%endmacro
|
||||
|
||||
%macro LOOP_END 0
|
||||
pshufd xm1, xm0, q0032
|
||||
paddq xm0, xm1
|
||||
movq accumq, xm0
|
||||
movzx blsbsd, byte [blsbs_ptrq] ; load *bypassed_lsbs
|
||||
sar accumq, 14 ; accum >>= 14
|
||||
and accumd, maskd ; accum &= mask
|
||||
add accumd, blsbsd ; accum += *bypassed_lsbs
|
||||
mov [samplesq + dest_chq], accumd ; samples[dest_ch] = accum
|
||||
add blsbs_ptrq, 8 ; bypassed_lsbs += MAX_CHANNELS;
|
||||
add samplesq, 32 ; samples += MAX_CHANNELS;
|
||||
cmp blsbs_ptrq, cntq
|
||||
%endmacro
|
||||
|
||||
%macro LOOP_SHIFT_END 0
|
||||
pshufd xm1, xm0, q0032
|
||||
paddq xm0, xm1
|
||||
movq accumq, xm0
|
||||
and indexd, auspd ; index &= access_unit_size_pow2;
|
||||
movsx noiseq, byte [noise_bufferq + indexq] ; load noise_buffer[index]
|
||||
add indexd, index2d ; index += index2
|
||||
SHLX noiseq, mns ; noise_buffer[index] <<= matrix_noise_shift
|
||||
add accumq, noiseq ; accum += noise_buffer[index]
|
||||
movzx noised, byte [blsbs_ptrq] ; load *bypassed_lsbs (reuse tmp noise register)
|
||||
sar accumq, 14 ; accum >>= 14
|
||||
and accumd, maskd ; accum &= mask
|
||||
add accumd, noised ; accum += *bypassed_lsbs
|
||||
mov [samplesq + dest_chq], accumd ; samples[dest_ch] = accum
|
||||
add blsbs_ptrq, 8 ; bypassed_lsbs += MAX_CHANNELS;
|
||||
add samplesq, 32 ; samples += MAX_CHANNELS;
|
||||
cmp blsbs_ptrq, cntq
|
||||
%endmacro
|
||||
|
||||
;void ff_mlp_rematrix_channel(int32_t *samples, const int32_t *coeffs,
|
||||
; const uint8_t *bypassed_lsbs, const int8_t *noise_buffer,
|
||||
; int index, unsigned int dest_ch, uint16_t blockpos,
|
||||
; unsigned int maxchan, int matrix_noise_shift,
|
||||
; int access_unit_size_pow2, int32_t mask)
|
||||
%macro MLP_REMATRIX_CHANNEL 0
|
||||
cglobal mlp_rematrix_channel, 0, 13, 5, samples, coeffs, blsbs_ptr, blsbs, \
|
||||
index, dest_ch, blockpos, maxchan, mns, \
|
||||
accum, mask, cnt
|
||||
mov mnsd, mnsm ; load matrix_noise_shift
|
||||
movzx blockposq, word blockposm ; load and zero extend blockpos (16bit)
|
||||
mov maxchand, maxchanm ; load maxchan
|
||||
mov maskd, maskm ; load mask
|
||||
%if WIN64
|
||||
mov dest_chd, dest_chm ; load dest_chd (not needed on UNIX64)
|
||||
%endif
|
||||
shl dest_chd, 2
|
||||
lea cntq, [blsbs_ptrq + blockposq*8]
|
||||
test mnsd, mnsd ; is matrix_noise_shift != 0?
|
||||
jne .shift ; jump if true
|
||||
cmp maxchand, 4 ; is maxchan < 4?
|
||||
jl .loop4 ; jump if true
|
||||
|
||||
align 16
|
||||
.loop8:
|
||||
; Process 5 or more channels
|
||||
REMATRIX
|
||||
LOOP_END
|
||||
jne .loop8
|
||||
RET
|
||||
|
||||
align 16
|
||||
.loop4:
|
||||
; Process up to 4 channels
|
||||
movdqa xm0, [samplesq]
|
||||
movdqa xm1, [coeffsq ]
|
||||
pshufd xm2, xm0, q2301
|
||||
pshufd xm3, xm1, q2301
|
||||
pmuldq xm0, xm1
|
||||
pmuldq xm3, xm2
|
||||
paddq xm0, xm3
|
||||
LOOP_END
|
||||
jne .loop4
|
||||
RET
|
||||
|
||||
.shift:
|
||||
%if WIN64
|
||||
mov indexd, indexm ; load index (not needed on UNIX64)
|
||||
%endif
|
||||
mov r9d, r9m ; load access_unit_size_pow2
|
||||
%if cpuflag(bmi2)
|
||||
; bmi2 has shift functions that accept any gpr, not just cl, so keep things in place.
|
||||
DEFINE_ARGS samples, coeffs, blsbs_ptr, noise_buffer, \
|
||||
index, dest_ch, accum, index2, mns, \
|
||||
ausp, mask, cnt, noise
|
||||
add mnsd, 7 ; matrix_noise_shift += 7
|
||||
%else ; sse4
|
||||
mov r6, rcx ; move rcx elsewhere so we can use cl for matrix_noise_shift
|
||||
%if WIN64
|
||||
; r0 = rcx
|
||||
DEFINE_ARGS mns, coeffs, blsbs_ptr, noise_buffer, index, dest_ch, samples, \
|
||||
index2, accum, ausp, mask, cnt, noise
|
||||
%else ; UNIX64
|
||||
; r3 = rcx
|
||||
DEFINE_ARGS samples, coeffs, blsbs_ptr, mns, index, dest_ch, noise_buffer, \
|
||||
index2, accum, ausp, mask, cnt, noise
|
||||
%endif
|
||||
lea mnsd, [r8 + 7] ; rcx = matrix_noise_shift + 7
|
||||
%endif ; cpuflag
|
||||
sub auspd, 1 ; access_unit_size_pow2 -= 1
|
||||
cmp r7d, 4 ; is maxchan < 4?
|
||||
lea index2q, [indexq*2 + 1] ; index2 = 2 * index + 1;
|
||||
jl .loop4_shift ; jump if maxchan < 4
|
||||
|
||||
align 16
|
||||
.loop8_shift:
|
||||
; Process 5 or more channels
|
||||
REMATRIX
|
||||
LOOP_SHIFT_END
|
||||
jne .loop8_shift
|
||||
RET
|
||||
|
||||
align 16
|
||||
.loop4_shift:
|
||||
; Process up to 4 channels
|
||||
movdqa xm0, [samplesq]
|
||||
movdqa xm1, [coeffsq ]
|
||||
pshufd xm2, xm0, q2301
|
||||
pshufd xm3, xm1, q2301
|
||||
pmuldq xm0, xm1
|
||||
pmuldq xm3, xm2
|
||||
paddq xm0, xm3
|
||||
LOOP_SHIFT_END
|
||||
jne .loop4_shift
|
||||
RET
|
||||
%endmacro
|
||||
|
||||
INIT_XMM sse4
|
||||
MLP_REMATRIX_CHANNEL
|
||||
%if HAVE_AVX2_EXTERNAL
|
||||
INIT_YMM avx2, bmi2
|
||||
MLP_REMATRIX_CHANNEL
|
||||
%endif
|
||||
|
||||
%endif ; ARCH_X86_64
|
@ -26,6 +26,22 @@
|
||||
#include "libavcodec/mlpdsp.h"
|
||||
#include "libavcodec/mlp.h"
|
||||
|
||||
#define REMATRIX_CHANNEL_FUNC(opt) \
|
||||
void ff_mlp_rematrix_channel_##opt(int32_t *samples, \
|
||||
const int32_t *coeffs, \
|
||||
const uint8_t *bypassed_lsbs, \
|
||||
const int8_t *noise_buffer, \
|
||||
int index, \
|
||||
unsigned int dest_ch, \
|
||||
uint16_t blockpos, \
|
||||
unsigned int maxchan, \
|
||||
int matrix_noise_shift, \
|
||||
int access_unit_size_pow2, \
|
||||
int32_t mask);
|
||||
|
||||
REMATRIX_CHANNEL_FUNC(sse4)
|
||||
REMATRIX_CHANNEL_FUNC(avx2_bmi2)
|
||||
|
||||
#if HAVE_7REGS && HAVE_INLINE_ASM && HAVE_INLINE_ASM_NONLOCAL_LABELS
|
||||
|
||||
extern char ff_mlp_firorder_8;
|
||||
@ -178,9 +194,13 @@ static void mlp_filter_channel_x86(int32_t *state, const int32_t *coeff,
|
||||
|
||||
av_cold void ff_mlpdsp_init_x86(MLPDSPContext *c)
|
||||
{
|
||||
#if HAVE_7REGS && HAVE_INLINE_ASM && HAVE_INLINE_ASM_NONLOCAL_LABELS
|
||||
int cpu_flags = av_get_cpu_flags();
|
||||
#if HAVE_7REGS && HAVE_INLINE_ASM && HAVE_INLINE_ASM_NONLOCAL_LABELS
|
||||
if (INLINE_MMX(cpu_flags))
|
||||
c->mlp_filter_channel = mlp_filter_channel_x86;
|
||||
#endif
|
||||
if (ARCH_X86_64 && EXTERNAL_SSE4(cpu_flags))
|
||||
c->mlp_rematrix_channel = ff_mlp_rematrix_channel_sse4;
|
||||
if (ARCH_X86_64 && EXTERNAL_AVX2(cpu_flags) && cpu_flags & AV_CPU_FLAG_BMI2)
|
||||
c->mlp_rematrix_channel = ff_mlp_rematrix_channel_avx2_bmi2;
|
||||
}
|
Loading…
x
Reference in New Issue
Block a user