Add apply_window_int16() to DSPContext with x86-optimized versions and use it
in the ac3_fixed encoder.
This commit is contained in:
parent
e971d81364
commit
e6e9823488
@ -167,7 +167,7 @@ static av_cold int mdct_init(AVCodecContext *avctx, AC3MDCTContext *mdct,
|
||||
static void mdct512(AC3MDCTContext *mdct, CoefType *out, SampleType *in);
|
||||
|
||||
static void apply_window(DSPContext *dsp, SampleType *output, const SampleType *input,
|
||||
const SampleType *window, int n);
|
||||
const SampleType *window, unsigned int len);
|
||||
|
||||
static int normalize_samples(AC3EncodeContext *s);
|
||||
|
||||
|
@ -252,15 +252,9 @@ static void mdct512(AC3MDCTContext *mdct, int32_t *out, int16_t *in)
|
||||
* Apply KBD window to input samples prior to MDCT.
|
||||
*/
|
||||
static void apply_window(DSPContext *dsp, int16_t *output, const int16_t *input,
|
||||
const int16_t *window, int n)
|
||||
const int16_t *window, unsigned int len)
|
||||
{
|
||||
int i;
|
||||
int n2 = n >> 1;
|
||||
|
||||
for (i = 0; i < n2; i++) {
|
||||
output[i] = MUL16(input[i], window[i]) >> 15;
|
||||
output[n-i-1] = MUL16(input[n-i-1], window[i]) >> 15;
|
||||
}
|
||||
dsp->apply_window_int16(output, input, window, len);
|
||||
}
|
||||
|
||||
|
||||
|
@ -83,9 +83,9 @@ static void mdct512(AC3MDCTContext *mdct, float *out, float *in)
|
||||
* Apply KBD window to input samples prior to MDCT.
|
||||
*/
|
||||
static void apply_window(DSPContext *dsp, float *output, const float *input,
|
||||
const float *window, int n)
|
||||
const float *window, unsigned int len)
|
||||
{
|
||||
dsp->vector_fmul(output, input, window, n);
|
||||
dsp->vector_fmul(output, input, window, len);
|
||||
}
|
||||
|
||||
|
||||
|
@ -141,7 +141,7 @@ const uint8_t ff_ac3_rematrix_band_tab[5] = { 13, 25, 37, 61, 253 };
|
||||
/* AC-3 MDCT window */
|
||||
|
||||
/* MDCT window */
|
||||
const int16_t ff_ac3_window[AC3_WINDOW_SIZE/2] = {
|
||||
DECLARE_ALIGNED(16, const int16_t, ff_ac3_window)[AC3_WINDOW_SIZE/2] = {
|
||||
4, 7, 12, 16, 21, 28, 34, 42,
|
||||
51, 61, 72, 84, 97, 111, 127, 145,
|
||||
164, 184, 207, 231, 257, 285, 315, 347,
|
||||
|
@ -3890,6 +3890,19 @@ static int32_t scalarproduct_and_madd_int16_c(int16_t *v1, const int16_t *v2, co
|
||||
return res;
|
||||
}
|
||||
|
||||
static void apply_window_int16_c(int16_t *output, const int16_t *input,
|
||||
const int16_t *window, unsigned int len)
|
||||
{
|
||||
int i;
|
||||
int len2 = len >> 1;
|
||||
|
||||
for (i = 0; i < len2; i++) {
|
||||
int16_t w = window[i];
|
||||
output[i] = (MUL16(input[i], w) + (1 << 14)) >> 15;
|
||||
output[len-i-1] = (MUL16(input[len-i-1], w) + (1 << 14)) >> 15;
|
||||
}
|
||||
}
|
||||
|
||||
#define W0 2048
|
||||
#define W1 2841 /* 2048*sqrt (2)*cos (1*pi/16) */
|
||||
#define W2 2676 /* 2048*sqrt (2)*cos (2*pi/16) */
|
||||
@ -4364,6 +4377,7 @@ av_cold void dsputil_init(DSPContext* c, AVCodecContext *avctx)
|
||||
c->vector_clipf = vector_clipf_c;
|
||||
c->scalarproduct_int16 = scalarproduct_int16_c;
|
||||
c->scalarproduct_and_madd_int16 = scalarproduct_and_madd_int16_c;
|
||||
c->apply_window_int16 = apply_window_int16_c;
|
||||
c->scalarproduct_float = scalarproduct_float_c;
|
||||
c->butterflies_float = butterflies_float_c;
|
||||
c->vector_fmul_scalar = vector_fmul_scalar_c;
|
||||
|
@ -524,6 +524,20 @@ typedef struct DSPContext {
|
||||
*/
|
||||
int32_t (*scalarproduct_and_madd_int16)(int16_t *v1/*align 16*/, const int16_t *v2, const int16_t *v3, int len, int mul);
|
||||
|
||||
/**
|
||||
* Apply symmetric window in 16-bit fixed-point.
|
||||
* @param output destination array
|
||||
* constraints: 16-byte aligned
|
||||
* @param input source array
|
||||
* constraints: 16-byte aligned
|
||||
* @param window window array
|
||||
* constraints: 16-byte aligned, at least len/2 elements
|
||||
* @param len full window length
|
||||
* constraints: multiple of ? greater than zero
|
||||
*/
|
||||
void (*apply_window_int16)(int16_t *output, const int16_t *input,
|
||||
const int16_t *window, unsigned int len);
|
||||
|
||||
/* rv30 functions */
|
||||
qpel_mc_func put_rv30_tpel_pixels_tab[4][16];
|
||||
qpel_mc_func avg_rv30_tpel_pixels_tab[4][16];
|
||||
|
@ -2388,6 +2388,20 @@ int32_t ff_scalarproduct_int16_sse2(const int16_t *v1, const int16_t *v2, int or
|
||||
int32_t ff_scalarproduct_and_madd_int16_mmx2(int16_t *v1, const int16_t *v2, const int16_t *v3, int order, int mul);
|
||||
int32_t ff_scalarproduct_and_madd_int16_sse2(int16_t *v1, const int16_t *v2, const int16_t *v3, int order, int mul);
|
||||
int32_t ff_scalarproduct_and_madd_int16_ssse3(int16_t *v1, const int16_t *v2, const int16_t *v3, int order, int mul);
|
||||
|
||||
void ff_apply_window_int16_mmxext (int16_t *output, const int16_t *input,
|
||||
const int16_t *window, unsigned int len);
|
||||
void ff_apply_window_int16_mmxext_ba (int16_t *output, const int16_t *input,
|
||||
const int16_t *window, unsigned int len);
|
||||
void ff_apply_window_int16_sse2 (int16_t *output, const int16_t *input,
|
||||
const int16_t *window, unsigned int len);
|
||||
void ff_apply_window_int16_sse2_ba (int16_t *output, const int16_t *input,
|
||||
const int16_t *window, unsigned int len);
|
||||
void ff_apply_window_int16_ssse3 (int16_t *output, const int16_t *input,
|
||||
const int16_t *window, unsigned int len);
|
||||
void ff_apply_window_int16_ssse3_atom(int16_t *output, const int16_t *input,
|
||||
const int16_t *window, unsigned int len);
|
||||
|
||||
void ff_add_hfyu_median_prediction_mmx2(uint8_t *dst, const uint8_t *top, const uint8_t *diff, int w, int *left, int *left_top);
|
||||
int ff_add_hfyu_left_prediction_ssse3(uint8_t *dst, const uint8_t *src, int w, int left);
|
||||
int ff_add_hfyu_left_prediction_sse4(uint8_t *dst, const uint8_t *src, int w, int left);
|
||||
@ -2749,6 +2763,11 @@ void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx)
|
||||
#if HAVE_YASM
|
||||
c->scalarproduct_int16 = ff_scalarproduct_int16_mmx2;
|
||||
c->scalarproduct_and_madd_int16 = ff_scalarproduct_and_madd_int16_mmx2;
|
||||
if (avctx->flags & CODEC_FLAG_BITEXACT) {
|
||||
c->apply_window_int16 = ff_apply_window_int16_mmxext_ba;
|
||||
} else {
|
||||
c->apply_window_int16 = ff_apply_window_int16_mmxext;
|
||||
}
|
||||
#endif
|
||||
}
|
||||
if(mm_flags & AV_CPU_FLAG_SSE){
|
||||
@ -2771,13 +2790,30 @@ void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx)
|
||||
#if HAVE_YASM
|
||||
c->scalarproduct_int16 = ff_scalarproduct_int16_sse2;
|
||||
c->scalarproduct_and_madd_int16 = ff_scalarproduct_and_madd_int16_sse2;
|
||||
if (avctx->flags & CODEC_FLAG_BITEXACT) {
|
||||
c->apply_window_int16 = ff_apply_window_int16_sse2_ba;
|
||||
} else {
|
||||
if (!(mm_flags & AV_CPU_FLAG_SSE2SLOW)) {
|
||||
c->apply_window_int16 = ff_apply_window_int16_sse2;
|
||||
}
|
||||
}
|
||||
|
||||
c->emulated_edge_mc = emulated_edge_mc_sse;
|
||||
c->gmc= gmc_sse;
|
||||
#endif
|
||||
}
|
||||
if((mm_flags & AV_CPU_FLAG_SSSE3) && !(mm_flags & (AV_CPU_FLAG_SSE42|AV_CPU_FLAG_3DNOW)) && HAVE_YASM) // cachesplit
|
||||
c->scalarproduct_and_madd_int16 = ff_scalarproduct_and_madd_int16_ssse3;
|
||||
if (mm_flags & AV_CPU_FLAG_SSSE3) {
|
||||
#if HAVE_YASM
|
||||
if (mm_flags & AV_CPU_FLAG_ATOM) {
|
||||
c->apply_window_int16 = ff_apply_window_int16_ssse3_atom;
|
||||
} else {
|
||||
c->apply_window_int16 = ff_apply_window_int16_ssse3;
|
||||
}
|
||||
if (!(mm_flags & (AV_CPU_FLAG_SSE42|AV_CPU_FLAG_3DNOW))) { // cachesplit
|
||||
c->scalarproduct_and_madd_int16 = ff_scalarproduct_and_madd_int16_ssse3;
|
||||
}
|
||||
#endif
|
||||
}
|
||||
}
|
||||
|
||||
if (CONFIG_ENCODERS)
|
||||
|
@ -27,6 +27,8 @@ pb_zzzzzzzz77777777: times 8 db -1
|
||||
pb_7: times 8 db 7
|
||||
pb_zzzz3333zzzzbbbb: db -1,-1,-1,-1,3,3,3,3,-1,-1,-1,-1,11,11,11,11
|
||||
pb_zz11zz55zz99zzdd: db -1,-1,1,1,-1,-1,5,5,-1,-1,9,9,-1,-1,13,13
|
||||
pb_revwords: db 14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1
|
||||
pd_16384: times 4 dd 16384
|
||||
|
||||
section .text align=16
|
||||
|
||||
@ -202,6 +204,130 @@ SCALARPRODUCT_LOOP 0
|
||||
RET
|
||||
|
||||
|
||||
;-----------------------------------------------------------------------------
|
||||
; void ff_apply_window_int16(int16_t *output, const int16_t *input,
|
||||
; const int16_t *window, unsigned int len)
|
||||
;-----------------------------------------------------------------------------
|
||||
|
||||
%macro REVERSE_WORDS_MMXEXT 1-2
|
||||
pshufw %1, %1, 0x1B
|
||||
%endmacro
|
||||
|
||||
%macro REVERSE_WORDS_SSE2 1-2
|
||||
pshuflw %1, %1, 0x1B
|
||||
pshufhw %1, %1, 0x1B
|
||||
pshufd %1, %1, 0x4E
|
||||
%endmacro
|
||||
|
||||
%macro REVERSE_WORDS_SSSE3 2
|
||||
pshufb %1, %2
|
||||
%endmacro
|
||||
|
||||
; dst = (dst * src) >> 15
|
||||
; pmulhw cuts off the bottom bit, so we have to lshift by 1 and add it back
|
||||
; in from the pmullw result.
|
||||
%macro MUL16FIXED_MMXEXT 3 ; dst, src, temp
|
||||
mova %3, %1
|
||||
pmulhw %1, %2
|
||||
pmullw %3, %2
|
||||
psrlw %3, 15
|
||||
psllw %1, 1
|
||||
por %1, %3
|
||||
%endmacro
|
||||
|
||||
; dst = ((dst * src) + (1<<14)) >> 15
|
||||
%macro MUL16FIXED_SSSE3 3 ; dst, src, unused
|
||||
pmulhrsw %1, %2
|
||||
%endmacro
|
||||
|
||||
%macro APPLY_WINDOW_INT16 3 ; %1=instruction set, %2=mmxext/sse2 bit exact version, %3=has_ssse3
|
||||
cglobal apply_window_int16_%1, 4,5,6, output, input, window, offset, offset2
|
||||
lea offset2q, [offsetq-mmsize]
|
||||
%if %2
|
||||
mova m5, [pd_16384]
|
||||
%elifidn %1, ssse3
|
||||
mova m5, [pb_revwords]
|
||||
ALIGN 16
|
||||
%endif
|
||||
.loop:
|
||||
%if %2
|
||||
; This version expands 16-bit to 32-bit, multiplies by the window,
|
||||
; adds 16384 for rounding, right shifts 15, then repacks back to words to
|
||||
; save to the output. The window is reversed for the second half.
|
||||
mova m3, [windowq+offset2q]
|
||||
mova m4, [ inputq+offset2q]
|
||||
pxor m0, m0
|
||||
punpcklwd m0, m3
|
||||
punpcklwd m1, m4
|
||||
pmaddwd m0, m1
|
||||
paddd m0, m5
|
||||
psrad m0, 15
|
||||
pxor m2, m2
|
||||
punpckhwd m2, m3
|
||||
punpckhwd m1, m4
|
||||
pmaddwd m2, m1
|
||||
paddd m2, m5
|
||||
psrad m2, 15
|
||||
packssdw m0, m2
|
||||
mova [outputq+offset2q], m0
|
||||
REVERSE_WORDS m3
|
||||
mova m4, [ inputq+offsetq]
|
||||
pxor m0, m0
|
||||
punpcklwd m0, m3
|
||||
punpcklwd m1, m4
|
||||
pmaddwd m0, m1
|
||||
paddd m0, m5
|
||||
psrad m0, 15
|
||||
pxor m2, m2
|
||||
punpckhwd m2, m3
|
||||
punpckhwd m1, m4
|
||||
pmaddwd m2, m1
|
||||
paddd m2, m5
|
||||
psrad m2, 15
|
||||
packssdw m0, m2
|
||||
mova [outputq+offsetq], m0
|
||||
%elif %3
|
||||
; This version does the 16x16->16 multiplication in-place without expanding
|
||||
; to 32-bit. The ssse3 version is bit-identical.
|
||||
mova m0, [windowq+offset2q]
|
||||
mova m1, [ inputq+offset2q]
|
||||
pmulhrsw m1, m0
|
||||
REVERSE_WORDS m0, m5
|
||||
pmulhrsw m0, [ inputq+offsetq ]
|
||||
mova [outputq+offset2q], m1
|
||||
mova [outputq+offsetq ], m0
|
||||
%else
|
||||
; This version does the 16x16->16 multiplication in-place without expanding
|
||||
; to 32-bit. The mmxext and sse2 versions do not use rounding, and
|
||||
; therefore are not bit-identical to the C version.
|
||||
mova m0, [windowq+offset2q]
|
||||
mova m1, [ inputq+offset2q]
|
||||
mova m2, [ inputq+offsetq ]
|
||||
MUL16FIXED m1, m0, m3
|
||||
REVERSE_WORDS m0
|
||||
MUL16FIXED m2, m0, m3
|
||||
mova [outputq+offset2q], m1
|
||||
mova [outputq+offsetq ], m2
|
||||
%endif
|
||||
add offsetd, mmsize
|
||||
sub offset2d, mmsize
|
||||
jae .loop
|
||||
REP_RET
|
||||
%endmacro
|
||||
|
||||
INIT_MMX
|
||||
%define REVERSE_WORDS REVERSE_WORDS_MMXEXT
|
||||
%define MUL16FIXED MUL16FIXED_MMXEXT
|
||||
APPLY_WINDOW_INT16 mmxext, 0, 0
|
||||
APPLY_WINDOW_INT16 mmxext_ba, 1, 0
|
||||
INIT_XMM
|
||||
%define REVERSE_WORDS REVERSE_WORDS_SSE2
|
||||
APPLY_WINDOW_INT16 sse2, 0, 0
|
||||
APPLY_WINDOW_INT16 sse2_ba, 1, 0
|
||||
APPLY_WINDOW_INT16 ssse3_atom, 0, 1
|
||||
%define REVERSE_WORDS REVERSE_WORDS_SSSE3
|
||||
APPLY_WINDOW_INT16 ssse3, 0, 1
|
||||
|
||||
|
||||
; void add_hfyu_median_prediction_mmx2(uint8_t *dst, const uint8_t *top, const uint8_t *diff, int w, int *left, int *left_top)
|
||||
cglobal add_hfyu_median_prediction_mmx2, 6,6,0, dst, top, diff, w, left, left_top
|
||||
|
@ -1,2 +1,2 @@
|
||||
b3a8f0a8809a58b2ece90744f06fff96 *./tests/data/acodec/ac3.rm
|
||||
346073c97eada69330f61e103a170ca1 *./tests/data/acodec/ac3.rm
|
||||
98751 ./tests/data/acodec/ac3.rm
|
||||
|
@ -1,2 +1,2 @@
|
||||
7da378131db880bcf2e58305d54418ec *./tests/data/lavf/lavf.rm
|
||||
7b7ede9548a09346675edad36acfbf19 *./tests/data/lavf/lavf.rm
|
||||
346706 ./tests/data/lavf/lavf.rm
|
||||
|
@ -1,45 +1,35 @@
|
||||
ret: 0 st: 0 flags:1 dts: 0.000000 pts: 0.000000 pos: 271 size: 556
|
||||
ret: 0 st:-1 flags:0 ts:-1.000000
|
||||
ret: 0 st: 0 flags:1 dts: 0.000000 pts: 0.000000 pos: 271 size: 556
|
||||
ret: 0 st:-1 flags:1 ts: 1.894167
|
||||
ret: 0 st: 0 flags:1 dts: 0.000000 pts: 0.000000 pos: 271 size: 556
|
||||
ret: 0 st: 0 flags:0 ts: 0.788000
|
||||
ret: 0 st: 0 flags:1 dts:12581.487000 pts:12581.487000 pos: 5822 size: 916
|
||||
ret:-1 st:-1 flags:1 ts: 1.894167
|
||||
ret:-1 st: 0 flags:0 ts: 0.788000
|
||||
ret: 0 st: 0 flags:1 ts:-0.317000
|
||||
ret: 0 st: 0 flags:1 dts: 0.000000 pts: 0.000000 pos: 271 size: 556
|
||||
ret: 0 st:-1 flags:0 ts: 2.576668
|
||||
ret: 0 st: 0 flags:1 dts:524.800000 pts:524.800000 pos: 6155 size: 244
|
||||
ret:-1 st:-1 flags:0 ts: 2.576668
|
||||
ret:-1 st:-1 flags:1 ts: 1.470835
|
||||
ret: 0 st: 0 flags:0 ts: 0.365000
|
||||
ret: 0 st: 0 flags:1 dts:12581.487000 pts:12581.487000 pos: 5822 size: 916
|
||||
ret:-1 st: 0 flags:0 ts: 0.365000
|
||||
ret: 0 st: 0 flags:1 ts:-0.741000
|
||||
ret: 0 st: 0 flags:1 dts: 0.000000 pts: 0.000000 pos: 271 size: 556
|
||||
ret:-1 st:-1 flags:0 ts: 2.153336
|
||||
ret: 0 st:-1 flags:1 ts: 1.047503
|
||||
ret: 0 st: 0 flags:1 dts: 0.000000 pts: 0.000000 pos: 271 size: 556
|
||||
ret:-1 st:-1 flags:1 ts: 1.047503
|
||||
ret: 0 st: 0 flags:0 ts:-0.058000
|
||||
ret: 0 st: 0 flags:1 dts: 0.000000 pts: 0.000000 pos: 271 size: 556
|
||||
ret: 0 st: 0 flags:1 ts: 2.836000
|
||||
ret: 0 st: 0 flags:1 dts: 2.681000 pts: 2.681000 pos: 44105 size: 558
|
||||
ret:-1 st: 0 flags:1 ts: 2.836000
|
||||
ret:-1 st:-1 flags:0 ts: 1.730004
|
||||
ret: 0 st:-1 flags:1 ts: 0.624171
|
||||
ret: 0 st: 0 flags:1 dts: 0.000000 pts: 0.000000 pos: 271 size: 556
|
||||
ret:-1 st:-1 flags:1 ts: 0.624171
|
||||
ret: 0 st: 0 flags:0 ts:-0.482000
|
||||
ret: 0 st: 0 flags:1 dts: 0.000000 pts: 0.000000 pos: 271 size: 556
|
||||
ret:-1 st: 0 flags:1 ts: 2.413000
|
||||
ret:-1 st:-1 flags:0 ts: 1.306672
|
||||
ret: 0 st:-1 flags:1 ts: 0.200839
|
||||
ret: 0 st: 0 flags:1 dts: 0.000000 pts: 0.000000 pos: 271 size: 556
|
||||
ret:-1 st:-1 flags:1 ts: 0.200839
|
||||
ret: 0 st: 0 flags:0 ts:-0.905000
|
||||
ret: 0 st: 0 flags:1 dts: 0.000000 pts: 0.000000 pos: 271 size: 556
|
||||
ret:-1 st: 0 flags:1 ts: 1.989000
|
||||
ret: 0 st:-1 flags:0 ts: 0.883340
|
||||
ret: 0 st: 0 flags:1 dts:12581.487000 pts:12581.487000 pos: 5822 size: 916
|
||||
ret:-1 st:-1 flags:0 ts: 0.883340
|
||||
ret: 0 st:-1 flags:1 ts:-0.222493
|
||||
ret: 0 st: 0 flags:1 dts: 0.000000 pts: 0.000000 pos: 271 size: 556
|
||||
ret:-1 st: 0 flags:0 ts: 2.672000
|
||||
ret:-1 st: 0 flags:1 ts: 1.566000
|
||||
ret: 0 st:-1 flags:0 ts: 0.460008
|
||||
ret: 0 st: 0 flags:1 dts:12581.487000 pts:12581.487000 pos: 5822 size: 916
|
||||
ret:-1 st:-1 flags:0 ts: 0.460008
|
||||
ret: 0 st:-1 flags:1 ts:-0.645825
|
||||
ret: 0 st: 0 flags:1 dts: 0.000000 pts: 0.000000 pos: 271 size: 556
|
||||
|
Loading…
Reference in New Issue
Block a user