From 165c7c420d611bfa16d999f2033619c542961926 Mon Sep 17 00:00:00 2001 From: Vitor Sessak Date: Sun, 22 May 2011 12:04:33 +0200 Subject: [PATCH 1/4] Fix dct32() compilation with --disable-yasm Signed-off-by: Ronald S. Bultje --- libavcodec/x86/fft.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/libavcodec/x86/fft.c b/libavcodec/x86/fft.c index 8eef4214a2..899f0f7ad5 100644 --- a/libavcodec/x86/fft.c +++ b/libavcodec/x86/fft.c @@ -56,11 +56,13 @@ av_cold void ff_fft_init_mmx(FFTContext *s) #if CONFIG_DCT av_cold void ff_dct_init_mmx(DCTContext *s) { +#if HAVE_YASM int has_vectors = av_get_cpu_flags(); if (has_vectors & AV_CPU_FLAG_AVX && HAVE_AVX) s->dct32 = ff_dct32_float_avx; else if (has_vectors & AV_CPU_FLAG_SSE && HAVE_SSE) s->dct32 = ff_dct32_float_sse; +#endif } #endif From 422b2362fc83ed3a75532ea68a6d167c52f447ec Mon Sep 17 00:00:00 2001 From: Loren Merritt Date: Sat, 21 May 2011 23:36:23 +0200 Subject: [PATCH 2/4] dct32_sse: eliminate some spills 125->104 cycles on penryn (x86_64 only) --- libavcodec/x86/dct32_sse.asm | 203 +++++++++++++++++++++++++--------- libavcodec/x86/fmtconvert.asm | 13 +-- libavcodec/x86/x86util.asm | 20 ++++ 3 files changed, 176 insertions(+), 60 deletions(-) diff --git a/libavcodec/x86/dct32_sse.asm b/libavcodec/x86/dct32_sse.asm index 2e1176cd84..bafe00289d 100644 --- a/libavcodec/x86/dct32_sse.asm +++ b/libavcodec/x86/dct32_sse.asm @@ -20,7 +20,7 @@ ;****************************************************************************** %include "x86inc.asm" -%include "config.asm" +%include "x86util.asm" SECTION_RODATA 32 @@ -37,8 +37,9 @@ ps_cos_vec: dd 0.500603, 0.505471, 0.515447, 0.531043 dd 1.000000, 1.000000, 1.306563, 0.541196 dd 1.000000, 0.707107, 1.000000, -0.707107 dd 1.000000, 0.707107, 1.000000, -0.707107 + dd 0.707107, 0.707107, 0.707107, 0.707107 - +align 32 ps_p1p1m1m1: dd 0, 0, 0x80000000, 0x80000000, 0, 0, 0x80000000, 0x80000000 %macro BUTTERFLY_SSE 4 @@ -77,6 +78,18 @@ ps_p1p1m1m1: dd 0, 0, 0x80000000, 0x80000000, 0, 0, 0x80000000, 0x80000000 BUTTERFLY0 %1, %2, %3, %4, 0xb1 %endmacro +%macro BUTTERFLY3V 5 + movaps m%5, m%1 + addps m%1, m%2 + subps m%5, m%2 + SWAP %2, %5 + mulps m%2, [ps_cos_vec+192] + movaps m%5, m%3 + addps m%3, m%4 + subps m%4, m%5 + mulps m%4, [ps_cos_vec+192] +%endmacro + %macro PASS6_AND_PERMUTE 0 mov tmpd, [outq+4] movss m7, [outq+72] @@ -269,9 +282,131 @@ INIT_XMM %define BUTTERFLY BUTTERFLY_SSE %define BUTTERFLY0 BUTTERFLY0_SSE +%ifdef ARCH_X86_64 +%define SPILL SWAP +%define UNSPILL SWAP + +%macro PASS5 0 + nop ; FIXME code alignment + SWAP 5, 8 + SWAP 4, 12 + SWAP 6, 14 + SWAP 7, 13 + SWAP 0, 15 + PERMUTE 9,10, 10,12, 11,14, 12,9, 13,11, 14,13 + TRANSPOSE4x4PS 8, 9, 10, 11, 0 + BUTTERFLY3V 8, 9, 10, 11, 0 + addps m10, m11 + TRANSPOSE4x4PS 12, 13, 14, 15, 0 + BUTTERFLY3V 12, 13, 14, 15, 0 + addps m14, m15 + addps m12, m14 + addps m14, m13 + addps m13, m15 +%endmacro + +%macro PASS6 0 + SWAP 9, 12 + SWAP 11, 14 + movss [outq+0x00], m8 + pshuflw m0, m8, 0xe + movss [outq+0x10], m9 + pshuflw m1, m9, 0xe + movss [outq+0x20], m10 + pshuflw m2, m10, 0xe + movss [outq+0x30], m11 + pshuflw m3, m11, 0xe + movss [outq+0x40], m12 + pshuflw m4, m12, 0xe + movss [outq+0x50], m13 + pshuflw m5, m13, 0xe + movss [outq+0x60], m14 + pshuflw m6, m14, 0xe + movaps [outq+0x70], m15 + pshuflw m7, m15, 0xe + addss m0, m1 + addss m1, m2 + movss [outq+0x08], m0 + addss m2, m3 + movss [outq+0x18], m1 + addss m3, m4 + movss [outq+0x28], m2 + addss m4, m5 + movss [outq+0x38], m3 + addss m5, m6 + movss [outq+0x48], m4 + addss m6, m7 + movss [outq+0x58], m5 + movss [outq+0x68], m6 + movss [outq+0x78], m7 + + PERMUTE 1,8, 3,9, 5,10, 7,11, 9,12, 11,13, 13,14, 8,1, 10,3, 12,5, 14,7 + movhlps m0, m1 + pshufd m1, m1, 3 + SWAP 0, 2, 4, 6, 8, 10, 12, 14 + SWAP 1, 3, 5, 7, 9, 11, 13, 15 +%rep 7 + movhlps m0, m1 + pshufd m1, m1, 3 + addss m15, m1 + SWAP 0, 2, 4, 6, 8, 10, 12, 14 + SWAP 1, 3, 5, 7, 9, 11, 13, 15 +%endrep +%assign i 4 +%rep 15 + addss m0, m1 + movss [outq+i], m0 + SWAP 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 + %assign i i+8 +%endrep +%endmacro + +%else ; ARCH_X86_32 +%macro SPILL 2 ; xmm#, mempos + movaps [outq+(%2-8)*16], m%1 +%endmacro +%macro UNSPILL 2 + movaps m%1, [outq+(%2-8)*16] +%endmacro + +%define PASS6 PASS6_AND_PERMUTE +%macro PASS5 0 + movaps m2, [ps_cos_vec+160] + shufps m3, m3, 0xcc + + BUTTERFLY3 m5, m3, m2, m1 + SPILL 5, 8 + + UNSPILL 1, 9 + BUTTERFLY3 m1, m3, m2, m5 + SPILL 1, 14 + + BUTTERFLY3 m4, m3, m2, m5 + SPILL 4, 12 + + BUTTERFLY3 m7, m3, m2, m5 + SPILL 7, 13 + + UNSPILL 5, 10 + BUTTERFLY3 m5, m3, m2, m7 + SPILL 5, 10 + + UNSPILL 4, 11 + BUTTERFLY3 m4, m3, m2, m7 + SPILL 4, 11 + + BUTTERFLY3 m6, m3, m2, m7 + SPILL 6, 9 + + BUTTERFLY3 m0, m3, m2, m7 + SPILL 0, 15 +%endmacro +%endif + + INIT_XMM ; void ff_dct32_float_sse(FFTSample *out, const FFTSample *in) -cglobal dct32_float_sse, 2,3,8, out, in, tmp +cglobal dct32_float_sse, 2,3,16, out, in, tmp ; pass 1 movaps m0, [inq+0] @@ -287,8 +422,8 @@ cglobal dct32_float_sse, 2,3,8, out, in, tmp ; pass 2 movaps m2, [ps_cos_vec+64] BUTTERFLY m1, m4, m2, m3 - movaps [outq+48], m1 - movaps [outq+ 0], m4 + SPILL 1, 11 + SPILL 4, 8 ; pass 1 movaps m1, [inq+16] @@ -313,17 +448,17 @@ cglobal dct32_float_sse, 2,3,8, out, in, tmp movaps m2, [ps_cos_vec+96] shufps m1, m1, 0x1b BUTTERFLY m0, m1, m2, m3 - movaps [outq+112], m0 - movaps [outq+ 96], m1 + SPILL 0, 15 + SPILL 1, 14 - movaps m0, [outq+0] + UNSPILL 0, 8 shufps m5, m5, 0x1b BUTTERFLY m0, m5, m2, m3 - movaps m1, [outq+48] + UNSPILL 1, 11 shufps m6, m6, 0x1b BUTTERFLY m1, m6, m2, m3 - movaps [outq+48], m1 + SPILL 1, 11 shufps m4, m4, 0x1b BUTTERFLY m7, m4, m2, m3 @@ -335,57 +470,25 @@ cglobal dct32_float_sse, 2,3,8, out, in, tmp BUTTERFLY2 m5, m3, m2, m1 BUTTERFLY2 m0, m3, m2, m1 - movaps [outq+16], m0 + SPILL 0, 9 BUTTERFLY2 m6, m3, m2, m1 - movaps [outq+32], m6 + SPILL 6, 10 - movaps m0, [outq+48] + UNSPILL 0, 11 BUTTERFLY2 m0, m3, m2, m1 - movaps [outq+48], m0 + SPILL 0, 11 BUTTERFLY2 m4, m3, m2, m1 BUTTERFLY2 m7, m3, m2, m1 - movaps m6, [outq+96] + UNSPILL 6, 14 BUTTERFLY2 m6, m3, m2, m1 - movaps m0, [outq+112] + UNSPILL 0, 15 BUTTERFLY2 m0, m3, m2, m1 - ; pass 5 - movaps m2, [ps_cos_vec+160] - shufps m3, m3, 0xcc - - BUTTERFLY3 m5, m3, m2, m1 - movaps [outq+0], m5 - - movaps m1, [outq+16] - BUTTERFLY3 m1, m3, m2, m5 - movaps [outq+96], m1 - - BUTTERFLY3 m4, m3, m2, m5 - movaps [outq+64], m4 - - BUTTERFLY3 m7, m3, m2, m5 - movaps [outq+80], m7 - - movaps m5, [outq+32] - BUTTERFLY3 m5, m3, m2, m7 - movaps [outq+32], m5 - - movaps m4, [outq+48] - BUTTERFLY3 m4, m3, m2, m7 - movaps [outq+48], m4 - - BUTTERFLY3 m6, m3, m2, m7 - movaps [outq+16], m6 - - BUTTERFLY3 m0, m3, m2, m7 - movaps [outq+112], m0 - - - ; pass 6, no SIMD... - PASS6_AND_PERMUTE + PASS5 + PASS6 RET diff --git a/libavcodec/x86/fmtconvert.asm b/libavcodec/x86/fmtconvert.asm index 13d6cc0130..efab87d570 100644 --- a/libavcodec/x86/fmtconvert.asm +++ b/libavcodec/x86/fmtconvert.asm @@ -95,13 +95,6 @@ FLOAT_TO_INT16_INTERLEAVE6 3dn2 ; void ff_float_interleave6(float *dst, const float **src, unsigned int len); ;----------------------------------------------------------------------------- -%macro BUTTERFLYPS 3 - movaps m%3, m%1 - unpcklps m%1, m%2 - unpckhps m%3, m%2 - SWAP %2, %3 -%endmacro - %macro FLOAT_INTERLEAVE6 2 cglobal float_interleave6_%1, 2,7,%2, dst, src, src1, src2, src3, src4, src5 %ifdef ARCH_X86_64 @@ -130,9 +123,9 @@ cglobal float_interleave6_%1, 2,7,%2, dst, src, src1, src2, src3, src4, src5 movaps m4, [srcq+src4q] movaps m5, [srcq+src5q] - BUTTERFLYPS 0, 1, 6 - BUTTERFLYPS 2, 3, 6 - BUTTERFLYPS 4, 5, 6 + SBUTTERFLYPS 0, 1, 6 + SBUTTERFLYPS 2, 3, 6 + SBUTTERFLYPS 4, 5, 6 movaps m6, m4 shufps m4, m0, 0xe4 diff --git a/libavcodec/x86/x86util.asm b/libavcodec/x86/x86util.asm index 7bd985a33b..141e96000c 100644 --- a/libavcodec/x86/x86util.asm +++ b/libavcodec/x86/x86util.asm @@ -41,6 +41,13 @@ SWAP %2, %4, %3 %endmacro +%macro SBUTTERFLYPS 3 + movaps m%3, m%1 + unpcklps m%1, m%2 + unpckhps m%3, m%2 + SWAP %2, %3 +%endmacro + %macro TRANSPOSE4x4B 5 SBUTTERFLY bw, %1, %2, %5 SBUTTERFLY bw, %3, %4, %5 @@ -74,6 +81,19 @@ SWAP %2, %3 %endmacro +; identical behavior to TRANSPOSE4x4D, but using SSE1 float ops +%macro TRANSPOSE4x4PS 5 + SBUTTERFLYPS %1, %2, %5 + SBUTTERFLYPS %3, %4, %5 + movaps m%5, m%1 + movlhps m%1, m%3 + movhlps m%3, m%5 + movaps m%5, m%2 + movlhps m%2, m%4 + movhlps m%4, m%5 + SWAP %2, %3 +%endmacro + %macro TRANSPOSE8x8W 9-11 %ifdef ARCH_X86_64 SBUTTERFLY wd, %1, %2, %9 From 8089b7fa8c5b5a48cc7101daa4be891d0ead5a5e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Martin=20Storsj=C3=B6?= Date: Sun, 22 May 2011 21:34:49 +0300 Subject: [PATCH 3/4] avoptions: Check the return value from av_get_number MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This avoids doing a division by zero if the option wasn't found, or wasn't an option of an appropriate type. Signed-off-by: Martin Storsjö --- libavutil/opt.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/libavutil/opt.c b/libavutil/opt.c index 57e3248a74..9e06b01c52 100644 --- a/libavutil/opt.c +++ b/libavutil/opt.c @@ -290,7 +290,8 @@ double av_get_double(void *obj, const char *name, const AVOption **o_out) double num=1; int den=1; - av_get_number(obj, name, o_out, &num, &den, &intnum); + if (av_get_number(obj, name, o_out, &num, &den, &intnum) < 0) + return -1; return num*intnum/den; } @@ -300,7 +301,8 @@ AVRational av_get_q(void *obj, const char *name, const AVOption **o_out) double num=1; int den=1; - av_get_number(obj, name, o_out, &num, &den, &intnum); + if (av_get_number(obj, name, o_out, &num, &den, &intnum) < 0) + return (AVRational){-1, 0}; if (num == 1.0 && (int)intnum == intnum) return (AVRational){intnum, den}; else @@ -313,7 +315,8 @@ int64_t av_get_int(void *obj, const char *name, const AVOption **o_out) double num=1; int den=1; - av_get_number(obj, name, o_out, &num, &den, &intnum); + if (av_get_number(obj, name, o_out, &num, &den, &intnum) < 0) + return -1; return num*intnum/den; } From a121754852a69b4879a39ba78863404c13c54f61 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Martin=20Storsj=C3=B6?= Date: Sun, 22 May 2011 14:40:33 +0300 Subject: [PATCH 4/4] ffmpeg: Don't trigger url_interrupt_cb on the first signal MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Currently, the url_interrupt_cb callback will abort all IO after the first received signal. This makes the output files from e.g. the mov muxer to be unreadable if the transcode is aborted with ctrl+c. After this patch, the first signal cleanly breaks out of the transcoding loop, but won't forcibly abort all IO. After the second signal is received, the url_interrupt_cb callback will abort all IO. Signed-off-by: Martin Storsjö --- ffmpeg.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/ffmpeg.c b/ffmpeg.c index 0c9545172f..86732535c8 100644 --- a/ffmpeg.c +++ b/ffmpeg.c @@ -426,11 +426,13 @@ static void term_exit(void) } static volatile int received_sigterm = 0; +static volatile int received_nb_signals = 0; static void sigterm_handler(int sig) { received_sigterm = sig; + received_nb_signals++; term_exit(); } @@ -445,7 +447,7 @@ static void term_init(void) static int decode_interrupt_cb(void) { - return received_sigterm; + return received_nb_signals > 1; } static int ffmpeg_exit(int ret)