From 45eaac02cbb8eb251945d8769a9c81254969528a Mon Sep 17 00:00:00 2001 From: Derek Buitenhuis Date: Wed, 1 Aug 2012 17:46:53 +0000 Subject: [PATCH 1/9] Canopus Lossless decoder At the moment it only does BGR24, but I plan to add the rest after. Signed-off-by: Derek Buitenhuis --- Changelog | 1 + doc/general.texi | 1 + libavcodec/Makefile | 1 + libavcodec/allcodecs.c | 1 + libavcodec/avcodec.h | 1 + libavcodec/cllc.c | 284 +++++++++++++++++++++++++++++++++++++++++ libavcodec/version.h | 4 +- libavformat/riff.c | 1 + 8 files changed, 292 insertions(+), 2 deletions(-) create mode 100644 libavcodec/cllc.c diff --git a/Changelog b/Changelog index ea2c353a15..64c23d3170 100644 --- a/Changelog +++ b/Changelog @@ -41,6 +41,7 @@ version : - G.723.1 demuxer and decoder - RTMPE protocol support - RTMPTE protocol support +- Canopus Lossless Codec decoder version 0.8: diff --git a/doc/general.texi b/doc/general.texi index 05855542bf..a14e888e33 100644 --- a/doc/general.texi +++ b/doc/general.texi @@ -468,6 +468,7 @@ following image formats are supported: @item Delphine Software International CIN video @tab @tab X @tab Codec used in Delphine Software International games. @item Discworld II BMV Video @tab @tab X +@item Canopus Lossless Codec @tab @tab X @item Cinepak @tab @tab X @item Cirrus Logic AccuPak @tab X @tab X @tab fourcc: CLJR diff --git a/libavcodec/Makefile b/libavcodec/Makefile index 7fc50594ff..17bc364e6b 100644 --- a/libavcodec/Makefile +++ b/libavcodec/Makefile @@ -116,6 +116,7 @@ OBJS-$(CONFIG_CDXL_DECODER) += cdxl.o OBJS-$(CONFIG_CINEPAK_DECODER) += cinepak.o OBJS-$(CONFIG_CLJR_DECODER) += cljr.o OBJS-$(CONFIG_CLJR_ENCODER) += cljr.o +OBJS-$(CONFIG_CLLC_DECODER) += cllc.o OBJS-$(CONFIG_COOK_DECODER) += cook.o OBJS-$(CONFIG_CSCD_DECODER) += cscd.o OBJS-$(CONFIG_CYUV_DECODER) += cyuv.o diff --git a/libavcodec/allcodecs.c b/libavcodec/allcodecs.c index 7e7cee6ba2..f85892b34c 100644 --- a/libavcodec/allcodecs.c +++ b/libavcodec/allcodecs.c @@ -89,6 +89,7 @@ void avcodec_register_all(void) REGISTER_DECODER (CDXL, cdxl); REGISTER_DECODER (CINEPAK, cinepak); REGISTER_ENCDEC (CLJR, cljr); + REGISTER_DECODER (CLLC, cllc); REGISTER_DECODER (CSCD, cscd); REGISTER_DECODER (CYUV, cyuv); REGISTER_DECODER (DFA, dfa); diff --git a/libavcodec/avcodec.h b/libavcodec/avcodec.h index 71f2e7093f..450a9ca723 100644 --- a/libavcodec/avcodec.h +++ b/libavcodec/avcodec.h @@ -259,6 +259,7 @@ enum CodecID { CODEC_ID_MSA1, CODEC_ID_TSCC2, CODEC_ID_MTS2, + CODEC_ID_CLLC, /* various PCM "codecs" */ CODEC_ID_FIRST_AUDIO = 0x10000, ///< A dummy id pointing at the start of audio codecs diff --git a/libavcodec/cllc.c b/libavcodec/cllc.c new file mode 100644 index 0000000000..aca4669fcb --- /dev/null +++ b/libavcodec/cllc.c @@ -0,0 +1,284 @@ +/* + * Canopus Lossless Codec decoder + * + * Copyright (c) 2012 Derek Buitenhuis + * + * This file is part of Libav. + * + * Libav is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * Libav is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with Libav; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "libavutil/intreadwrite.h" +#include "dsputil.h" +#include "get_bits.h" +#include "avcodec.h" + +typedef struct CLLCContext { + DSPContext dsp; + AVCodecContext *avctx; + + uint8_t *swapped_buf; + int swapped_buf_size; +} CLLCContext; + +static int read_code_table(CLLCContext *ctx, GetBitContext *gb, VLC *vlc) +{ + uint8_t symbols[256]; + uint8_t bits[256]; + uint16_t codes[256]; + int num_lens, num_codes, num_codes_sum, prefix; + int i, j, count; + + prefix = 0; + count = 0; + num_codes_sum = 0; + + num_lens = get_bits(gb, 5); + + for (i = 0; i < num_lens; i++) { + num_codes = get_bits(gb, 9); + num_codes_sum += num_codes; + + if (num_codes_sum > 256) { + vlc->table = NULL; + + av_log(ctx->avctx, AV_LOG_ERROR, + "Too many VLCs (%d) to be read.\n", num_codes_sum); + return AVERROR_INVALIDDATA; + } + + for (j = 0; j < num_codes; j++) { + symbols[count] = get_bits(gb, 8); + bits[count] = i + 1; + codes[count] = prefix++; + + count++; + } + + prefix <<= 1; + } + + return ff_init_vlc_sparse(vlc, 7, count, bits, 1, 1, + codes, 2, 2, symbols, 1, 1, 0); +} + +static int read_line(CLLCContext *ctx, GetBitContext *gb, int *top_left, + VLC *vlc, uint8_t *outbuf) +{ + uint8_t *dst; + int pred, code; + int i; + + OPEN_READER(bits, gb); + + dst = outbuf; + pred = *top_left; + + /* Simultaneously read and restore the line */ + for (i = 0; i < ctx->avctx->width; i++) { + UPDATE_CACHE(bits, gb); + GET_VLC(code, bits, gb, vlc->table, 7, 2); + + pred += code; + dst[0] = pred; + dst += 3; + } + + CLOSE_READER(bits, gb); + + /* Stash the first pixel */ + *top_left = dst[-3 * ctx->avctx->width]; + + return 0; +} + +static int decode_bgr24_frame(CLLCContext *ctx, GetBitContext *gb, AVFrame *pic) +{ + AVCodecContext *avctx = ctx->avctx; + uint8_t *dst; + int pred[3]; + int ret; + int i, j; + VLC vlc[3]; + + pred[0] = 0x80; + pred[1] = 0x80; + pred[2] = 0x80; + + dst = pic->data[0]; + + skip_bits(gb, 16); + + /* Read in code table for each plane */ + for (i = 0; i < 3; i++) { + ret = read_code_table(ctx, gb, &vlc[i]); + if (ret < 0) { + for (j = 0; j <= i; j++) + ff_free_vlc(&vlc[j]); + + av_log(ctx->avctx, AV_LOG_ERROR, + "Could not read code table %d.\n", i); + return ret; + } + } + + /* Read in and restore every line */ + for (i = 0; i < avctx->height; i++) { + for (j = 0; j < 3; j++) + read_line(ctx, gb, &pred[j], &vlc[j], &dst[j]); + + dst += pic->linesize[0]; + } + + for (i = 0; i < 3; i++) + ff_free_vlc(&vlc[i]); + + return 0; +} + +static int cllc_decode_frame(AVCodecContext *avctx, void *data, + int *got_picture_ptr, AVPacket *avpkt) +{ + CLLCContext *ctx = avctx->priv_data; + AVFrame *pic = avctx->coded_frame; + uint8_t *src = avpkt->data; + uint8_t *swapped_buf_new; + uint32_t info_tag, info_offset; + GetBitContext gb; + int coding_type, ret; + + if (pic->data[0]) + avctx->release_buffer(avctx, pic); + + pic->reference = 0; + + /* Make sure our bswap16'd buffer is big enough */ + swapped_buf_new = av_fast_realloc(ctx->swapped_buf, + &ctx->swapped_buf_size, avpkt->size); + if (!swapped_buf_new) { + av_log(avctx, AV_LOG_ERROR, "Could not realloc swapped buffer.\n"); + return AVERROR(ENOMEM); + } + ctx->swapped_buf = swapped_buf_new; + + /* Skip the INFO header if present */ + info_offset = 0; + info_tag = AV_RL32(src); + if (info_tag == MKTAG('I', 'N', 'F', 'O')) { + info_offset = AV_RL32(src + 4); + if (info_offset > UINT32_MAX - 8 || info_offset + 8 > avpkt->size) { + av_log(avctx, AV_LOG_ERROR, + "Invalid INFO header offset: 0x%08X is too large.\n", + info_offset); + return AVERROR_INVALIDDATA; + } + + info_offset += 8; + src += info_offset; + + av_log(avctx, AV_LOG_DEBUG, "Skipping INFO chunk.\n"); + } + + /* bswap16 the buffer since CLLC's bitreader works in 16-bit words */ + ctx->dsp.bswap16_buf((uint16_t *) ctx->swapped_buf, (uint16_t *) src, + (avpkt->size - info_offset) / 2); + + init_get_bits(&gb, ctx->swapped_buf, (avpkt->size - info_offset) * 8); + + /* + * Read in coding type. The types are as follows: + * + * 0 - YUY2 + * 1 - BGR24 (Triples) + * 2 - BGR24 (Quads) + * 3 - BGRA + */ + coding_type = (AV_RL32(src) >> 8) & 0xFF; + av_log(avctx, AV_LOG_DEBUG, "Frame coding type: %d\n", coding_type); + + switch (coding_type) { + case 1: + avctx->pix_fmt = PIX_FMT_RGB24; + avctx->bits_per_raw_sample = 8; + + ret = avctx->get_buffer(avctx, pic); + if (ret < 0) { + av_log(avctx, AV_LOG_ERROR, "Could not allocate buffer.\n"); + return ret; + } + + ret = decode_bgr24_frame(ctx, &gb, pic); + if (ret < 0) + return ret; + + break; + default: + av_log(avctx, AV_LOG_ERROR, "Unknown coding type: %d\n.", coding_type); + return AVERROR_INVALIDDATA; + } + + pic->key_frame = 1; + pic->pict_type = AV_PICTURE_TYPE_I; + + *got_picture_ptr = 1; + *(AVFrame *)data = *pic; + + return avpkt->size; +} + +static av_cold int cllc_decode_close(AVCodecContext *avctx) +{ + CLLCContext *ctx = avctx->priv_data; + + if (avctx->coded_frame->data[0]) + avctx->release_buffer(avctx, avctx->coded_frame); + + av_freep(&avctx->coded_frame); + av_freep(&ctx->swapped_buf); + + return 0; +} + +static av_cold int cllc_decode_init(AVCodecContext *avctx) +{ + CLLCContext *ctx = avctx->priv_data; + + /* Initialize various context values */ + ctx->avctx = avctx; + ctx->swapped_buf = NULL; + ctx->swapped_buf_size = 0; + + ff_dsputil_init(&ctx->dsp, avctx); + + avctx->coded_frame = avcodec_alloc_frame(); + if (!avctx->coded_frame) { + av_log(avctx, AV_LOG_ERROR, "Could not allocate frame.\n"); + return AVERROR(ENOMEM); + } + + return 0; +} + +AVCodec ff_cllc_decoder = { + .name = "cllc", + .type = AVMEDIA_TYPE_VIDEO, + .id = CODEC_ID_CLLC, + .priv_data_size = sizeof(CLLCContext), + .init = cllc_decode_init, + .decode = cllc_decode_frame, + .close = cllc_decode_close, + .capabilities = CODEC_CAP_DR1, + .long_name = NULL_IF_CONFIG_SMALL("Canopus Lossless Codec"), +}; diff --git a/libavcodec/version.h b/libavcodec/version.h index acad4d4a2a..24e64e91f0 100644 --- a/libavcodec/version.h +++ b/libavcodec/version.h @@ -27,8 +27,8 @@ */ #define LIBAVCODEC_VERSION_MAJOR 54 -#define LIBAVCODEC_VERSION_MINOR 23 -#define LIBAVCODEC_VERSION_MICRO 1 +#define LIBAVCODEC_VERSION_MINOR 24 +#define LIBAVCODEC_VERSION_MICRO 0 #define LIBAVCODEC_VERSION_INT AV_VERSION_INT(LIBAVCODEC_VERSION_MAJOR, \ LIBAVCODEC_VERSION_MINOR, \ diff --git a/libavformat/riff.c b/libavformat/riff.c index 7b036dddf4..5e70ab8308 100644 --- a/libavformat/riff.c +++ b/libavformat/riff.c @@ -288,6 +288,7 @@ const AVCodecTag ff_codec_bmp_tags[] = { { CODEC_ID_MSA1, MKTAG('M', 'S', 'A', '1') }, { CODEC_ID_TSCC2, MKTAG('T', 'S', 'C', '2') }, { CODEC_ID_MTS2, MKTAG('M', 'T', 'S', '2') }, + { CODEC_ID_CLLC, MKTAG('C', 'L', 'L', 'C') }, { CODEC_ID_NONE, 0 } }; From a675d73d574a2b7693aba62285b355fa216c674b Mon Sep 17 00:00:00 2001 From: Derek Buitenhuis Date: Wed, 1 Aug 2012 19:22:50 +0000 Subject: [PATCH 2/9] eamad: Use dsputils instead of a custom bswap16_buf Signed-off-by: Derek Buitenhuis --- libavcodec/eamad.c | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) diff --git a/libavcodec/eamad.c b/libavcodec/eamad.c index ca7439397f..d7e65db927 100644 --- a/libavcodec/eamad.c +++ b/libavcodec/eamad.c @@ -57,13 +57,6 @@ typedef struct MadContext { int mb_y; } MadContext; -static void bswap16_buf(uint16_t *dst, const uint16_t *src, int count) -{ - int i; - for (i=0; ipriv_data; @@ -273,7 +266,7 @@ static int decode_frame(AVCodecContext *avctx, av_fast_malloc(&s->bitstream_buf, &s->bitstream_buf_size, (buf_end-buf) + FF_INPUT_BUFFER_PADDING_SIZE); if (!s->bitstream_buf) return AVERROR(ENOMEM); - bswap16_buf(s->bitstream_buf, (const uint16_t*)buf, (buf_end-buf)/2); + s->dsp.bswap16_buf(s->bitstream_buf, (const uint16_t*)buf, (buf_end-buf)/2); init_get_bits(&s->gb, s->bitstream_buf, 8*(buf_end-buf)); for (s->mb_y=0; s->mb_y < (avctx->height+15)/16; s->mb_y++) From ec7c501ed5ba14467d05b3def76c57b780bb7a12 Mon Sep 17 00:00:00 2001 From: Mans Rullgard Date: Wed, 1 Aug 2012 13:16:23 +0100 Subject: [PATCH 3/9] x86: remove libmpeg2 mmx(ext) idct functions These functions are not faster than other mmx implementations on any hardware I have been able to test on, and they are horribly inaccurate. There is thus no reason to ever use them. Signed-off-by: Mans Rullgard --- LICENSE | 2 - libavcodec/dct-test.c | 4 - libavcodec/options_table.h | 1 - libavcodec/x86/Makefile | 1 - libavcodec/x86/dsputil_mmx.c | 42 --- libavcodec/x86/idct_mmx.c | 632 ----------------------------------- 6 files changed, 682 deletions(-) delete mode 100644 libavcodec/x86/idct_mmx.c diff --git a/LICENSE b/LICENSE index 97923b1380..24e0e44354 100644 --- a/LICENSE +++ b/LICENSE @@ -13,8 +13,6 @@ configure to activate them. In this case, Libav's license changes to GPL v2+. Specifically, the GPL parts of Libav are -- optional x86 optimizations in the files - libavcodec/x86/idct_mmx.c - the X11 grabber in libavdevice/x11grab.c There are a handful of files under other licensing terms, namely: diff --git a/libavcodec/dct-test.c b/libavcodec/dct-test.c index 9e19e0c6df..3aa752ba9e 100644 --- a/libavcodec/dct-test.c +++ b/libavcodec/dct-test.c @@ -109,10 +109,6 @@ static const struct algo idct_tab[] = { { "SIMPLE-C", ff_simple_idct_8, NO_PERM }, #if HAVE_MMX && HAVE_INLINE_ASM -#if CONFIG_GPL - { "LIBMPEG2-MMX", ff_mmx_idct, MMX_PERM, AV_CPU_FLAG_MMX, 1 }, - { "LIBMPEG2-MMX2", ff_mmxext_idct, MMX_PERM, AV_CPU_FLAG_MMX2, 1 }, -#endif { "SIMPLE-MMX", ff_simple_idct_mmx, MMX_SIMPLE_PERM, AV_CPU_FLAG_MMX }, { "XVID-MMX", ff_idct_xvid_mmx, NO_PERM, AV_CPU_FLAG_MMX, 1 }, { "XVID-MMX2", ff_idct_xvid_mmx2, NO_PERM, AV_CPU_FLAG_MMX2, 1 }, diff --git a/libavcodec/options_table.h b/libavcodec/options_table.h index 4f903cc9de..45acd4c320 100644 --- a/libavcodec/options_table.h +++ b/libavcodec/options_table.h @@ -193,7 +193,6 @@ static const AVOption options[]={ {"int", NULL, 0, AV_OPT_TYPE_CONST, {.dbl = FF_IDCT_INT }, INT_MIN, INT_MAX, V|E|D, "idct"}, {"simple", NULL, 0, AV_OPT_TYPE_CONST, {.dbl = FF_IDCT_SIMPLE }, INT_MIN, INT_MAX, V|E|D, "idct"}, {"simplemmx", NULL, 0, AV_OPT_TYPE_CONST, {.dbl = FF_IDCT_SIMPLEMMX }, INT_MIN, INT_MAX, V|E|D, "idct"}, -{"libmpeg2mmx", NULL, 0, AV_OPT_TYPE_CONST, {.dbl = FF_IDCT_LIBMPEG2MMX }, INT_MIN, INT_MAX, V|E|D, "idct"}, {"mmi", NULL, 0, AV_OPT_TYPE_CONST, { .dbl = FF_IDCT_MMI }, INT_MIN, INT_MAX, V|E|D, "idct"}, {"arm", NULL, 0, AV_OPT_TYPE_CONST, {.dbl = FF_IDCT_ARM }, INT_MIN, INT_MAX, V|E|D, "idct"}, {"altivec", NULL, 0, AV_OPT_TYPE_CONST, {.dbl = FF_IDCT_ALTIVEC }, INT_MIN, INT_MAX, V|E|D, "idct"}, diff --git a/libavcodec/x86/Makefile b/libavcodec/x86/Makefile index 0ae70b2dd0..eb82ef572e 100644 --- a/libavcodec/x86/Makefile +++ b/libavcodec/x86/Makefile @@ -19,7 +19,6 @@ MMX-OBJS-$(CONFIG_DNXHD_ENCODER) += x86/dnxhd_mmx.o MMX-OBJS-$(CONFIG_DWT) += x86/snowdsp_mmx.o MMX-OBJS-$(CONFIG_ENCODERS) += x86/dsputilenc_mmx.o MMX-OBJS-$(CONFIG_FFT) += x86/fft.o -MMX-OBJS-$(CONFIG_GPL) += x86/idct_mmx.o MMX-OBJS-$(CONFIG_H264DSP) += x86/h264dsp_mmx.o MMX-OBJS-$(CONFIG_H264PRED) += x86/h264_intrapred_init.o MMX-OBJS-$(CONFIG_LPC) += x86/lpc_mmx.o diff --git a/libavcodec/x86/dsputil_mmx.c b/libavcodec/x86/dsputil_mmx.c index afbb5312b8..827705c003 100644 --- a/libavcodec/x86/dsputil_mmx.c +++ b/libavcodec/x86/dsputil_mmx.c @@ -2164,35 +2164,6 @@ void ff_avg_vc1_mspel_mc00_mmx2(uint8_t *dst, const uint8_t *src, /* XXX: Those functions should be suppressed ASAP when all IDCTs are * converted. */ -#if CONFIG_GPL -static void ff_libmpeg2mmx_idct_put(uint8_t *dest, int line_size, - DCTELEM *block) -{ - ff_mmx_idct(block); - ff_put_pixels_clamped_mmx(block, dest, line_size); -} - -static void ff_libmpeg2mmx_idct_add(uint8_t *dest, int line_size, - DCTELEM *block) -{ - ff_mmx_idct(block); - ff_add_pixels_clamped_mmx(block, dest, line_size); -} - -static void ff_libmpeg2mmx2_idct_put(uint8_t *dest, int line_size, - DCTELEM *block) -{ - ff_mmxext_idct(block); - ff_put_pixels_clamped_mmx(block, dest, line_size); -} - -static void ff_libmpeg2mmx2_idct_add(uint8_t *dest, int line_size, - DCTELEM *block) -{ - ff_mmxext_idct(block); - ff_add_pixels_clamped_mmx(block, dest, line_size); -} -#endif static void ff_idct_xvid_mmx_put(uint8_t *dest, int line_size, DCTELEM *block) { @@ -3049,19 +3020,6 @@ void ff_dsputil_init_mmx(DSPContext *c, AVCodecContext *avctx) c->idct_add = ff_simple_idct_add_mmx; c->idct = ff_simple_idct_mmx; c->idct_permutation_type = FF_SIMPLE_IDCT_PERM; -#if CONFIG_GPL - } else if (idct_algo == FF_IDCT_LIBMPEG2MMX) { - if (mm_flags & AV_CPU_FLAG_MMX2) { - c->idct_put = ff_libmpeg2mmx2_idct_put; - c->idct_add = ff_libmpeg2mmx2_idct_add; - c->idct = ff_mmxext_idct; - } else { - c->idct_put = ff_libmpeg2mmx_idct_put; - c->idct_add = ff_libmpeg2mmx_idct_add; - c->idct = ff_mmx_idct; - } - c->idct_permutation_type = FF_LIBMPEG2_IDCT_PERM; -#endif } else if (idct_algo == FF_IDCT_CAVS) { c->idct_permutation_type = FF_TRANSPOSE_IDCT_PERM; } else if (idct_algo == FF_IDCT_XVIDMMX) { diff --git a/libavcodec/x86/idct_mmx.c b/libavcodec/x86/idct_mmx.c deleted file mode 100644 index 2408ab26ad..0000000000 --- a/libavcodec/x86/idct_mmx.c +++ /dev/null @@ -1,632 +0,0 @@ -/* - * Copyright (C) 1999-2001 Aaron Holtzman - * - * This file is part of mpeg2dec, a free MPEG-2 video stream decoder. - * See http://libmpeg2.sourceforge.net/ for updates. - * - * mpeg2dec is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * mpeg2dec is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with mpeg2dec; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ - -#include "libavutil/common.h" -#include "libavcodec/dsputil.h" - -#include "libavutil/x86_cpu.h" -#include "dsputil_mmx.h" - -#if HAVE_INLINE_ASM - -#define ROW_SHIFT 11 -#define COL_SHIFT 6 - -#define round(bias) ((int)(((bias)+0.5) * (1<> ROW_SHIFT; - row[1] = (a1 + b1) >> ROW_SHIFT; - row[2] = (a2 + b2) >> ROW_SHIFT; - row[3] = (a3 + b3) >> ROW_SHIFT; - row[4] = (a3 - b3) >> ROW_SHIFT; - row[5] = (a2 - b2) >> ROW_SHIFT; - row[6] = (a1 - b1) >> ROW_SHIFT; - row[7] = (a0 - b0) >> ROW_SHIFT; -} -#endif - - -/* MMXEXT row IDCT */ - -#define mmxext_table(c1,c2,c3,c4,c5,c6,c7) { c4, c2, -c4, -c2, \ - c4, c6, c4, c6, \ - c1, c3, -c1, -c5, \ - c5, c7, c3, -c7, \ - c4, -c6, c4, -c6, \ - -c4, c2, c4, -c2, \ - c5, -c1, c3, -c1, \ - c7, c3, c7, -c5 } - -static inline void mmxext_row_head (int16_t * const row, const int offset, - const int16_t * const table) -{ - __asm__ volatile( - "movq (%0), %%mm2 \n\t" /* mm2 = x6 x4 x2 x0 */ - - "movq 8(%0), %%mm5 \n\t" /* mm5 = x7 x5 x3 x1 */ - "movq %%mm2, %%mm0 \n\t" /* mm0 = x6 x4 x2 x0 */ - - "movq (%1), %%mm3 \n\t" /* mm3 = -C2 -C4 C2 C4 */ - "movq %%mm5, %%mm6 \n\t" /* mm6 = x7 x5 x3 x1 */ - - "movq 8(%1), %%mm4 \n\t" /* mm4 = C6 C4 C6 C4 */ - "pmaddwd %%mm0, %%mm3 \n\t" /* mm3 = -C4*x4-C2*x6 C4*x0+C2*x2 */ - - "pshufw $0x4e, %%mm2, %%mm2 \n\t" /* mm2 = x2 x0 x6 x4 */ - :: "r" ((row+offset)), "r" (table) - ); -} - -static inline void mmxext_row (const int16_t * const table, - const int32_t * const rounder) -{ - __asm__ volatile ( - "movq 16(%0), %%mm1 \n\t" /* mm1 = -C5 -C1 C3 C1 */ - "pmaddwd %%mm2, %%mm4 \n\t" /* mm4 = C4*x0+C6*x2 C4*x4+C6*x6 */ - - "pmaddwd 32(%0), %%mm0 \n\t" /* mm0 = C4*x4-C6*x6 C4*x0-C6*x2 */ - "pshufw $0x4e, %%mm6, %%mm6 \n\t" /* mm6 = x3 x1 x7 x5 */ - - "movq 24(%0), %%mm7 \n\t" /* mm7 = -C7 C3 C7 C5 */ - "pmaddwd %%mm5, %%mm1 \n\t" /* mm1= -C1*x5-C5*x7 C1*x1+C3*x3 */ - - "paddd (%1), %%mm3 \n\t" /* mm3 += rounder */ - "pmaddwd %%mm6, %%mm7 \n\t" /* mm7 = C3*x1-C7*x3 C5*x5+C7*x7 */ - - "pmaddwd 40(%0), %%mm2 \n\t" /* mm2= C4*x0-C2*x2 -C4*x4+C2*x6 */ - "paddd %%mm4, %%mm3 \n\t" /* mm3 = a1 a0 + rounder */ - - "pmaddwd 48(%0), %%mm5 \n\t" /* mm5 = C3*x5-C1*x7 C5*x1-C1*x3 */ - "movq %%mm3, %%mm4 \n\t" /* mm4 = a1 a0 + rounder */ - - "pmaddwd 56(%0), %%mm6 \n\t" /* mm6 = C7*x1-C5*x3 C7*x5+C3*x7 */ - "paddd %%mm7, %%mm1 \n\t" /* mm1 = b1 b0 */ - - "paddd (%1), %%mm0 \n\t" /* mm0 += rounder */ - "psubd %%mm1, %%mm3 \n\t" /* mm3 = a1-b1 a0-b0 + rounder */ - - "psrad $" AV_STRINGIFY(ROW_SHIFT) ", %%mm3 \n\t" /* mm3 = y6 y7 */ - "paddd %%mm4, %%mm1 \n\t" /* mm1 = a1+b1 a0+b0 + rounder */ - - "paddd %%mm2, %%mm0 \n\t" /* mm0 = a3 a2 + rounder */ - "psrad $" AV_STRINGIFY(ROW_SHIFT) ", %%mm1 \n\t" /* mm1 = y1 y0 */ - - "paddd %%mm6, %%mm5 \n\t" /* mm5 = b3 b2 */ - "movq %%mm0, %%mm4 \n\t" /* mm4 = a3 a2 + rounder */ - - "paddd %%mm5, %%mm0 \n\t" /* mm0 = a3+b3 a2+b2 + rounder */ - "psubd %%mm5, %%mm4 \n\t" /* mm4 = a3-b3 a2-b2 + rounder */ - : : "r" (table), "r" (rounder)); -} - -static inline void mmxext_row_tail (int16_t * const row, const int store) -{ - __asm__ volatile ( - "psrad $" AV_STRINGIFY(ROW_SHIFT) ", %%mm0 \n\t" /* mm0 = y3 y2 */ - - "psrad $" AV_STRINGIFY(ROW_SHIFT) ", %%mm4 \n\t" /* mm4 = y4 y5 */ - - "packssdw %%mm0, %%mm1 \n\t" /* mm1 = y3 y2 y1 y0 */ - - "packssdw %%mm3, %%mm4 \n\t" /* mm4 = y6 y7 y4 y5 */ - - "movq %%mm1, (%0) \n\t" /* save y3 y2 y1 y0 */ - "pshufw $0xb1, %%mm4, %%mm4 \n\t" /* mm4 = y7 y6 y5 y4 */ - - /* slot */ - - "movq %%mm4, 8(%0) \n\t" /* save y7 y6 y5 y4 */ - :: "r" (row+store) - ); -} - -static inline void mmxext_row_mid (int16_t * const row, const int store, - const int offset, - const int16_t * const table) -{ - __asm__ volatile ( - "movq (%0,%1), %%mm2 \n\t" /* mm2 = x6 x4 x2 x0 */ - "psrad $" AV_STRINGIFY(ROW_SHIFT) ", %%mm0 \n\t" /* mm0 = y3 y2 */ - - "movq 8(%0,%1), %%mm5 \n\t" /* mm5 = x7 x5 x3 x1 */ - "psrad $" AV_STRINGIFY(ROW_SHIFT) ", %%mm4 \n\t" /* mm4 = y4 y5 */ - - "packssdw %%mm0, %%mm1 \n\t" /* mm1 = y3 y2 y1 y0 */ - "movq %%mm5, %%mm6 \n\t" /* mm6 = x7 x5 x3 x1 */ - - "packssdw %%mm3, %%mm4 \n\t" /* mm4 = y6 y7 y4 y5 */ - "movq %%mm2, %%mm0 \n\t" /* mm0 = x6 x4 x2 x0 */ - - "movq %%mm1, (%0,%2) \n\t" /* save y3 y2 y1 y0 */ - "pshufw $0xb1, %%mm4, %%mm4\n\t" /* mm4 = y7 y6 y5 y4 */ - - "movq (%3), %%mm3 \n\t" /* mm3 = -C2 -C4 C2 C4 */ - "movq %%mm4, 8(%0,%2) \n\t" /* save y7 y6 y5 y4 */ - - "pmaddwd %%mm0, %%mm3 \n\t" /* mm3= -C4*x4-C2*x6 C4*x0+C2*x2 */ - - "movq 8(%3), %%mm4 \n\t" /* mm4 = C6 C4 C6 C4 */ - "pshufw $0x4e, %%mm2, %%mm2\n\t" /* mm2 = x2 x0 x6 x4 */ - :: "r" (row), "r" ((x86_reg) (2*offset)), "r" ((x86_reg) (2*store)), "r" (table) - ); -} - - -/* MMX row IDCT */ - -#define mmx_table(c1,c2,c3,c4,c5,c6,c7) { c4, c2, c4, c6, \ - c4, c6, -c4, -c2, \ - c1, c3, c3, -c7, \ - c5, c7, -c1, -c5, \ - c4, -c6, c4, -c2, \ - -c4, c2, c4, -c6, \ - c5, -c1, c7, -c5, \ - c7, c3, c3, -c1 } - -static inline void mmx_row_head (int16_t * const row, const int offset, - const int16_t * const table) -{ - __asm__ volatile ( - "movq (%0), %%mm2 \n\t" /* mm2 = x6 x4 x2 x0 */ - - "movq 8(%0), %%mm5 \n\t" /* mm5 = x7 x5 x3 x1 */ - "movq %%mm2, %%mm0 \n\t" /* mm0 = x6 x4 x2 x0 */ - - "movq (%1), %%mm3 \n\t" /* mm3 = C6 C4 C2 C4 */ - "movq %%mm5, %%mm6 \n\t" /* mm6 = x7 x5 x3 x1 */ - - "punpckldq %%mm0, %%mm0 \n\t" /* mm0 = x2 x0 x2 x0 */ - - "movq 8(%1), %%mm4 \n\t" /* mm4 = -C2 -C4 C6 C4 */ - "pmaddwd %%mm0, %%mm3 \n\t" /* mm3 = C4*x0+C6*x2 C4*x0+C2*x2 */ - - "movq 16(%1), %%mm1 \n\t" /* mm1 = -C7 C3 C3 C1 */ - "punpckhdq %%mm2, %%mm2 \n\t" /* mm2 = x6 x4 x6 x4 */ - :: "r" ((row+offset)), "r" (table) - ); -} - -static inline void mmx_row (const int16_t * const table, - const int32_t * const rounder) -{ - __asm__ volatile ( - "pmaddwd %%mm2, %%mm4 \n\t" /* mm4 = -C4*x4-C2*x6 C4*x4+C6*x6 */ - "punpckldq %%mm5, %%mm5 \n\t" /* mm5 = x3 x1 x3 x1 */ - - "pmaddwd 32(%0), %%mm0 \n\t" /* mm0 = C4*x0-C2*x2 C4*x0-C6*x2 */ - "punpckhdq %%mm6, %%mm6 \n\t" /* mm6 = x7 x5 x7 x5 */ - - "movq 24(%0), %%mm7 \n\t" /* mm7 = -C5 -C1 C7 C5 */ - "pmaddwd %%mm5, %%mm1 \n\t" /* mm1 = C3*x1-C7*x3 C1*x1+C3*x3 */ - - "paddd (%1), %%mm3 \n\t" /* mm3 += rounder */ - "pmaddwd %%mm6, %%mm7 \n\t" /* mm7 = -C1*x5-C5*x7 C5*x5+C7*x7 */ - - "pmaddwd 40(%0), %%mm2 \n\t" /* mm2 = C4*x4-C6*x6 -C4*x4+C2*x6 */ - "paddd %%mm4, %%mm3 \n\t" /* mm3 = a1 a0 + rounder */ - - "pmaddwd 48(%0), %%mm5 \n\t" /* mm5 = C7*x1-C5*x3 C5*x1-C1*x3 */ - "movq %%mm3, %%mm4 \n\t" /* mm4 = a1 a0 + rounder */ - - "pmaddwd 56(%0), %%mm6 \n\t" /* mm6 = C3*x5-C1*x7 C7*x5+C3*x7 */ - "paddd %%mm7, %%mm1 \n\t" /* mm1 = b1 b0 */ - - "paddd (%1), %%mm0 \n\t" /* mm0 += rounder */ - "psubd %%mm1, %%mm3 \n\t" /* mm3 = a1-b1 a0-b0 + rounder */ - - "psrad $" AV_STRINGIFY(ROW_SHIFT) ", %%mm3 \n\t" /* mm3 = y6 y7 */ - "paddd %%mm4, %%mm1 \n\t" /* mm1 = a1+b1 a0+b0 + rounder */ - - "paddd %%mm2, %%mm0 \n\t" /* mm0 = a3 a2 + rounder */ - "psrad $" AV_STRINGIFY(ROW_SHIFT) ", %%mm1 \n\t" /* mm1 = y1 y0 */ - - "paddd %%mm6, %%mm5 \n\t" /* mm5 = b3 b2 */ - "movq %%mm0, %%mm7 \n\t" /* mm7 = a3 a2 + rounder */ - - "paddd %%mm5, %%mm0 \n\t" /* mm0 = a3+b3 a2+b2 + rounder */ - "psubd %%mm5, %%mm7 \n\t" /* mm7 = a3-b3 a2-b2 + rounder */ - :: "r" (table), "r" (rounder) - ); -} - -static inline void mmx_row_tail (int16_t * const row, const int store) -{ - __asm__ volatile ( - "psrad $" AV_STRINGIFY(ROW_SHIFT) ", %%mm0 \n\t" /* mm0 = y3 y2 */ - - "psrad $" AV_STRINGIFY(ROW_SHIFT) ", %%mm7 \n\t" /* mm7 = y4 y5 */ - - "packssdw %%mm0, %%mm1 \n\t" /* mm1 = y3 y2 y1 y0 */ - - "packssdw %%mm3, %%mm7 \n\t" /* mm7 = y6 y7 y4 y5 */ - - "movq %%mm1, (%0) \n\t" /* save y3 y2 y1 y0 */ - "movq %%mm7, %%mm4 \n\t" /* mm4 = y6 y7 y4 y5 */ - - "pslld $16, %%mm7 \n\t" /* mm7 = y7 0 y5 0 */ - - "psrld $16, %%mm4 \n\t" /* mm4 = 0 y6 0 y4 */ - - "por %%mm4, %%mm7 \n\t" /* mm7 = y7 y6 y5 y4 */ - - /* slot */ - - "movq %%mm7, 8(%0) \n\t" /* save y7 y6 y5 y4 */ - :: "r" (row+store) - ); -} - -static inline void mmx_row_mid (int16_t * const row, const int store, - const int offset, const int16_t * const table) -{ - - __asm__ volatile ( - "movq (%0,%1), %%mm2 \n\t" /* mm2 = x6 x4 x2 x0 */ - "psrad $" AV_STRINGIFY(ROW_SHIFT) ", %%mm0 \n\t" /* mm0 = y3 y2 */ - - "movq 8(%0,%1), %%mm5 \n\t" /* mm5 = x7 x5 x3 x1 */ - "psrad $" AV_STRINGIFY(ROW_SHIFT) ", %%mm7 \n\t" /* mm7 = y4 y5 */ - - "packssdw %%mm0, %%mm1 \n\t" /* mm1 = y3 y2 y1 y0 */ - "movq %%mm5, %%mm6 \n\t" /* mm6 = x7 x5 x3 x1 */ - - "packssdw %%mm3, %%mm7 \n\t" /* mm7 = y6 y7 y4 y5 */ - "movq %%mm2, %%mm0 \n\t" /* mm0 = x6 x4 x2 x0 */ - - "movq %%mm1, (%0,%2) \n\t" /* save y3 y2 y1 y0 */ - "movq %%mm7, %%mm1 \n\t" /* mm1 = y6 y7 y4 y5 */ - - "punpckldq %%mm0, %%mm0 \n\t" /* mm0 = x2 x0 x2 x0 */ - "psrld $16, %%mm7 \n\t" /* mm7 = 0 y6 0 y4 */ - - "movq (%3), %%mm3 \n\t" /* mm3 = C6 C4 C2 C4 */ - "pslld $16, %%mm1 \n\t" /* mm1 = y7 0 y5 0 */ - - "movq 8(%3), %%mm4 \n\t" /* mm4 = -C2 -C4 C6 C4 */ - "por %%mm1, %%mm7 \n\t" /* mm7 = y7 y6 y5 y4 */ - - "movq 16(%3), %%mm1 \n\t" /* mm1 = -C7 C3 C3 C1 */ - "punpckhdq %%mm2, %%mm2 \n\t" /* mm2 = x6 x4 x6 x4 */ - - "movq %%mm7, 8(%0,%2) \n\t" /* save y7 y6 y5 y4 */ - "pmaddwd %%mm0, %%mm3 \n\t" /* mm3 = C4*x0+C6*x2 C4*x0+C2*x2 */ - : : "r" (row), "r" ((x86_reg) (2*offset)), "r" ((x86_reg) (2*store)), "r" (table) - ); -} - - -#if 0 -/* C column IDCT - it is just here to document the MMXEXT and MMX versions */ -static inline void idct_col (int16_t * col, int offset) -{ -/* multiplication - as implemented on mmx */ -#define F(c,x) (((c) * (x)) >> 16) - -/* saturation - it helps us handle torture test cases */ -#define S(x) (((x)>32767) ? 32767 : ((x)<-32768) ? -32768 : (x)) - - int16_t x0, x1, x2, x3, x4, x5, x6, x7; - int16_t y0, y1, y2, y3, y4, y5, y6, y7; - int16_t a0, a1, a2, a3, b0, b1, b2, b3; - int16_t u04, v04, u26, v26, u17, v17, u35, v35, u12, v12; - - col += offset; - - x0 = col[0*8]; - x1 = col[1*8]; - x2 = col[2*8]; - x3 = col[3*8]; - x4 = col[4*8]; - x5 = col[5*8]; - x6 = col[6*8]; - x7 = col[7*8]; - - u04 = S (x0 + x4); - v04 = S (x0 - x4); - u26 = S (F (T2, x6) + x2); - v26 = S (F (T2, x2) - x6); - - a0 = S (u04 + u26); - a1 = S (v04 + v26); - a2 = S (v04 - v26); - a3 = S (u04 - u26); - - u17 = S (F (T1, x7) + x1); - v17 = S (F (T1, x1) - x7); - u35 = S (F (T3, x5) + x3); - v35 = S (F (T3, x3) - x5); - - b0 = S (u17 + u35); - b3 = S (v17 - v35); - u12 = S (u17 - u35); - v12 = S (v17 + v35); - u12 = S (2 * F (C4, u12)); - v12 = S (2 * F (C4, v12)); - b1 = S (u12 + v12); - b2 = S (u12 - v12); - - y0 = S (a0 + b0) >> COL_SHIFT; - y1 = S (a1 + b1) >> COL_SHIFT; - y2 = S (a2 + b2) >> COL_SHIFT; - y3 = S (a3 + b3) >> COL_SHIFT; - - y4 = S (a3 - b3) >> COL_SHIFT; - y5 = S (a2 - b2) >> COL_SHIFT; - y6 = S (a1 - b1) >> COL_SHIFT; - y7 = S (a0 - b0) >> COL_SHIFT; - - col[0*8] = y0; - col[1*8] = y1; - col[2*8] = y2; - col[3*8] = y3; - col[4*8] = y4; - col[5*8] = y5; - col[6*8] = y6; - col[7*8] = y7; -} -#endif - - -/* MMX column IDCT */ -static inline void idct_col (int16_t * const col, const int offset) -{ -#define T1 13036 -#define T2 27146 -#define T3 43790 -#define C4 23170 - - DECLARE_ALIGNED(8, static const short, t1_vector)[] = { - T1,T1,T1,T1, - T2,T2,T2,T2, - T3,T3,T3,T3, - C4,C4,C4,C4 - }; - - /* column code adapted from Peter Gubanov */ - /* http://www.elecard.com/peter/idct.shtml */ - - __asm__ volatile ( - "movq (%0), %%mm0 \n\t" /* mm0 = T1 */ - - "movq 2*8(%1), %%mm1 \n\t" /* mm1 = x1 */ - "movq %%mm0, %%mm2 \n\t" /* mm2 = T1 */ - - "movq 7*2*8(%1), %%mm4 \n\t" /* mm4 = x7 */ - "pmulhw %%mm1, %%mm0 \n\t" /* mm0 = T1*x1 */ - - "movq 16(%0), %%mm5 \n\t" /* mm5 = T3 */ - "pmulhw %%mm4, %%mm2 \n\t" /* mm2 = T1*x7 */ - - "movq 2*5*8(%1), %%mm6 \n\t" /* mm6 = x5 */ - "movq %%mm5, %%mm7 \n\t" /* mm7 = T3-1 */ - - "movq 3*8*2(%1), %%mm3 \n\t" /* mm3 = x3 */ - "psubsw %%mm4, %%mm0 \n\t" /* mm0 = v17 */ - - "movq 8(%0), %%mm4 \n\t" /* mm4 = T2 */ - "pmulhw %%mm3, %%mm5 \n\t" /* mm5 = (T3-1)*x3 */ - - "paddsw %%mm2, %%mm1 \n\t" /* mm1 = u17 */ - "pmulhw %%mm6, %%mm7 \n\t" /* mm7 = (T3-1)*x5 */ - - /* slot */ - - "movq %%mm4, %%mm2 \n\t" /* mm2 = T2 */ - "paddsw %%mm3, %%mm5 \n\t" /* mm5 = T3*x3 */ - - "pmulhw 2*8*2(%1), %%mm4 \n\t" /* mm4 = T2*x2 */ - "paddsw %%mm6, %%mm7 \n\t" /* mm7 = T3*x5 */ - - "psubsw %%mm6, %%mm5 \n\t" /* mm5 = v35 */ - "paddsw %%mm3, %%mm7 \n\t" /* mm7 = u35 */ - - "movq 6*8*2(%1), %%mm3 \n\t" /* mm3 = x6 */ - "movq %%mm0, %%mm6 \n\t" /* mm6 = v17 */ - - "pmulhw %%mm3, %%mm2 \n\t" /* mm2 = T2*x6 */ - "psubsw %%mm5, %%mm0 \n\t" /* mm0 = b3 */ - - "psubsw %%mm3, %%mm4 \n\t" /* mm4 = v26 */ - "paddsw %%mm6, %%mm5 \n\t" /* mm5 = v12 */ - - "movq %%mm0, 3*8*2(%1)\n\t" /* save b3 in scratch0 */ - "movq %%mm1, %%mm6 \n\t" /* mm6 = u17 */ - - "paddsw 2*8*2(%1), %%mm2 \n\t" /* mm2 = u26 */ - "paddsw %%mm7, %%mm6 \n\t" /* mm6 = b0 */ - - "psubsw %%mm7, %%mm1 \n\t" /* mm1 = u12 */ - "movq %%mm1, %%mm7 \n\t" /* mm7 = u12 */ - - "movq 0*8(%1), %%mm3 \n\t" /* mm3 = x0 */ - "paddsw %%mm5, %%mm1 \n\t" /* mm1 = u12+v12 */ - - "movq 24(%0), %%mm0 \n\t" /* mm0 = C4/2 */ - "psubsw %%mm5, %%mm7 \n\t" /* mm7 = u12-v12 */ - - "movq %%mm6, 5*8*2(%1)\n\t" /* save b0 in scratch1 */ - "pmulhw %%mm0, %%mm1 \n\t" /* mm1 = b1/2 */ - - "movq %%mm4, %%mm6 \n\t" /* mm6 = v26 */ - "pmulhw %%mm0, %%mm7 \n\t" /* mm7 = b2/2 */ - - "movq 4*8*2(%1), %%mm5 \n\t" /* mm5 = x4 */ - "movq %%mm3, %%mm0 \n\t" /* mm0 = x0 */ - - "psubsw %%mm5, %%mm3 \n\t" /* mm3 = v04 */ - "paddsw %%mm5, %%mm0 \n\t" /* mm0 = u04 */ - - "paddsw %%mm3, %%mm4 \n\t" /* mm4 = a1 */ - "movq %%mm0, %%mm5 \n\t" /* mm5 = u04 */ - - "psubsw %%mm6, %%mm3 \n\t" /* mm3 = a2 */ - "paddsw %%mm2, %%mm5 \n\t" /* mm5 = a0 */ - - "paddsw %%mm1, %%mm1 \n\t" /* mm1 = b1 */ - "psubsw %%mm2, %%mm0 \n\t" /* mm0 = a3 */ - - "paddsw %%mm7, %%mm7 \n\t" /* mm7 = b2 */ - "movq %%mm3, %%mm2 \n\t" /* mm2 = a2 */ - - "movq %%mm4, %%mm6 \n\t" /* mm6 = a1 */ - "paddsw %%mm7, %%mm3 \n\t" /* mm3 = a2+b2 */ - - "psraw $" AV_STRINGIFY(COL_SHIFT) ", %%mm3\n\t" /* mm3 = y2 */ - "paddsw %%mm1, %%mm4\n\t" /* mm4 = a1+b1 */ - - "psraw $" AV_STRINGIFY(COL_SHIFT) ", %%mm4\n\t" /* mm4 = y1 */ - "psubsw %%mm1, %%mm6 \n\t" /* mm6 = a1-b1 */ - - "movq 5*8*2(%1), %%mm1 \n\t" /* mm1 = b0 */ - "psubsw %%mm7, %%mm2 \n\t" /* mm2 = a2-b2 */ - - "psraw $" AV_STRINGIFY(COL_SHIFT) ", %%mm6\n\t" /* mm6 = y6 */ - "movq %%mm5, %%mm7 \n\t" /* mm7 = a0 */ - - "movq %%mm4, 1*8*2(%1)\n\t" /* save y1 */ - "psraw $" AV_STRINGIFY(COL_SHIFT) ", %%mm2\n\t" /* mm2 = y5 */ - - "movq %%mm3, 2*8*2(%1)\n\t" /* save y2 */ - "paddsw %%mm1, %%mm5 \n\t" /* mm5 = a0+b0 */ - - "movq 3*8*2(%1), %%mm4 \n\t" /* mm4 = b3 */ - "psubsw %%mm1, %%mm7 \n\t" /* mm7 = a0-b0 */ - - "psraw $" AV_STRINGIFY(COL_SHIFT) ", %%mm5\n\t" /* mm5 = y0 */ - "movq %%mm0, %%mm3 \n\t" /* mm3 = a3 */ - - "movq %%mm2, 5*8*2(%1)\n\t" /* save y5 */ - "psubsw %%mm4, %%mm3 \n\t" /* mm3 = a3-b3 */ - - "psraw $" AV_STRINGIFY(COL_SHIFT) ", %%mm7\n\t" /* mm7 = y7 */ - "paddsw %%mm0, %%mm4 \n\t" /* mm4 = a3+b3 */ - - "movq %%mm5, 0*8*2(%1)\n\t" /* save y0 */ - "psraw $" AV_STRINGIFY(COL_SHIFT) ", %%mm3\n\t" /* mm3 = y4 */ - - "movq %%mm6, 6*8*2(%1)\n\t" /* save y6 */ - "psraw $" AV_STRINGIFY(COL_SHIFT) ", %%mm4\n\t" /* mm4 = y3 */ - - "movq %%mm7, 7*8*2(%1)\n\t" /* save y7 */ - - "movq %%mm3, 4*8*2(%1)\n\t" /* save y4 */ - - "movq %%mm4, 3*8*2(%1)\n\t" /* save y3 */ - :: "r" (t1_vector), "r" (col+offset) - ); - -#undef T1 -#undef T2 -#undef T3 -#undef C4 -} - - -DECLARE_ALIGNED(8, static const int32_t, rounder0)[] = - rounder ((1 << (COL_SHIFT - 1)) - 0.5); -DECLARE_ALIGNED(8, static const int32_t, rounder4)[] = rounder (0); -DECLARE_ALIGNED(8, static const int32_t, rounder1)[] = - rounder (1.25683487303); /* C1*(C1/C4+C1+C7)/2 */ -DECLARE_ALIGNED(8, static const int32_t, rounder7)[] = - rounder (-0.25); /* C1*(C7/C4+C7-C1)/2 */ -DECLARE_ALIGNED(8, static const int32_t, rounder2)[] = - rounder (0.60355339059); /* C2 * (C6+C2)/2 */ -DECLARE_ALIGNED(8, static const int32_t, rounder6)[] = - rounder (-0.25); /* C2 * (C6-C2)/2 */ -DECLARE_ALIGNED(8, static const int32_t, rounder3)[] = - rounder (0.087788325588); /* C3*(-C3/C4+C3+C5)/2 */ -DECLARE_ALIGNED(8, static const int32_t, rounder5)[] = - rounder (-0.441341716183); /* C3*(-C5/C4+C5-C3)/2 */ - -#undef COL_SHIFT -#undef ROW_SHIFT - -#define declare_idct(idct,table,idct_row_head,idct_row,idct_row_tail,idct_row_mid) \ -void idct (int16_t * const block) \ -{ \ - DECLARE_ALIGNED(16, static const int16_t, table04)[] = \ - table (22725, 21407, 19266, 16384, 12873, 8867, 4520); \ - DECLARE_ALIGNED(16, static const int16_t, table17)[] = \ - table (31521, 29692, 26722, 22725, 17855, 12299, 6270); \ - DECLARE_ALIGNED(16, static const int16_t, table26)[] = \ - table (29692, 27969, 25172, 21407, 16819, 11585, 5906); \ - DECLARE_ALIGNED(16, static const int16_t, table35)[] = \ - table (26722, 25172, 22654, 19266, 15137, 10426, 5315); \ - \ - idct_row_head (block, 0*8, table04); \ - idct_row (table04, rounder0); \ - idct_row_mid (block, 0*8, 4*8, table04); \ - idct_row (table04, rounder4); \ - idct_row_mid (block, 4*8, 1*8, table17); \ - idct_row (table17, rounder1); \ - idct_row_mid (block, 1*8, 7*8, table17); \ - idct_row (table17, rounder7); \ - idct_row_mid (block, 7*8, 2*8, table26); \ - idct_row (table26, rounder2); \ - idct_row_mid (block, 2*8, 6*8, table26); \ - idct_row (table26, rounder6); \ - idct_row_mid (block, 6*8, 3*8, table35); \ - idct_row (table35, rounder3); \ - idct_row_mid (block, 3*8, 5*8, table35); \ - idct_row (table35, rounder5); \ - idct_row_tail (block, 5*8); \ - \ - idct_col (block, 0); \ - idct_col (block, 4); \ -} - -declare_idct (ff_mmxext_idct, mmxext_table, - mmxext_row_head, mmxext_row, mmxext_row_tail, mmxext_row_mid) - -declare_idct (ff_mmx_idct, mmx_table, - mmx_row_head, mmx_row, mmx_row_tail, mmx_row_mid) - -#endif /* HAVE_INLINE_ASM */ From cf5781fad0e67c6e49abc9b84390c0ca9485873e Mon Sep 17 00:00:00 2001 From: Mans Rullgard Date: Wed, 1 Aug 2012 14:01:08 +0100 Subject: [PATCH 4/9] vp8: pack struct VP8ThreadData more efficiently Reordering the members in this struct reduces the holes required to maintain alignment. With this order, the only remaining, and unavoidable, hole is 3 bytes following left_nnz. Signed-off-by: Mans Rullgard --- libavcodec/vp8.h | 34 +++++++++++++++++----------------- 1 file changed, 17 insertions(+), 17 deletions(-) diff --git a/libavcodec/vp8.h b/libavcodec/vp8.h index 6b3caa2d51..a337173520 100644 --- a/libavcodec/vp8.h +++ b/libavcodec/vp8.h @@ -94,21 +94,8 @@ typedef struct { } VP8Macroblock; typedef struct { -#if HAVE_THREADS - pthread_mutex_t lock; - pthread_cond_t cond; -#endif - int thread_nr; - int thread_mb_pos; // (mb_y << 16) | (mb_x & 0xFFFF) - int wait_mb_pos; // What the current thread is waiting on. - uint8_t *edge_emu_buffer; - /** - * For coeff decode, we need to know whether the above block had non-zero - * coefficients. This means for each macroblock, we need data for 4 luma - * blocks, 2 u blocks, 2 v blocks, and the luma dc block, for a total of 9 - * per macroblock. We keep the last row in top_nnz. - */ - DECLARE_ALIGNED(8, uint8_t, left_nnz)[9]; + DECLARE_ALIGNED(16, DCTELEM, block)[6][4][16]; + DECLARE_ALIGNED(16, DCTELEM, block_dc)[16]; /** * This is the index plus one of the last non-zero coeff * for each of the blocks in the current macroblock. @@ -117,8 +104,21 @@ typedef struct { * 2+-> full transform */ DECLARE_ALIGNED(16, uint8_t, non_zero_count_cache)[6][4]; - DECLARE_ALIGNED(16, DCTELEM, block)[6][4][16]; - DECLARE_ALIGNED(16, DCTELEM, block_dc)[16]; + /** + * For coeff decode, we need to know whether the above block had non-zero + * coefficients. This means for each macroblock, we need data for 4 luma + * blocks, 2 u blocks, 2 v blocks, and the luma dc block, for a total of 9 + * per macroblock. We keep the last row in top_nnz. + */ + DECLARE_ALIGNED(8, uint8_t, left_nnz)[9]; + int thread_nr; +#if HAVE_THREADS + pthread_mutex_t lock; + pthread_cond_t cond; +#endif + int thread_mb_pos; // (mb_y << 16) | (mb_x & 0xFFFF) + int wait_mb_pos; // What the current thread is waiting on. + uint8_t *edge_emu_buffer; VP8FilterStrength *filter_strength; } VP8ThreadData; From af500c08bbb054902f9326006011777bb3a98fb4 Mon Sep 17 00:00:00 2001 From: Mans Rullgard Date: Wed, 1 Aug 2012 14:32:19 +0100 Subject: [PATCH 5/9] dct-test: always link with aandcttab.o This allows building dct-test even if aandcttab.o is not pulled in by any enabled codec. The DCT with which these tables are used does not use them directly, so building it without the tables is possible. Signed-off-by: Mans Rullgard --- libavcodec/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/libavcodec/Makefile b/libavcodec/Makefile index 17bc364e6b..4a4364dd1b 100644 --- a/libavcodec/Makefile +++ b/libavcodec/Makefile @@ -748,7 +748,7 @@ HOSTPROGS = aac_tablegen \ CLEANFILES = *_tables.c *_tables.h *_tablegen$(HOSTEXESUF) -$(SUBDIR)dct-test$(EXESUF): $(SUBDIR)dctref.o +$(SUBDIR)dct-test$(EXESUF): $(SUBDIR)dctref.o $(SUBDIR)aandcttab.o TRIG_TABLES = cos cos_fixed sin TRIG_TABLES := $(TRIG_TABLES:%=$(SUBDIR)%_tables.c) From cfb1091898684e1b433ef138c83389ba4390c9b7 Mon Sep 17 00:00:00 2001 From: Mans Rullgard Date: Wed, 1 Aug 2012 17:15:42 +0100 Subject: [PATCH 6/9] vc1dec: remove useless #include simple_idct.h Signed-off-by: Mans Rullgard --- libavcodec/vc1dec.c | 1 - 1 file changed, 1 deletion(-) diff --git a/libavcodec/vc1dec.c b/libavcodec/vc1dec.c index c6cbfc1270..e36cc0dc54 100644 --- a/libavcodec/vc1dec.c +++ b/libavcodec/vc1dec.c @@ -36,7 +36,6 @@ #include "vc1acdata.h" #include "msmpeg4data.h" #include "unary.h" -#include "simple_idct.h" #include "mathops.h" #include "vdpau_internal.h" From 50468f93e3940ba78836dfdac5165c20ae75327a Mon Sep 17 00:00:00 2001 From: Jordi Ortiz Date: Wed, 1 Aug 2012 11:25:19 +0200 Subject: [PATCH 7/9] rtmp: add functions for reading AMF values Signed-off-by: Luca Barbato --- libavformat/rtmppkt.c | 45 +++++++++++++++++++++++++++++++++++++++++++ libavformat/rtmppkt.h | 42 ++++++++++++++++++++++++++++++++++++++++ 2 files changed, 87 insertions(+) diff --git a/libavformat/rtmppkt.c b/libavformat/rtmppkt.c index 4ce238d5d0..f69ce82c66 100644 --- a/libavformat/rtmppkt.c +++ b/libavformat/rtmppkt.c @@ -71,6 +71,51 @@ void ff_amf_write_object_end(uint8_t **dst) bytestream_put_be24(dst, AMF_DATA_TYPE_OBJECT_END); } +int ff_amf_read_bool(GetByteContext *bc, int *val) +{ + if (bytestream2_get_byte(bc) != AMF_DATA_TYPE_BOOL) + return AVERROR_INVALIDDATA; + *val = bytestream2_get_byte(bc); + return 0; +} + +int ff_amf_read_number(GetByteContext *bc, double *val) +{ + uint64_t read; + if (bytestream2_get_byte(bc) != AMF_DATA_TYPE_NUMBER) + return AVERROR_INVALIDDATA; + read = bytestream2_get_be64(bc); + *val = av_int2double(read); + return 0; +} + +int ff_amf_read_string(GetByteContext *bc, uint8_t *str, + int strsize, int *length) +{ + int stringlen = 0; + int readsize; + if (bytestream2_get_byte(bc) != AMF_DATA_TYPE_STRING) + return AVERROR_INVALIDDATA; + stringlen = bytestream2_get_be16(bc); + if (stringlen + 1 > strsize) + return AVERROR(EINVAL); + readsize = bytestream2_get_buffer(bc, str, stringlen); + if (readsize != stringlen) { + av_log(NULL, AV_LOG_WARNING, + "Unable to read as many bytes as AMF string signaled\n"); + } + str[readsize] = '\0'; + *length = FFMIN(stringlen, readsize); + return 0; +} + +int ff_amf_read_null(GetByteContext *bc) +{ + if (bytestream2_get_byte(bc) != AMF_DATA_TYPE_NULL) + return AVERROR_INVALIDDATA; + return 0; +} + int ff_rtmp_packet_read(URLContext *h, RTMPPacket *p, int chunk_size, RTMPPacket *prev_pkt) { diff --git a/libavformat/rtmppkt.h b/libavformat/rtmppkt.h index a83d0feb8f..cd5be5ad83 100644 --- a/libavformat/rtmppkt.h +++ b/libavformat/rtmppkt.h @@ -231,6 +231,48 @@ void ff_amf_write_field_name(uint8_t **dst, const char *str); */ void ff_amf_write_object_end(uint8_t **dst); +/** + * Read AMF boolean value. + * + *@param[in,out] gbc GetByteContext initialized with AMF-formatted data + *@param[out] val 0 or 1 + *@return 0 on success or an AVERROR code on failure +*/ +int ff_amf_read_bool(GetByteContext *gbc, int *val); + +/** + * Read AMF number value. + * + *@param[in,out] gbc GetByteContext initialized with AMF-formatted data + *@param[out] val read value + *@return 0 on success or an AVERROR code on failure +*/ +int ff_amf_read_number(GetByteContext *gbc, double *val); + +/** + * Read AMF string value. + * + * Appends a trailing \0 to output string in order to + * ease later parsing. + * + *@param[in,out] gbc GetByteContext initialized with AMF-formatted data + *@param[out] str read string + *@param[in] strsize buffer size available to store the read string + *@param[out] length read string length + *@return 0 on success or an AVERROR code on failure +*/ +int ff_amf_read_string(GetByteContext *gbc, uint8_t *str, + int strsize, int *length); + +/** + * Read AMF NULL value. + * + *@param[in,out] gbc GetByteContext initialized with AMF-formatted data + *@return 0 on success or an AVERROR code on failure +*/ +int ff_amf_read_null(GetByteContext *gbc); + + /** @} */ // AMF funcs #endif /* AVFORMAT_RTMPPKT_H */ From 150adea6da24e5342584b403dc0f901aa79f53c8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Alex=20R=C3=B8nne=20Petersen?= Date: Thu, 2 Aug 2012 16:24:01 +0000 Subject: [PATCH 8/9] rtmppkt: Add missing libavcodec/bytestream.h include. Signed-off-by: Derek Buitenhuis --- libavformat/rtmppkt.h | 1 + 1 file changed, 1 insertion(+) diff --git a/libavformat/rtmppkt.h b/libavformat/rtmppkt.h index cd5be5ad83..8932cac8a6 100644 --- a/libavformat/rtmppkt.h +++ b/libavformat/rtmppkt.h @@ -22,6 +22,7 @@ #ifndef AVFORMAT_RTMPPKT_H #define AVFORMAT_RTMPPKT_H +#include "libavcodec/bytestream.h" #include "avformat.h" #include "url.h" From c728518b3cbb0daf0d0a65ba7adfcb48c5629b93 Mon Sep 17 00:00:00 2001 From: "Ronald S. Bultje" Date: Thu, 2 Aug 2012 12:15:46 -0500 Subject: [PATCH 9/9] x86: fft: fix imdct_half() for AVX Some calculations were changed in b6a3849 to use mmsize, which was not correct for the AVX version, which uses INIT_YMM and therefore has mmsize == 32. Fixes Bug 341. Signed-off-by: Justin Ruggles --- libavcodec/x86/fft_mmx.asm | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/libavcodec/x86/fft_mmx.asm b/libavcodec/x86/fft_mmx.asm index 81e4411dcb..ac53296f70 100644 --- a/libavcodec/x86/fft_mmx.asm +++ b/libavcodec/x86/fft_mmx.asm @@ -1009,7 +1009,11 @@ cglobal imdct_half, 3,12,8; FFTContext *s, FFTSample *output, const FFTSample *i push rrevtab %endif - sub r3, mmsize/4 +%if mmsize == 8 + sub r3, 2 +%else + sub r3, 4 +%endif %if ARCH_X86_64 || mmsize == 8 xor r4, r4 sub r4, r3 @@ -1036,7 +1040,9 @@ cglobal imdct_half, 3,12,8; FFTContext *s, FFTSample *output, const FFTSample *i mova [r1+r5*8], m0 mova [r1+r6*8], m2 add r4, 2 -%elif ARCH_X86_64 + sub r4, 2 +%else +%if ARCH_X86_64 movzx r5, word [rrevtab+r4-4] movzx r6, word [rrevtab+r4-2] movzx r10, word [rrevtab+r3] @@ -1057,7 +1063,8 @@ cglobal imdct_half, 3,12,8; FFTContext *s, FFTSample *output, const FFTSample *i movlps [r1+r5*8], xmm1 movhps [r1+r4*8], xmm1 %endif - sub r3, mmsize/4 + sub r3, 4 +%endif jns .pre mov r5, r0