v210enc: Add SIMD optimised 8-bit and 10-bit encoders
Signed-off-by: Michael Niedermayer <michaelni@gmx.at>
This commit is contained in:
parent
1d3a3b9f89
commit
36091742d1
@ -24,83 +24,191 @@
|
|||||||
#include "avcodec.h"
|
#include "avcodec.h"
|
||||||
#include "bytestream.h"
|
#include "bytestream.h"
|
||||||
#include "internal.h"
|
#include "internal.h"
|
||||||
|
#include "v210enc.h"
|
||||||
|
|
||||||
|
#define CLIP(v) av_clip(v, 4, 1019)
|
||||||
|
#define CLIP8(v) av_clip(v, 1, 254)
|
||||||
|
|
||||||
|
#define WRITE_PIXELS(a, b, c) \
|
||||||
|
do { \
|
||||||
|
val = CLIP(*a++); \
|
||||||
|
val |= (CLIP(*b++) << 10) | \
|
||||||
|
(CLIP(*c++) << 20); \
|
||||||
|
AV_WL32(dst, val); \
|
||||||
|
dst += 4; \
|
||||||
|
} while (0)
|
||||||
|
|
||||||
|
#define WRITE_PIXELS8(a, b, c) \
|
||||||
|
do { \
|
||||||
|
val = (CLIP8(*a++) << 2); \
|
||||||
|
val |= (CLIP8(*b++) << 12) | \
|
||||||
|
(CLIP8(*c++) << 22); \
|
||||||
|
AV_WL32(dst, val); \
|
||||||
|
dst += 4; \
|
||||||
|
} while (0)
|
||||||
|
|
||||||
|
static void v210_planar_pack_8_c(const uint8_t *y, const uint8_t *u,
|
||||||
|
const uint8_t *v, uint8_t *dst, ptrdiff_t width)
|
||||||
|
{
|
||||||
|
uint32_t val;
|
||||||
|
int i;
|
||||||
|
|
||||||
|
/* unroll this to match the assembly */
|
||||||
|
for( i = 0; i < width-11; i += 12 ){
|
||||||
|
WRITE_PIXELS8(u, y, v);
|
||||||
|
WRITE_PIXELS8(y, u, y);
|
||||||
|
WRITE_PIXELS8(v, y, u);
|
||||||
|
WRITE_PIXELS8(y, v, y);
|
||||||
|
WRITE_PIXELS8(u, y, v);
|
||||||
|
WRITE_PIXELS8(y, u, y);
|
||||||
|
WRITE_PIXELS8(v, y, u);
|
||||||
|
WRITE_PIXELS8(y, v, y);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
static void v210_planar_pack_10_c(const uint16_t *y, const uint16_t *u,
|
||||||
|
const uint16_t *v, uint8_t *dst, ptrdiff_t width)
|
||||||
|
{
|
||||||
|
uint32_t val;
|
||||||
|
int i;
|
||||||
|
|
||||||
|
for( i = 0; i < width-5; i += 6 ){
|
||||||
|
WRITE_PIXELS(u, y, v);
|
||||||
|
WRITE_PIXELS(y, u, y);
|
||||||
|
WRITE_PIXELS(v, y, u);
|
||||||
|
WRITE_PIXELS(y, v, y);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
static av_cold int encode_init(AVCodecContext *avctx)
|
static av_cold int encode_init(AVCodecContext *avctx)
|
||||||
{
|
{
|
||||||
|
V210EncContext *s = avctx->priv_data;
|
||||||
|
|
||||||
if (avctx->width & 1) {
|
if (avctx->width & 1) {
|
||||||
av_log(avctx, AV_LOG_ERROR, "v210 needs even width\n");
|
av_log(avctx, AV_LOG_ERROR, "v210 needs even width\n");
|
||||||
return AVERROR(EINVAL);
|
return AVERROR(EINVAL);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (avctx->bits_per_raw_sample != 10)
|
|
||||||
av_log(avctx, AV_LOG_WARNING, "bits per raw sample: %d != 10-bit\n",
|
|
||||||
avctx->bits_per_raw_sample);
|
|
||||||
|
|
||||||
avctx->coded_frame = av_frame_alloc();
|
avctx->coded_frame = av_frame_alloc();
|
||||||
if (!avctx->coded_frame)
|
if (!avctx->coded_frame)
|
||||||
return AVERROR(ENOMEM);
|
return AVERROR(ENOMEM);
|
||||||
|
|
||||||
avctx->coded_frame->pict_type = AV_PICTURE_TYPE_I;
|
avctx->coded_frame->pict_type = AV_PICTURE_TYPE_I;
|
||||||
|
|
||||||
|
s->pack_line_8 = v210_planar_pack_8_c;
|
||||||
|
s->pack_line_10 = v210_planar_pack_10_c;
|
||||||
|
|
||||||
|
if (ARCH_X86)
|
||||||
|
ff_v210enc_init_x86(s);
|
||||||
|
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
static int encode_frame(AVCodecContext *avctx, AVPacket *pkt,
|
static int encode_frame(AVCodecContext *avctx, AVPacket *pkt,
|
||||||
const AVFrame *pic, int *got_packet)
|
const AVFrame *pic, int *got_packet)
|
||||||
{
|
{
|
||||||
|
V210EncContext *s = avctx->priv_data;
|
||||||
|
|
||||||
int aligned_width = ((avctx->width + 47) / 48) * 48;
|
int aligned_width = ((avctx->width + 47) / 48) * 48;
|
||||||
int stride = aligned_width * 8 / 3;
|
int stride = aligned_width * 8 / 3;
|
||||||
int line_padding = stride - ((avctx->width * 8 + 11) / 12) * 4;
|
int line_padding = stride - ((avctx->width * 8 + 11) / 12) * 4;
|
||||||
int h, w, ret;
|
int h, w, ret;
|
||||||
|
uint8_t *dst;
|
||||||
|
|
||||||
|
if ((ret = ff_alloc_packet(pkt, avctx->height * stride)) < 0) {
|
||||||
|
av_log(avctx, AV_LOG_ERROR, "Error getting output packet.\n");
|
||||||
|
return ret;
|
||||||
|
}
|
||||||
|
|
||||||
|
dst = pkt->data;
|
||||||
|
|
||||||
|
if (pic->format == AV_PIX_FMT_YUV422P10) {
|
||||||
const uint16_t *y = (const uint16_t*)pic->data[0];
|
const uint16_t *y = (const uint16_t*)pic->data[0];
|
||||||
const uint16_t *u = (const uint16_t*)pic->data[1];
|
const uint16_t *u = (const uint16_t*)pic->data[1];
|
||||||
const uint16_t *v = (const uint16_t*)pic->data[2];
|
const uint16_t *v = (const uint16_t*)pic->data[2];
|
||||||
PutByteContext p;
|
|
||||||
|
|
||||||
if ((ret = ff_alloc_packet2(avctx, pkt, avctx->height * stride)) < 0)
|
|
||||||
return ret;
|
|
||||||
|
|
||||||
bytestream2_init_writer(&p, pkt->data, pkt->size);
|
|
||||||
|
|
||||||
#define CLIP(v) av_clip(v, 4, 1019)
|
|
||||||
|
|
||||||
#define WRITE_PIXELS(a, b, c) \
|
|
||||||
do { \
|
|
||||||
val = CLIP(*a++); \
|
|
||||||
val |= (CLIP(*b++) << 10) | \
|
|
||||||
(CLIP(*c++) << 20); \
|
|
||||||
bytestream2_put_le32u(&p, val); \
|
|
||||||
} while (0)
|
|
||||||
|
|
||||||
for (h = 0; h < avctx->height; h++) {
|
for (h = 0; h < avctx->height; h++) {
|
||||||
uint32_t val;
|
uint32_t val;
|
||||||
for (w = 0; w < avctx->width - 5; w += 6) {
|
w = (avctx->width / 6) * 6;
|
||||||
WRITE_PIXELS(u, y, v);
|
s->pack_line_10(y, u, v, dst, w);
|
||||||
WRITE_PIXELS(y, u, y);
|
|
||||||
WRITE_PIXELS(v, y, u);
|
y += w;
|
||||||
WRITE_PIXELS(y, v, y);
|
u += w >> 1;
|
||||||
}
|
v += w >> 1;
|
||||||
|
dst += (w / 6) * 16;
|
||||||
if (w < avctx->width - 1) {
|
if (w < avctx->width - 1) {
|
||||||
WRITE_PIXELS(u, y, v);
|
WRITE_PIXELS(u, y, v);
|
||||||
|
|
||||||
val = CLIP(*y++);
|
val = CLIP(*y++);
|
||||||
if (w == avctx->width - 2)
|
if (w == avctx->width - 2) {
|
||||||
bytestream2_put_le32u(&p, val);
|
AV_WL32(dst, val);
|
||||||
|
dst += 4;
|
||||||
|
}
|
||||||
|
}
|
||||||
if (w < avctx->width - 3) {
|
if (w < avctx->width - 3) {
|
||||||
val |= (CLIP(*u++) << 10) | (CLIP(*y++) << 20);
|
val |= (CLIP(*u++) << 10) | (CLIP(*y++) << 20);
|
||||||
bytestream2_put_le32u(&p, val);
|
AV_WL32(dst, val);
|
||||||
|
dst += 4;
|
||||||
|
|
||||||
val = CLIP(*v++) | (CLIP(*y++) << 10);
|
val = CLIP(*v++) | (CLIP(*y++) << 10);
|
||||||
bytestream2_put_le32u(&p, val);
|
AV_WL32(dst, val);
|
||||||
}
|
dst += 4;
|
||||||
}
|
}
|
||||||
|
|
||||||
bytestream2_set_buffer(&p, 0, line_padding);
|
memset(dst, 0, line_padding);
|
||||||
|
dst += line_padding;
|
||||||
|
|
||||||
y += pic->linesize[0] / 2 - avctx->width;
|
y += pic->linesize[0] / 2 - avctx->width;
|
||||||
u += pic->linesize[1] / 2 - avctx->width / 2;
|
u += pic->linesize[1] / 2 - avctx->width / 2;
|
||||||
v += pic->linesize[2] / 2 - avctx->width / 2;
|
v += pic->linesize[2] / 2 - avctx->width / 2;
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
else if(pic->format == AV_PIX_FMT_YUV422P) {
|
||||||
|
const uint8_t *y = pic->data[0];
|
||||||
|
const uint8_t *u = pic->data[1];
|
||||||
|
const uint8_t *v = pic->data[2];
|
||||||
|
for (h = 0; h < avctx->height; h++) {
|
||||||
|
uint32_t val;
|
||||||
|
w = (avctx->width / 12) * 12;
|
||||||
|
s->pack_line_8(y, u, v, dst, w);
|
||||||
|
|
||||||
|
y += w;
|
||||||
|
u += w >> 1;
|
||||||
|
v += w >> 1;
|
||||||
|
dst += (w / 12) * 32;
|
||||||
|
|
||||||
|
for( ; w < avctx->width-5; w += 6 ){
|
||||||
|
WRITE_PIXELS8(u, y, v);
|
||||||
|
WRITE_PIXELS8(y, u, y);
|
||||||
|
WRITE_PIXELS8(v, y, u);
|
||||||
|
WRITE_PIXELS8(y, v, y);
|
||||||
|
}
|
||||||
|
if (w < avctx->width - 1) {
|
||||||
|
WRITE_PIXELS8(u, y, v);
|
||||||
|
|
||||||
|
val = CLIP8(*y++) << 2;
|
||||||
|
if (w == avctx->width - 2) {
|
||||||
|
AV_WL32(dst, val);
|
||||||
|
dst += 4;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (w < avctx->width - 3) {
|
||||||
|
val |= (CLIP8(*u++) << 12) | (CLIP8(*y++) << 22);
|
||||||
|
AV_WL32(dst, val);
|
||||||
|
dst += 4;
|
||||||
|
|
||||||
|
val = (CLIP8(*v++) << 2) | (CLIP8(*y++) << 12);
|
||||||
|
AV_WL32(dst, val);
|
||||||
|
dst += 4;
|
||||||
|
}
|
||||||
|
|
||||||
|
memset(dst, 0, line_padding);
|
||||||
|
dst += line_padding;
|
||||||
|
|
||||||
|
y += pic->linesize[0] - avctx->width;
|
||||||
|
u += pic->linesize[1] - avctx->width / 2;
|
||||||
|
v += pic->linesize[2] - avctx->width / 2;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
pkt->flags |= AV_PKT_FLAG_KEY;
|
pkt->flags |= AV_PKT_FLAG_KEY;
|
||||||
*got_packet = 1;
|
*got_packet = 1;
|
||||||
@ -119,8 +227,9 @@ AVCodec ff_v210_encoder = {
|
|||||||
.long_name = NULL_IF_CONFIG_SMALL("Uncompressed 4:2:2 10-bit"),
|
.long_name = NULL_IF_CONFIG_SMALL("Uncompressed 4:2:2 10-bit"),
|
||||||
.type = AVMEDIA_TYPE_VIDEO,
|
.type = AVMEDIA_TYPE_VIDEO,
|
||||||
.id = AV_CODEC_ID_V210,
|
.id = AV_CODEC_ID_V210,
|
||||||
|
.priv_data_size = sizeof(V210EncContext),
|
||||||
.init = encode_init,
|
.init = encode_init,
|
||||||
.encode2 = encode_frame,
|
.encode2 = encode_frame,
|
||||||
.close = encode_close,
|
.close = encode_close,
|
||||||
.pix_fmts = (const enum AVPixelFormat[]){ AV_PIX_FMT_YUV422P10, AV_PIX_FMT_NONE },
|
.pix_fmts = (const enum AVPixelFormat[]){ AV_PIX_FMT_YUV422P10, AV_PIX_FMT_YUV422P, AV_PIX_FMT_NONE },
|
||||||
};
|
};
|
||||||
|
33
libavcodec/v210enc.h
Normal file
33
libavcodec/v210enc.h
Normal file
@ -0,0 +1,33 @@
|
|||||||
|
/*
|
||||||
|
* This file is part of FFmpeg.
|
||||||
|
*
|
||||||
|
* FFmpeg is free software; you can redistribute it and/or
|
||||||
|
* modify it under the terms of the GNU Lesser General Public
|
||||||
|
* License as published by the Free Software Foundation; either
|
||||||
|
* version 2.1 of the License, or (at your option) any later version.
|
||||||
|
*
|
||||||
|
* FFmpeg is distributed in the hope that it will be useful,
|
||||||
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||||
|
* Lesser General Public License for more details.
|
||||||
|
*
|
||||||
|
* You should have received a copy of the GNU Lesser General Public
|
||||||
|
* License along with FFmpeg; if not, write to the Free Software
|
||||||
|
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||||
|
*/
|
||||||
|
|
||||||
|
#ifndef AVCOENC_V210ENC_H
|
||||||
|
#define AVCOENC_V210ENC_H
|
||||||
|
|
||||||
|
#include "libavutil/log.h"
|
||||||
|
#include "libavutil/opt.h"
|
||||||
|
#include "libavutil/pixfmt.h"
|
||||||
|
|
||||||
|
typedef struct {
|
||||||
|
void (*pack_line_8)(const uint8_t *y, const uint8_t *u, const uint8_t *v, uint8_t *dst, ptrdiff_t width);
|
||||||
|
void (*pack_line_10)(const uint16_t *y, const uint16_t *u, const uint16_t *v, uint8_t *dst, ptrdiff_t width);
|
||||||
|
} V210EncContext;
|
||||||
|
|
||||||
|
void ff_v210enc_init_x86(V210EncContext *s);
|
||||||
|
|
||||||
|
#endif /* AVCOENC_V210ENC_H */
|
@ -54,6 +54,7 @@ OBJS-$(CONFIG_SVQ1_ENCODER) += x86/svq1enc_init.o
|
|||||||
OBJS-$(CONFIG_TRUEHD_DECODER) += x86/mlpdsp_init.o
|
OBJS-$(CONFIG_TRUEHD_DECODER) += x86/mlpdsp_init.o
|
||||||
OBJS-$(CONFIG_TTA_DECODER) += x86/ttadsp_init.o
|
OBJS-$(CONFIG_TTA_DECODER) += x86/ttadsp_init.o
|
||||||
OBJS-$(CONFIG_V210_DECODER) += x86/v210-init.o
|
OBJS-$(CONFIG_V210_DECODER) += x86/v210-init.o
|
||||||
|
OBJS-$(CONFIG_V210_ENCODER) += x86/v210enc_init.o
|
||||||
OBJS-$(CONFIG_VC1_DECODER) += x86/vc1dsp_init.o
|
OBJS-$(CONFIG_VC1_DECODER) += x86/vc1dsp_init.o
|
||||||
OBJS-$(CONFIG_VORBIS_DECODER) += x86/vorbisdsp_init.o
|
OBJS-$(CONFIG_VORBIS_DECODER) += x86/vorbisdsp_init.o
|
||||||
OBJS-$(CONFIG_VP6_DECODER) += x86/vp6dsp_init.o
|
OBJS-$(CONFIG_VP6_DECODER) += x86/vp6dsp_init.o
|
||||||
@ -144,6 +145,7 @@ YASM-OBJS-$(CONFIG_RV40_DECODER) += x86/rv34dsp.o \
|
|||||||
YASM-OBJS-$(CONFIG_SVQ1_ENCODER) += x86/svq1enc.o
|
YASM-OBJS-$(CONFIG_SVQ1_ENCODER) += x86/svq1enc.o
|
||||||
YASM-OBJS-$(CONFIG_TRUEHD_DECODER) += x86/mlpdsp.o
|
YASM-OBJS-$(CONFIG_TRUEHD_DECODER) += x86/mlpdsp.o
|
||||||
YASM-OBJS-$(CONFIG_TTA_DECODER) += x86/ttadsp.o
|
YASM-OBJS-$(CONFIG_TTA_DECODER) += x86/ttadsp.o
|
||||||
|
YASM-OBJS-$(CONFIG_V210_ENCODER) += x86/v210enc.o
|
||||||
YASM-OBJS-$(CONFIG_V210_DECODER) += x86/v210.o
|
YASM-OBJS-$(CONFIG_V210_DECODER) += x86/v210.o
|
||||||
YASM-OBJS-$(CONFIG_VC1_DECODER) += x86/vc1dsp.o
|
YASM-OBJS-$(CONFIG_VC1_DECODER) += x86/vc1dsp.o
|
||||||
YASM-OBJS-$(CONFIG_VORBIS_DECODER) += x86/vorbisdsp.o
|
YASM-OBJS-$(CONFIG_VORBIS_DECODER) += x86/vorbisdsp.o
|
||||||
|
145
libavcodec/x86/v210enc.asm
Normal file
145
libavcodec/x86/v210enc.asm
Normal file
@ -0,0 +1,145 @@
|
|||||||
|
;******************************************************************************
|
||||||
|
;* V210 SIMD pack
|
||||||
|
;* Copyright (c) 2014 Kieran Kunhya <kierank@obe.tv>
|
||||||
|
;*
|
||||||
|
;* This file is part of FFmpeg.
|
||||||
|
;*
|
||||||
|
;* FFmpeg is free software; you can redistribute it and/or
|
||||||
|
;* modify it under the terms of the GNU Lesser General Public
|
||||||
|
;* License as published by the Free Software Foundation; either
|
||||||
|
;* version 2.1 of the License, or (at your option) any later version.
|
||||||
|
;*
|
||||||
|
;* FFmpeg is distributed in the hope that it will be useful,
|
||||||
|
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||||
|
;* Lesser General Public License for more details.
|
||||||
|
;*
|
||||||
|
;* You should have received a copy of the GNU Lesser General Public
|
||||||
|
;* License along with FFmpeg; if not, write to the Free Software
|
||||||
|
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||||
|
;******************************************************************************
|
||||||
|
|
||||||
|
%include "libavutil/x86/x86util.asm"
|
||||||
|
|
||||||
|
SECTION_RODATA
|
||||||
|
|
||||||
|
v210_enc_min_10: times 8 dw 0x4
|
||||||
|
v210_enc_max_10: times 8 dw 0x3fb
|
||||||
|
|
||||||
|
v210_enc_luma_mult_10: dw 4,1,16,4,1,16,0,0
|
||||||
|
v210_enc_luma_shuf_10: db -1,0,1,-1,2,3,4,5,-1,6,7,-1,8,9,10,11
|
||||||
|
|
||||||
|
v210_enc_chroma_mult_10: dw 1,4,16,0,16,1,4,0
|
||||||
|
v210_enc_chroma_shuf_10: db 0,1,8,9,-1,2,3,-1,10,11,4,5,-1,12,13,-1
|
||||||
|
|
||||||
|
v210_enc_min_8: times 16 db 0x1
|
||||||
|
v210_enc_max_8: times 16 db 0xfe
|
||||||
|
|
||||||
|
v210_enc_luma_shuf_8: db 6,-1,7,-1,8,-1,9,-1,10,-1,11,-1,-1,-1,-1,-1
|
||||||
|
v210_enc_luma_mult_8: dw 16,4,64,16,4,64,0,0
|
||||||
|
|
||||||
|
v210_enc_chroma_shuf1_8: db 0,-1,1,-1,2,-1,3,-1,8,-1,9,-1,10,-1,11,-1
|
||||||
|
v210_enc_chroma_shuf2_8: db 3,-1,4,-1,5,-1,7,-1,11,-1,12,-1,13,-1,15,-1
|
||||||
|
|
||||||
|
v210_enc_chroma_mult_8: dw 4,16,64,0,64,4,16,0
|
||||||
|
|
||||||
|
SECTION .text
|
||||||
|
|
||||||
|
%macro v210_planar_pack_10 0
|
||||||
|
|
||||||
|
; v210_planar_pack_10(const uint16_t *y, const uint16_t *u, const uint16_t *v, uint8_t *dst, ptrdiff_t width)
|
||||||
|
cglobal v210_planar_pack_10, 5, 5, 4, y, u, v, dst, width
|
||||||
|
lea r0, [yq+2*widthq]
|
||||||
|
add uq, widthq
|
||||||
|
add vq, widthq
|
||||||
|
neg widthq
|
||||||
|
|
||||||
|
mova m2, [v210_enc_min_10]
|
||||||
|
mova m3, [v210_enc_max_10]
|
||||||
|
|
||||||
|
.loop
|
||||||
|
movu m0, [yq+2*widthq]
|
||||||
|
CLIPW m0, m2, m3
|
||||||
|
|
||||||
|
movq m1, [uq+widthq]
|
||||||
|
movhps m1, [vq+widthq]
|
||||||
|
CLIPW m1, m2, m3
|
||||||
|
|
||||||
|
pmullw m0, [v210_enc_luma_mult_10]
|
||||||
|
pshufb m0, [v210_enc_luma_shuf_10]
|
||||||
|
|
||||||
|
pmullw m1, [v210_enc_chroma_mult_10]
|
||||||
|
pshufb m1, [v210_enc_chroma_shuf_10]
|
||||||
|
|
||||||
|
por m0, m1
|
||||||
|
|
||||||
|
movu [dstq], m0
|
||||||
|
|
||||||
|
add dstq, mmsize
|
||||||
|
add widthq, 6
|
||||||
|
jl .loop
|
||||||
|
|
||||||
|
RET
|
||||||
|
%endmacro
|
||||||
|
|
||||||
|
INIT_XMM ssse3
|
||||||
|
v210_planar_pack_10
|
||||||
|
|
||||||
|
%macro v210_planar_pack_8 0
|
||||||
|
|
||||||
|
; v210_planar_pack_8(const uint8_t *y, const uint8_t *u, const uint8_t *v, uint8_t *dst, ptrdiff_t width)
|
||||||
|
cglobal v210_planar_pack_8, 5, 5, 7, y, u, v, dst, width
|
||||||
|
add yq, widthq
|
||||||
|
shr widthq, 1
|
||||||
|
add uq, widthq
|
||||||
|
add vq, widthq
|
||||||
|
neg widthq
|
||||||
|
|
||||||
|
mova m4, [v210_enc_min_8]
|
||||||
|
mova m5, [v210_enc_max_8]
|
||||||
|
pxor m6, m6
|
||||||
|
|
||||||
|
.loop
|
||||||
|
movu m1, [yq+2*widthq]
|
||||||
|
CLIPUB m1, m4, m5
|
||||||
|
|
||||||
|
punpcklbw m0, m1, m6
|
||||||
|
; can't unpack high bytes in the same way because we process
|
||||||
|
; only six bytes at a time
|
||||||
|
pshufb m1, [v210_enc_luma_shuf_8]
|
||||||
|
|
||||||
|
pmullw m0, [v210_enc_luma_mult_8]
|
||||||
|
pmullw m1, [v210_enc_luma_mult_8]
|
||||||
|
pshufb m0, [v210_enc_luma_shuf_10]
|
||||||
|
pshufb m1, [v210_enc_luma_shuf_10]
|
||||||
|
|
||||||
|
movq m3, [uq+widthq]
|
||||||
|
movhps m3, [vq+widthq]
|
||||||
|
CLIPUB m3, m4, m5
|
||||||
|
|
||||||
|
; shuffle and multiply to get the same packing as in 10-bit
|
||||||
|
pshufb m2, m3, [v210_enc_chroma_shuf1_8]
|
||||||
|
pshufb m3, [v210_enc_chroma_shuf2_8]
|
||||||
|
|
||||||
|
pmullw m2, [v210_enc_chroma_mult_8]
|
||||||
|
pmullw m3, [v210_enc_chroma_mult_8]
|
||||||
|
pshufb m2, [v210_enc_chroma_shuf_10]
|
||||||
|
pshufb m3, [v210_enc_chroma_shuf_10]
|
||||||
|
|
||||||
|
por m0, m2
|
||||||
|
por m1, m3
|
||||||
|
|
||||||
|
movu [dstq], m0
|
||||||
|
movu [dstq+mmsize], m1
|
||||||
|
|
||||||
|
add dstq, 2*mmsize
|
||||||
|
add widthq, 6
|
||||||
|
jl .loop
|
||||||
|
|
||||||
|
RET
|
||||||
|
%endmacro
|
||||||
|
|
||||||
|
INIT_XMM ssse3
|
||||||
|
v210_planar_pack_8
|
||||||
|
INIT_XMM avx
|
||||||
|
v210_planar_pack_8
|
37
libavcodec/x86/v210enc_init.c
Normal file
37
libavcodec/x86/v210enc_init.c
Normal file
@ -0,0 +1,37 @@
|
|||||||
|
/*
|
||||||
|
* This file is part of FFmpeg.
|
||||||
|
*
|
||||||
|
* FFmpeg is free software; you can redistribute it and/or
|
||||||
|
* modify it under the terms of the GNU Lesser General Public
|
||||||
|
* License as published by the Free Software Foundation; either
|
||||||
|
* version 2.1 of the License, or (at your option) any later version.
|
||||||
|
*
|
||||||
|
* FFmpeg is distributed in the hope that it will be useful,
|
||||||
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||||
|
* Lesser General Public License for more details.
|
||||||
|
*
|
||||||
|
* You should have received a copy of the GNU Lesser General Public
|
||||||
|
* License along with FFmpeg; if not, write to the Free Software
|
||||||
|
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||||
|
*/
|
||||||
|
|
||||||
|
#include "libavutil/x86/cpu.h"
|
||||||
|
#include "libavcodec/v210enc.h"
|
||||||
|
|
||||||
|
void ff_v210_planar_pack_8_ssse3(const uint8_t *y, const uint8_t *u, const uint8_t *v, uint8_t *dst, ptrdiff_t width);
|
||||||
|
void ff_v210_planar_pack_8_avx(const uint8_t *y, const uint8_t *u, const uint8_t *v, uint8_t *dst, ptrdiff_t width);
|
||||||
|
void ff_v210_planar_pack_10_ssse3(const uint16_t *y, const uint16_t *u, const uint16_t *v, uint8_t *dst, ptrdiff_t width);
|
||||||
|
|
||||||
|
av_cold void ff_v210enc_init_x86(V210EncContext *s)
|
||||||
|
{
|
||||||
|
int cpu_flags = av_get_cpu_flags();
|
||||||
|
|
||||||
|
if( EXTERNAL_SSSE3(cpu_flags) ) {
|
||||||
|
s->pack_line_8 = ff_v210_planar_pack_8_ssse3;
|
||||||
|
s->pack_line_10 = ff_v210_planar_pack_10_ssse3;
|
||||||
|
}
|
||||||
|
|
||||||
|
if( EXTERNAL_AVX(cpu_flags) )
|
||||||
|
s->pack_line_8 = ff_v210_planar_pack_8_avx;
|
||||||
|
}
|
@ -641,6 +641,11 @@
|
|||||||
%endif
|
%endif
|
||||||
%endmacro
|
%endmacro
|
||||||
|
|
||||||
|
%macro CLIPUB 3 ;(dst, min, max)
|
||||||
|
pmaxub %1, %2
|
||||||
|
pminub %1, %3
|
||||||
|
%endmacro
|
||||||
|
|
||||||
%macro CLIPW 3 ;(dst, min, max)
|
%macro CLIPW 3 ;(dst, min, max)
|
||||||
pmaxsw %1, %2
|
pmaxsw %1, %2
|
||||||
pminsw %1, %3
|
pminsw %1, %3
|
||||||
|
@ -1,4 +1,4 @@
|
|||||||
895d30660eb4da017568141a8d1df4e8 *tests/data/fate/vsynth1-v210.avi
|
b066679e08cd90c342da21c88bec2a20 *tests/data/fate/vsynth1-v210.avi
|
||||||
14752448 tests/data/fate/vsynth1-v210.avi
|
14752448 tests/data/fate/vsynth1-v210.avi
|
||||||
50973792d3f1abe04a51ee0121f077f2 *tests/data/fate/vsynth1-v210.out.rawvideo
|
2ba7f4ca302f3c4147860b9dfb12b6e4 *tests/data/fate/vsynth1-v210.out.rawvideo
|
||||||
stddev: 1.85 PSNR: 42.78 MAXDIFF: 29 bytes: 7603200/ 7603200
|
stddev: 1.84 PSNR: 42.81 MAXDIFF: 29 bytes: 7603200/ 7603200
|
||||||
|
@ -1,4 +1,4 @@
|
|||||||
6fbbcfee1832fe4c62aacb70454cff62 *tests/data/fate/vsynth2-v210.avi
|
fa1c4b1b8d0e9454b4bc2269c7fe634b *tests/data/fate/vsynth2-v210.avi
|
||||||
14752448 tests/data/fate/vsynth2-v210.avi
|
14752448 tests/data/fate/vsynth2-v210.avi
|
||||||
a627fb50c8276200fd71383977d87ca3 *tests/data/fate/vsynth2-v210.out.rawvideo
|
7ba6e411e43c6b57c95c49d6848f41e6 *tests/data/fate/vsynth2-v210.out.rawvideo
|
||||||
stddev: 0.34 PSNR: 57.43 MAXDIFF: 6 bytes: 7603200/ 7603200
|
stddev: 0.34 PSNR: 57.41 MAXDIFF: 6 bytes: 7603200/ 7603200
|
||||||
|
@ -1,4 +1,4 @@
|
|||||||
d2f5e07f0c0e917d80d63f39d683919e *tests/data/fate/vsynth3-v210.avi
|
6618ab86d047f4fb8fdd2d633888b20b *tests/data/fate/vsynth3-v210.avi
|
||||||
224448 tests/data/fate/vsynth3-v210.avi
|
224448 tests/data/fate/vsynth3-v210.avi
|
||||||
0cf7cf68724fa5146b1667e4fa08b0e1 *tests/data/fate/vsynth3-v210.out.rawvideo
|
198ffb24c06927d8aaac5e59d81a0934 *tests/data/fate/vsynth3-v210.out.rawvideo
|
||||||
stddev: 2.12 PSNR: 41.58 MAXDIFF: 26 bytes: 86700/ 86700
|
stddev: 2.11 PSNR: 41.61 MAXDIFF: 27 bytes: 86700/ 86700
|
||||||
|
Loading…
x
Reference in New Issue
Block a user