vp9: add fullpel (put) MC SIMD for 10/12bpp.
This commit is contained in:
parent
d64f7d4213
commit
6354ff0383
@ -62,7 +62,8 @@ OBJS-$(CONFIG_V210_ENCODER) += x86/v210enc_init.o
|
||||
OBJS-$(CONFIG_VC1_DECODER) += x86/vc1dsp_init.o
|
||||
OBJS-$(CONFIG_VORBIS_DECODER) += x86/vorbisdsp_init.o
|
||||
OBJS-$(CONFIG_VP6_DECODER) += x86/vp6dsp_init.o
|
||||
OBJS-$(CONFIG_VP9_DECODER) += x86/vp9dsp_init.o
|
||||
OBJS-$(CONFIG_VP9_DECODER) += x86/vp9dsp_init.o \
|
||||
x86/vp9dsp_init_16bpp.o
|
||||
OBJS-$(CONFIG_WEBP_DECODER) += x86/vp8dsp_init.o
|
||||
|
||||
|
||||
|
@ -23,31 +23,26 @@
|
||||
#include "libavutil/attributes.h"
|
||||
#include "libavutil/cpu.h"
|
||||
#include "libavutil/mem.h"
|
||||
#include "libavutil/x86/asm.h"
|
||||
#include "libavutil/x86/cpu.h"
|
||||
#include "libavcodec/vp9dsp.h"
|
||||
#include "libavcodec/x86/vp9dsp_init.h"
|
||||
|
||||
#if HAVE_YASM
|
||||
|
||||
#define fpel_func(avg, sz, opt) \
|
||||
void ff_vp9_##avg##sz##_##opt(uint8_t *dst, ptrdiff_t dst_stride, \
|
||||
const uint8_t *src, ptrdiff_t src_stride, \
|
||||
int h, int mx, int my)
|
||||
fpel_func(put, 4, mmx);
|
||||
fpel_func(put, 8, mmx);
|
||||
fpel_func(put, 16, sse);
|
||||
fpel_func(put, 32, sse);
|
||||
fpel_func(put, 64, sse);
|
||||
fpel_func(avg, 4, mmxext);
|
||||
fpel_func(avg, 8, mmxext);
|
||||
fpel_func(avg, 16, sse2);
|
||||
fpel_func(avg, 32, sse2);
|
||||
fpel_func(avg, 64, sse2);
|
||||
fpel_func(put, 32, avx);
|
||||
fpel_func(put, 64, avx);
|
||||
fpel_func(avg, 32, avx2);
|
||||
fpel_func(avg, 64, avx2);
|
||||
#undef fpel_func
|
||||
decl_fpel_func(put, 4, mmx);
|
||||
decl_fpel_func(put, 8, mmx);
|
||||
decl_fpel_func(put, 16, sse);
|
||||
decl_fpel_func(put, 32, sse);
|
||||
decl_fpel_func(put, 64, sse);
|
||||
decl_fpel_func(avg, 4, mmxext);
|
||||
decl_fpel_func(avg, 8, mmxext);
|
||||
decl_fpel_func(avg, 16, sse2);
|
||||
decl_fpel_func(avg, 32, sse2);
|
||||
decl_fpel_func(avg, 64, sse2);
|
||||
decl_fpel_func(put, 32, avx);
|
||||
decl_fpel_func(put, 64, avx);
|
||||
decl_fpel_func(avg, 32, avx2);
|
||||
decl_fpel_func(avg, 64, avx2);
|
||||
|
||||
#define mc_func(avg, sz, dir, opt, type, f_sz) \
|
||||
void ff_vp9_##avg##_8tap_1d_##dir##_##sz##_##opt(uint8_t *dst, ptrdiff_t dst_stride, \
|
||||
@ -311,16 +306,13 @@ av_cold void ff_vp9dsp_init_x86(VP9DSPContext *dsp, int bpp, int bitexact)
|
||||
{
|
||||
#if HAVE_YASM
|
||||
int cpu_flags;
|
||||
if (bpp != 8) return;
|
||||
if (bpp != 8) {
|
||||
ff_vp9dsp_init_16bpp_x86(dsp, bpp);
|
||||
return;
|
||||
}
|
||||
|
||||
cpu_flags = av_get_cpu_flags();
|
||||
|
||||
#define init_fpel(idx1, idx2, sz, type, opt) \
|
||||
dsp->mc[idx1][FILTER_8TAP_SMOOTH ][idx2][0][0] = \
|
||||
dsp->mc[idx1][FILTER_8TAP_REGULAR][idx2][0][0] = \
|
||||
dsp->mc[idx1][FILTER_8TAP_SHARP ][idx2][0][0] = \
|
||||
dsp->mc[idx1][FILTER_BILINEAR ][idx2][0][0] = ff_vp9_##type##sz##_##opt
|
||||
|
||||
#define init_subpel1(idx1, idx2, idxh, idxv, sz, dir, type, opt) \
|
||||
dsp->mc[idx1][FILTER_8TAP_SMOOTH ][idx2][idxh][idxv] = type##_8tap_smooth_##sz##dir##_##opt; \
|
||||
dsp->mc[idx1][FILTER_8TAP_REGULAR][idx2][idxh][idxv] = type##_8tap_regular_##sz##dir##_##opt; \
|
||||
@ -386,8 +378,8 @@ av_cold void ff_vp9dsp_init_x86(VP9DSPContext *dsp, int bpp, int bitexact)
|
||||
} while (0)
|
||||
|
||||
if (EXTERNAL_MMX(cpu_flags)) {
|
||||
init_fpel(4, 0, 4, put, mmx);
|
||||
init_fpel(3, 0, 8, put, mmx);
|
||||
init_fpel_func(4, 0, 4, put, mmx);
|
||||
init_fpel_func(3, 0, 8, put, mmx);
|
||||
if (!bitexact) {
|
||||
dsp->itxfm_add[4 /* lossless */][DCT_DCT] =
|
||||
dsp->itxfm_add[4 /* lossless */][ADST_DCT] =
|
||||
@ -400,8 +392,8 @@ av_cold void ff_vp9dsp_init_x86(VP9DSPContext *dsp, int bpp, int bitexact)
|
||||
if (EXTERNAL_MMXEXT(cpu_flags)) {
|
||||
init_subpel2(4, 0, 4, put, mmxext);
|
||||
init_subpel2(4, 1, 4, avg, mmxext);
|
||||
init_fpel(4, 1, 4, avg, mmxext);
|
||||
init_fpel(3, 1, 8, avg, mmxext);
|
||||
init_fpel_func(4, 1, 4, avg, mmxext);
|
||||
init_fpel_func(3, 1, 8, avg, mmxext);
|
||||
dsp->itxfm_add[TX_4X4][DCT_DCT] = ff_vp9_idct_idct_4x4_add_mmxext;
|
||||
init_dc_ipred(4, mmxext);
|
||||
init_dc_ipred(8, mmxext);
|
||||
@ -409,9 +401,9 @@ av_cold void ff_vp9dsp_init_x86(VP9DSPContext *dsp, int bpp, int bitexact)
|
||||
}
|
||||
|
||||
if (EXTERNAL_SSE(cpu_flags)) {
|
||||
init_fpel(2, 0, 16, put, sse);
|
||||
init_fpel(1, 0, 32, put, sse);
|
||||
init_fpel(0, 0, 64, put, sse);
|
||||
init_fpel_func(2, 0, 16, put, sse);
|
||||
init_fpel_func(1, 0, 32, put, sse);
|
||||
init_fpel_func(0, 0, 64, put, sse);
|
||||
init_ipred(16, sse, v, VERT);
|
||||
init_ipred(32, sse, v, VERT);
|
||||
}
|
||||
@ -419,9 +411,9 @@ av_cold void ff_vp9dsp_init_x86(VP9DSPContext *dsp, int bpp, int bitexact)
|
||||
if (EXTERNAL_SSE2(cpu_flags)) {
|
||||
init_subpel3_8to64(0, put, sse2);
|
||||
init_subpel3_8to64(1, avg, sse2);
|
||||
init_fpel(2, 1, 16, avg, sse2);
|
||||
init_fpel(1, 1, 32, avg, sse2);
|
||||
init_fpel(0, 1, 64, avg, sse2);
|
||||
init_fpel_func(2, 1, 16, avg, sse2);
|
||||
init_fpel_func(1, 1, 32, avg, sse2);
|
||||
init_fpel_func(0, 1, 64, avg, sse2);
|
||||
init_lpf(sse2);
|
||||
dsp->itxfm_add[TX_4X4][ADST_DCT] = ff_vp9_idct_iadst_4x4_add_sse2;
|
||||
dsp->itxfm_add[TX_4X4][DCT_ADST] = ff_vp9_iadst_idct_4x4_add_sse2;
|
||||
@ -491,14 +483,14 @@ av_cold void ff_vp9dsp_init_x86(VP9DSPContext *dsp, int bpp, int bitexact)
|
||||
init_dir_tm_h_ipred(32, avx);
|
||||
}
|
||||
if (EXTERNAL_AVX_FAST(cpu_flags)) {
|
||||
init_fpel(1, 0, 32, put, avx);
|
||||
init_fpel(0, 0, 64, put, avx);
|
||||
init_fpel_func(1, 0, 32, put, avx);
|
||||
init_fpel_func(0, 0, 64, put, avx);
|
||||
init_ipred(32, avx, v, VERT);
|
||||
}
|
||||
|
||||
if (EXTERNAL_AVX2(cpu_flags)) {
|
||||
init_fpel(1, 1, 32, avg, avx2);
|
||||
init_fpel(0, 1, 64, avg, avx2);
|
||||
init_fpel_func(1, 1, 32, avg, avx2);
|
||||
init_fpel_func(0, 1, 64, avg, avx2);
|
||||
if (ARCH_X86_64) {
|
||||
#if ARCH_X86_64 && HAVE_AVX2_EXTERNAL
|
||||
init_subpel3_32_64(0, put, avx2);
|
||||
|
39
libavcodec/x86/vp9dsp_init.h
Normal file
39
libavcodec/x86/vp9dsp_init.h
Normal file
@ -0,0 +1,39 @@
|
||||
/*
|
||||
* VP9 SIMD optimizations
|
||||
*
|
||||
* Copyright (c) 2013 Ronald S. Bultje <rsbultje gmail com>
|
||||
*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
#ifndef AVCODEC_X86_VP9DSP_INIT_H
|
||||
#define AVCODEC_X86_VP9DSP_INIT_H
|
||||
|
||||
#define decl_fpel_func(avg, sz, opt) \
|
||||
void ff_vp9_##avg##sz##_##opt(uint8_t *dst, ptrdiff_t dst_stride, \
|
||||
const uint8_t *src, ptrdiff_t src_stride, \
|
||||
int h, int mx, int my)
|
||||
|
||||
#define init_fpel_func(idx1, idx2, sz, type, opt) \
|
||||
dsp->mc[idx1][FILTER_8TAP_SMOOTH ][idx2][0][0] = \
|
||||
dsp->mc[idx1][FILTER_8TAP_REGULAR][idx2][0][0] = \
|
||||
dsp->mc[idx1][FILTER_8TAP_SHARP ][idx2][0][0] = \
|
||||
dsp->mc[idx1][FILTER_BILINEAR ][idx2][0][0] = ff_vp9_##type##sz##_##opt
|
||||
|
||||
void ff_vp9dsp_init_16bpp_x86(VP9DSPContext *dsp, int bpp);
|
||||
|
||||
#endif /* AVCODEC_X86_VP9DSP_INIT_H */
|
65
libavcodec/x86/vp9dsp_init_16bpp.c
Normal file
65
libavcodec/x86/vp9dsp_init_16bpp.c
Normal file
@ -0,0 +1,65 @@
|
||||
/*
|
||||
* VP9 SIMD optimizations
|
||||
*
|
||||
* Copyright (c) 2013 Ronald S. Bultje <rsbultje gmail com>
|
||||
*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
#include "libavutil/attributes.h"
|
||||
#include "libavutil/cpu.h"
|
||||
#include "libavutil/mem.h"
|
||||
#include "libavutil/x86/cpu.h"
|
||||
#include "libavcodec/vp9dsp.h"
|
||||
#include "libavcodec/x86/vp9dsp_init.h"
|
||||
|
||||
#if HAVE_YASM
|
||||
|
||||
decl_fpel_func(put, 8, mmx);
|
||||
decl_fpel_func(put, 16, sse);
|
||||
decl_fpel_func(put, 32, sse);
|
||||
decl_fpel_func(put, 64, sse);
|
||||
decl_fpel_func(put, 128, sse);
|
||||
decl_fpel_func(put, 32, avx);
|
||||
decl_fpel_func(put, 64, avx);
|
||||
decl_fpel_func(put, 128, avx);
|
||||
|
||||
#endif /* HAVE_YASM */
|
||||
|
||||
av_cold void ff_vp9dsp_init_16bpp_x86(VP9DSPContext *dsp, int bpp)
|
||||
{
|
||||
#if HAVE_YASM
|
||||
int cpu_flags = av_get_cpu_flags();
|
||||
|
||||
if (EXTERNAL_MMX(cpu_flags)) {
|
||||
init_fpel_func(4, 0, 8, put, mmx);
|
||||
}
|
||||
|
||||
if (EXTERNAL_SSE(cpu_flags)) {
|
||||
init_fpel_func(3, 0, 16, put, sse);
|
||||
init_fpel_func(2, 0, 32, put, sse);
|
||||
init_fpel_func(1, 0, 64, put, sse);
|
||||
init_fpel_func(0, 0, 128, put, sse);
|
||||
}
|
||||
if (EXTERNAL_AVX_FAST(cpu_flags)) {
|
||||
init_fpel_func(2, 0, 32, put, avx);
|
||||
init_fpel_func(1, 0, 64, put, avx);
|
||||
init_fpel_func(0, 0, 128, put, avx);
|
||||
}
|
||||
|
||||
#endif /* HAVE_YASM */
|
||||
}
|
@ -553,7 +553,7 @@ filter_vx2_fn avg
|
||||
|
||||
%endif ; ARCH_X86_64
|
||||
|
||||
%macro fpel_fn 6
|
||||
%macro fpel_fn 6-7 4
|
||||
%if %2 == 4
|
||||
%define %%srcfn movh
|
||||
%define %%dstfn movh
|
||||
@ -567,13 +567,19 @@ cglobal vp9_%1%2, 5, 7, 4, dst, dstride, src, sstride, h, dstride3, sstride3
|
||||
lea sstride3q, [sstrideq*3]
|
||||
lea dstride3q, [dstrideq*3]
|
||||
%else
|
||||
cglobal vp9_%1%2, 5, 5, 4, dst, dstride, src, sstride, h
|
||||
cglobal vp9_%1%2, 5, 5, %7, dst, dstride, src, sstride, h
|
||||
%endif
|
||||
.loop:
|
||||
%%srcfn m0, [srcq]
|
||||
%%srcfn m1, [srcq+s%3]
|
||||
%%srcfn m2, [srcq+s%4]
|
||||
%%srcfn m3, [srcq+s%5]
|
||||
%if %2/mmsize == 8
|
||||
%%srcfn m4, [srcq+mmsize*4]
|
||||
%%srcfn m5, [srcq+mmsize*5]
|
||||
%%srcfn m6, [srcq+mmsize*6]
|
||||
%%srcfn m7, [srcq+mmsize*7]
|
||||
%endif
|
||||
lea srcq, [srcq+sstrideq*%6]
|
||||
%ifidn %1, avg
|
||||
pavgb m0, [dstq]
|
||||
@ -585,6 +591,12 @@ cglobal vp9_%1%2, 5, 5, 4, dst, dstride, src, sstride, h
|
||||
%%dstfn [dstq+d%3], m1
|
||||
%%dstfn [dstq+d%4], m2
|
||||
%%dstfn [dstq+d%5], m3
|
||||
%if %2/mmsize == 8
|
||||
%%dstfn [dstq+mmsize*4], m4
|
||||
%%dstfn [dstq+mmsize*5], m5
|
||||
%%dstfn [dstq+mmsize*6], m6
|
||||
%%dstfn [dstq+mmsize*7], m7
|
||||
%endif
|
||||
lea dstq, [dstq+dstrideq*%6]
|
||||
sub hd, %6
|
||||
jnz .loop
|
||||
@ -605,6 +617,7 @@ INIT_XMM sse
|
||||
fpel_fn put, 16, strideq, strideq*2, stride3q, 4
|
||||
fpel_fn put, 32, mmsize, strideq, strideq+mmsize, 2
|
||||
fpel_fn put, 64, mmsize, mmsize*2, mmsize*3, 1
|
||||
fpel_fn put, 128, mmsize, mmsize*2, mmsize*3, 1, 8
|
||||
INIT_XMM sse2
|
||||
fpel_fn avg, 16, strideq, strideq*2, stride3q, 4
|
||||
fpel_fn avg, 32, mmsize, strideq, strideq+mmsize, 2
|
||||
@ -612,6 +625,7 @@ fpel_fn avg, 64, mmsize, mmsize*2, mmsize*3, 1
|
||||
INIT_YMM avx
|
||||
fpel_fn put, 32, strideq, strideq*2, stride3q, 4
|
||||
fpel_fn put, 64, mmsize, strideq, strideq+mmsize, 2
|
||||
fpel_fn put, 128, mmsize, mmsize*2, mmsize*3, 1
|
||||
%if HAVE_AVX2_EXTERNAL
|
||||
INIT_YMM avx2
|
||||
fpel_fn avg, 32, strideq, strideq*2, stride3q, 4
|
||||
|
Loading…
Reference in New Issue
Block a user