rv40dsp x86: MMX/MMX2/3DNow/SSE2/SSSE3 implementations of MC
Code mostly inspired by vp8's MC, however: - its MMX2 horizontal filter is worse because it can't take advantage of the coefficient redundancy - that same coefficient redundancy allows better code for non-SSSE3 versions Benchmark (rounded to tens of unit): V8x8 H8x8 2D8x8 V16x16 H16x16 2D16x16 C 445 358 985 1785 1559 3280 MMX* 219 271 478 714 929 1443 SSE2 131 158 294 425 515 892 SSSE3 120 122 248 387 390 763 End result is overall around a 15% speedup for SSSE3 version (on 6 sequences); all loop filter functions now take around 55% of decoding time, while luma MC dsp functions are around 6%, chroma ones are 1.3% and biweight around 2.3%. Signed-off-by: Diego Biurrun <diego@biurrun.de>
This commit is contained in:
parent
706b998cdc
commit
110d0cdc9d
@ -1791,6 +1791,22 @@ QPEL_2TAP(avg_, 16, 3dnow)
|
||||
QPEL_2TAP(put_, 8, 3dnow)
|
||||
QPEL_2TAP(avg_, 8, 3dnow)
|
||||
|
||||
void ff_put_rv40_qpel8_mc33_mmx(uint8_t *dst, uint8_t *src, int stride)
|
||||
{
|
||||
put_pixels8_xy2_mmx(dst, src, stride, 8);
|
||||
}
|
||||
void ff_put_rv40_qpel16_mc33_mmx(uint8_t *dst, uint8_t *src, int stride)
|
||||
{
|
||||
put_pixels16_xy2_mmx(dst, src, stride, 16);
|
||||
}
|
||||
void ff_avg_rv40_qpel8_mc33_mmx(uint8_t *dst, uint8_t *src, int stride)
|
||||
{
|
||||
avg_pixels8_xy2_mmx(dst, src, stride, 8);
|
||||
}
|
||||
void ff_avg_rv40_qpel16_mc33_mmx(uint8_t *dst, uint8_t *src, int stride)
|
||||
{
|
||||
avg_pixels16_xy2_mmx(dst, src, stride, 16);
|
||||
}
|
||||
|
||||
#if HAVE_YASM
|
||||
typedef void emu_edge_core_func(uint8_t *buf, const uint8_t *src,
|
||||
|
@ -199,6 +199,11 @@ void ff_avg_cavs_qpel16_mc00_mmx2(uint8_t *dst, uint8_t *src, int stride);
|
||||
void ff_put_vc1_mspel_mc00_mmx(uint8_t *dst, const uint8_t *src, int stride, int rnd);
|
||||
void ff_avg_vc1_mspel_mc00_mmx2(uint8_t *dst, const uint8_t *src, int stride, int rnd);
|
||||
|
||||
void ff_put_rv40_qpel8_mc33_mmx(uint8_t *block, uint8_t *pixels, int line_size);
|
||||
void ff_put_rv40_qpel16_mc33_mmx(uint8_t *block, uint8_t *pixels, int line_size);
|
||||
void ff_avg_rv40_qpel8_mc33_mmx(uint8_t *block, uint8_t *pixels, int line_size);
|
||||
void ff_avg_rv40_qpel16_mc33_mmx(uint8_t *block, uint8_t *pixels, int line_size);
|
||||
|
||||
void ff_mmx_idct(DCTELEM *block);
|
||||
void ff_mmxext_idct(DCTELEM *block);
|
||||
|
||||
|
@ -1,5 +1,7 @@
|
||||
;******************************************************************************
|
||||
;* MMX/SSE2-optimized functions for the RV40 decoder
|
||||
;* Copyright (c) 2010 Ronald S. Bultje <rsbultje@gmail.com>
|
||||
;* Copyright (c) 2010 Jason Garrett-Glaser <darkshikari@gmail.com>
|
||||
;* Copyright (C) 2012 Christophe Gisquet <christophe.gisquet@gmail.com>
|
||||
;*
|
||||
;* This file is part of Libav.
|
||||
@ -25,11 +27,319 @@
|
||||
SECTION_RODATA
|
||||
|
||||
align 16
|
||||
shift_round: times 8 dw 1 << (16 - 6)
|
||||
cextern pw_16
|
||||
pw_1024: times 8 dw 1 << (16 - 6) ; pw_1024
|
||||
|
||||
sixtap_filter_hb_m: times 8 db 1, -5
|
||||
times 8 db 52, 20
|
||||
; multiplied by 2 to have the same shift
|
||||
times 8 db 2, -10
|
||||
times 8 db 40, 40
|
||||
; back to normal
|
||||
times 8 db 1, -5
|
||||
times 8 db 20, 52
|
||||
|
||||
sixtap_filter_v_m: times 8 dw 1
|
||||
times 8 dw -5
|
||||
times 8 dw 52
|
||||
times 8 dw 20
|
||||
; multiplied by 2 to have the same shift
|
||||
times 8 dw 2
|
||||
times 8 dw -10
|
||||
times 8 dw 40
|
||||
times 8 dw 40
|
||||
; back to normal
|
||||
times 8 dw 1
|
||||
times 8 dw -5
|
||||
times 8 dw 20
|
||||
times 8 dw 52
|
||||
|
||||
%ifdef PIC
|
||||
%define sixtap_filter_hw picregq
|
||||
%define sixtap_filter_hb picregq
|
||||
%define sixtap_filter_v picregq
|
||||
%define npicregs 1
|
||||
%else
|
||||
%define sixtap_filter_hw sixtap_filter_hw_m
|
||||
%define sixtap_filter_hb sixtap_filter_hb_m
|
||||
%define sixtap_filter_v sixtap_filter_v_m
|
||||
%define npicregs 0
|
||||
%endif
|
||||
|
||||
filter_h6_shuf1: db 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8
|
||||
filter_h6_shuf2: db 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10
|
||||
filter_h6_shuf3: db 5, 4, 6, 5, 7, 6, 8, 7, 9, 8, 10, 9, 11, 10, 12, 11
|
||||
|
||||
cextern pw_32
|
||||
cextern pw_16
|
||||
cextern pw_512
|
||||
|
||||
SECTION .text
|
||||
|
||||
;-----------------------------------------------------------------------------
|
||||
; subpel MC functions:
|
||||
;
|
||||
; void [put|rv40]_rv40_qpel_[h|v]_<opt>(uint8_t *dst, int deststride,
|
||||
; uint8_t *src, int srcstride,
|
||||
; int len, int m);
|
||||
;----------------------------------------------------------------------
|
||||
%macro LOAD 2
|
||||
%if WIN64
|
||||
movsxd %1q, %1d
|
||||
%endif
|
||||
%ifdef PIC
|
||||
add %1q, picregq
|
||||
%else
|
||||
add %1q, %2
|
||||
%endif
|
||||
%endmacro
|
||||
|
||||
%macro STORE 3
|
||||
%ifidn %3, avg
|
||||
movh %2, [dstq]
|
||||
%endif
|
||||
packuswb %1, %1
|
||||
%ifidn %3, avg
|
||||
%if cpuflag(3dnow)
|
||||
pavgusb %1, %2
|
||||
%else
|
||||
pavgb %1, %2
|
||||
%endif
|
||||
%endif
|
||||
movh [dstq], %1
|
||||
%endmacro
|
||||
|
||||
%macro FILTER_V 1
|
||||
cglobal %1_rv40_qpel_v, 6,6+npicregs,12, dst, dststride, src, srcstride, height, my, picreg
|
||||
%ifdef PIC
|
||||
lea picregq, [sixtap_filter_v_m]
|
||||
%endif
|
||||
pxor m7, m7
|
||||
LOAD my, sixtap_filter_v
|
||||
|
||||
; read 5 lines
|
||||
sub srcq, srcstrideq
|
||||
sub srcq, srcstrideq
|
||||
movh m0, [srcq]
|
||||
movh m1, [srcq+srcstrideq]
|
||||
movh m2, [srcq+srcstrideq*2]
|
||||
lea srcq, [srcq+srcstrideq*2]
|
||||
add srcq, srcstrideq
|
||||
movh m3, [srcq]
|
||||
movh m4, [srcq+srcstrideq]
|
||||
punpcklbw m0, m7
|
||||
punpcklbw m1, m7
|
||||
punpcklbw m2, m7
|
||||
punpcklbw m3, m7
|
||||
punpcklbw m4, m7
|
||||
|
||||
%ifdef m8
|
||||
mova m8, [myq+ 0]
|
||||
mova m9, [myq+16]
|
||||
mova m10, [myq+32]
|
||||
mova m11, [myq+48]
|
||||
%define COEFF05 m8
|
||||
%define COEFF14 m9
|
||||
%define COEFF2 m10
|
||||
%define COEFF3 m11
|
||||
%else
|
||||
%define COEFF05 [myq+ 0]
|
||||
%define COEFF14 [myq+16]
|
||||
%define COEFF2 [myq+32]
|
||||
%define COEFF3 [myq+48]
|
||||
%endif
|
||||
.nextrow:
|
||||
mova m6, m1
|
||||
movh m5, [srcq+2*srcstrideq] ; read new row
|
||||
paddw m6, m4
|
||||
punpcklbw m5, m7
|
||||
pmullw m6, COEFF14
|
||||
paddw m0, m5
|
||||
pmullw m0, COEFF05
|
||||
paddw m6, m0
|
||||
mova m0, m1
|
||||
paddw m6, [pw_32]
|
||||
mova m1, m2
|
||||
pmullw m2, COEFF2
|
||||
paddw m6, m2
|
||||
mova m2, m3
|
||||
pmullw m3, COEFF3
|
||||
paddw m6, m3
|
||||
|
||||
; round/clip/store
|
||||
mova m3, m4
|
||||
psraw m6, 6
|
||||
mova m4, m5
|
||||
STORE m6, m5, %1
|
||||
|
||||
; go to next line
|
||||
add dstq, dststrideq
|
||||
add srcq, srcstrideq
|
||||
dec heightd ; next row
|
||||
jg .nextrow
|
||||
REP_RET
|
||||
%endmacro
|
||||
|
||||
%macro FILTER_H 1
|
||||
cglobal %1_rv40_qpel_h, 6, 6+npicregs, 12, dst, dststride, src, srcstride, height, mx, picreg
|
||||
%ifdef PIC
|
||||
lea picregq, [sixtap_filter_v_m]
|
||||
%endif
|
||||
pxor m7, m7
|
||||
LOAD mx, sixtap_filter_v
|
||||
mova m6, [pw_32]
|
||||
%ifdef m8
|
||||
mova m8, [mxq+ 0]
|
||||
mova m9, [mxq+16]
|
||||
mova m10, [mxq+32]
|
||||
mova m11, [mxq+48]
|
||||
%define COEFF05 m8
|
||||
%define COEFF14 m9
|
||||
%define COEFF2 m10
|
||||
%define COEFF3 m11
|
||||
%else
|
||||
%define COEFF05 [mxq+ 0]
|
||||
%define COEFF14 [mxq+16]
|
||||
%define COEFF2 [mxq+32]
|
||||
%define COEFF3 [mxq+48]
|
||||
%endif
|
||||
.nextrow:
|
||||
movq m0, [srcq-2]
|
||||
movq m5, [srcq+3]
|
||||
movq m1, [srcq-1]
|
||||
movq m4, [srcq+2]
|
||||
punpcklbw m0, m7
|
||||
punpcklbw m5, m7
|
||||
punpcklbw m1, m7
|
||||
punpcklbw m4, m7
|
||||
movq m2, [srcq-0]
|
||||
movq m3, [srcq+1]
|
||||
paddw m0, m5
|
||||
paddw m1, m4
|
||||
punpcklbw m2, m7
|
||||
punpcklbw m3, m7
|
||||
pmullw m0, COEFF05
|
||||
pmullw m1, COEFF14
|
||||
pmullw m2, COEFF2
|
||||
pmullw m3, COEFF3
|
||||
paddw m0, m6
|
||||
paddw m1, m2
|
||||
paddw m0, m3
|
||||
paddw m0, m1
|
||||
psraw m0, 6
|
||||
STORE m0, m1, %1
|
||||
|
||||
; go to next line
|
||||
add dstq, dststrideq
|
||||
add srcq, srcstrideq
|
||||
dec heightd ; next row
|
||||
jg .nextrow
|
||||
REP_RET
|
||||
%endmacro
|
||||
|
||||
%if ARCH_X86_32
|
||||
INIT_MMX mmx
|
||||
FILTER_V put
|
||||
FILTER_H put
|
||||
|
||||
INIT_MMX mmx2
|
||||
FILTER_V avg
|
||||
FILTER_H avg
|
||||
|
||||
INIT_MMX 3dnow
|
||||
FILTER_V avg
|
||||
FILTER_H avg
|
||||
%endif
|
||||
|
||||
INIT_XMM sse2
|
||||
FILTER_H put
|
||||
FILTER_H avg
|
||||
FILTER_V put
|
||||
FILTER_V avg
|
||||
|
||||
%macro FILTER_SSSE3 1
|
||||
cglobal %1_rv40_qpel_v, 6,6+npicregs,8, dst, dststride, src, srcstride, height, my, picreg
|
||||
%ifdef PIC
|
||||
lea picregq, [sixtap_filter_hb_m]
|
||||
%endif
|
||||
|
||||
; read 5 lines
|
||||
sub srcq, srcstrideq
|
||||
LOAD my, sixtap_filter_hb
|
||||
sub srcq, srcstrideq
|
||||
movh m0, [srcq]
|
||||
movh m1, [srcq+srcstrideq]
|
||||
movh m2, [srcq+srcstrideq*2]
|
||||
lea srcq, [srcq+srcstrideq*2]
|
||||
add srcq, srcstrideq
|
||||
mova m5, [myq]
|
||||
movh m3, [srcq]
|
||||
movh m4, [srcq+srcstrideq]
|
||||
lea srcq, [srcq+2*srcstrideq]
|
||||
|
||||
.nextrow:
|
||||
mova m6, m2
|
||||
punpcklbw m0, m1
|
||||
punpcklbw m6, m3
|
||||
pmaddubsw m0, m5
|
||||
pmaddubsw m6, [myq+16]
|
||||
movh m7, [srcq] ; read new row
|
||||
paddw m6, m0
|
||||
mova m0, m1
|
||||
mova m1, m2
|
||||
mova m2, m3
|
||||
mova m3, m4
|
||||
mova m4, m7
|
||||
punpcklbw m7, m3
|
||||
pmaddubsw m7, m5
|
||||
paddw m6, m7
|
||||
pmulhrsw m6, [pw_512]
|
||||
STORE m6, m7, %1
|
||||
|
||||
; go to next line
|
||||
add dstq, dststrideq
|
||||
add srcq, srcstrideq
|
||||
dec heightd ; next row
|
||||
jg .nextrow
|
||||
REP_RET
|
||||
|
||||
cglobal %1_rv40_qpel_h, 6,6+npicregs,8, dst, dststride, src, srcstride, height, mx, picreg
|
||||
%ifdef PIC
|
||||
lea picregq, [sixtap_filter_hb_m]
|
||||
%endif
|
||||
mova m3, [filter_h6_shuf2]
|
||||
mova m4, [filter_h6_shuf3]
|
||||
LOAD mx, sixtap_filter_hb
|
||||
mova m5, [mxq] ; set up 6tap filter in bytes
|
||||
mova m6, [mxq+16]
|
||||
mova m7, [filter_h6_shuf1]
|
||||
|
||||
.nextrow:
|
||||
movu m0, [srcq-2]
|
||||
mova m1, m0
|
||||
mova m2, m0
|
||||
pshufb m0, m7
|
||||
pshufb m1, m3
|
||||
pshufb m2, m4
|
||||
pmaddubsw m0, m5
|
||||
pmaddubsw m1, m6
|
||||
pmaddubsw m2, m5
|
||||
paddw m0, m1
|
||||
paddw m0, m2
|
||||
pmulhrsw m0, [pw_512]
|
||||
STORE m0, m1, %1
|
||||
|
||||
; go to next line
|
||||
add dstq, dststrideq
|
||||
add srcq, srcstrideq
|
||||
dec heightd ; next row
|
||||
jg .nextrow
|
||||
REP_RET
|
||||
%endmacro
|
||||
|
||||
INIT_XMM ssse3
|
||||
FILTER_SSSE3 put
|
||||
FILTER_SSSE3 avg
|
||||
|
||||
; %1=5bits weights?, %2=dst %3=src1 %4=src3 %5=stride if sse2
|
||||
%macro RV40_WCORE 4-5
|
||||
movh m4, [%3 + r6 + 0]
|
||||
@ -143,7 +453,7 @@ SECTION .text
|
||||
%macro RV40_WEIGHT 3
|
||||
cglobal rv40_weight_func_%1_%2, 6, 7, 8
|
||||
%if cpuflag(ssse3)
|
||||
mova m1, [shift_round]
|
||||
mova m1, [pw_1024]
|
||||
%else
|
||||
mova m1, [pw_16]
|
||||
%endif
|
||||
|
@ -22,8 +22,11 @@
|
||||
/**
|
||||
* @file
|
||||
* RV40 decoder motion compensation functions x86-optimised
|
||||
* 2,0 and 0,2 have h264 equivalents.
|
||||
* 3,3 is bugged in the rv40 format and maps to _xy2 version
|
||||
*/
|
||||
|
||||
#include "libavcodec/x86/dsputil_mmx.h"
|
||||
#include "libavcodec/rv34dsp.h"
|
||||
|
||||
void ff_put_rv40_chroma_mc8_mmx (uint8_t *dst, uint8_t *src,
|
||||
@ -53,6 +56,132 @@ DECLARE_WEIGHT(mmx)
|
||||
DECLARE_WEIGHT(sse2)
|
||||
DECLARE_WEIGHT(ssse3)
|
||||
|
||||
/** @{ */
|
||||
/**
|
||||
* Define one qpel function.
|
||||
* LOOPSIZE must be already set to the number of pixels processed per
|
||||
* iteration in the inner loop of the called functions.
|
||||
* COFF(x) must be already defined so as to provide the offset into any
|
||||
* array of coeffs used by the called function for the qpel position x.
|
||||
*/
|
||||
#define QPEL_FUNC_DECL(OP, SIZE, PH, PV, OPT) \
|
||||
static void OP ## rv40_qpel ##SIZE ##_mc ##PH ##PV ##OPT(uint8_t *dst, \
|
||||
uint8_t *src, \
|
||||
int stride) \
|
||||
{ \
|
||||
int i; \
|
||||
if (PH && PV) { \
|
||||
DECLARE_ALIGNED(16, uint8_t, tmp)[SIZE * (SIZE + 5)]; \
|
||||
uint8_t *tmpptr = tmp + SIZE * 2; \
|
||||
src -= stride * 2; \
|
||||
\
|
||||
for (i = 0; i < SIZE; i += LOOPSIZE) \
|
||||
ff_put_rv40_qpel_h ##OPT(tmp + i, SIZE, src + i, stride, \
|
||||
SIZE + 5, HCOFF(PH)); \
|
||||
for (i = 0; i < SIZE; i += LOOPSIZE) \
|
||||
ff_ ##OP ##rv40_qpel_v ##OPT(dst + i, stride, tmpptr + i, \
|
||||
SIZE, SIZE, VCOFF(PV)); \
|
||||
} else if (PV) { \
|
||||
for (i = 0; i < SIZE; i += LOOPSIZE) \
|
||||
ff_ ##OP ##rv40_qpel_v ## OPT(dst + i, stride, src + i, \
|
||||
stride, SIZE, VCOFF(PV)); \
|
||||
} else { \
|
||||
for (i = 0; i < SIZE; i += LOOPSIZE) \
|
||||
ff_ ##OP ##rv40_qpel_h ## OPT(dst + i, stride, src + i, \
|
||||
stride, SIZE, HCOFF(PH)); \
|
||||
} \
|
||||
};
|
||||
|
||||
/** Declare functions for sizes 8 and 16 and given operations
|
||||
* and qpel position. */
|
||||
#define QPEL_FUNCS_DECL(OP, PH, PV, OPT) \
|
||||
QPEL_FUNC_DECL(OP, 8, PH, PV, OPT) \
|
||||
QPEL_FUNC_DECL(OP, 16, PH, PV, OPT)
|
||||
|
||||
/** Declare all functions for all sizes and qpel positions */
|
||||
#define QPEL_MC_DECL(OP, OPT) \
|
||||
void ff_ ##OP ##rv40_qpel_h ##OPT(uint8_t *dst, ptrdiff_t dstStride, \
|
||||
const uint8_t *src, \
|
||||
ptrdiff_t srcStride, \
|
||||
int len, int m); \
|
||||
void ff_ ##OP ##rv40_qpel_v ##OPT(uint8_t *dst, ptrdiff_t dstStride, \
|
||||
const uint8_t *src, \
|
||||
ptrdiff_t srcStride, \
|
||||
int len, int m); \
|
||||
QPEL_FUNCS_DECL(OP, 0, 1, OPT) \
|
||||
QPEL_FUNCS_DECL(OP, 0, 3, OPT) \
|
||||
QPEL_FUNCS_DECL(OP, 1, 0, OPT) \
|
||||
QPEL_FUNCS_DECL(OP, 1, 1, OPT) \
|
||||
QPEL_FUNCS_DECL(OP, 1, 2, OPT) \
|
||||
QPEL_FUNCS_DECL(OP, 1, 3, OPT) \
|
||||
QPEL_FUNCS_DECL(OP, 2, 1, OPT) \
|
||||
QPEL_FUNCS_DECL(OP, 2, 2, OPT) \
|
||||
QPEL_FUNCS_DECL(OP, 2, 3, OPT) \
|
||||
QPEL_FUNCS_DECL(OP, 3, 0, OPT) \
|
||||
QPEL_FUNCS_DECL(OP, 3, 1, OPT) \
|
||||
QPEL_FUNCS_DECL(OP, 3, 2, OPT)
|
||||
/** @} */
|
||||
|
||||
#define LOOPSIZE 8
|
||||
#define HCOFF(x) (32 * (x - 1))
|
||||
#define VCOFF(x) (32 * (x - 1))
|
||||
QPEL_MC_DECL(put_, _ssse3)
|
||||
QPEL_MC_DECL(avg_, _ssse3)
|
||||
|
||||
#undef LOOPSIZE
|
||||
#undef HCOFF
|
||||
#undef VCOFF
|
||||
#define LOOPSIZE 8
|
||||
#define HCOFF(x) (64 * (x - 1))
|
||||
#define VCOFF(x) (64 * (x - 1))
|
||||
QPEL_MC_DECL(put_, _sse2)
|
||||
QPEL_MC_DECL(avg_, _sse2)
|
||||
|
||||
#if ARCH_X86_32
|
||||
#undef LOOPSIZE
|
||||
#undef HCOFF
|
||||
#undef VCOFF
|
||||
#define LOOPSIZE 4
|
||||
#define HCOFF(x) (64 * (x - 1))
|
||||
#define VCOFF(x) (64 * (x - 1))
|
||||
|
||||
QPEL_MC_DECL(put_, _mmx)
|
||||
|
||||
#define ff_put_rv40_qpel_h_mmx2 ff_put_rv40_qpel_h_mmx
|
||||
#define ff_put_rv40_qpel_v_mmx2 ff_put_rv40_qpel_v_mmx
|
||||
QPEL_MC_DECL(avg_, _mmx2)
|
||||
|
||||
#define ff_put_rv40_qpel_h_3dnow ff_put_rv40_qpel_h_mmx
|
||||
#define ff_put_rv40_qpel_v_3dnow ff_put_rv40_qpel_v_mmx
|
||||
QPEL_MC_DECL(avg_, _3dnow)
|
||||
#endif
|
||||
|
||||
/** @{ */
|
||||
/** Set one function */
|
||||
#define QPEL_FUNC_SET(OP, SIZE, PH, PV, OPT) \
|
||||
c-> OP ## pixels_tab[2 - SIZE / 8][4 * PV + PH] = OP ## rv40_qpel ##SIZE ## _mc ##PH ##PV ##OPT;
|
||||
|
||||
/** Set functions put and avg for sizes 8 and 16 and a given qpel position */
|
||||
#define QPEL_FUNCS_SET(OP, PH, PV, OPT) \
|
||||
QPEL_FUNC_SET(OP, 8, PH, PV, OPT) \
|
||||
QPEL_FUNC_SET(OP, 16, PH, PV, OPT)
|
||||
|
||||
/** Set all functions for all sizes and qpel positions */
|
||||
#define QPEL_MC_SET(OP, OPT) \
|
||||
QPEL_FUNCS_SET (OP, 0, 1, OPT) \
|
||||
QPEL_FUNCS_SET (OP, 0, 3, OPT) \
|
||||
QPEL_FUNCS_SET (OP, 1, 0, OPT) \
|
||||
QPEL_FUNCS_SET (OP, 1, 1, OPT) \
|
||||
QPEL_FUNCS_SET (OP, 1, 2, OPT) \
|
||||
QPEL_FUNCS_SET (OP, 1, 3, OPT) \
|
||||
QPEL_FUNCS_SET (OP, 2, 1, OPT) \
|
||||
QPEL_FUNCS_SET (OP, 2, 2, OPT) \
|
||||
QPEL_FUNCS_SET (OP, 2, 3, OPT) \
|
||||
QPEL_FUNCS_SET (OP, 3, 0, OPT) \
|
||||
QPEL_FUNCS_SET (OP, 3, 1, OPT) \
|
||||
QPEL_FUNCS_SET (OP, 3, 2, OPT)
|
||||
/** @} */
|
||||
|
||||
void ff_rv40dsp_init_x86(RV34DSPContext *c, DSPContext *dsp)
|
||||
{
|
||||
#if HAVE_YASM
|
||||
@ -65,25 +194,42 @@ void ff_rv40dsp_init_x86(RV34DSPContext *c, DSPContext *dsp)
|
||||
c->rv40_weight_pixels_tab[0][1] = ff_rv40_weight_func_rnd_8_mmx;
|
||||
c->rv40_weight_pixels_tab[1][0] = ff_rv40_weight_func_nornd_16_mmx;
|
||||
c->rv40_weight_pixels_tab[1][1] = ff_rv40_weight_func_nornd_8_mmx;
|
||||
c->put_pixels_tab[0][15] = ff_put_rv40_qpel16_mc33_mmx;
|
||||
c->put_pixels_tab[1][15] = ff_put_rv40_qpel8_mc33_mmx;
|
||||
c->avg_pixels_tab[0][15] = ff_avg_rv40_qpel16_mc33_mmx;
|
||||
c->avg_pixels_tab[1][15] = ff_avg_rv40_qpel8_mc33_mmx;
|
||||
#if ARCH_X86_32
|
||||
QPEL_MC_SET(put_, _mmx)
|
||||
#endif
|
||||
}
|
||||
if (mm_flags & AV_CPU_FLAG_MMX2) {
|
||||
c->avg_chroma_pixels_tab[0] = ff_avg_rv40_chroma_mc8_mmx2;
|
||||
c->avg_chroma_pixels_tab[1] = ff_avg_rv40_chroma_mc4_mmx2;
|
||||
#if ARCH_X86_32
|
||||
QPEL_MC_SET(avg_, _mmx2)
|
||||
#endif
|
||||
} else if (mm_flags & AV_CPU_FLAG_3DNOW) {
|
||||
c->avg_chroma_pixels_tab[0] = ff_avg_rv40_chroma_mc8_3dnow;
|
||||
c->avg_chroma_pixels_tab[1] = ff_avg_rv40_chroma_mc4_3dnow;
|
||||
#if ARCH_X86_32
|
||||
QPEL_MC_SET(avg_, _3dnow)
|
||||
#endif
|
||||
}
|
||||
if (mm_flags & AV_CPU_FLAG_SSE2) {
|
||||
c->rv40_weight_pixels_tab[0][0] = ff_rv40_weight_func_rnd_16_sse2;
|
||||
c->rv40_weight_pixels_tab[0][1] = ff_rv40_weight_func_rnd_8_sse2;
|
||||
c->rv40_weight_pixels_tab[1][0] = ff_rv40_weight_func_nornd_16_sse2;
|
||||
c->rv40_weight_pixels_tab[1][1] = ff_rv40_weight_func_nornd_8_sse2;
|
||||
QPEL_MC_SET(put_, _sse2)
|
||||
QPEL_MC_SET(avg_, _sse2)
|
||||
}
|
||||
if (mm_flags & AV_CPU_FLAG_SSSE3) {
|
||||
c->rv40_weight_pixels_tab[0][0] = ff_rv40_weight_func_rnd_16_ssse3;
|
||||
c->rv40_weight_pixels_tab[0][1] = ff_rv40_weight_func_rnd_8_ssse3;
|
||||
c->rv40_weight_pixels_tab[1][0] = ff_rv40_weight_func_nornd_16_ssse3;
|
||||
c->rv40_weight_pixels_tab[1][1] = ff_rv40_weight_func_nornd_8_ssse3;
|
||||
QPEL_MC_SET(put_, _ssse3)
|
||||
QPEL_MC_SET(avg_, _ssse3)
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
Loading…
x
Reference in New Issue
Block a user