Merge commit 'be923ed659016350592acb9b3346f706f8170ac5'

* commit 'be923ed659016350592acb9b3346f706f8170ac5':
  x86: fmtconvert: port to cpuflags
  x86: MMX2 ---> MMXEXT in macro names

Merged-by: Michael Niedermayer <michaelni@gmx.at>
This commit is contained in:
Michael Niedermayer 2012-10-31 14:16:04 +01:00
commit 28c0678eb7
11 changed files with 139 additions and 138 deletions

View File

@ -156,7 +156,7 @@ INIT_MMX mmx
%define ABS2 ABS2_MMX %define ABS2 ABS2_MMX
AC3_MAX_MSB_ABS_INT16 or_abs AC3_MAX_MSB_ABS_INT16 or_abs
INIT_MMX mmx2 INIT_MMX mmx2
%define ABS2 ABS2_MMX2 %define ABS2 ABS2_MMXEXT
AC3_MAX_MSB_ABS_INT16 min_max AC3_MAX_MSB_ABS_INT16 min_max
INIT_XMM sse2 INIT_XMM sse2
AC3_MAX_MSB_ABS_INT16 min_max AC3_MAX_MSB_ABS_INT16 min_max

View File

@ -430,7 +430,7 @@ static void ff_ ## OPNAME ## cavs_qpel ## SIZE ## _mc03_ ## MMX(uint8_t *dst, ui
"mov" #size " " #b ", " #temp " \n\t"\ "mov" #size " " #b ", " #temp " \n\t"\
"pavgusb " #temp ", " #a " \n\t"\ "pavgusb " #temp ", " #a " \n\t"\
"mov" #size " " #a ", " #b " \n\t" "mov" #size " " #a ", " #b " \n\t"
#define AVG_MMX2_OP(a,b,temp, size) \ #define AVG_MMXEXT_OP(a, b, temp, size) \
"mov" #size " " #b ", " #temp " \n\t"\ "mov" #size " " #b ", " #temp " \n\t"\
"pavgb " #temp ", " #a " \n\t"\ "pavgb " #temp ", " #a " \n\t"\
"mov" #size " " #a ", " #b " \n\t" "mov" #size " " #a ", " #b " \n\t"
@ -439,7 +439,7 @@ static void ff_ ## OPNAME ## cavs_qpel ## SIZE ## _mc03_ ## MMX(uint8_t *dst, ui
#if HAVE_MMXEXT_INLINE #if HAVE_MMXEXT_INLINE
QPEL_CAVS(put_, PUT_OP, mmx2) QPEL_CAVS(put_, PUT_OP, mmx2)
QPEL_CAVS(avg_, AVG_MMX2_OP, mmx2) QPEL_CAVS(avg_,AVG_MMXEXT_OP, mmx2)
CAVS_MC(put_, 8, mmx2) CAVS_MC(put_, 8, mmx2)
CAVS_MC(put_, 16,mmx2) CAVS_MC(put_, 16,mmx2)

View File

@ -943,7 +943,7 @@ static void draw_edges_mmx(uint8_t *buf, int wrap, int width, int height,
"packuswb %%mm5, %%mm5 \n\t" \ "packuswb %%mm5, %%mm5 \n\t" \
OP(%%mm5, out, %%mm7, d) OP(%%mm5, out, %%mm7, d)
#define QPEL_BASE(OPNAME, ROUNDER, RND, OP_MMX2, OP_3DNOW) \ #define QPEL_BASE(OPNAME, ROUNDER, RND, OP_MMXEXT, OP_3DNOW) \
static void OPNAME ## mpeg4_qpel16_h_lowpass_mmx2(uint8_t *dst, \ static void OPNAME ## mpeg4_qpel16_h_lowpass_mmx2(uint8_t *dst, \
uint8_t *src, \ uint8_t *src, \
int dstStride, \ int dstStride, \
@ -1011,7 +1011,7 @@ static void OPNAME ## mpeg4_qpel16_h_lowpass_mmx2(uint8_t *dst, \
"psraw $5, %%mm3 \n\t" \ "psraw $5, %%mm3 \n\t" \
"movq %5, %%mm1 \n\t" \ "movq %5, %%mm1 \n\t" \
"packuswb %%mm3, %%mm1 \n\t" \ "packuswb %%mm3, %%mm1 \n\t" \
OP_MMX2(%%mm1, (%1), %%mm4, q) \ OP_MMXEXT(%%mm1, (%1), %%mm4, q) \
/* mm0 = GHIJ, mm2 = FGHI, mm5 = HIJK, mm6 = IJKL, mm7 = 0 */ \ /* mm0 = GHIJ, mm2 = FGHI, mm5 = HIJK, mm6 = IJKL, mm7 = 0 */ \
\ \
"movq 9(%0), %%mm1 \n\t" /* JKLMNOPQ */ \ "movq 9(%0), %%mm1 \n\t" /* JKLMNOPQ */ \
@ -1058,7 +1058,7 @@ static void OPNAME ## mpeg4_qpel16_h_lowpass_mmx2(uint8_t *dst, \
"paddw %%mm3, %%mm4 \n\t" /* 20a - 6b + 3c - d */ \ "paddw %%mm3, %%mm4 \n\t" /* 20a - 6b + 3c - d */ \
"psraw $5, %%mm4 \n\t" \ "psraw $5, %%mm4 \n\t" \
"packuswb %%mm4, %%mm0 \n\t" \ "packuswb %%mm4, %%mm0 \n\t" \
OP_MMX2(%%mm0, 8(%1), %%mm4, q) \ OP_MMXEXT(%%mm0, 8(%1), %%mm4, q) \
\ \
"add %3, %0 \n\t" \ "add %3, %0 \n\t" \
"add %4, %1 \n\t" \ "add %4, %1 \n\t" \
@ -1195,7 +1195,7 @@ static void OPNAME ## mpeg4_qpel8_h_lowpass_mmx2(uint8_t *dst, \
"paddw %%mm1, %%mm3 \n\t" /* 20a - 6b + 3c - d */ \ "paddw %%mm1, %%mm3 \n\t" /* 20a - 6b + 3c - d */ \
"psraw $5, %%mm3 \n\t" \ "psraw $5, %%mm3 \n\t" \
"packuswb %%mm3, %%mm0 \n\t" \ "packuswb %%mm3, %%mm0 \n\t" \
OP_MMX2(%%mm0, (%1), %%mm4, q) \ OP_MMXEXT(%%mm0, (%1), %%mm4, q) \
\ \
"add %3, %0 \n\t" \ "add %3, %0 \n\t" \
"add %4, %1 \n\t" \ "add %4, %1 \n\t" \
@ -1764,19 +1764,19 @@ static void OPNAME ## qpel16_mc22_ ## MMX(uint8_t *dst, uint8_t *src, \
"pavgusb "#temp", "#a" \n\t" \ "pavgusb "#temp", "#a" \n\t" \
"mov"#size" "#a", "#b" \n\t" "mov"#size" "#a", "#b" \n\t"
#define AVG_MMX2_OP(a, b, temp, size) \ #define AVG_MMXEXT_OP(a, b, temp, size) \
"mov"#size" "#b", "#temp" \n\t" \ "mov"#size" "#b", "#temp" \n\t" \
"pavgb "#temp", "#a" \n\t" \ "pavgb "#temp", "#a" \n\t" \
"mov"#size" "#a", "#b" \n\t" "mov"#size" "#a", "#b" \n\t"
QPEL_BASE(put_, ff_pw_16, _, PUT_OP, PUT_OP) QPEL_BASE(put_, ff_pw_16, _, PUT_OP, PUT_OP)
QPEL_BASE(avg_, ff_pw_16, _, AVG_MMX2_OP, AVG_3DNOW_OP) QPEL_BASE(avg_, ff_pw_16, _, AVG_MMXEXT_OP, AVG_3DNOW_OP)
QPEL_BASE(put_no_rnd_, ff_pw_15, _no_rnd_, PUT_OP, PUT_OP) QPEL_BASE(put_no_rnd_, ff_pw_15, _no_rnd_, PUT_OP, PUT_OP)
QPEL_OP(put_, ff_pw_16, _, PUT_OP, 3dnow) QPEL_OP(put_, ff_pw_16, _, PUT_OP, 3dnow)
QPEL_OP(avg_, ff_pw_16, _, AVG_3DNOW_OP, 3dnow) QPEL_OP(avg_, ff_pw_16, _, AVG_3DNOW_OP, 3dnow)
QPEL_OP(put_no_rnd_, ff_pw_15, _no_rnd_, PUT_OP, 3dnow) QPEL_OP(put_no_rnd_, ff_pw_15, _no_rnd_, PUT_OP, 3dnow)
QPEL_OP(put_, ff_pw_16, _, PUT_OP, mmx2) QPEL_OP(put_, ff_pw_16, _, PUT_OP, mmx2)
QPEL_OP(avg_, ff_pw_16, _, AVG_MMX2_OP, mmx2) QPEL_OP(avg_, ff_pw_16, _, AVG_MMXEXT_OP, mmx2)
QPEL_OP(put_no_rnd_, ff_pw_15, _no_rnd_, PUT_OP, mmx2) QPEL_OP(put_no_rnd_, ff_pw_15, _no_rnd_, PUT_OP, mmx2)
/***********************************/ /***********************************/

View File

@ -112,7 +112,7 @@ SECTION .text
movd %3, %1 movd %3, %1
%endmacro %endmacro
%macro HSUM_MMX2 3 %macro HSUM_MMXEXT 3
pshufw %2, %1, 0xE pshufw %2, %1, 0xE
paddusw %1, %2 paddusw %1, %2
pshufw %2, %1, 0x1 pshufw %2, %1, 0x1
@ -263,12 +263,12 @@ INIT_MMX
%define HSUM HSUM_MMX %define HSUM HSUM_MMX
HADAMARD8_DIFF_MMX mmx HADAMARD8_DIFF_MMX mmx
%define ABS1 ABS1_MMX2 %define ABS1 ABS1_MMXEXT
%define HSUM HSUM_MMX2 %define HSUM HSUM_MMXEXT
HADAMARD8_DIFF_MMX mmx2 HADAMARD8_DIFF_MMX mmx2
INIT_XMM INIT_XMM
%define ABS2 ABS2_MMX2 %define ABS2 ABS2_MMXEXT
%if ARCH_X86_64 %if ARCH_X86_64
%define ABS_SUM_8x8 ABS_SUM_8x8_64 %define ABS_SUM_8x8 ABS_SUM_8x8_64
%else %else

View File

@ -889,7 +889,7 @@ static void sub_hfyu_median_prediction_mmx2(uint8_t *dst, const uint8_t *src1, c
"pxor " #z ", " #a " \n\t"\ "pxor " #z ", " #a " \n\t"\
"psubw " #z ", " #a " \n\t" "psubw " #z ", " #a " \n\t"
#define MMABS_MMX2(a,z)\ #define MMABS_MMXEXT(a, z) \
"pxor " #z ", " #z " \n\t"\ "pxor " #z ", " #z " \n\t"\
"psubw " #a ", " #z " \n\t"\ "psubw " #a ", " #z " \n\t"\
"pmaxsw " #z ", " #a " \n\t" "pmaxsw " #z ", " #a " \n\t"
@ -913,7 +913,7 @@ static void sub_hfyu_median_prediction_mmx2(uint8_t *dst, const uint8_t *src1, c
"paddusw "#t", "#a" \n\t"\ "paddusw "#t", "#a" \n\t"\
"movd "#a", "#dst" \n\t"\ "movd "#a", "#dst" \n\t"\
#define HSUM_MMX2(a, t, dst)\ #define HSUM_MMXEXT(a, t, dst) \
"pshufw $0x0E, "#a", "#t" \n\t"\ "pshufw $0x0E, "#a", "#t" \n\t"\
"paddusw "#t", "#a" \n\t"\ "paddusw "#t", "#a" \n\t"\
"pshufw $0x01, "#a", "#t" \n\t"\ "pshufw $0x01, "#a", "#t" \n\t"\
@ -975,8 +975,8 @@ DCT_SAD_FUNC(mmx)
#undef MMABS #undef MMABS
#undef HSUM #undef HSUM
#define HSUM(a,t,dst) HSUM_MMX2(a,t,dst) #define HSUM(a,t,dst) HSUM_MMXEXT(a,t,dst)
#define MMABS(a,z) MMABS_MMX2(a,z) #define MMABS(a,z) MMABS_MMXEXT(a,z)
DCT_SAD_FUNC(mmx2) DCT_SAD_FUNC(mmx2)
#undef HSUM #undef HSUM
#undef DCT_SAD #undef DCT_SAD

View File

@ -26,11 +26,11 @@ SECTION_TEXT
;--------------------------------------------------------------------------------- ;---------------------------------------------------------------------------------
; void int32_to_float_fmul_scalar(float *dst, const int *src, float mul, int len); ; void int32_to_float_fmul_scalar(float *dst, const int *src, float mul, int len);
;--------------------------------------------------------------------------------- ;---------------------------------------------------------------------------------
%macro INT32_TO_FLOAT_FMUL_SCALAR 2 %macro INT32_TO_FLOAT_FMUL_SCALAR 1
%if UNIX64 %if UNIX64
cglobal int32_to_float_fmul_scalar_%1, 3,3,%2, dst, src, len cglobal int32_to_float_fmul_scalar, 3, 3, %1, dst, src, len
%else %else
cglobal int32_to_float_fmul_scalar_%1, 4,4,%2, dst, src, mul, len cglobal int32_to_float_fmul_scalar, 4, 4, %1, dst, src, mul, len
%endif %endif
%if WIN64 %if WIN64
SWAP 0, 2 SWAP 0, 2
@ -43,7 +43,7 @@ cglobal int32_to_float_fmul_scalar_%1, 4,4,%2, dst, src, mul, len
add dstq, lenq add dstq, lenq
neg lenq neg lenq
.loop: .loop:
%ifidn %1, sse2 %if cpuflag(sse2)
cvtdq2ps m1, [srcq+lenq ] cvtdq2ps m1, [srcq+lenq ]
cvtdq2ps m2, [srcq+lenq+16] cvtdq2ps m2, [srcq+lenq+16]
%else %else
@ -63,27 +63,26 @@ cglobal int32_to_float_fmul_scalar_%1, 4,4,%2, dst, src, mul, len
REP_RET REP_RET
%endmacro %endmacro
INIT_XMM INIT_XMM sse
%define SPLATD SPLATD_SSE %define SPLATD SPLATD_SSE
%define movdqa movaps INT32_TO_FLOAT_FMUL_SCALAR 5
INT32_TO_FLOAT_FMUL_SCALAR sse, 5 INIT_XMM sse2
%undef movdqa
%define SPLATD SPLATD_SSE2 %define SPLATD SPLATD_SSE2
INT32_TO_FLOAT_FMUL_SCALAR sse2, 3 INT32_TO_FLOAT_FMUL_SCALAR 3
%undef SPLATD %undef SPLATD
;------------------------------------------------------------------------------ ;------------------------------------------------------------------------------
; void ff_float_to_int16(int16_t *dst, const float *src, long len); ; void ff_float_to_int16(int16_t *dst, const float *src, long len);
;------------------------------------------------------------------------------ ;------------------------------------------------------------------------------
%macro FLOAT_TO_INT16 2 %macro FLOAT_TO_INT16 1
cglobal float_to_int16_%1, 3,3,%2, dst, src, len cglobal float_to_int16, 3, 3, %1, dst, src, len
add lenq, lenq add lenq, lenq
lea srcq, [srcq+2*lenq] lea srcq, [srcq+2*lenq]
add dstq, lenq add dstq, lenq
neg lenq neg lenq
.loop: .loop:
%ifidn %1, sse2 %if cpuflag(sse2)
cvtps2dq m0, [srcq+2*lenq ] cvtps2dq m0, [srcq+2*lenq ]
cvtps2dq m1, [srcq+2*lenq+16] cvtps2dq m1, [srcq+2*lenq+16]
packssdw m0, m1 packssdw m0, m1
@ -100,31 +99,32 @@ cglobal float_to_int16_%1, 3,3,%2, dst, src, len
%endif %endif
add lenq, 16 add lenq, 16
js .loop js .loop
%ifnidn %1, sse2 %if mmsize == 8
emms emms
%endif %endif
REP_RET REP_RET
%endmacro %endmacro
INIT_XMM INIT_XMM sse2
FLOAT_TO_INT16 sse2, 2 FLOAT_TO_INT16 2
INIT_MMX INIT_MMX sse
FLOAT_TO_INT16 sse, 0 FLOAT_TO_INT16 0
%define cvtps2pi pf2id %define cvtps2pi pf2id
FLOAT_TO_INT16 3dnow, 0 INIT_MMX 3dnow
FLOAT_TO_INT16 0
%undef cvtps2pi %undef cvtps2pi
;------------------------------------------------------------------------------ ;------------------------------------------------------------------------------
; void ff_float_to_int16_step(int16_t *dst, const float *src, long len, long step); ; void ff_float_to_int16_step(int16_t *dst, const float *src, long len, long step);
;------------------------------------------------------------------------------ ;------------------------------------------------------------------------------
%macro FLOAT_TO_INT16_STEP 2 %macro FLOAT_TO_INT16_STEP 1
cglobal float_to_int16_step_%1, 4,7,%2, dst, src, len, step, step3, v1, v2 cglobal float_to_int16_step, 4, 7, %1, dst, src, len, step, step3, v1, v2
add lenq, lenq add lenq, lenq
lea srcq, [srcq+2*lenq] lea srcq, [srcq+2*lenq]
lea step3q, [stepq*3] lea step3q, [stepq*3]
neg lenq neg lenq
.loop: .loop:
%ifidn %1, sse2 %if cpuflag(sse2)
cvtps2dq m0, [srcq+2*lenq ] cvtps2dq m0, [srcq+2*lenq ]
cvtps2dq m1, [srcq+2*lenq+16] cvtps2dq m1, [srcq+2*lenq+16]
packssdw m0, m1 packssdw m0, m1
@ -179,25 +179,26 @@ cglobal float_to_int16_step_%1, 4,7,%2, dst, src, len, step, step3, v1, v2
%endif %endif
add lenq, 16 add lenq, 16
js .loop js .loop
%ifnidn %1, sse2 %if mmsize == 8
emms emms
%endif %endif
REP_RET REP_RET
%endmacro %endmacro
INIT_XMM INIT_XMM sse2
FLOAT_TO_INT16_STEP sse2, 2 FLOAT_TO_INT16_STEP 2
INIT_MMX INIT_MMX sse
FLOAT_TO_INT16_STEP sse, 0 FLOAT_TO_INT16_STEP 0
%define cvtps2pi pf2id %define cvtps2pi pf2id
FLOAT_TO_INT16_STEP 3dnow, 0 INIT_MMX 3dnow
FLOAT_TO_INT16_STEP 0
%undef cvtps2pi %undef cvtps2pi
;------------------------------------------------------------------------------- ;-------------------------------------------------------------------------------
; void ff_float_to_int16_interleave2(int16_t *dst, const float **src, long len); ; void ff_float_to_int16_interleave2(int16_t *dst, const float **src, long len);
;------------------------------------------------------------------------------- ;-------------------------------------------------------------------------------
%macro FLOAT_TO_INT16_INTERLEAVE2 1 %macro FLOAT_TO_INT16_INTERLEAVE2 0
cglobal float_to_int16_interleave2_%1, 3,4,2, dst, src0, src1, len cglobal float_to_int16_interleave2, 3, 4, 2, dst, src0, src1, len
lea lenq, [4*r2q] lea lenq, [4*r2q]
mov src1q, [src0q+gprsize] mov src1q, [src0q+gprsize]
mov src0q, [src0q] mov src0q, [src0q]
@ -206,7 +207,7 @@ cglobal float_to_int16_interleave2_%1, 3,4,2, dst, src0, src1, len
add src1q, lenq add src1q, lenq
neg lenq neg lenq
.loop: .loop:
%ifidn %1, sse2 %if cpuflag(sse2)
cvtps2dq m0, [src0q+lenq] cvtps2dq m0, [src0q+lenq]
cvtps2dq m1, [src1q+lenq] cvtps2dq m1, [src1q+lenq]
packssdw m0, m1 packssdw m0, m1
@ -228,21 +229,20 @@ cglobal float_to_int16_interleave2_%1, 3,4,2, dst, src0, src1, len
%endif %endif
add lenq, 16 add lenq, 16
js .loop js .loop
%ifnidn %1, sse2 %if mmsize == 8
emms emms
%endif %endif
REP_RET REP_RET
%endmacro %endmacro
INIT_MMX INIT_MMX 3dnow
%define cvtps2pi pf2id %define cvtps2pi pf2id
FLOAT_TO_INT16_INTERLEAVE2 3dnow FLOAT_TO_INT16_INTERLEAVE2
%undef cvtps2pi %undef cvtps2pi
%define movdqa movaps INIT_MMX sse
FLOAT_TO_INT16_INTERLEAVE2 sse FLOAT_TO_INT16_INTERLEAVE2
%undef movdqa INIT_XMM sse2
INIT_XMM FLOAT_TO_INT16_INTERLEAVE2
FLOAT_TO_INT16_INTERLEAVE2 sse2
%macro PSWAPD_SSE 2 %macro PSWAPD_SSE 2
@ -254,9 +254,9 @@ FLOAT_TO_INT16_INTERLEAVE2 sse2
punpckldq %1, %2 punpckldq %1, %2
%endmacro %endmacro
%macro FLOAT_TO_INT16_INTERLEAVE6 1 %macro FLOAT_TO_INT16_INTERLEAVE6 0
; void float_to_int16_interleave6_sse(int16_t *dst, const float **src, int len) ; void float_to_int16_interleave6_sse(int16_t *dst, const float **src, int len)
cglobal float_to_int16_interleave6_%1, 2,8,0, dst, src, src1, src2, src3, src4, src5, len cglobal float_to_int16_interleave6, 2, 8, 0, dst, src, src1, src2, src3, src4, src5, len
%if ARCH_X86_64 %if ARCH_X86_64
mov lend, r2d mov lend, r2d
%else %else
@ -302,21 +302,24 @@ cglobal float_to_int16_interleave6_%1, 2,8,0, dst, src, src1, src2, src3, src4,
RET RET
%endmacro ; FLOAT_TO_INT16_INTERLEAVE6 %endmacro ; FLOAT_TO_INT16_INTERLEAVE6
INIT_MMX sse
%define pswapd PSWAPD_SSE %define pswapd PSWAPD_SSE
FLOAT_TO_INT16_INTERLEAVE6 sse FLOAT_TO_INT16_INTERLEAVE6
INIT_MMX 3dnow
%define cvtps2pi pf2id %define cvtps2pi pf2id
%define pswapd PSWAPD_3DNOW %define pswapd PSWAPD_3DNOW
FLOAT_TO_INT16_INTERLEAVE6 3dnow FLOAT_TO_INT16_INTERLEAVE6
%undef pswapd %undef pswapd
FLOAT_TO_INT16_INTERLEAVE6 3dnowext INIT_MMX 3dnowext
FLOAT_TO_INT16_INTERLEAVE6
%undef cvtps2pi %undef cvtps2pi
;----------------------------------------------------------------------------- ;-----------------------------------------------------------------------------
; void ff_float_interleave6(float *dst, const float **src, unsigned int len); ; void ff_float_interleave6(float *dst, const float **src, unsigned int len);
;----------------------------------------------------------------------------- ;-----------------------------------------------------------------------------
%macro FLOAT_INTERLEAVE6 2 %macro FLOAT_INTERLEAVE6 1
cglobal float_interleave6_%1, 2,8,%2, dst, src, src1, src2, src3, src4, src5, len cglobal float_interleave6, 2, 8, %1, dst, src, src1, src2, src3, src4, src5, len
%if ARCH_X86_64 %if ARCH_X86_64
mov lend, r2d mov lend, r2d
%else %else
@ -334,7 +337,7 @@ cglobal float_interleave6_%1, 2,8,%2, dst, src, src1, src2, src3, src4, src5, le
sub src4q, srcq sub src4q, srcq
sub src5q, srcq sub src5q, srcq
.loop: .loop:
%ifidn %1, sse %if cpuflag(sse)
movaps m0, [srcq] movaps m0, [srcq]
movaps m1, [srcq+src1q] movaps m1, [srcq+src1q]
movaps m2, [srcq+src2q] movaps m2, [srcq+src2q]
@ -383,62 +386,60 @@ cglobal float_interleave6_%1, 2,8,%2, dst, src, src1, src2, src3, src4, src5, le
add dstq, mmsize*6 add dstq, mmsize*6
sub lend, mmsize/4 sub lend, mmsize/4
jg .loop jg .loop
%ifidn %1, mmx %if mmsize == 8
emms emms
%endif %endif
REP_RET REP_RET
%endmacro %endmacro
INIT_MMX INIT_MMX mmx
FLOAT_INTERLEAVE6 mmx, 0 FLOAT_INTERLEAVE6 0
INIT_XMM INIT_XMM sse
FLOAT_INTERLEAVE6 sse, 7 FLOAT_INTERLEAVE6 7
;----------------------------------------------------------------------------- ;-----------------------------------------------------------------------------
; void ff_float_interleave2(float *dst, const float **src, unsigned int len); ; void ff_float_interleave2(float *dst, const float **src, unsigned int len);
;----------------------------------------------------------------------------- ;-----------------------------------------------------------------------------
%macro FLOAT_INTERLEAVE2 2 %macro FLOAT_INTERLEAVE2 1
cglobal float_interleave2_%1, 3,4,%2, dst, src, len, src1 cglobal float_interleave2, 3, 4, %1, dst, src, len, src1
mov src1q, [srcq+gprsize] mov src1q, [srcq+gprsize]
mov srcq, [srcq ] mov srcq, [srcq ]
sub src1q, srcq sub src1q, srcq
.loop: .loop:
MOVPS m0, [srcq ] mova m0, [srcq ]
MOVPS m1, [srcq+src1q ] mova m1, [srcq+src1q ]
MOVPS m3, [srcq +mmsize] mova m3, [srcq +mmsize]
MOVPS m4, [srcq+src1q+mmsize] mova m4, [srcq+src1q+mmsize]
MOVPS m2, m0 mova m2, m0
PUNPCKLDQ m0, m1 PUNPCKLDQ m0, m1
PUNPCKHDQ m2, m1 PUNPCKHDQ m2, m1
MOVPS m1, m3 mova m1, m3
PUNPCKLDQ m3, m4 PUNPCKLDQ m3, m4
PUNPCKHDQ m1, m4 PUNPCKHDQ m1, m4
MOVPS [dstq ], m0 mova [dstq ], m0
MOVPS [dstq+1*mmsize], m2 mova [dstq+1*mmsize], m2
MOVPS [dstq+2*mmsize], m3 mova [dstq+2*mmsize], m3
MOVPS [dstq+3*mmsize], m1 mova [dstq+3*mmsize], m1
add srcq, mmsize*2 add srcq, mmsize*2
add dstq, mmsize*4 add dstq, mmsize*4
sub lend, mmsize/2 sub lend, mmsize/2
jg .loop jg .loop
%ifidn %1, mmx %if mmsize == 8
emms emms
%endif %endif
REP_RET REP_RET
%endmacro %endmacro
INIT_MMX INIT_MMX mmx
%define MOVPS movq
%define PUNPCKLDQ punpckldq %define PUNPCKLDQ punpckldq
%define PUNPCKHDQ punpckhdq %define PUNPCKHDQ punpckhdq
FLOAT_INTERLEAVE2 mmx, 0 FLOAT_INTERLEAVE2 0
INIT_XMM INIT_XMM sse
%define MOVPS movaps
%define PUNPCKLDQ unpcklps %define PUNPCKLDQ unpcklps
%define PUNPCKHDQ unpckhps %define PUNPCKHDQ unpckhps
FLOAT_INTERLEAVE2 sse, 5 FLOAT_INTERLEAVE2 5

View File

@ -246,7 +246,7 @@ cglobal h264_idct8_add_8_sse2, 3, 4, 10
IDCT8_ADD_SSE r0, r1, r2, r3 IDCT8_ADD_SSE r0, r1, r2, r3
RET RET
%macro DC_ADD_MMX2_INIT 2-3 %macro DC_ADD_MMXEXT_INIT 2-3
%if %0 == 2 %if %0 == 2
movsx %1, word [%1] movsx %1, word [%1]
add %1, 32 add %1, 32
@ -266,7 +266,7 @@ cglobal h264_idct8_add_8_sse2, 3, 4, 10
packuswb m1, m1 packuswb m1, m1
%endmacro %endmacro
%macro DC_ADD_MMX2_OP 4 %macro DC_ADD_MMXEXT_OP 4
%1 m2, [%2 ] %1 m2, [%2 ]
%1 m3, [%2+%3 ] %1 m3, [%2+%3 ]
%1 m4, [%2+%3*2] %1 m4, [%2+%3*2]
@ -288,16 +288,16 @@ cglobal h264_idct8_add_8_sse2, 3, 4, 10
INIT_MMX INIT_MMX
; ff_h264_idct_dc_add_mmx2(uint8_t *dst, int16_t *block, int stride) ; ff_h264_idct_dc_add_mmx2(uint8_t *dst, int16_t *block, int stride)
cglobal h264_idct_dc_add_8_mmx2, 3, 3, 0 cglobal h264_idct_dc_add_8_mmx2, 3, 3, 0
DC_ADD_MMX2_INIT r1, r2 DC_ADD_MMXEXT_INIT r1, r2
DC_ADD_MMX2_OP movh, r0, r2, r1 DC_ADD_MMXEXT_OP movh, r0, r2, r1
RET RET
; ff_h264_idct8_dc_add_mmx2(uint8_t *dst, int16_t *block, int stride) ; ff_h264_idct8_dc_add_mmx2(uint8_t *dst, int16_t *block, int stride)
cglobal h264_idct8_dc_add_8_mmx2, 3, 3, 0 cglobal h264_idct8_dc_add_8_mmx2, 3, 3, 0
DC_ADD_MMX2_INIT r1, r2 DC_ADD_MMXEXT_INIT r1, r2
DC_ADD_MMX2_OP mova, r0, r2, r1 DC_ADD_MMXEXT_OP mova, r0, r2, r1
lea r0, [r0+r2*4] lea r0, [r0+r2*4]
DC_ADD_MMX2_OP mova, r0, r2, r1 DC_ADD_MMXEXT_OP mova, r0, r2, r1
RET RET
; ff_h264_idct_add16_mmx(uint8_t *dst, const int *block_offset, ; ff_h264_idct_add16_mmx(uint8_t *dst, const int *block_offset,
@ -371,14 +371,14 @@ cglobal h264_idct_add16_8_mmx2, 5, 8 + npicregs, 0, dst1, block_offset, block, s
movsx r6, word [r2] movsx r6, word [r2]
test r6, r6 test r6, r6
jz .no_dc jz .no_dc
DC_ADD_MMX2_INIT r2, r3, r6 DC_ADD_MMXEXT_INIT r2, r3, r6
%if ARCH_X86_64 == 0 %if ARCH_X86_64 == 0
%define dst2q r1 %define dst2q r1
%define dst2d r1d %define dst2d r1d
%endif %endif
mov dst2d, dword [r1+r5*4] mov dst2d, dword [r1+r5*4]
lea dst2q, [r0+dst2q] lea dst2q, [r0+dst2q]
DC_ADD_MMX2_OP movh, dst2q, r3, r6 DC_ADD_MMXEXT_OP movh, dst2q, r3, r6
%if ARCH_X86_64 == 0 %if ARCH_X86_64 == 0
mov r1, r1m mov r1, r1m
%endif %endif
@ -445,14 +445,14 @@ cglobal h264_idct_add16intra_8_mmx2, 5, 8 + npicregs, 0, dst1, block_offset, blo
movsx r6, word [r2] movsx r6, word [r2]
test r6, r6 test r6, r6
jz .skipblock jz .skipblock
DC_ADD_MMX2_INIT r2, r3, r6 DC_ADD_MMXEXT_INIT r2, r3, r6
%if ARCH_X86_64 == 0 %if ARCH_X86_64 == 0
%define dst2q r1 %define dst2q r1
%define dst2d r1d %define dst2d r1d
%endif %endif
mov dst2d, dword [r1+r5*4] mov dst2d, dword [r1+r5*4]
add dst2q, r0 add dst2q, r0
DC_ADD_MMX2_OP movh, dst2q, r3, r6 DC_ADD_MMXEXT_OP movh, dst2q, r3, r6
%if ARCH_X86_64 == 0 %if ARCH_X86_64 == 0
mov r1, r1m mov r1, r1m
%endif %endif
@ -483,16 +483,16 @@ cglobal h264_idct8_add4_8_mmx2, 5, 8 + npicregs, 0, dst1, block_offset, block, s
movsx r6, word [r2] movsx r6, word [r2]
test r6, r6 test r6, r6
jz .no_dc jz .no_dc
DC_ADD_MMX2_INIT r2, r3, r6 DC_ADD_MMXEXT_INIT r2, r3, r6
%if ARCH_X86_64 == 0 %if ARCH_X86_64 == 0
%define dst2q r1 %define dst2q r1
%define dst2d r1d %define dst2d r1d
%endif %endif
mov dst2d, dword [r1+r5*4] mov dst2d, dword [r1+r5*4]
lea dst2q, [r0+dst2q] lea dst2q, [r0+dst2q]
DC_ADD_MMX2_OP mova, dst2q, r3, r6 DC_ADD_MMXEXT_OP mova, dst2q, r3, r6
lea dst2q, [dst2q+r3*4] lea dst2q, [dst2q+r3*4]
DC_ADD_MMX2_OP mova, dst2q, r3, r6 DC_ADD_MMXEXT_OP mova, dst2q, r3, r6
%if ARCH_X86_64 == 0 %if ARCH_X86_64 == 0
mov r1, r1m mov r1, r1m
%endif %endif
@ -541,16 +541,16 @@ cglobal h264_idct8_add4_8_sse2, 5, 8 + npicregs, 10, dst1, block_offset, block,
test r6, r6 test r6, r6
jz .no_dc jz .no_dc
INIT_MMX INIT_MMX
DC_ADD_MMX2_INIT r2, r3, r6 DC_ADD_MMXEXT_INIT r2, r3, r6
%if ARCH_X86_64 == 0 %if ARCH_X86_64 == 0
%define dst2q r1 %define dst2q r1
%define dst2d r1d %define dst2d r1d
%endif %endif
mov dst2d, dword [r1+r5*4] mov dst2d, dword [r1+r5*4]
add dst2q, r0 add dst2q, r0
DC_ADD_MMX2_OP mova, dst2q, r3, r6 DC_ADD_MMXEXT_OP mova, dst2q, r3, r6
lea dst2q, [dst2q+r3*4] lea dst2q, [dst2q+r3*4]
DC_ADD_MMX2_OP mova, dst2q, r3, r6 DC_ADD_MMXEXT_OP mova, dst2q, r3, r6
%if ARCH_X86_64 == 0 %if ARCH_X86_64 == 0
mov r1, r1m mov r1, r1m
%endif %endif
@ -644,7 +644,7 @@ h264_idct_add8_mmx2_plane:
movsx r6, word [r2] movsx r6, word [r2]
test r6, r6 test r6, r6
jz .skipblock jz .skipblock
DC_ADD_MMX2_INIT r2, r3, r6 DC_ADD_MMXEXT_INIT r2, r3, r6
%if ARCH_X86_64 %if ARCH_X86_64
mov r0d, dword [r1+r5*4] mov r0d, dword [r1+r5*4]
add r0, [dst2q] add r0, [dst2q]
@ -653,7 +653,7 @@ h264_idct_add8_mmx2_plane:
mov r0, [r0] mov r0, [r0]
add r0, dword [r1+r5*4] add r0, dword [r1+r5*4]
%endif %endif
DC_ADD_MMX2_OP movh, r0, r3, r6 DC_ADD_MMXEXT_OP movh, r0, r3, r6
.skipblock: .skipblock:
inc r5 inc r5
add r2, 32 add r2, 32
@ -697,7 +697,7 @@ h264_idct_dc_add8_mmx2:
pshufw m1, m0, 0xFA ; -d-d-d-d-D-D-D-D pshufw m1, m0, 0xFA ; -d-d-d-d-D-D-D-D
punpcklwd m0, m0 ; d d d d D D D D punpcklwd m0, m0 ; d d d d D D D D
lea r6, [r3*3] lea r6, [r3*3]
DC_ADD_MMX2_OP movq, r0, r3, r6 DC_ADD_MMXEXT_OP movq, r0, r3, r6
ret ret
ALIGN 16 ALIGN 16

View File

@ -1169,18 +1169,18 @@ QPEL_H264(avg_, AVG_3DNOW_OP, 3dnow)
#undef PAVGB #undef PAVGB
#define PAVGB "pavgb" #define PAVGB "pavgb"
QPEL_H264(put_, PUT_OP, mmx2) QPEL_H264(put_, PUT_OP, mmx2)
QPEL_H264(avg_, AVG_MMX2_OP, mmx2) QPEL_H264(avg_,AVG_MMXEXT_OP, mmx2)
QPEL_H264_V_XMM(put_, PUT_OP, sse2) QPEL_H264_V_XMM(put_, PUT_OP, sse2)
QPEL_H264_V_XMM(avg_, AVG_MMX2_OP, sse2) QPEL_H264_V_XMM(avg_,AVG_MMXEXT_OP, sse2)
QPEL_H264_HV_XMM(put_, PUT_OP, sse2) QPEL_H264_HV_XMM(put_, PUT_OP, sse2)
QPEL_H264_HV_XMM(avg_, AVG_MMX2_OP, sse2) QPEL_H264_HV_XMM(avg_,AVG_MMXEXT_OP, sse2)
#if HAVE_SSSE3_INLINE #if HAVE_SSSE3_INLINE
QPEL_H264_H_XMM(put_, PUT_OP, ssse3) QPEL_H264_H_XMM(put_, PUT_OP, ssse3)
QPEL_H264_H_XMM(avg_, AVG_MMX2_OP, ssse3) QPEL_H264_H_XMM(avg_,AVG_MMXEXT_OP, ssse3)
QPEL_H264_HV2_XMM(put_, PUT_OP, ssse3) QPEL_H264_HV2_XMM(put_, PUT_OP, ssse3)
QPEL_H264_HV2_XMM(avg_, AVG_MMX2_OP, ssse3) QPEL_H264_HV2_XMM(avg_,AVG_MMXEXT_OP, ssse3)
QPEL_H264_HV_XMM(put_, PUT_OP, ssse3) QPEL_H264_HV_XMM(put_, PUT_OP, ssse3)
QPEL_H264_HV_XMM(avg_, AVG_MMX2_OP, ssse3) QPEL_H264_HV_XMM(avg_,AVG_MMXEXT_OP, ssse3)
#endif #endif
#undef PAVGB #undef PAVGB

View File

@ -268,7 +268,7 @@ cglobal vc1_h_loop_filter8_%1, 3,5,0
RET RET
%endmacro %endmacro
%define PABSW PABSW_MMX2 %define PABSW PABSW_MMXEXT
VC1_LF_MMX mmx2 VC1_LF_MMX mmx2
INIT_XMM INIT_XMM

View File

@ -157,7 +157,7 @@
psubw %1, %2 psubw %1, %2
%endmacro %endmacro
%macro PABSW_MMX2 2 %macro PABSW_MMXEXT 2
pxor %1, %1 pxor %1, %1
psubw %1, %2 psubw %1, %2
pmaxsw %1, %2 pmaxsw %1, %2
@ -189,13 +189,13 @@
psubw %2, %4 psubw %2, %4
%endmacro %endmacro
%macro ABS1_MMX2 2 ; a, tmp %macro ABS1_MMXEXT 2 ; a, tmp
pxor %2, %2 pxor %2, %2
psubw %2, %1 psubw %2, %1
pmaxsw %1, %2 pmaxsw %1, %2
%endmacro %endmacro
%macro ABS2_MMX2 4 ; a, b, tmp0, tmp1 %macro ABS2_MMXEXT 4 ; a, b, tmp0, tmp1
pxor %3, %3 pxor %3, %3
pxor %4, %4 pxor %4, %4
psubw %3, %1 psubw %3, %1

View File

@ -593,7 +593,7 @@ static void RENAME(yuv2rgb555_X)(SwsContext *c, const int16_t *lumFilter,
"cmp "#dstw", "#index" \n\t"\ "cmp "#dstw", "#index" \n\t"\
" jb 1b \n\t" " jb 1b \n\t"
#define WRITEBGR24MMX2(dst, dstw, index) \ #define WRITEBGR24MMXEXT(dst, dstw, index) \
/* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\ /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
"movq "MANGLE(ff_M24A)", %%mm0 \n\t"\ "movq "MANGLE(ff_M24A)", %%mm0 \n\t"\
"movq "MANGLE(ff_M24C)", %%mm7 \n\t"\ "movq "MANGLE(ff_M24C)", %%mm7 \n\t"\
@ -643,7 +643,7 @@ static void RENAME(yuv2rgb555_X)(SwsContext *c, const int16_t *lumFilter,
#if COMPILE_TEMPLATE_MMXEXT #if COMPILE_TEMPLATE_MMXEXT
#undef WRITEBGR24 #undef WRITEBGR24
#define WRITEBGR24(dst, dstw, index) WRITEBGR24MMX2(dst, dstw, index) #define WRITEBGR24(dst, dstw, index) WRITEBGR24MMXEXT(dst, dstw, index)
#else #else
#undef WRITEBGR24 #undef WRITEBGR24
#define WRITEBGR24(dst, dstw, index) WRITEBGR24MMX(dst, dstw, index) #define WRITEBGR24(dst, dstw, index) WRITEBGR24MMX(dst, dstw, index)
@ -1485,7 +1485,7 @@ static void RENAME(hyscale_fast)(SwsContext *c, int16_t *dst,
PREFETCH" 64(%%"REG_c") \n\t" PREFETCH" 64(%%"REG_c") \n\t"
#if ARCH_X86_64 #if ARCH_X86_64
#define CALL_MMX2_FILTER_CODE \ #define CALL_MMXEXT_FILTER_CODE \
"movl (%%"REG_b"), %%esi \n\t"\ "movl (%%"REG_b"), %%esi \n\t"\
"call *%4 \n\t"\ "call *%4 \n\t"\
"movl (%%"REG_b", %%"REG_a"), %%esi \n\t"\ "movl (%%"REG_b", %%"REG_a"), %%esi \n\t"\
@ -1494,7 +1494,7 @@ static void RENAME(hyscale_fast)(SwsContext *c, int16_t *dst,
"xor %%"REG_a", %%"REG_a" \n\t"\ "xor %%"REG_a", %%"REG_a" \n\t"\
#else #else
#define CALL_MMX2_FILTER_CODE \ #define CALL_MMXEXT_FILTER_CODE \
"movl (%%"REG_b"), %%esi \n\t"\ "movl (%%"REG_b"), %%esi \n\t"\
"call *%4 \n\t"\ "call *%4 \n\t"\
"addl (%%"REG_b", %%"REG_a"), %%"REG_c" \n\t"\ "addl (%%"REG_b", %%"REG_a"), %%"REG_c" \n\t"\
@ -1503,14 +1503,14 @@ static void RENAME(hyscale_fast)(SwsContext *c, int16_t *dst,
#endif /* ARCH_X86_64 */ #endif /* ARCH_X86_64 */
CALL_MMX2_FILTER_CODE CALL_MMXEXT_FILTER_CODE
CALL_MMX2_FILTER_CODE CALL_MMXEXT_FILTER_CODE
CALL_MMX2_FILTER_CODE CALL_MMXEXT_FILTER_CODE
CALL_MMX2_FILTER_CODE CALL_MMXEXT_FILTER_CODE
CALL_MMX2_FILTER_CODE CALL_MMXEXT_FILTER_CODE
CALL_MMX2_FILTER_CODE CALL_MMXEXT_FILTER_CODE
CALL_MMX2_FILTER_CODE CALL_MMXEXT_FILTER_CODE
CALL_MMX2_FILTER_CODE CALL_MMXEXT_FILTER_CODE
#if defined(PIC) #if defined(PIC)
"mov %5, %%"REG_b" \n\t" "mov %5, %%"REG_b" \n\t"
@ -1580,10 +1580,10 @@ static void RENAME(hcscale_fast)(SwsContext *c, int16_t *dst1, int16_t *dst2,
PREFETCH" 32(%%"REG_c") \n\t" PREFETCH" 32(%%"REG_c") \n\t"
PREFETCH" 64(%%"REG_c") \n\t" PREFETCH" 64(%%"REG_c") \n\t"
CALL_MMX2_FILTER_CODE CALL_MMXEXT_FILTER_CODE
CALL_MMX2_FILTER_CODE CALL_MMXEXT_FILTER_CODE
CALL_MMX2_FILTER_CODE CALL_MMXEXT_FILTER_CODE
CALL_MMX2_FILTER_CODE CALL_MMXEXT_FILTER_CODE
"xor %%"REG_a", %%"REG_a" \n\t" // i "xor %%"REG_a", %%"REG_a" \n\t" // i
"mov %5, %%"REG_c" \n\t" // src "mov %5, %%"REG_c" \n\t" // src
"mov %6, %%"REG_D" \n\t" // buf2 "mov %6, %%"REG_D" \n\t" // buf2
@ -1591,10 +1591,10 @@ static void RENAME(hcscale_fast)(SwsContext *c, int16_t *dst1, int16_t *dst2,
PREFETCH" 32(%%"REG_c") \n\t" PREFETCH" 32(%%"REG_c") \n\t"
PREFETCH" 64(%%"REG_c") \n\t" PREFETCH" 64(%%"REG_c") \n\t"
CALL_MMX2_FILTER_CODE CALL_MMXEXT_FILTER_CODE
CALL_MMX2_FILTER_CODE CALL_MMXEXT_FILTER_CODE
CALL_MMX2_FILTER_CODE CALL_MMXEXT_FILTER_CODE
CALL_MMX2_FILTER_CODE CALL_MMXEXT_FILTER_CODE
#if defined(PIC) #if defined(PIC)
"mov %7, %%"REG_b" \n\t" "mov %7, %%"REG_b" \n\t"