Update x264asm header files to latest versions.
Modify the asm accordingly. GLOBAL is now no longoer necessary for PIC-compliant loads. Originally committed as revision 23739 to svn://svn.ffmpeg.org/ffmpeg/trunk
This commit is contained in:
parent
44c70a9b3d
commit
2966cc1849
@ -40,7 +40,7 @@ section .text align=16
|
|||||||
%endmacro
|
%endmacro
|
||||||
|
|
||||||
%macro FLOAT_TO_INT16_INTERLEAVE6 1
|
%macro FLOAT_TO_INT16_INTERLEAVE6 1
|
||||||
; void ff_float_to_int16_interleave6_sse(int16_t *dst, const float **src, int len)
|
; void float_to_int16_interleave6_sse(int16_t *dst, const float **src, int len)
|
||||||
cglobal float_to_int16_interleave6_%1, 2,7,0, dst, src, src1, src2, src3, src4, src5
|
cglobal float_to_int16_interleave6_%1, 2,7,0, dst, src, src1, src2, src3, src4, src5
|
||||||
%ifdef ARCH_X86_64
|
%ifdef ARCH_X86_64
|
||||||
%define lend r10d
|
%define lend r10d
|
||||||
@ -272,7 +272,7 @@ SCALARPRODUCT_LOOP 0
|
|||||||
|
|
||||||
|
|
||||||
|
|
||||||
; void ff_add_hfyu_median_prediction_mmx2(uint8_t *dst, const uint8_t *top, const uint8_t *diff, int w, int *left, int *left_top)
|
; void add_hfyu_median_prediction_mmx2(uint8_t *dst, const uint8_t *top, const uint8_t *diff, int w, int *left, int *left_top)
|
||||||
cglobal add_hfyu_median_prediction_mmx2, 6,6,0, dst, top, diff, w, left, left_top
|
cglobal add_hfyu_median_prediction_mmx2, 6,6,0, dst, top, diff, w, left, left_top
|
||||||
movq mm0, [topq]
|
movq mm0, [topq]
|
||||||
movq mm2, mm0
|
movq mm2, mm0
|
||||||
@ -370,23 +370,23 @@ cglobal add_hfyu_median_prediction_mmx2, 6,6,0, dst, top, diff, w, left, left_to
|
|||||||
RET
|
RET
|
||||||
%endmacro
|
%endmacro
|
||||||
|
|
||||||
; int ff_add_hfyu_left_prediction(uint8_t *dst, const uint8_t *src, int w, int left)
|
; int add_hfyu_left_prediction(uint8_t *dst, const uint8_t *src, int w, int left)
|
||||||
INIT_MMX
|
INIT_MMX
|
||||||
cglobal add_hfyu_left_prediction_ssse3, 3,3,7, dst, src, w, left
|
cglobal add_hfyu_left_prediction_ssse3, 3,3,7, dst, src, w, left
|
||||||
.skip_prologue:
|
.skip_prologue:
|
||||||
mova m5, [pb_7 GLOBAL]
|
mova m5, [pb_7]
|
||||||
mova m4, [pb_zzzz3333zzzzbbbb GLOBAL]
|
mova m4, [pb_zzzz3333zzzzbbbb]
|
||||||
mova m3, [pb_zz11zz55zz99zzdd GLOBAL]
|
mova m3, [pb_zz11zz55zz99zzdd]
|
||||||
movd m0, leftm
|
movd m0, leftm
|
||||||
psllq m0, 56
|
psllq m0, 56
|
||||||
ADD_HFYU_LEFT_LOOP 1
|
ADD_HFYU_LEFT_LOOP 1
|
||||||
|
|
||||||
INIT_XMM
|
INIT_XMM
|
||||||
cglobal add_hfyu_left_prediction_sse4, 3,3,7, dst, src, w, left
|
cglobal add_hfyu_left_prediction_sse4, 3,3,7, dst, src, w, left
|
||||||
mova m5, [pb_f GLOBAL]
|
mova m5, [pb_f]
|
||||||
mova m6, [pb_zzzzzzzz77777777 GLOBAL]
|
mova m6, [pb_zzzzzzzz77777777]
|
||||||
mova m4, [pb_zzzz3333zzzzbbbb GLOBAL]
|
mova m4, [pb_zzzz3333zzzzbbbb]
|
||||||
mova m3, [pb_zz11zz55zz99zzdd GLOBAL]
|
mova m3, [pb_zz11zz55zz99zzdd]
|
||||||
movd m0, leftm
|
movd m0, leftm
|
||||||
pslldq m0, 15
|
pslldq m0, 15
|
||||||
test srcq, 15
|
test srcq, 15
|
||||||
@ -398,7 +398,7 @@ cglobal add_hfyu_left_prediction_sse4, 3,3,7, dst, src, w, left
|
|||||||
ADD_HFYU_LEFT_LOOP 0
|
ADD_HFYU_LEFT_LOOP 0
|
||||||
|
|
||||||
|
|
||||||
; float ff_scalarproduct_float_sse(const float *v1, const float *v2, int len)
|
; float scalarproduct_float_sse(const float *v1, const float *v2, int len)
|
||||||
cglobal scalarproduct_float_sse, 3,3,2, v1, v2, offset
|
cglobal scalarproduct_float_sse, 3,3,2, v1, v2, offset
|
||||||
neg offsetq
|
neg offsetq
|
||||||
shl offsetq, 2
|
shl offsetq, 2
|
||||||
|
@ -35,7 +35,7 @@ ps_m1p1: dd 1<<31, 0
|
|||||||
|
|
||||||
%assign i 16
|
%assign i 16
|
||||||
%rep 13
|
%rep 13
|
||||||
cextern ff_cos_ %+ i
|
cextern cos_ %+ i
|
||||||
%assign i i<<1
|
%assign i i<<1
|
||||||
%endrep
|
%endrep
|
||||||
|
|
||||||
@ -64,7 +64,7 @@ section .text align=16
|
|||||||
mova %5, %3
|
mova %5, %3
|
||||||
pfsub %3, %4
|
pfsub %3, %4
|
||||||
pfadd %5, %4 ; {t6,t5}
|
pfadd %5, %4 ; {t6,t5}
|
||||||
pxor %3, [ps_m1p1 GLOBAL] ; {t8,t7}
|
pxor %3, [ps_m1p1] ; {t8,t7}
|
||||||
mova %6, %1
|
mova %6, %1
|
||||||
pswapd %3, %3
|
pswapd %3, %3
|
||||||
pfadd %1, %5 ; {r0,i0}
|
pfadd %1, %5 ; {r0,i0}
|
||||||
@ -105,8 +105,8 @@ section .text align=16
|
|||||||
addps %6, %5 ; {t1,t2,t3,t4}
|
addps %6, %5 ; {t1,t2,t3,t4}
|
||||||
mova %5, %3
|
mova %5, %3
|
||||||
shufps %5, %5, 0xb1 ; {i5,r5,i7,r7}
|
shufps %5, %5, 0xb1 ; {i5,r5,i7,r7}
|
||||||
mulps %3, [ps_root2mppm GLOBAL] ; {-r5,i5,r7,-i7}
|
mulps %3, [ps_root2mppm] ; {-r5,i5,r7,-i7}
|
||||||
mulps %5, [ps_root2 GLOBAL]
|
mulps %5, [ps_root2]
|
||||||
addps %3, %5 ; {t8,t7,ta,t9}
|
addps %3, %5 ; {t8,t7,ta,t9}
|
||||||
mova %5, %6
|
mova %5, %6
|
||||||
shufps %6, %3, 0x36 ; {t3,t2,t9,t8}
|
shufps %6, %3, 0x36 ; {t3,t2,t9,t8}
|
||||||
@ -309,7 +309,7 @@ fft16_sse:
|
|||||||
mova m6, Z(6)
|
mova m6, Z(6)
|
||||||
mova m7, Z(7)
|
mova m7, Z(7)
|
||||||
T4_SSE m6, m7, m0
|
T4_SSE m6, m7, m0
|
||||||
PASS_SMALL 0, [ff_cos_16 GLOBAL], [ff_cos_16+16 GLOBAL]
|
PASS_SMALL 0, [cos_16], [cos_16+16]
|
||||||
ret
|
ret
|
||||||
|
|
||||||
|
|
||||||
@ -342,12 +342,12 @@ fft8%1:
|
|||||||
T2_3DN m6, m7, Z(6), Z(7)
|
T2_3DN m6, m7, Z(6), Z(7)
|
||||||
pswapd m0, m5
|
pswapd m0, m5
|
||||||
pswapd m2, m7
|
pswapd m2, m7
|
||||||
pxor m0, [ps_m1p1 GLOBAL]
|
pxor m0, [ps_m1p1]
|
||||||
pxor m2, [ps_m1p1 GLOBAL]
|
pxor m2, [ps_m1p1]
|
||||||
pfsub m5, m0
|
pfsub m5, m0
|
||||||
pfadd m7, m2
|
pfadd m7, m2
|
||||||
pfmul m5, [ps_root2 GLOBAL]
|
pfmul m5, [ps_root2]
|
||||||
pfmul m7, [ps_root2 GLOBAL]
|
pfmul m7, [ps_root2]
|
||||||
T4_3DN m1, m3, m5, m7, m0, m2
|
T4_3DN m1, m3, m5, m7, m0, m2
|
||||||
mova Z(5), m5
|
mova Z(5), m5
|
||||||
mova Z(7), m7
|
mova Z(7), m7
|
||||||
@ -445,7 +445,7 @@ fft %+ n %+ %3%2:
|
|||||||
add r0, n*2 - (n2&(-2<<%1))
|
add r0, n*2 - (n2&(-2<<%1))
|
||||||
call fft %+ n4 %+ %2
|
call fft %+ n4 %+ %2
|
||||||
sub r0, n*6 + (n2&(-2<<%1))
|
sub r0, n*6 + (n2&(-2<<%1))
|
||||||
lea r1, [ff_cos_ %+ n GLOBAL]
|
lea r1, [cos_ %+ n]
|
||||||
mov r2d, n4/2
|
mov r2d, n4/2
|
||||||
jmp pass%3%2
|
jmp pass%3%2
|
||||||
|
|
||||||
@ -461,10 +461,10 @@ section .text
|
|||||||
; On x86_32, this function does the register saving and restoring for all of fft.
|
; On x86_32, this function does the register saving and restoring for all of fft.
|
||||||
; The others pass args in registers and don't spill anything.
|
; The others pass args in registers and don't spill anything.
|
||||||
cglobal fft_dispatch%3%2, 2,5,8, z, nbits
|
cglobal fft_dispatch%3%2, 2,5,8, z, nbits
|
||||||
lea r2, [dispatch_tab%3%2 GLOBAL]
|
lea r2, [dispatch_tab%3%2]
|
||||||
mov r2, [r2 + (nbitsq-2)*gprsize]
|
mov r2, [r2 + (nbitsq-2)*gprsize]
|
||||||
%ifdef PIC
|
%ifdef PIC
|
||||||
lea r3, [$$ GLOBAL]
|
lea r3, [$$]
|
||||||
add r2, r3
|
add r2, r3
|
||||||
%endif
|
%endif
|
||||||
call r2
|
call r2
|
||||||
|
@ -234,18 +234,18 @@ SECTION .text
|
|||||||
%macro DEBLOCK_P0_Q0 0
|
%macro DEBLOCK_P0_Q0 0
|
||||||
mova m5, m1
|
mova m5, m1
|
||||||
pxor m5, m2 ; p0^q0
|
pxor m5, m2 ; p0^q0
|
||||||
pand m5, [pb_01 GLOBAL] ; (p0^q0)&1
|
pand m5, [pb_01] ; (p0^q0)&1
|
||||||
pcmpeqb m4, m4
|
pcmpeqb m4, m4
|
||||||
pxor m3, m4
|
pxor m3, m4
|
||||||
pavgb m3, m0 ; (p1 - q1 + 256)>>1
|
pavgb m3, m0 ; (p1 - q1 + 256)>>1
|
||||||
pavgb m3, [pb_03 GLOBAL] ; (((p1 - q1 + 256)>>1)+4)>>1 = 64+2+(p1-q1)>>2
|
pavgb m3, [pb_03] ; (((p1 - q1 + 256)>>1)+4)>>1 = 64+2+(p1-q1)>>2
|
||||||
pxor m4, m1
|
pxor m4, m1
|
||||||
pavgb m4, m2 ; (q0 - p0 + 256)>>1
|
pavgb m4, m2 ; (q0 - p0 + 256)>>1
|
||||||
pavgb m3, m5
|
pavgb m3, m5
|
||||||
paddusb m3, m4 ; d+128+33
|
paddusb m3, m4 ; d+128+33
|
||||||
mova m6, [pb_a1 GLOBAL]
|
mova m6, [pb_a1]
|
||||||
psubusb m6, m3
|
psubusb m6, m3
|
||||||
psubusb m3, [pb_a1 GLOBAL]
|
psubusb m3, [pb_a1]
|
||||||
pminub m6, m7
|
pminub m6, m7
|
||||||
pminub m3, m7
|
pminub m3, m7
|
||||||
psubusb m1, m6
|
psubusb m1, m6
|
||||||
@ -263,7 +263,7 @@ SECTION .text
|
|||||||
pavgb %6, m2
|
pavgb %6, m2
|
||||||
pavgb %2, %6 ; avg(p2,avg(p0,q0))
|
pavgb %2, %6 ; avg(p2,avg(p0,q0))
|
||||||
pxor %6, %3
|
pxor %6, %3
|
||||||
pand %6, [pb_01 GLOBAL] ; (p2^avg(p0,q0))&1
|
pand %6, [pb_01] ; (p2^avg(p0,q0))&1
|
||||||
psubusb %2, %6 ; (p2+((p0+q0+1)>>1))>>1
|
psubusb %2, %6 ; (p2+((p0+q0+1)>>1))>>1
|
||||||
mova %6, %1
|
mova %6, %1
|
||||||
psubusb %6, %5
|
psubusb %6, %5
|
||||||
@ -612,8 +612,8 @@ DEBLOCK_LUMA sse2, v, 16
|
|||||||
%define mask0 spill(2)
|
%define mask0 spill(2)
|
||||||
%define mask1p spill(3)
|
%define mask1p spill(3)
|
||||||
%define mask1q spill(4)
|
%define mask1q spill(4)
|
||||||
%define mpb_00 [pb_00 GLOBAL]
|
%define mpb_00 [pb_00]
|
||||||
%define mpb_01 [pb_01 GLOBAL]
|
%define mpb_01 [pb_01]
|
||||||
%endif
|
%endif
|
||||||
|
|
||||||
;-----------------------------------------------------------------------------
|
;-----------------------------------------------------------------------------
|
||||||
@ -637,7 +637,7 @@ cglobal x264_deblock_%2_luma_intra_%1, 4,6,16
|
|||||||
mova q1, [r0+r1]
|
mova q1, [r0+r1]
|
||||||
%ifdef ARCH_X86_64
|
%ifdef ARCH_X86_64
|
||||||
pxor mpb_00, mpb_00
|
pxor mpb_00, mpb_00
|
||||||
mova mpb_01, [pb_01 GLOBAL]
|
mova mpb_01, [pb_01]
|
||||||
LOAD_MASK r2d, r3d, t5 ; m5=beta-1, t5=alpha-1, m7=mask0
|
LOAD_MASK r2d, r3d, t5 ; m5=beta-1, t5=alpha-1, m7=mask0
|
||||||
SWAP 7, 12 ; m12=mask0
|
SWAP 7, 12 ; m12=mask0
|
||||||
pavgb t5, mpb_00
|
pavgb t5, mpb_00
|
||||||
@ -656,8 +656,8 @@ cglobal x264_deblock_%2_luma_intra_%1, 4,6,16
|
|||||||
LOAD_MASK r2d, r3d, t5 ; m5=beta-1, t5=alpha-1, m7=mask0
|
LOAD_MASK r2d, r3d, t5 ; m5=beta-1, t5=alpha-1, m7=mask0
|
||||||
mova m4, t5
|
mova m4, t5
|
||||||
mova mask0, m7
|
mova mask0, m7
|
||||||
pavgb m4, [pb_00 GLOBAL]
|
pavgb m4, [pb_00]
|
||||||
pavgb m4, [pb_01 GLOBAL] ; alpha/4+1
|
pavgb m4, [pb_01] ; alpha/4+1
|
||||||
DIFF_GT2 p0, q0, m4, m6, m7 ; m6 = |p0-q0| > alpha/4+1
|
DIFF_GT2 p0, q0, m4, m6, m7 ; m6 = |p0-q0| > alpha/4+1
|
||||||
pand m6, mask0
|
pand m6, mask0
|
||||||
DIFF_GT2 p0, p2, m5, m4, m7 ; m4 = |p2-p0| > beta-1
|
DIFF_GT2 p0, p2, m5, m4, m7 ; m4 = |p2-p0| > beta-1
|
||||||
|
@ -43,7 +43,7 @@ cglobal x264_add8x4_idct_sse2, 3,3,8
|
|||||||
movhps m3, [r1+56]
|
movhps m3, [r1+56]
|
||||||
IDCT4_1D 0,1,2,3,4,5
|
IDCT4_1D 0,1,2,3,4,5
|
||||||
TRANSPOSE2x4x4W 0,1,2,3,4
|
TRANSPOSE2x4x4W 0,1,2,3,4
|
||||||
paddw m0, [pw_32 GLOBAL]
|
paddw m0, [pw_32]
|
||||||
IDCT4_1D 0,1,2,3,4,5
|
IDCT4_1D 0,1,2,3,4,5
|
||||||
pxor m7, m7
|
pxor m7, m7
|
||||||
STORE_DIFF m0, m4, m7, [r0]
|
STORE_DIFF m0, m4, m7, [r0]
|
||||||
|
@ -1,25 +1,39 @@
|
|||||||
;*****************************************************************************
|
;*****************************************************************************
|
||||||
;* x86inc.asm
|
;* x86inc.asm
|
||||||
;*****************************************************************************
|
;*****************************************************************************
|
||||||
;* Copyright (C) 2005-2008 Loren Merritt <lorenm@u.washington.edu>
|
;* Copyright (C) 2005-2008 x264 project
|
||||||
;*
|
;*
|
||||||
;* This file is part of FFmpeg.
|
;* Authors: Loren Merritt <lorenm@u.washington.edu>
|
||||||
|
;* Anton Mitrofanov <BugMaster@narod.ru>
|
||||||
;*
|
;*
|
||||||
;* FFmpeg is free software; you can redistribute it and/or
|
;* Permission to use, copy, modify, and/or distribute this software for any
|
||||||
;* modify it under the terms of the GNU Lesser General Public
|
;* purpose with or without fee is hereby granted, provided that the above
|
||||||
;* License as published by the Free Software Foundation; either
|
;* copyright notice and this permission notice appear in all copies.
|
||||||
;* version 2.1 of the License, or (at your option) any later version.
|
|
||||||
;*
|
;*
|
||||||
;* FFmpeg is distributed in the hope that it will be useful,
|
;* THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
|
||||||
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
;* WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
|
||||||
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
;* MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
|
||||||
;* Lesser General Public License for more details.
|
;* ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
|
||||||
;*
|
;* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
|
||||||
;* You should have received a copy of the GNU Lesser General Public
|
;* ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
|
||||||
;* License along with FFmpeg; if not, write to the Free Software
|
;* OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
|
||||||
;* 51, Inc., Foundation Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
|
||||||
;*****************************************************************************
|
;*****************************************************************************
|
||||||
|
|
||||||
|
; This is a header file for the x264ASM assembly language, which uses
|
||||||
|
; NASM/YASM syntax combined with a large number of macros to provide easy
|
||||||
|
; abstraction between different calling conventions (x86_32, win64, linux64).
|
||||||
|
; It also has various other useful features to simplify writing the kind of
|
||||||
|
; DSP functions that are most often used in x264.
|
||||||
|
|
||||||
|
; Unlike the rest of x264, this file is available under an ISC license, as it
|
||||||
|
; has significant usefulness outside of x264 and we want it to be available
|
||||||
|
; to the largest audience possible. Of course, if you modify it for your own
|
||||||
|
; purposes to add a new feature, we strongly encourage contributing a patch
|
||||||
|
; as this feature might be useful for others as well. Send patches or ideas
|
||||||
|
; to x264-devel@videolan.org .
|
||||||
|
|
||||||
|
%define program_name ff
|
||||||
|
|
||||||
%ifdef ARCH_X86_64
|
%ifdef ARCH_X86_64
|
||||||
%ifidn __OUTPUT_FORMAT__,win32
|
%ifidn __OUTPUT_FORMAT__,win32
|
||||||
%define WIN64
|
%define WIN64
|
||||||
@ -28,6 +42,12 @@
|
|||||||
%endif
|
%endif
|
||||||
%endif
|
%endif
|
||||||
|
|
||||||
|
%ifdef PREFIX
|
||||||
|
%define mangle(x) _ %+ x
|
||||||
|
%else
|
||||||
|
%define mangle(x) x
|
||||||
|
%endif
|
||||||
|
|
||||||
; FIXME: All of the 64bit asm functions that take a stride as an argument
|
; FIXME: All of the 64bit asm functions that take a stride as an argument
|
||||||
; via register, assume that the high dword of that register is filled with 0.
|
; via register, assume that the high dword of that register is filled with 0.
|
||||||
; This is true in practice (since we never do any 64bit arithmetic on strides,
|
; This is true in practice (since we never do any 64bit arithmetic on strides,
|
||||||
@ -47,28 +67,16 @@
|
|||||||
%endif
|
%endif
|
||||||
%endmacro
|
%endmacro
|
||||||
|
|
||||||
; PIC support macros.
|
|
||||||
; x86_64 can't fit 64bit address literals in most instruction types,
|
|
||||||
; so shared objects (under the assumption that they might be anywhere
|
|
||||||
; in memory) must use an address mode that does fit.
|
|
||||||
; So all accesses to global variables must use this macro, e.g.
|
|
||||||
; mov eax, [foo GLOBAL]
|
|
||||||
; instead of
|
|
||||||
; mov eax, [foo]
|
|
||||||
;
|
|
||||||
; x86_32 doesn't require PIC.
|
|
||||||
; Some distros prefer shared objects to be PIC, but nothing breaks if
|
|
||||||
; the code contains a few textrels, so we'll skip that complexity.
|
|
||||||
|
|
||||||
%ifdef WIN64
|
%ifdef WIN64
|
||||||
%define PIC
|
%define PIC
|
||||||
%elifndef ARCH_X86_64
|
%elifndef ARCH_X86_64
|
||||||
|
; x86_32 doesn't require PIC.
|
||||||
|
; Some distros prefer shared objects to be PIC, but nothing breaks if
|
||||||
|
; the code contains a few textrels, so we'll skip that complexity.
|
||||||
%undef PIC
|
%undef PIC
|
||||||
%endif
|
%endif
|
||||||
%ifdef PIC
|
%ifdef PIC
|
||||||
%define GLOBAL wrt rip
|
default rel
|
||||||
%else
|
|
||||||
%define GLOBAL
|
|
||||||
%endif
|
%endif
|
||||||
|
|
||||||
; Macros to eliminate most code duplication between x86_32 and x86_64:
|
; Macros to eliminate most code duplication between x86_32 and x86_64:
|
||||||
@ -163,7 +171,7 @@ DECLARE_REG_SIZE bp, bpl
|
|||||||
%endrep
|
%endrep
|
||||||
%endmacro
|
%endmacro
|
||||||
|
|
||||||
DECLARE_REG_TMP_SIZE 0,1,2,3,4,5,6,7
|
DECLARE_REG_TMP_SIZE 0,1,2,3,4,5,6,7,8,9
|
||||||
|
|
||||||
%ifdef ARCH_X86_64
|
%ifdef ARCH_X86_64
|
||||||
%define gprsize 8
|
%define gprsize 8
|
||||||
@ -259,15 +267,11 @@ DECLARE_REG 6, rax, eax, ax, al, [rsp + stack_offset + 56]
|
|||||||
%endif
|
%endif
|
||||||
%endmacro
|
%endmacro
|
||||||
|
|
||||||
%macro PROLOGUE 2-4+ ; #args, #regs, #xmm_regs, arg_names...
|
%macro PROLOGUE 2-4+ 0 ; #args, #regs, #xmm_regs, arg_names...
|
||||||
ASSERT %2 >= %1
|
ASSERT %2 >= %1
|
||||||
%assign regs_used %2
|
%assign regs_used %2
|
||||||
ASSERT regs_used <= 7
|
ASSERT regs_used <= 7
|
||||||
%if %0 > 2
|
%assign xmm_regs_used %3
|
||||||
%assign xmm_regs_used %3
|
|
||||||
%else
|
|
||||||
%assign xmm_regs_used 0
|
|
||||||
%endif
|
|
||||||
ASSERT xmm_regs_used <= 16
|
ASSERT xmm_regs_used <= 16
|
||||||
%if regs_used > 4
|
%if regs_used > 4
|
||||||
push r4
|
push r4
|
||||||
@ -388,7 +392,7 @@ DECLARE_REG 6, ebp, ebp, bp, null, [esp + stack_offset + 28]
|
|||||||
%endif
|
%endif
|
||||||
%endmacro
|
%endmacro
|
||||||
|
|
||||||
%macro PROLOGUE 2-4+ ; #args, #regs, arg_names...
|
%macro PROLOGUE 2-4+ ; #args, #regs, #xmm_regs, arg_names...
|
||||||
ASSERT %2 >= %1
|
ASSERT %2 >= %1
|
||||||
%assign regs_used %2
|
%assign regs_used %2
|
||||||
ASSERT regs_used <= 7
|
ASSERT regs_used <= 7
|
||||||
@ -434,10 +438,7 @@ DECLARE_REG 6, ebp, ebp, bp, null, [esp + stack_offset + 28]
|
|||||||
|
|
||||||
; Symbol prefix for C linkage
|
; Symbol prefix for C linkage
|
||||||
%macro cglobal 1-2+
|
%macro cglobal 1-2+
|
||||||
%xdefine %1 ff_%1
|
%xdefine %1 mangle(program_name %+ _ %+ %1)
|
||||||
%ifdef PREFIX
|
|
||||||
%xdefine %1 _ %+ %1
|
|
||||||
%endif
|
|
||||||
%xdefine %1.skip_prologue %1 %+ .skip_prologue
|
%xdefine %1.skip_prologue %1 %+ .skip_prologue
|
||||||
%ifidn __OUTPUT_FORMAT__,elf
|
%ifidn __OUTPUT_FORMAT__,elf
|
||||||
global %1:function hidden
|
global %1:function hidden
|
||||||
@ -454,21 +455,28 @@ DECLARE_REG 6, ebp, ebp, bp, null, [esp + stack_offset + 28]
|
|||||||
%endmacro
|
%endmacro
|
||||||
|
|
||||||
%macro cextern 1
|
%macro cextern 1
|
||||||
%ifdef PREFIX
|
%xdefine %1 mangle(program_name %+ _ %+ %1)
|
||||||
%xdefine %1 _%1
|
|
||||||
%endif
|
|
||||||
extern %1
|
extern %1
|
||||||
%endmacro
|
%endmacro
|
||||||
|
|
||||||
|
;like cextern, but without the prefix
|
||||||
|
%macro cextern_naked 1
|
||||||
|
%xdefine %1 mangle(%1)
|
||||||
|
extern %1
|
||||||
|
%endmacro
|
||||||
|
|
||||||
|
%macro const 2+
|
||||||
|
%xdefine %1 mangle(program_name %+ _ %+ %1)
|
||||||
|
global %1
|
||||||
|
%1: %2
|
||||||
|
%endmacro
|
||||||
|
|
||||||
; This is needed for ELF, otherwise the GNU linker assumes the stack is
|
; This is needed for ELF, otherwise the GNU linker assumes the stack is
|
||||||
; executable by default.
|
; executable by default.
|
||||||
%ifidn __OUTPUT_FORMAT__,elf
|
%ifidn __OUTPUT_FORMAT__,elf
|
||||||
SECTION .note.GNU-stack noalloc noexec nowrite progbits
|
SECTION .note.GNU-stack noalloc noexec nowrite progbits
|
||||||
%endif
|
%endif
|
||||||
|
|
||||||
%assign FENC_STRIDE 16
|
|
||||||
%assign FDEC_STRIDE 32
|
|
||||||
|
|
||||||
; merge mmx and sse*
|
; merge mmx and sse*
|
||||||
|
|
||||||
%macro CAT_XDEFINE 3
|
%macro CAT_XDEFINE 3
|
||||||
@ -575,7 +583,10 @@ INIT_MMX
|
|||||||
%endrep
|
%endrep
|
||||||
%endmacro
|
%endmacro
|
||||||
|
|
||||||
%macro SAVE_MM_PERMUTATION 1
|
; If SAVE_MM_PERMUTATION is placed at the end of a function and given the
|
||||||
|
; function name, then any later calls to that function will automatically
|
||||||
|
; load the permutation, so values can be returned in mmregs.
|
||||||
|
%macro SAVE_MM_PERMUTATION 1 ; name to save as
|
||||||
%assign %%i 0
|
%assign %%i 0
|
||||||
%rep num_mmregs
|
%rep num_mmregs
|
||||||
CAT_XDEFINE %1_m, %%i, m %+ %%i
|
CAT_XDEFINE %1_m, %%i, m %+ %%i
|
||||||
@ -583,7 +594,7 @@ INIT_MMX
|
|||||||
%endrep
|
%endrep
|
||||||
%endmacro
|
%endmacro
|
||||||
|
|
||||||
%macro LOAD_MM_PERMUTATION 1
|
%macro LOAD_MM_PERMUTATION 1 ; name to load from
|
||||||
%assign %%i 0
|
%assign %%i 0
|
||||||
%rep num_mmregs
|
%rep num_mmregs
|
||||||
CAT_XDEFINE m, %%i, %1_m %+ %%i
|
CAT_XDEFINE m, %%i, %1_m %+ %%i
|
||||||
@ -599,7 +610,7 @@ INIT_MMX
|
|||||||
%endif
|
%endif
|
||||||
%endmacro
|
%endmacro
|
||||||
|
|
||||||
;Substitutions that reduce instruction size but are functionally equivalent
|
; Substitutions that reduce instruction size but are functionally equivalent
|
||||||
%macro add 2
|
%macro add 2
|
||||||
%ifnum %2
|
%ifnum %2
|
||||||
%if %2==128
|
%if %2==128
|
||||||
|
@ -1,7 +1,10 @@
|
|||||||
;*****************************************************************************
|
;*****************************************************************************
|
||||||
;* x86util.asm
|
;* x86util.asm
|
||||||
;*****************************************************************************
|
;*****************************************************************************
|
||||||
;* Copyright (C) 2008 Loren Merritt <lorenm@u.washington.edu>
|
;* Copyright (C) 2008 x264 project
|
||||||
|
;*
|
||||||
|
;* Authors: Holger Lubitz <holger@lubitz.org>
|
||||||
|
;* Loren Merritt <lorenm@u.washington.edu>
|
||||||
;*
|
;*
|
||||||
;* This program is free software; you can redistribute it and/or modify
|
;* This program is free software; you can redistribute it and/or modify
|
||||||
;* it under the terms of the GNU General Public License as published by
|
;* it under the terms of the GNU General Public License as published by
|
||||||
@ -18,6 +21,9 @@
|
|||||||
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
|
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
|
||||||
;*****************************************************************************
|
;*****************************************************************************
|
||||||
|
|
||||||
|
%assign FENC_STRIDE 16
|
||||||
|
%assign FDEC_STRIDE 32
|
||||||
|
|
||||||
%macro SBUTTERFLY 4
|
%macro SBUTTERFLY 4
|
||||||
mova m%4, m%2
|
mova m%4, m%2
|
||||||
punpckl%1 m%2, m%3
|
punpckl%1 m%2, m%3
|
||||||
@ -25,6 +31,13 @@
|
|||||||
SWAP %3, %4
|
SWAP %3, %4
|
||||||
%endmacro
|
%endmacro
|
||||||
|
|
||||||
|
%macro SBUTTERFLY2 4
|
||||||
|
mova m%4, m%2
|
||||||
|
punpckh%1 m%2, m%3
|
||||||
|
punpckl%1 m%4, m%3
|
||||||
|
SWAP %2, %4, %3
|
||||||
|
%endmacro
|
||||||
|
|
||||||
%macro TRANSPOSE4x4W 5
|
%macro TRANSPOSE4x4W 5
|
||||||
SBUTTERFLY wd, %1, %2, %5
|
SBUTTERFLY wd, %1, %2, %5
|
||||||
SBUTTERFLY wd, %3, %4, %5
|
SBUTTERFLY wd, %3, %4, %5
|
||||||
@ -123,14 +136,40 @@
|
|||||||
pabsw %2, %2
|
pabsw %2, %2
|
||||||
%endmacro
|
%endmacro
|
||||||
|
|
||||||
%define ABS1 ABS1_MMX
|
%macro ABSB_MMX 2
|
||||||
%define ABS2 ABS2_MMX
|
pxor %2, %2
|
||||||
|
psubb %2, %1
|
||||||
|
pminub %1, %2
|
||||||
|
%endmacro
|
||||||
|
|
||||||
|
%macro ABSB2_MMX 4
|
||||||
|
pxor %3, %3
|
||||||
|
pxor %4, %4
|
||||||
|
psubb %3, %1
|
||||||
|
psubb %4, %2
|
||||||
|
pminub %1, %3
|
||||||
|
pminub %2, %4
|
||||||
|
%endmacro
|
||||||
|
|
||||||
|
%macro ABSB_SSSE3 2
|
||||||
|
pabsb %1, %1
|
||||||
|
%endmacro
|
||||||
|
|
||||||
|
%macro ABSB2_SSSE3 4
|
||||||
|
pabsb %1, %1
|
||||||
|
pabsb %2, %2
|
||||||
|
%endmacro
|
||||||
|
|
||||||
%macro ABS4 6
|
%macro ABS4 6
|
||||||
ABS2 %1, %2, %5, %6
|
ABS2 %1, %2, %5, %6
|
||||||
ABS2 %3, %4, %5, %6
|
ABS2 %3, %4, %5, %6
|
||||||
%endmacro
|
%endmacro
|
||||||
|
|
||||||
|
%define ABS1 ABS1_MMX
|
||||||
|
%define ABS2 ABS2_MMX
|
||||||
|
%define ABSB ABSB_MMX
|
||||||
|
%define ABSB2 ABSB2_MMX
|
||||||
|
|
||||||
%macro SPLATB_MMX 3
|
%macro SPLATB_MMX 3
|
||||||
movd %1, [%2-3] ;to avoid crossing a cacheline
|
movd %1, [%2-3] ;to avoid crossing a cacheline
|
||||||
punpcklbw %1, %1
|
punpcklbw %1, %1
|
||||||
@ -226,10 +265,10 @@
|
|||||||
; %3/%4: source regs
|
; %3/%4: source regs
|
||||||
; %5/%6: tmp regs
|
; %5/%6: tmp regs
|
||||||
%ifidn %1, d
|
%ifidn %1, d
|
||||||
%define mask [mask_10 GLOBAL]
|
%define mask [mask_10]
|
||||||
%define shift 16
|
%define shift 16
|
||||||
%elifidn %1, q
|
%elifidn %1, q
|
||||||
%define mask [mask_1100 GLOBAL]
|
%define mask [mask_1100]
|
||||||
%define shift 32
|
%define shift 32
|
||||||
%endif
|
%endif
|
||||||
%if %0==6 ; less dependency if we have two tmp
|
%if %0==6 ; less dependency if we have two tmp
|
||||||
@ -383,10 +422,10 @@
|
|||||||
%macro SUMSUBD2_AB 4
|
%macro SUMSUBD2_AB 4
|
||||||
mova %4, %1
|
mova %4, %1
|
||||||
mova %3, %2
|
mova %3, %2
|
||||||
psraw %2, 1
|
psraw %2, 1 ; %2: %2>>1
|
||||||
psraw %1, 1
|
psraw %1, 1 ; %1: %1>>1
|
||||||
paddw %2, %4
|
paddw %2, %4 ; %2: %2>>1+%1
|
||||||
psubw %1, %3
|
psubw %1, %3 ; %1: %1>>1-%2
|
||||||
%endmacro
|
%endmacro
|
||||||
|
|
||||||
%macro DCT4_1D 5
|
%macro DCT4_1D 5
|
||||||
@ -407,16 +446,27 @@
|
|||||||
%macro IDCT4_1D 5-6
|
%macro IDCT4_1D 5-6
|
||||||
%ifnum %5
|
%ifnum %5
|
||||||
SUMSUBD2_AB m%2, m%4, m%6, m%5
|
SUMSUBD2_AB m%2, m%4, m%6, m%5
|
||||||
|
; %2: %2>>1-%4 %4: %2+%4>>1
|
||||||
SUMSUB_BA m%3, m%1, m%6
|
SUMSUB_BA m%3, m%1, m%6
|
||||||
|
; %3: %1+%3 %1: %1-%3
|
||||||
SUMSUB_BADC m%4, m%3, m%2, m%1, m%6
|
SUMSUB_BADC m%4, m%3, m%2, m%1, m%6
|
||||||
|
; %4: %1+%3 + (%2+%4>>1)
|
||||||
|
; %3: %1+%3 - (%2+%4>>1)
|
||||||
|
; %2: %1-%3 + (%2>>1-%4)
|
||||||
|
; %1: %1-%3 - (%2>>1-%4)
|
||||||
%else
|
%else
|
||||||
SUMSUBD2_AB m%2, m%4, [%5], [%5+16]
|
SUMSUBD2_AB m%2, m%4, [%5], [%5+16]
|
||||||
SUMSUB_BA m%3, m%1
|
SUMSUB_BA m%3, m%1
|
||||||
SUMSUB_BADC m%4, m%3, m%2, m%1
|
SUMSUB_BADC m%4, m%3, m%2, m%1
|
||||||
%endif
|
%endif
|
||||||
SWAP %1, %4, %3
|
SWAP %1, %4, %3
|
||||||
|
; %1: %1+%3 + (%2+%4>>1) row0
|
||||||
|
; %2: %1-%3 + (%2>>1-%4) row1
|
||||||
|
; %3: %1-%3 - (%2>>1-%4) row2
|
||||||
|
; %4: %1+%3 - (%2+%4>>1) row3
|
||||||
%endmacro
|
%endmacro
|
||||||
|
|
||||||
|
|
||||||
%macro LOAD_DIFF 5
|
%macro LOAD_DIFF 5
|
||||||
%ifidn %3, none
|
%ifidn %3, none
|
||||||
movh %1, %4
|
movh %1, %4
|
||||||
@ -512,4 +562,3 @@
|
|||||||
packuswb %1, %1
|
packuswb %1, %1
|
||||||
movh %4, %1
|
movh %4, %1
|
||||||
%endmacro
|
%endmacro
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user