ffmpeg/libavcodec/x86/vc1dsp_yasm.asm

;******************************************************************************
;* VC1 deblocking optimizations
;* Copyright (c) 2009 David Conrad
;*
;* This file is part of FFmpeg.
;*
;* FFmpeg is free software; you can redistribute it and/or
;* modify it under the terms of the GNU Lesser General Public
;* License as published by the Free Software Foundation; either
;* version 2.1 of the License, or (at your option) any later version.
;*
;* FFmpeg is distributed in the hope that it will be useful,
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
;* Lesser General Public License for more details.
;*
;* You should have received a copy of the GNU Lesser General Public
;* License along with FFmpeg; if not, write to the Free Software
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
;******************************************************************************

%include "libavutil/x86/x86inc.asm"
%include "libavutil/x86/x86util.asm"

cextern pw_4
cextern pw_5

section .text

; dst_low, dst_high (src), zero
; zero-extends one vector from 8 to 16 bits
%macro UNPACK_8TO16 4
    mova      m%2, m%3
    punpckh%1 m%3, m%4
    punpckl%1 m%2, m%4
%endmacro

%macro STORE_4_WORDS_MMX 6
    movd  %6d, %5
%if mmsize==16
    psrldq %5, 4
%else
    psrlq  %5, 32
%endif
    mov    %1, %6w
    shr    %6, 16
    mov    %2, %6w
    movd  %6d, %5
    mov    %3, %6w
    shr    %6, 16
    mov    %4, %6w
%endmacro

%macro STORE_4_WORDS_SSE4 6
    pextrw %1, %5, %6+0
    pextrw %2, %5, %6+1
    pextrw %3, %5, %6+2
    pextrw %4, %5, %6+3
%endmacro

; in:  p1 p0 q0 q1, clobbers p0
; out: p1 = (2*(p1 - q1) - 5*(p0 - q0) + 4) >> 3
%macro VC1_LOOP_FILTER_A0 4
    psubw  %1, %4
    psubw  %2, %3
    paddw  %1, %1
    pmullw %2, [pw_5]
    psubw  %1, %2
    paddw  %1, [pw_4]
    psraw  %1, 3
%endmacro

; in: p0 q0 a0 a1 a2
;     m0 m1 m7 m6 m5
; %1: size
; out: m0=p0' m1=q0'
%macro VC1_FILTER 1
    PABSW   m4, m7
    PABSW   m3, m6
    PABSW   m2, m5
    mova    m6, m4
    pminsw  m3, m2
    pcmpgtw m6, m3  ; if (a2 < a0 || a1 < a0)
    psubw   m3, m4
    pmullw  m3, [pw_5]   ; 5*(a3 - a0)
    PABSW   m2, m3
    psraw   m2, 3   ; abs(d/8)
    pxor    m7, m3  ; d_sign ^= a0_sign

    pxor    m5, m5
    movd    m3, r2d
%if %1 > 4
    punpcklbw m3, m3
%endif
    punpcklbw m3, m5
    pcmpgtw m3, m4  ; if (a0 < pq)
    pand    m6, m3

    mova    m3, m0
    psubw   m3, m1
    PABSW   m4, m3
    psraw   m4, 1
    pxor    m3, m7  ; d_sign ^ clip_sign
    psraw   m3, 15
    pminsw  m2, m4  ; min(d, clip)
    pcmpgtw m4, m5
    pand    m6, m4  ; filt3 (C return value)

; each set of 4 pixels is not filtered if the 3rd is not
%if mmsize==16
    pshuflw m4, m6, 0xaa
%if %1 > 4
    pshufhw m4, m4, 0xaa
%endif
%else
    pshufw  m4, m6, 0xaa
%endif
    pandn   m3, m4
    pand    m2, m6
    pand    m3, m2  ; d final

    psraw   m7, 15
    pxor    m3, m7
    psubw   m3, m7
    psubw   m0, m3
    paddw   m1, m3
    packuswb m0, m0
    packuswb m1, m1
%endmacro

; 1st param: size of filter
; 2nd param: mov suffix equivalent to the filter size
%macro VC1_V_LOOP_FILTER 2
    pxor      m5, m5
    mov%2     m6, [r4]
    mov%2     m4, [r4+r1]
    mov%2     m7, [r4+2*r1]
    mov%2     m0, [r4+r3]
    punpcklbw m6, m5
    punpcklbw m4, m5
    punpcklbw m7, m5
    punpcklbw m0, m5

    VC1_LOOP_FILTER_A0 m6, m4, m7, m0
    mov%2     m1, [r0]
    mov%2     m2, [r0+r1]
    punpcklbw m1, m5
    punpcklbw m2, m5
    mova      m4, m0
    VC1_LOOP_FILTER_A0 m7, m4, m1, m2
    mov%2     m3, [r0+2*r1]
    mov%2     m4, [r0+r3]
    punpcklbw m3, m5
    punpcklbw m4, m5
    mova      m5, m1
    VC1_LOOP_FILTER_A0 m5, m2, m3, m4

    VC1_FILTER %1
    mov%2 [r4+r3], m0
    mov%2 [r0],    m1
%endmacro

; 1st param: size of filter
;     NOTE: UNPACK_8TO16 this number of 8 bit numbers are in half a register
; 2nd (optional) param: temp register to use for storing words
%macro VC1_H_LOOP_FILTER 1-2
%if %1 == 4
    movq      m0, [r0     -4]
    movq      m1, [r0+  r1-4]
    movq      m2, [r0+2*r1-4]
    movq      m3, [r0+  r3-4]
    TRANSPOSE4x4B 0, 1, 2, 3, 4
%else
    movq      m0, [r0     -4]
    movq      m4, [r0+  r1-4]
    movq      m1, [r0+2*r1-4]
    movq      m5, [r0+  r3-4]
    movq      m2, [r4     -4]
    movq      m6, [r4+  r1-4]
    movq      m3, [r4+2*r1-4]
    movq      m7, [r4+  r3-4]
    punpcklbw m0, m4
    punpcklbw m1, m5
    punpcklbw m2, m6
    punpcklbw m3, m7
    TRANSPOSE4x4W 0, 1, 2, 3, 4
%endif
    pxor      m5, m5

    UNPACK_8TO16 bw, 6, 0, 5
    UNPACK_8TO16 bw, 7, 1, 5
    VC1_LOOP_FILTER_A0 m6, m0, m7, m1
    UNPACK_8TO16 bw, 4, 2, 5
    mova    m0, m1                      ; m0 = p0
    VC1_LOOP_FILTER_A0 m7, m1, m4, m2
    UNPACK_8TO16 bw, 1, 3, 5
    mova    m5, m4
    VC1_LOOP_FILTER_A0 m5, m2, m1, m3
    SWAP 1, 4                           ; m1 = q0

    VC1_FILTER %1
    punpcklbw m0, m1
%if %0 > 1
    STORE_4_WORDS_MMX [r0-1], [r0+r1-1], [r0+2*r1-1], [r0+r3-1], m0, %2
%if %1 > 4
    psrldq m0, 4
    STORE_4_WORDS_MMX [r4-1], [r4+r1-1], [r4+2*r1-1], [r4+r3-1], m0, %2
%endif
%else
    STORE_4_WORDS_SSE4 [r0-1], [r0+r1-1], [r0+2*r1-1], [r0+r3-1], m0, 0
    STORE_4_WORDS_SSE4 [r4-1], [r4+r1-1], [r4+2*r1-1], [r4+r3-1], m0, 4
%endif
%endmacro


%macro START_V_FILTER 0
    mov  r4, r0
    lea  r3, [4*r1]
    sub  r4, r3
    lea  r3, [r1+2*r1]
    imul r2, 0x01010101
%endmacro

%macro START_H_FILTER 1
    lea  r3, [r1+2*r1]
%if %1 > 4
    lea  r4, [r0+4*r1]
%endif
    imul r2, 0x01010101
%endmacro

%macro VC1_LF_MMX 1
INIT_MMX
cglobal vc1_v_loop_filter_internal_%1
    VC1_V_LOOP_FILTER 4, d
    ret

cglobal vc1_h_loop_filter_internal_%1
    VC1_H_LOOP_FILTER 4, r4
    ret

; void ff_vc1_v_loop_filter4_mmx2(uint8_t *src, int stride, int pq)
cglobal vc1_v_loop_filter4_%1, 3,5,0
    START_V_FILTER
    call vc1_v_loop_filter_internal_%1
    RET

; void ff_vc1_h_loop_filter4_mmx2(uint8_t *src, int stride, int pq)
cglobal vc1_h_loop_filter4_%1, 3,5,0
    START_H_FILTER 4
    call vc1_h_loop_filter_internal_%1
    RET

; void ff_vc1_v_loop_filter8_mmx2(uint8_t *src, int stride, int pq)
cglobal vc1_v_loop_filter8_%1, 3,5,0
    START_V_FILTER
    call vc1_v_loop_filter_internal_%1
    add  r4, 4
    add  r0, 4
    call vc1_v_loop_filter_internal_%1
    RET

; void ff_vc1_h_loop_filter8_mmx2(uint8_t *src, int stride, int pq)
cglobal vc1_h_loop_filter8_%1, 3,5,0
    START_H_FILTER 4
    call vc1_h_loop_filter_internal_%1
    lea  r0, [r0+4*r1]
    call vc1_h_loop_filter_internal_%1
    RET
%endmacro

%define PABSW PABSW_MMX2
VC1_LF_MMX mmx2

INIT_XMM
; void ff_vc1_v_loop_filter8_sse2(uint8_t *src, int stride, int pq)
cglobal vc1_v_loop_filter8_sse2, 3,5,8
    START_V_FILTER
    VC1_V_LOOP_FILTER 8, q
    RET

; void ff_vc1_h_loop_filter8_sse2(uint8_t *src, int stride, int pq)
cglobal vc1_h_loop_filter8_sse2, 3,6,8
    START_H_FILTER 8
    VC1_H_LOOP_FILTER 8, r5
    RET

%define PABSW PABSW_SSSE3

INIT_MMX
; void ff_vc1_v_loop_filter4_ssse3(uint8_t *src, int stride, int pq)
cglobal vc1_v_loop_filter4_ssse3, 3,5,0
    START_V_FILTER
    VC1_V_LOOP_FILTER 4, d
    RET

; void ff_vc1_h_loop_filter4_ssse3(uint8_t *src, int stride, int pq)
cglobal vc1_h_loop_filter4_ssse3, 3,5,0
    START_H_FILTER 4
    VC1_H_LOOP_FILTER 4, r4
    RET

INIT_XMM
; void ff_vc1_v_loop_filter8_ssse3(uint8_t *src, int stride, int pq)
cglobal vc1_v_loop_filter8_ssse3, 3,5,8
    START_V_FILTER
    VC1_V_LOOP_FILTER 8, q
    RET

; void ff_vc1_h_loop_filter8_ssse3(uint8_t *src, int stride, int pq)
cglobal vc1_h_loop_filter8_ssse3, 3,6,8
    START_H_FILTER 8
    VC1_H_LOOP_FILTER 8, r5
    RET

; void ff_vc1_h_loop_filter8_sse4(uint8_t *src, int stride, int pq)
cglobal vc1_h_loop_filter8_sse4, 3,5,8
    START_H_FILTER 8
    VC1_H_LOOP_FILTER 8
    RET
MMX/SSE VC1 loop filter Originally committed as revision 24208 to svn://svn.ffmpeg.org/ffmpeg/trunk 2010-07-12 00:53:01 +02:00			`;******************************************************************************`
			`;* VC1 deblocking optimizations`
			`;* Copyright (c) 2009 David Conrad`
			`;*`
			`;* This file is part of FFmpeg.`
			`;*`
			`;* FFmpeg is free software; you can redistribute it and/or`
			`;* modify it under the terms of the GNU Lesser General Public`
			`;* License as published by the Free Software Foundation; either`
			`;* version 2.1 of the License, or (at your option) any later version.`
			`;*`
			`;* FFmpeg is distributed in the hope that it will be useful,`
			`;* but WITHOUT ANY WARRANTY; without even the implied warranty of`
			`;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU`
			`;* Lesser General Public License for more details.`
			`;*`
			`;* You should have received a copy of the GNU Lesser General Public`
			`;* License along with FFmpeg; if not, write to the Free Software`
Fix FSF address copy paste error in some license headers. 2011-05-14 21:32:31 +02:00			`;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA`
MMX/SSE VC1 loop filter Originally committed as revision 24208 to svn://svn.ffmpeg.org/ffmpeg/trunk 2010-07-12 00:53:01 +02:00			`;******************************************************************************`

Move x264asm to libavutil. Signed-off-by: Michael Niedermayer <michaelni@gmx.at> 2011-06-05 16:19:16 +02:00			`%include "libavutil/x86/x86inc.asm"`
			`%include "libavutil/x86/x86util.asm"`
MMX/SSE VC1 loop filter Originally committed as revision 24208 to svn://svn.ffmpeg.org/ffmpeg/trunk 2010-07-12 00:53:01 +02:00
			`cextern pw_4`
			`cextern pw_5`

			`section .text`

			`; dst_low, dst_high (src), zero`
			`; zero-extends one vector from 8 to 16 bits`
			`%macro UNPACK_8TO16 4`
			`mova m%2, m%3`
			`punpckh%1 m%3, m%4`
			`punpckl%1 m%2, m%4`
			`%endmacro`

			`%macro STORE_4_WORDS_MMX 6`
Use "d" suffix for general-purpose registers used with movd. This increases compatibilty with nasm and is also more consistent, e.g. with h264_intrapred.asm and h264_chromamc.asm that already do it that way. Originally committed as revision 25042 to svn://svn.ffmpeg.org/ffmpeg/trunk 2010-09-05 12:10:16 +02:00			`movd %6d, %5`
MMX/SSE VC1 loop filter Originally committed as revision 24208 to svn://svn.ffmpeg.org/ffmpeg/trunk 2010-07-12 00:53:01 +02:00			`%if mmsize==16`
			`psrldq %5, 4`
			`%else`
			`psrlq %5, 32`
			`%endif`
			`mov %1, %6w`
			`shr %6, 16`
			`mov %2, %6w`
Use "d" suffix for general-purpose registers used with movd. This increases compatibilty with nasm and is also more consistent, e.g. with h264_intrapred.asm and h264_chromamc.asm that already do it that way. Originally committed as revision 25042 to svn://svn.ffmpeg.org/ffmpeg/trunk 2010-09-05 12:10:16 +02:00			`movd %6d, %5`
MMX/SSE VC1 loop filter Originally committed as revision 24208 to svn://svn.ffmpeg.org/ffmpeg/trunk 2010-07-12 00:53:01 +02:00			`mov %3, %6w`
			`shr %6, 16`
			`mov %4, %6w`
			`%endmacro`

			`%macro STORE_4_WORDS_SSE4 6`
			`pextrw %1, %5, %6+0`
			`pextrw %2, %5, %6+1`
			`pextrw %3, %5, %6+2`
			`pextrw %4, %5, %6+3`
			`%endmacro`

			`; in: p1 p0 q0 q1, clobbers p0`
			`; out: p1 = (2(p1 - q1) - 5(p0 - q0) + 4) >> 3`
			`%macro VC1_LOOP_FILTER_A0 4`
			`psubw %1, %4`
			`psubw %2, %3`
			`paddw %1, %1`
			`pmullw %2, [pw_5]`
			`psubw %1, %2`
			`paddw %1, [pw_4]`
			`psraw %1, 3`
			`%endmacro`

			`; in: p0 q0 a0 a1 a2`
			`; m0 m1 m7 m6 m5`
			`; %1: size`
			`; out: m0=p0' m1=q0'`
			`%macro VC1_FILTER 1`
			`PABSW m4, m7`
			`PABSW m3, m6`
			`PABSW m2, m5`
			`mova m6, m4`
			`pminsw m3, m2`
			`pcmpgtw m6, m3 ; if (a2 < a0 \|\| a1 < a0)`
			`psubw m3, m4`
			`pmullw m3, [pw_5] ; 5*(a3 - a0)`
			`PABSW m2, m3`
			`psraw m2, 3 ; abs(d/8)`
			`pxor m7, m3 ; d_sign ^= a0_sign`

			`pxor m5, m5`
Use "d" suffix for general-purpose registers used with movd. This increases compatibilty with nasm and is also more consistent, e.g. with h264_intrapred.asm and h264_chromamc.asm that already do it that way. Originally committed as revision 25042 to svn://svn.ffmpeg.org/ffmpeg/trunk 2010-09-05 12:10:16 +02:00			`movd m3, r2d`
MMX/SSE VC1 loop filter Originally committed as revision 24208 to svn://svn.ffmpeg.org/ffmpeg/trunk 2010-07-12 00:53:01 +02:00			`%if %1 > 4`
			`punpcklbw m3, m3`
			`%endif`
			`punpcklbw m3, m5`
			`pcmpgtw m3, m4 ; if (a0 < pq)`
			`pand m6, m3`

			`mova m3, m0`
			`psubw m3, m1`
			`PABSW m4, m3`
			`psraw m4, 1`
			`pxor m3, m7 ; d_sign ^ clip_sign`
			`psraw m3, 15`
			`pminsw m2, m4 ; min(d, clip)`
			`pcmpgtw m4, m5`
			`pand m6, m4 ; filt3 (C return value)`

			`; each set of 4 pixels is not filtered if the 3rd is not`
			`%if mmsize==16`
			`pshuflw m4, m6, 0xaa`
			`%if %1 > 4`
			`pshufhw m4, m4, 0xaa`
			`%endif`
			`%else`
			`pshufw m4, m6, 0xaa`
			`%endif`
			`pandn m3, m4`
			`pand m2, m6`
			`pand m3, m2 ; d final`

x86: vc1: fix and enable optimised loop filter The problem is that the ssse3 psign instruction does the wrong thing here. Commit ea60dfe incorrectly removed a macro emulating this instruction for pre-ssse3 code. However, the emulation is incorrect, and the code relies on the behaviour of the macro. Specifically, the psign sets destination elements to zero where the corresponding source element is zero, whereas the emulation only negates destination elements where the source is negative. Furthermore, the PSIGNW_MMX macro in x86util.asm is totally bogus, which is why the original VC-1 code had an additional right shift when using it. Since the psign instruction cannot be used here, skip all the macro hell and use the working instruction sequence directly. None of this was noticed due a stray return statement in ff_vc1dsp_init_mmx() which meant that only the mmx version of the loop filter was ever used (before being removed in ea60dfe). Signed-off-by: Mans Rullgard <mans@mansr.com> 2012-06-22 22:40:28 +02:00			`psraw m7, 15`
			`pxor m3, m7`
			`psubw m3, m7`
MMX/SSE VC1 loop filter Originally committed as revision 24208 to svn://svn.ffmpeg.org/ffmpeg/trunk 2010-07-12 00:53:01 +02:00			`psubw m0, m3`
			`paddw m1, m3`
			`packuswb m0, m0`
			`packuswb m1, m1`
			`%endmacro`

			`; 1st param: size of filter`
			`; 2nd param: mov suffix equivalent to the filter size`
			`%macro VC1_V_LOOP_FILTER 2`
			`pxor m5, m5`
			`mov%2 m6, [r4]`
			`mov%2 m4, [r4+r1]`
			`mov%2 m7, [r4+2*r1]`
			`mov%2 m0, [r4+r3]`
			`punpcklbw m6, m5`
			`punpcklbw m4, m5`
			`punpcklbw m7, m5`
			`punpcklbw m0, m5`

			`VC1_LOOP_FILTER_A0 m6, m4, m7, m0`
			`mov%2 m1, [r0]`
			`mov%2 m2, [r0+r1]`
			`punpcklbw m1, m5`
			`punpcklbw m2, m5`
			`mova m4, m0`
			`VC1_LOOP_FILTER_A0 m7, m4, m1, m2`
			`mov%2 m3, [r0+2*r1]`
			`mov%2 m4, [r0+r3]`
			`punpcklbw m3, m5`
			`punpcklbw m4, m5`
			`mova m5, m1`
			`VC1_LOOP_FILTER_A0 m5, m2, m3, m4`

			`VC1_FILTER %1`
			`mov%2 [r4+r3], m0`
			`mov%2 [r0], m1`
			`%endmacro`

			`; 1st param: size of filter`
			`; NOTE: UNPACK_8TO16 this number of 8 bit numbers are in half a register`
			`; 2nd (optional) param: temp register to use for storing words`
			`%macro VC1_H_LOOP_FILTER 1-2`
			`%if %1 == 4`
			`movq m0, [r0 -4]`
			`movq m1, [r0+ r1-4]`
			`movq m2, [r0+2*r1-4]`
			`movq m3, [r0+ r3-4]`
			`TRANSPOSE4x4B 0, 1, 2, 3, 4`
			`%else`
			`movq m0, [r0 -4]`
			`movq m4, [r0+ r1-4]`
			`movq m1, [r0+2*r1-4]`
			`movq m5, [r0+ r3-4]`
			`movq m2, [r4 -4]`
			`movq m6, [r4+ r1-4]`
			`movq m3, [r4+2*r1-4]`
			`movq m7, [r4+ r3-4]`
			`punpcklbw m0, m4`
			`punpcklbw m1, m5`
			`punpcklbw m2, m6`
			`punpcklbw m3, m7`
			`TRANSPOSE4x4W 0, 1, 2, 3, 4`
			`%endif`
			`pxor m5, m5`

			`UNPACK_8TO16 bw, 6, 0, 5`
			`UNPACK_8TO16 bw, 7, 1, 5`
			`VC1_LOOP_FILTER_A0 m6, m0, m7, m1`
			`UNPACK_8TO16 bw, 4, 2, 5`
			`mova m0, m1 ; m0 = p0`
			`VC1_LOOP_FILTER_A0 m7, m1, m4, m2`
			`UNPACK_8TO16 bw, 1, 3, 5`
			`mova m5, m4`
			`VC1_LOOP_FILTER_A0 m5, m2, m1, m3`
			`SWAP 1, 4 ; m1 = q0`

			`VC1_FILTER %1`
			`punpcklbw m0, m1`
			`%if %0 > 1`
			`STORE_4_WORDS_MMX [r0-1], [r0+r1-1], [r0+2*r1-1], [r0+r3-1], m0, %2`
			`%if %1 > 4`
			`psrldq m0, 4`
			`STORE_4_WORDS_MMX [r4-1], [r4+r1-1], [r4+2*r1-1], [r4+r3-1], m0, %2`
			`%endif`
			`%else`
			`STORE_4_WORDS_SSE4 [r0-1], [r0+r1-1], [r0+2*r1-1], [r0+r3-1], m0, 0`
			`STORE_4_WORDS_SSE4 [r4-1], [r4+r1-1], [r4+2*r1-1], [r4+r3-1], m0, 4`
			`%endif`
			`%endmacro`


			`%macro START_V_FILTER 0`
			`mov r4, r0`
			`lea r3, [4*r1]`
			`sub r4, r3`
			`lea r3, [r1+2*r1]`
			`imul r2, 0x01010101`
			`%endmacro`

			`%macro START_H_FILTER 1`
			`lea r3, [r1+2*r1]`
			`%if %1 > 4`
			`lea r4, [r0+4*r1]`
			`%endif`
			`imul r2, 0x01010101`
			`%endmacro`

			`%macro VC1_LF_MMX 1`
			`INIT_MMX`
			`cglobal vc1_v_loop_filter_internal_%1`
			`VC1_V_LOOP_FILTER 4, d`
			`ret`

			`cglobal vc1_h_loop_filter_internal_%1`
			`VC1_H_LOOP_FILTER 4, r4`
			`ret`

			`; void ff_vc1_v_loop_filter4_mmx2(uint8_t *src, int stride, int pq)`
			`cglobal vc1_v_loop_filter4_%1, 3,5,0`
			`START_V_FILTER`
			`call vc1_v_loop_filter_internal_%1`
			`RET`

			`; void ff_vc1_h_loop_filter4_mmx2(uint8_t *src, int stride, int pq)`
			`cglobal vc1_h_loop_filter4_%1, 3,5,0`
			`START_H_FILTER 4`
			`call vc1_h_loop_filter_internal_%1`
			`RET`

			`; void ff_vc1_v_loop_filter8_mmx2(uint8_t *src, int stride, int pq)`
			`cglobal vc1_v_loop_filter8_%1, 3,5,0`
			`START_V_FILTER`
			`call vc1_v_loop_filter_internal_%1`
			`add r4, 4`
			`add r0, 4`
			`call vc1_v_loop_filter_internal_%1`
			`RET`

			`; void ff_vc1_h_loop_filter8_mmx2(uint8_t *src, int stride, int pq)`
			`cglobal vc1_h_loop_filter8_%1, 3,5,0`
			`START_H_FILTER 4`
			`call vc1_h_loop_filter_internal_%1`
			`lea r0, [r0+4*r1]`
			`call vc1_h_loop_filter_internal_%1`
			`RET`
			`%endmacro`

			`%define PABSW PABSW_MMX2`
			`VC1_LF_MMX mmx2`

			`INIT_XMM`
			`; void ff_vc1_v_loop_filter8_sse2(uint8_t *src, int stride, int pq)`
			`cglobal vc1_v_loop_filter8_sse2, 3,5,8`
			`START_V_FILTER`
			`VC1_V_LOOP_FILTER 8, q`
			`RET`

			`; void ff_vc1_h_loop_filter8_sse2(uint8_t *src, int stride, int pq)`
			`cglobal vc1_h_loop_filter8_sse2, 3,6,8`
			`START_H_FILTER 8`
			`VC1_H_LOOP_FILTER 8, r5`
			`RET`

			`%define PABSW PABSW_SSSE3`

			`INIT_MMX`
			`; void ff_vc1_v_loop_filter4_ssse3(uint8_t *src, int stride, int pq)`
			`cglobal vc1_v_loop_filter4_ssse3, 3,5,0`
			`START_V_FILTER`
			`VC1_V_LOOP_FILTER 4, d`
			`RET`

			`; void ff_vc1_h_loop_filter4_ssse3(uint8_t *src, int stride, int pq)`
			`cglobal vc1_h_loop_filter4_ssse3, 3,5,0`
			`START_H_FILTER 4`
			`VC1_H_LOOP_FILTER 4, r4`
			`RET`

			`INIT_XMM`
			`; void ff_vc1_v_loop_filter8_ssse3(uint8_t *src, int stride, int pq)`
			`cglobal vc1_v_loop_filter8_ssse3, 3,5,8`
			`START_V_FILTER`
			`VC1_V_LOOP_FILTER 8, q`
			`RET`

			`; void ff_vc1_h_loop_filter8_ssse3(uint8_t *src, int stride, int pq)`
			`cglobal vc1_h_loop_filter8_ssse3, 3,6,8`
			`START_H_FILTER 8`
			`VC1_H_LOOP_FILTER 8, r5`
			`RET`

			`; void ff_vc1_h_loop_filter8_sse4(uint8_t *src, int stride, int pq)`
			`cglobal vc1_h_loop_filter8_sse4, 3,5,8`
			`START_H_FILTER 8`
			`VC1_H_LOOP_FILTER 8`
			`RET`