vpx/vp9/encoder/x86/vp9_dct_mmx.asm

;
;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
;
;  Use of this source code is governed by a BSD-style license
;  that can be found in the LICENSE file in the root of the source
;  tree. An additional intellectual property rights grant can be found
;  in the file PATENTS.  All contributing project authors may
;  be found in the AUTHORS file in the root of the source tree.
;


%include "vpx_ports/x86_abi_support.asm"

;void vp9_short_fdct4x4_mmx(short *input, short *output, int pitch)
global sym(vp9_short_fdct4x4_mmx) PRIVATE
sym(vp9_short_fdct4x4_mmx):
    push        rbp
    mov         rbp,        rsp
    SHADOW_ARGS_TO_STACK 3
    GET_GOT     rbx
    push        rsi
    push        rdi
    ; end prolog

        mov         rsi,        arg(0)      ; input
        mov         rdi,        arg(1)      ; output

        movsxd      rax,        dword ptr arg(2) ;pitch

        lea         rcx,        [rsi + rax*2]
        ; read the input data
        movq        mm0,        [rsi]
        movq        mm1,        [rsi + rax]

        movq        mm2,        [rcx]
        movq        mm4,        [rcx + rax]

        ; transpose for the first stage
        movq        mm3,        mm0         ; 00 01 02 03
        movq        mm5,        mm2         ; 20 21 22 23

        punpcklwd   mm0,        mm1         ; 00 10 01 11
        punpckhwd   mm3,        mm1         ; 02 12 03 13

        punpcklwd   mm2,        mm4         ; 20 30 21 31
        punpckhwd   mm5,        mm4         ; 22 32 23 33

        movq        mm1,        mm0         ; 00 10 01 11
        punpckldq   mm0,        mm2         ; 00 10 20 30

        punpckhdq   mm1,        mm2         ; 01 11 21 31

        movq        mm2,        mm3         ; 02 12 03 13
        punpckldq   mm2,        mm5         ; 02 12 22 32

        punpckhdq   mm3,        mm5         ; 03 13 23 33

        ; mm0 0
        ; mm1 1
        ; mm2 2
        ; mm3 3

        ; first stage
        movq        mm5,        mm0
        movq        mm4,        mm1

        paddw       mm0,        mm3         ; a1 = 0 + 3
        paddw       mm1,        mm2         ; b1 = 1 + 2

        psubw       mm4,        mm2         ; c1 = 1 - 2
        psubw       mm5,        mm3         ; d1 = 0 - 3

        psllw       mm5,        3
        psllw       mm4,        3

        psllw       mm0,        3
        psllw       mm1,        3

        ; output 0 and 2
        movq        mm2,        mm0         ; a1

        paddw       mm0,        mm1         ; op[0] = a1 + b1
        psubw       mm2,        mm1         ; op[2] = a1 - b1

        ; output 1 and 3
        ; interleave c1, d1
        movq        mm1,        mm5         ; d1
        punpcklwd   mm1,        mm4         ; c1 d1
        punpckhwd   mm5,        mm4         ; c1 d1

        movq        mm3,        mm1
        movq        mm4,        mm5

        pmaddwd     mm1,        MMWORD PTR[GLOBAL (_5352_2217)]    ; c1*2217 + d1*5352
        pmaddwd     mm4,        MMWORD PTR[GLOBAL (_5352_2217)]    ; c1*2217 + d1*5352

        pmaddwd     mm3,        MMWORD PTR[GLOBAL(_2217_neg5352)]  ; d1*2217 - c1*5352
        pmaddwd     mm5,        MMWORD PTR[GLOBAL(_2217_neg5352)]  ; d1*2217 - c1*5352

        paddd       mm1,        MMWORD PTR[GLOBAL(_14500)]
        paddd       mm4,        MMWORD PTR[GLOBAL(_14500)]
        paddd       mm3,        MMWORD PTR[GLOBAL(_7500)]
        paddd       mm5,        MMWORD PTR[GLOBAL(_7500)]

        psrad       mm1,        12          ; (c1 * 2217 + d1 * 5352 +  14500)>>12
        psrad       mm4,        12          ; (c1 * 2217 + d1 * 5352 +  14500)>>12
        psrad       mm3,        12          ; (d1 * 2217 - c1 * 5352 +   7500)>>12
        psrad       mm5,        12          ; (d1 * 2217 - c1 * 5352 +   7500)>>12

        packssdw    mm1,        mm4         ; op[1]
        packssdw    mm3,        mm5         ; op[3]

        ; done with vertical
        ; transpose for the second stage
        movq        mm4,        mm0         ; 00 10 20 30
        movq        mm5,        mm2         ; 02 12 22 32

        punpcklwd   mm0,        mm1         ; 00 01 10 11
        punpckhwd   mm4,        mm1         ; 20 21 30 31

        punpcklwd   mm2,        mm3         ; 02 03 12 13
        punpckhwd   mm5,        mm3         ; 22 23 32 33

        movq        mm1,        mm0         ; 00 01 10 11
        punpckldq   mm0,        mm2         ; 00 01 02 03

        punpckhdq   mm1,        mm2         ; 01 22 12 13

        movq        mm2,        mm4         ; 20 31 30 31
        punpckldq   mm2,        mm5         ; 20 21 22 23

        punpckhdq   mm4,        mm5         ; 30 31 32 33

        ; mm0 0
        ; mm1 1
        ; mm2 2
        ; mm3 4

        movq        mm5,        mm0
        movq        mm3,        mm1

        paddw       mm0,        mm4         ; a1 = 0 + 3
        paddw       mm1,        mm2         ; b1 = 1 + 2

        psubw       mm3,        mm2         ; c1 = 1 - 2
        psubw       mm5,        mm4         ; d1 = 0 - 3

        pxor        mm6,        mm6         ; zero out for compare

        pcmpeqw     mm6,        mm5         ; d1 != 0

        pandn       mm6,        MMWORD PTR[GLOBAL(_cmp_mask)]   ; clear upper,
                                                                ; and keep bit 0 of lower

        ; output 0 and 2
        movq        mm2,        mm0         ; a1

        paddw       mm0,        mm1         ; a1 + b1
        psubw       mm2,        mm1         ; a1 - b1

        paddw       mm0,        MMWORD PTR[GLOBAL(_7w)]
        paddw       mm2,        MMWORD PTR[GLOBAL(_7w)]

        psraw       mm0,        4           ; op[0] = (a1 + b1 + 7)>>4
        psraw       mm2,        4           ; op[8] = (a1 - b1 + 7)>>4

        movq        MMWORD PTR[rdi + 0 ],  mm0
        movq        MMWORD PTR[rdi + 16],  mm2

        ; output 1 and 3
        ; interleave c1, d1
        movq        mm1,        mm5         ; d1
        punpcklwd   mm1,        mm3         ; c1 d1
        punpckhwd   mm5,        mm3         ; c1 d1

        movq        mm3,        mm1
        movq        mm4,        mm5

        pmaddwd     mm1,        MMWORD PTR[GLOBAL (_5352_2217)]    ; c1*2217 + d1*5352
        pmaddwd     mm4,        MMWORD PTR[GLOBAL (_5352_2217)]    ; c1*2217 + d1*5352

        pmaddwd     mm3,        MMWORD PTR[GLOBAL(_2217_neg5352)]  ; d1*2217 - c1*5352
        pmaddwd     mm5,        MMWORD PTR[GLOBAL(_2217_neg5352)]  ; d1*2217 - c1*5352

        paddd       mm1,        MMWORD PTR[GLOBAL(_12000)]
        paddd       mm4,        MMWORD PTR[GLOBAL(_12000)]
        paddd       mm3,        MMWORD PTR[GLOBAL(_51000)]
        paddd       mm5,        MMWORD PTR[GLOBAL(_51000)]

        psrad       mm1,        16          ; (c1 * 2217 + d1 * 5352 +  14500)>>16
        psrad       mm4,        16          ; (c1 * 2217 + d1 * 5352 +  14500)>>16
        psrad       mm3,        16          ; (d1 * 2217 - c1 * 5352 +   7500)>>16
        psrad       mm5,        16          ; (d1 * 2217 - c1 * 5352 +   7500)>>16

        packssdw    mm1,        mm4         ; op[4]
        packssdw    mm3,        mm5         ; op[12]

        paddw       mm1,        mm6         ; op[4] += (d1!=0)

        movq        MMWORD PTR[rdi + 8 ],  mm1
        movq        MMWORD PTR[rdi + 24],  mm3

     ; begin epilog
    pop         rdi
    pop         rsi
    RESTORE_GOT
    UNSHADOW_ARGS
    pop         rbp
    ret

SECTION_RODATA
align 8
_5352_2217:
    dw 5352
    dw 2217
    dw 5352
    dw 2217
align 8
_2217_neg5352:
    dw 2217
    dw -5352
    dw 2217
    dw -5352
align 8
_cmp_mask:
    times 4 dw 1
align 8
_7w:
    times 4 dw 7
align 8
_14500:
    times 2 dd 14500
align 8
_7500:
    times 2 dd 7500
align 8
_12000:
    times 2 dd 12000
align 8
_51000:
    times 2 dd 51000
Initial WebM release 2010-05-18 17:58:33 +02:00			`;`
Use WebM in copyright notice for consistency Changes 'The VP8 project' to 'The WebM project', for consistency with other webmproject.org repositories. Fixes issue #97. Change-Id: I37c13ed5fbdb9d334ceef71c6350e9febed9bbba 2010-09-09 14:16:39 +02:00			`; Copyright (c) 2010 The WebM project authors. All Rights Reserved.`
Initial WebM release 2010-05-18 17:58:33 +02:00			`;`
cosmetics: trim trailing whitespace When the license headers were updated, they accidentally contained trailing whitespace, so unfortunately we have to touch all the files again. Change-Id: I236c05fade06589e417179c0444cb39b09e4200d 2010-06-18 18:39:21 +02:00			`; Use of this source code is governed by a BSD-style license`
LICENSE: update with latest text Change-Id: Ieebea089095d9073b3a94932791099f614ce120c 2010-06-04 22:19:40 +02:00			`; that can be found in the LICENSE file in the root of the source`
			`; tree. An additional intellectual property rights grant can be found`
cosmetics: trim trailing whitespace When the license headers were updated, they accidentally contained trailing whitespace, so unfortunately we have to touch all the files again. Change-Id: I236c05fade06589e417179c0444cb39b09e4200d 2010-06-18 18:39:21 +02:00			`; in the file PATENTS. All contributing project authors may`
LICENSE: update with latest text Change-Id: Ieebea089095d9073b3a94932791099f614ce120c 2010-06-04 22:19:40 +02:00			`; be found in the AUTHORS file in the root of the source tree.`
Initial WebM release 2010-05-18 17:58:33 +02:00			`;`


			`%include "vpx_ports/x86_abi_support.asm"`

Change encoder vp8_ and vp8cx_ public symbol prefixes to vp9_. Change-Id: Ie2e3652591b010ded10c216501ce24fd95d0aec5 2012-10-30 20:58:42 +01:00			`;void vp9_short_fdct4x4_mmx(short input, short output, int pitch)`
add private to assembly files to insure proper chromebuild Change-Id: I6e43ca73f35401a974ed8ee27738d4318f09fd37 2012-12-20 18:40:18 +01:00			`global sym(vp9_short_fdct4x4_mmx) PRIVATE`
Change encoder vp8_ and vp8cx_ public symbol prefixes to vp9_. Change-Id: Ie2e3652591b010ded10c216501ce24fd95d0aec5 2012-10-30 20:58:42 +01:00			`sym(vp9_short_fdct4x4_mmx):`
Initial WebM release 2010-05-18 17:58:33 +02:00			`push rbp`
FDCT optimizations. Fixed up the fdct for mmx and 8x4 sse2 to match them most recent changes. Change-Id: Ibee2d6c536fe14dcf75cd6eb1c73f4848a56d719 2010-10-21 19:53:15 +02:00			`mov rbp, rsp`
Initial WebM release 2010-05-18 17:58:33 +02:00			`SHADOW_ARGS_TO_STACK 3`
			`GET_GOT rbx`
FDCT optimizations. Fixed up the fdct for mmx and 8x4 sse2 to match them most recent changes. Change-Id: Ibee2d6c536fe14dcf75cd6eb1c73f4848a56d719 2010-10-21 19:53:15 +02:00			`push rsi`
			`push rdi`
Initial WebM release 2010-05-18 17:58:33 +02:00			`; end prolog`

FDCT optimizations. Fixed up the fdct for mmx and 8x4 sse2 to match them most recent changes. Change-Id: Ibee2d6c536fe14dcf75cd6eb1c73f4848a56d719 2010-10-21 19:53:15 +02:00			`mov rsi, arg(0) ; input`
			`mov rdi, arg(1) ; output`
Initial WebM release 2010-05-18 17:58:33 +02:00
FDCT optimizations. Fixed up the fdct for mmx and 8x4 sse2 to match them most recent changes. Change-Id: Ibee2d6c536fe14dcf75cd6eb1c73f4848a56d719 2010-10-21 19:53:15 +02:00			`movsxd rax, dword ptr arg(2) ;pitch`
Initial WebM release 2010-05-18 17:58:33 +02:00
FDCT optimizations. Fixed up the fdct for mmx and 8x4 sse2 to match them most recent changes. Change-Id: Ibee2d6c536fe14dcf75cd6eb1c73f4848a56d719 2010-10-21 19:53:15 +02:00			`lea rcx, [rsi + rax*2]`
Initial WebM release 2010-05-18 17:58:33 +02:00			`; read the input data`
FDCT optimizations. Fixed up the fdct for mmx and 8x4 sse2 to match them most recent changes. Change-Id: Ibee2d6c536fe14dcf75cd6eb1c73f4848a56d719 2010-10-21 19:53:15 +02:00			`movq mm0, [rsi]`
			`movq mm1, [rsi + rax]`
Initial WebM release 2010-05-18 17:58:33 +02:00
FDCT optimizations. Fixed up the fdct for mmx and 8x4 sse2 to match them most recent changes. Change-Id: Ibee2d6c536fe14dcf75cd6eb1c73f4848a56d719 2010-10-21 19:53:15 +02:00			`movq mm2, [rcx]`
			`movq mm4, [rcx + rax]`
Initial WebM release 2010-05-18 17:58:33 +02:00
FDCT optimizations. Fixed up the fdct for mmx and 8x4 sse2 to match them most recent changes. Change-Id: Ibee2d6c536fe14dcf75cd6eb1c73f4848a56d719 2010-10-21 19:53:15 +02:00			`; transpose for the first stage`
			`movq mm3, mm0 ; 00 01 02 03`
			`movq mm5, mm2 ; 20 21 22 23`
Initial WebM release 2010-05-18 17:58:33 +02:00
FDCT optimizations. Fixed up the fdct for mmx and 8x4 sse2 to match them most recent changes. Change-Id: Ibee2d6c536fe14dcf75cd6eb1c73f4848a56d719 2010-10-21 19:53:15 +02:00			`punpcklwd mm0, mm1 ; 00 10 01 11`
			`punpckhwd mm3, mm1 ; 02 12 03 13`
Initial WebM release 2010-05-18 17:58:33 +02:00
FDCT optimizations. Fixed up the fdct for mmx and 8x4 sse2 to match them most recent changes. Change-Id: Ibee2d6c536fe14dcf75cd6eb1c73f4848a56d719 2010-10-21 19:53:15 +02:00			`punpcklwd mm2, mm4 ; 20 30 21 31`
			`punpckhwd mm5, mm4 ; 22 32 23 33`
Initial WebM release 2010-05-18 17:58:33 +02:00
FDCT optimizations. Fixed up the fdct for mmx and 8x4 sse2 to match them most recent changes. Change-Id: Ibee2d6c536fe14dcf75cd6eb1c73f4848a56d719 2010-10-21 19:53:15 +02:00			`movq mm1, mm0 ; 00 10 01 11`
			`punpckldq mm0, mm2 ; 00 10 20 30`
Initial WebM release 2010-05-18 17:58:33 +02:00
FDCT optimizations. Fixed up the fdct for mmx and 8x4 sse2 to match them most recent changes. Change-Id: Ibee2d6c536fe14dcf75cd6eb1c73f4848a56d719 2010-10-21 19:53:15 +02:00			`punpckhdq mm1, mm2 ; 01 11 21 31`
Initial WebM release 2010-05-18 17:58:33 +02:00
FDCT optimizations. Fixed up the fdct for mmx and 8x4 sse2 to match them most recent changes. Change-Id: Ibee2d6c536fe14dcf75cd6eb1c73f4848a56d719 2010-10-21 19:53:15 +02:00			`movq mm2, mm3 ; 02 12 03 13`
			`punpckldq mm2, mm5 ; 02 12 22 32`
Initial WebM release 2010-05-18 17:58:33 +02:00
FDCT optimizations. Fixed up the fdct for mmx and 8x4 sse2 to match them most recent changes. Change-Id: Ibee2d6c536fe14dcf75cd6eb1c73f4848a56d719 2010-10-21 19:53:15 +02:00			`punpckhdq mm3, mm5 ; 03 13 23 33`
Initial WebM release 2010-05-18 17:58:33 +02:00
FDCT optimizations. Fixed up the fdct for mmx and 8x4 sse2 to match them most recent changes. Change-Id: Ibee2d6c536fe14dcf75cd6eb1c73f4848a56d719 2010-10-21 19:53:15 +02:00			`; mm0 0`
			`; mm1 1`
			`; mm2 2`
			`; mm3 3`
Initial WebM release 2010-05-18 17:58:33 +02:00
			`; first stage`
FDCT optimizations. Fixed up the fdct for mmx and 8x4 sse2 to match them most recent changes. Change-Id: Ibee2d6c536fe14dcf75cd6eb1c73f4848a56d719 2010-10-21 19:53:15 +02:00			`movq mm5, mm0`
			`movq mm4, mm1`
Initial WebM release 2010-05-18 17:58:33 +02:00
FDCT optimizations. Fixed up the fdct for mmx and 8x4 sse2 to match them most recent changes. Change-Id: Ibee2d6c536fe14dcf75cd6eb1c73f4848a56d719 2010-10-21 19:53:15 +02:00			`paddw mm0, mm3 ; a1 = 0 + 3`
			`paddw mm1, mm2 ; b1 = 1 + 2`
Initial WebM release 2010-05-18 17:58:33 +02:00
FDCT optimizations. Fixed up the fdct for mmx and 8x4 sse2 to match them most recent changes. Change-Id: Ibee2d6c536fe14dcf75cd6eb1c73f4848a56d719 2010-10-21 19:53:15 +02:00			`psubw mm4, mm2 ; c1 = 1 - 2`
			`psubw mm5, mm3 ; d1 = 0 - 3`
Initial WebM release 2010-05-18 17:58:33 +02:00
FDCT optimizations. Fixed up the fdct for mmx and 8x4 sse2 to match them most recent changes. Change-Id: Ibee2d6c536fe14dcf75cd6eb1c73f4848a56d719 2010-10-21 19:53:15 +02:00			`psllw mm5, 3`
			`psllw mm4, 3`
Initial WebM release 2010-05-18 17:58:33 +02:00
FDCT optimizations. Fixed up the fdct for mmx and 8x4 sse2 to match them most recent changes. Change-Id: Ibee2d6c536fe14dcf75cd6eb1c73f4848a56d719 2010-10-21 19:53:15 +02:00			`psllw mm0, 3`
			`psllw mm1, 3`
Initial WebM release 2010-05-18 17:58:33 +02:00
FDCT optimizations. Fixed up the fdct for mmx and 8x4 sse2 to match them most recent changes. Change-Id: Ibee2d6c536fe14dcf75cd6eb1c73f4848a56d719 2010-10-21 19:53:15 +02:00			`; output 0 and 2`
			`movq mm2, mm0 ; a1`
Initial WebM release 2010-05-18 17:58:33 +02:00
FDCT optimizations. Fixed up the fdct for mmx and 8x4 sse2 to match them most recent changes. Change-Id: Ibee2d6c536fe14dcf75cd6eb1c73f4848a56d719 2010-10-21 19:53:15 +02:00			`paddw mm0, mm1 ; op[0] = a1 + b1`
			`psubw mm2, mm1 ; op[2] = a1 - b1`
Initial WebM release 2010-05-18 17:58:33 +02:00
			`; output 1 and 3`
FDCT optimizations. Fixed up the fdct for mmx and 8x4 sse2 to match them most recent changes. Change-Id: Ibee2d6c536fe14dcf75cd6eb1c73f4848a56d719 2010-10-21 19:53:15 +02:00			`; interleave c1, d1`
			`movq mm1, mm5 ; d1`
			`punpcklwd mm1, mm4 ; c1 d1`
			`punpckhwd mm5, mm4 ; c1 d1`
Initial WebM release 2010-05-18 17:58:33 +02:00
FDCT optimizations. Fixed up the fdct for mmx and 8x4 sse2 to match them most recent changes. Change-Id: Ibee2d6c536fe14dcf75cd6eb1c73f4848a56d719 2010-10-21 19:53:15 +02:00			`movq mm3, mm1`
			`movq mm4, mm5`
Initial WebM release 2010-05-18 17:58:33 +02:00
FDCT optimizations. Fixed up the fdct for mmx and 8x4 sse2 to match them most recent changes. Change-Id: Ibee2d6c536fe14dcf75cd6eb1c73f4848a56d719 2010-10-21 19:53:15 +02:00			`pmaddwd mm1, MMWORD PTR[GLOBAL (_5352_2217)] ; c12217 + d15352`
			`pmaddwd mm4, MMWORD PTR[GLOBAL (_5352_2217)] ; c12217 + d15352`
Initial WebM release 2010-05-18 17:58:33 +02:00
FDCT optimizations. Fixed up the fdct for mmx and 8x4 sse2 to match them most recent changes. Change-Id: Ibee2d6c536fe14dcf75cd6eb1c73f4848a56d719 2010-10-21 19:53:15 +02:00			`pmaddwd mm3, MMWORD PTR[GLOBAL(_2217_neg5352)] ; d12217 - c15352`
			`pmaddwd mm5, MMWORD PTR[GLOBAL(_2217_neg5352)] ; d12217 - c15352`
Initial WebM release 2010-05-18 17:58:33 +02:00
FDCT optimizations. Fixed up the fdct for mmx and 8x4 sse2 to match them most recent changes. Change-Id: Ibee2d6c536fe14dcf75cd6eb1c73f4848a56d719 2010-10-21 19:53:15 +02:00			`paddd mm1, MMWORD PTR[GLOBAL(_14500)]`
			`paddd mm4, MMWORD PTR[GLOBAL(_14500)]`
			`paddd mm3, MMWORD PTR[GLOBAL(_7500)]`
			`paddd mm5, MMWORD PTR[GLOBAL(_7500)]`
Initial WebM release 2010-05-18 17:58:33 +02:00
FDCT optimizations. Fixed up the fdct for mmx and 8x4 sse2 to match them most recent changes. Change-Id: Ibee2d6c536fe14dcf75cd6eb1c73f4848a56d719 2010-10-21 19:53:15 +02:00			`psrad mm1, 12 ; (c1 * 2217 + d1 * 5352 + 14500)>>12`
			`psrad mm4, 12 ; (c1 * 2217 + d1 * 5352 + 14500)>>12`
			`psrad mm3, 12 ; (d1 * 2217 - c1 * 5352 + 7500)>>12`
			`psrad mm5, 12 ; (d1 * 2217 - c1 * 5352 + 7500)>>12`
Initial WebM release 2010-05-18 17:58:33 +02:00
FDCT optimizations. Fixed up the fdct for mmx and 8x4 sse2 to match them most recent changes. Change-Id: Ibee2d6c536fe14dcf75cd6eb1c73f4848a56d719 2010-10-21 19:53:15 +02:00			`packssdw mm1, mm4 ; op[1]`
			`packssdw mm3, mm5 ; op[3]`
Initial WebM release 2010-05-18 17:58:33 +02:00
			`; done with vertical`
			`; transpose for the second stage`
FDCT optimizations. Fixed up the fdct for mmx and 8x4 sse2 to match them most recent changes. Change-Id: Ibee2d6c536fe14dcf75cd6eb1c73f4848a56d719 2010-10-21 19:53:15 +02:00			`movq mm4, mm0 ; 00 10 20 30`
			`movq mm5, mm2 ; 02 12 22 32`
Initial WebM release 2010-05-18 17:58:33 +02:00
FDCT optimizations. Fixed up the fdct for mmx and 8x4 sse2 to match them most recent changes. Change-Id: Ibee2d6c536fe14dcf75cd6eb1c73f4848a56d719 2010-10-21 19:53:15 +02:00			`punpcklwd mm0, mm1 ; 00 01 10 11`
			`punpckhwd mm4, mm1 ; 20 21 30 31`
Initial WebM release 2010-05-18 17:58:33 +02:00
FDCT optimizations. Fixed up the fdct for mmx and 8x4 sse2 to match them most recent changes. Change-Id: Ibee2d6c536fe14dcf75cd6eb1c73f4848a56d719 2010-10-21 19:53:15 +02:00			`punpcklwd mm2, mm3 ; 02 03 12 13`
			`punpckhwd mm5, mm3 ; 22 23 32 33`
Initial WebM release 2010-05-18 17:58:33 +02:00
FDCT optimizations. Fixed up the fdct for mmx and 8x4 sse2 to match them most recent changes. Change-Id: Ibee2d6c536fe14dcf75cd6eb1c73f4848a56d719 2010-10-21 19:53:15 +02:00			`movq mm1, mm0 ; 00 01 10 11`
			`punpckldq mm0, mm2 ; 00 01 02 03`
Initial WebM release 2010-05-18 17:58:33 +02:00
FDCT optimizations. Fixed up the fdct for mmx and 8x4 sse2 to match them most recent changes. Change-Id: Ibee2d6c536fe14dcf75cd6eb1c73f4848a56d719 2010-10-21 19:53:15 +02:00			`punpckhdq mm1, mm2 ; 01 22 12 13`
Initial WebM release 2010-05-18 17:58:33 +02:00
FDCT optimizations. Fixed up the fdct for mmx and 8x4 sse2 to match them most recent changes. Change-Id: Ibee2d6c536fe14dcf75cd6eb1c73f4848a56d719 2010-10-21 19:53:15 +02:00			`movq mm2, mm4 ; 20 31 30 31`
			`punpckldq mm2, mm5 ; 20 21 22 23`
Initial WebM release 2010-05-18 17:58:33 +02:00
FDCT optimizations. Fixed up the fdct for mmx and 8x4 sse2 to match them most recent changes. Change-Id: Ibee2d6c536fe14dcf75cd6eb1c73f4848a56d719 2010-10-21 19:53:15 +02:00			`punpckhdq mm4, mm5 ; 30 31 32 33`
Initial WebM release 2010-05-18 17:58:33 +02:00
FDCT optimizations. Fixed up the fdct for mmx and 8x4 sse2 to match them most recent changes. Change-Id: Ibee2d6c536fe14dcf75cd6eb1c73f4848a56d719 2010-10-21 19:53:15 +02:00			`; mm0 0`
			`; mm1 1`
			`; mm2 2`
			`; mm3 4`
Initial WebM release 2010-05-18 17:58:33 +02:00
FDCT optimizations. Fixed up the fdct for mmx and 8x4 sse2 to match them most recent changes. Change-Id: Ibee2d6c536fe14dcf75cd6eb1c73f4848a56d719 2010-10-21 19:53:15 +02:00			`movq mm5, mm0`
			`movq mm3, mm1`
Initial WebM release 2010-05-18 17:58:33 +02:00
FDCT optimizations. Fixed up the fdct for mmx and 8x4 sse2 to match them most recent changes. Change-Id: Ibee2d6c536fe14dcf75cd6eb1c73f4848a56d719 2010-10-21 19:53:15 +02:00			`paddw mm0, mm4 ; a1 = 0 + 3`
			`paddw mm1, mm2 ; b1 = 1 + 2`
Initial WebM release 2010-05-18 17:58:33 +02:00
FDCT optimizations. Fixed up the fdct for mmx and 8x4 sse2 to match them most recent changes. Change-Id: Ibee2d6c536fe14dcf75cd6eb1c73f4848a56d719 2010-10-21 19:53:15 +02:00			`psubw mm3, mm2 ; c1 = 1 - 2`
			`psubw mm5, mm4 ; d1 = 0 - 3`
Initial WebM release 2010-05-18 17:58:33 +02:00
FDCT optimizations. Fixed up the fdct for mmx and 8x4 sse2 to match them most recent changes. Change-Id: Ibee2d6c536fe14dcf75cd6eb1c73f4848a56d719 2010-10-21 19:53:15 +02:00			`pxor mm6, mm6 ; zero out for compare`
Initial WebM release 2010-05-18 17:58:33 +02:00
FDCT optimizations. Fixed up the fdct for mmx and 8x4 sse2 to match them most recent changes. Change-Id: Ibee2d6c536fe14dcf75cd6eb1c73f4848a56d719 2010-10-21 19:53:15 +02:00			`pcmpeqw mm6, mm5 ; d1 != 0`
Initial WebM release 2010-05-18 17:58:33 +02:00
FDCT optimizations. Fixed up the fdct for mmx and 8x4 sse2 to match them most recent changes. Change-Id: Ibee2d6c536fe14dcf75cd6eb1c73f4848a56d719 2010-10-21 19:53:15 +02:00			`pandn mm6, MMWORD PTR[GLOBAL(_cmp_mask)] ; clear upper,`
			`; and keep bit 0 of lower`
Initial WebM release 2010-05-18 17:58:33 +02:00
			`; output 0 and 2`
FDCT optimizations. Fixed up the fdct for mmx and 8x4 sse2 to match them most recent changes. Change-Id: Ibee2d6c536fe14dcf75cd6eb1c73f4848a56d719 2010-10-21 19:53:15 +02:00			`movq mm2, mm0 ; a1`
Initial WebM release 2010-05-18 17:58:33 +02:00
FDCT optimizations. Fixed up the fdct for mmx and 8x4 sse2 to match them most recent changes. Change-Id: Ibee2d6c536fe14dcf75cd6eb1c73f4848a56d719 2010-10-21 19:53:15 +02:00			`paddw mm0, mm1 ; a1 + b1`
			`psubw mm2, mm1 ; a1 - b1`
Initial WebM release 2010-05-18 17:58:33 +02:00
FDCT optimizations. Fixed up the fdct for mmx and 8x4 sse2 to match them most recent changes. Change-Id: Ibee2d6c536fe14dcf75cd6eb1c73f4848a56d719 2010-10-21 19:53:15 +02:00			`paddw mm0, MMWORD PTR[GLOBAL(_7w)]`
			`paddw mm2, MMWORD PTR[GLOBAL(_7w)]`
Initial WebM release 2010-05-18 17:58:33 +02:00
FDCT optimizations. Fixed up the fdct for mmx and 8x4 sse2 to match them most recent changes. Change-Id: Ibee2d6c536fe14dcf75cd6eb1c73f4848a56d719 2010-10-21 19:53:15 +02:00			`psraw mm0, 4 ; op[0] = (a1 + b1 + 7)>>4`
			`psraw mm2, 4 ; op[8] = (a1 - b1 + 7)>>4`
Initial WebM release 2010-05-18 17:58:33 +02:00
FDCT optimizations. Fixed up the fdct for mmx and 8x4 sse2 to match them most recent changes. Change-Id: Ibee2d6c536fe14dcf75cd6eb1c73f4848a56d719 2010-10-21 19:53:15 +02:00			`movq MMWORD PTR[rdi + 0 ], mm0`
			`movq MMWORD PTR[rdi + 16], mm2`
Initial WebM release 2010-05-18 17:58:33 +02:00
			`; output 1 and 3`
FDCT optimizations. Fixed up the fdct for mmx and 8x4 sse2 to match them most recent changes. Change-Id: Ibee2d6c536fe14dcf75cd6eb1c73f4848a56d719 2010-10-21 19:53:15 +02:00			`; interleave c1, d1`
			`movq mm1, mm5 ; d1`
			`punpcklwd mm1, mm3 ; c1 d1`
			`punpckhwd mm5, mm3 ; c1 d1`
Initial WebM release 2010-05-18 17:58:33 +02:00
FDCT optimizations. Fixed up the fdct for mmx and 8x4 sse2 to match them most recent changes. Change-Id: Ibee2d6c536fe14dcf75cd6eb1c73f4848a56d719 2010-10-21 19:53:15 +02:00			`movq mm3, mm1`
			`movq mm4, mm5`
Initial WebM release 2010-05-18 17:58:33 +02:00
FDCT optimizations. Fixed up the fdct for mmx and 8x4 sse2 to match them most recent changes. Change-Id: Ibee2d6c536fe14dcf75cd6eb1c73f4848a56d719 2010-10-21 19:53:15 +02:00			`pmaddwd mm1, MMWORD PTR[GLOBAL (_5352_2217)] ; c12217 + d15352`
			`pmaddwd mm4, MMWORD PTR[GLOBAL (_5352_2217)] ; c12217 + d15352`
Initial WebM release 2010-05-18 17:58:33 +02:00
FDCT optimizations. Fixed up the fdct for mmx and 8x4 sse2 to match them most recent changes. Change-Id: Ibee2d6c536fe14dcf75cd6eb1c73f4848a56d719 2010-10-21 19:53:15 +02:00			`pmaddwd mm3, MMWORD PTR[GLOBAL(_2217_neg5352)] ; d12217 - c15352`
			`pmaddwd mm5, MMWORD PTR[GLOBAL(_2217_neg5352)] ; d12217 - c15352`
Initial WebM release 2010-05-18 17:58:33 +02:00
FDCT optimizations. Fixed up the fdct for mmx and 8x4 sse2 to match them most recent changes. Change-Id: Ibee2d6c536fe14dcf75cd6eb1c73f4848a56d719 2010-10-21 19:53:15 +02:00			`paddd mm1, MMWORD PTR[GLOBAL(_12000)]`
			`paddd mm4, MMWORD PTR[GLOBAL(_12000)]`
			`paddd mm3, MMWORD PTR[GLOBAL(_51000)]`
			`paddd mm5, MMWORD PTR[GLOBAL(_51000)]`
Initial WebM release 2010-05-18 17:58:33 +02:00
FDCT optimizations. Fixed up the fdct for mmx and 8x4 sse2 to match them most recent changes. Change-Id: Ibee2d6c536fe14dcf75cd6eb1c73f4848a56d719 2010-10-21 19:53:15 +02:00			`psrad mm1, 16 ; (c1 * 2217 + d1 * 5352 + 14500)>>16`
			`psrad mm4, 16 ; (c1 * 2217 + d1 * 5352 + 14500)>>16`
			`psrad mm3, 16 ; (d1 * 2217 - c1 * 5352 + 7500)>>16`
			`psrad mm5, 16 ; (d1 * 2217 - c1 * 5352 + 7500)>>16`
Initial WebM release 2010-05-18 17:58:33 +02:00
FDCT optimizations. Fixed up the fdct for mmx and 8x4 sse2 to match them most recent changes. Change-Id: Ibee2d6c536fe14dcf75cd6eb1c73f4848a56d719 2010-10-21 19:53:15 +02:00			`packssdw mm1, mm4 ; op[4]`
			`packssdw mm3, mm5 ; op[12]`

			`paddw mm1, mm6 ; op[4] += (d1!=0)`
Initial WebM release 2010-05-18 17:58:33 +02:00
FDCT optimizations. Fixed up the fdct for mmx and 8x4 sse2 to match them most recent changes. Change-Id: Ibee2d6c536fe14dcf75cd6eb1c73f4848a56d719 2010-10-21 19:53:15 +02:00			`movq MMWORD PTR[rdi + 8 ], mm1`
			`movq MMWORD PTR[rdi + 24], mm3`
Initial WebM release 2010-05-18 17:58:33 +02:00
FDCT optimizations. Fixed up the fdct for mmx and 8x4 sse2 to match them most recent changes. Change-Id: Ibee2d6c536fe14dcf75cd6eb1c73f4848a56d719 2010-10-21 19:53:15 +02:00			`; begin epilog`
			`pop rdi`
			`pop rsi`
Initial WebM release 2010-05-18 17:58:33 +02:00			`RESTORE_GOT`
			`UNSHADOW_ARGS`
			`pop rbp`
			`ret`

			`SECTION_RODATA`
FDCT optimizations. Fixed up the fdct for mmx and 8x4 sse2 to match them most recent changes. Change-Id: Ibee2d6c536fe14dcf75cd6eb1c73f4848a56d719 2010-10-21 19:53:15 +02:00			`align 8`
			`_5352_2217:`
			`dw 5352`
			`dw 2217`
			`dw 5352`
			`dw 2217`
			`align 8`
			`_2217_neg5352:`
			`dw 2217`
			`dw -5352`
			`dw 2217`
			`dw -5352`
			`align 8`
			`_cmp_mask:`
			`times 4 dw 1`
			`align 8`
			`_7w:`
			`times 4 dw 7`
			`align 8`
			`_14500:`
			`times 2 dd 14500`
			`align 8`
			`_7500:`
			`times 2 dd 7500`
			`align 8`
			`_12000:`
			`times 2 dd 12000`
			`align 8`
			`_51000:`
			`times 2 dd 51000`