FDCT optimizations.
Fixed up the fdct for mmx and 8x4 sse2 to match them most recent changes. Change-Id: Ibee2d6c536fe14dcf75cd6eb1c73f4848a56d719
This commit is contained in:
parent
647df00f30
commit
5f0e0617ba
@ -11,511 +11,231 @@
|
||||
|
||||
%include "vpx_ports/x86_abi_support.asm"
|
||||
|
||||
section .text
|
||||
global sym(vp8_short_fdct4x4_mmx)
|
||||
global sym(vp8_short_fdct8x4_wmt)
|
||||
|
||||
|
||||
%define DCTCONSTANTSBITS (16)
|
||||
%define DCTROUNDINGVALUE (1<< (DCTCONSTANTSBITS-1))
|
||||
%define x_c1 (60547) ; cos(pi /8) * (1<<15)
|
||||
%define x_c2 (46341) ; cos(pi*2/8) * (1<<15)
|
||||
%define x_c3 (25080) ; cos(pi*3/8) * (1<<15)
|
||||
|
||||
|
||||
;void vp8_short_fdct4x4_mmx(short *input, short *output, int pitch)
|
||||
global sym(vp8_short_fdct4x4_mmx)
|
||||
sym(vp8_short_fdct4x4_mmx):
|
||||
push rbp
|
||||
mov rbp, rsp
|
||||
mov rbp, rsp
|
||||
SHADOW_ARGS_TO_STACK 3
|
||||
GET_GOT rbx
|
||||
push rsi
|
||||
push rdi
|
||||
push rsi
|
||||
push rdi
|
||||
; end prolog
|
||||
mov rsi, arg(0) ;input
|
||||
mov rdi, arg(1) ;output
|
||||
|
||||
lea rdx, [GLOBAL(dct_const_mmx)]
|
||||
movsxd rax, dword ptr arg(2) ;pitch
|
||||
mov rsi, arg(0) ; input
|
||||
mov rdi, arg(1) ; output
|
||||
|
||||
lea rcx, [rsi + rax*2]
|
||||
movsxd rax, dword ptr arg(2) ;pitch
|
||||
|
||||
lea rcx, [rsi + rax*2]
|
||||
; read the input data
|
||||
movq mm0, [rsi]
|
||||
movq mm1, [rsi + rax ]
|
||||
movq mm0, [rsi]
|
||||
movq mm1, [rsi + rax]
|
||||
|
||||
movq mm2, [rcx]
|
||||
movq mm3, [rcx + rax]
|
||||
; get the constants
|
||||
;shift to left by 1 for prescision
|
||||
psllw mm0, 3
|
||||
psllw mm1, 3
|
||||
movq mm2, [rcx]
|
||||
movq mm4, [rcx + rax]
|
||||
|
||||
psllw mm2, 3
|
||||
psllw mm3, 3
|
||||
; transpose for the first stage
|
||||
movq mm3, mm0 ; 00 01 02 03
|
||||
movq mm5, mm2 ; 20 21 22 23
|
||||
|
||||
; transpose for the second stage
|
||||
movq mm4, mm0 ; 00 01 02 03
|
||||
movq mm5, mm2 ; 10 11 12 03
|
||||
punpcklwd mm0, mm1 ; 00 10 01 11
|
||||
punpckhwd mm3, mm1 ; 02 12 03 13
|
||||
|
||||
punpcklwd mm0, mm1 ; 00 10 01 11
|
||||
punpckhwd mm4, mm1 ; 02 12 03 13
|
||||
punpcklwd mm2, mm4 ; 20 30 21 31
|
||||
punpckhwd mm5, mm4 ; 22 32 23 33
|
||||
|
||||
punpcklwd mm2, mm3 ; 20 30 21 31
|
||||
punpckhwd mm5, mm3 ; 22 32 23 33
|
||||
movq mm1, mm0 ; 00 10 01 11
|
||||
punpckldq mm0, mm2 ; 00 10 20 30
|
||||
|
||||
punpckhdq mm1, mm2 ; 01 11 21 31
|
||||
|
||||
movq mm1, mm0 ; 00 10 01 11
|
||||
punpckldq mm0, mm2 ; 00 10 20 30
|
||||
movq mm2, mm3 ; 02 12 03 13
|
||||
punpckldq mm2, mm5 ; 02 12 22 32
|
||||
|
||||
punpckhdq mm1, mm2 ; 01 11 21 31
|
||||
|
||||
movq mm2, mm4 ; 02 12 03 13
|
||||
punpckldq mm2, mm5 ; 02 12 22 32
|
||||
|
||||
punpckhdq mm4, mm5 ; 03 13 23 33
|
||||
movq mm3, mm4
|
||||
punpckhdq mm3, mm5 ; 03 13 23 33
|
||||
|
||||
; mm0 0
|
||||
; mm1 1
|
||||
; mm2 2
|
||||
; mm3 3
|
||||
|
||||
; first stage
|
||||
movq mm5, mm0
|
||||
movq mm4, mm1
|
||||
movq mm5, mm0
|
||||
movq mm4, mm1
|
||||
|
||||
paddw mm0, mm3 ; a = 0 + 3
|
||||
paddw mm1, mm2 ; b = 1 + 2
|
||||
paddw mm0, mm3 ; a1 = 0 + 3
|
||||
paddw mm1, mm2 ; b1 = 1 + 2
|
||||
|
||||
psubw mm4, mm2 ; c = 1 - 2
|
||||
psubw mm5, mm3 ; d = 0 - 3
|
||||
psubw mm4, mm2 ; c1 = 1 - 2
|
||||
psubw mm5, mm3 ; d1 = 0 - 3
|
||||
|
||||
psllw mm5, 3
|
||||
psllw mm4, 3
|
||||
|
||||
psllw mm0, 3
|
||||
psllw mm1, 3
|
||||
|
||||
; output 0 and 2
|
||||
movq mm6, [rdx + 16] ; c2
|
||||
movq mm2, mm0 ; a
|
||||
movq mm2, mm0 ; a1
|
||||
|
||||
paddw mm0, mm1 ; a + b
|
||||
psubw mm2, mm1 ; a - b
|
||||
|
||||
movq mm1, mm0 ; a + b
|
||||
pmulhw mm0, mm6 ; 00 01 02 03
|
||||
|
||||
paddw mm0, mm1 ; output 00 01 02 03
|
||||
pmulhw mm6, mm2 ; 20 21 22 23
|
||||
|
||||
paddw mm2, mm6 ; output 20 21 22 23
|
||||
paddw mm0, mm1 ; op[0] = a1 + b1
|
||||
psubw mm2, mm1 ; op[2] = a1 - b1
|
||||
|
||||
; output 1 and 3
|
||||
movq mm6, [rdx + 8] ; c1
|
||||
movq mm7, [rdx + 24] ; c3
|
||||
; interleave c1, d1
|
||||
movq mm1, mm5 ; d1
|
||||
punpcklwd mm1, mm4 ; c1 d1
|
||||
punpckhwd mm5, mm4 ; c1 d1
|
||||
|
||||
movq mm1, mm4 ; c
|
||||
movq mm3, mm5 ; d
|
||||
movq mm3, mm1
|
||||
movq mm4, mm5
|
||||
|
||||
pmulhw mm1, mm7 ; c * c3
|
||||
pmulhw mm3, mm6 ; d * c1
|
||||
pmaddwd mm1, MMWORD PTR[GLOBAL (_5352_2217)] ; c1*2217 + d1*5352
|
||||
pmaddwd mm4, MMWORD PTR[GLOBAL (_5352_2217)] ; c1*2217 + d1*5352
|
||||
|
||||
paddw mm3, mm5 ; d * c1 rounded
|
||||
paddw mm1, mm3 ; output 10 11 12 13
|
||||
pmaddwd mm3, MMWORD PTR[GLOBAL(_2217_neg5352)] ; d1*2217 - c1*5352
|
||||
pmaddwd mm5, MMWORD PTR[GLOBAL(_2217_neg5352)] ; d1*2217 - c1*5352
|
||||
|
||||
movq mm3, mm4 ; c
|
||||
pmulhw mm5, mm7 ; d * c3
|
||||
paddd mm1, MMWORD PTR[GLOBAL(_14500)]
|
||||
paddd mm4, MMWORD PTR[GLOBAL(_14500)]
|
||||
paddd mm3, MMWORD PTR[GLOBAL(_7500)]
|
||||
paddd mm5, MMWORD PTR[GLOBAL(_7500)]
|
||||
|
||||
pmulhw mm4, mm6 ; c * c1
|
||||
paddw mm3, mm4 ; round c* c1
|
||||
|
||||
psubw mm5, mm3 ; output 30 31 32 33
|
||||
movq mm3, mm5
|
||||
psrad mm1, 12 ; (c1 * 2217 + d1 * 5352 + 14500)>>12
|
||||
psrad mm4, 12 ; (c1 * 2217 + d1 * 5352 + 14500)>>12
|
||||
psrad mm3, 12 ; (d1 * 2217 - c1 * 5352 + 7500)>>12
|
||||
psrad mm5, 12 ; (d1 * 2217 - c1 * 5352 + 7500)>>12
|
||||
|
||||
packssdw mm1, mm4 ; op[1]
|
||||
packssdw mm3, mm5 ; op[3]
|
||||
|
||||
; done with vertical
|
||||
; transpose for the second stage
|
||||
movq mm4, mm0 ; 00 01 02 03
|
||||
movq mm5, mm2 ; 10 11 12 03
|
||||
movq mm4, mm0 ; 00 10 20 30
|
||||
movq mm5, mm2 ; 02 12 22 32
|
||||
|
||||
punpcklwd mm0, mm1 ; 00 10 01 11
|
||||
punpckhwd mm4, mm1 ; 02 12 03 13
|
||||
punpcklwd mm0, mm1 ; 00 01 10 11
|
||||
punpckhwd mm4, mm1 ; 20 21 30 31
|
||||
|
||||
punpcklwd mm2, mm3 ; 20 30 21 31
|
||||
punpckhwd mm5, mm3 ; 22 32 23 33
|
||||
punpcklwd mm2, mm3 ; 02 03 12 13
|
||||
punpckhwd mm5, mm3 ; 22 23 32 33
|
||||
|
||||
movq mm1, mm0 ; 00 01 10 11
|
||||
punpckldq mm0, mm2 ; 00 01 02 03
|
||||
|
||||
movq mm1, mm0 ; 00 10 01 11
|
||||
punpckldq mm0, mm2 ; 00 10 20 30
|
||||
punpckhdq mm1, mm2 ; 01 22 12 13
|
||||
|
||||
punpckhdq mm1, mm2 ; 01 11 21 31
|
||||
movq mm2, mm4 ; 20 31 30 31
|
||||
punpckldq mm2, mm5 ; 20 21 22 23
|
||||
|
||||
movq mm2, mm4 ; 02 12 03 13
|
||||
punpckldq mm2, mm5 ; 02 12 22 32
|
||||
punpckhdq mm4, mm5 ; 30 31 32 33
|
||||
|
||||
punpckhdq mm4, mm5 ; 03 13 23 33
|
||||
movq mm3, mm4
|
||||
; mm0 0
|
||||
; mm1 1
|
||||
; mm2 2
|
||||
; mm3 4
|
||||
|
||||
movq mm5, mm0
|
||||
movq mm3, mm1
|
||||
|
||||
; first stage
|
||||
movq mm5, mm0
|
||||
movq mm4, mm1
|
||||
paddw mm0, mm4 ; a1 = 0 + 3
|
||||
paddw mm1, mm2 ; b1 = 1 + 2
|
||||
|
||||
paddw mm0, mm3 ; a = 0 + 3
|
||||
paddw mm1, mm2 ; b = 1 + 2
|
||||
psubw mm3, mm2 ; c1 = 1 - 2
|
||||
psubw mm5, mm4 ; d1 = 0 - 3
|
||||
|
||||
psubw mm4, mm2 ; c = 1 - 2
|
||||
psubw mm5, mm3 ; d = 0 - 3
|
||||
pxor mm6, mm6 ; zero out for compare
|
||||
|
||||
pcmpeqw mm6, mm5 ; d1 != 0
|
||||
|
||||
pandn mm6, MMWORD PTR[GLOBAL(_cmp_mask)] ; clear upper,
|
||||
; and keep bit 0 of lower
|
||||
|
||||
; output 0 and 2
|
||||
movq mm6, [rdx + 16] ; c2
|
||||
movq mm2, mm0 ; a
|
||||
paddw mm0, mm1 ; a + b
|
||||
movq mm2, mm0 ; a1
|
||||
|
||||
psubw mm2, mm1 ; a - b
|
||||
paddw mm0, mm1 ; a1 + b1
|
||||
psubw mm2, mm1 ; a1 - b1
|
||||
|
||||
movq mm1, mm0 ; a + b
|
||||
pmulhw mm0, mm6 ; 00 01 02 03
|
||||
paddw mm0, MMWORD PTR[GLOBAL(_7w)]
|
||||
paddw mm2, MMWORD PTR[GLOBAL(_7w)]
|
||||
|
||||
paddw mm0, mm1 ; output 00 01 02 03
|
||||
pmulhw mm6, mm2 ; 20 21 22 23
|
||||
|
||||
paddw mm2, mm6 ; output 20 21 22 23
|
||||
psraw mm0, 4 ; op[0] = (a1 + b1 + 7)>>4
|
||||
psraw mm2, 4 ; op[8] = (a1 - b1 + 7)>>4
|
||||
|
||||
movq MMWORD PTR[rdi + 0 ], mm0
|
||||
movq MMWORD PTR[rdi + 16], mm2
|
||||
|
||||
; output 1 and 3
|
||||
movq mm6, [rdx + 8] ; c1
|
||||
movq mm7, [rdx + 24] ; c3
|
||||
; interleave c1, d1
|
||||
movq mm1, mm5 ; d1
|
||||
punpcklwd mm1, mm3 ; c1 d1
|
||||
punpckhwd mm5, mm3 ; c1 d1
|
||||
|
||||
movq mm1, mm4 ; c
|
||||
movq mm3, mm5 ; d
|
||||
movq mm3, mm1
|
||||
movq mm4, mm5
|
||||
|
||||
pmulhw mm1, mm7 ; c * c3
|
||||
pmulhw mm3, mm6 ; d * c1
|
||||
pmaddwd mm1, MMWORD PTR[GLOBAL (_5352_2217)] ; c1*2217 + d1*5352
|
||||
pmaddwd mm4, MMWORD PTR[GLOBAL (_5352_2217)] ; c1*2217 + d1*5352
|
||||
|
||||
paddw mm3, mm5 ; d * c1 rounded
|
||||
paddw mm1, mm3 ; output 10 11 12 13
|
||||
pmaddwd mm3, MMWORD PTR[GLOBAL(_2217_neg5352)] ; d1*2217 - c1*5352
|
||||
pmaddwd mm5, MMWORD PTR[GLOBAL(_2217_neg5352)] ; d1*2217 - c1*5352
|
||||
|
||||
movq mm3, mm4 ; c
|
||||
pmulhw mm5, mm7 ; d * c3
|
||||
paddd mm1, MMWORD PTR[GLOBAL(_12000)]
|
||||
paddd mm4, MMWORD PTR[GLOBAL(_12000)]
|
||||
paddd mm3, MMWORD PTR[GLOBAL(_51000)]
|
||||
paddd mm5, MMWORD PTR[GLOBAL(_51000)]
|
||||
|
||||
pmulhw mm4, mm6 ; c * c1
|
||||
paddw mm3, mm4 ; round c* c1
|
||||
psrad mm1, 16 ; (c1 * 2217 + d1 * 5352 + 14500)>>16
|
||||
psrad mm4, 16 ; (c1 * 2217 + d1 * 5352 + 14500)>>16
|
||||
psrad mm3, 16 ; (d1 * 2217 - c1 * 5352 + 7500)>>16
|
||||
psrad mm5, 16 ; (d1 * 2217 - c1 * 5352 + 7500)>>16
|
||||
|
||||
psubw mm5, mm3 ; output 30 31 32 33
|
||||
movq mm3, mm5
|
||||
; done with vertical
|
||||
packssdw mm1, mm4 ; op[4]
|
||||
packssdw mm3, mm5 ; op[12]
|
||||
|
||||
pcmpeqw mm4, mm4
|
||||
pcmpeqw mm5, mm5
|
||||
psrlw mm4, 15
|
||||
psrlw mm5, 15
|
||||
paddw mm1, mm6 ; op[4] += (d1!=0)
|
||||
|
||||
psllw mm4, 2
|
||||
psllw mm5, 2
|
||||
movq MMWORD PTR[rdi + 8 ], mm1
|
||||
movq MMWORD PTR[rdi + 24], mm3
|
||||
|
||||
paddw mm0, mm4
|
||||
paddw mm1, mm5
|
||||
paddw mm2, mm4
|
||||
paddw mm3, mm5
|
||||
|
||||
psraw mm0, 3
|
||||
psraw mm1, 3
|
||||
psraw mm2, 3
|
||||
psraw mm3, 3
|
||||
|
||||
movq [rdi ], mm0
|
||||
movq [rdi+ 8], mm1
|
||||
movq [rdi+16], mm2
|
||||
movq [rdi+24], mm3
|
||||
|
||||
; begin epilog
|
||||
pop rdi
|
||||
pop rsi
|
||||
; begin epilog
|
||||
pop rdi
|
||||
pop rsi
|
||||
RESTORE_GOT
|
||||
UNSHADOW_ARGS
|
||||
pop rbp
|
||||
ret
|
||||
|
||||
|
||||
;void vp8_short_fdct8x4_wmt(short *input, short *output, int pitch)
|
||||
sym(vp8_short_fdct8x4_wmt):
|
||||
push rbp
|
||||
mov rbp, rsp
|
||||
SHADOW_ARGS_TO_STACK 3
|
||||
GET_GOT rbx
|
||||
push rsi
|
||||
push rdi
|
||||
; end prolog
|
||||
mov rsi, arg(0) ;input
|
||||
mov rdi, arg(1) ;output
|
||||
|
||||
lea rdx, [GLOBAL(dct_const_xmm)]
|
||||
movsxd rax, dword ptr arg(2) ;pitch
|
||||
|
||||
lea rcx, [rsi + rax*2]
|
||||
; read the input data
|
||||
movdqa xmm0, [rsi]
|
||||
movdqa xmm2, [rsi + rax]
|
||||
|
||||
movdqa xmm4, [rcx]
|
||||
movdqa xmm3, [rcx + rax]
|
||||
; get the constants
|
||||
;shift to left by 1 for prescision
|
||||
psllw xmm0, 3
|
||||
psllw xmm2, 3
|
||||
|
||||
psllw xmm4, 3
|
||||
psllw xmm3, 3
|
||||
|
||||
; transpose for the second stage
|
||||
movdqa xmm1, xmm0 ; 00 01 02 03 04 05 06 07
|
||||
movdqa xmm5, xmm4 ; 20 21 22 23 24 25 26 27
|
||||
|
||||
punpcklwd xmm0, xmm2 ; 00 10 01 11 02 12 03 13
|
||||
punpckhwd xmm1, xmm2 ; 04 14 05 15 06 16 07 17
|
||||
|
||||
punpcklwd xmm4, xmm3 ; 20 30 21 31 22 32 23 33
|
||||
punpckhwd xmm5, xmm3 ; 24 34 25 35 26 36 27 37
|
||||
|
||||
movdqa xmm2, xmm0 ; 00 10 01 11 02 12 03 13
|
||||
punpckldq xmm0, xmm4 ; 00 10 20 30 01 11 21 31
|
||||
|
||||
punpckhdq xmm2, xmm4 ; 02 12 22 32 03 13 23 33
|
||||
|
||||
|
||||
movdqa xmm4, xmm1 ; 04 14 05 15 06 16 07 17
|
||||
punpckldq xmm4, xmm5 ; 04 14 24 34 05 15 25 35
|
||||
|
||||
punpckhdq xmm1, xmm5 ; 06 16 26 36 07 17 27 37
|
||||
movdqa xmm3, xmm2 ; 02 12 22 32 03 13 23 33
|
||||
|
||||
punpckhqdq xmm3, xmm1 ; 03 13 23 33 07 17 27 37
|
||||
punpcklqdq xmm2, xmm1 ; 02 12 22 32 06 16 26 36
|
||||
|
||||
movdqa xmm1, xmm0 ; 00 10 20 30 01 11 21 31
|
||||
punpcklqdq xmm0, xmm4 ; 00 10 20 30 04 14 24 34
|
||||
|
||||
punpckhqdq xmm1, xmm4 ; 01 11 21 32 05 15 25 35
|
||||
|
||||
; xmm0 0
|
||||
; xmm1 1
|
||||
; xmm2 2
|
||||
; xmm3 3
|
||||
|
||||
; first stage
|
||||
movdqa xmm5, xmm0
|
||||
movdqa xmm4, xmm1
|
||||
|
||||
paddw xmm0, xmm3 ; a = 0 + 3
|
||||
paddw xmm1, xmm2 ; b = 1 + 2
|
||||
|
||||
psubw xmm4, xmm2 ; c = 1 - 2
|
||||
psubw xmm5, xmm3 ; d = 0 - 3
|
||||
|
||||
|
||||
; output 0 and 2
|
||||
movdqa xmm6, [rdx + 32] ; c2
|
||||
movdqa xmm2, xmm0 ; a
|
||||
|
||||
paddw xmm0, xmm1 ; a + b
|
||||
psubw xmm2, xmm1 ; a - b
|
||||
|
||||
movdqa xmm1, xmm0 ; a + b
|
||||
pmulhw xmm0, xmm6 ; 00 01 02 03
|
||||
|
||||
paddw xmm0, xmm1 ; output 00 01 02 03
|
||||
pmulhw xmm6, xmm2 ; 20 21 22 23
|
||||
|
||||
paddw xmm2, xmm6 ; output 20 21 22 23
|
||||
|
||||
; output 1 and 3
|
||||
movdqa xmm6, [rdx + 16] ; c1
|
||||
movdqa xmm7, [rdx + 48] ; c3
|
||||
|
||||
movdqa xmm1, xmm4 ; c
|
||||
movdqa xmm3, xmm5 ; d
|
||||
|
||||
pmulhw xmm1, xmm7 ; c * c3
|
||||
pmulhw xmm3, xmm6 ; d * c1
|
||||
|
||||
paddw xmm3, xmm5 ; d * c1 rounded
|
||||
paddw xmm1, xmm3 ; output 10 11 12 13
|
||||
|
||||
movdqa xmm3, xmm4 ; c
|
||||
pmulhw xmm5, xmm7 ; d * c3
|
||||
|
||||
pmulhw xmm4, xmm6 ; c * c1
|
||||
paddw xmm3, xmm4 ; round c* c1
|
||||
|
||||
psubw xmm5, xmm3 ; output 30 31 32 33
|
||||
movdqa xmm3, xmm5
|
||||
|
||||
|
||||
; done with vertical
|
||||
; transpose for the second stage
|
||||
movdqa xmm4, xmm2 ; 02 12 22 32 06 16 26 36
|
||||
movdqa xmm2, xmm1 ; 01 11 21 31 05 15 25 35
|
||||
|
||||
movdqa xmm1, xmm0 ; 00 10 20 30 04 14 24 34
|
||||
movdqa xmm5, xmm4 ; 02 12 22 32 06 16 26 36
|
||||
|
||||
punpcklwd xmm0, xmm2 ; 00 01 10 11 20 21 30 31
|
||||
punpckhwd xmm1, xmm2 ; 04 05 14 15 24 25 34 35
|
||||
|
||||
punpcklwd xmm4, xmm3 ; 02 03 12 13 22 23 32 33
|
||||
punpckhwd xmm5, xmm3 ; 06 07 16 17 26 27 36 37
|
||||
|
||||
movdqa xmm2, xmm0 ; 00 01 10 11 20 21 30 31
|
||||
punpckldq xmm0, xmm4 ; 00 01 02 03 10 11 12 13
|
||||
|
||||
punpckhdq xmm2, xmm4 ; 20 21 22 23 30 31 32 33
|
||||
|
||||
|
||||
movdqa xmm4, xmm1 ; 04 05 14 15 24 25 34 35
|
||||
punpckldq xmm4, xmm5 ; 04 05 06 07 14 15 16 17
|
||||
|
||||
punpckhdq xmm1, xmm5 ; 24 25 26 27 34 35 36 37
|
||||
movdqa xmm3, xmm2 ; 20 21 22 23 30 31 32 33
|
||||
|
||||
punpckhqdq xmm3, xmm1 ; 30 31 32 33 34 35 36 37
|
||||
punpcklqdq xmm2, xmm1 ; 20 21 22 23 24 25 26 27
|
||||
|
||||
movdqa xmm1, xmm0 ; 00 01 02 03 10 11 12 13
|
||||
punpcklqdq xmm0, xmm4 ; 00 01 02 03 04 05 06 07
|
||||
|
||||
punpckhqdq xmm1, xmm4 ; 10 11 12 13 14 15 16 17
|
||||
|
||||
; first stage
|
||||
movdqa xmm5, xmm0
|
||||
movdqa xmm4, xmm1
|
||||
|
||||
paddw xmm0, xmm3 ; a = 0 + 3
|
||||
paddw xmm1, xmm2 ; b = 1 + 2
|
||||
|
||||
psubw xmm4, xmm2 ; c = 1 - 2
|
||||
psubw xmm5, xmm3 ; d = 0 - 3
|
||||
|
||||
|
||||
; output 0 and 2
|
||||
movdqa xmm6, [rdx + 32] ; c2
|
||||
movdqa xmm2, xmm0 ; a
|
||||
|
||||
paddw xmm0, xmm1 ; a + b
|
||||
psubw xmm2, xmm1 ; a - b
|
||||
|
||||
movdqa xmm1, xmm0 ; a + b
|
||||
pmulhw xmm0, xmm6 ; 00 01 02 03
|
||||
|
||||
paddw xmm0, xmm1 ; output 00 01 02 03
|
||||
pmulhw xmm6, xmm2 ; 20 21 22 23
|
||||
|
||||
paddw xmm2, xmm6 ; output 20 21 22 23
|
||||
|
||||
; output 1 and 3
|
||||
movdqa xmm6, [rdx + 16] ; c1
|
||||
movdqa xmm7, [rdx + 48] ; c3
|
||||
|
||||
movdqa xmm1, xmm4 ; c
|
||||
movdqa xmm3, xmm5 ; d
|
||||
|
||||
pmulhw xmm1, xmm7 ; c * c3
|
||||
pmulhw xmm3, xmm6 ; d * c1
|
||||
|
||||
paddw xmm3, xmm5 ; d * c1 rounded
|
||||
paddw xmm1, xmm3 ; output 10 11 12 13
|
||||
|
||||
movdqa xmm3, xmm4 ; c
|
||||
pmulhw xmm5, xmm7 ; d * c3
|
||||
|
||||
pmulhw xmm4, xmm6 ; c * c1
|
||||
paddw xmm3, xmm4 ; round c* c1
|
||||
|
||||
psubw xmm5, xmm3 ; output 30 31 32 33
|
||||
movdqa xmm3, xmm5
|
||||
; done with vertical
|
||||
|
||||
|
||||
pcmpeqw xmm4, xmm4
|
||||
pcmpeqw xmm5, xmm5;
|
||||
psrlw xmm4, 15
|
||||
psrlw xmm5, 15
|
||||
|
||||
psllw xmm4, 2
|
||||
psllw xmm5, 2
|
||||
|
||||
paddw xmm0, xmm4
|
||||
paddw xmm1, xmm5
|
||||
paddw xmm2, xmm4
|
||||
paddw xmm3, xmm5
|
||||
|
||||
psraw xmm0, 3
|
||||
psraw xmm1, 3
|
||||
psraw xmm2, 3
|
||||
psraw xmm3, 3
|
||||
|
||||
movq QWORD PTR[rdi ], xmm0
|
||||
movq QWORD PTR[rdi+ 8], xmm1
|
||||
movq QWORD PTR[rdi+16], xmm2
|
||||
movq QWORD PTR[rdi+24], xmm3
|
||||
|
||||
psrldq xmm0, 8
|
||||
psrldq xmm1, 8
|
||||
psrldq xmm2, 8
|
||||
psrldq xmm3, 8
|
||||
|
||||
movq QWORD PTR[rdi+32], xmm0
|
||||
movq QWORD PTR[rdi+40], xmm1
|
||||
movq QWORD PTR[rdi+48], xmm2
|
||||
movq QWORD PTR[rdi+56], xmm3
|
||||
; begin epilog
|
||||
pop rdi
|
||||
pop rsi
|
||||
RESTORE_GOT
|
||||
UNSHADOW_ARGS
|
||||
pop rbp
|
||||
ret
|
||||
|
||||
|
||||
SECTION_RODATA
|
||||
;static const unsigned int dct1st_stage_rounding_mmx[2] =
|
||||
align 16
|
||||
dct1st_stage_rounding_mmx:
|
||||
times 2 dd 8192
|
||||
|
||||
|
||||
;static const unsigned int dct2nd_stage_rounding_mmx[2] =
|
||||
align 16
|
||||
dct2nd_stage_rounding_mmx:
|
||||
times 2 dd 32768
|
||||
|
||||
|
||||
;static const short dct_matrix[4][4]=
|
||||
align 16
|
||||
dct_matrix:
|
||||
times 4 dw 23170
|
||||
|
||||
dw 30274
|
||||
dw 12540
|
||||
dw -12540
|
||||
dw -30274
|
||||
|
||||
dw 23170
|
||||
times 2 dw -23170
|
||||
dw 23170
|
||||
|
||||
dw 12540
|
||||
dw -30274
|
||||
dw 30274
|
||||
dw -12540
|
||||
|
||||
|
||||
;static const unsigned short dct_const_mmx[4 * 4]=
|
||||
align 16
|
||||
dct_const_mmx:
|
||||
times 4 dw 0
|
||||
times 4 dw 60547
|
||||
times 4 dw 46341
|
||||
times 4 dw 25080
|
||||
|
||||
|
||||
;static const unsigned short dct_const_xmm[8 * 4]=
|
||||
align 16
|
||||
dct_const_xmm:
|
||||
times 8 dw 0
|
||||
times 8 dw 60547
|
||||
times 8 dw 46341
|
||||
times 8 dw 25080
|
||||
align 8
|
||||
_5352_2217:
|
||||
dw 5352
|
||||
dw 2217
|
||||
dw 5352
|
||||
dw 2217
|
||||
align 8
|
||||
_2217_neg5352:
|
||||
dw 2217
|
||||
dw -5352
|
||||
dw 2217
|
||||
dw -5352
|
||||
align 8
|
||||
_cmp_mask:
|
||||
times 4 dw 1
|
||||
align 8
|
||||
_7w:
|
||||
times 4 dw 7
|
||||
align 8
|
||||
_14500:
|
||||
times 2 dd 14500
|
||||
align 8
|
||||
_7500:
|
||||
times 2 dd 7500
|
||||
align 8
|
||||
_12000:
|
||||
times 2 dd 12000
|
||||
align 8
|
||||
_51000:
|
||||
times 2 dd 51000
|
||||
|
@ -11,32 +11,68 @@
|
||||
|
||||
%include "vpx_ports/x86_abi_support.asm"
|
||||
|
||||
;void vp8_short_fdct4x4_sse2(short *input, short *output, int pitch)
|
||||
global sym(vp8_short_fdct4x4_sse2)
|
||||
sym(vp8_short_fdct4x4_sse2):
|
||||
%macro STACK_FRAME_CREATE 0
|
||||
%if ABI_IS_32BIT
|
||||
%define input rsi
|
||||
%define output rdi
|
||||
%define pitch rax
|
||||
push rbp
|
||||
mov rbp, rsp
|
||||
SHADOW_ARGS_TO_STACK 3
|
||||
;; SAVE_XMM
|
||||
GET_GOT rbx
|
||||
push rsi
|
||||
push rdi
|
||||
; end prolog
|
||||
|
||||
mov rsi, arg(0)
|
||||
movsxd rax, DWORD PTR arg(2)
|
||||
lea rdi, [rsi + rax*2]
|
||||
mov rdi, arg(1)
|
||||
|
||||
movq xmm0, MMWORD PTR[rsi ] ;03 02 01 00
|
||||
movq xmm2, MMWORD PTR[rsi + rax] ;13 12 11 10
|
||||
movq xmm1, MMWORD PTR[rsi + rax*2] ;23 22 21 20
|
||||
movq xmm3, MMWORD PTR[rdi + rax] ;33 32 31 30
|
||||
movsxd rax, dword ptr arg(2)
|
||||
lea rcx, [rsi + rax*2]
|
||||
%else
|
||||
%ifidn __OUTPUT_FORMAT__,x64
|
||||
%define input rcx
|
||||
%define output rdx
|
||||
%define pitch r8
|
||||
%else
|
||||
%define input rdi
|
||||
%define output rsi
|
||||
%define pitch rdx
|
||||
%endif
|
||||
%endif
|
||||
%endmacro
|
||||
|
||||
%macro STACK_FRAME_DESTROY 0
|
||||
%define input
|
||||
%define output
|
||||
%define pitch
|
||||
|
||||
%if ABI_IS_32BIT
|
||||
pop rdi
|
||||
pop rsi
|
||||
RESTORE_GOT
|
||||
pop rbp
|
||||
%else
|
||||
%ifidn __OUTPUT_FORMAT__,x64
|
||||
%endif
|
||||
%endif
|
||||
ret
|
||||
%endmacro
|
||||
|
||||
;void vp8_short_fdct4x4_sse2(short *input, short *output, int pitch)
|
||||
global sym(vp8_short_fdct4x4_sse2)
|
||||
sym(vp8_short_fdct4x4_sse2):
|
||||
|
||||
STACK_FRAME_CREATE
|
||||
|
||||
movq xmm0, MMWORD PTR[input ] ;03 02 01 00
|
||||
movq xmm2, MMWORD PTR[input+ pitch] ;13 12 11 10
|
||||
lea input, [input+2*pitch]
|
||||
movq xmm1, MMWORD PTR[input ] ;23 22 21 20
|
||||
movq xmm3, MMWORD PTR[input+ pitch] ;33 32 31 30
|
||||
|
||||
punpcklqdq xmm0, xmm2 ;13 12 11 10 03 02 01 00
|
||||
punpcklqdq xmm1, xmm3 ;33 32 31 30 23 22 21 20
|
||||
|
||||
mov rdi, arg(1)
|
||||
|
||||
movdqa xmm2, xmm0
|
||||
punpckldq xmm0, xmm1 ;23 22 03 02 21 20 01 00
|
||||
punpckhdq xmm2, xmm1 ;33 32 13 12 31 30 11 10
|
||||
@ -51,6 +87,7 @@ sym(vp8_short_fdct4x4_sse2):
|
||||
psubw xmm3, xmm1 ;c1 d1 c1 d1 c1 d1 c1 d1
|
||||
psllw xmm0, 3 ;b1 <<= 3 a1 <<= 3
|
||||
psllw xmm3, 3 ;c1 <<= 3 d1 <<= 3
|
||||
|
||||
movdqa xmm1, xmm0
|
||||
pmaddwd xmm0, XMMWORD PTR[GLOBAL(_mult_add)] ;a1 + b1
|
||||
pmaddwd xmm1, XMMWORD PTR[GLOBAL(_mult_sub)] ;a1 - b1
|
||||
@ -121,17 +158,216 @@ sym(vp8_short_fdct4x4_sse2):
|
||||
punpcklqdq xmm0, xmm3 ;op[4] op[0]
|
||||
punpckhqdq xmm1, xmm3 ;op[12] op[8]
|
||||
|
||||
movdqa XMMWORD PTR[rdi + 0], xmm0
|
||||
movdqa XMMWORD PTR[rdi + 16], xmm1
|
||||
movdqa XMMWORD PTR[output + 0], xmm0
|
||||
movdqa XMMWORD PTR[output + 16], xmm1
|
||||
|
||||
; begin epilog
|
||||
pop rdi
|
||||
pop rsi
|
||||
RESTORE_GOT
|
||||
;; RESTORE_XMM
|
||||
UNSHADOW_ARGS
|
||||
pop rbp
|
||||
ret
|
||||
STACK_FRAME_DESTROY
|
||||
|
||||
;void vp8_short_fdct8x4_sse2(short *input, short *output, int pitch)
|
||||
global sym(vp8_short_fdct8x4_sse2)
|
||||
sym(vp8_short_fdct8x4_sse2):
|
||||
|
||||
STACK_FRAME_CREATE
|
||||
|
||||
; read the input data
|
||||
movdqa xmm0, [input ]
|
||||
movdqa xmm2, [input+ pitch]
|
||||
lea input, [input+2*pitch]
|
||||
movdqa xmm4, [input ]
|
||||
movdqa xmm3, [input+ pitch]
|
||||
|
||||
; transpose for the first stage
|
||||
movdqa xmm1, xmm0 ; 00 01 02 03 04 05 06 07
|
||||
movdqa xmm5, xmm4 ; 20 21 22 23 24 25 26 27
|
||||
|
||||
punpcklwd xmm0, xmm2 ; 00 10 01 11 02 12 03 13
|
||||
punpckhwd xmm1, xmm2 ; 04 14 05 15 06 16 07 17
|
||||
|
||||
punpcklwd xmm4, xmm3 ; 20 30 21 31 22 32 23 33
|
||||
punpckhwd xmm5, xmm3 ; 24 34 25 35 26 36 27 37
|
||||
|
||||
movdqa xmm2, xmm0 ; 00 10 01 11 02 12 03 13
|
||||
punpckldq xmm0, xmm4 ; 00 10 20 30 01 11 21 31
|
||||
|
||||
punpckhdq xmm2, xmm4 ; 02 12 22 32 03 13 23 33
|
||||
|
||||
movdqa xmm4, xmm1 ; 04 14 05 15 06 16 07 17
|
||||
punpckldq xmm4, xmm5 ; 04 14 24 34 05 15 25 35
|
||||
|
||||
punpckhdq xmm1, xmm5 ; 06 16 26 36 07 17 27 37
|
||||
movdqa xmm3, xmm2 ; 02 12 22 32 03 13 23 33
|
||||
|
||||
punpckhqdq xmm3, xmm1 ; 03 13 23 33 07 17 27 37
|
||||
punpcklqdq xmm2, xmm1 ; 02 12 22 32 06 16 26 36
|
||||
|
||||
movdqa xmm1, xmm0 ; 00 10 20 30 01 11 21 31
|
||||
punpcklqdq xmm0, xmm4 ; 00 10 20 30 04 14 24 34
|
||||
|
||||
punpckhqdq xmm1, xmm4 ; 01 11 21 32 05 15 25 35
|
||||
|
||||
; xmm0 0
|
||||
; xmm1 1
|
||||
; xmm2 2
|
||||
; xmm3 3
|
||||
|
||||
; first stage
|
||||
movdqa xmm5, xmm0
|
||||
movdqa xmm4, xmm1
|
||||
|
||||
paddw xmm0, xmm3 ; a1 = 0 + 3
|
||||
paddw xmm1, xmm2 ; b1 = 1 + 2
|
||||
|
||||
psubw xmm4, xmm2 ; c1 = 1 - 2
|
||||
psubw xmm5, xmm3 ; d1 = 0 - 3
|
||||
|
||||
psllw xmm5, 3
|
||||
psllw xmm4, 3
|
||||
|
||||
psllw xmm0, 3
|
||||
psllw xmm1, 3
|
||||
|
||||
; output 0 and 2
|
||||
movdqa xmm2, xmm0 ; a1
|
||||
|
||||
paddw xmm0, xmm1 ; op[0] = a1 + b1
|
||||
psubw xmm2, xmm1 ; op[2] = a1 - b1
|
||||
|
||||
; output 1 and 3
|
||||
; interleave c1, d1
|
||||
movdqa xmm1, xmm5 ; d1
|
||||
punpcklwd xmm1, xmm4 ; c1 d1
|
||||
punpckhwd xmm5, xmm4 ; c1 d1
|
||||
|
||||
movdqa xmm3, xmm1
|
||||
movdqa xmm4, xmm5
|
||||
|
||||
pmaddwd xmm1, XMMWORD PTR[GLOBAL (_5352_2217)] ; c1*2217 + d1*5352
|
||||
pmaddwd xmm4, XMMWORD PTR[GLOBAL (_5352_2217)] ; c1*2217 + d1*5352
|
||||
|
||||
pmaddwd xmm3, XMMWORD PTR[GLOBAL(_2217_neg5352)] ; d1*2217 - c1*5352
|
||||
pmaddwd xmm5, XMMWORD PTR[GLOBAL(_2217_neg5352)] ; d1*2217 - c1*5352
|
||||
|
||||
paddd xmm1, XMMWORD PTR[GLOBAL(_14500)]
|
||||
paddd xmm4, XMMWORD PTR[GLOBAL(_14500)]
|
||||
paddd xmm3, XMMWORD PTR[GLOBAL(_7500)]
|
||||
paddd xmm5, XMMWORD PTR[GLOBAL(_7500)]
|
||||
|
||||
psrad xmm1, 12 ; (c1 * 2217 + d1 * 5352 + 14500)>>12
|
||||
psrad xmm4, 12 ; (c1 * 2217 + d1 * 5352 + 14500)>>12
|
||||
psrad xmm3, 12 ; (d1 * 2217 - c1 * 5352 + 7500)>>12
|
||||
psrad xmm5, 12 ; (d1 * 2217 - c1 * 5352 + 7500)>>12
|
||||
|
||||
packssdw xmm1, xmm4 ; op[1]
|
||||
packssdw xmm3, xmm5 ; op[3]
|
||||
|
||||
; done with vertical
|
||||
; transpose for the second stage
|
||||
movdqa xmm4, xmm0 ; 00 10 20 30 04 14 24 34
|
||||
movdqa xmm5, xmm2 ; 02 12 22 32 06 16 26 36
|
||||
|
||||
punpcklwd xmm0, xmm1 ; 00 01 10 11 20 21 30 31
|
||||
punpckhwd xmm4, xmm1 ; 04 05 14 15 24 25 34 35
|
||||
|
||||
punpcklwd xmm2, xmm3 ; 02 03 12 13 22 23 32 33
|
||||
punpckhwd xmm5, xmm3 ; 06 07 16 17 26 27 36 37
|
||||
|
||||
movdqa xmm1, xmm0 ; 00 01 10 11 20 21 30 31
|
||||
punpckldq xmm0, xmm2 ; 00 01 02 03 10 11 12 13
|
||||
|
||||
punpckhdq xmm1, xmm2 ; 20 21 22 23 30 31 32 33
|
||||
|
||||
movdqa xmm2, xmm4 ; 04 05 14 15 24 25 34 35
|
||||
punpckldq xmm2, xmm5 ; 04 05 06 07 14 15 16 17
|
||||
|
||||
punpckhdq xmm4, xmm5 ; 24 25 26 27 34 35 36 37
|
||||
movdqa xmm3, xmm1 ; 20 21 22 23 30 31 32 33
|
||||
|
||||
punpckhqdq xmm3, xmm4 ; 30 31 32 33 34 35 36 37
|
||||
punpcklqdq xmm1, xmm4 ; 20 21 22 23 24 25 26 27
|
||||
|
||||
movdqa xmm4, xmm0 ; 00 01 02 03 10 11 12 13
|
||||
punpcklqdq xmm0, xmm2 ; 00 01 02 03 04 05 06 07
|
||||
|
||||
punpckhqdq xmm4, xmm2 ; 10 11 12 13 14 15 16 17
|
||||
|
||||
; xmm0 0
|
||||
; xmm1 4
|
||||
; xmm2 1
|
||||
; xmm3 3
|
||||
|
||||
movdqa xmm5, xmm0
|
||||
movdqa xmm2, xmm1
|
||||
|
||||
paddw xmm0, xmm3 ; a1 = 0 + 3
|
||||
paddw xmm1, xmm4 ; b1 = 1 + 2
|
||||
|
||||
psubw xmm4, xmm2 ; c1 = 1 - 2
|
||||
psubw xmm5, xmm3 ; d1 = 0 - 3
|
||||
|
||||
pxor xmm6, xmm6 ; zero out for compare
|
||||
|
||||
pcmpeqw xmm6, xmm5 ; d1 != 0
|
||||
|
||||
pandn xmm6, XMMWORD PTR[GLOBAL(_cmp_mask8x4)] ; clear upper,
|
||||
; and keep bit 0 of lower
|
||||
|
||||
; output 0 and 2
|
||||
movdqa xmm2, xmm0 ; a1
|
||||
|
||||
paddw xmm0, xmm1 ; a1 + b1
|
||||
psubw xmm2, xmm1 ; a1 - b1
|
||||
|
||||
paddw xmm0, XMMWORD PTR[GLOBAL(_7w)]
|
||||
paddw xmm2, XMMWORD PTR[GLOBAL(_7w)]
|
||||
|
||||
psraw xmm0, 4 ; op[0] = (a1 + b1 + 7)>>4
|
||||
psraw xmm2, 4 ; op[8] = (a1 - b1 + 7)>>4
|
||||
|
||||
; output 1 and 3
|
||||
; interleave c1, d1
|
||||
movdqa xmm1, xmm5 ; d1
|
||||
punpcklwd xmm1, xmm4 ; c1 d1
|
||||
punpckhwd xmm5, xmm4 ; c1 d1
|
||||
|
||||
movdqa xmm3, xmm1
|
||||
movdqa xmm4, xmm5
|
||||
|
||||
pmaddwd xmm1, XMMWORD PTR[GLOBAL (_5352_2217)] ; c1*2217 + d1*5352
|
||||
pmaddwd xmm4, XMMWORD PTR[GLOBAL (_5352_2217)] ; c1*2217 + d1*5352
|
||||
|
||||
pmaddwd xmm3, XMMWORD PTR[GLOBAL(_2217_neg5352)] ; d1*2217 - c1*5352
|
||||
pmaddwd xmm5, XMMWORD PTR[GLOBAL(_2217_neg5352)] ; d1*2217 - c1*5352
|
||||
|
||||
paddd xmm1, XMMWORD PTR[GLOBAL(_12000)]
|
||||
paddd xmm4, XMMWORD PTR[GLOBAL(_12000)]
|
||||
paddd xmm3, XMMWORD PTR[GLOBAL(_51000)]
|
||||
paddd xmm5, XMMWORD PTR[GLOBAL(_51000)]
|
||||
|
||||
psrad xmm1, 16 ; (c1 * 2217 + d1 * 5352 + 14500)>>16
|
||||
psrad xmm4, 16 ; (c1 * 2217 + d1 * 5352 + 14500)>>16
|
||||
psrad xmm3, 16 ; (d1 * 2217 - c1 * 5352 + 7500)>>16
|
||||
psrad xmm5, 16 ; (d1 * 2217 - c1 * 5352 + 7500)>>16
|
||||
|
||||
packssdw xmm1, xmm4 ; op[4]
|
||||
packssdw xmm3, xmm5 ; op[12]
|
||||
|
||||
paddw xmm1, xmm6 ; op[4] += (d1!=0)
|
||||
|
||||
movdqa xmm4, xmm0
|
||||
movdqa xmm5, xmm2
|
||||
|
||||
punpcklqdq xmm0, xmm1
|
||||
punpckhqdq xmm4, xmm1
|
||||
|
||||
punpcklqdq xmm2, xmm3
|
||||
punpckhqdq xmm5, xmm3
|
||||
|
||||
movdqa XMMWORD PTR[output + 0 ], xmm0
|
||||
movdqa XMMWORD PTR[output + 16], xmm2
|
||||
movdqa XMMWORD PTR[output + 32], xmm4
|
||||
movdqa XMMWORD PTR[output + 48], xmm5
|
||||
|
||||
STACK_FRAME_DESTROY
|
||||
|
||||
SECTION_RODATA
|
||||
align 16
|
||||
@ -161,7 +397,9 @@ align 16
|
||||
_cmp_mask:
|
||||
times 4 dw 1
|
||||
times 4 dw 0
|
||||
|
||||
align 16
|
||||
_cmp_mask8x4:
|
||||
times 8 dw 1
|
||||
align 16
|
||||
_mult_sub:
|
||||
dw 1
|
||||
@ -176,6 +414,9 @@ align 16
|
||||
_7:
|
||||
times 4 dd 7
|
||||
align 16
|
||||
_7w:
|
||||
times 8 dw 7
|
||||
align 16
|
||||
_14500:
|
||||
times 4 dd 14500
|
||||
align 16
|
||||
|
@ -24,33 +24,31 @@ extern prototype_fdct(vp8_short_fdct4x4_mmx);
|
||||
extern prototype_fdct(vp8_short_fdct8x4_mmx);
|
||||
|
||||
#if !CONFIG_RUNTIME_CPU_DETECT
|
||||
#if 0
|
||||
|
||||
#undef vp8_fdct_short4x4
|
||||
#define vp8_fdct_short4x4 vp8_short_fdct4x4_mmx
|
||||
|
||||
#undef vp8_fdct_short8x4
|
||||
#define vp8_fdct_short8x4 vp8_short_fdct8x4_mmx
|
||||
#endif
|
||||
|
||||
#endif
|
||||
|
||||
#endif
|
||||
|
||||
|
||||
#if HAVE_SSE2
|
||||
extern prototype_fdct(vp8_short_fdct8x4_wmt);
|
||||
extern prototype_fdct(vp8_short_fdct8x4_sse2);
|
||||
extern prototype_fdct(vp8_short_walsh4x4_sse2);
|
||||
|
||||
extern prototype_fdct(vp8_short_fdct4x4_sse2);
|
||||
|
||||
#if !CONFIG_RUNTIME_CPU_DETECT
|
||||
#if 1
|
||||
/* short SSE2 DCT currently disabled, does not match the MMX version */
|
||||
|
||||
#undef vp8_fdct_short4x4
|
||||
#define vp8_fdct_short4x4 vp8_short_fdct4x4_sse2
|
||||
|
||||
#undef vp8_fdct_short8x4
|
||||
#define vp8_fdct_short8x4 vp8_short_fdct8x4_sse2
|
||||
#endif
|
||||
|
||||
#undef vp8_fdct_fast4x4
|
||||
#define vp8_fdct_fast4x4 vp8_short_fdct4x4_sse2
|
||||
@ -58,7 +56,7 @@ extern prototype_fdct(vp8_short_fdct4x4_sse2);
|
||||
#undef vp8_fdct_fast8x4
|
||||
#define vp8_fdct_fast8x4 vp8_short_fdct8x4_sse2
|
||||
|
||||
#undef vp8_fdct_walsh_short4x4
|
||||
#undef vp8_fdct_walsh_short4x4
|
||||
#define vp8_fdct_walsh_short4x4 vp8_short_walsh4x4_sse2
|
||||
|
||||
#endif
|
||||
|
@ -18,11 +18,10 @@
|
||||
#if HAVE_MMX
|
||||
void vp8_short_fdct8x4_mmx(short *input, short *output, int pitch)
|
||||
{
|
||||
vp8_short_fdct4x4_c(input, output, pitch);
|
||||
vp8_short_fdct4x4_c(input + 4, output + 16, pitch);
|
||||
vp8_short_fdct4x4_mmx(input, output, pitch);
|
||||
vp8_short_fdct4x4_mmx(input + 4, output + 16, pitch);
|
||||
}
|
||||
|
||||
|
||||
int vp8_fast_quantize_b_impl_mmx(short *coeff_ptr, short *zbin_ptr,
|
||||
short *qcoeff_ptr, short *dequant_ptr,
|
||||
short *scan_mask, short *round_ptr,
|
||||
@ -82,12 +81,6 @@ void vp8_subtract_b_mmx(BLOCK *be, BLOCKD *bd, int pitch)
|
||||
#endif
|
||||
|
||||
#if HAVE_SSE2
|
||||
void vp8_short_fdct8x4_sse2(short *input, short *output, int pitch)
|
||||
{
|
||||
vp8_short_fdct4x4_sse2(input, output, pitch);
|
||||
vp8_short_fdct4x4_sse2(input + 4, output + 16, pitch);
|
||||
}
|
||||
|
||||
int vp8_fast_quantize_b_impl_sse2(short *coeff_ptr,
|
||||
short *qcoeff_ptr, short *dequant_ptr,
|
||||
short *scan_mask, short *round_ptr,
|
||||
@ -249,18 +242,11 @@ void vp8_arch_x86_encoder_init(VP8_COMP *cpi)
|
||||
cpi->rtcd.variance.get8x8var = vp8_get8x8var_mmx;
|
||||
cpi->rtcd.variance.get16x16var = vp8_get16x16var_mmx;
|
||||
cpi->rtcd.variance.get4x4sse_cs = vp8_get4x4sse_cs_mmx;
|
||||
#if 0 // new fdct
|
||||
|
||||
cpi->rtcd.fdct.short4x4 = vp8_short_fdct4x4_mmx;
|
||||
cpi->rtcd.fdct.short8x4 = vp8_short_fdct8x4_mmx;
|
||||
cpi->rtcd.fdct.fast4x4 = vp8_short_fdct4x4_mmx;
|
||||
cpi->rtcd.fdct.fast8x4 = vp8_short_fdct8x4_mmx;
|
||||
#else
|
||||
cpi->rtcd.fdct.short4x4 = vp8_short_fdct4x4_c;
|
||||
cpi->rtcd.fdct.short8x4 = vp8_short_fdct8x4_c;
|
||||
cpi->rtcd.fdct.fast4x4 = vp8_short_fdct4x4_c;
|
||||
cpi->rtcd.fdct.fast8x4 = vp8_short_fdct8x4_c;
|
||||
|
||||
#endif
|
||||
|
||||
cpi->rtcd.fdct.walsh_short4x4 = vp8_short_walsh4x4_c;
|
||||
|
||||
|
Loading…
x
Reference in New Issue
Block a user