vpx/vp8/encoder/x86/dct_mmx.asm
John Koleszar 09202d8071 LICENSE: update with latest text
Change-Id: Ieebea089095d9073b3a94932791099f614ce120c
2010-06-04 16:19:40 -04:00

848 lines
23 KiB
NASM

;
; Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
;
; Use of this source code is governed by a BSD-style license
; that can be found in the LICENSE file in the root of the source
; tree. An additional intellectual property rights grant can be found
; in the file PATENTS. All contributing project authors may
; be found in the AUTHORS file in the root of the source tree.
;
%include "vpx_ports/x86_abi_support.asm"
section .text
global sym(vp8_short_fdct4x4_mmx)
global sym(vp8_fast_fdct4x4_mmx)
global sym(vp8_fast_fdct8x4_wmt)
%define DCTCONSTANTSBITS (16)
%define DCTROUNDINGVALUE (1<< (DCTCONSTANTSBITS-1))
%define x_c1 (60547) ; cos(pi /8) * (1<<15)
%define x_c2 (46341) ; cos(pi*2/8) * (1<<15)
%define x_c3 (25080) ; cos(pi*3/8) * (1<<15)
%define _1STSTAGESHIFT 14
%define _2NDSTAGESHIFT 16
; using matrix multiply with source and destbuffer has a pitch
;void vp8_short_fdct4x4_mmx(short *input, short *output, int pitch)
sym(vp8_short_fdct4x4_mmx):
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 3
GET_GOT rbx
push rsi
push rdi
; end prolog
mov rsi, arg(0) ;input
mov rdi, arg(1) ;output
movsxd rax, dword ptr arg(2) ;pitch
lea rdx, [dct_matrix GLOBAL]
movq mm0, [rsi ]
movq mm1, [rsi + rax]
movq mm2, [rsi + rax*2]
lea rsi, [rsi + rax*2]
movq mm3, [rsi + rax]
; first column
movq mm4, mm0
movq mm7, [rdx]
pmaddwd mm4, mm7
movq mm5, mm1
pmaddwd mm5, mm7
movq mm6, mm4
punpckldq mm4, mm5
punpckhdq mm6, mm5
paddd mm4, mm6
movq mm5, mm2
pmaddwd mm5, mm7
movq mm6, mm3
pmaddwd mm6, mm7
movq mm7, mm5
punpckldq mm5, mm6
punpckhdq mm7, mm6
paddd mm5, mm7
movq mm6, [dct1st_stage_rounding_mmx GLOBAL]
paddd mm4, mm6
paddd mm5, mm6
psrad mm4, _1STSTAGESHIFT
psrad mm5, _1STSTAGESHIFT
packssdw mm4, mm5
movq [rdi], mm4
;second column
movq mm4, mm0
pmaddwd mm4, [rdx+8]
movq mm5, mm1
pmaddwd mm5, [rdx+8]
movq mm6, mm4
punpckldq mm4, mm5
punpckhdq mm6, mm5
paddd mm4, mm6
movq mm5, mm2
pmaddwd mm5, [rdx+8]
movq mm6, mm3
pmaddwd mm6, [rdx+8]
movq mm7, mm5
punpckldq mm5, mm6
punpckhdq mm7, mm6
paddd mm5, mm7
movq mm6, [dct1st_stage_rounding_mmx GLOBAL]
paddd mm4, mm6
paddd mm5, mm6
psrad mm4, _1STSTAGESHIFT
psrad mm5, _1STSTAGESHIFT
packssdw mm4, mm5
movq [rdi+8], mm4
;third column
movq mm4, mm0
pmaddwd mm4, [rdx+16]
movq mm5, mm1
pmaddwd mm5, [rdx+16]
movq mm6, mm4
punpckldq mm4, mm5
punpckhdq mm6, mm5
paddd mm4, mm6
movq mm5, mm2
pmaddwd mm5, [rdx+16]
movq mm6, mm3
pmaddwd mm6, [rdx+16]
movq mm7, mm5
punpckldq mm5, mm6
punpckhdq mm7, mm6
paddd mm5, mm7
movq mm6, [dct1st_stage_rounding_mmx GLOBAL]
paddd mm4, mm6
paddd mm5, mm6
psrad mm4, _1STSTAGESHIFT
psrad mm5, _1STSTAGESHIFT
packssdw mm4, mm5
movq [rdi+16], mm4
;fourth column (this is the last column, so we do not have save the source any more)
pmaddwd mm0, [rdx+24]
pmaddwd mm1, [rdx+24]
movq mm6, mm0
punpckldq mm0, mm1
punpckhdq mm6, mm1
paddd mm0, mm6
pmaddwd mm2, [rdx+24]
pmaddwd mm3, [rdx+24]
movq mm7, mm2
punpckldq mm2, mm3
punpckhdq mm7, mm3
paddd mm2, mm7
movq mm6, [dct1st_stage_rounding_mmx GLOBAL]
paddd mm0, mm6
paddd mm2, mm6
psrad mm0, _1STSTAGESHIFT
psrad mm2, _1STSTAGESHIFT
packssdw mm0, mm2
movq mm3, mm0
; done with one pass
; now start second pass
movq mm0, [rdi ]
movq mm1, [rdi+ 8]
movq mm2, [rdi+ 16]
movq mm4, mm0
pmaddwd mm4, [rdx]
movq mm5, mm1
pmaddwd mm5, [rdx]
movq mm6, mm4
punpckldq mm4, mm5
punpckhdq mm6, mm5
paddd mm4, mm6
movq mm5, mm2
pmaddwd mm5, [rdx]
movq mm6, mm3
pmaddwd mm6, [rdx]
movq mm7, mm5
punpckldq mm5, mm6
punpckhdq mm7, mm6
paddd mm5, mm7
movq mm6, [dct2nd_stage_rounding_mmx GLOBAL]
paddd mm4, mm6
paddd mm5, mm6
psrad mm4, _2NDSTAGESHIFT
psrad mm5, _2NDSTAGESHIFT
packssdw mm4, mm5
movq [rdi], mm4
;second column
movq mm4, mm0
pmaddwd mm4, [rdx+8]
movq mm5, mm1
pmaddwd mm5, [rdx+8]
movq mm6, mm4
punpckldq mm4, mm5
punpckhdq mm6, mm5
paddd mm4, mm6
movq mm5, mm2
pmaddwd mm5, [rdx+8]
movq mm6, mm3
pmaddwd mm6, [rdx+8]
movq mm7, mm5
punpckldq mm5, mm6
punpckhdq mm7, mm6
paddd mm5, mm7
movq mm6, [dct2nd_stage_rounding_mmx GLOBAL]
paddd mm4, mm6
paddd mm5, mm6
psrad mm4, _2NDSTAGESHIFT
psrad mm5, _2NDSTAGESHIFT
packssdw mm4, mm5
movq [rdi+8], mm4
;third column
movq mm4, mm0
pmaddwd mm4, [rdx+16]
movq mm5, mm1
pmaddwd mm5, [rdx+16]
movq mm6, mm4
punpckldq mm4, mm5
punpckhdq mm6, mm5
paddd mm4, mm6
movq mm5, mm2
pmaddwd mm5, [rdx+16]
movq mm6, mm3
pmaddwd mm6, [rdx+16]
movq mm7, mm5
punpckldq mm5, mm6
punpckhdq mm7, mm6
paddd mm5, mm7
movq mm6, [dct2nd_stage_rounding_mmx GLOBAL]
paddd mm4, mm6
paddd mm5, mm6
psrad mm4, _2NDSTAGESHIFT
psrad mm5, _2NDSTAGESHIFT
packssdw mm4, mm5
movq [rdi+16], mm4
;fourth column
movq mm4, mm0
pmaddwd mm4, [rdx+24]
movq mm5, mm1
pmaddwd mm5, [rdx+24]
movq mm6, mm4
punpckldq mm4, mm5
punpckhdq mm6, mm5
paddd mm4, mm6
movq mm5, mm2
pmaddwd mm5, [rdx+24]
movq mm6, mm3
pmaddwd mm6, [rdx+24]
movq mm7, mm5
punpckldq mm5, mm6
punpckhdq mm7, mm6
paddd mm5, mm7
movq mm6, [dct2nd_stage_rounding_mmx GLOBAL]
paddd mm4, mm6
paddd mm5, mm6
psrad mm4, _2NDSTAGESHIFT
psrad mm5, _2NDSTAGESHIFT
packssdw mm4, mm5
movq [rdi+24], mm4
; begin epilog
pop rdi
pop rsi
RESTORE_GOT
UNSHADOW_ARGS
pop rbp
ret
;void vp8_fast_fdct4x4_mmx(short *input, short *output, int pitch)
sym(vp8_fast_fdct4x4_mmx):
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 3
GET_GOT rbx
push rsi
push rdi
; end prolog
mov rsi, arg(0) ;input
mov rdi, arg(1) ;output
lea rdx, [dct_const_mmx GLOBAL]
movsxd rax, dword ptr arg(2) ;pitch
lea rcx, [rsi + rax*2]
; read the input data
movq mm0, [rsi]
movq mm1, [rsi + rax ]
movq mm2, [rcx]
movq mm3, [rcx + rax]
; get the constants
;shift to left by 1 for prescision
paddw mm0, mm0
paddw mm1, mm1
psllw mm2, 1
psllw mm3, 1
; transpose for the second stage
movq mm4, mm0 ; 00 01 02 03
movq mm5, mm2 ; 10 11 12 03
punpcklwd mm0, mm1 ; 00 10 01 11
punpckhwd mm4, mm1 ; 02 12 03 13
punpcklwd mm2, mm3 ; 20 30 21 31
punpckhwd mm5, mm3 ; 22 32 23 33
movq mm1, mm0 ; 00 10 01 11
punpckldq mm0, mm2 ; 00 10 20 30
punpckhdq mm1, mm2 ; 01 11 21 31
movq mm2, mm4 ; 02 12 03 13
punpckldq mm2, mm5 ; 02 12 22 32
punpckhdq mm4, mm5 ; 03 13 23 33
movq mm3, mm4
; first stage
movq mm5, mm0
movq mm4, mm1
paddw mm0, mm3 ; a = 0 + 3
paddw mm1, mm2 ; b = 1 + 2
psubw mm4, mm2 ; c = 1 - 2
psubw mm5, mm3 ; d = 0 - 3
; output 0 and 2
movq mm6, [rdx + 16] ; c2
movq mm2, mm0 ; a
paddw mm0, mm1 ; a + b
psubw mm2, mm1 ; a - b
movq mm1, mm0 ; a + b
pmulhw mm0, mm6 ; 00 01 02 03
paddw mm0, mm1 ; output 00 01 02 03
pmulhw mm6, mm2 ; 20 21 22 23
paddw mm2, mm6 ; output 20 21 22 23
; output 1 and 3
movq mm6, [rdx + 8] ; c1
movq mm7, [rdx + 24] ; c3
movq mm1, mm4 ; c
movq mm3, mm5 ; d
pmulhw mm1, mm7 ; c * c3
pmulhw mm3, mm6 ; d * c1
paddw mm3, mm5 ; d * c1 rounded
paddw mm1, mm3 ; output 10 11 12 13
movq mm3, mm4 ; c
pmulhw mm5, mm7 ; d * c3
pmulhw mm4, mm6 ; c * c1
paddw mm3, mm4 ; round c* c1
psubw mm5, mm3 ; output 30 31 32 33
movq mm3, mm5
; done with vertical
; transpose for the second stage
movq mm4, mm0 ; 00 01 02 03
movq mm5, mm2 ; 10 11 12 03
punpcklwd mm0, mm1 ; 00 10 01 11
punpckhwd mm4, mm1 ; 02 12 03 13
punpcklwd mm2, mm3 ; 20 30 21 31
punpckhwd mm5, mm3 ; 22 32 23 33
movq mm1, mm0 ; 00 10 01 11
punpckldq mm0, mm2 ; 00 10 20 30
punpckhdq mm1, mm2 ; 01 11 21 31
movq mm2, mm4 ; 02 12 03 13
punpckldq mm2, mm5 ; 02 12 22 32
punpckhdq mm4, mm5 ; 03 13 23 33
movq mm3, mm4
; first stage
movq mm5, mm0
movq mm4, mm1
paddw mm0, mm3 ; a = 0 + 3
paddw mm1, mm2 ; b = 1 + 2
psubw mm4, mm2 ; c = 1 - 2
psubw mm5, mm3 ; d = 0 - 3
; output 0 and 2
movq mm6, [rdx + 16] ; c2
movq mm2, mm0 ; a
paddw mm0, mm1 ; a + b
psubw mm2, mm1 ; a - b
movq mm1, mm0 ; a + b
pmulhw mm0, mm6 ; 00 01 02 03
paddw mm0, mm1 ; output 00 01 02 03
pmulhw mm6, mm2 ; 20 21 22 23
paddw mm2, mm6 ; output 20 21 22 23
; output 1 and 3
movq mm6, [rdx + 8] ; c1
movq mm7, [rdx + 24] ; c3
movq mm1, mm4 ; c
movq mm3, mm5 ; d
pmulhw mm1, mm7 ; c * c3
pmulhw mm3, mm6 ; d * c1
paddw mm3, mm5 ; d * c1 rounded
paddw mm1, mm3 ; output 10 11 12 13
movq mm3, mm4 ; c
pmulhw mm5, mm7 ; d * c3
pmulhw mm4, mm6 ; c * c1
paddw mm3, mm4 ; round c* c1
psubw mm5, mm3 ; output 30 31 32 33
movq mm3, mm5
; done with vertical
pcmpeqw mm4, mm4
pcmpeqw mm5, mm5
psrlw mm4, 15
psrlw mm5, 15
paddw mm0, mm4
paddw mm1, mm5
paddw mm2, mm4
paddw mm3, mm5
psraw mm0, 1
psraw mm1, 1
psraw mm2, 1
psraw mm3, 1
movq [rdi ], mm0
movq [rdi+ 8], mm1
movq [rdi+16], mm2
movq [rdi+24], mm3
; begin epilog
pop rdi
pop rsi
RESTORE_GOT
UNSHADOW_ARGS
pop rbp
ret
;void vp8_fast_fdct8x4_wmt(short *input, short *output, int pitch)
sym(vp8_fast_fdct8x4_wmt):
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 3
GET_GOT rbx
push rsi
push rdi
; end prolog
mov rsi, arg(0) ;input
mov rdi, arg(1) ;output
lea rdx, [dct_const_xmm GLOBAL]
movsxd rax, dword ptr arg(2) ;pitch
lea rcx, [rsi + rax*2]
; read the input data
movdqa xmm0, [rsi]
movdqa xmm2, [rsi + rax]
movdqa xmm4, [rcx]
movdqa xmm3, [rcx + rax]
; get the constants
;shift to left by 1 for prescision
psllw xmm0, 1
psllw xmm2, 1
psllw xmm4, 1
psllw xmm3, 1
; transpose for the second stage
movdqa xmm1, xmm0 ; 00 01 02 03 04 05 06 07
movdqa xmm5, xmm4 ; 20 21 22 23 24 25 26 27
punpcklwd xmm0, xmm2 ; 00 10 01 11 02 12 03 13
punpckhwd xmm1, xmm2 ; 04 14 05 15 06 16 07 17
punpcklwd xmm4, xmm3 ; 20 30 21 31 22 32 23 33
punpckhwd xmm5, xmm3 ; 24 34 25 35 26 36 27 37
movdqa xmm2, xmm0 ; 00 10 01 11 02 12 03 13
punpckldq xmm0, xmm4 ; 00 10 20 30 01 11 21 31
punpckhdq xmm2, xmm4 ; 02 12 22 32 03 13 23 33
movdqa xmm4, xmm1 ; 04 14 05 15 06 16 07 17
punpckldq xmm4, xmm5 ; 04 14 24 34 05 15 25 35
punpckhdq xmm1, xmm5 ; 06 16 26 36 07 17 27 37
movdqa xmm3, xmm2 ; 02 12 22 32 03 13 23 33
punpckhqdq xmm3, xmm1 ; 03 13 23 33 07 17 27 37
punpcklqdq xmm2, xmm1 ; 02 12 22 32 06 16 26 36
movdqa xmm1, xmm0 ; 00 10 20 30 01 11 21 31
punpcklqdq xmm0, xmm4 ; 00 10 20 30 04 14 24 34
punpckhqdq xmm1, xmm4 ; 01 11 21 32 05 15 25 35
; xmm0 0
; xmm1 1
; xmm2 2
; xmm3 3
; first stage
movdqa xmm5, xmm0
movdqa xmm4, xmm1
paddw xmm0, xmm3 ; a = 0 + 3
paddw xmm1, xmm2 ; b = 1 + 2
psubw xmm4, xmm2 ; c = 1 - 2
psubw xmm5, xmm3 ; d = 0 - 3
; output 0 and 2
movdqa xmm6, [rdx + 32] ; c2
movdqa xmm2, xmm0 ; a
paddw xmm0, xmm1 ; a + b
psubw xmm2, xmm1 ; a - b
movdqa xmm1, xmm0 ; a + b
pmulhw xmm0, xmm6 ; 00 01 02 03
paddw xmm0, xmm1 ; output 00 01 02 03
pmulhw xmm6, xmm2 ; 20 21 22 23
paddw xmm2, xmm6 ; output 20 21 22 23
; output 1 and 3
movdqa xmm6, [rdx + 16] ; c1
movdqa xmm7, [rdx + 48] ; c3
movdqa xmm1, xmm4 ; c
movdqa xmm3, xmm5 ; d
pmulhw xmm1, xmm7 ; c * c3
pmulhw xmm3, xmm6 ; d * c1
paddw xmm3, xmm5 ; d * c1 rounded
paddw xmm1, xmm3 ; output 10 11 12 13
movdqa xmm3, xmm4 ; c
pmulhw xmm5, xmm7 ; d * c3
pmulhw xmm4, xmm6 ; c * c1
paddw xmm3, xmm4 ; round c* c1
psubw xmm5, xmm3 ; output 30 31 32 33
movdqa xmm3, xmm5
; done with vertical
; transpose for the second stage
movdqa xmm4, xmm2 ; 02 12 22 32 06 16 26 36
movdqa xmm2, xmm1 ; 01 11 21 31 05 15 25 35
movdqa xmm1, xmm0 ; 00 10 20 30 04 14 24 34
movdqa xmm5, xmm4 ; 02 12 22 32 06 16 26 36
punpcklwd xmm0, xmm2 ; 00 01 10 11 20 21 30 31
punpckhwd xmm1, xmm2 ; 04 05 14 15 24 25 34 35
punpcklwd xmm4, xmm3 ; 02 03 12 13 22 23 32 33
punpckhwd xmm5, xmm3 ; 06 07 16 17 26 27 36 37
movdqa xmm2, xmm0 ; 00 01 10 11 20 21 30 31
punpckldq xmm0, xmm4 ; 00 01 02 03 10 11 12 13
punpckhdq xmm2, xmm4 ; 20 21 22 23 30 31 32 33
movdqa xmm4, xmm1 ; 04 05 14 15 24 25 34 35
punpckldq xmm4, xmm5 ; 04 05 06 07 14 15 16 17
punpckhdq xmm1, xmm5 ; 24 25 26 27 34 35 36 37
movdqa xmm3, xmm2 ; 20 21 22 23 30 31 32 33
punpckhqdq xmm3, xmm1 ; 30 31 32 33 34 35 36 37
punpcklqdq xmm2, xmm1 ; 20 21 22 23 24 25 26 27
movdqa xmm1, xmm0 ; 00 01 02 03 10 11 12 13
punpcklqdq xmm0, xmm4 ; 00 01 02 03 04 05 06 07
punpckhqdq xmm1, xmm4 ; 10 11 12 13 14 15 16 17
; first stage
movdqa xmm5, xmm0
movdqa xmm4, xmm1
paddw xmm0, xmm3 ; a = 0 + 3
paddw xmm1, xmm2 ; b = 1 + 2
psubw xmm4, xmm2 ; c = 1 - 2
psubw xmm5, xmm3 ; d = 0 - 3
; output 0 and 2
movdqa xmm6, [rdx + 32] ; c2
movdqa xmm2, xmm0 ; a
paddw xmm0, xmm1 ; a + b
psubw xmm2, xmm1 ; a - b
movdqa xmm1, xmm0 ; a + b
pmulhw xmm0, xmm6 ; 00 01 02 03
paddw xmm0, xmm1 ; output 00 01 02 03
pmulhw xmm6, xmm2 ; 20 21 22 23
paddw xmm2, xmm6 ; output 20 21 22 23
; output 1 and 3
movdqa xmm6, [rdx + 16] ; c1
movdqa xmm7, [rdx + 48] ; c3
movdqa xmm1, xmm4 ; c
movdqa xmm3, xmm5 ; d
pmulhw xmm1, xmm7 ; c * c3
pmulhw xmm3, xmm6 ; d * c1
paddw xmm3, xmm5 ; d * c1 rounded
paddw xmm1, xmm3 ; output 10 11 12 13
movdqa xmm3, xmm4 ; c
pmulhw xmm5, xmm7 ; d * c3
pmulhw xmm4, xmm6 ; c * c1
paddw xmm3, xmm4 ; round c* c1
psubw xmm5, xmm3 ; output 30 31 32 33
movdqa xmm3, xmm5
; done with vertical
pcmpeqw xmm4, xmm4
pcmpeqw xmm5, xmm5;
psrlw xmm4, 15
psrlw xmm5, 15
paddw xmm0, xmm4
paddw xmm1, xmm5
paddw xmm2, xmm4
paddw xmm3, xmm5
psraw xmm0, 1
psraw xmm1, 1
psraw xmm2, 1
psraw xmm3, 1
movq QWORD PTR[rdi ], xmm0
movq QWORD PTR[rdi+ 8], xmm1
movq QWORD PTR[rdi+16], xmm2
movq QWORD PTR[rdi+24], xmm3
psrldq xmm0, 8
psrldq xmm1, 8
psrldq xmm2, 8
psrldq xmm3, 8
movq QWORD PTR[rdi+32], xmm0
movq QWORD PTR[rdi+40], xmm1
movq QWORD PTR[rdi+48], xmm2
movq QWORD PTR[rdi+56], xmm3
; begin epilog
pop rdi
pop rsi
RESTORE_GOT
UNSHADOW_ARGS
pop rbp
ret
SECTION_RODATA
;static const unsigned int dct1st_stage_rounding_mmx[2] =
align 16
dct1st_stage_rounding_mmx:
times 2 dd 8192
;static const unsigned int dct2nd_stage_rounding_mmx[2] =
align 16
dct2nd_stage_rounding_mmx:
times 2 dd 32768
;static const short dct_matrix[4][4]=
align 16
dct_matrix:
times 4 dw 23170
dw 30274
dw 12540
dw -12540
dw -30274
dw 23170
times 2 dw -23170
dw 23170
dw 12540
dw -30274
dw 30274
dw -12540
;static const unsigned short dct_const_mmx[4 * 4]=
align 16
dct_const_mmx:
times 4 dw 0
times 4 dw 60547
times 4 dw 46341
times 4 dw 25080
;static const unsigned short dct_const_xmm[8 * 4]=
align 16
dct_const_xmm:
times 8 dw 0
times 8 dw 60547
times 8 dw 46341
times 8 dw 25080