; ; Copyright (c) 2010 The VP8 project authors. All Rights Reserved. ; ; Use of this source code is governed by a BSD-style license ; that can be found in the LICENSE file in the root of the source ; tree. An additional intellectual property rights grant can be found ; in the file PATENTS. All contributing project authors may ; be found in the AUTHORS file in the root of the source tree. ; %include "vpx_ports/x86_abi_support.asm" section .text global sym(vp8_short_fdct4x4_mmx) global sym(vp8_fast_fdct4x4_mmx) global sym(vp8_fast_fdct8x4_wmt) %define DCTCONSTANTSBITS (16) %define DCTROUNDINGVALUE (1<< (DCTCONSTANTSBITS-1)) %define x_c1 (60547) ; cos(pi /8) * (1<<15) %define x_c2 (46341) ; cos(pi*2/8) * (1<<15) %define x_c3 (25080) ; cos(pi*3/8) * (1<<15) %define _1STSTAGESHIFT 14 %define _2NDSTAGESHIFT 16 ; using matrix multiply with source and destbuffer has a pitch ;void vp8_short_fdct4x4_mmx(short *input, short *output, int pitch) sym(vp8_short_fdct4x4_mmx): push rbp mov rbp, rsp SHADOW_ARGS_TO_STACK 3 GET_GOT rbx push rsi push rdi ; end prolog mov rsi, arg(0) ;input mov rdi, arg(1) ;output movsxd rax, dword ptr arg(2) ;pitch lea rdx, [dct_matrix GLOBAL] movq mm0, [rsi ] movq mm1, [rsi + rax] movq mm2, [rsi + rax*2] lea rsi, [rsi + rax*2] movq mm3, [rsi + rax] ; first column movq mm4, mm0 movq mm7, [rdx] pmaddwd mm4, mm7 movq mm5, mm1 pmaddwd mm5, mm7 movq mm6, mm4 punpckldq mm4, mm5 punpckhdq mm6, mm5 paddd mm4, mm6 movq mm5, mm2 pmaddwd mm5, mm7 movq mm6, mm3 pmaddwd mm6, mm7 movq mm7, mm5 punpckldq mm5, mm6 punpckhdq mm7, mm6 paddd mm5, mm7 movq mm6, [dct1st_stage_rounding_mmx GLOBAL] paddd mm4, mm6 paddd mm5, mm6 psrad mm4, _1STSTAGESHIFT psrad mm5, _1STSTAGESHIFT packssdw mm4, mm5 movq [rdi], mm4 ;second column movq mm4, mm0 pmaddwd mm4, [rdx+8] movq mm5, mm1 pmaddwd mm5, [rdx+8] movq mm6, mm4 punpckldq mm4, mm5 punpckhdq mm6, mm5 paddd mm4, mm6 movq mm5, mm2 pmaddwd mm5, [rdx+8] movq mm6, mm3 pmaddwd mm6, [rdx+8] movq mm7, mm5 punpckldq mm5, mm6 punpckhdq mm7, mm6 paddd mm5, mm7 movq mm6, [dct1st_stage_rounding_mmx GLOBAL] paddd mm4, mm6 paddd mm5, mm6 psrad mm4, _1STSTAGESHIFT psrad mm5, _1STSTAGESHIFT packssdw mm4, mm5 movq [rdi+8], mm4 ;third column movq mm4, mm0 pmaddwd mm4, [rdx+16] movq mm5, mm1 pmaddwd mm5, [rdx+16] movq mm6, mm4 punpckldq mm4, mm5 punpckhdq mm6, mm5 paddd mm4, mm6 movq mm5, mm2 pmaddwd mm5, [rdx+16] movq mm6, mm3 pmaddwd mm6, [rdx+16] movq mm7, mm5 punpckldq mm5, mm6 punpckhdq mm7, mm6 paddd mm5, mm7 movq mm6, [dct1st_stage_rounding_mmx GLOBAL] paddd mm4, mm6 paddd mm5, mm6 psrad mm4, _1STSTAGESHIFT psrad mm5, _1STSTAGESHIFT packssdw mm4, mm5 movq [rdi+16], mm4 ;fourth column (this is the last column, so we do not have save the source any more) pmaddwd mm0, [rdx+24] pmaddwd mm1, [rdx+24] movq mm6, mm0 punpckldq mm0, mm1 punpckhdq mm6, mm1 paddd mm0, mm6 pmaddwd mm2, [rdx+24] pmaddwd mm3, [rdx+24] movq mm7, mm2 punpckldq mm2, mm3 punpckhdq mm7, mm3 paddd mm2, mm7 movq mm6, [dct1st_stage_rounding_mmx GLOBAL] paddd mm0, mm6 paddd mm2, mm6 psrad mm0, _1STSTAGESHIFT psrad mm2, _1STSTAGESHIFT packssdw mm0, mm2 movq mm3, mm0 ; done with one pass ; now start second pass movq mm0, [rdi ] movq mm1, [rdi+ 8] movq mm2, [rdi+ 16] movq mm4, mm0 pmaddwd mm4, [rdx] movq mm5, mm1 pmaddwd mm5, [rdx] movq mm6, mm4 punpckldq mm4, mm5 punpckhdq mm6, mm5 paddd mm4, mm6 movq mm5, mm2 pmaddwd mm5, [rdx] movq mm6, mm3 pmaddwd mm6, [rdx] movq mm7, mm5 punpckldq mm5, mm6 punpckhdq mm7, mm6 paddd mm5, mm7 movq mm6, [dct2nd_stage_rounding_mmx GLOBAL] paddd mm4, mm6 paddd mm5, mm6 psrad mm4, _2NDSTAGESHIFT psrad mm5, _2NDSTAGESHIFT packssdw mm4, mm5 movq [rdi], mm4 ;second column movq mm4, mm0 pmaddwd mm4, [rdx+8] movq mm5, mm1 pmaddwd mm5, [rdx+8] movq mm6, mm4 punpckldq mm4, mm5 punpckhdq mm6, mm5 paddd mm4, mm6 movq mm5, mm2 pmaddwd mm5, [rdx+8] movq mm6, mm3 pmaddwd mm6, [rdx+8] movq mm7, mm5 punpckldq mm5, mm6 punpckhdq mm7, mm6 paddd mm5, mm7 movq mm6, [dct2nd_stage_rounding_mmx GLOBAL] paddd mm4, mm6 paddd mm5, mm6 psrad mm4, _2NDSTAGESHIFT psrad mm5, _2NDSTAGESHIFT packssdw mm4, mm5 movq [rdi+8], mm4 ;third column movq mm4, mm0 pmaddwd mm4, [rdx+16] movq mm5, mm1 pmaddwd mm5, [rdx+16] movq mm6, mm4 punpckldq mm4, mm5 punpckhdq mm6, mm5 paddd mm4, mm6 movq mm5, mm2 pmaddwd mm5, [rdx+16] movq mm6, mm3 pmaddwd mm6, [rdx+16] movq mm7, mm5 punpckldq mm5, mm6 punpckhdq mm7, mm6 paddd mm5, mm7 movq mm6, [dct2nd_stage_rounding_mmx GLOBAL] paddd mm4, mm6 paddd mm5, mm6 psrad mm4, _2NDSTAGESHIFT psrad mm5, _2NDSTAGESHIFT packssdw mm4, mm5 movq [rdi+16], mm4 ;fourth column movq mm4, mm0 pmaddwd mm4, [rdx+24] movq mm5, mm1 pmaddwd mm5, [rdx+24] movq mm6, mm4 punpckldq mm4, mm5 punpckhdq mm6, mm5 paddd mm4, mm6 movq mm5, mm2 pmaddwd mm5, [rdx+24] movq mm6, mm3 pmaddwd mm6, [rdx+24] movq mm7, mm5 punpckldq mm5, mm6 punpckhdq mm7, mm6 paddd mm5, mm7 movq mm6, [dct2nd_stage_rounding_mmx GLOBAL] paddd mm4, mm6 paddd mm5, mm6 psrad mm4, _2NDSTAGESHIFT psrad mm5, _2NDSTAGESHIFT packssdw mm4, mm5 movq [rdi+24], mm4 ; begin epilog pop rdi pop rsi RESTORE_GOT UNSHADOW_ARGS pop rbp ret ;void vp8_fast_fdct4x4_mmx(short *input, short *output, int pitch) sym(vp8_fast_fdct4x4_mmx): push rbp mov rbp, rsp SHADOW_ARGS_TO_STACK 3 GET_GOT rbx push rsi push rdi ; end prolog mov rsi, arg(0) ;input mov rdi, arg(1) ;output lea rdx, [dct_const_mmx GLOBAL] movsxd rax, dword ptr arg(2) ;pitch lea rcx, [rsi + rax*2] ; read the input data movq mm0, [rsi] movq mm1, [rsi + rax ] movq mm2, [rcx] movq mm3, [rcx + rax] ; get the constants ;shift to left by 1 for prescision paddw mm0, mm0 paddw mm1, mm1 psllw mm2, 1 psllw mm3, 1 ; transpose for the second stage movq mm4, mm0 ; 00 01 02 03 movq mm5, mm2 ; 10 11 12 03 punpcklwd mm0, mm1 ; 00 10 01 11 punpckhwd mm4, mm1 ; 02 12 03 13 punpcklwd mm2, mm3 ; 20 30 21 31 punpckhwd mm5, mm3 ; 22 32 23 33 movq mm1, mm0 ; 00 10 01 11 punpckldq mm0, mm2 ; 00 10 20 30 punpckhdq mm1, mm2 ; 01 11 21 31 movq mm2, mm4 ; 02 12 03 13 punpckldq mm2, mm5 ; 02 12 22 32 punpckhdq mm4, mm5 ; 03 13 23 33 movq mm3, mm4 ; first stage movq mm5, mm0 movq mm4, mm1 paddw mm0, mm3 ; a = 0 + 3 paddw mm1, mm2 ; b = 1 + 2 psubw mm4, mm2 ; c = 1 - 2 psubw mm5, mm3 ; d = 0 - 3 ; output 0 and 2 movq mm6, [rdx + 16] ; c2 movq mm2, mm0 ; a paddw mm0, mm1 ; a + b psubw mm2, mm1 ; a - b movq mm1, mm0 ; a + b pmulhw mm0, mm6 ; 00 01 02 03 paddw mm0, mm1 ; output 00 01 02 03 pmulhw mm6, mm2 ; 20 21 22 23 paddw mm2, mm6 ; output 20 21 22 23 ; output 1 and 3 movq mm6, [rdx + 8] ; c1 movq mm7, [rdx + 24] ; c3 movq mm1, mm4 ; c movq mm3, mm5 ; d pmulhw mm1, mm7 ; c * c3 pmulhw mm3, mm6 ; d * c1 paddw mm3, mm5 ; d * c1 rounded paddw mm1, mm3 ; output 10 11 12 13 movq mm3, mm4 ; c pmulhw mm5, mm7 ; d * c3 pmulhw mm4, mm6 ; c * c1 paddw mm3, mm4 ; round c* c1 psubw mm5, mm3 ; output 30 31 32 33 movq mm3, mm5 ; done with vertical ; transpose for the second stage movq mm4, mm0 ; 00 01 02 03 movq mm5, mm2 ; 10 11 12 03 punpcklwd mm0, mm1 ; 00 10 01 11 punpckhwd mm4, mm1 ; 02 12 03 13 punpcklwd mm2, mm3 ; 20 30 21 31 punpckhwd mm5, mm3 ; 22 32 23 33 movq mm1, mm0 ; 00 10 01 11 punpckldq mm0, mm2 ; 00 10 20 30 punpckhdq mm1, mm2 ; 01 11 21 31 movq mm2, mm4 ; 02 12 03 13 punpckldq mm2, mm5 ; 02 12 22 32 punpckhdq mm4, mm5 ; 03 13 23 33 movq mm3, mm4 ; first stage movq mm5, mm0 movq mm4, mm1 paddw mm0, mm3 ; a = 0 + 3 paddw mm1, mm2 ; b = 1 + 2 psubw mm4, mm2 ; c = 1 - 2 psubw mm5, mm3 ; d = 0 - 3 ; output 0 and 2 movq mm6, [rdx + 16] ; c2 movq mm2, mm0 ; a paddw mm0, mm1 ; a + b psubw mm2, mm1 ; a - b movq mm1, mm0 ; a + b pmulhw mm0, mm6 ; 00 01 02 03 paddw mm0, mm1 ; output 00 01 02 03 pmulhw mm6, mm2 ; 20 21 22 23 paddw mm2, mm6 ; output 20 21 22 23 ; output 1 and 3 movq mm6, [rdx + 8] ; c1 movq mm7, [rdx + 24] ; c3 movq mm1, mm4 ; c movq mm3, mm5 ; d pmulhw mm1, mm7 ; c * c3 pmulhw mm3, mm6 ; d * c1 paddw mm3, mm5 ; d * c1 rounded paddw mm1, mm3 ; output 10 11 12 13 movq mm3, mm4 ; c pmulhw mm5, mm7 ; d * c3 pmulhw mm4, mm6 ; c * c1 paddw mm3, mm4 ; round c* c1 psubw mm5, mm3 ; output 30 31 32 33 movq mm3, mm5 ; done with vertical pcmpeqw mm4, mm4 pcmpeqw mm5, mm5 psrlw mm4, 15 psrlw mm5, 15 paddw mm0, mm4 paddw mm1, mm5 paddw mm2, mm4 paddw mm3, mm5 psraw mm0, 1 psraw mm1, 1 psraw mm2, 1 psraw mm3, 1 movq [rdi ], mm0 movq [rdi+ 8], mm1 movq [rdi+16], mm2 movq [rdi+24], mm3 ; begin epilog pop rdi pop rsi RESTORE_GOT UNSHADOW_ARGS pop rbp ret ;void vp8_fast_fdct8x4_wmt(short *input, short *output, int pitch) sym(vp8_fast_fdct8x4_wmt): push rbp mov rbp, rsp SHADOW_ARGS_TO_STACK 3 GET_GOT rbx push rsi push rdi ; end prolog mov rsi, arg(0) ;input mov rdi, arg(1) ;output lea rdx, [dct_const_xmm GLOBAL] movsxd rax, dword ptr arg(2) ;pitch lea rcx, [rsi + rax*2] ; read the input data movdqa xmm0, [rsi] movdqa xmm2, [rsi + rax] movdqa xmm4, [rcx] movdqa xmm3, [rcx + rax] ; get the constants ;shift to left by 1 for prescision psllw xmm0, 1 psllw xmm2, 1 psllw xmm4, 1 psllw xmm3, 1 ; transpose for the second stage movdqa xmm1, xmm0 ; 00 01 02 03 04 05 06 07 movdqa xmm5, xmm4 ; 20 21 22 23 24 25 26 27 punpcklwd xmm0, xmm2 ; 00 10 01 11 02 12 03 13 punpckhwd xmm1, xmm2 ; 04 14 05 15 06 16 07 17 punpcklwd xmm4, xmm3 ; 20 30 21 31 22 32 23 33 punpckhwd xmm5, xmm3 ; 24 34 25 35 26 36 27 37 movdqa xmm2, xmm0 ; 00 10 01 11 02 12 03 13 punpckldq xmm0, xmm4 ; 00 10 20 30 01 11 21 31 punpckhdq xmm2, xmm4 ; 02 12 22 32 03 13 23 33 movdqa xmm4, xmm1 ; 04 14 05 15 06 16 07 17 punpckldq xmm4, xmm5 ; 04 14 24 34 05 15 25 35 punpckhdq xmm1, xmm5 ; 06 16 26 36 07 17 27 37 movdqa xmm3, xmm2 ; 02 12 22 32 03 13 23 33 punpckhqdq xmm3, xmm1 ; 03 13 23 33 07 17 27 37 punpcklqdq xmm2, xmm1 ; 02 12 22 32 06 16 26 36 movdqa xmm1, xmm0 ; 00 10 20 30 01 11 21 31 punpcklqdq xmm0, xmm4 ; 00 10 20 30 04 14 24 34 punpckhqdq xmm1, xmm4 ; 01 11 21 32 05 15 25 35 ; xmm0 0 ; xmm1 1 ; xmm2 2 ; xmm3 3 ; first stage movdqa xmm5, xmm0 movdqa xmm4, xmm1 paddw xmm0, xmm3 ; a = 0 + 3 paddw xmm1, xmm2 ; b = 1 + 2 psubw xmm4, xmm2 ; c = 1 - 2 psubw xmm5, xmm3 ; d = 0 - 3 ; output 0 and 2 movdqa xmm6, [rdx + 32] ; c2 movdqa xmm2, xmm0 ; a paddw xmm0, xmm1 ; a + b psubw xmm2, xmm1 ; a - b movdqa xmm1, xmm0 ; a + b pmulhw xmm0, xmm6 ; 00 01 02 03 paddw xmm0, xmm1 ; output 00 01 02 03 pmulhw xmm6, xmm2 ; 20 21 22 23 paddw xmm2, xmm6 ; output 20 21 22 23 ; output 1 and 3 movdqa xmm6, [rdx + 16] ; c1 movdqa xmm7, [rdx + 48] ; c3 movdqa xmm1, xmm4 ; c movdqa xmm3, xmm5 ; d pmulhw xmm1, xmm7 ; c * c3 pmulhw xmm3, xmm6 ; d * c1 paddw xmm3, xmm5 ; d * c1 rounded paddw xmm1, xmm3 ; output 10 11 12 13 movdqa xmm3, xmm4 ; c pmulhw xmm5, xmm7 ; d * c3 pmulhw xmm4, xmm6 ; c * c1 paddw xmm3, xmm4 ; round c* c1 psubw xmm5, xmm3 ; output 30 31 32 33 movdqa xmm3, xmm5 ; done with vertical ; transpose for the second stage movdqa xmm4, xmm2 ; 02 12 22 32 06 16 26 36 movdqa xmm2, xmm1 ; 01 11 21 31 05 15 25 35 movdqa xmm1, xmm0 ; 00 10 20 30 04 14 24 34 movdqa xmm5, xmm4 ; 02 12 22 32 06 16 26 36 punpcklwd xmm0, xmm2 ; 00 01 10 11 20 21 30 31 punpckhwd xmm1, xmm2 ; 04 05 14 15 24 25 34 35 punpcklwd xmm4, xmm3 ; 02 03 12 13 22 23 32 33 punpckhwd xmm5, xmm3 ; 06 07 16 17 26 27 36 37 movdqa xmm2, xmm0 ; 00 01 10 11 20 21 30 31 punpckldq xmm0, xmm4 ; 00 01 02 03 10 11 12 13 punpckhdq xmm2, xmm4 ; 20 21 22 23 30 31 32 33 movdqa xmm4, xmm1 ; 04 05 14 15 24 25 34 35 punpckldq xmm4, xmm5 ; 04 05 06 07 14 15 16 17 punpckhdq xmm1, xmm5 ; 24 25 26 27 34 35 36 37 movdqa xmm3, xmm2 ; 20 21 22 23 30 31 32 33 punpckhqdq xmm3, xmm1 ; 30 31 32 33 34 35 36 37 punpcklqdq xmm2, xmm1 ; 20 21 22 23 24 25 26 27 movdqa xmm1, xmm0 ; 00 01 02 03 10 11 12 13 punpcklqdq xmm0, xmm4 ; 00 01 02 03 04 05 06 07 punpckhqdq xmm1, xmm4 ; 10 11 12 13 14 15 16 17 ; first stage movdqa xmm5, xmm0 movdqa xmm4, xmm1 paddw xmm0, xmm3 ; a = 0 + 3 paddw xmm1, xmm2 ; b = 1 + 2 psubw xmm4, xmm2 ; c = 1 - 2 psubw xmm5, xmm3 ; d = 0 - 3 ; output 0 and 2 movdqa xmm6, [rdx + 32] ; c2 movdqa xmm2, xmm0 ; a paddw xmm0, xmm1 ; a + b psubw xmm2, xmm1 ; a - b movdqa xmm1, xmm0 ; a + b pmulhw xmm0, xmm6 ; 00 01 02 03 paddw xmm0, xmm1 ; output 00 01 02 03 pmulhw xmm6, xmm2 ; 20 21 22 23 paddw xmm2, xmm6 ; output 20 21 22 23 ; output 1 and 3 movdqa xmm6, [rdx + 16] ; c1 movdqa xmm7, [rdx + 48] ; c3 movdqa xmm1, xmm4 ; c movdqa xmm3, xmm5 ; d pmulhw xmm1, xmm7 ; c * c3 pmulhw xmm3, xmm6 ; d * c1 paddw xmm3, xmm5 ; d * c1 rounded paddw xmm1, xmm3 ; output 10 11 12 13 movdqa xmm3, xmm4 ; c pmulhw xmm5, xmm7 ; d * c3 pmulhw xmm4, xmm6 ; c * c1 paddw xmm3, xmm4 ; round c* c1 psubw xmm5, xmm3 ; output 30 31 32 33 movdqa xmm3, xmm5 ; done with vertical pcmpeqw xmm4, xmm4 pcmpeqw xmm5, xmm5; psrlw xmm4, 15 psrlw xmm5, 15 paddw xmm0, xmm4 paddw xmm1, xmm5 paddw xmm2, xmm4 paddw xmm3, xmm5 psraw xmm0, 1 psraw xmm1, 1 psraw xmm2, 1 psraw xmm3, 1 movq QWORD PTR[rdi ], xmm0 movq QWORD PTR[rdi+ 8], xmm1 movq QWORD PTR[rdi+16], xmm2 movq QWORD PTR[rdi+24], xmm3 psrldq xmm0, 8 psrldq xmm1, 8 psrldq xmm2, 8 psrldq xmm3, 8 movq QWORD PTR[rdi+32], xmm0 movq QWORD PTR[rdi+40], xmm1 movq QWORD PTR[rdi+48], xmm2 movq QWORD PTR[rdi+56], xmm3 ; begin epilog pop rdi pop rsi RESTORE_GOT UNSHADOW_ARGS pop rbp ret SECTION_RODATA ;static const unsigned int dct1st_stage_rounding_mmx[2] = align 16 dct1st_stage_rounding_mmx: times 2 dd 8192 ;static const unsigned int dct2nd_stage_rounding_mmx[2] = align 16 dct2nd_stage_rounding_mmx: times 2 dd 32768 ;static const short dct_matrix[4][4]= align 16 dct_matrix: times 4 dw 23170 dw 30274 dw 12540 dw -12540 dw -30274 dw 23170 times 2 dw -23170 dw 23170 dw 12540 dw -30274 dw 30274 dw -12540 ;static const unsigned short dct_const_mmx[4 * 4]= align 16 dct_const_mmx: times 4 dw 0 times 4 dw 60547 times 4 dw 46341 times 4 dw 25080 ;static const unsigned short dct_const_xmm[8 * 4]= align 16 dct_const_xmm: times 8 dw 0 times 8 dw 60547 times 8 dw 46341 times 8 dw 25080