vpx/vp8/common/x86/recon_sse2.asm
Johann 4a2b684ef4 modify SAVE_XMM for potential 64bit use
the win64 abi requires saving and restoring xmm6:xmm15. currently
SAVE_XMM and RESTORE XMM only allow for saving xmm6:xmm7. allow
specifying the highest register used and if the stack is unaligned.

Change-Id: Ica5699622ffe3346d3a486f48eef0206c51cf867
2011-04-19 10:42:45 -04:00

232 lines
6.8 KiB
NASM

;
; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
;
; Use of this source code is governed by a BSD-style license
; that can be found in the LICENSE file in the root of the source
; tree. An additional intellectual property rights grant can be found
; in the file PATENTS. All contributing project authors may
; be found in the AUTHORS file in the root of the source tree.
;
%include "vpx_ports/x86_abi_support.asm"
;void vp8_recon2b_sse2(unsigned char *s, short *q, unsigned char *d, int stride)
global sym(vp8_recon2b_sse2)
sym(vp8_recon2b_sse2):
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 4
push rsi
push rdi
; end prolog
mov rsi, arg(0) ;s
mov rdi, arg(2) ;d
mov rdx, arg(1) ;q
movsxd rax, dword ptr arg(3) ;stride
pxor xmm0, xmm0
movq xmm1, MMWORD PTR [rsi]
punpcklbw xmm1, xmm0
paddsw xmm1, XMMWORD PTR [rdx]
packuswb xmm1, xmm0 ; pack and unpack to saturate
movq MMWORD PTR [rdi], xmm1
movq xmm2, MMWORD PTR [rsi+8]
punpcklbw xmm2, xmm0
paddsw xmm2, XMMWORD PTR [rdx+16]
packuswb xmm2, xmm0 ; pack and unpack to saturate
movq MMWORD PTR [rdi+rax], xmm2
movq xmm3, MMWORD PTR [rsi+16]
punpcklbw xmm3, xmm0
paddsw xmm3, XMMWORD PTR [rdx+32]
packuswb xmm3, xmm0 ; pack and unpack to saturate
movq MMWORD PTR [rdi+rax*2], xmm3
add rdi, rax
movq xmm4, MMWORD PTR [rsi+24]
punpcklbw xmm4, xmm0
paddsw xmm4, XMMWORD PTR [rdx+48]
packuswb xmm4, xmm0 ; pack and unpack to saturate
movq MMWORD PTR [rdi+rax*2], xmm4
; begin epilog
pop rdi
pop rsi
UNSHADOW_ARGS
pop rbp
ret
;void vp8_recon4b_sse2(unsigned char *s, short *q, unsigned char *d, int stride)
global sym(vp8_recon4b_sse2)
sym(vp8_recon4b_sse2):
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 4
SAVE_XMM 7
push rsi
push rdi
; end prolog
mov rsi, arg(0) ;s
mov rdi, arg(2) ;d
mov rdx, arg(1) ;q
movsxd rax, dword ptr arg(3) ;stride
pxor xmm0, xmm0
movdqa xmm1, XMMWORD PTR [rsi]
movdqa xmm5, xmm1
punpcklbw xmm1, xmm0
punpckhbw xmm5, xmm0
paddsw xmm1, XMMWORD PTR [rdx]
paddsw xmm5, XMMWORD PTR [rdx+16]
packuswb xmm1, xmm5 ; pack and unpack to saturate
movdqa XMMWORD PTR [rdi], xmm1
movdqa xmm2, XMMWORD PTR [rsi+16]
movdqa xmm6, xmm2
punpcklbw xmm2, xmm0
punpckhbw xmm6, xmm0
paddsw xmm2, XMMWORD PTR [rdx+32]
paddsw xmm6, XMMWORD PTR [rdx+48]
packuswb xmm2, xmm6 ; pack and unpack to saturate
movdqa XMMWORD PTR [rdi+rax], xmm2
movdqa xmm3, XMMWORD PTR [rsi+32]
movdqa xmm7, xmm3
punpcklbw xmm3, xmm0
punpckhbw xmm7, xmm0
paddsw xmm3, XMMWORD PTR [rdx+64]
paddsw xmm7, XMMWORD PTR [rdx+80]
packuswb xmm3, xmm7 ; pack and unpack to saturate
movdqa XMMWORD PTR [rdi+rax*2], xmm3
add rdi, rax
movdqa xmm4, XMMWORD PTR [rsi+48]
movdqa xmm5, xmm4
punpcklbw xmm4, xmm0
punpckhbw xmm5, xmm0
paddsw xmm4, XMMWORD PTR [rdx+96]
paddsw xmm5, XMMWORD PTR [rdx+112]
packuswb xmm4, xmm5 ; pack and unpack to saturate
movdqa XMMWORD PTR [rdi+rax*2], xmm4
; begin epilog
pop rdi
pop rsi
RESTORE_XMM
UNSHADOW_ARGS
pop rbp
ret
;void copy_mem16x16_sse2(
; unsigned char *src,
; int src_stride,
; unsigned char *dst,
; int dst_stride
; )
global sym(vp8_copy_mem16x16_sse2)
sym(vp8_copy_mem16x16_sse2):
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 4
push rsi
push rdi
; end prolog
mov rsi, arg(0) ;src;
movdqu xmm0, [rsi]
movsxd rax, dword ptr arg(1) ;src_stride;
mov rdi, arg(2) ;dst;
movdqu xmm1, [rsi+rax]
movdqu xmm2, [rsi+rax*2]
movsxd rcx, dword ptr arg(3) ;dst_stride
lea rsi, [rsi+rax*2]
movdqa [rdi], xmm0
add rsi, rax
movdqa [rdi+rcx], xmm1
movdqa [rdi+rcx*2],xmm2
lea rdi, [rdi+rcx*2]
movdqu xmm3, [rsi]
add rdi, rcx
movdqu xmm4, [rsi+rax]
movdqu xmm5, [rsi+rax*2]
lea rsi, [rsi+rax*2]
movdqa [rdi], xmm3
add rsi, rax
movdqa [rdi+rcx], xmm4
movdqa [rdi+rcx*2],xmm5
lea rdi, [rdi+rcx*2]
movdqu xmm0, [rsi]
add rdi, rcx
movdqu xmm1, [rsi+rax]
movdqu xmm2, [rsi+rax*2]
lea rsi, [rsi+rax*2]
movdqa [rdi], xmm0
add rsi, rax
movdqa [rdi+rcx], xmm1
movdqa [rdi+rcx*2], xmm2
movdqu xmm3, [rsi]
movdqu xmm4, [rsi+rax]
lea rdi, [rdi+rcx*2]
add rdi, rcx
movdqu xmm5, [rsi+rax*2]
lea rsi, [rsi+rax*2]
movdqa [rdi], xmm3
add rsi, rax
movdqa [rdi+rcx], xmm4
movdqa [rdi+rcx*2],xmm5
movdqu xmm0, [rsi]
lea rdi, [rdi+rcx*2]
movdqu xmm1, [rsi+rax]
add rdi, rcx
movdqu xmm2, [rsi+rax*2]
lea rsi, [rsi+rax*2]
movdqa [rdi], xmm0
movdqa [rdi+rcx], xmm1
movdqa [rdi+rcx*2],xmm2
movdqu xmm3, [rsi+rax]
lea rdi, [rdi+rcx*2]
movdqa [rdi+rcx], xmm3
; begin epilog
pop rdi
pop rsi
UNSHADOW_ARGS
pop rbp
ret