vpx/vp8/encoder/x86/subtract_sse2.asm
Yunqing Wang 6614563b8f Save XMM registers in asm functions
XMM6/7 are used in these functions, and need to be saved.

Change-Id: I3dfaddaf2a69cd4bf8e8735c7064b17bac5a14e5
2010-10-28 16:59:03 -04:00

357 lines
13 KiB
NASM

;
; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
;
; Use of this source code is governed by a BSD-style license
; that can be found in the LICENSE file in the root of the source
; tree. An additional intellectual property rights grant can be found
; in the file PATENTS. All contributing project authors may
; be found in the AUTHORS file in the root of the source tree.
;
%include "vpx_ports/x86_abi_support.asm"
;void vp8_subtract_b_sse2_impl(unsigned char *z, int src_stride,
; short *diff, unsigned char *Predictor,
; int pitch);
global sym(vp8_subtract_b_sse2_impl)
sym(vp8_subtract_b_sse2_impl):
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 5
GET_GOT rbx
push rsi
push rdi
; end prolog
mov rdi, arg(2) ;diff
mov rax, arg(3) ;Predictor
mov rsi, arg(0) ;z
movsxd rdx, dword ptr arg(1);src_stride;
movsxd rcx, dword ptr arg(4);pitch
pxor mm7, mm7
movd mm0, [rsi]
movd mm1, [rax]
punpcklbw mm0, mm7
punpcklbw mm1, mm7
psubw mm0, mm1
movq MMWORD PTR [rdi], mm0
movd mm0, [rsi+rdx]
movd mm1, [rax+rcx]
punpcklbw mm0, mm7
punpcklbw mm1, mm7
psubw mm0, mm1
movq MMWORD PTR [rdi+rcx*2], mm0
movd mm0, [rsi+rdx*2]
movd mm1, [rax+rcx*2]
punpcklbw mm0, mm7
punpcklbw mm1, mm7
psubw mm0, mm1
movq MMWORD PTR [rdi+rcx*4], mm0
lea rsi, [rsi+rdx*2]
lea rcx, [rcx+rcx*2]
movd mm0, [rsi+rdx]
movd mm1, [rax+rcx]
punpcklbw mm0, mm7
punpcklbw mm1, mm7
psubw mm0, mm1
movq MMWORD PTR [rdi+rcx*2], mm0
; begin epilog
pop rdi
pop rsi
RESTORE_GOT
UNSHADOW_ARGS
pop rbp
ret
;void vp8_subtract_mby_sse2(short *diff, unsigned char *src, unsigned char *pred, int stride)
global sym(vp8_subtract_mby_sse2)
sym(vp8_subtract_mby_sse2):
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 4
SAVE_XMM
GET_GOT rbx
push rsi
push rdi
; end prolog
mov rsi, arg(1) ;src
mov rdi, arg(0) ;diff
mov rax, arg(2) ;pred
movsxd rdx, dword ptr arg(3) ;stride
mov rcx, 8 ; do two lines at one time
submby_loop:
movdqa xmm0, XMMWORD PTR [rsi] ; src
movdqa xmm1, XMMWORD PTR [rax] ; pred
movdqa xmm2, xmm0
psubb xmm0, xmm1
pxor xmm1, [GLOBAL(t80)] ;convert to signed values
pxor xmm2, [GLOBAL(t80)]
pcmpgtb xmm1, xmm2 ; obtain sign information
movdqa xmm2, xmm0
movdqa xmm3, xmm1
punpcklbw xmm0, xmm1 ; put sign back to subtraction
punpckhbw xmm2, xmm3 ; put sign back to subtraction
movdqa XMMWORD PTR [rdi], xmm0
movdqa XMMWORD PTR [rdi +16], xmm2
movdqa xmm4, XMMWORD PTR [rsi + rdx]
movdqa xmm5, XMMWORD PTR [rax + 16]
movdqa xmm6, xmm4
psubb xmm4, xmm5
pxor xmm5, [GLOBAL(t80)] ;convert to signed values
pxor xmm6, [GLOBAL(t80)]
pcmpgtb xmm5, xmm6 ; obtain sign information
movdqa xmm6, xmm4
movdqa xmm7, xmm5
punpcklbw xmm4, xmm5 ; put sign back to subtraction
punpckhbw xmm6, xmm7 ; put sign back to subtraction
movdqa XMMWORD PTR [rdi +32], xmm4
movdqa XMMWORD PTR [rdi +48], xmm6
add rdi, 64
add rax, 32
lea rsi, [rsi+rdx*2]
sub rcx, 1
jnz submby_loop
pop rdi
pop rsi
; begin epilog
RESTORE_GOT
RESTORE_XMM
UNSHADOW_ARGS
pop rbp
ret
;void vp8_subtract_mbuv_sse2(short *diff, unsigned char *usrc, unsigned char *vsrc, unsigned char *pred, int stride)
global sym(vp8_subtract_mbuv_sse2)
sym(vp8_subtract_mbuv_sse2):
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 5
GET_GOT rbx
push rsi
push rdi
; end prolog
mov rdi, arg(0) ;diff
mov rax, arg(3) ;pred
mov rsi, arg(1) ;z = usrc
add rdi, 256*2 ;diff = diff + 256 (shorts)
add rax, 256 ;Predictor = pred + 256
movsxd rdx, dword ptr arg(4) ;stride;
lea rcx, [rdx + rdx*2]
;u
;line 0 1
movq xmm0, MMWORD PTR [rsi] ; src
movq xmm2, MMWORD PTR [rsi+rdx]
movdqa xmm1, XMMWORD PTR [rax] ; pred
punpcklqdq xmm0, xmm2
movdqa xmm2, xmm0
psubb xmm0, xmm1 ; subtraction with sign missed
pxor xmm1, [GLOBAL(t80)] ;convert to signed values
pxor xmm2, [GLOBAL(t80)]
pcmpgtb xmm1, xmm2 ; obtain sign information
movdqa xmm2, xmm0
movdqa xmm3, xmm1
punpcklbw xmm0, xmm1 ; put sign back to subtraction
punpckhbw xmm2, xmm3 ; put sign back to subtraction
movdqa XMMWORD PTR [rdi], xmm0
movdqa XMMWORD PTR [rdi +16], xmm2
;line 2 3
movq xmm0, MMWORD PTR [rsi+rdx*2] ; src
movq xmm2, MMWORD PTR [rsi+rcx]
movdqa xmm1, XMMWORD PTR [rax+16] ; pred
punpcklqdq xmm0, xmm2
movdqa xmm2, xmm0
psubb xmm0, xmm1 ; subtraction with sign missed
pxor xmm1, [GLOBAL(t80)] ;convert to signed values
pxor xmm2, [GLOBAL(t80)]
pcmpgtb xmm1, xmm2 ; obtain sign information
movdqa xmm2, xmm0
movdqa xmm3, xmm1
punpcklbw xmm0, xmm1 ; put sign back to subtraction
punpckhbw xmm2, xmm3 ; put sign back to subtraction
movdqa XMMWORD PTR [rdi + 32], xmm0
movdqa XMMWORD PTR [rdi + 48], xmm2
;line 4 5
lea rsi, [rsi + rdx*4]
movq xmm0, MMWORD PTR [rsi] ; src
movq xmm2, MMWORD PTR [rsi+rdx]
movdqa xmm1, XMMWORD PTR [rax + 32] ; pred
punpcklqdq xmm0, xmm2
movdqa xmm2, xmm0
psubb xmm0, xmm1 ; subtraction with sign missed
pxor xmm1, [GLOBAL(t80)] ;convert to signed values
pxor xmm2, [GLOBAL(t80)]
pcmpgtb xmm1, xmm2 ; obtain sign information
movdqa xmm2, xmm0
movdqa xmm3, xmm1
punpcklbw xmm0, xmm1 ; put sign back to subtraction
punpckhbw xmm2, xmm3 ; put sign back to subtraction
movdqa XMMWORD PTR [rdi + 64], xmm0
movdqa XMMWORD PTR [rdi + 80], xmm2
;line 6 7
movq xmm0, MMWORD PTR [rsi+rdx*2] ; src
movq xmm2, MMWORD PTR [rsi+rcx]
movdqa xmm1, XMMWORD PTR [rax+ 48] ; pred
punpcklqdq xmm0, xmm2
movdqa xmm2, xmm0
psubb xmm0, xmm1 ; subtraction with sign missed
pxor xmm1, [GLOBAL(t80)] ;convert to signed values
pxor xmm2, [GLOBAL(t80)]
pcmpgtb xmm1, xmm2 ; obtain sign information
movdqa xmm2, xmm0
movdqa xmm3, xmm1
punpcklbw xmm0, xmm1 ; put sign back to subtraction
punpckhbw xmm2, xmm3 ; put sign back to subtraction
movdqa XMMWORD PTR [rdi + 96], xmm0
movdqa XMMWORD PTR [rdi + 112], xmm2
;v
mov rsi, arg(2) ;z = vsrc
add rdi, 64*2 ;diff = diff + 320 (shorts)
add rax, 64 ;Predictor = pred + 320
;line 0 1
movq xmm0, MMWORD PTR [rsi] ; src
movq xmm2, MMWORD PTR [rsi+rdx]
movdqa xmm1, XMMWORD PTR [rax] ; pred
punpcklqdq xmm0, xmm2
movdqa xmm2, xmm0
psubb xmm0, xmm1 ; subtraction with sign missed
pxor xmm1, [GLOBAL(t80)] ;convert to signed values
pxor xmm2, [GLOBAL(t80)]
pcmpgtb xmm1, xmm2 ; obtain sign information
movdqa xmm2, xmm0
movdqa xmm3, xmm1
punpcklbw xmm0, xmm1 ; put sign back to subtraction
punpckhbw xmm2, xmm3 ; put sign back to subtraction
movdqa XMMWORD PTR [rdi], xmm0
movdqa XMMWORD PTR [rdi +16], xmm2
;line 2 3
movq xmm0, MMWORD PTR [rsi+rdx*2] ; src
movq xmm2, MMWORD PTR [rsi+rcx]
movdqa xmm1, XMMWORD PTR [rax+16] ; pred
punpcklqdq xmm0, xmm2
movdqa xmm2, xmm0
psubb xmm0, xmm1 ; subtraction with sign missed
pxor xmm1, [GLOBAL(t80)] ;convert to signed values
pxor xmm2, [GLOBAL(t80)]
pcmpgtb xmm1, xmm2 ; obtain sign information
movdqa xmm2, xmm0
movdqa xmm3, xmm1
punpcklbw xmm0, xmm1 ; put sign back to subtraction
punpckhbw xmm2, xmm3 ; put sign back to subtraction
movdqa XMMWORD PTR [rdi + 32], xmm0
movdqa XMMWORD PTR [rdi + 48], xmm2
;line 4 5
lea rsi, [rsi + rdx*4]
movq xmm0, MMWORD PTR [rsi] ; src
movq xmm2, MMWORD PTR [rsi+rdx]
movdqa xmm1, XMMWORD PTR [rax + 32] ; pred
punpcklqdq xmm0, xmm2
movdqa xmm2, xmm0
psubb xmm0, xmm1 ; subtraction with sign missed
pxor xmm1, [GLOBAL(t80)] ;convert to signed values
pxor xmm2, [GLOBAL(t80)]
pcmpgtb xmm1, xmm2 ; obtain sign information
movdqa xmm2, xmm0
movdqa xmm3, xmm1
punpcklbw xmm0, xmm1 ; put sign back to subtraction
punpckhbw xmm2, xmm3 ; put sign back to subtraction
movdqa XMMWORD PTR [rdi + 64], xmm0
movdqa XMMWORD PTR [rdi + 80], xmm2
;line 6 7
movq xmm0, MMWORD PTR [rsi+rdx*2] ; src
movq xmm2, MMWORD PTR [rsi+rcx]
movdqa xmm1, XMMWORD PTR [rax+ 48] ; pred
punpcklqdq xmm0, xmm2
movdqa xmm2, xmm0
psubb xmm0, xmm1 ; subtraction with sign missed
pxor xmm1, [GLOBAL(t80)] ;convert to signed values
pxor xmm2, [GLOBAL(t80)]
pcmpgtb xmm1, xmm2 ; obtain sign information
movdqa xmm2, xmm0
movdqa xmm3, xmm1
punpcklbw xmm0, xmm1 ; put sign back to subtraction
punpckhbw xmm2, xmm3 ; put sign back to subtraction
movdqa XMMWORD PTR [rdi + 96], xmm0
movdqa XMMWORD PTR [rdi + 112], xmm2
; begin epilog
pop rdi
pop rsi
RESTORE_GOT
UNSHADOW_ARGS
pop rbp
ret
SECTION_RODATA
align 16
t80:
times 16 db 0x80