vpx/vp8/common/x86/mask_sse3.asm

485 lines
12 KiB
NASM

;
; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
;
; Use of this source code is governed by a BSD-style license
; that can be found in the LICENSE file in the root of the source
; tree. An additional intellectual property rights grant can be found
; in the file PATENTS. All contributing project authors may
; be found in the AUTHORS file in the root of the source tree.
;
%include "vpx_ports/x86_abi_support.asm"
;void int vp8_makemask_sse3(
; unsigned char *y,
; unsigned char *u,
; unsigned char *v,
; unsigned char *ym,
; unsigned char *uvm,
; int yp,
; int uvp,
; int ys,
; int us,
; int vs,
; int yt,
; int ut,
; int vt)
global sym(vp8_makemask_sse3)
sym(vp8_makemask_sse3):
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 14
push rsi
push rdi
; end prolog
mov rsi, arg(0) ;y
mov rdi, arg(1) ;u
mov rcx, arg(2) ;v
mov rax, arg(3) ;ym
movsxd rbx, dword arg(4) ;yp
movsxd rdx, dword arg(5) ;uvp
pxor xmm0,xmm0
;make 16 copies of the center y value
movd xmm1, arg(6)
pshufb xmm1, xmm0
; make 16 copies of the center u value
movd xmm2, arg(7)
pshufb xmm2, xmm0
; make 16 copies of the center v value
movd xmm3, arg(8)
pshufb xmm3, xmm0
unpcklpd xmm2, xmm3
;make 16 copies of the y tolerance
movd xmm3, arg(9)
pshufb xmm3, xmm0
;make 16 copies of the u tolerance
movd xmm4, arg(10)
pshufb xmm4, xmm0
;make 16 copies of the v tolerance
movd xmm5, arg(11)
pshufb xmm5, xmm0
unpckhpd xmm4, xmm5
mov r8,8
NextPairOfRows:
;grab the y source values
movdqu xmm0, [rsi]
;compute abs difference between source and y target
movdqa xmm6, xmm1
movdqa xmm7, xmm0
psubusb xmm0, xmm1
psubusb xmm6, xmm7
por xmm0, xmm6
;compute abs difference between
movdqa xmm6, xmm3
pcmpgtb xmm6, xmm0
;grab the y source values
add rsi, rbx
movdqu xmm0, [rsi]
;compute abs difference between source and y target
movdqa xmm11, xmm1
movdqa xmm7, xmm0
psubusb xmm0, xmm1
psubusb xmm11, xmm7
por xmm0, xmm11
;compute abs difference between
movdqa xmm11, xmm3
pcmpgtb xmm11, xmm0
;grab the u and v source values
movdqu xmm7, [rdi]
movdqu xmm8, [rcx]
unpcklpd xmm7, xmm8
;compute abs difference between source and uv targets
movdqa xmm9, xmm2
movdqa xmm10, xmm7
psubusb xmm7, xmm2
psubusb xmm9, xmm10
por xmm7, xmm9
;check whether the number is < tolerance
movdqa xmm0, xmm4
pcmpgtb xmm0, xmm7
;double u and v masks
movdqa xmm8, xmm0
punpckhbw xmm0, xmm0
punpcklbw xmm8, xmm8
;mask row 0 and output
pand xmm6, xmm8
pand xmm6, xmm0
movdqa [rax],xmm6
;mask row 1 and output
pand xmm11, xmm8
pand xmm11, xmm0
movdqa [rax+16],xmm11
; to the next row or set of rows
add rsi, rbx
add rdi, rdx
add rcx, rdx
add rax,32
dec r8
jnz NextPairOfRows
; begin epilog
pop rdi
pop rsi
UNSHADOW_ARGS
pop rbp
ret
;GROW_HORIZ (register for result, source register or mem local)
; takes source and shifts left and ors with source
; then shifts right and ors with source
%macro GROW_HORIZ 2
movdqa %1, %2
movdqa xmm14, %1
movdqa xmm15, %1
pslldq xmm14, 1
psrldq xmm15, 1
por %1,xmm14
por %1,xmm15
%endmacro
;GROW_VERT (result, center row, above row, below row)
%macro GROW_VERT 4
movdqa %1,%2
por %1,%3
por %1,%4
%endmacro
;GROW_NEXTLINE (new line to grow, new source, line to write)
%macro GROW_NEXTLINE 3
GROW_HORIZ %1, %2
GROW_VERT xmm3, xmm0, xmm1, xmm2
movdqa %3,xmm3
%endmacro
;void int vp8_growmaskmb_sse3(
; unsigned char *om,
; unsigned char *nm,
global sym(vp8_growmaskmb_sse3)
sym(vp8_growmaskmb_sse3):
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 2
push rsi
push rdi
; end prolog
mov rsi, arg(0) ;src
mov rdi, arg(1) ;rst
GROW_HORIZ xmm0, [rsi]
GROW_HORIZ xmm1, [rsi+16]
GROW_HORIZ xmm2, [rsi+32]
GROW_VERT xmm3, xmm0, xmm1, xmm2
por xmm0,xmm1
movdqa [rdi], xmm0
movdqa [rdi+16],xmm3
GROW_NEXTLINE xmm0,[rsi+48],[rdi+32]
GROW_NEXTLINE xmm1,[rsi+64],[rdi+48]
GROW_NEXTLINE xmm2,[rsi+80],[rdi+64]
GROW_NEXTLINE xmm0,[rsi+96],[rdi+80]
GROW_NEXTLINE xmm1,[rsi+112],[rdi+96]
GROW_NEXTLINE xmm2,[rsi+128],[rdi+112]
GROW_NEXTLINE xmm0,[rsi+144],[rdi+128]
GROW_NEXTLINE xmm1,[rsi+160],[rdi+144]
GROW_NEXTLINE xmm2,[rsi+176],[rdi+160]
GROW_NEXTLINE xmm0,[rsi+192],[rdi+176]
GROW_NEXTLINE xmm1,[rsi+208],[rdi+192]
GROW_NEXTLINE xmm2,[rsi+224],[rdi+208]
GROW_NEXTLINE xmm0,[rsi+240],[rdi+224]
por xmm0,xmm2
movdqa [rdi+240], xmm0
; begin epilog
pop rdi
pop rsi
UNSHADOW_ARGS
pop rbp
ret
;unsigned int vp8_sad16x16_masked_wmt(
; unsigned char *src_ptr,
; int src_stride,
; unsigned char *ref_ptr,
; int ref_stride,
; unsigned char *mask)
global sym(vp8_sad16x16_masked_wmt)
sym(vp8_sad16x16_masked_wmt):
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 5
push rsi
push rdi
; end prolog
mov rsi, arg(0) ;src_ptr
mov rdi, arg(2) ;ref_ptr
mov rbx, arg(4) ;mask
movsxd rax, dword ptr arg(1) ;src_stride
movsxd rdx, dword ptr arg(3) ;ref_stride
mov rcx, 16
pxor xmm3, xmm3
NextSadRow:
movdqu xmm0, [rsi]
movdqu xmm1, [rdi]
movdqu xmm2, [rbx]
pand xmm0, xmm2
pand xmm1, xmm2
psadbw xmm0, xmm1
paddw xmm3, xmm0
add rsi, rax
add rdi, rdx
add rbx, 16
dec rcx
jnz NextSadRow
movdqa xmm4 , xmm3
psrldq xmm4, 8
paddw xmm3, xmm4
movq rax, xmm3
; begin epilog
pop rdi
pop rsi
UNSHADOW_ARGS
pop rbp
ret
;unsigned int vp8_sad16x16_unmasked_wmt(
; unsigned char *src_ptr,
; int src_stride,
; unsigned char *ref_ptr,
; int ref_stride,
; unsigned char *mask)
global sym(vp8_sad16x16_unmasked_wmt)
sym(vp8_sad16x16_unmasked_wmt):
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 5
push rsi
push rdi
; end prolog
mov rsi, arg(0) ;src_ptr
mov rdi, arg(2) ;ref_ptr
mov rbx, arg(4) ;mask
movsxd rax, dword ptr arg(1) ;src_stride
movsxd rdx, dword ptr arg(3) ;ref_stride
mov rcx, 16
pxor xmm3, xmm3
next_vp8_sad16x16_unmasked_wmt:
movdqu xmm0, [rsi]
movdqu xmm1, [rdi]
movdqu xmm2, [rbx]
por xmm0, xmm2
por xmm1, xmm2
psadbw xmm0, xmm1
paddw xmm3, xmm0
add rsi, rax
add rdi, rdx
add rbx, 16
dec rcx
jnz next_vp8_sad16x16_unmasked_wmt
movdqa xmm4 , xmm3
psrldq xmm4, 8
paddw xmm3, xmm4
movq rax, xmm3
; begin epilog
pop rdi
pop rsi
UNSHADOW_ARGS
pop rbp
ret
;unsigned int vp8_masked_predictor_wmt(
; unsigned char *masked,
; unsigned char *unmasked,
; int src_stride,
; unsigned char *dst_ptr,
; int dst_stride,
; unsigned char *mask)
global sym(vp8_masked_predictor_wmt)
sym(vp8_masked_predictor_wmt):
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 6
push rsi
push rdi
; end prolog
mov rsi, arg(0) ;src_ptr
mov rdi, arg(1) ;ref_ptr
mov rbx, arg(5) ;mask
movsxd rax, dword ptr arg(2) ;src_stride
mov r11, arg(3) ; destination
movsxd rdx, dword ptr arg(4) ;dst_stride
mov rcx, 16
pxor xmm3, xmm3
next_vp8_masked_predictor_wmt:
movdqu xmm0, [rsi]
movdqu xmm1, [rdi]
movdqu xmm2, [rbx]
pand xmm0, xmm2
pandn xmm2, xmm1
por xmm0, xmm2
movdqu [r11], xmm0
add r11, rdx
add rsi, rax
add rdi, rdx
add rbx, 16
dec rcx
jnz next_vp8_masked_predictor_wmt
; begin epilog
pop rdi
pop rsi
UNSHADOW_ARGS
pop rbp
ret
;unsigned int vp8_masked_predictor_uv_wmt(
; unsigned char *masked,
; unsigned char *unmasked,
; int src_stride,
; unsigned char *dst_ptr,
; int dst_stride,
; unsigned char *mask)
global sym(vp8_masked_predictor_uv_wmt)
sym(vp8_masked_predictor_uv_wmt):
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 6
push rsi
push rdi
; end prolog
mov rsi, arg(0) ;src_ptr
mov rdi, arg(1) ;ref_ptr
mov rbx, arg(5) ;mask
movsxd rax, dword ptr arg(2) ;src_stride
mov r11, arg(3) ; destination
movsxd rdx, dword ptr arg(4) ;dst_stride
mov rcx, 8
pxor xmm3, xmm3
next_vp8_masked_predictor_uv_wmt:
movq xmm0, [rsi]
movq xmm1, [rdi]
movq xmm2, [rbx]
pand xmm0, xmm2
pandn xmm2, xmm1
por xmm0, xmm2
movq [r11], xmm0
add r11, rdx
add rsi, rax
add rdi, rax
add rbx, 8
dec rcx
jnz next_vp8_masked_predictor_uv_wmt
; begin epilog
pop rdi
pop rsi
UNSHADOW_ARGS
pop rbp
ret
;unsigned int vp8_uv_from_y_mask(
; unsigned char *ymask,
; unsigned char *uvmask)
global sym(vp8_uv_from_y_mask)
sym(vp8_uv_from_y_mask):
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 6
push rsi
push rdi
; end prolog
mov rsi, arg(0) ;src_ptr
mov rdi, arg(1) ;dst_ptr
mov rcx, 8
pxor xmm3, xmm3
next_p8_uv_from_y_mask:
movdqu xmm0, [rsi]
pshufb xmm0, [shuf1b] ;[GLOBAL(shuf1b)]
movq [rdi],xmm0
add rdi, 8
add rsi,32
dec rcx
jnz next_p8_uv_from_y_mask
; begin epilog
pop rdi
pop rsi
UNSHADOW_ARGS
pop rbp
ret
SECTION_RODATA
align 16
shuf1b:
db 0, 2, 4, 6, 8, 10, 12, 14, 0, 0, 0, 0, 0, 0, 0, 0