02ff360b33
Also port the 4x4, 16x16, 8x16 and 16x8 versions to x86inc.asm; this makes them all slightly faster, particularly on x86-64. Remove SSE3 sad16x16 version, since the SSE2 version is now faster. About 1.5% overall encoding speedup. Change-Id: Id4011a78cce7839f554b301d0800d5ca021af797
93 lines
2.9 KiB
NASM
93 lines
2.9 KiB
NASM
;
|
|
; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
|
|
;
|
|
; Use of this source code is governed by a BSD-style license
|
|
; that can be found in the LICENSE file in the root of the source
|
|
; tree. An additional intellectual property rights grant can be found
|
|
; in the file PATENTS. All contributing project authors may
|
|
; be found in the AUTHORS file in the root of the source tree.
|
|
;
|
|
|
|
|
|
%include "vpx_ports/x86_abi_support.asm"
|
|
|
|
;void vp9_copy32xn_sse2(
|
|
; unsigned char *src_ptr,
|
|
; int src_stride,
|
|
; unsigned char *dst_ptr,
|
|
; int dst_stride,
|
|
; int height);
|
|
global sym(vp9_copy32xn_sse2) PRIVATE
|
|
sym(vp9_copy32xn_sse2):
|
|
push rbp
|
|
mov rbp, rsp
|
|
SHADOW_ARGS_TO_STACK 5
|
|
SAVE_XMM 7
|
|
push rsi
|
|
push rdi
|
|
; end prolog
|
|
|
|
mov rsi, arg(0) ;src_ptr
|
|
mov rdi, arg(2) ;dst_ptr
|
|
|
|
movsxd rax, dword ptr arg(1) ;src_stride
|
|
movsxd rdx, dword ptr arg(3) ;dst_stride
|
|
movsxd rcx, dword ptr arg(4) ;height
|
|
|
|
.block_copy_sse2_loopx4:
|
|
movdqu xmm0, XMMWORD PTR [rsi]
|
|
movdqu xmm1, XMMWORD PTR [rsi + 16]
|
|
movdqu xmm2, XMMWORD PTR [rsi + rax]
|
|
movdqu xmm3, XMMWORD PTR [rsi + rax + 16]
|
|
|
|
lea rsi, [rsi+rax*2]
|
|
|
|
movdqu xmm4, XMMWORD PTR [rsi]
|
|
movdqu xmm5, XMMWORD PTR [rsi + 16]
|
|
movdqu xmm6, XMMWORD PTR [rsi + rax]
|
|
movdqu xmm7, XMMWORD PTR [rsi + rax + 16]
|
|
|
|
lea rsi, [rsi+rax*2]
|
|
|
|
movdqa XMMWORD PTR [rdi], xmm0
|
|
movdqa XMMWORD PTR [rdi + 16], xmm1
|
|
movdqa XMMWORD PTR [rdi + rdx], xmm2
|
|
movdqa XMMWORD PTR [rdi + rdx + 16], xmm3
|
|
|
|
lea rdi, [rdi+rdx*2]
|
|
|
|
movdqa XMMWORD PTR [rdi], xmm4
|
|
movdqa XMMWORD PTR [rdi + 16], xmm5
|
|
movdqa XMMWORD PTR [rdi + rdx], xmm6
|
|
movdqa XMMWORD PTR [rdi + rdx + 16], xmm7
|
|
|
|
lea rdi, [rdi+rdx*2]
|
|
|
|
sub rcx, 4
|
|
cmp rcx, 4
|
|
jge .block_copy_sse2_loopx4
|
|
|
|
cmp rcx, 0
|
|
je .copy_is_done
|
|
|
|
.block_copy_sse2_loop:
|
|
movdqu xmm0, XMMWORD PTR [rsi]
|
|
movdqu xmm1, XMMWORD PTR [rsi + 16]
|
|
lea rsi, [rsi+rax]
|
|
|
|
movdqa XMMWORD PTR [rdi], xmm0
|
|
movdqa XMMWORD PTR [rdi + 16], xmm1
|
|
lea rdi, [rdi+rdx]
|
|
|
|
sub rcx, 1
|
|
jne .block_copy_sse2_loop
|
|
|
|
.copy_is_done:
|
|
; begin epilog
|
|
pop rdi
|
|
pop rsi
|
|
RESTORE_XMM
|
|
UNSHADOW_ARGS
|
|
pop rbp
|
|
ret
|