vpx/vp8/encoder/x86/sad_sse2.asm
Yunqing Wang 20bd1446c0 Preload reference area to an intermediate buffer in sub-pixel motion search
In sub-pixel motion search, the search range is small(+/- 3 pixels).
Preload whole search area from reference buffer into a 32-byte
aligned buffer. Then in search, load reference data from this buffer
instead. This keeps data in cache, and reduces the crossing cache-
line penalty. For tulip clip, tests on Intel Core2 Quad machine(linux)
showed encoder speed improvement:
  3.4%   at --rt --cpu-used =-4
  2.8%   at --rt --cpu-used =-3
  2.3%   at --rt --cpu-used =-2
  2.2%   at --rt --cpu-used =-1

Test on Atom notebook showed only 1.1% speed improvement(speed=-4).
Test on Xeon machine also showed less improvement, since unaligned
data access latency is greatly reduced in newer cores.

Next, I will apply similar idea to other 2 sub-pixel search functions
for encoding speed > 4.

Make this change exclusively for x86 platforms.

Change-Id: Ia7bb9f56169eac0f01009fe2b2f2ab5b61d2eb2f
2011-07-22 09:28:06 -04:00

411 lines
11 KiB
NASM

;
; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
;
; Use of this source code is governed by a BSD-style license
; that can be found in the LICENSE file in the root of the source
; tree. An additional intellectual property rights grant can be found
; in the file PATENTS. All contributing project authors may
; be found in the AUTHORS file in the root of the source tree.
;
%include "vpx_ports/x86_abi_support.asm"
;unsigned int vp8_sad16x16_wmt(
; unsigned char *src_ptr,
; int src_stride,
; unsigned char *ref_ptr,
; int ref_stride)
global sym(vp8_sad16x16_wmt)
sym(vp8_sad16x16_wmt):
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 4
SAVE_XMM 6
push rsi
push rdi
; end prolog
mov rsi, arg(0) ;src_ptr
mov rdi, arg(2) ;ref_ptr
movsxd rax, dword ptr arg(1) ;src_stride
movsxd rdx, dword ptr arg(3) ;ref_stride
lea rcx, [rsi+rax*8]
lea rcx, [rcx+rax*8]
pxor xmm6, xmm6
x16x16sad_wmt_loop:
movq xmm0, QWORD PTR [rsi]
movq xmm2, QWORD PTR [rsi+8]
movq xmm1, QWORD PTR [rdi]
movq xmm3, QWORD PTR [rdi+8]
movq xmm4, QWORD PTR [rsi+rax]
movq xmm5, QWORD PTR [rdi+rdx]
punpcklbw xmm0, xmm2
punpcklbw xmm1, xmm3
psadbw xmm0, xmm1
movq xmm2, QWORD PTR [rsi+rax+8]
movq xmm3, QWORD PTR [rdi+rdx+8]
lea rsi, [rsi+rax*2]
lea rdi, [rdi+rdx*2]
punpcklbw xmm4, xmm2
punpcklbw xmm5, xmm3
psadbw xmm4, xmm5
paddw xmm6, xmm0
paddw xmm6, xmm4
cmp rsi, rcx
jne x16x16sad_wmt_loop
movq xmm0, xmm6
psrldq xmm6, 8
paddw xmm0, xmm6
movq rax, xmm0
; begin epilog
pop rdi
pop rsi
RESTORE_XMM
UNSHADOW_ARGS
pop rbp
ret
;unsigned int vp8_sad8x16_wmt(
; unsigned char *src_ptr,
; int src_stride,
; unsigned char *ref_ptr,
; int ref_stride,
; int max_err)
global sym(vp8_sad8x16_wmt)
sym(vp8_sad8x16_wmt):
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 5
push rbx
push rsi
push rdi
; end prolog
mov rsi, arg(0) ;src_ptr
mov rdi, arg(2) ;ref_ptr
movsxd rbx, dword ptr arg(1) ;src_stride
movsxd rdx, dword ptr arg(3) ;ref_stride
lea rcx, [rsi+rbx*8]
lea rcx, [rcx+rbx*8]
pxor mm7, mm7
x8x16sad_wmt_loop:
movq rax, mm7
cmp eax, arg(4)
jg x8x16sad_wmt_early_exit
movq mm0, QWORD PTR [rsi]
movq mm1, QWORD PTR [rdi]
movq mm2, QWORD PTR [rsi+rbx]
movq mm3, QWORD PTR [rdi+rdx]
psadbw mm0, mm1
psadbw mm2, mm3
lea rsi, [rsi+rbx*2]
lea rdi, [rdi+rdx*2]
paddw mm7, mm0
paddw mm7, mm2
cmp rsi, rcx
jne x8x16sad_wmt_loop
movq rax, mm7
x8x16sad_wmt_early_exit:
; begin epilog
pop rdi
pop rsi
pop rbx
UNSHADOW_ARGS
pop rbp
ret
;unsigned int vp8_sad8x8_wmt(
; unsigned char *src_ptr,
; int src_stride,
; unsigned char *ref_ptr,
; int ref_stride)
global sym(vp8_sad8x8_wmt)
sym(vp8_sad8x8_wmt):
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 5
push rbx
push rsi
push rdi
; end prolog
mov rsi, arg(0) ;src_ptr
mov rdi, arg(2) ;ref_ptr
movsxd rbx, dword ptr arg(1) ;src_stride
movsxd rdx, dword ptr arg(3) ;ref_stride
lea rcx, [rsi+rbx*8]
pxor mm7, mm7
x8x8sad_wmt_loop:
movq rax, mm7
cmp eax, arg(4)
jg x8x8sad_wmt_early_exit
movq mm0, QWORD PTR [rsi]
movq mm1, QWORD PTR [rdi]
psadbw mm0, mm1
lea rsi, [rsi+rbx]
add rdi, rdx
paddw mm7, mm0
cmp rsi, rcx
jne x8x8sad_wmt_loop
movq rax, mm7
x8x8sad_wmt_early_exit:
; begin epilog
pop rdi
pop rsi
pop rbx
UNSHADOW_ARGS
pop rbp
ret
;unsigned int vp8_sad4x4_wmt(
; unsigned char *src_ptr,
; int src_stride,
; unsigned char *ref_ptr,
; int ref_stride)
global sym(vp8_sad4x4_wmt)
sym(vp8_sad4x4_wmt):
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 4
push rsi
push rdi
; end prolog
mov rsi, arg(0) ;src_ptr
mov rdi, arg(2) ;ref_ptr
movsxd rax, dword ptr arg(1) ;src_stride
movsxd rdx, dword ptr arg(3) ;ref_stride
movd mm0, DWORD PTR [rsi]
movd mm1, DWORD PTR [rdi]
movd mm2, DWORD PTR [rsi+rax]
movd mm3, DWORD PTR [rdi+rdx]
punpcklbw mm0, mm2
punpcklbw mm1, mm3
psadbw mm0, mm1
lea rsi, [rsi+rax*2]
lea rdi, [rdi+rdx*2]
movd mm4, DWORD PTR [rsi]
movd mm5, DWORD PTR [rdi]
movd mm6, DWORD PTR [rsi+rax]
movd mm7, DWORD PTR [rdi+rdx]
punpcklbw mm4, mm6
punpcklbw mm5, mm7
psadbw mm4, mm5
paddw mm0, mm4
movq rax, mm0
; begin epilog
pop rdi
pop rsi
UNSHADOW_ARGS
pop rbp
ret
;unsigned int vp8_sad16x8_wmt(
; unsigned char *src_ptr,
; int src_stride,
; unsigned char *ref_ptr,
; int ref_stride)
global sym(vp8_sad16x8_wmt)
sym(vp8_sad16x8_wmt):
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 5
push rbx
push rsi
push rdi
; end prolog
mov rsi, arg(0) ;src_ptr
mov rdi, arg(2) ;ref_ptr
movsxd rbx, dword ptr arg(1) ;src_stride
movsxd rdx, dword ptr arg(3) ;ref_stride
lea rcx, [rsi+rbx*8]
pxor mm7, mm7
x16x8sad_wmt_loop:
movq rax, mm7
cmp eax, arg(4)
jg x16x8sad_wmt_early_exit
movq mm0, QWORD PTR [rsi]
movq mm2, QWORD PTR [rsi+8]
movq mm1, QWORD PTR [rdi]
movq mm3, QWORD PTR [rdi+8]
movq mm4, QWORD PTR [rsi+rbx]
movq mm5, QWORD PTR [rdi+rdx]
psadbw mm0, mm1
psadbw mm2, mm3
movq mm1, QWORD PTR [rsi+rbx+8]
movq mm3, QWORD PTR [rdi+rdx+8]
psadbw mm4, mm5
psadbw mm1, mm3
lea rsi, [rsi+rbx*2]
lea rdi, [rdi+rdx*2]
paddw mm0, mm2
paddw mm4, mm1
paddw mm7, mm0
paddw mm7, mm4
cmp rsi, rcx
jne x16x8sad_wmt_loop
movq rax, mm7
x16x8sad_wmt_early_exit:
; begin epilog
pop rdi
pop rsi
pop rbx
UNSHADOW_ARGS
pop rbp
ret
;void vp8_copy32xn_sse2(
; unsigned char *src_ptr,
; int src_stride,
; unsigned char *dst_ptr,
; int dst_stride,
; int height);
global sym(vp8_copy32xn_sse2)
sym(vp8_copy32xn_sse2):
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 5
SAVE_XMM 7
push rsi
push rdi
; end prolog
mov rsi, arg(0) ;src_ptr
mov rdi, arg(2) ;dst_ptr
movsxd rax, dword ptr arg(1) ;src_stride
movsxd rdx, dword ptr arg(3) ;dst_stride
movsxd rcx, dword ptr arg(4) ;height
block_copy_sse2_loopx4:
movdqu xmm0, XMMWORD PTR [rsi]
movdqu xmm1, XMMWORD PTR [rsi + 16]
movdqu xmm2, XMMWORD PTR [rsi + rax]
movdqu xmm3, XMMWORD PTR [rsi + rax + 16]
lea rsi, [rsi+rax*2]
movdqu xmm4, XMMWORD PTR [rsi]
movdqu xmm5, XMMWORD PTR [rsi + 16]
movdqu xmm6, XMMWORD PTR [rsi + rax]
movdqu xmm7, XMMWORD PTR [rsi + rax + 16]
lea rsi, [rsi+rax*2]
movdqa XMMWORD PTR [rdi], xmm0
movdqa XMMWORD PTR [rdi + 16], xmm1
movdqa XMMWORD PTR [rdi + rdx], xmm2
movdqa XMMWORD PTR [rdi + rdx + 16], xmm3
lea rdi, [rdi+rdx*2]
movdqa XMMWORD PTR [rdi], xmm4
movdqa XMMWORD PTR [rdi + 16], xmm5
movdqa XMMWORD PTR [rdi + rdx], xmm6
movdqa XMMWORD PTR [rdi + rdx + 16], xmm7
lea rdi, [rdi+rdx*2]
sub rcx, 4
cmp rcx, 4
jge block_copy_sse2_loopx4
cmp rcx, 0
je copy_is_done
block_copy_sse2_loop:
movdqu xmm0, XMMWORD PTR [rsi]
movdqu xmm1, XMMWORD PTR [rsi + 16]
lea rsi, [rsi+rax]
movdqa XMMWORD PTR [rdi], xmm0
movdqa XMMWORD PTR [rdi + 16], xmm1
lea rdi, [rdi+rdx]
sub rcx, 1
jne block_copy_sse2_loop
copy_is_done:
; begin epilog
pop rdi
pop rsi
RESTORE_XMM
UNSHADOW_ARGS
pop rbp
ret