20bd1446c0
In sub-pixel motion search, the search range is small(+/- 3 pixels). Preload whole search area from reference buffer into a 32-byte aligned buffer. Then in search, load reference data from this buffer instead. This keeps data in cache, and reduces the crossing cache- line penalty. For tulip clip, tests on Intel Core2 Quad machine(linux) showed encoder speed improvement: 3.4% at --rt --cpu-used =-4 2.8% at --rt --cpu-used =-3 2.3% at --rt --cpu-used =-2 2.2% at --rt --cpu-used =-1 Test on Atom notebook showed only 1.1% speed improvement(speed=-4). Test on Xeon machine also showed less improvement, since unaligned data access latency is greatly reduced in newer cores. Next, I will apply similar idea to other 2 sub-pixel search functions for encoding speed > 4. Make this change exclusively for x86 platforms. Change-Id: Ia7bb9f56169eac0f01009fe2b2f2ab5b61d2eb2f
411 lines
11 KiB
NASM
411 lines
11 KiB
NASM
;
|
|
; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
|
|
;
|
|
; Use of this source code is governed by a BSD-style license
|
|
; that can be found in the LICENSE file in the root of the source
|
|
; tree. An additional intellectual property rights grant can be found
|
|
; in the file PATENTS. All contributing project authors may
|
|
; be found in the AUTHORS file in the root of the source tree.
|
|
;
|
|
|
|
|
|
%include "vpx_ports/x86_abi_support.asm"
|
|
|
|
;unsigned int vp8_sad16x16_wmt(
|
|
; unsigned char *src_ptr,
|
|
; int src_stride,
|
|
; unsigned char *ref_ptr,
|
|
; int ref_stride)
|
|
global sym(vp8_sad16x16_wmt)
|
|
sym(vp8_sad16x16_wmt):
|
|
push rbp
|
|
mov rbp, rsp
|
|
SHADOW_ARGS_TO_STACK 4
|
|
SAVE_XMM 6
|
|
push rsi
|
|
push rdi
|
|
; end prolog
|
|
|
|
mov rsi, arg(0) ;src_ptr
|
|
mov rdi, arg(2) ;ref_ptr
|
|
|
|
movsxd rax, dword ptr arg(1) ;src_stride
|
|
movsxd rdx, dword ptr arg(3) ;ref_stride
|
|
|
|
lea rcx, [rsi+rax*8]
|
|
|
|
lea rcx, [rcx+rax*8]
|
|
pxor xmm6, xmm6
|
|
|
|
x16x16sad_wmt_loop:
|
|
|
|
movq xmm0, QWORD PTR [rsi]
|
|
movq xmm2, QWORD PTR [rsi+8]
|
|
|
|
movq xmm1, QWORD PTR [rdi]
|
|
movq xmm3, QWORD PTR [rdi+8]
|
|
|
|
movq xmm4, QWORD PTR [rsi+rax]
|
|
movq xmm5, QWORD PTR [rdi+rdx]
|
|
|
|
|
|
punpcklbw xmm0, xmm2
|
|
punpcklbw xmm1, xmm3
|
|
|
|
psadbw xmm0, xmm1
|
|
movq xmm2, QWORD PTR [rsi+rax+8]
|
|
|
|
movq xmm3, QWORD PTR [rdi+rdx+8]
|
|
lea rsi, [rsi+rax*2]
|
|
|
|
lea rdi, [rdi+rdx*2]
|
|
punpcklbw xmm4, xmm2
|
|
|
|
punpcklbw xmm5, xmm3
|
|
psadbw xmm4, xmm5
|
|
|
|
paddw xmm6, xmm0
|
|
paddw xmm6, xmm4
|
|
|
|
cmp rsi, rcx
|
|
jne x16x16sad_wmt_loop
|
|
|
|
movq xmm0, xmm6
|
|
psrldq xmm6, 8
|
|
|
|
paddw xmm0, xmm6
|
|
movq rax, xmm0
|
|
|
|
; begin epilog
|
|
pop rdi
|
|
pop rsi
|
|
RESTORE_XMM
|
|
UNSHADOW_ARGS
|
|
pop rbp
|
|
ret
|
|
|
|
;unsigned int vp8_sad8x16_wmt(
|
|
; unsigned char *src_ptr,
|
|
; int src_stride,
|
|
; unsigned char *ref_ptr,
|
|
; int ref_stride,
|
|
; int max_err)
|
|
global sym(vp8_sad8x16_wmt)
|
|
sym(vp8_sad8x16_wmt):
|
|
push rbp
|
|
mov rbp, rsp
|
|
SHADOW_ARGS_TO_STACK 5
|
|
push rbx
|
|
push rsi
|
|
push rdi
|
|
; end prolog
|
|
|
|
mov rsi, arg(0) ;src_ptr
|
|
mov rdi, arg(2) ;ref_ptr
|
|
|
|
movsxd rbx, dword ptr arg(1) ;src_stride
|
|
movsxd rdx, dword ptr arg(3) ;ref_stride
|
|
|
|
lea rcx, [rsi+rbx*8]
|
|
|
|
lea rcx, [rcx+rbx*8]
|
|
pxor mm7, mm7
|
|
|
|
x8x16sad_wmt_loop:
|
|
|
|
movq rax, mm7
|
|
cmp eax, arg(4)
|
|
jg x8x16sad_wmt_early_exit
|
|
|
|
movq mm0, QWORD PTR [rsi]
|
|
movq mm1, QWORD PTR [rdi]
|
|
|
|
movq mm2, QWORD PTR [rsi+rbx]
|
|
movq mm3, QWORD PTR [rdi+rdx]
|
|
|
|
psadbw mm0, mm1
|
|
psadbw mm2, mm3
|
|
|
|
lea rsi, [rsi+rbx*2]
|
|
lea rdi, [rdi+rdx*2]
|
|
|
|
paddw mm7, mm0
|
|
paddw mm7, mm2
|
|
|
|
cmp rsi, rcx
|
|
jne x8x16sad_wmt_loop
|
|
|
|
movq rax, mm7
|
|
|
|
x8x16sad_wmt_early_exit:
|
|
|
|
; begin epilog
|
|
pop rdi
|
|
pop rsi
|
|
pop rbx
|
|
UNSHADOW_ARGS
|
|
pop rbp
|
|
ret
|
|
|
|
|
|
;unsigned int vp8_sad8x8_wmt(
|
|
; unsigned char *src_ptr,
|
|
; int src_stride,
|
|
; unsigned char *ref_ptr,
|
|
; int ref_stride)
|
|
global sym(vp8_sad8x8_wmt)
|
|
sym(vp8_sad8x8_wmt):
|
|
push rbp
|
|
mov rbp, rsp
|
|
SHADOW_ARGS_TO_STACK 5
|
|
push rbx
|
|
push rsi
|
|
push rdi
|
|
; end prolog
|
|
|
|
mov rsi, arg(0) ;src_ptr
|
|
mov rdi, arg(2) ;ref_ptr
|
|
|
|
movsxd rbx, dword ptr arg(1) ;src_stride
|
|
movsxd rdx, dword ptr arg(3) ;ref_stride
|
|
|
|
lea rcx, [rsi+rbx*8]
|
|
pxor mm7, mm7
|
|
|
|
x8x8sad_wmt_loop:
|
|
|
|
movq rax, mm7
|
|
cmp eax, arg(4)
|
|
jg x8x8sad_wmt_early_exit
|
|
|
|
movq mm0, QWORD PTR [rsi]
|
|
movq mm1, QWORD PTR [rdi]
|
|
|
|
psadbw mm0, mm1
|
|
lea rsi, [rsi+rbx]
|
|
|
|
add rdi, rdx
|
|
paddw mm7, mm0
|
|
|
|
cmp rsi, rcx
|
|
jne x8x8sad_wmt_loop
|
|
|
|
movq rax, mm7
|
|
x8x8sad_wmt_early_exit:
|
|
|
|
; begin epilog
|
|
pop rdi
|
|
pop rsi
|
|
pop rbx
|
|
UNSHADOW_ARGS
|
|
pop rbp
|
|
ret
|
|
|
|
;unsigned int vp8_sad4x4_wmt(
|
|
; unsigned char *src_ptr,
|
|
; int src_stride,
|
|
; unsigned char *ref_ptr,
|
|
; int ref_stride)
|
|
global sym(vp8_sad4x4_wmt)
|
|
sym(vp8_sad4x4_wmt):
|
|
push rbp
|
|
mov rbp, rsp
|
|
SHADOW_ARGS_TO_STACK 4
|
|
push rsi
|
|
push rdi
|
|
; end prolog
|
|
|
|
mov rsi, arg(0) ;src_ptr
|
|
mov rdi, arg(2) ;ref_ptr
|
|
|
|
movsxd rax, dword ptr arg(1) ;src_stride
|
|
movsxd rdx, dword ptr arg(3) ;ref_stride
|
|
|
|
movd mm0, DWORD PTR [rsi]
|
|
movd mm1, DWORD PTR [rdi]
|
|
|
|
movd mm2, DWORD PTR [rsi+rax]
|
|
movd mm3, DWORD PTR [rdi+rdx]
|
|
|
|
punpcklbw mm0, mm2
|
|
punpcklbw mm1, mm3
|
|
|
|
psadbw mm0, mm1
|
|
lea rsi, [rsi+rax*2]
|
|
|
|
lea rdi, [rdi+rdx*2]
|
|
movd mm4, DWORD PTR [rsi]
|
|
|
|
movd mm5, DWORD PTR [rdi]
|
|
movd mm6, DWORD PTR [rsi+rax]
|
|
|
|
movd mm7, DWORD PTR [rdi+rdx]
|
|
punpcklbw mm4, mm6
|
|
|
|
punpcklbw mm5, mm7
|
|
psadbw mm4, mm5
|
|
|
|
paddw mm0, mm4
|
|
movq rax, mm0
|
|
|
|
; begin epilog
|
|
pop rdi
|
|
pop rsi
|
|
UNSHADOW_ARGS
|
|
pop rbp
|
|
ret
|
|
|
|
|
|
;unsigned int vp8_sad16x8_wmt(
|
|
; unsigned char *src_ptr,
|
|
; int src_stride,
|
|
; unsigned char *ref_ptr,
|
|
; int ref_stride)
|
|
global sym(vp8_sad16x8_wmt)
|
|
sym(vp8_sad16x8_wmt):
|
|
push rbp
|
|
mov rbp, rsp
|
|
SHADOW_ARGS_TO_STACK 5
|
|
push rbx
|
|
push rsi
|
|
push rdi
|
|
; end prolog
|
|
|
|
|
|
mov rsi, arg(0) ;src_ptr
|
|
mov rdi, arg(2) ;ref_ptr
|
|
|
|
movsxd rbx, dword ptr arg(1) ;src_stride
|
|
movsxd rdx, dword ptr arg(3) ;ref_stride
|
|
|
|
lea rcx, [rsi+rbx*8]
|
|
pxor mm7, mm7
|
|
|
|
x16x8sad_wmt_loop:
|
|
|
|
movq rax, mm7
|
|
cmp eax, arg(4)
|
|
jg x16x8sad_wmt_early_exit
|
|
|
|
movq mm0, QWORD PTR [rsi]
|
|
movq mm2, QWORD PTR [rsi+8]
|
|
|
|
movq mm1, QWORD PTR [rdi]
|
|
movq mm3, QWORD PTR [rdi+8]
|
|
|
|
movq mm4, QWORD PTR [rsi+rbx]
|
|
movq mm5, QWORD PTR [rdi+rdx]
|
|
|
|
psadbw mm0, mm1
|
|
psadbw mm2, mm3
|
|
|
|
movq mm1, QWORD PTR [rsi+rbx+8]
|
|
movq mm3, QWORD PTR [rdi+rdx+8]
|
|
|
|
psadbw mm4, mm5
|
|
psadbw mm1, mm3
|
|
|
|
lea rsi, [rsi+rbx*2]
|
|
lea rdi, [rdi+rdx*2]
|
|
|
|
paddw mm0, mm2
|
|
paddw mm4, mm1
|
|
|
|
paddw mm7, mm0
|
|
paddw mm7, mm4
|
|
|
|
cmp rsi, rcx
|
|
jne x16x8sad_wmt_loop
|
|
|
|
movq rax, mm7
|
|
|
|
x16x8sad_wmt_early_exit:
|
|
|
|
; begin epilog
|
|
pop rdi
|
|
pop rsi
|
|
pop rbx
|
|
UNSHADOW_ARGS
|
|
pop rbp
|
|
ret
|
|
|
|
;void vp8_copy32xn_sse2(
|
|
; unsigned char *src_ptr,
|
|
; int src_stride,
|
|
; unsigned char *dst_ptr,
|
|
; int dst_stride,
|
|
; int height);
|
|
global sym(vp8_copy32xn_sse2)
|
|
sym(vp8_copy32xn_sse2):
|
|
push rbp
|
|
mov rbp, rsp
|
|
SHADOW_ARGS_TO_STACK 5
|
|
SAVE_XMM 7
|
|
push rsi
|
|
push rdi
|
|
; end prolog
|
|
|
|
mov rsi, arg(0) ;src_ptr
|
|
mov rdi, arg(2) ;dst_ptr
|
|
|
|
movsxd rax, dword ptr arg(1) ;src_stride
|
|
movsxd rdx, dword ptr arg(3) ;dst_stride
|
|
movsxd rcx, dword ptr arg(4) ;height
|
|
|
|
block_copy_sse2_loopx4:
|
|
movdqu xmm0, XMMWORD PTR [rsi]
|
|
movdqu xmm1, XMMWORD PTR [rsi + 16]
|
|
movdqu xmm2, XMMWORD PTR [rsi + rax]
|
|
movdqu xmm3, XMMWORD PTR [rsi + rax + 16]
|
|
|
|
lea rsi, [rsi+rax*2]
|
|
|
|
movdqu xmm4, XMMWORD PTR [rsi]
|
|
movdqu xmm5, XMMWORD PTR [rsi + 16]
|
|
movdqu xmm6, XMMWORD PTR [rsi + rax]
|
|
movdqu xmm7, XMMWORD PTR [rsi + rax + 16]
|
|
|
|
lea rsi, [rsi+rax*2]
|
|
|
|
movdqa XMMWORD PTR [rdi], xmm0
|
|
movdqa XMMWORD PTR [rdi + 16], xmm1
|
|
movdqa XMMWORD PTR [rdi + rdx], xmm2
|
|
movdqa XMMWORD PTR [rdi + rdx + 16], xmm3
|
|
|
|
lea rdi, [rdi+rdx*2]
|
|
|
|
movdqa XMMWORD PTR [rdi], xmm4
|
|
movdqa XMMWORD PTR [rdi + 16], xmm5
|
|
movdqa XMMWORD PTR [rdi + rdx], xmm6
|
|
movdqa XMMWORD PTR [rdi + rdx + 16], xmm7
|
|
|
|
lea rdi, [rdi+rdx*2]
|
|
|
|
sub rcx, 4
|
|
cmp rcx, 4
|
|
jge block_copy_sse2_loopx4
|
|
|
|
cmp rcx, 0
|
|
je copy_is_done
|
|
|
|
block_copy_sse2_loop:
|
|
movdqu xmm0, XMMWORD PTR [rsi]
|
|
movdqu xmm1, XMMWORD PTR [rsi + 16]
|
|
lea rsi, [rsi+rax]
|
|
|
|
movdqa XMMWORD PTR [rdi], xmm0
|
|
movdqa XMMWORD PTR [rdi + 16], xmm1
|
|
lea rdi, [rdi+rdx]
|
|
|
|
sub rcx, 1
|
|
jne block_copy_sse2_loop
|
|
|
|
copy_is_done:
|
|
; begin epilog
|
|
pop rdi
|
|
pop rsi
|
|
RESTORE_XMM
|
|
UNSHADOW_ARGS
|
|
pop rbp
|
|
ret
|