vpx/vp8/common/x86/recon_mmx.asm
Scott LaVarnway ed9c66f584 Remove usage of predict buffer for decode
Instead of using the predict buffer, the decoder now writes
the predictor into the recon buffer.  For blocks with eob=0,
unnecessary idcts can be eliminated.  This gave a performance
boost of ~1.8% for the HD clips used.

Tero: Added needed changes to ARM side and scheduled some
      assembly code to prevent interlocks.

Patch Set 6:  Merged (I1bcdca7a95aacc3a181b9faa6b10e3a71ee24df3)
into this commit because of similarities in the idct
functions.
Patch Set 7: EC bug fix.

Change-Id: Ie31d90b5d3522e1108163f2ac491e455e3f955e6
2011-10-18 12:06:50 -04:00

275 lines
7.1 KiB
NASM

;
; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
;
; Use of this source code is governed by a BSD-style license
; that can be found in the LICENSE file in the root of the source
; tree. An additional intellectual property rights grant can be found
; in the file PATENTS. All contributing project authors may
; be found in the AUTHORS file in the root of the source tree.
;
%include "vpx_ports/x86_abi_support.asm"
;void copy_mem8x8_mmx(
; unsigned char *src,
; int src_stride,
; unsigned char *dst,
; int dst_stride
; )
global sym(vp8_copy_mem8x8_mmx)
sym(vp8_copy_mem8x8_mmx):
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 4
push rsi
push rdi
; end prolog
mov rsi, arg(0) ;src;
movq mm0, [rsi]
movsxd rax, dword ptr arg(1) ;src_stride;
mov rdi, arg(2) ;dst;
movq mm1, [rsi+rax]
movq mm2, [rsi+rax*2]
movsxd rcx, dword ptr arg(3) ;dst_stride
lea rsi, [rsi+rax*2]
movq [rdi], mm0
add rsi, rax
movq [rdi+rcx], mm1
movq [rdi+rcx*2], mm2
lea rdi, [rdi+rcx*2]
movq mm3, [rsi]
add rdi, rcx
movq mm4, [rsi+rax]
movq mm5, [rsi+rax*2]
movq [rdi], mm3
lea rsi, [rsi+rax*2]
movq [rdi+rcx], mm4
movq [rdi+rcx*2], mm5
lea rdi, [rdi+rcx*2]
movq mm0, [rsi+rax]
movq mm1, [rsi+rax*2]
movq [rdi+rcx], mm0
movq [rdi+rcx*2],mm1
; begin epilog
pop rdi
pop rsi
UNSHADOW_ARGS
pop rbp
ret
;void copy_mem8x4_mmx(
; unsigned char *src,
; int src_stride,
; unsigned char *dst,
; int dst_stride
; )
global sym(vp8_copy_mem8x4_mmx)
sym(vp8_copy_mem8x4_mmx):
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 4
push rsi
push rdi
; end prolog
mov rsi, arg(0) ;src;
movq mm0, [rsi]
movsxd rax, dword ptr arg(1) ;src_stride;
mov rdi, arg(2) ;dst;
movq mm1, [rsi+rax]
movq mm2, [rsi+rax*2]
movsxd rcx, dword ptr arg(3) ;dst_stride
lea rsi, [rsi+rax*2]
movq [rdi], mm0
movq [rdi+rcx], mm1
movq [rdi+rcx*2], mm2
lea rdi, [rdi+rcx*2]
movq mm3, [rsi+rax]
movq [rdi+rcx], mm3
; begin epilog
pop rdi
pop rsi
UNSHADOW_ARGS
pop rbp
ret
;void copy_mem16x16_mmx(
; unsigned char *src,
; int src_stride,
; unsigned char *dst,
; int dst_stride
; )
global sym(vp8_copy_mem16x16_mmx)
sym(vp8_copy_mem16x16_mmx):
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 4
push rsi
push rdi
; end prolog
mov rsi, arg(0) ;src;
movsxd rax, dword ptr arg(1) ;src_stride;
mov rdi, arg(2) ;dst;
movsxd rcx, dword ptr arg(3) ;dst_stride
movq mm0, [rsi]
movq mm3, [rsi+8];
movq mm1, [rsi+rax]
movq mm4, [rsi+rax+8]
movq mm2, [rsi+rax*2]
movq mm5, [rsi+rax*2+8]
lea rsi, [rsi+rax*2]
add rsi, rax
movq [rdi], mm0
movq [rdi+8], mm3
movq [rdi+rcx], mm1
movq [rdi+rcx+8], mm4
movq [rdi+rcx*2], mm2
movq [rdi+rcx*2+8], mm5
lea rdi, [rdi+rcx*2]
add rdi, rcx
movq mm0, [rsi]
movq mm3, [rsi+8];
movq mm1, [rsi+rax]
movq mm4, [rsi+rax+8]
movq mm2, [rsi+rax*2]
movq mm5, [rsi+rax*2+8]
lea rsi, [rsi+rax*2]
add rsi, rax
movq [rdi], mm0
movq [rdi+8], mm3
movq [rdi+rcx], mm1
movq [rdi+rcx+8], mm4
movq [rdi+rcx*2], mm2
movq [rdi+rcx*2+8], mm5
lea rdi, [rdi+rcx*2]
add rdi, rcx
movq mm0, [rsi]
movq mm3, [rsi+8];
movq mm1, [rsi+rax]
movq mm4, [rsi+rax+8]
movq mm2, [rsi+rax*2]
movq mm5, [rsi+rax*2+8]
lea rsi, [rsi+rax*2]
add rsi, rax
movq [rdi], mm0
movq [rdi+8], mm3
movq [rdi+rcx], mm1
movq [rdi+rcx+8], mm4
movq [rdi+rcx*2], mm2
movq [rdi+rcx*2+8], mm5
lea rdi, [rdi+rcx*2]
add rdi, rcx
movq mm0, [rsi]
movq mm3, [rsi+8];
movq mm1, [rsi+rax]
movq mm4, [rsi+rax+8]
movq mm2, [rsi+rax*2]
movq mm5, [rsi+rax*2+8]
lea rsi, [rsi+rax*2]
add rsi, rax
movq [rdi], mm0
movq [rdi+8], mm3
movq [rdi+rcx], mm1
movq [rdi+rcx+8], mm4
movq [rdi+rcx*2], mm2
movq [rdi+rcx*2+8], mm5
lea rdi, [rdi+rcx*2]
add rdi, rcx
movq mm0, [rsi]
movq mm3, [rsi+8];
movq mm1, [rsi+rax]
movq mm4, [rsi+rax+8]
movq mm2, [rsi+rax*2]
movq mm5, [rsi+rax*2+8]
lea rsi, [rsi+rax*2]
add rsi, rax
movq [rdi], mm0
movq [rdi+8], mm3
movq [rdi+rcx], mm1
movq [rdi+rcx+8], mm4
movq [rdi+rcx*2], mm2
movq [rdi+rcx*2+8], mm5
lea rdi, [rdi+rcx*2]
add rdi, rcx
movq mm0, [rsi]
movq mm3, [rsi+8];
movq [rdi], mm0
movq [rdi+8], mm3
; begin epilog
pop rdi
pop rsi
UNSHADOW_ARGS
pop rbp
ret