c5f890af2c
Prepend . to local labels in assembly code. This allows non unique labels within a file. Also makes profiling information more informative by keeping the function name with the loop name. Change-Id: I7a983cb3a5ba2413d5dafd0a37936b268fb9e37f
433 lines
13 KiB
NASM
433 lines
13 KiB
NASM
;
|
|
; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
|
|
;
|
|
; Use of this source code is governed by a BSD-style license
|
|
; that can be found in the LICENSE file in the root of the source
|
|
; tree. An additional intellectual property rights grant can be found
|
|
; in the file PATENTS. All contributing project authors may
|
|
; be found in the AUTHORS file in the root of the source tree.
|
|
;
|
|
|
|
|
|
%include "vpx_ports/x86_abi_support.asm"
|
|
|
|
;void vp8_subtract_b_mmx_impl(unsigned char *z, int src_stride,
|
|
; short *diff, unsigned char *Predictor,
|
|
; int pitch);
|
|
global sym(vp8_subtract_b_mmx_impl)
|
|
sym(vp8_subtract_b_mmx_impl):
|
|
push rbp
|
|
mov rbp, rsp
|
|
SHADOW_ARGS_TO_STACK 5
|
|
push rsi
|
|
push rdi
|
|
; end prolog
|
|
|
|
|
|
mov rdi, arg(2) ;diff
|
|
mov rax, arg(3) ;Predictor
|
|
mov rsi, arg(0) ;z
|
|
movsxd rdx, dword ptr arg(1);src_stride;
|
|
movsxd rcx, dword ptr arg(4);pitch
|
|
pxor mm7, mm7
|
|
|
|
movd mm0, [rsi]
|
|
movd mm1, [rax]
|
|
punpcklbw mm0, mm7
|
|
punpcklbw mm1, mm7
|
|
psubw mm0, mm1
|
|
movq [rdi], mm0
|
|
|
|
|
|
movd mm0, [rsi+rdx]
|
|
movd mm1, [rax+rcx]
|
|
punpcklbw mm0, mm7
|
|
punpcklbw mm1, mm7
|
|
psubw mm0, mm1
|
|
movq [rdi+rcx*2],mm0
|
|
|
|
|
|
movd mm0, [rsi+rdx*2]
|
|
movd mm1, [rax+rcx*2]
|
|
punpcklbw mm0, mm7
|
|
punpcklbw mm1, mm7
|
|
psubw mm0, mm1
|
|
movq [rdi+rcx*4], mm0
|
|
|
|
lea rsi, [rsi+rdx*2]
|
|
lea rcx, [rcx+rcx*2]
|
|
|
|
|
|
|
|
movd mm0, [rsi+rdx]
|
|
movd mm1, [rax+rcx]
|
|
punpcklbw mm0, mm7
|
|
punpcklbw mm1, mm7
|
|
psubw mm0, mm1
|
|
movq [rdi+rcx*2], mm0
|
|
|
|
; begin epilog
|
|
pop rdi
|
|
pop rsi
|
|
UNSHADOW_ARGS
|
|
pop rbp
|
|
ret
|
|
|
|
;void vp8_subtract_mby_mmx(short *diff, unsigned char *src, unsigned char *pred, int stride)
|
|
global sym(vp8_subtract_mby_mmx)
|
|
sym(vp8_subtract_mby_mmx):
|
|
push rbp
|
|
mov rbp, rsp
|
|
SHADOW_ARGS_TO_STACK 4
|
|
push rsi
|
|
push rdi
|
|
; end prolog
|
|
|
|
|
|
mov rsi, arg(1) ;src
|
|
mov rdi, arg(0) ;diff
|
|
|
|
mov rax, arg(2) ;pred
|
|
movsxd rdx, dword ptr arg(3) ;stride
|
|
|
|
mov rcx, 16
|
|
pxor mm0, mm0
|
|
|
|
.submby_loop:
|
|
|
|
movq mm1, [rsi]
|
|
movq mm3, [rax]
|
|
|
|
movq mm2, mm1
|
|
movq mm4, mm3
|
|
|
|
punpcklbw mm1, mm0
|
|
punpcklbw mm3, mm0
|
|
|
|
punpckhbw mm2, mm0
|
|
punpckhbw mm4, mm0
|
|
|
|
psubw mm1, mm3
|
|
psubw mm2, mm4
|
|
|
|
movq [rdi], mm1
|
|
movq [rdi+8], mm2
|
|
|
|
|
|
movq mm1, [rsi+8]
|
|
movq mm3, [rax+8]
|
|
|
|
movq mm2, mm1
|
|
movq mm4, mm3
|
|
|
|
punpcklbw mm1, mm0
|
|
punpcklbw mm3, mm0
|
|
|
|
punpckhbw mm2, mm0
|
|
punpckhbw mm4, mm0
|
|
|
|
psubw mm1, mm3
|
|
psubw mm2, mm4
|
|
|
|
movq [rdi+16], mm1
|
|
movq [rdi+24], mm2
|
|
|
|
|
|
add rdi, 32
|
|
add rax, 16
|
|
|
|
lea rsi, [rsi+rdx]
|
|
|
|
sub rcx, 1
|
|
jnz .submby_loop
|
|
|
|
pop rdi
|
|
pop rsi
|
|
; begin epilog
|
|
UNSHADOW_ARGS
|
|
pop rbp
|
|
ret
|
|
|
|
|
|
;void vp8_subtract_mbuv_mmx(short *diff, unsigned char *usrc, unsigned char *vsrc, unsigned char *pred, int stride)
|
|
global sym(vp8_subtract_mbuv_mmx)
|
|
sym(vp8_subtract_mbuv_mmx):
|
|
push rbp
|
|
mov rbp, rsp
|
|
SHADOW_ARGS_TO_STACK 5
|
|
push rsi
|
|
push rdi
|
|
; end prolog
|
|
|
|
;short *udiff = diff + 256;
|
|
;short *vdiff = diff + 320;
|
|
;unsigned char *upred = pred + 256;
|
|
;unsigned char *vpred = pred + 320;
|
|
|
|
;unsigned char *z = usrc;
|
|
;unsigned short *diff = udiff;
|
|
;unsigned char *Predictor= upred;
|
|
|
|
mov rdi, arg(0) ;diff
|
|
mov rax, arg(3) ;pred
|
|
mov rsi, arg(1) ;z = usrc
|
|
add rdi, 256*2 ;diff = diff + 256 (shorts)
|
|
add rax, 256 ;Predictor = pred + 256
|
|
movsxd rdx, dword ptr arg(4) ;stride;
|
|
pxor mm7, mm7
|
|
|
|
movq mm0, [rsi]
|
|
movq mm1, [rax]
|
|
movq mm3, mm0
|
|
movq mm4, mm1
|
|
punpcklbw mm0, mm7
|
|
punpcklbw mm1, mm7
|
|
punpckhbw mm3, mm7
|
|
punpckhbw mm4, mm7
|
|
psubw mm0, mm1
|
|
psubw mm3, mm4
|
|
movq [rdi], mm0
|
|
movq [rdi+8], mm3
|
|
|
|
|
|
movq mm0, [rsi+rdx]
|
|
movq mm1, [rax+8]
|
|
movq mm3, mm0
|
|
movq mm4, mm1
|
|
punpcklbw mm0, mm7
|
|
punpcklbw mm1, mm7
|
|
punpckhbw mm3, mm7
|
|
punpckhbw mm4, mm7
|
|
psubw mm0, mm1
|
|
psubw mm3, mm4
|
|
movq [rdi+16], mm0
|
|
movq [rdi+24], mm3
|
|
|
|
movq mm0, [rsi+rdx*2]
|
|
movq mm1, [rax+16]
|
|
movq mm3, mm0
|
|
movq mm4, mm1
|
|
punpcklbw mm0, mm7
|
|
punpcklbw mm1, mm7
|
|
punpckhbw mm3, mm7
|
|
punpckhbw mm4, mm7
|
|
psubw mm0, mm1
|
|
psubw mm3, mm4
|
|
movq [rdi+32], mm0
|
|
movq [rdi+40], mm3
|
|
lea rsi, [rsi+rdx*2]
|
|
|
|
|
|
movq mm0, [rsi+rdx]
|
|
movq mm1, [rax+24]
|
|
movq mm3, mm0
|
|
movq mm4, mm1
|
|
punpcklbw mm0, mm7
|
|
punpcklbw mm1, mm7
|
|
punpckhbw mm3, mm7
|
|
punpckhbw mm4, mm7
|
|
psubw mm0, mm1
|
|
psubw mm3, mm4
|
|
|
|
movq [rdi+48], mm0
|
|
movq [rdi+56], mm3
|
|
|
|
|
|
add rdi, 64
|
|
add rax, 32
|
|
lea rsi, [rsi+rdx*2]
|
|
|
|
|
|
movq mm0, [rsi]
|
|
movq mm1, [rax]
|
|
movq mm3, mm0
|
|
movq mm4, mm1
|
|
punpcklbw mm0, mm7
|
|
punpcklbw mm1, mm7
|
|
punpckhbw mm3, mm7
|
|
punpckhbw mm4, mm7
|
|
psubw mm0, mm1
|
|
psubw mm3, mm4
|
|
movq [rdi], mm0
|
|
movq [rdi+8], mm3
|
|
|
|
|
|
movq mm0, [rsi+rdx]
|
|
movq mm1, [rax+8]
|
|
movq mm3, mm0
|
|
movq mm4, mm1
|
|
punpcklbw mm0, mm7
|
|
punpcklbw mm1, mm7
|
|
punpckhbw mm3, mm7
|
|
punpckhbw mm4, mm7
|
|
psubw mm0, mm1
|
|
psubw mm3, mm4
|
|
movq [rdi+16], mm0
|
|
movq [rdi+24], mm3
|
|
|
|
movq mm0, [rsi+rdx*2]
|
|
movq mm1, [rax+16]
|
|
movq mm3, mm0
|
|
movq mm4, mm1
|
|
punpcklbw mm0, mm7
|
|
punpcklbw mm1, mm7
|
|
punpckhbw mm3, mm7
|
|
punpckhbw mm4, mm7
|
|
psubw mm0, mm1
|
|
psubw mm3, mm4
|
|
movq [rdi+32], mm0
|
|
movq [rdi+40], mm3
|
|
lea rsi, [rsi+rdx*2]
|
|
|
|
|
|
movq mm0, [rsi+rdx]
|
|
movq mm1, [rax+24]
|
|
movq mm3, mm0
|
|
movq mm4, mm1
|
|
punpcklbw mm0, mm7
|
|
punpcklbw mm1, mm7
|
|
punpckhbw mm3, mm7
|
|
punpckhbw mm4, mm7
|
|
psubw mm0, mm1
|
|
psubw mm3, mm4
|
|
|
|
movq [rdi+48], mm0
|
|
movq [rdi+56], mm3
|
|
|
|
;unsigned char *z = vsrc;
|
|
;unsigned short *diff = vdiff;
|
|
;unsigned char *Predictor= vpred;
|
|
|
|
mov rdi, arg(0) ;diff
|
|
mov rax, arg(3) ;pred
|
|
mov rsi, arg(2) ;z = usrc
|
|
add rdi, 320*2 ;diff = diff + 320 (shorts)
|
|
add rax, 320 ;Predictor = pred + 320
|
|
movsxd rdx, dword ptr arg(4) ;stride;
|
|
pxor mm7, mm7
|
|
|
|
movq mm0, [rsi]
|
|
movq mm1, [rax]
|
|
movq mm3, mm0
|
|
movq mm4, mm1
|
|
punpcklbw mm0, mm7
|
|
punpcklbw mm1, mm7
|
|
punpckhbw mm3, mm7
|
|
punpckhbw mm4, mm7
|
|
psubw mm0, mm1
|
|
psubw mm3, mm4
|
|
movq [rdi], mm0
|
|
movq [rdi+8], mm3
|
|
|
|
|
|
movq mm0, [rsi+rdx]
|
|
movq mm1, [rax+8]
|
|
movq mm3, mm0
|
|
movq mm4, mm1
|
|
punpcklbw mm0, mm7
|
|
punpcklbw mm1, mm7
|
|
punpckhbw mm3, mm7
|
|
punpckhbw mm4, mm7
|
|
psubw mm0, mm1
|
|
psubw mm3, mm4
|
|
movq [rdi+16], mm0
|
|
movq [rdi+24], mm3
|
|
|
|
movq mm0, [rsi+rdx*2]
|
|
movq mm1, [rax+16]
|
|
movq mm3, mm0
|
|
movq mm4, mm1
|
|
punpcklbw mm0, mm7
|
|
punpcklbw mm1, mm7
|
|
punpckhbw mm3, mm7
|
|
punpckhbw mm4, mm7
|
|
psubw mm0, mm1
|
|
psubw mm3, mm4
|
|
movq [rdi+32], mm0
|
|
movq [rdi+40], mm3
|
|
lea rsi, [rsi+rdx*2]
|
|
|
|
|
|
movq mm0, [rsi+rdx]
|
|
movq mm1, [rax+24]
|
|
movq mm3, mm0
|
|
movq mm4, mm1
|
|
punpcklbw mm0, mm7
|
|
punpcklbw mm1, mm7
|
|
punpckhbw mm3, mm7
|
|
punpckhbw mm4, mm7
|
|
psubw mm0, mm1
|
|
psubw mm3, mm4
|
|
|
|
movq [rdi+48], mm0
|
|
movq [rdi+56], mm3
|
|
|
|
|
|
add rdi, 64
|
|
add rax, 32
|
|
lea rsi, [rsi+rdx*2]
|
|
|
|
|
|
movq mm0, [rsi]
|
|
movq mm1, [rax]
|
|
movq mm3, mm0
|
|
movq mm4, mm1
|
|
punpcklbw mm0, mm7
|
|
punpcklbw mm1, mm7
|
|
punpckhbw mm3, mm7
|
|
punpckhbw mm4, mm7
|
|
psubw mm0, mm1
|
|
psubw mm3, mm4
|
|
movq [rdi], mm0
|
|
movq [rdi+8], mm3
|
|
|
|
|
|
movq mm0, [rsi+rdx]
|
|
movq mm1, [rax+8]
|
|
movq mm3, mm0
|
|
movq mm4, mm1
|
|
punpcklbw mm0, mm7
|
|
punpcklbw mm1, mm7
|
|
punpckhbw mm3, mm7
|
|
punpckhbw mm4, mm7
|
|
psubw mm0, mm1
|
|
psubw mm3, mm4
|
|
movq [rdi+16], mm0
|
|
movq [rdi+24], mm3
|
|
|
|
movq mm0, [rsi+rdx*2]
|
|
movq mm1, [rax+16]
|
|
movq mm3, mm0
|
|
movq mm4, mm1
|
|
punpcklbw mm0, mm7
|
|
punpcklbw mm1, mm7
|
|
punpckhbw mm3, mm7
|
|
punpckhbw mm4, mm7
|
|
psubw mm0, mm1
|
|
psubw mm3, mm4
|
|
movq [rdi+32], mm0
|
|
movq [rdi+40], mm3
|
|
lea rsi, [rsi+rdx*2]
|
|
|
|
|
|
movq mm0, [rsi+rdx]
|
|
movq mm1, [rax+24]
|
|
movq mm3, mm0
|
|
movq mm4, mm1
|
|
punpcklbw mm0, mm7
|
|
punpcklbw mm1, mm7
|
|
punpckhbw mm3, mm7
|
|
punpckhbw mm4, mm7
|
|
psubw mm0, mm1
|
|
psubw mm3, mm4
|
|
|
|
movq [rdi+48], mm0
|
|
movq [rdi+56], mm3
|
|
|
|
; begin epilog
|
|
pop rdi
|
|
pop rsi
|
|
UNSHADOW_ARGS
|
|
pop rbp
|
|
ret
|