981 lines
36 KiB
NASM
981 lines
36 KiB
NASM
|
;
|
||
|
; Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
|
||
|
;
|
||
|
; Use of this source code is governed by a BSD-style license and patent
|
||
|
; grant that can be found in the LICENSE file in the root of the source
|
||
|
; tree. All contributing project authors may be found in the AUTHORS
|
||
|
; file in the root of the source tree.
|
||
|
;
|
||
|
|
||
|
|
||
|
%include "vpx_ports/x86_abi_support.asm"
|
||
|
|
||
|
;unsigned int vp8_get_mb_ss_mmx( short *src_ptr )
|
||
|
global sym(vp8_get_mb_ss_mmx)
|
||
|
sym(vp8_get_mb_ss_mmx):
|
||
|
push rbp
|
||
|
mov rbp, rsp
|
||
|
SHADOW_ARGS_TO_STACK 7
|
||
|
GET_GOT rbx
|
||
|
push rsi
|
||
|
push rdi
|
||
|
sub rsp, 8
|
||
|
; end prolog
|
||
|
|
||
|
mov rax, arg(0) ;src_ptr
|
||
|
mov rcx, 16
|
||
|
pxor mm4, mm4
|
||
|
|
||
|
NEXTROW:
|
||
|
movq mm0, [rax]
|
||
|
movq mm1, [rax+8]
|
||
|
movq mm2, [rax+16]
|
||
|
movq mm3, [rax+24]
|
||
|
pmaddwd mm0, mm0
|
||
|
pmaddwd mm1, mm1
|
||
|
pmaddwd mm2, mm2
|
||
|
pmaddwd mm3, mm3
|
||
|
|
||
|
paddd mm4, mm0
|
||
|
paddd mm4, mm1
|
||
|
paddd mm4, mm2
|
||
|
paddd mm4, mm3
|
||
|
|
||
|
add rax, 32
|
||
|
dec rcx
|
||
|
ja NEXTROW
|
||
|
movq QWORD PTR [rsp], mm4
|
||
|
|
||
|
;return sum[0]+sum[1];
|
||
|
movsxd rax, dword ptr [rsp]
|
||
|
movsxd rcx, dword ptr [rsp+4]
|
||
|
add rax, rcx
|
||
|
|
||
|
|
||
|
; begin epilog
|
||
|
add rsp, 8
|
||
|
pop rdi
|
||
|
pop rsi
|
||
|
RESTORE_GOT
|
||
|
UNSHADOW_ARGS
|
||
|
pop rbp
|
||
|
ret
|
||
|
|
||
|
|
||
|
;unsigned int vp8_get8x8var_mmx
|
||
|
;(
|
||
|
; unsigned char *src_ptr,
|
||
|
; int source_stride,
|
||
|
; unsigned char *ref_ptr,
|
||
|
; int recon_stride,
|
||
|
; unsigned int *SSE,
|
||
|
; int *Sum
|
||
|
;)
|
||
|
global sym(vp8_get8x8var_mmx)
|
||
|
sym(vp8_get8x8var_mmx):
|
||
|
push rbp
|
||
|
mov rbp, rsp
|
||
|
SHADOW_ARGS_TO_STACK 6
|
||
|
push rsi
|
||
|
push rdi
|
||
|
push rbx
|
||
|
sub rsp, 16
|
||
|
; end prolog
|
||
|
|
||
|
|
||
|
pxor mm5, mm5 ; Blank mmx6
|
||
|
pxor mm6, mm6 ; Blank mmx7
|
||
|
pxor mm7, mm7 ; Blank mmx7
|
||
|
|
||
|
mov rax, arg(0) ;[src_ptr] ; Load base addresses
|
||
|
mov rbx, arg(2) ;[ref_ptr]
|
||
|
movsxd rcx, dword ptr arg(1) ;[source_stride]
|
||
|
movsxd rdx, dword ptr arg(3) ;[recon_stride]
|
||
|
|
||
|
; Row 1
|
||
|
movq mm0, [rax] ; Copy eight bytes to mm0
|
||
|
movq mm1, [rbx] ; Copy eight bytes to mm1
|
||
|
movq mm2, mm0 ; Take copies
|
||
|
movq mm3, mm1 ; Take copies
|
||
|
|
||
|
punpcklbw mm0, mm6 ; unpack to higher prrcision
|
||
|
punpcklbw mm1, mm6
|
||
|
punpckhbw mm2, mm6 ; unpack to higher prrcision
|
||
|
punpckhbw mm3, mm6
|
||
|
psubsw mm0, mm1 ; A-B (low order) to MM0
|
||
|
psubsw mm2, mm3 ; A-B (high order) to MM2
|
||
|
|
||
|
paddw mm5, mm0 ; accumulate differences in mm5
|
||
|
paddw mm5, mm2 ; accumulate differences in mm5
|
||
|
|
||
|
pmaddwd mm0, mm0 ; square and accumulate
|
||
|
pmaddwd mm2, mm2 ; square and accumulate
|
||
|
add rbx,rdx ; Inc pointer into ref data
|
||
|
add rax,rcx ; Inc pointer into the new data
|
||
|
movq mm1, [rbx] ; Copy eight bytes to mm1
|
||
|
paddd mm7, mm0 ; accumulate in mm7
|
||
|
paddd mm7, mm2 ; accumulate in mm7
|
||
|
|
||
|
|
||
|
; Row 2
|
||
|
movq mm0, [rax] ; Copy eight bytes to mm0
|
||
|
movq mm2, mm0 ; Take copies
|
||
|
movq mm3, mm1 ; Take copies
|
||
|
|
||
|
punpcklbw mm0, mm6 ; unpack to higher prrcision
|
||
|
punpcklbw mm1, mm6
|
||
|
punpckhbw mm2, mm6 ; unpack to higher prrcision
|
||
|
punpckhbw mm3, mm6
|
||
|
psubsw mm0, mm1 ; A-B (low order) to MM0
|
||
|
psubsw mm2, mm3 ; A-B (high order) to MM2
|
||
|
|
||
|
paddw mm5, mm0 ; accumulate differences in mm5
|
||
|
paddw mm5, mm2 ; accumulate differences in mm5
|
||
|
|
||
|
pmaddwd mm0, mm0 ; square and accumulate
|
||
|
pmaddwd mm2, mm2 ; square and accumulate
|
||
|
add rbx,rdx ; Inc pointer into ref data
|
||
|
add rax,rcx ; Inc pointer into the new data
|
||
|
movq mm1, [rbx] ; Copy eight bytes to mm1
|
||
|
paddd mm7, mm0 ; accumulate in mm7
|
||
|
paddd mm7, mm2 ; accumulate in mm7
|
||
|
|
||
|
; Row 3
|
||
|
movq mm0, [rax] ; Copy eight bytes to mm0
|
||
|
movq mm2, mm0 ; Take copies
|
||
|
movq mm3, mm1 ; Take copies
|
||
|
|
||
|
punpcklbw mm0, mm6 ; unpack to higher prrcision
|
||
|
punpcklbw mm1, mm6
|
||
|
punpckhbw mm2, mm6 ; unpack to higher prrcision
|
||
|
punpckhbw mm3, mm6
|
||
|
psubsw mm0, mm1 ; A-B (low order) to MM0
|
||
|
psubsw mm2, mm3 ; A-B (high order) to MM2
|
||
|
|
||
|
paddw mm5, mm0 ; accumulate differences in mm5
|
||
|
paddw mm5, mm2 ; accumulate differences in mm5
|
||
|
|
||
|
pmaddwd mm0, mm0 ; square and accumulate
|
||
|
pmaddwd mm2, mm2 ; square and accumulate
|
||
|
add rbx,rdx ; Inc pointer into ref data
|
||
|
add rax,rcx ; Inc pointer into the new data
|
||
|
movq mm1, [rbx] ; Copy eight bytes to mm1
|
||
|
paddd mm7, mm0 ; accumulate in mm7
|
||
|
paddd mm7, mm2 ; accumulate in mm7
|
||
|
|
||
|
; Row 4
|
||
|
movq mm0, [rax] ; Copy eight bytes to mm0
|
||
|
movq mm2, mm0 ; Take copies
|
||
|
movq mm3, mm1 ; Take copies
|
||
|
|
||
|
punpcklbw mm0, mm6 ; unpack to higher prrcision
|
||
|
punpcklbw mm1, mm6
|
||
|
punpckhbw mm2, mm6 ; unpack to higher prrcision
|
||
|
punpckhbw mm3, mm6
|
||
|
psubsw mm0, mm1 ; A-B (low order) to MM0
|
||
|
psubsw mm2, mm3 ; A-B (high order) to MM2
|
||
|
|
||
|
paddw mm5, mm0 ; accumulate differences in mm5
|
||
|
paddw mm5, mm2 ; accumulate differences in mm5
|
||
|
|
||
|
pmaddwd mm0, mm0 ; square and accumulate
|
||
|
pmaddwd mm2, mm2 ; square and accumulate
|
||
|
add rbx,rdx ; Inc pointer into ref data
|
||
|
add rax,rcx ; Inc pointer into the new data
|
||
|
movq mm1, [rbx] ; Copy eight bytes to mm1
|
||
|
paddd mm7, mm0 ; accumulate in mm7
|
||
|
paddd mm7, mm2 ; accumulate in mm7
|
||
|
|
||
|
; Row 5
|
||
|
movq mm0, [rax] ; Copy eight bytes to mm0
|
||
|
movq mm2, mm0 ; Take copies
|
||
|
movq mm3, mm1 ; Take copies
|
||
|
|
||
|
punpcklbw mm0, mm6 ; unpack to higher prrcision
|
||
|
punpcklbw mm1, mm6
|
||
|
punpckhbw mm2, mm6 ; unpack to higher prrcision
|
||
|
punpckhbw mm3, mm6
|
||
|
psubsw mm0, mm1 ; A-B (low order) to MM0
|
||
|
psubsw mm2, mm3 ; A-B (high order) to MM2
|
||
|
|
||
|
paddw mm5, mm0 ; accumulate differences in mm5
|
||
|
paddw mm5, mm2 ; accumulate differences in mm5
|
||
|
|
||
|
pmaddwd mm0, mm0 ; square and accumulate
|
||
|
pmaddwd mm2, mm2 ; square and accumulate
|
||
|
add rbx,rdx ; Inc pointer into ref data
|
||
|
add rax,rcx ; Inc pointer into the new data
|
||
|
movq mm1, [rbx] ; Copy eight bytes to mm1
|
||
|
; movq mm4, [rbx + rdx]
|
||
|
paddd mm7, mm0 ; accumulate in mm7
|
||
|
paddd mm7, mm2 ; accumulate in mm7
|
||
|
|
||
|
; Row 6
|
||
|
movq mm0, [rax] ; Copy eight bytes to mm0
|
||
|
movq mm2, mm0 ; Take copies
|
||
|
movq mm3, mm1 ; Take copies
|
||
|
|
||
|
punpcklbw mm0, mm6 ; unpack to higher prrcision
|
||
|
punpcklbw mm1, mm6
|
||
|
punpckhbw mm2, mm6 ; unpack to higher prrcision
|
||
|
punpckhbw mm3, mm6
|
||
|
psubsw mm0, mm1 ; A-B (low order) to MM0
|
||
|
psubsw mm2, mm3 ; A-B (high order) to MM2
|
||
|
|
||
|
paddw mm5, mm0 ; accumulate differences in mm5
|
||
|
paddw mm5, mm2 ; accumulate differences in mm5
|
||
|
|
||
|
pmaddwd mm0, mm0 ; square and accumulate
|
||
|
pmaddwd mm2, mm2 ; square and accumulate
|
||
|
add rbx,rdx ; Inc pointer into ref data
|
||
|
add rax,rcx ; Inc pointer into the new data
|
||
|
movq mm1, [rbx] ; Copy eight bytes to mm1
|
||
|
paddd mm7, mm0 ; accumulate in mm7
|
||
|
paddd mm7, mm2 ; accumulate in mm7
|
||
|
|
||
|
; Row 7
|
||
|
movq mm0, [rax] ; Copy eight bytes to mm0
|
||
|
movq mm2, mm0 ; Take copies
|
||
|
movq mm3, mm1 ; Take copies
|
||
|
|
||
|
punpcklbw mm0, mm6 ; unpack to higher prrcision
|
||
|
punpcklbw mm1, mm6
|
||
|
punpckhbw mm2, mm6 ; unpack to higher prrcision
|
||
|
punpckhbw mm3, mm6
|
||
|
psubsw mm0, mm1 ; A-B (low order) to MM0
|
||
|
psubsw mm2, mm3 ; A-B (high order) to MM2
|
||
|
|
||
|
paddw mm5, mm0 ; accumulate differences in mm5
|
||
|
paddw mm5, mm2 ; accumulate differences in mm5
|
||
|
|
||
|
pmaddwd mm0, mm0 ; square and accumulate
|
||
|
pmaddwd mm2, mm2 ; square and accumulate
|
||
|
add rbx,rdx ; Inc pointer into ref data
|
||
|
add rax,rcx ; Inc pointer into the new data
|
||
|
movq mm1, [rbx] ; Copy eight bytes to mm1
|
||
|
paddd mm7, mm0 ; accumulate in mm7
|
||
|
paddd mm7, mm2 ; accumulate in mm7
|
||
|
|
||
|
; Row 8
|
||
|
movq mm0, [rax] ; Copy eight bytes to mm0
|
||
|
movq mm2, mm0 ; Take copies
|
||
|
movq mm3, mm1 ; Take copies
|
||
|
|
||
|
punpcklbw mm0, mm6 ; unpack to higher prrcision
|
||
|
punpcklbw mm1, mm6
|
||
|
punpckhbw mm2, mm6 ; unpack to higher prrcision
|
||
|
punpckhbw mm3, mm6
|
||
|
psubsw mm0, mm1 ; A-B (low order) to MM0
|
||
|
psubsw mm2, mm3 ; A-B (high order) to MM2
|
||
|
|
||
|
paddw mm5, mm0 ; accumulate differences in mm5
|
||
|
paddw mm5, mm2 ; accumulate differences in mm5
|
||
|
|
||
|
pmaddwd mm0, mm0 ; square and accumulate
|
||
|
pmaddwd mm2, mm2 ; square and accumulate
|
||
|
add rbx,rdx ; Inc pointer into ref data
|
||
|
add rax,rcx ; Inc pointer into the new data
|
||
|
paddd mm7, mm0 ; accumulate in mm7
|
||
|
paddd mm7, mm2 ; accumulate in mm7
|
||
|
|
||
|
; Now accumulate the final results.
|
||
|
movq QWORD PTR [rsp+8], mm5 ; copy back accumulated results into normal memory
|
||
|
movq QWORD PTR [rsp], mm7 ; copy back accumulated results into normal memory
|
||
|
movsx rdx, WORD PTR [rsp+8]
|
||
|
movsx rcx, WORD PTR [rsp+10]
|
||
|
movsx rbx, WORD PTR [rsp+12]
|
||
|
movsx rax, WORD PTR [rsp+14]
|
||
|
add rdx, rcx
|
||
|
add rbx, rax
|
||
|
add rdx, rbx ;XSum
|
||
|
movsxd rax, DWORD PTR [rsp]
|
||
|
movsxd rcx, DWORD PTR [rsp+4]
|
||
|
add rax, rcx ;XXSum
|
||
|
mov rsi, arg(4) ;SSE
|
||
|
mov rdi, arg(5) ;Sum
|
||
|
mov dword ptr [rsi], eax
|
||
|
mov dword ptr [rdi], edx
|
||
|
xor rax, rax ; return 0
|
||
|
|
||
|
|
||
|
; begin epilog
|
||
|
add rsp, 16
|
||
|
pop rbx
|
||
|
pop rdi
|
||
|
pop rsi
|
||
|
UNSHADOW_ARGS
|
||
|
pop rbp
|
||
|
ret
|
||
|
|
||
|
|
||
|
|
||
|
;unsigned int
|
||
|
;vp8_get4x4var_mmx
|
||
|
;(
|
||
|
; unsigned char *src_ptr,
|
||
|
; int source_stride,
|
||
|
; unsigned char *ref_ptr,
|
||
|
; int recon_stride,
|
||
|
; unsigned int *SSE,
|
||
|
; int *Sum
|
||
|
;)
|
||
|
global sym(vp8_get4x4var_mmx)
|
||
|
sym(vp8_get4x4var_mmx):
|
||
|
push rbp
|
||
|
mov rbp, rsp
|
||
|
SHADOW_ARGS_TO_STACK 6
|
||
|
push rsi
|
||
|
push rdi
|
||
|
push rbx
|
||
|
sub rsp, 16
|
||
|
; end prolog
|
||
|
|
||
|
|
||
|
pxor mm5, mm5 ; Blank mmx6
|
||
|
pxor mm6, mm6 ; Blank mmx7
|
||
|
pxor mm7, mm7 ; Blank mmx7
|
||
|
|
||
|
mov rax, arg(0) ;[src_ptr] ; Load base addresses
|
||
|
mov rbx, arg(2) ;[ref_ptr]
|
||
|
movsxd rcx, dword ptr arg(1) ;[source_stride]
|
||
|
movsxd rdx, dword ptr arg(3) ;[recon_stride]
|
||
|
|
||
|
; Row 1
|
||
|
movq mm0, [rax] ; Copy eight bytes to mm0
|
||
|
movq mm1, [rbx] ; Copy eight bytes to mm1
|
||
|
punpcklbw mm0, mm6 ; unpack to higher prrcision
|
||
|
punpcklbw mm1, mm6
|
||
|
psubsw mm0, mm1 ; A-B (low order) to MM0
|
||
|
paddw mm5, mm0 ; accumulate differences in mm5
|
||
|
pmaddwd mm0, mm0 ; square and accumulate
|
||
|
add rbx,rdx ; Inc pointer into ref data
|
||
|
add rax,rcx ; Inc pointer into the new data
|
||
|
movq mm1, [rbx] ; Copy eight bytes to mm1
|
||
|
paddd mm7, mm0 ; accumulate in mm7
|
||
|
|
||
|
|
||
|
; Row 2
|
||
|
movq mm0, [rax] ; Copy eight bytes to mm0
|
||
|
punpcklbw mm0, mm6 ; unpack to higher prrcision
|
||
|
punpcklbw mm1, mm6
|
||
|
psubsw mm0, mm1 ; A-B (low order) to MM0
|
||
|
paddw mm5, mm0 ; accumulate differences in mm5
|
||
|
|
||
|
pmaddwd mm0, mm0 ; square and accumulate
|
||
|
add rbx,rdx ; Inc pointer into ref data
|
||
|
add rax,rcx ; Inc pointer into the new data
|
||
|
movq mm1, [rbx] ; Copy eight bytes to mm1
|
||
|
paddd mm7, mm0 ; accumulate in mm7
|
||
|
|
||
|
; Row 3
|
||
|
movq mm0, [rax] ; Copy eight bytes to mm0
|
||
|
punpcklbw mm0, mm6 ; unpack to higher prrcision
|
||
|
punpcklbw mm1, mm6
|
||
|
psubsw mm0, mm1 ; A-B (low order) to MM0
|
||
|
paddw mm5, mm0 ; accumulate differences in mm5
|
||
|
|
||
|
pmaddwd mm0, mm0 ; square and accumulate
|
||
|
add rbx,rdx ; Inc pointer into ref data
|
||
|
add rax,rcx ; Inc pointer into the new data
|
||
|
movq mm1, [rbx] ; Copy eight bytes to mm1
|
||
|
paddd mm7, mm0 ; accumulate in mm7
|
||
|
|
||
|
; Row 4
|
||
|
movq mm0, [rax] ; Copy eight bytes to mm0
|
||
|
|
||
|
punpcklbw mm0, mm6 ; unpack to higher prrcision
|
||
|
punpcklbw mm1, mm6
|
||
|
psubsw mm0, mm1 ; A-B (low order) to MM0
|
||
|
|
||
|
paddw mm5, mm0 ; accumulate differences in mm5
|
||
|
|
||
|
pmaddwd mm0, mm0 ; square and accumulate
|
||
|
paddd mm7, mm0 ; accumulate in mm7
|
||
|
|
||
|
|
||
|
; Now accumulate the final results.
|
||
|
movq QWORD PTR [rsp+8], mm5 ; copy back accumulated results into normal memory
|
||
|
movq QWORD PTR [rsp], mm7 ; copy back accumulated results into normal memory
|
||
|
movsx rdx, WORD PTR [rsp+8]
|
||
|
movsx rcx, WORD PTR [rsp+10]
|
||
|
movsx rbx, WORD PTR [rsp+12]
|
||
|
movsx rax, WORD PTR [rsp+14]
|
||
|
add rdx, rcx
|
||
|
add rbx, rax
|
||
|
add rdx, rbx ;XSum
|
||
|
movsxd rax, DWORD PTR [rsp]
|
||
|
movsxd rcx, DWORD PTR [rsp+4]
|
||
|
add rax, rcx ;XXSum
|
||
|
mov rsi, arg(4) ;SSE
|
||
|
mov rdi, arg(5) ;Sum
|
||
|
mov dword ptr [rsi], eax
|
||
|
mov dword ptr [rdi], edx
|
||
|
xor rax, rax ; return 0
|
||
|
|
||
|
|
||
|
; begin epilog
|
||
|
add rsp, 16
|
||
|
pop rbx
|
||
|
pop rdi
|
||
|
pop rsi
|
||
|
UNSHADOW_ARGS
|
||
|
pop rbp
|
||
|
ret
|
||
|
|
||
|
|
||
|
|
||
|
;unsigned int
|
||
|
;vp8_get4x4sse_cs_mmx
|
||
|
;(
|
||
|
; unsigned char *src_ptr,
|
||
|
; int source_stride,
|
||
|
; unsigned char *ref_ptr,
|
||
|
; int recon_stride
|
||
|
;)
|
||
|
global sym(vp8_get4x4sse_cs_mmx)
|
||
|
sym(vp8_get4x4sse_cs_mmx):
|
||
|
push rbp
|
||
|
mov rbp, rsp
|
||
|
SHADOW_ARGS_TO_STACK 4
|
||
|
push rsi
|
||
|
push rdi
|
||
|
push rbx
|
||
|
; end prolog
|
||
|
|
||
|
|
||
|
pxor mm6, mm6 ; Blank mmx7
|
||
|
pxor mm7, mm7 ; Blank mmx7
|
||
|
|
||
|
mov rax, arg(0) ;[src_ptr] ; Load base addresses
|
||
|
mov rbx, arg(2) ;[ref_ptr]
|
||
|
movsxd rcx, dword ptr arg(1) ;[source_stride]
|
||
|
movsxd rdx, dword ptr arg(3) ;[recon_stride]
|
||
|
; Row 1
|
||
|
movd mm0, [rax] ; Copy eight bytes to mm0
|
||
|
movd mm1, [rbx] ; Copy eight bytes to mm1
|
||
|
punpcklbw mm0, mm6 ; unpack to higher prrcision
|
||
|
punpcklbw mm1, mm6
|
||
|
psubsw mm0, mm1 ; A-B (low order) to MM0
|
||
|
pmaddwd mm0, mm0 ; square and accumulate
|
||
|
add rbx,rdx ; Inc pointer into ref data
|
||
|
add rax,rcx ; Inc pointer into the new data
|
||
|
movd mm1, [rbx] ; Copy eight bytes to mm1
|
||
|
paddd mm7, mm0 ; accumulate in mm7
|
||
|
|
||
|
; Row 2
|
||
|
movd mm0, [rax] ; Copy eight bytes to mm0
|
||
|
punpcklbw mm0, mm6 ; unpack to higher prrcision
|
||
|
punpcklbw mm1, mm6
|
||
|
psubsw mm0, mm1 ; A-B (low order) to MM0
|
||
|
pmaddwd mm0, mm0 ; square and accumulate
|
||
|
add rbx,rdx ; Inc pointer into ref data
|
||
|
add rax,rcx ; Inc pointer into the new data
|
||
|
movd mm1, [rbx] ; Copy eight bytes to mm1
|
||
|
paddd mm7, mm0 ; accumulate in mm7
|
||
|
|
||
|
; Row 3
|
||
|
movd mm0, [rax] ; Copy eight bytes to mm0
|
||
|
punpcklbw mm1, mm6
|
||
|
punpcklbw mm0, mm6 ; unpack to higher prrcision
|
||
|
psubsw mm0, mm1 ; A-B (low order) to MM0
|
||
|
|
||
|
pmaddwd mm0, mm0 ; square and accumulate
|
||
|
add rbx,rdx ; Inc pointer into ref data
|
||
|
add rax,rcx ; Inc pointer into the new data
|
||
|
movd mm1, [rbx] ; Copy eight bytes to mm1
|
||
|
paddd mm7, mm0 ; accumulate in mm7
|
||
|
|
||
|
; Row 4
|
||
|
movd mm0, [rax] ; Copy eight bytes to mm0
|
||
|
punpcklbw mm0, mm6 ; unpack to higher prrcision
|
||
|
punpcklbw mm1, mm6
|
||
|
psubsw mm0, mm1 ; A-B (low order) to MM0
|
||
|
pmaddwd mm0, mm0 ; square and accumulate
|
||
|
paddd mm7, mm0 ; accumulate in mm7
|
||
|
|
||
|
movq mm0, mm7 ;
|
||
|
psrlq mm7, 32
|
||
|
|
||
|
paddd mm0, mm7
|
||
|
movd rax, mm0
|
||
|
|
||
|
|
||
|
; begin epilog
|
||
|
pop rbx
|
||
|
pop rdi
|
||
|
pop rsi
|
||
|
UNSHADOW_ARGS
|
||
|
pop rbp
|
||
|
ret
|
||
|
|
||
|
%define mmx_filter_shift 7
|
||
|
|
||
|
;void vp8_filter_block2d_bil4x4_var_mmx
|
||
|
;(
|
||
|
; unsigned char *ref_ptr,
|
||
|
; int ref_pixels_per_line,
|
||
|
; unsigned char *src_ptr,
|
||
|
; int src_pixels_per_line,
|
||
|
; unsigned short *HFilter,
|
||
|
; unsigned short *VFilter,
|
||
|
; int *sum,
|
||
|
; unsigned int *sumsquared
|
||
|
;)
|
||
|
global sym(vp8_filter_block2d_bil4x4_var_mmx)
|
||
|
sym(vp8_filter_block2d_bil4x4_var_mmx):
|
||
|
push rbp
|
||
|
mov rbp, rsp
|
||
|
SHADOW_ARGS_TO_STACK 8
|
||
|
GET_GOT rbx
|
||
|
push rsi
|
||
|
push rdi
|
||
|
sub rsp, 16
|
||
|
; end prolog
|
||
|
|
||
|
|
||
|
pxor mm6, mm6 ;
|
||
|
pxor mm7, mm7 ;
|
||
|
|
||
|
mov rax, arg(4) ;HFilter ;
|
||
|
mov rdx, arg(5) ;VFilter ;
|
||
|
|
||
|
mov rsi, arg(0) ;ref_ptr ;
|
||
|
mov rdi, arg(2) ;src_ptr ;
|
||
|
|
||
|
mov rcx, 4 ;
|
||
|
pxor mm0, mm0 ;
|
||
|
|
||
|
movd mm1, [rsi] ;
|
||
|
movd mm3, [rsi+1] ;
|
||
|
|
||
|
punpcklbw mm1, mm0 ;
|
||
|
pmullw mm1, [rax] ;
|
||
|
|
||
|
punpcklbw mm3, mm0 ;
|
||
|
pmullw mm3, [rax+8] ;
|
||
|
|
||
|
paddw mm1, mm3 ;
|
||
|
paddw mm1, [mmx_bi_rd GLOBAL] ;
|
||
|
|
||
|
psraw mm1, mmx_filter_shift ;
|
||
|
movq mm5, mm1
|
||
|
|
||
|
%if ABI_IS_32BIT
|
||
|
add rsi, dword ptr arg(1) ;ref_pixels_per_line ;
|
||
|
%else
|
||
|
movsxd r8, dword ptr arg(1) ;ref_pixels_per_line ;
|
||
|
add rsi, r8
|
||
|
%endif
|
||
|
|
||
|
filter_block2d_bil4x4_var_mmx_loop:
|
||
|
|
||
|
movd mm1, [rsi] ;
|
||
|
movd mm3, [rsi+1] ;
|
||
|
|
||
|
punpcklbw mm1, mm0 ;
|
||
|
pmullw mm1, [rax] ;
|
||
|
|
||
|
punpcklbw mm3, mm0 ;
|
||
|
pmullw mm3, [rax+8] ;
|
||
|
|
||
|
paddw mm1, mm3 ;
|
||
|
paddw mm1, [mmx_bi_rd GLOBAL] ;
|
||
|
|
||
|
psraw mm1, mmx_filter_shift ;
|
||
|
movq mm3, mm5 ;
|
||
|
|
||
|
movq mm5, mm1 ;
|
||
|
pmullw mm3, [rdx] ;
|
||
|
|
||
|
pmullw mm1, [rdx+8] ;
|
||
|
paddw mm1, mm3 ;
|
||
|
|
||
|
|
||
|
paddw mm1, [mmx_bi_rd GLOBAL] ;
|
||
|
psraw mm1, mmx_filter_shift ;
|
||
|
|
||
|
movd mm3, [rdi] ;
|
||
|
punpcklbw mm3, mm0 ;
|
||
|
|
||
|
psubw mm1, mm3 ;
|
||
|
paddw mm6, mm1 ;
|
||
|
|
||
|
pmaddwd mm1, mm1 ;
|
||
|
paddd mm7, mm1 ;
|
||
|
|
||
|
%if ABI_IS_32BIT
|
||
|
add rsi, dword ptr arg(1) ;ref_pixels_per_line ;
|
||
|
add rdi, dword ptr arg(3) ;src_pixels_per_line ;
|
||
|
%else
|
||
|
movsxd r8, dword ptr arg(1) ;ref_pixels_per_line
|
||
|
movsxd r9, dword ptr arg(3) ;src_pixels_per_line
|
||
|
add rsi, r8
|
||
|
add rdi, r9
|
||
|
%endif
|
||
|
sub rcx, 1 ;
|
||
|
jnz filter_block2d_bil4x4_var_mmx_loop ;
|
||
|
|
||
|
|
||
|
pxor mm3, mm3 ;
|
||
|
pxor mm2, mm2 ;
|
||
|
|
||
|
punpcklwd mm2, mm6 ;
|
||
|
punpckhwd mm3, mm6 ;
|
||
|
|
||
|
paddd mm2, mm3 ;
|
||
|
movq mm6, mm2 ;
|
||
|
|
||
|
psrlq mm6, 32 ;
|
||
|
paddd mm2, mm6 ;
|
||
|
|
||
|
psrad mm2, 16 ;
|
||
|
movq mm4, mm7 ;
|
||
|
|
||
|
psrlq mm4, 32 ;
|
||
|
paddd mm4, mm7 ;
|
||
|
|
||
|
mov rdi, arg(6) ;sum
|
||
|
mov rsi, arg(7) ;sumsquared
|
||
|
|
||
|
movd dword ptr [rdi], mm2 ;
|
||
|
movd dword ptr [rsi], mm4 ;
|
||
|
|
||
|
|
||
|
|
||
|
; begin epilog
|
||
|
add rsp, 16
|
||
|
pop rdi
|
||
|
pop rsi
|
||
|
RESTORE_GOT
|
||
|
UNSHADOW_ARGS
|
||
|
pop rbp
|
||
|
ret
|
||
|
|
||
|
|
||
|
|
||
|
|
||
|
;void vp8_filter_block2d_bil_var_mmx
|
||
|
;(
|
||
|
; unsigned char *ref_ptr,
|
||
|
; int ref_pixels_per_line,
|
||
|
; unsigned char *src_ptr,
|
||
|
; int src_pixels_per_line,
|
||
|
; unsigned int Height,
|
||
|
; unsigned short *HFilter,
|
||
|
; unsigned short *VFilter,
|
||
|
; int *sum,
|
||
|
; unsigned int *sumsquared
|
||
|
;)
|
||
|
global sym(vp8_filter_block2d_bil_var_mmx)
|
||
|
sym(vp8_filter_block2d_bil_var_mmx):
|
||
|
push rbp
|
||
|
mov rbp, rsp
|
||
|
SHADOW_ARGS_TO_STACK 9
|
||
|
GET_GOT rbx
|
||
|
push rsi
|
||
|
push rdi
|
||
|
sub rsp, 16
|
||
|
; end prolog
|
||
|
|
||
|
pxor mm6, mm6 ;
|
||
|
pxor mm7, mm7 ;
|
||
|
mov rax, arg(5) ;HFilter ;
|
||
|
|
||
|
mov rdx, arg(6) ;VFilter ;
|
||
|
mov rsi, arg(0) ;ref_ptr ;
|
||
|
|
||
|
mov rdi, arg(2) ;src_ptr ;
|
||
|
movsxd rcx, dword ptr arg(4) ;Height ;
|
||
|
|
||
|
pxor mm0, mm0 ;
|
||
|
movq mm1, [rsi] ;
|
||
|
|
||
|
movq mm3, [rsi+1] ;
|
||
|
movq mm2, mm1 ;
|
||
|
|
||
|
movq mm4, mm3 ;
|
||
|
punpcklbw mm1, mm0 ;
|
||
|
|
||
|
punpckhbw mm2, mm0 ;
|
||
|
pmullw mm1, [rax] ;
|
||
|
|
||
|
pmullw mm2, [rax] ;
|
||
|
punpcklbw mm3, mm0 ;
|
||
|
|
||
|
punpckhbw mm4, mm0 ;
|
||
|
pmullw mm3, [rax+8] ;
|
||
|
|
||
|
pmullw mm4, [rax+8] ;
|
||
|
paddw mm1, mm3 ;
|
||
|
|
||
|
paddw mm2, mm4 ;
|
||
|
paddw mm1, [mmx_bi_rd GLOBAL] ;
|
||
|
|
||
|
psraw mm1, mmx_filter_shift ;
|
||
|
paddw mm2, [mmx_bi_rd GLOBAL] ;
|
||
|
|
||
|
psraw mm2, mmx_filter_shift ;
|
||
|
movq mm5, mm1
|
||
|
|
||
|
packuswb mm5, mm2 ;
|
||
|
%if ABI_IS_32BIT
|
||
|
add rsi, dword ptr arg(1) ;ref_pixels_per_line
|
||
|
%else
|
||
|
movsxd r8, dword ptr arg(1) ;ref_pixels_per_line
|
||
|
add rsi, r8
|
||
|
%endif
|
||
|
|
||
|
filter_block2d_bil_var_mmx_loop:
|
||
|
|
||
|
movq mm1, [rsi] ;
|
||
|
movq mm3, [rsi+1] ;
|
||
|
|
||
|
movq mm2, mm1 ;
|
||
|
movq mm4, mm3 ;
|
||
|
|
||
|
punpcklbw mm1, mm0 ;
|
||
|
punpckhbw mm2, mm0 ;
|
||
|
|
||
|
pmullw mm1, [rax] ;
|
||
|
pmullw mm2, [rax] ;
|
||
|
|
||
|
punpcklbw mm3, mm0 ;
|
||
|
punpckhbw mm4, mm0 ;
|
||
|
|
||
|
pmullw mm3, [rax+8] ;
|
||
|
pmullw mm4, [rax+8] ;
|
||
|
|
||
|
paddw mm1, mm3 ;
|
||
|
paddw mm2, mm4 ;
|
||
|
|
||
|
paddw mm1, [mmx_bi_rd GLOBAL] ;
|
||
|
psraw mm1, mmx_filter_shift ;
|
||
|
|
||
|
paddw mm2, [mmx_bi_rd GLOBAL] ;
|
||
|
psraw mm2, mmx_filter_shift ;
|
||
|
|
||
|
movq mm3, mm5 ;
|
||
|
movq mm4, mm5 ;
|
||
|
|
||
|
punpcklbw mm3, mm0 ;
|
||
|
punpckhbw mm4, mm0 ;
|
||
|
|
||
|
movq mm5, mm1 ;
|
||
|
packuswb mm5, mm2 ;
|
||
|
|
||
|
pmullw mm3, [rdx] ;
|
||
|
pmullw mm4, [rdx] ;
|
||
|
|
||
|
pmullw mm1, [rdx+8] ;
|
||
|
pmullw mm2, [rdx+8] ;
|
||
|
|
||
|
paddw mm1, mm3 ;
|
||
|
paddw mm2, mm4 ;
|
||
|
|
||
|
paddw mm1, [mmx_bi_rd GLOBAL] ;
|
||
|
paddw mm2, [mmx_bi_rd GLOBAL] ;
|
||
|
|
||
|
psraw mm1, mmx_filter_shift ;
|
||
|
psraw mm2, mmx_filter_shift ;
|
||
|
|
||
|
movq mm3, [rdi] ;
|
||
|
movq mm4, mm3 ;
|
||
|
|
||
|
punpcklbw mm3, mm0 ;
|
||
|
punpckhbw mm4, mm0 ;
|
||
|
|
||
|
psubw mm1, mm3 ;
|
||
|
psubw mm2, mm4 ;
|
||
|
|
||
|
paddw mm6, mm1 ;
|
||
|
pmaddwd mm1, mm1 ;
|
||
|
|
||
|
paddw mm6, mm2 ;
|
||
|
pmaddwd mm2, mm2 ;
|
||
|
|
||
|
paddd mm7, mm1 ;
|
||
|
paddd mm7, mm2 ;
|
||
|
|
||
|
%if ABI_IS_32BIT
|
||
|
add rsi, dword ptr arg(1) ;ref_pixels_per_line ;
|
||
|
add rdi, dword ptr arg(3) ;src_pixels_per_line ;
|
||
|
%else
|
||
|
movsxd r8, dword ptr arg(1) ;ref_pixels_per_line ;
|
||
|
movsxd r9, dword ptr arg(3) ;src_pixels_per_line ;
|
||
|
add rsi, r8
|
||
|
add rdi, r9
|
||
|
%endif
|
||
|
sub rcx, 1 ;
|
||
|
jnz filter_block2d_bil_var_mmx_loop ;
|
||
|
|
||
|
|
||
|
pxor mm3, mm3 ;
|
||
|
pxor mm2, mm2 ;
|
||
|
|
||
|
punpcklwd mm2, mm6 ;
|
||
|
punpckhwd mm3, mm6 ;
|
||
|
|
||
|
paddd mm2, mm3 ;
|
||
|
movq mm6, mm2 ;
|
||
|
|
||
|
psrlq mm6, 32 ;
|
||
|
paddd mm2, mm6 ;
|
||
|
|
||
|
psrad mm2, 16 ;
|
||
|
movq mm4, mm7 ;
|
||
|
|
||
|
psrlq mm4, 32 ;
|
||
|
paddd mm4, mm7 ;
|
||
|
|
||
|
mov rdi, arg(7) ;sum
|
||
|
mov rsi, arg(8) ;sumsquared
|
||
|
|
||
|
movd dword ptr [rdi], mm2 ;
|
||
|
movd dword ptr [rsi], mm4 ;
|
||
|
|
||
|
; begin epilog
|
||
|
add rsp, 16
|
||
|
pop rdi
|
||
|
pop rsi
|
||
|
RESTORE_GOT
|
||
|
UNSHADOW_ARGS
|
||
|
pop rbp
|
||
|
ret
|
||
|
|
||
|
;unsigned int vp8_get16x16pred_error_mmx
|
||
|
;(
|
||
|
; unsigned char *src_ptr,
|
||
|
; int src_stride,
|
||
|
; unsigned char *ref_ptr,
|
||
|
; int ref_stride
|
||
|
;)
|
||
|
global sym(vp8_get16x16pred_error_mmx)
|
||
|
sym(vp8_get16x16pred_error_mmx):
|
||
|
push rbp
|
||
|
mov rbp, rsp
|
||
|
SHADOW_ARGS_TO_STACK 4
|
||
|
GET_GOT rbx
|
||
|
push rsi
|
||
|
push rdi
|
||
|
sub rsp, 16
|
||
|
; end prolog
|
||
|
|
||
|
mov rsi, arg(0) ;DWORD PTR [src_ptr]
|
||
|
mov rdi, arg(2) ;DWORD PTR [ref_ptr]
|
||
|
|
||
|
movsxd rax, DWORD PTR arg(1) ;[src_stride]
|
||
|
movsxd rdx, DWORD PTR arg(3) ;[ref_stride]
|
||
|
|
||
|
pxor mm0, mm0 ; clear xmm0 for unpack
|
||
|
pxor mm7, mm7 ; clear xmm7 for accumulating diffs
|
||
|
|
||
|
pxor mm6, mm6 ; clear xmm6 for accumulating sse
|
||
|
mov rcx, 16
|
||
|
|
||
|
var16loop:
|
||
|
|
||
|
movq mm1, [rsi]
|
||
|
movq mm2, [rdi]
|
||
|
|
||
|
movq mm3, mm1
|
||
|
movq mm4, mm2
|
||
|
|
||
|
punpcklbw mm1, mm0
|
||
|
punpckhbw mm3, mm0
|
||
|
|
||
|
punpcklbw mm2, mm0
|
||
|
punpckhbw mm4, mm0
|
||
|
|
||
|
psubw mm1, mm2
|
||
|
psubw mm3, mm4
|
||
|
|
||
|
paddw mm7, mm1
|
||
|
pmaddwd mm1, mm1
|
||
|
|
||
|
paddw mm7, mm3
|
||
|
pmaddwd mm3, mm3
|
||
|
|
||
|
paddd mm6, mm1
|
||
|
paddd mm6, mm3
|
||
|
|
||
|
|
||
|
movq mm1, [rsi+8]
|
||
|
movq mm2, [rdi+8]
|
||
|
|
||
|
movq mm3, mm1
|
||
|
movq mm4, mm2
|
||
|
|
||
|
punpcklbw mm1, mm0
|
||
|
punpckhbw mm3, mm0
|
||
|
|
||
|
punpcklbw mm2, mm0
|
||
|
punpckhbw mm4, mm0
|
||
|
|
||
|
psubw mm1, mm2
|
||
|
psubw mm3, mm4
|
||
|
|
||
|
paddw mm7, mm1
|
||
|
pmaddwd mm1, mm1
|
||
|
|
||
|
paddw mm7, mm3
|
||
|
pmaddwd mm3, mm3
|
||
|
|
||
|
paddd mm6, mm1
|
||
|
paddd mm6, mm3
|
||
|
|
||
|
add rsi, rax
|
||
|
add rdi, rdx
|
||
|
|
||
|
sub rcx, 1
|
||
|
jnz var16loop
|
||
|
|
||
|
|
||
|
movq mm1, mm6
|
||
|
pxor mm6, mm6
|
||
|
|
||
|
pxor mm5, mm5
|
||
|
punpcklwd mm6, mm7
|
||
|
|
||
|
punpckhwd mm5, mm7
|
||
|
psrad mm5, 16
|
||
|
|
||
|
psrad mm6, 16
|
||
|
paddd mm6, mm5
|
||
|
|
||
|
movq mm2, mm1
|
||
|
psrlq mm1, 32
|
||
|
|
||
|
paddd mm2, mm1
|
||
|
movq mm7, mm6
|
||
|
|
||
|
psrlq mm6, 32
|
||
|
paddd mm6, mm7
|
||
|
|
||
|
movd DWORD PTR [rsp], mm6 ;Sum
|
||
|
movd DWORD PTR [rsp+4], mm2 ;SSE
|
||
|
|
||
|
; return (SSE-((Sum*Sum)>>8));
|
||
|
movsxd rdx, dword ptr [rsp]
|
||
|
imul rdx, rdx
|
||
|
sar rdx, 8
|
||
|
movsxd rax, dword ptr [rsp + 4]
|
||
|
sub rax, rdx
|
||
|
|
||
|
|
||
|
; begin epilog
|
||
|
add rsp, 16
|
||
|
pop rdi
|
||
|
pop rsi
|
||
|
RESTORE_GOT
|
||
|
UNSHADOW_ARGS
|
||
|
pop rbp
|
||
|
ret
|
||
|
|
||
|
|
||
|
|
||
|
SECTION_RODATA
|
||
|
;short mmx_bi_rd[4] = { 64, 64, 64, 64};
|
||
|
align 16
|
||
|
mmx_bi_rd:
|
||
|
times 4 dw 64
|