; ; Copyright (c) 2010 The VP8 project authors. All Rights Reserved. ; ; Use of this source code is governed by a BSD-style license and patent ; grant that can be found in the LICENSE file in the root of the source ; tree. All contributing project authors may be found in the AUTHORS ; file in the root of the source tree. ; %include "vpx_ports/x86_abi_support.asm" global sym(vp8_sad16x16_mmx) global sym(vp8_sad8x16_mmx) global sym(vp8_sad8x8_mmx) global sym(vp8_sad4x4_mmx) global sym(vp8_sad16x8_mmx) %idefine QWORD ;unsigned int vp8_sad16x16_mmx( ; unsigned char *src_ptr, ; int src_stride, ; unsigned char *ref_ptr, ; int ref_stride) sym(vp8_sad16x16_mmx): push rbp mov rbp, rsp SHADOW_ARGS_TO_STACK 4 push rsi push rdi ; end prolog mov rsi, arg(0) ;src_ptr mov rdi, arg(2) ;ref_ptr movsxd rax, dword ptr arg(1) ;src_stride movsxd rdx, dword ptr arg(3) ;ref_stride lea rcx, [rsi+rax*8] lea rcx, [rcx+rax*8] pxor mm7, mm7 pxor mm6, mm6 x16x16sad_mmx_loop: movq mm0, QWORD PTR [rsi] movq mm2, QWORD PTR [rsi+8] movq mm1, QWORD PTR [rdi] movq mm3, QWORD PTR [rdi+8] movq mm4, mm0 movq mm5, mm2 psubusb mm0, mm1 psubusb mm1, mm4 psubusb mm2, mm3 psubusb mm3, mm5 por mm0, mm1 por mm2, mm3 movq mm1, mm0 movq mm3, mm2 punpcklbw mm0, mm6 punpcklbw mm2, mm6 punpckhbw mm1, mm6 punpckhbw mm3, mm6 paddw mm0, mm2 paddw mm1, mm3 lea rsi, [rsi+rax] add rdi, rdx paddw mm7, mm0 paddw mm7, mm1 cmp rsi, rcx jne x16x16sad_mmx_loop movq mm0, mm7 punpcklwd mm0, mm6 punpckhwd mm7, mm6 paddw mm0, mm7 movq mm7, mm0 psrlq mm0, 32 paddw mm7, mm0 movd rax, mm7 pop rdi pop rsi mov rsp, rbp ; begin epilog UNSHADOW_ARGS pop rbp ret ;unsigned int vp8_sad8x16_mmx( ; unsigned char *src_ptr, ; int src_stride, ; unsigned char *ref_ptr, ; int ref_stride) sym(vp8_sad8x16_mmx): push rbp mov rbp, rsp SHADOW_ARGS_TO_STACK 4 push rsi push rdi ; end prolog mov rsi, arg(0) ;src_ptr mov rdi, arg(2) ;ref_ptr movsxd rax, dword ptr arg(1) ;src_stride movsxd rdx, dword ptr arg(3) ;ref_stride lea rcx, [rsi+rax*8] lea rcx, [rcx+rax*8] pxor mm7, mm7 pxor mm6, mm6 x8x16sad_mmx_loop: movq mm0, QWORD PTR [rsi] movq mm1, QWORD PTR [rdi] movq mm2, mm0 psubusb mm0, mm1 psubusb mm1, mm2 por mm0, mm1 movq mm2, mm0 punpcklbw mm0, mm6 punpckhbw mm2, mm6 lea rsi, [rsi+rax] add rdi, rdx paddw mm7, mm0 paddw mm7, mm2 cmp rsi, rcx jne x8x16sad_mmx_loop movq mm0, mm7 punpcklwd mm0, mm6 punpckhwd mm7, mm6 paddw mm0, mm7 movq mm7, mm0 psrlq mm0, 32 paddw mm7, mm0 movd rax, mm7 pop rdi pop rsi mov rsp, rbp ; begin epilog UNSHADOW_ARGS pop rbp ret ;unsigned int vp8_sad8x8_mmx( ; unsigned char *src_ptr, ; int src_stride, ; unsigned char *ref_ptr, ; int ref_stride) sym(vp8_sad8x8_mmx): push rbp mov rbp, rsp SHADOW_ARGS_TO_STACK 4 push rsi push rdi ; end prolog mov rsi, arg(0) ;src_ptr mov rdi, arg(2) ;ref_ptr movsxd rax, dword ptr arg(1) ;src_stride movsxd rdx, dword ptr arg(3) ;ref_stride lea rcx, [rsi+rax*8] pxor mm7, mm7 pxor mm6, mm6 x8x8sad_mmx_loop: movq mm0, QWORD PTR [rsi] movq mm1, QWORD PTR [rdi] movq mm2, mm0 psubusb mm0, mm1 psubusb mm1, mm2 por mm0, mm1 movq mm2, mm0 punpcklbw mm0, mm6 punpckhbw mm2, mm6 paddw mm0, mm2 lea rsi, [rsi+rax] add rdi, rdx paddw mm7, mm0 cmp rsi, rcx jne x8x8sad_mmx_loop movq mm0, mm7 punpcklwd mm0, mm6 punpckhwd mm7, mm6 paddw mm0, mm7 movq mm7, mm0 psrlq mm0, 32 paddw mm7, mm0 movd rax, mm7 pop rdi pop rsi mov rsp, rbp ; begin epilog UNSHADOW_ARGS pop rbp ret ;unsigned int vp8_sad4x4_mmx( ; unsigned char *src_ptr, ; int src_stride, ; unsigned char *ref_ptr, ; int ref_stride) sym(vp8_sad4x4_mmx): push rbp mov rbp, rsp SHADOW_ARGS_TO_STACK 4 push rsi push rdi ; end prolog mov rsi, arg(0) ;src_ptr mov rdi, arg(2) ;ref_ptr movsxd rax, dword ptr arg(1) ;src_stride movsxd rdx, dword ptr arg(3) ;ref_stride movd mm0, QWORD PTR [rsi] movd mm1, QWORD PTR [rdi] movd mm2, QWORD PTR [rsi+rax] movd mm3, QWORD PTR [rdi+rdx] punpcklbw mm0, mm2 punpcklbw mm1, mm3 movq mm2, mm0 psubusb mm0, mm1 psubusb mm1, mm2 por mm0, mm1 movq mm2, mm0 pxor mm3, mm3 punpcklbw mm0, mm3 punpckhbw mm2, mm3 paddw mm0, mm2 lea rsi, [rsi+rax*2] lea rdi, [rdi+rdx*2] movd mm4, QWORD PTR [rsi] movd mm5, QWORD PTR [rdi] movd mm6, QWORD PTR [rsi+rax] movd mm7, QWORD PTR [rdi+rdx] punpcklbw mm4, mm6 punpcklbw mm5, mm7 movq mm6, mm4 psubusb mm4, mm5 psubusb mm5, mm6 por mm4, mm5 movq mm5, mm4 punpcklbw mm4, mm3 punpckhbw mm5, mm3 paddw mm4, mm5 paddw mm0, mm4 movq mm1, mm0 punpcklwd mm0, mm3 punpckhwd mm1, mm3 paddw mm0, mm1 movq mm1, mm0 psrlq mm0, 32 paddw mm0, mm1 movd rax, mm0 pop rdi pop rsi mov rsp, rbp ; begin epilog UNSHADOW_ARGS pop rbp ret ;unsigned int vp8_sad16x8_mmx( ; unsigned char *src_ptr, ; int src_stride, ; unsigned char *ref_ptr, ; int ref_stride) sym(vp8_sad16x8_mmx): push rbp mov rbp, rsp SHADOW_ARGS_TO_STACK 4 push rsi push rdi ; end prolog mov rsi, arg(0) ;src_ptr mov rdi, arg(2) ;ref_ptr movsxd rax, dword ptr arg(1) ;src_stride movsxd rdx, dword ptr arg(3) ;ref_stride lea rcx, [rsi+rax*8] pxor mm7, mm7 pxor mm6, mm6 x16x8sad_mmx_loop: movq mm0, [rsi] movq mm1, [rdi] movq mm2, [rsi+8] movq mm3, [rdi+8] movq mm4, mm0 movq mm5, mm2 psubusb mm0, mm1 psubusb mm1, mm4 psubusb mm2, mm3 psubusb mm3, mm5 por mm0, mm1 por mm2, mm3 movq mm1, mm0 movq mm3, mm2 punpcklbw mm0, mm6 punpckhbw mm1, mm6 punpcklbw mm2, mm6 punpckhbw mm3, mm6 paddw mm0, mm2 paddw mm1, mm3 paddw mm0, mm1 lea rsi, [rsi+rax] add rdi, rdx paddw mm7, mm0 cmp rsi, rcx jne x16x8sad_mmx_loop movq mm0, mm7 punpcklwd mm0, mm6 punpckhwd mm7, mm6 paddw mm0, mm7 movq mm7, mm0 psrlq mm0, 32 paddw mm7, mm0 movd rax, mm7 pop rdi pop rsi mov rsp, rbp ; begin epilog UNSHADOW_ARGS pop rbp ret