vpx/vp8/encoder/x86/sad_sse3.asm
John Koleszar 94c52e4da8 cosmetics: trim trailing whitespace
When the license headers were updated, they accidentally contained
trailing whitespace, so unfortunately we have to touch all the files
again.

Change-Id: I236c05fade06589e417179c0444cb39b09e4200d
2010-06-18 13:06:11 -04:00

941 lines
25 KiB
NASM

;
; Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
;
; Use of this source code is governed by a BSD-style license
; that can be found in the LICENSE file in the root of the source
; tree. An additional intellectual property rights grant can be found
; in the file PATENTS. All contributing project authors may
; be found in the AUTHORS file in the root of the source tree.
;
%include "vpx_ports/x86_abi_support.asm"
%idefine QWORD
%macro PROCESS_16X2X3 1
%if %1
movdqa xmm0, [rsi]
lddqu xmm5, [rdi]
lddqu xmm6, [rdi+1]
lddqu xmm7, [rdi+2]
psadbw xmm5, xmm0
psadbw xmm6, xmm0
psadbw xmm7, xmm0
%else
movdqa xmm0, [rsi]
lddqu xmm1, [rdi]
lddqu xmm2, [rdi+1]
lddqu xmm3, [rdi+2]
psadbw xmm1, xmm0
psadbw xmm2, xmm0
psadbw xmm3, xmm0
paddw xmm5, xmm1
paddw xmm6, xmm2
paddw xmm7, xmm3
%endif
movdqa xmm0, QWORD PTR [rsi+rax]
lddqu xmm1, QWORD PTR [rdi+rdx]
lddqu xmm2, QWORD PTR [rdi+rdx+1]
lddqu xmm3, QWORD PTR [rdi+rdx+2]
lea rsi, [rsi+rax*2]
lea rdi, [rdi+rdx*2]
psadbw xmm1, xmm0
psadbw xmm2, xmm0
psadbw xmm3, xmm0
paddw xmm5, xmm1
paddw xmm6, xmm2
paddw xmm7, xmm3
%endmacro
%macro PROCESS_8X2X3 1
%if %1
movq mm0, [rsi]
movq mm5, [rdi]
movq mm6, [rdi+1]
movq mm7, [rdi+2]
psadbw mm5, mm0
psadbw mm6, mm0
psadbw mm7, mm0
%else
movq mm0, [rsi]
movq mm1, [rdi]
movq mm2, [rdi+1]
movq mm3, [rdi+2]
psadbw mm1, mm0
psadbw mm2, mm0
psadbw mm3, mm0
paddw mm5, mm1
paddw mm6, mm2
paddw mm7, mm3
%endif
movq mm0, QWORD PTR [rsi+rax]
movq mm1, QWORD PTR [rdi+rdx]
movq mm2, QWORD PTR [rdi+rdx+1]
movq mm3, QWORD PTR [rdi+rdx+2]
lea rsi, [rsi+rax*2]
lea rdi, [rdi+rdx*2]
psadbw mm1, mm0
psadbw mm2, mm0
psadbw mm3, mm0
paddw mm5, mm1
paddw mm6, mm2
paddw mm7, mm3
%endmacro
%macro LOAD_X4_ADDRESSES 5
mov %2, [%1+REG_SZ_BYTES*0]
mov %3, [%1+REG_SZ_BYTES*1]
mov %4, [%1+REG_SZ_BYTES*2]
mov %5, [%1+REG_SZ_BYTES*3]
%endmacro
%macro PROCESS_16X2X4 1
%if %1
movdqa xmm0, [rsi]
lddqu xmm4, [rcx]
lddqu xmm5, [rdx]
lddqu xmm6, [rbx]
lddqu xmm7, [rdi]
psadbw xmm4, xmm0
psadbw xmm5, xmm0
psadbw xmm6, xmm0
psadbw xmm7, xmm0
%else
movdqa xmm0, [rsi]
lddqu xmm1, [rcx]
lddqu xmm2, [rdx]
lddqu xmm3, [rbx]
psadbw xmm1, xmm0
psadbw xmm2, xmm0
psadbw xmm3, xmm0
paddw xmm4, xmm1
lddqu xmm1, [rdi]
paddw xmm5, xmm2
paddw xmm6, xmm3
psadbw xmm1, xmm0
paddw xmm7, xmm1
%endif
movdqa xmm0, QWORD PTR [rsi+rax]
lddqu xmm1, QWORD PTR [rcx+rbp]
lddqu xmm2, QWORD PTR [rdx+rbp]
lddqu xmm3, QWORD PTR [rbx+rbp]
psadbw xmm1, xmm0
psadbw xmm2, xmm0
psadbw xmm3, xmm0
paddw xmm4, xmm1
lddqu xmm1, QWORD PTR [rdi+rbp]
paddw xmm5, xmm2
paddw xmm6, xmm3
lea rsi, [rsi+rax*2]
lea rcx, [rcx+rbp*2]
lea rdx, [rdx+rbp*2]
lea rbx, [rbx+rbp*2]
lea rdi, [rdi+rbp*2]
psadbw xmm1, xmm0
paddw xmm7, xmm1
%endmacro
%macro PROCESS_8X2X4 1
%if %1
movq mm0, [rsi]
movq mm4, [rcx]
movq mm5, [rdx]
movq mm6, [rbx]
movq mm7, [rdi]
psadbw mm4, mm0
psadbw mm5, mm0
psadbw mm6, mm0
psadbw mm7, mm0
%else
movq mm0, [rsi]
movq mm1, [rcx]
movq mm2, [rdx]
movq mm3, [rbx]
psadbw mm1, mm0
psadbw mm2, mm0
psadbw mm3, mm0
paddw mm4, mm1
movq mm1, [rdi]
paddw mm5, mm2
paddw mm6, mm3
psadbw mm1, mm0
paddw mm7, mm1
%endif
movq mm0, QWORD PTR [rsi+rax]
movq mm1, QWORD PTR [rcx+rbp]
movq mm2, QWORD PTR [rdx+rbp]
movq mm3, QWORD PTR [rbx+rbp]
psadbw mm1, mm0
psadbw mm2, mm0
psadbw mm3, mm0
paddw mm4, mm1
movq mm1, QWORD PTR [rdi+rbp]
paddw mm5, mm2
paddw mm6, mm3
lea rsi, [rsi+rax*2]
lea rcx, [rcx+rbp*2]
lea rdx, [rdx+rbp*2]
lea rbx, [rbx+rbp*2]
lea rdi, [rdi+rbp*2]
psadbw mm1, mm0
paddw mm7, mm1
%endmacro
;void int vp8_sad16x16x3_sse3(
; unsigned char *src_ptr,
; int src_stride,
; unsigned char *ref_ptr,
; int ref_stride,
; int *results)
global sym(vp8_sad16x16x3_sse3)
sym(vp8_sad16x16x3_sse3):
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 5
push rsi
push rdi
; end prolog
mov rsi, arg(0) ;src_ptr
mov rdi, arg(2) ;ref_ptr
movsxd rax, dword ptr arg(1) ;src_stride
movsxd rdx, dword ptr arg(3) ;ref_stride
PROCESS_16X2X3 1
PROCESS_16X2X3 0
PROCESS_16X2X3 0
PROCESS_16X2X3 0
PROCESS_16X2X3 0
PROCESS_16X2X3 0
PROCESS_16X2X3 0
PROCESS_16X2X3 0
mov rdi, arg(4) ;Results
movq xmm0, xmm5
psrldq xmm5, 8
paddw xmm0, xmm5
movd [rdi], xmm0
;-
movq xmm0, xmm6
psrldq xmm6, 8
paddw xmm0, xmm6
movd [rdi+4], xmm0
;-
movq xmm0, xmm7
psrldq xmm7, 8
paddw xmm0, xmm7
movd [rdi+8], xmm0
; begin epilog
pop rdi
pop rsi
UNSHADOW_ARGS
pop rbp
ret
;void int vp8_sad16x8x3_sse3(
; unsigned char *src_ptr,
; int src_stride,
; unsigned char *ref_ptr,
; int ref_stride,
; int *results)
global sym(vp8_sad16x8x3_sse3)
sym(vp8_sad16x8x3_sse3):
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 5
push rsi
push rdi
; end prolog
mov rsi, arg(0) ;src_ptr
mov rdi, arg(2) ;ref_ptr
movsxd rax, dword ptr arg(1) ;src_stride
movsxd rdx, dword ptr arg(3) ;ref_stride
PROCESS_16X2X3 1
PROCESS_16X2X3 0
PROCESS_16X2X3 0
PROCESS_16X2X3 0
mov rdi, arg(4) ;Results
movq xmm0, xmm5
psrldq xmm5, 8
paddw xmm0, xmm5
movd [rdi], xmm0
;-
movq xmm0, xmm6
psrldq xmm6, 8
paddw xmm0, xmm6
movd [rdi+4], xmm0
;-
movq xmm0, xmm7
psrldq xmm7, 8
paddw xmm0, xmm7
movd [rdi+8], xmm0
; begin epilog
pop rdi
pop rsi
UNSHADOW_ARGS
pop rbp
ret
;void int vp8_sad8x16x3_sse3(
; unsigned char *src_ptr,
; int src_stride,
; unsigned char *ref_ptr,
; int ref_stride,
; int *results)
global sym(vp8_sad8x16x3_sse3)
sym(vp8_sad8x16x3_sse3):
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 5
push rsi
push rdi
; end prolog
mov rsi, arg(0) ;src_ptr
mov rdi, arg(2) ;ref_ptr
movsxd rax, dword ptr arg(1) ;src_stride
movsxd rdx, dword ptr arg(3) ;ref_stride
PROCESS_8X2X3 1
PROCESS_8X2X3 0
PROCESS_8X2X3 0
PROCESS_8X2X3 0
PROCESS_8X2X3 0
PROCESS_8X2X3 0
PROCESS_8X2X3 0
PROCESS_8X2X3 0
mov rdi, arg(4) ;Results
movd [rdi], mm5
movd [rdi+4], mm6
movd [rdi+8], mm7
; begin epilog
pop rdi
pop rsi
UNSHADOW_ARGS
pop rbp
ret
;void int vp8_sad8x8x3_sse3(
; unsigned char *src_ptr,
; int src_stride,
; unsigned char *ref_ptr,
; int ref_stride,
; int *results)
global sym(vp8_sad8x8x3_sse3)
sym(vp8_sad8x8x3_sse3):
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 5
push rsi
push rdi
; end prolog
mov rsi, arg(0) ;src_ptr
mov rdi, arg(2) ;ref_ptr
movsxd rax, dword ptr arg(1) ;src_stride
movsxd rdx, dword ptr arg(3) ;ref_stride
PROCESS_8X2X3 1
PROCESS_8X2X3 0
PROCESS_8X2X3 0
PROCESS_8X2X3 0
mov rdi, arg(4) ;Results
movd [rdi], mm5
movd [rdi+4], mm6
movd [rdi+8], mm7
; begin epilog
pop rdi
pop rsi
UNSHADOW_ARGS
pop rbp
ret
;void int vp8_sad4x4x3_sse3(
; unsigned char *src_ptr,
; int src_stride,
; unsigned char *ref_ptr,
; int ref_stride,
; int *results)
global sym(vp8_sad4x4x3_sse3)
sym(vp8_sad4x4x3_sse3):
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 5
push rsi
push rdi
; end prolog
mov rsi, arg(0) ;src_ptr
mov rdi, arg(2) ;ref_ptr
movsxd rax, dword ptr arg(1) ;src_stride
movsxd rdx, dword ptr arg(3) ;ref_stride
movd mm0, QWORD PTR [rsi]
movd mm1, QWORD PTR [rdi]
movd mm2, QWORD PTR [rsi+rax]
movd mm3, QWORD PTR [rdi+rdx]
punpcklbw mm0, mm2
punpcklbw mm1, mm3
movd mm4, QWORD PTR [rdi+1]
movd mm5, QWORD PTR [rdi+2]
movd mm2, QWORD PTR [rdi+rdx+1]
movd mm3, QWORD PTR [rdi+rdx+2]
psadbw mm1, mm0
punpcklbw mm4, mm2
punpcklbw mm5, mm3
psadbw mm4, mm0
psadbw mm5, mm0
lea rsi, [rsi+rax*2]
lea rdi, [rdi+rdx*2]
movd mm0, QWORD PTR [rsi]
movd mm2, QWORD PTR [rdi]
movd mm3, QWORD PTR [rsi+rax]
movd mm6, QWORD PTR [rdi+rdx]
punpcklbw mm0, mm3
punpcklbw mm2, mm6
movd mm3, QWORD PTR [rdi+1]
movd mm7, QWORD PTR [rdi+2]
psadbw mm2, mm0
paddw mm1, mm2
movd mm2, QWORD PTR [rdi+rdx+1]
movd mm6, QWORD PTR [rdi+rdx+2]
punpcklbw mm3, mm2
punpcklbw mm7, mm6
psadbw mm3, mm0
psadbw mm7, mm0
paddw mm3, mm4
paddw mm7, mm5
mov rdi, arg(4) ;Results
movd [rdi], mm1
movd [rdi+4], mm3
movd [rdi+8], mm7
; begin epilog
pop rdi
pop rsi
UNSHADOW_ARGS
pop rbp
ret
;unsigned int vp8_sad16x16_sse3(
; unsigned char *src_ptr,
; int src_stride,
; unsigned char *ref_ptr,
; int ref_stride,
; int max_err)
;%define lddqu movdqu
global sym(vp8_sad16x16_sse3)
sym(vp8_sad16x16_sse3):
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 5
push rbx
push rsi
push rdi
; end prolog
mov rsi, arg(0) ;src_ptr
mov rdi, arg(2) ;ref_ptr
movsxd rbx, dword ptr arg(1) ;src_stride
movsxd rdx, dword ptr arg(3) ;ref_stride
lea rcx, [rsi+rbx*8]
lea rcx, [rcx+rbx*8]
pxor mm7, mm7
vp8_sad16x16_sse3_loop:
movd rax, mm7
cmp rax, arg(4)
jg vp8_sad16x16_early_exit
movq mm0, QWORD PTR [rsi]
movq mm2, QWORD PTR [rsi+8]
movq mm1, QWORD PTR [rdi]
movq mm3, QWORD PTR [rdi+8]
movq mm4, QWORD PTR [rsi+rbx]
movq mm5, QWORD PTR [rdi+rdx]
psadbw mm0, mm1
psadbw mm2, mm3
movq mm1, QWORD PTR [rsi+rbx+8]
movq mm3, QWORD PTR [rdi+rdx+8]
psadbw mm4, mm5
psadbw mm1, mm3
lea rsi, [rsi+rbx*2]
lea rdi, [rdi+rdx*2]
paddw mm0, mm2
paddw mm4, mm1
paddw mm7, mm0
paddw mm7, mm4
cmp rsi, rcx
jne vp8_sad16x16_sse3_loop
movd rax, mm7
vp8_sad16x16_early_exit:
; begin epilog
pop rdi
pop rsi
pop rbx
UNSHADOW_ARGS
pop rbp
ret
;void vp8_sad16x16x4d_sse3(
; unsigned char *src_ptr,
; int src_stride,
; unsigned char *ref_ptr_base,
; int ref_stride,
; int *results)
global sym(vp8_sad16x16x4d_sse3)
sym(vp8_sad16x16x4d_sse3):
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 5
push rsi
push rdi
push rbx
; end prolog
push rbp
mov rdi, arg(2) ; ref_ptr_base
LOAD_X4_ADDRESSES rdi, rcx, rdx, rax, rdi
mov rsi, arg(0) ;src_ptr
movsxd rbx, dword ptr arg(1) ;src_stride
movsxd rbp, dword ptr arg(3) ;ref_stride
xchg rbx, rax
PROCESS_16X2X4 1
PROCESS_16X2X4 0
PROCESS_16X2X4 0
PROCESS_16X2X4 0
PROCESS_16X2X4 0
PROCESS_16X2X4 0
PROCESS_16X2X4 0
PROCESS_16X2X4 0
pop rbp
mov rdi, arg(4) ;Results
movq xmm0, xmm4
psrldq xmm4, 8
paddw xmm0, xmm4
movd [rdi], xmm0
;-
movq xmm0, xmm5
psrldq xmm5, 8
paddw xmm0, xmm5
movd [rdi+4], xmm0
;-
movq xmm0, xmm6
psrldq xmm6, 8
paddw xmm0, xmm6
movd [rdi+8], xmm0
;-
movq xmm0, xmm7
psrldq xmm7, 8
paddw xmm0, xmm7
movd [rdi+12], xmm0
; begin epilog
pop rbx
pop rdi
pop rsi
UNSHADOW_ARGS
pop rbp
ret
;void vp8_sad16x8x4d_sse3(
; unsigned char *src_ptr,
; int src_stride,
; unsigned char *ref_ptr_base,
; int ref_stride,
; int *results)
global sym(vp8_sad16x8x4d_sse3)
sym(vp8_sad16x8x4d_sse3):
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 5
push rsi
push rdi
push rbx
; end prolog
push rbp
mov rdi, arg(2) ; ref_ptr_base
LOAD_X4_ADDRESSES rdi, rcx, rdx, rax, rdi
mov rsi, arg(0) ;src_ptr
movsxd rbx, dword ptr arg(1) ;src_stride
movsxd rbp, dword ptr arg(3) ;ref_stride
xchg rbx, rax
PROCESS_16X2X4 1
PROCESS_16X2X4 0
PROCESS_16X2X4 0
PROCESS_16X2X4 0
pop rbp
mov rdi, arg(4) ;Results
movq xmm0, xmm4
psrldq xmm4, 8
paddw xmm0, xmm4
movd [rdi], xmm0
;-
movq xmm0, xmm5
psrldq xmm5, 8
paddw xmm0, xmm5
movd [rdi+4], xmm0
;-
movq xmm0, xmm6
psrldq xmm6, 8
paddw xmm0, xmm6
movd [rdi+8], xmm0
;-
movq xmm0, xmm7
psrldq xmm7, 8
paddw xmm0, xmm7
movd [rdi+12], xmm0
; begin epilog
pop rbx
pop rdi
pop rsi
UNSHADOW_ARGS
pop rbp
ret
;void int vp8_sad8x16x4d_sse3(
; unsigned char *src_ptr,
; int src_stride,
; unsigned char *ref_ptr,
; int ref_stride,
; int *results)
global sym(vp8_sad8x16x4d_sse3)
sym(vp8_sad8x16x4d_sse3):
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 5
push rsi
push rdi
push rbx
; end prolog
push rbp
mov rdi, arg(2) ; ref_ptr_base
LOAD_X4_ADDRESSES rdi, rcx, rdx, rax, rdi
mov rsi, arg(0) ;src_ptr
movsxd rbx, dword ptr arg(1) ;src_stride
movsxd rbp, dword ptr arg(3) ;ref_stride
xchg rbx, rax
PROCESS_8X2X4 1
PROCESS_8X2X4 0
PROCESS_8X2X4 0
PROCESS_8X2X4 0
PROCESS_8X2X4 0
PROCESS_8X2X4 0
PROCESS_8X2X4 0
PROCESS_8X2X4 0
pop rbp
mov rdi, arg(4) ;Results
movd [rdi], mm4
movd [rdi+4], mm5
movd [rdi+8], mm6
movd [rdi+12], mm7
; begin epilog
pop rbx
pop rdi
pop rsi
UNSHADOW_ARGS
pop rbp
ret
;void int vp8_sad8x8x4d_sse3(
; unsigned char *src_ptr,
; int src_stride,
; unsigned char *ref_ptr,
; int ref_stride,
; int *results)
global sym(vp8_sad8x8x4d_sse3)
sym(vp8_sad8x8x4d_sse3):
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 5
push rsi
push rdi
push rbx
; end prolog
push rbp
mov rdi, arg(2) ; ref_ptr_base
LOAD_X4_ADDRESSES rdi, rcx, rdx, rax, rdi
mov rsi, arg(0) ;src_ptr
movsxd rbx, dword ptr arg(1) ;src_stride
movsxd rbp, dword ptr arg(3) ;ref_stride
xchg rbx, rax
PROCESS_8X2X4 1
PROCESS_8X2X4 0
PROCESS_8X2X4 0
PROCESS_8X2X4 0
pop rbp
mov rdi, arg(4) ;Results
movd [rdi], mm4
movd [rdi+4], mm5
movd [rdi+8], mm6
movd [rdi+12], mm7
; begin epilog
pop rbx
pop rdi
pop rsi
UNSHADOW_ARGS
pop rbp
ret
;void int vp8_sad4x4x4d_sse3(
; unsigned char *src_ptr,
; int src_stride,
; unsigned char *ref_ptr,
; int ref_stride,
; int *results)
global sym(vp8_sad4x4x4d_sse3)
sym(vp8_sad4x4x4d_sse3):
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 5
push rsi
push rdi
push rbx
; end prolog
push rbp
mov rdi, arg(2) ; ref_ptr_base
LOAD_X4_ADDRESSES rdi, rcx, rdx, rax, rdi
mov rsi, arg(0) ;src_ptr
movsxd rbx, dword ptr arg(1) ;src_stride
movsxd rbp, dword ptr arg(3) ;ref_stride
xchg rbx, rax
movd mm0, QWORD PTR [rsi]
movd mm1, QWORD PTR [rcx]
movd mm2, QWORD PTR [rsi+rax]
movd mm3, QWORD PTR [rcx+rbp]
punpcklbw mm0, mm2
punpcklbw mm1, mm3
movd mm4, QWORD PTR [rdx]
movd mm5, QWORD PTR [rbx]
movd mm6, QWORD PTR [rdi]
movd mm2, QWORD PTR [rdx+rbp]
movd mm3, QWORD PTR [rbx+rbp]
movd mm7, QWORD PTR [rdi+rbp]
psadbw mm1, mm0
punpcklbw mm4, mm2
punpcklbw mm5, mm3
punpcklbw mm6, mm7
psadbw mm4, mm0
psadbw mm5, mm0
psadbw mm6, mm0
lea rsi, [rsi+rax*2]
lea rcx, [rcx+rbp*2]
lea rdx, [rdx+rbp*2]
lea rbx, [rbx+rbp*2]
lea rdi, [rdi+rbp*2]
movd mm0, QWORD PTR [rsi]
movd mm2, QWORD PTR [rcx]
movd mm3, QWORD PTR [rsi+rax]
movd mm7, QWORD PTR [rcx+rbp]
punpcklbw mm0, mm3
punpcklbw mm2, mm7
movd mm3, QWORD PTR [rdx]
movd mm7, QWORD PTR [rbx]
psadbw mm2, mm0
mov rax, rbp
pop rbp
mov rsi, arg(4) ;Results
paddw mm1, mm2
movd [rsi], mm1
movd mm2, QWORD PTR [rdx+rax]
movd mm1, QWORD PTR [rbx+rax]
punpcklbw mm3, mm2
punpcklbw mm7, mm1
psadbw mm3, mm0
psadbw mm7, mm0
movd mm2, QWORD PTR [rdi]
movd mm1, QWORD PTR [rdi+rax]
paddw mm3, mm4
paddw mm7, mm5
movd [rsi+4], mm3
punpcklbw mm2, mm1
movd [rsi+8], mm7
psadbw mm2, mm0
paddw mm2, mm6
movd [rsi+12], mm2
; begin epilog
pop rbx
pop rdi
pop rsi
UNSHADOW_ARGS
pop rbp
ret