vpx/vp9/common/x86/vp9_subpixel_8t_ssse3.asm
John Koleszar a9ebbcc338 convolve: support larger blocks, fix asm saturation bug
Updates the common convoloution code to support blocks larger than
16x16, and rectangular blocks. This uncovered a bug in the SSSE3
filtering routines due to the order of application of saturation.
This commit fixes that bug, adjusts the unit test to bias its
random values towards the extremes, and adds a test to ensure that
all filters conform to the expected pairwise addition structure.

Change-Id: I81f69668b1de0de5a8ed43f0643845641525c8f0
2013-04-18 13:57:59 -07:00

1012 lines
24 KiB
NASM

;
; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
;
; Use of this source code is governed by a BSD-style license
; that can be found in the LICENSE file in the root of the source
; tree. An additional intellectual property rights grant can be found
; in the file PATENTS. All contributing project authors may
; be found in the AUTHORS file in the root of the source tree.
;
%include "vpx_ports/x86_abi_support.asm"
;/************************************************************************************
; Notes: filter_block1d_h6 applies a 6 tap filter horizontally to the input pixels. The
; input pixel array has output_height rows. This routine assumes that output_height is an
; even number. This function handles 8 pixels in horizontal direction, calculating ONE
; rows each iteration to take advantage of the 128 bits operations.
;
; This is an implementation of some of the SSE optimizations first seen in ffvp8
;
;*************************************************************************************/
%macro VERTx4 1
mov rdx, arg(5) ;filter ptr
mov rsi, arg(0) ;src_ptr
mov rdi, arg(2) ;output_ptr
mov rcx, 0x0400040
movdqa xmm4, [rdx] ;load filters
movd xmm5, rcx
packsswb xmm4, xmm4
pshuflw xmm0, xmm4, 0b ;k0_k1
pshuflw xmm1, xmm4, 01010101b ;k2_k3
pshuflw xmm2, xmm4, 10101010b ;k4_k5
pshuflw xmm3, xmm4, 11111111b ;k6_k7
punpcklqdq xmm0, xmm0
punpcklqdq xmm1, xmm1
punpcklqdq xmm2, xmm2
punpcklqdq xmm3, xmm3
movdqa k0k1, xmm0
movdqa k2k3, xmm1
pshufd xmm5, xmm5, 0
movdqa k4k5, xmm2
movdqa k6k7, xmm3
movdqa krd, xmm5
movsxd rdx, DWORD PTR arg(1) ;pixels_per_line
%if ABI_IS_32BIT=0
movsxd r8, DWORD PTR arg(3) ;out_pitch
%endif
mov rax, rsi
movsxd rcx, DWORD PTR arg(4) ;output_height
add rax, rdx
lea rbx, [rdx + rdx*4]
add rbx, rdx ;pitch * 6
.loop:
movd xmm0, [rsi] ;A
movd xmm1, [rsi + rdx] ;B
movd xmm2, [rsi + rdx * 2] ;C
movd xmm3, [rax + rdx * 2] ;D
movd xmm4, [rsi + rdx * 4] ;E
movd xmm5, [rax + rdx * 4] ;F
punpcklbw xmm0, xmm1 ;A B
punpcklbw xmm2, xmm3 ;C D
punpcklbw xmm4, xmm5 ;E F
movd xmm6, [rsi + rbx] ;G
movd xmm7, [rax + rbx] ;H
pmaddubsw xmm0, k0k1
pmaddubsw xmm2, k2k3
punpcklbw xmm6, xmm7 ;G H
pmaddubsw xmm4, k4k5
pmaddubsw xmm6, k6k7
paddsw xmm0, xmm6
paddsw xmm0, xmm2
paddsw xmm0, xmm4
paddsw xmm0, krd
psraw xmm0, 7
packuswb xmm0, xmm0
add rsi, rdx
add rax, rdx
%if %1
movd xmm1, [rdi]
pavgb xmm0, xmm1
%endif
movd [rdi], xmm0
%if ABI_IS_32BIT
add rdi, DWORD PTR arg(3) ;out_pitch
%else
add rdi, r8
%endif
dec rcx
jnz .loop
%endm
%macro VERTx8 1
mov rdx, arg(5) ;filter ptr
mov rsi, arg(0) ;src_ptr
mov rdi, arg(2) ;output_ptr
mov rcx, 0x0400040
movdqa xmm4, [rdx] ;load filters
movq xmm5, rcx
packsswb xmm4, xmm4
pshuflw xmm0, xmm4, 0b ;k0_k1
pshuflw xmm1, xmm4, 01010101b ;k2_k3
pshuflw xmm2, xmm4, 10101010b ;k4_k5
pshuflw xmm3, xmm4, 11111111b ;k6_k7
punpcklqdq xmm0, xmm0
punpcklqdq xmm1, xmm1
punpcklqdq xmm2, xmm2
punpcklqdq xmm3, xmm3
movdqa k0k1, xmm0
movdqa k2k3, xmm1
pshufd xmm5, xmm5, 0
movdqa k4k5, xmm2
movdqa k6k7, xmm3
movdqa krd, xmm5
movsxd rdx, DWORD PTR arg(1) ;pixels_per_line
%if ABI_IS_32BIT=0
movsxd r8, DWORD PTR arg(3) ;out_pitch
%endif
mov rax, rsi
movsxd rcx, DWORD PTR arg(4) ;output_height
add rax, rdx
lea rbx, [rdx + rdx*4]
add rbx, rdx ;pitch * 6
.loop:
movq xmm0, [rsi] ;A
movq xmm1, [rsi + rdx] ;B
movq xmm2, [rsi + rdx * 2] ;C
movq xmm3, [rax + rdx * 2] ;D
movq xmm4, [rsi + rdx * 4] ;E
movq xmm5, [rax + rdx * 4] ;F
punpcklbw xmm0, xmm1 ;A B
punpcklbw xmm2, xmm3 ;C D
punpcklbw xmm4, xmm5 ;E F
movq xmm6, [rsi + rbx] ;G
movq xmm7, [rax + rbx] ;H
pmaddubsw xmm0, k0k1
pmaddubsw xmm2, k2k3
punpcklbw xmm6, xmm7 ;G H
pmaddubsw xmm4, k4k5
pmaddubsw xmm6, k6k7
paddsw xmm0, xmm6
paddsw xmm0, xmm2
paddsw xmm0, xmm4
paddsw xmm0, krd
psraw xmm0, 7
packuswb xmm0, xmm0
add rsi, rdx
add rax, rdx
%if %1
movq xmm1, [rdi]
pavgb xmm0, xmm1
%endif
movq [rdi], xmm0
%if ABI_IS_32BIT
add rdi, DWORD PTR arg(3) ;out_pitch
%else
add rdi, r8
%endif
dec rcx
jnz .loop
%endm
%macro VERTx16 1
mov rdx, arg(5) ;filter ptr
mov rsi, arg(0) ;src_ptr
mov rdi, arg(2) ;output_ptr
mov rcx, 0x0400040
movdqa xmm4, [rdx] ;load filters
movq xmm5, rcx
packsswb xmm4, xmm4
pshuflw xmm0, xmm4, 0b ;k0_k1
pshuflw xmm1, xmm4, 01010101b ;k2_k3
pshuflw xmm2, xmm4, 10101010b ;k4_k5
pshuflw xmm3, xmm4, 11111111b ;k6_k7
punpcklqdq xmm0, xmm0
punpcklqdq xmm1, xmm1
punpcklqdq xmm2, xmm2
punpcklqdq xmm3, xmm3
movdqa k0k1, xmm0
movdqa k2k3, xmm1
pshufd xmm5, xmm5, 0
movdqa k4k5, xmm2
movdqa k6k7, xmm3
movdqa krd, xmm5
movsxd rdx, DWORD PTR arg(1) ;pixels_per_line
%if ABI_IS_32BIT=0
movsxd r8, DWORD PTR arg(3) ;out_pitch
%endif
mov rax, rsi
movsxd rcx, DWORD PTR arg(4) ;output_height
add rax, rdx
lea rbx, [rdx + rdx*4]
add rbx, rdx ;pitch * 6
.loop:
movq xmm0, [rsi] ;A
movq xmm1, [rsi + rdx] ;B
movq xmm2, [rsi + rdx * 2] ;C
movq xmm3, [rax + rdx * 2] ;D
movq xmm4, [rsi + rdx * 4] ;E
movq xmm5, [rax + rdx * 4] ;F
punpcklbw xmm0, xmm1 ;A B
punpcklbw xmm2, xmm3 ;C D
punpcklbw xmm4, xmm5 ;E F
movq xmm6, [rsi + rbx] ;G
movq xmm7, [rax + rbx] ;H
pmaddubsw xmm0, k0k1
pmaddubsw xmm2, k2k3
punpcklbw xmm6, xmm7 ;G H
pmaddubsw xmm4, k4k5
pmaddubsw xmm6, k6k7
paddsw xmm0, xmm6
paddsw xmm0, xmm2
paddsw xmm0, xmm4
paddsw xmm0, krd
psraw xmm0, 7
packuswb xmm0, xmm0
%if %1
movq xmm1, [rdi]
pavgb xmm0, xmm1
%endif
movq [rdi], xmm0
movq xmm0, [rsi + 8] ;A
movq xmm1, [rsi + rdx + 8] ;B
movq xmm2, [rsi + rdx * 2 + 8] ;C
movq xmm3, [rax + rdx * 2 + 8] ;D
movq xmm4, [rsi + rdx * 4 + 8] ;E
movq xmm5, [rax + rdx * 4 + 8] ;F
punpcklbw xmm0, xmm1 ;A B
punpcklbw xmm2, xmm3 ;C D
punpcklbw xmm4, xmm5 ;E F
movq xmm6, [rsi + rbx + 8] ;G
movq xmm7, [rax + rbx + 8] ;H
punpcklbw xmm6, xmm7 ;G H
pmaddubsw xmm0, k0k1
pmaddubsw xmm2, k2k3
pmaddubsw xmm4, k4k5
pmaddubsw xmm6, k6k7
paddsw xmm0, xmm6
paddsw xmm0, xmm2
paddsw xmm0, xmm4
paddsw xmm0, krd
psraw xmm0, 7
packuswb xmm0, xmm0
add rsi, rdx
add rax, rdx
%if %1
movq xmm1, [rdi+8]
pavgb xmm0, xmm1
%endif
movq [rdi+8], xmm0
%if ABI_IS_32BIT
add rdi, DWORD PTR arg(3) ;out_pitch
%else
add rdi, r8
%endif
dec rcx
jnz .loop
%endm
;void vp9_filter_block1d8_v8_ssse3
;(
; unsigned char *src_ptr,
; unsigned int src_pitch,
; unsigned char *output_ptr,
; unsigned int out_pitch,
; unsigned int output_height,
; short *filter
;)
global sym(vp9_filter_block1d4_v8_ssse3) PRIVATE
sym(vp9_filter_block1d4_v8_ssse3):
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 6
SAVE_XMM 7
push rsi
push rdi
push rbx
; end prolog
ALIGN_STACK 16, rax
sub rsp, 16*5
%define k0k1 [rsp + 16*0]
%define k2k3 [rsp + 16*1]
%define k4k5 [rsp + 16*2]
%define k6k7 [rsp + 16*3]
%define krd [rsp + 16*4]
VERTx4 0
add rsp, 16*5
pop rsp
pop rbx
; begin epilog
pop rdi
pop rsi
RESTORE_XMM
UNSHADOW_ARGS
pop rbp
ret
;void vp9_filter_block1d8_v8_ssse3
;(
; unsigned char *src_ptr,
; unsigned int src_pitch,
; unsigned char *output_ptr,
; unsigned int out_pitch,
; unsigned int output_height,
; short *filter
;)
global sym(vp9_filter_block1d8_v8_ssse3) PRIVATE
sym(vp9_filter_block1d8_v8_ssse3):
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 6
SAVE_XMM 7
push rsi
push rdi
push rbx
; end prolog
ALIGN_STACK 16, rax
sub rsp, 16*5
%define k0k1 [rsp + 16*0]
%define k2k3 [rsp + 16*1]
%define k4k5 [rsp + 16*2]
%define k6k7 [rsp + 16*3]
%define krd [rsp + 16*4]
VERTx8 0
add rsp, 16*5
pop rsp
pop rbx
; begin epilog
pop rdi
pop rsi
RESTORE_XMM
UNSHADOW_ARGS
pop rbp
ret
;void vp9_filter_block1d16_v8_ssse3
;(
; unsigned char *src_ptr,
; unsigned int src_pitch,
; unsigned char *output_ptr,
; unsigned int out_pitch,
; unsigned int output_height,
; short *filter
;)
global sym(vp9_filter_block1d16_v8_ssse3) PRIVATE
sym(vp9_filter_block1d16_v8_ssse3):
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 6
SAVE_XMM 7
push rsi
push rdi
push rbx
; end prolog
ALIGN_STACK 16, rax
sub rsp, 16*5
%define k0k1 [rsp + 16*0]
%define k2k3 [rsp + 16*1]
%define k4k5 [rsp + 16*2]
%define k6k7 [rsp + 16*3]
%define krd [rsp + 16*4]
VERTx16 0
add rsp, 16*5
pop rsp
pop rbx
; begin epilog
pop rdi
pop rsi
RESTORE_XMM
UNSHADOW_ARGS
pop rbp
ret
;~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
global sym(vp9_filter_block1d4_v8_avg_ssse3) PRIVATE
sym(vp9_filter_block1d4_v8_avg_ssse3):
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 6
SAVE_XMM 7
push rsi
push rdi
push rbx
; end prolog
ALIGN_STACK 16, rax
sub rsp, 16*5
%define k0k1 [rsp + 16*0]
%define k2k3 [rsp + 16*1]
%define k4k5 [rsp + 16*2]
%define k6k7 [rsp + 16*3]
%define krd [rsp + 16*4]
VERTx4 1
add rsp, 16*5
pop rsp
pop rbx
; begin epilog
pop rdi
pop rsi
RESTORE_XMM
UNSHADOW_ARGS
pop rbp
ret
global sym(vp9_filter_block1d8_v8_avg_ssse3) PRIVATE
sym(vp9_filter_block1d8_v8_avg_ssse3):
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 6
SAVE_XMM 7
push rsi
push rdi
push rbx
; end prolog
ALIGN_STACK 16, rax
sub rsp, 16*5
%define k0k1 [rsp + 16*0]
%define k2k3 [rsp + 16*1]
%define k4k5 [rsp + 16*2]
%define k6k7 [rsp + 16*3]
%define krd [rsp + 16*4]
VERTx8 1
add rsp, 16*5
pop rsp
pop rbx
; begin epilog
pop rdi
pop rsi
RESTORE_XMM
UNSHADOW_ARGS
pop rbp
ret
global sym(vp9_filter_block1d16_v8_avg_ssse3) PRIVATE
sym(vp9_filter_block1d16_v8_avg_ssse3):
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 6
SAVE_XMM 7
push rsi
push rdi
push rbx
; end prolog
ALIGN_STACK 16, rax
sub rsp, 16*5
%define k0k1 [rsp + 16*0]
%define k2k3 [rsp + 16*1]
%define k4k5 [rsp + 16*2]
%define k6k7 [rsp + 16*3]
%define krd [rsp + 16*4]
VERTx16 1
add rsp, 16*5
pop rsp
pop rbx
; begin epilog
pop rdi
pop rsi
RESTORE_XMM
UNSHADOW_ARGS
pop rbp
ret
;~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
%macro HORIZx4 1
mov rdx, arg(5) ;filter ptr
mov rsi, arg(0) ;src_ptr
mov rdi, arg(2) ;output_ptr
mov rcx, 0x0400040
movdqa xmm4, [rdx] ;load filters
movq xmm5, rcx
packsswb xmm4, xmm4
pshuflw xmm0, xmm4, 0b ;k0_k1
pshuflw xmm1, xmm4, 01010101b ;k2_k3
pshuflw xmm2, xmm4, 10101010b ;k4_k5
pshuflw xmm3, xmm4, 11111111b ;k6_k7
punpcklqdq xmm0, xmm0
punpcklqdq xmm1, xmm1
punpcklqdq xmm2, xmm2
punpcklqdq xmm3, xmm3
movdqa k0k1, xmm0
movdqa k2k3, xmm1
pshufd xmm5, xmm5, 0
movdqa k4k5, xmm2
movdqa k6k7, xmm3
movdqa krd, xmm5
movsxd rax, dword ptr arg(1) ;src_pixels_per_line
movsxd rdx, dword ptr arg(3) ;output_pitch
movsxd rcx, dword ptr arg(4) ;output_height
.loop:
movq xmm0, [rsi - 3] ; -3 -2 -1 0 1 2 3 4
movq xmm3, [rsi + 5] ; 5 6 7 8 9 10 11 12
punpcklqdq xmm0, xmm3
movdqa xmm1, xmm0
pshufb xmm0, [GLOBAL(shuf_t0t1)]
pmaddubsw xmm0, k0k1
movdqa xmm2, xmm1
pshufb xmm1, [GLOBAL(shuf_t2t3)]
pmaddubsw xmm1, k2k3
movdqa xmm4, xmm2
pshufb xmm2, [GLOBAL(shuf_t4t5)]
pmaddubsw xmm2, k4k5
pshufb xmm4, [GLOBAL(shuf_t6t7)]
pmaddubsw xmm4, k6k7
paddsw xmm0, xmm1
paddsw xmm0, xmm4
paddsw xmm0, xmm2
paddsw xmm0, krd
psraw xmm0, 7
packuswb xmm0, xmm0
%if %1
movd xmm1, [rdi]
pavgb xmm0, xmm1
%endif
lea rsi, [rsi + rax]
movd [rdi], xmm0
lea rdi, [rdi + rdx]
dec rcx
jnz .loop
%endm
%macro HORIZx8 1
mov rdx, arg(5) ;filter ptr
mov rsi, arg(0) ;src_ptr
mov rdi, arg(2) ;output_ptr
mov rcx, 0x0400040
movdqa xmm4, [rdx] ;load filters
movd xmm5, rcx
packsswb xmm4, xmm4
pshuflw xmm0, xmm4, 0b ;k0_k1
pshuflw xmm1, xmm4, 01010101b ;k2_k3
pshuflw xmm2, xmm4, 10101010b ;k4_k5
pshuflw xmm3, xmm4, 11111111b ;k6_k7
punpcklqdq xmm0, xmm0
punpcklqdq xmm1, xmm1
punpcklqdq xmm2, xmm2
punpcklqdq xmm3, xmm3
movdqa k0k1, xmm0
movdqa k2k3, xmm1
pshufd xmm5, xmm5, 0
movdqa k4k5, xmm2
movdqa k6k7, xmm3
movdqa krd, xmm5
movsxd rax, dword ptr arg(1) ;src_pixels_per_line
movsxd rdx, dword ptr arg(3) ;output_pitch
movsxd rcx, dword ptr arg(4) ;output_height
.loop:
movq xmm0, [rsi - 3] ; -3 -2 -1 0 1 2 3 4
movq xmm3, [rsi + 5] ; 5 6 7 8 9 10 11 12
punpcklqdq xmm0, xmm3
movdqa xmm1, xmm0
pshufb xmm0, [GLOBAL(shuf_t0t1)]
pmaddubsw xmm0, k0k1
movdqa xmm2, xmm1
pshufb xmm1, [GLOBAL(shuf_t2t3)]
pmaddubsw xmm1, k2k3
movdqa xmm4, xmm2
pshufb xmm2, [GLOBAL(shuf_t4t5)]
pmaddubsw xmm2, k4k5
pshufb xmm4, [GLOBAL(shuf_t6t7)]
pmaddubsw xmm4, k6k7
paddsw xmm0, xmm1
paddsw xmm0, xmm4
paddsw xmm0, xmm2
paddsw xmm0, krd
psraw xmm0, 7
packuswb xmm0, xmm0
%if %1
movq xmm1, [rdi]
pavgb xmm0, xmm1
%endif
lea rsi, [rsi + rax]
movq [rdi], xmm0
lea rdi, [rdi + rdx]
dec rcx
jnz .loop
%endm
%macro HORIZx16 1
mov rdx, arg(5) ;filter ptr
mov rsi, arg(0) ;src_ptr
mov rdi, arg(2) ;output_ptr
mov rcx, 0x0400040
movdqa xmm4, [rdx] ;load filters
movq xmm5, rcx
packsswb xmm4, xmm4
pshuflw xmm0, xmm4, 0b ;k0_k1
pshuflw xmm1, xmm4, 01010101b ;k2_k3
pshuflw xmm2, xmm4, 10101010b ;k4_k5
pshuflw xmm3, xmm4, 11111111b ;k6_k7
punpcklqdq xmm0, xmm0
punpcklqdq xmm1, xmm1
punpcklqdq xmm2, xmm2
punpcklqdq xmm3, xmm3
movdqa k0k1, xmm0
movdqa k2k3, xmm1
pshufd xmm5, xmm5, 0
movdqa k4k5, xmm2
movdqa k6k7, xmm3
movdqa krd, xmm5
movsxd rax, dword ptr arg(1) ;src_pixels_per_line
movsxd rdx, dword ptr arg(3) ;output_pitch
movsxd rcx, dword ptr arg(4) ;output_height
.loop:
movq xmm0, [rsi - 3] ; -3 -2 -1 0 1 2 3 4
movq xmm3, [rsi + 5] ; 5 6 7 8 9 10 11 12
punpcklqdq xmm0, xmm3
movdqa xmm1, xmm0
pshufb xmm0, [GLOBAL(shuf_t0t1)]
pmaddubsw xmm0, k0k1
movdqa xmm2, xmm1
pshufb xmm1, [GLOBAL(shuf_t2t3)]
pmaddubsw xmm1, k2k3
movdqa xmm4, xmm2
pshufb xmm2, [GLOBAL(shuf_t4t5)]
pmaddubsw xmm2, k4k5
pshufb xmm4, [GLOBAL(shuf_t6t7)]
pmaddubsw xmm4, k6k7
paddsw xmm0, xmm1
paddsw xmm0, xmm4
paddsw xmm0, xmm2
paddsw xmm0, krd
psraw xmm0, 7
packuswb xmm0, xmm0
movq xmm3, [rsi + 5]
movq xmm7, [rsi + 13]
punpcklqdq xmm3, xmm7
movdqa xmm1, xmm3
pshufb xmm3, [GLOBAL(shuf_t0t1)]
pmaddubsw xmm3, k0k1
movdqa xmm2, xmm1
pshufb xmm1, [GLOBAL(shuf_t2t3)]
pmaddubsw xmm1, k2k3
movdqa xmm4, xmm2
pshufb xmm2, [GLOBAL(shuf_t4t5)]
pmaddubsw xmm2, k4k5
pshufb xmm4, [GLOBAL(shuf_t6t7)]
pmaddubsw xmm4, k6k7
paddsw xmm3, xmm1
paddsw xmm3, xmm4
paddsw xmm3, xmm2
paddsw xmm3, krd
psraw xmm3, 7
packuswb xmm3, xmm3
punpcklqdq xmm0, xmm3
%if %1
movdqa xmm1, [rdi]
pavgb xmm0, xmm1
%endif
lea rsi, [rsi + rax]
movdqa [rdi], xmm0
lea rdi, [rdi + rdx]
dec rcx
jnz .loop
%endm
;void vp9_filter_block1d4_h8_ssse3
;(
; unsigned char *src_ptr,
; unsigned int src_pixels_per_line,
; unsigned char *output_ptr,
; unsigned int output_pitch,
; unsigned int output_height,
; short *filter
;)
global sym(vp9_filter_block1d4_h8_ssse3) PRIVATE
sym(vp9_filter_block1d4_h8_ssse3):
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 6
SAVE_XMM 7
GET_GOT rbx
push rsi
push rdi
; end prolog
ALIGN_STACK 16, rax
sub rsp, 16*5
%define k0k1 [rsp + 16*0]
%define k2k3 [rsp + 16*1]
%define k4k5 [rsp + 16*2]
%define k6k7 [rsp + 16*3]
%define krd [rsp + 16*4]
HORIZx4 0
add rsp, 16*5
pop rsp
; begin epilog
pop rdi
pop rsi
RESTORE_GOT
RESTORE_XMM
UNSHADOW_ARGS
pop rbp
ret
;void vp9_filter_block1d8_h8_ssse3
;(
; unsigned char *src_ptr,
; unsigned int src_pixels_per_line,
; unsigned char *output_ptr,
; unsigned int output_pitch,
; unsigned int output_height,
; short *filter
;)
global sym(vp9_filter_block1d8_h8_ssse3) PRIVATE
sym(vp9_filter_block1d8_h8_ssse3):
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 6
SAVE_XMM 7
GET_GOT rbx
push rsi
push rdi
; end prolog
ALIGN_STACK 16, rax
sub rsp, 16*5
%define k0k1 [rsp + 16*0]
%define k2k3 [rsp + 16*1]
%define k4k5 [rsp + 16*2]
%define k6k7 [rsp + 16*3]
%define krd [rsp + 16*4]
HORIZx8 0
add rsp, 16*5
pop rsp
; begin epilog
pop rdi
pop rsi
RESTORE_GOT
RESTORE_XMM
UNSHADOW_ARGS
pop rbp
ret
;void vp9_filter_block1d16_h8_ssse3
;(
; unsigned char *src_ptr,
; unsigned int src_pixels_per_line,
; unsigned char *output_ptr,
; unsigned int output_pitch,
; unsigned int output_height,
; short *filter
;)
global sym(vp9_filter_block1d16_h8_ssse3) PRIVATE
sym(vp9_filter_block1d16_h8_ssse3):
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 6
SAVE_XMM 7
GET_GOT rbx
push rsi
push rdi
; end prolog
ALIGN_STACK 16, rax
sub rsp, 16*5
%define k0k1 [rsp + 16*0]
%define k2k3 [rsp + 16*1]
%define k4k5 [rsp + 16*2]
%define k6k7 [rsp + 16*3]
%define krd [rsp + 16*4]
HORIZx16 0
add rsp, 16*5
pop rsp
; begin epilog
pop rdi
pop rsi
RESTORE_GOT
RESTORE_XMM
UNSHADOW_ARGS
pop rbp
ret
global sym(vp9_filter_block1d4_h8_avg_ssse3) PRIVATE
sym(vp9_filter_block1d4_h8_avg_ssse3):
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 6
SAVE_XMM 7
GET_GOT rbx
push rsi
push rdi
; end prolog
ALIGN_STACK 16, rax
sub rsp, 16*5
%define k0k1 [rsp + 16*0]
%define k2k3 [rsp + 16*1]
%define k4k5 [rsp + 16*2]
%define k6k7 [rsp + 16*3]
%define krd [rsp + 16*4]
HORIZx4 1
add rsp, 16*5
pop rsp
; begin epilog
pop rdi
pop rsi
RESTORE_GOT
RESTORE_XMM
UNSHADOW_ARGS
pop rbp
ret
global sym(vp9_filter_block1d8_h8_avg_ssse3) PRIVATE
sym(vp9_filter_block1d8_h8_avg_ssse3):
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 6
SAVE_XMM 7
GET_GOT rbx
push rsi
push rdi
; end prolog
ALIGN_STACK 16, rax
sub rsp, 16*5
%define k0k1 [rsp + 16*0]
%define k2k3 [rsp + 16*1]
%define k4k5 [rsp + 16*2]
%define k6k7 [rsp + 16*3]
%define krd [rsp + 16*4]
HORIZx8 1
add rsp, 16*5
pop rsp
; begin epilog
pop rdi
pop rsi
RESTORE_GOT
RESTORE_XMM
UNSHADOW_ARGS
pop rbp
ret
global sym(vp9_filter_block1d16_h8_avg_ssse3) PRIVATE
sym(vp9_filter_block1d16_h8_avg_ssse3):
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 6
SAVE_XMM 7
GET_GOT rbx
push rsi
push rdi
; end prolog
ALIGN_STACK 16, rax
sub rsp, 16*5
%define k0k1 [rsp + 16*0]
%define k2k3 [rsp + 16*1]
%define k4k5 [rsp + 16*2]
%define k6k7 [rsp + 16*3]
%define krd [rsp + 16*4]
HORIZx16 1
add rsp, 16*5
pop rsp
; begin epilog
pop rdi
pop rsi
RESTORE_GOT
RESTORE_XMM
UNSHADOW_ARGS
pop rbp
ret
SECTION_RODATA
align 16
shuf_t0t1:
db 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8
align 16
shuf_t2t3:
db 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10
align 16
shuf_t4t5:
db 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12
align 16
shuf_t6t7:
db 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14