vpx/vp8/common/x86/subpixel_ssse3.asm
Yaowu Xu 6035da5448 WebM Experimental Codec Branch Snapshot
This is a code snapshot of experimental work currently ongoing for a
next-generation codec.

The codebase has been cut down considerably from the libvpx baseline.
For example, we are currently only supporting VBR 2-pass rate control
and have removed most of the code relating to coding speed, threading,
error resilience, partitions and various other features.  This is in
part to make the codebase easier to work on and experiment with, but
also because we want to have an open discussion about how the bitstream
will be structured and partitioned and not have that conversation
constrained by past work.

Our basic working pattern has been to initially encapsulate experiments
using configure options linked to #IF CONFIG_XXX statements in the
code. Once experiments have matured and we are reasonably happy that
they give benefit and can be merged without breaking other experiments,
we remove the conditional compile statements and merge them in.

Current changes include:
* Temporal coding experiment for segments (though still only 4 max, it
  will likely be increased).
* Segment feature experiment - to allow various bits of information to
  be coded at the segment level. Features tested so far include mode
  and reference frame information, limiting end of block offset and
  transform size, alongside Q and loop filter parameters, but this set
  is very fluid.
* Support for 8x8 transform - 8x8 dct with 2nd order 2x2 haar is used
  in MBs using 16x16 prediction modes within inter frames.
* Compound prediction (combination of signals from existing predictors
  to create a new predictor).
* 8 tap interpolation filters and 1/8th pel motion vectors.
* Loop filter modifications.
* Various entropy modifications and changes to how entropy contexts and
  updates are handled.
* Extended quantizer range matched to transform precision improvements.

There are also ongoing further experiments that we hope to merge in the
near future: For example, coding of motion and other aspects of the
prediction signal to better support larger image formats, use of larger
block sizes (e.g. 32x32 and up) and lossless non-transform based coding
options (especially for key frames). It is our hope that we will be
able to make regular updates and we will warmly welcome community
contributions.

Please be warned that, at this stage, the codebase is currently slower
than VP8 stable branch as most new code has not been optimized, and
even the 'C' has been deliberately written to be simple and obvious,
not fast.

The following graphs have the initial test results, numbers in the
tables measure the compression improvement in terms of percentage. The
build has  the following optional experiments configured:
--enable-experimental --enable-enhanced_interp --enable-uvintra
--enable-high_precision_mv --enable-sixteenth_subpel_uv

CIF Size clips:
http://getwebm.org/tmp/cif/
HD size clips:
http://getwebm.org/tmp/hd/
(stable_20120309 represents encoding results of WebM master branch
build as of commit#7a15907)

They were encoded using the following encode parameters:
--good --cpu-used=0 -t 0 --lag-in-frames=25 --min-q=0 --max-q=63
--end-usage=0 --auto-alt-ref=1 -p 2 --pass=2 --kf-max-dist=9999
--kf-min-dist=0 --drop-frame=0 --static-thresh=0 --bias-pct=50
--minsection-pct=0 --maxsection-pct=800 --sharpness=0
--arnr-maxframes=7 --arnr-strength=3(for HD,6 for CIF)
--arnr-type=3

Change-Id: I5c62ed09cfff5815a2bb34e7820d6a810c23183c
2012-03-15 07:36:47 -07:00

1528 lines
42 KiB
NASM

;
; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
;
; Use of this source code is governed by a BSD-style license
; that can be found in the LICENSE file in the root of the source
; tree. An additional intellectual property rights grant can be found
; in the file PATENTS. All contributing project authors may
; be found in the AUTHORS file in the root of the source tree.
;
%include "vpx_ports/x86_abi_support.asm"
%define BLOCK_HEIGHT_WIDTH 4
%define VP8_FILTER_WEIGHT 128
%define VP8_FILTER_SHIFT 7
;/************************************************************************************
; Notes: filter_block1d_h6 applies a 6 tap filter horizontally to the input pixels. The
; input pixel array has output_height rows. This routine assumes that output_height is an
; even number. This function handles 8 pixels in horizontal direction, calculating ONE
; rows each iteration to take advantage of the 128 bits operations.
;
; This is an implementation of some of the SSE optimizations first seen in ffvp8
;
;*************************************************************************************/
;void vp8_filter_block1d8_h6_ssse3
;(
; unsigned char *src_ptr,
; unsigned int src_pixels_per_line,
; unsigned char *output_ptr,
; unsigned int output_pitch,
; unsigned int output_height,
; unsigned int vp8_filter_index
;)
global sym(vp8_filter_block1d8_h6_ssse3)
sym(vp8_filter_block1d8_h6_ssse3):
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 6
SAVE_XMM 7
GET_GOT rbx
push rsi
push rdi
; end prolog
movsxd rdx, DWORD PTR arg(5) ;table index
xor rsi, rsi
shl rdx, 4
movdqa xmm7, [GLOBAL(rd)]
lea rax, [GLOBAL(k0_k5)]
add rax, rdx
mov rdi, arg(2) ;output_ptr
cmp esi, DWORD PTR [rax]
je vp8_filter_block1d8_h4_ssse3
movdqa xmm4, XMMWORD PTR [rax] ;k0_k5
movdqa xmm5, XMMWORD PTR [rax+256] ;k2_k4
movdqa xmm6, XMMWORD PTR [rax+128] ;k1_k3
mov rsi, arg(0) ;src_ptr
movsxd rax, dword ptr arg(1) ;src_pixels_per_line
movsxd rcx, dword ptr arg(4) ;output_height
movsxd rdx, dword ptr arg(3) ;output_pitch
sub rdi, rdx
;xmm3 free
.filter_block1d8_h6_rowloop_ssse3:
movq xmm0, MMWORD PTR [rsi - 2] ; -2 -1 0 1 2 3 4 5
movq xmm2, MMWORD PTR [rsi + 3] ; 3 4 5 6 7 8 9 10
punpcklbw xmm0, xmm2 ; -2 3 -1 4 0 5 1 6 2 7 3 8 4 9 5 10
movdqa xmm1, xmm0
pmaddubsw xmm0, xmm4
movdqa xmm2, xmm1
pshufb xmm1, [GLOBAL(shuf2bfrom1)]
pshufb xmm2, [GLOBAL(shuf3bfrom1)]
pmaddubsw xmm1, xmm5
lea rdi, [rdi + rdx]
pmaddubsw xmm2, xmm6
lea rsi, [rsi + rax]
dec rcx
paddsw xmm0, xmm1
paddsw xmm2, xmm7
paddsw xmm0, xmm2
psraw xmm0, 7
packuswb xmm0, xmm0
movq MMWORD Ptr [rdi], xmm0
jnz .filter_block1d8_h6_rowloop_ssse3
; begin epilog
pop rdi
pop rsi
RESTORE_GOT
RESTORE_XMM
UNSHADOW_ARGS
pop rbp
ret
vp8_filter_block1d8_h4_ssse3:
movdqa xmm5, XMMWORD PTR [rax+256] ;k2_k4
movdqa xmm6, XMMWORD PTR [rax+128] ;k1_k3
movdqa xmm3, XMMWORD PTR [GLOBAL(shuf2bfrom1)]
movdqa xmm4, XMMWORD PTR [GLOBAL(shuf3bfrom1)]
mov rsi, arg(0) ;src_ptr
movsxd rax, dword ptr arg(1) ;src_pixels_per_line
movsxd rcx, dword ptr arg(4) ;output_height
movsxd rdx, dword ptr arg(3) ;output_pitch
sub rdi, rdx
.filter_block1d8_h4_rowloop_ssse3:
movq xmm0, MMWORD PTR [rsi - 2] ; -2 -1 0 1 2 3 4 5
movq xmm1, MMWORD PTR [rsi + 3] ; 3 4 5 6 7 8 9 10
punpcklbw xmm0, xmm1 ; -2 3 -1 4 0 5 1 6 2 7 3 8 4 9 5 10
movdqa xmm2, xmm0
pshufb xmm0, xmm3
pshufb xmm2, xmm4
pmaddubsw xmm0, xmm5
lea rdi, [rdi + rdx]
pmaddubsw xmm2, xmm6
lea rsi, [rsi + rax]
dec rcx
paddsw xmm0, xmm7
paddsw xmm0, xmm2
psraw xmm0, 7
packuswb xmm0, xmm0
movq MMWORD Ptr [rdi], xmm0
jnz .filter_block1d8_h4_rowloop_ssse3
; begin epilog
pop rdi
pop rsi
RESTORE_GOT
RESTORE_XMM
UNSHADOW_ARGS
pop rbp
ret
;void vp8_filter_block1d16_h6_ssse3
;(
; unsigned char *src_ptr,
; unsigned int src_pixels_per_line,
; unsigned char *output_ptr,
; unsigned int output_pitch,
; unsigned int output_height,
; unsigned int vp8_filter_index
;)
global sym(vp8_filter_block1d16_h6_ssse3)
sym(vp8_filter_block1d16_h6_ssse3):
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 6
SAVE_XMM 7
GET_GOT rbx
push rsi
push rdi
; end prolog
movsxd rdx, DWORD PTR arg(5) ;table index
xor rsi, rsi
shl rdx, 4 ;
lea rax, [GLOBAL(k0_k5)]
add rax, rdx
mov rdi, arg(2) ;output_ptr
mov rsi, arg(0) ;src_ptr
movdqa xmm4, XMMWORD PTR [rax] ;k0_k5
movdqa xmm5, XMMWORD PTR [rax+256] ;k2_k4
movdqa xmm6, XMMWORD PTR [rax+128] ;k1_k3
movsxd rax, dword ptr arg(1) ;src_pixels_per_line
movsxd rcx, dword ptr arg(4) ;output_height
movsxd rdx, dword ptr arg(3) ;output_pitch
.filter_block1d16_h6_rowloop_ssse3:
movq xmm0, MMWORD PTR [rsi - 2] ; -2 -1 0 1 2 3 4 5
movq xmm3, MMWORD PTR [rsi + 3] ; 3 4 5 6 7 8 9 10
punpcklbw xmm0, xmm3 ; -2 3 -1 4 0 5 1 6 2 7 3 8 4 9 5 10
movdqa xmm1, xmm0
pmaddubsw xmm0, xmm4
movdqa xmm2, xmm1
pshufb xmm1, [GLOBAL(shuf2bfrom1)]
pshufb xmm2, [GLOBAL(shuf3bfrom1)]
movq xmm3, MMWORD PTR [rsi + 6]
pmaddubsw xmm1, xmm5
movq xmm7, MMWORD PTR [rsi + 11]
pmaddubsw xmm2, xmm6
punpcklbw xmm3, xmm7
paddsw xmm0, xmm1
movdqa xmm1, xmm3
pmaddubsw xmm3, xmm4
paddsw xmm0, xmm2
movdqa xmm2, xmm1
paddsw xmm0, [GLOBAL(rd)]
pshufb xmm1, [GLOBAL(shuf2bfrom1)]
pshufb xmm2, [GLOBAL(shuf3bfrom1)]
psraw xmm0, 7
pmaddubsw xmm1, xmm5
pmaddubsw xmm2, xmm6
packuswb xmm0, xmm0
lea rsi, [rsi + rax]
paddsw xmm3, xmm1
paddsw xmm3, xmm2
paddsw xmm3, [GLOBAL(rd)]
psraw xmm3, 7
packuswb xmm3, xmm3
punpcklqdq xmm0, xmm3
movdqa XMMWORD Ptr [rdi], xmm0
lea rdi, [rdi + rdx]
dec rcx
jnz .filter_block1d16_h6_rowloop_ssse3
; begin epilog
pop rdi
pop rsi
RESTORE_GOT
RESTORE_XMM
UNSHADOW_ARGS
pop rbp
ret
;void vp8_filter_block1d4_h6_ssse3
;(
; unsigned char *src_ptr,
; unsigned int src_pixels_per_line,
; unsigned char *output_ptr,
; unsigned int output_pitch,
; unsigned int output_height,
; unsigned int vp8_filter_index
;)
global sym(vp8_filter_block1d4_h6_ssse3)
sym(vp8_filter_block1d4_h6_ssse3):
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 6
SAVE_XMM 7
GET_GOT rbx
push rsi
push rdi
; end prolog
movsxd rdx, DWORD PTR arg(5) ;table index
xor rsi, rsi
shl rdx, 4 ;
lea rax, [GLOBAL(k0_k5)]
add rax, rdx
movdqa xmm7, [GLOBAL(rd)]
cmp esi, DWORD PTR [rax]
je .vp8_filter_block1d4_h4_ssse3
movdqa xmm4, XMMWORD PTR [rax] ;k0_k5
movdqa xmm5, XMMWORD PTR [rax+256] ;k2_k4
movdqa xmm6, XMMWORD PTR [rax+128] ;k1_k3
mov rsi, arg(0) ;src_ptr
mov rdi, arg(2) ;output_ptr
movsxd rax, dword ptr arg(1) ;src_pixels_per_line
movsxd rcx, dword ptr arg(4) ;output_height
movsxd rdx, dword ptr arg(3) ;output_pitch
;xmm3 free
.filter_block1d4_h6_rowloop_ssse3:
movdqu xmm0, XMMWORD PTR [rsi - 2]
movdqa xmm1, xmm0
pshufb xmm0, [GLOBAL(shuf1b)]
movdqa xmm2, xmm1
pshufb xmm1, [GLOBAL(shuf2b)]
pmaddubsw xmm0, xmm4
pshufb xmm2, [GLOBAL(shuf3b)]
pmaddubsw xmm1, xmm5
;--
pmaddubsw xmm2, xmm6
lea rsi, [rsi + rax]
;--
paddsw xmm0, xmm1
paddsw xmm0, xmm7
pxor xmm1, xmm1
paddsw xmm0, xmm2
psraw xmm0, 7
packuswb xmm0, xmm0
movd DWORD PTR [rdi], xmm0
add rdi, rdx
dec rcx
jnz .filter_block1d4_h6_rowloop_ssse3
; begin epilog
pop rdi
pop rsi
RESTORE_GOT
UNSHADOW_ARGS
pop rbp
ret
.vp8_filter_block1d4_h4_ssse3:
movdqa xmm5, XMMWORD PTR [rax+256] ;k2_k4
movdqa xmm6, XMMWORD PTR [rax+128] ;k1_k3
movdqa xmm0, XMMWORD PTR [GLOBAL(shuf2b)]
movdqa xmm3, XMMWORD PTR [GLOBAL(shuf3b)]
mov rsi, arg(0) ;src_ptr
mov rdi, arg(2) ;output_ptr
movsxd rax, dword ptr arg(1) ;src_pixels_per_line
movsxd rcx, dword ptr arg(4) ;output_height
movsxd rdx, dword ptr arg(3) ;output_pitch
.filter_block1d4_h4_rowloop_ssse3:
movdqu xmm1, XMMWORD PTR [rsi - 2]
movdqa xmm2, xmm1
pshufb xmm1, xmm0 ;;[GLOBAL(shuf2b)]
pshufb xmm2, xmm3 ;;[GLOBAL(shuf3b)]
pmaddubsw xmm1, xmm5
;--
pmaddubsw xmm2, xmm6
lea rsi, [rsi + rax]
;--
paddsw xmm1, xmm7
paddsw xmm1, xmm2
psraw xmm1, 7
packuswb xmm1, xmm1
movd DWORD PTR [rdi], xmm1
add rdi, rdx
dec rcx
jnz .filter_block1d4_h4_rowloop_ssse3
; begin epilog
pop rdi
pop rsi
RESTORE_GOT
RESTORE_XMM
UNSHADOW_ARGS
pop rbp
ret
;void vp8_filter_block1d16_v6_ssse3
;(
; unsigned char *src_ptr,
; unsigned int src_pitch,
; unsigned char *output_ptr,
; unsigned int out_pitch,
; unsigned int output_height,
; unsigned int vp8_filter_index
;)
global sym(vp8_filter_block1d16_v6_ssse3)
sym(vp8_filter_block1d16_v6_ssse3):
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 6
SAVE_XMM 7
GET_GOT rbx
push rsi
push rdi
; end prolog
movsxd rdx, DWORD PTR arg(5) ;table index
xor rsi, rsi
shl rdx, 4 ;
lea rax, [GLOBAL(k0_k5)]
add rax, rdx
cmp esi, DWORD PTR [rax]
je .vp8_filter_block1d16_v4_ssse3
movdqa xmm5, XMMWORD PTR [rax] ;k0_k5
movdqa xmm6, XMMWORD PTR [rax+256] ;k2_k4
movdqa xmm7, XMMWORD PTR [rax+128] ;k1_k3
mov rsi, arg(0) ;src_ptr
movsxd rdx, DWORD PTR arg(1) ;pixels_per_line
mov rdi, arg(2) ;output_ptr
%if ABI_IS_32BIT=0
movsxd r8, DWORD PTR arg(3) ;out_pitch
%endif
mov rax, rsi
movsxd rcx, DWORD PTR arg(4) ;output_height
add rax, rdx
.vp8_filter_block1d16_v6_ssse3_loop:
movq xmm1, MMWORD PTR [rsi] ;A
movq xmm2, MMWORD PTR [rsi + rdx] ;B
movq xmm3, MMWORD PTR [rsi + rdx * 2] ;C
movq xmm4, MMWORD PTR [rax + rdx * 2] ;D
movq xmm0, MMWORD PTR [rsi + rdx * 4] ;E
punpcklbw xmm2, xmm4 ;B D
punpcklbw xmm3, xmm0 ;C E
movq xmm0, MMWORD PTR [rax + rdx * 4] ;F
pmaddubsw xmm3, xmm6
punpcklbw xmm1, xmm0 ;A F
pmaddubsw xmm2, xmm7
pmaddubsw xmm1, xmm5
paddsw xmm2, xmm3
paddsw xmm2, xmm1
paddsw xmm2, [GLOBAL(rd)]
psraw xmm2, 7
packuswb xmm2, xmm2
movq MMWORD PTR [rdi], xmm2 ;store the results
movq xmm1, MMWORD PTR [rsi + 8] ;A
movq xmm2, MMWORD PTR [rsi + rdx + 8] ;B
movq xmm3, MMWORD PTR [rsi + rdx * 2 + 8] ;C
movq xmm4, MMWORD PTR [rax + rdx * 2 + 8] ;D
movq xmm0, MMWORD PTR [rsi + rdx * 4 + 8] ;E
punpcklbw xmm2, xmm4 ;B D
punpcklbw xmm3, xmm0 ;C E
movq xmm0, MMWORD PTR [rax + rdx * 4 + 8] ;F
pmaddubsw xmm3, xmm6
punpcklbw xmm1, xmm0 ;A F
pmaddubsw xmm2, xmm7
pmaddubsw xmm1, xmm5
add rsi, rdx
add rax, rdx
;--
;--
paddsw xmm2, xmm3
paddsw xmm2, xmm1
paddsw xmm2, [GLOBAL(rd)]
psraw xmm2, 7
packuswb xmm2, xmm2
movq MMWORD PTR [rdi+8], xmm2
%if ABI_IS_32BIT
add rdi, DWORD PTR arg(3) ;out_pitch
%else
add rdi, r8
%endif
dec rcx
jnz .vp8_filter_block1d16_v6_ssse3_loop
; begin epilog
pop rdi
pop rsi
RESTORE_GOT
RESTORE_XMM
UNSHADOW_ARGS
pop rbp
ret
.vp8_filter_block1d16_v4_ssse3:
movdqa xmm6, XMMWORD PTR [rax+256] ;k2_k4
movdqa xmm7, XMMWORD PTR [rax+128] ;k1_k3
mov rsi, arg(0) ;src_ptr
movsxd rdx, DWORD PTR arg(1) ;pixels_per_line
mov rdi, arg(2) ;output_ptr
%if ABI_IS_32BIT=0
movsxd r8, DWORD PTR arg(3) ;out_pitch
%endif
mov rax, rsi
movsxd rcx, DWORD PTR arg(4) ;output_height
add rax, rdx
.vp8_filter_block1d16_v4_ssse3_loop:
movq xmm2, MMWORD PTR [rsi + rdx] ;B
movq xmm3, MMWORD PTR [rsi + rdx * 2] ;C
movq xmm4, MMWORD PTR [rax + rdx * 2] ;D
movq xmm0, MMWORD PTR [rsi + rdx * 4] ;E
punpcklbw xmm2, xmm4 ;B D
punpcklbw xmm3, xmm0 ;C E
pmaddubsw xmm3, xmm6
pmaddubsw xmm2, xmm7
movq xmm5, MMWORD PTR [rsi + rdx + 8] ;B
movq xmm1, MMWORD PTR [rsi + rdx * 2 + 8] ;C
movq xmm4, MMWORD PTR [rax + rdx * 2 + 8] ;D
movq xmm0, MMWORD PTR [rsi + rdx * 4 + 8] ;E
paddsw xmm2, [GLOBAL(rd)]
paddsw xmm2, xmm3
psraw xmm2, 7
packuswb xmm2, xmm2
punpcklbw xmm5, xmm4 ;B D
punpcklbw xmm1, xmm0 ;C E
pmaddubsw xmm1, xmm6
pmaddubsw xmm5, xmm7
movdqa xmm4, [GLOBAL(rd)]
add rsi, rdx
add rax, rdx
;--
;--
paddsw xmm5, xmm1
paddsw xmm5, xmm4
psraw xmm5, 7
packuswb xmm5, xmm5
punpcklqdq xmm2, xmm5
movdqa XMMWORD PTR [rdi], xmm2
%if ABI_IS_32BIT
add rdi, DWORD PTR arg(3) ;out_pitch
%else
add rdi, r8
%endif
dec rcx
jnz .vp8_filter_block1d16_v4_ssse3_loop
; begin epilog
pop rdi
pop rsi
RESTORE_GOT
RESTORE_XMM
UNSHADOW_ARGS
pop rbp
ret
;void vp8_filter_block1d8_v6_ssse3
;(
; unsigned char *src_ptr,
; unsigned int src_pitch,
; unsigned char *output_ptr,
; unsigned int out_pitch,
; unsigned int output_height,
; unsigned int vp8_filter_index
;)
global sym(vp8_filter_block1d8_v6_ssse3)
sym(vp8_filter_block1d8_v6_ssse3):
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 6
SAVE_XMM 7
GET_GOT rbx
push rsi
push rdi
; end prolog
movsxd rdx, DWORD PTR arg(5) ;table index
xor rsi, rsi
shl rdx, 4 ;
lea rax, [GLOBAL(k0_k5)]
add rax, rdx
movsxd rdx, DWORD PTR arg(1) ;pixels_per_line
mov rdi, arg(2) ;output_ptr
%if ABI_IS_32BIT=0
movsxd r8, DWORD PTR arg(3) ; out_pitch
%endif
movsxd rcx, DWORD PTR arg(4) ;[output_height]
cmp esi, DWORD PTR [rax]
je .vp8_filter_block1d8_v4_ssse3
movdqa xmm5, XMMWORD PTR [rax] ;k0_k5
movdqa xmm6, XMMWORD PTR [rax+256] ;k2_k4
movdqa xmm7, XMMWORD PTR [rax+128] ;k1_k3
mov rsi, arg(0) ;src_ptr
mov rax, rsi
add rax, rdx
.vp8_filter_block1d8_v6_ssse3_loop:
movq xmm1, MMWORD PTR [rsi] ;A
movq xmm2, MMWORD PTR [rsi + rdx] ;B
movq xmm3, MMWORD PTR [rsi + rdx * 2] ;C
movq xmm4, MMWORD PTR [rax + rdx * 2] ;D
movq xmm0, MMWORD PTR [rsi + rdx * 4] ;E
punpcklbw xmm2, xmm4 ;B D
punpcklbw xmm3, xmm0 ;C E
movq xmm0, MMWORD PTR [rax + rdx * 4] ;F
movdqa xmm4, [GLOBAL(rd)]
pmaddubsw xmm3, xmm6
punpcklbw xmm1, xmm0 ;A F
pmaddubsw xmm2, xmm7
pmaddubsw xmm1, xmm5
add rsi, rdx
add rax, rdx
;--
;--
paddsw xmm2, xmm3
paddsw xmm2, xmm1
paddsw xmm2, xmm4
psraw xmm2, 7
packuswb xmm2, xmm2
movq MMWORD PTR [rdi], xmm2
%if ABI_IS_32BIT
add rdi, DWORD PTR arg(3) ;[out_pitch]
%else
add rdi, r8
%endif
dec rcx
jnz .vp8_filter_block1d8_v6_ssse3_loop
; begin epilog
pop rdi
pop rsi
RESTORE_GOT
RESTORE_XMM
UNSHADOW_ARGS
pop rbp
ret
.vp8_filter_block1d8_v4_ssse3:
movdqa xmm6, XMMWORD PTR [rax+256] ;k2_k4
movdqa xmm7, XMMWORD PTR [rax+128] ;k1_k3
movdqa xmm5, [GLOBAL(rd)]
mov rsi, arg(0) ;src_ptr
mov rax, rsi
add rax, rdx
.vp8_filter_block1d8_v4_ssse3_loop:
movq xmm2, MMWORD PTR [rsi + rdx] ;B
movq xmm3, MMWORD PTR [rsi + rdx * 2] ;C
movq xmm4, MMWORD PTR [rax + rdx * 2] ;D
movq xmm0, MMWORD PTR [rsi + rdx * 4] ;E
punpcklbw xmm2, xmm4 ;B D
punpcklbw xmm3, xmm0 ;C E
pmaddubsw xmm3, xmm6
pmaddubsw xmm2, xmm7
add rsi, rdx
add rax, rdx
;--
;--
paddsw xmm2, xmm3
paddsw xmm2, xmm5
psraw xmm2, 7
packuswb xmm2, xmm2
movq MMWORD PTR [rdi], xmm2
%if ABI_IS_32BIT
add rdi, DWORD PTR arg(3) ;[out_pitch]
%else
add rdi, r8
%endif
dec rcx
jnz .vp8_filter_block1d8_v4_ssse3_loop
; begin epilog
pop rdi
pop rsi
RESTORE_GOT
RESTORE_XMM
UNSHADOW_ARGS
pop rbp
ret
;void vp8_filter_block1d4_v6_ssse3
;(
; unsigned char *src_ptr,
; unsigned int src_pitch,
; unsigned char *output_ptr,
; unsigned int out_pitch,
; unsigned int output_height,
; unsigned int vp8_filter_index
;)
global sym(vp8_filter_block1d4_v6_ssse3)
sym(vp8_filter_block1d4_v6_ssse3):
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 6
GET_GOT rbx
push rsi
push rdi
; end prolog
movsxd rdx, DWORD PTR arg(5) ;table index
xor rsi, rsi
shl rdx, 4 ;
lea rax, [GLOBAL(k0_k5)]
add rax, rdx
movsxd rdx, DWORD PTR arg(1) ;pixels_per_line
mov rdi, arg(2) ;output_ptr
%if ABI_IS_32BIT=0
movsxd r8, DWORD PTR arg(3) ; out_pitch
%endif
movsxd rcx, DWORD PTR arg(4) ;[output_height]
cmp esi, DWORD PTR [rax]
je .vp8_filter_block1d4_v4_ssse3
movq mm5, MMWORD PTR [rax] ;k0_k5
movq mm6, MMWORD PTR [rax+256] ;k2_k4
movq mm7, MMWORD PTR [rax+128] ;k1_k3
mov rsi, arg(0) ;src_ptr
mov rax, rsi
add rax, rdx
.vp8_filter_block1d4_v6_ssse3_loop:
movd mm1, DWORD PTR [rsi] ;A
movd mm2, DWORD PTR [rsi + rdx] ;B
movd mm3, DWORD PTR [rsi + rdx * 2] ;C
movd mm4, DWORD PTR [rax + rdx * 2] ;D
movd mm0, DWORD PTR [rsi + rdx * 4] ;E
punpcklbw mm2, mm4 ;B D
punpcklbw mm3, mm0 ;C E
movd mm0, DWORD PTR [rax + rdx * 4] ;F
movq mm4, [GLOBAL(rd)]
pmaddubsw mm3, mm6
punpcklbw mm1, mm0 ;A F
pmaddubsw mm2, mm7
pmaddubsw mm1, mm5
add rsi, rdx
add rax, rdx
;--
;--
paddsw mm2, mm3
paddsw mm2, mm1
paddsw mm2, mm4
psraw mm2, 7
packuswb mm2, mm2
movd DWORD PTR [rdi], mm2
%if ABI_IS_32BIT
add rdi, DWORD PTR arg(3) ;[out_pitch]
%else
add rdi, r8
%endif
dec rcx
jnz .vp8_filter_block1d4_v6_ssse3_loop
; begin epilog
pop rdi
pop rsi
RESTORE_GOT
UNSHADOW_ARGS
pop rbp
ret
.vp8_filter_block1d4_v4_ssse3:
movq mm6, MMWORD PTR [rax+256] ;k2_k4
movq mm7, MMWORD PTR [rax+128] ;k1_k3
movq mm5, MMWORD PTR [GLOBAL(rd)]
mov rsi, arg(0) ;src_ptr
mov rax, rsi
add rax, rdx
.vp8_filter_block1d4_v4_ssse3_loop:
movd mm2, DWORD PTR [rsi + rdx] ;B
movd mm3, DWORD PTR [rsi + rdx * 2] ;C
movd mm4, DWORD PTR [rax + rdx * 2] ;D
movd mm0, DWORD PTR [rsi + rdx * 4] ;E
punpcklbw mm2, mm4 ;B D
punpcklbw mm3, mm0 ;C E
pmaddubsw mm3, mm6
pmaddubsw mm2, mm7
add rsi, rdx
add rax, rdx
;--
;--
paddsw mm2, mm3
paddsw mm2, mm5
psraw mm2, 7
packuswb mm2, mm2
movd DWORD PTR [rdi], mm2
%if ABI_IS_32BIT
add rdi, DWORD PTR arg(3) ;[out_pitch]
%else
add rdi, r8
%endif
dec rcx
jnz .vp8_filter_block1d4_v4_ssse3_loop
; begin epilog
pop rdi
pop rsi
RESTORE_GOT
UNSHADOW_ARGS
pop rbp
ret
;void vp8_bilinear_predict16x16_ssse3
;(
; unsigned char *src_ptr,
; int src_pixels_per_line,
; int xoffset,
; int yoffset,
; unsigned char *dst_ptr,
; int dst_pitch
;)
global sym(vp8_bilinear_predict16x16_ssse3)
sym(vp8_bilinear_predict16x16_ssse3):
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 6
SAVE_XMM 7
GET_GOT rbx
push rsi
push rdi
; end prolog
lea rcx, [GLOBAL(vp8_bilinear_filters_ssse3)]
movsxd rax, dword ptr arg(2) ; xoffset
cmp rax, 0 ; skip first_pass filter if xoffset=0
je .b16x16_sp_only
shl rax, 4
lea rax, [rax + rcx] ; HFilter
mov rdi, arg(4) ; dst_ptr
mov rsi, arg(0) ; src_ptr
movsxd rdx, dword ptr arg(5) ; dst_pitch
movdqa xmm1, [rax]
movsxd rax, dword ptr arg(3) ; yoffset
cmp rax, 0 ; skip second_pass filter if yoffset=0
je .b16x16_fp_only
shl rax, 4
lea rax, [rax + rcx] ; VFilter
lea rcx, [rdi+rdx*8]
lea rcx, [rcx+rdx*8]
movsxd rdx, dword ptr arg(1) ; src_pixels_per_line
movdqa xmm2, [rax]
%if ABI_IS_32BIT=0
movsxd r8, dword ptr arg(5) ; dst_pitch
%endif
movq xmm3, [rsi] ; 00 01 02 03 04 05 06 07
movq xmm5, [rsi+1] ; 01 02 03 04 05 06 07 08
punpcklbw xmm3, xmm5 ; 00 01 01 02 02 03 03 04 04 05 05 06 06 07 07 08
movq xmm4, [rsi+8] ; 08 09 10 11 12 13 14 15
movq xmm5, [rsi+9] ; 09 10 11 12 13 14 15 16
lea rsi, [rsi + rdx] ; next line
pmaddubsw xmm3, xmm1 ; 00 02 04 06 08 10 12 14
punpcklbw xmm4, xmm5 ; 08 09 09 10 10 11 11 12 12 13 13 14 14 15 15 16
pmaddubsw xmm4, xmm1 ; 01 03 05 07 09 11 13 15
paddw xmm3, [GLOBAL(rd)] ; xmm3 += round value
psraw xmm3, VP8_FILTER_SHIFT ; xmm3 /= 128
paddw xmm4, [GLOBAL(rd)] ; xmm4 += round value
psraw xmm4, VP8_FILTER_SHIFT ; xmm4 /= 128
movdqa xmm7, xmm3
packuswb xmm7, xmm4 ; 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 15
.next_row:
movq xmm6, [rsi] ; 00 01 02 03 04 05 06 07
movq xmm5, [rsi+1] ; 01 02 03 04 05 06 07 08
punpcklbw xmm6, xmm5
movq xmm4, [rsi+8] ; 08 09 10 11 12 13 14 15
movq xmm5, [rsi+9] ; 09 10 11 12 13 14 15 16
lea rsi, [rsi + rdx] ; next line
pmaddubsw xmm6, xmm1
punpcklbw xmm4, xmm5
pmaddubsw xmm4, xmm1
paddw xmm6, [GLOBAL(rd)] ; xmm6 += round value
psraw xmm6, VP8_FILTER_SHIFT ; xmm6 /= 128
paddw xmm4, [GLOBAL(rd)] ; xmm4 += round value
psraw xmm4, VP8_FILTER_SHIFT ; xmm4 /= 128
packuswb xmm6, xmm4
movdqa xmm5, xmm7
punpcklbw xmm5, xmm6
pmaddubsw xmm5, xmm2
punpckhbw xmm7, xmm6
pmaddubsw xmm7, xmm2
paddw xmm5, [GLOBAL(rd)] ; xmm5 += round value
psraw xmm5, VP8_FILTER_SHIFT ; xmm5 /= 128
paddw xmm7, [GLOBAL(rd)] ; xmm7 += round value
psraw xmm7, VP8_FILTER_SHIFT ; xmm7 /= 128
packuswb xmm5, xmm7
movdqa xmm7, xmm6
movdqa [rdi], xmm5 ; store the results in the destination
%if ABI_IS_32BIT
add rdi, DWORD PTR arg(5) ; dst_pitch
%else
add rdi, r8
%endif
cmp rdi, rcx
jne .next_row
jmp .done
.b16x16_sp_only:
movsxd rax, dword ptr arg(3) ; yoffset
shl rax, 4
lea rax, [rax + rcx] ; VFilter
mov rdi, arg(4) ; dst_ptr
mov rsi, arg(0) ; src_ptr
movsxd rdx, dword ptr arg(5) ; dst_pitch
movdqa xmm1, [rax] ; VFilter
lea rcx, [rdi+rdx*8]
lea rcx, [rcx+rdx*8]
movsxd rax, dword ptr arg(1) ; src_pixels_per_line
; get the first horizontal line done
movq xmm4, [rsi] ; load row 0
movq xmm2, [rsi + 8] ; load row 0
lea rsi, [rsi + rax] ; next line
.next_row_sp:
movq xmm3, [rsi] ; load row + 1
movq xmm5, [rsi + 8] ; load row + 1
punpcklbw xmm4, xmm3
punpcklbw xmm2, xmm5
pmaddubsw xmm4, xmm1
movq xmm7, [rsi + rax] ; load row + 2
pmaddubsw xmm2, xmm1
movq xmm6, [rsi + rax + 8] ; load row + 2
punpcklbw xmm3, xmm7
punpcklbw xmm5, xmm6
pmaddubsw xmm3, xmm1
paddw xmm4, [GLOBAL(rd)]
pmaddubsw xmm5, xmm1
paddw xmm2, [GLOBAL(rd)]
psraw xmm4, VP8_FILTER_SHIFT
psraw xmm2, VP8_FILTER_SHIFT
packuswb xmm4, xmm2
paddw xmm3, [GLOBAL(rd)]
movdqa [rdi], xmm4 ; store row 0
paddw xmm5, [GLOBAL(rd)]
psraw xmm3, VP8_FILTER_SHIFT
psraw xmm5, VP8_FILTER_SHIFT
packuswb xmm3, xmm5
movdqa xmm4, xmm7
movdqa [rdi + rdx],xmm3 ; store row 1
lea rsi, [rsi + 2*rax]
movdqa xmm2, xmm6
lea rdi, [rdi + 2*rdx]
cmp rdi, rcx
jne .next_row_sp
jmp .done
.b16x16_fp_only:
lea rcx, [rdi+rdx*8]
lea rcx, [rcx+rdx*8]
movsxd rax, dword ptr arg(1) ; src_pixels_per_line
.next_row_fp:
movq xmm2, [rsi] ; 00 01 02 03 04 05 06 07
movq xmm4, [rsi+1] ; 01 02 03 04 05 06 07 08
punpcklbw xmm2, xmm4
movq xmm3, [rsi+8] ; 08 09 10 11 12 13 14 15
pmaddubsw xmm2, xmm1
movq xmm4, [rsi+9] ; 09 10 11 12 13 14 15 16
lea rsi, [rsi + rax] ; next line
punpcklbw xmm3, xmm4
pmaddubsw xmm3, xmm1
movq xmm5, [rsi]
paddw xmm2, [GLOBAL(rd)]
movq xmm7, [rsi+1]
movq xmm6, [rsi+8]
psraw xmm2, VP8_FILTER_SHIFT
punpcklbw xmm5, xmm7
movq xmm7, [rsi+9]
paddw xmm3, [GLOBAL(rd)]
pmaddubsw xmm5, xmm1
psraw xmm3, VP8_FILTER_SHIFT
punpcklbw xmm6, xmm7
packuswb xmm2, xmm3
pmaddubsw xmm6, xmm1
movdqa [rdi], xmm2 ; store the results in the destination
paddw xmm5, [GLOBAL(rd)]
lea rdi, [rdi + rdx] ; dst_pitch
psraw xmm5, VP8_FILTER_SHIFT
paddw xmm6, [GLOBAL(rd)]
psraw xmm6, VP8_FILTER_SHIFT
packuswb xmm5, xmm6
lea rsi, [rsi + rax] ; next line
movdqa [rdi], xmm5 ; store the results in the destination
lea rdi, [rdi + rdx] ; dst_pitch
cmp rdi, rcx
jne .next_row_fp
.done:
; begin epilog
pop rdi
pop rsi
RESTORE_GOT
RESTORE_XMM
UNSHADOW_ARGS
pop rbp
ret
;void vp8_bilinear_predict8x8_ssse3
;(
; unsigned char *src_ptr,
; int src_pixels_per_line,
; int xoffset,
; int yoffset,
; unsigned char *dst_ptr,
; int dst_pitch
;)
global sym(vp8_bilinear_predict8x8_ssse3)
sym(vp8_bilinear_predict8x8_ssse3):
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 6
SAVE_XMM 7
GET_GOT rbx
push rsi
push rdi
; end prolog
ALIGN_STACK 16, rax
sub rsp, 144 ; reserve 144 bytes
lea rcx, [GLOBAL(vp8_bilinear_filters_ssse3)]
mov rsi, arg(0) ;src_ptr
movsxd rdx, dword ptr arg(1) ;src_pixels_per_line
;Read 9-line unaligned data in and put them on stack. This gives a big
;performance boost.
movdqu xmm0, [rsi]
lea rax, [rdx + rdx*2]
movdqu xmm1, [rsi+rdx]
movdqu xmm2, [rsi+rdx*2]
add rsi, rax
movdqu xmm3, [rsi]
movdqu xmm4, [rsi+rdx]
movdqu xmm5, [rsi+rdx*2]
add rsi, rax
movdqu xmm6, [rsi]
movdqu xmm7, [rsi+rdx]
movdqa XMMWORD PTR [rsp], xmm0
movdqu xmm0, [rsi+rdx*2]
movdqa XMMWORD PTR [rsp+16], xmm1
movdqa XMMWORD PTR [rsp+32], xmm2
movdqa XMMWORD PTR [rsp+48], xmm3
movdqa XMMWORD PTR [rsp+64], xmm4
movdqa XMMWORD PTR [rsp+80], xmm5
movdqa XMMWORD PTR [rsp+96], xmm6
movdqa XMMWORD PTR [rsp+112], xmm7
movdqa XMMWORD PTR [rsp+128], xmm0
movsxd rax, dword ptr arg(2) ; xoffset
cmp rax, 0 ; skip first_pass filter if xoffset=0
je .b8x8_sp_only
shl rax, 4
add rax, rcx ; HFilter
mov rdi, arg(4) ; dst_ptr
movsxd rdx, dword ptr arg(5) ; dst_pitch
movdqa xmm0, [rax]
movsxd rax, dword ptr arg(3) ; yoffset
cmp rax, 0 ; skip second_pass filter if yoffset=0
je .b8x8_fp_only
shl rax, 4
lea rax, [rax + rcx] ; VFilter
lea rcx, [rdi+rdx*8]
movdqa xmm1, [rax]
; get the first horizontal line done
movdqa xmm3, [rsp] ; 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 15
movdqa xmm5, xmm3 ; 01 02 03 04 05 06 07 08 09 10 11 12 13 14 15 xx
psrldq xmm5, 1
lea rsp, [rsp + 16] ; next line
punpcklbw xmm3, xmm5 ; 00 01 01 02 02 03 03 04 04 05 05 06 06 07 07 08
pmaddubsw xmm3, xmm0 ; 00 02 04 06 08 10 12 14
paddw xmm3, [GLOBAL(rd)] ; xmm3 += round value
psraw xmm3, VP8_FILTER_SHIFT ; xmm3 /= 128
movdqa xmm7, xmm3
packuswb xmm7, xmm7 ; 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 15
.next_row:
movdqa xmm6, [rsp] ; 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 15
lea rsp, [rsp + 16] ; next line
movdqa xmm5, xmm6
psrldq xmm5, 1
punpcklbw xmm6, xmm5
pmaddubsw xmm6, xmm0
paddw xmm6, [GLOBAL(rd)] ; xmm6 += round value
psraw xmm6, VP8_FILTER_SHIFT ; xmm6 /= 128
packuswb xmm6, xmm6
punpcklbw xmm7, xmm6
pmaddubsw xmm7, xmm1
paddw xmm7, [GLOBAL(rd)] ; xmm7 += round value
psraw xmm7, VP8_FILTER_SHIFT ; xmm7 /= 128
packuswb xmm7, xmm7
movq [rdi], xmm7 ; store the results in the destination
lea rdi, [rdi + rdx]
movdqa xmm7, xmm6
cmp rdi, rcx
jne .next_row
jmp .done8x8
.b8x8_sp_only:
movsxd rax, dword ptr arg(3) ; yoffset
shl rax, 4
lea rax, [rax + rcx] ; VFilter
mov rdi, arg(4) ;dst_ptr
movsxd rdx, dword ptr arg(5) ; dst_pitch
movdqa xmm0, [rax] ; VFilter
movq xmm1, XMMWORD PTR [rsp]
movq xmm2, XMMWORD PTR [rsp+16]
movq xmm3, XMMWORD PTR [rsp+32]
punpcklbw xmm1, xmm2
movq xmm4, XMMWORD PTR [rsp+48]
punpcklbw xmm2, xmm3
movq xmm5, XMMWORD PTR [rsp+64]
punpcklbw xmm3, xmm4
movq xmm6, XMMWORD PTR [rsp+80]
punpcklbw xmm4, xmm5
movq xmm7, XMMWORD PTR [rsp+96]
punpcklbw xmm5, xmm6
pmaddubsw xmm1, xmm0
pmaddubsw xmm2, xmm0
pmaddubsw xmm3, xmm0
pmaddubsw xmm4, xmm0
pmaddubsw xmm5, xmm0
punpcklbw xmm6, xmm7
pmaddubsw xmm6, xmm0
paddw xmm1, [GLOBAL(rd)]
paddw xmm2, [GLOBAL(rd)]
psraw xmm1, VP8_FILTER_SHIFT
paddw xmm3, [GLOBAL(rd)]
psraw xmm2, VP8_FILTER_SHIFT
paddw xmm4, [GLOBAL(rd)]
psraw xmm3, VP8_FILTER_SHIFT
paddw xmm5, [GLOBAL(rd)]
psraw xmm4, VP8_FILTER_SHIFT
paddw xmm6, [GLOBAL(rd)]
psraw xmm5, VP8_FILTER_SHIFT
psraw xmm6, VP8_FILTER_SHIFT
packuswb xmm1, xmm1
packuswb xmm2, xmm2
movq [rdi], xmm1
packuswb xmm3, xmm3
movq [rdi+rdx], xmm2
packuswb xmm4, xmm4
movq xmm1, XMMWORD PTR [rsp+112]
lea rdi, [rdi + 2*rdx]
movq xmm2, XMMWORD PTR [rsp+128]
packuswb xmm5, xmm5
movq [rdi], xmm3
packuswb xmm6, xmm6
movq [rdi+rdx], xmm4
lea rdi, [rdi + 2*rdx]
punpcklbw xmm7, xmm1
movq [rdi], xmm5
pmaddubsw xmm7, xmm0
movq [rdi+rdx], xmm6
punpcklbw xmm1, xmm2
pmaddubsw xmm1, xmm0
paddw xmm7, [GLOBAL(rd)]
psraw xmm7, VP8_FILTER_SHIFT
paddw xmm1, [GLOBAL(rd)]
psraw xmm1, VP8_FILTER_SHIFT
packuswb xmm7, xmm7
packuswb xmm1, xmm1
lea rdi, [rdi + 2*rdx]
movq [rdi], xmm7
movq [rdi+rdx], xmm1
lea rsp, [rsp + 144]
jmp .done8x8
.b8x8_fp_only:
lea rcx, [rdi+rdx*8]
.next_row_fp:
movdqa xmm1, XMMWORD PTR [rsp]
movdqa xmm3, XMMWORD PTR [rsp+16]
movdqa xmm2, xmm1
movdqa xmm5, XMMWORD PTR [rsp+32]
psrldq xmm2, 1
movdqa xmm7, XMMWORD PTR [rsp+48]
movdqa xmm4, xmm3
psrldq xmm4, 1
movdqa xmm6, xmm5
psrldq xmm6, 1
punpcklbw xmm1, xmm2
pmaddubsw xmm1, xmm0
punpcklbw xmm3, xmm4
pmaddubsw xmm3, xmm0
punpcklbw xmm5, xmm6
pmaddubsw xmm5, xmm0
movdqa xmm2, xmm7
psrldq xmm2, 1
punpcklbw xmm7, xmm2
pmaddubsw xmm7, xmm0
paddw xmm1, [GLOBAL(rd)]
psraw xmm1, VP8_FILTER_SHIFT
paddw xmm3, [GLOBAL(rd)]
psraw xmm3, VP8_FILTER_SHIFT
paddw xmm5, [GLOBAL(rd)]
psraw xmm5, VP8_FILTER_SHIFT
paddw xmm7, [GLOBAL(rd)]
psraw xmm7, VP8_FILTER_SHIFT
packuswb xmm1, xmm1
packuswb xmm3, xmm3
packuswb xmm5, xmm5
movq [rdi], xmm1
packuswb xmm7, xmm7
movq [rdi+rdx], xmm3
lea rdi, [rdi + 2*rdx]
movq [rdi], xmm5
lea rsp, [rsp + 4*16]
movq [rdi+rdx], xmm7
lea rdi, [rdi + 2*rdx]
cmp rdi, rcx
jne .next_row_fp
lea rsp, [rsp + 16]
.done8x8:
;add rsp, 144
pop rsp
; begin epilog
pop rdi
pop rsi
RESTORE_GOT
RESTORE_XMM
UNSHADOW_ARGS
pop rbp
ret
SECTION_RODATA
align 16
shuf1b:
db 0, 5, 1, 6, 2, 7, 3, 8, 4, 9, 5, 10, 6, 11, 7, 12
shuf2b:
db 2, 4, 3, 5, 4, 6, 5, 7, 6, 8, 7, 9, 8, 10, 9, 11
shuf3b:
db 1, 3, 2, 4, 3, 5, 4, 6, 5, 7, 6, 8, 7, 9, 8, 10
align 16
shuf2bfrom1:
db 4, 8, 6, 1, 8, 3, 1, 5, 3, 7, 5, 9, 7,11, 9,13
align 16
shuf3bfrom1:
db 2, 6, 4, 8, 6, 1, 8, 3, 1, 5, 3, 7, 5, 9, 7,11
align 16
rd:
times 8 dw 0x40
align 16
k0_k5:
times 8 db 0, 0 ;placeholder
times 8 db 0, 0
times 8 db 2, 1
times 8 db 0, 0
times 8 db 3, 3
times 8 db 0, 0
times 8 db 1, 2
times 8 db 0, 0
k1_k3:
times 8 db 0, 0 ;placeholder
times 8 db -6, 12
times 8 db -11, 36
times 8 db -9, 50
times 8 db -16, 77
times 8 db -6, 93
times 8 db -8, 108
times 8 db -1, 123
k2_k4:
times 8 db 128, 0 ;placeholder
times 8 db 123, -1
times 8 db 108, -8
times 8 db 93, -6
times 8 db 77, -16
times 8 db 50, -9
times 8 db 36, -11
times 8 db 12, -6
align 16
%if CONFIG_SIXTEENTH_SUBPEL_UV
vp8_bilinear_filters_ssse3:
times 8 db 128, 0
times 8 db 120, 8
times 8 db 112, 16
times 8 db 104, 24
times 8 db 96, 32
times 8 db 88, 40
times 8 db 80, 48
times 8 db 72, 56
times 8 db 64, 64
times 8 db 56, 72
times 8 db 48, 80
times 8 db 40, 88
times 8 db 32, 96
times 8 db 24, 104
times 8 db 16, 112
times 8 db 8, 120
%else
vp8_bilinear_filters_ssse3:
times 8 db 128, 0
times 8 db 112, 16
times 8 db 96, 32
times 8 db 80, 48
times 8 db 64, 64
times 8 db 48, 80
times 8 db 32, 96
times 8 db 16, 112
%endif