6035da5448
This is a code snapshot of experimental work currently ongoing for a next-generation codec. The codebase has been cut down considerably from the libvpx baseline. For example, we are currently only supporting VBR 2-pass rate control and have removed most of the code relating to coding speed, threading, error resilience, partitions and various other features. This is in part to make the codebase easier to work on and experiment with, but also because we want to have an open discussion about how the bitstream will be structured and partitioned and not have that conversation constrained by past work. Our basic working pattern has been to initially encapsulate experiments using configure options linked to #IF CONFIG_XXX statements in the code. Once experiments have matured and we are reasonably happy that they give benefit and can be merged without breaking other experiments, we remove the conditional compile statements and merge them in. Current changes include: * Temporal coding experiment for segments (though still only 4 max, it will likely be increased). * Segment feature experiment - to allow various bits of information to be coded at the segment level. Features tested so far include mode and reference frame information, limiting end of block offset and transform size, alongside Q and loop filter parameters, but this set is very fluid. * Support for 8x8 transform - 8x8 dct with 2nd order 2x2 haar is used in MBs using 16x16 prediction modes within inter frames. * Compound prediction (combination of signals from existing predictors to create a new predictor). * 8 tap interpolation filters and 1/8th pel motion vectors. * Loop filter modifications. * Various entropy modifications and changes to how entropy contexts and updates are handled. * Extended quantizer range matched to transform precision improvements. There are also ongoing further experiments that we hope to merge in the near future: For example, coding of motion and other aspects of the prediction signal to better support larger image formats, use of larger block sizes (e.g. 32x32 and up) and lossless non-transform based coding options (especially for key frames). It is our hope that we will be able to make regular updates and we will warmly welcome community contributions. Please be warned that, at this stage, the codebase is currently slower than VP8 stable branch as most new code has not been optimized, and even the 'C' has been deliberately written to be simple and obvious, not fast. The following graphs have the initial test results, numbers in the tables measure the compression improvement in terms of percentage. The build has the following optional experiments configured: --enable-experimental --enable-enhanced_interp --enable-uvintra --enable-high_precision_mv --enable-sixteenth_subpel_uv CIF Size clips: http://getwebm.org/tmp/cif/ HD size clips: http://getwebm.org/tmp/hd/ (stable_20120309 represents encoding results of WebM master branch build as of commit#7a15907) They were encoded using the following encode parameters: --good --cpu-used=0 -t 0 --lag-in-frames=25 --min-q=0 --max-q=63 --end-usage=0 --auto-alt-ref=1 -p 2 --pass=2 --kf-max-dist=9999 --kf-min-dist=0 --drop-frame=0 --static-thresh=0 --bias-pct=50 --minsection-pct=0 --maxsection-pct=800 --sharpness=0 --arnr-maxframes=7 --arnr-strength=3(for HD,6 for CIF) --arnr-type=3 Change-Id: I5c62ed09cfff5815a2bb34e7820d6a810c23183c
1528 lines
42 KiB
NASM
1528 lines
42 KiB
NASM
;
|
|
; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
|
|
;
|
|
; Use of this source code is governed by a BSD-style license
|
|
; that can be found in the LICENSE file in the root of the source
|
|
; tree. An additional intellectual property rights grant can be found
|
|
; in the file PATENTS. All contributing project authors may
|
|
; be found in the AUTHORS file in the root of the source tree.
|
|
;
|
|
|
|
|
|
%include "vpx_ports/x86_abi_support.asm"
|
|
|
|
%define BLOCK_HEIGHT_WIDTH 4
|
|
%define VP8_FILTER_WEIGHT 128
|
|
%define VP8_FILTER_SHIFT 7
|
|
|
|
|
|
;/************************************************************************************
|
|
; Notes: filter_block1d_h6 applies a 6 tap filter horizontally to the input pixels. The
|
|
; input pixel array has output_height rows. This routine assumes that output_height is an
|
|
; even number. This function handles 8 pixels in horizontal direction, calculating ONE
|
|
; rows each iteration to take advantage of the 128 bits operations.
|
|
;
|
|
; This is an implementation of some of the SSE optimizations first seen in ffvp8
|
|
;
|
|
;*************************************************************************************/
|
|
;void vp8_filter_block1d8_h6_ssse3
|
|
;(
|
|
; unsigned char *src_ptr,
|
|
; unsigned int src_pixels_per_line,
|
|
; unsigned char *output_ptr,
|
|
; unsigned int output_pitch,
|
|
; unsigned int output_height,
|
|
; unsigned int vp8_filter_index
|
|
;)
|
|
global sym(vp8_filter_block1d8_h6_ssse3)
|
|
sym(vp8_filter_block1d8_h6_ssse3):
|
|
push rbp
|
|
mov rbp, rsp
|
|
SHADOW_ARGS_TO_STACK 6
|
|
SAVE_XMM 7
|
|
GET_GOT rbx
|
|
push rsi
|
|
push rdi
|
|
; end prolog
|
|
|
|
movsxd rdx, DWORD PTR arg(5) ;table index
|
|
xor rsi, rsi
|
|
shl rdx, 4
|
|
|
|
movdqa xmm7, [GLOBAL(rd)]
|
|
|
|
lea rax, [GLOBAL(k0_k5)]
|
|
add rax, rdx
|
|
mov rdi, arg(2) ;output_ptr
|
|
|
|
cmp esi, DWORD PTR [rax]
|
|
je vp8_filter_block1d8_h4_ssse3
|
|
|
|
movdqa xmm4, XMMWORD PTR [rax] ;k0_k5
|
|
movdqa xmm5, XMMWORD PTR [rax+256] ;k2_k4
|
|
movdqa xmm6, XMMWORD PTR [rax+128] ;k1_k3
|
|
|
|
mov rsi, arg(0) ;src_ptr
|
|
movsxd rax, dword ptr arg(1) ;src_pixels_per_line
|
|
movsxd rcx, dword ptr arg(4) ;output_height
|
|
|
|
movsxd rdx, dword ptr arg(3) ;output_pitch
|
|
|
|
sub rdi, rdx
|
|
;xmm3 free
|
|
.filter_block1d8_h6_rowloop_ssse3:
|
|
movq xmm0, MMWORD PTR [rsi - 2] ; -2 -1 0 1 2 3 4 5
|
|
|
|
movq xmm2, MMWORD PTR [rsi + 3] ; 3 4 5 6 7 8 9 10
|
|
|
|
punpcklbw xmm0, xmm2 ; -2 3 -1 4 0 5 1 6 2 7 3 8 4 9 5 10
|
|
|
|
movdqa xmm1, xmm0
|
|
pmaddubsw xmm0, xmm4
|
|
|
|
movdqa xmm2, xmm1
|
|
pshufb xmm1, [GLOBAL(shuf2bfrom1)]
|
|
|
|
pshufb xmm2, [GLOBAL(shuf3bfrom1)]
|
|
pmaddubsw xmm1, xmm5
|
|
|
|
lea rdi, [rdi + rdx]
|
|
pmaddubsw xmm2, xmm6
|
|
|
|
lea rsi, [rsi + rax]
|
|
dec rcx
|
|
|
|
paddsw xmm0, xmm1
|
|
paddsw xmm2, xmm7
|
|
|
|
paddsw xmm0, xmm2
|
|
|
|
psraw xmm0, 7
|
|
|
|
packuswb xmm0, xmm0
|
|
|
|
movq MMWORD Ptr [rdi], xmm0
|
|
jnz .filter_block1d8_h6_rowloop_ssse3
|
|
|
|
; begin epilog
|
|
pop rdi
|
|
pop rsi
|
|
RESTORE_GOT
|
|
RESTORE_XMM
|
|
UNSHADOW_ARGS
|
|
pop rbp
|
|
ret
|
|
|
|
vp8_filter_block1d8_h4_ssse3:
|
|
movdqa xmm5, XMMWORD PTR [rax+256] ;k2_k4
|
|
movdqa xmm6, XMMWORD PTR [rax+128] ;k1_k3
|
|
|
|
movdqa xmm3, XMMWORD PTR [GLOBAL(shuf2bfrom1)]
|
|
movdqa xmm4, XMMWORD PTR [GLOBAL(shuf3bfrom1)]
|
|
|
|
mov rsi, arg(0) ;src_ptr
|
|
|
|
movsxd rax, dword ptr arg(1) ;src_pixels_per_line
|
|
movsxd rcx, dword ptr arg(4) ;output_height
|
|
|
|
movsxd rdx, dword ptr arg(3) ;output_pitch
|
|
|
|
sub rdi, rdx
|
|
|
|
.filter_block1d8_h4_rowloop_ssse3:
|
|
movq xmm0, MMWORD PTR [rsi - 2] ; -2 -1 0 1 2 3 4 5
|
|
|
|
movq xmm1, MMWORD PTR [rsi + 3] ; 3 4 5 6 7 8 9 10
|
|
|
|
punpcklbw xmm0, xmm1 ; -2 3 -1 4 0 5 1 6 2 7 3 8 4 9 5 10
|
|
|
|
movdqa xmm2, xmm0
|
|
pshufb xmm0, xmm3
|
|
|
|
pshufb xmm2, xmm4
|
|
pmaddubsw xmm0, xmm5
|
|
|
|
lea rdi, [rdi + rdx]
|
|
pmaddubsw xmm2, xmm6
|
|
|
|
lea rsi, [rsi + rax]
|
|
dec rcx
|
|
|
|
paddsw xmm0, xmm7
|
|
|
|
paddsw xmm0, xmm2
|
|
|
|
psraw xmm0, 7
|
|
|
|
packuswb xmm0, xmm0
|
|
|
|
movq MMWORD Ptr [rdi], xmm0
|
|
|
|
jnz .filter_block1d8_h4_rowloop_ssse3
|
|
|
|
; begin epilog
|
|
pop rdi
|
|
pop rsi
|
|
RESTORE_GOT
|
|
RESTORE_XMM
|
|
UNSHADOW_ARGS
|
|
pop rbp
|
|
ret
|
|
;void vp8_filter_block1d16_h6_ssse3
|
|
;(
|
|
; unsigned char *src_ptr,
|
|
; unsigned int src_pixels_per_line,
|
|
; unsigned char *output_ptr,
|
|
; unsigned int output_pitch,
|
|
; unsigned int output_height,
|
|
; unsigned int vp8_filter_index
|
|
;)
|
|
global sym(vp8_filter_block1d16_h6_ssse3)
|
|
sym(vp8_filter_block1d16_h6_ssse3):
|
|
push rbp
|
|
mov rbp, rsp
|
|
SHADOW_ARGS_TO_STACK 6
|
|
SAVE_XMM 7
|
|
GET_GOT rbx
|
|
push rsi
|
|
push rdi
|
|
; end prolog
|
|
|
|
movsxd rdx, DWORD PTR arg(5) ;table index
|
|
xor rsi, rsi
|
|
shl rdx, 4 ;
|
|
|
|
lea rax, [GLOBAL(k0_k5)]
|
|
add rax, rdx
|
|
|
|
mov rdi, arg(2) ;output_ptr
|
|
|
|
mov rsi, arg(0) ;src_ptr
|
|
|
|
movdqa xmm4, XMMWORD PTR [rax] ;k0_k5
|
|
movdqa xmm5, XMMWORD PTR [rax+256] ;k2_k4
|
|
movdqa xmm6, XMMWORD PTR [rax+128] ;k1_k3
|
|
|
|
movsxd rax, dword ptr arg(1) ;src_pixels_per_line
|
|
movsxd rcx, dword ptr arg(4) ;output_height
|
|
movsxd rdx, dword ptr arg(3) ;output_pitch
|
|
|
|
.filter_block1d16_h6_rowloop_ssse3:
|
|
movq xmm0, MMWORD PTR [rsi - 2] ; -2 -1 0 1 2 3 4 5
|
|
|
|
movq xmm3, MMWORD PTR [rsi + 3] ; 3 4 5 6 7 8 9 10
|
|
|
|
punpcklbw xmm0, xmm3 ; -2 3 -1 4 0 5 1 6 2 7 3 8 4 9 5 10
|
|
|
|
movdqa xmm1, xmm0
|
|
pmaddubsw xmm0, xmm4
|
|
|
|
movdqa xmm2, xmm1
|
|
pshufb xmm1, [GLOBAL(shuf2bfrom1)]
|
|
|
|
pshufb xmm2, [GLOBAL(shuf3bfrom1)]
|
|
movq xmm3, MMWORD PTR [rsi + 6]
|
|
|
|
pmaddubsw xmm1, xmm5
|
|
movq xmm7, MMWORD PTR [rsi + 11]
|
|
|
|
pmaddubsw xmm2, xmm6
|
|
punpcklbw xmm3, xmm7
|
|
|
|
paddsw xmm0, xmm1
|
|
movdqa xmm1, xmm3
|
|
|
|
pmaddubsw xmm3, xmm4
|
|
paddsw xmm0, xmm2
|
|
|
|
movdqa xmm2, xmm1
|
|
paddsw xmm0, [GLOBAL(rd)]
|
|
|
|
pshufb xmm1, [GLOBAL(shuf2bfrom1)]
|
|
pshufb xmm2, [GLOBAL(shuf3bfrom1)]
|
|
|
|
psraw xmm0, 7
|
|
pmaddubsw xmm1, xmm5
|
|
|
|
pmaddubsw xmm2, xmm6
|
|
packuswb xmm0, xmm0
|
|
|
|
lea rsi, [rsi + rax]
|
|
paddsw xmm3, xmm1
|
|
|
|
paddsw xmm3, xmm2
|
|
|
|
paddsw xmm3, [GLOBAL(rd)]
|
|
|
|
psraw xmm3, 7
|
|
|
|
packuswb xmm3, xmm3
|
|
|
|
punpcklqdq xmm0, xmm3
|
|
|
|
movdqa XMMWORD Ptr [rdi], xmm0
|
|
|
|
lea rdi, [rdi + rdx]
|
|
dec rcx
|
|
jnz .filter_block1d16_h6_rowloop_ssse3
|
|
|
|
; begin epilog
|
|
pop rdi
|
|
pop rsi
|
|
RESTORE_GOT
|
|
RESTORE_XMM
|
|
UNSHADOW_ARGS
|
|
pop rbp
|
|
ret
|
|
|
|
;void vp8_filter_block1d4_h6_ssse3
|
|
;(
|
|
; unsigned char *src_ptr,
|
|
; unsigned int src_pixels_per_line,
|
|
; unsigned char *output_ptr,
|
|
; unsigned int output_pitch,
|
|
; unsigned int output_height,
|
|
; unsigned int vp8_filter_index
|
|
;)
|
|
global sym(vp8_filter_block1d4_h6_ssse3)
|
|
sym(vp8_filter_block1d4_h6_ssse3):
|
|
push rbp
|
|
mov rbp, rsp
|
|
SHADOW_ARGS_TO_STACK 6
|
|
SAVE_XMM 7
|
|
GET_GOT rbx
|
|
push rsi
|
|
push rdi
|
|
; end prolog
|
|
|
|
movsxd rdx, DWORD PTR arg(5) ;table index
|
|
xor rsi, rsi
|
|
shl rdx, 4 ;
|
|
|
|
lea rax, [GLOBAL(k0_k5)]
|
|
add rax, rdx
|
|
movdqa xmm7, [GLOBAL(rd)]
|
|
|
|
cmp esi, DWORD PTR [rax]
|
|
je .vp8_filter_block1d4_h4_ssse3
|
|
|
|
movdqa xmm4, XMMWORD PTR [rax] ;k0_k5
|
|
movdqa xmm5, XMMWORD PTR [rax+256] ;k2_k4
|
|
movdqa xmm6, XMMWORD PTR [rax+128] ;k1_k3
|
|
|
|
mov rsi, arg(0) ;src_ptr
|
|
mov rdi, arg(2) ;output_ptr
|
|
movsxd rax, dword ptr arg(1) ;src_pixels_per_line
|
|
movsxd rcx, dword ptr arg(4) ;output_height
|
|
|
|
movsxd rdx, dword ptr arg(3) ;output_pitch
|
|
|
|
;xmm3 free
|
|
.filter_block1d4_h6_rowloop_ssse3:
|
|
movdqu xmm0, XMMWORD PTR [rsi - 2]
|
|
|
|
movdqa xmm1, xmm0
|
|
pshufb xmm0, [GLOBAL(shuf1b)]
|
|
|
|
movdqa xmm2, xmm1
|
|
pshufb xmm1, [GLOBAL(shuf2b)]
|
|
pmaddubsw xmm0, xmm4
|
|
pshufb xmm2, [GLOBAL(shuf3b)]
|
|
pmaddubsw xmm1, xmm5
|
|
|
|
;--
|
|
pmaddubsw xmm2, xmm6
|
|
|
|
lea rsi, [rsi + rax]
|
|
;--
|
|
paddsw xmm0, xmm1
|
|
paddsw xmm0, xmm7
|
|
pxor xmm1, xmm1
|
|
paddsw xmm0, xmm2
|
|
psraw xmm0, 7
|
|
packuswb xmm0, xmm0
|
|
|
|
movd DWORD PTR [rdi], xmm0
|
|
|
|
add rdi, rdx
|
|
dec rcx
|
|
jnz .filter_block1d4_h6_rowloop_ssse3
|
|
|
|
; begin epilog
|
|
pop rdi
|
|
pop rsi
|
|
RESTORE_GOT
|
|
UNSHADOW_ARGS
|
|
pop rbp
|
|
ret
|
|
|
|
.vp8_filter_block1d4_h4_ssse3:
|
|
movdqa xmm5, XMMWORD PTR [rax+256] ;k2_k4
|
|
movdqa xmm6, XMMWORD PTR [rax+128] ;k1_k3
|
|
movdqa xmm0, XMMWORD PTR [GLOBAL(shuf2b)]
|
|
movdqa xmm3, XMMWORD PTR [GLOBAL(shuf3b)]
|
|
|
|
mov rsi, arg(0) ;src_ptr
|
|
mov rdi, arg(2) ;output_ptr
|
|
movsxd rax, dword ptr arg(1) ;src_pixels_per_line
|
|
movsxd rcx, dword ptr arg(4) ;output_height
|
|
|
|
movsxd rdx, dword ptr arg(3) ;output_pitch
|
|
|
|
.filter_block1d4_h4_rowloop_ssse3:
|
|
movdqu xmm1, XMMWORD PTR [rsi - 2]
|
|
|
|
movdqa xmm2, xmm1
|
|
pshufb xmm1, xmm0 ;;[GLOBAL(shuf2b)]
|
|
pshufb xmm2, xmm3 ;;[GLOBAL(shuf3b)]
|
|
pmaddubsw xmm1, xmm5
|
|
|
|
;--
|
|
pmaddubsw xmm2, xmm6
|
|
|
|
lea rsi, [rsi + rax]
|
|
;--
|
|
paddsw xmm1, xmm7
|
|
paddsw xmm1, xmm2
|
|
psraw xmm1, 7
|
|
packuswb xmm1, xmm1
|
|
|
|
movd DWORD PTR [rdi], xmm1
|
|
|
|
add rdi, rdx
|
|
dec rcx
|
|
jnz .filter_block1d4_h4_rowloop_ssse3
|
|
|
|
; begin epilog
|
|
pop rdi
|
|
pop rsi
|
|
RESTORE_GOT
|
|
RESTORE_XMM
|
|
UNSHADOW_ARGS
|
|
pop rbp
|
|
ret
|
|
|
|
|
|
|
|
;void vp8_filter_block1d16_v6_ssse3
|
|
;(
|
|
; unsigned char *src_ptr,
|
|
; unsigned int src_pitch,
|
|
; unsigned char *output_ptr,
|
|
; unsigned int out_pitch,
|
|
; unsigned int output_height,
|
|
; unsigned int vp8_filter_index
|
|
;)
|
|
global sym(vp8_filter_block1d16_v6_ssse3)
|
|
sym(vp8_filter_block1d16_v6_ssse3):
|
|
push rbp
|
|
mov rbp, rsp
|
|
SHADOW_ARGS_TO_STACK 6
|
|
SAVE_XMM 7
|
|
GET_GOT rbx
|
|
push rsi
|
|
push rdi
|
|
; end prolog
|
|
|
|
movsxd rdx, DWORD PTR arg(5) ;table index
|
|
xor rsi, rsi
|
|
shl rdx, 4 ;
|
|
|
|
lea rax, [GLOBAL(k0_k5)]
|
|
add rax, rdx
|
|
|
|
cmp esi, DWORD PTR [rax]
|
|
je .vp8_filter_block1d16_v4_ssse3
|
|
|
|
movdqa xmm5, XMMWORD PTR [rax] ;k0_k5
|
|
movdqa xmm6, XMMWORD PTR [rax+256] ;k2_k4
|
|
movdqa xmm7, XMMWORD PTR [rax+128] ;k1_k3
|
|
|
|
mov rsi, arg(0) ;src_ptr
|
|
movsxd rdx, DWORD PTR arg(1) ;pixels_per_line
|
|
mov rdi, arg(2) ;output_ptr
|
|
|
|
%if ABI_IS_32BIT=0
|
|
movsxd r8, DWORD PTR arg(3) ;out_pitch
|
|
%endif
|
|
mov rax, rsi
|
|
movsxd rcx, DWORD PTR arg(4) ;output_height
|
|
add rax, rdx
|
|
|
|
|
|
.vp8_filter_block1d16_v6_ssse3_loop:
|
|
movq xmm1, MMWORD PTR [rsi] ;A
|
|
movq xmm2, MMWORD PTR [rsi + rdx] ;B
|
|
movq xmm3, MMWORD PTR [rsi + rdx * 2] ;C
|
|
movq xmm4, MMWORD PTR [rax + rdx * 2] ;D
|
|
movq xmm0, MMWORD PTR [rsi + rdx * 4] ;E
|
|
|
|
punpcklbw xmm2, xmm4 ;B D
|
|
punpcklbw xmm3, xmm0 ;C E
|
|
|
|
movq xmm0, MMWORD PTR [rax + rdx * 4] ;F
|
|
|
|
pmaddubsw xmm3, xmm6
|
|
punpcklbw xmm1, xmm0 ;A F
|
|
pmaddubsw xmm2, xmm7
|
|
pmaddubsw xmm1, xmm5
|
|
|
|
paddsw xmm2, xmm3
|
|
paddsw xmm2, xmm1
|
|
paddsw xmm2, [GLOBAL(rd)]
|
|
psraw xmm2, 7
|
|
packuswb xmm2, xmm2
|
|
|
|
movq MMWORD PTR [rdi], xmm2 ;store the results
|
|
|
|
movq xmm1, MMWORD PTR [rsi + 8] ;A
|
|
movq xmm2, MMWORD PTR [rsi + rdx + 8] ;B
|
|
movq xmm3, MMWORD PTR [rsi + rdx * 2 + 8] ;C
|
|
movq xmm4, MMWORD PTR [rax + rdx * 2 + 8] ;D
|
|
movq xmm0, MMWORD PTR [rsi + rdx * 4 + 8] ;E
|
|
|
|
punpcklbw xmm2, xmm4 ;B D
|
|
punpcklbw xmm3, xmm0 ;C E
|
|
|
|
movq xmm0, MMWORD PTR [rax + rdx * 4 + 8] ;F
|
|
pmaddubsw xmm3, xmm6
|
|
punpcklbw xmm1, xmm0 ;A F
|
|
pmaddubsw xmm2, xmm7
|
|
pmaddubsw xmm1, xmm5
|
|
|
|
add rsi, rdx
|
|
add rax, rdx
|
|
;--
|
|
;--
|
|
paddsw xmm2, xmm3
|
|
paddsw xmm2, xmm1
|
|
paddsw xmm2, [GLOBAL(rd)]
|
|
psraw xmm2, 7
|
|
packuswb xmm2, xmm2
|
|
|
|
movq MMWORD PTR [rdi+8], xmm2
|
|
|
|
%if ABI_IS_32BIT
|
|
add rdi, DWORD PTR arg(3) ;out_pitch
|
|
%else
|
|
add rdi, r8
|
|
%endif
|
|
dec rcx
|
|
jnz .vp8_filter_block1d16_v6_ssse3_loop
|
|
|
|
; begin epilog
|
|
pop rdi
|
|
pop rsi
|
|
RESTORE_GOT
|
|
RESTORE_XMM
|
|
UNSHADOW_ARGS
|
|
pop rbp
|
|
ret
|
|
|
|
.vp8_filter_block1d16_v4_ssse3:
|
|
movdqa xmm6, XMMWORD PTR [rax+256] ;k2_k4
|
|
movdqa xmm7, XMMWORD PTR [rax+128] ;k1_k3
|
|
|
|
mov rsi, arg(0) ;src_ptr
|
|
movsxd rdx, DWORD PTR arg(1) ;pixels_per_line
|
|
mov rdi, arg(2) ;output_ptr
|
|
|
|
%if ABI_IS_32BIT=0
|
|
movsxd r8, DWORD PTR arg(3) ;out_pitch
|
|
%endif
|
|
mov rax, rsi
|
|
movsxd rcx, DWORD PTR arg(4) ;output_height
|
|
add rax, rdx
|
|
|
|
.vp8_filter_block1d16_v4_ssse3_loop:
|
|
movq xmm2, MMWORD PTR [rsi + rdx] ;B
|
|
movq xmm3, MMWORD PTR [rsi + rdx * 2] ;C
|
|
movq xmm4, MMWORD PTR [rax + rdx * 2] ;D
|
|
movq xmm0, MMWORD PTR [rsi + rdx * 4] ;E
|
|
|
|
punpcklbw xmm2, xmm4 ;B D
|
|
punpcklbw xmm3, xmm0 ;C E
|
|
|
|
pmaddubsw xmm3, xmm6
|
|
pmaddubsw xmm2, xmm7
|
|
movq xmm5, MMWORD PTR [rsi + rdx + 8] ;B
|
|
movq xmm1, MMWORD PTR [rsi + rdx * 2 + 8] ;C
|
|
movq xmm4, MMWORD PTR [rax + rdx * 2 + 8] ;D
|
|
movq xmm0, MMWORD PTR [rsi + rdx * 4 + 8] ;E
|
|
|
|
paddsw xmm2, [GLOBAL(rd)]
|
|
paddsw xmm2, xmm3
|
|
psraw xmm2, 7
|
|
packuswb xmm2, xmm2
|
|
|
|
punpcklbw xmm5, xmm4 ;B D
|
|
punpcklbw xmm1, xmm0 ;C E
|
|
|
|
pmaddubsw xmm1, xmm6
|
|
pmaddubsw xmm5, xmm7
|
|
|
|
movdqa xmm4, [GLOBAL(rd)]
|
|
add rsi, rdx
|
|
add rax, rdx
|
|
;--
|
|
;--
|
|
paddsw xmm5, xmm1
|
|
paddsw xmm5, xmm4
|
|
psraw xmm5, 7
|
|
packuswb xmm5, xmm5
|
|
|
|
punpcklqdq xmm2, xmm5
|
|
|
|
movdqa XMMWORD PTR [rdi], xmm2
|
|
|
|
%if ABI_IS_32BIT
|
|
add rdi, DWORD PTR arg(3) ;out_pitch
|
|
%else
|
|
add rdi, r8
|
|
%endif
|
|
dec rcx
|
|
jnz .vp8_filter_block1d16_v4_ssse3_loop
|
|
|
|
; begin epilog
|
|
pop rdi
|
|
pop rsi
|
|
RESTORE_GOT
|
|
RESTORE_XMM
|
|
UNSHADOW_ARGS
|
|
pop rbp
|
|
ret
|
|
|
|
;void vp8_filter_block1d8_v6_ssse3
|
|
;(
|
|
; unsigned char *src_ptr,
|
|
; unsigned int src_pitch,
|
|
; unsigned char *output_ptr,
|
|
; unsigned int out_pitch,
|
|
; unsigned int output_height,
|
|
; unsigned int vp8_filter_index
|
|
;)
|
|
global sym(vp8_filter_block1d8_v6_ssse3)
|
|
sym(vp8_filter_block1d8_v6_ssse3):
|
|
push rbp
|
|
mov rbp, rsp
|
|
SHADOW_ARGS_TO_STACK 6
|
|
SAVE_XMM 7
|
|
GET_GOT rbx
|
|
push rsi
|
|
push rdi
|
|
; end prolog
|
|
|
|
movsxd rdx, DWORD PTR arg(5) ;table index
|
|
xor rsi, rsi
|
|
shl rdx, 4 ;
|
|
|
|
lea rax, [GLOBAL(k0_k5)]
|
|
add rax, rdx
|
|
|
|
movsxd rdx, DWORD PTR arg(1) ;pixels_per_line
|
|
mov rdi, arg(2) ;output_ptr
|
|
%if ABI_IS_32BIT=0
|
|
movsxd r8, DWORD PTR arg(3) ; out_pitch
|
|
%endif
|
|
movsxd rcx, DWORD PTR arg(4) ;[output_height]
|
|
|
|
cmp esi, DWORD PTR [rax]
|
|
je .vp8_filter_block1d8_v4_ssse3
|
|
|
|
movdqa xmm5, XMMWORD PTR [rax] ;k0_k5
|
|
movdqa xmm6, XMMWORD PTR [rax+256] ;k2_k4
|
|
movdqa xmm7, XMMWORD PTR [rax+128] ;k1_k3
|
|
|
|
mov rsi, arg(0) ;src_ptr
|
|
|
|
mov rax, rsi
|
|
add rax, rdx
|
|
|
|
.vp8_filter_block1d8_v6_ssse3_loop:
|
|
movq xmm1, MMWORD PTR [rsi] ;A
|
|
movq xmm2, MMWORD PTR [rsi + rdx] ;B
|
|
movq xmm3, MMWORD PTR [rsi + rdx * 2] ;C
|
|
movq xmm4, MMWORD PTR [rax + rdx * 2] ;D
|
|
movq xmm0, MMWORD PTR [rsi + rdx * 4] ;E
|
|
|
|
punpcklbw xmm2, xmm4 ;B D
|
|
punpcklbw xmm3, xmm0 ;C E
|
|
|
|
movq xmm0, MMWORD PTR [rax + rdx * 4] ;F
|
|
movdqa xmm4, [GLOBAL(rd)]
|
|
|
|
pmaddubsw xmm3, xmm6
|
|
punpcklbw xmm1, xmm0 ;A F
|
|
pmaddubsw xmm2, xmm7
|
|
pmaddubsw xmm1, xmm5
|
|
add rsi, rdx
|
|
add rax, rdx
|
|
;--
|
|
;--
|
|
paddsw xmm2, xmm3
|
|
paddsw xmm2, xmm1
|
|
paddsw xmm2, xmm4
|
|
psraw xmm2, 7
|
|
packuswb xmm2, xmm2
|
|
|
|
movq MMWORD PTR [rdi], xmm2
|
|
|
|
%if ABI_IS_32BIT
|
|
add rdi, DWORD PTR arg(3) ;[out_pitch]
|
|
%else
|
|
add rdi, r8
|
|
%endif
|
|
dec rcx
|
|
jnz .vp8_filter_block1d8_v6_ssse3_loop
|
|
|
|
; begin epilog
|
|
pop rdi
|
|
pop rsi
|
|
RESTORE_GOT
|
|
RESTORE_XMM
|
|
UNSHADOW_ARGS
|
|
pop rbp
|
|
ret
|
|
|
|
.vp8_filter_block1d8_v4_ssse3:
|
|
movdqa xmm6, XMMWORD PTR [rax+256] ;k2_k4
|
|
movdqa xmm7, XMMWORD PTR [rax+128] ;k1_k3
|
|
movdqa xmm5, [GLOBAL(rd)]
|
|
|
|
mov rsi, arg(0) ;src_ptr
|
|
|
|
mov rax, rsi
|
|
add rax, rdx
|
|
|
|
.vp8_filter_block1d8_v4_ssse3_loop:
|
|
movq xmm2, MMWORD PTR [rsi + rdx] ;B
|
|
movq xmm3, MMWORD PTR [rsi + rdx * 2] ;C
|
|
movq xmm4, MMWORD PTR [rax + rdx * 2] ;D
|
|
movq xmm0, MMWORD PTR [rsi + rdx * 4] ;E
|
|
|
|
punpcklbw xmm2, xmm4 ;B D
|
|
punpcklbw xmm3, xmm0 ;C E
|
|
|
|
pmaddubsw xmm3, xmm6
|
|
pmaddubsw xmm2, xmm7
|
|
add rsi, rdx
|
|
add rax, rdx
|
|
;--
|
|
;--
|
|
paddsw xmm2, xmm3
|
|
paddsw xmm2, xmm5
|
|
psraw xmm2, 7
|
|
packuswb xmm2, xmm2
|
|
|
|
movq MMWORD PTR [rdi], xmm2
|
|
|
|
%if ABI_IS_32BIT
|
|
add rdi, DWORD PTR arg(3) ;[out_pitch]
|
|
%else
|
|
add rdi, r8
|
|
%endif
|
|
dec rcx
|
|
jnz .vp8_filter_block1d8_v4_ssse3_loop
|
|
|
|
; begin epilog
|
|
pop rdi
|
|
pop rsi
|
|
RESTORE_GOT
|
|
RESTORE_XMM
|
|
UNSHADOW_ARGS
|
|
pop rbp
|
|
ret
|
|
;void vp8_filter_block1d4_v6_ssse3
|
|
;(
|
|
; unsigned char *src_ptr,
|
|
; unsigned int src_pitch,
|
|
; unsigned char *output_ptr,
|
|
; unsigned int out_pitch,
|
|
; unsigned int output_height,
|
|
; unsigned int vp8_filter_index
|
|
;)
|
|
global sym(vp8_filter_block1d4_v6_ssse3)
|
|
sym(vp8_filter_block1d4_v6_ssse3):
|
|
push rbp
|
|
mov rbp, rsp
|
|
SHADOW_ARGS_TO_STACK 6
|
|
GET_GOT rbx
|
|
push rsi
|
|
push rdi
|
|
; end prolog
|
|
|
|
movsxd rdx, DWORD PTR arg(5) ;table index
|
|
xor rsi, rsi
|
|
shl rdx, 4 ;
|
|
|
|
lea rax, [GLOBAL(k0_k5)]
|
|
add rax, rdx
|
|
|
|
movsxd rdx, DWORD PTR arg(1) ;pixels_per_line
|
|
mov rdi, arg(2) ;output_ptr
|
|
%if ABI_IS_32BIT=0
|
|
movsxd r8, DWORD PTR arg(3) ; out_pitch
|
|
%endif
|
|
movsxd rcx, DWORD PTR arg(4) ;[output_height]
|
|
|
|
cmp esi, DWORD PTR [rax]
|
|
je .vp8_filter_block1d4_v4_ssse3
|
|
|
|
movq mm5, MMWORD PTR [rax] ;k0_k5
|
|
movq mm6, MMWORD PTR [rax+256] ;k2_k4
|
|
movq mm7, MMWORD PTR [rax+128] ;k1_k3
|
|
|
|
mov rsi, arg(0) ;src_ptr
|
|
|
|
mov rax, rsi
|
|
add rax, rdx
|
|
|
|
.vp8_filter_block1d4_v6_ssse3_loop:
|
|
movd mm1, DWORD PTR [rsi] ;A
|
|
movd mm2, DWORD PTR [rsi + rdx] ;B
|
|
movd mm3, DWORD PTR [rsi + rdx * 2] ;C
|
|
movd mm4, DWORD PTR [rax + rdx * 2] ;D
|
|
movd mm0, DWORD PTR [rsi + rdx * 4] ;E
|
|
|
|
punpcklbw mm2, mm4 ;B D
|
|
punpcklbw mm3, mm0 ;C E
|
|
|
|
movd mm0, DWORD PTR [rax + rdx * 4] ;F
|
|
|
|
movq mm4, [GLOBAL(rd)]
|
|
|
|
pmaddubsw mm3, mm6
|
|
punpcklbw mm1, mm0 ;A F
|
|
pmaddubsw mm2, mm7
|
|
pmaddubsw mm1, mm5
|
|
add rsi, rdx
|
|
add rax, rdx
|
|
;--
|
|
;--
|
|
paddsw mm2, mm3
|
|
paddsw mm2, mm1
|
|
paddsw mm2, mm4
|
|
psraw mm2, 7
|
|
packuswb mm2, mm2
|
|
|
|
movd DWORD PTR [rdi], mm2
|
|
|
|
%if ABI_IS_32BIT
|
|
add rdi, DWORD PTR arg(3) ;[out_pitch]
|
|
%else
|
|
add rdi, r8
|
|
%endif
|
|
dec rcx
|
|
jnz .vp8_filter_block1d4_v6_ssse3_loop
|
|
|
|
; begin epilog
|
|
pop rdi
|
|
pop rsi
|
|
RESTORE_GOT
|
|
UNSHADOW_ARGS
|
|
pop rbp
|
|
ret
|
|
|
|
.vp8_filter_block1d4_v4_ssse3:
|
|
movq mm6, MMWORD PTR [rax+256] ;k2_k4
|
|
movq mm7, MMWORD PTR [rax+128] ;k1_k3
|
|
movq mm5, MMWORD PTR [GLOBAL(rd)]
|
|
|
|
mov rsi, arg(0) ;src_ptr
|
|
|
|
mov rax, rsi
|
|
add rax, rdx
|
|
|
|
.vp8_filter_block1d4_v4_ssse3_loop:
|
|
movd mm2, DWORD PTR [rsi + rdx] ;B
|
|
movd mm3, DWORD PTR [rsi + rdx * 2] ;C
|
|
movd mm4, DWORD PTR [rax + rdx * 2] ;D
|
|
movd mm0, DWORD PTR [rsi + rdx * 4] ;E
|
|
|
|
punpcklbw mm2, mm4 ;B D
|
|
punpcklbw mm3, mm0 ;C E
|
|
|
|
pmaddubsw mm3, mm6
|
|
pmaddubsw mm2, mm7
|
|
add rsi, rdx
|
|
add rax, rdx
|
|
;--
|
|
;--
|
|
paddsw mm2, mm3
|
|
paddsw mm2, mm5
|
|
psraw mm2, 7
|
|
packuswb mm2, mm2
|
|
|
|
movd DWORD PTR [rdi], mm2
|
|
|
|
%if ABI_IS_32BIT
|
|
add rdi, DWORD PTR arg(3) ;[out_pitch]
|
|
%else
|
|
add rdi, r8
|
|
%endif
|
|
dec rcx
|
|
jnz .vp8_filter_block1d4_v4_ssse3_loop
|
|
|
|
; begin epilog
|
|
pop rdi
|
|
pop rsi
|
|
RESTORE_GOT
|
|
UNSHADOW_ARGS
|
|
pop rbp
|
|
ret
|
|
|
|
;void vp8_bilinear_predict16x16_ssse3
|
|
;(
|
|
; unsigned char *src_ptr,
|
|
; int src_pixels_per_line,
|
|
; int xoffset,
|
|
; int yoffset,
|
|
; unsigned char *dst_ptr,
|
|
; int dst_pitch
|
|
;)
|
|
global sym(vp8_bilinear_predict16x16_ssse3)
|
|
sym(vp8_bilinear_predict16x16_ssse3):
|
|
push rbp
|
|
mov rbp, rsp
|
|
SHADOW_ARGS_TO_STACK 6
|
|
SAVE_XMM 7
|
|
GET_GOT rbx
|
|
push rsi
|
|
push rdi
|
|
; end prolog
|
|
|
|
lea rcx, [GLOBAL(vp8_bilinear_filters_ssse3)]
|
|
movsxd rax, dword ptr arg(2) ; xoffset
|
|
|
|
cmp rax, 0 ; skip first_pass filter if xoffset=0
|
|
je .b16x16_sp_only
|
|
|
|
shl rax, 4
|
|
lea rax, [rax + rcx] ; HFilter
|
|
|
|
mov rdi, arg(4) ; dst_ptr
|
|
mov rsi, arg(0) ; src_ptr
|
|
movsxd rdx, dword ptr arg(5) ; dst_pitch
|
|
|
|
movdqa xmm1, [rax]
|
|
|
|
movsxd rax, dword ptr arg(3) ; yoffset
|
|
|
|
cmp rax, 0 ; skip second_pass filter if yoffset=0
|
|
je .b16x16_fp_only
|
|
|
|
shl rax, 4
|
|
lea rax, [rax + rcx] ; VFilter
|
|
|
|
lea rcx, [rdi+rdx*8]
|
|
lea rcx, [rcx+rdx*8]
|
|
movsxd rdx, dword ptr arg(1) ; src_pixels_per_line
|
|
|
|
movdqa xmm2, [rax]
|
|
|
|
%if ABI_IS_32BIT=0
|
|
movsxd r8, dword ptr arg(5) ; dst_pitch
|
|
%endif
|
|
movq xmm3, [rsi] ; 00 01 02 03 04 05 06 07
|
|
movq xmm5, [rsi+1] ; 01 02 03 04 05 06 07 08
|
|
|
|
punpcklbw xmm3, xmm5 ; 00 01 01 02 02 03 03 04 04 05 05 06 06 07 07 08
|
|
movq xmm4, [rsi+8] ; 08 09 10 11 12 13 14 15
|
|
|
|
movq xmm5, [rsi+9] ; 09 10 11 12 13 14 15 16
|
|
|
|
lea rsi, [rsi + rdx] ; next line
|
|
|
|
pmaddubsw xmm3, xmm1 ; 00 02 04 06 08 10 12 14
|
|
|
|
punpcklbw xmm4, xmm5 ; 08 09 09 10 10 11 11 12 12 13 13 14 14 15 15 16
|
|
pmaddubsw xmm4, xmm1 ; 01 03 05 07 09 11 13 15
|
|
|
|
paddw xmm3, [GLOBAL(rd)] ; xmm3 += round value
|
|
psraw xmm3, VP8_FILTER_SHIFT ; xmm3 /= 128
|
|
|
|
paddw xmm4, [GLOBAL(rd)] ; xmm4 += round value
|
|
psraw xmm4, VP8_FILTER_SHIFT ; xmm4 /= 128
|
|
|
|
movdqa xmm7, xmm3
|
|
packuswb xmm7, xmm4 ; 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 15
|
|
|
|
.next_row:
|
|
movq xmm6, [rsi] ; 00 01 02 03 04 05 06 07
|
|
movq xmm5, [rsi+1] ; 01 02 03 04 05 06 07 08
|
|
|
|
punpcklbw xmm6, xmm5
|
|
movq xmm4, [rsi+8] ; 08 09 10 11 12 13 14 15
|
|
|
|
movq xmm5, [rsi+9] ; 09 10 11 12 13 14 15 16
|
|
lea rsi, [rsi + rdx] ; next line
|
|
|
|
pmaddubsw xmm6, xmm1
|
|
|
|
punpcklbw xmm4, xmm5
|
|
pmaddubsw xmm4, xmm1
|
|
|
|
paddw xmm6, [GLOBAL(rd)] ; xmm6 += round value
|
|
psraw xmm6, VP8_FILTER_SHIFT ; xmm6 /= 128
|
|
|
|
paddw xmm4, [GLOBAL(rd)] ; xmm4 += round value
|
|
psraw xmm4, VP8_FILTER_SHIFT ; xmm4 /= 128
|
|
|
|
packuswb xmm6, xmm4
|
|
movdqa xmm5, xmm7
|
|
|
|
punpcklbw xmm5, xmm6
|
|
pmaddubsw xmm5, xmm2
|
|
|
|
punpckhbw xmm7, xmm6
|
|
pmaddubsw xmm7, xmm2
|
|
|
|
paddw xmm5, [GLOBAL(rd)] ; xmm5 += round value
|
|
psraw xmm5, VP8_FILTER_SHIFT ; xmm5 /= 128
|
|
|
|
paddw xmm7, [GLOBAL(rd)] ; xmm7 += round value
|
|
psraw xmm7, VP8_FILTER_SHIFT ; xmm7 /= 128
|
|
|
|
packuswb xmm5, xmm7
|
|
movdqa xmm7, xmm6
|
|
|
|
movdqa [rdi], xmm5 ; store the results in the destination
|
|
%if ABI_IS_32BIT
|
|
add rdi, DWORD PTR arg(5) ; dst_pitch
|
|
%else
|
|
add rdi, r8
|
|
%endif
|
|
|
|
cmp rdi, rcx
|
|
jne .next_row
|
|
|
|
jmp .done
|
|
|
|
.b16x16_sp_only:
|
|
movsxd rax, dword ptr arg(3) ; yoffset
|
|
shl rax, 4
|
|
lea rax, [rax + rcx] ; VFilter
|
|
|
|
mov rdi, arg(4) ; dst_ptr
|
|
mov rsi, arg(0) ; src_ptr
|
|
movsxd rdx, dword ptr arg(5) ; dst_pitch
|
|
|
|
movdqa xmm1, [rax] ; VFilter
|
|
|
|
lea rcx, [rdi+rdx*8]
|
|
lea rcx, [rcx+rdx*8]
|
|
movsxd rax, dword ptr arg(1) ; src_pixels_per_line
|
|
|
|
; get the first horizontal line done
|
|
movq xmm4, [rsi] ; load row 0
|
|
movq xmm2, [rsi + 8] ; load row 0
|
|
|
|
lea rsi, [rsi + rax] ; next line
|
|
.next_row_sp:
|
|
movq xmm3, [rsi] ; load row + 1
|
|
movq xmm5, [rsi + 8] ; load row + 1
|
|
|
|
punpcklbw xmm4, xmm3
|
|
punpcklbw xmm2, xmm5
|
|
|
|
pmaddubsw xmm4, xmm1
|
|
movq xmm7, [rsi + rax] ; load row + 2
|
|
|
|
pmaddubsw xmm2, xmm1
|
|
movq xmm6, [rsi + rax + 8] ; load row + 2
|
|
|
|
punpcklbw xmm3, xmm7
|
|
punpcklbw xmm5, xmm6
|
|
|
|
pmaddubsw xmm3, xmm1
|
|
paddw xmm4, [GLOBAL(rd)]
|
|
|
|
pmaddubsw xmm5, xmm1
|
|
paddw xmm2, [GLOBAL(rd)]
|
|
|
|
psraw xmm4, VP8_FILTER_SHIFT
|
|
psraw xmm2, VP8_FILTER_SHIFT
|
|
|
|
packuswb xmm4, xmm2
|
|
paddw xmm3, [GLOBAL(rd)]
|
|
|
|
movdqa [rdi], xmm4 ; store row 0
|
|
paddw xmm5, [GLOBAL(rd)]
|
|
|
|
psraw xmm3, VP8_FILTER_SHIFT
|
|
psraw xmm5, VP8_FILTER_SHIFT
|
|
|
|
packuswb xmm3, xmm5
|
|
movdqa xmm4, xmm7
|
|
|
|
movdqa [rdi + rdx],xmm3 ; store row 1
|
|
lea rsi, [rsi + 2*rax]
|
|
|
|
movdqa xmm2, xmm6
|
|
lea rdi, [rdi + 2*rdx]
|
|
|
|
cmp rdi, rcx
|
|
jne .next_row_sp
|
|
|
|
jmp .done
|
|
|
|
.b16x16_fp_only:
|
|
lea rcx, [rdi+rdx*8]
|
|
lea rcx, [rcx+rdx*8]
|
|
movsxd rax, dword ptr arg(1) ; src_pixels_per_line
|
|
|
|
.next_row_fp:
|
|
movq xmm2, [rsi] ; 00 01 02 03 04 05 06 07
|
|
movq xmm4, [rsi+1] ; 01 02 03 04 05 06 07 08
|
|
|
|
punpcklbw xmm2, xmm4
|
|
movq xmm3, [rsi+8] ; 08 09 10 11 12 13 14 15
|
|
|
|
pmaddubsw xmm2, xmm1
|
|
movq xmm4, [rsi+9] ; 09 10 11 12 13 14 15 16
|
|
|
|
lea rsi, [rsi + rax] ; next line
|
|
punpcklbw xmm3, xmm4
|
|
|
|
pmaddubsw xmm3, xmm1
|
|
movq xmm5, [rsi]
|
|
|
|
paddw xmm2, [GLOBAL(rd)]
|
|
movq xmm7, [rsi+1]
|
|
|
|
movq xmm6, [rsi+8]
|
|
psraw xmm2, VP8_FILTER_SHIFT
|
|
|
|
punpcklbw xmm5, xmm7
|
|
movq xmm7, [rsi+9]
|
|
|
|
paddw xmm3, [GLOBAL(rd)]
|
|
pmaddubsw xmm5, xmm1
|
|
|
|
psraw xmm3, VP8_FILTER_SHIFT
|
|
punpcklbw xmm6, xmm7
|
|
|
|
packuswb xmm2, xmm3
|
|
pmaddubsw xmm6, xmm1
|
|
|
|
movdqa [rdi], xmm2 ; store the results in the destination
|
|
paddw xmm5, [GLOBAL(rd)]
|
|
|
|
lea rdi, [rdi + rdx] ; dst_pitch
|
|
psraw xmm5, VP8_FILTER_SHIFT
|
|
|
|
paddw xmm6, [GLOBAL(rd)]
|
|
psraw xmm6, VP8_FILTER_SHIFT
|
|
|
|
packuswb xmm5, xmm6
|
|
lea rsi, [rsi + rax] ; next line
|
|
|
|
movdqa [rdi], xmm5 ; store the results in the destination
|
|
lea rdi, [rdi + rdx] ; dst_pitch
|
|
|
|
cmp rdi, rcx
|
|
|
|
jne .next_row_fp
|
|
|
|
.done:
|
|
; begin epilog
|
|
pop rdi
|
|
pop rsi
|
|
RESTORE_GOT
|
|
RESTORE_XMM
|
|
UNSHADOW_ARGS
|
|
pop rbp
|
|
ret
|
|
|
|
;void vp8_bilinear_predict8x8_ssse3
|
|
;(
|
|
; unsigned char *src_ptr,
|
|
; int src_pixels_per_line,
|
|
; int xoffset,
|
|
; int yoffset,
|
|
; unsigned char *dst_ptr,
|
|
; int dst_pitch
|
|
;)
|
|
global sym(vp8_bilinear_predict8x8_ssse3)
|
|
sym(vp8_bilinear_predict8x8_ssse3):
|
|
push rbp
|
|
mov rbp, rsp
|
|
SHADOW_ARGS_TO_STACK 6
|
|
SAVE_XMM 7
|
|
GET_GOT rbx
|
|
push rsi
|
|
push rdi
|
|
; end prolog
|
|
|
|
ALIGN_STACK 16, rax
|
|
sub rsp, 144 ; reserve 144 bytes
|
|
|
|
lea rcx, [GLOBAL(vp8_bilinear_filters_ssse3)]
|
|
|
|
mov rsi, arg(0) ;src_ptr
|
|
movsxd rdx, dword ptr arg(1) ;src_pixels_per_line
|
|
|
|
;Read 9-line unaligned data in and put them on stack. This gives a big
|
|
;performance boost.
|
|
movdqu xmm0, [rsi]
|
|
lea rax, [rdx + rdx*2]
|
|
movdqu xmm1, [rsi+rdx]
|
|
movdqu xmm2, [rsi+rdx*2]
|
|
add rsi, rax
|
|
movdqu xmm3, [rsi]
|
|
movdqu xmm4, [rsi+rdx]
|
|
movdqu xmm5, [rsi+rdx*2]
|
|
add rsi, rax
|
|
movdqu xmm6, [rsi]
|
|
movdqu xmm7, [rsi+rdx]
|
|
|
|
movdqa XMMWORD PTR [rsp], xmm0
|
|
|
|
movdqu xmm0, [rsi+rdx*2]
|
|
|
|
movdqa XMMWORD PTR [rsp+16], xmm1
|
|
movdqa XMMWORD PTR [rsp+32], xmm2
|
|
movdqa XMMWORD PTR [rsp+48], xmm3
|
|
movdqa XMMWORD PTR [rsp+64], xmm4
|
|
movdqa XMMWORD PTR [rsp+80], xmm5
|
|
movdqa XMMWORD PTR [rsp+96], xmm6
|
|
movdqa XMMWORD PTR [rsp+112], xmm7
|
|
movdqa XMMWORD PTR [rsp+128], xmm0
|
|
|
|
movsxd rax, dword ptr arg(2) ; xoffset
|
|
cmp rax, 0 ; skip first_pass filter if xoffset=0
|
|
je .b8x8_sp_only
|
|
|
|
shl rax, 4
|
|
add rax, rcx ; HFilter
|
|
|
|
mov rdi, arg(4) ; dst_ptr
|
|
movsxd rdx, dword ptr arg(5) ; dst_pitch
|
|
|
|
movdqa xmm0, [rax]
|
|
|
|
movsxd rax, dword ptr arg(3) ; yoffset
|
|
cmp rax, 0 ; skip second_pass filter if yoffset=0
|
|
je .b8x8_fp_only
|
|
|
|
shl rax, 4
|
|
lea rax, [rax + rcx] ; VFilter
|
|
|
|
lea rcx, [rdi+rdx*8]
|
|
|
|
movdqa xmm1, [rax]
|
|
|
|
; get the first horizontal line done
|
|
movdqa xmm3, [rsp] ; 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 15
|
|
movdqa xmm5, xmm3 ; 01 02 03 04 05 06 07 08 09 10 11 12 13 14 15 xx
|
|
|
|
psrldq xmm5, 1
|
|
lea rsp, [rsp + 16] ; next line
|
|
|
|
punpcklbw xmm3, xmm5 ; 00 01 01 02 02 03 03 04 04 05 05 06 06 07 07 08
|
|
pmaddubsw xmm3, xmm0 ; 00 02 04 06 08 10 12 14
|
|
|
|
paddw xmm3, [GLOBAL(rd)] ; xmm3 += round value
|
|
psraw xmm3, VP8_FILTER_SHIFT ; xmm3 /= 128
|
|
|
|
movdqa xmm7, xmm3
|
|
packuswb xmm7, xmm7 ; 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 15
|
|
|
|
.next_row:
|
|
movdqa xmm6, [rsp] ; 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 15
|
|
lea rsp, [rsp + 16] ; next line
|
|
|
|
movdqa xmm5, xmm6
|
|
|
|
psrldq xmm5, 1
|
|
|
|
punpcklbw xmm6, xmm5
|
|
pmaddubsw xmm6, xmm0
|
|
|
|
paddw xmm6, [GLOBAL(rd)] ; xmm6 += round value
|
|
psraw xmm6, VP8_FILTER_SHIFT ; xmm6 /= 128
|
|
|
|
packuswb xmm6, xmm6
|
|
|
|
punpcklbw xmm7, xmm6
|
|
pmaddubsw xmm7, xmm1
|
|
|
|
paddw xmm7, [GLOBAL(rd)] ; xmm7 += round value
|
|
psraw xmm7, VP8_FILTER_SHIFT ; xmm7 /= 128
|
|
|
|
packuswb xmm7, xmm7
|
|
|
|
movq [rdi], xmm7 ; store the results in the destination
|
|
lea rdi, [rdi + rdx]
|
|
|
|
movdqa xmm7, xmm6
|
|
|
|
cmp rdi, rcx
|
|
jne .next_row
|
|
|
|
jmp .done8x8
|
|
|
|
.b8x8_sp_only:
|
|
movsxd rax, dword ptr arg(3) ; yoffset
|
|
shl rax, 4
|
|
lea rax, [rax + rcx] ; VFilter
|
|
|
|
mov rdi, arg(4) ;dst_ptr
|
|
movsxd rdx, dword ptr arg(5) ; dst_pitch
|
|
|
|
movdqa xmm0, [rax] ; VFilter
|
|
|
|
movq xmm1, XMMWORD PTR [rsp]
|
|
movq xmm2, XMMWORD PTR [rsp+16]
|
|
|
|
movq xmm3, XMMWORD PTR [rsp+32]
|
|
punpcklbw xmm1, xmm2
|
|
|
|
movq xmm4, XMMWORD PTR [rsp+48]
|
|
punpcklbw xmm2, xmm3
|
|
|
|
movq xmm5, XMMWORD PTR [rsp+64]
|
|
punpcklbw xmm3, xmm4
|
|
|
|
movq xmm6, XMMWORD PTR [rsp+80]
|
|
punpcklbw xmm4, xmm5
|
|
|
|
movq xmm7, XMMWORD PTR [rsp+96]
|
|
punpcklbw xmm5, xmm6
|
|
|
|
pmaddubsw xmm1, xmm0
|
|
pmaddubsw xmm2, xmm0
|
|
|
|
pmaddubsw xmm3, xmm0
|
|
pmaddubsw xmm4, xmm0
|
|
|
|
pmaddubsw xmm5, xmm0
|
|
punpcklbw xmm6, xmm7
|
|
|
|
pmaddubsw xmm6, xmm0
|
|
paddw xmm1, [GLOBAL(rd)]
|
|
|
|
paddw xmm2, [GLOBAL(rd)]
|
|
psraw xmm1, VP8_FILTER_SHIFT
|
|
|
|
paddw xmm3, [GLOBAL(rd)]
|
|
psraw xmm2, VP8_FILTER_SHIFT
|
|
|
|
paddw xmm4, [GLOBAL(rd)]
|
|
psraw xmm3, VP8_FILTER_SHIFT
|
|
|
|
paddw xmm5, [GLOBAL(rd)]
|
|
psraw xmm4, VP8_FILTER_SHIFT
|
|
|
|
paddw xmm6, [GLOBAL(rd)]
|
|
psraw xmm5, VP8_FILTER_SHIFT
|
|
|
|
psraw xmm6, VP8_FILTER_SHIFT
|
|
packuswb xmm1, xmm1
|
|
|
|
packuswb xmm2, xmm2
|
|
movq [rdi], xmm1
|
|
|
|
packuswb xmm3, xmm3
|
|
movq [rdi+rdx], xmm2
|
|
|
|
packuswb xmm4, xmm4
|
|
movq xmm1, XMMWORD PTR [rsp+112]
|
|
|
|
lea rdi, [rdi + 2*rdx]
|
|
movq xmm2, XMMWORD PTR [rsp+128]
|
|
|
|
packuswb xmm5, xmm5
|
|
movq [rdi], xmm3
|
|
|
|
packuswb xmm6, xmm6
|
|
movq [rdi+rdx], xmm4
|
|
|
|
lea rdi, [rdi + 2*rdx]
|
|
punpcklbw xmm7, xmm1
|
|
|
|
movq [rdi], xmm5
|
|
pmaddubsw xmm7, xmm0
|
|
|
|
movq [rdi+rdx], xmm6
|
|
punpcklbw xmm1, xmm2
|
|
|
|
pmaddubsw xmm1, xmm0
|
|
paddw xmm7, [GLOBAL(rd)]
|
|
|
|
psraw xmm7, VP8_FILTER_SHIFT
|
|
paddw xmm1, [GLOBAL(rd)]
|
|
|
|
psraw xmm1, VP8_FILTER_SHIFT
|
|
packuswb xmm7, xmm7
|
|
|
|
packuswb xmm1, xmm1
|
|
lea rdi, [rdi + 2*rdx]
|
|
|
|
movq [rdi], xmm7
|
|
|
|
movq [rdi+rdx], xmm1
|
|
lea rsp, [rsp + 144]
|
|
|
|
jmp .done8x8
|
|
|
|
.b8x8_fp_only:
|
|
lea rcx, [rdi+rdx*8]
|
|
|
|
.next_row_fp:
|
|
movdqa xmm1, XMMWORD PTR [rsp]
|
|
movdqa xmm3, XMMWORD PTR [rsp+16]
|
|
|
|
movdqa xmm2, xmm1
|
|
movdqa xmm5, XMMWORD PTR [rsp+32]
|
|
|
|
psrldq xmm2, 1
|
|
movdqa xmm7, XMMWORD PTR [rsp+48]
|
|
|
|
movdqa xmm4, xmm3
|
|
psrldq xmm4, 1
|
|
|
|
movdqa xmm6, xmm5
|
|
psrldq xmm6, 1
|
|
|
|
punpcklbw xmm1, xmm2
|
|
pmaddubsw xmm1, xmm0
|
|
|
|
punpcklbw xmm3, xmm4
|
|
pmaddubsw xmm3, xmm0
|
|
|
|
punpcklbw xmm5, xmm6
|
|
pmaddubsw xmm5, xmm0
|
|
|
|
movdqa xmm2, xmm7
|
|
psrldq xmm2, 1
|
|
|
|
punpcklbw xmm7, xmm2
|
|
pmaddubsw xmm7, xmm0
|
|
|
|
paddw xmm1, [GLOBAL(rd)]
|
|
psraw xmm1, VP8_FILTER_SHIFT
|
|
|
|
paddw xmm3, [GLOBAL(rd)]
|
|
psraw xmm3, VP8_FILTER_SHIFT
|
|
|
|
paddw xmm5, [GLOBAL(rd)]
|
|
psraw xmm5, VP8_FILTER_SHIFT
|
|
|
|
paddw xmm7, [GLOBAL(rd)]
|
|
psraw xmm7, VP8_FILTER_SHIFT
|
|
|
|
packuswb xmm1, xmm1
|
|
packuswb xmm3, xmm3
|
|
|
|
packuswb xmm5, xmm5
|
|
movq [rdi], xmm1
|
|
|
|
packuswb xmm7, xmm7
|
|
movq [rdi+rdx], xmm3
|
|
|
|
lea rdi, [rdi + 2*rdx]
|
|
movq [rdi], xmm5
|
|
|
|
lea rsp, [rsp + 4*16]
|
|
movq [rdi+rdx], xmm7
|
|
|
|
lea rdi, [rdi + 2*rdx]
|
|
cmp rdi, rcx
|
|
|
|
jne .next_row_fp
|
|
|
|
lea rsp, [rsp + 16]
|
|
|
|
.done8x8:
|
|
;add rsp, 144
|
|
pop rsp
|
|
; begin epilog
|
|
pop rdi
|
|
pop rsi
|
|
RESTORE_GOT
|
|
RESTORE_XMM
|
|
UNSHADOW_ARGS
|
|
pop rbp
|
|
ret
|
|
|
|
SECTION_RODATA
|
|
align 16
|
|
shuf1b:
|
|
db 0, 5, 1, 6, 2, 7, 3, 8, 4, 9, 5, 10, 6, 11, 7, 12
|
|
shuf2b:
|
|
db 2, 4, 3, 5, 4, 6, 5, 7, 6, 8, 7, 9, 8, 10, 9, 11
|
|
shuf3b:
|
|
db 1, 3, 2, 4, 3, 5, 4, 6, 5, 7, 6, 8, 7, 9, 8, 10
|
|
|
|
align 16
|
|
shuf2bfrom1:
|
|
db 4, 8, 6, 1, 8, 3, 1, 5, 3, 7, 5, 9, 7,11, 9,13
|
|
align 16
|
|
shuf3bfrom1:
|
|
db 2, 6, 4, 8, 6, 1, 8, 3, 1, 5, 3, 7, 5, 9, 7,11
|
|
|
|
align 16
|
|
rd:
|
|
times 8 dw 0x40
|
|
|
|
align 16
|
|
k0_k5:
|
|
times 8 db 0, 0 ;placeholder
|
|
times 8 db 0, 0
|
|
times 8 db 2, 1
|
|
times 8 db 0, 0
|
|
times 8 db 3, 3
|
|
times 8 db 0, 0
|
|
times 8 db 1, 2
|
|
times 8 db 0, 0
|
|
k1_k3:
|
|
times 8 db 0, 0 ;placeholder
|
|
times 8 db -6, 12
|
|
times 8 db -11, 36
|
|
times 8 db -9, 50
|
|
times 8 db -16, 77
|
|
times 8 db -6, 93
|
|
times 8 db -8, 108
|
|
times 8 db -1, 123
|
|
k2_k4:
|
|
times 8 db 128, 0 ;placeholder
|
|
times 8 db 123, -1
|
|
times 8 db 108, -8
|
|
times 8 db 93, -6
|
|
times 8 db 77, -16
|
|
times 8 db 50, -9
|
|
times 8 db 36, -11
|
|
times 8 db 12, -6
|
|
align 16
|
|
%if CONFIG_SIXTEENTH_SUBPEL_UV
|
|
vp8_bilinear_filters_ssse3:
|
|
times 8 db 128, 0
|
|
times 8 db 120, 8
|
|
times 8 db 112, 16
|
|
times 8 db 104, 24
|
|
times 8 db 96, 32
|
|
times 8 db 88, 40
|
|
times 8 db 80, 48
|
|
times 8 db 72, 56
|
|
times 8 db 64, 64
|
|
times 8 db 56, 72
|
|
times 8 db 48, 80
|
|
times 8 db 40, 88
|
|
times 8 db 32, 96
|
|
times 8 db 24, 104
|
|
times 8 db 16, 112
|
|
times 8 db 8, 120
|
|
%else
|
|
vp8_bilinear_filters_ssse3:
|
|
times 8 db 128, 0
|
|
times 8 db 112, 16
|
|
times 8 db 96, 32
|
|
times 8 db 80, 48
|
|
times 8 db 64, 64
|
|
times 8 db 48, 80
|
|
times 8 db 32, 96
|
|
times 8 db 16, 112
|
|
%endif
|
|
|