879cb7d962
Incorportate vp9-preview changes by merging master branch into experimental. Conflicts: test/test.mk vp9/common/vp9_filter.c vp9/common/vp9_idctllm.c vp9/common/vp9_invtrans.h vp9/common/vp9_mbpitch.c vp9/common/vp9_rtcd_defs.sh vp9/common/vp9_systemdependent.h vp9/common/vp9_type_aliases.h vp9/common/x86/vp9_asm_stubs.c vp9/common/x86/vp9_subpixel_mmx.asm vp9/decoder/vp9_decodframe.c vp9/decoder/vp9_dequantize.c vp9/decoder/vp9_dequantize.h vp9/decoder/vp9_onyxd_int.h vp9/encoder/vp9_bitstream.c vp9/encoder/vp9_encodeframe.c vp9/encoder/vp9_rdopt.c Change-Id: I17f51c3666d1b59cf1a699f87607cbc5d30a87c5
269 lines
8.1 KiB
NASM
269 lines
8.1 KiB
NASM
;
|
|
; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
|
|
;
|
|
; Use of this source code is governed by a BSD-style license
|
|
; that can be found in the LICENSE file in the root of the source
|
|
; tree. An additional intellectual property rights grant can be found
|
|
; in the file PATENTS. All contributing project authors may
|
|
; be found in the AUTHORS file in the root of the source tree.
|
|
;
|
|
|
|
|
|
%include "vpx_ports/x86_abi_support.asm"
|
|
|
|
|
|
%define BLOCK_HEIGHT_WIDTH 4
|
|
%define vp9_filter_weight 128
|
|
%define VP9_FILTER_SHIFT 7
|
|
|
|
|
|
;void vp9_filter_block1d_h6_mmx
|
|
;(
|
|
; unsigned char *src_ptr,
|
|
; unsigned short *output_ptr,
|
|
; unsigned int src_pixels_per_line,
|
|
; unsigned int pixel_step,
|
|
; unsigned int output_height,
|
|
; unsigned int output_width,
|
|
; short * vp9_filter
|
|
;)
|
|
global sym(vp9_filter_block1d_h6_mmx) PRIVATE
|
|
sym(vp9_filter_block1d_h6_mmx):
|
|
push rbp
|
|
mov rbp, rsp
|
|
SHADOW_ARGS_TO_STACK 7
|
|
GET_GOT rbx
|
|
push rsi
|
|
push rdi
|
|
; end prolog
|
|
|
|
mov rdx, arg(6) ;vp9_filter
|
|
|
|
movq mm1, [rdx + 16] ; do both the negative taps first!!!
|
|
movq mm2, [rdx + 32] ;
|
|
movq mm6, [rdx + 48] ;
|
|
movq mm7, [rdx + 64] ;
|
|
|
|
mov rdi, arg(1) ;output_ptr
|
|
mov rsi, arg(0) ;src_ptr
|
|
movsxd rcx, dword ptr arg(4) ;output_height
|
|
movsxd rax, dword ptr arg(5) ;output_width ; destination pitch?
|
|
pxor mm0, mm0 ; mm0 = 00000000
|
|
|
|
.nextrow:
|
|
movq mm3, [rsi-2] ; mm3 = p-2..p5
|
|
movq mm4, mm3 ; mm4 = p-2..p5
|
|
psrlq mm3, 8 ; mm3 = p-1..p5
|
|
punpcklbw mm3, mm0 ; mm3 = p-1..p2
|
|
pmullw mm3, mm1 ; mm3 *= kernel 1 modifiers.
|
|
|
|
movq mm5, mm4 ; mm5 = p-2..p5
|
|
punpckhbw mm4, mm0 ; mm5 = p2..p5
|
|
pmullw mm4, mm7 ; mm5 *= kernel 4 modifiers
|
|
paddsw mm3, mm4 ; mm3 += mm5
|
|
|
|
movq mm4, mm5 ; mm4 = p-2..p5;
|
|
psrlq mm5, 16 ; mm5 = p0..p5;
|
|
punpcklbw mm5, mm0 ; mm5 = p0..p3
|
|
pmullw mm5, mm2 ; mm5 *= kernel 2 modifiers
|
|
paddsw mm3, mm5 ; mm3 += mm5
|
|
|
|
movq mm5, mm4 ; mm5 = p-2..p5
|
|
psrlq mm4, 24 ; mm4 = p1..p5
|
|
punpcklbw mm4, mm0 ; mm4 = p1..p4
|
|
pmullw mm4, mm6 ; mm5 *= kernel 3 modifiers
|
|
paddsw mm3, mm4 ; mm3 += mm5
|
|
|
|
; do outer positive taps
|
|
movd mm4, [rsi+3]
|
|
punpcklbw mm4, mm0 ; mm5 = p3..p6
|
|
pmullw mm4, [rdx+80] ; mm5 *= kernel 0 modifiers
|
|
paddsw mm3, mm4 ; mm3 += mm5
|
|
|
|
punpcklbw mm5, mm0 ; mm5 = p-2..p1
|
|
pmullw mm5, [rdx] ; mm5 *= kernel 5 modifiers
|
|
paddsw mm3, mm5 ; mm3 += mm5
|
|
|
|
paddsw mm3, [GLOBAL(rd)] ; mm3 += round value
|
|
psraw mm3, VP9_FILTER_SHIFT ; mm3 /= 128
|
|
packuswb mm3, mm0 ; pack and unpack to saturate
|
|
punpcklbw mm3, mm0 ;
|
|
|
|
movq [rdi], mm3 ; store the results in the destination
|
|
|
|
%if ABI_IS_32BIT
|
|
add rsi, dword ptr arg(2) ;src_pixels_per_line ; next line
|
|
add rdi, rax;
|
|
%else
|
|
movsxd r8, dword ptr arg(2) ;src_pixels_per_line
|
|
add rdi, rax;
|
|
|
|
add rsi, r8 ; next line
|
|
%endif
|
|
|
|
dec rcx ; decrement count
|
|
jnz .nextrow ; next row
|
|
|
|
; begin epilog
|
|
pop rdi
|
|
pop rsi
|
|
RESTORE_GOT
|
|
UNSHADOW_ARGS
|
|
pop rbp
|
|
ret
|
|
|
|
|
|
;void vp9_filter_block1dc_v6_mmx
|
|
;(
|
|
; short *src_ptr,
|
|
; unsigned char *output_ptr,
|
|
; int output_pitch,
|
|
; unsigned int pixels_per_line,
|
|
; unsigned int pixel_step,
|
|
; unsigned int output_height,
|
|
; unsigned int output_width,
|
|
; short * vp9_filter
|
|
;)
|
|
global sym(vp9_filter_block1dc_v6_mmx) PRIVATE
|
|
sym(vp9_filter_block1dc_v6_mmx):
|
|
push rbp
|
|
mov rbp, rsp
|
|
SHADOW_ARGS_TO_STACK 8
|
|
GET_GOT rbx
|
|
push rsi
|
|
push rdi
|
|
; end prolog
|
|
|
|
movq mm5, [GLOBAL(rd)]
|
|
push rbx
|
|
mov rbx, arg(7) ;vp9_filter
|
|
movq mm1, [rbx + 16] ; do both the negative taps first!!!
|
|
movq mm2, [rbx + 32] ;
|
|
movq mm6, [rbx + 48] ;
|
|
movq mm7, [rbx + 64] ;
|
|
|
|
movsxd rdx, dword ptr arg(3) ;pixels_per_line
|
|
mov rdi, arg(1) ;output_ptr
|
|
mov rsi, arg(0) ;src_ptr
|
|
sub rsi, rdx
|
|
sub rsi, rdx
|
|
movsxd rcx, DWORD PTR arg(5) ;output_height
|
|
movsxd rax, DWORD PTR arg(2) ;output_pitch ; destination pitch?
|
|
pxor mm0, mm0 ; mm0 = 00000000
|
|
|
|
|
|
.nextrow_cv:
|
|
movq mm3, [rsi+rdx] ; mm3 = p0..p8 = row -1
|
|
pmullw mm3, mm1 ; mm3 *= kernel 1 modifiers.
|
|
|
|
|
|
movq mm4, [rsi + 4*rdx] ; mm4 = p0..p3 = row 2
|
|
pmullw mm4, mm7 ; mm4 *= kernel 4 modifiers.
|
|
paddsw mm3, mm4 ; mm3 += mm4
|
|
|
|
movq mm4, [rsi + 2*rdx] ; mm4 = p0..p3 = row 0
|
|
pmullw mm4, mm2 ; mm4 *= kernel 2 modifiers.
|
|
paddsw mm3, mm4 ; mm3 += mm4
|
|
|
|
movq mm4, [rsi] ; mm4 = p0..p3 = row -2
|
|
pmullw mm4, [rbx] ; mm4 *= kernel 0 modifiers.
|
|
paddsw mm3, mm4 ; mm3 += mm4
|
|
|
|
|
|
add rsi, rdx ; move source forward 1 line to avoid 3 * pitch
|
|
movq mm4, [rsi + 2*rdx] ; mm4 = p0..p3 = row 1
|
|
pmullw mm4, mm6 ; mm4 *= kernel 3 modifiers.
|
|
paddsw mm3, mm4 ; mm3 += mm4
|
|
|
|
movq mm4, [rsi + 4*rdx] ; mm4 = p0..p3 = row 3
|
|
pmullw mm4, [rbx +80] ; mm4 *= kernel 3 modifiers.
|
|
paddsw mm3, mm4 ; mm3 += mm4
|
|
|
|
|
|
paddsw mm3, mm5 ; mm3 += round value
|
|
psraw mm3, VP9_FILTER_SHIFT ; mm3 /= 128
|
|
packuswb mm3, mm0 ; pack and saturate
|
|
|
|
movd [rdi],mm3 ; store the results in the destination
|
|
; the subsequent iterations repeat 3 out of 4 of these reads. Since the
|
|
; recon block should be in cache this shouldn't cost much. Its obviously
|
|
; avoidable!!!.
|
|
lea rdi, [rdi+rax] ;
|
|
dec rcx ; decrement count
|
|
jnz .nextrow_cv ; next row
|
|
|
|
pop rbx
|
|
|
|
; begin epilog
|
|
pop rdi
|
|
pop rsi
|
|
RESTORE_GOT
|
|
UNSHADOW_ARGS
|
|
pop rbp
|
|
ret
|
|
|
|
SECTION_RODATA
|
|
align 16
|
|
rd:
|
|
times 4 dw 0x40
|
|
|
|
align 16
|
|
global HIDDEN_DATA(sym(vp9_six_tap_mmx))
|
|
sym(vp9_six_tap_mmx):
|
|
times 8 dw 0
|
|
times 8 dw 0
|
|
times 8 dw 128
|
|
times 8 dw 0
|
|
times 8 dw 0
|
|
times 8 dw 0
|
|
|
|
times 8 dw 0
|
|
times 8 dw -6
|
|
times 8 dw 123
|
|
times 8 dw 12
|
|
times 8 dw -1
|
|
times 8 dw 0
|
|
|
|
times 8 dw 2
|
|
times 8 dw -11
|
|
times 8 dw 108
|
|
times 8 dw 36
|
|
times 8 dw -8
|
|
times 8 dw 1
|
|
|
|
times 8 dw 0
|
|
times 8 dw -9
|
|
times 8 dw 93
|
|
times 8 dw 50
|
|
times 8 dw -6
|
|
times 8 dw 0
|
|
|
|
times 8 dw 3
|
|
times 8 dw -16
|
|
times 8 dw 77
|
|
times 8 dw 77
|
|
times 8 dw -16
|
|
times 8 dw 3
|
|
|
|
times 8 dw 0
|
|
times 8 dw -6
|
|
times 8 dw 50
|
|
times 8 dw 93
|
|
times 8 dw -9
|
|
times 8 dw 0
|
|
|
|
times 8 dw 1
|
|
times 8 dw -8
|
|
times 8 dw 36
|
|
times 8 dw 108
|
|
times 8 dw -11
|
|
times 8 dw 2
|
|
|
|
times 8 dw 0
|
|
times 8 dw -1
|
|
times 8 dw 12
|
|
times 8 dw 123
|
|
times 8 dw -6
|
|
times 8 dw 0
|
|
|