x86 sse2 temporal_filter_apply
count can be reduced to short because the max number of filtered frames is set to 15. the max value for any frame is 32 (modifier = 16, filter_weight = 2). 15*32 = 480 which requires 9 bits this function goes from about 7000 us / 1000 iterations for the C code to < 275 us / 1000 iterations for sse2 for block_size = 16 and from about 1800 us / 1000 iters to < 100 us / 1000 iters for block_size = 8 Change-Id: I64a32607f58a2d33c39286f468b04ccd457d9e6e
This commit is contained in:
207
vp8/encoder/x86/temporal_filter_apply_sse2.asm
Normal file
207
vp8/encoder/x86/temporal_filter_apply_sse2.asm
Normal file
@@ -0,0 +1,207 @@
|
||||
;
|
||||
; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
|
||||
;
|
||||
; Use of this source code is governed by a BSD-style license
|
||||
; that can be found in the LICENSE file in the root of the source
|
||||
; tree. An additional intellectual property rights grant can be found
|
||||
; in the file PATENTS. All contributing project authors may
|
||||
; be found in the AUTHORS file in the root of the source tree.
|
||||
;
|
||||
|
||||
|
||||
%include "vpx_ports/x86_abi_support.asm"
|
||||
|
||||
; void vp8_temporal_filter_apply_sse2 | arg
|
||||
; (unsigned char *frame1, | 0
|
||||
; unsigned int stride, | 1
|
||||
; unsigned char *frame2, | 2
|
||||
; unsigned int block_size, | 3
|
||||
; int strength, | 4
|
||||
; int filter_weight, | 5
|
||||
; unsigned int *accumulator, | 6
|
||||
; unsigned short *count) | 7
|
||||
global sym(vp8_temporal_filter_apply_sse2)
|
||||
sym(vp8_temporal_filter_apply_sse2):
|
||||
|
||||
push rbp
|
||||
mov rbp, rsp
|
||||
SHADOW_ARGS_TO_STACK 8
|
||||
SAVE_XMM
|
||||
GET_GOT rbx
|
||||
push rsi
|
||||
push rdi
|
||||
ALIGN_STACK 16, rax
|
||||
%define block_size 0
|
||||
%define strength 16
|
||||
%define filter_weight 32
|
||||
%define rounding_bit 48
|
||||
%define rbp_backup 64
|
||||
%define stack_size 80
|
||||
sub rsp, stack_size
|
||||
mov [rsp + rbp_backup], rbp
|
||||
; end prolog
|
||||
|
||||
mov rdx, arg(3)
|
||||
mov [rsp + block_size], rdx
|
||||
movd xmm6, arg(4)
|
||||
movdqa [rsp + strength], xmm6 ; where strength is used, all 16 bytes are read
|
||||
|
||||
; calculate the rounding bit outside the loop
|
||||
; 0x8000 >> (16 - strength)
|
||||
mov rdx, 16
|
||||
sub rdx, arg(4) ; 16 - strength
|
||||
movd xmm4, rdx ; can't use rdx w/ shift
|
||||
movdqa xmm5, [GLOBAL(_const_top_bit)]
|
||||
psrlw xmm5, xmm4
|
||||
movdqa [rsp + rounding_bit], xmm5
|
||||
|
||||
mov rsi, arg(0) ; src/frame1
|
||||
mov rdx, arg(2) ; predictor frame
|
||||
mov rdi, arg(6) ; accumulator
|
||||
mov rax, arg(7) ; count
|
||||
|
||||
; dup the filter weight and store for later
|
||||
movd xmm0, arg(5) ; filter_weight
|
||||
pshuflw xmm0, xmm0, 0
|
||||
punpcklwd xmm0, xmm0
|
||||
movdqa [rsp + filter_weight], xmm0
|
||||
|
||||
mov rbp, arg(1) ; stride
|
||||
pxor xmm7, xmm7 ; zero for extraction
|
||||
|
||||
lea rcx, [rdx + 16*16*1]
|
||||
cmp dword ptr [rsp + block_size], 8
|
||||
jne temporal_filter_apply_load_16
|
||||
lea rcx, [rdx + 8*8*1]
|
||||
|
||||
temporal_filter_apply_load_8:
|
||||
movq xmm0, [rsi] ; first row
|
||||
lea rsi, [rsi + rbp] ; += stride
|
||||
punpcklbw xmm0, xmm7 ; src[ 0- 7]
|
||||
movq xmm1, [rsi] ; second row
|
||||
lea rsi, [rsi + rbp] ; += stride
|
||||
punpcklbw xmm1, xmm7 ; src[ 8-15]
|
||||
jmp temporal_filter_apply_load_finished
|
||||
|
||||
temporal_filter_apply_load_16:
|
||||
movdqa xmm0, [rsi] ; src (frame1)
|
||||
lea rsi, [rsi + rbp] ; += stride
|
||||
movdqa xmm1, xmm0
|
||||
punpcklbw xmm0, xmm7 ; src[ 0- 7]
|
||||
punpckhbw xmm1, xmm7 ; src[ 8-15]
|
||||
|
||||
temporal_filter_apply_load_finished:
|
||||
movdqa xmm2, [rdx] ; predictor (frame2)
|
||||
movdqa xmm3, xmm2
|
||||
punpcklbw xmm2, xmm7 ; pred[ 0- 7]
|
||||
punpckhbw xmm3, xmm7 ; pred[ 8-15]
|
||||
|
||||
; modifier = src_byte - pixel_value
|
||||
psubw xmm0, xmm2 ; src - pred[ 0- 7]
|
||||
psubw xmm1, xmm3 ; src - pred[ 8-15]
|
||||
|
||||
; modifier *= modifier
|
||||
pmullw xmm0, xmm0 ; modifer[ 0- 7]^2
|
||||
pmullw xmm1, xmm1 ; modifer[ 8-15]^2
|
||||
|
||||
; modifier *= 3
|
||||
pmullw xmm0, [GLOBAL(_const_3w)]
|
||||
pmullw xmm1, [GLOBAL(_const_3w)]
|
||||
|
||||
; modifer += 0x8000 >> (16 - strength)
|
||||
paddw xmm0, [rsp + rounding_bit]
|
||||
paddw xmm1, [rsp + rounding_bit]
|
||||
|
||||
; modifier >>= strength
|
||||
psrlw xmm0, [rsp + strength]
|
||||
psrlw xmm1, [rsp + strength]
|
||||
|
||||
; modifier = 16 - modifier
|
||||
; saturation takes care of modifier > 16
|
||||
movdqa xmm3, [GLOBAL(_const_16w)]
|
||||
movdqa xmm2, [GLOBAL(_const_16w)]
|
||||
psubusw xmm3, xmm1
|
||||
psubusw xmm2, xmm0
|
||||
|
||||
; modifier *= filter_weight
|
||||
pmullw xmm2, [rsp + filter_weight]
|
||||
pmullw xmm3, [rsp + filter_weight]
|
||||
|
||||
; count
|
||||
movdqa xmm4, [rax]
|
||||
movdqa xmm5, [rax+16]
|
||||
; += modifier
|
||||
paddw xmm4, xmm2
|
||||
paddw xmm5, xmm3
|
||||
; write back
|
||||
movdqa [rax], xmm4
|
||||
movdqa [rax+16], xmm5
|
||||
lea rax, [rax + 16*2] ; count += 16*(sizeof(short))
|
||||
|
||||
; load and extract the predictor up to shorts
|
||||
pxor xmm7, xmm7
|
||||
movdqa xmm0, [rdx]
|
||||
lea rdx, [rdx + 16*1] ; pred += 16*(sizeof(char))
|
||||
movdqa xmm1, xmm0
|
||||
punpcklbw xmm0, xmm7 ; pred[ 0- 7]
|
||||
punpckhbw xmm1, xmm7 ; pred[ 8-15]
|
||||
|
||||
; modifier *= pixel_value
|
||||
pmullw xmm0, xmm2
|
||||
pmullw xmm1, xmm3
|
||||
|
||||
; expand to double words
|
||||
movdqa xmm2, xmm0
|
||||
punpcklwd xmm0, xmm7 ; [ 0- 3]
|
||||
punpckhwd xmm2, xmm7 ; [ 4- 7]
|
||||
movdqa xmm3, xmm1
|
||||
punpcklwd xmm1, xmm7 ; [ 8-11]
|
||||
punpckhwd xmm3, xmm7 ; [12-15]
|
||||
|
||||
; accumulator
|
||||
movdqa xmm4, [rdi]
|
||||
movdqa xmm5, [rdi+16]
|
||||
movdqa xmm6, [rdi+32]
|
||||
movdqa xmm7, [rdi+48]
|
||||
; += modifier
|
||||
paddw xmm4, xmm0
|
||||
paddw xmm5, xmm2
|
||||
paddw xmm6, xmm1
|
||||
paddw xmm7, xmm3
|
||||
; write back
|
||||
movdqa [rdi], xmm4
|
||||
movdqa [rdi+16], xmm5
|
||||
movdqa [rdi+32], xmm6
|
||||
movdqa [rdi+48], xmm7
|
||||
lea rdi, [rdi + 16*4] ; accumulator += 16*(sizeof(int))
|
||||
|
||||
cmp rdx, rcx
|
||||
je temporal_filter_apply_epilog
|
||||
pxor xmm7, xmm7 ; zero for extraction
|
||||
cmp dword ptr [rsp + block_size], 16
|
||||
je temporal_filter_apply_load_16
|
||||
jmp temporal_filter_apply_load_8
|
||||
|
||||
temporal_filter_apply_epilog:
|
||||
; begin epilog
|
||||
mov rbp, [rsp + rbp_backup]
|
||||
add rsp, stack_size
|
||||
pop rsp
|
||||
pop rdi
|
||||
pop rsi
|
||||
RESTORE_GOT
|
||||
RESTORE_XMM
|
||||
UNSHADOW_ARGS
|
||||
pop rbp
|
||||
ret
|
||||
|
||||
SECTION_RODATA
|
||||
align 16
|
||||
_const_3w:
|
||||
times 8 dw 3
|
||||
align 16
|
||||
_const_top_bit:
|
||||
times 8 dw 1<<15
|
||||
align 16
|
||||
_const_16w
|
||||
times 8 dw 16
|
||||
27
vp8/encoder/x86/temporal_filter_x86.h
Normal file
27
vp8/encoder/x86/temporal_filter_x86.h
Normal file
@@ -0,0 +1,27 @@
|
||||
/*
|
||||
* Copyright (c) 2010 The WebM project authors. All Rights Reserved.
|
||||
*
|
||||
* Use of this source code is governed by a BSD-style license
|
||||
* that can be found in the LICENSE file in the root of the source
|
||||
* tree. An additional intellectual property rights grant can be found
|
||||
* in the file PATENTS. All contributing project authors may
|
||||
* be found in the AUTHORS file in the root of the source tree.
|
||||
*/
|
||||
|
||||
|
||||
#ifndef __INC_VP8_TEMPORAL_FILTER_X86_H
|
||||
#define __INC_VP8_TEMPORAL_FILTER_X86_H
|
||||
|
||||
#if HAVE_SSE2
|
||||
extern prototype_apply(vp8_temporal_filter_apply_sse2);
|
||||
|
||||
#if !CONFIG_RUNTIME_CPU_DETECT
|
||||
|
||||
#undef vp8_temporal_filter_apply
|
||||
#define vp8_temporal_filter_apply vp8_temporal_filter_apply_sse2
|
||||
|
||||
#endif
|
||||
|
||||
#endif
|
||||
|
||||
#endif // __INC_VP8_TEMPORAL_FILTER_X86_H
|
||||
@@ -309,6 +309,8 @@ void vp8_arch_x86_encoder_init(VP8_COMP *cpi)
|
||||
|
||||
/*cpi->rtcd.quantize.quantb = vp8_regular_quantize_b_sse2;*/
|
||||
cpi->rtcd.quantize.fastquantb = vp8_fast_quantize_b_sse2;
|
||||
|
||||
cpi->rtcd.temporal.apply = vp8_temporal_filter_apply_sse2;
|
||||
}
|
||||
#endif
|
||||
|
||||
|
||||
Reference in New Issue
Block a user