diff --git a/vp8/encoder/temporal_filter.c b/vp8/encoder/temporal_filter.c index 2fffaa95f..396e3390d 100644 --- a/vp8/encoder/temporal_filter.c +++ b/vp8/encoder/temporal_filter.c @@ -36,36 +36,9 @@ #define ALT_REF_MC_ENABLED 1 // dis/enable MC in AltRef filtering #define ALT_REF_SUBPEL_ENABLED 1 // dis/enable subpel in MC AltRef filtering -#define USE_FILTER_LUT 0 // use lookup table to improve filter #if VP8_TEMPORAL_ALT_REF -#if USE_FILTER_LUT -// for (strength = 0; strength <= 6; strength++) { -// for (delta = 0; delta <= 18; delta++) { -// float coeff = (3.0 * delta * delta) / pow(2, strength); -// printf("%3d", (int)roundf(coeff > 16 ? 0 : 16-coeff)); -// } -// printf("\n"); -// } -static int modifier_lut[7][19] = -{ - // Strength=0 - {16, 13, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, - // Strength=1 - {16, 15, 10, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, - // Strength=2 - {16, 15, 13, 9, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, - // Strength=3 - {16, 16, 15, 13, 10, 7, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, - // Strength=4 - {16, 16, 15, 14, 13, 11, 9, 7, 4, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0}, - // Strength=5 - {16, 16, 16, 15, 15, 14, 13, 11, 10, 8, 7, 5, 3, 0, 0, 0, 0, 0, 0}, - // Strength=6 - {16, 16, 16, 16, 15, 15, 14, 14, 13, 12, 11, 10, 9, 8, 7, 5, 4, 2, 1} -}; -#endif static void vp8_temporal_filter_predictors_mb_c ( MACROBLOCKD *x, @@ -86,14 +59,11 @@ static void vp8_temporal_filter_predictors_mb_c if ((mv_row | mv_col) & 7) { -// vp8_sixtap_predict16x16_c(yptr, stride, -// mv_col & 7, mv_row & 7, &pred[0], 16); x->subpixel_predict16x16(yptr, stride, mv_col & 7, mv_row & 7, &pred[0], 16); } else { - //vp8_copy_mem16x16_c (yptr, stride, &pred[0], 16); RECON_INVOKE(&x->rtcd->recon, copy16x16)(yptr, stride, &pred[0], 16); } @@ -127,17 +97,13 @@ void vp8_temporal_filter_apply_c int strength, int filter_weight, unsigned int *accumulator, - unsigned int *count + unsigned short *count ) { int i, j, k; int modifier; int byte = 0; -#if USE_FILTER_LUT - int *lut = modifier_lut[strength]; -#endif - for (i = 0,k = 0; i < block_size; i++) { for (j = 0; j < block_size; j++, k++) @@ -146,11 +112,10 @@ void vp8_temporal_filter_apply_c int src_byte = frame1[byte]; int pixel_value = *frame2++; -#if USE_FILTER_LUT - modifier = abs(src_byte-pixel_value); - modifier = modifier>18 ? 0 : lut[modifier]; -#else modifier = src_byte - pixel_value; + // This is an integer approximation of: + // float coeff = (3.0 * modifer * modifier) / pow(2, strength); + // modifier = (int)roundf(coeff > 16 ? 0 : 16-coeff); modifier *= modifier; modifier *= 3; modifier += 1 << (strength - 1); @@ -160,7 +125,6 @@ void vp8_temporal_filter_apply_c modifier = 16; modifier = 16 - modifier; -#endif modifier *= filter_weight; count[k] += modifier; @@ -331,12 +295,12 @@ static void vp8_temporal_filter_iterate_c int MBs = cpi->common.MBs; int mb_y_offset = 0; int mb_uv_offset = 0; - unsigned int accumulator[384]; - unsigned int count[384]; + DECLARE_ALIGNED_ARRAY(16, unsigned int, accumulator, 16*16 + 8*8 + 8*8); + DECLARE_ALIGNED_ARRAY(16, unsigned short, count, 16*16 + 8*8 + 8*8); MACROBLOCKD *mbd = &cpi->mb.e_mbd; YV12_BUFFER_CONFIG *f = cpi->frames[alt_ref_index]; unsigned char *dst1, *dst2; - DECLARE_ALIGNED(16, unsigned char, predictor[384]); + DECLARE_ALIGNED_ARRAY(16, unsigned char, predictor, 16*16 + 8*8 + 8*8); // Save input state unsigned char *y_buffer = mbd->pre.y_buffer; @@ -366,7 +330,7 @@ static void vp8_temporal_filter_iterate_c int stride; vpx_memset(accumulator, 0, 384*sizeof(unsigned int)); - vpx_memset(count, 0, 384*sizeof(unsigned int)); + vpx_memset(count, 0, 384*sizeof(unsigned short)); #if ALT_REF_MC_ENABLED // Reduced search extent by 3 for 6-tap filter & smaller UMV border diff --git a/vp8/encoder/temporal_filter.h b/vp8/encoder/temporal_filter.h index 7b8c21c04..740037a85 100644 --- a/vp8/encoder/temporal_filter.h +++ b/vp8/encoder/temporal_filter.h @@ -22,9 +22,13 @@ int strength, \ int filter_weight, \ unsigned int *accumulator, \ - unsigned int *count \ + unsigned short *count \ ) +#if ARCH_X86 || ARCH_X86_64 +#include "x86/temporal_filter_x86.h" +#endif + #ifndef vp8_temporal_filter_apply #define vp8_temporal_filter_apply vp8_temporal_filter_apply_c #endif diff --git a/vp8/encoder/x86/temporal_filter_apply_sse2.asm b/vp8/encoder/x86/temporal_filter_apply_sse2.asm new file mode 100644 index 000000000..0127b012e --- /dev/null +++ b/vp8/encoder/x86/temporal_filter_apply_sse2.asm @@ -0,0 +1,207 @@ +; +; Copyright (c) 2010 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. +; + + +%include "vpx_ports/x86_abi_support.asm" + +; void vp8_temporal_filter_apply_sse2 | arg +; (unsigned char *frame1, | 0 +; unsigned int stride, | 1 +; unsigned char *frame2, | 2 +; unsigned int block_size, | 3 +; int strength, | 4 +; int filter_weight, | 5 +; unsigned int *accumulator, | 6 +; unsigned short *count) | 7 +global sym(vp8_temporal_filter_apply_sse2) +sym(vp8_temporal_filter_apply_sse2): + + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 8 + SAVE_XMM + GET_GOT rbx + push rsi + push rdi + ALIGN_STACK 16, rax + %define block_size 0 + %define strength 16 + %define filter_weight 32 + %define rounding_bit 48 + %define rbp_backup 64 + %define stack_size 80 + sub rsp, stack_size + mov [rsp + rbp_backup], rbp + ; end prolog + + mov rdx, arg(3) + mov [rsp + block_size], rdx + movd xmm6, arg(4) + movdqa [rsp + strength], xmm6 ; where strength is used, all 16 bytes are read + + ; calculate the rounding bit outside the loop + ; 0x8000 >> (16 - strength) + mov rdx, 16 + sub rdx, arg(4) ; 16 - strength + movd xmm4, rdx ; can't use rdx w/ shift + movdqa xmm5, [GLOBAL(_const_top_bit)] + psrlw xmm5, xmm4 + movdqa [rsp + rounding_bit], xmm5 + + mov rsi, arg(0) ; src/frame1 + mov rdx, arg(2) ; predictor frame + mov rdi, arg(6) ; accumulator + mov rax, arg(7) ; count + + ; dup the filter weight and store for later + movd xmm0, arg(5) ; filter_weight + pshuflw xmm0, xmm0, 0 + punpcklwd xmm0, xmm0 + movdqa [rsp + filter_weight], xmm0 + + mov rbp, arg(1) ; stride + pxor xmm7, xmm7 ; zero for extraction + + lea rcx, [rdx + 16*16*1] + cmp dword ptr [rsp + block_size], 8 + jne temporal_filter_apply_load_16 + lea rcx, [rdx + 8*8*1] + +temporal_filter_apply_load_8: + movq xmm0, [rsi] ; first row + lea rsi, [rsi + rbp] ; += stride + punpcklbw xmm0, xmm7 ; src[ 0- 7] + movq xmm1, [rsi] ; second row + lea rsi, [rsi + rbp] ; += stride + punpcklbw xmm1, xmm7 ; src[ 8-15] + jmp temporal_filter_apply_load_finished + +temporal_filter_apply_load_16: + movdqa xmm0, [rsi] ; src (frame1) + lea rsi, [rsi + rbp] ; += stride + movdqa xmm1, xmm0 + punpcklbw xmm0, xmm7 ; src[ 0- 7] + punpckhbw xmm1, xmm7 ; src[ 8-15] + +temporal_filter_apply_load_finished: + movdqa xmm2, [rdx] ; predictor (frame2) + movdqa xmm3, xmm2 + punpcklbw xmm2, xmm7 ; pred[ 0- 7] + punpckhbw xmm3, xmm7 ; pred[ 8-15] + + ; modifier = src_byte - pixel_value + psubw xmm0, xmm2 ; src - pred[ 0- 7] + psubw xmm1, xmm3 ; src - pred[ 8-15] + + ; modifier *= modifier + pmullw xmm0, xmm0 ; modifer[ 0- 7]^2 + pmullw xmm1, xmm1 ; modifer[ 8-15]^2 + + ; modifier *= 3 + pmullw xmm0, [GLOBAL(_const_3w)] + pmullw xmm1, [GLOBAL(_const_3w)] + + ; modifer += 0x8000 >> (16 - strength) + paddw xmm0, [rsp + rounding_bit] + paddw xmm1, [rsp + rounding_bit] + + ; modifier >>= strength + psrlw xmm0, [rsp + strength] + psrlw xmm1, [rsp + strength] + + ; modifier = 16 - modifier + ; saturation takes care of modifier > 16 + movdqa xmm3, [GLOBAL(_const_16w)] + movdqa xmm2, [GLOBAL(_const_16w)] + psubusw xmm3, xmm1 + psubusw xmm2, xmm0 + + ; modifier *= filter_weight + pmullw xmm2, [rsp + filter_weight] + pmullw xmm3, [rsp + filter_weight] + + ; count + movdqa xmm4, [rax] + movdqa xmm5, [rax+16] + ; += modifier + paddw xmm4, xmm2 + paddw xmm5, xmm3 + ; write back + movdqa [rax], xmm4 + movdqa [rax+16], xmm5 + lea rax, [rax + 16*2] ; count += 16*(sizeof(short)) + + ; load and extract the predictor up to shorts + pxor xmm7, xmm7 + movdqa xmm0, [rdx] + lea rdx, [rdx + 16*1] ; pred += 16*(sizeof(char)) + movdqa xmm1, xmm0 + punpcklbw xmm0, xmm7 ; pred[ 0- 7] + punpckhbw xmm1, xmm7 ; pred[ 8-15] + + ; modifier *= pixel_value + pmullw xmm0, xmm2 + pmullw xmm1, xmm3 + + ; expand to double words + movdqa xmm2, xmm0 + punpcklwd xmm0, xmm7 ; [ 0- 3] + punpckhwd xmm2, xmm7 ; [ 4- 7] + movdqa xmm3, xmm1 + punpcklwd xmm1, xmm7 ; [ 8-11] + punpckhwd xmm3, xmm7 ; [12-15] + + ; accumulator + movdqa xmm4, [rdi] + movdqa xmm5, [rdi+16] + movdqa xmm6, [rdi+32] + movdqa xmm7, [rdi+48] + ; += modifier + paddw xmm4, xmm0 + paddw xmm5, xmm2 + paddw xmm6, xmm1 + paddw xmm7, xmm3 + ; write back + movdqa [rdi], xmm4 + movdqa [rdi+16], xmm5 + movdqa [rdi+32], xmm6 + movdqa [rdi+48], xmm7 + lea rdi, [rdi + 16*4] ; accumulator += 16*(sizeof(int)) + + cmp rdx, rcx + je temporal_filter_apply_epilog + pxor xmm7, xmm7 ; zero for extraction + cmp dword ptr [rsp + block_size], 16 + je temporal_filter_apply_load_16 + jmp temporal_filter_apply_load_8 + +temporal_filter_apply_epilog: + ; begin epilog + mov rbp, [rsp + rbp_backup] + add rsp, stack_size + pop rsp + pop rdi + pop rsi + RESTORE_GOT + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + +SECTION_RODATA +align 16 +_const_3w: + times 8 dw 3 +align 16 +_const_top_bit: + times 8 dw 1<<15 +align 16 +_const_16w + times 8 dw 16 diff --git a/vp8/encoder/x86/temporal_filter_x86.h b/vp8/encoder/x86/temporal_filter_x86.h new file mode 100644 index 000000000..2daa14018 --- /dev/null +++ b/vp8/encoder/x86/temporal_filter_x86.h @@ -0,0 +1,27 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + + +#ifndef __INC_VP8_TEMPORAL_FILTER_X86_H +#define __INC_VP8_TEMPORAL_FILTER_X86_H + +#if HAVE_SSE2 +extern prototype_apply(vp8_temporal_filter_apply_sse2); + +#if !CONFIG_RUNTIME_CPU_DETECT + +#undef vp8_temporal_filter_apply +#define vp8_temporal_filter_apply vp8_temporal_filter_apply_sse2 + +#endif + +#endif + +#endif // __INC_VP8_TEMPORAL_FILTER_X86_H diff --git a/vp8/encoder/x86/x86_csystemdependent.c b/vp8/encoder/x86/x86_csystemdependent.c index 6e317e2a2..c7dffc443 100644 --- a/vp8/encoder/x86/x86_csystemdependent.c +++ b/vp8/encoder/x86/x86_csystemdependent.c @@ -309,6 +309,8 @@ void vp8_arch_x86_encoder_init(VP8_COMP *cpi) /*cpi->rtcd.quantize.quantb = vp8_regular_quantize_b_sse2;*/ cpi->rtcd.quantize.fastquantb = vp8_fast_quantize_b_sse2; + + cpi->rtcd.temporal.apply = vp8_temporal_filter_apply_sse2; } #endif diff --git a/vp8/vp8cx.mk b/vp8/vp8cx.mk index 683d785e6..932f145e6 100644 --- a/vp8/vp8cx.mk +++ b/vp8/vp8cx.mk @@ -94,6 +94,7 @@ VP8_CX_SRCS-$(ARCH_X86)$(ARCH_X86_64) += encoder/x86/dct_x86.h VP8_CX_SRCS-$(ARCH_X86)$(ARCH_X86_64) += encoder/x86/mcomp_x86.h VP8_CX_SRCS-$(ARCH_X86)$(ARCH_X86_64) += encoder/x86/variance_x86.h VP8_CX_SRCS-$(ARCH_X86)$(ARCH_X86_64) += encoder/x86/quantize_x86.h +VP8_CX_SRCS-$(ARCH_X86)$(ARCH_X86_64) += encoder/x86/temporal_filter_x86.h VP8_CX_SRCS-$(ARCH_X86)$(ARCH_X86_64) += encoder/x86/x86_csystemdependent.c VP8_CX_SRCS-$(HAVE_MMX) += encoder/x86/variance_mmx.c VP8_CX_SRCS-$(HAVE_MMX) += encoder/x86/variance_impl_mmx.asm @@ -107,6 +108,7 @@ VP8_CX_SRCS-$(HAVE_SSE2) += encoder/x86/sad_sse2.asm VP8_CX_SRCS-$(HAVE_SSE2) += encoder/x86/fwalsh_sse2.asm VP8_CX_SRCS-$(HAVE_SSE2) += encoder/x86/quantize_sse2.asm VP8_CX_SRCS-$(HAVE_SSE2) += encoder/x86/subtract_sse2.asm +VP8_CX_SRCS-$(HAVE_SSE2) += encoder/x86/temporal_filter_apply_sse2.asm VP8_CX_SRCS-$(HAVE_SSE3) += encoder/x86/sad_sse3.asm VP8_CX_SRCS-$(HAVE_SSSE3) += encoder/x86/sad_ssse3.asm VP8_CX_SRCS-$(HAVE_SSSE3) += encoder/x86/quantize_ssse3.asm