From 0cdfef1e227206dc0f6ec3cf2a529dd63e271121 Mon Sep 17 00:00:00 2001 From: Paul Wilkins Date: Fri, 21 Jan 2011 17:52:00 +0000 Subject: [PATCH 1/2] Modified static scene check. Added code to scan ahead a few frames when we see what we think is a static scene in the two pass GF loop to see if the conditions persist. Moved calculation of decay rate out into a fuunction. Change-Id: I6e9c67e01ec9f555144deafc8ae67ef25bffb449 --- vp8/encoder/firstpass.c | 103 ++++++++++++++++++++++++++++------------ 1 file changed, 73 insertions(+), 30 deletions(-) diff --git a/vp8/encoder/firstpass.c b/vp8/encoder/firstpass.c index b45a229a9..a77ced78c 100644 --- a/vp8/encoder/firstpass.c +++ b/vp8/encoder/firstpass.c @@ -1316,6 +1316,43 @@ void vp8_end_second_pass(VP8_COMP *cpi) { } +// This function gives and estimate of how badly we believe +// the predicition quality is decaying from frame to frame. +double gf_prediction_decay_rate(VP8_COMP *cpi, FIRSTPASS_STATS *next_frame) +{ + double prediction_decay_rate; + double motion_decay; + double motion_pct = next_frame->pcnt_motion; + + + // Initial basis is the % mbs inter coded + prediction_decay_rate = next_frame->pcnt_inter; + + // High % motion -> somewhat higher decay rate + motion_decay = (1.0 - (motion_pct / 20.0)); + if (motion_decay < prediction_decay_rate) + prediction_decay_rate = motion_decay; + + // Adjustment to decay rate based on speed of motion + { + double this_mv_rabs; + double this_mv_cabs; + double distance_factor; + + this_mv_rabs = fabs(next_frame->mvr_abs * motion_pct); + this_mv_cabs = fabs(next_frame->mvc_abs * motion_pct); + + distance_factor = sqrt((this_mv_rabs * this_mv_rabs) + + (this_mv_cabs * this_mv_cabs)) / 250.0; + distance_factor = ((distance_factor > 1.0) + ? 0.0 : (1.0 - distance_factor)); + if (distance_factor < prediction_decay_rate) + prediction_decay_rate = distance_factor; + } + + return prediction_decay_rate; +} + // Analyse and define a gf/arf group . static void define_gf_group(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame) { @@ -1468,36 +1505,11 @@ static void define_gf_group(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame) if (r > GF_RMAX) r = GF_RMAX; - // Adjust loop decay rate - //if ( next_frame.pcnt_inter < loop_decay_rate ) - loop_decay_rate = next_frame.pcnt_inter; - - // High % motion -> somewhat higher decay rate - motion_decay = (1.0 - (motion_pct / 20.0)); - if (motion_decay < loop_decay_rate) - loop_decay_rate = motion_decay; - - // Adjustment to decay rate based on speed of motion - { - double this_mv_rabs; - double this_mv_cabs; - double distance_factor; - - this_mv_rabs = fabs(next_frame.mvr_abs * motion_pct); - this_mv_cabs = fabs(next_frame.mvc_abs * motion_pct); - - distance_factor = sqrt((this_mv_rabs * this_mv_rabs) + - (this_mv_cabs * this_mv_cabs)) / 250.0; - distance_factor = ((distance_factor > 1.0) - ? 0.0 : (1.0 - distance_factor)); - if (distance_factor < loop_decay_rate) - loop_decay_rate = distance_factor; - } + loop_decay_rate = gf_prediction_decay_rate(cpi, &next_frame); // Cumulative effect of decay decay_accumulator = decay_accumulator * loop_decay_rate; decay_accumulator = decay_accumulator < 0.1 ? 0.1 : decay_accumulator; - //decay_accumulator = ( loop_decay_rate < decay_accumulator ) ? loop_decay_rate : decay_accumulator; boost_score += (decay_accumulator * r); @@ -1508,11 +1520,42 @@ static void define_gf_group(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame) (loop_decay_rate >= 0.999) && (decay_accumulator < 0.9) ) { - // Force GF not alt ref - allow_alt_ref = FALSE; + int j; + FIRSTPASS_STATS * position = cpi->stats_in; + FIRSTPASS_STATS tmp_next_frame; + double decay_rate; - boost_score = old_boost_score; - break; + // Look ahead a few frames to see if static condition + // persists... + for ( j = 0; j < 4; j++ ) + { + if (EOF == vp8_input_stats(cpi, &tmp_next_frame)) + break; + + decay_rate = gf_prediction_decay_rate(cpi, &tmp_next_frame); + if ( decay_rate < 0.999 ) + break; + } + reset_fpf_position(cpi, position); // Reset file position + + // Force GF not alt ref + if ( j == 4 ) + { + if (0) + { + FILE *f = fopen("fadegf.stt", "a"); + fprintf(f, " %8d %8d %10.4f %10.4f %10.4f\n", + cpi->common.current_video_frame+i, i, + loop_decay_rate, decay_accumulator, + boost_score ); + fclose(f); + } + + allow_alt_ref = FALSE; + + boost_score = old_boost_score; + break; + } } // Break out conditions. From 0822a62f4051289fb3853c997b797ae3b6a006f5 Mon Sep 17 00:00:00 2001 From: Yunqing Wang Date: Thu, 20 Jan 2011 13:01:30 -0500 Subject: [PATCH 2/2] Modify sub-pixel filters to eliminate unnecessary calculations In sub-pixel calculation, xoffset and yoffset mostly take some specific values. Modified sub-pixel filter functions according to these possible values to improve performance. Change-Id: I83083570af8b00ff65093467914fbb97a4e9ea21 --- vp8/encoder/x86/variance_impl_sse2.asm | 183 +++++++++++++++++++------ vp8/encoder/x86/variance_sse2.c | 155 +++++++++++++++------ 2 files changed, 255 insertions(+), 83 deletions(-) diff --git a/vp8/encoder/x86/variance_impl_sse2.asm b/vp8/encoder/x86/variance_impl_sse2.asm index cefa0a956..7178e7e31 100644 --- a/vp8/encoder/x86/variance_impl_sse2.asm +++ b/vp8/encoder/x86/variance_impl_sse2.asm @@ -493,8 +493,8 @@ sym(vp8_get8x8var_sse2): ; unsigned char *src_ptr, ; int src_pixels_per_line, ; unsigned int Height, -; unsigned short *HFilter, -; unsigned short *VFilter, +; int xoffset, +; int yoffset, ; int *sum, ; unsigned int *sumsquared;; ; @@ -504,68 +504,80 @@ sym(vp8_filter_block2d_bil_var_sse2): push rbp mov rbp, rsp SHADOW_ARGS_TO_STACK 9 + SAVE_XMM GET_GOT rbx push rsi push rdi - sub rsp, 16 + push rbx ; end prolog pxor xmm6, xmm6 ; pxor xmm7, xmm7 ; - mov rax, arg(5) ;HFilter ; - mov rdx, arg(6) ;VFilter ; - mov rsi, arg(0) ;ref_ptr ; + lea rsi, [GLOBAL(xmm_bi_rd)] ; rounding + movdqa xmm4, XMMWORD PTR [rsi] - mov rdi, arg(2) ;src_ptr ; - movsxd rcx, dword ptr arg(4) ;Height ; + lea rcx, [GLOBAL(vp8_bilinear_filters_sse2)] + movsxd rax, dword ptr arg(5) ; xoffset + + cmp rax, 0 ; skip first_pass filter if xoffset=0 + je filter_block2d_bil_var_sse2_sp_only + + shl rax, 5 ; point to filter coeff with xoffset + lea rax, [rax + rcx] ; HFilter + + movsxd rdx, dword ptr arg(6) ; yoffset + + cmp rdx, 0 ; skip second_pass filter if yoffset=0 + je filter_block2d_bil_var_sse2_fp_only + + shl rdx, 5 + lea rdx, [rdx + rcx] ; VFilter + + mov rsi, arg(0) ;ref_ptr + mov rdi, arg(2) ;src_ptr + movsxd rcx, dword ptr arg(4) ;Height pxor xmm0, xmm0 ; - movq xmm1, QWORD PTR [rsi] ; + movq xmm1, QWORD PTR [rsi] ; + movq xmm3, QWORD PTR [rsi+1] ; - movq xmm3, QWORD PTR [rsi+1] ; punpcklbw xmm1, xmm0 ; - - pmullw xmm1, [rax] ; + pmullw xmm1, [rax] ; punpcklbw xmm3, xmm0 - ; pmullw xmm3, [rax+16] ; + paddw xmm1, xmm3 ; - - paddw xmm1, [GLOBAL(xmm_bi_rd)] ; - psraw xmm1, xmm_filter_shift ; - + paddw xmm1, xmm4 ; + psraw xmm1, xmm_filter_shift ; movdqa xmm5, xmm1 -%if ABI_IS_32BIT - add rsi, dword ptr arg(1) ;ref_pixels_per_line ; -%else - movsxd r8, dword ptr arg(1) ;ref_pixels_per_line ; - add rsi, r8 -%endif -filter_block2d_bil_var_sse2_loop: + movsxd rbx, dword ptr arg(1) ;ref_pixels_per_line + lea rsi, [rsi + rbx] +%if ABI_IS_32BIT=0 + movsxd r9, dword ptr arg(3) ;src_pixels_per_line +%endif + +filter_block2d_bil_var_sse2_loop: movq xmm1, QWORD PTR [rsi] ; movq xmm3, QWORD PTR [rsi+1] ; punpcklbw xmm1, xmm0 ; pmullw xmm1, [rax] ; - punpcklbw xmm3, xmm0 ; pmullw xmm3, [rax+16] ; paddw xmm1, xmm3 ; - paddw xmm1, [GLOBAL(xmm_bi_rd)] ; - + paddw xmm1, xmm4 ; psraw xmm1, xmm_filter_shift ; + movdqa xmm3, xmm5 ; - movdqa xmm5, xmm1 ; - pmullw xmm3, [rdx] ; + pmullw xmm3, [rdx] ; pmullw xmm1, [rdx+16] ; paddw xmm1, xmm3 ; - - paddw xmm1, [GLOBAL(xmm_bi_rd)] ; + paddw xmm1, xmm4 ; psraw xmm1, xmm_filter_shift ; movq xmm3, QWORD PTR [rdi] ; @@ -577,20 +589,103 @@ filter_block2d_bil_var_sse2_loop: pmaddwd xmm1, xmm1 ; paddd xmm7, xmm1 ; + lea rsi, [rsi + rbx] ;ref_pixels_per_line %if ABI_IS_32BIT - add rsi, dword ptr arg(1) ;ref_pixels_per_line ; - add rdi, dword ptr arg(3) ;src_pixels_per_line ; + add rdi, dword ptr arg(3) ;src_pixels_per_line %else - movsxd r8, dword ptr arg(1) ;ref_pixels_per_line ; - movsxd r9, dword ptr arg(3) ;src_pixels_per_line ; - add rsi, r8 - add rdi, r9 + lea rdi, [rdi + r9] %endif sub rcx, 1 ; jnz filter_block2d_bil_var_sse2_loop ; + jmp filter_block2d_bil_variance +filter_block2d_bil_var_sse2_sp_only: + movsxd rdx, dword ptr arg(6) ; yoffset + shl rdx, 5 + lea rdx, [rdx + rcx] ; VFilter + + mov rsi, arg(0) ;ref_ptr + mov rdi, arg(2) ;src_ptr + movsxd rcx, dword ptr arg(4) ;Height + movsxd rax, dword ptr arg(1) ;ref_pixels_per_line + + pxor xmm0, xmm0 ; + movq xmm1, QWORD PTR [rsi] ; + punpcklbw xmm1, xmm0 ; + + movsxd rbx, dword ptr arg(3) ;src_pixels_per_line + lea rsi, [rsi + rax] + +filter_block2d_bil_sp_only_loop: + movq xmm3, QWORD PTR [rsi] ; + punpcklbw xmm3, xmm0 ; + movdqa xmm5, xmm3 + + pmullw xmm1, [rdx] ; + pmullw xmm3, [rdx+16] ; + paddw xmm1, xmm3 ; + paddw xmm1, xmm4 ; + psraw xmm1, xmm_filter_shift ; + + movq xmm3, QWORD PTR [rdi] ; + punpcklbw xmm3, xmm0 ; + + psubw xmm1, xmm3 ; + paddw xmm6, xmm1 ; + + pmaddwd xmm1, xmm1 ; + paddd xmm7, xmm1 ; + + movdqa xmm1, xmm5 ; + lea rsi, [rsi + rax] ;ref_pixels_per_line + lea rdi, [rdi + rbx] ;src_pixels_per_line + + sub rcx, 1 ; + jnz filter_block2d_bil_sp_only_loop ; + + jmp filter_block2d_bil_variance + +filter_block2d_bil_var_sse2_fp_only: + mov rsi, arg(0) ;ref_ptr + mov rdi, arg(2) ;src_ptr + movsxd rcx, dword ptr arg(4) ;Height + movsxd rdx, dword ptr arg(1) ;ref_pixels_per_line + + pxor xmm0, xmm0 ; + movsxd rbx, dword ptr arg(3) ;src_pixels_per_line + +filter_block2d_bil_fp_only_loop: + movq xmm1, QWORD PTR [rsi] ; + movq xmm3, QWORD PTR [rsi+1] ; + + punpcklbw xmm1, xmm0 ; + pmullw xmm1, [rax] ; + punpcklbw xmm3, xmm0 ; + pmullw xmm3, [rax+16] ; + + paddw xmm1, xmm3 ; + paddw xmm1, xmm4 ; + psraw xmm1, xmm_filter_shift ; + + movq xmm3, QWORD PTR [rdi] ; + punpcklbw xmm3, xmm0 ; + + psubw xmm1, xmm3 ; + paddw xmm6, xmm1 ; + + pmaddwd xmm1, xmm1 ; + paddd xmm7, xmm1 ; + lea rsi, [rsi + rdx] + lea rdi, [rdi + rbx] ;src_pixels_per_line + + sub rcx, 1 ; + jnz filter_block2d_bil_fp_only_loop ; + + jmp filter_block2d_bil_variance + +filter_block2d_bil_variance: movdq2q mm6, xmm6 ; movdq2q mm7, xmm7 ; @@ -627,12 +722,12 @@ filter_block2d_bil_var_sse2_loop: movd [rsi], mm2 ; xsum movd [rdi], mm4 ; xxsum - ; begin epilog - add rsp, 16 + pop rbx pop rdi pop rsi RESTORE_GOT + RESTORE_XMM UNSHADOW_ARGS pop rbp ret @@ -974,3 +1069,13 @@ SECTION_RODATA align 16 xmm_bi_rd: times 8 dw 64 +align 16 +vp8_bilinear_filters_sse2: + dw 128, 128, 128, 128, 128, 128, 128, 128, 0, 0, 0, 0, 0, 0, 0, 0 + dw 112, 112, 112, 112, 112, 112, 112, 112, 16, 16, 16, 16, 16, 16, 16, 16 + dw 96, 96, 96, 96, 96, 96, 96, 96, 32, 32, 32, 32, 32, 32, 32, 32 + dw 80, 80, 80, 80, 80, 80, 80, 80, 48, 48, 48, 48, 48, 48, 48, 48 + dw 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 + dw 48, 48, 48, 48, 48, 48, 48, 48, 80, 80, 80, 80, 80, 80, 80, 80 + dw 32, 32, 32, 32, 32, 32, 32, 32, 96, 96, 96, 96, 96, 96, 96, 96 + dw 16, 16, 16, 16, 16, 16, 16, 16, 112, 112, 112, 112, 112, 112, 112, 112 diff --git a/vp8/encoder/x86/variance_sse2.c b/vp8/encoder/x86/variance_sse2.c index 006e0a24a..6f79f0d23 100644 --- a/vp8/encoder/x86/variance_sse2.c +++ b/vp8/encoder/x86/variance_sse2.c @@ -76,8 +76,8 @@ void vp8_filter_block2d_bil_var_sse2 const unsigned char *src_ptr, int src_pixels_per_line, unsigned int Height, - const short *HFilter, - const short *VFilter, + int xoffset, + int yoffset, int *sum, unsigned int *sumsquared ); @@ -222,21 +222,6 @@ unsigned int vp8_variance8x16_wmt } -/////////////////////////////////////////////////////////////////////////// -// the mmx function that does the bilinear filtering and var calculation // -// int one pass // -/////////////////////////////////////////////////////////////////////////// -DECLARE_ALIGNED(16, const short, vp8_bilinear_filters_xmm[8][16]) = -{ - { 128, 128, 128, 128, 128, 128, 128, 128, 0, 0, 0, 0, 0, 0, 0, 0 }, - { 112, 112, 112, 112, 112, 112, 112, 112, 16, 16, 16, 16, 16, 16, 16, 16 }, - { 96, 96, 96, 96, 96, 96, 96, 96, 32, 32, 32, 32, 32, 32, 32, 32 }, - { 80, 80, 80, 80, 80, 80, 80, 80, 48, 48, 48, 48, 48, 48, 48, 48 }, - { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 }, - { 48, 48, 48, 48, 48, 48, 48, 48, 80, 80, 80, 80, 80, 80, 80, 80 }, - { 32, 32, 32, 32, 32, 32, 32, 32, 96, 96, 96, 96, 96, 96, 96, 96 }, - { 16, 16, 16, 16, 16, 16, 16, 16, 112, 112, 112, 112, 112, 112, 112, 112 } -}; unsigned int vp8_sub_pixel_variance4x4_wmt ( const unsigned char *src_ptr, @@ -272,15 +257,38 @@ unsigned int vp8_sub_pixel_variance8x8_wmt unsigned int *sse ) { - int xsum; unsigned int xxsum; - vp8_filter_block2d_bil_var_sse2( - src_ptr, src_pixels_per_line, - dst_ptr, dst_pixels_per_line, 8, - vp8_bilinear_filters_xmm[xoffset], vp8_bilinear_filters_xmm[yoffset], - &xsum, &xxsum - ); + + if (xoffset == 4 && yoffset == 0) + { + vp8_half_horiz_variance16x_h_sse2( + src_ptr, src_pixels_per_line, + dst_ptr, dst_pixels_per_line, 8, + &xsum, &xxsum); + } + else if (xoffset == 0 && yoffset == 4) + { + vp8_half_vert_variance16x_h_sse2( + src_ptr, src_pixels_per_line, + dst_ptr, dst_pixels_per_line, 8, + &xsum, &xxsum); + } + else if (xoffset == 4 && yoffset == 4) + { + vp8_half_horiz_vert_variance16x_h_sse2( + src_ptr, src_pixels_per_line, + dst_ptr, dst_pixels_per_line, 8, + &xsum, &xxsum); + } + else + { + vp8_filter_block2d_bil_var_sse2( + src_ptr, src_pixels_per_line, + dst_ptr, dst_pixels_per_line, 8, + xoffset, yoffset, + &xsum, &xxsum); + } *sse = xxsum; return (xxsum - ((xsum * xsum) >> 6)); @@ -344,7 +352,7 @@ unsigned int vp8_sub_pixel_variance16x16_wmt vp8_filter_block2d_bil_var_sse2( src_ptr, src_pixels_per_line, dst_ptr, dst_pixels_per_line, 16, - vp8_bilinear_filters_xmm[xoffset], vp8_bilinear_filters_xmm[yoffset], + xoffset, yoffset, &xsum0, &xxsum0 ); @@ -352,7 +360,7 @@ unsigned int vp8_sub_pixel_variance16x16_wmt vp8_filter_block2d_bil_var_sse2( src_ptr + 8, src_pixels_per_line, dst_ptr + 8, dst_pixels_per_line, 16, - vp8_bilinear_filters_xmm[xoffset], vp8_bilinear_filters_xmm[yoffset], + xoffset, yoffset, &xsum1, &xxsum1 ); } @@ -392,21 +400,56 @@ unsigned int vp8_sub_pixel_variance16x8_wmt int xsum0, xsum1; unsigned int xxsum0, xxsum1; + if (xoffset == 4 && yoffset == 0) + { + vp8_half_horiz_variance16x_h_sse2( + src_ptr, src_pixels_per_line, + dst_ptr, dst_pixels_per_line, 8, + &xsum0, &xxsum0); - vp8_filter_block2d_bil_var_sse2( - src_ptr, src_pixels_per_line, - dst_ptr, dst_pixels_per_line, 8, - vp8_bilinear_filters_xmm[xoffset], vp8_bilinear_filters_xmm[yoffset], - &xsum0, &xxsum0 - ); + vp8_half_horiz_variance16x_h_sse2( + src_ptr + 8, src_pixels_per_line, + dst_ptr + 8, dst_pixels_per_line, 8, + &xsum1, &xxsum1); + } + else if (xoffset == 0 && yoffset == 4) + { + vp8_half_vert_variance16x_h_sse2( + src_ptr, src_pixels_per_line, + dst_ptr, dst_pixels_per_line, 8, + &xsum0, &xxsum0); + vp8_half_vert_variance16x_h_sse2( + src_ptr + 8, src_pixels_per_line, + dst_ptr + 8, dst_pixels_per_line, 8, + &xsum1, &xxsum1); + } + else if (xoffset == 4 && yoffset == 4) + { + vp8_half_horiz_vert_variance16x_h_sse2( + src_ptr, src_pixels_per_line, + dst_ptr, dst_pixels_per_line, 8, + &xsum0, &xxsum0); - vp8_filter_block2d_bil_var_sse2( - src_ptr + 8, src_pixels_per_line, - dst_ptr + 8, dst_pixels_per_line, 8, - vp8_bilinear_filters_xmm[xoffset], vp8_bilinear_filters_xmm[yoffset], - &xsum1, &xxsum1 - ); + vp8_half_horiz_vert_variance16x_h_sse2( + src_ptr + 8, src_pixels_per_line, + dst_ptr + 8, dst_pixels_per_line, 8, + &xsum1, &xxsum1); + } + else + { + vp8_filter_block2d_bil_var_sse2( + src_ptr, src_pixels_per_line, + dst_ptr, dst_pixels_per_line, 8, + xoffset, yoffset, + &xsum0, &xxsum0); + + vp8_filter_block2d_bil_var_sse2( + src_ptr + 8, src_pixels_per_line, + dst_ptr + 8, dst_pixels_per_line, 8, + xoffset, yoffset, + &xsum1, &xxsum1); + } xsum0 += xsum1; xxsum0 += xxsum1; @@ -428,12 +471,36 @@ unsigned int vp8_sub_pixel_variance8x16_wmt { int xsum; unsigned int xxsum; - vp8_filter_block2d_bil_var_sse2( - src_ptr, src_pixels_per_line, - dst_ptr, dst_pixels_per_line, 16, - vp8_bilinear_filters_xmm[xoffset], vp8_bilinear_filters_xmm[yoffset], - &xsum, &xxsum - ); + + if (xoffset == 4 && yoffset == 0) + { + vp8_half_horiz_variance16x_h_sse2( + src_ptr, src_pixels_per_line, + dst_ptr, dst_pixels_per_line, 16, + &xsum, &xxsum); + } + else if (xoffset == 0 && yoffset == 4) + { + vp8_half_vert_variance16x_h_sse2( + src_ptr, src_pixels_per_line, + dst_ptr, dst_pixels_per_line, 16, + &xsum, &xxsum); + } + else if (xoffset == 4 && yoffset == 4) + { + vp8_half_horiz_vert_variance16x_h_sse2( + src_ptr, src_pixels_per_line, + dst_ptr, dst_pixels_per_line, 16, + &xsum, &xxsum); + } + else + { + vp8_filter_block2d_bil_var_sse2( + src_ptr, src_pixels_per_line, + dst_ptr, dst_pixels_per_line, 16, + xoffset, yoffset, + &xsum, &xxsum); + } *sse = xxsum; return (xxsum - ((xsum * xsum) >> 7));