Merge remote branch 'origin/master' into experimental

Change-Id: I2c3326f7e4d9e901f098e499973586e973e1b8fb
This commit is contained in:
John Koleszar 2011-01-22 00:05:13 -05:00
commit d4797aa8fd
3 changed files with 328 additions and 113 deletions

View File

@ -1316,6 +1316,43 @@ void vp8_end_second_pass(VP8_COMP *cpi)
{
}
// This function gives and estimate of how badly we believe
// the predicition quality is decaying from frame to frame.
double gf_prediction_decay_rate(VP8_COMP *cpi, FIRSTPASS_STATS *next_frame)
{
double prediction_decay_rate;
double motion_decay;
double motion_pct = next_frame->pcnt_motion;
// Initial basis is the % mbs inter coded
prediction_decay_rate = next_frame->pcnt_inter;
// High % motion -> somewhat higher decay rate
motion_decay = (1.0 - (motion_pct / 20.0));
if (motion_decay < prediction_decay_rate)
prediction_decay_rate = motion_decay;
// Adjustment to decay rate based on speed of motion
{
double this_mv_rabs;
double this_mv_cabs;
double distance_factor;
this_mv_rabs = fabs(next_frame->mvr_abs * motion_pct);
this_mv_cabs = fabs(next_frame->mvc_abs * motion_pct);
distance_factor = sqrt((this_mv_rabs * this_mv_rabs) +
(this_mv_cabs * this_mv_cabs)) / 250.0;
distance_factor = ((distance_factor > 1.0)
? 0.0 : (1.0 - distance_factor));
if (distance_factor < prediction_decay_rate)
prediction_decay_rate = distance_factor;
}
return prediction_decay_rate;
}
// Analyse and define a gf/arf group .
static void define_gf_group(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame)
{
@ -1468,36 +1505,11 @@ static void define_gf_group(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame)
if (r > GF_RMAX)
r = GF_RMAX;
// Adjust loop decay rate
//if ( next_frame.pcnt_inter < loop_decay_rate )
loop_decay_rate = next_frame.pcnt_inter;
// High % motion -> somewhat higher decay rate
motion_decay = (1.0 - (motion_pct / 20.0));
if (motion_decay < loop_decay_rate)
loop_decay_rate = motion_decay;
// Adjustment to decay rate based on speed of motion
{
double this_mv_rabs;
double this_mv_cabs;
double distance_factor;
this_mv_rabs = fabs(next_frame.mvr_abs * motion_pct);
this_mv_cabs = fabs(next_frame.mvc_abs * motion_pct);
distance_factor = sqrt((this_mv_rabs * this_mv_rabs) +
(this_mv_cabs * this_mv_cabs)) / 250.0;
distance_factor = ((distance_factor > 1.0)
? 0.0 : (1.0 - distance_factor));
if (distance_factor < loop_decay_rate)
loop_decay_rate = distance_factor;
}
loop_decay_rate = gf_prediction_decay_rate(cpi, &next_frame);
// Cumulative effect of decay
decay_accumulator = decay_accumulator * loop_decay_rate;
decay_accumulator = decay_accumulator < 0.1 ? 0.1 : decay_accumulator;
//decay_accumulator = ( loop_decay_rate < decay_accumulator ) ? loop_decay_rate : decay_accumulator;
boost_score += (decay_accumulator * r);
@ -1508,11 +1520,42 @@ static void define_gf_group(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame)
(loop_decay_rate >= 0.999) &&
(decay_accumulator < 0.9) )
{
// Force GF not alt ref
allow_alt_ref = FALSE;
int j;
FIRSTPASS_STATS * position = cpi->stats_in;
FIRSTPASS_STATS tmp_next_frame;
double decay_rate;
boost_score = old_boost_score;
break;
// Look ahead a few frames to see if static condition
// persists...
for ( j = 0; j < 4; j++ )
{
if (EOF == vp8_input_stats(cpi, &tmp_next_frame))
break;
decay_rate = gf_prediction_decay_rate(cpi, &tmp_next_frame);
if ( decay_rate < 0.999 )
break;
}
reset_fpf_position(cpi, position); // Reset file position
// Force GF not alt ref
if ( j == 4 )
{
if (0)
{
FILE *f = fopen("fadegf.stt", "a");
fprintf(f, " %8d %8d %10.4f %10.4f %10.4f\n",
cpi->common.current_video_frame+i, i,
loop_decay_rate, decay_accumulator,
boost_score );
fclose(f);
}
allow_alt_ref = FALSE;
boost_score = old_boost_score;
break;
}
}
// Break out conditions.

View File

@ -493,8 +493,8 @@ sym(vp8_get8x8var_sse2):
; unsigned char *src_ptr,
; int src_pixels_per_line,
; unsigned int Height,
; unsigned short *HFilter,
; unsigned short *VFilter,
; int xoffset,
; int yoffset,
; int *sum,
; unsigned int *sumsquared;;
;
@ -504,68 +504,80 @@ sym(vp8_filter_block2d_bil_var_sse2):
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 9
SAVE_XMM
GET_GOT rbx
push rsi
push rdi
sub rsp, 16
push rbx
; end prolog
pxor xmm6, xmm6 ;
pxor xmm7, xmm7 ;
mov rax, arg(5) ;HFilter ;
mov rdx, arg(6) ;VFilter ;
mov rsi, arg(0) ;ref_ptr ;
lea rsi, [GLOBAL(xmm_bi_rd)] ; rounding
movdqa xmm4, XMMWORD PTR [rsi]
mov rdi, arg(2) ;src_ptr ;
movsxd rcx, dword ptr arg(4) ;Height ;
lea rcx, [GLOBAL(vp8_bilinear_filters_sse2)]
movsxd rax, dword ptr arg(5) ; xoffset
cmp rax, 0 ; skip first_pass filter if xoffset=0
je filter_block2d_bil_var_sse2_sp_only
shl rax, 5 ; point to filter coeff with xoffset
lea rax, [rax + rcx] ; HFilter
movsxd rdx, dword ptr arg(6) ; yoffset
cmp rdx, 0 ; skip second_pass filter if yoffset=0
je filter_block2d_bil_var_sse2_fp_only
shl rdx, 5
lea rdx, [rdx + rcx] ; VFilter
mov rsi, arg(0) ;ref_ptr
mov rdi, arg(2) ;src_ptr
movsxd rcx, dword ptr arg(4) ;Height
pxor xmm0, xmm0 ;
movq xmm1, QWORD PTR [rsi] ;
movq xmm1, QWORD PTR [rsi] ;
movq xmm3, QWORD PTR [rsi+1] ;
movq xmm3, QWORD PTR [rsi+1] ;
punpcklbw xmm1, xmm0 ;
pmullw xmm1, [rax] ;
pmullw xmm1, [rax] ;
punpcklbw xmm3, xmm0
;
pmullw xmm3, [rax+16] ;
paddw xmm1, xmm3 ;
paddw xmm1, [GLOBAL(xmm_bi_rd)] ;
psraw xmm1, xmm_filter_shift ;
paddw xmm1, xmm4 ;
psraw xmm1, xmm_filter_shift ;
movdqa xmm5, xmm1
%if ABI_IS_32BIT
add rsi, dword ptr arg(1) ;ref_pixels_per_line ;
%else
movsxd r8, dword ptr arg(1) ;ref_pixels_per_line ;
add rsi, r8
%endif
filter_block2d_bil_var_sse2_loop:
movsxd rbx, dword ptr arg(1) ;ref_pixels_per_line
lea rsi, [rsi + rbx]
%if ABI_IS_32BIT=0
movsxd r9, dword ptr arg(3) ;src_pixels_per_line
%endif
filter_block2d_bil_var_sse2_loop:
movq xmm1, QWORD PTR [rsi] ;
movq xmm3, QWORD PTR [rsi+1] ;
punpcklbw xmm1, xmm0 ;
pmullw xmm1, [rax] ;
punpcklbw xmm3, xmm0 ;
pmullw xmm3, [rax+16] ;
paddw xmm1, xmm3 ;
paddw xmm1, [GLOBAL(xmm_bi_rd)] ;
paddw xmm1, xmm4 ;
psraw xmm1, xmm_filter_shift ;
movdqa xmm3, xmm5 ;
movdqa xmm5, xmm1 ;
pmullw xmm3, [rdx] ;
pmullw xmm3, [rdx] ;
pmullw xmm1, [rdx+16] ;
paddw xmm1, xmm3 ;
paddw xmm1, [GLOBAL(xmm_bi_rd)] ;
paddw xmm1, xmm4 ;
psraw xmm1, xmm_filter_shift ;
movq xmm3, QWORD PTR [rdi] ;
@ -577,20 +589,103 @@ filter_block2d_bil_var_sse2_loop:
pmaddwd xmm1, xmm1 ;
paddd xmm7, xmm1 ;
lea rsi, [rsi + rbx] ;ref_pixels_per_line
%if ABI_IS_32BIT
add rsi, dword ptr arg(1) ;ref_pixels_per_line ;
add rdi, dword ptr arg(3) ;src_pixels_per_line ;
add rdi, dword ptr arg(3) ;src_pixels_per_line
%else
movsxd r8, dword ptr arg(1) ;ref_pixels_per_line ;
movsxd r9, dword ptr arg(3) ;src_pixels_per_line ;
add rsi, r8
add rdi, r9
lea rdi, [rdi + r9]
%endif
sub rcx, 1 ;
jnz filter_block2d_bil_var_sse2_loop ;
jmp filter_block2d_bil_variance
filter_block2d_bil_var_sse2_sp_only:
movsxd rdx, dword ptr arg(6) ; yoffset
shl rdx, 5
lea rdx, [rdx + rcx] ; VFilter
mov rsi, arg(0) ;ref_ptr
mov rdi, arg(2) ;src_ptr
movsxd rcx, dword ptr arg(4) ;Height
movsxd rax, dword ptr arg(1) ;ref_pixels_per_line
pxor xmm0, xmm0 ;
movq xmm1, QWORD PTR [rsi] ;
punpcklbw xmm1, xmm0 ;
movsxd rbx, dword ptr arg(3) ;src_pixels_per_line
lea rsi, [rsi + rax]
filter_block2d_bil_sp_only_loop:
movq xmm3, QWORD PTR [rsi] ;
punpcklbw xmm3, xmm0 ;
movdqa xmm5, xmm3
pmullw xmm1, [rdx] ;
pmullw xmm3, [rdx+16] ;
paddw xmm1, xmm3 ;
paddw xmm1, xmm4 ;
psraw xmm1, xmm_filter_shift ;
movq xmm3, QWORD PTR [rdi] ;
punpcklbw xmm3, xmm0 ;
psubw xmm1, xmm3 ;
paddw xmm6, xmm1 ;
pmaddwd xmm1, xmm1 ;
paddd xmm7, xmm1 ;
movdqa xmm1, xmm5 ;
lea rsi, [rsi + rax] ;ref_pixels_per_line
lea rdi, [rdi + rbx] ;src_pixels_per_line
sub rcx, 1 ;
jnz filter_block2d_bil_sp_only_loop ;
jmp filter_block2d_bil_variance
filter_block2d_bil_var_sse2_fp_only:
mov rsi, arg(0) ;ref_ptr
mov rdi, arg(2) ;src_ptr
movsxd rcx, dword ptr arg(4) ;Height
movsxd rdx, dword ptr arg(1) ;ref_pixels_per_line
pxor xmm0, xmm0 ;
movsxd rbx, dword ptr arg(3) ;src_pixels_per_line
filter_block2d_bil_fp_only_loop:
movq xmm1, QWORD PTR [rsi] ;
movq xmm3, QWORD PTR [rsi+1] ;
punpcklbw xmm1, xmm0 ;
pmullw xmm1, [rax] ;
punpcklbw xmm3, xmm0 ;
pmullw xmm3, [rax+16] ;
paddw xmm1, xmm3 ;
paddw xmm1, xmm4 ;
psraw xmm1, xmm_filter_shift ;
movq xmm3, QWORD PTR [rdi] ;
punpcklbw xmm3, xmm0 ;
psubw xmm1, xmm3 ;
paddw xmm6, xmm1 ;
pmaddwd xmm1, xmm1 ;
paddd xmm7, xmm1 ;
lea rsi, [rsi + rdx]
lea rdi, [rdi + rbx] ;src_pixels_per_line
sub rcx, 1 ;
jnz filter_block2d_bil_fp_only_loop ;
jmp filter_block2d_bil_variance
filter_block2d_bil_variance:
movdq2q mm6, xmm6 ;
movdq2q mm7, xmm7 ;
@ -627,12 +722,12 @@ filter_block2d_bil_var_sse2_loop:
movd [rsi], mm2 ; xsum
movd [rdi], mm4 ; xxsum
; begin epilog
add rsp, 16
pop rbx
pop rdi
pop rsi
RESTORE_GOT
RESTORE_XMM
UNSHADOW_ARGS
pop rbp
ret
@ -974,3 +1069,13 @@ SECTION_RODATA
align 16
xmm_bi_rd:
times 8 dw 64
align 16
vp8_bilinear_filters_sse2:
dw 128, 128, 128, 128, 128, 128, 128, 128, 0, 0, 0, 0, 0, 0, 0, 0
dw 112, 112, 112, 112, 112, 112, 112, 112, 16, 16, 16, 16, 16, 16, 16, 16
dw 96, 96, 96, 96, 96, 96, 96, 96, 32, 32, 32, 32, 32, 32, 32, 32
dw 80, 80, 80, 80, 80, 80, 80, 80, 48, 48, 48, 48, 48, 48, 48, 48
dw 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64
dw 48, 48, 48, 48, 48, 48, 48, 48, 80, 80, 80, 80, 80, 80, 80, 80
dw 32, 32, 32, 32, 32, 32, 32, 32, 96, 96, 96, 96, 96, 96, 96, 96
dw 16, 16, 16, 16, 16, 16, 16, 16, 112, 112, 112, 112, 112, 112, 112, 112

View File

@ -76,8 +76,8 @@ void vp8_filter_block2d_bil_var_sse2
const unsigned char *src_ptr,
int src_pixels_per_line,
unsigned int Height,
const short *HFilter,
const short *VFilter,
int xoffset,
int yoffset,
int *sum,
unsigned int *sumsquared
);
@ -222,21 +222,6 @@ unsigned int vp8_variance8x16_wmt
}
///////////////////////////////////////////////////////////////////////////
// the mmx function that does the bilinear filtering and var calculation //
// int one pass //
///////////////////////////////////////////////////////////////////////////
DECLARE_ALIGNED(16, const short, vp8_bilinear_filters_xmm[8][16]) =
{
{ 128, 128, 128, 128, 128, 128, 128, 128, 0, 0, 0, 0, 0, 0, 0, 0 },
{ 112, 112, 112, 112, 112, 112, 112, 112, 16, 16, 16, 16, 16, 16, 16, 16 },
{ 96, 96, 96, 96, 96, 96, 96, 96, 32, 32, 32, 32, 32, 32, 32, 32 },
{ 80, 80, 80, 80, 80, 80, 80, 80, 48, 48, 48, 48, 48, 48, 48, 48 },
{ 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 },
{ 48, 48, 48, 48, 48, 48, 48, 48, 80, 80, 80, 80, 80, 80, 80, 80 },
{ 32, 32, 32, 32, 32, 32, 32, 32, 96, 96, 96, 96, 96, 96, 96, 96 },
{ 16, 16, 16, 16, 16, 16, 16, 16, 112, 112, 112, 112, 112, 112, 112, 112 }
};
unsigned int vp8_sub_pixel_variance4x4_wmt
(
const unsigned char *src_ptr,
@ -272,15 +257,38 @@ unsigned int vp8_sub_pixel_variance8x8_wmt
unsigned int *sse
)
{
int xsum;
unsigned int xxsum;
vp8_filter_block2d_bil_var_sse2(
src_ptr, src_pixels_per_line,
dst_ptr, dst_pixels_per_line, 8,
vp8_bilinear_filters_xmm[xoffset], vp8_bilinear_filters_xmm[yoffset],
&xsum, &xxsum
);
if (xoffset == 4 && yoffset == 0)
{
vp8_half_horiz_variance16x_h_sse2(
src_ptr, src_pixels_per_line,
dst_ptr, dst_pixels_per_line, 8,
&xsum, &xxsum);
}
else if (xoffset == 0 && yoffset == 4)
{
vp8_half_vert_variance16x_h_sse2(
src_ptr, src_pixels_per_line,
dst_ptr, dst_pixels_per_line, 8,
&xsum, &xxsum);
}
else if (xoffset == 4 && yoffset == 4)
{
vp8_half_horiz_vert_variance16x_h_sse2(
src_ptr, src_pixels_per_line,
dst_ptr, dst_pixels_per_line, 8,
&xsum, &xxsum);
}
else
{
vp8_filter_block2d_bil_var_sse2(
src_ptr, src_pixels_per_line,
dst_ptr, dst_pixels_per_line, 8,
xoffset, yoffset,
&xsum, &xxsum);
}
*sse = xxsum;
return (xxsum - ((xsum * xsum) >> 6));
@ -344,7 +352,7 @@ unsigned int vp8_sub_pixel_variance16x16_wmt
vp8_filter_block2d_bil_var_sse2(
src_ptr, src_pixels_per_line,
dst_ptr, dst_pixels_per_line, 16,
vp8_bilinear_filters_xmm[xoffset], vp8_bilinear_filters_xmm[yoffset],
xoffset, yoffset,
&xsum0, &xxsum0
);
@ -352,7 +360,7 @@ unsigned int vp8_sub_pixel_variance16x16_wmt
vp8_filter_block2d_bil_var_sse2(
src_ptr + 8, src_pixels_per_line,
dst_ptr + 8, dst_pixels_per_line, 16,
vp8_bilinear_filters_xmm[xoffset], vp8_bilinear_filters_xmm[yoffset],
xoffset, yoffset,
&xsum1, &xxsum1
);
}
@ -392,21 +400,56 @@ unsigned int vp8_sub_pixel_variance16x8_wmt
int xsum0, xsum1;
unsigned int xxsum0, xxsum1;
if (xoffset == 4 && yoffset == 0)
{
vp8_half_horiz_variance16x_h_sse2(
src_ptr, src_pixels_per_line,
dst_ptr, dst_pixels_per_line, 8,
&xsum0, &xxsum0);
vp8_filter_block2d_bil_var_sse2(
src_ptr, src_pixels_per_line,
dst_ptr, dst_pixels_per_line, 8,
vp8_bilinear_filters_xmm[xoffset], vp8_bilinear_filters_xmm[yoffset],
&xsum0, &xxsum0
);
vp8_half_horiz_variance16x_h_sse2(
src_ptr + 8, src_pixels_per_line,
dst_ptr + 8, dst_pixels_per_line, 8,
&xsum1, &xxsum1);
}
else if (xoffset == 0 && yoffset == 4)
{
vp8_half_vert_variance16x_h_sse2(
src_ptr, src_pixels_per_line,
dst_ptr, dst_pixels_per_line, 8,
&xsum0, &xxsum0);
vp8_half_vert_variance16x_h_sse2(
src_ptr + 8, src_pixels_per_line,
dst_ptr + 8, dst_pixels_per_line, 8,
&xsum1, &xxsum1);
}
else if (xoffset == 4 && yoffset == 4)
{
vp8_half_horiz_vert_variance16x_h_sse2(
src_ptr, src_pixels_per_line,
dst_ptr, dst_pixels_per_line, 8,
&xsum0, &xxsum0);
vp8_filter_block2d_bil_var_sse2(
src_ptr + 8, src_pixels_per_line,
dst_ptr + 8, dst_pixels_per_line, 8,
vp8_bilinear_filters_xmm[xoffset], vp8_bilinear_filters_xmm[yoffset],
&xsum1, &xxsum1
);
vp8_half_horiz_vert_variance16x_h_sse2(
src_ptr + 8, src_pixels_per_line,
dst_ptr + 8, dst_pixels_per_line, 8,
&xsum1, &xxsum1);
}
else
{
vp8_filter_block2d_bil_var_sse2(
src_ptr, src_pixels_per_line,
dst_ptr, dst_pixels_per_line, 8,
xoffset, yoffset,
&xsum0, &xxsum0);
vp8_filter_block2d_bil_var_sse2(
src_ptr + 8, src_pixels_per_line,
dst_ptr + 8, dst_pixels_per_line, 8,
xoffset, yoffset,
&xsum1, &xxsum1);
}
xsum0 += xsum1;
xxsum0 += xxsum1;
@ -428,12 +471,36 @@ unsigned int vp8_sub_pixel_variance8x16_wmt
{
int xsum;
unsigned int xxsum;
vp8_filter_block2d_bil_var_sse2(
src_ptr, src_pixels_per_line,
dst_ptr, dst_pixels_per_line, 16,
vp8_bilinear_filters_xmm[xoffset], vp8_bilinear_filters_xmm[yoffset],
&xsum, &xxsum
);
if (xoffset == 4 && yoffset == 0)
{
vp8_half_horiz_variance16x_h_sse2(
src_ptr, src_pixels_per_line,
dst_ptr, dst_pixels_per_line, 16,
&xsum, &xxsum);
}
else if (xoffset == 0 && yoffset == 4)
{
vp8_half_vert_variance16x_h_sse2(
src_ptr, src_pixels_per_line,
dst_ptr, dst_pixels_per_line, 16,
&xsum, &xxsum);
}
else if (xoffset == 4 && yoffset == 4)
{
vp8_half_horiz_vert_variance16x_h_sse2(
src_ptr, src_pixels_per_line,
dst_ptr, dst_pixels_per_line, 16,
&xsum, &xxsum);
}
else
{
vp8_filter_block2d_bil_var_sse2(
src_ptr, src_pixels_per_line,
dst_ptr, dst_pixels_per_line, 16,
xoffset, yoffset,
&xsum, &xxsum);
}
*sse = xxsum;
return (xxsum - ((xsum * xsum) >> 7));