Merge remote branch 'internal/upstream' into HEAD
This commit is contained in:
		@@ -1316,6 +1316,43 @@ void vp8_end_second_pass(VP8_COMP *cpi)
 | 
			
		||||
{
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
// This function gives and estimate of how badly we believe
 | 
			
		||||
// the predicition quality is decaying from frame to frame.
 | 
			
		||||
double gf_prediction_decay_rate(VP8_COMP *cpi, FIRSTPASS_STATS *next_frame)
 | 
			
		||||
{
 | 
			
		||||
    double prediction_decay_rate;
 | 
			
		||||
    double motion_decay;
 | 
			
		||||
    double motion_pct = next_frame->pcnt_motion;
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
    // Initial basis is the % mbs inter coded
 | 
			
		||||
    prediction_decay_rate = next_frame->pcnt_inter;
 | 
			
		||||
 | 
			
		||||
    // High % motion -> somewhat higher decay rate
 | 
			
		||||
    motion_decay = (1.0 - (motion_pct / 20.0));
 | 
			
		||||
    if (motion_decay < prediction_decay_rate)
 | 
			
		||||
        prediction_decay_rate = motion_decay;
 | 
			
		||||
 | 
			
		||||
    // Adjustment to decay rate based on speed of motion
 | 
			
		||||
    {
 | 
			
		||||
        double this_mv_rabs;
 | 
			
		||||
        double this_mv_cabs;
 | 
			
		||||
        double distance_factor;
 | 
			
		||||
 | 
			
		||||
        this_mv_rabs = fabs(next_frame->mvr_abs * motion_pct);
 | 
			
		||||
        this_mv_cabs = fabs(next_frame->mvc_abs * motion_pct);
 | 
			
		||||
 | 
			
		||||
        distance_factor = sqrt((this_mv_rabs * this_mv_rabs) +
 | 
			
		||||
                               (this_mv_cabs * this_mv_cabs)) / 250.0;
 | 
			
		||||
        distance_factor = ((distance_factor > 1.0)
 | 
			
		||||
                                ? 0.0 : (1.0 - distance_factor));
 | 
			
		||||
        if (distance_factor < prediction_decay_rate)
 | 
			
		||||
            prediction_decay_rate = distance_factor;
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    return prediction_decay_rate;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
// Analyse and define a gf/arf group .
 | 
			
		||||
static void define_gf_group(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame)
 | 
			
		||||
{
 | 
			
		||||
@@ -1468,36 +1505,11 @@ static void define_gf_group(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame)
 | 
			
		||||
        if (r > GF_RMAX)
 | 
			
		||||
            r = GF_RMAX;
 | 
			
		||||
 | 
			
		||||
        // Adjust loop decay rate
 | 
			
		||||
        //if ( next_frame.pcnt_inter < loop_decay_rate )
 | 
			
		||||
        loop_decay_rate = next_frame.pcnt_inter;
 | 
			
		||||
 | 
			
		||||
        // High % motion -> somewhat higher decay rate
 | 
			
		||||
        motion_decay = (1.0 - (motion_pct / 20.0));
 | 
			
		||||
        if (motion_decay < loop_decay_rate)
 | 
			
		||||
            loop_decay_rate = motion_decay;
 | 
			
		||||
 | 
			
		||||
        // Adjustment to decay rate based on speed of motion
 | 
			
		||||
        {
 | 
			
		||||
            double this_mv_rabs;
 | 
			
		||||
            double this_mv_cabs;
 | 
			
		||||
            double distance_factor;
 | 
			
		||||
 | 
			
		||||
            this_mv_rabs = fabs(next_frame.mvr_abs * motion_pct);
 | 
			
		||||
            this_mv_cabs = fabs(next_frame.mvc_abs * motion_pct);
 | 
			
		||||
 | 
			
		||||
            distance_factor = sqrt((this_mv_rabs * this_mv_rabs) +
 | 
			
		||||
                                   (this_mv_cabs * this_mv_cabs)) / 250.0;
 | 
			
		||||
            distance_factor = ((distance_factor > 1.0)
 | 
			
		||||
                                    ? 0.0 : (1.0 - distance_factor));
 | 
			
		||||
            if (distance_factor < loop_decay_rate)
 | 
			
		||||
                loop_decay_rate = distance_factor;
 | 
			
		||||
        }
 | 
			
		||||
        loop_decay_rate = gf_prediction_decay_rate(cpi, &next_frame);
 | 
			
		||||
 | 
			
		||||
        // Cumulative effect of decay
 | 
			
		||||
        decay_accumulator = decay_accumulator * loop_decay_rate;
 | 
			
		||||
        decay_accumulator = decay_accumulator < 0.1 ? 0.1 : decay_accumulator;
 | 
			
		||||
        //decay_accumulator = ( loop_decay_rate < decay_accumulator ) ? loop_decay_rate : decay_accumulator;
 | 
			
		||||
 | 
			
		||||
        boost_score += (decay_accumulator * r);
 | 
			
		||||
 | 
			
		||||
@@ -1508,12 +1520,43 @@ static void define_gf_group(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame)
 | 
			
		||||
             (loop_decay_rate >= 0.999) &&
 | 
			
		||||
             (decay_accumulator < 0.9) )
 | 
			
		||||
        {
 | 
			
		||||
            int j;
 | 
			
		||||
            FIRSTPASS_STATS * position = cpi->stats_in;
 | 
			
		||||
            FIRSTPASS_STATS tmp_next_frame;
 | 
			
		||||
            double decay_rate;
 | 
			
		||||
 | 
			
		||||
            // Look ahead a few frames to see if static condition
 | 
			
		||||
            // persists...
 | 
			
		||||
            for ( j = 0; j < 4; j++ )
 | 
			
		||||
            {
 | 
			
		||||
                if (EOF == vp8_input_stats(cpi, &tmp_next_frame))
 | 
			
		||||
                    break;
 | 
			
		||||
 | 
			
		||||
                decay_rate = gf_prediction_decay_rate(cpi, &tmp_next_frame);
 | 
			
		||||
                if ( decay_rate < 0.999 )
 | 
			
		||||
                    break;
 | 
			
		||||
            }
 | 
			
		||||
            reset_fpf_position(cpi, position);            // Reset file position
 | 
			
		||||
 | 
			
		||||
            // Force GF not alt ref
 | 
			
		||||
            if ( j == 4 )
 | 
			
		||||
            {
 | 
			
		||||
                if (0)
 | 
			
		||||
                {
 | 
			
		||||
                    FILE *f = fopen("fadegf.stt", "a");
 | 
			
		||||
                    fprintf(f, " %8d %8d %10.4f %10.4f %10.4f\n",
 | 
			
		||||
                         cpi->common.current_video_frame+i, i,
 | 
			
		||||
                         loop_decay_rate, decay_accumulator,
 | 
			
		||||
                         boost_score );
 | 
			
		||||
                    fclose(f);
 | 
			
		||||
                }
 | 
			
		||||
 | 
			
		||||
                allow_alt_ref = FALSE;
 | 
			
		||||
 | 
			
		||||
                boost_score = old_boost_score;
 | 
			
		||||
                break;
 | 
			
		||||
            }
 | 
			
		||||
        }
 | 
			
		||||
 | 
			
		||||
        // Break out conditions.
 | 
			
		||||
        if  (   /* i>4 || */
 | 
			
		||||
 
 | 
			
		||||
@@ -493,8 +493,8 @@ sym(vp8_get8x8var_sse2):
 | 
			
		||||
;    unsigned char *src_ptr,
 | 
			
		||||
;    int src_pixels_per_line,
 | 
			
		||||
;    unsigned int Height,
 | 
			
		||||
;    unsigned short *HFilter,
 | 
			
		||||
;    unsigned short *VFilter,
 | 
			
		||||
;    int  xoffset,
 | 
			
		||||
;    int  yoffset,
 | 
			
		||||
;    int *sum,
 | 
			
		||||
;    unsigned int *sumsquared;;
 | 
			
		||||
;
 | 
			
		||||
@@ -504,68 +504,80 @@ sym(vp8_filter_block2d_bil_var_sse2):
 | 
			
		||||
    push        rbp
 | 
			
		||||
    mov         rbp, rsp
 | 
			
		||||
    SHADOW_ARGS_TO_STACK 9
 | 
			
		||||
    SAVE_XMM
 | 
			
		||||
    GET_GOT     rbx
 | 
			
		||||
    push rsi
 | 
			
		||||
    push rdi
 | 
			
		||||
    sub         rsp, 16
 | 
			
		||||
    push rbx
 | 
			
		||||
    ; end prolog
 | 
			
		||||
 | 
			
		||||
        pxor            xmm6,           xmm6                 ;
 | 
			
		||||
        pxor            xmm7,           xmm7                 ;
 | 
			
		||||
        mov             rax,            arg(5) ;HFilter             ;
 | 
			
		||||
 | 
			
		||||
        mov             rdx,            arg(6) ;VFilter             ;
 | 
			
		||||
        mov             rsi,            arg(0) ;ref_ptr              ;
 | 
			
		||||
        lea             rsi,            [GLOBAL(xmm_bi_rd)]  ; rounding
 | 
			
		||||
        movdqa          xmm4,           XMMWORD PTR [rsi]
 | 
			
		||||
 | 
			
		||||
        mov             rdi,            arg(2) ;src_ptr              ;
 | 
			
		||||
        movsxd          rcx,            dword ptr arg(4) ;Height              ;
 | 
			
		||||
        lea             rcx,            [GLOBAL(vp8_bilinear_filters_sse2)]
 | 
			
		||||
        movsxd          rax,            dword ptr arg(5)     ; xoffset
 | 
			
		||||
 | 
			
		||||
        cmp             rax,            0                    ; skip first_pass filter if xoffset=0
 | 
			
		||||
        je              filter_block2d_bil_var_sse2_sp_only
 | 
			
		||||
 | 
			
		||||
        shl             rax,            5                    ; point to filter coeff with xoffset
 | 
			
		||||
        lea             rax,            [rax + rcx]          ; HFilter
 | 
			
		||||
 | 
			
		||||
        movsxd          rdx,            dword ptr arg(6)     ; yoffset
 | 
			
		||||
 | 
			
		||||
        cmp             rdx,            0                    ; skip second_pass filter if yoffset=0
 | 
			
		||||
        je              filter_block2d_bil_var_sse2_fp_only
 | 
			
		||||
 | 
			
		||||
        shl             rdx,            5
 | 
			
		||||
        lea             rdx,            [rdx + rcx]          ; VFilter
 | 
			
		||||
 | 
			
		||||
        mov             rsi,            arg(0)               ;ref_ptr
 | 
			
		||||
        mov             rdi,            arg(2)               ;src_ptr
 | 
			
		||||
        movsxd          rcx,            dword ptr arg(4)     ;Height
 | 
			
		||||
 | 
			
		||||
        pxor            xmm0,           xmm0                 ;
 | 
			
		||||
        movq            xmm1,           QWORD PTR [rsi]      ;
 | 
			
		||||
 | 
			
		||||
        movq            xmm3,           QWORD PTR [rsi+1]    ;
 | 
			
		||||
        punpcklbw       xmm1,           xmm0                 ;
 | 
			
		||||
 | 
			
		||||
        punpcklbw       xmm1,           xmm0                 ;
 | 
			
		||||
        pmullw          xmm1,           [rax]                ;
 | 
			
		||||
        punpcklbw       xmm3,           xmm0
 | 
			
		||||
            ;
 | 
			
		||||
        pmullw          xmm3,           [rax+16]             ;
 | 
			
		||||
 | 
			
		||||
        paddw           xmm1,           xmm3                 ;
 | 
			
		||||
 | 
			
		||||
        paddw           xmm1,           [GLOBAL(xmm_bi_rd)]  ;
 | 
			
		||||
        paddw           xmm1,           xmm4                 ;
 | 
			
		||||
        psraw           xmm1,           xmm_filter_shift     ;
 | 
			
		||||
 | 
			
		||||
        movdqa          xmm5,           xmm1
 | 
			
		||||
%if ABI_IS_32BIT
 | 
			
		||||
        add             rsi,            dword ptr arg(1) ;ref_pixels_per_line    ;
 | 
			
		||||
%else
 | 
			
		||||
        movsxd          r8,             dword ptr arg(1) ;ref_pixels_per_line    ;
 | 
			
		||||
        add             rsi,            r8
 | 
			
		||||
%endif
 | 
			
		||||
filter_block2d_bil_var_sse2_loop:
 | 
			
		||||
 | 
			
		||||
        movsxd          rbx,            dword ptr arg(1) ;ref_pixels_per_line
 | 
			
		||||
        lea             rsi,            [rsi + rbx]
 | 
			
		||||
%if ABI_IS_32BIT=0
 | 
			
		||||
        movsxd          r9,             dword ptr arg(3) ;src_pixels_per_line
 | 
			
		||||
%endif
 | 
			
		||||
 | 
			
		||||
filter_block2d_bil_var_sse2_loop:
 | 
			
		||||
        movq            xmm1,           QWORD PTR [rsi]               ;
 | 
			
		||||
        movq            xmm3,           QWORD PTR [rsi+1]             ;
 | 
			
		||||
 | 
			
		||||
        punpcklbw       xmm1,           xmm0                 ;
 | 
			
		||||
        pmullw          xmm1,           [rax]               ;
 | 
			
		||||
 | 
			
		||||
        punpcklbw       xmm3,           xmm0                 ;
 | 
			
		||||
        pmullw          xmm3,           [rax+16]             ;
 | 
			
		||||
 | 
			
		||||
        paddw           xmm1,           xmm3                 ;
 | 
			
		||||
        paddw           xmm1,           [GLOBAL(xmm_bi_rd)]  ;
 | 
			
		||||
 | 
			
		||||
        paddw           xmm1,           xmm4               ;
 | 
			
		||||
        psraw           xmm1,           xmm_filter_shift    ;
 | 
			
		||||
 | 
			
		||||
        movdqa          xmm3,           xmm5                 ;
 | 
			
		||||
 | 
			
		||||
        movdqa          xmm5,           xmm1                 ;
 | 
			
		||||
        pmullw          xmm3,           [rdx]               ;
 | 
			
		||||
 | 
			
		||||
        pmullw          xmm3,           [rdx]               ;
 | 
			
		||||
        pmullw          xmm1,           [rdx+16]             ;
 | 
			
		||||
        paddw           xmm1,           xmm3                 ;
 | 
			
		||||
 | 
			
		||||
        paddw           xmm1,           [GLOBAL(xmm_bi_rd)]  ;
 | 
			
		||||
        paddw           xmm1,           xmm4                 ;
 | 
			
		||||
        psraw           xmm1,           xmm_filter_shift    ;
 | 
			
		||||
 | 
			
		||||
        movq            xmm3,           QWORD PTR [rdi]               ;
 | 
			
		||||
@@ -577,20 +589,103 @@ filter_block2d_bil_var_sse2_loop:
 | 
			
		||||
        pmaddwd         xmm1,           xmm1                 ;
 | 
			
		||||
        paddd           xmm7,           xmm1                 ;
 | 
			
		||||
 | 
			
		||||
        lea             rsi,            [rsi + rbx]          ;ref_pixels_per_line
 | 
			
		||||
%if ABI_IS_32BIT
 | 
			
		||||
        add             rsi,            dword ptr arg(1) ;ref_pixels_per_line    ;
 | 
			
		||||
        add             rdi,            dword ptr arg(3) ;src_pixels_per_line    ;
 | 
			
		||||
        add             rdi,            dword ptr arg(3)     ;src_pixels_per_line
 | 
			
		||||
%else
 | 
			
		||||
        movsxd          r8,             dword ptr arg(1) ;ref_pixels_per_line    ;
 | 
			
		||||
        movsxd          r9,             dword ptr arg(3) ;src_pixels_per_line    ;
 | 
			
		||||
        add             rsi,            r8
 | 
			
		||||
        add             rdi,            r9
 | 
			
		||||
        lea             rdi,            [rdi + r9]
 | 
			
		||||
%endif
 | 
			
		||||
 | 
			
		||||
        sub             rcx,            1                   ;
 | 
			
		||||
        jnz             filter_block2d_bil_var_sse2_loop       ;
 | 
			
		||||
 | 
			
		||||
        jmp             filter_block2d_bil_variance
 | 
			
		||||
 | 
			
		||||
filter_block2d_bil_var_sse2_sp_only:
 | 
			
		||||
        movsxd          rdx,            dword ptr arg(6)     ; yoffset
 | 
			
		||||
        shl             rdx,            5
 | 
			
		||||
        lea             rdx,            [rdx + rcx]          ; VFilter
 | 
			
		||||
 | 
			
		||||
        mov             rsi,            arg(0)               ;ref_ptr
 | 
			
		||||
        mov             rdi,            arg(2)               ;src_ptr
 | 
			
		||||
        movsxd          rcx,            dword ptr arg(4)     ;Height
 | 
			
		||||
        movsxd          rax,            dword ptr arg(1)     ;ref_pixels_per_line
 | 
			
		||||
 | 
			
		||||
        pxor            xmm0,           xmm0                 ;
 | 
			
		||||
        movq            xmm1,           QWORD PTR [rsi]      ;
 | 
			
		||||
        punpcklbw       xmm1,           xmm0                 ;
 | 
			
		||||
 | 
			
		||||
        movsxd          rbx,            dword ptr arg(3)     ;src_pixels_per_line
 | 
			
		||||
        lea             rsi,            [rsi + rax]
 | 
			
		||||
 | 
			
		||||
filter_block2d_bil_sp_only_loop:
 | 
			
		||||
        movq            xmm3,           QWORD PTR [rsi]             ;
 | 
			
		||||
        punpcklbw       xmm3,           xmm0                 ;
 | 
			
		||||
        movdqa          xmm5,           xmm3
 | 
			
		||||
 | 
			
		||||
        pmullw          xmm1,           [rdx]               ;
 | 
			
		||||
        pmullw          xmm3,           [rdx+16]             ;
 | 
			
		||||
        paddw           xmm1,           xmm3                 ;
 | 
			
		||||
        paddw           xmm1,           xmm4                 ;
 | 
			
		||||
        psraw           xmm1,           xmm_filter_shift    ;
 | 
			
		||||
 | 
			
		||||
        movq            xmm3,           QWORD PTR [rdi]               ;
 | 
			
		||||
        punpcklbw       xmm3,           xmm0                 ;
 | 
			
		||||
 | 
			
		||||
        psubw           xmm1,           xmm3                 ;
 | 
			
		||||
        paddw           xmm6,           xmm1                 ;
 | 
			
		||||
 | 
			
		||||
        pmaddwd         xmm1,           xmm1                 ;
 | 
			
		||||
        paddd           xmm7,           xmm1                 ;
 | 
			
		||||
 | 
			
		||||
        movdqa          xmm1,           xmm5                 ;
 | 
			
		||||
        lea             rsi,            [rsi + rax]          ;ref_pixels_per_line
 | 
			
		||||
        lea             rdi,            [rdi + rbx]          ;src_pixels_per_line
 | 
			
		||||
 | 
			
		||||
        sub             rcx,            1                   ;
 | 
			
		||||
        jnz             filter_block2d_bil_sp_only_loop       ;
 | 
			
		||||
 | 
			
		||||
        jmp             filter_block2d_bil_variance
 | 
			
		||||
 | 
			
		||||
filter_block2d_bil_var_sse2_fp_only:
 | 
			
		||||
        mov             rsi,            arg(0)               ;ref_ptr
 | 
			
		||||
        mov             rdi,            arg(2)               ;src_ptr
 | 
			
		||||
        movsxd          rcx,            dword ptr arg(4)     ;Height
 | 
			
		||||
        movsxd          rdx,            dword ptr arg(1)     ;ref_pixels_per_line
 | 
			
		||||
 | 
			
		||||
        pxor            xmm0,           xmm0                 ;
 | 
			
		||||
        movsxd          rbx,            dword ptr arg(3)     ;src_pixels_per_line
 | 
			
		||||
 | 
			
		||||
filter_block2d_bil_fp_only_loop:
 | 
			
		||||
        movq            xmm1,           QWORD PTR [rsi]       ;
 | 
			
		||||
        movq            xmm3,           QWORD PTR [rsi+1]     ;
 | 
			
		||||
 | 
			
		||||
        punpcklbw       xmm1,           xmm0                 ;
 | 
			
		||||
        pmullw          xmm1,           [rax]               ;
 | 
			
		||||
        punpcklbw       xmm3,           xmm0                 ;
 | 
			
		||||
        pmullw          xmm3,           [rax+16]             ;
 | 
			
		||||
 | 
			
		||||
        paddw           xmm1,           xmm3                 ;
 | 
			
		||||
        paddw           xmm1,           xmm4  ;
 | 
			
		||||
        psraw           xmm1,           xmm_filter_shift    ;
 | 
			
		||||
 | 
			
		||||
        movq            xmm3,           QWORD PTR [rdi]     ;
 | 
			
		||||
        punpcklbw       xmm3,           xmm0                 ;
 | 
			
		||||
 | 
			
		||||
        psubw           xmm1,           xmm3                 ;
 | 
			
		||||
        paddw           xmm6,           xmm1                 ;
 | 
			
		||||
 | 
			
		||||
        pmaddwd         xmm1,           xmm1                 ;
 | 
			
		||||
        paddd           xmm7,           xmm1                 ;
 | 
			
		||||
        lea             rsi,            [rsi + rdx]
 | 
			
		||||
        lea             rdi,            [rdi + rbx]          ;src_pixels_per_line
 | 
			
		||||
 | 
			
		||||
        sub             rcx,            1                   ;
 | 
			
		||||
        jnz             filter_block2d_bil_fp_only_loop       ;
 | 
			
		||||
 | 
			
		||||
        jmp             filter_block2d_bil_variance
 | 
			
		||||
 | 
			
		||||
filter_block2d_bil_variance:
 | 
			
		||||
        movdq2q         mm6,            xmm6                ;
 | 
			
		||||
        movdq2q         mm7,            xmm7                ;
 | 
			
		||||
 | 
			
		||||
@@ -627,12 +722,12 @@ filter_block2d_bil_var_sse2_loop:
 | 
			
		||||
        movd            [rsi],          mm2    ; xsum
 | 
			
		||||
        movd            [rdi],          mm4    ; xxsum
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
    ; begin epilog
 | 
			
		||||
    add rsp, 16
 | 
			
		||||
    pop rbx
 | 
			
		||||
    pop rdi
 | 
			
		||||
    pop rsi
 | 
			
		||||
    RESTORE_GOT
 | 
			
		||||
    RESTORE_XMM
 | 
			
		||||
    UNSHADOW_ARGS
 | 
			
		||||
    pop         rbp
 | 
			
		||||
    ret
 | 
			
		||||
@@ -974,3 +1069,13 @@ SECTION_RODATA
 | 
			
		||||
align 16
 | 
			
		||||
xmm_bi_rd:
 | 
			
		||||
    times 8 dw 64
 | 
			
		||||
align 16
 | 
			
		||||
vp8_bilinear_filters_sse2:
 | 
			
		||||
    dw 128, 128, 128, 128, 128, 128, 128, 128,  0,  0,  0,  0,  0,  0,  0,  0
 | 
			
		||||
    dw 112, 112, 112, 112, 112, 112, 112, 112, 16, 16, 16, 16, 16, 16, 16, 16
 | 
			
		||||
    dw 96, 96, 96, 96, 96, 96, 96, 96, 32, 32, 32, 32, 32, 32, 32, 32
 | 
			
		||||
    dw 80, 80, 80, 80, 80, 80, 80, 80, 48, 48, 48, 48, 48, 48, 48, 48
 | 
			
		||||
    dw 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64
 | 
			
		||||
    dw 48, 48, 48, 48, 48, 48, 48, 48, 80, 80, 80, 80, 80, 80, 80, 80
 | 
			
		||||
    dw 32, 32, 32, 32, 32, 32, 32, 32, 96, 96, 96, 96, 96, 96, 96, 96
 | 
			
		||||
    dw 16, 16, 16, 16, 16, 16, 16, 16, 112, 112, 112, 112, 112, 112, 112, 112
 | 
			
		||||
 
 | 
			
		||||
@@ -76,8 +76,8 @@ void vp8_filter_block2d_bil_var_sse2
 | 
			
		||||
    const unsigned char *src_ptr,
 | 
			
		||||
    int src_pixels_per_line,
 | 
			
		||||
    unsigned int Height,
 | 
			
		||||
    const short *HFilter,
 | 
			
		||||
    const short *VFilter,
 | 
			
		||||
    int  xoffset,
 | 
			
		||||
    int  yoffset,
 | 
			
		||||
    int *sum,
 | 
			
		||||
    unsigned int *sumsquared
 | 
			
		||||
);
 | 
			
		||||
@@ -222,21 +222,6 @@ unsigned int vp8_variance8x16_wmt
 | 
			
		||||
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
///////////////////////////////////////////////////////////////////////////
 | 
			
		||||
// the mmx function that does the bilinear filtering and var calculation //
 | 
			
		||||
// int one pass                                                          //
 | 
			
		||||
///////////////////////////////////////////////////////////////////////////
 | 
			
		||||
DECLARE_ALIGNED(16, const short, vp8_bilinear_filters_xmm[8][16]) =
 | 
			
		||||
{
 | 
			
		||||
    { 128, 128, 128, 128, 128, 128, 128, 128,  0,  0,  0,  0,  0,  0,  0,  0 },
 | 
			
		||||
    { 112, 112, 112, 112, 112, 112, 112, 112, 16, 16, 16, 16, 16, 16, 16, 16 },
 | 
			
		||||
    {  96, 96, 96, 96, 96, 96, 96, 96, 32, 32, 32, 32, 32, 32, 32, 32 },
 | 
			
		||||
    {  80, 80, 80, 80, 80, 80, 80, 80, 48, 48, 48, 48, 48, 48, 48, 48 },
 | 
			
		||||
    {  64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 },
 | 
			
		||||
    {  48, 48, 48, 48, 48, 48, 48, 48, 80, 80, 80, 80, 80, 80, 80, 80 },
 | 
			
		||||
    {  32, 32, 32, 32, 32, 32, 32, 32, 96, 96, 96, 96, 96, 96, 96, 96 },
 | 
			
		||||
    {  16, 16, 16, 16, 16, 16, 16, 16, 112, 112, 112, 112, 112, 112, 112, 112 }
 | 
			
		||||
};
 | 
			
		||||
unsigned int vp8_sub_pixel_variance4x4_wmt
 | 
			
		||||
(
 | 
			
		||||
    const unsigned char  *src_ptr,
 | 
			
		||||
@@ -272,15 +257,38 @@ unsigned int vp8_sub_pixel_variance8x8_wmt
 | 
			
		||||
    unsigned int *sse
 | 
			
		||||
)
 | 
			
		||||
{
 | 
			
		||||
 | 
			
		||||
    int xsum;
 | 
			
		||||
    unsigned int xxsum;
 | 
			
		||||
 | 
			
		||||
    if (xoffset == 4 && yoffset == 0)
 | 
			
		||||
    {
 | 
			
		||||
        vp8_half_horiz_variance16x_h_sse2(
 | 
			
		||||
            src_ptr, src_pixels_per_line,
 | 
			
		||||
            dst_ptr, dst_pixels_per_line, 8,
 | 
			
		||||
            &xsum, &xxsum);
 | 
			
		||||
    }
 | 
			
		||||
    else if (xoffset == 0 && yoffset == 4)
 | 
			
		||||
    {
 | 
			
		||||
        vp8_half_vert_variance16x_h_sse2(
 | 
			
		||||
            src_ptr, src_pixels_per_line,
 | 
			
		||||
            dst_ptr, dst_pixels_per_line, 8,
 | 
			
		||||
            &xsum, &xxsum);
 | 
			
		||||
    }
 | 
			
		||||
    else if (xoffset == 4 && yoffset == 4)
 | 
			
		||||
    {
 | 
			
		||||
        vp8_half_horiz_vert_variance16x_h_sse2(
 | 
			
		||||
            src_ptr, src_pixels_per_line,
 | 
			
		||||
            dst_ptr, dst_pixels_per_line, 8,
 | 
			
		||||
            &xsum, &xxsum);
 | 
			
		||||
    }
 | 
			
		||||
    else
 | 
			
		||||
    {
 | 
			
		||||
        vp8_filter_block2d_bil_var_sse2(
 | 
			
		||||
            src_ptr, src_pixels_per_line,
 | 
			
		||||
            dst_ptr, dst_pixels_per_line, 8,
 | 
			
		||||
        vp8_bilinear_filters_xmm[xoffset], vp8_bilinear_filters_xmm[yoffset],
 | 
			
		||||
        &xsum, &xxsum
 | 
			
		||||
    );
 | 
			
		||||
            xoffset, yoffset,
 | 
			
		||||
            &xsum, &xxsum);
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    *sse = xxsum;
 | 
			
		||||
    return (xxsum - ((xsum * xsum) >> 6));
 | 
			
		||||
@@ -344,7 +352,7 @@ unsigned int vp8_sub_pixel_variance16x16_wmt
 | 
			
		||||
        vp8_filter_block2d_bil_var_sse2(
 | 
			
		||||
            src_ptr, src_pixels_per_line,
 | 
			
		||||
            dst_ptr, dst_pixels_per_line, 16,
 | 
			
		||||
            vp8_bilinear_filters_xmm[xoffset], vp8_bilinear_filters_xmm[yoffset],
 | 
			
		||||
            xoffset, yoffset,
 | 
			
		||||
            &xsum0, &xxsum0
 | 
			
		||||
        );
 | 
			
		||||
 | 
			
		||||
@@ -352,7 +360,7 @@ unsigned int vp8_sub_pixel_variance16x16_wmt
 | 
			
		||||
        vp8_filter_block2d_bil_var_sse2(
 | 
			
		||||
            src_ptr + 8, src_pixels_per_line,
 | 
			
		||||
            dst_ptr + 8, dst_pixels_per_line, 16,
 | 
			
		||||
            vp8_bilinear_filters_xmm[xoffset], vp8_bilinear_filters_xmm[yoffset],
 | 
			
		||||
            xoffset, yoffset,
 | 
			
		||||
            &xsum1, &xxsum1
 | 
			
		||||
        );
 | 
			
		||||
    }
 | 
			
		||||
@@ -392,21 +400,56 @@ unsigned int vp8_sub_pixel_variance16x8_wmt
 | 
			
		||||
    int xsum0, xsum1;
 | 
			
		||||
    unsigned int xxsum0, xxsum1;
 | 
			
		||||
 | 
			
		||||
    if (xoffset == 4 && yoffset == 0)
 | 
			
		||||
    {
 | 
			
		||||
        vp8_half_horiz_variance16x_h_sse2(
 | 
			
		||||
            src_ptr, src_pixels_per_line,
 | 
			
		||||
            dst_ptr, dst_pixels_per_line, 8,
 | 
			
		||||
            &xsum0, &xxsum0);
 | 
			
		||||
 | 
			
		||||
        vp8_half_horiz_variance16x_h_sse2(
 | 
			
		||||
            src_ptr + 8, src_pixels_per_line,
 | 
			
		||||
            dst_ptr + 8, dst_pixels_per_line, 8,
 | 
			
		||||
            &xsum1, &xxsum1);
 | 
			
		||||
    }
 | 
			
		||||
    else if (xoffset == 0 && yoffset == 4)
 | 
			
		||||
    {
 | 
			
		||||
        vp8_half_vert_variance16x_h_sse2(
 | 
			
		||||
            src_ptr, src_pixels_per_line,
 | 
			
		||||
            dst_ptr, dst_pixels_per_line, 8,
 | 
			
		||||
            &xsum0, &xxsum0);
 | 
			
		||||
 | 
			
		||||
        vp8_half_vert_variance16x_h_sse2(
 | 
			
		||||
            src_ptr + 8, src_pixels_per_line,
 | 
			
		||||
            dst_ptr + 8, dst_pixels_per_line, 8,
 | 
			
		||||
            &xsum1, &xxsum1);
 | 
			
		||||
    }
 | 
			
		||||
    else if (xoffset == 4 && yoffset == 4)
 | 
			
		||||
    {
 | 
			
		||||
        vp8_half_horiz_vert_variance16x_h_sse2(
 | 
			
		||||
            src_ptr, src_pixels_per_line,
 | 
			
		||||
            dst_ptr, dst_pixels_per_line, 8,
 | 
			
		||||
            &xsum0, &xxsum0);
 | 
			
		||||
 | 
			
		||||
        vp8_half_horiz_vert_variance16x_h_sse2(
 | 
			
		||||
            src_ptr + 8, src_pixels_per_line,
 | 
			
		||||
            dst_ptr + 8, dst_pixels_per_line, 8,
 | 
			
		||||
            &xsum1, &xxsum1);
 | 
			
		||||
    }
 | 
			
		||||
    else
 | 
			
		||||
    {
 | 
			
		||||
        vp8_filter_block2d_bil_var_sse2(
 | 
			
		||||
            src_ptr, src_pixels_per_line,
 | 
			
		||||
            dst_ptr, dst_pixels_per_line, 8,
 | 
			
		||||
        vp8_bilinear_filters_xmm[xoffset], vp8_bilinear_filters_xmm[yoffset],
 | 
			
		||||
        &xsum0, &xxsum0
 | 
			
		||||
    );
 | 
			
		||||
 | 
			
		||||
            xoffset, yoffset,
 | 
			
		||||
            &xsum0, &xxsum0);
 | 
			
		||||
 | 
			
		||||
        vp8_filter_block2d_bil_var_sse2(
 | 
			
		||||
            src_ptr + 8, src_pixels_per_line,
 | 
			
		||||
            dst_ptr + 8, dst_pixels_per_line, 8,
 | 
			
		||||
        vp8_bilinear_filters_xmm[xoffset], vp8_bilinear_filters_xmm[yoffset],
 | 
			
		||||
        &xsum1, &xxsum1
 | 
			
		||||
    );
 | 
			
		||||
            xoffset, yoffset,
 | 
			
		||||
            &xsum1, &xxsum1);
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    xsum0 += xsum1;
 | 
			
		||||
    xxsum0 += xxsum1;
 | 
			
		||||
@@ -428,12 +471,36 @@ unsigned int vp8_sub_pixel_variance8x16_wmt
 | 
			
		||||
{
 | 
			
		||||
    int xsum;
 | 
			
		||||
    unsigned int xxsum;
 | 
			
		||||
 | 
			
		||||
    if (xoffset == 4 && yoffset == 0)
 | 
			
		||||
    {
 | 
			
		||||
        vp8_half_horiz_variance16x_h_sse2(
 | 
			
		||||
            src_ptr, src_pixels_per_line,
 | 
			
		||||
            dst_ptr, dst_pixels_per_line, 16,
 | 
			
		||||
            &xsum, &xxsum);
 | 
			
		||||
    }
 | 
			
		||||
    else if (xoffset == 0 && yoffset == 4)
 | 
			
		||||
    {
 | 
			
		||||
        vp8_half_vert_variance16x_h_sse2(
 | 
			
		||||
            src_ptr, src_pixels_per_line,
 | 
			
		||||
            dst_ptr, dst_pixels_per_line, 16,
 | 
			
		||||
            &xsum, &xxsum);
 | 
			
		||||
    }
 | 
			
		||||
    else if (xoffset == 4 && yoffset == 4)
 | 
			
		||||
    {
 | 
			
		||||
        vp8_half_horiz_vert_variance16x_h_sse2(
 | 
			
		||||
            src_ptr, src_pixels_per_line,
 | 
			
		||||
            dst_ptr, dst_pixels_per_line, 16,
 | 
			
		||||
            &xsum, &xxsum);
 | 
			
		||||
    }
 | 
			
		||||
    else
 | 
			
		||||
    {
 | 
			
		||||
        vp8_filter_block2d_bil_var_sse2(
 | 
			
		||||
            src_ptr, src_pixels_per_line,
 | 
			
		||||
            dst_ptr, dst_pixels_per_line, 16,
 | 
			
		||||
        vp8_bilinear_filters_xmm[xoffset], vp8_bilinear_filters_xmm[yoffset],
 | 
			
		||||
        &xsum, &xxsum
 | 
			
		||||
    );
 | 
			
		||||
            xoffset, yoffset,
 | 
			
		||||
            &xsum, &xxsum);
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    *sse = xxsum;
 | 
			
		||||
    return (xxsum - ((xsum * xsum) >> 7));
 | 
			
		||||
 
 | 
			
		||||
		Reference in New Issue
	
	Block a user