remove mmx variance functions

there are sse2 equivalents which is a reasonable modern baseline Removed mmx variance functions: vpx_get_mb_ss_mmx() vpx_get8x8var_mmx() vpx_get4x4var_mmx() vpx_variance4x4_mmx() vpx_variance8x8_mmx() vpx_mse16x16_mmx() vpx_variance16x16_mmx() vpx_variance16x8_mmx() vpx_variance8x16_mmx() Change-Id: Iffaf85344c6676a3dd337c0645a2dd5deb2f86a1
2016-05-11 12:39:42 -07:00
parent 57566ff24a
commit d0ffae825d
4 changed files with 8 additions and 519 deletions
--- a/test/variance_test.cc
+++ b/test/variance_test.cc
@@ -977,20 +977,6 @@ INSTANTIATE_TEST_CASE_P(
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 #if HAVE_MMX
 INSTANTIATE_TEST_CASE_P(MMX, VpxMseTest,
                        ::testing::Values(make_tuple(4, 4, &vpx_mse16x16_mmx)));
 INSTANTIATE_TEST_CASE_P(MMX, SumOfSquaresTest,
                        ::testing::Values(vpx_get_mb_ss_mmx));
 INSTANTIATE_TEST_CASE_P(
    MMX, VpxVarianceTest,
    ::testing::Values(make_tuple(4, 4, &vpx_variance16x16_mmx, 0),
                      make_tuple(4, 3, &vpx_variance16x8_mmx, 0),
                      make_tuple(3, 4, &vpx_variance8x16_mmx, 0),
                      make_tuple(3, 3, &vpx_variance8x8_mmx, 0),
                      make_tuple(2, 2, &vpx_variance4x4_mmx, 0)));
 INSTANTIATE_TEST_CASE_P(
    MMX, VpxSubpelVarianceTest,
    ::testing::Values(make_tuple(4, 4, &vpx_sub_pixel_variance16x16_mmx, 0),
--- a/vpx_dsp/vpx_dsp_rtcd_defs.pl
+++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl
@@ -1407,16 +1407,16 @@ add_proto qw/unsigned int vpx_variance16x32/, "const uint8_t *src_ptr, int sourc
  specialize qw/vpx_variance16x32 sse2 msa/;
 add_proto qw/unsigned int vpx_variance16x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vpx_variance16x16 mmx sse2 avx2 media neon msa/;
+  specialize qw/vpx_variance16x16 sse2 avx2 media neon msa/;
 add_proto qw/unsigned int vpx_variance16x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vpx_variance16x8 mmx sse2 neon msa/;
+  specialize qw/vpx_variance16x8 sse2 neon msa/;
 add_proto qw/unsigned int vpx_variance8x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vpx_variance8x16 mmx sse2 neon msa/;
+  specialize qw/vpx_variance8x16 sse2 neon msa/;
 add_proto qw/unsigned int vpx_variance8x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vpx_variance8x8 mmx sse2 media neon msa/;
+  specialize qw/vpx_variance8x8 sse2 media neon msa/;
 add_proto qw/unsigned int vpx_variance8x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
  specialize qw/vpx_variance8x4 sse2 msa/;
@@ -1425,7 +1425,7 @@ add_proto qw/unsigned int vpx_variance4x8/, "const uint8_t *src_ptr, int source_
  specialize qw/vpx_variance4x8 sse2 msa/;
 add_proto qw/unsigned int vpx_variance4x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vpx_variance4x4 mmx sse2 msa/;
+  specialize qw/vpx_variance4x4 sse2 msa/;
 #
 # Specialty Variance
@@ -1434,10 +1434,10 @@ add_proto qw/void vpx_get16x16var/, "const uint8_t *src_ptr, int source_stride,
  specialize qw/vpx_get16x16var sse2 avx2 neon msa/;
 add_proto qw/void vpx_get8x8var/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum";
-  specialize qw/vpx_get8x8var mmx sse2 neon msa/;
+  specialize qw/vpx_get8x8var sse2 neon msa/;
 add_proto qw/unsigned int vpx_mse16x16/, "const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse";
-  specialize qw/vpx_mse16x16 mmx sse2 avx2 media neon msa/;
+  specialize qw/vpx_mse16x16 sse2 avx2 media neon msa/;
 add_proto qw/unsigned int vpx_mse16x8/, "const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse";
  specialize qw/vpx_mse16x8 sse2 msa/;
@@ -1449,7 +1449,7 @@ add_proto qw/unsigned int vpx_mse8x8/, "const uint8_t *src_ptr, int  source_stri
  specialize qw/vpx_mse8x8 sse2 msa/;
 add_proto qw/unsigned int vpx_get_mb_ss/, "const int16_t *";
-  specialize qw/vpx_get_mb_ss mmx sse2 msa/;
+  specialize qw/vpx_get_mb_ss sse2 msa/;
 add_proto qw/unsigned int vpx_get4x4sse_cs/, "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int  ref_stride";
  specialize qw/vpx_get4x4sse_cs neon msa/;
--- a/vpx_dsp/x86/variance_impl_mmx.asm
+++ b/vpx_dsp/x86/variance_impl_mmx.asm
@@ -13,407 +13,6 @@
 %define mmx_filter_shift            7
 ;unsigned int vpx_get_mb_ss_mmx( short *src_ptr )
 global sym(vpx_get_mb_ss_mmx) PRIVATE
 sym(vpx_get_mb_ss_mmx):
    push        rbp
    mov         rbp, rsp
    SHADOW_ARGS_TO_STACK 7
    GET_GOT     rbx
    push rsi
    push rdi
    sub         rsp, 8
    ; end prolog
        mov         rax, arg(0) ;src_ptr
        mov         rcx, 16
        pxor        mm4, mm4
 .NEXTROW:
        movq        mm0, [rax]
        movq        mm1, [rax+8]
        movq        mm2, [rax+16]
        movq        mm3, [rax+24]
        pmaddwd     mm0, mm0
        pmaddwd     mm1, mm1
        pmaddwd     mm2, mm2
        pmaddwd     mm3, mm3
        paddd       mm4, mm0
        paddd       mm4, mm1
        paddd       mm4, mm2
        paddd       mm4, mm3
        add         rax, 32
        dec         rcx
        ja          .NEXTROW
        movq        QWORD PTR [rsp], mm4
        ;return sum[0]+sum[1];
        movsxd      rax, dword ptr [rsp]
        movsxd      rcx, dword ptr [rsp+4]
        add         rax, rcx
    ; begin epilog
    add rsp, 8
    pop rdi
    pop rsi
    RESTORE_GOT
    UNSHADOW_ARGS
    pop         rbp
    ret
 ;void vpx_get8x8var_mmx
 ;(
 ;    unsigned char *src_ptr,
 ;    int  source_stride,
 ;    unsigned char *ref_ptr,
 ;    int  recon_stride,
 ;    unsigned int *SSE,
 ;    int *Sum
 ;)
 global sym(vpx_get8x8var_mmx) PRIVATE
 sym(vpx_get8x8var_mmx):
    push        rbp
    mov         rbp, rsp
    SHADOW_ARGS_TO_STACK 6
    push rsi
    push rdi
    push rbx
    sub         rsp, 16
    ; end prolog
        pxor        mm5, mm5                    ; Blank mmx6
        pxor        mm6, mm6                    ; Blank mmx7
        pxor        mm7, mm7                    ; Blank mmx7
        mov         rax, arg(0) ;[src_ptr]  ; Load base addresses
        mov         rbx, arg(2) ;[ref_ptr]
        movsxd      rcx, dword ptr arg(1) ;[source_stride]
        movsxd      rdx, dword ptr arg(3) ;[recon_stride]
        ; Row 1
        movq        mm0, [rax]                  ; Copy eight bytes to mm0
        movq        mm1, [rbx]                  ; Copy eight bytes to mm1
        movq        mm2, mm0                    ; Take copies
        movq        mm3, mm1                    ; Take copies
        punpcklbw   mm0, mm6                    ; unpack to higher prrcision
        punpcklbw   mm1, mm6
        punpckhbw   mm2, mm6                    ; unpack to higher prrcision
        punpckhbw   mm3, mm6
        psubsw      mm0, mm1                    ; A-B (low order) to MM0
        psubsw      mm2, mm3                    ; A-B (high order) to MM2
        paddw       mm5, mm0                    ; accumulate differences in mm5
        paddw       mm5, mm2                    ; accumulate differences in mm5
        pmaddwd     mm0, mm0                    ; square and accumulate
        pmaddwd     mm2, mm2                    ; square and accumulate
        add         rbx,rdx                     ; Inc pointer into ref data
        add         rax,rcx                     ; Inc pointer into the new data
        movq        mm1, [rbx]                  ; Copy eight bytes to mm1
        paddd       mm7, mm0                    ; accumulate in mm7
        paddd       mm7, mm2                    ; accumulate in mm7
        ; Row 2
        movq        mm0, [rax]                  ; Copy eight bytes to mm0
        movq        mm2, mm0                    ; Take copies
        movq        mm3, mm1                    ; Take copies
        punpcklbw   mm0, mm6                    ; unpack to higher prrcision
        punpcklbw   mm1, mm6
        punpckhbw   mm2, mm6                    ; unpack to higher prrcision
        punpckhbw   mm3, mm6
        psubsw      mm0, mm1                    ; A-B (low order) to MM0
        psubsw      mm2, mm3                    ; A-B (high order) to MM2
        paddw       mm5, mm0                    ; accumulate differences in mm5
        paddw       mm5, mm2                    ; accumulate differences in mm5
        pmaddwd     mm0, mm0                    ; square and accumulate
        pmaddwd     mm2, mm2                    ; square and accumulate
        add         rbx,rdx                     ; Inc pointer into ref data
        add         rax,rcx                     ; Inc pointer into the new data
        movq        mm1, [rbx]                  ; Copy eight bytes to mm1
        paddd       mm7, mm0                    ; accumulate in mm7
        paddd       mm7, mm2                    ; accumulate in mm7
        ; Row 3
        movq        mm0, [rax]                  ; Copy eight bytes to mm0
        movq        mm2, mm0                    ; Take copies
        movq        mm3, mm1                    ; Take copies
        punpcklbw   mm0, mm6                    ; unpack to higher prrcision
        punpcklbw   mm1, mm6
        punpckhbw   mm2, mm6                    ; unpack to higher prrcision
        punpckhbw   mm3, mm6
        psubsw      mm0, mm1                    ; A-B (low order) to MM0
        psubsw      mm2, mm3                    ; A-B (high order) to MM2
        paddw       mm5, mm0                    ; accumulate differences in mm5
        paddw       mm5, mm2                    ; accumulate differences in mm5
        pmaddwd     mm0, mm0                    ; square and accumulate
        pmaddwd     mm2, mm2                    ; square and accumulate
        add         rbx,rdx                     ; Inc pointer into ref data
        add         rax,rcx                     ; Inc pointer into the new data
        movq        mm1, [rbx]                  ; Copy eight bytes to mm1
        paddd       mm7, mm0                    ; accumulate in mm7
        paddd       mm7, mm2                    ; accumulate in mm7
        ; Row 4
        movq        mm0, [rax]                  ; Copy eight bytes to mm0
        movq        mm2, mm0                    ; Take copies
        movq        mm3, mm1                    ; Take copies
        punpcklbw   mm0, mm6                    ; unpack to higher prrcision
        punpcklbw   mm1, mm6
        punpckhbw   mm2, mm6                    ; unpack to higher prrcision
        punpckhbw   mm3, mm6
        psubsw      mm0, mm1                    ; A-B (low order) to MM0
        psubsw      mm2, mm3                    ; A-B (high order) to MM2
        paddw       mm5, mm0                    ; accumulate differences in mm5
        paddw       mm5, mm2                    ; accumulate differences in mm5
        pmaddwd     mm0, mm0                    ; square and accumulate
        pmaddwd     mm2, mm2                    ; square and accumulate
        add         rbx,rdx                     ; Inc pointer into ref data
        add         rax,rcx                     ; Inc pointer into the new data
        movq        mm1, [rbx]                  ; Copy eight bytes to mm1
        paddd       mm7, mm0                    ; accumulate in mm7
        paddd       mm7, mm2                    ; accumulate in mm7
        ; Row 5
        movq        mm0, [rax]                  ; Copy eight bytes to mm0
        movq        mm2, mm0                    ; Take copies
        movq        mm3, mm1                    ; Take copies
        punpcklbw   mm0, mm6                    ; unpack to higher prrcision
        punpcklbw   mm1, mm6
        punpckhbw   mm2, mm6                    ; unpack to higher prrcision
        punpckhbw   mm3, mm6
        psubsw      mm0, mm1                    ; A-B (low order) to MM0
        psubsw      mm2, mm3                    ; A-B (high order) to MM2
        paddw       mm5, mm0                    ; accumulate differences in mm5
        paddw       mm5, mm2                    ; accumulate differences in mm5
        pmaddwd     mm0, mm0                    ; square and accumulate
        pmaddwd     mm2, mm2                    ; square and accumulate
        add         rbx,rdx                     ; Inc pointer into ref data
        add         rax,rcx                     ; Inc pointer into the new data
        movq        mm1, [rbx]                  ; Copy eight bytes to mm1
        ;              movq        mm4, [rbx + rdx]
        paddd       mm7, mm0                    ; accumulate in mm7
        paddd       mm7, mm2                    ; accumulate in mm7
        ; Row 6
        movq        mm0, [rax]                  ; Copy eight bytes to mm0
        movq        mm2, mm0                    ; Take copies
        movq        mm3, mm1                    ; Take copies
        punpcklbw   mm0, mm6                    ; unpack to higher prrcision
        punpcklbw   mm1, mm6
        punpckhbw   mm2, mm6                    ; unpack to higher prrcision
        punpckhbw   mm3, mm6
        psubsw      mm0, mm1                    ; A-B (low order) to MM0
        psubsw      mm2, mm3                    ; A-B (high order) to MM2
        paddw       mm5, mm0                    ; accumulate differences in mm5
        paddw       mm5, mm2                    ; accumulate differences in mm5
        pmaddwd     mm0, mm0                    ; square and accumulate
        pmaddwd     mm2, mm2                    ; square and accumulate
        add         rbx,rdx                     ; Inc pointer into ref data
        add         rax,rcx                     ; Inc pointer into the new data
        movq        mm1, [rbx]                  ; Copy eight bytes to mm1
        paddd       mm7, mm0                    ; accumulate in mm7
        paddd       mm7, mm2                    ; accumulate in mm7
        ; Row 7
        movq        mm0, [rax]                  ; Copy eight bytes to mm0
        movq        mm2, mm0                    ; Take copies
        movq        mm3, mm1                    ; Take copies
        punpcklbw   mm0, mm6                    ; unpack to higher prrcision
        punpcklbw   mm1, mm6
        punpckhbw   mm2, mm6                    ; unpack to higher prrcision
        punpckhbw   mm3, mm6
        psubsw      mm0, mm1                    ; A-B (low order) to MM0
        psubsw      mm2, mm3                    ; A-B (high order) to MM2
        paddw       mm5, mm0                    ; accumulate differences in mm5
        paddw       mm5, mm2                    ; accumulate differences in mm5
        pmaddwd     mm0, mm0                    ; square and accumulate
        pmaddwd     mm2, mm2                    ; square and accumulate
        add         rbx,rdx                     ; Inc pointer into ref data
        add         rax,rcx                     ; Inc pointer into the new data
        movq        mm1, [rbx]                  ; Copy eight bytes to mm1
        paddd       mm7, mm0                    ; accumulate in mm7
        paddd       mm7, mm2                    ; accumulate in mm7
        ; Row 8
        movq        mm0, [rax]                  ; Copy eight bytes to mm0
        movq        mm2, mm0                    ; Take copies
        movq        mm3, mm1                    ; Take copies
        punpcklbw   mm0, mm6                    ; unpack to higher prrcision
        punpcklbw   mm1, mm6
        punpckhbw   mm2, mm6                    ; unpack to higher prrcision
        punpckhbw   mm3, mm6
        psubsw      mm0, mm1                    ; A-B (low order) to MM0
        psubsw      mm2, mm3                    ; A-B (high order) to MM2
        paddw       mm5, mm0                    ; accumulate differences in mm5
        paddw       mm5, mm2                    ; accumulate differences in mm5
        pmaddwd     mm0, mm0                    ; square and accumulate
        pmaddwd     mm2, mm2                    ; square and accumulate
        add         rbx,rdx                     ; Inc pointer into ref data
        add         rax,rcx                     ; Inc pointer into the new data
        paddd       mm7, mm0                    ; accumulate in mm7
        paddd       mm7, mm2                    ; accumulate in mm7
        ; Now accumulate the final results.
        movq        QWORD PTR [rsp+8], mm5      ; copy back accumulated results into normal memory
        movq        QWORD PTR [rsp], mm7        ; copy back accumulated results into normal memory
        movsx       rdx, WORD PTR [rsp+8]
        movsx       rcx, WORD PTR [rsp+10]
        movsx       rbx, WORD PTR [rsp+12]
        movsx       rax, WORD PTR [rsp+14]
        add         rdx, rcx
        add         rbx, rax
        add         rdx, rbx    ;XSum
        movsxd      rax, DWORD PTR [rsp]
        movsxd      rcx, DWORD PTR [rsp+4]
        add         rax, rcx    ;XXSum
        mov         rsi, arg(4) ;SSE
        mov         rdi, arg(5) ;Sum
        mov         dword ptr [rsi], eax
        mov         dword ptr [rdi], edx
        xor         rax, rax    ; return 0
    ; begin epilog
    add rsp, 16
    pop rbx
    pop rdi
    pop rsi
    UNSHADOW_ARGS
    pop         rbp
    ret
 ;void
 ;vpx_get4x4var_mmx
 ;(
 ;    unsigned char *src_ptr,
 ;    int  source_stride,
 ;    unsigned char *ref_ptr,
 ;    int  recon_stride,
 ;    unsigned int *SSE,
 ;    int *Sum
 ;)
 global sym(vpx_get4x4var_mmx) PRIVATE
 sym(vpx_get4x4var_mmx):
    push        rbp
    mov         rbp, rsp
    SHADOW_ARGS_TO_STACK 6
    push rsi
    push rdi
    push rbx
    sub         rsp, 16
    ; end prolog
        pxor        mm5, mm5                    ; Blank mmx6
        pxor        mm6, mm6                    ; Blank mmx7
        pxor        mm7, mm7                    ; Blank mmx7
        mov         rax, arg(0) ;[src_ptr]  ; Load base addresses
        mov         rbx, arg(2) ;[ref_ptr]
        movsxd      rcx, dword ptr arg(1) ;[source_stride]
        movsxd      rdx, dword ptr arg(3) ;[recon_stride]
        ; Row 1
        movd        mm0, [rax]                  ; Copy four bytes to mm0
        movd        mm1, [rbx]                  ; Copy four bytes to mm1
        punpcklbw   mm0, mm6                    ; unpack to higher prrcision
        punpcklbw   mm1, mm6
        psubsw      mm0, mm1                    ; A-B (low order) to MM0
        paddw       mm5, mm0                    ; accumulate differences in mm5
        pmaddwd     mm0, mm0                    ; square and accumulate
        add         rbx,rdx                     ; Inc pointer into ref data
        add         rax,rcx                     ; Inc pointer into the new data
        movd        mm1, [rbx]                  ; Copy four bytes to mm1
        paddd       mm7, mm0                    ; accumulate in mm7
        ; Row 2
        movd        mm0, [rax]                  ; Copy four bytes to mm0
        punpcklbw   mm0, mm6                    ; unpack to higher prrcision
        punpcklbw   mm1, mm6
        psubsw      mm0, mm1                    ; A-B (low order) to MM0
        paddw       mm5, mm0                    ; accumulate differences in mm5
        pmaddwd     mm0, mm0                    ; square and accumulate
        add         rbx,rdx                     ; Inc pointer into ref data
        add         rax,rcx                     ; Inc pointer into the new data
        movd        mm1, [rbx]                  ; Copy four bytes to mm1
        paddd       mm7, mm0                    ; accumulate in mm7
        ; Row 3
        movd        mm0, [rax]                  ; Copy four bytes to mm0
        punpcklbw   mm0, mm6                    ; unpack to higher precision
        punpcklbw   mm1, mm6
        psubsw      mm0, mm1                    ; A-B (low order) to MM0
        paddw       mm5, mm0                    ; accumulate differences in mm5
        pmaddwd     mm0, mm0                    ; square and accumulate
        add         rbx,rdx                     ; Inc pointer into ref data
        add         rax,rcx                     ; Inc pointer into the new data
        movd        mm1, [rbx]                  ; Copy four bytes to mm1
        paddd       mm7, mm0                    ; accumulate in mm7
        ; Row 4
        movd        mm0, [rax]                  ; Copy four bytes to mm0
        punpcklbw   mm0, mm6                    ; unpack to higher prrcision
        punpcklbw   mm1, mm6
        psubsw      mm0, mm1                    ; A-B (low order) to MM0
        paddw       mm5, mm0                    ; accumulate differences in mm5
        pmaddwd     mm0, mm0                    ; square and accumulate
        paddd       mm7, mm0                    ; accumulate in mm7
        ; Now accumulate the final results.
        movq        QWORD PTR [rsp+8], mm5      ; copy back accumulated results into normal memory
        movq        QWORD PTR [rsp], mm7        ; copy back accumulated results into normal memory
        movsx       rdx, WORD PTR [rsp+8]
        movsx       rcx, WORD PTR [rsp+10]
        movsx       rbx, WORD PTR [rsp+12]
        movsx       rax, WORD PTR [rsp+14]
        add         rdx, rcx
        add         rbx, rax
        add         rdx, rbx    ;XSum
        movsxd      rax, DWORD PTR [rsp]
        movsxd      rcx, DWORD PTR [rsp+4]
        add         rax, rcx    ;XXSum
        mov         rsi, arg(4) ;SSE
        mov         rdi, arg(5) ;Sum
        mov         dword ptr [rsi], eax
        mov         dword ptr [rdi], edx
        xor         rax, rax    ; return 0
    ; begin epilog
    add rsp, 16
    pop rbx
    pop rdi
    pop rsi
    UNSHADOW_ARGS
    pop         rbp
    ret
 ;void vpx_filter_block2d_bil4x4_var_mmx
 ;(
 ;    unsigned char *ref_ptr,
--- a/vpx_dsp/x86/variance_mmx.c
+++ b/vpx_dsp/x86/variance_mmx.c
@@ -23,10 +23,6 @@ DECLARE_ALIGNED(16, static const int16_t, bilinear_filters_mmx[8][8]) = {
  {  16,  16,  16,  16, 112, 112, 112, 112 }
 };
 extern void vpx_get4x4var_mmx(const uint8_t *a, int a_stride,
                              const uint8_t *b, int b_stride,
                              unsigned int *sse, int *sum);
 extern void vpx_filter_block2d_bil4x4_var_mmx(const unsigned char *ref_ptr,
                                              int ref_pixels_per_line,
                                              const unsigned char *src_ptr,
@@ -47,98 +43,6 @@ extern void vpx_filter_block2d_bil_var_mmx(const unsigned char *ref_ptr,
                                           unsigned int *sumsquared);
 unsigned int vpx_variance4x4_mmx(const unsigned char *a, int a_stride,
                                 const unsigned char *b, int b_stride,
                                 unsigned int *sse) {
    unsigned int var;
    int avg;
    vpx_get4x4var_mmx(a, a_stride, b, b_stride, &var, &avg);
    *sse = var;
    return (var - (((unsigned int)avg * avg) >> 4));
 }
 unsigned int vpx_variance8x8_mmx(const unsigned char *a, int a_stride,
                                 const unsigned char *b, int b_stride,
                                 unsigned int *sse) {
    unsigned int var;
    int avg;
    vpx_get8x8var_mmx(a, a_stride, b, b_stride, &var, &avg);
    *sse = var;
    return (var - (((unsigned int)avg * avg) >> 6));
 }
 unsigned int vpx_mse16x16_mmx(const unsigned char *a, int a_stride,
                              const unsigned char *b, int b_stride,
                              unsigned int *sse) {
    unsigned int sse0, sse1, sse2, sse3, var;
    int sum0, sum1, sum2, sum3;
    vpx_get8x8var_mmx(a, a_stride, b, b_stride, &sse0, &sum0);
    vpx_get8x8var_mmx(a + 8, a_stride, b + 8, b_stride, &sse1, &sum1);
    vpx_get8x8var_mmx(a + 8 * a_stride, a_stride,
                      b + 8 * b_stride, b_stride, &sse2, &sum2);
    vpx_get8x8var_mmx(a + 8 * a_stride + 8, a_stride,
                      b + 8 * b_stride + 8, b_stride, &sse3, &sum3);
    var = sse0 + sse1 + sse2 + sse3;
    *sse = var;
    return var;
 }
 unsigned int vpx_variance16x16_mmx(const unsigned char *a, int a_stride,
                                   const unsigned char *b, int b_stride,
                                   unsigned int *sse) {
    unsigned int sse0, sse1, sse2, sse3, var;
    int sum0, sum1, sum2, sum3, avg;
    vpx_get8x8var_mmx(a, a_stride, b, b_stride, &sse0, &sum0);
    vpx_get8x8var_mmx(a + 8, a_stride, b + 8, b_stride, &sse1, &sum1);
    vpx_get8x8var_mmx(a + 8 * a_stride, a_stride,
                      b + 8 * b_stride, b_stride, &sse2, &sum2);
    vpx_get8x8var_mmx(a + 8 * a_stride + 8, a_stride,
                      b + 8 * b_stride + 8, b_stride, &sse3, &sum3);
    var = sse0 + sse1 + sse2 + sse3;
    avg = sum0 + sum1 + sum2 + sum3;
    *sse = var;
    return (var - (((unsigned int)avg * avg) >> 8));
 }
 unsigned int vpx_variance16x8_mmx(const unsigned char *a, int a_stride,
                                  const unsigned char *b, int b_stride,
                                  unsigned int *sse) {
    unsigned int sse0, sse1, var;
    int sum0, sum1, avg;
    vpx_get8x8var_mmx(a, a_stride, b, b_stride, &sse0, &sum0);
    vpx_get8x8var_mmx(a + 8, a_stride, b + 8, b_stride, &sse1, &sum1);
    var = sse0 + sse1;
    avg = sum0 + sum1;
    *sse = var;
    return (var - (((unsigned int)avg * avg) >> 7));
 }
 unsigned int vpx_variance8x16_mmx(const unsigned char *a, int a_stride,
                                  const unsigned char *b, int b_stride,
                                  unsigned int *sse) {
    unsigned int sse0, sse1, var;
    int sum0, sum1, avg;
    vpx_get8x8var_mmx(a, a_stride, b, b_stride, &sse0, &sum0);
    vpx_get8x8var_mmx(a + 8 * a_stride, a_stride,
                      b + 8 * b_stride, b_stride, &sse1, &sum1);
    var = sse0 + sse1;
    avg = sum0 + sum1;
    *sse = var;
    return (var - (((unsigned int)avg * avg) >> 7));
 }
 uint32_t vpx_sub_pixel_variance4x4_mmx(const uint8_t *a, int a_stride,
                                       int xoffset, int yoffset,
                                       const uint8_t *b, int b_stride,