Move variance functions to vpx_dsp

subpel functions will be moved in another patch. Change-Id: Idb2e049bad0b9b32ac42cc7731cd6903de2826ce
2015-05-15 11:52:03 -07:00 · 2015-05-15 11:52:03 -07:00 · c3bdffb0a5
commit c3bdffb0a5
parent 976f7f42c1
53 changed files with 3224 additions and 4230 deletions
--- a/test/variance_test.cc
+++ b/test/variance_test.cc
--- a/vp8/common/arm/armv6/vp8_variance16x16_armv6.asm
+++ b/vp8/common/arm/armv6/vp8_variance16x16_armv6.asm
@ -1,154 +0,0 @@
 ;
 ;  Copyright (c) 2011 The WebM project authors. All Rights Reserved.
 ;
 ;  Use of this source code is governed by a BSD-style license
 ;  that can be found in the LICENSE file in the root of the source
 ;  tree. An additional intellectual property rights grant can be found
 ;  in the file PATENTS.  All contributing project authors may
 ;  be found in the AUTHORS file in the root of the source tree.
 ;
    EXPORT  |vp8_variance16x16_armv6|
    ARM
    REQUIRE8
    PRESERVE8
    AREA ||.text||, CODE, READONLY, ALIGN=2
 ; r0    unsigned char *src_ptr
 ; r1    int source_stride
 ; r2    unsigned char *ref_ptr
 ; r3    int  recon_stride
 ; stack unsigned int *sse
 |vp8_variance16x16_armv6| PROC
    stmfd   sp!, {r4-r12, lr}
    pld     [r0, r1, lsl #0]
    pld     [r2, r3, lsl #0]
    mov     r8, #0              ; initialize sum = 0
    mov     r11, #0             ; initialize sse = 0
    mov     r12, #16            ; set loop counter to 16 (=block height)
 loop
    ; 1st 4 pixels
    ldr     r4, [r0, #0]        ; load 4 src pixels
    ldr     r5, [r2, #0]        ; load 4 ref pixels
    mov     lr, #0              ; constant zero
    usub8   r6, r4, r5          ; calculate difference
    pld     [r0, r1, lsl #1]
    sel     r7, r6, lr          ; select bytes with positive difference
    usub8   r9, r5, r4          ; calculate difference with reversed operands
    pld     [r2, r3, lsl #1]
    sel     r6, r9, lr          ; select bytes with negative difference
    ; calculate partial sums
    usad8   r4, r7, lr          ; calculate sum of positive differences
    usad8   r5, r6, lr          ; calculate sum of negative differences
    orr     r6, r6, r7          ; differences of all 4 pixels
    ; calculate total sum
    adds    r8, r8, r4          ; add positive differences to sum
    subs    r8, r8, r5          ; subtract negative differences from sum
    ; calculate sse
    uxtb16  r5, r6              ; byte (two pixels) to halfwords
    uxtb16  r10, r6, ror #8     ; another two pixels to halfwords
    smlad   r11, r5, r5, r11    ; dual signed multiply, add and accumulate (1)
    ; 2nd 4 pixels
    ldr     r4, [r0, #4]        ; load 4 src pixels
    ldr     r5, [r2, #4]        ; load 4 ref pixels
    smlad   r11, r10, r10, r11  ; dual signed multiply, add and accumulate (2)
    usub8   r6, r4, r5          ; calculate difference
    sel     r7, r6, lr          ; select bytes with positive difference
    usub8   r9, r5, r4          ; calculate difference with reversed operands
    sel     r6, r9, lr          ; select bytes with negative difference
    ; calculate partial sums
    usad8   r4, r7, lr          ; calculate sum of positive differences
    usad8   r5, r6, lr          ; calculate sum of negative differences
    orr     r6, r6, r7          ; differences of all 4 pixels
    ; calculate total sum
    add     r8, r8, r4          ; add positive differences to sum
    sub     r8, r8, r5          ; subtract negative differences from sum
    ; calculate sse
    uxtb16  r5, r6              ; byte (two pixels) to halfwords
    uxtb16  r10, r6, ror #8     ; another two pixels to halfwords
    smlad   r11, r5, r5, r11    ; dual signed multiply, add and accumulate (1)
    ; 3rd 4 pixels
    ldr     r4, [r0, #8]        ; load 4 src pixels
    ldr     r5, [r2, #8]        ; load 4 ref pixels
    smlad   r11, r10, r10, r11  ; dual signed multiply, add and accumulate (2)
    usub8   r6, r4, r5          ; calculate difference
    sel     r7, r6, lr          ; select bytes with positive difference
    usub8   r9, r5, r4          ; calculate difference with reversed operands
    sel     r6, r9, lr          ; select bytes with negative difference
    ; calculate partial sums
    usad8   r4, r7, lr          ; calculate sum of positive differences
    usad8   r5, r6, lr          ; calculate sum of negative differences
    orr     r6, r6, r7          ; differences of all 4 pixels
    ; calculate total sum
    add     r8, r8, r4          ; add positive differences to sum
    sub     r8, r8, r5          ; subtract negative differences from sum
    ; calculate sse
    uxtb16  r5, r6              ; byte (two pixels) to halfwords
    uxtb16  r10, r6, ror #8     ; another two pixels to halfwords
    smlad   r11, r5, r5, r11    ; dual signed multiply, add and accumulate (1)
    ; 4th 4 pixels
    ldr     r4, [r0, #12]       ; load 4 src pixels
    ldr     r5, [r2, #12]       ; load 4 ref pixels
    smlad   r11, r10, r10, r11  ; dual signed multiply, add and accumulate (2)
    usub8   r6, r4, r5          ; calculate difference
    add     r0, r0, r1          ; set src_ptr to next row
    sel     r7, r6, lr          ; select bytes with positive difference
    usub8   r9, r5, r4          ; calculate difference with reversed operands
    add     r2, r2, r3          ; set dst_ptr to next row
    sel     r6, r9, lr          ; select bytes with negative difference
    ; calculate partial sums
    usad8   r4, r7, lr          ; calculate sum of positive differences
    usad8   r5, r6, lr          ; calculate sum of negative differences
    orr     r6, r6, r7          ; differences of all 4 pixels
    ; calculate total sum
    add     r8, r8, r4          ; add positive differences to sum
    sub     r8, r8, r5          ; subtract negative differences from sum
    ; calculate sse
    uxtb16  r5, r6              ; byte (two pixels) to halfwords
    uxtb16  r10, r6, ror #8     ; another two pixels to halfwords
    smlad   r11, r5, r5, r11    ; dual signed multiply, add and accumulate (1)
    smlad   r11, r10, r10, r11  ; dual signed multiply, add and accumulate (2)
    subs    r12, r12, #1
    bne     loop
    ; return stuff
    ldr     r6, [sp, #40]       ; get address of sse
    mul     r0, r8, r8          ; sum * sum
    str     r11, [r6]           ; store sse
    sub     r0, r11, r0, lsr #8 ; return (sse - ((sum * sum) >> 8))
    ldmfd   sp!, {r4-r12, pc}
    ENDP
    END
--- a/vp8/common/arm/armv6/vp8_variance8x8_armv6.asm
+++ b/vp8/common/arm/armv6/vp8_variance8x8_armv6.asm
@ -1,101 +0,0 @@
 ;
 ;  Copyright (c) 2011 The WebM project authors. All Rights Reserved.
 ;
 ;  Use of this source code is governed by a BSD-style license
 ;  that can be found in the LICENSE file in the root of the source
 ;  tree. An additional intellectual property rights grant can be found
 ;  in the file PATENTS.  All contributing project authors may
 ;  be found in the AUTHORS file in the root of the source tree.
 ;
    EXPORT  |vp8_variance8x8_armv6|
    ARM
    AREA ||.text||, CODE, READONLY, ALIGN=2
 ; r0    unsigned char *src_ptr
 ; r1    int source_stride
 ; r2    unsigned char *ref_ptr
 ; r3    int  recon_stride
 ; stack unsigned int *sse
 |vp8_variance8x8_armv6| PROC
    push    {r4-r10, lr}
    pld     [r0, r1, lsl #0]
    pld     [r2, r3, lsl #0]
    mov     r12, #8             ; set loop counter to 8 (=block height)
    mov     r4, #0              ; initialize sum = 0
    mov     r5, #0              ; initialize sse = 0
 loop
    ; 1st 4 pixels
    ldr     r6, [r0, #0x0]      ; load 4 src pixels
    ldr     r7, [r2, #0x0]      ; load 4 ref pixels
    mov     lr, #0              ; constant zero
    usub8   r8, r6, r7          ; calculate difference
    pld     [r0, r1, lsl #1]
    sel     r10, r8, lr         ; select bytes with positive difference
    usub8   r9, r7, r6          ; calculate difference with reversed operands
    pld     [r2, r3, lsl #1]
    sel     r8, r9, lr          ; select bytes with negative difference
    ; calculate partial sums
    usad8   r6, r10, lr         ; calculate sum of positive differences
    usad8   r7, r8, lr          ; calculate sum of negative differences
    orr     r8, r8, r10         ; differences of all 4 pixels
    ; calculate total sum
    add    r4, r4, r6           ; add positive differences to sum
    sub    r4, r4, r7           ; subtract negative differences from sum
    ; calculate sse
    uxtb16  r7, r8              ; byte (two pixels) to halfwords
    uxtb16  r10, r8, ror #8     ; another two pixels to halfwords
    smlad   r5, r7, r7, r5      ; dual signed multiply, add and accumulate (1)
    ; 2nd 4 pixels
    ldr     r6, [r0, #0x4]      ; load 4 src pixels
    ldr     r7, [r2, #0x4]      ; load 4 ref pixels
    smlad   r5, r10, r10, r5    ; dual signed multiply, add and accumulate (2)
    usub8   r8, r6, r7          ; calculate difference
    add     r0, r0, r1          ; set src_ptr to next row
    sel     r10, r8, lr         ; select bytes with positive difference
    usub8   r9, r7, r6          ; calculate difference with reversed operands
    add     r2, r2, r3          ; set dst_ptr to next row
    sel     r8, r9, lr          ; select bytes with negative difference
    ; calculate partial sums
    usad8   r6, r10, lr         ; calculate sum of positive differences
    usad8   r7, r8, lr          ; calculate sum of negative differences
    orr     r8, r8, r10         ; differences of all 4 pixels
    ; calculate total sum
    add     r4, r4, r6          ; add positive differences to sum
    sub     r4, r4, r7          ; subtract negative differences from sum
    ; calculate sse
    uxtb16  r7, r8              ; byte (two pixels) to halfwords
    uxtb16  r10, r8, ror #8     ; another two pixels to halfwords
    smlad   r5, r7, r7, r5      ; dual signed multiply, add and accumulate (1)
    subs    r12, r12, #1        ; next row
    smlad   r5, r10, r10, r5    ; dual signed multiply, add and accumulate (2)
    bne     loop
    ; return stuff
    ldr     r8, [sp, #32]       ; get address of sse
    mul     r1, r4, r4          ; sum * sum
    str     r5, [r8]            ; store sse
    sub     r0, r5, r1, ASR #6  ; return (sse - ((sum * sum) >> 6))
    pop     {r4-r10, pc}
    ENDP
    END
--- a/vp8/common/arm/neon/variance_neon.c
+++ b/vp8/common/arm/neon/variance_neon.c
@ -1,320 +0,0 @@
 /*
 *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
 *
 *  Use of this source code is governed by a BSD-style license
 *  that can be found in the LICENSE file in the root of the source
 *  tree. An additional intellectual property rights grant can be found
 *  in the file PATENTS.  All contributing project authors may
 *  be found in the AUTHORS file in the root of the source tree.
 */
 #include <arm_neon.h>
 #include "vpx_ports/mem.h"
 unsigned int vp8_variance16x16_neon(
        const unsigned char *src_ptr,
        int source_stride,
        const unsigned char *ref_ptr,
        int recon_stride,
        unsigned int *sse) {
    int i;
    int16x4_t d22s16, d23s16, d24s16, d25s16, d26s16, d27s16, d28s16, d29s16;
    uint32x2_t d0u32, d10u32;
    int64x1_t d0s64, d1s64;
    uint8x16_t q0u8, q1u8, q2u8, q3u8;
    uint16x8_t q11u16, q12u16, q13u16, q14u16;
    int32x4_t q8s32, q9s32, q10s32;
    int64x2_t q0s64, q1s64, q5s64;
    q8s32 = vdupq_n_s32(0);
    q9s32 = vdupq_n_s32(0);
    q10s32 = vdupq_n_s32(0);
    for (i = 0; i < 8; i++) {
        q0u8 = vld1q_u8(src_ptr);
        src_ptr += source_stride;
        q1u8 = vld1q_u8(src_ptr);
        src_ptr += source_stride;
        __builtin_prefetch(src_ptr);
        q2u8 = vld1q_u8(ref_ptr);
        ref_ptr += recon_stride;
        q3u8 = vld1q_u8(ref_ptr);
        ref_ptr += recon_stride;
        __builtin_prefetch(ref_ptr);
        q11u16 = vsubl_u8(vget_low_u8(q0u8), vget_low_u8(q2u8));
        q12u16 = vsubl_u8(vget_high_u8(q0u8), vget_high_u8(q2u8));
        q13u16 = vsubl_u8(vget_low_u8(q1u8), vget_low_u8(q3u8));
        q14u16 = vsubl_u8(vget_high_u8(q1u8), vget_high_u8(q3u8));
        d22s16 = vreinterpret_s16_u16(vget_low_u16(q11u16));
        d23s16 = vreinterpret_s16_u16(vget_high_u16(q11u16));
        q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q11u16));
        q9s32 = vmlal_s16(q9s32, d22s16, d22s16);
        q10s32 = vmlal_s16(q10s32, d23s16, d23s16);
        d24s16 = vreinterpret_s16_u16(vget_low_u16(q12u16));
        d25s16 = vreinterpret_s16_u16(vget_high_u16(q12u16));
        q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q12u16));
        q9s32 = vmlal_s16(q9s32, d24s16, d24s16);
        q10s32 = vmlal_s16(q10s32, d25s16, d25s16);
        d26s16 = vreinterpret_s16_u16(vget_low_u16(q13u16));
        d27s16 = vreinterpret_s16_u16(vget_high_u16(q13u16));
        q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q13u16));
        q9s32 = vmlal_s16(q9s32, d26s16, d26s16);
        q10s32 = vmlal_s16(q10s32, d27s16, d27s16);
        d28s16 = vreinterpret_s16_u16(vget_low_u16(q14u16));
        d29s16 = vreinterpret_s16_u16(vget_high_u16(q14u16));
        q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q14u16));
        q9s32 = vmlal_s16(q9s32, d28s16, d28s16);
        q10s32 = vmlal_s16(q10s32, d29s16, d29s16);
    }
    q10s32 = vaddq_s32(q10s32, q9s32);
    q0s64 = vpaddlq_s32(q8s32);
    q1s64 = vpaddlq_s32(q10s32);
    d0s64 = vadd_s64(vget_low_s64(q0s64), vget_high_s64(q0s64));
    d1s64 = vadd_s64(vget_low_s64(q1s64), vget_high_s64(q1s64));
    q5s64 = vmull_s32(vreinterpret_s32_s64(d0s64),
                      vreinterpret_s32_s64(d0s64));
    vst1_lane_u32((uint32_t *)sse, vreinterpret_u32_s64(d1s64), 0);
    d10u32 = vshr_n_u32(vreinterpret_u32_s64(vget_low_s64(q5s64)), 8);
    d0u32 = vsub_u32(vreinterpret_u32_s64(d1s64), d10u32);
    return vget_lane_u32(d0u32, 0);
 }
 unsigned int vp8_variance16x8_neon(
        const unsigned char *src_ptr,
        int source_stride,
        const unsigned char *ref_ptr,
        int recon_stride,
        unsigned int *sse) {
    int i;
    int16x4_t d22s16, d23s16, d24s16, d25s16, d26s16, d27s16, d28s16, d29s16;
    uint32x2_t d0u32, d10u32;
    int64x1_t d0s64, d1s64;
    uint8x16_t q0u8, q1u8, q2u8, q3u8;
    uint16x8_t q11u16, q12u16, q13u16, q14u16;
    int32x4_t q8s32, q9s32, q10s32;
    int64x2_t q0s64, q1s64, q5s64;
    q8s32 = vdupq_n_s32(0);
    q9s32 = vdupq_n_s32(0);
    q10s32 = vdupq_n_s32(0);
    for (i = 0; i < 4; i++) {  // variance16x8_neon_loop
        q0u8 = vld1q_u8(src_ptr);
        src_ptr += source_stride;
        q1u8 = vld1q_u8(src_ptr);
        src_ptr += source_stride;
        __builtin_prefetch(src_ptr);
        q2u8 = vld1q_u8(ref_ptr);
        ref_ptr += recon_stride;
        q3u8 = vld1q_u8(ref_ptr);
        ref_ptr += recon_stride;
        __builtin_prefetch(ref_ptr);
        q11u16 = vsubl_u8(vget_low_u8(q0u8), vget_low_u8(q2u8));
        q12u16 = vsubl_u8(vget_high_u8(q0u8), vget_high_u8(q2u8));
        q13u16 = vsubl_u8(vget_low_u8(q1u8), vget_low_u8(q3u8));
        q14u16 = vsubl_u8(vget_high_u8(q1u8), vget_high_u8(q3u8));
        d22s16 = vreinterpret_s16_u16(vget_low_u16(q11u16));
        d23s16 = vreinterpret_s16_u16(vget_high_u16(q11u16));
        q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q11u16));
        q9s32 = vmlal_s16(q9s32, d22s16, d22s16);
        q10s32 = vmlal_s16(q10s32, d23s16, d23s16);
        d24s16 = vreinterpret_s16_u16(vget_low_u16(q12u16));
        d25s16 = vreinterpret_s16_u16(vget_high_u16(q12u16));
        q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q12u16));
        q9s32 = vmlal_s16(q9s32, d24s16, d24s16);
        q10s32 = vmlal_s16(q10s32, d25s16, d25s16);
        d26s16 = vreinterpret_s16_u16(vget_low_u16(q13u16));
        d27s16 = vreinterpret_s16_u16(vget_high_u16(q13u16));
        q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q13u16));
        q9s32 = vmlal_s16(q9s32, d26s16, d26s16);
        q10s32 = vmlal_s16(q10s32, d27s16, d27s16);
        d28s16 = vreinterpret_s16_u16(vget_low_u16(q14u16));
        d29s16 = vreinterpret_s16_u16(vget_high_u16(q14u16));
        q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q14u16));
        q9s32 = vmlal_s16(q9s32, d28s16, d28s16);
        q10s32 = vmlal_s16(q10s32, d29s16, d29s16);
    }
    q10s32 = vaddq_s32(q10s32, q9s32);
    q0s64 = vpaddlq_s32(q8s32);
    q1s64 = vpaddlq_s32(q10s32);
    d0s64 = vadd_s64(vget_low_s64(q0s64), vget_high_s64(q0s64));
    d1s64 = vadd_s64(vget_low_s64(q1s64), vget_high_s64(q1s64));
    q5s64 = vmull_s32(vreinterpret_s32_s64(d0s64),
                      vreinterpret_s32_s64(d0s64));
    vst1_lane_u32((uint32_t *)sse, vreinterpret_u32_s64(d1s64), 0);
    d10u32 = vshr_n_u32(vreinterpret_u32_s64(vget_low_s64(q5s64)), 7);
    d0u32 = vsub_u32(vreinterpret_u32_s64(d1s64), d10u32);
    return vget_lane_u32(d0u32, 0);
 }
 unsigned int vp8_variance8x16_neon(
        const unsigned char *src_ptr,
        int source_stride,
        const unsigned char *ref_ptr,
        int recon_stride,
        unsigned int *sse) {
    int i;
    uint8x8_t d0u8, d2u8, d4u8, d6u8;
    int16x4_t d22s16, d23s16, d24s16, d25s16;
    uint32x2_t d0u32, d10u32;
    int64x1_t d0s64, d1s64;
    uint16x8_t q11u16, q12u16;
    int32x4_t q8s32, q9s32, q10s32;
    int64x2_t q0s64, q1s64, q5s64;
    q8s32 = vdupq_n_s32(0);
    q9s32 = vdupq_n_s32(0);
    q10s32 = vdupq_n_s32(0);
    for (i = 0; i < 8; i++) {  // variance8x16_neon_loop
        d0u8 = vld1_u8(src_ptr);
        src_ptr += source_stride;
        d2u8 = vld1_u8(src_ptr);
        src_ptr += source_stride;
        __builtin_prefetch(src_ptr);
        d4u8 = vld1_u8(ref_ptr);
        ref_ptr += recon_stride;
        d6u8 = vld1_u8(ref_ptr);
        ref_ptr += recon_stride;
        __builtin_prefetch(ref_ptr);
        q11u16 = vsubl_u8(d0u8, d4u8);
        q12u16 = vsubl_u8(d2u8, d6u8);
        d22s16 = vreinterpret_s16_u16(vget_low_u16(q11u16));
        d23s16 = vreinterpret_s16_u16(vget_high_u16(q11u16));
        q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q11u16));
        q9s32 = vmlal_s16(q9s32, d22s16, d22s16);
        q10s32 = vmlal_s16(q10s32, d23s16, d23s16);
        d24s16 = vreinterpret_s16_u16(vget_low_u16(q12u16));
        d25s16 = vreinterpret_s16_u16(vget_high_u16(q12u16));
        q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q12u16));
        q9s32 = vmlal_s16(q9s32, d24s16, d24s16);
        q10s32 = vmlal_s16(q10s32, d25s16, d25s16);
    }
    q10s32 = vaddq_s32(q10s32, q9s32);
    q0s64 = vpaddlq_s32(q8s32);
    q1s64 = vpaddlq_s32(q10s32);
    d0s64 = vadd_s64(vget_low_s64(q0s64), vget_high_s64(q0s64));
    d1s64 = vadd_s64(vget_low_s64(q1s64), vget_high_s64(q1s64));
    q5s64 = vmull_s32(vreinterpret_s32_s64(d0s64),
                      vreinterpret_s32_s64(d0s64));
    vst1_lane_u32((uint32_t *)sse, vreinterpret_u32_s64(d1s64), 0);
    d10u32 = vshr_n_u32(vreinterpret_u32_s64(vget_low_s64(q5s64)), 7);
    d0u32 = vsub_u32(vreinterpret_u32_s64(d1s64), d10u32);
    return vget_lane_u32(d0u32, 0);
 }
 unsigned int vp8_variance8x8_neon(
        const unsigned char *src_ptr,
        int source_stride,
        const unsigned char *ref_ptr,
        int recon_stride,
        unsigned int *sse) {
    int i;
    uint8x8_t d0u8, d1u8, d2u8, d3u8, d4u8, d5u8, d6u8, d7u8;
    int16x4_t d22s16, d23s16, d24s16, d25s16, d26s16, d27s16, d28s16, d29s16;
    uint32x2_t d0u32, d10u32;
    int64x1_t d0s64, d1s64;
    uint16x8_t q11u16, q12u16, q13u16, q14u16;
    int32x4_t q8s32, q9s32, q10s32;
    int64x2_t q0s64, q1s64, q5s64;
    q8s32 = vdupq_n_s32(0);
    q9s32 = vdupq_n_s32(0);
    q10s32 = vdupq_n_s32(0);
    for (i = 0; i < 2; i++) {  // variance8x8_neon_loop
        d0u8 = vld1_u8(src_ptr);
        src_ptr += source_stride;
        d1u8 = vld1_u8(src_ptr);
        src_ptr += source_stride;
        d2u8 = vld1_u8(src_ptr);
        src_ptr += source_stride;
        d3u8 = vld1_u8(src_ptr);
        src_ptr += source_stride;
        d4u8 = vld1_u8(ref_ptr);
        ref_ptr += recon_stride;
        d5u8 = vld1_u8(ref_ptr);
        ref_ptr += recon_stride;
        d6u8 = vld1_u8(ref_ptr);
        ref_ptr += recon_stride;
        d7u8 = vld1_u8(ref_ptr);
        ref_ptr += recon_stride;
        q11u16 = vsubl_u8(d0u8, d4u8);
        q12u16 = vsubl_u8(d1u8, d5u8);
        q13u16 = vsubl_u8(d2u8, d6u8);
        q14u16 = vsubl_u8(d3u8, d7u8);
        d22s16 = vreinterpret_s16_u16(vget_low_u16(q11u16));
        d23s16 = vreinterpret_s16_u16(vget_high_u16(q11u16));
        q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q11u16));
        q9s32 = vmlal_s16(q9s32, d22s16, d22s16);
        q10s32 = vmlal_s16(q10s32, d23s16, d23s16);
        d24s16 = vreinterpret_s16_u16(vget_low_u16(q12u16));
        d25s16 = vreinterpret_s16_u16(vget_high_u16(q12u16));
        q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q12u16));
        q9s32 = vmlal_s16(q9s32, d24s16, d24s16);
        q10s32 = vmlal_s16(q10s32, d25s16, d25s16);
        d26s16 = vreinterpret_s16_u16(vget_low_u16(q13u16));
        d27s16 = vreinterpret_s16_u16(vget_high_u16(q13u16));
        q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q13u16));
        q9s32 = vmlal_s16(q9s32, d26s16, d26s16);
        q10s32 = vmlal_s16(q10s32, d27s16, d27s16);
        d28s16 = vreinterpret_s16_u16(vget_low_u16(q14u16));
        d29s16 = vreinterpret_s16_u16(vget_high_u16(q14u16));
        q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q14u16));
        q9s32 = vmlal_s16(q9s32, d28s16, d28s16);
        q10s32 = vmlal_s16(q10s32, d29s16, d29s16);
    }
    q10s32 = vaddq_s32(q10s32, q9s32);
    q0s64 = vpaddlq_s32(q8s32);
    q1s64 = vpaddlq_s32(q10s32);
    d0s64 = vadd_s64(vget_low_s64(q0s64), vget_high_s64(q0s64));
    d1s64 = vadd_s64(vget_low_s64(q1s64), vget_high_s64(q1s64));
    q5s64 = vmull_s32(vreinterpret_s32_s64(d0s64),
                      vreinterpret_s32_s64(d0s64));
    vst1_lane_u32((uint32_t *)sse, vreinterpret_u32_s64(d1s64), 0);
    d10u32 = vshr_n_u32(vreinterpret_u32_s64(vget_low_s64(q5s64)), 6);
    d0u32 = vsub_u32(vreinterpret_u32_s64(d1s64), d10u32);
    return vget_lane_u32(d0u32, 0);
 }
--- a/vp8/common/arm/variance_arm.c
+++ b/vp8/common/arm/variance_arm.c
@ -9,10 +9,14 @@
 */
 #include "vpx_config.h"
-#include "vp8_rtcd.h"
+#include "./vp8_rtcd.h"
 #include "./vpx_dsp_rtcd.h"
 #include "vp8/common/variance.h"
 #include "vp8/common/filter.h"
 // TODO(johannkoenig): Move this to vpx_dsp or vp8/encoder
 #if CONFIG_VP8_ENCODER
 #if HAVE_MEDIA
 #include "vp8/common/arm/bilinearfilter_arm.h"
@ -40,8 +44,8 @@ unsigned int vp8_sub_pixel_variance8x8_armv6
    vp8_filter_block2d_bil_second_pass_armv6(first_pass, second_pass,
                                             8, 8, 8, VFilter);
-    return vp8_variance8x8_armv6(second_pass, 8, dst_ptr,
+    return vpx_variance8x8_media(second_pass, 8, dst_ptr,
-                                   dst_pixels_per_line, sse);
+                                 dst_pixels_per_line, sse);
 }
 unsigned int vp8_sub_pixel_variance16x16_armv6
@ -86,13 +90,13 @@ unsigned int vp8_sub_pixel_variance16x16_armv6
        vp8_filter_block2d_bil_second_pass_armv6(first_pass, second_pass,
                                                 16, 16, 16, VFilter);
-        var = vp8_variance16x16_armv6(second_pass, 16, dst_ptr,
+        var = vpx_variance16x16_media(second_pass, 16, dst_ptr,
-                                       dst_pixels_per_line, sse);
+                                      dst_pixels_per_line, sse);
    }
    return var;
 }
-#endif /* HAVE_MEDIA */
+#endif  // HAVE_MEDIA
 #if HAVE_NEON
@ -129,4 +133,5 @@ unsigned int vp8_sub_pixel_variance16x16_neon
    return vp8_sub_pixel_variance16x16_neon_func(src_ptr, src_pixels_per_line, xoffset, yoffset, dst_ptr, dst_pixels_per_line, sse);
 }
-#endif
+#endif  // HAVE_NEON
 #endif  // CONFIG_VP8_ENCODER
--- a/vp8/common/mfqe.c
+++ b/vp8/common/mfqe.c
@ -151,14 +151,14 @@ static void multiframe_quality_enhance_block
    if (blksize == 16)
    {
-        actd = (vp8_variance16x16(yd, yd_stride, VP8_ZEROS, 0, &sse)+128)>>8;
+        actd = (vpx_variance16x16(yd, yd_stride, VP8_ZEROS, 0, &sse)+128)>>8;
-        act = (vp8_variance16x16(y, y_stride, VP8_ZEROS, 0, &sse)+128)>>8;
+        act = (vpx_variance16x16(y, y_stride, VP8_ZEROS, 0, &sse)+128)>>8;
 #ifdef USE_SSD
-        vp8_variance16x16(y, y_stride, yd, yd_stride, &sse);
+        vpx_variance16x16(y, y_stride, yd, yd_stride, &sse);
        sad = (sse + 128)>>8;
-        vp8_variance8x8(u, uv_stride, ud, uvd_stride, &sse);
+        vpx_variance8x8(u, uv_stride, ud, uvd_stride, &sse);
        usad = (sse + 32)>>6;
-        vp8_variance8x8(v, uv_stride, vd, uvd_stride, &sse);
+        vpx_variance8x8(v, uv_stride, vd, uvd_stride, &sse);
        vsad = (sse + 32)>>6;
 #else
        sad = (vpx_sad16x16(y, y_stride, yd, yd_stride) + 128) >> 8;
@ -168,14 +168,14 @@ static void multiframe_quality_enhance_block
    }
    else /* if (blksize == 8) */
    {
-        actd = (vp8_variance8x8(yd, yd_stride, VP8_ZEROS, 0, &sse)+32)>>6;
+        actd = (vpx_variance8x8(yd, yd_stride, VP8_ZEROS, 0, &sse)+32)>>6;
-        act = (vp8_variance8x8(y, y_stride, VP8_ZEROS, 0, &sse)+32)>>6;
+        act = (vpx_variance8x8(y, y_stride, VP8_ZEROS, 0, &sse)+32)>>6;
 #ifdef USE_SSD
-        vp8_variance8x8(y, y_stride, yd, yd_stride, &sse);
+        vpx_variance8x8(y, y_stride, yd, yd_stride, &sse);
        sad = (sse + 32)>>6;
-        vp8_variance4x4(u, uv_stride, ud, uvd_stride, &sse);
+        vpx_variance4x4(u, uv_stride, ud, uvd_stride, &sse);
        usad = (sse + 8)>>4;
-        vp8_variance4x4(v, uv_stride, vd, uvd_stride, &sse);
+        vpx_variance4x4(v, uv_stride, vd, uvd_stride, &sse);
        vsad = (sse + 8)>>4;
 #else
        sad = (vpx_sad8x8(y, y_stride, yd, yd_stride) + 32) >> 6;
--- a/vp8/common/rtcd_defs.pl
+++ b/vp8/common/rtcd_defs.pl
@ -236,31 +236,6 @@ add_proto qw/void vp8_bilinear_predict4x4/, "unsigned char *src, int src_pitch,
 specialize qw/vp8_bilinear_predict4x4 mmx media neon/;
 $vp8_bilinear_predict4x4_media=vp8_bilinear_predict4x4_armv6;
 #
 # Whole-pixel Variance
 #
 add_proto qw/unsigned int vp8_variance4x4/, "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int  ref_stride, unsigned int *sse";
 specialize qw/vp8_variance4x4 mmx sse2/;
 $vp8_variance4x4_sse2=vp8_variance4x4_wmt;
 add_proto qw/unsigned int vp8_variance8x8/, "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int  ref_stride, unsigned int *sse";
 specialize qw/vp8_variance8x8 mmx sse2 media neon/;
 $vp8_variance8x8_sse2=vp8_variance8x8_wmt;
 $vp8_variance8x8_media=vp8_variance8x8_armv6;
 add_proto qw/unsigned int vp8_variance8x16/, "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int  ref_stride, unsigned int *sse";
 specialize qw/vp8_variance8x16 mmx sse2 neon/;
 $vp8_variance8x16_sse2=vp8_variance8x16_wmt;
 add_proto qw/unsigned int vp8_variance16x8/, "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int  ref_stride, unsigned int *sse";
 specialize qw/vp8_variance16x8 mmx sse2 neon/;
 $vp8_variance16x8_sse2=vp8_variance16x8_wmt;
 add_proto qw/unsigned int vp8_variance16x16/, "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int  ref_stride, unsigned int *sse";
 specialize qw/vp8_variance16x16 mmx sse2 media neon/;
 $vp8_variance16x16_sse2=vp8_variance16x16_wmt;
 $vp8_variance16x16_media=vp8_variance16x16_armv6;
 #
 # Sub-pixel Variance
 #
@ -308,12 +283,6 @@ $vp8_variance_halfpixvar16x16_hv_media=vp8_variance_halfpixvar16x16_hv_armv6;
 #
 if (vpx_config("CONFIG_VP8_ENCODER") eq "yes") {
 #
 # Sum of squares (vector)
 #
 add_proto qw/unsigned int vp8_get_mb_ss/, "const short *";
 specialize qw/vp8_get_mb_ss mmx sse2/;
 #
 # SSE (Sum Squared Error)
 #
@ -321,14 +290,6 @@ add_proto qw/unsigned int vp8_sub_pixel_mse16x16/, "const unsigned char  *src_pt
 specialize qw/vp8_sub_pixel_mse16x16 mmx sse2/;
 $vp8_sub_pixel_mse16x16_sse2=vp8_sub_pixel_mse16x16_wmt;
 add_proto qw/unsigned int vp8_mse16x16/, "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int  ref_stride, unsigned int *sse";
 specialize qw/vp8_mse16x16 mmx sse2 media neon/;
 $vp8_mse16x16_sse2=vp8_mse16x16_wmt;
 $vp8_mse16x16_media=vp8_mse16x16_armv6;
 add_proto qw/unsigned int vp8_get4x4sse_cs/, "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int  ref_stride";
 specialize qw/vp8_get4x4sse_cs mmx neon/;
 #
 # Block copy
 #
--- a/vp8/common/variance.h
+++ b/vp8/common/variance.h
@ -39,6 +39,7 @@ typedef void (*vpx_sad_multi_fn_t)(
    const unsigned char *ref_array,
    int  ref_stride,
    unsigned int *sad_array);
 typedef void (*vpx_sad_multi_d_fn_t)
    (
     const unsigned char *src_ptr,
@ -48,7 +49,7 @@ typedef void (*vpx_sad_multi_d_fn_t)
     unsigned int *sad_array
    );
-typedef unsigned int (*vp8_variance_fn_t)
+typedef unsigned int (*vpx_variance_fn_t)
    (
     const unsigned char *src_ptr,
     int source_stride,
@ -68,37 +69,14 @@ typedef unsigned int (*vp8_subpixvariance_fn_t)
      unsigned int *sse
    );
 typedef void (*vp8_ssimpf_fn_t)
      (
        unsigned char *s,
        int sp,
        unsigned char *r,
        int rp,
        unsigned long *sum_s,
        unsigned long *sum_r,
        unsigned long *sum_sq_s,
        unsigned long *sum_sq_r,
        unsigned long *sum_sxr
      );
 typedef unsigned int (*vp8_getmbss_fn_t)(const short *);
 typedef unsigned int (*vp8_get16x16prederror_fn_t)
    (
     const unsigned char *src_ptr,
     int source_stride,
     const unsigned char *ref_ptr,
     int  ref_stride
    );
 typedef struct variance_vtable
 {
    vpx_sad_fn_t            sdf;
-    vp8_variance_fn_t       vf;
+    vpx_variance_fn_t       vf;
    vp8_subpixvariance_fn_t svf;
-    vp8_variance_fn_t       svf_halfpix_h;
+    vpx_variance_fn_t       svf_halfpix_h;
-    vp8_variance_fn_t       svf_halfpix_v;
+    vpx_variance_fn_t       svf_halfpix_v;
-    vp8_variance_fn_t       svf_halfpix_hv;
+    vpx_variance_fn_t       svf_halfpix_hv;
    vpx_sad_multi_fn_t      sdx3f;
    vpx_sad_multi_fn_t      sdx8f;
    vpx_sad_multi_d_fn_t    sdx4df;
--- a/vp8/common/variance_c.c
+++ b/vp8/common/variance_c.c
@ -8,44 +8,34 @@
 *  be found in the AUTHORS file in the root of the source tree.
 */
 #include "./vp8_rtcd.h"
 #include "filter.h"
 #include "variance.h"
-
+/* This is a bad idea.
-unsigned int vp8_get_mb_ss_c
+ * ctz = count trailing zeros */
-(
+static int ctz(int a) {
-    const short *src_ptr
+  int b = 0;
-)
+  while (a != 1) {
-{
+    a >>= 1;
-    unsigned int i = 0, sum = 0;
+    b++;
-
+  }
-    do
+  return b;
    {
        sum += (src_ptr[i] * src_ptr[i]);
        i++;
    }
    while (i < 256);
    return sum;
 }
-
+static unsigned int variance(
 static void variance(
    const unsigned char *src_ptr,
    int  source_stride,
    const unsigned char *ref_ptr,
    int  recon_stride,
    int  w,
    int  h,
-    unsigned int *sse,
+    unsigned int *sse)
    int *sum)
 {
    int i, j;
-    int diff;
+    int diff, sum;
-    *sum = 0;
+    sum = 0;
    *sse = 0;
    for (i = 0; i < h; i++)
@ -53,114 +43,17 @@ static void variance(
        for (j = 0; j < w; j++)
        {
            diff = src_ptr[j] - ref_ptr[j];
-            *sum += diff;
+            sum += diff;
            *sse += diff * diff;
        }
        src_ptr += source_stride;
        ref_ptr += recon_stride;
    }
    return (*sse - (((unsigned int)sum * sum) >> (int)((ctz(w) + ctz(h)))));
 }
 unsigned int vp8_variance16x16_c(
    const unsigned char *src_ptr,
    int  source_stride,
    const unsigned char *ref_ptr,
    int  recon_stride,
    unsigned int *sse)
 {
    unsigned int var;
    int avg;
    variance(src_ptr, source_stride, ref_ptr, recon_stride, 16, 16, &var, &avg);
    *sse = var;
    return (var - (((unsigned int)avg * avg) >> 8));
 }
 unsigned int vp8_variance8x16_c(
    const unsigned char *src_ptr,
    int  source_stride,
    const unsigned char *ref_ptr,
    int  recon_stride,
    unsigned int *sse)
 {
    unsigned int var;
    int avg;
    variance(src_ptr, source_stride, ref_ptr, recon_stride, 8, 16, &var, &avg);
    *sse = var;
    return (var - (((unsigned int)avg * avg) >> 7));
 }
 unsigned int vp8_variance16x8_c(
    const unsigned char *src_ptr,
    int  source_stride,
    const unsigned char *ref_ptr,
    int  recon_stride,
    unsigned int *sse)
 {
    unsigned int var;
    int avg;
    variance(src_ptr, source_stride, ref_ptr, recon_stride, 16, 8, &var, &avg);
    *sse = var;
    return (var - (((unsigned int)avg * avg) >> 7));
 }
 unsigned int vp8_variance8x8_c(
    const unsigned char *src_ptr,
    int  source_stride,
    const unsigned char *ref_ptr,
    int  recon_stride,
    unsigned int *sse)
 {
    unsigned int var;
    int avg;
    variance(src_ptr, source_stride, ref_ptr, recon_stride, 8, 8, &var, &avg);
    *sse = var;
    return (var - (((unsigned int)avg * avg) >> 6));
 }
 unsigned int vp8_variance4x4_c(
    const unsigned char *src_ptr,
    int  source_stride,
    const unsigned char *ref_ptr,
    int  recon_stride,
    unsigned int *sse)
 {
    unsigned int var;
    int avg;
    variance(src_ptr, source_stride, ref_ptr, recon_stride, 4, 4, &var, &avg);
    *sse = var;
    return (var - (((unsigned int)avg * avg) >> 4));
 }
 unsigned int vp8_mse16x16_c(
    const unsigned char *src_ptr,
    int  source_stride,
    const unsigned char *ref_ptr,
    int  recon_stride,
    unsigned int *sse)
 {
    unsigned int var;
    int avg;
    variance(src_ptr, source_stride, ref_ptr, recon_stride, 16, 16, &var, &avg);
    *sse = var;
    return var;
 }
 /****************************************************************************
 *
 *  ROUTINE       : filter_block2d_bil_first_pass
@ -304,7 +197,7 @@ unsigned int vp8_sub_pixel_variance4x4_c
    /* Now filter Verticaly */
    var_filter_block2d_bil_second_pass(FData3, temp2, 4,  4,  4,  4, VFilter);
-    return vp8_variance4x4_c(temp2, 4, dst_ptr, dst_pixels_per_line, sse);
+    return variance(temp2, 4, dst_ptr, dst_pixels_per_line, 4, 4, sse);
 }
@ -329,7 +222,7 @@ unsigned int vp8_sub_pixel_variance8x8_c
    var_filter_block2d_bil_first_pass(src_ptr, FData3, src_pixels_per_line, 1, 9, 8, HFilter);
    var_filter_block2d_bil_second_pass(FData3, temp2, 8, 8, 8, 8, VFilter);
-    return vp8_variance8x8_c(temp2, 8, dst_ptr, dst_pixels_per_line, sse);
+    return variance(temp2, 8, dst_ptr, dst_pixels_per_line, 8, 8, sse);
 }
 unsigned int vp8_sub_pixel_variance16x16_c
@ -353,7 +246,7 @@ unsigned int vp8_sub_pixel_variance16x16_c
    var_filter_block2d_bil_first_pass(src_ptr, FData3, src_pixels_per_line, 1, 17, 16, HFilter);
    var_filter_block2d_bil_second_pass(FData3, temp2, 16, 16, 16, 16, VFilter);
-    return vp8_variance16x16_c(temp2, 16, dst_ptr, dst_pixels_per_line, sse);
+    return variance(temp2, 16, dst_ptr, dst_pixels_per_line, 16, 16, sse);
 }
@ -429,7 +322,7 @@ unsigned int vp8_sub_pixel_variance16x8_c
    var_filter_block2d_bil_first_pass(src_ptr, FData3, src_pixels_per_line, 1, 9, 16, HFilter);
    var_filter_block2d_bil_second_pass(FData3, temp2, 16, 16, 8, 16, VFilter);
-    return vp8_variance16x8_c(temp2, 16, dst_ptr, dst_pixels_per_line, sse);
+    return variance(temp2, 16, dst_ptr, dst_pixels_per_line, 16, 8, sse);
 }
 unsigned int vp8_sub_pixel_variance8x16_c
@ -455,5 +348,5 @@ unsigned int vp8_sub_pixel_variance8x16_c
    var_filter_block2d_bil_first_pass(src_ptr, FData3, src_pixels_per_line, 1, 17, 8, HFilter);
    var_filter_block2d_bil_second_pass(FData3, temp2, 8, 8, 16, 8, VFilter);
-    return vp8_variance8x16_c(temp2, 8, dst_ptr, dst_pixels_per_line, sse);
+    return variance(temp2, 8, dst_ptr, dst_pixels_per_line, 8, 16, sse);
 }
--- a/vp8/common/x86/variance_impl_mmx.asm
+++ b/vp8/common/x86/variance_impl_mmx.asm
@ -11,504 +11,6 @@
 %include "vpx_ports/x86_abi_support.asm"
 ;unsigned int vp8_get_mb_ss_mmx( short *src_ptr )
 global sym(vp8_get_mb_ss_mmx) PRIVATE
 sym(vp8_get_mb_ss_mmx):
    push        rbp
    mov         rbp, rsp
    SHADOW_ARGS_TO_STACK 7
    GET_GOT     rbx
    push rsi
    push rdi
    sub         rsp, 8
    ; end prolog
        mov         rax, arg(0) ;src_ptr
        mov         rcx, 16
        pxor        mm4, mm4
 .NEXTROW:
        movq        mm0, [rax]
        movq        mm1, [rax+8]
        movq        mm2, [rax+16]
        movq        mm3, [rax+24]
        pmaddwd     mm0, mm0
        pmaddwd     mm1, mm1
        pmaddwd     mm2, mm2
        pmaddwd     mm3, mm3
        paddd       mm4, mm0
        paddd       mm4, mm1
        paddd       mm4, mm2
        paddd       mm4, mm3
        add         rax, 32
        dec         rcx
        ja          .NEXTROW
        movq        QWORD PTR [rsp], mm4
        ;return sum[0]+sum[1];
        movsxd      rax, dword ptr [rsp]
        movsxd      rcx, dword ptr [rsp+4]
        add         rax, rcx
    ; begin epilog
    add rsp, 8
    pop rdi
    pop rsi
    RESTORE_GOT
    UNSHADOW_ARGS
    pop         rbp
    ret
 ;unsigned int vp8_get8x8var_mmx
 ;(
 ;    unsigned char *src_ptr,
 ;    int  source_stride,
 ;    unsigned char *ref_ptr,
 ;    int  recon_stride,
 ;    unsigned int *SSE,
 ;    int *Sum
 ;)
 global sym(vp8_get8x8var_mmx) PRIVATE
 sym(vp8_get8x8var_mmx):
    push        rbp
    mov         rbp, rsp
    SHADOW_ARGS_TO_STACK 6
    push rsi
    push rdi
    push rbx
    sub         rsp, 16
    ; end prolog
        pxor        mm5, mm5                    ; Blank mmx6
        pxor        mm6, mm6                    ; Blank mmx7
        pxor        mm7, mm7                    ; Blank mmx7
        mov         rax, arg(0) ;[src_ptr]  ; Load base addresses
        mov         rbx, arg(2) ;[ref_ptr]
        movsxd      rcx, dword ptr arg(1) ;[source_stride]
        movsxd      rdx, dword ptr arg(3) ;[recon_stride]
        ; Row 1
        movq        mm0, [rax]                  ; Copy eight bytes to mm0
        movq        mm1, [rbx]                  ; Copy eight bytes to mm1
        movq        mm2, mm0                    ; Take copies
        movq        mm3, mm1                    ; Take copies
        punpcklbw   mm0, mm6                    ; unpack to higher prrcision
        punpcklbw   mm1, mm6
        punpckhbw   mm2, mm6                    ; unpack to higher prrcision
        punpckhbw   mm3, mm6
        psubsw      mm0, mm1                    ; A-B (low order) to MM0
        psubsw      mm2, mm3                    ; A-B (high order) to MM2
        paddw       mm5, mm0                    ; accumulate differences in mm5
        paddw       mm5, mm2                    ; accumulate differences in mm5
        pmaddwd     mm0, mm0                    ; square and accumulate
        pmaddwd     mm2, mm2                    ; square and accumulate
        add         rbx,rdx                     ; Inc pointer into ref data
        add         rax,rcx                     ; Inc pointer into the new data
        movq        mm1, [rbx]                  ; Copy eight bytes to mm1
        paddd       mm7, mm0                    ; accumulate in mm7
        paddd       mm7, mm2                    ; accumulate in mm7
        ; Row 2
        movq        mm0, [rax]                  ; Copy eight bytes to mm0
        movq        mm2, mm0                    ; Take copies
        movq        mm3, mm1                    ; Take copies
        punpcklbw   mm0, mm6                    ; unpack to higher prrcision
        punpcklbw   mm1, mm6
        punpckhbw   mm2, mm6                    ; unpack to higher prrcision
        punpckhbw   mm3, mm6
        psubsw      mm0, mm1                    ; A-B (low order) to MM0
        psubsw      mm2, mm3                    ; A-B (high order) to MM2
        paddw       mm5, mm0                    ; accumulate differences in mm5
        paddw       mm5, mm2                    ; accumulate differences in mm5
        pmaddwd     mm0, mm0                    ; square and accumulate
        pmaddwd     mm2, mm2                    ; square and accumulate
        add         rbx,rdx                     ; Inc pointer into ref data
        add         rax,rcx                     ; Inc pointer into the new data
        movq        mm1, [rbx]                  ; Copy eight bytes to mm1
        paddd       mm7, mm0                    ; accumulate in mm7
        paddd       mm7, mm2                    ; accumulate in mm7
        ; Row 3
        movq        mm0, [rax]                  ; Copy eight bytes to mm0
        movq        mm2, mm0                    ; Take copies
        movq        mm3, mm1                    ; Take copies
        punpcklbw   mm0, mm6                    ; unpack to higher prrcision
        punpcklbw   mm1, mm6
        punpckhbw   mm2, mm6                    ; unpack to higher prrcision
        punpckhbw   mm3, mm6
        psubsw      mm0, mm1                    ; A-B (low order) to MM0
        psubsw      mm2, mm3                    ; A-B (high order) to MM2
        paddw       mm5, mm0                    ; accumulate differences in mm5
        paddw       mm5, mm2                    ; accumulate differences in mm5
        pmaddwd     mm0, mm0                    ; square and accumulate
        pmaddwd     mm2, mm2                    ; square and accumulate
        add         rbx,rdx                     ; Inc pointer into ref data
        add         rax,rcx                     ; Inc pointer into the new data
        movq        mm1, [rbx]                  ; Copy eight bytes to mm1
        paddd       mm7, mm0                    ; accumulate in mm7
        paddd       mm7, mm2                    ; accumulate in mm7
        ; Row 4
        movq        mm0, [rax]                  ; Copy eight bytes to mm0
        movq        mm2, mm0                    ; Take copies
        movq        mm3, mm1                    ; Take copies
        punpcklbw   mm0, mm6                    ; unpack to higher prrcision
        punpcklbw   mm1, mm6
        punpckhbw   mm2, mm6                    ; unpack to higher prrcision
        punpckhbw   mm3, mm6
        psubsw      mm0, mm1                    ; A-B (low order) to MM0
        psubsw      mm2, mm3                    ; A-B (high order) to MM2
        paddw       mm5, mm0                    ; accumulate differences in mm5
        paddw       mm5, mm2                    ; accumulate differences in mm5
        pmaddwd     mm0, mm0                    ; square and accumulate
        pmaddwd     mm2, mm2                    ; square and accumulate
        add         rbx,rdx                     ; Inc pointer into ref data
        add         rax,rcx                     ; Inc pointer into the new data
        movq        mm1, [rbx]                  ; Copy eight bytes to mm1
        paddd       mm7, mm0                    ; accumulate in mm7
        paddd       mm7, mm2                    ; accumulate in mm7
        ; Row 5
        movq        mm0, [rax]                  ; Copy eight bytes to mm0
        movq        mm2, mm0                    ; Take copies
        movq        mm3, mm1                    ; Take copies
        punpcklbw   mm0, mm6                    ; unpack to higher prrcision
        punpcklbw   mm1, mm6
        punpckhbw   mm2, mm6                    ; unpack to higher prrcision
        punpckhbw   mm3, mm6
        psubsw      mm0, mm1                    ; A-B (low order) to MM0
        psubsw      mm2, mm3                    ; A-B (high order) to MM2
        paddw       mm5, mm0                    ; accumulate differences in mm5
        paddw       mm5, mm2                    ; accumulate differences in mm5
        pmaddwd     mm0, mm0                    ; square and accumulate
        pmaddwd     mm2, mm2                    ; square and accumulate
        add         rbx,rdx                     ; Inc pointer into ref data
        add         rax,rcx                     ; Inc pointer into the new data
        movq        mm1, [rbx]                  ; Copy eight bytes to mm1
        ;              movq        mm4, [rbx + rdx]
        paddd       mm7, mm0                    ; accumulate in mm7
        paddd       mm7, mm2                    ; accumulate in mm7
        ; Row 6
        movq        mm0, [rax]                  ; Copy eight bytes to mm0
        movq        mm2, mm0                    ; Take copies
        movq        mm3, mm1                    ; Take copies
        punpcklbw   mm0, mm6                    ; unpack to higher prrcision
        punpcklbw   mm1, mm6
        punpckhbw   mm2, mm6                    ; unpack to higher prrcision
        punpckhbw   mm3, mm6
        psubsw      mm0, mm1                    ; A-B (low order) to MM0
        psubsw      mm2, mm3                    ; A-B (high order) to MM2
        paddw       mm5, mm0                    ; accumulate differences in mm5
        paddw       mm5, mm2                    ; accumulate differences in mm5
        pmaddwd     mm0, mm0                    ; square and accumulate
        pmaddwd     mm2, mm2                    ; square and accumulate
        add         rbx,rdx                     ; Inc pointer into ref data
        add         rax,rcx                     ; Inc pointer into the new data
        movq        mm1, [rbx]                  ; Copy eight bytes to mm1
        paddd       mm7, mm0                    ; accumulate in mm7
        paddd       mm7, mm2                    ; accumulate in mm7
        ; Row 7
        movq        mm0, [rax]                  ; Copy eight bytes to mm0
        movq        mm2, mm0                    ; Take copies
        movq        mm3, mm1                    ; Take copies
        punpcklbw   mm0, mm6                    ; unpack to higher prrcision
        punpcklbw   mm1, mm6
        punpckhbw   mm2, mm6                    ; unpack to higher prrcision
        punpckhbw   mm3, mm6
        psubsw      mm0, mm1                    ; A-B (low order) to MM0
        psubsw      mm2, mm3                    ; A-B (high order) to MM2
        paddw       mm5, mm0                    ; accumulate differences in mm5
        paddw       mm5, mm2                    ; accumulate differences in mm5
        pmaddwd     mm0, mm0                    ; square and accumulate
        pmaddwd     mm2, mm2                    ; square and accumulate
        add         rbx,rdx                     ; Inc pointer into ref data
        add         rax,rcx                     ; Inc pointer into the new data
        movq        mm1, [rbx]                  ; Copy eight bytes to mm1
        paddd       mm7, mm0                    ; accumulate in mm7
        paddd       mm7, mm2                    ; accumulate in mm7
        ; Row 8
        movq        mm0, [rax]                  ; Copy eight bytes to mm0
        movq        mm2, mm0                    ; Take copies
        movq        mm3, mm1                    ; Take copies
        punpcklbw   mm0, mm6                    ; unpack to higher prrcision
        punpcklbw   mm1, mm6
        punpckhbw   mm2, mm6                    ; unpack to higher prrcision
        punpckhbw   mm3, mm6
        psubsw      mm0, mm1                    ; A-B (low order) to MM0
        psubsw      mm2, mm3                    ; A-B (high order) to MM2
        paddw       mm5, mm0                    ; accumulate differences in mm5
        paddw       mm5, mm2                    ; accumulate differences in mm5
        pmaddwd     mm0, mm0                    ; square and accumulate
        pmaddwd     mm2, mm2                    ; square and accumulate
        add         rbx,rdx                     ; Inc pointer into ref data
        add         rax,rcx                     ; Inc pointer into the new data
        paddd       mm7, mm0                    ; accumulate in mm7
        paddd       mm7, mm2                    ; accumulate in mm7
        ; Now accumulate the final results.
        movq        QWORD PTR [rsp+8], mm5      ; copy back accumulated results into normal memory
        movq        QWORD PTR [rsp], mm7        ; copy back accumulated results into normal memory
        movsx       rdx, WORD PTR [rsp+8]
        movsx       rcx, WORD PTR [rsp+10]
        movsx       rbx, WORD PTR [rsp+12]
        movsx       rax, WORD PTR [rsp+14]
        add         rdx, rcx
        add         rbx, rax
        add         rdx, rbx    ;XSum
        movsxd      rax, DWORD PTR [rsp]
        movsxd      rcx, DWORD PTR [rsp+4]
        add         rax, rcx    ;XXSum
        mov         rsi, arg(4) ;SSE
        mov         rdi, arg(5) ;Sum
        mov         dword ptr [rsi], eax
        mov         dword ptr [rdi], edx
        xor         rax, rax    ; return 0
    ; begin epilog
    add rsp, 16
    pop rbx
    pop rdi
    pop rsi
    UNSHADOW_ARGS
    pop         rbp
    ret
 ;unsigned int
 ;vp8_get4x4var_mmx
 ;(
 ;    unsigned char *src_ptr,
 ;    int  source_stride,
 ;    unsigned char *ref_ptr,
 ;    int  recon_stride,
 ;    unsigned int *SSE,
 ;    int *Sum
 ;)
 global sym(vp8_get4x4var_mmx) PRIVATE
 sym(vp8_get4x4var_mmx):
    push        rbp
    mov         rbp, rsp
    SHADOW_ARGS_TO_STACK 6
    push rsi
    push rdi
    push rbx
    sub         rsp, 16
    ; end prolog
        pxor        mm5, mm5                    ; Blank mmx6
        pxor        mm6, mm6                    ; Blank mmx7
        pxor        mm7, mm7                    ; Blank mmx7
        mov         rax, arg(0) ;[src_ptr]  ; Load base addresses
        mov         rbx, arg(2) ;[ref_ptr]
        movsxd      rcx, dword ptr arg(1) ;[source_stride]
        movsxd      rdx, dword ptr arg(3) ;[recon_stride]
        ; Row 1
        movd        mm0, [rax]                  ; Copy four bytes to mm0
        movd        mm1, [rbx]                  ; Copy four bytes to mm1
        punpcklbw   mm0, mm6                    ; unpack to higher prrcision
        punpcklbw   mm1, mm6
        psubsw      mm0, mm1                    ; A-B (low order) to MM0
        paddw       mm5, mm0                    ; accumulate differences in mm5
        pmaddwd     mm0, mm0                    ; square and accumulate
        add         rbx,rdx                     ; Inc pointer into ref data
        add         rax,rcx                     ; Inc pointer into the new data
        movd        mm1, [rbx]                  ; Copy four bytes to mm1
        paddd       mm7, mm0                    ; accumulate in mm7
        ; Row 2
        movd        mm0, [rax]                  ; Copy four bytes to mm0
        punpcklbw   mm0, mm6                    ; unpack to higher prrcision
        punpcklbw   mm1, mm6
        psubsw      mm0, mm1                    ; A-B (low order) to MM0
        paddw       mm5, mm0                    ; accumulate differences in mm5
        pmaddwd     mm0, mm0                    ; square and accumulate
        add         rbx,rdx                     ; Inc pointer into ref data
        add         rax,rcx                     ; Inc pointer into the new data
        movd        mm1, [rbx]                  ; Copy four bytes to mm1
        paddd       mm7, mm0                    ; accumulate in mm7
        ; Row 3
        movd        mm0, [rax]                  ; Copy four bytes to mm0
        punpcklbw   mm0, mm6                    ; unpack to higher precision
        punpcklbw   mm1, mm6
        psubsw      mm0, mm1                    ; A-B (low order) to MM0
        paddw       mm5, mm0                    ; accumulate differences in mm5
        pmaddwd     mm0, mm0                    ; square and accumulate
        add         rbx,rdx                     ; Inc pointer into ref data
        add         rax,rcx                     ; Inc pointer into the new data
        movd        mm1, [rbx]                  ; Copy four bytes to mm1
        paddd       mm7, mm0                    ; accumulate in mm7
        ; Row 4
        movd        mm0, [rax]                  ; Copy four bytes to mm0
        punpcklbw   mm0, mm6                    ; unpack to higher prrcision
        punpcklbw   mm1, mm6
        psubsw      mm0, mm1                    ; A-B (low order) to MM0
        paddw       mm5, mm0                    ; accumulate differences in mm5
        pmaddwd     mm0, mm0                    ; square and accumulate
        paddd       mm7, mm0                    ; accumulate in mm7
        ; Now accumulate the final results.
        movq        QWORD PTR [rsp+8], mm5      ; copy back accumulated results into normal memory
        movq        QWORD PTR [rsp], mm7        ; copy back accumulated results into normal memory
        movsx       rdx, WORD PTR [rsp+8]
        movsx       rcx, WORD PTR [rsp+10]
        movsx       rbx, WORD PTR [rsp+12]
        movsx       rax, WORD PTR [rsp+14]
        add         rdx, rcx
        add         rbx, rax
        add         rdx, rbx    ;XSum
        movsxd      rax, DWORD PTR [rsp]
        movsxd      rcx, DWORD PTR [rsp+4]
        add         rax, rcx    ;XXSum
        mov         rsi, arg(4) ;SSE
        mov         rdi, arg(5) ;Sum
        mov         dword ptr [rsi], eax
        mov         dword ptr [rdi], edx
        xor         rax, rax    ; return 0
    ; begin epilog
    add rsp, 16
    pop rbx
    pop rdi
    pop rsi
    UNSHADOW_ARGS
    pop         rbp
    ret
 ;unsigned int
 ;vp8_get4x4sse_cs_mmx
 ;(
 ;    unsigned char *src_ptr,
 ;    int  source_stride,
 ;    unsigned char *ref_ptr,
 ;    int  recon_stride
 ;)
 global sym(vp8_get4x4sse_cs_mmx) PRIVATE
 sym(vp8_get4x4sse_cs_mmx):
    push        rbp
    mov         rbp, rsp
    SHADOW_ARGS_TO_STACK 4
    push rsi
    push rdi
    push rbx
    ; end prolog
        pxor        mm6, mm6                    ; Blank mmx7
        pxor        mm7, mm7                    ; Blank mmx7
        mov         rax, arg(0) ;[src_ptr]  ; Load base addresses
        mov         rbx, arg(2) ;[ref_ptr]
        movsxd      rcx, dword ptr arg(1) ;[source_stride]
        movsxd      rdx, dword ptr arg(3) ;[recon_stride]
        ; Row 1
        movd        mm0, [rax]                  ; Copy eight bytes to mm0
        movd        mm1, [rbx]                  ; Copy eight bytes to mm1
        punpcklbw   mm0, mm6                    ; unpack to higher prrcision
        punpcklbw   mm1, mm6
        psubsw      mm0, mm1                    ; A-B (low order) to MM0
        pmaddwd     mm0, mm0                    ; square and accumulate
        add         rbx,rdx                     ; Inc pointer into ref data
        add         rax,rcx                     ; Inc pointer into the new data
        movd        mm1, [rbx]                  ; Copy eight bytes to mm1
        paddd       mm7, mm0                    ; accumulate in mm7
        ; Row 2
        movd        mm0, [rax]                  ; Copy eight bytes to mm0
        punpcklbw   mm0, mm6                    ; unpack to higher prrcision
        punpcklbw   mm1, mm6
        psubsw      mm0, mm1                    ; A-B (low order) to MM0
        pmaddwd     mm0, mm0                    ; square and accumulate
        add         rbx,rdx                     ; Inc pointer into ref data
        add         rax,rcx                     ; Inc pointer into the new data
        movd        mm1, [rbx]                  ; Copy eight bytes to mm1
        paddd       mm7, mm0                    ; accumulate in mm7
        ; Row 3
        movd        mm0, [rax]                  ; Copy eight bytes to mm0
        punpcklbw   mm1, mm6
        punpcklbw   mm0, mm6                    ; unpack to higher prrcision
        psubsw      mm0, mm1                    ; A-B (low order) to MM0
        pmaddwd     mm0, mm0                    ; square and accumulate
        add         rbx,rdx                     ; Inc pointer into ref data
        add         rax,rcx                     ; Inc pointer into the new data
        movd        mm1, [rbx]                  ; Copy eight bytes to mm1
        paddd       mm7, mm0                    ; accumulate in mm7
        ; Row 4
        movd        mm0, [rax]                  ; Copy eight bytes to mm0
        punpcklbw   mm0, mm6                    ; unpack to higher prrcision
        punpcklbw   mm1, mm6
        psubsw      mm0, mm1                    ; A-B (low order) to MM0
        pmaddwd     mm0, mm0                    ; square and accumulate
        paddd       mm7, mm0                    ; accumulate in mm7
        movq        mm0,    mm7                 ;
        psrlq       mm7,    32
        paddd       mm0,    mm7
        movq        rax,    mm0
    ; begin epilog
    pop rbx
    pop rdi
    pop rsi
    UNSHADOW_ARGS
    pop         rbp
    ret
 %define mmx_filter_shift            7
 ;void vp8_filter_block2d_bil4x4_var_mmx
--- a/vp8/common/x86/variance_impl_sse2.asm
+++ b/vp8/common/x86/variance_impl_sse2.asm
@ -13,393 +13,6 @@
 %define xmm_filter_shift            7
 ;unsigned int vp8_get_mb_ss_sse2
 ;(
 ;    short *src_ptr
 ;)
 global sym(vp8_get_mb_ss_sse2) PRIVATE
 sym(vp8_get_mb_ss_sse2):
    push        rbp
    mov         rbp, rsp
    SHADOW_ARGS_TO_STACK 1
    GET_GOT     rbx
    push rsi
    push rdi
    sub         rsp, 16
    ; end prolog
        mov         rax, arg(0) ;[src_ptr]
        mov         rcx, 8
        pxor        xmm4, xmm4
 .NEXTROW:
        movdqa      xmm0, [rax]
        movdqa      xmm1, [rax+16]
        movdqa      xmm2, [rax+32]
        movdqa      xmm3, [rax+48]
        pmaddwd     xmm0, xmm0
        pmaddwd     xmm1, xmm1
        pmaddwd     xmm2, xmm2
        pmaddwd     xmm3, xmm3
        paddd       xmm0, xmm1
        paddd       xmm2, xmm3
        paddd       xmm4, xmm0
        paddd       xmm4, xmm2
        add         rax, 0x40
        dec         rcx
        ja          .NEXTROW
        movdqa      xmm3,xmm4
        psrldq      xmm4,8
        paddd       xmm4,xmm3
        movdqa      xmm3,xmm4
        psrldq      xmm4,4
        paddd       xmm4,xmm3
        movq        rax,xmm4
    ; begin epilog
    add rsp, 16
    pop rdi
    pop rsi
    RESTORE_GOT
    UNSHADOW_ARGS
    pop         rbp
    ret
 ;unsigned int vp8_get16x16var_sse2
 ;(
 ;    unsigned char   *  src_ptr,
 ;    int             source_stride,
 ;    unsigned char   *  ref_ptr,
 ;    int             recon_stride,
 ;    unsigned int    *  SSE,
 ;    int             *  Sum
 ;)
 global sym(vp8_get16x16var_sse2) PRIVATE
 sym(vp8_get16x16var_sse2):
    push        rbp
    mov         rbp, rsp
    SHADOW_ARGS_TO_STACK 6
    SAVE_XMM 7
    push rbx
    push rsi
    push rdi
    ; end prolog
        mov         rsi,            arg(0) ;[src_ptr]
        mov         rdi,            arg(2) ;[ref_ptr]
        movsxd      rax,            DWORD PTR arg(1) ;[source_stride]
        movsxd      rdx,            DWORD PTR arg(3) ;[recon_stride]
        ; Prefetch data
        lea             rcx,    [rax+rax*2]
        prefetcht0      [rsi]
        prefetcht0      [rsi+rax]
        prefetcht0      [rsi+rax*2]
        prefetcht0      [rsi+rcx]
        lea             rbx,    [rsi+rax*4]
        prefetcht0      [rbx]
        prefetcht0      [rbx+rax]
        prefetcht0      [rbx+rax*2]
        prefetcht0      [rbx+rcx]
        lea             rcx,    [rdx+rdx*2]
        prefetcht0      [rdi]
        prefetcht0      [rdi+rdx]
        prefetcht0      [rdi+rdx*2]
        prefetcht0      [rdi+rcx]
        lea             rbx,    [rdi+rdx*4]
        prefetcht0      [rbx]
        prefetcht0      [rbx+rdx]
        prefetcht0      [rbx+rdx*2]
        prefetcht0      [rbx+rcx]
        pxor        xmm0,           xmm0                        ; clear xmm0 for unpack
        pxor        xmm7,           xmm7                        ; clear xmm7 for accumulating diffs
        pxor        xmm6,           xmm6                        ; clear xmm6 for accumulating sse
        mov         rcx,            16
 .var16loop:
        movdqu      xmm1,           XMMWORD PTR [rsi]
        movdqu      xmm2,           XMMWORD PTR [rdi]
        prefetcht0      [rsi+rax*8]
        prefetcht0      [rdi+rdx*8]
        movdqa      xmm3,           xmm1
        movdqa      xmm4,           xmm2
        punpcklbw   xmm1,           xmm0
        punpckhbw   xmm3,           xmm0
        punpcklbw   xmm2,           xmm0
        punpckhbw   xmm4,           xmm0
        psubw       xmm1,           xmm2
        psubw       xmm3,           xmm4
        paddw       xmm7,           xmm1
        pmaddwd     xmm1,           xmm1
        paddw       xmm7,           xmm3
        pmaddwd     xmm3,           xmm3
        paddd       xmm6,           xmm1
        paddd       xmm6,           xmm3
        add         rsi,            rax
        add         rdi,            rdx
        sub         rcx,            1
        jnz         .var16loop
        movdqa      xmm1,           xmm6
        pxor        xmm6,           xmm6
        pxor        xmm5,           xmm5
        punpcklwd   xmm6,           xmm7
        punpckhwd   xmm5,           xmm7
        psrad       xmm5,           16
        psrad       xmm6,           16
        paddd       xmm6,           xmm5
        movdqa      xmm2,           xmm1
        punpckldq   xmm1,           xmm0
        punpckhdq   xmm2,           xmm0
        movdqa      xmm7,           xmm6
        paddd       xmm1,           xmm2
        punpckldq   xmm6,           xmm0
        punpckhdq   xmm7,           xmm0
        paddd       xmm6,           xmm7
        movdqa      xmm2,           xmm1
        movdqa      xmm7,           xmm6
        psrldq      xmm1,           8
        psrldq      xmm6,           8
        paddd       xmm7,           xmm6
        paddd       xmm1,           xmm2
        mov         rax,            arg(5) ;[Sum]
        mov         rdi,            arg(4) ;[SSE]
        movd DWORD PTR [rax],       xmm7
        movd DWORD PTR [rdi],       xmm1
    ; begin epilog
    pop rdi
    pop rsi
    pop rbx
    RESTORE_XMM
    UNSHADOW_ARGS
    pop         rbp
    ret
 ;unsigned int vp8_get8x8var_sse2
 ;(
 ;    unsigned char   *  src_ptr,
 ;    int             source_stride,
 ;    unsigned char   *  ref_ptr,
 ;    int             recon_stride,
 ;    unsigned int    *  SSE,
 ;    int             *  Sum
 ;)
 global sym(vp8_get8x8var_sse2) PRIVATE
 sym(vp8_get8x8var_sse2):
    push        rbp
    mov         rbp, rsp
    SHADOW_ARGS_TO_STACK 6
    SAVE_XMM 7
    GET_GOT     rbx
    push rsi
    push rdi
    sub         rsp, 16
    ; end prolog
        mov         rsi,            arg(0) ;[src_ptr]
        mov         rdi,            arg(2) ;[ref_ptr]
        movsxd      rax,            DWORD PTR arg(1) ;[source_stride]
        movsxd      rdx,            DWORD PTR arg(3) ;[recon_stride]
        pxor        xmm0,           xmm0                        ; clear xmm0 for unpack
        pxor        xmm7,           xmm7                        ; clear xmm7 for accumulating diffs
        movq        xmm1,           QWORD PTR [rsi]
        movq        xmm2,           QWORD PTR [rdi]
        punpcklbw   xmm1,           xmm0
        punpcklbw   xmm2,           xmm0
        psubsw      xmm1,           xmm2
        paddw       xmm7,           xmm1
        pmaddwd     xmm1,           xmm1
        movq        xmm2,           QWORD PTR[rsi + rax]
        movq        xmm3,           QWORD PTR[rdi + rdx]
        punpcklbw   xmm2,           xmm0
        punpcklbw   xmm3,           xmm0
        psubsw      xmm2,           xmm3
        paddw       xmm7,           xmm2
        pmaddwd     xmm2,           xmm2
        paddd       xmm1,           xmm2
        movq        xmm2,           QWORD PTR[rsi + rax * 2]
        movq        xmm3,           QWORD PTR[rdi + rdx * 2]
        punpcklbw   xmm2,           xmm0
        punpcklbw   xmm3,           xmm0
        psubsw      xmm2,           xmm3
        paddw       xmm7,           xmm2
        pmaddwd     xmm2,           xmm2
        paddd       xmm1,           xmm2
        lea         rsi,            [rsi + rax * 2]
        lea         rdi,            [rdi + rdx * 2]
        movq        xmm2,           QWORD PTR[rsi + rax]
        movq        xmm3,           QWORD PTR[rdi + rdx]
        punpcklbw   xmm2,           xmm0
        punpcklbw   xmm3,           xmm0
        psubsw      xmm2,           xmm3
        paddw       xmm7,           xmm2
        pmaddwd     xmm2,           xmm2
        paddd       xmm1,           xmm2
        movq        xmm2,           QWORD PTR[rsi + rax *2]
        movq        xmm3,           QWORD PTR[rdi + rdx *2]
        punpcklbw   xmm2,           xmm0
        punpcklbw   xmm3,           xmm0
        psubsw      xmm2,           xmm3
        paddw       xmm7,           xmm2
        pmaddwd     xmm2,           xmm2
        paddd       xmm1,           xmm2
        lea         rsi,            [rsi + rax * 2]
        lea         rdi,            [rdi + rdx * 2]
        movq        xmm2,           QWORD PTR[rsi + rax]
        movq        xmm3,           QWORD PTR[rdi + rdx]
        punpcklbw   xmm2,           xmm0
        punpcklbw   xmm3,           xmm0
        psubsw      xmm2,           xmm3
        paddw       xmm7,           xmm2
        pmaddwd     xmm2,           xmm2
        paddd       xmm1,           xmm2
        movq        xmm2,           QWORD PTR[rsi + rax *2]
        movq        xmm3,           QWORD PTR[rdi + rdx *2]
        punpcklbw   xmm2,           xmm0
        punpcklbw   xmm3,           xmm0
        psubsw      xmm2,           xmm3
        paddw       xmm7,           xmm2
        pmaddwd     xmm2,           xmm2
        paddd       xmm1,           xmm2
        lea         rsi,            [rsi + rax * 2]
        lea         rdi,            [rdi + rdx * 2]
        movq        xmm2,           QWORD PTR[rsi + rax]
        movq        xmm3,           QWORD PTR[rdi + rdx]
        punpcklbw   xmm2,           xmm0
        punpcklbw   xmm3,           xmm0
        psubsw      xmm2,           xmm3
        paddw       xmm7,           xmm2
        pmaddwd     xmm2,           xmm2
        paddd       xmm1,           xmm2
        movdqa      xmm6,           xmm7
        punpcklwd   xmm6,           xmm0
        punpckhwd   xmm7,           xmm0
        movdqa      xmm2,           xmm1
        paddw       xmm6,           xmm7
        punpckldq   xmm1,           xmm0
        punpckhdq   xmm2,           xmm0
        movdqa      xmm7,           xmm6
        paddd       xmm1,           xmm2
        punpckldq   xmm6,           xmm0
        punpckhdq   xmm7,           xmm0
        paddw       xmm6,           xmm7
        movdqa      xmm2,           xmm1
        movdqa      xmm7,           xmm6
        psrldq      xmm1,           8
        psrldq      xmm6,           8
        paddw       xmm7,           xmm6
        paddd       xmm1,           xmm2
        mov         rax,            arg(5) ;[Sum]
        mov         rdi,            arg(4) ;[SSE]
        movq        rdx,            xmm7
        movsx       rcx,            dx
        mov  dword ptr [rax],       ecx
        movd DWORD PTR [rdi],       xmm1
    ; begin epilog
    add rsp, 16
    pop rdi
    pop rsi
    RESTORE_GOT
    RESTORE_XMM
    UNSHADOW_ARGS
    pop         rbp
    ret
 ;void vp8_filter_block2d_bil_var_sse2
 ;(
 ;    unsigned char *ref_ptr,
--- a/vp8/common/x86/variance_mmx.c
+++ b/vp8/common/x86/variance_mmx.c
@ -35,25 +35,6 @@ extern void filter_block1d_v6_mmx
    short *filter
 );
 extern unsigned int vp8_get_mb_ss_mmx(const short *src_ptr);
 extern unsigned int vp8_get8x8var_mmx
 (
    const unsigned char *src_ptr,
    int  source_stride,
    const unsigned char *ref_ptr,
    int  recon_stride,
    unsigned int *SSE,
    int *Sum
 );
 extern unsigned int vp8_get4x4var_mmx
 (
    const unsigned char *src_ptr,
    int  source_stride,
    const unsigned char *ref_ptr,
    int  recon_stride,
    unsigned int *SSE,
    int *Sum
 );
 extern void vp8_filter_block2d_bil4x4_var_mmx
 (
    const unsigned char *ref_ptr,
@ -78,127 +59,6 @@ extern void vp8_filter_block2d_bil_var_mmx
    unsigned int *sumsquared
 );
 unsigned int vp8_variance4x4_mmx(
    const unsigned char *src_ptr,
    int  source_stride,
    const unsigned char *ref_ptr,
    int  recon_stride,
    unsigned int *sse)
 {
    unsigned int var;
    int avg;
    vp8_get4x4var_mmx(src_ptr, source_stride, ref_ptr, recon_stride, &var, &avg) ;
    *sse = var;
    return (var - (((unsigned int)avg * avg) >> 4));
 }
 unsigned int vp8_variance8x8_mmx(
    const unsigned char *src_ptr,
    int  source_stride,
    const unsigned char *ref_ptr,
    int  recon_stride,
    unsigned int *sse)
 {
    unsigned int var;
    int avg;
    vp8_get8x8var_mmx(src_ptr, source_stride, ref_ptr, recon_stride, &var, &avg) ;
    *sse = var;
    return (var - (((unsigned int)avg * avg) >> 6));
 }
 unsigned int vp8_mse16x16_mmx(
    const unsigned char *src_ptr,
    int  source_stride,
    const unsigned char *ref_ptr,
    int  recon_stride,
    unsigned int *sse)
 {
    unsigned int sse0, sse1, sse2, sse3, var;
    int sum0, sum1, sum2, sum3;
    vp8_get8x8var_mmx(src_ptr, source_stride, ref_ptr, recon_stride, &sse0, &sum0) ;
    vp8_get8x8var_mmx(src_ptr + 8, source_stride, ref_ptr + 8, recon_stride, &sse1, &sum1);
    vp8_get8x8var_mmx(src_ptr + 8 * source_stride, source_stride, ref_ptr + 8 * recon_stride, recon_stride, &sse2, &sum2) ;
    vp8_get8x8var_mmx(src_ptr + 8 * source_stride + 8, source_stride, ref_ptr + 8 * recon_stride + 8, recon_stride, &sse3, &sum3);
    var = sse0 + sse1 + sse2 + sse3;
    *sse = var;
    return var;
 }
 unsigned int vp8_variance16x16_mmx(
    const unsigned char *src_ptr,
    int  source_stride,
    const unsigned char *ref_ptr,
    int  recon_stride,
    unsigned int *sse)
 {
    unsigned int sse0, sse1, sse2, sse3, var;
    int sum0, sum1, sum2, sum3, avg;
    vp8_get8x8var_mmx(src_ptr, source_stride, ref_ptr, recon_stride, &sse0, &sum0) ;
    vp8_get8x8var_mmx(src_ptr + 8, source_stride, ref_ptr + 8, recon_stride, &sse1, &sum1);
    vp8_get8x8var_mmx(src_ptr + 8 * source_stride, source_stride, ref_ptr + 8 * recon_stride, recon_stride, &sse2, &sum2) ;
    vp8_get8x8var_mmx(src_ptr + 8 * source_stride + 8, source_stride, ref_ptr + 8 * recon_stride + 8, recon_stride, &sse3, &sum3);
    var = sse0 + sse1 + sse2 + sse3;
    avg = sum0 + sum1 + sum2 + sum3;
    *sse = var;
    return (var - (((unsigned int)avg * avg) >> 8));
 }
 unsigned int vp8_variance16x8_mmx(
    const unsigned char *src_ptr,
    int  source_stride,
    const unsigned char *ref_ptr,
    int  recon_stride,
    unsigned int *sse)
 {
    unsigned int sse0, sse1, var;
    int sum0, sum1, avg;
    vp8_get8x8var_mmx(src_ptr, source_stride, ref_ptr, recon_stride, &sse0, &sum0) ;
    vp8_get8x8var_mmx(src_ptr + 8, source_stride, ref_ptr + 8, recon_stride, &sse1, &sum1);
    var = sse0 + sse1;
    avg = sum0 + sum1;
    *sse = var;
    return (var - (((unsigned int)avg * avg) >> 7));
 }
 unsigned int vp8_variance8x16_mmx(
    const unsigned char *src_ptr,
    int  source_stride,
    const unsigned char *ref_ptr,
    int  recon_stride,
    unsigned int *sse)
 {
    unsigned int sse0, sse1, var;
    int sum0, sum1, avg;
    vp8_get8x8var_mmx(src_ptr, source_stride, ref_ptr, recon_stride, &sse0, &sum0) ;
    vp8_get8x8var_mmx(src_ptr + 8 * source_stride, source_stride, ref_ptr + 8 * recon_stride, recon_stride, &sse1, &sum1) ;
    var = sse0 + sse1;
    avg = sum0 + sum1;
    *sse = var;
    return (var - (((unsigned int)avg * avg) >> 7));
 }
 unsigned int vp8_sub_pixel_variance4x4_mmx
 (
    const unsigned char  *src_ptr,
--- a/vp8/common/x86/variance_sse2.c
+++ b/vp8/common/x86/variance_sse2.c
@ -31,38 +31,6 @@ extern void vp8_filter_block2d_bil4x4_var_mmx
    unsigned int *sumsquared
 );
 extern unsigned int vp8_get4x4var_mmx
 (
    const unsigned char *src_ptr,
    int  source_stride,
    const unsigned char *ref_ptr,
    int  recon_stride,
    unsigned int *SSE,
    int *Sum
 );
 unsigned int vp8_get_mb_ss_sse2
 (
    const short *src_ptr
 );
 unsigned int vp8_get16x16var_sse2
 (
    const unsigned char *src_ptr,
    int source_stride,
    const unsigned char *ref_ptr,
    int recon_stride,
    unsigned int *SSE,
    int *Sum
 );
 unsigned int vp8_get8x8var_sse2
 (
    const unsigned char *src_ptr,
    int source_stride,
    const unsigned char *ref_ptr,
    int recon_stride,
    unsigned int *SSE,
    int *Sum
 );
 void vp8_filter_block2d_bil_var_sse2
 (
    const unsigned char *ref_ptr,
@ -136,115 +104,6 @@ void vp8_half_vert_variance16x_h_sse2
    unsigned int *sumsquared
 );
 unsigned int vp8_variance4x4_wmt(
    const unsigned char *src_ptr,
    int  source_stride,
    const unsigned char *ref_ptr,
    int  recon_stride,
    unsigned int *sse)
 {
    unsigned int var;
    int avg;
    vp8_get4x4var_mmx(src_ptr, source_stride, ref_ptr, recon_stride, &var, &avg) ;
    *sse = var;
    return (var - (((unsigned int)avg * avg) >> 4));
 }
 unsigned int vp8_variance8x8_wmt
 (
    const unsigned char *src_ptr,
    int  source_stride,
    const unsigned char *ref_ptr,
    int  recon_stride,
    unsigned int *sse)
 {
    unsigned int var;
    int avg;
    vp8_get8x8var_sse2(src_ptr, source_stride, ref_ptr, recon_stride, &var, &avg) ;
    *sse = var;
    return (var - (((unsigned int)avg * avg) >> 6));
 }
 unsigned int vp8_variance16x16_wmt
 (
    const unsigned char *src_ptr,
    int  source_stride,
    const unsigned char *ref_ptr,
    int  recon_stride,
    unsigned int *sse)
 {
    unsigned int sse0;
    int sum0;
    vp8_get16x16var_sse2(src_ptr, source_stride, ref_ptr, recon_stride, &sse0, &sum0) ;
    *sse = sse0;
    return (sse0 - (((unsigned int)sum0 * sum0) >> 8));
 }
 unsigned int vp8_mse16x16_wmt(
    const unsigned char *src_ptr,
    int  source_stride,
    const unsigned char *ref_ptr,
    int  recon_stride,
    unsigned int *sse)
 {
    unsigned int sse0;
    int sum0;
    vp8_get16x16var_sse2(src_ptr, source_stride, ref_ptr, recon_stride, &sse0, &sum0) ;
    *sse = sse0;
    return sse0;
 }
 unsigned int vp8_variance16x8_wmt
 (
    const unsigned char *src_ptr,
    int  source_stride,
    const unsigned char *ref_ptr,
    int  recon_stride,
    unsigned int *sse)
 {
    unsigned int sse0, sse1, var;
    int sum0, sum1, avg;
    vp8_get8x8var_sse2(src_ptr, source_stride, ref_ptr, recon_stride, &sse0, &sum0) ;
    vp8_get8x8var_sse2(src_ptr + 8, source_stride, ref_ptr + 8, recon_stride, &sse1, &sum1);
    var = sse0 + sse1;
    avg = sum0 + sum1;
    *sse = var;
    return (var - (((unsigned int)avg * avg) >> 7));
 }
 unsigned int vp8_variance8x16_wmt
 (
    const unsigned char *src_ptr,
    int  source_stride,
    const unsigned char *ref_ptr,
    int  recon_stride,
    unsigned int *sse)
 {
    unsigned int sse0, sse1, var;
    int sum0, sum1, avg;
    vp8_get8x8var_sse2(src_ptr, source_stride, ref_ptr, recon_stride, &sse0, &sum0) ;
    vp8_get8x8var_sse2(src_ptr + 8 * source_stride, source_stride, ref_ptr + 8 * recon_stride, recon_stride, &sse1, &sum1) ;
    var = sse0 + sse1;
    avg = sum0 + sum1;
    *sse = var;
    return (var - (((unsigned int)avg * avg) >> 7));
 }
 unsigned int vp8_sub_pixel_variance4x4_wmt
 (
    const unsigned char  *src_ptr,
--- a/vp8/common/x86/variance_ssse3.c
+++ b/vp8/common/x86/variance_ssse3.c
@ -13,15 +13,6 @@
 #include "vp8/common/variance.h"
 #include "vpx_ports/mem.h"
 extern unsigned int vp8_get16x16var_sse2
 (
    const unsigned char *src_ptr,
    int source_stride,
    const unsigned char *ref_ptr,
    int recon_stride,
    unsigned int *SSE,
    int *Sum
 );
 extern void vp8_half_horiz_vert_variance16x_h_sse2
 (
    const unsigned char *ref_ptr,
--- a/vp8/encoder/arm/armv6/vp8_mse16x16_armv6.asm
+++ b/vp8/encoder/arm/armv6/vp8_mse16x16_armv6.asm
@ -1,138 +0,0 @@
 ;
 ;  Copyright (c) 2011 The WebM project authors. All Rights Reserved.
 ;
 ;  Use of this source code is governed by a BSD-style license
 ;  that can be found in the LICENSE file in the root of the source
 ;  tree. An additional intellectual property rights grant can be found
 ;  in the file PATENTS.  All contributing project authors may
 ;  be found in the AUTHORS file in the root of the source tree.
 ;
    EXPORT  |vp8_mse16x16_armv6|
    ARM
    AREA ||.text||, CODE, READONLY, ALIGN=2
 ; r0    unsigned char *src_ptr
 ; r1    int source_stride
 ; r2    unsigned char *ref_ptr
 ; r3    int  recon_stride
 ; stack unsigned int *sse
 ;
 ;note: Based on vp8_variance16x16_armv6. In this function, sum is never used.
 ;      So, we can remove this part of calculation.
 |vp8_mse16x16_armv6| PROC
    push    {r4-r9, lr}
    pld     [r0, r1, lsl #0]
    pld     [r2, r3, lsl #0]
    mov     r12, #16            ; set loop counter to 16 (=block height)
    mov     r4, #0              ; initialize sse = 0
 loop
    ; 1st 4 pixels
    ldr     r5, [r0, #0x0]      ; load 4 src pixels
    ldr     r6, [r2, #0x0]      ; load 4 ref pixels
    mov     lr, #0              ; constant zero
    usub8   r8, r5, r6          ; calculate difference
    pld     [r0, r1, lsl #1]
    sel     r7, r8, lr          ; select bytes with positive difference
    usub8   r9, r6, r5          ; calculate difference with reversed operands
    pld     [r2, r3, lsl #1]
    sel     r8, r9, lr          ; select bytes with negative difference
    ; calculate partial sums
    usad8   r5, r7, lr          ; calculate sum of positive differences
    usad8   r6, r8, lr          ; calculate sum of negative differences
    orr     r8, r8, r7          ; differences of all 4 pixels
    ldr     r5, [r0, #0x4]      ; load 4 src pixels
    ; calculate sse
    uxtb16  r6, r8              ; byte (two pixels) to halfwords
    uxtb16  r7, r8, ror #8      ; another two pixels to halfwords
    smlad   r4, r6, r6, r4      ; dual signed multiply, add and accumulate (1)
    ; 2nd 4 pixels
    ldr     r6, [r2, #0x4]      ; load 4 ref pixels
    smlad   r4, r7, r7, r4      ; dual signed multiply, add and accumulate (2)
    usub8   r8, r5, r6          ; calculate difference
    sel     r7, r8, lr          ; select bytes with positive difference
    usub8   r9, r6, r5          ; calculate difference with reversed operands
    sel     r8, r9, lr          ; select bytes with negative difference
    ; calculate partial sums
    usad8   r5, r7, lr          ; calculate sum of positive differences
    usad8   r6, r8, lr          ; calculate sum of negative differences
    orr     r8, r8, r7          ; differences of all 4 pixels
    ldr     r5, [r0, #0x8]      ; load 4 src pixels
    ; calculate sse
    uxtb16  r6, r8              ; byte (two pixels) to halfwords
    uxtb16  r7, r8, ror #8      ; another two pixels to halfwords
    smlad   r4, r6, r6, r4      ; dual signed multiply, add and accumulate (1)
    ; 3rd 4 pixels
    ldr     r6, [r2, #0x8]      ; load 4 ref pixels
    smlad   r4, r7, r7, r4      ; dual signed multiply, add and accumulate (2)
    usub8   r8, r5, r6          ; calculate difference
    sel     r7, r8, lr          ; select bytes with positive difference
    usub8   r9, r6, r5          ; calculate difference with reversed operands
    sel     r8, r9, lr          ; select bytes with negative difference
    ; calculate partial sums
    usad8   r5, r7, lr          ; calculate sum of positive differences
    usad8   r6, r8, lr          ; calculate sum of negative differences
    orr     r8, r8, r7          ; differences of all 4 pixels
    ldr     r5, [r0, #0xc]      ; load 4 src pixels
    ; calculate sse
    uxtb16  r6, r8              ; byte (two pixels) to halfwords
    uxtb16  r7, r8, ror #8      ; another two pixels to halfwords
    smlad   r4, r6, r6, r4      ; dual signed multiply, add and accumulate (1)
    ; 4th 4 pixels
    ldr     r6, [r2, #0xc]      ; load 4 ref pixels
    smlad   r4, r7, r7, r4      ; dual signed multiply, add and accumulate (2)
    usub8   r8, r5, r6          ; calculate difference
    add     r0, r0, r1          ; set src_ptr to next row
    sel     r7, r8, lr          ; select bytes with positive difference
    usub8   r9, r6, r5          ; calculate difference with reversed operands
    add     r2, r2, r3          ; set dst_ptr to next row
    sel     r8, r9, lr          ; select bytes with negative difference
    ; calculate partial sums
    usad8   r5, r7, lr          ; calculate sum of positive differences
    usad8   r6, r8, lr          ; calculate sum of negative differences
    orr     r8, r8, r7          ; differences of all 4 pixels
    subs    r12, r12, #1        ; next row
    ; calculate sse
    uxtb16  r6, r8              ; byte (two pixels) to halfwords
    uxtb16  r7, r8, ror #8      ; another two pixels to halfwords
    smlad   r4, r6, r6, r4      ; dual signed multiply, add and accumulate (1)
    smlad   r4, r7, r7, r4      ; dual signed multiply, add and accumulate (2)
    bne     loop
    ; return stuff
    ldr     r1, [sp, #28]       ; get address of sse
    mov     r0, r4              ; return sse
    str     r4, [r1]            ; store sse
    pop     {r4-r9, pc}
    ENDP
    END
--- a/vp8/encoder/arm/neon/vp8_mse16x16_neon.c
+++ b/vp8/encoder/arm/neon/vp8_mse16x16_neon.c
@ -1,131 +0,0 @@
 /*
 *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
 *
 *  Use of this source code is governed by a BSD-style license
 *  that can be found in the LICENSE file in the root of the source
 *  tree. An additional intellectual property rights grant can be found
 *  in the file PATENTS.  All contributing project authors may
 *  be found in the AUTHORS file in the root of the source tree.
 */
 #include <arm_neon.h>
 unsigned int vp8_mse16x16_neon(
        const unsigned char *src_ptr,
        int source_stride,
        const unsigned char *ref_ptr,
        int recon_stride,
        unsigned int *sse) {
    int i;
    int16x4_t d22s16, d23s16, d24s16, d25s16, d26s16, d27s16, d28s16, d29s16;
    int64x1_t d0s64;
    uint8x16_t q0u8, q1u8, q2u8, q3u8;
    int32x4_t q7s32, q8s32, q9s32, q10s32;
    uint16x8_t q11u16, q12u16, q13u16, q14u16;
    int64x2_t q1s64;
    q7s32 = vdupq_n_s32(0);
    q8s32 = vdupq_n_s32(0);
    q9s32 = vdupq_n_s32(0);
    q10s32 = vdupq_n_s32(0);
    for (i = 0; i < 8; i++) {  // mse16x16_neon_loop
        q0u8 = vld1q_u8(src_ptr);
        src_ptr += source_stride;
        q1u8 = vld1q_u8(src_ptr);
        src_ptr += source_stride;
        q2u8 = vld1q_u8(ref_ptr);
        ref_ptr += recon_stride;
        q3u8 = vld1q_u8(ref_ptr);
        ref_ptr += recon_stride;
        q11u16 = vsubl_u8(vget_low_u8(q0u8), vget_low_u8(q2u8));
        q12u16 = vsubl_u8(vget_high_u8(q0u8), vget_high_u8(q2u8));
        q13u16 = vsubl_u8(vget_low_u8(q1u8), vget_low_u8(q3u8));
        q14u16 = vsubl_u8(vget_high_u8(q1u8), vget_high_u8(q3u8));
        d22s16 = vreinterpret_s16_u16(vget_low_u16(q11u16));
        d23s16 = vreinterpret_s16_u16(vget_high_u16(q11u16));
        q7s32 = vmlal_s16(q7s32, d22s16, d22s16);
        q8s32 = vmlal_s16(q8s32, d23s16, d23s16);
        d24s16 = vreinterpret_s16_u16(vget_low_u16(q12u16));
        d25s16 = vreinterpret_s16_u16(vget_high_u16(q12u16));
        q9s32 = vmlal_s16(q9s32, d24s16, d24s16);
        q10s32 = vmlal_s16(q10s32, d25s16, d25s16);
        d26s16 = vreinterpret_s16_u16(vget_low_u16(q13u16));
        d27s16 = vreinterpret_s16_u16(vget_high_u16(q13u16));
        q7s32 = vmlal_s16(q7s32, d26s16, d26s16);
        q8s32 = vmlal_s16(q8s32, d27s16, d27s16);
        d28s16 = vreinterpret_s16_u16(vget_low_u16(q14u16));
        d29s16 = vreinterpret_s16_u16(vget_high_u16(q14u16));
        q9s32 = vmlal_s16(q9s32, d28s16, d28s16);
        q10s32 = vmlal_s16(q10s32, d29s16, d29s16);
    }
    q7s32 = vaddq_s32(q7s32, q8s32);
    q9s32 = vaddq_s32(q9s32, q10s32);
    q10s32 = vaddq_s32(q7s32, q9s32);
    q1s64 = vpaddlq_s32(q10s32);
    d0s64 = vadd_s64(vget_low_s64(q1s64), vget_high_s64(q1s64));
    vst1_lane_u32((uint32_t *)sse, vreinterpret_u32_s64(d0s64), 0);
    return vget_lane_u32(vreinterpret_u32_s64(d0s64), 0);
 }
 unsigned int vp8_get4x4sse_cs_neon(
        const unsigned char *src_ptr,
        int source_stride,
        const unsigned char *ref_ptr,
        int recon_stride) {
    int16x4_t d22s16, d24s16, d26s16, d28s16;
    int64x1_t d0s64;
    uint8x8_t d0u8, d1u8, d2u8, d3u8, d4u8, d5u8, d6u8, d7u8;
    int32x4_t q7s32, q8s32, q9s32, q10s32;
    uint16x8_t q11u16, q12u16, q13u16, q14u16;
    int64x2_t q1s64;
    d0u8 = vld1_u8(src_ptr);
    src_ptr += source_stride;
    d4u8 = vld1_u8(ref_ptr);
    ref_ptr += recon_stride;
    d1u8 = vld1_u8(src_ptr);
    src_ptr += source_stride;
    d5u8 = vld1_u8(ref_ptr);
    ref_ptr += recon_stride;
    d2u8 = vld1_u8(src_ptr);
    src_ptr += source_stride;
    d6u8 = vld1_u8(ref_ptr);
    ref_ptr += recon_stride;
    d3u8 = vld1_u8(src_ptr);
    src_ptr += source_stride;
    d7u8 = vld1_u8(ref_ptr);
    ref_ptr += recon_stride;
    q11u16 = vsubl_u8(d0u8, d4u8);
    q12u16 = vsubl_u8(d1u8, d5u8);
    q13u16 = vsubl_u8(d2u8, d6u8);
    q14u16 = vsubl_u8(d3u8, d7u8);
    d22s16 = vget_low_s16(vreinterpretq_s16_u16(q11u16));
    d24s16 = vget_low_s16(vreinterpretq_s16_u16(q12u16));
    d26s16 = vget_low_s16(vreinterpretq_s16_u16(q13u16));
    d28s16 = vget_low_s16(vreinterpretq_s16_u16(q14u16));
    q7s32 = vmull_s16(d22s16, d22s16);
    q8s32 = vmull_s16(d24s16, d24s16);
    q9s32 = vmull_s16(d26s16, d26s16);
    q10s32 = vmull_s16(d28s16, d28s16);
    q7s32 = vaddq_s32(q7s32, q8s32);
    q9s32 = vaddq_s32(q9s32, q10s32);
    q9s32 = vaddq_s32(q7s32, q9s32);
    q1s64 = vpaddlq_s32(q9s32);
    d0s64 = vadd_s64(vget_low_s64(q1s64), vget_high_s64(q1s64));
    return vget_lane_u32(vreinterpret_u32_s64(d0s64), 0);
 }
--- a/vp8/encoder/encodeframe.c
+++ b/vp8/encoder/encodeframe.c
@ -11,6 +11,7 @@
 #include "vpx_config.h"
 #include "vp8_rtcd.h"
 #include "./vpx_dsp_rtcd.h"
 #include "encodemb.h"
 #include "encodemv.h"
 #include "vp8/common/common.h"
@ -90,7 +91,7 @@ static unsigned int tt_activity_measure( VP8_COMP *cpi, MACROBLOCK *x )
     *  lambda using a non-linear combination (e.g., the smallest, or second
     *  smallest, etc.).
     */
-    act =  vp8_variance16x16(x->src.y_buffer,
+    act =  vpx_variance16x16(x->src.y_buffer,
                    x->src.y_stride, VP8_VAR_OFFS, 0, &sse);
    act = act<<4;
--- a/vp8/encoder/encodeintra.c
+++ b/vp8/encoder/encodeintra.c
@ -11,6 +11,7 @@
 #include "vpx_config.h"
 #include "vp8_rtcd.h"
 #include "./vpx_dsp_rtcd.h"
 #include "quantize.h"
 #include "vp8/common/reconintra4x4.h"
 #include "encodemb.h"
@ -44,7 +45,7 @@ int vp8_encode_intra(VP8_COMP *cpi, MACROBLOCK *x, int use_dc_pred)
        }
    }
-    intra_pred_var = vp8_get_mb_ss(x->src_diff);
+    intra_pred_var = vpx_get_mb_ss(x->src_diff);
    return intra_pred_var;
 }
--- a/vp8/encoder/firstpass.c
+++ b/vp8/encoder/firstpass.c
@ -12,6 +12,7 @@
 #include <limits.h>
 #include <stdio.h>
 #include "./vpx_dsp_rtcd.h"
 #include "./vpx_scale_rtcd.h"
 #include "block.h"
 #include "onyx_int.h"
@ -422,14 +423,14 @@ static void zz_motion_search( VP8_COMP *cpi, MACROBLOCK * x,
    /* Set up pointers for this macro block raw buffer */
    raw_ptr = (unsigned char *)(raw_buffer->y_buffer + recon_yoffset
                                + d->offset);
-    vp8_mse16x16 ( src_ptr, src_stride, raw_ptr, raw_stride,
+    vpx_mse16x16(src_ptr, src_stride, raw_ptr, raw_stride,
-                   (unsigned int *)(raw_motion_err));
+                 (unsigned int *)(raw_motion_err));
    /* Set up pointers for this macro block recon buffer */
    xd->pre.y_buffer = recon_buffer->y_buffer + recon_yoffset;
    ref_ptr = (unsigned char *)(xd->pre.y_buffer + d->offset );
-    vp8_mse16x16 ( src_ptr, src_stride, ref_ptr, ref_stride,
+    vpx_mse16x16(src_ptr, src_stride, ref_ptr, ref_stride,
-                   (unsigned int *)(best_motion_err));
+                 (unsigned int *)(best_motion_err));
 }
 static void first_pass_motion_search(VP8_COMP *cpi, MACROBLOCK *x,
@ -453,7 +454,7 @@ static void first_pass_motion_search(VP8_COMP *cpi, MACROBLOCK *x,
    int new_mv_mode_penalty = 256;
    /* override the default variance function to use MSE */
-    v_fn_ptr.vf    = vp8_mse16x16;
+    v_fn_ptr.vf    = vpx_mse16x16;
    /* Set up pointers for this macro block recon buffer */
    xd->pre.y_buffer = recon_buffer->y_buffer + recon_yoffset;
--- a/vp8/encoder/onyx_if.c
+++ b/vp8/encoder/onyx_if.c
@ -2131,7 +2131,7 @@ struct VP8_COMP* vp8_create_compressor(VP8_CONFIG *oxcf)
 #endif
    cpi->fn_ptr[BLOCK_16X16].sdf            = vpx_sad16x16;
-    cpi->fn_ptr[BLOCK_16X16].vf             = vp8_variance16x16;
+    cpi->fn_ptr[BLOCK_16X16].vf             = vpx_variance16x16;
    cpi->fn_ptr[BLOCK_16X16].svf            = vp8_sub_pixel_variance16x16;
    cpi->fn_ptr[BLOCK_16X16].svf_halfpix_h  = vp8_variance_halfpixvar16x16_h;
    cpi->fn_ptr[BLOCK_16X16].svf_halfpix_v  = vp8_variance_halfpixvar16x16_v;
@ -2141,7 +2141,7 @@ struct VP8_COMP* vp8_create_compressor(VP8_CONFIG *oxcf)
    cpi->fn_ptr[BLOCK_16X16].sdx4df         = vpx_sad16x16x4d;
    cpi->fn_ptr[BLOCK_16X8].sdf            = vpx_sad16x8;
-    cpi->fn_ptr[BLOCK_16X8].vf             = vp8_variance16x8;
+    cpi->fn_ptr[BLOCK_16X8].vf             = vpx_variance16x8;
    cpi->fn_ptr[BLOCK_16X8].svf            = vp8_sub_pixel_variance16x8;
    cpi->fn_ptr[BLOCK_16X8].svf_halfpix_h  = NULL;
    cpi->fn_ptr[BLOCK_16X8].svf_halfpix_v  = NULL;
@ -2151,7 +2151,7 @@ struct VP8_COMP* vp8_create_compressor(VP8_CONFIG *oxcf)
    cpi->fn_ptr[BLOCK_16X8].sdx4df         = vpx_sad16x8x4d;
    cpi->fn_ptr[BLOCK_8X16].sdf            = vpx_sad8x16;
-    cpi->fn_ptr[BLOCK_8X16].vf             = vp8_variance8x16;
+    cpi->fn_ptr[BLOCK_8X16].vf             = vpx_variance8x16;
    cpi->fn_ptr[BLOCK_8X16].svf            = vp8_sub_pixel_variance8x16;
    cpi->fn_ptr[BLOCK_8X16].svf_halfpix_h  = NULL;
    cpi->fn_ptr[BLOCK_8X16].svf_halfpix_v  = NULL;
@ -2161,7 +2161,7 @@ struct VP8_COMP* vp8_create_compressor(VP8_CONFIG *oxcf)
    cpi->fn_ptr[BLOCK_8X16].sdx4df         = vpx_sad8x16x4d;
    cpi->fn_ptr[BLOCK_8X8].sdf            = vpx_sad8x8;
-    cpi->fn_ptr[BLOCK_8X8].vf             = vp8_variance8x8;
+    cpi->fn_ptr[BLOCK_8X8].vf             = vpx_variance8x8;
    cpi->fn_ptr[BLOCK_8X8].svf            = vp8_sub_pixel_variance8x8;
    cpi->fn_ptr[BLOCK_8X8].svf_halfpix_h  = NULL;
    cpi->fn_ptr[BLOCK_8X8].svf_halfpix_v  = NULL;
@ -2171,7 +2171,7 @@ struct VP8_COMP* vp8_create_compressor(VP8_CONFIG *oxcf)
    cpi->fn_ptr[BLOCK_8X8].sdx4df         = vpx_sad8x8x4d;
    cpi->fn_ptr[BLOCK_4X4].sdf            = vpx_sad4x4;
-    cpi->fn_ptr[BLOCK_4X4].vf             = vp8_variance4x4;
+    cpi->fn_ptr[BLOCK_4X4].vf             = vpx_variance4x4;
    cpi->fn_ptr[BLOCK_4X4].svf            = vp8_sub_pixel_variance4x4;
    cpi->fn_ptr[BLOCK_4X4].svf_halfpix_h  = NULL;
    cpi->fn_ptr[BLOCK_4X4].svf_halfpix_v  = NULL;
@ -2558,7 +2558,7 @@ static uint64_t calc_plane_error(unsigned char *orig, int orig_stride,
        {
            unsigned int sse;
-            vp8_mse16x16(orig + col, orig_stride,
+            vpx_mse16x16(orig + col, orig_stride,
                                            recon + col, recon_stride,
                                            &sse);
            total_sse += sse;
@ -3384,7 +3384,7 @@ static int measure_square_diff_partial(YV12_BUFFER_CONFIG *source,
                int index = block_index_row + (j >> 4);
                if (cpi->consec_zero_last[index] >= min_consec_zero_last) {
                  unsigned int sse;
-                  Total += vp8_mse16x16(src + j,
+                  Total += vpx_mse16x16(src + j,
                                        source->y_stride,
                                        dst + j, dest->y_stride,
                                        &sse);
@ -3448,7 +3448,7 @@ static void process_denoiser_mode_change(VP8_COMP *cpi) {
      int index = block_index_row + (j >> 4);
      if (cpi->consec_zero_last[index] >= min_consec_zero_last) {
        unsigned int sse;
-        const unsigned int var = vp8_variance16x16(src + j,
+        const unsigned int var = vpx_variance16x16(src + j,
                                                   ystride,
                                                   dst + j,
                                                   ystride,
@ -3458,7 +3458,7 @@ static void process_denoiser_mode_change(VP8_COMP *cpi) {
        // is small (to avoid effects from lighting change).
        if ((sse - var) < 128) {
          unsigned int sse2;
-          const unsigned int act = vp8_variance16x16(src + j,
+          const unsigned int act = vpx_variance16x16(src + j,
                                                     ystride,
                                                     const_source,
                                                     0,
@ -5993,7 +5993,8 @@ int vp8_calc_ss_err(YV12_BUFFER_CONFIG *source, YV12_BUFFER_CONFIG *dest)
        for (j = 0; j < source->y_width; j += 16)
        {
            unsigned int sse;
-            Total += vp8_mse16x16(src + j, source->y_stride, dst + j, dest->y_stride, &sse);
+            Total += vpx_mse16x16(src + j, source->y_stride,
                                  dst + j, dest->y_stride, &sse);
        }
        src += 16 * source->y_stride;
--- a/vp8/encoder/pickinter.c
+++ b/vp8/encoder/pickinter.c
@ -11,6 +11,7 @@
 #include <limits.h>
 #include "vpx_config.h"
 #include "./vpx_dsp_rtcd.h"
 #include "onyx_int.h"
 #include "modecosts.h"
 #include "encodeintra.h"
@ -215,33 +216,6 @@ int vp8_get_inter_mbpred_error(MACROBLOCK *mb,
 }
 unsigned int vp8_get4x4sse_cs_c
 (
    const unsigned char *src_ptr,
    int  source_stride,
    const unsigned char *ref_ptr,
    int  recon_stride
 )
 {
    int distortion = 0;
    int r, c;
    for (r = 0; r < 4; r++)
    {
        for (c = 0; c < 4; c++)
        {
            int diff = src_ptr[c] - ref_ptr[c];
            distortion += diff * diff;
        }
        src_ptr += source_stride;
        ref_ptr += recon_stride;
    }
    return distortion;
 }
 static int get_prediction_error(BLOCK *be, BLOCKD *b)
 {
    unsigned char *sptr;
@ -249,7 +223,7 @@ static int get_prediction_error(BLOCK *be, BLOCKD *b)
    sptr = (*(be->base_src) + be->src);
    dptr = b->predictor;
-    return vp8_get4x4sse_cs(sptr, be->src_stride, dptr, 16);
+    return vpx_get4x4sse_cs(sptr, be->src_stride, dptr, 16);
 }
@ -1037,7 +1011,7 @@ void vp8_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset,
            else
            {
                rate2 += rate;
-                distortion2 = vp8_variance16x16(
+                distortion2 = vpx_variance16x16(
                                    *(b->base_src), b->src_stride,
                                    x->e_mbd.predictor, 16, &sse);
                this_rd = RDCOST(x->rdmult, x->rddiv, rate2, distortion2);
@ -1066,7 +1040,7 @@ void vp8_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset,
                                             xd->dst.y_stride,
                                             xd->predictor,
                                             16);
-            distortion2 = vp8_variance16x16
+            distortion2 = vpx_variance16x16
                                          (*(b->base_src), b->src_stride,
                                          x->e_mbd.predictor, 16, &sse);
            rate2 += x->mbmode_cost[x->e_mbd.frame_type][x->e_mbd.mode_info_context->mbmi.mode];
@ -1547,7 +1521,7 @@ void vp8_pick_intra_mode(MACROBLOCK *x, int *rate_)
                                         xd->dst.y_stride,
                                         xd->predictor,
                                         16);
-        distortion = vp8_variance16x16
+        distortion = vpx_variance16x16
            (*(b->base_src), b->src_stride, xd->predictor, 16, &sse);
        rate = x->mbmode_cost[xd->frame_type][mode];
        this_rd = RDCOST(x->rdmult, x->rddiv, rate, distortion);
--- a/vp8/encoder/picklpf.c
+++ b/vp8/encoder/picklpf.c
@ -9,6 +9,7 @@
 */
 #include "./vpx_dsp_rtcd.h"
 #include "./vpx_scale_rtcd.h"
 #include "vp8/common/onyxc_int.h"
 #include "onyx_int.h"
@ -83,7 +84,7 @@ static int calc_partial_ssl_err(YV12_BUFFER_CONFIG *source,
        for (j = 0; j < source->y_width; j += 16)
        {
            unsigned int sse;
-            Total += vp8_mse16x16(src + j, source->y_stride,
+            Total += vpx_mse16x16(src + j, source->y_stride,
                                                     dst + j, dest->y_stride,
                                                     &sse);
        }
--- a/vp8/encoder/rdopt.c
+++ b/vp8/encoder/rdopt.c
@ -15,6 +15,7 @@
 #include <assert.h>
 #include "vpx_config.h"
 #include "vp8_rtcd.h"
 #include "./vpx_dsp_rtcd.h"
 #include "tokenize.h"
 #include "treewriter.h"
 #include "onyx_int.h"
@ -507,9 +508,9 @@ int VP8_UVSSE(MACROBLOCK *x)
    }
    else
    {
-        vp8_variance8x8(uptr, pre_stride,
+        vpx_variance8x8(uptr, pre_stride,
            upred_ptr, uv_stride, &sse2);
-        vp8_variance8x8(vptr, pre_stride,
+        vpx_variance8x8(vptr, pre_stride,
            vpred_ptr, uv_stride, &sse1);
        sse2 += sse1;
    }
@ -1783,7 +1784,7 @@ static int evaluate_inter_mode_rd(int mdcounts[4],
        if(threshold < x->encode_breakout)
            threshold = x->encode_breakout;
-        var = vp8_variance16x16
+        var = vpx_variance16x16
                (*(b->base_src), b->src_stride,
                x->e_mbd.predictor, 16, &sse);
--- a/vp8/vp8_common.mk
+++ b/vp8/vp8_common.mk
@ -145,8 +145,6 @@ VP8_COMMON_SRCS-$(HAVE_MEDIA)  += common/arm/armv6/intra4x4_predict_v6$(ASM)
 VP8_COMMON_SRCS-$(HAVE_MEDIA)  += common/arm/armv6/dequant_idct_v6$(ASM)
 VP8_COMMON_SRCS-$(HAVE_MEDIA)  += common/arm/armv6/dequantize_v6$(ASM)
 VP8_COMMON_SRCS-$(HAVE_MEDIA)  += common/arm/armv6/idct_blk_v6.c
 VP8_COMMON_SRCS-$(HAVE_MEDIA)  += common/arm/armv6/vp8_variance8x8_armv6$(ASM)
 VP8_COMMON_SRCS-$(HAVE_MEDIA)  += common/arm/armv6/vp8_variance16x16_armv6$(ASM)
 VP8_COMMON_SRCS-$(HAVE_MEDIA)  += common/arm/armv6/vp8_variance_halfpixvar16x16_h_armv6$(ASM)
 VP8_COMMON_SRCS-$(HAVE_MEDIA)  += common/arm/armv6/vp8_variance_halfpixvar16x16_v_armv6$(ASM)
 VP8_COMMON_SRCS-$(HAVE_MEDIA)  += common/arm/armv6/vp8_variance_halfpixvar16x16_hv_armv6$(ASM)
@ -168,7 +166,6 @@ VP8_COMMON_SRCS-$(HAVE_NEON)  += common/arm/neon/mbloopfilter_neon.c
 VP8_COMMON_SRCS-$(HAVE_NEON)  += common/arm/neon/reconintra_neon.c
 VP8_COMMON_SRCS-$(HAVE_NEON)  += common/arm/neon/shortidct4x4llm_neon.c
 VP8_COMMON_SRCS-$(HAVE_NEON)  += common/arm/neon/sixtappredict_neon.c
 VP8_COMMON_SRCS-$(HAVE_NEON)  += common/arm/neon/variance_neon.c
 VP8_COMMON_SRCS-$(HAVE_NEON)  += common/arm/neon/vp8_subpixelvariance_neon.c
 $(eval $(call rtcd_h_template,vp8_rtcd,vp8/common/rtcd_defs.pl))
--- a/vp8/vp8cx_arm.mk
+++ b/vp8/vp8cx_arm.mk
@ -18,7 +18,6 @@ VP8_CX_SRCS-$(ARCH_ARM)  += encoder/arm/dct_arm.c
 #File list for media
 # encoder
 VP8_CX_SRCS-$(HAVE_MEDIA)  += encoder/arm/armv6/vp8_short_fdct4x4_armv6$(ASM)
 VP8_CX_SRCS-$(HAVE_MEDIA)  += encoder/arm/armv6/vp8_mse16x16_armv6$(ASM)
 VP8_CX_SRCS-$(HAVE_MEDIA)  += encoder/arm/armv6/walsh_v6$(ASM)
 #File list for neon
@ -27,5 +26,4 @@ VP8_CX_SRCS-$(HAVE_NEON)  += encoder/arm/neon/denoising_neon.c
 VP8_CX_SRCS-$(HAVE_NEON)  += encoder/arm/neon/fastquantizeb_neon.c
 VP8_CX_SRCS-$(HAVE_NEON)  += encoder/arm/neon/shortfdct_neon.c
 VP8_CX_SRCS-$(HAVE_NEON)  += encoder/arm/neon/subtract_neon.c
 VP8_CX_SRCS-$(HAVE_NEON)  += encoder/arm/neon/vp8_mse16x16_neon.c
 VP8_CX_SRCS-$(HAVE_NEON)  += encoder/arm/neon/vp8_shortwalsh4x4_neon.c
--- a/vp9/common/vp9_mfqe.c
+++ b/vp9/common/vp9_mfqe.c
@ -171,13 +171,13 @@ static void mfqe_block(BLOCK_SIZE bs, const uint8_t *y, const uint8_t *u,
  get_thr(bs, qdiff, &sad_thr, &vdiff_thr);
  if (bs == BLOCK_16X16) {
-    vdiff = (vp9_variance16x16(y, y_stride, yd, yd_stride, &sse) + 128) >> 8;
+    vdiff = (vpx_variance16x16(y, y_stride, yd, yd_stride, &sse) + 128) >> 8;
    sad = (vpx_sad16x16(y, y_stride, yd, yd_stride) + 128) >> 8;
  } else if (bs == BLOCK_32X32) {
-    vdiff = (vp9_variance32x32(y, y_stride, yd, yd_stride, &sse) + 512) >> 10;
+    vdiff = (vpx_variance32x32(y, y_stride, yd, yd_stride, &sse) + 512) >> 10;
    sad = (vpx_sad32x32(y, y_stride, yd, yd_stride) + 512) >> 10;
  } else /* if (bs == BLOCK_64X64) */ {
-    vdiff = (vp9_variance64x64(y, y_stride, yd, yd_stride, &sse) + 2048) >> 12;
+    vdiff = (vpx_variance64x64(y, y_stride, yd, yd_stride, &sse) + 2048) >> 12;
    sad = (vpx_sad64x64(y, y_stride, yd, yd_stride) + 2048) >> 12;
  }
--- a/vp9/common/vp9_rtcd_defs.pl
+++ b/vp9/common/vp9_rtcd_defs.pl
@ -797,51 +797,6 @@ if (vpx_config("CONFIG_VP9_ENCODER") eq "yes") {
 # variance
 add_proto qw/unsigned int vp9_variance32x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
 specialize qw/vp9_variance32x16 avx2/, "$sse2_x86inc";
 add_proto qw/unsigned int vp9_variance16x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
 specialize qw/vp9_variance16x32/, "$sse2_x86inc";
 add_proto qw/unsigned int vp9_variance64x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
 specialize qw/vp9_variance64x32 avx2 neon/, "$sse2_x86inc";
 add_proto qw/unsigned int vp9_variance32x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
 specialize qw/vp9_variance32x64 neon/, "$sse2_x86inc";
 add_proto qw/unsigned int vp9_variance32x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
 specialize qw/vp9_variance32x32 avx2 neon/, "$sse2_x86inc";
 add_proto qw/unsigned int vp9_variance64x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
 specialize qw/vp9_variance64x64 avx2 neon/, "$sse2_x86inc";
 add_proto qw/unsigned int vp9_variance16x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
 specialize qw/vp9_variance16x16 avx2 neon/, "$sse2_x86inc";
 add_proto qw/unsigned int vp9_variance16x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
 specialize qw/vp9_variance16x8/, "$sse2_x86inc";
 add_proto qw/unsigned int vp9_variance8x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
 specialize qw/vp9_variance8x16/, "$sse2_x86inc";
 add_proto qw/unsigned int vp9_variance8x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
 specialize qw/vp9_variance8x8 neon/, "$sse2_x86inc";
 add_proto qw/void vp9_get8x8var/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum";
 specialize qw/vp9_get8x8var neon/, "$sse2_x86inc";
 add_proto qw/void vp9_get16x16var/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum";
 specialize qw/vp9_get16x16var avx2 neon/, "$sse2_x86inc";
 add_proto qw/unsigned int vp9_variance8x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
 specialize qw/vp9_variance8x4/, "$sse2_x86inc";
 add_proto qw/unsigned int vp9_variance4x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
 specialize qw/vp9_variance4x8/, "$sse2_x86inc";
 add_proto qw/unsigned int vp9_variance4x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
 specialize qw/vp9_variance4x4/, "$sse2_x86inc";
 add_proto qw/unsigned int vp9_sub_pixel_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
 specialize qw/vp9_sub_pixel_variance64x64 avx2 neon/, "$sse2_x86inc", "$ssse3_x86inc";
@ -922,21 +877,6 @@ specialize qw/vp9_sub_pixel_variance4x4/, "$sse_x86inc", "$ssse3_x86inc";
 add_proto qw/unsigned int vp9_sub_pixel_avg_variance4x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
 specialize qw/vp9_sub_pixel_avg_variance4x4/, "$sse_x86inc", "$ssse3_x86inc";
 add_proto qw/unsigned int vp9_mse16x16/, "const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse";
 specialize qw/vp9_mse16x16 avx2/, "$sse2_x86inc";
 add_proto qw/unsigned int vp9_mse8x16/, "const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse";
 specialize qw/vp9_mse8x16/, "$sse2_x86inc";
 add_proto qw/unsigned int vp9_mse16x8/, "const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse";
 specialize qw/vp9_mse16x8/, "$sse2_x86inc";
 add_proto qw/unsigned int vp9_mse8x8/, "const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse";
 specialize qw/vp9_mse8x8/, "$sse2_x86inc";
 add_proto qw/unsigned int vp9_get_mb_ss/, "const int16_t *";
 specialize qw/vp9_get_mb_ss/, "$sse2_x86inc";
 add_proto qw/unsigned int vp9_avg_8x8/, "const uint8_t *, int p";
 specialize qw/vp9_avg_8x8 sse2 neon/;
@ -1141,142 +1081,6 @@ specialize qw/vp9_temporal_filter_apply sse2/;
 if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
  # variance
  add_proto qw/unsigned int vp9_highbd_variance32x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
  specialize qw/vp9_highbd_variance32x16/, "$sse2_x86inc";
  add_proto qw/unsigned int vp9_highbd_variance16x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
  specialize qw/vp9_highbd_variance16x32/, "$sse2_x86inc";
  add_proto qw/unsigned int vp9_highbd_variance64x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
  specialize qw/vp9_highbd_variance64x32/, "$sse2_x86inc";
  add_proto qw/unsigned int vp9_highbd_variance32x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
  specialize qw/vp9_highbd_variance32x64/, "$sse2_x86inc";
  add_proto qw/unsigned int vp9_highbd_variance32x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
  specialize qw/vp9_highbd_variance32x32/, "$sse2_x86inc";
  add_proto qw/unsigned int vp9_highbd_variance64x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
  specialize qw/vp9_highbd_variance64x64/, "$sse2_x86inc";
  add_proto qw/unsigned int vp9_highbd_variance16x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
  specialize qw/vp9_highbd_variance16x16/, "$sse2_x86inc";
  add_proto qw/unsigned int vp9_highbd_variance16x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
  specialize qw/vp9_highbd_variance16x8/, "$sse2_x86inc";
  add_proto qw/unsigned int vp9_highbd_variance8x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
  specialize qw/vp9_highbd_variance8x16/, "$sse2_x86inc";
  add_proto qw/unsigned int vp9_highbd_variance8x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
  specialize qw/vp9_highbd_variance8x8/, "$sse2_x86inc";
  add_proto qw/unsigned int vp9_highbd_variance8x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
  specialize qw/vp9_highbd_variance8x4/;
  add_proto qw/unsigned int vp9_highbd_variance4x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
  specialize qw/vp9_highbd_variance4x8/;
  add_proto qw/unsigned int vp9_highbd_variance4x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
  specialize qw/vp9_highbd_variance4x4/;
  add_proto qw/void vp9_highbd_get8x8var/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum";
  specialize qw/vp9_highbd_get8x8var/, "$sse2_x86inc";
  add_proto qw/void vp9_highbd_get16x16var/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum";
  specialize qw/vp9_highbd_get16x16var/, "$sse2_x86inc";
  add_proto qw/unsigned int vp9_highbd_10_variance32x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
  specialize qw/vp9_highbd_10_variance32x16/, "$sse2_x86inc";
  add_proto qw/unsigned int vp9_highbd_10_variance16x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
  specialize qw/vp9_highbd_10_variance16x32/, "$sse2_x86inc";
  add_proto qw/unsigned int vp9_highbd_10_variance64x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
  specialize qw/vp9_highbd_10_variance64x32/, "$sse2_x86inc";
  add_proto qw/unsigned int vp9_highbd_10_variance32x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
  specialize qw/vp9_highbd_10_variance32x64/, "$sse2_x86inc";
  add_proto qw/unsigned int vp9_highbd_10_variance32x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
  specialize qw/vp9_highbd_10_variance32x32/, "$sse2_x86inc";
  add_proto qw/unsigned int vp9_highbd_10_variance64x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
  specialize qw/vp9_highbd_10_variance64x64/, "$sse2_x86inc";
  add_proto qw/unsigned int vp9_highbd_10_variance16x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
  specialize qw/vp9_highbd_10_variance16x16/, "$sse2_x86inc";
  add_proto qw/unsigned int vp9_highbd_10_variance16x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
  specialize qw/vp9_highbd_10_variance16x8/, "$sse2_x86inc";
  add_proto qw/unsigned int vp9_highbd_10_variance8x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
  specialize qw/vp9_highbd_10_variance8x16/, "$sse2_x86inc";
  add_proto qw/unsigned int vp9_highbd_10_variance8x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
  specialize qw/vp9_highbd_10_variance8x8/, "$sse2_x86inc";
  add_proto qw/unsigned int vp9_highbd_10_variance8x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
  specialize qw/vp9_highbd_10_variance8x4/;
  add_proto qw/unsigned int vp9_highbd_10_variance4x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
  specialize qw/vp9_highbd_10_variance4x8/;
  add_proto qw/unsigned int vp9_highbd_10_variance4x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
  specialize qw/vp9_highbd_10_variance4x4/;
  add_proto qw/void vp9_highbd_10_get8x8var/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum";
  specialize qw/vp9_highbd_10_get8x8var/, "$sse2_x86inc";
  add_proto qw/void vp9_highbd_10_get16x16var/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum";
  specialize qw/vp9_highbd_10_get16x16var/, "$sse2_x86inc";
  add_proto qw/unsigned int vp9_highbd_12_variance32x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
  specialize qw/vp9_highbd_12_variance32x16/, "$sse2_x86inc";
  add_proto qw/unsigned int vp9_highbd_12_variance16x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
  specialize qw/vp9_highbd_12_variance16x32/, "$sse2_x86inc";
  add_proto qw/unsigned int vp9_highbd_12_variance64x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
  specialize qw/vp9_highbd_12_variance64x32/, "$sse2_x86inc";
  add_proto qw/unsigned int vp9_highbd_12_variance32x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
  specialize qw/vp9_highbd_12_variance32x64/, "$sse2_x86inc";
  add_proto qw/unsigned int vp9_highbd_12_variance32x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
  specialize qw/vp9_highbd_12_variance32x32/, "$sse2_x86inc";
  add_proto qw/unsigned int vp9_highbd_12_variance64x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
  specialize qw/vp9_highbd_12_variance64x64/, "$sse2_x86inc";
  add_proto qw/unsigned int vp9_highbd_12_variance16x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
  specialize qw/vp9_highbd_12_variance16x16/, "$sse2_x86inc";
  add_proto qw/unsigned int vp9_highbd_12_variance16x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
  specialize qw/vp9_highbd_12_variance16x8/, "$sse2_x86inc";
  add_proto qw/unsigned int vp9_highbd_12_variance8x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
  specialize qw/vp9_highbd_12_variance8x16/, "$sse2_x86inc";
  add_proto qw/unsigned int vp9_highbd_12_variance8x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
  specialize qw/vp9_highbd_12_variance8x8/, "$sse2_x86inc";
  add_proto qw/unsigned int vp9_highbd_12_variance8x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
  specialize qw/vp9_highbd_12_variance8x4/;
  add_proto qw/unsigned int vp9_highbd_12_variance4x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
  specialize qw/vp9_highbd_12_variance4x8/;
  add_proto qw/unsigned int vp9_highbd_12_variance4x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
  specialize qw/vp9_highbd_12_variance4x4/;
  add_proto qw/void vp9_highbd_12_get8x8var/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum";
  specialize qw/vp9_highbd_12_get8x8var/, "$sse2_x86inc";
  add_proto qw/void vp9_highbd_12_get16x16var/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum";
  specialize qw/vp9_highbd_12_get16x16var/, "$sse2_x86inc";
  add_proto qw/unsigned int vp9_highbd_sub_pixel_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
  specialize qw/vp9_highbd_sub_pixel_variance64x64/, "$sse2_x86inc";
@ -1511,41 +1315,6 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
  add_proto qw/unsigned int vp9_highbd_12_sub_pixel_avg_variance4x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
  specialize qw/vp9_highbd_12_sub_pixel_avg_variance4x4/;
  add_proto qw/unsigned int vp9_highbd_mse16x16/, "const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse";
  specialize qw/vp9_highbd_mse16x16/, "$sse2_x86inc";
  add_proto qw/unsigned int vp9_highbd_mse8x16/, "const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse";
  specialize qw/vp9_highbd_mse8x16/;
  add_proto qw/unsigned int vp9_highbd_mse16x8/, "const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse";
  specialize qw/vp9_highbd_mse16x8/;
  add_proto qw/unsigned int vp9_highbd_mse8x8/, "const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse";
  specialize qw/vp9_highbd_mse8x8/, "$sse2_x86inc";
  add_proto qw/unsigned int vp9_highbd_10_mse16x16/, "const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse";
  specialize qw/vp9_highbd_10_mse16x16/, "$sse2_x86inc";
  add_proto qw/unsigned int vp9_highbd_10_mse8x16/, "const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse";
  specialize qw/vp9_highbd_10_mse8x16/;
  add_proto qw/unsigned int vp9_highbd_10_mse16x8/, "const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse";
  specialize qw/vp9_highbd_10_mse16x8/;
  add_proto qw/unsigned int vp9_highbd_10_mse8x8/, "const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse";
  specialize qw/vp9_highbd_10_mse8x8/, "$sse2_x86inc";
  add_proto qw/unsigned int vp9_highbd_12_mse16x16/, "const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse";
  specialize qw/vp9_highbd_12_mse16x16/, "$sse2_x86inc";
  add_proto qw/unsigned int vp9_highbd_12_mse8x16/, "const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse";
  specialize qw/vp9_highbd_12_mse8x16/;
  add_proto qw/unsigned int vp9_highbd_12_mse16x8/, "const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse";
  specialize qw/vp9_highbd_12_mse16x8/;
  add_proto qw/unsigned int vp9_highbd_12_mse8x8/, "const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse";
  specialize qw/vp9_highbd_12_mse8x8/, "$sse2_x86inc";
  # ENCODEMB INVOKE
--- a/vp9/encoder/arm/neon/vp9_variance_neon.c
+++ b/vp9/encoder/arm/neon/vp9_variance_neon.c
@ -10,6 +10,7 @@
 #include <arm_neon.h>
 #include "./vp9_rtcd.h"
 #include "./vpx_dsp_rtcd.h"
 #include "./vpx_config.h"
 #include "vpx_ports/mem.h"
@ -20,82 +21,6 @@
 #include "vp9/encoder/vp9_variance.h"
 static INLINE int horizontal_add_s16x8(const int16x8_t v_16x8) {
  const int32x4_t a = vpaddlq_s16(v_16x8);
  const int64x2_t b = vpaddlq_s32(a);
  const int32x2_t c = vadd_s32(vreinterpret_s32_s64(vget_low_s64(b)),
                               vreinterpret_s32_s64(vget_high_s64(b)));
  return vget_lane_s32(c, 0);
 }
 static INLINE int horizontal_add_s32x4(const int32x4_t v_32x4) {
  const int64x2_t b = vpaddlq_s32(v_32x4);
  const int32x2_t c = vadd_s32(vreinterpret_s32_s64(vget_low_s64(b)),
                               vreinterpret_s32_s64(vget_high_s64(b)));
  return vget_lane_s32(c, 0);
 }
 // w * h must be less than 2048 or local variable v_sum may overflow.
 static void variance_neon_w8(const uint8_t *a, int a_stride,
                             const uint8_t *b, int b_stride,
                             int w, int h, uint32_t *sse, int *sum) {
  int i, j;
  int16x8_t v_sum = vdupq_n_s16(0);
  int32x4_t v_sse_lo = vdupq_n_s32(0);
  int32x4_t v_sse_hi = vdupq_n_s32(0);
  for (i = 0; i < h; ++i) {
    for (j = 0; j < w; j += 8) {
      const uint8x8_t v_a = vld1_u8(&a[j]);
      const uint8x8_t v_b = vld1_u8(&b[j]);
      const uint16x8_t v_diff = vsubl_u8(v_a, v_b);
      const int16x8_t sv_diff = vreinterpretq_s16_u16(v_diff);
      v_sum = vaddq_s16(v_sum, sv_diff);
      v_sse_lo = vmlal_s16(v_sse_lo,
                           vget_low_s16(sv_diff),
                           vget_low_s16(sv_diff));
      v_sse_hi = vmlal_s16(v_sse_hi,
                           vget_high_s16(sv_diff),
                           vget_high_s16(sv_diff));
    }
    a += a_stride;
    b += b_stride;
  }
  *sum = horizontal_add_s16x8(v_sum);
  *sse = (unsigned int)horizontal_add_s32x4(vaddq_s32(v_sse_lo, v_sse_hi));
 }
 void vp9_get8x8var_neon(const uint8_t *src_ptr, int source_stride,
                        const uint8_t *ref_ptr, int ref_stride,
                        unsigned int *sse, int *sum) {
  variance_neon_w8(src_ptr, source_stride, ref_ptr, ref_stride, 8,
                   8, sse, sum);
 }
 unsigned int vp9_variance8x8_neon(const uint8_t *a, int a_stride,
                                  const uint8_t *b, int b_stride,
                                  unsigned int *sse) {
  int sum;
  variance_neon_w8(a, a_stride, b, b_stride, 8, 8, sse, &sum);
  return *sse - (((int64_t)sum * sum) >> 6);  //  >> 6 = / 8 * 8
 }
 void vp9_get16x16var_neon(const uint8_t *src_ptr, int source_stride,
                          const uint8_t *ref_ptr, int ref_stride,
                          unsigned int *sse, int *sum) {
  variance_neon_w8(src_ptr, source_stride, ref_ptr, ref_stride, 16,
                   16, sse, sum);
 }
 unsigned int vp9_variance16x16_neon(const uint8_t *a, int a_stride,
                                    const uint8_t *b, int b_stride,
                                    unsigned int *sse) {
  int sum;
  variance_neon_w8(a, a_stride, b, b_stride, 16, 16, sse, &sum);
  return *sse - (((int64_t)sum * sum) >> 8);  //  >> 8 = / 16 * 16
 }
 static void var_filter_block2d_bil_w8(const uint8_t *src_ptr,
                                      uint8_t *output_ptr,
                                      unsigned int src_pixels_per_line,
@ -162,7 +87,7 @@ unsigned int vp9_sub_pixel_variance8x8_neon(const uint8_t *src,
                            BILINEAR_FILTERS_2TAP(xoffset));
  var_filter_block2d_bil_w8(fdata3, temp2, 8, 8, 8,
                            8, BILINEAR_FILTERS_2TAP(yoffset));
-  return vp9_variance8x8_neon(temp2, 8, dst, dst_stride, sse);
+  return vpx_variance8x8_neon(temp2, 8, dst, dst_stride, sse);
 }
 unsigned int vp9_sub_pixel_variance16x16_neon(const uint8_t *src,
@ -180,77 +105,7 @@ unsigned int vp9_sub_pixel_variance16x16_neon(const uint8_t *src,
                             BILINEAR_FILTERS_2TAP(xoffset));
  var_filter_block2d_bil_w16(fdata3, temp2, 16, 16, 16,
                             16, BILINEAR_FILTERS_2TAP(yoffset));
-  return vp9_variance16x16_neon(temp2, 16, dst, dst_stride, sse);
+  return vpx_variance16x16_neon(temp2, 16, dst, dst_stride, sse);
 }
 void vp9_get32x32var_neon(const uint8_t *src_ptr, int source_stride,
                          const uint8_t *ref_ptr, int ref_stride,
                          unsigned int *sse, int *sum) {
  variance_neon_w8(src_ptr, source_stride, ref_ptr, ref_stride, 32,
                   32, sse, sum);
 }
 unsigned int vp9_variance32x32_neon(const uint8_t *a, int a_stride,
                                    const uint8_t *b, int b_stride,
                                    unsigned int *sse) {
  int sum;
  variance_neon_w8(a, a_stride, b, b_stride, 32, 32, sse, &sum);
  return *sse - (((int64_t)sum * sum) >> 10);  // >> 10 = / 32 * 32
 }
 unsigned int vp9_variance32x64_neon(const uint8_t *a, int a_stride,
                                    const uint8_t *b, int b_stride,
                                    unsigned int *sse) {
  int sum1, sum2;
  uint32_t sse1, sse2;
  variance_neon_w8(a, a_stride, b, b_stride, 32, 32, &sse1, &sum1);
  variance_neon_w8(a + (32 * a_stride), a_stride,
                   b + (32 * b_stride), b_stride, 32, 32,
                   &sse2, &sum2);
  *sse = sse1 + sse2;
  sum1 += sum2;
  return *sse - (((int64_t)sum1 * sum1) >> 11);  // >> 11 = / 32 * 64
 }
 unsigned int vp9_variance64x32_neon(const uint8_t *a, int a_stride,
                                    const uint8_t *b, int b_stride,
                                    unsigned int *sse) {
  int sum1, sum2;
  uint32_t sse1, sse2;
  variance_neon_w8(a, a_stride, b, b_stride, 64, 16, &sse1, &sum1);
  variance_neon_w8(a + (16 * a_stride), a_stride,
                   b + (16 * b_stride), b_stride, 64, 16,
                   &sse2, &sum2);
  *sse = sse1 + sse2;
  sum1 += sum2;
  return *sse - (((int64_t)sum1 * sum1) >> 11);  // >> 11 = / 32 * 64
 }
 unsigned int vp9_variance64x64_neon(const uint8_t *a, int a_stride,
                                    const uint8_t *b, int b_stride,
                                    unsigned int *sse) {
  int sum1, sum2;
  uint32_t sse1, sse2;
  variance_neon_w8(a, a_stride, b, b_stride, 64, 16, &sse1, &sum1);
  variance_neon_w8(a + (16 * a_stride), a_stride,
                   b + (16 * b_stride), b_stride, 64, 16,
                   &sse2, &sum2);
  sse1 += sse2;
  sum1 += sum2;
  variance_neon_w8(a + (16 * 2 * a_stride), a_stride,
                   b + (16 * 2 * b_stride), b_stride,
                   64, 16, &sse2, &sum2);
  sse1 += sse2;
  sum1 += sum2;
  variance_neon_w8(a + (16 * 3 * a_stride), a_stride,
                   b + (16 * 3 * b_stride), b_stride,
                   64, 16, &sse2, &sum2);
  *sse = sse1 + sse2;
  sum1 += sum2;
  return *sse - (((int64_t)sum1 * sum1) >> 12);  // >> 12 = / 64 * 64
 }
 unsigned int vp9_sub_pixel_variance32x32_neon(const uint8_t *src,
@ -268,7 +123,7 @@ unsigned int vp9_sub_pixel_variance32x32_neon(const uint8_t *src,
                             BILINEAR_FILTERS_2TAP(xoffset));
  var_filter_block2d_bil_w16(fdata3, temp2, 32, 32, 32,
                             32, BILINEAR_FILTERS_2TAP(yoffset));
-  return vp9_variance32x32_neon(temp2, 32, dst, dst_stride, sse);
+  return vpx_variance32x32_neon(temp2, 32, dst, dst_stride, sse);
 }
 unsigned int vp9_sub_pixel_variance64x64_neon(const uint8_t *src,
@ -286,5 +141,5 @@ unsigned int vp9_sub_pixel_variance64x64_neon(const uint8_t *src,
                             BILINEAR_FILTERS_2TAP(xoffset));
  var_filter_block2d_bil_w16(fdata3, temp2, 64, 64, 64,
                             64, BILINEAR_FILTERS_2TAP(yoffset));
-  return vp9_variance64x64_neon(temp2, 64, dst, dst_stride, sse);
+  return vpx_variance64x64_neon(temp2, 64, dst, dst_stride, sse);
 }
--- a/vp9/encoder/vp9_aq_variance.c
+++ b/vp9/encoder/vp9_aq_variance.c
@ -98,9 +98,9 @@ static unsigned int block_variance(VP9_COMP *cpi, MACROBLOCK *x,
    int avg;
 #if CONFIG_VP9_HIGHBITDEPTH
    if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
-      highbd_variance(x->plane[0].src.buf, x->plane[0].src.stride,
+      highbd_8_variance(x->plane[0].src.buf, x->plane[0].src.stride,
-                      CONVERT_TO_BYTEPTR(vp9_highbd_64_zeros), 0, bw, bh,
+                        CONVERT_TO_BYTEPTR(vp9_highbd_64_zeros), 0, bw, bh,
-                      &sse, &avg);
+                        &sse, &avg);
      sse >>= 2 * (xd->bd - 8);
      avg >>= (xd->bd - 8);
    } else {
--- a/vp9/encoder/vp9_encodeframe.c
+++ b/vp9/encoder/vp9_encodeframe.c
@ -13,6 +13,7 @@
 #include <stdio.h>
 #include "./vp9_rtcd.h"
 #include "./vpx_dsp_rtcd.h"
 #include "./vpx_config.h"
 #include "vpx_ports/mem.h"
@ -3672,15 +3673,15 @@ static int set_var_thresh_from_histogram(VP9_COMP *cpi) {
      if (cm->use_highbitdepth) {
        switch (cm->bit_depth) {
          case VPX_BITS_8:
-            vp9_highbd_get16x16var(src, src_stride, last_src, last_stride,
+            vpx_highbd_8_get16x16var(src, src_stride, last_src, last_stride,
                                   &var16->sse, &var16->sum);
            break;
          case VPX_BITS_10:
-            vp9_highbd_10_get16x16var(src, src_stride, last_src, last_stride,
+            vpx_highbd_10_get16x16var(src, src_stride, last_src, last_stride,
                                    &var16->sse, &var16->sum);
            break;
          case VPX_BITS_12:
-            vp9_highbd_12_get16x16var(src, src_stride, last_src, last_stride,
+            vpx_highbd_12_get16x16var(src, src_stride, last_src, last_stride,
                                      &var16->sse, &var16->sum);
            break;
          default:
@ -3689,11 +3690,11 @@ static int set_var_thresh_from_histogram(VP9_COMP *cpi) {
            return -1;
        }
      } else {
-        vp9_get16x16var(src, src_stride, last_src, last_stride,
+        vpx_get16x16var(src, src_stride, last_src, last_stride,
                        &var16->sse, &var16->sum);
      }
 #else
-      vp9_get16x16var(src, src_stride, last_src, last_stride,
+      vpx_get16x16var(src, src_stride, last_src, last_stride,
                      &var16->sse, &var16->sum);
 #endif  // CONFIG_VP9_HIGHBITDEPTH
      var16->var = var16->sse -
--- a/vp9/encoder/vp9_encoder.c
+++ b/vp9/encoder/vp9_encoder.c
@ -998,7 +998,7 @@ static void  highbd_set_var_fns(VP9_COMP *const cpi) {
        HIGHBD_BFP(BLOCK_32X16,
                   vpx_highbd_sad32x16_bits8,
                   vpx_highbd_sad32x16_avg_bits8,
-                   vp9_highbd_variance32x16,
+                   vpx_highbd_8_variance32x16,
                   vp9_highbd_sub_pixel_variance32x16,
                   vp9_highbd_sub_pixel_avg_variance32x16,
                   NULL,
@ -1008,7 +1008,7 @@ static void  highbd_set_var_fns(VP9_COMP *const cpi) {
        HIGHBD_BFP(BLOCK_16X32,
                   vpx_highbd_sad16x32_bits8,
                   vpx_highbd_sad16x32_avg_bits8,
-                   vp9_highbd_variance16x32,
+                   vpx_highbd_8_variance16x32,
                   vp9_highbd_sub_pixel_variance16x32,
                   vp9_highbd_sub_pixel_avg_variance16x32,
                   NULL,
@ -1018,7 +1018,7 @@ static void  highbd_set_var_fns(VP9_COMP *const cpi) {
        HIGHBD_BFP(BLOCK_64X32,
                   vpx_highbd_sad64x32_bits8,
                   vpx_highbd_sad64x32_avg_bits8,
-                   vp9_highbd_variance64x32,
+                   vpx_highbd_8_variance64x32,
                   vp9_highbd_sub_pixel_variance64x32,
                   vp9_highbd_sub_pixel_avg_variance64x32,
                   NULL,
@ -1028,7 +1028,7 @@ static void  highbd_set_var_fns(VP9_COMP *const cpi) {
        HIGHBD_BFP(BLOCK_32X64,
                   vpx_highbd_sad32x64_bits8,
                   vpx_highbd_sad32x64_avg_bits8,
-                   vp9_highbd_variance32x64,
+                   vpx_highbd_8_variance32x64,
                   vp9_highbd_sub_pixel_variance32x64,
                   vp9_highbd_sub_pixel_avg_variance32x64,
                   NULL,
@ -1038,7 +1038,7 @@ static void  highbd_set_var_fns(VP9_COMP *const cpi) {
        HIGHBD_BFP(BLOCK_32X32,
                   vpx_highbd_sad32x32_bits8,
                   vpx_highbd_sad32x32_avg_bits8,
-                   vp9_highbd_variance32x32,
+                   vpx_highbd_8_variance32x32,
                   vp9_highbd_sub_pixel_variance32x32,
                   vp9_highbd_sub_pixel_avg_variance32x32,
                   vpx_highbd_sad32x32x3_bits8,
@ -1048,7 +1048,7 @@ static void  highbd_set_var_fns(VP9_COMP *const cpi) {
        HIGHBD_BFP(BLOCK_64X64,
                   vpx_highbd_sad64x64_bits8,
                   vpx_highbd_sad64x64_avg_bits8,
-                   vp9_highbd_variance64x64,
+                   vpx_highbd_8_variance64x64,
                   vp9_highbd_sub_pixel_variance64x64,
                   vp9_highbd_sub_pixel_avg_variance64x64,
                   vpx_highbd_sad64x64x3_bits8,
@ -1058,7 +1058,7 @@ static void  highbd_set_var_fns(VP9_COMP *const cpi) {
        HIGHBD_BFP(BLOCK_16X16,
                   vpx_highbd_sad16x16_bits8,
                   vpx_highbd_sad16x16_avg_bits8,
-                   vp9_highbd_variance16x16,
+                   vpx_highbd_8_variance16x16,
                   vp9_highbd_sub_pixel_variance16x16,
                   vp9_highbd_sub_pixel_avg_variance16x16,
                   vpx_highbd_sad16x16x3_bits8,
@ -1068,7 +1068,7 @@ static void  highbd_set_var_fns(VP9_COMP *const cpi) {
        HIGHBD_BFP(BLOCK_16X8,
                   vpx_highbd_sad16x8_bits8,
                   vpx_highbd_sad16x8_avg_bits8,
-                   vp9_highbd_variance16x8,
+                   vpx_highbd_8_variance16x8,
                   vp9_highbd_sub_pixel_variance16x8,
                   vp9_highbd_sub_pixel_avg_variance16x8,
                   vpx_highbd_sad16x8x3_bits8,
@ -1078,7 +1078,7 @@ static void  highbd_set_var_fns(VP9_COMP *const cpi) {
        HIGHBD_BFP(BLOCK_8X16,
                   vpx_highbd_sad8x16_bits8,
                   vpx_highbd_sad8x16_avg_bits8,
-                   vp9_highbd_variance8x16,
+                   vpx_highbd_8_variance8x16,
                   vp9_highbd_sub_pixel_variance8x16,
                   vp9_highbd_sub_pixel_avg_variance8x16,
                   vpx_highbd_sad8x16x3_bits8,
@ -1088,7 +1088,7 @@ static void  highbd_set_var_fns(VP9_COMP *const cpi) {
        HIGHBD_BFP(BLOCK_8X8,
                   vpx_highbd_sad8x8_bits8,
                   vpx_highbd_sad8x8_avg_bits8,
-                   vp9_highbd_variance8x8,
+                   vpx_highbd_8_variance8x8,
                   vp9_highbd_sub_pixel_variance8x8,
                   vp9_highbd_sub_pixel_avg_variance8x8,
                   vpx_highbd_sad8x8x3_bits8,
@ -1098,7 +1098,7 @@ static void  highbd_set_var_fns(VP9_COMP *const cpi) {
        HIGHBD_BFP(BLOCK_8X4,
                   vpx_highbd_sad8x4_bits8,
                   vpx_highbd_sad8x4_avg_bits8,
-                   vp9_highbd_variance8x4,
+                   vpx_highbd_8_variance8x4,
                   vp9_highbd_sub_pixel_variance8x4,
                   vp9_highbd_sub_pixel_avg_variance8x4,
                   NULL,
@ -1108,7 +1108,7 @@ static void  highbd_set_var_fns(VP9_COMP *const cpi) {
        HIGHBD_BFP(BLOCK_4X8,
                   vpx_highbd_sad4x8_bits8,
                   vpx_highbd_sad4x8_avg_bits8,
-                   vp9_highbd_variance4x8,
+                   vpx_highbd_8_variance4x8,
                   vp9_highbd_sub_pixel_variance4x8,
                   vp9_highbd_sub_pixel_avg_variance4x8,
                   NULL,
@ -1118,7 +1118,7 @@ static void  highbd_set_var_fns(VP9_COMP *const cpi) {
        HIGHBD_BFP(BLOCK_4X4,
                   vpx_highbd_sad4x4_bits8,
                   vpx_highbd_sad4x4_avg_bits8,
-                   vp9_highbd_variance4x4,
+                   vpx_highbd_8_variance4x4,
                   vp9_highbd_sub_pixel_variance4x4,
                   vp9_highbd_sub_pixel_avg_variance4x4,
                   vpx_highbd_sad4x4x3_bits8,
@ -1130,7 +1130,7 @@ static void  highbd_set_var_fns(VP9_COMP *const cpi) {
        HIGHBD_BFP(BLOCK_32X16,
                   vpx_highbd_sad32x16_bits10,
                   vpx_highbd_sad32x16_avg_bits10,
-                   vp9_highbd_10_variance32x16,
+                   vpx_highbd_10_variance32x16,
                   vp9_highbd_10_sub_pixel_variance32x16,
                   vp9_highbd_10_sub_pixel_avg_variance32x16,
                   NULL,
@ -1140,7 +1140,7 @@ static void  highbd_set_var_fns(VP9_COMP *const cpi) {
        HIGHBD_BFP(BLOCK_16X32,
                   vpx_highbd_sad16x32_bits10,
                   vpx_highbd_sad16x32_avg_bits10,
-                   vp9_highbd_10_variance16x32,
+                   vpx_highbd_10_variance16x32,
                   vp9_highbd_10_sub_pixel_variance16x32,
                   vp9_highbd_10_sub_pixel_avg_variance16x32,
                   NULL,
@ -1150,7 +1150,7 @@ static void  highbd_set_var_fns(VP9_COMP *const cpi) {
        HIGHBD_BFP(BLOCK_64X32,
                   vpx_highbd_sad64x32_bits10,
                   vpx_highbd_sad64x32_avg_bits10,
-                   vp9_highbd_10_variance64x32,
+                   vpx_highbd_10_variance64x32,
                   vp9_highbd_10_sub_pixel_variance64x32,
                   vp9_highbd_10_sub_pixel_avg_variance64x32,
                   NULL,
@ -1160,7 +1160,7 @@ static void  highbd_set_var_fns(VP9_COMP *const cpi) {
        HIGHBD_BFP(BLOCK_32X64,
                   vpx_highbd_sad32x64_bits10,
                   vpx_highbd_sad32x64_avg_bits10,
-                   vp9_highbd_10_variance32x64,
+                   vpx_highbd_10_variance32x64,
                   vp9_highbd_10_sub_pixel_variance32x64,
                   vp9_highbd_10_sub_pixel_avg_variance32x64,
                   NULL,
@ -1170,7 +1170,7 @@ static void  highbd_set_var_fns(VP9_COMP *const cpi) {
        HIGHBD_BFP(BLOCK_32X32,
                   vpx_highbd_sad32x32_bits10,
                   vpx_highbd_sad32x32_avg_bits10,
-                   vp9_highbd_10_variance32x32,
+                   vpx_highbd_10_variance32x32,
                   vp9_highbd_10_sub_pixel_variance32x32,
                   vp9_highbd_10_sub_pixel_avg_variance32x32,
                   vpx_highbd_sad32x32x3_bits10,
@ -1180,7 +1180,7 @@ static void  highbd_set_var_fns(VP9_COMP *const cpi) {
        HIGHBD_BFP(BLOCK_64X64,
                   vpx_highbd_sad64x64_bits10,
                   vpx_highbd_sad64x64_avg_bits10,
-                   vp9_highbd_10_variance64x64,
+                   vpx_highbd_10_variance64x64,
                   vp9_highbd_10_sub_pixel_variance64x64,
                   vp9_highbd_10_sub_pixel_avg_variance64x64,
                   vpx_highbd_sad64x64x3_bits10,
@ -1190,7 +1190,7 @@ static void  highbd_set_var_fns(VP9_COMP *const cpi) {
        HIGHBD_BFP(BLOCK_16X16,
                   vpx_highbd_sad16x16_bits10,
                   vpx_highbd_sad16x16_avg_bits10,
-                   vp9_highbd_10_variance16x16,
+                   vpx_highbd_10_variance16x16,
                   vp9_highbd_10_sub_pixel_variance16x16,
                   vp9_highbd_10_sub_pixel_avg_variance16x16,
                   vpx_highbd_sad16x16x3_bits10,
@ -1200,7 +1200,7 @@ static void  highbd_set_var_fns(VP9_COMP *const cpi) {
        HIGHBD_BFP(BLOCK_16X8,
                   vpx_highbd_sad16x8_bits10,
                   vpx_highbd_sad16x8_avg_bits10,
-                   vp9_highbd_10_variance16x8,
+                   vpx_highbd_10_variance16x8,
                   vp9_highbd_10_sub_pixel_variance16x8,
                   vp9_highbd_10_sub_pixel_avg_variance16x8,
                   vpx_highbd_sad16x8x3_bits10,
@ -1210,7 +1210,7 @@ static void  highbd_set_var_fns(VP9_COMP *const cpi) {
        HIGHBD_BFP(BLOCK_8X16,
                   vpx_highbd_sad8x16_bits10,
                   vpx_highbd_sad8x16_avg_bits10,
-                   vp9_highbd_10_variance8x16,
+                   vpx_highbd_10_variance8x16,
                   vp9_highbd_10_sub_pixel_variance8x16,
                   vp9_highbd_10_sub_pixel_avg_variance8x16,
                   vpx_highbd_sad8x16x3_bits10,
@ -1220,7 +1220,7 @@ static void  highbd_set_var_fns(VP9_COMP *const cpi) {
        HIGHBD_BFP(BLOCK_8X8,
                   vpx_highbd_sad8x8_bits10,
                   vpx_highbd_sad8x8_avg_bits10,
-                   vp9_highbd_10_variance8x8,
+                   vpx_highbd_10_variance8x8,
                   vp9_highbd_10_sub_pixel_variance8x8,
                   vp9_highbd_10_sub_pixel_avg_variance8x8,
                   vpx_highbd_sad8x8x3_bits10,
@ -1230,7 +1230,7 @@ static void  highbd_set_var_fns(VP9_COMP *const cpi) {
        HIGHBD_BFP(BLOCK_8X4,
                   vpx_highbd_sad8x4_bits10,
                   vpx_highbd_sad8x4_avg_bits10,
-                   vp9_highbd_10_variance8x4,
+                   vpx_highbd_10_variance8x4,
                   vp9_highbd_10_sub_pixel_variance8x4,
                   vp9_highbd_10_sub_pixel_avg_variance8x4,
                   NULL,
@ -1240,7 +1240,7 @@ static void  highbd_set_var_fns(VP9_COMP *const cpi) {
        HIGHBD_BFP(BLOCK_4X8,
                   vpx_highbd_sad4x8_bits10,
                   vpx_highbd_sad4x8_avg_bits10,
-                   vp9_highbd_10_variance4x8,
+                   vpx_highbd_10_variance4x8,
                   vp9_highbd_10_sub_pixel_variance4x8,
                   vp9_highbd_10_sub_pixel_avg_variance4x8,
                   NULL,
@ -1250,7 +1250,7 @@ static void  highbd_set_var_fns(VP9_COMP *const cpi) {
        HIGHBD_BFP(BLOCK_4X4,
                   vpx_highbd_sad4x4_bits10,
                   vpx_highbd_sad4x4_avg_bits10,
-                   vp9_highbd_10_variance4x4,
+                   vpx_highbd_10_variance4x4,
                   vp9_highbd_10_sub_pixel_variance4x4,
                   vp9_highbd_10_sub_pixel_avg_variance4x4,
                   vpx_highbd_sad4x4x3_bits10,
@ -1262,7 +1262,7 @@ static void  highbd_set_var_fns(VP9_COMP *const cpi) {
        HIGHBD_BFP(BLOCK_32X16,
                   vpx_highbd_sad32x16_bits12,
                   vpx_highbd_sad32x16_avg_bits12,
-                   vp9_highbd_12_variance32x16,
+                   vpx_highbd_12_variance32x16,
                   vp9_highbd_12_sub_pixel_variance32x16,
                   vp9_highbd_12_sub_pixel_avg_variance32x16,
                   NULL,
@ -1272,7 +1272,7 @@ static void  highbd_set_var_fns(VP9_COMP *const cpi) {
        HIGHBD_BFP(BLOCK_16X32,
                   vpx_highbd_sad16x32_bits12,
                   vpx_highbd_sad16x32_avg_bits12,
-                   vp9_highbd_12_variance16x32,
+                   vpx_highbd_12_variance16x32,
                   vp9_highbd_12_sub_pixel_variance16x32,
                   vp9_highbd_12_sub_pixel_avg_variance16x32,
                   NULL,
@ -1282,7 +1282,7 @@ static void  highbd_set_var_fns(VP9_COMP *const cpi) {
        HIGHBD_BFP(BLOCK_64X32,
                   vpx_highbd_sad64x32_bits12,
                   vpx_highbd_sad64x32_avg_bits12,
-                   vp9_highbd_12_variance64x32,
+                   vpx_highbd_12_variance64x32,
                   vp9_highbd_12_sub_pixel_variance64x32,
                   vp9_highbd_12_sub_pixel_avg_variance64x32,
                   NULL,
@ -1292,7 +1292,7 @@ static void  highbd_set_var_fns(VP9_COMP *const cpi) {
        HIGHBD_BFP(BLOCK_32X64,
                   vpx_highbd_sad32x64_bits12,
                   vpx_highbd_sad32x64_avg_bits12,
-                   vp9_highbd_12_variance32x64,
+                   vpx_highbd_12_variance32x64,
                   vp9_highbd_12_sub_pixel_variance32x64,
                   vp9_highbd_12_sub_pixel_avg_variance32x64,
                   NULL,
@ -1302,7 +1302,7 @@ static void  highbd_set_var_fns(VP9_COMP *const cpi) {
        HIGHBD_BFP(BLOCK_32X32,
                   vpx_highbd_sad32x32_bits12,
                   vpx_highbd_sad32x32_avg_bits12,
-                   vp9_highbd_12_variance32x32,
+                   vpx_highbd_12_variance32x32,
                   vp9_highbd_12_sub_pixel_variance32x32,
                   vp9_highbd_12_sub_pixel_avg_variance32x32,
                   vpx_highbd_sad32x32x3_bits12,
@ -1312,7 +1312,7 @@ static void  highbd_set_var_fns(VP9_COMP *const cpi) {
        HIGHBD_BFP(BLOCK_64X64,
                   vpx_highbd_sad64x64_bits12,
                   vpx_highbd_sad64x64_avg_bits12,
-                   vp9_highbd_12_variance64x64,
+                   vpx_highbd_12_variance64x64,
                   vp9_highbd_12_sub_pixel_variance64x64,
                   vp9_highbd_12_sub_pixel_avg_variance64x64,
                   vpx_highbd_sad64x64x3_bits12,
@ -1322,7 +1322,7 @@ static void  highbd_set_var_fns(VP9_COMP *const cpi) {
        HIGHBD_BFP(BLOCK_16X16,
                   vpx_highbd_sad16x16_bits12,
                   vpx_highbd_sad16x16_avg_bits12,
-                   vp9_highbd_12_variance16x16,
+                   vpx_highbd_12_variance16x16,
                   vp9_highbd_12_sub_pixel_variance16x16,
                   vp9_highbd_12_sub_pixel_avg_variance16x16,
                   vpx_highbd_sad16x16x3_bits12,
@ -1332,7 +1332,7 @@ static void  highbd_set_var_fns(VP9_COMP *const cpi) {
        HIGHBD_BFP(BLOCK_16X8,
                   vpx_highbd_sad16x8_bits12,
                   vpx_highbd_sad16x8_avg_bits12,
-                   vp9_highbd_12_variance16x8,
+                   vpx_highbd_12_variance16x8,
                   vp9_highbd_12_sub_pixel_variance16x8,
                   vp9_highbd_12_sub_pixel_avg_variance16x8,
                   vpx_highbd_sad16x8x3_bits12,
@ -1342,7 +1342,7 @@ static void  highbd_set_var_fns(VP9_COMP *const cpi) {
        HIGHBD_BFP(BLOCK_8X16,
                   vpx_highbd_sad8x16_bits12,
                   vpx_highbd_sad8x16_avg_bits12,
-                   vp9_highbd_12_variance8x16,
+                   vpx_highbd_12_variance8x16,
                   vp9_highbd_12_sub_pixel_variance8x16,
                   vp9_highbd_12_sub_pixel_avg_variance8x16,
                   vpx_highbd_sad8x16x3_bits12,
@ -1352,7 +1352,7 @@ static void  highbd_set_var_fns(VP9_COMP *const cpi) {
        HIGHBD_BFP(BLOCK_8X8,
                   vpx_highbd_sad8x8_bits12,
                   vpx_highbd_sad8x8_avg_bits12,
-                   vp9_highbd_12_variance8x8,
+                   vpx_highbd_12_variance8x8,
                   vp9_highbd_12_sub_pixel_variance8x8,
                   vp9_highbd_12_sub_pixel_avg_variance8x8,
                   vpx_highbd_sad8x8x3_bits12,
@ -1362,7 +1362,7 @@ static void  highbd_set_var_fns(VP9_COMP *const cpi) {
        HIGHBD_BFP(BLOCK_8X4,
                   vpx_highbd_sad8x4_bits12,
                   vpx_highbd_sad8x4_avg_bits12,
-                   vp9_highbd_12_variance8x4,
+                   vpx_highbd_12_variance8x4,
                   vp9_highbd_12_sub_pixel_variance8x4,
                   vp9_highbd_12_sub_pixel_avg_variance8x4,
                   NULL,
@ -1372,7 +1372,7 @@ static void  highbd_set_var_fns(VP9_COMP *const cpi) {
        HIGHBD_BFP(BLOCK_4X8,
                   vpx_highbd_sad4x8_bits12,
                   vpx_highbd_sad4x8_avg_bits12,
-                   vp9_highbd_12_variance4x8,
+                   vpx_highbd_12_variance4x8,
                   vp9_highbd_12_sub_pixel_variance4x8,
                   vp9_highbd_12_sub_pixel_avg_variance4x8,
                   NULL,
@ -1382,7 +1382,7 @@ static void  highbd_set_var_fns(VP9_COMP *const cpi) {
        HIGHBD_BFP(BLOCK_4X4,
                   vpx_highbd_sad4x4_bits12,
                   vpx_highbd_sad4x4_avg_bits12,
-                   vp9_highbd_12_variance4x4,
+                   vpx_highbd_12_variance4x4,
                   vp9_highbd_12_sub_pixel_variance4x4,
                   vp9_highbd_12_sub_pixel_avg_variance4x4,
                   vpx_highbd_sad4x4x3_bits12,
@ -1805,61 +1805,61 @@ VP9_COMP *vp9_create_compressor(VP9EncoderConfig *oxcf,
    cpi->fn_ptr[BT].sdx4df         = SDX4DF;
  BFP(BLOCK_32X16, vpx_sad32x16, vpx_sad32x16_avg,
-      vp9_variance32x16, vp9_sub_pixel_variance32x16,
+      vpx_variance32x16, vp9_sub_pixel_variance32x16,
      vp9_sub_pixel_avg_variance32x16, NULL, NULL, vpx_sad32x16x4d)
  BFP(BLOCK_16X32, vpx_sad16x32, vpx_sad16x32_avg,
-      vp9_variance16x32, vp9_sub_pixel_variance16x32,
+      vpx_variance16x32, vp9_sub_pixel_variance16x32,
      vp9_sub_pixel_avg_variance16x32, NULL, NULL, vpx_sad16x32x4d)
  BFP(BLOCK_64X32, vpx_sad64x32, vpx_sad64x32_avg,
-      vp9_variance64x32, vp9_sub_pixel_variance64x32,
+      vpx_variance64x32, vp9_sub_pixel_variance64x32,
      vp9_sub_pixel_avg_variance64x32, NULL, NULL, vpx_sad64x32x4d)
  BFP(BLOCK_32X64, vpx_sad32x64, vpx_sad32x64_avg,
-      vp9_variance32x64, vp9_sub_pixel_variance32x64,
+      vpx_variance32x64, vp9_sub_pixel_variance32x64,
      vp9_sub_pixel_avg_variance32x64, NULL, NULL, vpx_sad32x64x4d)
  BFP(BLOCK_32X32, vpx_sad32x32, vpx_sad32x32_avg,
-      vp9_variance32x32, vp9_sub_pixel_variance32x32,
+      vpx_variance32x32, vp9_sub_pixel_variance32x32,
      vp9_sub_pixel_avg_variance32x32, vpx_sad32x32x3, vpx_sad32x32x8,
      vpx_sad32x32x4d)
  BFP(BLOCK_64X64, vpx_sad64x64, vpx_sad64x64_avg,
-      vp9_variance64x64, vp9_sub_pixel_variance64x64,
+      vpx_variance64x64, vp9_sub_pixel_variance64x64,
      vp9_sub_pixel_avg_variance64x64, vpx_sad64x64x3, vpx_sad64x64x8,
      vpx_sad64x64x4d)
  BFP(BLOCK_16X16, vpx_sad16x16, vpx_sad16x16_avg,
-      vp9_variance16x16, vp9_sub_pixel_variance16x16,
+      vpx_variance16x16, vp9_sub_pixel_variance16x16,
      vp9_sub_pixel_avg_variance16x16, vpx_sad16x16x3, vpx_sad16x16x8,
      vpx_sad16x16x4d)
  BFP(BLOCK_16X8, vpx_sad16x8, vpx_sad16x8_avg,
-      vp9_variance16x8, vp9_sub_pixel_variance16x8,
+      vpx_variance16x8, vp9_sub_pixel_variance16x8,
      vp9_sub_pixel_avg_variance16x8,
      vpx_sad16x8x3, vpx_sad16x8x8, vpx_sad16x8x4d)
  BFP(BLOCK_8X16, vpx_sad8x16, vpx_sad8x16_avg,
-      vp9_variance8x16, vp9_sub_pixel_variance8x16,
+      vpx_variance8x16, vp9_sub_pixel_variance8x16,
      vp9_sub_pixel_avg_variance8x16,
      vpx_sad8x16x3, vpx_sad8x16x8, vpx_sad8x16x4d)
  BFP(BLOCK_8X8, vpx_sad8x8, vpx_sad8x8_avg,
-      vp9_variance8x8, vp9_sub_pixel_variance8x8,
+      vpx_variance8x8, vp9_sub_pixel_variance8x8,
      vp9_sub_pixel_avg_variance8x8,
      vpx_sad8x8x3, vpx_sad8x8x8, vpx_sad8x8x4d)
  BFP(BLOCK_8X4, vpx_sad8x4, vpx_sad8x4_avg,
-      vp9_variance8x4, vp9_sub_pixel_variance8x4,
+      vpx_variance8x4, vp9_sub_pixel_variance8x4,
      vp9_sub_pixel_avg_variance8x4, NULL, vpx_sad8x4x8, vpx_sad8x4x4d)
  BFP(BLOCK_4X8, vpx_sad4x8, vpx_sad4x8_avg,
-      vp9_variance4x8, vp9_sub_pixel_variance4x8,
+      vpx_variance4x8, vp9_sub_pixel_variance4x8,
      vp9_sub_pixel_avg_variance4x8, NULL, vpx_sad4x8x8, vpx_sad4x8x4d)
  BFP(BLOCK_4X4, vpx_sad4x4, vpx_sad4x4_avg,
-      vp9_variance4x4, vp9_sub_pixel_variance4x4,
+      vpx_variance4x4, vp9_sub_pixel_variance4x4,
      vp9_sub_pixel_avg_variance4x4,
      vpx_sad4x4x3, vpx_sad4x4x8, vpx_sad4x4x4d)
@ -2079,7 +2079,7 @@ static int64_t get_sse(const uint8_t *a, int a_stride,
    const uint8_t *pa = a;
    const uint8_t *pb = b;
    for (x = 0; x < width / 16; ++x) {
-      vp9_mse16x16(pa, a_stride, pb, b_stride, &sse);
+      vpx_mse16x16(pa, a_stride, pb, b_stride, &sse);
      total_sse += sse;
      pa += 16;
@ -2124,21 +2124,21 @@ static int64_t highbd_get_sse(const uint8_t *a, int a_stride,
  unsigned int sse = 0;
  int sum = 0;
  if (dw > 0) {
-    highbd_variance(&a[width - dw], a_stride, &b[width - dw], b_stride,
+    highbd_8_variance(&a[width - dw], a_stride, &b[width - dw], b_stride,
-                    dw, height, &sse, &sum);
+                      dw, height, &sse, &sum);
    total_sse += sse;
  }
  if (dh > 0) {
-    highbd_variance(&a[(height - dh) * a_stride], a_stride,
+    highbd_8_variance(&a[(height - dh) * a_stride], a_stride,
-                    &b[(height - dh) * b_stride], b_stride,
+                      &b[(height - dh) * b_stride], b_stride,
-                    width - dw, dh, &sse, &sum);
+                      width - dw, dh, &sse, &sum);
    total_sse += sse;
  }
  for (y = 0; y < height / 16; ++y) {
    const uint8_t *pa = a;
    const uint8_t *pb = b;
    for (x = 0; x < width / 16; ++x) {
-      vp9_highbd_mse16x16(pa, a_stride, pb, b_stride, &sse);
+      vpx_highbd_8_mse16x16(pa, a_stride, pb, b_stride, &sse);
      total_sse += sse;
      pa += 16;
      pb += 16;
--- a/vp9/encoder/vp9_firstpass.c
+++ b/vp9/encoder/vp9_firstpass.c
@ -12,6 +12,7 @@
 #include <math.h>
 #include <stdio.h>
 #include "./vpx_dsp_rtcd.h"
 #include "./vpx_scale_rtcd.h"
 #include "vpx_mem/vpx_mem.h"
@ -267,13 +268,13 @@ void vp9_end_first_pass(VP9_COMP *cpi) {
 static vp9_variance_fn_t get_block_variance_fn(BLOCK_SIZE bsize) {
  switch (bsize) {
    case BLOCK_8X8:
-      return vp9_mse8x8;
+      return vpx_mse8x8;
    case BLOCK_16X8:
-      return vp9_mse16x8;
+      return vpx_mse16x8;
    case BLOCK_8X16:
-      return vp9_mse8x16;
+      return vpx_mse8x16;
    default:
-      return vp9_mse16x16;
+      return vpx_mse16x16;
  }
 }
@ -293,37 +294,37 @@ static vp9_variance_fn_t highbd_get_block_variance_fn(BLOCK_SIZE bsize,
    default:
      switch (bsize) {
        case BLOCK_8X8:
-          return vp9_highbd_mse8x8;
+          return vpx_highbd_8_mse8x8;
        case BLOCK_16X8:
-          return vp9_highbd_mse16x8;
+          return vpx_highbd_8_mse16x8;
        case BLOCK_8X16:
-          return vp9_highbd_mse8x16;
+          return vpx_highbd_8_mse8x16;
        default:
-          return vp9_highbd_mse16x16;
+          return vpx_highbd_8_mse16x16;
      }
      break;
    case 10:
      switch (bsize) {
        case BLOCK_8X8:
-          return vp9_highbd_10_mse8x8;
+          return vpx_highbd_10_mse8x8;
        case BLOCK_16X8:
-          return vp9_highbd_10_mse16x8;
+          return vpx_highbd_10_mse16x8;
        case BLOCK_8X16:
-          return vp9_highbd_10_mse8x16;
+          return vpx_highbd_10_mse8x16;
        default:
-          return vp9_highbd_10_mse16x16;
+          return vpx_highbd_10_mse16x16;
      }
      break;
    case 12:
      switch (bsize) {
        case BLOCK_8X8:
-          return vp9_highbd_12_mse8x8;
+          return vpx_highbd_12_mse8x8;
        case BLOCK_16X8:
-          return vp9_highbd_12_mse16x8;
+          return vpx_highbd_12_mse16x8;
        case BLOCK_8X16:
-          return vp9_highbd_12_mse8x16;
+          return vpx_highbd_12_mse8x16;
        default:
-          return vp9_highbd_12_mse16x16;
+          return vpx_highbd_12_mse16x16;
      }
      break;
  }
@ -634,7 +635,7 @@ void vp9_first_pass(VP9_COMP *cpi, const struct lookahead_entry *source) {
      xd->mi[0]->mbmi.tx_size = use_dc_pred ?
         (bsize >= BLOCK_16X16 ? TX_16X16 : TX_8X8) : TX_4X4;
      vp9_encode_intra_block_plane(x, bsize, 0);
-      this_error = vp9_get_mb_ss(x->plane[0].src_diff);
+      this_error = vpx_get_mb_ss(x->plane[0].src_diff);
 #if CONFIG_VP9_HIGHBITDEPTH
      if (cm->use_highbitdepth) {
        switch (cm->bit_depth) {
--- a/vp9/encoder/vp9_mcomp.c
+++ b/vp9/encoder/vp9_mcomp.c
@ -13,6 +13,7 @@
 #include <stdio.h>
 #include "./vpx_config.h"
 #include "./vpx_dsp_rtcd.h"
 #include "vpx_mem/vpx_mem.h"
 #include "vpx_ports/mem.h"
@ -303,13 +304,13 @@ static INLINE unsigned int setup_center_error(const MACROBLOCKD *xd,
  if (second_pred != NULL) {
    if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
      DECLARE_ALIGNED(16, uint16_t, comp_pred16[64 * 64]);
-      vp9_highbd_comp_avg_pred(comp_pred16, second_pred, w, h, y + offset,
+      vpx_highbd_comp_avg_pred(comp_pred16, second_pred, w, h, y + offset,
                               y_stride);
      besterr = vfp->vf(CONVERT_TO_BYTEPTR(comp_pred16), w, src, src_stride,
                        sse1);
    } else {
      DECLARE_ALIGNED(16, uint8_t, comp_pred[64 * 64]);
-      vp9_comp_avg_pred(comp_pred, second_pred, w, h, y + offset, y_stride);
+      vpx_comp_avg_pred(comp_pred, second_pred, w, h, y + offset, y_stride);
      besterr = vfp->vf(comp_pred, w, src, src_stride, sse1);
    }
  } else {
@ -321,7 +322,7 @@ static INLINE unsigned int setup_center_error(const MACROBLOCKD *xd,
  (void) xd;
  if (second_pred != NULL) {
    DECLARE_ALIGNED(16, uint8_t, comp_pred[64 * 64]);
-    vp9_comp_avg_pred(comp_pred, second_pred, w, h, y + offset, y_stride);
+    vpx_comp_avg_pred(comp_pred, second_pred, w, h, y + offset, y_stride);
    besterr = vfp->vf(comp_pred, w, src, src_stride, sse1);
  } else {
    besterr = vfp->vf(y + offset, y_stride, src, src_stride, sse1);
--- a/vp9/encoder/vp9_pickmode.c
+++ b/vp9/encoder/vp9_pickmode.c
@ -14,6 +14,7 @@
 #include <stdio.h>
 #include "./vp9_rtcd.h"
 #include "./vpx_dsp_rtcd.h"
 #include "vpx_mem/vpx_mem.h"
 #include "vpx_ports/mem.h"
@ -215,7 +216,7 @@ static void block_variance(const uint8_t *src, int src_stride,
  for (i = 0; i < h; i += block_size) {
    for (j = 0; j < w; j += block_size) {
-      vp9_get8x8var(src + src_stride * i + j, src_stride,
+      vpx_get8x8var(src + src_stride * i + j, src_stride,
                    ref + ref_stride * i + j, ref_stride,
                    &sse8x8[k], &sum8x8[k]);
      *sse += sse8x8[k];
--- a/vp9/encoder/vp9_variance.c
+++ b/vp9/encoder/vp9_variance.c
@ -9,6 +9,7 @@
 */
 #include "./vp9_rtcd.h"
 #include "./vpx_dsp_rtcd.h"
 #include "vpx_ports/mem.h"
 #include "vpx/vpx_integer.h"
@ -18,26 +19,6 @@
 #include "vp9/encoder/vp9_variance.h"
 void variance(const uint8_t *a, int  a_stride,
              const uint8_t *b, int  b_stride,
              int  w, int  h, unsigned int *sse, int *sum) {
  int i, j;
  *sum = 0;
  *sse = 0;
  for (i = 0; i < h; i++) {
    for (j = 0; j < w; j++) {
      const int diff = a[j] - b[j];
      *sum += diff;
      *sse += diff * diff;
    }
    a += a_stride;
    b += b_stride;
  }
 }
 // Applies a 1-D 2-tap bi-linear filter to the source block in either horizontal
 // or vertical direction to produce the filtered output block. Used to implement
 // first-pass of 2-D separable filter.
@ -100,25 +81,6 @@ static void var_filter_block2d_bil_second_pass(const uint16_t *src_ptr,
  }
 }
 unsigned int vp9_get_mb_ss_c(const int16_t *src_ptr) {
  unsigned int i, sum = 0;
  for (i = 0; i < 256; ++i) {
    sum += src_ptr[i] * src_ptr[i];
  }
  return sum;
 }
 #define VAR(W, H) \
 unsigned int vp9_variance##W##x##H##_c(const uint8_t *a, int a_stride, \
                                       const uint8_t *b, int b_stride, \
                                       unsigned int *sse) { \
  int sum; \
  variance(a, a_stride, b, b_stride, W, H, sse, &sum); \
  return *sse - (((int64_t)sum * sum) / (W * H)); \
 }
 #define SUBPIX_VAR(W, H) \
 unsigned int vp9_sub_pixel_variance##W##x##H##_c( \
  const uint8_t *src, int  src_stride, \
@ -133,7 +95,7 @@ unsigned int vp9_sub_pixel_variance##W##x##H##_c( \
  var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \
                                     BILINEAR_FILTERS_2TAP(yoffset)); \
 \
-  return vp9_variance##W##x##H##_c(temp2, W, dst, dst_stride, sse); \
+  return vpx_variance##W##x##H##_c(temp2, W, dst, dst_stride, sse); \
 }
 #define SUBPIX_AVG_VAR(W, H) \
@ -152,178 +114,51 @@ unsigned int vp9_sub_pixel_avg_variance##W##x##H##_c( \
  var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \
                                     BILINEAR_FILTERS_2TAP(yoffset)); \
 \
-  vp9_comp_avg_pred(temp3, second_pred, W, H, temp2, W); \
+  vpx_comp_avg_pred(temp3, second_pred, W, H, temp2, W); \
 \
-  return vp9_variance##W##x##H##_c(temp3, W, dst, dst_stride, sse); \
+  return vpx_variance##W##x##H##_c(temp3, W, dst, dst_stride, sse); \
 }
 void vp9_get16x16var_c(const uint8_t *src_ptr, int source_stride,
                       const uint8_t *ref_ptr, int ref_stride,
                       unsigned int *sse, int *sum) {
  variance(src_ptr, source_stride, ref_ptr, ref_stride, 16, 16, sse, sum);
 }
 void vp9_get8x8var_c(const uint8_t *src_ptr, int source_stride,
                     const uint8_t *ref_ptr, int ref_stride,
                     unsigned int *sse, int *sum) {
  variance(src_ptr, source_stride, ref_ptr, ref_stride, 8, 8, sse, sum);
 }
 unsigned int vp9_mse16x16_c(const uint8_t *src, int src_stride,
                            const uint8_t *ref, int ref_stride,
                            unsigned int *sse) {
  int sum;
  variance(src, src_stride, ref, ref_stride, 16, 16, sse, &sum);
  return *sse;
 }
 unsigned int vp9_mse16x8_c(const uint8_t *src, int src_stride,
                           const uint8_t *ref, int ref_stride,
                           unsigned int *sse) {
  int sum;
  variance(src, src_stride, ref, ref_stride, 16, 8, sse, &sum);
  return *sse;
 }
 unsigned int vp9_mse8x16_c(const uint8_t *src, int src_stride,
                           const uint8_t *ref, int ref_stride,
                           unsigned int *sse) {
  int sum;
  variance(src, src_stride, ref, ref_stride, 8, 16, sse, &sum);
  return *sse;
 }
 unsigned int vp9_mse8x8_c(const uint8_t *src, int src_stride,
                          const uint8_t *ref, int ref_stride,
                          unsigned int *sse) {
  int sum;
  variance(src, src_stride, ref, ref_stride, 8, 8, sse, &sum);
  return *sse;
 }
 VAR(4, 4)
 SUBPIX_VAR(4, 4)
 SUBPIX_AVG_VAR(4, 4)
 VAR(4, 8)
 SUBPIX_VAR(4, 8)
 SUBPIX_AVG_VAR(4, 8)
 VAR(8, 4)
 SUBPIX_VAR(8, 4)
 SUBPIX_AVG_VAR(8, 4)
 VAR(8, 8)
 SUBPIX_VAR(8, 8)
 SUBPIX_AVG_VAR(8, 8)
 VAR(8, 16)
 SUBPIX_VAR(8, 16)
 SUBPIX_AVG_VAR(8, 16)
 VAR(16, 8)
 SUBPIX_VAR(16, 8)
 SUBPIX_AVG_VAR(16, 8)
 VAR(16, 16)
 SUBPIX_VAR(16, 16)
 SUBPIX_AVG_VAR(16, 16)
 VAR(16, 32)
 SUBPIX_VAR(16, 32)
 SUBPIX_AVG_VAR(16, 32)
 VAR(32, 16)
 SUBPIX_VAR(32, 16)
 SUBPIX_AVG_VAR(32, 16)
 VAR(32, 32)
 SUBPIX_VAR(32, 32)
 SUBPIX_AVG_VAR(32, 32)
 VAR(32, 64)
 SUBPIX_VAR(32, 64)
 SUBPIX_AVG_VAR(32, 64)
 VAR(64, 32)
 SUBPIX_VAR(64, 32)
 SUBPIX_AVG_VAR(64, 32)
 VAR(64, 64)
 SUBPIX_VAR(64, 64)
 SUBPIX_AVG_VAR(64, 64)
 void vp9_comp_avg_pred(uint8_t *comp_pred, const uint8_t *pred, int width,
                       int height, const uint8_t *ref, int ref_stride) {
  int i, j;
  for (i = 0; i < height; i++) {
    for (j = 0; j < width; j++) {
      const int tmp = pred[j] + ref[j];
      comp_pred[j] = ROUND_POWER_OF_TWO(tmp, 1);
    }
    comp_pred += width;
    pred += width;
    ref += ref_stride;
  }
 }
 #if CONFIG_VP9_HIGHBITDEPTH
 void highbd_variance64(const uint8_t *a8, int  a_stride,
                       const uint8_t *b8, int  b_stride,
                       int w, int h, uint64_t *sse,
                       uint64_t *sum) {
  int i, j;
  uint16_t *a = CONVERT_TO_SHORTPTR(a8);
  uint16_t *b = CONVERT_TO_SHORTPTR(b8);
  *sum = 0;
  *sse = 0;
  for (i = 0; i < h; i++) {
    for (j = 0; j < w; j++) {
      const int diff = a[j] - b[j];
      *sum += diff;
      *sse += diff * diff;
    }
    a += a_stride;
    b += b_stride;
  }
 }
 void highbd_variance(const uint8_t *a8, int  a_stride,
                     const uint8_t *b8, int  b_stride,
                     int w, int h, unsigned int *sse,
                     int *sum) {
  uint64_t sse_long = 0;
  uint64_t sum_long = 0;
  highbd_variance64(a8, a_stride, b8, b_stride, w, h, &sse_long, &sum_long);
  *sse = (unsigned int)sse_long;
  *sum = (int)sum_long;
 }
 void highbd_10_variance(const uint8_t *a8, int  a_stride,
                        const uint8_t *b8, int  b_stride,
                        int w, int h, unsigned int *sse,
                        int *sum) {
  uint64_t sse_long = 0;
  uint64_t sum_long = 0;
  highbd_variance64(a8, a_stride, b8, b_stride, w, h, &sse_long, &sum_long);
  *sum = (int)ROUND_POWER_OF_TWO(sum_long, 2);
  *sse = (unsigned int)ROUND_POWER_OF_TWO(sse_long, 4);
 }
 void highbd_12_variance(const uint8_t *a8, int  a_stride,
                        const uint8_t *b8, int  b_stride,
                        int w, int h, unsigned int *sse,
                        int *sum) {
  uint64_t sse_long = 0;
  uint64_t sum_long = 0;
  highbd_variance64(a8, a_stride, b8, b_stride, w, h, &sse_long, &sum_long);
  *sum = (int)ROUND_POWER_OF_TWO(sum_long, 4);
  *sse = (unsigned int)ROUND_POWER_OF_TWO(sse_long, 8);
 }
 static void highbd_var_filter_block2d_bil_first_pass(
    const uint8_t *src_ptr8,
    uint16_t *output_ptr,
@ -374,35 +209,6 @@ static void highbd_var_filter_block2d_bil_second_pass(
  }
 }
 #define HIGHBD_VAR(W, H) \
 unsigned int vp9_highbd_variance##W##x##H##_c(const uint8_t *a, int a_stride, \
                                              const uint8_t *b, int b_stride, \
                                              unsigned int *sse) { \
  int sum; \
  highbd_variance(a, a_stride, b, b_stride, W, H, sse, &sum); \
  return *sse - (((int64_t)sum * sum) / (W * H)); \
 } \
 \
 unsigned int vp9_highbd_10_variance##W##x##H##_c(const uint8_t *a, \
                                                 int a_stride, \
                                                 const uint8_t *b, \
                                                 int b_stride, \
                                                 unsigned int *sse) { \
  int sum; \
  highbd_10_variance(a, a_stride, b, b_stride, W, H, sse, &sum); \
  return *sse - (((int64_t)sum * sum) / (W * H)); \
 } \
 \
 unsigned int vp9_highbd_12_variance##W##x##H##_c(const uint8_t *a, \
                                                 int a_stride, \
                                                 const uint8_t *b, \
                                                 int b_stride, \
                                                 unsigned int *sse) { \
  int sum; \
  highbd_12_variance(a, a_stride, b, b_stride, W, H, sse, &sum); \
  return *sse - (((int64_t)sum * sum) / (W * H)); \
 }
 #define HIGHBD_SUBPIX_VAR(W, H) \
 unsigned int vp9_highbd_sub_pixel_variance##W##x##H##_c( \
  const uint8_t *src, int  src_stride, \
@ -417,7 +223,7 @@ unsigned int vp9_highbd_sub_pixel_variance##W##x##H##_c( \
  highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \
                                            BILINEAR_FILTERS_2TAP(yoffset)); \
 \
-  return vp9_highbd_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp2), W, dst, \
+  return vpx_highbd_8_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp2), W, dst, \
                                          dst_stride, sse); \
 } \
 \
@ -434,7 +240,7 @@ unsigned int vp9_highbd_10_sub_pixel_variance##W##x##H##_c( \
  highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \
                                            BILINEAR_FILTERS_2TAP(yoffset)); \
 \
-  return vp9_highbd_10_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp2), \
+  return vpx_highbd_10_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp2), \
                                             W, dst, dst_stride, sse); \
 } \
 \
@ -451,7 +257,7 @@ unsigned int vp9_highbd_12_sub_pixel_variance##W##x##H##_c( \
  highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \
                                            BILINEAR_FILTERS_2TAP(yoffset)); \
 \
-  return vp9_highbd_12_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp2), \
+  return vpx_highbd_12_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp2), \
                                             W, dst, dst_stride, sse); \
 }
@ -471,10 +277,10 @@ unsigned int vp9_highbd_sub_pixel_avg_variance##W##x##H##_c( \
  highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \
                                            BILINEAR_FILTERS_2TAP(yoffset)); \
 \
-  vp9_highbd_comp_avg_pred(temp3, second_pred, W, H, \
+  vpx_highbd_comp_avg_pred(temp3, second_pred, W, H, \
                           CONVERT_TO_BYTEPTR(temp2), W); \
 \
-  return vp9_highbd_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp3), W, dst, \
+  return vpx_highbd_8_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp3), W, dst, \
                                          dst_stride, sse); \
 } \
 \
@ -493,10 +299,10 @@ unsigned int vp9_highbd_10_sub_pixel_avg_variance##W##x##H##_c( \
  highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \
                                            BILINEAR_FILTERS_2TAP(yoffset)); \
 \
-  vp9_highbd_comp_avg_pred(temp3, second_pred, W, H, \
+  vpx_highbd_comp_avg_pred(temp3, second_pred, W, H, \
                           CONVERT_TO_BYTEPTR(temp2), W); \
 \
-  return vp9_highbd_10_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp3), \
+  return vpx_highbd_10_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp3), \
                                             W, dst, dst_stride, sse); \
 } \
 \
@ -515,137 +321,49 @@ unsigned int vp9_highbd_12_sub_pixel_avg_variance##W##x##H##_c( \
  highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \
                                            BILINEAR_FILTERS_2TAP(yoffset)); \
 \
-  vp9_highbd_comp_avg_pred(temp3, second_pred, W, H, \
+  vpx_highbd_comp_avg_pred(temp3, second_pred, W, H, \
                           CONVERT_TO_BYTEPTR(temp2), W); \
 \
-  return vp9_highbd_12_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp3), \
+  return vpx_highbd_12_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp3), \
                                             W, dst, dst_stride, sse); \
 }
 #define HIGHBD_GET_VAR(S) \
 void vp9_highbd_get##S##x##S##var_c(const uint8_t *src, int src_stride, \
                                    const uint8_t *ref, int ref_stride, \
                                    unsigned int *sse, int *sum) { \
  highbd_variance(src, src_stride, ref, ref_stride, S, S, sse, sum); \
 } \
 \
 void vp9_highbd_10_get##S##x##S##var_c(const uint8_t *src, int src_stride, \
                                       const uint8_t *ref, int ref_stride, \
                                       unsigned int *sse, int *sum) { \
  highbd_10_variance(src, src_stride, ref, ref_stride, S, S, sse, sum); \
 } \
 \
 void vp9_highbd_12_get##S##x##S##var_c(const uint8_t *src, int src_stride, \
                                       const uint8_t *ref, int ref_stride, \
                                       unsigned int *sse, int *sum) { \
  highbd_12_variance(src, src_stride, ref, ref_stride, S, S, sse, sum); \
 }
 #define HIGHBD_MSE(W, H) \
 unsigned int vp9_highbd_mse##W##x##H##_c(const uint8_t *src, \
                                         int src_stride, \
                                         const uint8_t *ref, \
                                         int ref_stride, \
                                         unsigned int *sse) { \
  int sum; \
  highbd_variance(src, src_stride, ref, ref_stride, W, H, sse, &sum); \
  return *sse; \
 } \
 \
 unsigned int vp9_highbd_10_mse##W##x##H##_c(const uint8_t *src, \
                                            int src_stride, \
                                            const uint8_t *ref, \
                                            int ref_stride, \
                                            unsigned int *sse) { \
  int sum; \
  highbd_10_variance(src, src_stride, ref, ref_stride, W, H, sse, &sum); \
  return *sse; \
 } \
 \
 unsigned int vp9_highbd_12_mse##W##x##H##_c(const uint8_t *src, \
                                            int src_stride, \
                                            const uint8_t *ref, \
                                            int ref_stride, \
                                            unsigned int *sse) { \
  int sum; \
  highbd_12_variance(src, src_stride, ref, ref_stride, W, H, sse, &sum); \
  return *sse; \
 }
 HIGHBD_GET_VAR(8)
 HIGHBD_GET_VAR(16)
 HIGHBD_MSE(16, 16)
 HIGHBD_MSE(16, 8)
 HIGHBD_MSE(8, 16)
 HIGHBD_MSE(8, 8)
 HIGHBD_VAR(4, 4)
 HIGHBD_SUBPIX_VAR(4, 4)
 HIGHBD_SUBPIX_AVG_VAR(4, 4)
 HIGHBD_VAR(4, 8)
 HIGHBD_SUBPIX_VAR(4, 8)
 HIGHBD_SUBPIX_AVG_VAR(4, 8)
 HIGHBD_VAR(8, 4)
 HIGHBD_SUBPIX_VAR(8, 4)
 HIGHBD_SUBPIX_AVG_VAR(8, 4)
 HIGHBD_VAR(8, 8)
 HIGHBD_SUBPIX_VAR(8, 8)
 HIGHBD_SUBPIX_AVG_VAR(8, 8)
 HIGHBD_VAR(8, 16)
 HIGHBD_SUBPIX_VAR(8, 16)
 HIGHBD_SUBPIX_AVG_VAR(8, 16)
 HIGHBD_VAR(16, 8)
 HIGHBD_SUBPIX_VAR(16, 8)
 HIGHBD_SUBPIX_AVG_VAR(16, 8)
 HIGHBD_VAR(16, 16)
 HIGHBD_SUBPIX_VAR(16, 16)
 HIGHBD_SUBPIX_AVG_VAR(16, 16)
 HIGHBD_VAR(16, 32)
 HIGHBD_SUBPIX_VAR(16, 32)
 HIGHBD_SUBPIX_AVG_VAR(16, 32)
 HIGHBD_VAR(32, 16)
 HIGHBD_SUBPIX_VAR(32, 16)
 HIGHBD_SUBPIX_AVG_VAR(32, 16)
 HIGHBD_VAR(32, 32)
 HIGHBD_SUBPIX_VAR(32, 32)
 HIGHBD_SUBPIX_AVG_VAR(32, 32)
 HIGHBD_VAR(32, 64)
 HIGHBD_SUBPIX_VAR(32, 64)
 HIGHBD_SUBPIX_AVG_VAR(32, 64)
 HIGHBD_VAR(64, 32)
 HIGHBD_SUBPIX_VAR(64, 32)
 HIGHBD_SUBPIX_AVG_VAR(64, 32)
 HIGHBD_VAR(64, 64)
 HIGHBD_SUBPIX_VAR(64, 64)
 HIGHBD_SUBPIX_AVG_VAR(64, 64)
 void vp9_highbd_comp_avg_pred(uint16_t *comp_pred, const uint8_t *pred8,
                              int width, int height, const uint8_t *ref8,
                              int ref_stride) {
  int i, j;
  uint16_t *pred = CONVERT_TO_SHORTPTR(pred8);
  uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
  for (i = 0; i < height; i++) {
    for (j = 0; j < width; j++) {
      const int tmp = pred[j] + ref[j];
      comp_pred[j] = ROUND_POWER_OF_TWO(tmp, 1);
    }
    comp_pred += width;
    pred += width;
    ref += ref_stride;
  }
 }
 #endif  // CONFIG_VP9_HIGHBITDEPTH
--- a/vp9/encoder/vp9_variance.h
+++ b/vp9/encoder/vp9_variance.h
@ -12,31 +12,64 @@
 #define VP9_ENCODER_VP9_VARIANCE_H_
 #include "vpx/vpx_integer.h"
 #include "vpx_ports/mem.h"
 #ifdef __cplusplus
 extern "C" {
 #endif
-void variance(const uint8_t *a, int a_stride,
+// TODO(johannkoenig): All functions which depend on
-              const uint8_t *b, int b_stride,
+// [highbd_][8|10|12_]variance should be refactored or moved to vpx_dsp.
-              int  w, int  h,
+static void variance(const uint8_t *a, int a_stride,
-              unsigned int *sse, int *sum);
+                     const uint8_t *b, int b_stride,
                     int  w, int  h, unsigned int *sse, int *sum) {
  int i, j;
  *sum = 0;
  *sse = 0;
  for (i = 0; i < h; i++) {
    for (j = 0; j < w; j++) {
      const int diff = a[j] - b[j];
      *sum += diff;
      *sse += diff * diff;
    }
    a += a_stride;
    b += b_stride;
  }
 }
 #if CONFIG_VP9_HIGHBITDEPTH
-void highbd_variance(const uint8_t *a8, int a_stride,
+static void highbd_variance64(const uint8_t *a8, int  a_stride,
-                     const uint8_t *b8, int b_stride,
+                              const uint8_t *b8, int  b_stride,
-                     int w, int h,
+                              int w, int h, uint64_t *sse, uint64_t *sum) {
-                     unsigned int *sse, int *sum);
+  int i, j;
-void highbd_10_variance(const uint8_t *a8, int a_stride,
+  uint16_t *a = CONVERT_TO_SHORTPTR(a8);
-                        const uint8_t *b8, int b_stride,
+  uint16_t *b = CONVERT_TO_SHORTPTR(b8);
-                        int w, int h,
+  *sum = 0;
-                        unsigned int *sse, int *sum);
+  *sse = 0;
-void highbd_12_variance(const uint8_t *a8, int a_stride,
+  for (i = 0; i < h; i++) {
-                        const uint8_t *b8, int b_stride,
+    for (j = 0; j < w; j++) {
-                        int w, int h,
+      const int diff = a[j] - b[j];
-                        unsigned int *sse, int *sum);
+      *sum += diff;
      *sse += diff * diff;
    }
    a += a_stride;
    b += b_stride;
  }
 }
 static void highbd_8_variance(const uint8_t *a8, int a_stride,
                              const uint8_t *b8, int b_stride,
                              int w, int h, unsigned int *sse, int *sum) {
  uint64_t sse_long = 0;
  uint64_t sum_long = 0;
  highbd_variance64(a8, a_stride, b8, b_stride, w, h, &sse_long, &sum_long);
  *sse = (unsigned int)sse_long;
  *sum = (int)sum_long;
 }
 #endif
 typedef unsigned int(*vp9_sad_fn_t)(const uint8_t *src_ptr,
@ -95,15 +128,6 @@ typedef struct vp9_variance_vtable {
  vp9_sad_multi_d_fn_t       sdx4df;
 } vp9_variance_fn_ptr_t;
 void vp9_comp_avg_pred(uint8_t *comp_pred, const uint8_t *pred, int width,
                       int height, const uint8_t *ref, int ref_stride);
 #if CONFIG_VP9_HIGHBITDEPTH
 void vp9_highbd_comp_avg_pred(uint16_t *comp_pred, const uint8_t *pred,
                              int width, int height,
                              const uint8_t *ref, int ref_stride);
 #endif
 #ifdef __cplusplus
 }  // extern "C"
 #endif
--- a/vp9/encoder/x86/vp9_highbd_variance_sse2.c
+++ b/vp9/encoder/x86/vp9_highbd_variance_sse2.c
@ -13,237 +13,6 @@
 #include "vp9/encoder/vp9_variance.h"
 #include "vpx_ports/mem.h"
 typedef uint32_t (*high_variance_fn_t) (const uint16_t *src, int src_stride,
                                        const uint16_t *ref, int ref_stride,
                                        uint32_t *sse, int *sum);
 uint32_t vp9_highbd_calc8x8var_sse2(const uint16_t *src, int src_stride,
                                    const uint16_t *ref, int ref_stride,
                                    uint32_t *sse, int *sum);
 uint32_t vp9_highbd_calc16x16var_sse2(const uint16_t *src, int src_stride,
                                      const uint16_t *ref, int ref_stride,
                                      uint32_t *sse, int *sum);
 static void highbd_variance_sse2(const uint16_t *src, int src_stride,
                                 const uint16_t *ref, int ref_stride,
                                 int w, int h, uint32_t *sse, int *sum,
                                 high_variance_fn_t var_fn, int block_size) {
  int i, j;
  *sse = 0;
  *sum = 0;
  for (i = 0; i < h; i += block_size) {
    for (j = 0; j < w; j += block_size) {
      unsigned int sse0;
      int sum0;
      var_fn(src + src_stride * i + j, src_stride,
             ref + ref_stride * i + j, ref_stride, &sse0, &sum0);
      *sse += sse0;
      *sum += sum0;
    }
  }
 }
 static void highbd_10_variance_sse2(const uint16_t *src, int src_stride,
                                    const uint16_t *ref, int ref_stride,
                                    int w, int h, uint32_t *sse, int *sum,
                                    high_variance_fn_t var_fn, int block_size) {
  int i, j;
  uint64_t sse_long = 0;
  int64_t sum_long = 0;
  for (i = 0; i < h; i += block_size) {
    for (j = 0; j < w; j += block_size) {
      unsigned int sse0;
      int sum0;
      var_fn(src + src_stride * i + j, src_stride,
             ref + ref_stride * i + j, ref_stride, &sse0, &sum0);
      sse_long += sse0;
      sum_long += sum0;
    }
  }
  *sum = ROUND_POWER_OF_TWO(sum_long, 2);
  *sse = ROUND_POWER_OF_TWO(sse_long, 4);
 }
 static void highbd_12_variance_sse2(const uint16_t *src, int src_stride,
                                    const uint16_t *ref, int ref_stride,
                                    int w, int h, uint32_t *sse, int *sum,
                                    high_variance_fn_t var_fn, int block_size) {
  int i, j;
  uint64_t sse_long = 0;
  int64_t sum_long = 0;
  for (i = 0; i < h; i += block_size) {
    for (j = 0; j < w; j += block_size) {
      unsigned int sse0;
      int sum0;
      var_fn(src + src_stride * i + j, src_stride,
             ref + ref_stride * i + j, ref_stride, &sse0, &sum0);
      sse_long += sse0;
      sum_long += sum0;
    }
  }
  *sum = ROUND_POWER_OF_TWO(sum_long, 4);
  *sse = ROUND_POWER_OF_TWO(sse_long, 8);
 }
 #define HIGH_GET_VAR(S) \
 void vp9_highbd_get##S##x##S##var_sse2(const uint8_t *src8, int src_stride, \
                                       const uint8_t *ref8, int ref_stride, \
                                       uint32_t *sse, int *sum) { \
  uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
  uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); \
  vp9_highbd_calc##S##x##S##var_sse2(src, src_stride, ref, ref_stride, \
                                     sse, sum); \
 } \
 \
 void vp9_highbd_10_get##S##x##S##var_sse2(const uint8_t *src8, int src_stride, \
                                          const uint8_t *ref8, int ref_stride, \
                                          uint32_t *sse, int *sum) { \
  uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
  uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); \
  vp9_highbd_calc##S##x##S##var_sse2(src, src_stride, ref, ref_stride, \
                                     sse, sum); \
  *sum = ROUND_POWER_OF_TWO(*sum, 2); \
  *sse = ROUND_POWER_OF_TWO(*sse, 4); \
 } \
 \
 void vp9_highbd_12_get##S##x##S##var_sse2(const uint8_t *src8, int src_stride, \
                                          const uint8_t *ref8, int ref_stride, \
                                          uint32_t *sse, int *sum) { \
  uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
  uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); \
  vp9_highbd_calc##S##x##S##var_sse2(src, src_stride, ref, ref_stride, \
                                     sse, sum); \
  *sum = ROUND_POWER_OF_TWO(*sum, 4); \
  *sse = ROUND_POWER_OF_TWO(*sse, 8); \
 }
 HIGH_GET_VAR(16);
 HIGH_GET_VAR(8);
 #undef HIGH_GET_VAR
 #define VAR_FN(w, h, block_size, shift) \
 uint32_t vp9_highbd_variance##w##x##h##_sse2( \
    const uint8_t *src8, int src_stride, \
    const uint8_t *ref8, int ref_stride, uint32_t *sse) { \
  int sum; \
  uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
  uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); \
  highbd_variance_sse2(src, src_stride, ref, ref_stride, w, h, sse, &sum, \
                       vp9_highbd_calc##block_size##x##block_size##var_sse2, \
                       block_size); \
  return *sse - (((int64_t)sum * sum) >> shift); \
 } \
 \
 uint32_t vp9_highbd_10_variance##w##x##h##_sse2( \
    const uint8_t *src8, int src_stride, \
    const uint8_t *ref8, int ref_stride, uint32_t *sse) { \
  int sum; \
  uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
  uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); \
  highbd_10_variance_sse2( \
      src, src_stride, ref, ref_stride, w, h, sse, &sum, \
      vp9_highbd_calc##block_size##x##block_size##var_sse2, block_size); \
  return *sse - (((int64_t)sum * sum) >> shift); \
 } \
 \
 uint32_t vp9_highbd_12_variance##w##x##h##_sse2( \
    const uint8_t *src8, int src_stride, \
    const uint8_t *ref8, int ref_stride, uint32_t *sse) { \
  int sum; \
  uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
  uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); \
  highbd_12_variance_sse2( \
      src, src_stride, ref, ref_stride, w, h, sse, &sum, \
      vp9_highbd_calc##block_size##x##block_size##var_sse2, block_size); \
  return *sse - (((int64_t)sum * sum) >> shift); \
 }
 VAR_FN(64, 64, 16, 12);
 VAR_FN(64, 32, 16, 11);
 VAR_FN(32, 64, 16, 11);
 VAR_FN(32, 32, 16, 10);
 VAR_FN(32, 16, 16, 9);
 VAR_FN(16, 32, 16, 9);
 VAR_FN(16, 16, 16, 8);
 VAR_FN(16, 8, 8, 7);
 VAR_FN(8, 16, 8, 7);
 VAR_FN(8, 8, 8, 6);
 #undef VAR_FN
 unsigned int vp9_highbd_mse16x16_sse2(const uint8_t *src8, int src_stride,
                                      const uint8_t *ref8, int ref_stride,
                                      unsigned int *sse) {
  int sum;
  uint16_t *src = CONVERT_TO_SHORTPTR(src8);
  uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
  highbd_variance_sse2(src, src_stride, ref, ref_stride, 16, 16,
                       sse, &sum, vp9_highbd_calc16x16var_sse2, 16);
  return *sse;
 }
 unsigned int vp9_highbd_10_mse16x16_sse2(const uint8_t *src8, int src_stride,
                                         const uint8_t *ref8, int ref_stride,
                                         unsigned int *sse) {
  int sum;
  uint16_t *src = CONVERT_TO_SHORTPTR(src8);
  uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
  highbd_10_variance_sse2(src, src_stride, ref, ref_stride, 16, 16,
                          sse, &sum, vp9_highbd_calc16x16var_sse2, 16);
  return *sse;
 }
 unsigned int vp9_highbd_12_mse16x16_sse2(const uint8_t *src8, int src_stride,
                                         const uint8_t *ref8, int ref_stride,
                                         unsigned int *sse) {
  int sum;
  uint16_t *src = CONVERT_TO_SHORTPTR(src8);
  uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
  highbd_12_variance_sse2(src, src_stride, ref, ref_stride, 16, 16,
                          sse, &sum, vp9_highbd_calc16x16var_sse2, 16);
  return *sse;
 }
 unsigned int vp9_highbd_mse8x8_sse2(const uint8_t *src8, int src_stride,
                                    const uint8_t *ref8, int ref_stride,
                                    unsigned int *sse) {
  int sum;
  uint16_t *src = CONVERT_TO_SHORTPTR(src8);
  uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
  highbd_variance_sse2(src, src_stride, ref, ref_stride, 8, 8,
                       sse, &sum, vp9_highbd_calc8x8var_sse2, 8);
  return *sse;
 }
 unsigned int vp9_highbd_10_mse8x8_sse2(const uint8_t *src8, int src_stride,
                                       const uint8_t *ref8, int ref_stride,
                                       unsigned int *sse) {
  int sum;
  uint16_t *src = CONVERT_TO_SHORTPTR(src8);
  uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
  highbd_10_variance_sse2(src, src_stride, ref, ref_stride, 8, 8,
                          sse, &sum, vp9_highbd_calc8x8var_sse2, 8);
  return *sse;
 }
 unsigned int vp9_highbd_12_mse8x8_sse2(const uint8_t *src8, int src_stride,
                                       const uint8_t *ref8, int ref_stride,
                                       unsigned int *sse) {
  int sum;
  uint16_t *src = CONVERT_TO_SHORTPTR(src8);
  uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
  highbd_12_variance_sse2(src, src_stride, ref, ref_stride, 8, 8,
                          sse, &sum, vp9_highbd_calc8x8var_sse2, 8);
  return *sse;
 }
 #define DECL(w, opt) \
 int vp9_highbd_sub_pixel_variance##w##xh_##opt(const uint16_t *src, \
                                               ptrdiff_t src_stride, \
--- a/vp9/encoder/x86/vp9_variance_avx2.c
+++ b/vp9/encoder/x86/vp9_variance_avx2.c
@ -13,18 +13,6 @@
 #include "vp9/encoder/vp9_variance.h"
 #include "vpx_ports/mem.h"
 typedef void (*get_var_avx2)(const uint8_t *src, int src_stride,
                             const uint8_t *ref, int ref_stride,
                             unsigned int *sse, int *sum);
 void vp9_get16x16var_avx2(const uint8_t *src, int src_stride,
                          const uint8_t *ref, int ref_stride,
                          unsigned int *sse, int *sum);
 void vp9_get32x32var_avx2(const uint8_t *src, int src_stride,
                          const uint8_t *ref, int ref_stride,
                          unsigned int *sse, int *sum);
 unsigned int vp9_sub_pixel_variance32xh_avx2(const uint8_t *src, int src_stride,
                                             int x_offset, int y_offset,
                                             const uint8_t *dst, int dst_stride,
@ -42,81 +30,6 @@ unsigned int vp9_sub_pixel_avg_variance32xh_avx2(const uint8_t *src,
                                                 int height,
                                                 unsigned int *sseptr);
 static void variance_avx2(const uint8_t *src, int src_stride,
                          const uint8_t *ref, int  ref_stride,
                          int w, int h, unsigned int *sse, int *sum,
                          get_var_avx2 var_fn, int block_size) {
  int i, j;
  *sse = 0;
  *sum = 0;
  for (i = 0; i < h; i += 16) {
    for (j = 0; j < w; j += block_size) {
      unsigned int sse0;
      int sum0;
      var_fn(&src[src_stride * i + j], src_stride,
             &ref[ref_stride * i + j], ref_stride, &sse0, &sum0);
      *sse += sse0;
      *sum += sum0;
    }
  }
 }
 unsigned int vp9_variance16x16_avx2(const uint8_t *src, int src_stride,
                                    const uint8_t *ref, int ref_stride,
                                    unsigned int *sse) {
  int sum;
  variance_avx2(src, src_stride, ref, ref_stride, 16, 16,
                sse, &sum, vp9_get16x16var_avx2, 16);
  return *sse - (((unsigned int)sum * sum) >> 8);
 }
 unsigned int vp9_mse16x16_avx2(const uint8_t *src, int src_stride,
                               const uint8_t *ref, int ref_stride,
                               unsigned int *sse) {
  int sum;
  vp9_get16x16var_avx2(src, src_stride, ref, ref_stride, sse, &sum);
  return *sse;
 }
 unsigned int vp9_variance32x16_avx2(const uint8_t *src, int src_stride,
                                    const uint8_t *ref, int ref_stride,
                                    unsigned int *sse) {
  int sum;
  variance_avx2(src, src_stride, ref, ref_stride, 32, 16,
                sse, &sum, vp9_get32x32var_avx2, 32);
  return *sse - (((int64_t)sum * sum) >> 9);
 }
 unsigned int vp9_variance32x32_avx2(const uint8_t *src, int src_stride,
                                    const uint8_t *ref, int ref_stride,
                                    unsigned int *sse) {
  int sum;
  variance_avx2(src, src_stride, ref, ref_stride, 32, 32,
                sse, &sum, vp9_get32x32var_avx2, 32);
  return *sse - (((int64_t)sum * sum) >> 10);
 }
 unsigned int vp9_variance64x64_avx2(const uint8_t *src, int src_stride,
                                    const uint8_t *ref, int ref_stride,
                                    unsigned int *sse) {
  int sum;
  variance_avx2(src, src_stride, ref, ref_stride, 64, 64,
                sse, &sum, vp9_get32x32var_avx2, 32);
  return *sse - (((int64_t)sum * sum) >> 12);
 }
 unsigned int vp9_variance64x32_avx2(const uint8_t *src, int src_stride,
                                    const uint8_t *ref, int ref_stride,
                                    unsigned int *sse) {
  int sum;
  variance_avx2(src, src_stride, ref, ref_stride, 64, 32,
                sse, &sum, vp9_get32x32var_avx2, 32);
  return *sse - (((int64_t)sum * sum) >> 11);
 }
 unsigned int vp9_sub_pixel_variance64x64_avx2(const uint8_t *src,
                                              int src_stride,
                                              int x_offset,
--- a/vp9/encoder/x86/vp9_variance_sse2.c
+++ b/vp9/encoder/x86/vp9_variance_sse2.c
@ -16,299 +16,6 @@
 #include "vp9/encoder/vp9_variance.h"
 #include "vpx_ports/mem.h"
 typedef void (*variance_fn_t)(const unsigned char *src, int src_stride,
                              const unsigned char *ref, int ref_stride,
                              unsigned int *sse, int *sum);
 unsigned int vp9_get_mb_ss_sse2(const int16_t *src) {
  __m128i vsum = _mm_setzero_si128();
  int i;
  for (i = 0; i < 32; ++i) {
    const __m128i v = _mm_loadu_si128((const __m128i *)src);
    vsum = _mm_add_epi32(vsum, _mm_madd_epi16(v, v));
    src += 8;
  }
  vsum = _mm_add_epi32(vsum, _mm_srli_si128(vsum, 8));
  vsum = _mm_add_epi32(vsum, _mm_srli_si128(vsum, 4));
  return  _mm_cvtsi128_si32(vsum);
 }
 #define READ64(p, stride, i) \
  _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const uint32_t *)(p + i * stride)), \
      _mm_cvtsi32_si128(*(const uint32_t *)(p + (i + 1) * stride)))
 static void get4x4var_sse2(const uint8_t *src, int src_stride,
                           const uint8_t *ref, int ref_stride,
                           unsigned int *sse, int *sum) {
  const __m128i zero = _mm_setzero_si128();
  const __m128i src0 = _mm_unpacklo_epi8(READ64(src, src_stride, 0), zero);
  const __m128i src1 = _mm_unpacklo_epi8(READ64(src, src_stride, 2), zero);
  const __m128i ref0 = _mm_unpacklo_epi8(READ64(ref, ref_stride, 0), zero);
  const __m128i ref1 = _mm_unpacklo_epi8(READ64(ref, ref_stride, 2), zero);
  const __m128i diff0 = _mm_sub_epi16(src0, ref0);
  const __m128i diff1 = _mm_sub_epi16(src1, ref1);
  // sum
  __m128i vsum = _mm_add_epi16(diff0, diff1);
  vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 8));
  vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 4));
  vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 2));
  *sum = (int16_t)_mm_extract_epi16(vsum, 0);
  // sse
  vsum = _mm_add_epi32(_mm_madd_epi16(diff0, diff0),
                       _mm_madd_epi16(diff1, diff1));
  vsum = _mm_add_epi32(vsum, _mm_srli_si128(vsum, 8));
  vsum = _mm_add_epi32(vsum, _mm_srli_si128(vsum, 4));
  *sse = _mm_cvtsi128_si32(vsum);
 }
 void vp9_get8x8var_sse2(const uint8_t *src, int src_stride,
                        const uint8_t *ref, int ref_stride,
                        unsigned int *sse, int *sum) {
  const __m128i zero = _mm_setzero_si128();
  __m128i vsum = _mm_setzero_si128();
  __m128i vsse = _mm_setzero_si128();
  int i;
  for (i = 0; i < 8; i += 2) {
    const __m128i src0 = _mm_unpacklo_epi8(_mm_loadl_epi64(
        (const __m128i *)(src + i * src_stride)), zero);
    const __m128i ref0 = _mm_unpacklo_epi8(_mm_loadl_epi64(
        (const __m128i *)(ref + i * ref_stride)), zero);
    const __m128i diff0 = _mm_sub_epi16(src0, ref0);
    const __m128i src1 = _mm_unpacklo_epi8(_mm_loadl_epi64(
        (const __m128i *)(src + (i + 1) * src_stride)), zero);
    const __m128i ref1 = _mm_unpacklo_epi8(_mm_loadl_epi64(
        (const __m128i *)(ref + (i + 1) * ref_stride)), zero);
    const __m128i diff1 = _mm_sub_epi16(src1, ref1);
    vsum = _mm_add_epi16(vsum, diff0);
    vsum = _mm_add_epi16(vsum, diff1);
    vsse = _mm_add_epi32(vsse, _mm_madd_epi16(diff0, diff0));
    vsse = _mm_add_epi32(vsse, _mm_madd_epi16(diff1, diff1));
  }
  // sum
  vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 8));
  vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 4));
  vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 2));
  *sum = (int16_t)_mm_extract_epi16(vsum, 0);
  // sse
  vsse = _mm_add_epi32(vsse, _mm_srli_si128(vsse, 8));
  vsse = _mm_add_epi32(vsse, _mm_srli_si128(vsse, 4));
  *sse = _mm_cvtsi128_si32(vsse);
 }
 void vp9_get16x16var_sse2(const uint8_t *src, int src_stride,
                          const uint8_t *ref, int ref_stride,
                          unsigned int *sse, int *sum) {
  const __m128i zero = _mm_setzero_si128();
  __m128i vsum = _mm_setzero_si128();
  __m128i vsse = _mm_setzero_si128();
  int i;
  for (i = 0; i < 16; ++i) {
    const __m128i s = _mm_loadu_si128((const __m128i *)src);
    const __m128i r = _mm_loadu_si128((const __m128i *)ref);
    const __m128i src0 = _mm_unpacklo_epi8(s, zero);
    const __m128i ref0 = _mm_unpacklo_epi8(r, zero);
    const __m128i diff0 = _mm_sub_epi16(src0, ref0);
    const __m128i src1 = _mm_unpackhi_epi8(s, zero);
    const __m128i ref1 = _mm_unpackhi_epi8(r, zero);
    const __m128i diff1 = _mm_sub_epi16(src1, ref1);
    vsum = _mm_add_epi16(vsum, diff0);
    vsum = _mm_add_epi16(vsum, diff1);
    vsse = _mm_add_epi32(vsse, _mm_madd_epi16(diff0, diff0));
    vsse = _mm_add_epi32(vsse, _mm_madd_epi16(diff1, diff1));
    src += src_stride;
    ref += ref_stride;
  }
  // sum
  vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 8));
  vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 4));
  *sum = (int16_t)_mm_extract_epi16(vsum, 0) +
             (int16_t)_mm_extract_epi16(vsum, 1);
  // sse
  vsse = _mm_add_epi32(vsse, _mm_srli_si128(vsse, 8));
  vsse = _mm_add_epi32(vsse, _mm_srli_si128(vsse, 4));
  *sse = _mm_cvtsi128_si32(vsse);
 }
 static void variance_sse2(const unsigned char *src, int src_stride,
                          const unsigned char *ref, int ref_stride,
                          int w, int h, unsigned int *sse, int *sum,
                          variance_fn_t var_fn, int block_size) {
  int i, j;
  *sse = 0;
  *sum = 0;
  for (i = 0; i < h; i += block_size) {
    for (j = 0; j < w; j += block_size) {
      unsigned int sse0;
      int sum0;
      var_fn(src + src_stride * i + j, src_stride,
             ref + ref_stride * i + j, ref_stride, &sse0, &sum0);
      *sse += sse0;
      *sum += sum0;
    }
  }
 }
 unsigned int vp9_variance4x4_sse2(const unsigned char *src, int src_stride,
                                  const unsigned char *ref, int ref_stride,
                                  unsigned int *sse) {
  int sum;
  get4x4var_sse2(src, src_stride, ref, ref_stride, sse, &sum);
  return *sse - (((unsigned int)sum * sum) >> 4);
 }
 unsigned int vp9_variance8x4_sse2(const uint8_t *src, int src_stride,
                                  const uint8_t *ref, int ref_stride,
                                  unsigned int *sse) {
  int sum;
  variance_sse2(src, src_stride, ref, ref_stride, 8, 4,
                sse, &sum, get4x4var_sse2, 4);
  return *sse - (((unsigned int)sum * sum) >> 5);
 }
 unsigned int vp9_variance4x8_sse2(const uint8_t *src, int src_stride,
                                  const uint8_t *ref, int ref_stride,
                                  unsigned int *sse) {
  int sum;
  variance_sse2(src, src_stride, ref, ref_stride, 4, 8,
                sse, &sum, get4x4var_sse2, 4);
  return *sse - (((unsigned int)sum * sum) >> 5);
 }
 unsigned int vp9_variance8x8_sse2(const unsigned char *src, int src_stride,
                                  const unsigned char *ref, int ref_stride,
                                  unsigned int *sse) {
  int sum;
  vp9_get8x8var_sse2(src, src_stride, ref, ref_stride, sse, &sum);
  return *sse - (((unsigned int)sum * sum) >> 6);
 }
 unsigned int vp9_variance16x8_sse2(const unsigned char *src, int src_stride,
                                   const unsigned char *ref, int ref_stride,
                                   unsigned int *sse) {
  int sum;
  variance_sse2(src, src_stride, ref, ref_stride, 16, 8,
                sse, &sum, vp9_get8x8var_sse2, 8);
  return *sse - (((unsigned int)sum * sum) >> 7);
 }
 unsigned int vp9_variance8x16_sse2(const unsigned char *src, int src_stride,
                                   const unsigned char *ref, int ref_stride,
                                   unsigned int *sse) {
  int sum;
  variance_sse2(src, src_stride, ref, ref_stride, 8, 16,
                sse, &sum, vp9_get8x8var_sse2, 8);
  return *sse - (((unsigned int)sum * sum) >> 7);
 }
 unsigned int vp9_variance16x16_sse2(const unsigned char *src, int src_stride,
                                    const unsigned char *ref, int ref_stride,
                                    unsigned int *sse) {
  int sum;
  vp9_get16x16var_sse2(src, src_stride, ref, ref_stride, sse, &sum);
  return *sse - (((unsigned int)sum * sum) >> 8);
 }
 unsigned int vp9_variance32x32_sse2(const uint8_t *src, int src_stride,
                                    const uint8_t *ref, int ref_stride,
                                    unsigned int *sse) {
  int sum;
  variance_sse2(src, src_stride, ref, ref_stride, 32, 32,
                sse, &sum, vp9_get16x16var_sse2, 16);
  return *sse - (((int64_t)sum * sum) >> 10);
 }
 unsigned int vp9_variance32x16_sse2(const uint8_t *src, int src_stride,
                                    const uint8_t *ref, int ref_stride,
                                    unsigned int *sse) {
  int sum;
  variance_sse2(src, src_stride, ref, ref_stride, 32, 16,
                sse, &sum, vp9_get16x16var_sse2, 16);
  return *sse - (((int64_t)sum * sum) >> 9);
 }
 unsigned int vp9_variance16x32_sse2(const uint8_t *src, int src_stride,
                                    const uint8_t *ref, int ref_stride,
                                    unsigned int *sse) {
  int sum;
  variance_sse2(src, src_stride, ref, ref_stride, 16, 32,
                sse, &sum, vp9_get16x16var_sse2, 16);
  return *sse - (((int64_t)sum * sum) >> 9);
 }
 unsigned int vp9_variance64x64_sse2(const uint8_t *src, int src_stride,
                                    const uint8_t *ref, int ref_stride,
                                    unsigned int *sse) {
  int sum;
  variance_sse2(src, src_stride, ref, ref_stride, 64, 64,
                sse, &sum, vp9_get16x16var_sse2, 16);
  return *sse - (((int64_t)sum * sum) >> 12);
 }
 unsigned int vp9_variance64x32_sse2(const uint8_t *src, int src_stride,
                                    const uint8_t *ref, int ref_stride,
                                    unsigned int *sse) {
  int sum;
  variance_sse2(src, src_stride, ref, ref_stride, 64, 32,
                sse, &sum, vp9_get16x16var_sse2, 16);
  return *sse - (((int64_t)sum * sum) >> 11);
 }
 unsigned int vp9_variance32x64_sse2(const uint8_t *src, int src_stride,
                                    const uint8_t *ref, int ref_stride,
                                    unsigned int *sse) {
  int sum;
  variance_sse2(src, src_stride, ref, ref_stride, 32, 64,
                sse, &sum, vp9_get16x16var_sse2, 16);
  return *sse - (((int64_t)sum * sum) >> 11);
 }
 unsigned int vp9_mse8x8_sse2(const uint8_t *src, int src_stride,
                             const uint8_t *ref, int ref_stride,
                             unsigned int *sse) {
  vp9_variance8x8_sse2(src, src_stride, ref, ref_stride, sse);
  return *sse;
 }
 unsigned int vp9_mse8x16_sse2(const uint8_t *src, int src_stride,
                              const uint8_t *ref, int ref_stride,
                              unsigned int *sse) {
  vp9_variance8x16_sse2(src, src_stride, ref, ref_stride, sse);
  return *sse;
 }
 unsigned int vp9_mse16x8_sse2(const uint8_t *src, int src_stride,
                              const uint8_t *ref, int ref_stride,
                              unsigned int *sse) {
  vp9_variance16x8_sse2(src, src_stride, ref, ref_stride, sse);
  return *sse;
 }
 unsigned int vp9_mse16x16_sse2(const uint8_t *src, int src_stride,
                               const uint8_t *ref, int ref_stride,
                               unsigned int *sse) {
  vp9_variance16x16_sse2(src, src_stride, ref, ref_stride, sse);
  return *sse;
 }
 // The 2 unused parameters are place holders for PIC enabled build.
 #define DECL(w, opt) \
 int vp9_sub_pixel_variance##w##xh_##opt(const uint8_t *src, \
--- a/vp9/vp9cx.mk
+++ b/vp9/vp9cx.mk
@ -102,13 +102,11 @@ VP9_CX_SRCS-yes += encoder/vp9_temporal_filter.h
 VP9_CX_SRCS-yes += encoder/vp9_mbgraph.c
 VP9_CX_SRCS-yes += encoder/vp9_mbgraph.h
 VP9_CX_SRCS-$(HAVE_AVX2) += encoder/x86/vp9_variance_impl_intrin_avx2.c
 VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_avg_intrin_sse2.c
 VP9_CX_SRCS-$(HAVE_AVX2) += encoder/x86/vp9_subpel_variance_impl_intrin_avx2.c
 VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_temporal_filter_apply_sse2.asm
 VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_quantize_sse2.c
 ifeq ($(CONFIG_VP9_HIGHBITDEPTH),yes)
 VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_highbd_variance_impl_sse2.asm
 VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_highbd_quantize_intrin_sse2.c
 VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_highbd_block_error_intrin_sse2.c
 endif
--- a/vpx_dsp/arm/variance_media.asm
+++ b/vpx_dsp/arm/variance_media.asm
@ -0,0 +1,363 @@
 ;
 ;  Copyright (c) 2011 The WebM project authors. All Rights Reserved.
 ;
 ;  Use of this source code is governed by a BSD-style license
 ;  that can be found in the LICENSE file in the root of the source
 ;  tree. An additional intellectual property rights grant can be found
 ;  in the file PATENTS.  All contributing project authors may
 ;  be found in the AUTHORS file in the root of the source tree.
 ;
    EXPORT  |vpx_variance16x16_media|
    EXPORT  |vpx_variance8x8_media|
    EXPORT  |vpx_mse16x16_media|
    ARM
    REQUIRE8
    PRESERVE8
    AREA ||.text||, CODE, READONLY, ALIGN=2
 ; r0    unsigned char *src_ptr
 ; r1    int source_stride
 ; r2    unsigned char *ref_ptr
 ; r3    int  recon_stride
 ; stack unsigned int *sse
 |vpx_variance16x16_media| PROC
    stmfd   sp!, {r4-r12, lr}
    pld     [r0, r1, lsl #0]
    pld     [r2, r3, lsl #0]
    mov     r8, #0              ; initialize sum = 0
    mov     r11, #0             ; initialize sse = 0
    mov     r12, #16            ; set loop counter to 16 (=block height)
 loop16x16
    ; 1st 4 pixels
    ldr     r4, [r0, #0]        ; load 4 src pixels
    ldr     r5, [r2, #0]        ; load 4 ref pixels
    mov     lr, #0              ; constant zero
    usub8   r6, r4, r5          ; calculate difference
    pld     [r0, r1, lsl #1]
    sel     r7, r6, lr          ; select bytes with positive difference
    usub8   r9, r5, r4          ; calculate difference with reversed operands
    pld     [r2, r3, lsl #1]
    sel     r6, r9, lr          ; select bytes with negative difference
    ; calculate partial sums
    usad8   r4, r7, lr          ; calculate sum of positive differences
    usad8   r5, r6, lr          ; calculate sum of negative differences
    orr     r6, r6, r7          ; differences of all 4 pixels
    ; calculate total sum
    adds    r8, r8, r4          ; add positive differences to sum
    subs    r8, r8, r5          ; subtract negative differences from sum
    ; calculate sse
    uxtb16  r5, r6              ; byte (two pixels) to halfwords
    uxtb16  r10, r6, ror #8     ; another two pixels to halfwords
    smlad   r11, r5, r5, r11    ; dual signed multiply, add and accumulate (1)
    ; 2nd 4 pixels
    ldr     r4, [r0, #4]        ; load 4 src pixels
    ldr     r5, [r2, #4]        ; load 4 ref pixels
    smlad   r11, r10, r10, r11  ; dual signed multiply, add and accumulate (2)
    usub8   r6, r4, r5          ; calculate difference
    sel     r7, r6, lr          ; select bytes with positive difference
    usub8   r9, r5, r4          ; calculate difference with reversed operands
    sel     r6, r9, lr          ; select bytes with negative difference
    ; calculate partial sums
    usad8   r4, r7, lr          ; calculate sum of positive differences
    usad8   r5, r6, lr          ; calculate sum of negative differences
    orr     r6, r6, r7          ; differences of all 4 pixels
    ; calculate total sum
    add     r8, r8, r4          ; add positive differences to sum
    sub     r8, r8, r5          ; subtract negative differences from sum
    ; calculate sse
    uxtb16  r5, r6              ; byte (two pixels) to halfwords
    uxtb16  r10, r6, ror #8     ; another two pixels to halfwords
    smlad   r11, r5, r5, r11    ; dual signed multiply, add and accumulate (1)
    ; 3rd 4 pixels
    ldr     r4, [r0, #8]        ; load 4 src pixels
    ldr     r5, [r2, #8]        ; load 4 ref pixels
    smlad   r11, r10, r10, r11  ; dual signed multiply, add and accumulate (2)
    usub8   r6, r4, r5          ; calculate difference
    sel     r7, r6, lr          ; select bytes with positive difference
    usub8   r9, r5, r4          ; calculate difference with reversed operands
    sel     r6, r9, lr          ; select bytes with negative difference
    ; calculate partial sums
    usad8   r4, r7, lr          ; calculate sum of positive differences
    usad8   r5, r6, lr          ; calculate sum of negative differences
    orr     r6, r6, r7          ; differences of all 4 pixels
    ; calculate total sum
    add     r8, r8, r4          ; add positive differences to sum
    sub     r8, r8, r5          ; subtract negative differences from sum
    ; calculate sse
    uxtb16  r5, r6              ; byte (two pixels) to halfwords
    uxtb16  r10, r6, ror #8     ; another two pixels to halfwords
    smlad   r11, r5, r5, r11    ; dual signed multiply, add and accumulate (1)
    ; 4th 4 pixels
    ldr     r4, [r0, #12]       ; load 4 src pixels
    ldr     r5, [r2, #12]       ; load 4 ref pixels
    smlad   r11, r10, r10, r11  ; dual signed multiply, add and accumulate (2)
    usub8   r6, r4, r5          ; calculate difference
    add     r0, r0, r1          ; set src_ptr to next row
    sel     r7, r6, lr          ; select bytes with positive difference
    usub8   r9, r5, r4          ; calculate difference with reversed operands
    add     r2, r2, r3          ; set dst_ptr to next row
    sel     r6, r9, lr          ; select bytes with negative difference
    ; calculate partial sums
    usad8   r4, r7, lr          ; calculate sum of positive differences
    usad8   r5, r6, lr          ; calculate sum of negative differences
    orr     r6, r6, r7          ; differences of all 4 pixels
    ; calculate total sum
    add     r8, r8, r4          ; add positive differences to sum
    sub     r8, r8, r5          ; subtract negative differences from sum
    ; calculate sse
    uxtb16  r5, r6              ; byte (two pixels) to halfwords
    uxtb16  r10, r6, ror #8     ; another two pixels to halfwords
    smlad   r11, r5, r5, r11    ; dual signed multiply, add and accumulate (1)
    smlad   r11, r10, r10, r11  ; dual signed multiply, add and accumulate (2)
    subs    r12, r12, #1
    bne     loop16x16
    ; return stuff
    ldr     r6, [sp, #40]       ; get address of sse
    mul     r0, r8, r8          ; sum * sum
    str     r11, [r6]           ; store sse
    sub     r0, r11, r0, lsr #8 ; return (sse - ((sum * sum) >> 8))
    ldmfd   sp!, {r4-r12, pc}
    ENDP
    END
 ; r0    unsigned char *src_ptr
 ; r1    int source_stride
 ; r2    unsigned char *ref_ptr
 ; r3    int  recon_stride
 ; stack unsigned int *sse
 |vpx_variance8x8_media| PROC
    push    {r4-r10, lr}
    pld     [r0, r1, lsl #0]
    pld     [r2, r3, lsl #0]
    mov     r12, #8             ; set loop counter to 8 (=block height)
    mov     r4, #0              ; initialize sum = 0
    mov     r5, #0              ; initialize sse = 0
 loop8x8
    ; 1st 4 pixels
    ldr     r6, [r0, #0x0]      ; load 4 src pixels
    ldr     r7, [r2, #0x0]      ; load 4 ref pixels
    mov     lr, #0              ; constant zero
    usub8   r8, r6, r7          ; calculate difference
    pld     [r0, r1, lsl #1]
    sel     r10, r8, lr         ; select bytes with positive difference
    usub8   r9, r7, r6          ; calculate difference with reversed operands
    pld     [r2, r3, lsl #1]
    sel     r8, r9, lr          ; select bytes with negative difference
    ; calculate partial sums
    usad8   r6, r10, lr         ; calculate sum of positive differences
    usad8   r7, r8, lr          ; calculate sum of negative differences
    orr     r8, r8, r10         ; differences of all 4 pixels
    ; calculate total sum
    add    r4, r4, r6           ; add positive differences to sum
    sub    r4, r4, r7           ; subtract negative differences from sum
    ; calculate sse
    uxtb16  r7, r8              ; byte (two pixels) to halfwords
    uxtb16  r10, r8, ror #8     ; another two pixels to halfwords
    smlad   r5, r7, r7, r5      ; dual signed multiply, add and accumulate (1)
    ; 2nd 4 pixels
    ldr     r6, [r0, #0x4]      ; load 4 src pixels
    ldr     r7, [r2, #0x4]      ; load 4 ref pixels
    smlad   r5, r10, r10, r5    ; dual signed multiply, add and accumulate (2)
    usub8   r8, r6, r7          ; calculate difference
    add     r0, r0, r1          ; set src_ptr to next row
    sel     r10, r8, lr         ; select bytes with positive difference
    usub8   r9, r7, r6          ; calculate difference with reversed operands
    add     r2, r2, r3          ; set dst_ptr to next row
    sel     r8, r9, lr          ; select bytes with negative difference
    ; calculate partial sums
    usad8   r6, r10, lr         ; calculate sum of positive differences
    usad8   r7, r8, lr          ; calculate sum of negative differences
    orr     r8, r8, r10         ; differences of all 4 pixels
    ; calculate total sum
    add     r4, r4, r6          ; add positive differences to sum
    sub     r4, r4, r7          ; subtract negative differences from sum
    ; calculate sse
    uxtb16  r7, r8              ; byte (two pixels) to halfwords
    uxtb16  r10, r8, ror #8     ; another two pixels to halfwords
    smlad   r5, r7, r7, r5      ; dual signed multiply, add and accumulate (1)
    subs    r12, r12, #1        ; next row
    smlad   r5, r10, r10, r5    ; dual signed multiply, add and accumulate (2)
    bne     loop8x8
    ; return stuff
    ldr     r8, [sp, #32]       ; get address of sse
    mul     r1, r4, r4          ; sum * sum
    str     r5, [r8]            ; store sse
    sub     r0, r5, r1, ASR #6  ; return (sse - ((sum * sum) >> 6))
    pop     {r4-r10, pc}
    ENDP
    END
 ; r0    unsigned char *src_ptr
 ; r1    int source_stride
 ; r2    unsigned char *ref_ptr
 ; r3    int  recon_stride
 ; stack unsigned int *sse
 ;
 ;note: Based on vpx_variance16x16_media. In this function, sum is never used.
 ;      So, we can remove this part of calculation.
 |vpx_mse16x16_media| PROC
    push    {r4-r9, lr}
    pld     [r0, r1, lsl #0]
    pld     [r2, r3, lsl #0]
    mov     r12, #16            ; set loop counter to 16 (=block height)
    mov     r4, #0              ; initialize sse = 0
 loopmse
    ; 1st 4 pixels
    ldr     r5, [r0, #0x0]      ; load 4 src pixels
    ldr     r6, [r2, #0x0]      ; load 4 ref pixels
    mov     lr, #0              ; constant zero
    usub8   r8, r5, r6          ; calculate difference
    pld     [r0, r1, lsl #1]
    sel     r7, r8, lr          ; select bytes with positive difference
    usub8   r9, r6, r5          ; calculate difference with reversed operands
    pld     [r2, r3, lsl #1]
    sel     r8, r9, lr          ; select bytes with negative difference
    ; calculate partial sums
    usad8   r5, r7, lr          ; calculate sum of positive differences
    usad8   r6, r8, lr          ; calculate sum of negative differences
    orr     r8, r8, r7          ; differences of all 4 pixels
    ldr     r5, [r0, #0x4]      ; load 4 src pixels
    ; calculate sse
    uxtb16  r6, r8              ; byte (two pixels) to halfwords
    uxtb16  r7, r8, ror #8      ; another two pixels to halfwords
    smlad   r4, r6, r6, r4      ; dual signed multiply, add and accumulate (1)
    ; 2nd 4 pixels
    ldr     r6, [r2, #0x4]      ; load 4 ref pixels
    smlad   r4, r7, r7, r4      ; dual signed multiply, add and accumulate (2)
    usub8   r8, r5, r6          ; calculate difference
    sel     r7, r8, lr          ; select bytes with positive difference
    usub8   r9, r6, r5          ; calculate difference with reversed operands
    sel     r8, r9, lr          ; select bytes with negative difference
    ; calculate partial sums
    usad8   r5, r7, lr          ; calculate sum of positive differences
    usad8   r6, r8, lr          ; calculate sum of negative differences
    orr     r8, r8, r7          ; differences of all 4 pixels
    ldr     r5, [r0, #0x8]      ; load 4 src pixels
    ; calculate sse
    uxtb16  r6, r8              ; byte (two pixels) to halfwords
    uxtb16  r7, r8, ror #8      ; another two pixels to halfwords
    smlad   r4, r6, r6, r4      ; dual signed multiply, add and accumulate (1)
    ; 3rd 4 pixels
    ldr     r6, [r2, #0x8]      ; load 4 ref pixels
    smlad   r4, r7, r7, r4      ; dual signed multiply, add and accumulate (2)
    usub8   r8, r5, r6          ; calculate difference
    sel     r7, r8, lr          ; select bytes with positive difference
    usub8   r9, r6, r5          ; calculate difference with reversed operands
    sel     r8, r9, lr          ; select bytes with negative difference
    ; calculate partial sums
    usad8   r5, r7, lr          ; calculate sum of positive differences
    usad8   r6, r8, lr          ; calculate sum of negative differences
    orr     r8, r8, r7          ; differences of all 4 pixels
    ldr     r5, [r0, #0xc]      ; load 4 src pixels
    ; calculate sse
    uxtb16  r6, r8              ; byte (two pixels) to halfwords
    uxtb16  r7, r8, ror #8      ; another two pixels to halfwords
    smlad   r4, r6, r6, r4      ; dual signed multiply, add and accumulate (1)
    ; 4th 4 pixels
    ldr     r6, [r2, #0xc]      ; load 4 ref pixels
    smlad   r4, r7, r7, r4      ; dual signed multiply, add and accumulate (2)
    usub8   r8, r5, r6          ; calculate difference
    add     r0, r0, r1          ; set src_ptr to next row
    sel     r7, r8, lr          ; select bytes with positive difference
    usub8   r9, r6, r5          ; calculate difference with reversed operands
    add     r2, r2, r3          ; set dst_ptr to next row
    sel     r8, r9, lr          ; select bytes with negative difference
    ; calculate partial sums
    usad8   r5, r7, lr          ; calculate sum of positive differences
    usad8   r6, r8, lr          ; calculate sum of negative differences
    orr     r8, r8, r7          ; differences of all 4 pixels
    subs    r12, r12, #1        ; next row
    ; calculate sse
    uxtb16  r6, r8              ; byte (two pixels) to halfwords
    uxtb16  r7, r8, ror #8      ; another two pixels to halfwords
    smlad   r4, r6, r6, r4      ; dual signed multiply, add and accumulate (1)
    smlad   r4, r7, r7, r4      ; dual signed multiply, add and accumulate (2)
    bne     loopmse
    ; return stuff
    ldr     r1, [sp, #28]       ; get address of sse
    mov     r0, r4              ; return sse
    str     r4, [r1]            ; store sse
    pop     {r4-r9, pc}
    ENDP
    END
--- a/vpx_dsp/arm/variance_neon.c
+++ b/vpx_dsp/arm/variance_neon.c
@ -0,0 +1,417 @@
 /*
 *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
 *
 *  Use of this source code is governed by a BSD-style license
 *  that can be found in the LICENSE file in the root of the source
 *  tree. An additional intellectual property rights grant can be found
 *  in the file PATENTS.  All contributing project authors may
 *  be found in the AUTHORS file in the root of the source tree.
 */
 #include <arm_neon.h>
 #include "./vpx_dsp_rtcd.h"
 #include "./vpx_config.h"
 #include "vpx/vpx_integer.h"
 static INLINE int horizontal_add_s16x8(const int16x8_t v_16x8) {
  const int32x4_t a = vpaddlq_s16(v_16x8);
  const int64x2_t b = vpaddlq_s32(a);
  const int32x2_t c = vadd_s32(vreinterpret_s32_s64(vget_low_s64(b)),
                               vreinterpret_s32_s64(vget_high_s64(b)));
  return vget_lane_s32(c, 0);
 }
 static INLINE int horizontal_add_s32x4(const int32x4_t v_32x4) {
  const int64x2_t b = vpaddlq_s32(v_32x4);
  const int32x2_t c = vadd_s32(vreinterpret_s32_s64(vget_low_s64(b)),
                               vreinterpret_s32_s64(vget_high_s64(b)));
  return vget_lane_s32(c, 0);
 }
 // w * h must be less than 2048 or local variable v_sum may overflow.
 static void variance_neon_w8(const uint8_t *a, int a_stride,
                             const uint8_t *b, int b_stride,
                             int w, int h, uint32_t *sse, int *sum) {
  int i, j;
  int16x8_t v_sum = vdupq_n_s16(0);
  int32x4_t v_sse_lo = vdupq_n_s32(0);
  int32x4_t v_sse_hi = vdupq_n_s32(0);
  for (i = 0; i < h; ++i) {
    for (j = 0; j < w; j += 8) {
      const uint8x8_t v_a = vld1_u8(&a[j]);
      const uint8x8_t v_b = vld1_u8(&b[j]);
      const uint16x8_t v_diff = vsubl_u8(v_a, v_b);
      const int16x8_t sv_diff = vreinterpretq_s16_u16(v_diff);
      v_sum = vaddq_s16(v_sum, sv_diff);
      v_sse_lo = vmlal_s16(v_sse_lo,
                           vget_low_s16(sv_diff),
                           vget_low_s16(sv_diff));
      v_sse_hi = vmlal_s16(v_sse_hi,
                           vget_high_s16(sv_diff),
                           vget_high_s16(sv_diff));
    }
    a += a_stride;
    b += b_stride;
  }
  *sum = horizontal_add_s16x8(v_sum);
  *sse = (unsigned int)horizontal_add_s32x4(vaddq_s32(v_sse_lo, v_sse_hi));
 }
 void vpx_get8x8var_neon(const uint8_t *a, int a_stride,
                        const uint8_t *b, int b_stride,
                        unsigned int *sse, int *sum) {
  variance_neon_w8(a, a_stride, b, b_stride, 8, 8, sse, sum);
 }
 void vpx_get16x16var_neon(const uint8_t *a, int a_stride,
                          const uint8_t *b, int b_stride,
                          unsigned int *sse, int *sum) {
  variance_neon_w8(a, a_stride, b, b_stride, 16, 16, sse, sum);
 }
 unsigned int vpx_variance8x8_neon(const uint8_t *a, int a_stride,
                                  const uint8_t *b, int b_stride,
                                  unsigned int *sse) {
  int sum;
  variance_neon_w8(a, a_stride, b, b_stride, 8, 8, sse, &sum);
  return *sse - (((int64_t)sum * sum) >> 6);  //  >> 6 = / 8 * 8
 }
 unsigned int vpx_variance16x16_neon(const uint8_t *a, int a_stride,
                                    const uint8_t *b, int b_stride,
                                    unsigned int *sse) {
  int sum;
  variance_neon_w8(a, a_stride, b, b_stride, 16, 16, sse, &sum);
  return *sse - (((int64_t)sum * sum) >> 8);  //  >> 8 = / 16 * 16
 }
 unsigned int vpx_variance32x32_neon(const uint8_t *a, int a_stride,
                                    const uint8_t *b, int b_stride,
                                    unsigned int *sse) {
  int sum;
  variance_neon_w8(a, a_stride, b, b_stride, 32, 32, sse, &sum);
  return *sse - (((int64_t)sum * sum) >> 10);  // >> 10 = / 32 * 32
 }
 unsigned int vpx_variance32x64_neon(const uint8_t *a, int a_stride,
                                    const uint8_t *b, int b_stride,
                                    unsigned int *sse) {
  int sum1, sum2;
  uint32_t sse1, sse2;
  variance_neon_w8(a, a_stride, b, b_stride, 32, 32, &sse1, &sum1);
  variance_neon_w8(a + (32 * a_stride), a_stride,
                   b + (32 * b_stride), b_stride, 32, 32,
                   &sse2, &sum2);
  *sse = sse1 + sse2;
  sum1 += sum2;
  return *sse - (((int64_t)sum1 * sum1) >> 11);  // >> 11 = / 32 * 64
 }
 unsigned int vpx_variance64x32_neon(const uint8_t *a, int a_stride,
                                    const uint8_t *b, int b_stride,
                                    unsigned int *sse) {
  int sum1, sum2;
  uint32_t sse1, sse2;
  variance_neon_w8(a, a_stride, b, b_stride, 64, 16, &sse1, &sum1);
  variance_neon_w8(a + (16 * a_stride), a_stride,
                   b + (16 * b_stride), b_stride, 64, 16,
                   &sse2, &sum2);
  *sse = sse1 + sse2;
  sum1 += sum2;
  return *sse - (((int64_t)sum1 * sum1) >> 11);  // >> 11 = / 32 * 64
 }
 unsigned int vpx_variance64x64_neon(const uint8_t *a, int a_stride,
                                    const uint8_t *b, int b_stride,
                                    unsigned int *sse) {
  int sum1, sum2;
  uint32_t sse1, sse2;
  variance_neon_w8(a, a_stride, b, b_stride, 64, 16, &sse1, &sum1);
  variance_neon_w8(a + (16 * a_stride), a_stride,
                   b + (16 * b_stride), b_stride, 64, 16,
                   &sse2, &sum2);
  sse1 += sse2;
  sum1 += sum2;
  variance_neon_w8(a + (16 * 2 * a_stride), a_stride,
                   b + (16 * 2 * b_stride), b_stride,
                   64, 16, &sse2, &sum2);
  sse1 += sse2;
  sum1 += sum2;
  variance_neon_w8(a + (16 * 3 * a_stride), a_stride,
                   b + (16 * 3 * b_stride), b_stride,
                   64, 16, &sse2, &sum2);
  *sse = sse1 + sse2;
  sum1 += sum2;
  return *sse - (((int64_t)sum1 * sum1) >> 12);  // >> 12 = / 64 * 64
 }
 unsigned int vpx_variance16x8_neon(
        const unsigned char *src_ptr,
        int source_stride,
        const unsigned char *ref_ptr,
        int recon_stride,
        unsigned int *sse) {
    int i;
    int16x4_t d22s16, d23s16, d24s16, d25s16, d26s16, d27s16, d28s16, d29s16;
    uint32x2_t d0u32, d10u32;
    int64x1_t d0s64, d1s64;
    uint8x16_t q0u8, q1u8, q2u8, q3u8;
    uint16x8_t q11u16, q12u16, q13u16, q14u16;
    int32x4_t q8s32, q9s32, q10s32;
    int64x2_t q0s64, q1s64, q5s64;
    q8s32 = vdupq_n_s32(0);
    q9s32 = vdupq_n_s32(0);
    q10s32 = vdupq_n_s32(0);
    for (i = 0; i < 4; i++) {
        q0u8 = vld1q_u8(src_ptr);
        src_ptr += source_stride;
        q1u8 = vld1q_u8(src_ptr);
        src_ptr += source_stride;
        __builtin_prefetch(src_ptr);
        q2u8 = vld1q_u8(ref_ptr);
        ref_ptr += recon_stride;
        q3u8 = vld1q_u8(ref_ptr);
        ref_ptr += recon_stride;
        __builtin_prefetch(ref_ptr);
        q11u16 = vsubl_u8(vget_low_u8(q0u8), vget_low_u8(q2u8));
        q12u16 = vsubl_u8(vget_high_u8(q0u8), vget_high_u8(q2u8));
        q13u16 = vsubl_u8(vget_low_u8(q1u8), vget_low_u8(q3u8));
        q14u16 = vsubl_u8(vget_high_u8(q1u8), vget_high_u8(q3u8));
        d22s16 = vreinterpret_s16_u16(vget_low_u16(q11u16));
        d23s16 = vreinterpret_s16_u16(vget_high_u16(q11u16));
        q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q11u16));
        q9s32 = vmlal_s16(q9s32, d22s16, d22s16);
        q10s32 = vmlal_s16(q10s32, d23s16, d23s16);
        d24s16 = vreinterpret_s16_u16(vget_low_u16(q12u16));
        d25s16 = vreinterpret_s16_u16(vget_high_u16(q12u16));
        q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q12u16));
        q9s32 = vmlal_s16(q9s32, d24s16, d24s16);
        q10s32 = vmlal_s16(q10s32, d25s16, d25s16);
        d26s16 = vreinterpret_s16_u16(vget_low_u16(q13u16));
        d27s16 = vreinterpret_s16_u16(vget_high_u16(q13u16));
        q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q13u16));
        q9s32 = vmlal_s16(q9s32, d26s16, d26s16);
        q10s32 = vmlal_s16(q10s32, d27s16, d27s16);
        d28s16 = vreinterpret_s16_u16(vget_low_u16(q14u16));
        d29s16 = vreinterpret_s16_u16(vget_high_u16(q14u16));
        q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q14u16));
        q9s32 = vmlal_s16(q9s32, d28s16, d28s16);
        q10s32 = vmlal_s16(q10s32, d29s16, d29s16);
    }
    q10s32 = vaddq_s32(q10s32, q9s32);
    q0s64 = vpaddlq_s32(q8s32);
    q1s64 = vpaddlq_s32(q10s32);
    d0s64 = vadd_s64(vget_low_s64(q0s64), vget_high_s64(q0s64));
    d1s64 = vadd_s64(vget_low_s64(q1s64), vget_high_s64(q1s64));
    q5s64 = vmull_s32(vreinterpret_s32_s64(d0s64),
                      vreinterpret_s32_s64(d0s64));
    vst1_lane_u32((uint32_t *)sse, vreinterpret_u32_s64(d1s64), 0);
    d10u32 = vshr_n_u32(vreinterpret_u32_s64(vget_low_s64(q5s64)), 7);
    d0u32 = vsub_u32(vreinterpret_u32_s64(d1s64), d10u32);
    return vget_lane_u32(d0u32, 0);
 }
 unsigned int vpx_variance8x16_neon(
        const unsigned char *src_ptr,
        int source_stride,
        const unsigned char *ref_ptr,
        int recon_stride,
        unsigned int *sse) {
    int i;
    uint8x8_t d0u8, d2u8, d4u8, d6u8;
    int16x4_t d22s16, d23s16, d24s16, d25s16;
    uint32x2_t d0u32, d10u32;
    int64x1_t d0s64, d1s64;
    uint16x8_t q11u16, q12u16;
    int32x4_t q8s32, q9s32, q10s32;
    int64x2_t q0s64, q1s64, q5s64;
    q8s32 = vdupq_n_s32(0);
    q9s32 = vdupq_n_s32(0);
    q10s32 = vdupq_n_s32(0);
    for (i = 0; i < 8; i++) {
        d0u8 = vld1_u8(src_ptr);
        src_ptr += source_stride;
        d2u8 = vld1_u8(src_ptr);
        src_ptr += source_stride;
        __builtin_prefetch(src_ptr);
        d4u8 = vld1_u8(ref_ptr);
        ref_ptr += recon_stride;
        d6u8 = vld1_u8(ref_ptr);
        ref_ptr += recon_stride;
        __builtin_prefetch(ref_ptr);
        q11u16 = vsubl_u8(d0u8, d4u8);
        q12u16 = vsubl_u8(d2u8, d6u8);
        d22s16 = vreinterpret_s16_u16(vget_low_u16(q11u16));
        d23s16 = vreinterpret_s16_u16(vget_high_u16(q11u16));
        q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q11u16));
        q9s32 = vmlal_s16(q9s32, d22s16, d22s16);
        q10s32 = vmlal_s16(q10s32, d23s16, d23s16);
        d24s16 = vreinterpret_s16_u16(vget_low_u16(q12u16));
        d25s16 = vreinterpret_s16_u16(vget_high_u16(q12u16));
        q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q12u16));
        q9s32 = vmlal_s16(q9s32, d24s16, d24s16);
        q10s32 = vmlal_s16(q10s32, d25s16, d25s16);
    }
    q10s32 = vaddq_s32(q10s32, q9s32);
    q0s64 = vpaddlq_s32(q8s32);
    q1s64 = vpaddlq_s32(q10s32);
    d0s64 = vadd_s64(vget_low_s64(q0s64), vget_high_s64(q0s64));
    d1s64 = vadd_s64(vget_low_s64(q1s64), vget_high_s64(q1s64));
    q5s64 = vmull_s32(vreinterpret_s32_s64(d0s64),
                      vreinterpret_s32_s64(d0s64));
    vst1_lane_u32((uint32_t *)sse, vreinterpret_u32_s64(d1s64), 0);
    d10u32 = vshr_n_u32(vreinterpret_u32_s64(vget_low_s64(q5s64)), 7);
    d0u32 = vsub_u32(vreinterpret_u32_s64(d1s64), d10u32);
    return vget_lane_u32(d0u32, 0);
 }
 unsigned int vpx_mse16x16_neon(
        const unsigned char *src_ptr,
        int source_stride,
        const unsigned char *ref_ptr,
        int recon_stride,
        unsigned int *sse) {
    int i;
    int16x4_t d22s16, d23s16, d24s16, d25s16, d26s16, d27s16, d28s16, d29s16;
    int64x1_t d0s64;
    uint8x16_t q0u8, q1u8, q2u8, q3u8;
    int32x4_t q7s32, q8s32, q9s32, q10s32;
    uint16x8_t q11u16, q12u16, q13u16, q14u16;
    int64x2_t q1s64;
    q7s32 = vdupq_n_s32(0);
    q8s32 = vdupq_n_s32(0);
    q9s32 = vdupq_n_s32(0);
    q10s32 = vdupq_n_s32(0);
    for (i = 0; i < 8; i++) {  // mse16x16_neon_loop
        q0u8 = vld1q_u8(src_ptr);
        src_ptr += source_stride;
        q1u8 = vld1q_u8(src_ptr);
        src_ptr += source_stride;
        q2u8 = vld1q_u8(ref_ptr);
        ref_ptr += recon_stride;
        q3u8 = vld1q_u8(ref_ptr);
        ref_ptr += recon_stride;
        q11u16 = vsubl_u8(vget_low_u8(q0u8), vget_low_u8(q2u8));
        q12u16 = vsubl_u8(vget_high_u8(q0u8), vget_high_u8(q2u8));
        q13u16 = vsubl_u8(vget_low_u8(q1u8), vget_low_u8(q3u8));
        q14u16 = vsubl_u8(vget_high_u8(q1u8), vget_high_u8(q3u8));
        d22s16 = vreinterpret_s16_u16(vget_low_u16(q11u16));
        d23s16 = vreinterpret_s16_u16(vget_high_u16(q11u16));
        q7s32 = vmlal_s16(q7s32, d22s16, d22s16);
        q8s32 = vmlal_s16(q8s32, d23s16, d23s16);
        d24s16 = vreinterpret_s16_u16(vget_low_u16(q12u16));
        d25s16 = vreinterpret_s16_u16(vget_high_u16(q12u16));
        q9s32 = vmlal_s16(q9s32, d24s16, d24s16);
        q10s32 = vmlal_s16(q10s32, d25s16, d25s16);
        d26s16 = vreinterpret_s16_u16(vget_low_u16(q13u16));
        d27s16 = vreinterpret_s16_u16(vget_high_u16(q13u16));
        q7s32 = vmlal_s16(q7s32, d26s16, d26s16);
        q8s32 = vmlal_s16(q8s32, d27s16, d27s16);
        d28s16 = vreinterpret_s16_u16(vget_low_u16(q14u16));
        d29s16 = vreinterpret_s16_u16(vget_high_u16(q14u16));
        q9s32 = vmlal_s16(q9s32, d28s16, d28s16);
        q10s32 = vmlal_s16(q10s32, d29s16, d29s16);
    }
    q7s32 = vaddq_s32(q7s32, q8s32);
    q9s32 = vaddq_s32(q9s32, q10s32);
    q10s32 = vaddq_s32(q7s32, q9s32);
    q1s64 = vpaddlq_s32(q10s32);
    d0s64 = vadd_s64(vget_low_s64(q1s64), vget_high_s64(q1s64));
    vst1_lane_u32((uint32_t *)sse, vreinterpret_u32_s64(d0s64), 0);
    return vget_lane_u32(vreinterpret_u32_s64(d0s64), 0);
 }
 unsigned int vpx_get4x4sse_cs_neon(
        const unsigned char *src_ptr,
        int source_stride,
        const unsigned char *ref_ptr,
        int recon_stride) {
    int16x4_t d22s16, d24s16, d26s16, d28s16;
    int64x1_t d0s64;
    uint8x8_t d0u8, d1u8, d2u8, d3u8, d4u8, d5u8, d6u8, d7u8;
    int32x4_t q7s32, q8s32, q9s32, q10s32;
    uint16x8_t q11u16, q12u16, q13u16, q14u16;
    int64x2_t q1s64;
    d0u8 = vld1_u8(src_ptr);
    src_ptr += source_stride;
    d4u8 = vld1_u8(ref_ptr);
    ref_ptr += recon_stride;
    d1u8 = vld1_u8(src_ptr);
    src_ptr += source_stride;
    d5u8 = vld1_u8(ref_ptr);
    ref_ptr += recon_stride;
    d2u8 = vld1_u8(src_ptr);
    src_ptr += source_stride;
    d6u8 = vld1_u8(ref_ptr);
    ref_ptr += recon_stride;
    d3u8 = vld1_u8(src_ptr);
    src_ptr += source_stride;
    d7u8 = vld1_u8(ref_ptr);
    ref_ptr += recon_stride;
    q11u16 = vsubl_u8(d0u8, d4u8);
    q12u16 = vsubl_u8(d1u8, d5u8);
    q13u16 = vsubl_u8(d2u8, d6u8);
    q14u16 = vsubl_u8(d3u8, d7u8);
    d22s16 = vget_low_s16(vreinterpretq_s16_u16(q11u16));
    d24s16 = vget_low_s16(vreinterpretq_s16_u16(q12u16));
    d26s16 = vget_low_s16(vreinterpretq_s16_u16(q13u16));
    d28s16 = vget_low_s16(vreinterpretq_s16_u16(q14u16));
    q7s32 = vmull_s16(d22s16, d22s16);
    q8s32 = vmull_s16(d24s16, d24s16);
    q9s32 = vmull_s16(d26s16, d26s16);
    q10s32 = vmull_s16(d28s16, d28s16);
    q7s32 = vaddq_s32(q7s32, q8s32);
    q9s32 = vaddq_s32(q9s32, q10s32);
    q9s32 = vaddq_s32(q7s32, q9s32);
    q1s64 = vpaddlq_s32(q9s32);
    d0s64 = vadd_s64(vget_low_s64(q1s64), vget_high_s64(q1s64));
    return vget_lane_u32(vreinterpret_u32_s64(d0s64), 0);
 }
--- a/vpx_dsp/sad.c
+++ b/vpx_dsp/sad.c
@ -33,6 +33,7 @@ static INLINE unsigned int sad(const uint8_t *a, int a_stride,
  return sad;
 }
 // TODO(johannkoenig): this moved to vpx_dsp, should be able to clean this up.
 /* Remove dependency on vp9 variance function by duplicating vp9_comp_avg_pred.
 * The function averages every corresponding element of the buffers and stores
 * the value in a third buffer, comp_pred.
--- a/vpx_dsp/variance.c
+++ b/vpx_dsp/variance.c
@ -0,0 +1,306 @@
 /*
 *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
 *
 *  Use of this source code is governed by a BSD-style license
 *  that can be found in the LICENSE file in the root of the source
 *  tree. An additional intellectual property rights grant can be found
 *  in the file PATENTS.  All contributing project authors may
 *  be found in the AUTHORS file in the root of the source tree.
 */
 #include "./vpx_config.h"
 #include "./vpx_dsp_rtcd.h"
 #include "vpx_ports/mem.h"
 #include "vpx/vpx_integer.h"
 unsigned int vpx_get4x4sse_cs_c(const unsigned char *a, int  a_stride,
                                const unsigned char *b, int  b_stride) {
  int distortion = 0;
  int r, c;
  for (r = 0; r < 4; r++) {
    for (c = 0; c < 4; c++) {
      int diff = a[c] - b[c];
      distortion += diff * diff;
    }
    a += a_stride;
    b += b_stride;
  }
  return distortion;
 }
 unsigned int vpx_get_mb_ss_c(const int16_t *a) {
  unsigned int i, sum = 0;
  for (i = 0; i < 256; ++i) {
    sum += a[i] * a[i];
  }
  return sum;
 }
 static void variance(const uint8_t *a, int  a_stride,
                     const uint8_t *b, int  b_stride,
                     int  w, int  h, unsigned int *sse, int *sum) {
  int i, j;
  *sum = 0;
  *sse = 0;
  for (i = 0; i < h; i++) {
    for (j = 0; j < w; j++) {
      const int diff = a[j] - b[j];
      *sum += diff;
      *sse += diff * diff;
    }
    a += a_stride;
    b += b_stride;
  }
 }
 #define VAR(W, H) \
 unsigned int vpx_variance##W##x##H##_c(const uint8_t *a, int a_stride, \
                                       const uint8_t *b, int b_stride, \
                                       unsigned int *sse) { \
  int sum; \
  variance(a, a_stride, b, b_stride, W, H, sse, &sum); \
  return *sse - (((int64_t)sum * sum) / (W * H)); \
 }
 /* Identical to the variance call except it takes an additional parameter, sum,
 * and returns that value using pass-by-reference instead of returning
 * sse - sum^2 / w*h
 */
 #define GET_VAR(W, H) \
 void vpx_get##W##x##H##var_c(const uint8_t *a, int a_stride, \
                             const uint8_t *b, int b_stride, \
                             unsigned int *sse, int *sum) { \
  variance(a, a_stride, b, b_stride, W, H, sse, sum); \
 }
 /* Identical to the variance call except it does not calculate the
 * sse - sum^2 / w*h and returns sse in addtion to modifying the passed in
 * variable.
 */
 #define MSE(W, H) \
 unsigned int vpx_mse##W##x##H##_c(const uint8_t *a, int a_stride, \
                                  const uint8_t *b, int b_stride, \
                                  unsigned int *sse) { \
  int sum; \
  variance(a, a_stride, b, b_stride, W, H, sse, &sum); \
  return *sse; \
 }
 VAR(64, 64)
 VAR(64, 32)
 VAR(32, 64)
 VAR(32, 32)
 VAR(32, 16)
 VAR(16, 32)
 VAR(16, 16)
 VAR(16, 8)
 VAR(8, 16)
 VAR(8, 8)
 VAR(8, 4)
 VAR(4, 8)
 VAR(4, 4)
 GET_VAR(16, 16)
 GET_VAR(8, 8)
 MSE(16, 16)
 MSE(16, 8)
 MSE(8, 16)
 MSE(8, 8)
 void vpx_comp_avg_pred_c(uint8_t *comp_pred, const uint8_t *pred, int width,
                         int height, const uint8_t *ref, int ref_stride) {
  int i, j;
  for (i = 0; i < height; i++) {
    for (j = 0; j < width; j++) {
      const int tmp = pred[j] + ref[j];
      comp_pred[j] = ROUND_POWER_OF_TWO(tmp, 1);
    }
    comp_pred += width;
    pred += width;
    ref += ref_stride;
  }
 }
 #if CONFIG_VP9_HIGHBITDEPTH
 static void highbd_variance64(const uint8_t *a8, int  a_stride,
                              const uint8_t *b8, int  b_stride,
                              int w, int h, uint64_t *sse, uint64_t *sum) {
  int i, j;
  uint16_t *a = CONVERT_TO_SHORTPTR(a8);
  uint16_t *b = CONVERT_TO_SHORTPTR(b8);
  *sum = 0;
  *sse = 0;
  for (i = 0; i < h; i++) {
    for (j = 0; j < w; j++) {
      const int diff = a[j] - b[j];
      *sum += diff;
      *sse += diff * diff;
    }
    a += a_stride;
    b += b_stride;
  }
 }
 static void highbd_8_variance(const uint8_t *a8, int  a_stride,
                              const uint8_t *b8, int  b_stride,
                              int w, int h, unsigned int *sse, int *sum) {
  uint64_t sse_long = 0;
  uint64_t sum_long = 0;
  highbd_variance64(a8, a_stride, b8, b_stride, w, h, &sse_long, &sum_long);
  *sse = (unsigned int)sse_long;
  *sum = (int)sum_long;
 }
 static void highbd_10_variance(const uint8_t *a8, int  a_stride,
                               const uint8_t *b8, int  b_stride,
                               int w, int h, unsigned int *sse, int *sum) {
  uint64_t sse_long = 0;
  uint64_t sum_long = 0;
  highbd_variance64(a8, a_stride, b8, b_stride, w, h, &sse_long, &sum_long);
  *sse = (unsigned int)ROUND_POWER_OF_TWO(sse_long, 4);
  *sum = (int)ROUND_POWER_OF_TWO(sum_long, 2);
 }
 static void highbd_12_variance(const uint8_t *a8, int  a_stride,
                               const uint8_t *b8, int  b_stride,
                               int w, int h, unsigned int *sse, int *sum) {
  uint64_t sse_long = 0;
  uint64_t sum_long = 0;
  highbd_variance64(a8, a_stride, b8, b_stride, w, h, &sse_long, &sum_long);
  *sse = (unsigned int)ROUND_POWER_OF_TWO(sse_long, 8);
  *sum = (int)ROUND_POWER_OF_TWO(sum_long, 4);
 }
 #define HIGHBD_VAR(W, H) \
 unsigned int vpx_highbd_8_variance##W##x##H##_c(const uint8_t *a, \
                                                int a_stride, \
                                                const uint8_t *b, \
                                                int b_stride, \
                                                unsigned int *sse) { \
  int sum; \
  highbd_8_variance(a, a_stride, b, b_stride, W, H, sse, &sum); \
  return *sse - (((int64_t)sum * sum) / (W * H)); \
 } \
 \
 unsigned int vpx_highbd_10_variance##W##x##H##_c(const uint8_t *a, \
                                                 int a_stride, \
                                                 const uint8_t *b, \
                                                 int b_stride, \
                                                 unsigned int *sse) { \
  int sum; \
  highbd_10_variance(a, a_stride, b, b_stride, W, H, sse, &sum); \
  return *sse - (((int64_t)sum * sum) / (W * H)); \
 } \
 \
 unsigned int vpx_highbd_12_variance##W##x##H##_c(const uint8_t *a, \
                                                 int a_stride, \
                                                 const uint8_t *b, \
                                                 int b_stride, \
                                                 unsigned int *sse) { \
  int sum; \
  highbd_12_variance(a, a_stride, b, b_stride, W, H, sse, &sum); \
  return *sse - (((int64_t)sum * sum) / (W * H)); \
 }
 #define HIGHBD_GET_VAR(S) \
 void vpx_highbd_8_get##S##x##S##var_c(const uint8_t *src, int src_stride, \
                                    const uint8_t *ref, int ref_stride, \
                                    unsigned int *sse, int *sum) { \
  highbd_8_variance(src, src_stride, ref, ref_stride, S, S, sse, sum); \
 } \
 \
 void vpx_highbd_10_get##S##x##S##var_c(const uint8_t *src, int src_stride, \
                                       const uint8_t *ref, int ref_stride, \
                                       unsigned int *sse, int *sum) { \
  highbd_10_variance(src, src_stride, ref, ref_stride, S, S, sse, sum); \
 } \
 \
 void vpx_highbd_12_get##S##x##S##var_c(const uint8_t *src, int src_stride, \
                                       const uint8_t *ref, int ref_stride, \
                                       unsigned int *sse, int *sum) { \
  highbd_12_variance(src, src_stride, ref, ref_stride, S, S, sse, sum); \
 }
 #define HIGHBD_MSE(W, H) \
 unsigned int vpx_highbd_8_mse##W##x##H##_c(const uint8_t *src, \
                                         int src_stride, \
                                         const uint8_t *ref, \
                                         int ref_stride, \
                                         unsigned int *sse) { \
  int sum; \
  highbd_8_variance(src, src_stride, ref, ref_stride, W, H, sse, &sum); \
  return *sse; \
 } \
 \
 unsigned int vpx_highbd_10_mse##W##x##H##_c(const uint8_t *src, \
                                            int src_stride, \
                                            const uint8_t *ref, \
                                            int ref_stride, \
                                            unsigned int *sse) { \
  int sum; \
  highbd_10_variance(src, src_stride, ref, ref_stride, W, H, sse, &sum); \
  return *sse; \
 } \
 \
 unsigned int vpx_highbd_12_mse##W##x##H##_c(const uint8_t *src, \
                                            int src_stride, \
                                            const uint8_t *ref, \
                                            int ref_stride, \
                                            unsigned int *sse) { \
  int sum; \
  highbd_12_variance(src, src_stride, ref, ref_stride, W, H, sse, &sum); \
  return *sse; \
 }
 HIGHBD_GET_VAR(8)
 HIGHBD_GET_VAR(16)
 HIGHBD_MSE(16, 16)
 HIGHBD_MSE(16, 8)
 HIGHBD_MSE(8, 16)
 HIGHBD_MSE(8, 8)
 HIGHBD_VAR(64, 64)
 HIGHBD_VAR(64, 32)
 HIGHBD_VAR(32, 64)
 HIGHBD_VAR(32, 32)
 HIGHBD_VAR(32, 16)
 HIGHBD_VAR(16, 32)
 HIGHBD_VAR(16, 16)
 HIGHBD_VAR(16, 8)
 HIGHBD_VAR(8, 16)
 HIGHBD_VAR(8, 8)
 HIGHBD_VAR(8, 4)
 HIGHBD_VAR(4, 8)
 HIGHBD_VAR(4, 4)
 void vpx_highbd_comp_avg_pred(uint16_t *comp_pred, const uint8_t *pred8,
                              int width, int height, const uint8_t *ref8,
                              int ref_stride) {
  int i, j;
  uint16_t *pred = CONVERT_TO_SHORTPTR(pred8);
  uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
  for (i = 0; i < height; i++) {
    for (j = 0; j < width; j++) {
      const int tmp = pred[j] + ref[j];
      comp_pred[j] = ROUND_POWER_OF_TWO(tmp, 1);
    }
    comp_pred += width;
    pred += width;
    ref += ref_stride;
  }
 }
 #endif  // CONFIG_VP9_HIGHBITDEPTH
--- a/vpx_dsp/vpx_dsp.mk
+++ b/vpx_dsp/vpx_dsp.mk
@ -17,6 +17,7 @@ DSP_SRCS-$(HAVE_MEDIA)  += arm/sad_media$(ASM)
 DSP_SRCS-$(HAVE_NEON)   += arm/sad4d_neon.c
 DSP_SRCS-$(HAVE_NEON)   += arm/sad_neon.c
 DSP_SRCS-$(HAVE_MMX)    += x86/sad_mmx.asm
 DSP_SRCS-$(HAVE_SSE2)   += x86/sad4d_sse2.asm
 DSP_SRCS-$(HAVE_SSE2)   += x86/sad_sse2.asm
@ -29,9 +30,28 @@ DSP_SRCS-$(HAVE_AVX2)   += x86/sad_avx2.c
 ifeq ($(CONFIG_VP9_HIGHBITDEPTH),yes)
 DSP_SRCS-$(HAVE_SSE2) += x86/highbd_sad4d_sse2.asm
 DSP_SRCS-$(HAVE_SSE2) += x86/highbd_sad_sse2.asm
 endif  # CONFIG_VP9_HIGHBITDEPTH
 endif  # CONFIG_ENCODERS
 ifneq ($(filter yes,$(CONFIG_ENCODERS) $(CONFIG_POSTPROC) $(CONFIG_VP9_POSTPROC)),)
 DSP_SRCS-yes            += variance.c
 DSP_SRCS-$(HAVE_MEDIA)  += arm/variance_media$(ASM)
 DSP_SRCS-$(HAVE_NEON)   += arm/variance_neon.c
 DSP_SRCS-$(HAVE_MMX)    += x86/variance_mmx.c
 DSP_SRCS-$(HAVE_MMX)    += x86/variance_impl_mmx.asm
 DSP_SRCS-$(HAVE_SSE2)   += x86/variance_sse2.c
 DSP_SRCS-$(HAVE_AVX2)   += x86/variance_avx2.c
 DSP_SRCS-$(HAVE_AVX2)   += x86/variance_impl_avx2.c
 ifeq ($(CONFIG_VP9_HIGHBITDEPTH),yes)
 DSP_SRCS-$(HAVE_SSE2)   += x86/highbd_variance_sse2.c
 DSP_SRCS-$(HAVE_SSE2)   += x86/highbd_variance_impl_sse2.asm
 endif  # CONFIG_VP9_HIGHBITDEPTH
 endif  # CONFIG_ENCODERS || CONFIG_POSTPROC || CONFIG_VP9_POSTPROC
 DSP_SRCS-no += $(DSP_SRCS_REMOVE-yes)
 DSP_SRCS-yes += vpx_dsp_rtcd.c
--- a/vpx_dsp/vpx_dsp_rtcd_defs.pl
+++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl
@ -392,4 +392,212 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
 }  # CONFIG_VP9_HIGHBITDEPTH
 }  # CONFIG_ENCODERS
 if (vpx_config("CONFIG_ENCODERS") eq "yes" || vpx_config("CONFIG_POSTPROC") eq "yes" || vpx_config("CONFIG_VP9_POSTPROC") eq "yes") {
 add_proto qw/unsigned int vpx_variance64x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
  specialize qw/vpx_variance64x64 sse2 avx2 neon/;
 add_proto qw/unsigned int vpx_variance64x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
  specialize qw/vpx_variance64x32 sse2 avx2 neon/;
 add_proto qw/unsigned int vpx_variance32x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
  specialize qw/vpx_variance32x64 sse2 neon/;
 add_proto qw/unsigned int vpx_variance32x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
  specialize qw/vpx_variance32x32 sse2 avx2 neon/;
 add_proto qw/unsigned int vpx_variance32x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
  specialize qw/vpx_variance32x16 sse2 avx2/;
 add_proto qw/unsigned int vpx_variance16x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
  specialize qw/vpx_variance16x32 sse2/;
 add_proto qw/unsigned int vpx_variance16x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
  specialize qw/vpx_variance16x16 mmx sse2 avx2 media neon/;
 add_proto qw/unsigned int vpx_variance16x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
  specialize qw/vpx_variance16x8 mmx sse2 neon/;
 add_proto qw/unsigned int vpx_variance8x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
  specialize qw/vpx_variance8x16 mmx sse2 neon/;
 add_proto qw/unsigned int vpx_variance8x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
  specialize qw/vpx_variance8x8 mmx sse2 media neon/;
 add_proto qw/unsigned int vpx_variance8x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
  specialize qw/vpx_variance8x4 sse2/;
 add_proto qw/unsigned int vpx_variance4x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
  specialize qw/vpx_variance4x8 sse2/;
 add_proto qw/unsigned int vpx_variance4x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
  specialize qw/vpx_variance4x4 mmx sse2/;
 add_proto qw/void vpx_get16x16var/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum";
  specialize qw/vpx_get16x16var sse2 avx2 neon/;
 add_proto qw/void vpx_get8x8var/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum";
  specialize qw/vpx_get8x8var mmx sse2 neon/;
 add_proto qw/unsigned int vpx_mse16x16/, "const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse";
  specialize qw/vpx_mse16x16 mmx sse2 avx2 media neon/;
 add_proto qw/unsigned int vpx_mse16x8/, "const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse";
  specialize qw/vpx_mse16x8 sse2/;
 add_proto qw/unsigned int vpx_mse8x16/, "const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse";
  specialize qw/vpx_mse8x16 sse2/;
 add_proto qw/unsigned int vpx_mse8x8/, "const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse";
  specialize qw/vpx_mse8x8 sse2/;
 add_proto qw/unsigned int vpx_get_mb_ss/, "const int16_t *";
  specialize qw/vpx_get_mb_ss mmx sse2/;
 add_proto qw/unsigned int vpx_get4x4sse_cs/, "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int  ref_stride";
  specialize qw/vpx_get4x4sse_cs neon/;
 add_proto qw/void vpx_comp_avg_pred/, "uint8_t *comp_pred, const uint8_t *pred, int width, int height, const uint8_t *ref, int ref_stride";
 if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
  add_proto qw/unsigned int vpx_highbd_12_variance64x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
  specialize qw/vpx_highbd_12_variance64x64 sse2/;
  add_proto qw/unsigned int vpx_highbd_12_variance64x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
  specialize qw/vpx_highbd_12_variance64x32 sse2/;
  add_proto qw/unsigned int vpx_highbd_12_variance32x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
  specialize qw/vpx_highbd_12_variance32x64 sse2/;
  add_proto qw/unsigned int vpx_highbd_12_variance32x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
  specialize qw/vpx_highbd_12_variance32x32 sse2/;
  add_proto qw/unsigned int vpx_highbd_12_variance32x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
  specialize qw/vpx_highbd_12_variance32x16 sse2/;
  add_proto qw/unsigned int vpx_highbd_12_variance16x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
  specialize qw/vpx_highbd_12_variance16x32 sse2/;
  add_proto qw/unsigned int vpx_highbd_12_variance16x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
  specialize qw/vpx_highbd_12_variance16x16 sse2/;
  add_proto qw/unsigned int vpx_highbd_12_variance16x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
  specialize qw/vpx_highbd_12_variance16x8 sse2/;
  add_proto qw/unsigned int vpx_highbd_12_variance8x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
  specialize qw/vpx_highbd_12_variance8x16 sse2/;
  add_proto qw/unsigned int vpx_highbd_12_variance8x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
  specialize qw/vpx_highbd_12_variance8x8 sse2/;
  add_proto qw/unsigned int vpx_highbd_12_variance8x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
  add_proto qw/unsigned int vpx_highbd_12_variance4x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
  add_proto qw/unsigned int vpx_highbd_12_variance4x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
  add_proto qw/unsigned int vpx_highbd_10_variance64x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
  specialize qw/vpx_highbd_10_variance64x64 sse2/;
  add_proto qw/unsigned int vpx_highbd_10_variance64x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
  specialize qw/vpx_highbd_10_variance64x32 sse2/;
  add_proto qw/unsigned int vpx_highbd_10_variance32x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
  specialize qw/vpx_highbd_10_variance32x64 sse2/;
  add_proto qw/unsigned int vpx_highbd_10_variance32x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
  specialize qw/vpx_highbd_10_variance32x32 sse2/;
  add_proto qw/unsigned int vpx_highbd_10_variance32x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
  specialize qw/vpx_highbd_10_variance32x16 sse2/;
  add_proto qw/unsigned int vpx_highbd_10_variance16x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
  specialize qw/vpx_highbd_10_variance16x32 sse2/;
  add_proto qw/unsigned int vpx_highbd_10_variance16x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
  specialize qw/vpx_highbd_10_variance16x16 sse2/;
  add_proto qw/unsigned int vpx_highbd_10_variance16x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
  specialize qw/vpx_highbd_10_variance16x8 sse2/;
  add_proto qw/unsigned int vpx_highbd_10_variance8x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
  specialize qw/vpx_highbd_10_variance8x16 sse2/;
  add_proto qw/unsigned int vpx_highbd_10_variance8x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
  specialize qw/vpx_highbd_10_variance8x8 sse2/;
  add_proto qw/unsigned int vpx_highbd_10_variance8x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
  add_proto qw/unsigned int vpx_highbd_10_variance4x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
  add_proto qw/unsigned int vpx_highbd_10_variance4x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
  add_proto qw/unsigned int vpx_highbd_8_variance64x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
  specialize qw/vpx_highbd_8_variance64x64 sse2/;
  add_proto qw/unsigned int vpx_highbd_8_variance64x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
  specialize qw/vpx_highbd_8_variance64x32 sse2/;
  add_proto qw/unsigned int vpx_highbd_8_variance32x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
  specialize qw/vpx_highbd_8_variance32x64 sse2/;
  add_proto qw/unsigned int vpx_highbd_8_variance32x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
  specialize qw/vpx_highbd_8_variance32x32 sse2/;
  add_proto qw/unsigned int vpx_highbd_8_variance32x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
  specialize qw/vpx_highbd_8_variance32x16 sse2/;
  add_proto qw/unsigned int vpx_highbd_8_variance16x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
  specialize qw/vpx_highbd_8_variance16x32 sse2/;
  add_proto qw/unsigned int vpx_highbd_8_variance16x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
  specialize qw/vpx_highbd_8_variance16x16 sse2/;
  add_proto qw/unsigned int vpx_highbd_8_variance16x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
  specialize qw/vpx_highbd_8_variance16x8 sse2/;
  add_proto qw/unsigned int vpx_highbd_8_variance8x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
  specialize qw/vpx_highbd_8_variance8x16 sse2/;
  add_proto qw/unsigned int vpx_highbd_8_variance8x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
  specialize qw/vpx_highbd_8_variance8x8 sse2/;
  add_proto qw/unsigned int vpx_highbd_8_variance8x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
  add_proto qw/unsigned int vpx_highbd_8_variance4x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
  add_proto qw/unsigned int vpx_highbd_8_variance4x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
  add_proto qw/void vpx_highbd_8_get16x16var/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum";
  add_proto qw/void vpx_highbd_8_get8x8var/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum";
  add_proto qw/void vpx_highbd_10_get16x16var/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum";
  add_proto qw/void vpx_highbd_10_get8x8var/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum";
  add_proto qw/void vpx_highbd_12_get16x16var/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum";
  add_proto qw/void vpx_highbd_12_get8x8var/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum";
  add_proto qw/unsigned int vpx_highbd_8_mse16x16/, "const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse";
  specialize qw/vpx_highbd_8_mse16x16 sse2/;
  add_proto qw/unsigned int vpx_highbd_8_mse16x8/, "const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse";
  add_proto qw/unsigned int vpx_highbd_8_mse8x16/, "const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse";
  add_proto qw/unsigned int vpx_highbd_8_mse8x8/, "const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse";
  specialize qw/vpx_highbd_8_mse8x8 sse2/;
  add_proto qw/unsigned int vpx_highbd_10_mse16x16/, "const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse";
  specialize qw/vpx_highbd_10_mse16x16 sse2/;
  add_proto qw/unsigned int vpx_highbd_10_mse16x8/, "const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse";
  add_proto qw/unsigned int vpx_highbd_10_mse8x16/, "const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse";
  add_proto qw/unsigned int vpx_highbd_10_mse8x8/, "const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse";
  specialize qw/vpx_highbd_10_mse8x8 sse2/;
  add_proto qw/unsigned int vpx_highbd_12_mse16x16/, "const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse";
  specialize qw/vpx_highbd_12_mse16x16 sse2/;
  add_proto qw/unsigned int vpx_highbd_12_mse16x8/, "const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse";
  add_proto qw/unsigned int vpx_highbd_12_mse8x16/, "const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse";
  add_proto qw/unsigned int vpx_highbd_12_mse8x8/, "const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse";
  specialize qw/vpx_highbd_12_mse8x8 sse2/;
  add_proto qw/void vpx_highbd_comp_avg_pred/, "uint16_t *comp_pred, const uint8_t *pred8, int width, int height, const uint8_t *ref8, int ref_stride";
 }  # CONFIG_VP9_HIGHBITDEPTH
 }  # CONFIG_ENCODERS || CONFIG_POSTPROC || CONFIG_VP9_POSTPROC
 1;
--- a/vp9/encoder/x86/vp9_highbd_variance_impl_sse2.asm
+++ b/vp9/encoder/x86/vp9_highbd_variance_impl_sse2.asm
@ -11,7 +11,7 @@
 %include "vpx_ports/x86_abi_support.asm"
-;unsigned int vp9_highbd_calc16x16var_sse2
+;unsigned int vpx_highbd_calc16x16var_sse2
 ;(
 ;    unsigned char   *  src_ptr,
 ;    int             source_stride,
@ -20,8 +20,8 @@
 ;    unsigned int    *  SSE,
 ;    int             *  Sum
 ;)
-global sym(vp9_highbd_calc16x16var_sse2) PRIVATE
+global sym(vpx_highbd_calc16x16var_sse2) PRIVATE
-sym(vp9_highbd_calc16x16var_sse2):
+sym(vpx_highbd_calc16x16var_sse2):
    push        rbp
    mov         rbp, rsp
    SHADOW_ARGS_TO_STACK 6
@ -164,7 +164,7 @@ sym(vp9_highbd_calc16x16var_sse2):
    ret
-;unsigned int vp9_highbd_calc8x8var_sse2
+;unsigned int vpx_highbd_calc8x8var_sse2
 ;(
 ;    unsigned char   *  src_ptr,
 ;    int             source_stride,
@ -173,8 +173,8 @@ sym(vp9_highbd_calc16x16var_sse2):
 ;    unsigned int    *  SSE,
 ;    int             *  Sum
 ;)
-global sym(vp9_highbd_calc8x8var_sse2) PRIVATE
+global sym(vpx_highbd_calc8x8var_sse2) PRIVATE
-sym(vp9_highbd_calc8x8var_sse2):
+sym(vpx_highbd_calc8x8var_sse2):
    push        rbp
    mov         rbp, rsp
    SHADOW_ARGS_TO_STACK 6
--- a/vpx_dsp/x86/highbd_variance_sse2.c
+++ b/vpx_dsp/x86/highbd_variance_sse2.c
@ -0,0 +1,245 @@
 /*
 *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
 *
 *  Use of this source code is governed by a BSD-style license
 *  that can be found in the LICENSE file in the root of the source
 *  tree. An additional intellectual property rights grant can be found
 *  in the file PATENTS.  All contributing project authors may
 *  be found in the AUTHORS file in the root of the source tree.
 */
 #include "./vpx_config.h"
 #include "vp9/common/vp9_common.h"
 #include "vp9/encoder/vp9_variance.h"
 #include "vpx_ports/mem.h"
 typedef uint32_t (*high_variance_fn_t) (const uint16_t *src, int src_stride,
                                        const uint16_t *ref, int ref_stride,
                                        uint32_t *sse, int *sum);
 uint32_t vpx_highbd_calc8x8var_sse2(const uint16_t *src, int src_stride,
                                    const uint16_t *ref, int ref_stride,
                                    uint32_t *sse, int *sum);
 uint32_t vpx_highbd_calc16x16var_sse2(const uint16_t *src, int src_stride,
                                      const uint16_t *ref, int ref_stride,
                                      uint32_t *sse, int *sum);
 static void highbd_8_variance_sse2(const uint16_t *src, int src_stride,
                                   const uint16_t *ref, int ref_stride,
                                   int w, int h, uint32_t *sse, int *sum,
                                   high_variance_fn_t var_fn, int block_size) {
  int i, j;
  *sse = 0;
  *sum = 0;
  for (i = 0; i < h; i += block_size) {
    for (j = 0; j < w; j += block_size) {
      unsigned int sse0;
      int sum0;
      var_fn(src + src_stride * i + j, src_stride,
             ref + ref_stride * i + j, ref_stride, &sse0, &sum0);
      *sse += sse0;
      *sum += sum0;
    }
  }
 }
 static void highbd_10_variance_sse2(const uint16_t *src, int src_stride,
                                    const uint16_t *ref, int ref_stride,
                                    int w, int h, uint32_t *sse, int *sum,
                                    high_variance_fn_t var_fn, int block_size) {
  int i, j;
  uint64_t sse_long = 0;
  int64_t sum_long = 0;
  for (i = 0; i < h; i += block_size) {
    for (j = 0; j < w; j += block_size) {
      unsigned int sse0;
      int sum0;
      var_fn(src + src_stride * i + j, src_stride,
             ref + ref_stride * i + j, ref_stride, &sse0, &sum0);
      sse_long += sse0;
      sum_long += sum0;
    }
  }
  *sum = ROUND_POWER_OF_TWO(sum_long, 2);
  *sse = ROUND_POWER_OF_TWO(sse_long, 4);
 }
 static void highbd_12_variance_sse2(const uint16_t *src, int src_stride,
                                    const uint16_t *ref, int ref_stride,
                                    int w, int h, uint32_t *sse, int *sum,
                                    high_variance_fn_t var_fn, int block_size) {
  int i, j;
  uint64_t sse_long = 0;
  int64_t sum_long = 0;
  for (i = 0; i < h; i += block_size) {
    for (j = 0; j < w; j += block_size) {
      unsigned int sse0;
      int sum0;
      var_fn(src + src_stride * i + j, src_stride,
             ref + ref_stride * i + j, ref_stride, &sse0, &sum0);
      sse_long += sse0;
      sum_long += sum0;
    }
  }
  *sum = ROUND_POWER_OF_TWO(sum_long, 4);
  *sse = ROUND_POWER_OF_TWO(sse_long, 8);
 }
 #define HIGH_GET_VAR(S) \
 void vpx_highbd_get##S##x##S##var_sse2(const uint8_t *src8, int src_stride, \
                                       const uint8_t *ref8, int ref_stride, \
                                       uint32_t *sse, int *sum) { \
  uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
  uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); \
  vpx_highbd_calc##S##x##S##var_sse2(src, src_stride, ref, ref_stride, \
                                     sse, sum); \
 } \
 \
 void vpx_highbd_10_get##S##x##S##var_sse2(const uint8_t *src8, int src_stride, \
                                          const uint8_t *ref8, int ref_stride, \
                                          uint32_t *sse, int *sum) { \
  uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
  uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); \
  vpx_highbd_calc##S##x##S##var_sse2(src, src_stride, ref, ref_stride, \
                                     sse, sum); \
  *sum = ROUND_POWER_OF_TWO(*sum, 2); \
  *sse = ROUND_POWER_OF_TWO(*sse, 4); \
 } \
 \
 void vpx_highbd_12_get##S##x##S##var_sse2(const uint8_t *src8, int src_stride, \
                                          const uint8_t *ref8, int ref_stride, \
                                          uint32_t *sse, int *sum) { \
  uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
  uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); \
  vpx_highbd_calc##S##x##S##var_sse2(src, src_stride, ref, ref_stride, \
                                     sse, sum); \
  *sum = ROUND_POWER_OF_TWO(*sum, 4); \
  *sse = ROUND_POWER_OF_TWO(*sse, 8); \
 }
 HIGH_GET_VAR(16);
 HIGH_GET_VAR(8);
 #undef HIGH_GET_VAR
 #define VAR_FN(w, h, block_size, shift) \
 uint32_t vpx_highbd_8_variance##w##x##h##_sse2( \
    const uint8_t *src8, int src_stride, \
    const uint8_t *ref8, int ref_stride, uint32_t *sse) { \
  int sum; \
  uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
  uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); \
  highbd_8_variance_sse2(src, src_stride, ref, ref_stride, w, h, sse, &sum, \
                         vpx_highbd_calc##block_size##x##block_size##var_sse2, \
                         block_size); \
  return *sse - (((int64_t)sum * sum) >> shift); \
 } \
 \
 uint32_t vpx_highbd_10_variance##w##x##h##_sse2( \
    const uint8_t *src8, int src_stride, \
    const uint8_t *ref8, int ref_stride, uint32_t *sse) { \
  int sum; \
  uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
  uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); \
  highbd_10_variance_sse2( \
      src, src_stride, ref, ref_stride, w, h, sse, &sum, \
      vpx_highbd_calc##block_size##x##block_size##var_sse2, block_size); \
  return *sse - (((int64_t)sum * sum) >> shift); \
 } \
 \
 uint32_t vpx_highbd_12_variance##w##x##h##_sse2( \
    const uint8_t *src8, int src_stride, \
    const uint8_t *ref8, int ref_stride, uint32_t *sse) { \
  int sum; \
  uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
  uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); \
  highbd_12_variance_sse2( \
      src, src_stride, ref, ref_stride, w, h, sse, &sum, \
      vpx_highbd_calc##block_size##x##block_size##var_sse2, block_size); \
  return *sse - (((int64_t)sum * sum) >> shift); \
 }
 VAR_FN(64, 64, 16, 12);
 VAR_FN(64, 32, 16, 11);
 VAR_FN(32, 64, 16, 11);
 VAR_FN(32, 32, 16, 10);
 VAR_FN(32, 16, 16, 9);
 VAR_FN(16, 32, 16, 9);
 VAR_FN(16, 16, 16, 8);
 VAR_FN(16, 8, 8, 7);
 VAR_FN(8, 16, 8, 7);
 VAR_FN(8, 8, 8, 6);
 #undef VAR_FN
 unsigned int vpx_highbd_8_mse16x16_sse2(const uint8_t *src8, int src_stride,
                                      const uint8_t *ref8, int ref_stride,
                                      unsigned int *sse) {
  int sum;
  uint16_t *src = CONVERT_TO_SHORTPTR(src8);
  uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
  highbd_8_variance_sse2(src, src_stride, ref, ref_stride, 16, 16,
                         sse, &sum, vpx_highbd_calc16x16var_sse2, 16);
  return *sse;
 }
 unsigned int vpx_highbd_10_mse16x16_sse2(const uint8_t *src8, int src_stride,
                                         const uint8_t *ref8, int ref_stride,
                                         unsigned int *sse) {
  int sum;
  uint16_t *src = CONVERT_TO_SHORTPTR(src8);
  uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
  highbd_10_variance_sse2(src, src_stride, ref, ref_stride, 16, 16,
                          sse, &sum, vpx_highbd_calc16x16var_sse2, 16);
  return *sse;
 }
 unsigned int vpx_highbd_12_mse16x16_sse2(const uint8_t *src8, int src_stride,
                                         const uint8_t *ref8, int ref_stride,
                                         unsigned int *sse) {
  int sum;
  uint16_t *src = CONVERT_TO_SHORTPTR(src8);
  uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
  highbd_12_variance_sse2(src, src_stride, ref, ref_stride, 16, 16,
                          sse, &sum, vpx_highbd_calc16x16var_sse2, 16);
  return *sse;
 }
 unsigned int vpx_highbd_8_mse8x8_sse2(const uint8_t *src8, int src_stride,
                                    const uint8_t *ref8, int ref_stride,
                                    unsigned int *sse) {
  int sum;
  uint16_t *src = CONVERT_TO_SHORTPTR(src8);
  uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
  highbd_8_variance_sse2(src, src_stride, ref, ref_stride, 8, 8,
                         sse, &sum, vpx_highbd_calc8x8var_sse2, 8);
  return *sse;
 }
 unsigned int vpx_highbd_10_mse8x8_sse2(const uint8_t *src8, int src_stride,
                                       const uint8_t *ref8, int ref_stride,
                                       unsigned int *sse) {
  int sum;
  uint16_t *src = CONVERT_TO_SHORTPTR(src8);
  uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
  highbd_10_variance_sse2(src, src_stride, ref, ref_stride, 8, 8,
                          sse, &sum, vpx_highbd_calc8x8var_sse2, 8);
  return *sse;
 }
 unsigned int vpx_highbd_12_mse8x8_sse2(const uint8_t *src8, int src_stride,
                                       const uint8_t *ref8, int ref_stride,
                                       unsigned int *sse) {
  int sum;
  uint16_t *src = CONVERT_TO_SHORTPTR(src8);
  uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
  highbd_12_variance_sse2(src, src_stride, ref, ref_stride, 8, 8,
                          sse, &sum, vpx_highbd_calc8x8var_sse2, 8);
  return *sse;
 }
--- a/vpx_dsp/x86/variance_avx2.c
+++ b/vpx_dsp/x86/variance_avx2.c
@ -0,0 +1,93 @@
 /*
 *  Copyright (c) 2012 The WebM project authors. All Rights Reserved.
 *
 *  Use of this source code is governed by a BSD-style license
 *  that can be found in the LICENSE file in the root of the source
 *  tree. An additional intellectual property rights grant can be found
 *  in the file PATENTS.  All contributing project authors may
 *  be found in the AUTHORS file in the root of the source tree.
 */
 #include "./vpx_dsp_rtcd.h"
 typedef void (*get_var_avx2)(const uint8_t *src, int src_stride,
                             const uint8_t *ref, int ref_stride,
                             unsigned int *sse, int *sum);
 void vpx_get32x32var_avx2(const uint8_t *src, int src_stride,
                          const uint8_t *ref, int ref_stride,
                          unsigned int *sse, int *sum);
 static void variance_avx2(const uint8_t *src, int src_stride,
                          const uint8_t *ref, int  ref_stride,
                          int w, int h, unsigned int *sse, int *sum,
                          get_var_avx2 var_fn, int block_size) {
  int i, j;
  *sse = 0;
  *sum = 0;
  for (i = 0; i < h; i += 16) {
    for (j = 0; j < w; j += block_size) {
      unsigned int sse0;
      int sum0;
      var_fn(&src[src_stride * i + j], src_stride,
             &ref[ref_stride * i + j], ref_stride, &sse0, &sum0);
      *sse += sse0;
      *sum += sum0;
    }
  }
 }
 unsigned int vpx_variance16x16_avx2(const uint8_t *src, int src_stride,
                                    const uint8_t *ref, int ref_stride,
                                    unsigned int *sse) {
  int sum;
  variance_avx2(src, src_stride, ref, ref_stride, 16, 16,
                sse, &sum, vpx_get16x16var_avx2, 16);
  return *sse - (((unsigned int)sum * sum) >> 8);
 }
 unsigned int vpx_mse16x16_avx2(const uint8_t *src, int src_stride,
                               const uint8_t *ref, int ref_stride,
                               unsigned int *sse) {
  int sum;
  vpx_get16x16var_avx2(src, src_stride, ref, ref_stride, sse, &sum);
  return *sse;
 }
 unsigned int vpx_variance32x16_avx2(const uint8_t *src, int src_stride,
                                    const uint8_t *ref, int ref_stride,
                                    unsigned int *sse) {
  int sum;
  variance_avx2(src, src_stride, ref, ref_stride, 32, 16,
                sse, &sum, vpx_get32x32var_avx2, 32);
  return *sse - (((int64_t)sum * sum) >> 9);
 }
 unsigned int vpx_variance32x32_avx2(const uint8_t *src, int src_stride,
                                    const uint8_t *ref, int ref_stride,
                                    unsigned int *sse) {
  int sum;
  variance_avx2(src, src_stride, ref, ref_stride, 32, 32,
                sse, &sum, vpx_get32x32var_avx2, 32);
  return *sse - (((int64_t)sum * sum) >> 10);
 }
 unsigned int vpx_variance64x64_avx2(const uint8_t *src, int src_stride,
                                    const uint8_t *ref, int ref_stride,
                                    unsigned int *sse) {
  int sum;
  variance_avx2(src, src_stride, ref, ref_stride, 64, 64,
                sse, &sum, vpx_get32x32var_avx2, 32);
  return *sse - (((int64_t)sum * sum) >> 12);
 }
 unsigned int vpx_variance64x32_avx2(const uint8_t *src, int src_stride,
                                    const uint8_t *ref, int ref_stride,
                                    unsigned int *sse) {
  int sum;
  variance_avx2(src, src_stride, ref, ref_stride, 64, 32,
                sse, &sum, vpx_get32x32var_avx2, 32);
  return *sse - (((int64_t)sum * sum) >> 11);
 }
--- a/vp9/encoder/x86/vp9_variance_impl_intrin_avx2.c
+++ b/vp9/encoder/x86/vp9_variance_impl_intrin_avx2.c
@ -10,9 +10,9 @@
 #include <immintrin.h>  // AVX2
-#include "./vp9_rtcd.h"
+#include "./vpx_dsp_rtcd.h"
-void vp9_get16x16var_avx2(const unsigned char *src_ptr,
+void vpx_get16x16var_avx2(const unsigned char *src_ptr,
                          int source_stride,
                          const unsigned char *ref_ptr,
                          int recon_stride,
@ -123,7 +123,7 @@ void vp9_get16x16var_avx2(const unsigned char *src_ptr,
    }
 }
-void vp9_get32x32var_avx2(const unsigned char *src_ptr,
+void vpx_get32x32var_avx2(const unsigned char *src_ptr,
                          int source_stride,
                          const unsigned char *ref_ptr,
                          int recon_stride,
--- a/vpx_dsp/x86/variance_impl_mmx.asm
+++ b/vpx_dsp/x86/variance_impl_mmx.asm
@ -0,0 +1,424 @@
 ;
 ;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
 ;
 ;  Use of this source code is governed by a BSD-style license
 ;  that can be found in the LICENSE file in the root of the source
 ;  tree. An additional intellectual property rights grant can be found
 ;  in the file PATENTS.  All contributing project authors may
 ;  be found in the AUTHORS file in the root of the source tree.
 ;
 %include "vpx_ports/x86_abi_support.asm"
 ;unsigned int vpx_get_mb_ss_mmx( short *src_ptr )
 global sym(vpx_get_mb_ss_mmx) PRIVATE
 sym(vpx_get_mb_ss_mmx):
    push        rbp
    mov         rbp, rsp
    SHADOW_ARGS_TO_STACK 7
    GET_GOT     rbx
    push rsi
    push rdi
    sub         rsp, 8
    ; end prolog
        mov         rax, arg(0) ;src_ptr
        mov         rcx, 16
        pxor        mm4, mm4
 .NEXTROW:
        movq        mm0, [rax]
        movq        mm1, [rax+8]
        movq        mm2, [rax+16]
        movq        mm3, [rax+24]
        pmaddwd     mm0, mm0
        pmaddwd     mm1, mm1
        pmaddwd     mm2, mm2
        pmaddwd     mm3, mm3
        paddd       mm4, mm0
        paddd       mm4, mm1
        paddd       mm4, mm2
        paddd       mm4, mm3
        add         rax, 32
        dec         rcx
        ja          .NEXTROW
        movq        QWORD PTR [rsp], mm4
        ;return sum[0]+sum[1];
        movsxd      rax, dword ptr [rsp]
        movsxd      rcx, dword ptr [rsp+4]
        add         rax, rcx
    ; begin epilog
    add rsp, 8
    pop rdi
    pop rsi
    RESTORE_GOT
    UNSHADOW_ARGS
    pop         rbp
    ret
 ;void vpx_get8x8var_mmx
 ;(
 ;    unsigned char *src_ptr,
 ;    int  source_stride,
 ;    unsigned char *ref_ptr,
 ;    int  recon_stride,
 ;    unsigned int *SSE,
 ;    int *Sum
 ;)
 global sym(vpx_get8x8var_mmx) PRIVATE
 sym(vpx_get8x8var_mmx):
    push        rbp
    mov         rbp, rsp
    SHADOW_ARGS_TO_STACK 6
    push rsi
    push rdi
    push rbx
    sub         rsp, 16
    ; end prolog
        pxor        mm5, mm5                    ; Blank mmx6
        pxor        mm6, mm6                    ; Blank mmx7
        pxor        mm7, mm7                    ; Blank mmx7
        mov         rax, arg(0) ;[src_ptr]  ; Load base addresses
        mov         rbx, arg(2) ;[ref_ptr]
        movsxd      rcx, dword ptr arg(1) ;[source_stride]
        movsxd      rdx, dword ptr arg(3) ;[recon_stride]
        ; Row 1
        movq        mm0, [rax]                  ; Copy eight bytes to mm0
        movq        mm1, [rbx]                  ; Copy eight bytes to mm1
        movq        mm2, mm0                    ; Take copies
        movq        mm3, mm1                    ; Take copies
        punpcklbw   mm0, mm6                    ; unpack to higher prrcision
        punpcklbw   mm1, mm6
        punpckhbw   mm2, mm6                    ; unpack to higher prrcision
        punpckhbw   mm3, mm6
        psubsw      mm0, mm1                    ; A-B (low order) to MM0
        psubsw      mm2, mm3                    ; A-B (high order) to MM2
        paddw       mm5, mm0                    ; accumulate differences in mm5
        paddw       mm5, mm2                    ; accumulate differences in mm5
        pmaddwd     mm0, mm0                    ; square and accumulate
        pmaddwd     mm2, mm2                    ; square and accumulate
        add         rbx,rdx                     ; Inc pointer into ref data
        add         rax,rcx                     ; Inc pointer into the new data
        movq        mm1, [rbx]                  ; Copy eight bytes to mm1
        paddd       mm7, mm0                    ; accumulate in mm7
        paddd       mm7, mm2                    ; accumulate in mm7
        ; Row 2
        movq        mm0, [rax]                  ; Copy eight bytes to mm0
        movq        mm2, mm0                    ; Take copies
        movq        mm3, mm1                    ; Take copies
        punpcklbw   mm0, mm6                    ; unpack to higher prrcision
        punpcklbw   mm1, mm6
        punpckhbw   mm2, mm6                    ; unpack to higher prrcision
        punpckhbw   mm3, mm6
        psubsw      mm0, mm1                    ; A-B (low order) to MM0
        psubsw      mm2, mm3                    ; A-B (high order) to MM2
        paddw       mm5, mm0                    ; accumulate differences in mm5
        paddw       mm5, mm2                    ; accumulate differences in mm5
        pmaddwd     mm0, mm0                    ; square and accumulate
        pmaddwd     mm2, mm2                    ; square and accumulate
        add         rbx,rdx                     ; Inc pointer into ref data
        add         rax,rcx                     ; Inc pointer into the new data
        movq        mm1, [rbx]                  ; Copy eight bytes to mm1
        paddd       mm7, mm0                    ; accumulate in mm7
        paddd       mm7, mm2                    ; accumulate in mm7
        ; Row 3
        movq        mm0, [rax]                  ; Copy eight bytes to mm0
        movq        mm2, mm0                    ; Take copies
        movq        mm3, mm1                    ; Take copies
        punpcklbw   mm0, mm6                    ; unpack to higher prrcision
        punpcklbw   mm1, mm6
        punpckhbw   mm2, mm6                    ; unpack to higher prrcision
        punpckhbw   mm3, mm6
        psubsw      mm0, mm1                    ; A-B (low order) to MM0
        psubsw      mm2, mm3                    ; A-B (high order) to MM2
        paddw       mm5, mm0                    ; accumulate differences in mm5
        paddw       mm5, mm2                    ; accumulate differences in mm5
        pmaddwd     mm0, mm0                    ; square and accumulate
        pmaddwd     mm2, mm2                    ; square and accumulate
        add         rbx,rdx                     ; Inc pointer into ref data
        add         rax,rcx                     ; Inc pointer into the new data
        movq        mm1, [rbx]                  ; Copy eight bytes to mm1
        paddd       mm7, mm0                    ; accumulate in mm7
        paddd       mm7, mm2                    ; accumulate in mm7
        ; Row 4
        movq        mm0, [rax]                  ; Copy eight bytes to mm0
        movq        mm2, mm0                    ; Take copies
        movq        mm3, mm1                    ; Take copies
        punpcklbw   mm0, mm6                    ; unpack to higher prrcision
        punpcklbw   mm1, mm6
        punpckhbw   mm2, mm6                    ; unpack to higher prrcision
        punpckhbw   mm3, mm6
        psubsw      mm0, mm1                    ; A-B (low order) to MM0
        psubsw      mm2, mm3                    ; A-B (high order) to MM2
        paddw       mm5, mm0                    ; accumulate differences in mm5
        paddw       mm5, mm2                    ; accumulate differences in mm5
        pmaddwd     mm0, mm0                    ; square and accumulate
        pmaddwd     mm2, mm2                    ; square and accumulate
        add         rbx,rdx                     ; Inc pointer into ref data
        add         rax,rcx                     ; Inc pointer into the new data
        movq        mm1, [rbx]                  ; Copy eight bytes to mm1
        paddd       mm7, mm0                    ; accumulate in mm7
        paddd       mm7, mm2                    ; accumulate in mm7
        ; Row 5
        movq        mm0, [rax]                  ; Copy eight bytes to mm0
        movq        mm2, mm0                    ; Take copies
        movq        mm3, mm1                    ; Take copies
        punpcklbw   mm0, mm6                    ; unpack to higher prrcision
        punpcklbw   mm1, mm6
        punpckhbw   mm2, mm6                    ; unpack to higher prrcision
        punpckhbw   mm3, mm6
        psubsw      mm0, mm1                    ; A-B (low order) to MM0
        psubsw      mm2, mm3                    ; A-B (high order) to MM2
        paddw       mm5, mm0                    ; accumulate differences in mm5
        paddw       mm5, mm2                    ; accumulate differences in mm5
        pmaddwd     mm0, mm0                    ; square and accumulate
        pmaddwd     mm2, mm2                    ; square and accumulate
        add         rbx,rdx                     ; Inc pointer into ref data
        add         rax,rcx                     ; Inc pointer into the new data
        movq        mm1, [rbx]                  ; Copy eight bytes to mm1
        ;              movq        mm4, [rbx + rdx]
        paddd       mm7, mm0                    ; accumulate in mm7
        paddd       mm7, mm2                    ; accumulate in mm7
        ; Row 6
        movq        mm0, [rax]                  ; Copy eight bytes to mm0
        movq        mm2, mm0                    ; Take copies
        movq        mm3, mm1                    ; Take copies
        punpcklbw   mm0, mm6                    ; unpack to higher prrcision
        punpcklbw   mm1, mm6
        punpckhbw   mm2, mm6                    ; unpack to higher prrcision
        punpckhbw   mm3, mm6
        psubsw      mm0, mm1                    ; A-B (low order) to MM0
        psubsw      mm2, mm3                    ; A-B (high order) to MM2
        paddw       mm5, mm0                    ; accumulate differences in mm5
        paddw       mm5, mm2                    ; accumulate differences in mm5
        pmaddwd     mm0, mm0                    ; square and accumulate
        pmaddwd     mm2, mm2                    ; square and accumulate
        add         rbx,rdx                     ; Inc pointer into ref data
        add         rax,rcx                     ; Inc pointer into the new data
        movq        mm1, [rbx]                  ; Copy eight bytes to mm1
        paddd       mm7, mm0                    ; accumulate in mm7
        paddd       mm7, mm2                    ; accumulate in mm7
        ; Row 7
        movq        mm0, [rax]                  ; Copy eight bytes to mm0
        movq        mm2, mm0                    ; Take copies
        movq        mm3, mm1                    ; Take copies
        punpcklbw   mm0, mm6                    ; unpack to higher prrcision
        punpcklbw   mm1, mm6
        punpckhbw   mm2, mm6                    ; unpack to higher prrcision
        punpckhbw   mm3, mm6
        psubsw      mm0, mm1                    ; A-B (low order) to MM0
        psubsw      mm2, mm3                    ; A-B (high order) to MM2
        paddw       mm5, mm0                    ; accumulate differences in mm5
        paddw       mm5, mm2                    ; accumulate differences in mm5
        pmaddwd     mm0, mm0                    ; square and accumulate
        pmaddwd     mm2, mm2                    ; square and accumulate
        add         rbx,rdx                     ; Inc pointer into ref data
        add         rax,rcx                     ; Inc pointer into the new data
        movq        mm1, [rbx]                  ; Copy eight bytes to mm1
        paddd       mm7, mm0                    ; accumulate in mm7
        paddd       mm7, mm2                    ; accumulate in mm7
        ; Row 8
        movq        mm0, [rax]                  ; Copy eight bytes to mm0
        movq        mm2, mm0                    ; Take copies
        movq        mm3, mm1                    ; Take copies
        punpcklbw   mm0, mm6                    ; unpack to higher prrcision
        punpcklbw   mm1, mm6
        punpckhbw   mm2, mm6                    ; unpack to higher prrcision
        punpckhbw   mm3, mm6
        psubsw      mm0, mm1                    ; A-B (low order) to MM0
        psubsw      mm2, mm3                    ; A-B (high order) to MM2
        paddw       mm5, mm0                    ; accumulate differences in mm5
        paddw       mm5, mm2                    ; accumulate differences in mm5
        pmaddwd     mm0, mm0                    ; square and accumulate
        pmaddwd     mm2, mm2                    ; square and accumulate
        add         rbx,rdx                     ; Inc pointer into ref data
        add         rax,rcx                     ; Inc pointer into the new data
        paddd       mm7, mm0                    ; accumulate in mm7
        paddd       mm7, mm2                    ; accumulate in mm7
        ; Now accumulate the final results.
        movq        QWORD PTR [rsp+8], mm5      ; copy back accumulated results into normal memory
        movq        QWORD PTR [rsp], mm7        ; copy back accumulated results into normal memory
        movsx       rdx, WORD PTR [rsp+8]
        movsx       rcx, WORD PTR [rsp+10]
        movsx       rbx, WORD PTR [rsp+12]
        movsx       rax, WORD PTR [rsp+14]
        add         rdx, rcx
        add         rbx, rax
        add         rdx, rbx    ;XSum
        movsxd      rax, DWORD PTR [rsp]
        movsxd      rcx, DWORD PTR [rsp+4]
        add         rax, rcx    ;XXSum
        mov         rsi, arg(4) ;SSE
        mov         rdi, arg(5) ;Sum
        mov         dword ptr [rsi], eax
        mov         dword ptr [rdi], edx
        xor         rax, rax    ; return 0
    ; begin epilog
    add rsp, 16
    pop rbx
    pop rdi
    pop rsi
    UNSHADOW_ARGS
    pop         rbp
    ret
 ;void
 ;vpx_get4x4var_mmx
 ;(
 ;    unsigned char *src_ptr,
 ;    int  source_stride,
 ;    unsigned char *ref_ptr,
 ;    int  recon_stride,
 ;    unsigned int *SSE,
 ;    int *Sum
 ;)
 global sym(vpx_get4x4var_mmx) PRIVATE
 sym(vpx_get4x4var_mmx):
    push        rbp
    mov         rbp, rsp
    SHADOW_ARGS_TO_STACK 6
    push rsi
    push rdi
    push rbx
    sub         rsp, 16
    ; end prolog
        pxor        mm5, mm5                    ; Blank mmx6
        pxor        mm6, mm6                    ; Blank mmx7
        pxor        mm7, mm7                    ; Blank mmx7
        mov         rax, arg(0) ;[src_ptr]  ; Load base addresses
        mov         rbx, arg(2) ;[ref_ptr]
        movsxd      rcx, dword ptr arg(1) ;[source_stride]
        movsxd      rdx, dword ptr arg(3) ;[recon_stride]
        ; Row 1
        movd        mm0, [rax]                  ; Copy four bytes to mm0
        movd        mm1, [rbx]                  ; Copy four bytes to mm1
        punpcklbw   mm0, mm6                    ; unpack to higher prrcision
        punpcklbw   mm1, mm6
        psubsw      mm0, mm1                    ; A-B (low order) to MM0
        paddw       mm5, mm0                    ; accumulate differences in mm5
        pmaddwd     mm0, mm0                    ; square and accumulate
        add         rbx,rdx                     ; Inc pointer into ref data
        add         rax,rcx                     ; Inc pointer into the new data
        movd        mm1, [rbx]                  ; Copy four bytes to mm1
        paddd       mm7, mm0                    ; accumulate in mm7
        ; Row 2
        movd        mm0, [rax]                  ; Copy four bytes to mm0
        punpcklbw   mm0, mm6                    ; unpack to higher prrcision
        punpcklbw   mm1, mm6
        psubsw      mm0, mm1                    ; A-B (low order) to MM0
        paddw       mm5, mm0                    ; accumulate differences in mm5
        pmaddwd     mm0, mm0                    ; square and accumulate
        add         rbx,rdx                     ; Inc pointer into ref data
        add         rax,rcx                     ; Inc pointer into the new data
        movd        mm1, [rbx]                  ; Copy four bytes to mm1
        paddd       mm7, mm0                    ; accumulate in mm7
        ; Row 3
        movd        mm0, [rax]                  ; Copy four bytes to mm0
        punpcklbw   mm0, mm6                    ; unpack to higher precision
        punpcklbw   mm1, mm6
        psubsw      mm0, mm1                    ; A-B (low order) to MM0
        paddw       mm5, mm0                    ; accumulate differences in mm5
        pmaddwd     mm0, mm0                    ; square and accumulate
        add         rbx,rdx                     ; Inc pointer into ref data
        add         rax,rcx                     ; Inc pointer into the new data
        movd        mm1, [rbx]                  ; Copy four bytes to mm1
        paddd       mm7, mm0                    ; accumulate in mm7
        ; Row 4
        movd        mm0, [rax]                  ; Copy four bytes to mm0
        punpcklbw   mm0, mm6                    ; unpack to higher prrcision
        punpcklbw   mm1, mm6
        psubsw      mm0, mm1                    ; A-B (low order) to MM0
        paddw       mm5, mm0                    ; accumulate differences in mm5
        pmaddwd     mm0, mm0                    ; square and accumulate
        paddd       mm7, mm0                    ; accumulate in mm7
        ; Now accumulate the final results.
        movq        QWORD PTR [rsp+8], mm5      ; copy back accumulated results into normal memory
        movq        QWORD PTR [rsp], mm7        ; copy back accumulated results into normal memory
        movsx       rdx, WORD PTR [rsp+8]
        movsx       rcx, WORD PTR [rsp+10]
        movsx       rbx, WORD PTR [rsp+12]
        movsx       rax, WORD PTR [rsp+14]
        add         rdx, rcx
        add         rbx, rax
        add         rdx, rbx    ;XSum
        movsxd      rax, DWORD PTR [rsp]
        movsxd      rcx, DWORD PTR [rsp+4]
        add         rax, rcx    ;XXSum
        mov         rsi, arg(4) ;SSE
        mov         rdi, arg(5) ;Sum
        mov         dword ptr [rsi], eax
        mov         dword ptr [rdi], edx
        xor         rax, rax    ; return 0
    ; begin epilog
    add rsp, 16
    pop rbx
    pop rdi
    pop rsi
    UNSHADOW_ARGS
    pop         rbp
    ret
--- a/vpx_dsp/x86/variance_mmx.c
+++ b/vpx_dsp/x86/variance_mmx.c
@ -0,0 +1,107 @@
 /*
 *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
 *
 *  Use of this source code is governed by a BSD-style license
 *  that can be found in the LICENSE file in the root of the source
 *  tree. An additional intellectual property rights grant can be found
 *  in the file PATENTS.  All contributing project authors may
 *  be found in the AUTHORS file in the root of the source tree.
 */
 #include "./vpx_dsp_rtcd.h"
 extern void vpx_get4x4var_mmx(const uint8_t *a, int a_stride,
                              const uint8_t *b, int b_stride,
                              unsigned int *sse, int *sum);
 unsigned int vpx_variance4x4_mmx(const unsigned char *a, int  a_stride,
                                 const unsigned char *b, int  b_stride,
                                 unsigned int *sse) {
    unsigned int var;
    int avg;
    vpx_get4x4var_mmx(a, a_stride, b, b_stride, &var, &avg);
    *sse = var;
    return (var - (((unsigned int)avg * avg) >> 4));
 }
 unsigned int vpx_variance8x8_mmx(const unsigned char *a, int  a_stride,
                                 const unsigned char *b, int  b_stride,
                                 unsigned int *sse) {
    unsigned int var;
    int avg;
    vpx_get8x8var_mmx(a, a_stride, b, b_stride, &var, &avg);
    *sse = var;
    return (var - (((unsigned int)avg * avg) >> 6));
 }
 unsigned int vpx_mse16x16_mmx(const unsigned char *a, int  a_stride,
                              const unsigned char *b, int  b_stride,
                              unsigned int *sse) {
    unsigned int sse0, sse1, sse2, sse3, var;
    int sum0, sum1, sum2, sum3;
    vpx_get8x8var_mmx(a, a_stride, b, b_stride, &sse0, &sum0);
    vpx_get8x8var_mmx(a + 8, a_stride, b + 8, b_stride, &sse1, &sum1);
    vpx_get8x8var_mmx(a + 8 * a_stride, a_stride,
                      b + 8 * b_stride, b_stride, &sse2, &sum2);
    vpx_get8x8var_mmx(a + 8 * a_stride + 8, a_stride,
                      b + 8 * b_stride + 8, b_stride, &sse3, &sum3);
    var = sse0 + sse1 + sse2 + sse3;
    *sse = var;
    return var;
 }
 unsigned int vpx_variance16x16_mmx(const unsigned char *a, int  a_stride,
                                   const unsigned char *b, int  b_stride,
                                   unsigned int *sse) {
    unsigned int sse0, sse1, sse2, sse3, var;
    int sum0, sum1, sum2, sum3, avg;
    vpx_get8x8var_mmx(a, a_stride, b, b_stride, &sse0, &sum0);
    vpx_get8x8var_mmx(a + 8, a_stride, b + 8, b_stride, &sse1, &sum1);
    vpx_get8x8var_mmx(a + 8 * a_stride, a_stride,
                      b + 8 * b_stride, b_stride, &sse2, &sum2);
    vpx_get8x8var_mmx(a + 8 * a_stride + 8, a_stride,
                      b + 8 * b_stride + 8, b_stride, &sse3, &sum3);
    var = sse0 + sse1 + sse2 + sse3;
    avg = sum0 + sum1 + sum2 + sum3;
    *sse = var;
    return (var - (((unsigned int)avg * avg) >> 8));
 }
 unsigned int vpx_variance16x8_mmx(const unsigned char *a, int  a_stride,
                                  const unsigned char *b, int  b_stride,
                                  unsigned int *sse) {
    unsigned int sse0, sse1, var;
    int sum0, sum1, avg;
    vpx_get8x8var_mmx(a, a_stride, b, b_stride, &sse0, &sum0);
    vpx_get8x8var_mmx(a + 8, a_stride, b + 8, b_stride, &sse1, &sum1);
    var = sse0 + sse1;
    avg = sum0 + sum1;
    *sse = var;
    return (var - (((unsigned int)avg * avg) >> 7));
 }
 unsigned int vpx_variance8x16_mmx(const unsigned char *a, int  a_stride,
                                  const unsigned char *b, int  b_stride,
                                  unsigned int *sse) {
    unsigned int sse0, sse1, var;
    int sum0, sum1, avg;
    vpx_get8x8var_mmx(a, a_stride, b, b_stride, &sse0, &sum0);
    vpx_get8x8var_mmx(a + 8 * a_stride, a_stride,
                      b + 8 * b_stride, b_stride, &sse1, &sum1);
    var = sse0 + sse1;
    avg = sum0 + sum1;
    *sse = var;
    return (var - (((unsigned int)avg * avg) >> 7));
 }
--- a/vpx_dsp/x86/variance_sse2.c
+++ b/vpx_dsp/x86/variance_sse2.c
@ -0,0 +1,309 @@
 /*
 *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
 *
 *  Use of this source code is governed by a BSD-style license
 *  that can be found in the LICENSE file in the root of the source
 *  tree. An additional intellectual property rights grant can be found
 *  in the file PATENTS.  All contributing project authors may
 *  be found in the AUTHORS file in the root of the source tree.
 */
 #include <emmintrin.h>  // SSE2
 #include "./vpx_config.h"
 #include "./vpx_dsp_rtcd.h"
 #include "vpx_ports/mem.h"
 typedef void (*getNxMvar_fn_t) (const unsigned char *src, int src_stride,
                                const unsigned char *ref, int ref_stride,
                                unsigned int *sse, int *sum);
 unsigned int vpx_get_mb_ss_sse2(const int16_t *src) {
  __m128i vsum = _mm_setzero_si128();
  int i;
  for (i = 0; i < 32; ++i) {
    const __m128i v = _mm_loadu_si128((const __m128i *)src);
    vsum = _mm_add_epi32(vsum, _mm_madd_epi16(v, v));
    src += 8;
  }
  vsum = _mm_add_epi32(vsum, _mm_srli_si128(vsum, 8));
  vsum = _mm_add_epi32(vsum, _mm_srli_si128(vsum, 4));
  return  _mm_cvtsi128_si32(vsum);
 }
 #define READ64(p, stride, i) \
  _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const uint32_t *)(p + i * stride)), \
      _mm_cvtsi32_si128(*(const uint32_t *)(p + (i + 1) * stride)))
 static void get4x4var_sse2(const uint8_t *src, int src_stride,
                           const uint8_t *ref, int ref_stride,
                           unsigned int *sse, int *sum) {
  const __m128i zero = _mm_setzero_si128();
  const __m128i src0 = _mm_unpacklo_epi8(READ64(src, src_stride, 0), zero);
  const __m128i src1 = _mm_unpacklo_epi8(READ64(src, src_stride, 2), zero);
  const __m128i ref0 = _mm_unpacklo_epi8(READ64(ref, ref_stride, 0), zero);
  const __m128i ref1 = _mm_unpacklo_epi8(READ64(ref, ref_stride, 2), zero);
  const __m128i diff0 = _mm_sub_epi16(src0, ref0);
  const __m128i diff1 = _mm_sub_epi16(src1, ref1);
  // sum
  __m128i vsum = _mm_add_epi16(diff0, diff1);
  vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 8));
  vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 4));
  vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 2));
  *sum = (int16_t)_mm_extract_epi16(vsum, 0);
  // sse
  vsum = _mm_add_epi32(_mm_madd_epi16(diff0, diff0),
                       _mm_madd_epi16(diff1, diff1));
  vsum = _mm_add_epi32(vsum, _mm_srli_si128(vsum, 8));
  vsum = _mm_add_epi32(vsum, _mm_srli_si128(vsum, 4));
  *sse = _mm_cvtsi128_si32(vsum);
 }
 void vpx_get8x8var_sse2(const uint8_t *src, int src_stride,
                        const uint8_t *ref, int ref_stride,
                        unsigned int *sse, int *sum) {
  const __m128i zero = _mm_setzero_si128();
  __m128i vsum = _mm_setzero_si128();
  __m128i vsse = _mm_setzero_si128();
  int i;
  for (i = 0; i < 8; i += 2) {
    const __m128i src0 = _mm_unpacklo_epi8(_mm_loadl_epi64(
        (const __m128i *)(src + i * src_stride)), zero);
    const __m128i ref0 = _mm_unpacklo_epi8(_mm_loadl_epi64(
        (const __m128i *)(ref + i * ref_stride)), zero);
    const __m128i diff0 = _mm_sub_epi16(src0, ref0);
    const __m128i src1 = _mm_unpacklo_epi8(_mm_loadl_epi64(
        (const __m128i *)(src + (i + 1) * src_stride)), zero);
    const __m128i ref1 = _mm_unpacklo_epi8(_mm_loadl_epi64(
        (const __m128i *)(ref + (i + 1) * ref_stride)), zero);
    const __m128i diff1 = _mm_sub_epi16(src1, ref1);
    vsum = _mm_add_epi16(vsum, diff0);
    vsum = _mm_add_epi16(vsum, diff1);
    vsse = _mm_add_epi32(vsse, _mm_madd_epi16(diff0, diff0));
    vsse = _mm_add_epi32(vsse, _mm_madd_epi16(diff1, diff1));
  }
  // sum
  vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 8));
  vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 4));
  vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 2));
  *sum = (int16_t)_mm_extract_epi16(vsum, 0);
  // sse
  vsse = _mm_add_epi32(vsse, _mm_srli_si128(vsse, 8));
  vsse = _mm_add_epi32(vsse, _mm_srli_si128(vsse, 4));
  *sse = _mm_cvtsi128_si32(vsse);
 }
 void vpx_get16x16var_sse2(const uint8_t *src, int src_stride,
                          const uint8_t *ref, int ref_stride,
                          unsigned int *sse, int *sum) {
  const __m128i zero = _mm_setzero_si128();
  __m128i vsum = _mm_setzero_si128();
  __m128i vsse = _mm_setzero_si128();
  int i;
  for (i = 0; i < 16; ++i) {
    const __m128i s = _mm_loadu_si128((const __m128i *)src);
    const __m128i r = _mm_loadu_si128((const __m128i *)ref);
    const __m128i src0 = _mm_unpacklo_epi8(s, zero);
    const __m128i ref0 = _mm_unpacklo_epi8(r, zero);
    const __m128i diff0 = _mm_sub_epi16(src0, ref0);
    const __m128i src1 = _mm_unpackhi_epi8(s, zero);
    const __m128i ref1 = _mm_unpackhi_epi8(r, zero);
    const __m128i diff1 = _mm_sub_epi16(src1, ref1);
    vsum = _mm_add_epi16(vsum, diff0);
    vsum = _mm_add_epi16(vsum, diff1);
    vsse = _mm_add_epi32(vsse, _mm_madd_epi16(diff0, diff0));
    vsse = _mm_add_epi32(vsse, _mm_madd_epi16(diff1, diff1));
    src += src_stride;
    ref += ref_stride;
  }
  // sum
  vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 8));
  vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 4));
  *sum = (int16_t)_mm_extract_epi16(vsum, 0) +
             (int16_t)_mm_extract_epi16(vsum, 1);
  // sse
  vsse = _mm_add_epi32(vsse, _mm_srli_si128(vsse, 8));
  vsse = _mm_add_epi32(vsse, _mm_srli_si128(vsse, 4));
  *sse = _mm_cvtsi128_si32(vsse);
 }
 static void variance_sse2(const unsigned char *src, int src_stride,
                          const unsigned char *ref, int ref_stride,
                          int w, int h, unsigned int *sse, int *sum,
                          getNxMvar_fn_t var_fn, int block_size) {
  int i, j;
  *sse = 0;
  *sum = 0;
  for (i = 0; i < h; i += block_size) {
    for (j = 0; j < w; j += block_size) {
      unsigned int sse0;
      int sum0;
      var_fn(src + src_stride * i + j, src_stride,
             ref + ref_stride * i + j, ref_stride, &sse0, &sum0);
      *sse += sse0;
      *sum += sum0;
    }
  }
 }
 unsigned int vpx_variance4x4_sse2(const unsigned char *src, int src_stride,
                                  const unsigned char *ref, int ref_stride,
                                  unsigned int *sse) {
  int sum;
  get4x4var_sse2(src, src_stride, ref, ref_stride, sse, &sum);
  return *sse - (((unsigned int)sum * sum) >> 4);
 }
 unsigned int vpx_variance8x4_sse2(const uint8_t *src, int src_stride,
                                  const uint8_t *ref, int ref_stride,
                                  unsigned int *sse) {
  int sum;
  variance_sse2(src, src_stride, ref, ref_stride, 8, 4,
                sse, &sum, get4x4var_sse2, 4);
  return *sse - (((unsigned int)sum * sum) >> 5);
 }
 unsigned int vpx_variance4x8_sse2(const uint8_t *src, int src_stride,
                                  const uint8_t *ref, int ref_stride,
                                  unsigned int *sse) {
  int sum;
  variance_sse2(src, src_stride, ref, ref_stride, 4, 8,
                sse, &sum, get4x4var_sse2, 4);
  return *sse - (((unsigned int)sum * sum) >> 5);
 }
 unsigned int vpx_variance8x8_sse2(const unsigned char *src, int src_stride,
                                  const unsigned char *ref, int ref_stride,
                                  unsigned int *sse) {
  int sum;
  vpx_get8x8var_sse2(src, src_stride, ref, ref_stride, sse, &sum);
  return *sse - (((unsigned int)sum * sum) >> 6);
 }
 unsigned int vpx_variance16x8_sse2(const unsigned char *src, int src_stride,
                                   const unsigned char *ref, int ref_stride,
                                   unsigned int *sse) {
  int sum;
  variance_sse2(src, src_stride, ref, ref_stride, 16, 8,
                sse, &sum, vpx_get8x8var_sse2, 8);
  return *sse - (((unsigned int)sum * sum) >> 7);
 }
 unsigned int vpx_variance8x16_sse2(const unsigned char *src, int src_stride,
                                   const unsigned char *ref, int ref_stride,
                                   unsigned int *sse) {
  int sum;
  variance_sse2(src, src_stride, ref, ref_stride, 8, 16,
                sse, &sum, vpx_get8x8var_sse2, 8);
  return *sse - (((unsigned int)sum * sum) >> 7);
 }
 unsigned int vpx_variance16x16_sse2(const unsigned char *src, int src_stride,
                                    const unsigned char *ref, int ref_stride,
                                    unsigned int *sse) {
  int sum;
  vpx_get16x16var_sse2(src, src_stride, ref, ref_stride, sse, &sum);
  return *sse - (((unsigned int)sum * sum) >> 8);
 }
 unsigned int vpx_variance32x32_sse2(const uint8_t *src, int src_stride,
                                    const uint8_t *ref, int ref_stride,
                                    unsigned int *sse) {
  int sum;
  variance_sse2(src, src_stride, ref, ref_stride, 32, 32,
                sse, &sum, vpx_get16x16var_sse2, 16);
  return *sse - (((int64_t)sum * sum) >> 10);
 }
 unsigned int vpx_variance32x16_sse2(const uint8_t *src, int src_stride,
                                    const uint8_t *ref, int ref_stride,
                                    unsigned int *sse) {
  int sum;
  variance_sse2(src, src_stride, ref, ref_stride, 32, 16,
                sse, &sum, vpx_get16x16var_sse2, 16);
  return *sse - (((int64_t)sum * sum) >> 9);
 }
 unsigned int vpx_variance16x32_sse2(const uint8_t *src, int src_stride,
                                    const uint8_t *ref, int ref_stride,
                                    unsigned int *sse) {
  int sum;
  variance_sse2(src, src_stride, ref, ref_stride, 16, 32,
                sse, &sum, vpx_get16x16var_sse2, 16);
  return *sse - (((int64_t)sum * sum) >> 9);
 }
 unsigned int vpx_variance64x64_sse2(const uint8_t *src, int src_stride,
                                    const uint8_t *ref, int ref_stride,
                                    unsigned int *sse) {
  int sum;
  variance_sse2(src, src_stride, ref, ref_stride, 64, 64,
                sse, &sum, vpx_get16x16var_sse2, 16);
  return *sse - (((int64_t)sum * sum) >> 12);
 }
 unsigned int vpx_variance64x32_sse2(const uint8_t *src, int src_stride,
                                    const uint8_t *ref, int ref_stride,
                                    unsigned int *sse) {
  int sum;
  variance_sse2(src, src_stride, ref, ref_stride, 64, 32,
                sse, &sum, vpx_get16x16var_sse2, 16);
  return *sse - (((int64_t)sum * sum) >> 11);
 }
 unsigned int vpx_variance32x64_sse2(const uint8_t *src, int src_stride,
                                    const uint8_t *ref, int ref_stride,
                                    unsigned int *sse) {
  int sum;
  variance_sse2(src, src_stride, ref, ref_stride, 32, 64,
                sse, &sum, vpx_get16x16var_sse2, 16);
  return *sse - (((int64_t)sum * sum) >> 11);
 }
 unsigned int vpx_mse8x8_sse2(const uint8_t *src, int src_stride,
                             const uint8_t *ref, int ref_stride,
                             unsigned int *sse) {
  vpx_variance8x8_sse2(src, src_stride, ref, ref_stride, sse);
  return *sse;
 }
 unsigned int vpx_mse8x16_sse2(const uint8_t *src, int src_stride,
                              const uint8_t *ref, int ref_stride,
                              unsigned int *sse) {
  vpx_variance8x16_sse2(src, src_stride, ref, ref_stride, sse);
  return *sse;
 }
 unsigned int vpx_mse16x8_sse2(const uint8_t *src, int src_stride,
                              const uint8_t *ref, int ref_stride,
                              unsigned int *sse) {
  vpx_variance16x8_sse2(src, src_stride, ref, ref_stride, sse);
  return *sse;
 }
 unsigned int vpx_mse16x16_sse2(const uint8_t *src, int src_stride,
                               const uint8_t *ref, int ref_stride,
                               unsigned int *sse) {
  vpx_variance16x16_sse2(src, src_stride, ref, ref_stride, sse);
  return *sse;
 }