Merge remote branch 'internal/upstream' into HEAD

2011-06-07 00:05:04 -04:00
parent 84f5b14b0e 03973017a7
commit d13cfba344
23 changed files with 340 additions and 30 deletions
--- a/vp8/encoder/arm/arm_csystemdependent.c
+++ b/vp8/encoder/arm/arm_csystemdependent.c
@@ -121,8 +121,10 @@ void vp8_arch_arm_encoder_init(VP8_COMP *cpi)
        cpi->rtcd.encodemb.submby                = vp8_subtract_mby_neon;
        cpi->rtcd.encodemb.submbuv               = vp8_subtract_mbuv_neon;

-        /*cpi->rtcd.quantize.quantb                = vp8_regular_quantize_b;*/
+        /*cpi->rtcd.quantize.quantb                = vp8_regular_quantize_b;
+        cpi->rtcd.quantize.quantb_pair           = vp8_regular_quantize_b_pair;*/
        cpi->rtcd.quantize.fastquantb            = vp8_fast_quantize_b_neon;
+        cpi->rtcd.quantize.fastquantb_pair       = vp8_fast_quantize_b_pair_neon;
    }
 #endif

--- a/vp8/encoder/arm/armv6/vp8_mse16x16_armv6.asm
+++ b/vp8/encoder/arm/armv6/vp8_mse16x16_armv6.asm
@@ -27,8 +27,11 @@
 |vp8_mse16x16_armv6| PROC

    push    {r4-r9, lr}
-    mov     r12, #16            ; set loop counter to 16 (=block height)

+    pld     [r0, r1, lsl #0]
+    pld     [r2, r3, lsl #0]
+
+    mov     r12, #16            ; set loop counter to 16 (=block height)
    mov     r4, #0              ; initialize sse = 0

 loop
@@ -39,8 +42,10 @@ loop
    mov     lr, #0              ; constant zero

    usub8   r8, r5, r6          ; calculate difference
+    pld     [r0, r1, lsl #1]
    sel     r7, r8, lr          ; select bytes with positive difference
    usub8   r9, r6, r5          ; calculate difference with reversed operands
+    pld     [r2, r3, lsl #1]
    sel     r8, r9, lr          ; select bytes with negative difference

    ; calculate partial sums
--- a/vp8/encoder/arm/armv6/vp8_sad16x16_armv6.asm
+++ b/vp8/encoder/arm/armv6/vp8_sad16x16_armv6.asm
@@ -24,6 +24,12 @@
 ; stack max_sad (not used)
 |vp8_sad16x16_armv6| PROC
    stmfd   sp!, {r4-r12, lr}
+
+    pld     [r0, r1, lsl #0]
+    pld     [r2, r3, lsl #0]
+    pld     [r0, r1, lsl #1]
+    pld     [r2, r3, lsl #1]
+
    mov     r4, #0              ; sad = 0;
    mov     r5, #8              ; loop count

@@ -45,6 +51,9 @@ loop
    add     r0, r0, r1          ; set src pointer to next row
    add     r2, r2, r3          ; set dst pointer to next row

+    pld     [r0, r1, lsl #1]
+    pld     [r2, r3, lsl #1]
+
    usada8  r4, r10, r12, r4    ; calculate sad for 4 pixels
    usada8  r8, r11, lr, r8     ; calculate sad for 4 pixels

@@ -70,6 +79,9 @@ loop
    usada8  r4, r10, r12, r4    ; calculate sad for 4 pixels
    usada8  r8, r11, lr, r8     ; calculate sad for 4 pixels

+    pld     [r0, r1, lsl #1]
+    pld     [r2, r3, lsl #1]
+
    subs    r5, r5, #1          ; decrement loop counter
    add     r4, r4, r8          ; add partial sad values

--- a/vp8/encoder/arm/armv6/vp8_variance16x16_armv6.asm
+++ b/vp8/encoder/arm/armv6/vp8_variance16x16_armv6.asm
@@ -25,6 +25,10 @@
 |vp8_variance16x16_armv6| PROC

    stmfd   sp!, {r4-r12, lr}
+
+    pld     [r0, r1, lsl #0]
+    pld     [r2, r3, lsl #0]
+
    mov     r8, #0              ; initialize sum = 0
    mov     r11, #0             ; initialize sse = 0
    mov     r12, #16            ; set loop counter to 16 (=block height)
@@ -37,8 +41,10 @@ loop
    mov     lr, #0              ; constant zero

    usub8   r6, r4, r5          ; calculate difference
+    pld     [r0, r1, lsl #1]
    sel     r7, r6, lr          ; select bytes with positive difference
    usub8   r9, r5, r4          ; calculate difference with reversed operands
+    pld     [r2, r3, lsl #1]
    sel     r6, r9, lr          ; select bytes with negative difference

    ; calculate partial sums
--- a/vp8/encoder/arm/armv6/vp8_variance8x8_armv6.asm
+++ b/vp8/encoder/arm/armv6/vp8_variance8x8_armv6.asm
@@ -23,6 +23,10 @@
 |vp8_variance8x8_armv6| PROC

    push    {r4-r10, lr}
+
+    pld     [r0, r1, lsl #0]
+    pld     [r2, r3, lsl #0]
+
    mov     r12, #8             ; set loop counter to 8 (=block height)
    mov     r4, #0              ; initialize sum = 0
    mov     r5, #0              ; initialize sse = 0
@@ -35,8 +39,10 @@ loop
    mov     lr, #0              ; constant zero

    usub8   r8, r6, r7          ; calculate difference
+    pld     [r0, r1, lsl #1]
    sel     r10, r8, lr         ; select bytes with positive difference
    usub8   r9, r7, r6          ; calculate difference with reversed operands
+    pld     [r2, r3, lsl #1]
    sel     r8, r9, lr          ; select bytes with negative difference

    ; calculate partial sums
--- a/vp8/encoder/arm/armv6/vp8_variance_halfpixvar16x16_h_armv6.asm
+++ b/vp8/encoder/arm/armv6/vp8_variance_halfpixvar16x16_h_armv6.asm
@@ -25,6 +25,10 @@
 |vp8_variance_halfpixvar16x16_h_armv6| PROC

    stmfd   sp!, {r4-r12, lr}
+
+    pld     [r0, r1, lsl #0]
+    pld     [r2, r3, lsl #0]
+
    mov     r8, #0              ; initialize sum = 0
    ldr     r10, c80808080
    mov     r11, #0             ; initialize sse = 0
@@ -42,8 +46,10 @@ loop
    eor     r4, r4, r10

    usub8   r6, r4, r5          ; calculate difference
+    pld     [r0, r1, lsl #1]
    sel     r7, r6, lr          ; select bytes with positive difference
    usub8   r6, r5, r4          ; calculate difference with reversed operands
+    pld     [r2, r3, lsl #1]
    sel     r6, r6, lr          ; select bytes with negative difference

    ; calculate partial sums
--- a/vp8/encoder/arm/armv6/vp8_variance_halfpixvar16x16_hv_armv6.asm
+++ b/vp8/encoder/arm/armv6/vp8_variance_halfpixvar16x16_hv_armv6.asm
@@ -25,6 +25,10 @@
 |vp8_variance_halfpixvar16x16_hv_armv6| PROC

    stmfd   sp!, {r4-r12, lr}
+
+    pld     [r0, r1, lsl #0]
+    pld     [r2, r3, lsl #0]
+
    mov     r8, #0              ; initialize sum = 0
    ldr     r10, c80808080
    mov     r11, #0             ; initialize sse = 0
@@ -53,8 +57,10 @@ loop
    eor     r4, r4, r10

    usub8   r6, r4, r5          ; calculate difference
+    pld     [r0, r1, lsl #1]
    sel     r7, r6, lr          ; select bytes with positive difference
    usub8   r6, r5, r4          ; calculate difference with reversed operands
+    pld     [r2, r3, lsl #1]
    sel     r6, r6, lr          ; select bytes with negative difference

    ; calculate partial sums
--- a/vp8/encoder/arm/armv6/vp8_variance_halfpixvar16x16_v_armv6.asm
+++ b/vp8/encoder/arm/armv6/vp8_variance_halfpixvar16x16_v_armv6.asm
@@ -25,6 +25,10 @@
 |vp8_variance_halfpixvar16x16_v_armv6| PROC

    stmfd   sp!, {r4-r12, lr}
+
+    pld     [r0, r1, lsl #0]
+    pld     [r2, r3, lsl #0]
+
    mov     r8, #0              ; initialize sum = 0
    ldr     r10, c80808080
    mov     r11, #0             ; initialize sse = 0
@@ -43,8 +47,10 @@ loop
    eor     r4, r4, r10

    usub8   r6, r4, r5          ; calculate difference
+    pld     [r0, r1, lsl #1]
    sel     r7, r6, lr          ; select bytes with positive difference
    usub8   r6, r5, r4          ; calculate difference with reversed operands
+    pld     [r2, r3, lsl #1]
    sel     r6, r6, lr          ; select bytes with negative difference

    ; calculate partial sums
--- a/vp8/encoder/arm/neon/fastquantizeb_neon.asm
+++ b/vp8/encoder/arm/neon/fastquantizeb_neon.asm
@@ -10,6 +10,7 @@


    EXPORT  |vp8_fast_quantize_b_neon|
+    EXPORT  |vp8_fast_quantize_b_pair_neon|

    INCLUDE asm_enc_offsets.asm

@@ -19,6 +20,138 @@

    AREA ||.text||, CODE, READONLY, ALIGN=4

+;vp8_fast_quantize_b_pair_neon(BLOCK *b1, BLOCK *b2, BLOCKD *d1, BLOCKD *d2);
+|vp8_fast_quantize_b_pair_neon| PROC
+
+    stmfd           sp!, {r4-r9}
+    vstmdb          sp!, {q4-q7}
+
+    ldr             r4, [r0, #vp8_block_coeff]
+    ldr             r5, [r0, #vp8_block_quant_fast]
+    ldr             r6, [r0, #vp8_block_round]
+
+    vld1.16         {q0, q1}, [r4@128]  ; load z
+
+    ldr             r7, [r2, #vp8_blockd_qcoeff]
+
+    vabs.s16        q4, q0              ; calculate x = abs(z)
+    vabs.s16        q5, q1
+
+    ;right shift 15 to get sign, all 0 if it is positive, all 1 if it is negative
+    vshr.s16        q2, q0, #15         ; sz
+    vshr.s16        q3, q1, #15
+
+    vld1.s16        {q6, q7}, [r6@128]  ; load round_ptr [0-15]
+    vld1.s16        {q8, q9}, [r5@128]  ; load quant_ptr [0-15]
+
+    ldr             r4, [r1, #vp8_block_coeff]
+
+    vadd.s16        q4, q6              ; x + Round
+    vadd.s16        q5, q7
+
+    vld1.16         {q0, q1}, [r4@128]  ; load z2
+
+    vqdmulh.s16     q4, q8              ; y = ((Round+abs(z)) * Quant) >> 16
+    vqdmulh.s16     q5, q9
+
+    vabs.s16        q10, q0             ; calculate x2 = abs(z_2)
+    vabs.s16        q11, q1
+    vshr.s16        q12, q0, #15        ; sz2
+    vshr.s16        q13, q1, #15
+
+    ;modify data to have its original sign
+    veor.s16        q4, q2              ; y^sz
+    veor.s16        q5, q3
+
+    vadd.s16        q10, q6             ; x2 + Round
+    vadd.s16        q11, q7
+
+    ldr             r8, [r2, #vp8_blockd_dequant]
+
+    vqdmulh.s16     q10, q8             ; y2 = ((Round+abs(z)) * Quant) >> 16
+    vqdmulh.s16     q11, q9
+
+    vshr.s16        q4, #1              ; right shift 1 after vqdmulh
+    vshr.s16        q5, #1
+
+    vld1.s16        {q6, q7}, [r8@128]  ;load dequant_ptr[i]
+
+    vsub.s16        q4, q2              ; x1=(y^sz)-sz = (y^sz)-(-1) (2's complement)
+    vsub.s16        q5, q3
+
+    vshr.s16        q10, #1             ; right shift 1 after vqdmulh
+    vshr.s16        q11, #1
+
+    ldr             r9, [r2, #vp8_blockd_dqcoeff]
+
+    veor.s16        q10, q12            ; y2^sz2
+    veor.s16        q11, q13
+
+    vst1.s16        {q4, q5}, [r7]      ; store: qcoeff = x1
+
+
+    vsub.s16        q10, q12            ; x2=(y^sz)-sz = (y^sz)-(-1) (2's complement)
+    vsub.s16        q11, q13
+
+    ldr             r6, [r3, #vp8_blockd_qcoeff]
+
+    vmul.s16        q2, q6, q4          ; x * Dequant
+    vmul.s16        q3, q7, q5
+
+    ldr             r0, _inv_zig_zag_   ; load ptr of inverse zigzag table
+
+    vceq.s16        q8, q8              ; set q8 to all 1
+
+    vst1.s16        {q10, q11}, [r6]    ; store: qcoeff = x2
+
+    vmul.s16        q12, q6, q10        ; x2 * Dequant
+    vmul.s16        q13, q7, q11
+
+    vld1.16         {q6, q7}, [r0@128]  ; load inverse scan order
+
+    vtst.16         q14, q4, q8         ; now find eob
+    vtst.16         q15, q5, q8         ; non-zero element is set to all 1
+
+    vst1.s16        {q2, q3}, [r9]      ; store dqcoeff = x * Dequant
+
+    ldr             r7, [r3, #vp8_blockd_dqcoeff]
+
+    vand            q0, q6, q14         ; get all valid numbers from scan array
+    vand            q1, q7, q15
+
+    vst1.s16        {q12, q13}, [r7]    ; store dqcoeff = x * Dequant
+
+    vtst.16         q2, q10, q8         ; now find eob
+    vtst.16         q3, q11, q8         ; non-zero element is set to all 1
+
+    vmax.u16        q0, q0, q1          ; find maximum value in q0, q1
+
+    vand            q10, q6, q2         ; get all valid numbers from scan array
+    vand            q11, q7, q3
+    vmax.u16        q10, q10, q11       ; find maximum value in q10, q11
+
+    vmax.u16        d0, d0, d1
+    vmax.u16        d20, d20, d21
+    vmovl.u16       q0, d0
+    vmovl.u16       q10, d20
+
+
+    vmax.u32        d0, d0, d1
+    vmax.u32        d20, d20, d21
+    vpmax.u32       d0, d0, d0
+    vpmax.u32       d20, d20, d20
+
+    add             r4, r2, #vp8_blockd_eob
+    add             r5, r3, #vp8_blockd_eob
+
+    vst1.32         {d0[0]}, [r4@32]
+    vst1.32         {d20[0]}, [r5@32]
+
+    vldmia          sp!, {q4-q7}
+    ldmfd           sp!, {r4-r9}
+    bx              lr
+
+    ENDP

 ;void vp8_fast_quantize_b_c(BLOCK *b, BLOCKD *d)
 |vp8_fast_quantize_b_neon| PROC
@@ -97,10 +230,8 @@

    vst1.s16        {q2, q3}, [r7@128]  ; store dqcoeff = x * Dequant

-    vmov.32         r0, d0[0]           ; this instruction takes 1+13 cycles
-                                        ; if we have vfp, we could use
-                                        ; vstr      s0, [r1, #vp8_blockd_eob]
-    str             r0, [r1, #vp8_blockd_eob]
+    add             r4, r1, #vp8_blockd_eob
+    vst1.32         {d0[0]}, [r4@32]

    ldmfd           sp!, {r4-r7}
    bx              lr
--- a/vp8/encoder/arm/quantize_arm.c
+++ b/vp8/encoder/arm/quantize_arm.c
@@ -0,0 +1,62 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#include <math.h>
+#include "vpx_mem/vpx_mem.h"
+
+#include "vp8/encoder/quantize.h"
+#include "vp8/common/entropy.h"
+
+
+#if HAVE_ARMV7
+
+/* vp8_quantize_mbX functions here differs from corresponding ones in
+ * quantize.c only by using quantize_b_pair function pointer instead of
+ * the regular quantize_b function pointer */
+void vp8_quantize_mby_neon(MACROBLOCK *x)
+{
+    int i;
+    int has_2nd_order = (x->e_mbd.mode_info_context->mbmi.mode != B_PRED
+        && x->e_mbd.mode_info_context->mbmi.mode != SPLITMV);
+
+    for (i = 0; i < 16; i+=2)
+        x->quantize_b_pair(&x->block[i], &x->block[i+1],
+                           &x->e_mbd.block[i], &x->e_mbd.block[i+1]);
+
+    if(has_2nd_order)
+        x->quantize_b(&x->block[24], &x->e_mbd.block[24]);
+}
+
+void vp8_quantize_mb_neon(MACROBLOCK *x)
+{
+    int i;
+    int has_2nd_order=(x->e_mbd.mode_info_context->mbmi.mode != B_PRED
+        && x->e_mbd.mode_info_context->mbmi.mode != SPLITMV);
+
+    for (i = 0; i < 24; i+=2)
+        x->quantize_b_pair(&x->block[i], &x->block[i+1],
+                           &x->e_mbd.block[i], &x->e_mbd.block[i+1]);
+
+    if (has_2nd_order)
+        x->quantize_b(&x->block[i], &x->e_mbd.block[i]);
+}
+
+
+void vp8_quantize_mbuv_neon(MACROBLOCK *x)
+{
+    int i;
+
+    for (i = 16; i < 24; i+=2)
+        x->quantize_b_pair(&x->block[i], &x->block[i+1],
+                           &x->e_mbd.block[i], &x->e_mbd.block[i+1]);
+}
+
+#endif /* HAVE_ARMV7 */
--- a/vp8/encoder/arm/quantize_arm.h
+++ b/vp8/encoder/arm/quantize_arm.h
@@ -16,8 +16,10 @@

 extern prototype_quantize_block(vp8_fast_quantize_b_armv6);

+#if !CONFIG_RUNTIME_CPU_DETECT
 #undef  vp8_quantize_fastquantb
 #define vp8_quantize_fastquantb vp8_fast_quantize_b_armv6
+#endif

 #endif /* HAVE_ARMV6 */

@@ -25,10 +27,25 @@ extern prototype_quantize_block(vp8_fast_quantize_b_armv6);
 #if HAVE_ARMV7

 extern prototype_quantize_block(vp8_fast_quantize_b_neon);
+extern prototype_quantize_block_pair(vp8_fast_quantize_b_pair_neon);

+#if !CONFIG_RUNTIME_CPU_DETECT
 #undef  vp8_quantize_fastquantb
 #define vp8_quantize_fastquantb vp8_fast_quantize_b_neon

+#undef  vp8_quantize_fastquantb_pair
+#define vp8_quantize_fastquantb_pair vp8_fast_quantize_b_pair_neon
+
+#undef vp8_quantize_mb
+#define vp8_quantize_mb vp8_quantize_mb_neon
+
+#undef vp8_quantize_mbuv
+#define vp8_quantize_mbuv vp8_quantize_mbuv_neon
+
+#undef vp8_quantize_mby
+#define vp8_quantize_mby vp8_quantize_mby_neon
+#endif
+
 #endif /* HAVE_ARMV7 */

 #endif
--- a/vp8/encoder/block.h
+++ b/vp8/encoder/block.h
@@ -123,6 +123,7 @@ typedef struct
    void (*vp8_short_fdct8x4)(short *input, short *output, int pitch);
    void (*short_walsh4x4)(short *input, short *output, int pitch);
    void (*quantize_b)(BLOCK *b, BLOCKD *d);
+    void (*quantize_b_pair)(BLOCK *b1, BLOCK *b2, BLOCKD *d0, BLOCKD *d1);

 } MACROBLOCK;

--- a/vp8/encoder/encodeframe.c
+++ b/vp8/encoder/encodeframe.c
@@ -1266,8 +1266,10 @@ int vp8cx_encode_inter_macroblock
        /* Are we using the fast quantizer for the mode selection? */
        if(cpi->sf.use_fastquant_for_pick)
        {
-            cpi->mb.quantize_b = QUANTIZE_INVOKE(&cpi->rtcd.quantize,
-                                                 fastquantb);
+            cpi->mb.quantize_b      = QUANTIZE_INVOKE(&cpi->rtcd.quantize,
+                                                      fastquantb);
+            cpi->mb.quantize_b_pair = QUANTIZE_INVOKE(&cpi->rtcd.quantize,
+                                                      fastquantb_pair);

            /* the fast quantizer does not use zbin_extra, so
             * do not recalculate */
@@ -1279,7 +1281,10 @@ int vp8cx_encode_inter_macroblock
        /* switch back to the regular quantizer for the encode */
        if (cpi->sf.improved_quant)
        {
-            cpi->mb.quantize_b    = QUANTIZE_INVOKE(&cpi->rtcd.quantize, quantb);
+            cpi->mb.quantize_b      = QUANTIZE_INVOKE(&cpi->rtcd.quantize,
+                                                      quantb);
+            cpi->mb.quantize_b_pair = QUANTIZE_INVOKE(&cpi->rtcd.quantize,
+                                                      quantb_pair);
        }

        /* restore cpi->zbin_mode_boost_enabled */
--- a/vp8/encoder/ethreading.c
+++ b/vp8/encoder/ethreading.c
@@ -314,6 +314,7 @@ static void setup_mbby_copy(MACROBLOCK *mbdst, MACROBLOCK *mbsrc)
    z->vp8_short_fdct8x4     = x->vp8_short_fdct8x4;
    z->short_walsh4x4    = x->short_walsh4x4;
    z->quantize_b        = x->quantize_b;
+    z->quantize_b_pair   = x->quantize_b_pair;
    z->optimize          = x->optimize;

    /*
--- a/vp8/encoder/generic/csystemdependent.c
+++ b/vp8/encoder/generic/csystemdependent.c
@@ -17,8 +17,6 @@
 void vp8_arch_x86_encoder_init(VP8_COMP *cpi);
 void vp8_arch_arm_encoder_init(VP8_COMP *cpi);

-extern void vp8_fast_quantize_b_c(BLOCK *b, BLOCKD *d);
-
 void (*vp8_yv12_copy_partial_frame_ptr)(YV12_BUFFER_CONFIG *src_ybc, YV12_BUFFER_CONFIG *dst_ybc, int Fraction);
 extern void vp8_yv12_copy_partial_frame(YV12_BUFFER_CONFIG *src_ybc, YV12_BUFFER_CONFIG *dst_ybc, int Fraction);

@@ -88,7 +86,9 @@ void vp8_cmachine_specific_config(VP8_COMP *cpi)
    cpi->rtcd.encodemb.submbuv               = vp8_subtract_mbuv_c;

    cpi->rtcd.quantize.quantb                = vp8_regular_quantize_b;
+    cpi->rtcd.quantize.quantb_pair           = vp8_regular_quantize_b_pair;
    cpi->rtcd.quantize.fastquantb            = vp8_fast_quantize_b_c;
+    cpi->rtcd.quantize.fastquantb_pair       = vp8_fast_quantize_b_pair_c;
    cpi->rtcd.search.full_search             = vp8_full_search_sad;
    cpi->rtcd.search.refining_search         = vp8_refining_search_sad;
    cpi->rtcd.search.diamond_search          = vp8_diamond_search_sad;
--- a/vp8/encoder/mcomp.c
+++ b/vp8/encoder/mcomp.c
@@ -842,7 +842,6 @@ int vp8_hex_search
    int_mv *best_mv,
    int search_param,
    int sad_per_bit,
-    int *num00,
    const vp8_variance_fn_ptr_t *vfp,
    int *mvsadcost[2],
    int *mvcost[2],
@@ -996,12 +995,8 @@ cal_neighbors:

    best_mv->as_mv.row = br;
    best_mv->as_mv.col = bc;
-    this_mv.as_mv.row = br<<3;
-    this_mv.as_mv.col = bc<<3;

-    this_offset = (unsigned char *)(*(d->base_pre) + d->pre + (br * (in_what_stride)) + bc);
-    return vfp->vf(what, what_stride, this_offset, in_what_stride, &bestsad)
-        + mv_err_cost(&this_mv, center_mv, mvcost, x->errorperbit) ;
+    return bestsad;
 }
 #undef CHECK_BOUNDS
 #undef CHECK_POINT
--- a/vp8/encoder/mcomp.h
+++ b/vp8/encoder/mcomp.h
@@ -40,7 +40,6 @@ extern int vp8_hex_search
    int_mv *best_mv,
    int search_param,
    int error_per_bit,
-    int *num00,
    const vp8_variance_fn_ptr_t *vf,
    int *mvsadcost[2],
    int *mvcost[2],
--- a/vp8/encoder/onyx_if.c
+++ b/vp8/encoder/onyx_if.c
@@ -1266,11 +1266,17 @@ void vp8_set_speed_features(VP8_COMP *cpi)

    if (cpi->sf.improved_quant)
    {
-        cpi->mb.quantize_b    = QUANTIZE_INVOKE(&cpi->rtcd.quantize, quantb);
+        cpi->mb.quantize_b      = QUANTIZE_INVOKE(&cpi->rtcd.quantize,
+                                                  quantb);
+        cpi->mb.quantize_b_pair = QUANTIZE_INVOKE(&cpi->rtcd.quantize,
+                                                  quantb_pair);
    }
    else
    {
-        cpi->mb.quantize_b      = QUANTIZE_INVOKE(&cpi->rtcd.quantize, fastquantb);
+        cpi->mb.quantize_b      = QUANTIZE_INVOKE(&cpi->rtcd.quantize,
+                                                  fastquantb);
+        cpi->mb.quantize_b_pair = QUANTIZE_INVOKE(&cpi->rtcd.quantize,
+                                                  fastquantb_pair);
    }
    if (cpi->sf.improved_quant != last_improved_quant)
        vp8cx_init_quantizer(cpi);
--- a/vp8/encoder/pickinter.c
+++ b/vp8/encoder/pickinter.c
@@ -762,7 +762,7 @@ void vp8_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset,
            if (cpi->sf.search_method == HEX)
            {
                bestsme = vp8_hex_search(x, b, d, &mvp, &d->bmi.mv, step_param,
-                                      sadpb, &num00, &cpi->fn_ptr[BLOCK_16X16],
+                                      sadpb, &cpi->fn_ptr[BLOCK_16X16],
                                      x->mvsadcost, x->mvcost, &best_ref_mv);
                mode_mv[NEWMV].as_int = d->bmi.mv.as_int;
            }
--- a/vp8/encoder/quantize.c
+++ b/vp8/encoder/quantize.c
@@ -269,7 +269,7 @@ void vp8_regular_quantize_b(BLOCK *b, BLOCKD *d)

 #endif

-void vp8_quantize_mby(MACROBLOCK *x)
+void vp8_quantize_mby_c(MACROBLOCK *x)
 {
    int i;
    int has_2nd_order = (x->e_mbd.mode_info_context->mbmi.mode != B_PRED
@@ -282,7 +282,7 @@ void vp8_quantize_mby(MACROBLOCK *x)
        x->quantize_b(&x->block[24], &x->e_mbd.block[24]);
 }

-void vp8_quantize_mb(MACROBLOCK *x)
+void vp8_quantize_mb_c(MACROBLOCK *x)
 {
    int i;
    int has_2nd_order=(x->e_mbd.mode_info_context->mbmi.mode != B_PRED
@@ -293,7 +293,7 @@ void vp8_quantize_mb(MACROBLOCK *x)
 }


-void vp8_quantize_mbuv(MACROBLOCK *x)
+void vp8_quantize_mbuv_c(MACROBLOCK *x)
 {
    int i;

@@ -301,6 +301,22 @@ void vp8_quantize_mbuv(MACROBLOCK *x)
        x->quantize_b(&x->block[i], &x->e_mbd.block[i]);
 }

+/* quantize_b_pair function pointer in MACROBLOCK structure is set to one of
+ * these two C functions if corresponding optimized routine is not available.
+ * NEON optimized version implements currently the fast quantization for pair
+ * of blocks. */
+void vp8_regular_quantize_b_pair(BLOCK *b1, BLOCK *b2, BLOCKD *d1, BLOCKD *d2)
+{
+    vp8_regular_quantize_b(b1, d1);
+    vp8_regular_quantize_b(b2, d2);
+}
+
+void vp8_fast_quantize_b_pair_c(BLOCK *b1, BLOCK *b2, BLOCKD *d1, BLOCKD *d2)
+{
+    vp8_fast_quantize_b_c(b1, d1);
+    vp8_fast_quantize_b_c(b2, d2);
+}
+

 static const int qrounding_factors[129] =
 {
@@ -715,3 +731,4 @@ void vp8_set_quantizer(struct VP8_COMP *cpi, int Q)
        vp8cx_init_quantizer(cpi);

 }
+
--- a/vp8/encoder/quantize.h
+++ b/vp8/encoder/quantize.h
@@ -17,6 +17,11 @@
 #define prototype_quantize_block(sym) \
    void (sym)(BLOCK *b,BLOCKD *d)

+#define prototype_quantize_block_pair(sym) \
+    void (sym)(BLOCK *b1, BLOCK *b2, BLOCKD *d1, BLOCKD *d2)
+
+#define prototype_quantize_mb(sym) \
+    void (sym)(MACROBLOCK *x)

 #if ARCH_X86 || ARCH_X86_64
 #include "x86/quantize_x86.h"
@@ -31,17 +36,43 @@
 #endif
 extern prototype_quantize_block(vp8_quantize_quantb);

+#ifndef vp8_quantize_quantb_pair
+#define vp8_quantize_quantb_pair vp8_regular_quantize_b_pair
+#endif
+extern prototype_quantize_block_pair(vp8_quantize_quantb_pair);
+
 #ifndef vp8_quantize_fastquantb
 #define vp8_quantize_fastquantb vp8_fast_quantize_b_c
 #endif
 extern prototype_quantize_block(vp8_quantize_fastquantb);

+#ifndef vp8_quantize_fastquantb_pair
+#define vp8_quantize_fastquantb_pair vp8_fast_quantize_b_pair_c
+#endif
+extern prototype_quantize_block_pair(vp8_quantize_fastquantb_pair);
+
 typedef struct
 {
    prototype_quantize_block(*quantb);
+    prototype_quantize_block_pair(*quantb_pair);
    prototype_quantize_block(*fastquantb);
+    prototype_quantize_block_pair(*fastquantb_pair);
 } vp8_quantize_rtcd_vtable_t;

+#ifndef vp8_quantize_mb
+#define vp8_quantize_mb vp8_quantize_mb_c
+#endif
+extern prototype_quantize_mb(vp8_quantize_mb);
+
+#ifndef vp8_quantize_mbuv
+#define vp8_quantize_mbuv vp8_quantize_mbuv_c
+#endif
+extern prototype_quantize_mb(vp8_quantize_mbuv);
+
+#ifndef vp8_quantize_mby
+#define vp8_quantize_mby vp8_quantize_mby_c
+#endif
+extern prototype_quantize_mb(vp8_quantize_mby);

 #if CONFIG_RUNTIME_CPU_DETECT
 #define QUANTIZE_INVOKE(ctx,fn) (ctx)->fn
@@ -51,10 +82,6 @@ typedef struct

 extern void vp8_strict_quantize_b(BLOCK *b,BLOCKD *d);

-extern void vp8_quantize_mb(MACROBLOCK *x);
-extern void vp8_quantize_mbuv(MACROBLOCK *x);
-extern void vp8_quantize_mby(MACROBLOCK *x);
-
 struct VP8_COMP;
 extern void vp8_set_quantizer(struct VP8_COMP *cpi, int Q);
 extern void vp8cx_frame_init_quantizer(struct VP8_COMP *cpi);
--- a/vp8/encoder/temporal_filter.c
+++ b/vp8/encoder/temporal_filter.c
@@ -153,7 +153,6 @@ static int vp8_temporal_filter_find_matching_mb_c
    int further_steps;
    int sadpb = x->sadperbit16;
    int bestsme = INT_MAX;
-    int num00 = 0;

    BLOCK *b = &x->block[0];
    BLOCKD *d = &x->e_mbd.block[0];
@@ -201,7 +200,7 @@ static int vp8_temporal_filter_find_matching_mb_c
        &best_ref_mv1, &d->bmi.mv,
        step_param,
        sadpb,
-        &num00, &cpi->fn_ptr[BLOCK_16X16],
+        &cpi->fn_ptr[BLOCK_16X16],
        mvsadcost, mvcost, &best_ref_mv1);

 #if ALT_REF_SUBPEL_ENABLED
--- a/vp8/vp8cx_arm.mk
+++ b/vp8/vp8cx_arm.mk
@@ -15,6 +15,7 @@
 # encoder
 VP8_CX_SRCS-$(ARCH_ARM)  += encoder/arm/arm_csystemdependent.c

+VP8_CX_SRCS-$(HAVE_ARMV7)  += encoder/arm/quantize_arm.c
 VP8_CX_SRCS-$(HAVE_ARMV7)  += encoder/arm/picklpf_arm.c
 VP8_CX_SRCS-$(HAVE_ARMV6)  += encoder/arm/dct_arm.c
 VP8_CX_SRCS-$(HAVE_ARMV6)  += encoder/arm/variance_arm.c