From c32e0ecc592d12573199c992f0fb710b7785c5eb Mon Sep 17 00:00:00 2001
From: Johann <johannkoenig@google.com>
Date: Thu, 24 Mar 2011 13:31:10 -0400
Subject: [PATCH 1/2] use asm_offsets with vp8_fast_quantize_b_sse2

on the same order as the regular quantize change: ~2%

Change-Id: I5c9eec18e89ae7345dd96945cb740e6f349cee86
---
 vp8/encoder/x86/quantize_sse2.asm      | 139 +++++++++++++++----------
 vp8/encoder/x86/quantize_x86.h         |   4 +
 vp8/encoder/x86/x86_csystemdependent.c |  27 +----
 3 files changed, 90 insertions(+), 80 deletions(-)

diff --git a/vp8/encoder/x86/quantize_sse2.asm b/vp8/encoder/x86/quantize_sse2.asm
index e00faebd1..5e40dc7de 100644
--- a/vp8/encoder/x86/quantize_sse2.asm
+++ b/vp8/encoder/x86/quantize_sse2.asm
@@ -233,72 +233,97 @@ ZIGZAG_LOOP 15
     pop         rbp
     ret
 
-; int vp8_fast_quantize_b_impl_sse2 | arg
-;  (short *coeff_ptr,               |  0
-;   short *qcoeff_ptr,              |  1
-;   short *dequant_ptr,             |  2
-;   short *inv_scan_order,          |  3
-;   short *round_ptr,               |  4
-;   short *quant_ptr,               |  5
-;   short *dqcoeff_ptr)             |  6
+; void vp8_fast_quantize_b_sse2 | arg
+;  (BLOCK  *b,                  |  0
+;   BLOCKD *d)                  |  1
 
-global sym(vp8_fast_quantize_b_impl_sse2)
-sym(vp8_fast_quantize_b_impl_sse2):
+global sym(vp8_fast_quantize_b_sse2)
+sym(vp8_fast_quantize_b_sse2):
     push        rbp
     mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 7
-    push        rsi
+    GET_GOT     rbx
+
+%if ABI_IS_32BIT
     push        rdi
+    push        rsi
+%else
+  %ifidn __OUTPUT_FORMAT__,x64
+    push        rdi
+    push        rsi
+  %else
+    ; these registers are used for passing arguments
+  %endif
+%endif
+
     ; end prolog
 
-    mov         rdx, arg(0)                 ;coeff_ptr
-    mov         rcx, arg(2)                 ;dequant_ptr
-    mov         rdi, arg(4)                 ;round_ptr
-    mov         rsi, arg(5)                 ;quant_ptr
+%if ABI_IS_32BIT
+    mov         rdi, arg(0)                 ; BLOCK *b
+    mov         rsi, arg(1)                 ; BLOCKD *d
+%else
+  %ifidn __OUTPUT_FORMAT__,x64
+    mov         rdi, rcx                    ; BLOCK *b
+    mov         rsi, rdx                    ; BLOCKD *d
+  %else
+    ;mov         rdi, rdi                    ; BLOCK *b
+    ;mov         rsi, rsi                    ; BLOCKD *d
+  %endif
+%endif
 
-    movdqa      xmm0, XMMWORD PTR[rdx]
-    movdqa      xmm4, XMMWORD PTR[rdx + 16]
+    mov         rax, [rdi + vp8_block_coeff]
+    mov         rcx, [rdi + vp8_block_round]
+    mov         rdx, [rdi + vp8_block_quant_fast]
 
-    movdqa      xmm2, XMMWORD PTR[rdi]      ;round lo
-    movdqa      xmm3, XMMWORD PTR[rdi + 16] ;round hi
+    ; z = coeff
+    movdqa      xmm0, [rax]
+    movdqa      xmm4, [rax + 16]
 
+    ; dup z so we can save sz
     movdqa      xmm1, xmm0
     movdqa      xmm5, xmm4
 
-    psraw       xmm0, 15                    ;sign of z (aka sz)
-    psraw       xmm4, 15                    ;sign of z (aka sz)
-
-    pxor        xmm1, xmm0
-    pxor        xmm5, xmm4
-    psubw       xmm1, xmm0                  ;x = abs(z)
-    psubw       xmm5, xmm4                  ;x = abs(z)
-
-    paddw       xmm1, xmm2
-    paddw       xmm5, xmm3
-
-    pmulhw      xmm1, XMMWORD PTR[rsi]
-    pmulhw      xmm5, XMMWORD PTR[rsi + 16]
-
-    mov         rdi, arg(1)                 ;qcoeff_ptr
-    mov         rsi, arg(6)                 ;dqcoeff_ptr
-
-    movdqa      xmm2, XMMWORD PTR[rcx]
-    movdqa      xmm3, XMMWORD PTR[rcx + 16]
+    ; sz = z >> 15
+    psraw       xmm0, 15
+    psraw       xmm4, 15
 
+    ; x = abs(z) = (z ^ sz) - sz
     pxor        xmm1, xmm0
     pxor        xmm5, xmm4
     psubw       xmm1, xmm0
     psubw       xmm5, xmm4
 
-    movdqa      XMMWORD PTR[rdi], xmm1
-    movdqa      XMMWORD PTR[rdi + 16], xmm5
+    ; x += round
+    paddw       xmm1, [rcx]
+    paddw       xmm5, [rcx + 16]
 
-    pmullw      xmm2, xmm1
-    pmullw      xmm3, xmm5
+    mov         rax, [rsi + vp8_blockd_qcoeff]
+    mov         rcx, [rsi + vp8_blockd_dequant]
+    mov         rdi, [rsi + vp8_blockd_dqcoeff]
 
-    mov         rdi, arg(3)                 ;inv_scan_order
+    ; y = x * quant >> 16
+    pmulhw      xmm1, [rdx]
+    pmulhw      xmm5, [rdx + 16]
+
+    ; x = (y ^ sz) - sz
+    pxor        xmm1, xmm0
+    pxor        xmm5, xmm4
+    psubw       xmm1, xmm0
+    psubw       xmm5, xmm4
+
+    ; qcoeff = x
+    movdqa      [rax], xmm1
+    movdqa      [rax + 16], xmm5
+
+    ; x * dequant
+    movdqa      xmm2, xmm1
+    movdqa      xmm3, xmm5
+    pmullw      xmm2, [rcx]
+    pmullw      xmm3, [rcx + 16]
+
+    ; dqcoeff = x * dequant
+    movdqa      [rdi], xmm2
+    movdqa      [rdi + 16], xmm3
 
-    ; Start with 16
     pxor        xmm4, xmm4                  ;clear all bits
     pcmpeqw     xmm1, xmm4
     pcmpeqw     xmm5, xmm4
@@ -307,8 +332,8 @@ sym(vp8_fast_quantize_b_impl_sse2):
     pxor        xmm1, xmm4
     pxor        xmm5, xmm4
 
-    pand        xmm1, XMMWORD PTR[rdi]
-    pand        xmm5, XMMWORD PTR[rdi+16]
+    pand        xmm1, [GLOBAL(inv_zig_zag)]
+    pand        xmm5, [GLOBAL(inv_zig_zag + 16)]
 
     pmaxsw      xmm1, xmm5
 
@@ -327,16 +352,22 @@ sym(vp8_fast_quantize_b_impl_sse2):
 
     pmaxsw      xmm1, xmm5
 
-    movd        rax, xmm1
-    and         rax, 0xff
-
-    movdqa      XMMWORD PTR[rsi], xmm2        ;store dqcoeff
-    movdqa      XMMWORD PTR[rsi + 16], xmm3   ;store dqcoeff
+    movd        eax, xmm1
+    and         eax, 0xff
+    mov         [rsi + vp8_blockd_eob], eax
 
     ; begin epilog
-    pop         rdi
+%if ABI_IS_32BIT
     pop         rsi
-    UNSHADOW_ARGS
+    pop         rdi
+%else
+  %ifidn __OUTPUT_FORMAT__,x64
+    pop         rsi
+    pop         rdi
+  %endif
+%endif
+
+    RESTORE_GOT
     pop         rbp
     ret
 
diff --git a/vp8/encoder/x86/quantize_x86.h b/vp8/encoder/x86/quantize_x86.h
index 6f54bec31..df2e0bc39 100644
--- a/vp8/encoder/x86/quantize_x86.h
+++ b/vp8/encoder/x86/quantize_x86.h
@@ -24,12 +24,16 @@
 
 #if HAVE_SSE2
 extern prototype_quantize_block(vp8_regular_quantize_b_sse2);
+extern prototype_quantize_block(vp8_fast_quantize_b_sse2);
 
 #if !CONFIG_RUNTIME_CPU_DETECT
 
 #undef vp8_quantize_quantb
 #define vp8_quantize_quantb vp8_regular_quantize_b_sse2
 
+#undef vp8_quantize_fastquantb
+#define vp8_quantize_fastquantb vp8_fast_quantize_b_sse2
+
 #endif
 
 #endif
diff --git a/vp8/encoder/x86/x86_csystemdependent.c b/vp8/encoder/x86/x86_csystemdependent.c
index 2b6bd98eb..8bceecec4 100644
--- a/vp8/encoder/x86/x86_csystemdependent.c
+++ b/vp8/encoder/x86/x86_csystemdependent.c
@@ -81,31 +81,6 @@ static void subtract_b_mmx(BLOCK *be, BLOCKD *bd, int pitch)
 #endif
 
 #if HAVE_SSE2
-int vp8_fast_quantize_b_impl_sse2(short *coeff_ptr,
-                                 short *qcoeff_ptr, short *dequant_ptr,
-                                 const short *inv_scan_order, short *round_ptr,
-                                 short *quant_ptr, short *dqcoeff_ptr);
-static void fast_quantize_b_sse2(BLOCK *b, BLOCKD *d)
-{
-    short *scan_mask   = vp8_default_zig_zag_mask;//d->scan_order_mask_ptr;
-    short *coeff_ptr   = b->coeff;
-    short *round_ptr   = b->round;
-    short *quant_ptr   = b->quant_fast;
-    short *qcoeff_ptr  = d->qcoeff;
-    short *dqcoeff_ptr = d->dqcoeff;
-    short *dequant_ptr = d->dequant;
-
-    d->eob = vp8_fast_quantize_b_impl_sse2(
-                 coeff_ptr,
-                 qcoeff_ptr,
-                 dequant_ptr,
-                 vp8_default_inv_zig_zag,
-                 round_ptr,
-                 quant_ptr,
-                 dqcoeff_ptr
-             );
-}
-
 int vp8_mbblock_error_xmm_impl(short *coeff_ptr, short *dcoef_ptr, int dc);
 static int mbblock_error_xmm(MACROBLOCK *mb, int dc)
 {
@@ -294,7 +269,7 @@ void vp8_arch_x86_encoder_init(VP8_COMP *cpi)
         cpi->rtcd.encodemb.submbuv               = vp8_subtract_mbuv_sse2;
 
         cpi->rtcd.quantize.quantb                = vp8_regular_quantize_b_sse2;
-        cpi->rtcd.quantize.fastquantb            = fast_quantize_b_sse2;
+        cpi->rtcd.quantize.fastquantb            = vp8_fast_quantize_b_sse2;
 
 #if !(CONFIG_REALTIME_ONLY)
         cpi->rtcd.temporal.apply                 = vp8_temporal_filter_apply_sse2;

From 02423b2e9219eab817235715ea8a89709f97a26e Mon Sep 17 00:00:00 2001
From: Yunqing Wang <yunqingwang@google.com>
Date: Wed, 6 Apr 2011 09:08:47 -0400
Subject: [PATCH 2/2] Minor modification

A small change.

Change-Id: I2e7726e58370a95d0319361f4f6ad231138d1328
---
 vp8/encoder/mcomp.c | 12 ++++++------
 vp8/encoder/rdopt.c |  7 ++-----
 2 files changed, 8 insertions(+), 11 deletions(-)

diff --git a/vp8/encoder/mcomp.c b/vp8/encoder/mcomp.c
index 37c30da14..716f514af 100644
--- a/vp8/encoder/mcomp.c
+++ b/vp8/encoder/mcomp.c
@@ -1208,8 +1208,8 @@ int vp8_full_search_sad(MACROBLOCK *x, BLOCK *b, BLOCKD *d, MV *ref_mv, int erro
     unsigned char *check_here;
     int thissad;
 
-    int ref_row = ref_mv->row >> 3;
-    int ref_col = ref_mv->col >> 3;
+    int ref_row = ref_mv->row;
+    int ref_col = ref_mv->col;
 
     int row_min = ref_row - distance;
     int row_max = ref_row + distance;
@@ -1303,8 +1303,8 @@ int vp8_full_search_sadx3(MACROBLOCK *x, BLOCK *b, BLOCKD *d, MV *ref_mv, int er
     unsigned char *check_here;
     unsigned int thissad;
 
-    int ref_row = ref_mv->row >> 3;
-    int ref_col = ref_mv->col >> 3;
+    int ref_row = ref_mv->row;
+    int ref_col = ref_mv->col;
 
     int row_min = ref_row - distance;
     int row_max = ref_row + distance;
@@ -1431,8 +1431,8 @@ int vp8_full_search_sadx8(MACROBLOCK *x, BLOCK *b, BLOCKD *d, MV *ref_mv, int er
     unsigned char *check_here;
     unsigned int thissad;
 
-    int ref_row = ref_mv->row >> 3;
-    int ref_col = ref_mv->col >> 3;
+    int ref_row = ref_mv->row;
+    int ref_col = ref_mv->col;
 
     int row_min = ref_row - distance;
     int row_max = ref_row + distance;
diff --git a/vp8/encoder/rdopt.c b/vp8/encoder/rdopt.c
index 908e97153..59d19e6fe 100644
--- a/vp8/encoder/rdopt.c
+++ b/vp8/encoder/rdopt.c
@@ -2145,10 +2145,6 @@ int vp8_rd_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset, int
                 {
                     int thissme;
                     int full_flag_thresh = 0;
-                    MV full_mvp;
-
-                    full_mvp.row = d->bmi.mv.as_mv.row <<3;    // use diamond search result as full search staring point
-                    full_mvp.col = d->bmi.mv.as_mv.col <<3;
 
                     // Update x->vector_range based on best vector found in step search
                     search_range = MAXF(abs((mvp.row>>3) - d->bmi.mv.as_mv.row), abs((mvp.col>>3) - d->bmi.mv.as_mv.col));
@@ -2167,7 +2163,8 @@ int vp8_rd_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset, int
 
                     {
                         int sadpb = x->sadperbit16 >> 2;
-                        thissme = cpi->full_search_sad(x, b, d, &full_mvp, sadpb, search_range, &cpi->fn_ptr[BLOCK_16X16], x->mvcost, &best_ref_mv);
+                        /* use diamond search result as full search staring point */
+                        thissme = cpi->full_search_sad(x, b, d, &d->bmi.mv.as_mv, sadpb, search_range, &cpi->fn_ptr[BLOCK_16X16], x->mvcost, &best_ref_mv);
                     }
 
                     // Barrier threshold to initiating full search