Merge "Re-design quantization process for 32x32 transform block"

2014-07-09 11:55:26 -07:00 · 2014-07-09 11:55:26 -07:00 · f6bf614b2f
commit f6bf614b2f
parent b84ee5a3d0 9ad1b9fc67
5 changed files with 68 additions and 19 deletions
--- a/vp9/common/vp9_rtcd_defs.pl
+++ b/vp9/common/vp9_rtcd_defs.pl
@ -717,6 +717,9 @@ specialize qw/vp9_subtract_block/, "$sse2_x86inc";
 add_proto qw/void vp9_quantize_fp/, "const int16_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr, const int16_t *dequant_ptr, int zbin_oq_value, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
 specialize qw/vp9_quantize_fp/, "$ssse3_x86_64";

+add_proto qw/void vp9_quantize_fp_32x32/, "const int16_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr, const int16_t *dequant_ptr, int zbin_oq_value, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
+specialize qw/vp9_quantize_fp_32x32/, "$ssse3_x86_64";
+
 add_proto qw/void vp9_quantize_b/, "const int16_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr, const int16_t *dequant_ptr, int zbin_oq_value, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
 specialize qw/vp9_quantize_b/, "$ssse3_x86_64";

--- a/vp9/encoder/vp9_encodemb.c
+++ b/vp9/encoder/vp9_encodemb.c
@ -320,10 +320,10 @@ void vp9_xform_quant_fp(MACROBLOCK *x, int plane, int block,
  switch (tx_size) {
    case TX_32X32:
      fdct32x32(x->use_lp32x32fdct, src_diff, coeff, diff_stride);
-      vp9_quantize_b_32x32(coeff, 1024, x->skip_block, p->zbin, p->round,
-                           p->quant, p->quant_shift, qcoeff, dqcoeff,
-                           pd->dequant, p->zbin_extra, eob, scan_order->scan,
-                           scan_order->iscan);
+      vp9_quantize_fp_32x32(coeff, 1024, x->skip_block, p->zbin, p->round_fp,
+                            p->quant_fp, p->quant_shift, qcoeff, dqcoeff,
+                            pd->dequant, p->zbin_extra, eob, scan_order->scan,
+                            scan_order->iscan);
      break;
    case TX_16X16:
      vp9_fdct16x16(src_diff, coeff, diff_stride);
--- a/vp9/encoder/vp9_quantize.c
+++ b/vp9/encoder/vp9_quantize.c
@ -104,6 +104,49 @@ void vp9_quantize_fp_c(const int16_t *coeff_ptr, intptr_t count,
  *eob_ptr = eob + 1;
 }

+// TODO(jingning) Refactor this file and combine functions with similar
+// operations.
+void vp9_quantize_fp_32x32_c(const int16_t *coeff_ptr, intptr_t n_coeffs,
+                             int skip_block,
+                             const int16_t *zbin_ptr, const int16_t *round_ptr,
+                             const int16_t *quant_ptr,
+                             const int16_t *quant_shift_ptr,
+                             int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr,
+                             const int16_t *dequant_ptr,
+                             int zbin_oq_value, uint16_t *eob_ptr,
+                             const int16_t *scan, const int16_t *iscan) {
+  int i, eob = -1;
+  (void)zbin_ptr;
+  (void)quant_shift_ptr;
+  (void)zbin_oq_value;
+  (void)iscan;
+
+  vpx_memset(qcoeff_ptr, 0, n_coeffs * sizeof(int16_t));
+  vpx_memset(dqcoeff_ptr, 0, n_coeffs * sizeof(int16_t));
+
+  if (!skip_block) {
+    for (i = 0; i < n_coeffs; i++) {
+      const int rc = scan[i];
+      const int coeff = coeff_ptr[rc];
+      const int coeff_sign = (coeff >> 31);
+      int tmp = 0;
+      int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
+
+      if (abs_coeff >= (dequant_ptr[rc != 0] >> 2)) {
+        abs_coeff += ROUND_POWER_OF_TWO(round_ptr[rc != 0], 1);
+        abs_coeff = clamp(abs_coeff, INT16_MIN, INT16_MAX);
+        tmp = (abs_coeff * quant_ptr[rc != 0]) >> 15;
+        qcoeff_ptr[rc] = (tmp ^ coeff_sign) - coeff_sign;
+        dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0] / 2;
+      }
+
+      if (tmp)
+        eob = i;
+    }
+  }
+  *eob_ptr = eob + 1;
+}
+
 void vp9_quantize_b_c(const int16_t *coeff_ptr, intptr_t count,
                      int skip_block,
                      const int16_t *zbin_ptr, const int16_t *round_ptr,
--- a/vp9/encoder/vp9_speed_features.c
+++ b/vp9/encoder/vp9_speed_features.c
@ -253,6 +253,7 @@ static void set_rt_speed_feature(VP9_COMP *cpi, SPEED_FEATURES *sf,
  }

  if (speed >= 5) {
+    sf->use_quant_fp = cm->frame_type == KEY_FRAME ? 0 : 1;
    sf->auto_min_max_partition_size = (cm->frame_type == KEY_FRAME) ?
        RELAXED_NEIGHBORING_MIN_MAX : STRICT_NEIGHBORING_MIN_MAX;
    sf->max_partition_size = BLOCK_32X32;
@ -287,7 +288,6 @@ static void set_rt_speed_feature(VP9_COMP *cpi, SPEED_FEATURES *sf,
    sf->mv.reduce_first_step_size = 1;
  }
  if (speed >= 7) {
-    sf->use_quant_fp = cm->frame_type == KEY_FRAME ? 0 : 1;
    sf->mv.fullpel_search_step_param = 10;
    sf->lpf_pick = LPF_PICK_MINIMAL_LPF;
    sf->encode_breakout_thresh = (MIN(cm->width, cm->height) >= 720) ?
--- a/vp9/encoder/x86/vp9_quantize_ssse3_x86_64.asm
+++ b/vp9/encoder/x86/vp9_quantize_ssse3_x86_64.asm
@ -234,21 +234,18 @@ cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, skip, zbin, round, quant, \
  movifnidn                   quantq, quantmp
  mova                            m1, [roundq]             ; m1 = round
  mova                            m2, [quantq]             ; m2 = quant
-%ifidn %1, b_32x32
-; TODO(jingning) to be continued with 32x32 quantization process
+%ifidn %1, fp_32x32
  pcmpeqw                         m5, m5
  psrlw                           m5, 15
-  paddw                           m0, m5
  paddw                           m1, m5
-  psrlw                           m0, 1                    ; m0 = (m0 + 1) / 2
  psrlw                           m1, 1                    ; m1 = (m1 + 1) / 2
 %endif
  mova                            m3, [r2q]                ; m3 = dequant
  mov                             r3, qcoeffmp
  mov                             r4, dqcoeffmp
  mov                             r5, iscanmp
-%ifidn %1, b_32x32
-  psllw                           m4, 1
+%ifidn %1, fp_32x32
+  psllw                           m2, 1
 %endif
  pxor                            m5, m5                   ; m5 = dedicated zero
  DEFINE_ARGS coeff, ncoeff, d1, qcoeff, dqcoeff, iscan, d2, d3, d4, d5, d6, eob
@ -275,18 +272,19 @@ cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, skip, zbin, round, quant, \
  psignw                         m13, m10                  ; m13 = reinsert sign
  mova        [qcoeffq+ncoeffq*2+ 0], m8
  mova        [qcoeffq+ncoeffq*2+16], m13
-%ifidn %1, b_32x32
+%ifidn %1, fp_32x32
  pabsw                           m8, m8
  pabsw                          m13, m13
 %endif
  pmullw                          m8, m3                   ; dqc[i] = qc[i] * q
  punpckhqdq                      m3, m3
  pmullw                         m13, m3                   ; dqc[i] = qc[i] * q
-%ifidn %1, b_32x32
+%ifidn %1, fp_32x32
  psrlw                           m8, 1
  psrlw                          m13, 1
  psignw                          m8, m9
  psignw                         m13, m10
+  psrlw                           m0, m3, 2
 %endif
  mova       [dqcoeffq+ncoeffq*2+ 0], m8
  mova       [dqcoeffq+ncoeffq*2+16], m13
@ -307,13 +305,17 @@ cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, skip, zbin, round, quant, \
  mova                           m10, [  coeffq+ncoeffq*2+16] ; m10 = c[i]
  pabsw                           m6, m9                   ; m6 = abs(m9)
  pabsw                          m11, m10                  ; m11 = abs(m10)
-  pcmpeqw                         m7, m7
-%ifidn %1, b_32x32
+%ifidn %1, fp_32x32
+  pcmpgtw                         m7, m6,  m0
+  pcmpgtw                        m12, m11, m0
  pmovmskb                        r6, m7
-  pmovmskb                        r2, m7
+  pmovmskb                        r2, m12
+
  or                              r6, r2
  jz .skip_iter
 %endif
+  pcmpeqw                         m7, m7
+
  paddsw                          m6, m1                   ; m6 += round
  paddsw                         m11, m1                   ; m11 += round
  pmulhw                         m14, m6, m2               ; m14 = m6*q>>16
@ -322,13 +324,13 @@ cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, skip, zbin, round, quant, \
  psignw                         m13, m10                  ; m13 = reinsert sign
  mova        [qcoeffq+ncoeffq*2+ 0], m14
  mova        [qcoeffq+ncoeffq*2+16], m13
-%ifidn %1, b_32x32
+%ifidn %1, fp_32x32
  pabsw                          m14, m14
  pabsw                          m13, m13
 %endif
  pmullw                         m14, m3                   ; dqc[i] = qc[i] * q
  pmullw                         m13, m3                   ; dqc[i] = qc[i] * q
-%ifidn %1, b_32x32
+%ifidn %1, fp_32x32
  psrlw                          m14, 1
  psrlw                          m13, 1
  psignw                         m14, m9
@ -349,7 +351,7 @@ cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, skip, zbin, round, quant, \
  add                        ncoeffq, mmsize
  jl .ac_only_loop

-%ifidn %1, b_32x32
+%ifidn %1, fp_32x32
  jmp .accumulate_eob
 .skip_iter:
  mova        [qcoeffq+ncoeffq*2+ 0], m5
@ -397,3 +399,4 @@ cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, skip, zbin, round, quant, \

 INIT_XMM ssse3
 QUANTIZE_FP fp, 7
+QUANTIZE_FP fp_32x32, 7