From 9ad1b9fc67921cfaf58ed85732ed139530edab4a Mon Sep 17 00:00:00 2001 From: Jingning Han Date: Mon, 7 Jul 2014 12:08:40 -0700 Subject: [PATCH] Re-design quantization process for 32x32 transform block This commit enables a new quantization process for 32x32 2D-DCT transform coefficient blocks. It improves the compression performance of speed 5 by 1.4%. The overall compression gains of speed 5 due to the new quantization scheme is 4.7%. It also includes the SSSE3 implementation of the 32x32 quantization process. Change-Id: I0855b124fd6462418683f783f5bcb44255c9993b --- vp9/common/vp9_rtcd_defs.pl | 3 ++ vp9/encoder/vp9_encodemb.c | 8 ++-- vp9/encoder/vp9_quantize.c | 43 +++++++++++++++++++ vp9/encoder/vp9_speed_features.c | 2 +- vp9/encoder/x86/vp9_quantize_ssse3_x86_64.asm | 31 +++++++------ 5 files changed, 68 insertions(+), 19 deletions(-) diff --git a/vp9/common/vp9_rtcd_defs.pl b/vp9/common/vp9_rtcd_defs.pl index f52dccbf4..b182f3fe3 100644 --- a/vp9/common/vp9_rtcd_defs.pl +++ b/vp9/common/vp9_rtcd_defs.pl @@ -717,6 +717,9 @@ specialize qw/vp9_subtract_block/, "$sse2_x86inc"; add_proto qw/void vp9_quantize_fp/, "const int16_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr, const int16_t *dequant_ptr, int zbin_oq_value, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan"; specialize qw/vp9_quantize_fp/, "$ssse3_x86_64"; +add_proto qw/void vp9_quantize_fp_32x32/, "const int16_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr, const int16_t *dequant_ptr, int zbin_oq_value, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan"; +specialize qw/vp9_quantize_fp_32x32/, "$ssse3_x86_64"; + add_proto qw/void vp9_quantize_b/, "const int16_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr, const int16_t *dequant_ptr, int zbin_oq_value, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan"; specialize qw/vp9_quantize_b/, "$ssse3_x86_64"; diff --git a/vp9/encoder/vp9_encodemb.c b/vp9/encoder/vp9_encodemb.c index eb9624dde..cd0191e0a 100644 --- a/vp9/encoder/vp9_encodemb.c +++ b/vp9/encoder/vp9_encodemb.c @@ -320,10 +320,10 @@ void vp9_xform_quant_fp(MACROBLOCK *x, int plane, int block, switch (tx_size) { case TX_32X32: fdct32x32(x->use_lp32x32fdct, src_diff, coeff, diff_stride); - vp9_quantize_b_32x32(coeff, 1024, x->skip_block, p->zbin, p->round, - p->quant, p->quant_shift, qcoeff, dqcoeff, - pd->dequant, p->zbin_extra, eob, scan_order->scan, - scan_order->iscan); + vp9_quantize_fp_32x32(coeff, 1024, x->skip_block, p->zbin, p->round_fp, + p->quant_fp, p->quant_shift, qcoeff, dqcoeff, + pd->dequant, p->zbin_extra, eob, scan_order->scan, + scan_order->iscan); break; case TX_16X16: vp9_fdct16x16(src_diff, coeff, diff_stride); diff --git a/vp9/encoder/vp9_quantize.c b/vp9/encoder/vp9_quantize.c index 4964e0fd0..370e1ce77 100644 --- a/vp9/encoder/vp9_quantize.c +++ b/vp9/encoder/vp9_quantize.c @@ -104,6 +104,49 @@ void vp9_quantize_fp_c(const int16_t *coeff_ptr, intptr_t count, *eob_ptr = eob + 1; } +// TODO(jingning) Refactor this file and combine functions with similar +// operations. +void vp9_quantize_fp_32x32_c(const int16_t *coeff_ptr, intptr_t n_coeffs, + int skip_block, + const int16_t *zbin_ptr, const int16_t *round_ptr, + const int16_t *quant_ptr, + const int16_t *quant_shift_ptr, + int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr, + const int16_t *dequant_ptr, + int zbin_oq_value, uint16_t *eob_ptr, + const int16_t *scan, const int16_t *iscan) { + int i, eob = -1; + (void)zbin_ptr; + (void)quant_shift_ptr; + (void)zbin_oq_value; + (void)iscan; + + vpx_memset(qcoeff_ptr, 0, n_coeffs * sizeof(int16_t)); + vpx_memset(dqcoeff_ptr, 0, n_coeffs * sizeof(int16_t)); + + if (!skip_block) { + for (i = 0; i < n_coeffs; i++) { + const int rc = scan[i]; + const int coeff = coeff_ptr[rc]; + const int coeff_sign = (coeff >> 31); + int tmp = 0; + int abs_coeff = (coeff ^ coeff_sign) - coeff_sign; + + if (abs_coeff >= (dequant_ptr[rc != 0] >> 2)) { + abs_coeff += ROUND_POWER_OF_TWO(round_ptr[rc != 0], 1); + abs_coeff = clamp(abs_coeff, INT16_MIN, INT16_MAX); + tmp = (abs_coeff * quant_ptr[rc != 0]) >> 15; + qcoeff_ptr[rc] = (tmp ^ coeff_sign) - coeff_sign; + dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0] / 2; + } + + if (tmp) + eob = i; + } + } + *eob_ptr = eob + 1; +} + void vp9_quantize_b_c(const int16_t *coeff_ptr, intptr_t count, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, diff --git a/vp9/encoder/vp9_speed_features.c b/vp9/encoder/vp9_speed_features.c index 1eac02f99..9fe7b34af 100644 --- a/vp9/encoder/vp9_speed_features.c +++ b/vp9/encoder/vp9_speed_features.c @@ -250,6 +250,7 @@ static void set_rt_speed_feature(VP9_COMP *cpi, SPEED_FEATURES *sf, } if (speed >= 5) { + sf->use_quant_fp = cm->frame_type == KEY_FRAME ? 0 : 1; sf->auto_min_max_partition_size = (cm->frame_type == KEY_FRAME) ? RELAXED_NEIGHBORING_MIN_MAX : STRICT_NEIGHBORING_MIN_MAX; sf->max_partition_size = BLOCK_32X32; @@ -282,7 +283,6 @@ static void set_rt_speed_feature(VP9_COMP *cpi, SPEED_FEATURES *sf, sf->elevate_newmv_thresh = 2000; } if (speed >= 7) { - sf->use_quant_fp = cm->frame_type == KEY_FRAME ? 0 : 1; sf->mv.fullpel_search_step_param = 10; sf->lpf_pick = LPF_PICK_MINIMAL_LPF; sf->encode_breakout_thresh = (MIN(cm->width, cm->height) >= 720) ? diff --git a/vp9/encoder/x86/vp9_quantize_ssse3_x86_64.asm b/vp9/encoder/x86/vp9_quantize_ssse3_x86_64.asm index 2d9f2b056..508e1d4f5 100644 --- a/vp9/encoder/x86/vp9_quantize_ssse3_x86_64.asm +++ b/vp9/encoder/x86/vp9_quantize_ssse3_x86_64.asm @@ -234,21 +234,18 @@ cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, skip, zbin, round, quant, \ movifnidn quantq, quantmp mova m1, [roundq] ; m1 = round mova m2, [quantq] ; m2 = quant -%ifidn %1, b_32x32 -; TODO(jingning) to be continued with 32x32 quantization process +%ifidn %1, fp_32x32 pcmpeqw m5, m5 psrlw m5, 15 - paddw m0, m5 paddw m1, m5 - psrlw m0, 1 ; m0 = (m0 + 1) / 2 psrlw m1, 1 ; m1 = (m1 + 1) / 2 %endif mova m3, [r2q] ; m3 = dequant mov r3, qcoeffmp mov r4, dqcoeffmp mov r5, iscanmp -%ifidn %1, b_32x32 - psllw m4, 1 +%ifidn %1, fp_32x32 + psllw m2, 1 %endif pxor m5, m5 ; m5 = dedicated zero DEFINE_ARGS coeff, ncoeff, d1, qcoeff, dqcoeff, iscan, d2, d3, d4, d5, d6, eob @@ -275,18 +272,19 @@ cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, skip, zbin, round, quant, \ psignw m13, m10 ; m13 = reinsert sign mova [qcoeffq+ncoeffq*2+ 0], m8 mova [qcoeffq+ncoeffq*2+16], m13 -%ifidn %1, b_32x32 +%ifidn %1, fp_32x32 pabsw m8, m8 pabsw m13, m13 %endif pmullw m8, m3 ; dqc[i] = qc[i] * q punpckhqdq m3, m3 pmullw m13, m3 ; dqc[i] = qc[i] * q -%ifidn %1, b_32x32 +%ifidn %1, fp_32x32 psrlw m8, 1 psrlw m13, 1 psignw m8, m9 psignw m13, m10 + psrlw m0, m3, 2 %endif mova [dqcoeffq+ncoeffq*2+ 0], m8 mova [dqcoeffq+ncoeffq*2+16], m13 @@ -307,13 +305,17 @@ cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, skip, zbin, round, quant, \ mova m10, [ coeffq+ncoeffq*2+16] ; m10 = c[i] pabsw m6, m9 ; m6 = abs(m9) pabsw m11, m10 ; m11 = abs(m10) - pcmpeqw m7, m7 -%ifidn %1, b_32x32 +%ifidn %1, fp_32x32 + pcmpgtw m7, m6, m0 + pcmpgtw m12, m11, m0 pmovmskb r6, m7 - pmovmskb r2, m7 + pmovmskb r2, m12 + or r6, r2 jz .skip_iter %endif + pcmpeqw m7, m7 + paddsw m6, m1 ; m6 += round paddsw m11, m1 ; m11 += round pmulhw m14, m6, m2 ; m14 = m6*q>>16 @@ -322,13 +324,13 @@ cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, skip, zbin, round, quant, \ psignw m13, m10 ; m13 = reinsert sign mova [qcoeffq+ncoeffq*2+ 0], m14 mova [qcoeffq+ncoeffq*2+16], m13 -%ifidn %1, b_32x32 +%ifidn %1, fp_32x32 pabsw m14, m14 pabsw m13, m13 %endif pmullw m14, m3 ; dqc[i] = qc[i] * q pmullw m13, m3 ; dqc[i] = qc[i] * q -%ifidn %1, b_32x32 +%ifidn %1, fp_32x32 psrlw m14, 1 psrlw m13, 1 psignw m14, m9 @@ -349,7 +351,7 @@ cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, skip, zbin, round, quant, \ add ncoeffq, mmsize jl .ac_only_loop -%ifidn %1, b_32x32 +%ifidn %1, fp_32x32 jmp .accumulate_eob .skip_iter: mova [qcoeffq+ncoeffq*2+ 0], m5 @@ -397,3 +399,4 @@ cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, skip, zbin, round, quant, \ INIT_XMM ssse3 QUANTIZE_FP fp, 7 +QUANTIZE_FP fp_32x32, 7