From c32e0ecc592d12573199c992f0fb710b7785c5eb Mon Sep 17 00:00:00 2001 From: Johann Date: Thu, 24 Mar 2011 13:31:10 -0400 Subject: [PATCH 1/2] use asm_offsets with vp8_fast_quantize_b_sse2 on the same order as the regular quantize change: ~2% Change-Id: I5c9eec18e89ae7345dd96945cb740e6f349cee86 --- vp8/encoder/x86/quantize_sse2.asm | 139 +++++++++++++++---------- vp8/encoder/x86/quantize_x86.h | 4 + vp8/encoder/x86/x86_csystemdependent.c | 27 +---- 3 files changed, 90 insertions(+), 80 deletions(-) diff --git a/vp8/encoder/x86/quantize_sse2.asm b/vp8/encoder/x86/quantize_sse2.asm index e00faebd1..5e40dc7de 100644 --- a/vp8/encoder/x86/quantize_sse2.asm +++ b/vp8/encoder/x86/quantize_sse2.asm @@ -233,72 +233,97 @@ ZIGZAG_LOOP 15 pop rbp ret -; int vp8_fast_quantize_b_impl_sse2 | arg -; (short *coeff_ptr, | 0 -; short *qcoeff_ptr, | 1 -; short *dequant_ptr, | 2 -; short *inv_scan_order, | 3 -; short *round_ptr, | 4 -; short *quant_ptr, | 5 -; short *dqcoeff_ptr) | 6 +; void vp8_fast_quantize_b_sse2 | arg +; (BLOCK *b, | 0 +; BLOCKD *d) | 1 -global sym(vp8_fast_quantize_b_impl_sse2) -sym(vp8_fast_quantize_b_impl_sse2): +global sym(vp8_fast_quantize_b_sse2) +sym(vp8_fast_quantize_b_sse2): push rbp mov rbp, rsp - SHADOW_ARGS_TO_STACK 7 - push rsi + GET_GOT rbx + +%if ABI_IS_32BIT push rdi + push rsi +%else + %ifidn __OUTPUT_FORMAT__,x64 + push rdi + push rsi + %else + ; these registers are used for passing arguments + %endif +%endif + ; end prolog - mov rdx, arg(0) ;coeff_ptr - mov rcx, arg(2) ;dequant_ptr - mov rdi, arg(4) ;round_ptr - mov rsi, arg(5) ;quant_ptr +%if ABI_IS_32BIT + mov rdi, arg(0) ; BLOCK *b + mov rsi, arg(1) ; BLOCKD *d +%else + %ifidn __OUTPUT_FORMAT__,x64 + mov rdi, rcx ; BLOCK *b + mov rsi, rdx ; BLOCKD *d + %else + ;mov rdi, rdi ; BLOCK *b + ;mov rsi, rsi ; BLOCKD *d + %endif +%endif - movdqa xmm0, XMMWORD PTR[rdx] - movdqa xmm4, XMMWORD PTR[rdx + 16] + mov rax, [rdi + vp8_block_coeff] + mov rcx, [rdi + vp8_block_round] + mov rdx, [rdi + vp8_block_quant_fast] - movdqa xmm2, XMMWORD PTR[rdi] ;round lo - movdqa xmm3, XMMWORD PTR[rdi + 16] ;round hi + ; z = coeff + movdqa xmm0, [rax] + movdqa xmm4, [rax + 16] + ; dup z so we can save sz movdqa xmm1, xmm0 movdqa xmm5, xmm4 - psraw xmm0, 15 ;sign of z (aka sz) - psraw xmm4, 15 ;sign of z (aka sz) - - pxor xmm1, xmm0 - pxor xmm5, xmm4 - psubw xmm1, xmm0 ;x = abs(z) - psubw xmm5, xmm4 ;x = abs(z) - - paddw xmm1, xmm2 - paddw xmm5, xmm3 - - pmulhw xmm1, XMMWORD PTR[rsi] - pmulhw xmm5, XMMWORD PTR[rsi + 16] - - mov rdi, arg(1) ;qcoeff_ptr - mov rsi, arg(6) ;dqcoeff_ptr - - movdqa xmm2, XMMWORD PTR[rcx] - movdqa xmm3, XMMWORD PTR[rcx + 16] + ; sz = z >> 15 + psraw xmm0, 15 + psraw xmm4, 15 + ; x = abs(z) = (z ^ sz) - sz pxor xmm1, xmm0 pxor xmm5, xmm4 psubw xmm1, xmm0 psubw xmm5, xmm4 - movdqa XMMWORD PTR[rdi], xmm1 - movdqa XMMWORD PTR[rdi + 16], xmm5 + ; x += round + paddw xmm1, [rcx] + paddw xmm5, [rcx + 16] - pmullw xmm2, xmm1 - pmullw xmm3, xmm5 + mov rax, [rsi + vp8_blockd_qcoeff] + mov rcx, [rsi + vp8_blockd_dequant] + mov rdi, [rsi + vp8_blockd_dqcoeff] - mov rdi, arg(3) ;inv_scan_order + ; y = x * quant >> 16 + pmulhw xmm1, [rdx] + pmulhw xmm5, [rdx + 16] + + ; x = (y ^ sz) - sz + pxor xmm1, xmm0 + pxor xmm5, xmm4 + psubw xmm1, xmm0 + psubw xmm5, xmm4 + + ; qcoeff = x + movdqa [rax], xmm1 + movdqa [rax + 16], xmm5 + + ; x * dequant + movdqa xmm2, xmm1 + movdqa xmm3, xmm5 + pmullw xmm2, [rcx] + pmullw xmm3, [rcx + 16] + + ; dqcoeff = x * dequant + movdqa [rdi], xmm2 + movdqa [rdi + 16], xmm3 - ; Start with 16 pxor xmm4, xmm4 ;clear all bits pcmpeqw xmm1, xmm4 pcmpeqw xmm5, xmm4 @@ -307,8 +332,8 @@ sym(vp8_fast_quantize_b_impl_sse2): pxor xmm1, xmm4 pxor xmm5, xmm4 - pand xmm1, XMMWORD PTR[rdi] - pand xmm5, XMMWORD PTR[rdi+16] + pand xmm1, [GLOBAL(inv_zig_zag)] + pand xmm5, [GLOBAL(inv_zig_zag + 16)] pmaxsw xmm1, xmm5 @@ -327,16 +352,22 @@ sym(vp8_fast_quantize_b_impl_sse2): pmaxsw xmm1, xmm5 - movd rax, xmm1 - and rax, 0xff - - movdqa XMMWORD PTR[rsi], xmm2 ;store dqcoeff - movdqa XMMWORD PTR[rsi + 16], xmm3 ;store dqcoeff + movd eax, xmm1 + and eax, 0xff + mov [rsi + vp8_blockd_eob], eax ; begin epilog - pop rdi +%if ABI_IS_32BIT pop rsi - UNSHADOW_ARGS + pop rdi +%else + %ifidn __OUTPUT_FORMAT__,x64 + pop rsi + pop rdi + %endif +%endif + + RESTORE_GOT pop rbp ret diff --git a/vp8/encoder/x86/quantize_x86.h b/vp8/encoder/x86/quantize_x86.h index 6f54bec31..df2e0bc39 100644 --- a/vp8/encoder/x86/quantize_x86.h +++ b/vp8/encoder/x86/quantize_x86.h @@ -24,12 +24,16 @@ #if HAVE_SSE2 extern prototype_quantize_block(vp8_regular_quantize_b_sse2); +extern prototype_quantize_block(vp8_fast_quantize_b_sse2); #if !CONFIG_RUNTIME_CPU_DETECT #undef vp8_quantize_quantb #define vp8_quantize_quantb vp8_regular_quantize_b_sse2 +#undef vp8_quantize_fastquantb +#define vp8_quantize_fastquantb vp8_fast_quantize_b_sse2 + #endif #endif diff --git a/vp8/encoder/x86/x86_csystemdependent.c b/vp8/encoder/x86/x86_csystemdependent.c index 2b6bd98eb..8bceecec4 100644 --- a/vp8/encoder/x86/x86_csystemdependent.c +++ b/vp8/encoder/x86/x86_csystemdependent.c @@ -81,31 +81,6 @@ static void subtract_b_mmx(BLOCK *be, BLOCKD *bd, int pitch) #endif #if HAVE_SSE2 -int vp8_fast_quantize_b_impl_sse2(short *coeff_ptr, - short *qcoeff_ptr, short *dequant_ptr, - const short *inv_scan_order, short *round_ptr, - short *quant_ptr, short *dqcoeff_ptr); -static void fast_quantize_b_sse2(BLOCK *b, BLOCKD *d) -{ - short *scan_mask = vp8_default_zig_zag_mask;//d->scan_order_mask_ptr; - short *coeff_ptr = b->coeff; - short *round_ptr = b->round; - short *quant_ptr = b->quant_fast; - short *qcoeff_ptr = d->qcoeff; - short *dqcoeff_ptr = d->dqcoeff; - short *dequant_ptr = d->dequant; - - d->eob = vp8_fast_quantize_b_impl_sse2( - coeff_ptr, - qcoeff_ptr, - dequant_ptr, - vp8_default_inv_zig_zag, - round_ptr, - quant_ptr, - dqcoeff_ptr - ); -} - int vp8_mbblock_error_xmm_impl(short *coeff_ptr, short *dcoef_ptr, int dc); static int mbblock_error_xmm(MACROBLOCK *mb, int dc) { @@ -294,7 +269,7 @@ void vp8_arch_x86_encoder_init(VP8_COMP *cpi) cpi->rtcd.encodemb.submbuv = vp8_subtract_mbuv_sse2; cpi->rtcd.quantize.quantb = vp8_regular_quantize_b_sse2; - cpi->rtcd.quantize.fastquantb = fast_quantize_b_sse2; + cpi->rtcd.quantize.fastquantb = vp8_fast_quantize_b_sse2; #if !(CONFIG_REALTIME_ONLY) cpi->rtcd.temporal.apply = vp8_temporal_filter_apply_sse2; From 02423b2e9219eab817235715ea8a89709f97a26e Mon Sep 17 00:00:00 2001 From: Yunqing Wang Date: Wed, 6 Apr 2011 09:08:47 -0400 Subject: [PATCH 2/2] Minor modification A small change. Change-Id: I2e7726e58370a95d0319361f4f6ad231138d1328 --- vp8/encoder/mcomp.c | 12 ++++++------ vp8/encoder/rdopt.c | 7 ++----- 2 files changed, 8 insertions(+), 11 deletions(-) diff --git a/vp8/encoder/mcomp.c b/vp8/encoder/mcomp.c index 37c30da14..716f514af 100644 --- a/vp8/encoder/mcomp.c +++ b/vp8/encoder/mcomp.c @@ -1208,8 +1208,8 @@ int vp8_full_search_sad(MACROBLOCK *x, BLOCK *b, BLOCKD *d, MV *ref_mv, int erro unsigned char *check_here; int thissad; - int ref_row = ref_mv->row >> 3; - int ref_col = ref_mv->col >> 3; + int ref_row = ref_mv->row; + int ref_col = ref_mv->col; int row_min = ref_row - distance; int row_max = ref_row + distance; @@ -1303,8 +1303,8 @@ int vp8_full_search_sadx3(MACROBLOCK *x, BLOCK *b, BLOCKD *d, MV *ref_mv, int er unsigned char *check_here; unsigned int thissad; - int ref_row = ref_mv->row >> 3; - int ref_col = ref_mv->col >> 3; + int ref_row = ref_mv->row; + int ref_col = ref_mv->col; int row_min = ref_row - distance; int row_max = ref_row + distance; @@ -1431,8 +1431,8 @@ int vp8_full_search_sadx8(MACROBLOCK *x, BLOCK *b, BLOCKD *d, MV *ref_mv, int er unsigned char *check_here; unsigned int thissad; - int ref_row = ref_mv->row >> 3; - int ref_col = ref_mv->col >> 3; + int ref_row = ref_mv->row; + int ref_col = ref_mv->col; int row_min = ref_row - distance; int row_max = ref_row + distance; diff --git a/vp8/encoder/rdopt.c b/vp8/encoder/rdopt.c index 908e97153..59d19e6fe 100644 --- a/vp8/encoder/rdopt.c +++ b/vp8/encoder/rdopt.c @@ -2145,10 +2145,6 @@ int vp8_rd_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset, int { int thissme; int full_flag_thresh = 0; - MV full_mvp; - - full_mvp.row = d->bmi.mv.as_mv.row <<3; // use diamond search result as full search staring point - full_mvp.col = d->bmi.mv.as_mv.col <<3; // Update x->vector_range based on best vector found in step search search_range = MAXF(abs((mvp.row>>3) - d->bmi.mv.as_mv.row), abs((mvp.col>>3) - d->bmi.mv.as_mv.col)); @@ -2167,7 +2163,8 @@ int vp8_rd_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset, int { int sadpb = x->sadperbit16 >> 2; - thissme = cpi->full_search_sad(x, b, d, &full_mvp, sadpb, search_range, &cpi->fn_ptr[BLOCK_16X16], x->mvcost, &best_ref_mv); + /* use diamond search result as full search staring point */ + thissme = cpi->full_search_sad(x, b, d, &d->bmi.mv.as_mv, sadpb, search_range, &cpi->fn_ptr[BLOCK_16X16], x->mvcost, &best_ref_mv); } // Barrier threshold to initiating full search