From bb6bcbccda296e97bccad8a871dd60653ee68194 Mon Sep 17 00:00:00 2001 From: Johann Date: Fri, 11 Feb 2011 11:09:00 -0500 Subject: [PATCH 1/5] remove assembly detokenizer hasn't been kept up to date. remove it to avoid confusion. Change-Id: I52ffde19b59fec5c7a381299ca2e85cb38330be7 --- configure | 3 - vp8/decoder/arm/detokenize.asm | 320 ------------------------------- vp8/decoder/arm/detokenize_arm.h | 22 --- vp8/decoder/detokenize.c | 61 ------ vp8/decoder/detokenize.h | 4 - vp8/decoder/onyxd_if.c | 3 - vp8/vp8dx_arm.mk | 1 - 7 files changed, 414 deletions(-) delete mode 100644 vp8/decoder/arm/detokenize.asm delete mode 100644 vp8/decoder/arm/detokenize_arm.h diff --git a/configure b/configure index ed1990607..0f55de3af 100755 --- a/configure +++ b/configure @@ -40,7 +40,6 @@ Advanced options: ${toggle_runtime_cpu_detect} runtime cpu detection ${toggle_shared} shared library support ${toggle_small} favor smaller size over speed - ${toggle_arm_asm_detok} assembly version of the detokenizer (ARM platforms only) ${toggle_postproc_visualizer} macro block / block level visualizers Codecs: @@ -255,7 +254,6 @@ CONFIG_LIST=" realtime_only shared small - arm_asm_detok postproc_visualizer os_support " @@ -296,7 +294,6 @@ CMDLINE_SELECT=" realtime_only shared small - arm_asm_detok postproc_visualizer " diff --git a/vp8/decoder/arm/detokenize.asm b/vp8/decoder/arm/detokenize.asm deleted file mode 100644 index 0c164f191..000000000 --- a/vp8/decoder/arm/detokenize.asm +++ /dev/null @@ -1,320 +0,0 @@ -; -; Copyright (c) 2010 The WebM project authors. All Rights Reserved. -; -; Use of this source code is governed by a BSD-style license -; that can be found in the LICENSE file in the root of the source -; tree. An additional intellectual property rights grant can be found -; in the file PATENTS. All contributing project authors may -; be found in the AUTHORS file in the root of the source tree. -; - - - EXPORT |vp8_decode_mb_tokens_v6| - - AREA |.text|, CODE, READONLY ; name this block of code - - INCLUDE asm_dec_offsets.asm - -l_qcoeff EQU 0 -l_i EQU 4 -l_type EQU 8 -l_stop EQU 12 -l_c EQU 16 -l_l_ptr EQU 20 -l_a_ptr EQU 24 -l_bc EQU 28 -l_coef_ptr EQU 32 -l_stacksize EQU 64 - - -;; constant offsets -- these should be created at build time -c_block2above_offset EQU 25 -c_entropy_nodes EQU 11 -c_dct_eob_token EQU 11 - -|vp8_decode_mb_tokens_v6| PROC - stmdb sp!, {r4 - r11, lr} - sub sp, sp, #l_stacksize - mov r7, r1 ; type - mov r9, r0 ; detoken - - ldr r1, [r9, #detok_current_bc] - ldr r0, [r9, #detok_qcoeff_start_ptr] - mov r11, #0 ; i - mov r3, #16 ; stop - - cmp r7, #1 ; type ?= 1 - addeq r11, r11, #24 ; i = 24 - addeq r3, r3, #8 ; stop = 24 - addeq r0, r0, #3, 24 ; qcoefptr += 24*16 - - str r0, [sp, #l_qcoeff] - str r11, [sp, #l_i] - str r7, [sp, #l_type] - str r3, [sp, #l_stop] - str r1, [sp, #l_bc] - - add lr, r9, r7, lsl #2 ; detoken + type*4 - - ldr r8, [r1, #bool_decoder_user_buffer] - - ldr r10, [lr, #detok_coef_probs] - ldr r5, [r1, #bool_decoder_count] - ldr r6, [r1, #bool_decoder_range] - ldr r4, [r1, #bool_decoder_value] - - str r10, [sp, #l_coef_ptr] - -BLOCK_LOOP - ldr r3, [r9, #detok_ptr_block2leftabove] - ldr r1, [r9, #detok_L] - ldr r2, [r9, #detok_A] - ldrb r12, [r3, r11]! ; block2left[i] - ldrb r3, [r3, #c_block2above_offset]; block2above[i] - - cmp r7, #0 ; c = !type - moveq r7, #1 - movne r7, #0 - - ldrb r0, [r1, r12]! ; *(L += block2left[i]) - ldrb r3, [r2, r3]! ; *(A += block2above[i]) - mov lr, #c_entropy_nodes ; ENTROPY_NODES = 11 - -; VP8_COMBINEENTROPYCONTETEXTS(t, *a, *l) => t = ((*a) != 0) + ((*l) !=0) - cmp r0, #0 ; *l ?= 0 - movne r0, #1 - cmp r3, #0 ; *a ?= 0 - addne r0, r0, #1 ; t - - str r1, [sp, #l_l_ptr] ; save &l - str r2, [sp, #l_a_ptr] ; save &a - smlabb r0, r0, lr, r10 ; Prob = coef_probs + (t * ENTROPY_NODES) - mov r1, #0 ; t = 0 - str r7, [sp, #l_c] - - ;align 4 -COEFF_LOOP - ldr r3, [r9, #detok_ptr_coef_bands_x] - ldr lr, [r9, #detok_coef_tree_ptr] - ;STALL - ldrb r3, [r3, r7] ; coef_bands_x[c] - ;STALL - ;STALL - add r0, r0, r3 ; Prob += coef_bands_x[c] - -get_token_loop - ldrb r2, [r0, +r1, asr #1] ; Prob[t >> 1] - mov r3, r6, lsl #8 ; range << 8 - sub r3, r3, #256 ; (range << 8) - (1 << 8) - mov r10, #1 ; 1 - - smlawb r2, r3, r2, r10 ; split = 1 + (((range-1) * probability) >> 8) - - ldrb r12, [r8] ; load cx data byte in stall slot : r8 = bufptr - ;++ - - subs r3, r4, r2, lsl #24 ; value-(split<<24): used later to calculate shift for NORMALIZE - addhs r1, r1, #1 ; t += 1 - movhs r4, r3 ; value -= bigsplit (split << 24) - subhs r2, r6, r2 ; range -= split - ; movlo r6, r2 ; range = split - - ldrsb r1, [lr, r1] ; t = onyx_coef_tree_ptr[t] - -; NORMALIZE - clz r3, r2 ; vp8dx_bitreader_norm[range] + 24 - sub r3, r3, #24 ; vp8dx_bitreader_norm[range] - subs r5, r5, r3 ; count -= shift - mov r6, r2, lsl r3 ; range <<= shift - mov r4, r4, lsl r3 ; value <<= shift - -; if count <= 0, += BR_COUNT; value |= *bufptr++ << (BR_COUNT-count); BR_COUNT = 8, but need to upshift values by +16 - addle r5, r5, #8 ; count += 8 - rsble r3, r5, #24 ; 24 - count - addle r8, r8, #1 ; bufptr++ - orrle r4, r4, r12, lsl r3 ; value |= *bufptr << shift + 16 - - cmp r1, #0 ; t ?= 0 - bgt get_token_loop ; while (t > 0) - - cmn r1, #c_dct_eob_token ; if(t == -DCT_EOB_TOKEN) - beq END_OF_BLOCK ; break - - rsb lr, r1, #0 ; v = -t; - - cmp lr, #4 ; if(v > FOUR_TOKEN) - ble SKIP_EXTRABITS - - ldr r3, [r9, #detok_teb_base_ptr] - mov r11, #1 ; 1 in split = 1 + ... nope, v+= 1 << bits_count - add r7, r3, lr, lsl #4 ; detok_teb_base_ptr + (v << 4) - - ldrsh lr, [r7, #tokenextrabits_min_val] ; v = teb_ptr->min_val - ldrsh r0, [r7, #tokenextrabits_length] ; bits_count = teb_ptr->Length - -extrabits_loop - add r3, r0, r7 ; &teb_ptr->Probs[bits_count] - - ldrb r2, [r3, #4] ; probability. why +4? - mov r3, r6, lsl #8 ; range << 8 - sub r3, r3, #256 ; range << 8 + 1 << 8 - - smlawb r2, r3, r2, r11 ; split = 1 + (((range-1) * probability) >> 8) - - ldrb r12, [r8] ; *bufptr - ;++ - - subs r10, r4, r2, lsl #24 ; value - (split<<24) - movhs r4, r10 ; value = value - (split << 24) - subhs r2, r6, r2 ; range = range - split - addhs lr, lr, r11, lsl r0 ; v += ((UINT16)1<> 1 - - subs r3, r4, r2, lsl #24 ; value - (split<<24) - movhs r4, r3 ; value -= (split << 24) - subhs r2, r6, r2 ; range -= split - mvnhs r3, lr ; -v - addhs lr, r3, #1 ; v = (v ^ -1) + 1 - -; NORMALIZE - clz r3, r2 ; leading 0s in split - sub r3, r3, #24 ; shift - subs r5, r5, r3 ; count -= shift - mov r6, r2, lsl r3 ; range <<= shift - mov r4, r4, lsl r3 ; value <<= shift - ldrleb r2, [r8], #1 ; *(bufptr++) - addle r5, r5, #8 ; count += 8 - rsble r3, r5, #24 ; BR_COUNT - count - orrle r4, r4, r2, lsl r3 ; value |= *bufptr << (BR_COUNT - count) - - add r0, r0, #11 ; Prob += ENTROPY_NODES (11) - - cmn r1, #1 ; t < -ONE_TOKEN - - addlt r0, r0, #11 ; Prob += ENTROPY_NODES (11) - - mvn r1, #1 ; t = -1 ???? C is -2 - -SKIP_EOB_CHECK - ldr r7, [sp, #l_c] ; c - ldr r3, [r9, #detok_scan] - add r1, r1, #2 ; t+= 2 - cmp r7, #15 ; c should will be one higher - - ldr r3, [r3, +r7, lsl #2] ; scan[c] this needs pre-inc c value - add r7, r7, #1 ; c++ - add r3, r11, r3, lsl #1 ; qcoeff + scan[c] - - str r7, [sp, #l_c] ; store c - strh lr, [r3] ; qcoef_ptr[scan[c]] = v - - blt COEFF_LOOP - - sub r7, r7, #1 ; if(t != -DCT_EOB_TOKEN) --c - -END_OF_BLOCK - ldr r3, [sp, #l_type] ; type - ldr r10, [sp, #l_coef_ptr] ; coef_ptr - ldr r0, [sp, #l_qcoeff] ; qcoeff - ldr r11, [sp, #l_i] ; i - ldr r12, [sp, #l_stop] ; stop - - cmp r3, #0 ; type ?= 0 - moveq r1, #1 - movne r1, #0 - add r3, r11, r9 ; detok + i - - cmp r7, r1 ; c ?= !type - strb r7, [r3, #detok_eob] ; eob[i] = c - - ldr r7, [sp, #l_l_ptr] ; l - ldr r2, [sp, #l_a_ptr] ; a - movne r3, #1 ; t - moveq r3, #0 - - add r0, r0, #32 ; qcoeff += 32 (16 * 2?) - add r11, r11, #1 ; i++ - strb r3, [r7] ; *l = t - strb r3, [r2] ; *a = t - str r0, [sp, #l_qcoeff] ; qcoeff - str r11, [sp, #l_i] ; i - - cmp r11, r12 ; i < stop - ldr r7, [sp, #l_type] ; type - - blt BLOCK_LOOP - - cmp r11, #25 ; i ?= 25 - bne ln2_decode_mb_to - - ldr r12, [r9, #detok_qcoeff_start_ptr] - ldr r10, [r9, #detok_coef_probs] - mov r7, #0 ; type/i = 0 - mov r3, #16 ; stop = 16 - str r12, [sp, #l_qcoeff] ; qcoeff_ptr = qcoeff_start_ptr - str r7, [sp, #l_i] - str r7, [sp, #l_type] - str r3, [sp, #l_stop] - - str r10, [sp, #l_coef_ptr] ; coef_probs = coef_probs[type=0] - - b BLOCK_LOOP - -ln2_decode_mb_to - cmp r11, #16 ; i ?= 16 - bne ln1_decode_mb_to - - mov r10, #detok_coef_probs - add r10, r10, #2*4 ; coef_probs[type] - ldr r10, [r9, r10] ; detok + detok_coef_probs[type] - - mov r7, #2 ; type = 2 - mov r3, #24 ; stop = 24 - - str r7, [sp, #l_type] - str r3, [sp, #l_stop] - - str r10, [sp, #l_coef_ptr] ; coef_probs = coef_probs[type] - b BLOCK_LOOP - -ln1_decode_mb_to - ldr r2, [sp, #l_bc] - mov r0, #0 - nop - - str r8, [r2, #bool_decoder_user_buffer] - str r5, [r2, #bool_decoder_count] - str r4, [r2, #bool_decoder_value] - str r6, [r2, #bool_decoder_range] - - add sp, sp, #l_stacksize - ldmia sp!, {r4 - r11, pc} - - ENDP ; |vp8_decode_mb_tokens_v6| - - END diff --git a/vp8/decoder/arm/detokenize_arm.h b/vp8/decoder/arm/detokenize_arm.h deleted file mode 100644 index 9bb19b6cf..000000000 --- a/vp8/decoder/arm/detokenize_arm.h +++ /dev/null @@ -1,22 +0,0 @@ -/* - * Copyright (c) 2010 The WebM project authors. All Rights Reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - - -#ifndef DETOKENIZE_ARM_H -#define DETOKENIZE_ARM_H - -#if HAVE_ARMV6 -#if CONFIG_ARM_ASM_DETOK -void vp8_init_detokenizer(VP8D_COMP *dx); -void vp8_decode_mb_tokens_v6(DETOK *detoken, int type); -#endif -#endif - -#endif diff --git a/vp8/decoder/detokenize.c b/vp8/decoder/detokenize.c index 7d013d240..e529db131 100644 --- a/vp8/decoder/detokenize.c +++ b/vp8/decoder/detokenize.c @@ -74,37 +74,6 @@ void vp8_reset_mb_tokens_context(MACROBLOCKD *x) } } -#if CONFIG_ARM_ASM_DETOK -/* mashup of vp8_block2left and vp8_block2above so we only need one pointer - * for the assembly version. - */ -DECLARE_ALIGNED(16, const UINT8, vp8_block2leftabove[25*2]) = -{ - /* vp8_block2left */ - 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, - /* vp8_block2above */ - 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 4, 5, 4, 5, 6, 7, 6, 7, 8 -}; - -void vp8_init_detokenizer(VP8D_COMP *dx) -{ - const VP8_COMMON *const oc = & dx->common; - MACROBLOCKD *x = & dx->mb; - - dx->detoken.vp8_coef_tree_ptr = vp8_coef_tree; - dx->detoken.ptr_block2leftabove = vp8_block2leftabove; - dx->detoken.ptr_coef_bands_x = vp8_coef_bands_x; - dx->detoken.scan = vp8_default_zig_zag1d; - dx->detoken.teb_base_ptr = vp8d_token_extra_bits2; - dx->detoken.qcoeff_start_ptr = &x->qcoeff[0]; - - dx->detoken.coef_probs[0] = (oc->fc.coef_probs [0] [ 0 ] [0]); - dx->detoken.coef_probs[1] = (oc->fc.coef_probs [1] [ 0 ] [0]); - dx->detoken.coef_probs[2] = (oc->fc.coef_probs [2] [ 0 ] [0]); - dx->detoken.coef_probs[3] = (oc->fc.coef_probs [3] [ 0 ] [0]); -} -#endif - DECLARE_ALIGNED(16, extern const unsigned char, vp8dx_bitreader_norm[256]); #define FILL \ if(count < 0) \ @@ -202,35 +171,6 @@ DECLARE_ALIGNED(16, extern const unsigned char, vp8dx_bitreader_norm[256]); }\ NORMALIZE -#if CONFIG_ARM_ASM_DETOK -int vp8_decode_mb_tokens(VP8D_COMP *dx, MACROBLOCKD *x) -{ - int eobtotal = 0; - int i, type; - - dx->detoken.current_bc = x->current_bc; - dx->detoken.A = x->above_context; - dx->detoken.L = x->left_context; - - type = 3; - - if (x->mode_info_context->mbmi.mode != B_PRED && x->mode_info_context->mbmi.mode != SPLITMV) - { - type = 1; - eobtotal -= 16; - } - - vp8_decode_mb_tokens_v6(&dx->detoken, type); - - for (i = 0; i < 25; i++) - { - x->eobs[i] = dx->detoken.eob[i]; - eobtotal += dx->detoken.eob[i]; - } - - return eobtotal; -} -#else int vp8_decode_mb_tokens(VP8D_COMP *dx, MACROBLOCKD *x) { ENTROPY_CONTEXT *A = (ENTROPY_CONTEXT *)x->above_context; @@ -423,4 +363,3 @@ BLOCK_FINISHED: return eobtotal; } -#endif /*!CONFIG_ASM_DETOK*/ diff --git a/vp8/decoder/detokenize.h b/vp8/decoder/detokenize.h index 294a4a55d..8640bda4c 100644 --- a/vp8/decoder/detokenize.h +++ b/vp8/decoder/detokenize.h @@ -14,10 +14,6 @@ #include "onyxd_int.h" -#if ARCH_ARM -#include "arm/detokenize_arm.h" -#endif - void vp8_reset_mb_tokens_context(MACROBLOCKD *x); int vp8_decode_mb_tokens(VP8D_COMP *, MACROBLOCKD *); diff --git a/vp8/decoder/onyxd_if.c b/vp8/decoder/onyxd_if.c index 41966b9da..9d49c9e62 100644 --- a/vp8/decoder/onyxd_if.c +++ b/vp8/decoder/onyxd_if.c @@ -133,9 +133,6 @@ VP8D_PTR vp8dx_create_decompressor(VP8D_CONFIG *oxcf) cm->last_sharpness_level = cm->sharpness_level; } -#if CONFIG_ARM_ASM_DETOK - vp8_init_detokenizer(pbi); -#endif pbi->common.error.setjmp = 0; return (VP8D_PTR) pbi; } diff --git a/vp8/vp8dx_arm.mk b/vp8/vp8dx_arm.mk index 080c9afde..03084c573 100644 --- a/vp8/vp8dx_arm.mk +++ b/vp8/vp8dx_arm.mk @@ -15,7 +15,6 @@ VP8_DX_SRCS-$(ARCH_ARM) += decoder/arm/arm_dsystemdependent.c VP8_CX_SRCS-$(ARCH_ARM) += decoder/asm_dec_offsets.c VP8_DX_SRCS-$(HAVE_ARMV6) += decoder/arm/dequantize_arm.c -VP8_DX_SRCS-$(CONFIG_ARM_ASM_DETOK) += decoder/arm/detokenize$(ASM) #File list for armv6 VP8_DX_SRCS-$(HAVE_ARMV6) += decoder/arm/armv6/dequant_dc_idct_v6$(ASM) From d419b93e3e47e5080161c9de3abbee79519ee130 Mon Sep 17 00:00:00 2001 From: Scott LaVarnway Date: Mon, 14 Feb 2011 13:32:58 -0500 Subject: [PATCH 2/5] Improved rd_pick_intra4x4block Eliminated unnecessary calculations. Improved performance by 10% on keyframes and 1.6% overall for the test clip used. Change-Id: I87671b26af5e2cc439e81d0fee3b15c7cd2a3309 --- vp8/encoder/encodeintra.c | 15 -------- vp8/encoder/rdopt.c | 79 ++++++++++++++++++++++++++++----------- 2 files changed, 58 insertions(+), 36 deletions(-) diff --git a/vp8/encoder/encodeintra.c b/vp8/encoder/encodeintra.c index 9163b427d..73a0db0f5 100644 --- a/vp8/encoder/encodeintra.c +++ b/vp8/encoder/encodeintra.c @@ -58,21 +58,6 @@ void vp8_encode_intra4x4block(const VP8_ENCODER_RTCD *rtcd, MACROBLOCK *x, BLOCK RECON_INVOKE(&rtcd->common->recon, recon)(b->predictor, b->diff, *(b->base_dst) + b->dst, b->dst_stride); } -void vp8_encode_intra4x4block_rd(const VP8_ENCODER_RTCD *rtcd, MACROBLOCK *x, BLOCK *be, BLOCKD *b, int best_mode) -{ - vp8_predict_intra4x4(b, best_mode, b->predictor); - - ENCODEMB_INVOKE(&rtcd->encodemb, subb)(be, b, 16); - - x->vp8_short_fdct4x4(be->src_diff, be->coeff, 32); - - x->quantize_b(be, b); - - IDCT_INVOKE(&rtcd->common->idct, idct16)(b->dqcoeff, b->diff, 32); - - RECON_INVOKE(&rtcd->common->recon, recon)(b->predictor, b->diff, *(b->base_dst) + b->dst, b->dst_stride); -} - void vp8_encode_intra4x4mby(const VP8_ENCODER_RTCD *rtcd, MACROBLOCK *mb) { int i; diff --git a/vp8/encoder/rdopt.c b/vp8/encoder/rdopt.c index 7080425c1..0d01d64b5 100644 --- a/vp8/encoder/rdopt.c +++ b/vp8/encoder/rdopt.c @@ -584,14 +584,41 @@ static void macro_block_yrd( MACROBLOCK *mb, *Rate = vp8_rdcost_mby(mb); } +static void save_predictor(unsigned char *predictor, unsigned char *dst) +{ + int r, c; + for (r = 0; r < 4; r++) + { + for (c = 0; c < 4; c++) + { + *dst = predictor[c]; + dst++; + } + + predictor += 16; + } +} +static void restore_predictor(unsigned char *predictor, unsigned char *dst) +{ + int r, c; + for (r = 0; r < 4; r++) + { + for (c = 0; c < 4; c++) + { + predictor[c] = *dst; + dst++; + } + + predictor += 16; + } +} static int rd_pick_intra4x4block( VP8_COMP *cpi, MACROBLOCK *x, BLOCK *be, BLOCKD *b, B_PREDICTION_MODE *best_mode, - B_PREDICTION_MODE above, - B_PREDICTION_MODE left, + unsigned int *bmode_costs, ENTROPY_CONTEXT *a, ENTROPY_CONTEXT *l, @@ -600,31 +627,27 @@ static int rd_pick_intra4x4block( int *bestdistortion) { B_PREDICTION_MODE mode; - int best_rd = INT_MAX; // 1<<30 + int best_rd = INT_MAX; int rate = 0; int distortion; - unsigned int *mode_costs; ENTROPY_CONTEXT ta = *a, tempa = *a; ENTROPY_CONTEXT tl = *l, templ = *l; - - if (x->e_mbd.frame_type == KEY_FRAME) - { - mode_costs = x->bmode_costs[above][left]; - } - else - { - mode_costs = x->inter_bmode_costs; - } + DECLARE_ALIGNED_ARRAY(16, unsigned char, predictor, 16); + DECLARE_ALIGNED_ARRAY(16, short, dqcoeff, 16); for (mode = B_DC_PRED; mode <= B_HU_PRED; mode++) { int this_rd; int ratey; - rate = mode_costs[mode]; - vp8_encode_intra4x4block_rd(IF_RTCD(&cpi->rtcd), x, be, b, mode); + rate = bmode_costs[mode]; + + vp8_predict_intra4x4(b, mode, b->predictor); + ENCODEMB_INVOKE(IF_RTCD(&cpi->rtcd.encodemb), subb)(be, b, 16); + x->vp8_short_fdct4x4(be->src_diff, be->coeff, 32); + x->quantize_b(be, b); tempa = ta; templ = tl; @@ -644,17 +667,23 @@ static int rd_pick_intra4x4block( *best_mode = mode; *a = tempa; *l = templ; + save_predictor(b->predictor, predictor); + vpx_memcpy(dqcoeff, b->dqcoeff, 32); } } b->bmi.mode = (B_PREDICTION_MODE)(*best_mode); - vp8_encode_intra4x4block_rd(IF_RTCD(&cpi->rtcd), x, be, b, b->bmi.mode); + + restore_predictor(b->predictor, predictor); + vpx_memcpy(b->dqcoeff, dqcoeff, 32); + + IDCT_INVOKE(IF_RTCD(&cpi->rtcd.common->idct), idct16)(b->dqcoeff, b->diff, 32); + RECON_INVOKE(IF_RTCD(&cpi->rtcd.common->recon), recon)(b->predictor, b->diff, *(b->base_dst) + b->dst, b->dst_stride); return best_rd; } - int vp8_rd_pick_intra4x4mby_modes(VP8_COMP *cpi, MACROBLOCK *mb, int *Rate, int *rate_y, int *Distortion, int best_rd) { @@ -667,6 +696,7 @@ int vp8_rd_pick_intra4x4mby_modes(VP8_COMP *cpi, MACROBLOCK *mb, int *Rate, ENTROPY_CONTEXT_PLANES t_above, t_left; ENTROPY_CONTEXT *ta; ENTROPY_CONTEXT *tl; + unsigned int *bmode_costs; vpx_memcpy(&t_above, mb->e_mbd.above_context, sizeof(ENTROPY_CONTEXT_PLANES)); vpx_memcpy(&t_left, mb->e_mbd.left_context, sizeof(ENTROPY_CONTEXT_PLANES)); @@ -676,17 +706,25 @@ int vp8_rd_pick_intra4x4mby_modes(VP8_COMP *cpi, MACROBLOCK *mb, int *Rate, vp8_intra_prediction_down_copy(xd); + bmode_costs = mb->inter_bmode_costs; + for (i = 0; i < 16; i++) { MODE_INFO *const mic = xd->mode_info_context; const int mis = xd->mode_info_stride; - const B_PREDICTION_MODE A = vp8_above_bmi(mic, i, mis)->mode; - const B_PREDICTION_MODE L = vp8_left_bmi(mic, i)->mode; B_PREDICTION_MODE UNINITIALIZED_IS_SAFE(best_mode); int UNINITIALIZED_IS_SAFE(r), UNINITIALIZED_IS_SAFE(ry), UNINITIALIZED_IS_SAFE(d); + if (mb->e_mbd.frame_type == KEY_FRAME) + { + const B_PREDICTION_MODE A = vp8_above_bmi(mic, i, mis)->mode; + const B_PREDICTION_MODE L = vp8_left_bmi(mic, i)->mode; + + bmode_costs = mb->bmode_costs[A][L]; + } + total_rd += rd_pick_intra4x4block( - cpi, mb, mb->block + i, xd->block + i, &best_mode, A, L, + cpi, mb, mb->block + i, xd->block + i, &best_mode, bmode_costs, ta + vp8_block2above[i], tl + vp8_block2left[i], &r, &ry, &d); @@ -708,7 +746,6 @@ int vp8_rd_pick_intra4x4mby_modes(VP8_COMP *cpi, MACROBLOCK *mb, int *Rate, return RDCOST(mb->rdmult, mb->rddiv, cost, distortion); } - int vp8_rd_pick_intra16x16mby_mode(VP8_COMP *cpi, MACROBLOCK *x, int *Rate, From d3dfcde0f7e499113ae7cdbb85c9b6306b2d7e24 Mon Sep 17 00:00:00 2001 From: James Berry Date: Mon, 14 Feb 2011 14:02:52 -0500 Subject: [PATCH 3/5] mem leak fix for cpi->tplist checks added to make sure that cpi->tplist is freed correctly in vp8_dealloc_compressor_data and vp8_alloc_compressor_data. Change-Id: I66149dbbd25c958800ad94f4379d723191d9680d --- vp8/encoder/onyx_if.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/vp8/encoder/onyx_if.c b/vp8/encoder/onyx_if.c index e73e41ee7..3f787d6da 100644 --- a/vp8/encoder/onyx_if.c +++ b/vp8/encoder/onyx_if.c @@ -262,7 +262,8 @@ static void setup_features(VP8_COMP *cpi) void vp8_dealloc_compressor_data(VP8_COMP *cpi) { - vpx_free(cpi->tplist); + if(cpi->tplist!=0) + vpx_free(cpi->tplist); cpi->tplist = NULL; // Delete last frame MV storage buffers @@ -1406,6 +1407,9 @@ void vp8_alloc_compressor_data(VP8_COMP *cpi) cpi->mt_sync_range = 16; #endif + if(cpi->tplist); + vpx_free(cpi->tplist); + CHECK_MEM_ERROR(cpi->tplist, vpx_malloc(sizeof(TOKENLIST) * cpi->common.mb_rows)); } From 2debd5b5f75ab11bb6835b929e468f2873a88277 Mon Sep 17 00:00:00 2001 From: Yunqing Wang Date: Mon, 14 Feb 2011 16:23:49 -0500 Subject: [PATCH 4/5] Improve vp8_sad16x16_sse3 function In real-time mode, vp8_sad16x16 function is called heavily in motion search part. Improvement of this function gives 1.2% encoding performance gain (real-time mode, tulip clip). Change-Id: I23c401fc40c061f732a9767e8d383737a179bd58 --- vp8/encoder/x86/sad_sse3.asm | 65 ++++++++++++++++-------------------- 1 file changed, 29 insertions(+), 36 deletions(-) diff --git a/vp8/encoder/x86/sad_sse3.asm b/vp8/encoder/x86/sad_sse3.asm index 575417516..f0336ab17 100644 --- a/vp8/encoder/x86/sad_sse3.asm +++ b/vp8/encoder/x86/sad_sse3.asm @@ -586,52 +586,45 @@ sym(vp8_sad16x16_sse3): STACK_FRAME_CREATE_X3 - lea end_ptr, [src_ptr+src_stride*8] - - lea end_ptr, [end_ptr+src_stride*8] - pxor mm7, mm7 + mov end_ptr, 4 + pxor xmm7, xmm7 .vp8_sad16x16_sse3_loop: - - movq ret_var, mm7 - cmp ret_var, max_err - jg .vp8_sad16x16_early_exit - - movq mm0, QWORD PTR [src_ptr] - movq mm2, QWORD PTR [src_ptr+8] - - movq mm1, QWORD PTR [ref_ptr] - movq mm3, QWORD PTR [ref_ptr+8] - - movq mm4, QWORD PTR [src_ptr+src_stride] - movq mm5, QWORD PTR [ref_ptr+ref_stride] - - psadbw mm0, mm1 - psadbw mm2, mm3 - - movq mm1, QWORD PTR [src_ptr+src_stride+8] - movq mm3, QWORD PTR [ref_ptr+ref_stride+8] - - psadbw mm4, mm5 - psadbw mm1, mm3 + movdqa xmm0, XMMWORD PTR [src_ptr] + movdqu xmm1, XMMWORD PTR [ref_ptr] + movdqa xmm2, XMMWORD PTR [src_ptr+src_stride] + movdqu xmm3, XMMWORD PTR [ref_ptr+ref_stride] lea src_ptr, [src_ptr+src_stride*2] lea ref_ptr, [ref_ptr+ref_stride*2] - paddw mm0, mm2 - paddw mm4, mm1 + movdqa xmm4, XMMWORD PTR [src_ptr] + movdqu xmm5, XMMWORD PTR [ref_ptr] + movdqa xmm6, XMMWORD PTR [src_ptr+src_stride] - paddw mm7, mm0 - paddw mm7, mm4 + psadbw xmm0, xmm1 - cmp src_ptr, end_ptr + movdqu xmm1, XMMWORD PTR [ref_ptr+ref_stride] + + psadbw xmm2, xmm3 + psadbw xmm4, xmm5 + psadbw xmm6, xmm1 + + lea src_ptr, [src_ptr+src_stride*2] + lea ref_ptr, [ref_ptr+ref_stride*2] + + paddw xmm7, xmm0 + paddw xmm7, xmm2 + paddw xmm7, xmm4 + paddw xmm7, xmm6 + + sub end_ptr, 1 jne .vp8_sad16x16_sse3_loop - movq ret_var, mm7 - -.vp8_sad16x16_early_exit: - - mov rax, ret_var + movq xmm0, xmm7 + psrldq xmm7, 8 + paddw xmm0, xmm7 + movq rax, xmm0 STACK_FRAME_DESTROY_X3 From 94d4fee08f1e2bdd6b4b493635f7281491280220 Mon Sep 17 00:00:00 2001 From: Scott LaVarnway Date: Mon, 14 Feb 2011 16:34:33 -0500 Subject: [PATCH 5/5] Improved vp8_rd_pick_intra_mbuv_mode Eliminated unnecessary calculations. Very small change to performance. Change-Id: Ib7213d43c64e36955177c4d47950ff472266f822 --- vp8/encoder/encodeintra.c | 14 -------------- vp8/encoder/encodeintra.h | 1 - vp8/encoder/rdopt.c | 16 ++++++---------- 3 files changed, 6 insertions(+), 25 deletions(-) diff --git a/vp8/encoder/encodeintra.c b/vp8/encoder/encodeintra.c index 73a0db0f5..e016fa35b 100644 --- a/vp8/encoder/encodeintra.c +++ b/vp8/encoder/encodeintra.c @@ -153,17 +153,3 @@ void vp8_encode_intra16x16mbuv(const VP8_ENCODER_RTCD *rtcd, MACROBLOCK *x) vp8_recon_intra_mbuv(IF_RTCD(&rtcd->common->recon), &x->e_mbd); } -void vp8_encode_intra16x16mbuvrd(const VP8_ENCODER_RTCD *rtcd, MACROBLOCK *x) -{ - vp8_build_intra_predictors_mbuv(&x->e_mbd); - - ENCODEMB_INVOKE(&rtcd->encodemb, submbuv)(x->src_diff, x->src.u_buffer, x->src.v_buffer, x->e_mbd.predictor, x->src.uv_stride); - - vp8_transform_mbuv(x); - - vp8_quantize_mbuv(x); - - vp8_inverse_transform_mbuv(IF_RTCD(&rtcd->common->idct), &x->e_mbd); - - vp8_recon_intra_mbuv(IF_RTCD(&rtcd->common->recon), &x->e_mbd); -} diff --git a/vp8/encoder/encodeintra.h b/vp8/encoder/encodeintra.h index c0247b06a..b8b80f176 100644 --- a/vp8/encoder/encodeintra.h +++ b/vp8/encoder/encodeintra.h @@ -19,6 +19,5 @@ void vp8_encode_intra4x4mby(const VP8_ENCODER_RTCD *, MACROBLOCK *mb); void vp8_encode_intra4x4block(const VP8_ENCODER_RTCD *, MACROBLOCK *x, BLOCK *be, BLOCKD *b, int best_mode); void vp8_update_mode_context(int *abmode, int *lbmode, int i, int best_mode); void vp8_encode_intra4x4block_rd(const VP8_ENCODER_RTCD *, MACROBLOCK *x, BLOCK *be, BLOCKD *b, int best_mode); -void vp8_encode_intra16x16mbuvrd(const VP8_ENCODER_RTCD *, MACROBLOCK *x); #endif diff --git a/vp8/encoder/rdopt.c b/vp8/encoder/rdopt.c index 0d01d64b5..3449e4532 100644 --- a/vp8/encoder/rdopt.c +++ b/vp8/encoder/rdopt.c @@ -813,15 +813,6 @@ static int rd_cost_mbuv(MACROBLOCK *mb) } -unsigned int vp8_get_mbuvrecon_error(const vp8_variance_rtcd_vtable_t *rtcd, const MACROBLOCK *x) // sum of squares -{ - unsigned int sse0, sse1; - int sum0, sum1; - VARIANCE_INVOKE(rtcd, get8x8var)(x->src.u_buffer, x->src.uv_stride, x->e_mbd.dst.u_buffer, x->e_mbd.dst.uv_stride, &sse0, &sum0); - VARIANCE_INVOKE(rtcd, get8x8var)(x->src.v_buffer, x->src.uv_stride, x->e_mbd.dst.v_buffer, x->e_mbd.dst.uv_stride, &sse1, &sum1); - return (sse0 + sse1); -} - static int vp8_rd_inter_uv(VP8_COMP *cpi, MACROBLOCK *x, int *rate, int *distortion, int fullpixel) { vp8_build_uvmvs(&x->e_mbd, fullpixel); @@ -849,7 +840,12 @@ int vp8_rd_pick_intra_mbuv_mode(VP8_COMP *cpi, MACROBLOCK *x, int *rate, int *ra int this_rd; x->e_mbd.mode_info_context->mbmi.uv_mode = mode; - vp8_encode_intra16x16mbuvrd(IF_RTCD(&cpi->rtcd), x); + vp8_build_intra_predictors_mbuv(&x->e_mbd); + ENCODEMB_INVOKE(IF_RTCD(&cpi->rtcd.encodemb), submbuv)(x->src_diff, + x->src.u_buffer, x->src.v_buffer, x->e_mbd.predictor, + x->src.uv_stride); + vp8_transform_mbuv(x); + vp8_quantize_mbuv(x); rate_to = rd_cost_mbuv(x); rate = rate_to + x->intra_uv_mode_cost[x->e_mbd.frame_type][x->e_mbd.mode_info_context->mbmi.uv_mode];