From f3e9e2a0f8d277f3b3c59ded05c0de6fd00a7326 Mon Sep 17 00:00:00 2001 From: James Berry Date: Thu, 10 Mar 2011 11:13:44 -0500 Subject: [PATCH 01/10] Fix incorrect macroblock counts in twopass rate control The previous calculation of macroblock count (w*h)/256 is not correct when the width/height are not multiples of 16. Use the precalculated macroblock count from cpi->common instead. This manifested itself as a divide by zero when the number of pixels was less than 256. num_mbs updated in estimate_max_q, estimate_q, estimate_kf_group_q, and estimate_cq Change-Id: I92ff98587864c801b1ee5485cfead964673a9973 --- vp8/encoder/firstpass.c | 32 ++++++++++++++------------------ 1 file changed, 14 insertions(+), 18 deletions(-) diff --git a/vp8/encoder/firstpass.c b/vp8/encoder/firstpass.c index d2cc8482e..a85e35b72 100644 --- a/vp8/encoder/firstpass.c +++ b/vp8/encoder/firstpass.c @@ -944,10 +944,10 @@ void vp8_first_pass(VP8_COMP *cpi) extern const int vp8_bits_per_mb[2][QINDEX_RANGE]; #define BASE_ERRPERMB 150 -static int estimate_max_q(VP8_COMP *cpi, double section_err, int section_target_bandwitdh, int Height, int Width) +static int estimate_max_q(VP8_COMP *cpi, double section_err, int section_target_bandwitdh) { int Q; - int num_mbs = ((Height * Width) / (16 * 16)); + int num_mbs = cpi->common.MBs; int target_norm_bits_per_mb; double err_per_mb = section_err / num_mbs; @@ -1044,10 +1044,10 @@ static int estimate_max_q(VP8_COMP *cpi, double section_err, int section_target_ return Q; } -static int estimate_q(VP8_COMP *cpi, double section_err, int section_target_bandwitdh, int Height, int Width) +static int estimate_q(VP8_COMP *cpi, double section_err, int section_target_bandwitdh) { int Q; - int num_mbs = ((Height * Width) / (16 * 16)); + int num_mbs = cpi->common.MBs; int target_norm_bits_per_mb; double err_per_mb = section_err / num_mbs; @@ -1095,10 +1095,10 @@ static int estimate_q(VP8_COMP *cpi, double section_err, int section_target_band } // Estimate a worst case Q for a KF group -static int estimate_kf_group_q(VP8_COMP *cpi, double section_err, int section_target_bandwitdh, int Height, int Width, double group_iiratio) +static int estimate_kf_group_q(VP8_COMP *cpi, double section_err, int section_target_bandwitdh, double group_iiratio) { int Q; - int num_mbs = ((Height * Width) / (16 * 16)); + int num_mbs = cpi->common.MBs; int target_norm_bits_per_mb = (512 * section_target_bandwitdh) / num_mbs; int bits_per_mb_at_this_q; @@ -1193,11 +1193,10 @@ static int estimate_kf_group_q(VP8_COMP *cpi, double section_err, int section_ta // For cq mode estimate a cq level that matches the observed // complexity and data rate. -static int estimate_cq(VP8_COMP *cpi, double section_err, - int section_target_bandwitdh, int Height, int Width) +static int estimate_cq(VP8_COMP *cpi, double section_err, int section_target_bandwitdh) { int Q; - int num_mbs = ((Height * Width) / (16 * 16)); + int num_mbs = cpi->common.MBs; int target_norm_bits_per_mb; double err_per_mb = section_err / num_mbs; @@ -1717,7 +1716,7 @@ static void define_gf_group(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame) arf_frame_bits = (int)((double)Boost * (group_bits / (double)allocation_chunks)); // Estimate if there are enough bits available to make worthwhile use of an arf. - tmp_q = estimate_q(cpi, mod_frame_err, (int)arf_frame_bits, cpi->common.Height, cpi->common.Width); + tmp_q = estimate_q(cpi, mod_frame_err, (int)arf_frame_bits); // Only use an arf if it is likely we will be able to code it at a lower Q than the surrounding frames. if (tmp_q < cpi->worst_quality) @@ -2245,8 +2244,7 @@ void vp8_second_pass(VP8_COMP *cpi) est_cq = estimate_cq( cpi, (cpi->total_coded_error_left / frames_left), - (int)(cpi->bits_left / frames_left), - cpi->common.Height, cpi->common.Width); + (int)(cpi->bits_left / frames_left)); cpi->cq_target_quality = cpi->oxcf.cq_level; if ( est_cq > cpi->cq_target_quality ) @@ -2258,9 +2256,7 @@ void vp8_second_pass(VP8_COMP *cpi) cpi->maxq_min_limit = cpi->best_quality; tmp_q = estimate_max_q( cpi, (cpi->total_coded_error_left / frames_left), - (int)(cpi->bits_left / frames_left), - cpi->common.Height, - cpi->common.Width); + (int)(cpi->bits_left / frames_left)); // Limit the maxq value returned subsequently. // This increases the risk of overspend or underspend if the initial @@ -2288,7 +2284,7 @@ void vp8_second_pass(VP8_COMP *cpi) if (frames_left < 1) frames_left = 1; - tmp_q = estimate_max_q(cpi, (cpi->total_coded_error_left / frames_left), (int)(cpi->bits_left / frames_left), cpi->common.Height, cpi->common.Width); + tmp_q = estimate_max_q(cpi, (cpi->total_coded_error_left / frames_left), (int)(cpi->bits_left / frames_left)); // Move active_worst_quality but in a damped way if (tmp_q > cpi->active_worst_quality) @@ -2897,7 +2893,7 @@ void vp8_find_next_key_frame(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame) bits_per_frame = (cpi->oxcf.target_bandwidth * cpi->oxcf.two_pass_vbrmin_section / 100); // Work out if spatial resampling is necessary - kf_q = estimate_kf_group_q(cpi, err_per_frame, bits_per_frame, new_height, new_width, group_iiratio); + kf_q = estimate_kf_group_q(cpi, err_per_frame, bits_per_frame, group_iiratio); // If we project a required Q higher than the maximum allowed Q then make a guess at the actual size of frames in this section projected_bits_perframe = bits_per_frame; @@ -2968,7 +2964,7 @@ void vp8_find_next_key_frame(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame) effective_size_ratio = (1.0 + (3.0 * effective_size_ratio)) / 4.0; // Now try again and see what Q we get with the smaller image size - kf_q = estimate_kf_group_q(cpi, err_per_frame * effective_size_ratio, bits_per_frame, new_height, new_width, group_iiratio); + kf_q = estimate_kf_group_q(cpi, err_per_frame * effective_size_ratio, bits_per_frame, group_iiratio); if (0) { From 7ab08e1feeaf876eea0cc8085c9c4f1534eab9d0 Mon Sep 17 00:00:00 2001 From: Tero Rintaluoma Date: Mon, 7 Mar 2011 11:12:56 +0200 Subject: [PATCH 02/10] ARMv6 optimized quantization Adds new ARMv6 optimized function vp8_fast_quantize_b_armv6 to the encoder. Change-Id: I40277ec8f82e8a6cbc453cf295a0cc9b2504b21e --- vp8/encoder/arm/arm_csystemdependent.c | 4 +- .../arm/armv6/vp8_fast_quantize_b_armv6.asm | 224 ++++++++++++++++++ vp8/encoder/arm/quantize_arm.h | 10 + vp8/encoder/asm_enc_offsets.c | 11 + vp8/vp8cx_arm.mk | 1 + 5 files changed, 248 insertions(+), 2 deletions(-) create mode 100644 vp8/encoder/arm/armv6/vp8_fast_quantize_b_armv6.asm diff --git a/vp8/encoder/arm/arm_csystemdependent.c b/vp8/encoder/arm/arm_csystemdependent.c index 73007d414..b643137a6 100644 --- a/vp8/encoder/arm/arm_csystemdependent.c +++ b/vp8/encoder/arm/arm_csystemdependent.c @@ -71,8 +71,8 @@ void vp8_arch_arm_encoder_init(VP8_COMP *cpi) cpi->rtcd.encodemb.submby = vp8_subtract_mby_c; cpi->rtcd.encodemb.submbuv = vp8_subtract_mbuv_c;*/ - /*cpi->rtcd.quantize.quantb = vp8_regular_quantize_b; - cpi->rtcd.quantize.fastquantb = vp8_fast_quantize_b_c;*/ + /*cpi->rtcd.quantize.quantb = vp8_regular_quantize_b;*/ + cpi->rtcd.quantize.fastquantb = vp8_fast_quantize_b_armv6; } #endif diff --git a/vp8/encoder/arm/armv6/vp8_fast_quantize_b_armv6.asm b/vp8/encoder/arm/armv6/vp8_fast_quantize_b_armv6.asm new file mode 100644 index 000000000..ae2f6030d --- /dev/null +++ b/vp8/encoder/arm/armv6/vp8_fast_quantize_b_armv6.asm @@ -0,0 +1,224 @@ +; +; Copyright (c) 2011 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. +; + + + EXPORT |vp8_fast_quantize_b_armv6| + + INCLUDE asm_enc_offsets.asm + + ARM + REQUIRE8 + PRESERVE8 + + AREA ||.text||, CODE, READONLY, ALIGN=2 + +; r0 BLOCK *b +; r1 BLOCKD *d +|vp8_fast_quantize_b_armv6| PROC + stmfd sp!, {r1, r4-r11, lr} + + ldr r3, [r0, #vp8_block_coeff] ; coeff + ldr r4, [r0, #vp8_block_quant_fast] ; quant_fast + ldr r5, [r0, #vp8_block_round] ; round + ldr r6, [r1, #vp8_blockd_qcoeff] ; qcoeff + ldr r7, [r1, #vp8_blockd_dqcoeff] ; dqcoeff + ldr r8, [r1, #vp8_blockd_dequant] ; dequant + + ldr r2, loop_count ; loop_count=0x1000000. 'lsls' instruction + ; is used to update the counter so that + ; it can be used to mark nonzero + ; quantized coefficient pairs. + + mov r1, #0 ; flags for quantized coeffs + + ; PART 1: quantization and dequantization loop +loop + ldr r9, [r3], #4 ; [z1 | z0] + ldr r10, [r5], #4 ; [r1 | r0] + ldr r11, [r4], #4 ; [q1 | q0] + + ssat16 lr, #1, r9 ; [sz1 | sz0] + eor r9, r9, lr ; [z1 ^ sz1 | z0 ^ sz0] + ssub16 r9, r9, lr ; x = (z ^ sz) - sz + sadd16 r9, r9, r10 ; [x1+r1 | x0+r0] + + ldr r12, [r3], #4 ; [z3 | z2] + + smulbb r0, r9, r11 ; [(x0+r0)*q0] + smultt r9, r9, r11 ; [(x1+r1)*q1] + + ldr r10, [r5], #4 ; [r3 | r2] + + ssat16 r11, #1, r12 ; [sz3 | sz2] + eor r12, r12, r11 ; [z3 ^ sz3 | z2 ^ sz2] + pkhtb r0, r9, r0, asr #16 ; [y1 | y0] + ldr r9, [r4], #4 ; [q3 | q2] + ssub16 r12, r12, r11 ; x = (z ^ sz) - sz + + sadd16 r12, r12, r10 ; [x3+r3 | x2+r2] + + eor r0, r0, lr ; [(y1 ^ sz1) | (y0 ^ sz0)] + + smulbb r10, r12, r9 ; [(x2+r2)*q2] + smultt r12, r12, r9 ; [(x3+r3)*q3] + + ssub16 r0, r0, lr ; x = (y ^ sz) - sz + + cmp r0, #0 ; check if zero + orrne r1, r1, r2, lsr #24 ; add flag for nonzero coeffs + + str r0, [r6], #4 ; *qcoeff++ = x + ldr r9, [r8], #4 ; [dq1 | dq0] + + pkhtb r10, r12, r10, asr #16 ; [y3 | y2] + eor r10, r10, r11 ; [(y3 ^ sz3) | (y2 ^ sz2)] + ssub16 r10, r10, r11 ; x = (y ^ sz) - sz + + cmp r10, #0 ; check if zero + orrne r1, r1, r2, lsr #23 ; add flag for nonzero coeffs + + str r10, [r6], #4 ; *qcoeff++ = x + ldr r11, [r8], #4 ; [dq3 | dq2] + + smulbb r12, r0, r9 ; [x0*dq0] + smultt r0, r0, r9 ; [x1*dq1] + + smulbb r9, r10, r11 ; [x2*dq2] + smultt r10, r10, r11 ; [x3*dq3] + + lsls r2, r2, #2 ; update loop counter + strh r12, [r7, #0] ; dqcoeff[0] = [x0*dq0] + strh r0, [r7, #2] ; dqcoeff[1] = [x1*dq1] + strh r9, [r7, #4] ; dqcoeff[2] = [x2*dq2] + strh r10, [r7, #6] ; dqcoeff[3] = [x3*dq3] + add r7, r7, #8 ; dqcoeff += 8 + bne loop + + ; PART 2: check position for eob... + mov lr, #0 ; init eob + cmp r1, #0 ; coeffs after quantization? + ldr r11, [sp, #0] ; restore BLOCKD pointer + beq end ; skip eob calculations if all zero + + ldr r0, [r11, #vp8_blockd_qcoeff] + + ; check shortcut for nonzero qcoeffs + tst r1, #0x80 + bne quant_coeff_15_14 + tst r1, #0x20 + bne quant_coeff_13_11 + tst r1, #0x8 + bne quant_coeff_12_7 + tst r1, #0x40 + bne quant_coeff_10_9 + tst r1, #0x10 + bne quant_coeff_8_3 + tst r1, #0x2 + bne quant_coeff_6_5 + tst r1, #0x4 + bne quant_coeff_4_2 + b quant_coeff_1_0 + +quant_coeff_15_14 + ldrh r2, [r0, #30] ; rc=15, i=15 + mov lr, #16 + cmp r2, #0 + bne end + + ldrh r3, [r0, #28] ; rc=14, i=14 + mov lr, #15 + cmp r3, #0 + bne end + +quant_coeff_13_11 + ldrh r2, [r0, #22] ; rc=11, i=13 + mov lr, #14 + cmp r2, #0 + bne end + +quant_coeff_12_7 + ldrh r3, [r0, #14] ; rc=7, i=12 + mov lr, #13 + cmp r3, #0 + bne end + + ldrh r2, [r0, #20] ; rc=10, i=11 + mov lr, #12 + cmp r2, #0 + bne end + +quant_coeff_10_9 + ldrh r3, [r0, #26] ; rc=13, i=10 + mov lr, #11 + cmp r3, #0 + bne end + + ldrh r2, [r0, #24] ; rc=12, i=9 + mov lr, #10 + cmp r2, #0 + bne end + +quant_coeff_8_3 + ldrh r3, [r0, #18] ; rc=9, i=8 + mov lr, #9 + cmp r3, #0 + bne end + + ldrh r2, [r0, #12] ; rc=6, i=7 + mov lr, #8 + cmp r2, #0 + bne end + +quant_coeff_6_5 + ldrh r3, [r0, #6] ; rc=3, i=6 + mov lr, #7 + cmp r3, #0 + bne end + + ldrh r2, [r0, #4] ; rc=2, i=5 + mov lr, #6 + cmp r2, #0 + bne end + +quant_coeff_4_2 + ldrh r3, [r0, #10] ; rc=5, i=4 + mov lr, #5 + cmp r3, #0 + bne end + + ldrh r2, [r0, #16] ; rc=8, i=3 + mov lr, #4 + cmp r2, #0 + bne end + + ldrh r3, [r0, #8] ; rc=4, i=2 + mov lr, #3 + cmp r3, #0 + bne end + +quant_coeff_1_0 + ldrh r2, [r0, #2] ; rc=1, i=1 + mov lr, #2 + cmp r2, #0 + bne end + + mov lr, #1 ; rc=0, i=0 + +end + str lr, [r11, #vp8_blockd_eob] + ldmfd sp!, {r1, r4-r11, pc} + + ENDP + +loop_count + DCD 0x1000000 + + END + diff --git a/vp8/encoder/arm/quantize_arm.h b/vp8/encoder/arm/quantize_arm.h index 5f9155eb1..0c6adf4c2 100644 --- a/vp8/encoder/arm/quantize_arm.h +++ b/vp8/encoder/arm/quantize_arm.h @@ -12,6 +12,16 @@ #ifndef QUANTIZE_ARM_H #define QUANTIZE_ARM_H +#if HAVE_ARMV6 + +extern prototype_quantize_block(vp8_fast_quantize_b_armv6); + +#undef vp8_quantize_fastquantb +#define vp8_quantize_fastquantb vp8_fast_quantize_b_armv6 + +#endif /* HAVE_ARMV6 */ + + #if HAVE_ARMV7 extern prototype_quantize_block(vp8_fast_quantize_b_neon); diff --git a/vp8/encoder/asm_enc_offsets.c b/vp8/encoder/asm_enc_offsets.c index cd4953227..fcf77756a 100644 --- a/vp8/encoder/asm_enc_offsets.c +++ b/vp8/encoder/asm_enc_offsets.c @@ -65,6 +65,17 @@ DEFINE(TOKENLIST_SZ, sizeof(TOKENLIST)); DEFINE(vp8_common_mb_rows, offsetof(VP8_COMMON, mb_rows)); +// offsets from BLOCK structure +DEFINE(vp8_block_coeff, offsetof(BLOCK, coeff)); +DEFINE(vp8_block_quant_fast, offsetof(BLOCK, quant_fast)); +DEFINE(vp8_block_round, offsetof(BLOCK, round)); + +// offsets from BLOCKD structure +DEFINE(vp8_blockd_qcoeff, offsetof(BLOCKD, qcoeff)); +DEFINE(vp8_blockd_dqcoeff, offsetof(BLOCKD, dqcoeff)); +DEFINE(vp8_blockd_dequant, offsetof(BLOCKD, dequant)); +DEFINE(vp8_blockd_eob, offsetof(BLOCKD, eob)); + // These two sizes are used in vp8cx_pack_tokens. They are hard coded // so if the size changes this will have to be adjusted. #if HAVE_ARMV5TE diff --git a/vp8/vp8cx_arm.mk b/vp8/vp8cx_arm.mk index 7980a0f75..f8979f4f7 100644 --- a/vp8/vp8cx_arm.mk +++ b/vp8/vp8cx_arm.mk @@ -34,6 +34,7 @@ VP8_CX_SRCS-$(HAVE_ARMV5TE) += encoder/arm/armv5te/vp8_packtokens_partitions_ar #File list for armv6 # encoder +VP8_CX_SRCS-$(HAVE_ARMV6) += encoder/arm/armv6/vp8_fast_quantize_b_armv6$(ASM) VP8_CX_SRCS-$(HAVE_ARMV6) += encoder/arm/armv6/vp8_sad16x16_armv6$(ASM) VP8_CX_SRCS-$(HAVE_ARMV6) += encoder/arm/armv6/vp8_variance16x16_armv6$(ASM) VP8_CX_SRCS-$(HAVE_ARMV6) += encoder/arm/armv6/walsh_v6$(ASM) From 3ae24657887ed0a9f2e547973640890e9dfa6ea4 Mon Sep 17 00:00:00 2001 From: Attila Nagy Date: Fri, 25 Feb 2011 13:42:05 +0200 Subject: [PATCH 03/10] Encoder loopfilter running in its own thread In multithreaded mode the loopfilter is running in its own thread (filter level calculation and frame filtering). Filtering is mostly done in parallel with the bitstream packing. Before starting the packing the loopfilter level has to be calculated. Also any needed reference frame copying is done in the filter thread. Currently the encoder will create n+1 threads, where n > 1 is the number of threads specified by application and 1 is the extra filter thread. With n = 1 the encoder runs in single thread mode. There will never be more than n threads running concurrently. Change-Id: I4fb29b559a40275d6d3babb8727245c40fba931b --- vp8/encoder/ethreading.c | 43 ++++++++++ vp8/encoder/onyx_if.c | 178 +++++++++++++++++++++++---------------- vp8/encoder/onyx_int.h | 5 ++ 3 files changed, 155 insertions(+), 71 deletions(-) diff --git a/vp8/encoder/ethreading.c b/vp8/encoder/ethreading.c index 12d5f66d3..5c607a0cb 100644 --- a/vp8/encoder/ethreading.c +++ b/vp8/encoder/ethreading.c @@ -24,6 +24,35 @@ extern void vp8cx_mb_init_quantizer(VP8_COMP *cpi, MACROBLOCK *x); extern void vp8_build_block_offsets(MACROBLOCK *x); extern void vp8_setup_block_ptrs(MACROBLOCK *x); +#if CONFIG_MULTITHREAD + +extern void loopfilter_frame(VP8_COMP *cpi, VP8_COMMON *cm); + +static THREAD_FUNCTION loopfilter_thread(void *p_data) +{ + VP8_COMP *cpi = (VP8_COMP *)(((LPFTHREAD_DATA *)p_data)->ptr1); + VP8_COMMON *cm = &cpi->common; + + while (1) + { + if (cpi->b_multi_threaded == 0) + break; + + if (sem_wait(&cpi->h_event_start_lpf) == 0) + { + if (cpi->b_multi_threaded == FALSE) // we're shutting down + break; + + loopfilter_frame(cpi, cm); + + sem_post(&cpi->h_event_end_lpf); + } + } + + return 0; +} +#endif + static THREAD_FUNCTION thread_encoding_proc(void *p_data) { @@ -479,6 +508,15 @@ void vp8cx_create_encoder_threads(VP8_COMP *cpi) pthread_create(&cpi->h_encoding_thread[ithread], 0, thread_encoding_proc, ethd); } + { + LPFTHREAD_DATA * lpfthd = &cpi->lpf_thread_data; + + sem_init(&cpi->h_event_start_lpf, 0, 0); + sem_init(&cpi->h_event_end_lpf, 0, 0); + + lpfthd->ptr1 = (void *)cpi; + pthread_create(&cpi->h_filter_thread, 0, loopfilter_thread, lpfthd); + } } } @@ -500,9 +538,14 @@ void vp8cx_remove_encoder_threads(VP8_COMP *cpi) sem_destroy(&cpi->h_event_start_encoding[i]); } + + sem_post(&cpi->h_event_start_lpf); + pthread_join(cpi->h_filter_thread, 0); } sem_destroy(&cpi->h_event_end_encoding); + sem_destroy(&cpi->h_event_end_lpf); + sem_destroy(&cpi->h_event_start_lpf); //free thread related resources vpx_free(cpi->h_event_start_encoding); diff --git a/vp8/encoder/onyx_if.c b/vp8/encoder/onyx_if.c index 5dc579d10..fcd996d1c 100644 --- a/vp8/encoder/onyx_if.c +++ b/vp8/encoder/onyx_if.c @@ -3509,6 +3509,89 @@ static BOOL recode_loop_test( VP8_COMP *cpi, return force_recode; } +void loopfilter_frame(VP8_COMP *cpi, VP8_COMMON *cm) +{ + if (cm->no_lpf) + { + cm->filter_level = 0; + } + else + { + struct vpx_usec_timer timer; + + vp8_clear_system_state(); + + vpx_usec_timer_start(&timer); + if (cpi->sf.auto_filter == 0) + vp8cx_pick_filter_level_fast(cpi->Source, cpi); + + else + vp8cx_pick_filter_level(cpi->Source, cpi); + + vpx_usec_timer_mark(&timer); + cpi->time_pick_lpf += vpx_usec_timer_elapsed(&timer); + } + +#if CONFIG_MULTITHREAD + sem_post(&cpi->h_event_end_lpf); /* signal that we have set filter_level */ +#endif + + if (cm->filter_level > 0) + { + vp8cx_set_alt_lf_level(cpi, cm->filter_level); + vp8_loop_filter_frame(cm, &cpi->mb.e_mbd, cm->filter_level); + cm->last_filter_type = cm->filter_type; + cm->last_sharpness_level = cm->sharpness_level; + } + + vp8_yv12_extend_frame_borders_ptr(cm->frame_to_show); + + { + YV12_BUFFER_CONFIG *lst_yv12 = &cm->yv12_fb[cm->lst_fb_idx]; + YV12_BUFFER_CONFIG *new_yv12 = &cm->yv12_fb[cm->new_fb_idx]; + YV12_BUFFER_CONFIG *gld_yv12 = &cm->yv12_fb[cm->gld_fb_idx]; + YV12_BUFFER_CONFIG *alt_yv12 = &cm->yv12_fb[cm->alt_fb_idx]; + // At this point the new frame has been encoded. + // If any buffer copy / swapping is signaled it should be done here. + if (cm->frame_type == KEY_FRAME) + { + vp8_yv12_copy_frame_ptr(cm->frame_to_show, gld_yv12); + vp8_yv12_copy_frame_ptr(cm->frame_to_show, alt_yv12); + } + else // For non key frames + { + // Code to copy between reference buffers + if (cm->copy_buffer_to_arf) + { + if (cm->copy_buffer_to_arf == 1) + { + if (cm->refresh_last_frame) + // We copy new_frame here because last and new buffers will already have been swapped if cm->refresh_last_frame is set. + vp8_yv12_copy_frame_ptr(new_yv12, alt_yv12); + else + vp8_yv12_copy_frame_ptr(lst_yv12, alt_yv12); + } + else if (cm->copy_buffer_to_arf == 2) + vp8_yv12_copy_frame_ptr(gld_yv12, alt_yv12); + } + + if (cm->copy_buffer_to_gf) + { + if (cm->copy_buffer_to_gf == 1) + { + if (cm->refresh_last_frame) + // We copy new_frame here because last and new buffers will already have been swapped if cm->refresh_last_frame is set. + vp8_yv12_copy_frame_ptr(new_yv12, gld_yv12); + else + vp8_yv12_copy_frame_ptr(lst_yv12, gld_yv12); + } + else if (cm->copy_buffer_to_gf == 2) + vp8_yv12_copy_frame_ptr(alt_yv12, gld_yv12); + } + } + } +} + static void encode_frame_to_data_rate ( VP8_COMP *cpi, @@ -4058,8 +4141,8 @@ static void encode_frame_to_data_rate vp8_setup_key_frame(cpi); // transform / motion compensation build reconstruction frame - vp8_encode_frame(cpi); + cpi->projected_frame_size -= vp8_estimate_entropy_savings(cpi); cpi->projected_frame_size = (cpi->projected_frame_size > 0) ? cpi->projected_frame_size : 0; @@ -4408,92 +4491,43 @@ static void encode_frame_to_data_rate else cm->frame_to_show = &cm->yv12_fb[cm->new_fb_idx]; - if (cm->no_lpf) + +#if CONFIG_MULTITHREAD + if (cpi->b_multi_threaded) { - cm->filter_level = 0; + sem_post(&cpi->h_event_start_lpf); /* start loopfilter in separate thread */ } else +#endif { - struct vpx_usec_timer timer; - - vpx_usec_timer_start(&timer); - - if (cpi->sf.auto_filter == 0) - vp8cx_pick_filter_level_fast(cpi->Source, cpi); - else - vp8cx_pick_filter_level(cpi->Source, cpi); - - vpx_usec_timer_mark(&timer); - - cpi->time_pick_lpf += vpx_usec_timer_elapsed(&timer); + loopfilter_frame(cpi, cm); } - if (cm->filter_level > 0) - { - vp8cx_set_alt_lf_level(cpi, cm->filter_level); - vp8_loop_filter_frame(cm, &cpi->mb.e_mbd, cm->filter_level); - cm->last_filter_type = cm->filter_type; - cm->last_sharpness_level = cm->sharpness_level; - } - - /* Move storing frame_type out of the above loop since it is also - * needed in motion search besides loopfilter */ - cm->last_frame_type = cm->frame_type; - - vp8_yv12_extend_frame_borders_ptr(cm->frame_to_show); - if (cpi->oxcf.error_resilient_mode == 1) { cm->refresh_entropy_probs = 0; } +#if CONFIG_MULTITHREAD + /* wait that filter_level is picked so that we can continue with stream packing */ + if (cpi->b_multi_threaded) + sem_wait(&cpi->h_event_end_lpf); +#endif + // build the bitstream vp8_pack_bitstream(cpi, dest, size); +#if CONFIG_MULTITHREAD + /* wait for loopfilter thread done */ + if (cpi->b_multi_threaded) { - YV12_BUFFER_CONFIG *lst_yv12 = &cm->yv12_fb[cm->lst_fb_idx]; - YV12_BUFFER_CONFIG *new_yv12 = &cm->yv12_fb[cm->new_fb_idx]; - YV12_BUFFER_CONFIG *gld_yv12 = &cm->yv12_fb[cm->gld_fb_idx]; - YV12_BUFFER_CONFIG *alt_yv12 = &cm->yv12_fb[cm->alt_fb_idx]; - // At this point the new frame has been encoded coded. - // If any buffer copy / swaping is signalled it should be done here. - if (cm->frame_type == KEY_FRAME) - { - vp8_yv12_copy_frame_ptr(cm->frame_to_show, gld_yv12); - vp8_yv12_copy_frame_ptr(cm->frame_to_show, alt_yv12); - } - else // For non key frames - { - // Code to copy between reference buffers - if (cm->copy_buffer_to_arf) - { - if (cm->copy_buffer_to_arf == 1) - { - if (cm->refresh_last_frame) - // We copy new_frame here because last and new buffers will already have been swapped if cm->refresh_last_frame is set. - vp8_yv12_copy_frame_ptr(new_yv12, alt_yv12); - else - vp8_yv12_copy_frame_ptr(lst_yv12, alt_yv12); - } - else if (cm->copy_buffer_to_arf == 2) - vp8_yv12_copy_frame_ptr(gld_yv12, alt_yv12); - } - - if (cm->copy_buffer_to_gf) - { - if (cm->copy_buffer_to_gf == 1) - { - if (cm->refresh_last_frame) - // We copy new_frame here because last and new buffers will already have been swapped if cm->refresh_last_frame is set. - vp8_yv12_copy_frame_ptr(new_yv12, gld_yv12); - else - vp8_yv12_copy_frame_ptr(lst_yv12, gld_yv12); - } - else if (cm->copy_buffer_to_gf == 2) - vp8_yv12_copy_frame_ptr(alt_yv12, gld_yv12); - } - } + sem_wait(&cpi->h_event_end_lpf); } +#endif + + /* Move storing frame_type out of the above loop since it is also + * needed in motion search besides loopfilter */ + cm->last_frame_type = cm->frame_type; // Update rate control heuristics cpi->total_byte_count += (*size); @@ -5325,7 +5359,9 @@ int vp8_get_compressed_data(VP8_PTR ptr, unsigned int *frame_flags, unsigned lon cpi->time_compress_data += vpx_usec_timer_elapsed(&cmptimer); if (cpi->b_calculate_psnr && cpi->pass != 1 && cm->show_frame) + { generate_psnr_packet(cpi); + } #if CONFIG_PSNR diff --git a/vp8/encoder/onyx_int.h b/vp8/encoder/onyx_int.h index b66131d15..057186eb8 100644 --- a/vp8/encoder/onyx_int.h +++ b/vp8/encoder/onyx_int.h @@ -603,12 +603,17 @@ typedef struct int encoding_thread_count; pthread_t *h_encoding_thread; + pthread_t h_filter_thread; + MB_ROW_COMP *mb_row_ei; ENCODETHREAD_DATA *en_thread_data; + LPFTHREAD_DATA lpf_thread_data; //events sem_t *h_event_start_encoding; sem_t h_event_end_encoding; + sem_t h_event_start_lpf; + sem_t h_event_end_lpf; #endif TOKENLIST *tplist; From b2aa4017760c9bf9d267470ac235650e24b8adc5 Mon Sep 17 00:00:00 2001 From: Yunqing Wang Date: Fri, 11 Mar 2011 08:24:23 -0500 Subject: [PATCH 04/10] Align SAD output array to be 16-byte aligned Use aligned store. Change-Id: Icab4c0c53da811d0c52bb7e8134927f249ba2499 --- vp8/encoder/mcomp.c | 2 +- vp8/encoder/x86/sad_sse4.asm | 10 +++++----- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/vp8/encoder/mcomp.c b/vp8/encoder/mcomp.c index 33aaa2ca9..c210c1de2 100644 --- a/vp8/encoder/mcomp.c +++ b/vp8/encoder/mcomp.c @@ -1415,7 +1415,7 @@ int vp8_full_search_sadx8(MACROBLOCK *x, BLOCK *b, BLOCKD *d, MV *ref_mv, int er int col_min = ref_col - distance; int col_max = ref_col + distance; - unsigned short sad_array8[8]; + DECLARE_ALIGNED_ARRAY(16, unsigned short, sad_array8, 8); unsigned int sad_array[3]; // Work out the mid point for the search diff --git a/vp8/encoder/x86/sad_sse4.asm b/vp8/encoder/x86/sad_sse4.asm index 21e2e5007..03ecec4b3 100644 --- a/vp8/encoder/x86/sad_sse4.asm +++ b/vp8/encoder/x86/sad_sse4.asm @@ -186,7 +186,7 @@ sym(vp8_sad16x16x8_sse4): PROCESS_16X2X8 0 mov rdi, arg(4) ;Results - movdqu XMMWORD PTR [rdi], xmm1 + movdqa XMMWORD PTR [rdi], xmm1 ; begin epilog pop rdi @@ -224,7 +224,7 @@ sym(vp8_sad16x8x8_sse4): PROCESS_16X2X8 0 mov rdi, arg(4) ;Results - movdqu XMMWORD PTR [rdi], xmm1 + movdqa XMMWORD PTR [rdi], xmm1 ; begin epilog pop rdi @@ -262,7 +262,7 @@ sym(vp8_sad8x8x8_sse4): PROCESS_8X2X8 0 mov rdi, arg(4) ;Results - movdqu XMMWORD PTR [rdi], xmm1 + movdqa XMMWORD PTR [rdi], xmm1 ; begin epilog pop rdi @@ -303,7 +303,7 @@ sym(vp8_sad8x16x8_sse4): PROCESS_8X2X8 0 PROCESS_8X2X8 0 mov rdi, arg(4) ;Results - movdqu XMMWORD PTR [rdi], xmm1 + movdqa XMMWORD PTR [rdi], xmm1 ; begin epilog pop rdi @@ -339,7 +339,7 @@ sym(vp8_sad4x4x8_sse4): PROCESS_4X2X8 0 mov rdi, arg(4) ;Results - movdqu XMMWORD PTR [rdi], xmm1 + movdqa XMMWORD PTR [rdi], xmm1 ; begin epilog pop rdi From 3f6f7289aaa5b91a16fab20702e536149c8c3f0c Mon Sep 17 00:00:00 2001 From: Jim Bankoski Date: Tue, 8 Mar 2011 09:05:18 -0500 Subject: [PATCH 05/10] vp8cx- alternate ssim function with optimizations Change-Id: I91921b0a90dbaddc7010380b038955be347964b3 --- vp8/encoder/generic/csystemdependent.c | 4 + vp8/encoder/onyx_if.c | 12 +- vp8/encoder/ssim.c | 468 ++++++++++--------------- vp8/encoder/variance.h | 32 ++ vp8/encoder/x86/ssim_opt.asm | 215 ++++++++++++ vp8/encoder/x86/x86_csystemdependent.c | 30 ++ vp8/vp8cx.mk | 1 + 7 files changed, 468 insertions(+), 294 deletions(-) create mode 100644 vp8/encoder/x86/ssim_opt.asm diff --git a/vp8/encoder/generic/csystemdependent.c b/vp8/encoder/generic/csystemdependent.c index fc0580d55..81108fe96 100644 --- a/vp8/encoder/generic/csystemdependent.c +++ b/vp8/encoder/generic/csystemdependent.c @@ -103,6 +103,10 @@ void vp8_cmachine_specific_config(VP8_COMP *cpi) // Pure C: vp8_yv12_copy_partial_frame_ptr = vp8_yv12_copy_partial_frame; +#if CONFIG_PSNR + cpi->rtcd.variance.ssimpf_8x8 = ssim_parms_8x8_c; + cpi->rtcd.variance.ssimpf = ssim_parms_c; +#endif #if ARCH_X86 || ARCH_X86_64 vp8_arch_x86_encoder_init(cpi); diff --git a/vp8/encoder/onyx_if.c b/vp8/encoder/onyx_if.c index 797e18b30..5caaeb933 100644 --- a/vp8/encoder/onyx_if.c +++ b/vp8/encoder/onyx_if.c @@ -86,9 +86,11 @@ extern double vp8_calc_ssim YV12_BUFFER_CONFIG *source, YV12_BUFFER_CONFIG *dest, int lumamask, - double *weight + double *weight, + const vp8_variance_rtcd_vtable_t *rtcd ); + extern double vp8_calc_ssimg ( YV12_BUFFER_CONFIG *source, @@ -5133,8 +5135,12 @@ int vp8_get_compressed_data(VP8_PTR ptr, unsigned int *frame_flags, unsigned lon vp8_deblock(cm->frame_to_show, &cm->post_proc_buffer, cm->filter_level * 10 / 6, 1, 0, IF_RTCD(&cm->rtcd.postproc)); vp8_clear_system_state(); - frame_psnr2 = vp8_calc_psnr(cpi->Source, &cm->post_proc_buffer, &y2, &u2, &v2, &sq_error); - frame_ssim2 = vp8_calc_ssim(cpi->Source, &cm->post_proc_buffer, 1, &weight); + frame_psnr2 = vp8_calc_psnr(cpi->Source, + &cm->post_proc_buffer, &y2, &u2, &v2, &sq_error); + + frame_ssim2 = vp8_calc_ssim(cpi->Source, + &cm->post_proc_buffer, 1, &weight, + IF_RTCD(&cpi->rtcd.variance)); cpi->summed_quality += frame_ssim2 * weight; cpi->summed_weights += weight; diff --git a/vp8/encoder/ssim.c b/vp8/encoder/ssim.c index 4ebcba1a1..d6aa9566b 100644 --- a/vp8/encoder/ssim.c +++ b/vp8/encoder/ssim.c @@ -11,298 +11,13 @@ #include "vpx_scale/yv12config.h" #include "math.h" +#include "onyx_int.h" -#define C1 (float)(64 * 64 * 0.01*255*0.01*255) -#define C2 (float)(64 * 64 * 0.03*255*0.03*255) - -static int width_y; -static int height_y; -static int height_uv; -static int width_uv; -static int stride_uv; -static int stride; -static int lumimask; -static int luminance; -static double plane_summed_weights = 0; - -static short img12_sum_block[8*4096*4096*2] ; - -static short img1_sum[8*4096*2]; -static short img2_sum[8*4096*2]; -static int img1_sq_sum[8*4096*2]; -static int img2_sq_sum[8*4096*2]; -static int img12_mul_sum[8*4096*2]; - - -double vp8_similarity -( - int mu_x, - int mu_y, - int pre_mu_x2, - int pre_mu_y2, - int pre_mu_xy2 -) -{ - int mu_x2, mu_y2, mu_xy, theta_x2, theta_y2, theta_xy; - - mu_x2 = mu_x * mu_x; - mu_y2 = mu_y * mu_y; - mu_xy = mu_x * mu_y; - - theta_x2 = 64 * pre_mu_x2 - mu_x2; - theta_y2 = 64 * pre_mu_y2 - mu_y2; - theta_xy = 64 * pre_mu_xy2 - mu_xy; - - return (2 * mu_xy + C1) * (2 * theta_xy + C2) / ((mu_x2 + mu_y2 + C1) * (theta_x2 + theta_y2 + C2)); -} - -double vp8_ssim -( - const unsigned char *img1, - const unsigned char *img2, - int stride_img1, - int stride_img2, - int width, - int height -) -{ - int x, y, x2, y2, img1_block, img2_block, img1_sq_block, img2_sq_block, img12_mul_block, temp; - - double plane_quality, weight, mean; - - short *img1_sum_ptr1, *img1_sum_ptr2; - short *img2_sum_ptr1, *img2_sum_ptr2; - int *img1_sq_sum_ptr1, *img1_sq_sum_ptr2; - int *img2_sq_sum_ptr1, *img2_sq_sum_ptr2; - int *img12_mul_sum_ptr1, *img12_mul_sum_ptr2; - - plane_quality = 0; - - if (lumimask) - plane_summed_weights = 0.0f; - else - plane_summed_weights = (height - 7) * (width - 7); - - //some prologue for the main loop - temp = 8 * width; - - img1_sum_ptr1 = img1_sum + temp; - img2_sum_ptr1 = img2_sum + temp; - img1_sq_sum_ptr1 = img1_sq_sum + temp; - img2_sq_sum_ptr1 = img2_sq_sum + temp; - img12_mul_sum_ptr1 = img12_mul_sum + temp; - - for (x = 0; x < width; x++) - { - img1_sum[x] = img1[x]; - img2_sum[x] = img2[x]; - img1_sq_sum[x] = img1[x] * img1[x]; - img2_sq_sum[x] = img2[x] * img2[x]; - img12_mul_sum[x] = img1[x] * img2[x]; - - img1_sum_ptr1[x] = 0; - img2_sum_ptr1[x] = 0; - img1_sq_sum_ptr1[x] = 0; - img2_sq_sum_ptr1[x] = 0; - img12_mul_sum_ptr1[x] = 0; - } - - //the main loop - for (y = 1; y < height; y++) - { - img1 += stride_img1; - img2 += stride_img2; - - temp = (y - 1) % 9 * width; - - img1_sum_ptr1 = img1_sum + temp; - img2_sum_ptr1 = img2_sum + temp; - img1_sq_sum_ptr1 = img1_sq_sum + temp; - img2_sq_sum_ptr1 = img2_sq_sum + temp; - img12_mul_sum_ptr1 = img12_mul_sum + temp; - - temp = y % 9 * width; - - img1_sum_ptr2 = img1_sum + temp; - img2_sum_ptr2 = img2_sum + temp; - img1_sq_sum_ptr2 = img1_sq_sum + temp; - img2_sq_sum_ptr2 = img2_sq_sum + temp; - img12_mul_sum_ptr2 = img12_mul_sum + temp; - - for (x = 0; x < width; x++) - { - img1_sum_ptr2[x] = img1_sum_ptr1[x] + img1[x]; - img2_sum_ptr2[x] = img2_sum_ptr1[x] + img2[x]; - img1_sq_sum_ptr2[x] = img1_sq_sum_ptr1[x] + img1[x] * img1[x]; - img2_sq_sum_ptr2[x] = img2_sq_sum_ptr1[x] + img2[x] * img2[x]; - img12_mul_sum_ptr2[x] = img12_mul_sum_ptr1[x] + img1[x] * img2[x]; - } - - if (y > 6) - { - //calculate the sum of the last 8 lines by subtracting the total sum of 8 lines back from the present sum - temp = (y + 1) % 9 * width; - - img1_sum_ptr1 = img1_sum + temp; - img2_sum_ptr1 = img2_sum + temp; - img1_sq_sum_ptr1 = img1_sq_sum + temp; - img2_sq_sum_ptr1 = img2_sq_sum + temp; - img12_mul_sum_ptr1 = img12_mul_sum + temp; - - for (x = 0; x < width; x++) - { - img1_sum_ptr1[x] = img1_sum_ptr2[x] - img1_sum_ptr1[x]; - img2_sum_ptr1[x] = img2_sum_ptr2[x] - img2_sum_ptr1[x]; - img1_sq_sum_ptr1[x] = img1_sq_sum_ptr2[x] - img1_sq_sum_ptr1[x]; - img2_sq_sum_ptr1[x] = img2_sq_sum_ptr2[x] - img2_sq_sum_ptr1[x]; - img12_mul_sum_ptr1[x] = img12_mul_sum_ptr2[x] - img12_mul_sum_ptr1[x]; - } - - //here we calculate the sum over the 8x8 block of pixels - //this is done by sliding a window across the column sums for the last 8 lines - //each time adding the new column sum, and subtracting the one which fell out of the window - img1_block = 0; - img2_block = 0; - img1_sq_block = 0; - img2_sq_block = 0; - img12_mul_block = 0; - - //prologue, and calculation of simularity measure from the first 8 column sums - for (x = 0; x < 8; x++) - { - img1_block += img1_sum_ptr1[x]; - img2_block += img2_sum_ptr1[x]; - img1_sq_block += img1_sq_sum_ptr1[x]; - img2_sq_block += img2_sq_sum_ptr1[x]; - img12_mul_block += img12_mul_sum_ptr1[x]; - } - - if (lumimask) - { - y2 = y - 7; - x2 = 0; - - if (luminance) - { - mean = (img2_block + img1_block) / 128.0f; - - if (!(y2 % 2 || x2 % 2)) - *(img12_sum_block + y2 / 2 * width_uv + x2 / 2) = img2_block + img1_block; - } - else - { - mean = *(img12_sum_block + y2 * width_uv + x2); - mean += *(img12_sum_block + y2 * width_uv + x2 + 4); - mean += *(img12_sum_block + (y2 + 4) * width_uv + x2); - mean += *(img12_sum_block + (y2 + 4) * width_uv + x2 + 4); - - mean /= 512.0f; - } - - weight = mean < 40 ? 0.0f : - (mean < 50 ? (mean - 40.0f) / 10.0f : 1.0f); - plane_summed_weights += weight; - - plane_quality += weight * vp8_similarity(img1_block, img2_block, img1_sq_block, img2_sq_block, img12_mul_block); - } - else - plane_quality += vp8_similarity(img1_block, img2_block, img1_sq_block, img2_sq_block, img12_mul_block); - - //and for the rest - for (x = 8; x < width; x++) - { - img1_block = img1_block + img1_sum_ptr1[x] - img1_sum_ptr1[x - 8]; - img2_block = img2_block + img2_sum_ptr1[x] - img2_sum_ptr1[x - 8]; - img1_sq_block = img1_sq_block + img1_sq_sum_ptr1[x] - img1_sq_sum_ptr1[x - 8]; - img2_sq_block = img2_sq_block + img2_sq_sum_ptr1[x] - img2_sq_sum_ptr1[x - 8]; - img12_mul_block = img12_mul_block + img12_mul_sum_ptr1[x] - img12_mul_sum_ptr1[x - 8]; - - if (lumimask) - { - y2 = y - 7; - x2 = x - 7; - - if (luminance) - { - mean = (img2_block + img1_block) / 128.0f; - - if (!(y2 % 2 || x2 % 2)) - *(img12_sum_block + y2 / 2 * width_uv + x2 / 2) = img2_block + img1_block; - } - else - { - mean = *(img12_sum_block + y2 * width_uv + x2); - mean += *(img12_sum_block + y2 * width_uv + x2 + 4); - mean += *(img12_sum_block + (y2 + 4) * width_uv + x2); - mean += *(img12_sum_block + (y2 + 4) * width_uv + x2 + 4); - - mean /= 512.0f; - } - - weight = mean < 40 ? 0.0f : - (mean < 50 ? (mean - 40.0f) / 10.0f : 1.0f); - plane_summed_weights += weight; - - plane_quality += weight * vp8_similarity(img1_block, img2_block, img1_sq_block, img2_sq_block, img12_mul_block); - } - else - plane_quality += vp8_similarity(img1_block, img2_block, img1_sq_block, img2_sq_block, img12_mul_block); - } - } - } - - if (plane_summed_weights == 0) - return 1.0f; - else - return plane_quality / plane_summed_weights; -} - -double vp8_calc_ssim -( - YV12_BUFFER_CONFIG *source, - YV12_BUFFER_CONFIG *dest, - int lumamask, - double *weight -) -{ - double a, b, c; - double frame_weight; - double ssimv; - - width_y = source->y_width; - height_y = source->y_height; - height_uv = source->uv_height; - width_uv = source->uv_width; - stride_uv = dest->uv_stride; - stride = dest->y_stride; - - lumimask = lumamask; - - luminance = 1; - a = vp8_ssim(source->y_buffer, dest->y_buffer, - source->y_stride, dest->y_stride, source->y_width, source->y_height); - luminance = 0; - - frame_weight = plane_summed_weights / ((width_y - 7) * (height_y - 7)); - - if (frame_weight == 0) - a = b = c = 1.0f; - else - { - b = vp8_ssim(source->u_buffer, dest->u_buffer, - source->uv_stride, dest->uv_stride, source->uv_width, source->uv_height); - - c = vp8_ssim(source->v_buffer, dest->v_buffer, - source->uv_stride, dest->uv_stride, source->uv_width, source->uv_height); - } - - ssimv = a * .8 + .1 * (b + c); - - *weight = frame_weight; - - return ssimv; -} - +#if CONFIG_RUNTIME_CPU_DETECT +#define IF_RTCD(x) (x) +#else +#define IF_RTCD(x) NULL +#endif // Google version of SSIM // SSIM #define KERNEL 3 @@ -520,3 +235,174 @@ double vp8_calc_ssimg *ssim_v /= uvsize; return ssim_all; } + + +void ssim_parms_c +( + unsigned char *s, + int sp, + unsigned char *r, + int rp, + unsigned long *sum_s, + unsigned long *sum_r, + unsigned long *sum_sq_s, + unsigned long *sum_sq_r, + unsigned long *sum_sxr +) +{ + int i,j; + for(i=0;i<16;i++,s+=sp,r+=rp) + { + for(j=0;j<16;j++) + { + *sum_s += s[j]; + *sum_r += r[j]; + *sum_sq_s += s[j] * s[j]; + *sum_sq_r += r[j] * r[j]; + *sum_sxr += s[j] * r[j]; + } + } +} +void ssim_parms_8x8_c +( + unsigned char *s, + int sp, + unsigned char *r, + int rp, + unsigned long *sum_s, + unsigned long *sum_r, + unsigned long *sum_sq_s, + unsigned long *sum_sq_r, + unsigned long *sum_sxr +) +{ + int i,j; + for(i=0;i<8;i++,s+=sp,r+=rp) + { + for(j=0;j<8;j++) + { + *sum_s += s[j]; + *sum_r += r[j]; + *sum_sq_s += s[j] * s[j]; + *sum_sq_r += r[j] * r[j]; + *sum_sxr += s[j] * r[j]; + } + } +} + +const static long long c1 = 426148; // (256^2*(.01*255)^2 +const static long long c2 = 3835331; //(256^2*(.03*255)^2 + +static double similarity +( + unsigned long sum_s, + unsigned long sum_r, + unsigned long sum_sq_s, + unsigned long sum_sq_r, + unsigned long sum_sxr, + int count +) +{ + long long ssim_n = (2*sum_s*sum_r+ c1)*(2*count*sum_sxr-2*sum_s*sum_r+c2); + + long long ssim_d = (sum_s*sum_s +sum_r*sum_r+c1)* + (count*sum_sq_s-sum_s*sum_s + count*sum_sq_r-sum_r*sum_r +c2) ; + + return ssim_n * 1.0 / ssim_d; +} + +static double ssim_16x16(unsigned char *s,int sp, unsigned char *r,int rp, + const vp8_variance_rtcd_vtable_t *rtcd) +{ + unsigned long sum_s=0,sum_r=0,sum_sq_s=0,sum_sq_r=0,sum_sxr=0; + rtcd->ssimpf(s, sp, r, rp, &sum_s, &sum_r, &sum_sq_s, &sum_sq_r, &sum_sxr); + return similarity(sum_s, sum_r, sum_sq_s, sum_sq_r, sum_sxr, 256); +} +static double ssim_8x8(unsigned char *s,int sp, unsigned char *r,int rp, + const vp8_variance_rtcd_vtable_t *rtcd) +{ + unsigned long sum_s=0,sum_r=0,sum_sq_s=0,sum_sq_r=0,sum_sxr=0; + rtcd->ssimpf_8x8(s, sp, r, rp, &sum_s, &sum_r, &sum_sq_s, &sum_sq_r, &sum_sxr); + return similarity(sum_s, sum_r, sum_sq_s, sum_sq_r, sum_sxr, 64); +} + +// TODO: (jbb) tried to scale this function such that we may be able to use it +// for distortion metric in mode selection code ( provided we do a reconstruction) +long dssim(unsigned char *s,int sp, unsigned char *r,int rp, + const vp8_variance_rtcd_vtable_t *rtcd) +{ + unsigned long sum_s=0,sum_r=0,sum_sq_s=0,sum_sq_r=0,sum_sxr=0; + double ssim3; + long long ssim_n; + long long ssim_d; + + rtcd->ssimpf(s, sp, r, rp, &sum_s, &sum_r, &sum_sq_s, &sum_sq_r, &sum_sxr); + ssim_n = (2*sum_s*sum_r+ c1)*(2*256*sum_sxr-2*sum_s*sum_r+c2); + + ssim_d = (sum_s*sum_s +sum_r*sum_r+c1)* + (256*sum_sq_s-sum_s*sum_s + 256*sum_sq_r-sum_r*sum_r +c2) ; + + ssim3 = 256 * (ssim_d-ssim_n) / ssim_d; + return (long)( 256*ssim3 * ssim3 ); +} +// TODO: (jbb) this 8x8 window might be too big + we may want to pick pixels +// such that the window regions overlap block boundaries to penalize blocking +// artifacts. + +double vp8_ssim2 +( + unsigned char *img1, + unsigned char *img2, + int stride_img1, + int stride_img2, + int width, + int height, + const vp8_variance_rtcd_vtable_t *rtcd +) +{ + int i,j; + + double ssim_total=0; + + // we can sample points as frequently as we like start with 1 per 8x8 + for(i=0; i < height; i+=8, img1 += stride_img1*8, img2 += stride_img2*8) + { + for(j=0; j < width; j+=8 ) + { + ssim_total += ssim_8x8(img1, stride_img1, img2, stride_img2, rtcd); + } + } + ssim_total /= (width/8 * height /8); + return ssim_total; + +} +double vp8_calc_ssim +( + YV12_BUFFER_CONFIG *source, + YV12_BUFFER_CONFIG *dest, + int lumamask, + double *weight, + const vp8_variance_rtcd_vtable_t *rtcd +) +{ + double a, b, c; + double ssimv; +//IF_RTCD(&cpi->rtcd.variance) + a = vp8_ssim2(source->y_buffer, dest->y_buffer, + source->y_stride, dest->y_stride, source->y_width, + source->y_height, rtcd); + + b = vp8_ssim2(source->u_buffer, dest->u_buffer, + source->uv_stride, dest->uv_stride, source->uv_width, + source->uv_height, rtcd); + + c = vp8_ssim2(source->v_buffer, dest->v_buffer, + source->uv_stride, dest->uv_stride, source->uv_width, + source->uv_height, rtcd); + + ssimv = a * .8 + .1 * (b + c); + + *weight = 1; + + return ssimv; +} diff --git a/vp8/encoder/variance.h b/vp8/encoder/variance.h index 5befd3b86..bf17ea8b6 100644 --- a/vp8/encoder/variance.h +++ b/vp8/encoder/variance.h @@ -85,6 +85,19 @@ unsigned int *sse \ ); +#define prototype_ssimpf(sym) \ + void (sym) \ + ( \ + unsigned char *s, \ + int sp, \ + unsigned char *r, \ + int rp, \ + unsigned long *sum_s, \ + unsigned long *sum_r, \ + unsigned long *sum_sq_s, \ + unsigned long *sum_sq_r, \ + unsigned long *sum_sxr \ + ); #define prototype_getmbss(sym) unsigned int (sym)(const short *) @@ -306,6 +319,15 @@ extern prototype_variance2(vp8_variance_get16x16var); #endif extern prototype_sad(vp8_variance_get4x4sse_cs); +#ifndef vp8_ssimpf +#define vp8_ssimpf ssim_parms_c +#endif +extern prototype_ssimpf(vp8_ssimpf) + +#ifndef vp8_ssimpf_8x8 +#define vp8_ssimpf_8x8 ssim_parms_8x8_c +#endif +extern prototype_ssimpf(vp8_ssimpf_8x8) typedef prototype_sad(*vp8_sad_fn_t); typedef prototype_sad_multi_same_address(*vp8_sad_multi_fn_t); @@ -315,6 +337,10 @@ typedef prototype_variance(*vp8_variance_fn_t); typedef prototype_variance2(*vp8_variance2_fn_t); typedef prototype_subpixvariance(*vp8_subpixvariance_fn_t); typedef prototype_getmbss(*vp8_getmbss_fn_t); + +typedef prototype_ssimpf(*vp8_ssimpf_fn_t) + + typedef struct { vp8_sad_fn_t sad4x4; @@ -365,6 +391,11 @@ typedef struct vp8_sad_multi_d_fn_t sad8x8x4d; vp8_sad_multi_d_fn_t sad4x4x4d; +#if CONFIG_PSNR + vp8_ssimpf_fn_t ssimpf_8x8; + vp8_ssimpf_fn_t ssimpf; +#endif + } vp8_variance_rtcd_vtable_t; typedef struct @@ -378,6 +409,7 @@ typedef struct vp8_sad_multi_fn_t sdx3f; vp8_sad_multi1_fn_t sdx8f; vp8_sad_multi_d_fn_t sdx4df; + } vp8_variance_fn_ptr_t; #if CONFIG_RUNTIME_CPU_DETECT diff --git a/vp8/encoder/x86/ssim_opt.asm b/vp8/encoder/x86/ssim_opt.asm new file mode 100644 index 000000000..c267cdb54 --- /dev/null +++ b/vp8/encoder/x86/ssim_opt.asm @@ -0,0 +1,215 @@ +; +; Copyright (c) 2010 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. +; + +%include "vpx_ports/x86_abi_support.asm" + +; tabulate_ssim - sums sum_s,sum_r,sum_sq_s,sum_sq_r, sum_sxr +%macro TABULATE_SSIM 0 + paddusw xmm15, xmm3 ; sum_s + paddusw xmm14, xmm4 ; sum_r + movdqa xmm1, xmm3 + pmaddwd xmm1, xmm1 + paddq xmm13, xmm1 ; sum_sq_s + movdqa xmm2, xmm4 + pmaddwd xmm2, xmm2 + paddq xmm12, xmm2 ; sum_sq_r + pmaddwd xmm3, xmm4 + paddq xmm11, xmm3 ; sum_sxr +%endmacro + +; Sum across the register %1 starting with q words +%macro SUM_ACROSS_Q 1 + movdqa xmm2,%1 + punpckldq %1,xmm0 + punpckhdq xmm2,xmm0 + paddq %1,xmm2 + movdqa xmm2,%1 + punpcklqdq %1,xmm0 + punpckhqdq xmm2,xmm0 + paddq %1,xmm2 +%endmacro + +; Sum across the register %1 starting with q words +%macro SUM_ACROSS_W 1 + movdqa xmm1, %1 + punpcklwd %1,xmm0 + punpckhwd xmm1,xmm0 + paddd %1, xmm1 + SUM_ACROSS_Q %1 +%endmacro +;void ssim_parms_sse3( +; unsigned char *s, +; int sp, +; unsigned char *r, +; int rp +; unsigned long *sum_s, +; unsigned long *sum_r, +; unsigned long *sum_sq_s, +; unsigned long *sum_sq_r, +; unsigned long *sum_sxr); +; +; TODO: Use parm passing through structure, probably don't need the pxors +; ( calling app will initialize to 0 ) could easily fit everything in sse2 +; without too much hastle, and can probably do better estimates with psadw +; or pavgb At this point this is just meant to be first pass for calculating +; all the parms needed for 16x16 ssim so we can play with dssim as distortion +; in mode selection code. +global sym(vp8_ssim_parms_16x16_sse3) +sym(vp8_ssim_parms_16x16_sse3): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 9 + push rsi + push rdi + ; end prolog + + mov rsi, arg(0) ;s + mov rcx, arg(1) ;sp + mov rdi, arg(2) ;r + mov rax, arg(3) ;rp + + pxor xmm0, xmm0 + pxor xmm15,xmm15 ;sum_s + pxor xmm14,xmm14 ;sum_r + pxor xmm13,xmm13 ;sum_sq_s + pxor xmm12,xmm12 ;sum_sq_r + pxor xmm11,xmm11 ;sum_sxr + + mov rdx, 16 ;row counter +NextRow: + + ;grab source and reference pixels + movdqu xmm5, [rsi] + movdqu xmm6, [rdi] + movdqa xmm3, xmm5 + movdqa xmm4, xmm6 + punpckhbw xmm3, xmm0 ; high_s + punpckhbw xmm4, xmm0 ; high_r + + TABULATE_SSIM + + movdqa xmm3, xmm5 + movdqa xmm4, xmm6 + punpcklbw xmm3, xmm0 ; low_s + punpcklbw xmm4, xmm0 ; low_r + + TABULATE_SSIM + + add rsi, rcx ; next s row + add rdi, rax ; next r row + + dec rdx ; counter + jnz NextRow + + SUM_ACROSS_W xmm15 + SUM_ACROSS_W xmm14 + SUM_ACROSS_Q xmm13 + SUM_ACROSS_Q xmm12 + SUM_ACROSS_Q xmm11 + + mov rdi,arg(4) + movq [rdi], xmm15; + mov rdi,arg(5) + movq [rdi], xmm14; + mov rdi,arg(6) + movq [rdi], xmm13; + mov rdi,arg(7) + movq [rdi], xmm12; + mov rdi,arg(8) + movq [rdi], xmm11; + + ; begin epilog + pop rdi + pop rsi + UNSHADOW_ARGS + pop rbp + ret + +;void ssim_parms_sse3( +; unsigned char *s, +; int sp, +; unsigned char *r, +; int rp +; unsigned long *sum_s, +; unsigned long *sum_r, +; unsigned long *sum_sq_s, +; unsigned long *sum_sq_r, +; unsigned long *sum_sxr); +; +; TODO: Use parm passing through structure, probably don't need the pxors +; ( calling app will initialize to 0 ) could easily fit everything in sse2 +; without too much hastle, and can probably do better estimates with psadw +; or pavgb At this point this is just meant to be first pass for calculating +; all the parms needed for 16x16 ssim so we can play with dssim as distortion +; in mode selection code. +global sym(vp8_ssim_parms_8x8_sse3) +sym(vp8_ssim_parms_8x8_sse3): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 9 + push rsi + push rdi + ; end prolog + + mov rsi, arg(0) ;s + mov rcx, arg(1) ;sp + mov rdi, arg(2) ;r + mov rax, arg(3) ;rp + + pxor xmm0, xmm0 + pxor xmm15,xmm15 ;sum_s + pxor xmm14,xmm14 ;sum_r + pxor xmm13,xmm13 ;sum_sq_s + pxor xmm12,xmm12 ;sum_sq_r + pxor xmm11,xmm11 ;sum_sxr + + mov rdx, 8 ;row counter +NextRow2: + + ;grab source and reference pixels + movq xmm5, [rsi] + movq xmm6, [rdi] + + movdqa xmm3, xmm5 + movdqa xmm4, xmm6 + punpcklbw xmm3, xmm0 ; low_s + punpcklbw xmm4, xmm0 ; low_r + + TABULATE_SSIM + + add rsi, rcx ; next s row + add rdi, rax ; next r row + + dec rdx ; counter + jnz NextRow2 + + SUM_ACROSS_W xmm15 + SUM_ACROSS_W xmm14 + SUM_ACROSS_Q xmm13 + SUM_ACROSS_Q xmm12 + SUM_ACROSS_Q xmm11 + + mov rdi,arg(4) + movq [rdi], xmm15; + mov rdi,arg(5) + movq [rdi], xmm14; + mov rdi,arg(6) + movq [rdi], xmm13; + mov rdi,arg(7) + movq [rdi], xmm12; + mov rdi,arg(8) + movq [rdi], xmm11; + + ; begin epilog + pop rdi + pop rsi + UNSHADOW_ARGS + pop rbp + ret diff --git a/vp8/encoder/x86/x86_csystemdependent.c b/vp8/encoder/x86/x86_csystemdependent.c index 3158ac12b..5ab364147 100644 --- a/vp8/encoder/x86/x86_csystemdependent.c +++ b/vp8/encoder/x86/x86_csystemdependent.c @@ -176,6 +176,25 @@ void vp8_fast_quantize_b_ssse3(BLOCK *b, BLOCKD *d) d->dqcoeff ); } +#if CONFIG_PSNR +#if ARCH_X86_64 +typedef void ssimpf +( + unsigned char *s, + int sp, + unsigned char *r, + int rp, + unsigned long *sum_s, + unsigned long *sum_r, + unsigned long *sum_sq_s, + unsigned long *sum_sq_r, + unsigned long *sum_sxr +); + +extern ssimpf vp8_ssim_parms_16x16_sse3; +extern ssimpf vp8_ssim_parms_8x8_sse3; +#endif +#endif #endif @@ -280,6 +299,8 @@ void vp8_arch_x86_encoder_init(VP8_COMP *cpi) cpi->rtcd.variance.get16x16prederror = vp8_get16x16pred_error_sse2; cpi->rtcd.variance.get8x8var = vp8_get8x8var_sse2; cpi->rtcd.variance.get16x16var = vp8_get16x16var_sse2; + + /* cpi->rtcd.variance.get4x4sse_cs not implemented for wmt */; cpi->rtcd.fdct.short4x4 = vp8_short_fdct4x4_sse2; @@ -339,9 +360,18 @@ void vp8_arch_x86_encoder_init(VP8_COMP *cpi) cpi->rtcd.quantize.fastquantb = vp8_fast_quantize_b_ssse3; +#if CONFIG_PSNR +#if ARCH_X86_64 + cpi->rtcd.variance.ssimpf_8x8 = vp8_ssim_parms_8x8_sse3; + cpi->rtcd.variance.ssimpf = vp8_ssim_parms_16x16_sse3; +#endif +#endif + } #endif + + #if HAVE_SSE4_1 if (SSE4_1Enabled) { diff --git a/vp8/vp8cx.mk b/vp8/vp8cx.mk index c0ae250f5..670c02280 100644 --- a/vp8/vp8cx.mk +++ b/vp8/vp8cx.mk @@ -116,6 +116,7 @@ VP8_CX_SRCS-$(HAVE_SSSE3) += encoder/x86/quantize_ssse3.asm VP8_CX_SRCS-$(HAVE_SSE4_1) += encoder/x86/sad_sse4.asm VP8_CX_SRCS-$(ARCH_X86)$(ARCH_X86_64) += encoder/x86/quantize_mmx.asm VP8_CX_SRCS-$(ARCH_X86)$(ARCH_X86_64) += encoder/x86/encodeopt.asm +VP8_CX_SRCS-$(ARCH_X86)$(ARCH_X86_64) += encoder/x86/ssim_opt.asm ifeq ($(CONFIG_REALTIME_ONLY),yes) VP8_CX_SRCS_REMOVE-$(HAVE_SSE2) += encoder/x86/temporal_filter_apply_sse2.asm From 3dc382294b32364d365528538abb010a296e0993 Mon Sep 17 00:00:00 2001 From: Jim Bankoski Date: Tue, 8 Mar 2011 15:23:40 -0500 Subject: [PATCH 06/10] vp8cx - psnr converted to call assemblerized sse Change-Id: Ie388d4618c44b131f96b9fe526618b457f020dfa --- vp8/encoder/onyx_if.c | 63 ++++++++++++++++++++++++++++++++++--------- vp8/encoder/ssim.c | 2 +- 2 files changed, 51 insertions(+), 14 deletions(-) diff --git a/vp8/encoder/onyx_if.c b/vp8/encoder/onyx_if.c index 5caaeb933..9ec9f5b7a 100644 --- a/vp8/encoder/onyx_if.c +++ b/vp8/encoder/onyx_if.c @@ -5121,12 +5121,35 @@ int vp8_get_compressed_data(VP8_PTR ptr, unsigned int *frame_flags, unsigned lon if (cpi->b_calculate_psnr) { double y, u, v; - double sq_error; - double frame_psnr = vp8_calc_psnr(cpi->Source, cm->frame_to_show, &y, &u, &v, &sq_error); + double ye,ue,ve; + double frame_psnr; + YV12_BUFFER_CONFIG *orig = cpi->Source; + YV12_BUFFER_CONFIG *recon = cpi->common.frame_to_show; + YV12_BUFFER_CONFIG *pp = &cm->post_proc_buffer; + int y_samples = orig->y_height * orig->y_width ; + int uv_samples = orig->uv_height * orig->uv_width ; + int t_samples = y_samples + 2 * uv_samples; + long long sq_error; - cpi->total_y += y; - cpi->total_u += u; - cpi->total_v += v; + ye = calc_plane_error(orig->y_buffer, orig->y_stride, + recon->y_buffer, recon->y_stride, orig->y_width, orig->y_height, + IF_RTCD(&cpi->rtcd.variance)); + + ue = calc_plane_error(orig->u_buffer, orig->uv_stride, + recon->u_buffer, recon->uv_stride, orig->uv_width, orig->uv_height, + IF_RTCD(&cpi->rtcd.variance)); + + ve = calc_plane_error(orig->v_buffer, orig->uv_stride, + recon->v_buffer, recon->uv_stride, orig->uv_width, orig->uv_height, + IF_RTCD(&cpi->rtcd.variance)); + + sq_error = ye + ue + ve; + + frame_psnr = vp8_mse2psnr(t_samples, 255.0, sq_error); + + cpi->total_y += vp8_mse2psnr(y_samples, 255.0, ye); + cpi->total_u += vp8_mse2psnr(uv_samples, 255.0, ue); + cpi->total_v += vp8_mse2psnr(uv_samples, 255.0, ve); cpi->total_sq_error += sq_error; cpi->total += frame_psnr; { @@ -5135,8 +5158,28 @@ int vp8_get_compressed_data(VP8_PTR ptr, unsigned int *frame_flags, unsigned lon vp8_deblock(cm->frame_to_show, &cm->post_proc_buffer, cm->filter_level * 10 / 6, 1, 0, IF_RTCD(&cm->rtcd.postproc)); vp8_clear_system_state(); - frame_psnr2 = vp8_calc_psnr(cpi->Source, - &cm->post_proc_buffer, &y2, &u2, &v2, &sq_error); + + ye = calc_plane_error(orig->y_buffer, orig->y_stride, + pp->y_buffer, pp->y_stride, orig->y_width, orig->y_height, + IF_RTCD(&cpi->rtcd.variance)); + + ue = calc_plane_error(orig->u_buffer, orig->uv_stride, + pp->u_buffer, pp->uv_stride, orig->uv_width, orig->uv_height, + IF_RTCD(&cpi->rtcd.variance)); + + ve = calc_plane_error(orig->v_buffer, orig->uv_stride, + pp->v_buffer, pp->uv_stride, orig->uv_width, orig->uv_height, + IF_RTCD(&cpi->rtcd.variance)); + + sq_error = ye + ue + ve; + + frame_psnr2 = vp8_mse2psnr(t_samples, 255.0, sq_error); + + cpi->totalp_y += vp8_mse2psnr(y_samples, 255.0, ye); + cpi->totalp_u += vp8_mse2psnr(uv_samples, 255.0, ue); + cpi->totalp_v += vp8_mse2psnr(uv_samples, 255.0, ve); + cpi->total_sq_error2 += sq_error; + cpi->totalp += frame_psnr2; frame_ssim2 = vp8_calc_ssim(cpi->Source, &cm->post_proc_buffer, 1, &weight, @@ -5145,12 +5188,6 @@ int vp8_get_compressed_data(VP8_PTR ptr, unsigned int *frame_flags, unsigned lon cpi->summed_quality += frame_ssim2 * weight; cpi->summed_weights += weight; - cpi->totalp_y += y2; - cpi->totalp_u += u2; - cpi->totalp_v += v2; - cpi->totalp += frame_psnr2; - cpi->total_sq_error2 += sq_error; - } } diff --git a/vp8/encoder/ssim.c b/vp8/encoder/ssim.c index d6aa9566b..64d67c6dd 100644 --- a/vp8/encoder/ssim.c +++ b/vp8/encoder/ssim.c @@ -387,7 +387,7 @@ double vp8_calc_ssim { double a, b, c; double ssimv; -//IF_RTCD(&cpi->rtcd.variance) + a = vp8_ssim2(source->y_buffer, dest->y_buffer, source->y_stride, dest->y_stride, source->y_width, source->y_height, rtcd); From 2ae91fbef0da3d4f677b15342e7d0e18598f5ada Mon Sep 17 00:00:00 2001 From: Paul Wilkins Date: Thu, 10 Mar 2011 16:11:39 +0000 Subject: [PATCH 07/10] 1 Pass CQ and VBR bug fixes Issue 291 highlighted the fact that CQ mode was not working as expected in 1 pass mode, This commit fixes that specific problem but in so doing I also uncovered an overflow issue in the VBR code for 1 pass and some data values not being correctly initialized. For some clips (particularly short clips), the resulting improvement is dramatic. Change-Id: Ieefd6c6e4776eb8f1b0550dbfdfb72f86b33c960 --- vp8/encoder/onyx_if.c | 62 +++++++++++++++++++++++----------- vp8/encoder/ratectrl.c | 75 +++++++++++++++++++++++++++++++----------- 2 files changed, 97 insertions(+), 40 deletions(-) diff --git a/vp8/encoder/onyx_if.c b/vp8/encoder/onyx_if.c index 797e18b30..9b49f2dcf 100644 --- a/vp8/encoder/onyx_if.c +++ b/vp8/encoder/onyx_if.c @@ -1516,9 +1516,15 @@ void vp8_init_config(VP8_PTR ptr, VP8_CONFIG *oxcf) else cpi->oxcf = *oxcf; + // change includes all joint functionality + vp8_change_config(ptr, oxcf); - // Convert target bandwidth from Kbit/s to Bit/s - cpi->oxcf.target_bandwidth *= 1000; + // Initialize active best and worst q and average q values. + cpi->active_worst_quality = cpi->oxcf.worst_allowed_q; + cpi->active_best_quality = cpi->oxcf.best_allowed_q; + cpi->avg_frame_qindex = cpi->oxcf.worst_allowed_q; + + // Initialise the starting buffer levels cpi->oxcf.starting_buffer_level = rescale(cpi->oxcf.starting_buffer_level, cpi->oxcf.target_bandwidth, 1000); @@ -1526,10 +1532,6 @@ void vp8_init_config(VP8_PTR ptr, VP8_CONFIG *oxcf) cpi->buffer_level = cpi->oxcf.starting_buffer_level; cpi->bits_off_target = cpi->oxcf.starting_buffer_level; - cpi->active_worst_quality = cpi->oxcf.worst_allowed_q; - cpi->active_best_quality = cpi->oxcf.best_allowed_q; - cpi->avg_frame_qindex = cpi->oxcf.worst_allowed_q; - cpi->rolling_target_bits = cpi->av_per_frame_bandwidth; cpi->rolling_actual_bits = cpi->av_per_frame_bandwidth; cpi->long_rolling_target_bits = cpi->av_per_frame_bandwidth; @@ -1538,9 +1540,6 @@ void vp8_init_config(VP8_PTR ptr, VP8_CONFIG *oxcf) cpi->total_actual_bits = 0; cpi->total_target_vs_actual = 0; - // change includes all joint functionality - vp8_change_config(ptr, oxcf); - #if VP8_TEMPORAL_ALT_REF cpi->use_weighted_temporal_filter = 0; @@ -1668,7 +1667,8 @@ void vp8_change_config(VP8_PTR ptr, VP8_CONFIG *oxcf) } - cpi->baseline_gf_interval = cpi->oxcf.alt_freq ? cpi->oxcf.alt_freq : DEFAULT_GF_INTERVAL; + cpi->baseline_gf_interval = + cpi->oxcf.alt_freq ? cpi->oxcf.alt_freq : DEFAULT_GF_INTERVAL; cpi->ref_frame_flags = VP8_ALT_FLAG | VP8_GOLD_FLAG | VP8_LAST_FLAG; @@ -1679,7 +1679,8 @@ void vp8_change_config(VP8_PTR ptr, VP8_CONFIG *oxcf) cm->refresh_entropy_probs = 1; if (cpi->oxcf.token_partitions >= 0 && cpi->oxcf.token_partitions <= 3) - cm->multi_token_partition = (TOKEN_PARTITION) cpi->oxcf.token_partitions; + cm->multi_token_partition = + (TOKEN_PARTITION) cpi->oxcf.token_partitions; setup_features(cpi); @@ -1700,12 +1701,12 @@ void vp8_change_config(VP8_PTR ptr, VP8_CONFIG *oxcf) cpi->oxcf.starting_buffer_level = 60000; cpi->oxcf.optimal_buffer_level = 60000; cpi->oxcf.maximum_buffer_size = 240000; - } // Convert target bandwidth from Kbit/s to Bit/s cpi->oxcf.target_bandwidth *= 1000; + // Set or reset optimal and maximum buffer levels. if (cpi->oxcf.optimal_buffer_level == 0) cpi->oxcf.optimal_buffer_level = cpi->oxcf.target_bandwidth / 8; else @@ -1720,7 +1721,10 @@ void vp8_change_config(VP8_PTR ptr, VP8_CONFIG *oxcf) rescale(cpi->oxcf.maximum_buffer_size, cpi->oxcf.target_bandwidth, 1000); + // Set up frame rate and related parameters rate control values. vp8_new_frame_rate(cpi, cpi->oxcf.frame_rate); + + // Set absolute upper and lower quality limits cpi->worst_quality = cpi->oxcf.worst_allowed_q; cpi->best_quality = cpi->oxcf.best_allowed_q; @@ -1749,9 +1753,9 @@ void vp8_change_config(VP8_PTR ptr, VP8_CONFIG *oxcf) cpi->cq_target_quality = cpi->oxcf.cq_level; // Only allow dropped frames in buffered mode - cpi->drop_frames_allowed = cpi->oxcf.allow_df && cpi->buffered_mode; + cpi->drop_frames_allowed = cpi->oxcf.allow_df && cpi->buffered_mode; - cm->filter_type = (LOOPFILTERTYPE) cpi->filter_type; + cm->filter_type = (LOOPFILTERTYPE) cpi->filter_type; if (!cm->use_bilinear_mc_filter) cm->mcomp_filter_type = SIXTAP; @@ -1766,7 +1770,8 @@ void vp8_change_config(VP8_PTR ptr, VP8_CONFIG *oxcf) cm->horiz_scale = cpi->horiz_scale; cm->vert_scale = cpi->vert_scale ; - cpi->intra_frame_target = (4 * (cm->Width + cm->Height) / 15) * 1000; // As per VP8 + // As per VP8 + cpi->intra_frame_target = (4 * (cm->Width + cm->Height) / 15) * 1000; // VP8 sharpness level mapping 0-7 (vs 0-10 in general VPx dialogs) if (cpi->oxcf.Sharpness > 7) @@ -1787,8 +1792,10 @@ void vp8_change_config(VP8_PTR ptr, VP8_CONFIG *oxcf) cm->Height = (vs - 1 + cpi->oxcf.Height * vr) / vs; } - if (((cm->Width + 15) & 0xfffffff0) != cm->yv12_fb[cm->lst_fb_idx].y_width || - ((cm->Height + 15) & 0xfffffff0) != cm->yv12_fb[cm->lst_fb_idx].y_height || + if (((cm->Width + 15) & 0xfffffff0) != + cm->yv12_fb[cm->lst_fb_idx].y_width || + ((cm->Height + 15) & 0xfffffff0) != + cm->yv12_fb[cm->lst_fb_idx].y_height || cm->yv12_fb[cm->lst_fb_idx].y_width == 0) { alloc_raw_frame_buffers(cpi); @@ -3637,11 +3644,12 @@ static void encode_frame_to_data_rate } } - // If CBR and the buffer is as full then it is reasonable to allow higher quality on the frames - // to prevent bits just going to waste. + // If CBR and the buffer is as full then it is reasonable to allow + // higher quality on the frames to prevent bits just going to waste. if (cpi->oxcf.end_usage == USAGE_STREAM_FROM_SERVER) { - // Note that the use of >= here elliminates the risk of a devide by 0 error in the else if clause + // Note that the use of >= here elliminates the risk of a devide + // by 0 error in the else if clause if (cpi->buffer_level >= cpi->oxcf.maximum_buffer_size) cpi->active_best_quality = cpi->best_quality; @@ -3654,6 +3662,20 @@ static void encode_frame_to_data_rate } } } + // Make sure constrained quality mode limits are adhered to for the first + // few frames of one pass encodes + else if (cpi->oxcf.end_usage == USAGE_CONSTRAINED_QUALITY) + { + if ( (cm->frame_type == KEY_FRAME) || + cm->refresh_golden_frame || cpi->common.refresh_alt_ref_frame ) + { + cpi->active_best_quality = cpi->best_quality; + } + else if (cpi->active_best_quality < cpi->cq_target_quality) + { + cpi->active_best_quality = cpi->cq_target_quality; + } + } // Clip the active best and worst quality values to limits if (cpi->active_worst_quality > cpi->worst_quality) diff --git a/vp8/encoder/ratectrl.c b/vp8/encoder/ratectrl.c index bfffe43d9..9797f5f25 100644 --- a/vp8/encoder/ratectrl.c +++ b/vp8/encoder/ratectrl.c @@ -842,7 +842,8 @@ void vp8_calc_pframe_target_size(VP8_COMP *cpi) { int one_percent_bits = 1 + cpi->oxcf.optimal_buffer_level / 100; - if ((cpi->buffer_level < cpi->oxcf.optimal_buffer_level) || (cpi->bits_off_target < cpi->oxcf.optimal_buffer_level)) + if ((cpi->buffer_level < cpi->oxcf.optimal_buffer_level) || + (cpi->bits_off_target < cpi->oxcf.optimal_buffer_level)) { int percent_low = 0; @@ -851,9 +852,12 @@ void vp8_calc_pframe_target_size(VP8_COMP *cpi) // If we are are below the optimal buffer fullness level and adherence // to buffering contraints is important to the end useage then adjust // the per frame target. - if ((cpi->oxcf.end_usage == USAGE_STREAM_FROM_SERVER) && (cpi->buffer_level < cpi->oxcf.optimal_buffer_level)) + if ((cpi->oxcf.end_usage == USAGE_STREAM_FROM_SERVER) && + (cpi->buffer_level < cpi->oxcf.optimal_buffer_level)) { - percent_low = (cpi->oxcf.optimal_buffer_level - cpi->buffer_level) / one_percent_bits; + percent_low = + (cpi->oxcf.optimal_buffer_level - cpi->buffer_level) / + one_percent_bits; if (percent_low > 100) percent_low = 100; @@ -864,7 +868,8 @@ void vp8_calc_pframe_target_size(VP8_COMP *cpi) else if (cpi->bits_off_target < 0) { // Adjust per frame data target downwards to compensate. - percent_low = (int)(100 * -cpi->bits_off_target / (cpi->total_byte_count * 8)); + percent_low = (int)(100 * -cpi->bits_off_target / + (cpi->total_byte_count * 8)); if (percent_low > 100) percent_low = 100; @@ -873,39 +878,60 @@ void vp8_calc_pframe_target_size(VP8_COMP *cpi) } // lower the target bandwidth for this frame. - cpi->this_frame_target = (cpi->this_frame_target * (100 - (percent_low / 2))) / 100; + cpi->this_frame_target = + (cpi->this_frame_target * (100 - (percent_low / 2))) / 100; - // Are we using allowing control of active_worst_allowed_q according to buffer level. + // Are we using allowing control of active_worst_allowed_q + // according to buffer level. if (cpi->auto_worst_q) { int critical_buffer_level; - // For streaming applications the most important factor is cpi->buffer_level as this takes - // into account the specified short term buffering constraints. However, hitting the long - // term clip data rate target is also important. + // For streaming applications the most important factor is + // cpi->buffer_level as this takes into account the + // specified short term buffering constraints. However, + // hitting the long term clip data rate target is also + // important. if (cpi->oxcf.end_usage == USAGE_STREAM_FROM_SERVER) { - // Take the smaller of cpi->buffer_level and cpi->bits_off_target - critical_buffer_level = (cpi->buffer_level < cpi->bits_off_target) ? cpi->buffer_level : cpi->bits_off_target; + // Take the smaller of cpi->buffer_level and + // cpi->bits_off_target + critical_buffer_level = + (cpi->buffer_level < cpi->bits_off_target) + ? cpi->buffer_level : cpi->bits_off_target; } - // For local file playback short term buffering contraints are less of an issue + // For local file playback short term buffering contraints + // are less of an issue else { - // Consider only how we are doing for the clip as a whole + // Consider only how we are doing for the clip as a + // whole critical_buffer_level = cpi->bits_off_target; } - // Set the active worst quality based upon the selected buffer fullness number. + // Set the active worst quality based upon the selected + // buffer fullness number. if (critical_buffer_level < cpi->oxcf.optimal_buffer_level) { - if (critical_buffer_level > (cpi->oxcf.optimal_buffer_level / 4)) + if ( critical_buffer_level > + (cpi->oxcf.optimal_buffer_level >> 2) ) { - int qadjustment_range = cpi->worst_quality - cpi->ni_av_qi; - int above_base = (critical_buffer_level - (cpi->oxcf.optimal_buffer_level / 4)); + INT64 qadjustment_range = + cpi->worst_quality - cpi->ni_av_qi; + INT64 above_base = + (critical_buffer_level - + (cpi->oxcf.optimal_buffer_level >> 2)); - // Step active worst quality down from cpi->ni_av_qi when (critical_buffer_level == cpi->optimal_buffer_level) - // to cpi->oxcf.worst_allowed_q when (critical_buffer_level == cpi->optimal_buffer_level/4) - cpi->active_worst_quality = cpi->worst_quality - ((qadjustment_range * above_base) / (cpi->oxcf.optimal_buffer_level * 3 / 4)); + // Step active worst quality down from + // cpi->ni_av_qi when (critical_buffer_level == + // cpi->optimal_buffer_level) to + // cpi->worst_quality when + // (critical_buffer_level == + // cpi->optimal_buffer_level >> 2) + cpi->active_worst_quality = + cpi->worst_quality - + ((qadjustment_range * above_base) / + (cpi->oxcf.optimal_buffer_level*3>>2)); } else { @@ -965,6 +991,15 @@ void vp8_calc_pframe_target_size(VP8_COMP *cpi) // Set the active worst quality cpi->active_worst_quality = cpi->worst_quality; } + + // Special trap for constrained quality mode + // "active_worst_quality" may never drop below cq level + // for any frame type. + if ( cpi->oxcf.end_usage == USAGE_CONSTRAINED_QUALITY && + cpi->active_worst_quality < cpi->cq_target_quality) + { + cpi->active_worst_quality = cpi->cq_target_quality; + } } // Test to see if we have to drop a frame From 6e737484926c123d057f075a3cd385fb926a0372 Mon Sep 17 00:00:00 2001 From: Paul Wilkins Date: Fri, 11 Mar 2011 14:51:40 +0000 Subject: [PATCH 08/10] Clean up of vp8_init_config() Clean up vp8_init_config() a bit and remove null pointer case, as this code can't be called any more and is not an adequate trap anyway, as a null pointer would cause exceptions before hitting the test. Change-Id: I937c00167cc039b3aa3f645f29c319d58ae8d3ee --- vp8/encoder/onyx_if.c | 45 +------------------------------------------ 1 file changed, 1 insertion(+), 44 deletions(-) diff --git a/vp8/encoder/onyx_if.c b/vp8/encoder/onyx_if.c index ce262b1f4..8965634fe 100644 --- a/vp8/encoder/onyx_if.c +++ b/vp8/encoder/onyx_if.c @@ -1458,8 +1458,7 @@ void vp8_init_config(VP8_PTR ptr, VP8_CONFIG *oxcf) VP8_COMP *cpi = (VP8_COMP *)(ptr); VP8_COMMON *cm = &cpi->common; - if (!cpi) - return; + cpi->oxcf = *oxcf; cpi->auto_gold = 1; cpi->auto_adjust_gold_quantizer = 1; @@ -1471,47 +1470,6 @@ void vp8_init_config(VP8_PTR ptr, VP8_CONFIG *oxcf) cm->version = oxcf->Version; vp8_setup_version(cm); - if (oxcf == 0) - { - cpi->pass = 0; - - cpi->auto_worst_q = 0; - cpi->oxcf.best_allowed_q = MINQ; - cpi->oxcf.worst_allowed_q = MAXQ; - cpi->oxcf.cq_level = MINQ; - - cpi->oxcf.end_usage = USAGE_STREAM_FROM_SERVER; - cpi->oxcf.starting_buffer_level = 4000; - cpi->oxcf.optimal_buffer_level = 5000; - cpi->oxcf.maximum_buffer_size = 6000; - cpi->oxcf.under_shoot_pct = 90; - cpi->oxcf.allow_df = 0; - cpi->oxcf.drop_frames_water_mark = 20; - - cpi->oxcf.allow_spatial_resampling = 0; - cpi->oxcf.resample_down_water_mark = 40; - cpi->oxcf.resample_up_water_mark = 60; - - cpi->oxcf.fixed_q = cpi->interquantizer; - - cpi->filter_type = NORMAL_LOOPFILTER; - - if (cm->simpler_lpf) - cpi->filter_type = SIMPLE_LOOPFILTER; - - cpi->compressor_speed = 1; - cpi->horiz_scale = 0; - cpi->vert_scale = 0; - cpi->oxcf.two_pass_vbrbias = 50; - cpi->oxcf.two_pass_vbrmax_section = 400; - cpi->oxcf.two_pass_vbrmin_section = 0; - - cpi->oxcf.Sharpness = 0; - cpi->oxcf.noise_sensitivity = 0; - } - else - cpi->oxcf = *oxcf; - // change includes all joint functionality vp8_change_config(ptr, oxcf); @@ -1537,7 +1495,6 @@ void vp8_init_config(VP8_PTR ptr, VP8_CONFIG *oxcf) cpi->total_target_vs_actual = 0; #if VP8_TEMPORAL_ALT_REF - { int i; From 5db0eeea21a33820cb4e0adf171ed60868666bb6 Mon Sep 17 00:00:00 2001 From: John Koleszar Date: Fri, 11 Mar 2011 11:27:08 -0500 Subject: [PATCH 09/10] Only enable ssim_opt.asm on X86_64 Fix compiling on 32 bit x86. Change-Id: I6210573e1d9287ac49acbe3d7e5181e309316107 --- vp8/vp8cx.mk | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vp8/vp8cx.mk b/vp8/vp8cx.mk index 670c02280..8f0681fb9 100644 --- a/vp8/vp8cx.mk +++ b/vp8/vp8cx.mk @@ -116,7 +116,7 @@ VP8_CX_SRCS-$(HAVE_SSSE3) += encoder/x86/quantize_ssse3.asm VP8_CX_SRCS-$(HAVE_SSE4_1) += encoder/x86/sad_sse4.asm VP8_CX_SRCS-$(ARCH_X86)$(ARCH_X86_64) += encoder/x86/quantize_mmx.asm VP8_CX_SRCS-$(ARCH_X86)$(ARCH_X86_64) += encoder/x86/encodeopt.asm -VP8_CX_SRCS-$(ARCH_X86)$(ARCH_X86_64) += encoder/x86/ssim_opt.asm +VP8_CX_SRCS-$(ARCH_X86_64) += encoder/x86/ssim_opt.asm ifeq ($(CONFIG_REALTIME_ONLY),yes) VP8_CX_SRCS_REMOVE-$(HAVE_SSE2) += encoder/x86/temporal_filter_apply_sse2.asm From 27972d2c1d53a0ff831c1f05d40a3720cf45aec7 Mon Sep 17 00:00:00 2001 From: John Koleszar Date: Fri, 11 Mar 2011 11:35:38 -0500 Subject: [PATCH 10/10] Move build_intra_predictors_mby to RTCD framework The vp8_build_intra_predictors_mby and vp8_build_intra_predictors_mby_s functions had global function pointers rather than using the RTCD framework. This can show up as a potential data race with tools such as helgrind. See https://bugzilla.mozilla.org/show_bug.cgi?id=640935 for an example. Change-Id: I29c407f828ac2bddfc039f852f138de5de888534 --- vp8/common/arm/arm_systemdependent.c | 35 ++++------------------------ vp8/common/arm/recon_arm.h | 10 ++++++++ vp8/common/generic/systemdependent.c | 13 ++++------- vp8/common/recon.h | 19 +++++++++++++++ vp8/common/reconintra.h | 7 ------ vp8/decoder/decodframe.c | 7 +++--- vp8/encoder/encodeframe.c | 3 ++- vp8/encoder/encodeintra.c | 2 +- vp8/encoder/pickinter.c | 3 ++- vp8/encoder/rdopt.c | 6 +++-- 10 files changed, 50 insertions(+), 55 deletions(-) diff --git a/vp8/common/arm/arm_systemdependent.c b/vp8/common/arm/arm_systemdependent.c index 69e1bdff4..bd5c0759d 100644 --- a/vp8/common/arm/arm_systemdependent.c +++ b/vp8/common/arm/arm_systemdependent.c @@ -19,14 +19,6 @@ #include "vp8/common/idct.h" #include "vp8/common/onyxc_int.h" -extern void (*vp8_build_intra_predictors_mby_ptr)(MACROBLOCKD *x); -extern void vp8_build_intra_predictors_mby(MACROBLOCKD *x); -extern void vp8_build_intra_predictors_mby_neon(MACROBLOCKD *x); - -extern void (*vp8_build_intra_predictors_mby_s_ptr)(MACROBLOCKD *x); -extern void vp8_build_intra_predictors_mby_s(MACROBLOCKD *x); -extern void vp8_build_intra_predictors_mby_s_neon(MACROBLOCKD *x); - void vp8_arch_arm_common_init(VP8_COMMON *ctx) { #if CONFIG_RUNTIME_CPU_DETECT @@ -106,31 +98,12 @@ void vp8_arch_arm_common_init(VP8_COMMON *ctx) rtcd->recon.recon2 = vp8_recon2b_neon; rtcd->recon.recon4 = vp8_recon4b_neon; rtcd->recon.recon_mb = vp8_recon_mb_neon; - + rtcd->recon.build_intra_predictors_mby = + vp8_build_intra_predictors_mby_neon; + rtcd->recon.build_intra_predictors_mby_s = + vp8_build_intra_predictors_mby_s_neon; } #endif #endif - -#if HAVE_ARMV6 -#if CONFIG_RUNTIME_CPU_DETECT - if (has_media) -#endif - { - vp8_build_intra_predictors_mby_ptr = vp8_build_intra_predictors_mby; - vp8_build_intra_predictors_mby_s_ptr = vp8_build_intra_predictors_mby_s; - } -#endif - -#if HAVE_ARMV7 -#if CONFIG_RUNTIME_CPU_DETECT - if (has_neon) -#endif - { - vp8_build_intra_predictors_mby_ptr = - vp8_build_intra_predictors_mby_neon; - vp8_build_intra_predictors_mby_s_ptr = - vp8_build_intra_predictors_mby_s_neon; - } -#endif } diff --git a/vp8/common/arm/recon_arm.h b/vp8/common/arm/recon_arm.h index b46b7fc7d..377cb2a07 100644 --- a/vp8/common/arm/recon_arm.h +++ b/vp8/common/arm/recon_arm.h @@ -53,6 +53,9 @@ extern prototype_copy_block(vp8_copy_mem16x16_neon); extern prototype_recon_macroblock(vp8_recon_mb_neon); +extern prototype_build_intra_predictors(vp8_build_intra_predictors_mby_neon); +extern prototype_build_intra_predictors(vp8_build_intra_predictors_mby_s_neon); + #if !CONFIG_RUNTIME_CPU_DETECT #undef vp8_recon_recon #define vp8_recon_recon vp8_recon_b_neon @@ -74,6 +77,13 @@ extern prototype_recon_macroblock(vp8_recon_mb_neon); #undef vp8_recon_recon_mb #define vp8_recon_recon_mb vp8_recon_mb_neon + +#undef vp8_recon_build_intra_predictors_mby +#define vp8_recon_build_intra_predictors_mby vp8_build_intra_predictors_mby_neon + +#undef vp8_recon_build_intra_predictors_mby_s +#define vp8_recon_build_intra_predictors_mby_s vp8_build_intra_predictors_mby_s_neon + #endif #endif diff --git a/vp8/common/generic/systemdependent.c b/vp8/common/generic/systemdependent.c index c843d86fe..5c6464772 100644 --- a/vp8/common/generic/systemdependent.c +++ b/vp8/common/generic/systemdependent.c @@ -20,12 +20,6 @@ extern void vp8_arch_x86_common_init(VP8_COMMON *ctx); extern void vp8_arch_arm_common_init(VP8_COMMON *ctx); -void (*vp8_build_intra_predictors_mby_ptr)(MACROBLOCKD *x); -extern void vp8_build_intra_predictors_mby(MACROBLOCKD *x); - -void (*vp8_build_intra_predictors_mby_s_ptr)(MACROBLOCKD *x); -extern void vp8_build_intra_predictors_mby_s(MACROBLOCKD *x); - void vp8_machine_specific_config(VP8_COMMON *ctx) { #if CONFIG_RUNTIME_CPU_DETECT @@ -45,6 +39,10 @@ void vp8_machine_specific_config(VP8_COMMON *ctx) rtcd->recon.recon4 = vp8_recon4b_c; rtcd->recon.recon_mb = vp8_recon_mb_c; rtcd->recon.recon_mby = vp8_recon_mby_c; + rtcd->recon.build_intra_predictors_mby = + vp8_build_intra_predictors_mby; + rtcd->recon.build_intra_predictors_mby_s = + vp8_build_intra_predictors_mby_s; rtcd->subpix.sixtap16x16 = vp8_sixtap_predict16x16_c; rtcd->subpix.sixtap8x8 = vp8_sixtap_predict8x8_c; @@ -75,9 +73,6 @@ void vp8_machine_specific_config(VP8_COMMON *ctx) #endif #endif - /* Pure C: */ - vp8_build_intra_predictors_mby_ptr = vp8_build_intra_predictors_mby; - vp8_build_intra_predictors_mby_s_ptr = vp8_build_intra_predictors_mby_s; #if ARCH_X86 || ARCH_X86_64 vp8_arch_x86_common_init(ctx); diff --git a/vp8/common/recon.h b/vp8/common/recon.h index e7df90a71..e608f218c 100644 --- a/vp8/common/recon.h +++ b/vp8/common/recon.h @@ -23,6 +23,9 @@ #define prototype_recon_macroblock(sym) \ void sym(const struct vp8_recon_rtcd_vtable *rtcd, MACROBLOCKD *x) +#define prototype_build_intra_predictors(sym) \ + void sym(MACROBLOCKD *x) + struct vp8_recon_rtcd_vtable; #if ARCH_X86 || ARCH_X86_64 @@ -73,9 +76,23 @@ extern prototype_recon_macroblock(vp8_recon_recon_mb); #endif extern prototype_recon_macroblock(vp8_recon_recon_mby); +#ifndef vp8_recon_build_intra_predictors_mby +#define vp8_recon_build_intra_predictors_mby vp8_build_intra_predictors_mby +#endif +extern prototype_build_intra_predictors\ + (vp8_recon_build_intra_predictors_mby); + +#ifndef vp8_recon_build_intra_predictors_mby_s +#define vp8_recon_build_intra_predictors_mby_s vp8_build_intra_predictors_mby_s +#endif +extern prototype_build_intra_predictors\ + (vp8_recon_build_intra_predictors_mby_s); + + typedef prototype_copy_block((*vp8_copy_block_fn_t)); typedef prototype_recon_block((*vp8_recon_fn_t)); typedef prototype_recon_macroblock((*vp8_recon_mb_fn_t)); +typedef prototype_build_intra_predictors((*vp8_build_intra_pred_fn_t)); typedef struct vp8_recon_rtcd_vtable { vp8_copy_block_fn_t copy16x16; @@ -86,6 +103,8 @@ typedef struct vp8_recon_rtcd_vtable vp8_recon_fn_t recon4; vp8_recon_mb_fn_t recon_mb; vp8_recon_mb_fn_t recon_mby; + vp8_build_intra_pred_fn_t build_intra_predictors_mby_s; + vp8_build_intra_pred_fn_t build_intra_predictors_mby; } vp8_recon_rtcd_vtable_t; #if CONFIG_RUNTIME_CPU_DETECT diff --git a/vp8/common/reconintra.h b/vp8/common/reconintra.h index 988b43a77..4025a5307 100644 --- a/vp8/common/reconintra.h +++ b/vp8/common/reconintra.h @@ -14,13 +14,6 @@ extern void init_intra_left_above_pixels(MACROBLOCKD *x); -extern void (*vp8_build_intra_predictors_mby_ptr)(MACROBLOCKD *x); -extern void vp8_build_intra_predictors_mby(MACROBLOCKD *x); -extern void vp8_build_intra_predictors_mby_neon(MACROBLOCKD *x); -extern void (*vp8_build_intra_predictors_mby_s_ptr)(MACROBLOCKD *x); -extern void vp8_build_intra_predictors_mby_s(MACROBLOCKD *x); -extern void vp8_build_intra_predictors_mby_s_neon(MACROBLOCKD *x); - extern void vp8_build_intra_predictors_mbuv(MACROBLOCKD *x); extern void vp8_build_intra_predictors_mbuv_s(MACROBLOCKD *x); diff --git a/vp8/decoder/decodframe.c b/vp8/decoder/decodframe.c index c454bbc70..3d4d9b961 100644 --- a/vp8/decoder/decodframe.c +++ b/vp8/decoder/decodframe.c @@ -115,8 +115,8 @@ static void skip_recon_mb(VP8D_COMP *pbi, MACROBLOCKD *xd) { vp8_build_intra_predictors_mbuv_s(xd); - vp8_build_intra_predictors_mby_s_ptr(xd); - + RECON_INVOKE(&pbi->common.rtcd.recon, + build_intra_predictors_mby_s)(xd); } else { @@ -214,7 +214,8 @@ void vp8_decode_macroblock(VP8D_COMP *pbi, MACROBLOCKD *xd) if (xd->mode_info_context->mbmi.mode != B_PRED) { - vp8_build_intra_predictors_mby_ptr(xd); + RECON_INVOKE(&pbi->common.rtcd.recon, + build_intra_predictors_mby)(xd); } else { vp8_intra_prediction_down_copy(xd); } diff --git a/vp8/encoder/encodeframe.c b/vp8/encoder/encodeframe.c index 0ced6e7b0..0613b9070 100644 --- a/vp8/encoder/encodeframe.c +++ b/vp8/encoder/encodeframe.c @@ -1184,7 +1184,8 @@ int vp8cx_encode_intra_macro_block(VP8_COMP *cpi, MACROBLOCK *x, TOKENEXTRA **t) int distortion2; x->e_mbd.mode_info_context->mbmi.mode = mode; - vp8_build_intra_predictors_mby_ptr(&x->e_mbd); + RECON_INVOKE(&cpi->common.rtcd.recon, build_intra_predictors_mby) + (&x->e_mbd); distortion2 = VARIANCE_INVOKE(&cpi->rtcd.variance, get16x16prederror)(x->src.y_buffer, x->src.y_stride, x->e_mbd.predictor, 16, 0x7fffffff); rate2 = x->mbmode_cost[x->e_mbd.frame_type][mode]; this_rd = RD_ESTIMATE(x->rdmult, x->rddiv, rate2, distortion2); diff --git a/vp8/encoder/encodeintra.c b/vp8/encoder/encodeintra.c index cd66016cc..7b81c8d95 100644 --- a/vp8/encoder/encodeintra.c +++ b/vp8/encoder/encodeintra.c @@ -80,7 +80,7 @@ void vp8_encode_intra16x16mby(const VP8_ENCODER_RTCD *rtcd, MACROBLOCK *x) { int b; - vp8_build_intra_predictors_mby_ptr(&x->e_mbd); + RECON_INVOKE(&rtcd->common->recon, build_intra_predictors_mby)(&x->e_mbd); ENCODEMB_INVOKE(&rtcd->encodemb, submby)(x->src_diff, x->src.y_buffer, x->e_mbd.predictor, x->src.y_stride); diff --git a/vp8/encoder/pickinter.c b/vp8/encoder/pickinter.c index 6ab85adbc..0790d3517 100644 --- a/vp8/encoder/pickinter.c +++ b/vp8/encoder/pickinter.c @@ -664,7 +664,8 @@ int vp8_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset, int rec case V_PRED: case H_PRED: case TM_PRED: - vp8_build_intra_predictors_mby_ptr(&x->e_mbd); + RECON_INVOKE(&cpi->common.rtcd.recon, build_intra_predictors_mby) + (&x->e_mbd); distortion2 = VARIANCE_INVOKE(&cpi->rtcd.variance, get16x16prederror)(x->src.y_buffer, x->src.y_stride, x->e_mbd.predictor, 16, 0x7fffffff); rate2 += x->mbmode_cost[x->e_mbd.frame_type][x->e_mbd.mode_info_context->mbmi.mode]; this_rd = RD_ESTIMATE(x->rdmult, x->rddiv, rate2, distortion2); diff --git a/vp8/encoder/rdopt.c b/vp8/encoder/rdopt.c index b0dcfe0a4..c706c575f 100644 --- a/vp8/encoder/rdopt.c +++ b/vp8/encoder/rdopt.c @@ -745,7 +745,8 @@ int vp8_rd_pick_intra16x16mby_mode(VP8_COMP *cpi, { x->e_mbd.mode_info_context->mbmi.mode = mode; - vp8_build_intra_predictors_mby_ptr(&x->e_mbd); + RECON_INVOKE(&cpi->common.rtcd.recon, build_intra_predictors_mby) + (&x->e_mbd); macro_block_yrd(x, &ratey, &distortion, IF_RTCD(&cpi->rtcd.encodemb)); rate = ratey + x->mbmode_cost[x->e_mbd.frame_type] @@ -2038,7 +2039,8 @@ int vp8_rd_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset, int case H_PRED: case TM_PRED: x->e_mbd.mode_info_context->mbmi.ref_frame = INTRA_FRAME; - vp8_build_intra_predictors_mby_ptr(&x->e_mbd); + RECON_INVOKE(&cpi->common.rtcd.recon, build_intra_predictors_mby) + (&x->e_mbd); macro_block_yrd(x, &rate_y, &distortion, IF_RTCD(&cpi->rtcd.encodemb)) ; rate2 += rate_y; distortion2 += distortion;