From 15f9bea73b136df73ee5efd1589e19924162e8fe Mon Sep 17 00:00:00 2001 From: Johann Date: Tue, 11 Jan 2011 09:41:57 -0500 Subject: [PATCH 1/5] update sse2 regular quantizer about ~5% gain on 32bit. disabled for 64bit unset executable bit on ssse3 version (cosmetic) Change-Id: I1a5860839eb294ce4261f819caea2dcfa78e57ca --- vp8/encoder/quantize.c | 9 +- vp8/encoder/x86/quantize_sse2.asm | 334 +++++++++++-------------- vp8/encoder/x86/quantize_ssse3.asm | 0 vp8/encoder/x86/quantize_x86.h | 10 +- vp8/encoder/x86/x86_csystemdependent.c | 49 ++-- 5 files changed, 168 insertions(+), 234 deletions(-) mode change 100755 => 100644 vp8/encoder/x86/quantize_ssse3.asm diff --git a/vp8/encoder/quantize.c b/vp8/encoder/quantize.c index be9f26c7f..4a2329fc1 100644 --- a/vp8/encoder/quantize.c +++ b/vp8/encoder/quantize.c @@ -129,9 +129,6 @@ void vp8_regular_quantize_b(BLOCK *b, BLOCKD *d) rc = vp8_default_zig_zag1d[i]; z = coeff_ptr[rc]; - //if ( i == 0 ) - // zbin = zbin_ptr[rc] + *zbin_boost_ptr + zbin_oq_value/2; - //else zbin = zbin_ptr[rc] + *zbin_boost_ptr + zbin_oq_value; zbin_boost_ptr ++; @@ -144,13 +141,13 @@ void vp8_regular_quantize_b(BLOCK *b, BLOCKD *d) y = (((x * quant_ptr[rc]) >> 16) + x) >> quant_shift_ptr[rc]; // quantize (x) x = (y ^ sz) - sz; // get the sign back - qcoeff_ptr[rc] = x; // write to destination - dqcoeff_ptr[rc] = x * dequant_ptr[rc]; // dequantized value + qcoeff_ptr[rc] = x; // write to destination + dqcoeff_ptr[rc] = x * dequant_ptr[rc]; // dequantized value if (y) { eob = i; // last nonzero coeffs - zbin_boost_ptr = &b->zrun_zbin_boost[0]; // reset zero runlength + zbin_boost_ptr = b->zrun_zbin_boost; // reset zero runlength } } } diff --git a/vp8/encoder/x86/quantize_sse2.asm b/vp8/encoder/x86/quantize_sse2.asm index 57bf3c93a..45e1a2ad3 100644 --- a/vp8/encoder/x86/quantize_sse2.asm +++ b/vp8/encoder/x86/quantize_sse2.asm @@ -11,220 +11,169 @@ %include "vpx_ports/x86_abi_support.asm" -;int vp8_regular_quantize_b_impl_sse2(short *coeff_ptr, short *zbin_ptr, -; short *qcoeff_ptr,short *dequant_ptr, -; const int *default_zig_zag, short *round_ptr, -; short *quant_ptr, short *dqcoeff_ptr, +;int vp8_regular_quantize_b_impl_sse2( +; short *coeff_ptr, +; short *zbin_ptr, +; short *qcoeff_ptr, +; short *dequant_ptr, +; const int *default_zig_zag, +; short *round_ptr, +; short *quant_ptr, +; short *dqcoeff_ptr, ; unsigned short zbin_oq_value, -; short *zbin_boost_ptr); +; short *zbin_boost_ptr, +; short *quant_shift); ; global sym(vp8_regular_quantize_b_impl_sse2) sym(vp8_regular_quantize_b_impl_sse2): push rbp mov rbp, rsp - SHADOW_ARGS_TO_STACK 10 + SHADOW_ARGS_TO_STACK 11 + SAVE_XMM push rsi push rdi push rbx + ALIGN_STACK 16, rax + %define abs_minus_zbin 0 + %define temp_qcoeff 32 + %define qcoeff 64 + %define eob_tmp 96 + %define stack_size 112 + sub rsp, stack_size ; end prolog - ALIGN_STACK 16, rax - - %define abs_minus_zbin_lo 0 - %define abs_minus_zbin_hi 16 - %define temp_qcoeff_lo 32 - %define temp_qcoeff_hi 48 - %define save_xmm6 64 - %define save_xmm7 80 - %define eob 96 - - %define vp8_regularquantizeb_stack_size eob + 16 - - sub rsp, vp8_regularquantizeb_stack_size - - movdqa OWORD PTR[rsp + save_xmm6], xmm6 - movdqa OWORD PTR[rsp + save_xmm7], xmm7 - - mov rdx, arg(0) ;coeff_ptr - mov eax, arg(8) ;zbin_oq_value - - mov rcx, arg(1) ;zbin_ptr - movd xmm7, eax + mov rdx, arg(0) ; coeff_ptr + mov rcx, arg(1) ; zbin_ptr + movd xmm7, arg(8) ; zbin_oq_value + mov rdi, arg(5) ; round_ptr + mov rsi, arg(6) ; quant_ptr + ; z movdqa xmm0, OWORD PTR[rdx] movdqa xmm4, OWORD PTR[rdx + 16] + pshuflw xmm7, xmm7, 0 + punpcklwd xmm7, xmm7 ; duplicated zbin_oq_value + movdqa xmm1, xmm0 movdqa xmm5, xmm4 - psraw xmm0, 15 ;sign of z (aka sz) - psraw xmm4, 15 ;sign of z (aka sz) - - pxor xmm1, xmm0 - pxor xmm5, xmm4 - - movdqa xmm2, OWORD PTR[rcx] ;load zbin_ptr - movdqa xmm3, OWORD PTR[rcx + 16] ;load zbin_ptr - - pshuflw xmm7, xmm7, 0 - psubw xmm1, xmm0 ;x = abs(z) - - punpcklwd xmm7, xmm7 ;duplicated zbin_oq_value - psubw xmm5, xmm4 ;x = abs(z) - - paddw xmm2, xmm7 - paddw xmm3, xmm7 - - psubw xmm1, xmm2 ;sub (zbin_ptr + zbin_oq_value) - psubw xmm5, xmm3 ;sub (zbin_ptr + zbin_oq_value) - - mov rdi, arg(5) ;round_ptr - mov rsi, arg(6) ;quant_ptr - - movdqa OWORD PTR[rsp + abs_minus_zbin_lo], xmm1 - movdqa OWORD PTR[rsp + abs_minus_zbin_hi], xmm5 - - paddw xmm1, xmm2 ;add (zbin_ptr + zbin_oq_value) back - paddw xmm5, xmm3 ;add (zbin_ptr + zbin_oq_value) back - - movdqa xmm2, OWORD PTR[rdi] - movdqa xmm3, OWORD PTR[rsi] - - movdqa xmm6, OWORD PTR[rdi + 16] - movdqa xmm7, OWORD PTR[rsi + 16] - - paddw xmm1, xmm2 - paddw xmm5, xmm6 - - pmulhw xmm1, xmm3 - pmulhw xmm5, xmm7 - - mov rsi, arg(2) ;qcoeff_ptr - pxor xmm6, xmm6 + ; sz + psraw xmm0, 15 + psraw xmm4, 15 + ; (z ^ sz) pxor xmm1, xmm0 pxor xmm5, xmm4 + ; x = abs(z) psubw xmm1, xmm0 psubw xmm5, xmm4 - movdqa OWORD PTR[rsp + temp_qcoeff_lo], xmm1 - movdqa OWORD PTR[rsp + temp_qcoeff_hi], xmm5 + movdqa xmm2, OWORD PTR[rcx] + movdqa xmm3, OWORD PTR[rcx + 16] - movdqa OWORD PTR[rsi], xmm6 ;zero qcoeff - movdqa OWORD PTR[rsi + 16], xmm6 ;zero qcoeff + ; *zbin_ptr + zbin_oq_value + paddw xmm2, xmm7 + paddw xmm3, xmm7 - xor rax, rax - mov rcx, -1 + ; x - (*zbin_ptr + zbin_oq_value) + psubw xmm1, xmm2 + psubw xmm5, xmm3 + movdqa OWORD PTR[rsp + abs_minus_zbin], xmm1 + movdqa OWORD PTR[rsp + abs_minus_zbin + 16], xmm5 - mov [rsp + eob], rcx - mov rsi, arg(9) ;zbin_boost_ptr - - mov rbx, arg(4) ;default_zig_zag - -rq_zigzag_loop: - movsxd rcx, DWORD PTR[rbx + rax*4] ;now we have rc - movsx edi, WORD PTR [rsi] ;*zbin_boost_ptr aka zbin - lea rsi, [rsi + 2] ;zbin_boost_ptr++ - - movsx edx, WORD PTR[rsp + abs_minus_zbin_lo + rcx *2] - - sub edx, edi ;x - zbin - jl rq_zigzag_1 - - mov rdi, arg(2) ;qcoeff_ptr - - movsx edx, WORD PTR[rsp + temp_qcoeff_lo + rcx *2] - - cmp edx, 0 - je rq_zigzag_1 - - mov WORD PTR[rdi + rcx * 2], dx ;qcoeff_ptr[rc] = temp_qcoeff[rc] - - mov rsi, arg(9) ;zbin_boost_ptr - mov [rsp + eob], rax ;eob = i - -rq_zigzag_1: - movsxd rcx, DWORD PTR[rbx + rax*4 + 4] - movsx edi, WORD PTR [rsi] ;*zbin_boost_ptr aka zbin - lea rsi, [rsi + 2] ;zbin_boost_ptr++ - - movsx edx, WORD PTR[rsp + abs_minus_zbin_lo + rcx *2] - lea rax, [rax + 1] - - sub edx, edi ;x - zbin - jl rq_zigzag_1a - - mov rdi, arg(2) ;qcoeff_ptr - - movsx edx, WORD PTR[rsp + temp_qcoeff_lo + rcx *2] - - cmp edx, 0 - je rq_zigzag_1a - - mov WORD PTR[rdi + rcx * 2], dx ;qcoeff_ptr[rc] = temp_qcoeff[rc] - - mov rsi, arg(9) ;zbin_boost_ptr - mov [rsp + eob], rax ;eob = i - -rq_zigzag_1a: - movsxd rcx, DWORD PTR[rbx + rax*4 + 4] - movsx edi, WORD PTR [rsi] ;*zbin_boost_ptr aka zbin - lea rsi, [rsi + 2] ;zbin_boost_ptr++ - - movsx edx, WORD PTR[rsp + abs_minus_zbin_lo + rcx *2] - lea rax, [rax + 1] - - sub edx, edi ;x - zbin - jl rq_zigzag_1b - - mov rdi, arg(2) ;qcoeff_ptr - - movsx edx, WORD PTR[rsp + temp_qcoeff_lo + rcx *2] - - cmp edx, 0 - je rq_zigzag_1b - - mov WORD PTR[rdi + rcx * 2], dx ;qcoeff_ptr[rc] = temp_qcoeff[rc] - - mov rsi, arg(9) ;zbin_boost_ptr - mov [rsp + eob], rax ;eob = i - -rq_zigzag_1b: - movsxd rcx, DWORD PTR[rbx + rax*4 + 4] - movsx edi, WORD PTR [rsi] ;*zbin_boost_ptr aka zbin - lea rsi, [rsi + 2] ;zbin_boost_ptr++ - - movsx edx, WORD PTR[rsp + abs_minus_zbin_lo + rcx *2] - lea rax, [rax + 1] - - sub edx, edi ;x - zbin - jl rq_zigzag_1c - - mov rdi, arg(2) ;qcoeff_ptr - - movsx edx, WORD PTR[rsp + temp_qcoeff_lo + rcx *2] - - cmp edx, 0 - je rq_zigzag_1c - - mov WORD PTR[rdi + rcx * 2], dx ;qcoeff_ptr[rc] = temp_qcoeff[rc] - - mov rsi, arg(9) ;zbin_boost_ptr - mov [rsp + eob], rax ;eob = i - -rq_zigzag_1c: - lea rax, [rax + 1] - - cmp rax, 16 - jl rq_zigzag_loop - - mov rdi, arg(2) ;qcoeff_ptr - mov rcx, arg(3) ;dequant_ptr - mov rsi, arg(7) ;dqcoeff_ptr + ; add (zbin_ptr + zbin_oq_value) back + paddw xmm1, xmm2 + paddw xmm5, xmm3 movdqa xmm2, OWORD PTR[rdi] - movdqa xmm3, OWORD PTR[rdi + 16] + movdqa xmm6, OWORD PTR[rdi + 16] + + movdqa xmm3, OWORD PTR[rsi] + movdqa xmm7, OWORD PTR[rsi + 16] + + ; x + round + paddw xmm1, xmm2 + paddw xmm5, xmm6 + + ; y = x * quant_ptr >> 16 + pmulhw xmm3, xmm1 + pmulhw xmm7, xmm5 + + ; y += x + paddw xmm1, xmm3 + paddw xmm5, xmm7 + + movdqa OWORD PTR[rsp + temp_qcoeff], xmm1 + movdqa OWORD PTR[rsp + temp_qcoeff + 16], xmm5 + + pxor xmm6, xmm6 + ; zero qcoeff + movdqa OWORD PTR[rsp + qcoeff], xmm6 + movdqa OWORD PTR[rsp + qcoeff + 16], xmm6 + + mov [rsp + eob_tmp], DWORD -1 ; eob + mov rsi, arg(9) ; zbin_boost_ptr + mov rdi, arg(4) ; default_zig_zag + mov rax, arg(10) ; quant_shift_ptr + +%macro ZIGZAG_LOOP 2 +rq_zigzag_loop_%1: + movsxd rdx, DWORD PTR[rdi + (%1 * 4)] ; rc + movsx ebx, WORD PTR [rsi] ; *zbin_boost_ptr + lea rsi, [rsi + 2] ; zbin_boost_ptr++ + + ; x + movsx ecx, WORD PTR[rsp + abs_minus_zbin + rdx *2] + + ; if (x >= zbin) + sub ecx, ebx ; x - zbin + jl rq_zigzag_loop_%2 ; x < zbin + + movsx ebx, WORD PTR[rsp + temp_qcoeff + rdx *2] + + ; downshift by quant_shift[rdx] + movsx ecx, WORD PTR[rax + rdx*2] ; quant_shift_ptr[rc] + sar ebx, cl ; also sets Z bit + je rq_zigzag_loop_%2 ; !y + mov WORD PTR[rsp + qcoeff + rdx * 2], bx ;qcoeff_ptr[rc] = temp_qcoeff[rc] + + mov rsi, arg(9) ; reset to b->zrun_zbin_boost + mov [rsp + eob_tmp], DWORD %1 ; eob = i +%endmacro +ZIGZAG_LOOP 0, 1 +ZIGZAG_LOOP 1, 2 +ZIGZAG_LOOP 2, 3 +ZIGZAG_LOOP 3, 4 +ZIGZAG_LOOP 4, 5 +ZIGZAG_LOOP 5, 6 +ZIGZAG_LOOP 6, 7 +ZIGZAG_LOOP 7, 8 +ZIGZAG_LOOP 8, 9 +ZIGZAG_LOOP 9, 10 +ZIGZAG_LOOP 10, 11 +ZIGZAG_LOOP 11, 12 +ZIGZAG_LOOP 12, 13 +ZIGZAG_LOOP 13, 14 +ZIGZAG_LOOP 14, 15 +ZIGZAG_LOOP 15, end +rq_zigzag_loop_end: + + mov rbx, arg(2) ; qcoeff_ptr + mov rcx, arg(3) ; dequant_ptr + mov rsi, arg(7) ; dqcoeff_ptr + mov rax, [rsp + eob_tmp] ; eob + + movdqa xmm2, OWORD PTR[rsp + qcoeff] + movdqa xmm3, OWORD PTR[rsp + qcoeff + 16] + + ; y ^ sz + pxor xmm2, xmm0 + pxor xmm3, xmm4 + ; x = (y ^ sz) - sz + psubw xmm2, xmm0 + psubw xmm3, xmm4 movdqa xmm0, OWORD PTR[rcx] movdqa xmm1, OWORD PTR[rcx + 16] @@ -232,23 +181,20 @@ rq_zigzag_1c: pmullw xmm0, xmm2 pmullw xmm1, xmm3 - movdqa OWORD PTR[rsi], xmm0 ;store dqcoeff - movdqa OWORD PTR[rsi + 16], xmm1 ;store dqcoeff - - mov rax, [rsp + eob] - - movdqa xmm6, OWORD PTR[rsp + save_xmm6] - movdqa xmm7, OWORD PTR[rsp + save_xmm7] + movdqa OWORD PTR[rbx], xmm2 + movdqa OWORD PTR[rbx + 16], xmm3 + movdqa OWORD PTR[rsi], xmm0 ; store dqcoeff + movdqa OWORD PTR[rsi + 16], xmm1 ; store dqcoeff add rax, 1 - add rsp, vp8_regularquantizeb_stack_size - pop rsp - ; begin epilog + add rsp, stack_size + pop rsp pop rbx pop rdi pop rsi + RESTORE_XMM UNSHADOW_ARGS pop rbp ret diff --git a/vp8/encoder/x86/quantize_ssse3.asm b/vp8/encoder/x86/quantize_ssse3.asm old mode 100755 new mode 100644 diff --git a/vp8/encoder/x86/quantize_x86.h b/vp8/encoder/x86/quantize_x86.h index b5b22c022..266efb446 100644 --- a/vp8/encoder/x86/quantize_x86.h +++ b/vp8/encoder/x86/quantize_x86.h @@ -27,11 +27,11 @@ extern prototype_quantize_block(vp8_regular_quantize_b_sse2); #if !CONFIG_RUNTIME_CPU_DETECT -/* The sse2 quantizer has not been updated to match the new exact - * quantizer introduced in commit e04e2935 - *#undef vp8_quantize_quantb - *#define vp8_quantize_quantb vp8_regular_quantize_b_sse2 - */ +// Currently, this function realizes a gain on x86 and a loss on x86_64 +#if ARCH_X86 +#undef vp8_quantize_quantb +#define vp8_quantize_quantb vp8_regular_quantize_b_sse2 +#endif #endif diff --git a/vp8/encoder/x86/x86_csystemdependent.c b/vp8/encoder/x86/x86_csystemdependent.c index c7dffc443..c1ed080eb 100644 --- a/vp8/encoder/x86/x86_csystemdependent.c +++ b/vp8/encoder/x86/x86_csystemdependent.c @@ -108,37 +108,26 @@ void vp8_fast_quantize_b_sse2(BLOCK *b, BLOCKD *d) int vp8_regular_quantize_b_impl_sse2(short *coeff_ptr, short *zbin_ptr, - short *qcoeff_ptr,short *dequant_ptr, - const int *default_zig_zag, short *round_ptr, - short *quant_ptr, short *dqcoeff_ptr, - unsigned short zbin_oq_value, - short *zbin_boost_ptr); + short *qcoeff_ptr,short *dequant_ptr, + const int *default_zig_zag, short *round_ptr, + short *quant_ptr, short *dqcoeff_ptr, + unsigned short zbin_oq_value, + short *zbin_boost_ptr, + short *quant_shift_ptr); void vp8_regular_quantize_b_sse2(BLOCK *b,BLOCKD *d) { - short *zbin_boost_ptr = b->zrun_zbin_boost; - short *coeff_ptr = b->coeff; - short *zbin_ptr = b->zbin; - short *round_ptr = b->round; - short *quant_ptr = b->quant; - short *qcoeff_ptr = d->qcoeff; - short *dqcoeff_ptr = d->dqcoeff; - short *dequant_ptr = d->dequant; - short zbin_oq_value = b->zbin_extra; - - d->eob = vp8_regular_quantize_b_impl_sse2( - coeff_ptr, - zbin_ptr, - qcoeff_ptr, - dequant_ptr, - vp8_default_zig_zag1d, - - round_ptr, - quant_ptr, - dqcoeff_ptr, - zbin_oq_value, - zbin_boost_ptr - ); + d->eob = vp8_regular_quantize_b_impl_sse2(b->coeff, + b->zbin, + d->qcoeff, + d->dequant, + vp8_default_zig_zag1d, + b->round, + b->quant, + d->dqcoeff, + b->zbin_extra, + b->zrun_zbin_boost, + b->quant_shift); } int vp8_mbblock_error_xmm_impl(short *coeff_ptr, short *dcoef_ptr, int dc); @@ -307,7 +296,9 @@ void vp8_arch_x86_encoder_init(VP8_COMP *cpi) cpi->rtcd.encodemb.submby = vp8_subtract_mby_sse2; cpi->rtcd.encodemb.submbuv = vp8_subtract_mbuv_sse2; - /*cpi->rtcd.quantize.quantb = vp8_regular_quantize_b_sse2;*/ +#if ARCH_X86 + cpi->rtcd.quantize.quantb = vp8_regular_quantize_b_sse2; +#endif cpi->rtcd.quantize.fastquantb = vp8_fast_quantize_b_sse2; cpi->rtcd.temporal.apply = vp8_temporal_filter_apply_sse2; From ce6c954d2e643c74eb77ca36884c541b1d97fbdb Mon Sep 17 00:00:00 2001 From: Yunqing Wang Date: Tue, 18 Jan 2011 14:19:52 -0500 Subject: [PATCH 2/5] Modify calling of NEON code in sub-pixel search In vp8_find_best_sub_pixel_step_iteratively(), many times xoffset and yoffset are specific values - (4,0) (0,4) and (4,4). Modified code to call simplified NEON version at these specific offsets to help with the performance. Change-Id: Iaf896a0f7aae4697bd36a49e182525dd1ef1ab4d --- vp8/encoder/arm/arm_csystemdependent.c | 31 +++++++++++++++++++ .../neon/vp8_subpixelvariance16x16_neon.asm | 4 +-- 2 files changed, 33 insertions(+), 2 deletions(-) diff --git a/vp8/encoder/arm/arm_csystemdependent.c b/vp8/encoder/arm/arm_csystemdependent.c index a1f110260..4f68a9576 100644 --- a/vp8/encoder/arm/arm_csystemdependent.c +++ b/vp8/encoder/arm/arm_csystemdependent.c @@ -18,6 +18,37 @@ extern void (*vp8_yv12_copy_partial_frame_ptr)(YV12_BUFFER_CONFIG *src_ybc, YV12 extern void vp8_yv12_copy_partial_frame(YV12_BUFFER_CONFIG *src_ybc, YV12_BUFFER_CONFIG *dst_ybc, int Fraction); extern void vpxyv12_copy_partial_frame_neon(YV12_BUFFER_CONFIG *src_ybc, YV12_BUFFER_CONFIG *dst_ybc, int Fraction); +extern unsigned int vp8_sub_pixel_variance16x16_neon_func +( + const unsigned char *src_ptr, + int src_pixels_per_line, + int xoffset, + int yoffset, + const unsigned char *dst_ptr, + int dst_pixels_per_line, + unsigned int *sse +); +unsigned int vp8_sub_pixel_variance16x16_neon +( + const unsigned char *src_ptr, + int src_pixels_per_line, + int xoffset, + int yoffset, + const unsigned char *dst_ptr, + int dst_pixels_per_line, + unsigned int *sse +) +{ + if (xoffset == 4 && yoffset == 0) + return vp8_variance_halfpixvar16x16_h_neon(src_ptr, src_pixels_per_line, dst_ptr, dst_pixels_per_line, sse); + else if (xoffset == 0 && yoffset == 4) + return vp8_variance_halfpixvar16x16_v_neon(src_ptr, src_pixels_per_line, dst_ptr, dst_pixels_per_line, sse); + else if (xoffset == 4 && yoffset == 4) + return vp8_variance_halfpixvar16x16_hv_neon(src_ptr, src_pixels_per_line, dst_ptr, dst_pixels_per_line, sse); + else + return vp8_sub_pixel_variance16x16_neon_func(src_ptr, src_pixels_per_line, xoffset, yoffset, dst_ptr, dst_pixels_per_line, sse); +} + void vp8_arch_arm_encoder_init(VP8_COMP *cpi) { #if CONFIG_RUNTIME_CPU_DETECT diff --git a/vp8/encoder/arm/neon/vp8_subpixelvariance16x16_neon.asm b/vp8/encoder/arm/neon/vp8_subpixelvariance16x16_neon.asm index 1b09cfe4c..1475f76df 100644 --- a/vp8/encoder/arm/neon/vp8_subpixelvariance16x16_neon.asm +++ b/vp8/encoder/arm/neon/vp8_subpixelvariance16x16_neon.asm @@ -9,7 +9,7 @@ ; - EXPORT |vp8_sub_pixel_variance16x16_neon| + EXPORT |vp8_sub_pixel_variance16x16_neon_func| ARM REQUIRE8 PRESERVE8 @@ -24,7 +24,7 @@ ; stack(r6) unsigned int *sse ;note: most of the code is copied from bilinear_predict16x16_neon and vp8_variance16x16_neon. -|vp8_sub_pixel_variance16x16_neon| PROC +|vp8_sub_pixel_variance16x16_neon_func| PROC push {r4-r6, lr} ldr r12, _BilinearTaps_coeff_ From 336aa0b7da8a35ba57400ce92fc016fc7fb35233 Mon Sep 17 00:00:00 2001 From: Paul Wilkins Date: Tue, 25 Jan 2011 12:29:06 +0000 Subject: [PATCH 3/5] Incorrect bit allocation in forced KF groups. The old 2 pass code estimated error distribution when coding a forced (by interval) key frame. The result of this was that in some cases, when allocating bits at the GF group level within a KF group there was either a glut of bits or starvation of bits at the end of the KF group. Added code to rescan and get the correct data once the position of a forced key frame has been determined. Change-Id: I0c811675ef3f9e4109d14bd049d7641682ffcf11 --- vp8/encoder/firstpass.c | 31 +++++++++++++++++++++++++++---- 1 file changed, 27 insertions(+), 4 deletions(-) diff --git a/vp8/encoder/firstpass.c b/vp8/encoder/firstpass.c index a77ced78c..3e67bf53c 100644 --- a/vp8/encoder/firstpass.c +++ b/vp8/encoder/firstpass.c @@ -2423,12 +2423,35 @@ void vp8_find_next_key_frame(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame) if (cpi->oxcf.auto_key && cpi->frames_to_key > (int)cpi->key_frame_frequency ) { + int current_pos = cpi->stats_in; + FIRSTPASS_STATS tmp_frame; + cpi->frames_to_key /= 2; - // Estimate corrected kf group error - kf_group_err /= 2.0; - kf_group_intra_err /= 2.0; - kf_group_coded_err /= 2.0; + // Copy first frame details + vpx_memcpy(&tmp_frame, &first_frame, sizeof(first_frame)); + + // Reset to the start of the group + reset_fpf_position(cpi, start_position); + + kf_group_err = 0; + kf_group_intra_err = 0; + kf_group_coded_err = 0; + + // Rescan to get the correct error data for the forced kf group + for( i = 0; i < cpi->frames_to_key; i++ ) + { + // Accumulate kf group errors + kf_group_err += calculate_modified_err(cpi, &tmp_frame); + kf_group_intra_err += tmp_frame.intra_error; + kf_group_coded_err += tmp_frame.coded_error; + + // Load a the next frame's stats + vp8_input_stats(cpi, &tmp_frame); + } + + // Reset to the start of the group + reset_fpf_position(cpi, current_pos); cpi->next_key_frame_forced = TRUE; } From 3bf235a4c92efdd622c59a622ab03dd081012089 Mon Sep 17 00:00:00 2001 From: Attila Nagy Date: Mon, 17 Jan 2011 13:00:08 +0200 Subject: [PATCH 4/5] Fix issue 262, vp8cx_pack_tokens_into_partitions_armv5 http://code.google.com/p/webm/issues/detail?id=262 Function was asuming that partitions have equal amount of mb_rows, which is not always true. Change-Id: I59ed40117fd408392a85c633beeb5340ed2f4b25 --- vp8/encoder/arm/armv5te/vp8_packtokens_partitions_armv5.asm | 2 ++ 1 file changed, 2 insertions(+) diff --git a/vp8/encoder/arm/armv5te/vp8_packtokens_partitions_armv5.asm b/vp8/encoder/arm/armv5te/vp8_packtokens_partitions_armv5.asm index 57cd318ee..42dae13de 100644 --- a/vp8/encoder/arm/armv5te/vp8_packtokens_partitions_armv5.asm +++ b/vp8/encoder/arm/armv5te/vp8_packtokens_partitions_armv5.asm @@ -65,6 +65,8 @@ numparts_loop ldr r10, [sp, #40] ; ptr ldr r5, [sp, #36] ; move mb_rows to the counting section + sub r5, r5, r11 ; move start point with each partition + ; mb_rows starts at i str r5, [sp, #12] ; Reset all of the VP8 Writer data for each partition that From 2168a94495f1cef07a3e0cd22d42afe01deb3286 Mon Sep 17 00:00:00 2001 From: Johann Date: Tue, 25 Jan 2011 15:11:39 -0500 Subject: [PATCH 5/5] move new neon subpixel function previously wasn't guarded with ifdef ARMV7, causing a link error with ARMV6 Change-Id: I0526858be0b5f49b2bf11e9090180b2a6c48926d --- vp8/encoder/arm/arm_csystemdependent.c | 31 ---------------------- vp8/encoder/arm/variance_arm.c | 36 ++++++++++++++++++++++++++ vp8/encoder/arm/variance_arm.h | 1 + vp8/vp8cx_arm.mk | 1 + 4 files changed, 38 insertions(+), 31 deletions(-) create mode 100644 vp8/encoder/arm/variance_arm.c diff --git a/vp8/encoder/arm/arm_csystemdependent.c b/vp8/encoder/arm/arm_csystemdependent.c index 4f68a9576..a1f110260 100644 --- a/vp8/encoder/arm/arm_csystemdependent.c +++ b/vp8/encoder/arm/arm_csystemdependent.c @@ -18,37 +18,6 @@ extern void (*vp8_yv12_copy_partial_frame_ptr)(YV12_BUFFER_CONFIG *src_ybc, YV12 extern void vp8_yv12_copy_partial_frame(YV12_BUFFER_CONFIG *src_ybc, YV12_BUFFER_CONFIG *dst_ybc, int Fraction); extern void vpxyv12_copy_partial_frame_neon(YV12_BUFFER_CONFIG *src_ybc, YV12_BUFFER_CONFIG *dst_ybc, int Fraction); -extern unsigned int vp8_sub_pixel_variance16x16_neon_func -( - const unsigned char *src_ptr, - int src_pixels_per_line, - int xoffset, - int yoffset, - const unsigned char *dst_ptr, - int dst_pixels_per_line, - unsigned int *sse -); -unsigned int vp8_sub_pixel_variance16x16_neon -( - const unsigned char *src_ptr, - int src_pixels_per_line, - int xoffset, - int yoffset, - const unsigned char *dst_ptr, - int dst_pixels_per_line, - unsigned int *sse -) -{ - if (xoffset == 4 && yoffset == 0) - return vp8_variance_halfpixvar16x16_h_neon(src_ptr, src_pixels_per_line, dst_ptr, dst_pixels_per_line, sse); - else if (xoffset == 0 && yoffset == 4) - return vp8_variance_halfpixvar16x16_v_neon(src_ptr, src_pixels_per_line, dst_ptr, dst_pixels_per_line, sse); - else if (xoffset == 4 && yoffset == 4) - return vp8_variance_halfpixvar16x16_hv_neon(src_ptr, src_pixels_per_line, dst_ptr, dst_pixels_per_line, sse); - else - return vp8_sub_pixel_variance16x16_neon_func(src_ptr, src_pixels_per_line, xoffset, yoffset, dst_ptr, dst_pixels_per_line, sse); -} - void vp8_arch_arm_encoder_init(VP8_COMP *cpi) { #if CONFIG_RUNTIME_CPU_DETECT diff --git a/vp8/encoder/arm/variance_arm.c b/vp8/encoder/arm/variance_arm.c new file mode 100644 index 000000000..4c7248543 --- /dev/null +++ b/vp8/encoder/arm/variance_arm.c @@ -0,0 +1,36 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "vpx_config.h" + +#if HAVE_ARMV7 + +unsigned int vp8_sub_pixel_variance16x16_neon +( + const unsigned char *src_ptr, + int src_pixels_per_line, + int xoffset, + int yoffset, + const unsigned char *dst_ptr, + int dst_pixels_per_line, + unsigned int *sse +) +{ + if (xoffset == 4 && yoffset == 0) + return vp8_variance_halfpixvar16x16_h_neon(src_ptr, src_pixels_per_line, dst_ptr, dst_pixels_per_line, sse); + else if (xoffset == 0 && yoffset == 4) + return vp8_variance_halfpixvar16x16_v_neon(src_ptr, src_pixels_per_line, dst_ptr, dst_pixels_per_line, sse); + else if (xoffset == 4 && yoffset == 4) + return vp8_variance_halfpixvar16x16_hv_neon(src_ptr, src_pixels_per_line, dst_ptr, dst_pixels_per_line, sse); + else + return vp8_sub_pixel_variance16x16_neon_func(src_ptr, src_pixels_per_line, xoffset, yoffset, dst_ptr, dst_pixels_per_line, sse); +} + +#endif diff --git a/vp8/encoder/arm/variance_arm.h b/vp8/encoder/arm/variance_arm.h index 0e5f62fcf..3cbacfac3 100644 --- a/vp8/encoder/arm/variance_arm.h +++ b/vp8/encoder/arm/variance_arm.h @@ -30,6 +30,7 @@ extern prototype_subpixvariance(vp8_sub_pixel_variance8x8_neon); //extern prototype_subpixvariance(vp8_sub_pixel_variance8x16_c); //extern prototype_subpixvariance(vp8_sub_pixel_variance16x8_c); extern prototype_subpixvariance(vp8_sub_pixel_variance16x16_neon); +extern prototype_subpixvariance(vp8_sub_pixel_variance16x16_neon_func); extern prototype_variance(vp8_variance_halfpixvar16x16_h_neon); extern prototype_variance(vp8_variance_halfpixvar16x16_v_neon); extern prototype_variance(vp8_variance_halfpixvar16x16_hv_neon); diff --git a/vp8/vp8cx_arm.mk b/vp8/vp8cx_arm.mk index da27e0897..4113f2395 100644 --- a/vp8/vp8cx_arm.mk +++ b/vp8/vp8cx_arm.mk @@ -16,6 +16,7 @@ VP8_CX_SRCS-$(ARCH_ARM) += encoder/arm/arm_csystemdependent.c VP8_CX_SRCS-$(HAVE_ARMV7) += encoder/arm/encodemb_arm.c +VP8_CX_SRCS-$(HAVE_ARMV7) += encoder/arm/variance_arm.c VP8_CX_SRCS-$(HAVE_ARMV7) += encoder/arm/quantize_arm.c VP8_CX_SRCS-$(HAVE_ARMV7) += encoder/arm/picklpf_arm.c VP8_CX_SRCS-$(HAVE_ARMV5TE) += encoder/arm/boolhuff_arm.c