diff --git a/vp8/common/arm/arm_systemdependent.c b/vp8/common/arm/arm_systemdependent.c index b5f194d3d..cd55a6377 100644 --- a/vp8/common/arm/arm_systemdependent.c +++ b/vp8/common/arm/arm_systemdependent.c @@ -46,7 +46,6 @@ void vp8_arch_arm_common_init(VP8_COMMON *ctx) rtcd->subpix.bilinear4x4 = vp8_bilinear_predict4x4_armv6; rtcd->idct.idct16 = vp8_short_idct4x4llm_v6_dual; - rtcd->idct.iwalsh1 = vp8_short_inv_walsh4x4_1_v6; rtcd->idct.iwalsh16 = vp8_short_inv_walsh4x4_v6; rtcd->loopfilter.normal_mb_v = vp8_loop_filter_mbv_armv6; @@ -80,7 +79,6 @@ void vp8_arch_arm_common_init(VP8_COMMON *ctx) rtcd->subpix.bilinear4x4 = vp8_bilinear_predict4x4_neon; rtcd->idct.idct16 = vp8_short_idct4x4llm_neon; - rtcd->idct.iwalsh1 = vp8_short_inv_walsh4x4_1_neon; rtcd->idct.iwalsh16 = vp8_short_inv_walsh4x4_neon; rtcd->loopfilter.normal_mb_v = vp8_loop_filter_mbv_neon; diff --git a/vp8/common/arm/armv6/iwalsh_v6.asm b/vp8/common/arm/armv6/iwalsh_v6.asm index 463bff0f5..31ef09cad 100644 --- a/vp8/common/arm/armv6/iwalsh_v6.asm +++ b/vp8/common/arm/armv6/iwalsh_v6.asm @@ -9,7 +9,6 @@ ; EXPORT |vp8_short_inv_walsh4x4_v6| - EXPORT |vp8_short_inv_walsh4x4_1_v6| ARM REQUIRE8 @@ -17,19 +16,19 @@ AREA |.text|, CODE, READONLY ; name this block of code -;short vp8_short_inv_walsh4x4_v6(short *input, short *output) +;short vp8_short_inv_walsh4x4_v6(short *input, short *mb_dqcoeff) |vp8_short_inv_walsh4x4_v6| PROC - stmdb sp!, {r4 - r11, lr} + stmdb sp!, {r4 - r12, lr} - ldr r2, [r0], #4 ; [1 | 0] - ldr r3, [r0], #4 ; [3 | 2] - ldr r4, [r0], #4 ; [5 | 4] - ldr r5, [r0], #4 ; [7 | 6] - ldr r6, [r0], #4 ; [9 | 8] - ldr r7, [r0], #4 ; [11 | 10] - ldr r8, [r0], #4 ; [13 | 12] - ldr r9, [r0] ; [15 | 14] + ldr r2, [r0, #0] ; [1 | 0] + ldr r3, [r0, #4] ; [3 | 2] + ldr r4, [r0, #8] ; [5 | 4] + ldr r5, [r0, #12] ; [7 | 6] + ldr r6, [r0, #16] ; [9 | 8] + ldr r7, [r0, #20] ; [11 | 10] + ldr r8, [r0, #24] ; [13 | 12] + ldr r9, [r0, #28] ; [15 | 14] qadd16 r10, r2, r8 ; a1 [1+13 | 0+12] qadd16 r11, r4, r6 ; b1 [5+9 | 4+8] @@ -69,24 +68,27 @@ qadd16 r4, r4, r10 ; [b2+3|c2+3] qadd16 r5, r5, r10 ; [a2+3|d2+3] - asr r12, r2, #3 ; [1 | x] - pkhtb r12, r12, r3, asr #19; [1 | 0] - lsl lr, r3, #16 ; [~3 | x] - lsl r2, r2, #16 ; [~2 | x] - asr lr, lr, #3 ; [3 | x] - pkhtb lr, lr, r2, asr #19 ; [3 | 2] + asr r12, r3, #19 ; [0] + strh r12, [r1], #32 + asr lr, r2, #19 ; [1] + strh lr, [r1], #32 + sxth r2, r2 + sxth r3, r3 + asr r2, r2, #3 ; [2] + strh r2, [r1], #32 + asr r3, r3, #3 ; [3] + strh r3, [r1], #32 - asr r2, r4, #3 ; [5 | x] - pkhtb r2, r2, r5, asr #19 ; [5 | 4] - lsl r3, r5, #16 ; [~7 | x] - lsl r4, r4, #16 ; [~6 | x] - asr r3, r3, #3 ; [7 | x] - pkhtb r3, r3, r4, asr #19 ; [7 | 6] - - str r12, [r1], #4 - str lr, [r1], #4 - str r2, [r1], #4 - str r3, [r1], #4 + asr r12, r5, #19 ; [4] + strh r12, [r1], #32 + asr lr, r4, #19 ; [5] + strh lr, [r1], #32 + sxth r4, r4 + sxth r5, r5 + asr r4, r4, #3 ; [6] + strh r4, [r1], #32 + asr r5, r5, #3 ; [7] + strh r5, [r1], #32 qsubaddx r2, r6, r7 ; [c1|a1] [9-10 | 8+11] qaddsubx r3, r6, r7 ; [b1|d1] [9+10 | 8-11] @@ -103,50 +105,32 @@ qadd16 r8, r8, r10 ; [b2+3|c2+3] qadd16 r9, r9, r10 ; [a2+3|d2+3] - asr r2, r6, #3 ; [9 | x] - pkhtb r2, r2, r7, asr #19 ; [9 | 8] - lsl r3, r7, #16 ; [~11| x] - lsl r4, r6, #16 ; [~10| x] - asr r3, r3, #3 ; [11 | x] - pkhtb r3, r3, r4, asr #19 ; [11 | 10] + asr r12, r7, #19 ; [8] + strh r12, [r1], #32 + asr lr, r6, #19 ; [9] + strh lr, [r1], #32 + sxth r6, r6 + sxth r7, r7 + asr r6, r6, #3 ; [10] + strh r6, [r1], #32 + asr r7, r7, #3 ; [11] + strh r7, [r1], #32 - asr r4, r8, #3 ; [13 | x] - pkhtb r4, r4, r9, asr #19 ; [13 | 12] - lsl r5, r9, #16 ; [~15| x] - lsl r6, r8, #16 ; [~14| x] - asr r5, r5, #3 ; [15 | x] - pkhtb r5, r5, r6, asr #19 ; [15 | 14] + asr r12, r9, #19 ; [12] + strh r12, [r1], #32 + asr lr, r8, #19 ; [13] + strh lr, [r1], #32 + sxth r8, r8 + sxth r9, r9 + asr r8, r8, #3 ; [14] + strh r8, [r1], #32 + asr r9, r9, #3 ; [15] + strh r9, [r1], #32 - str r2, [r1], #4 - str r3, [r1], #4 - str r4, [r1], #4 - str r5, [r1] - - ldmia sp!, {r4 - r11, pc} + ldmia sp!, {r4 - r12, pc} ENDP ; |vp8_short_inv_walsh4x4_v6| -;short vp8_short_inv_walsh4x4_1_v6(short *input, short *output) -|vp8_short_inv_walsh4x4_1_v6| PROC - - ldrsh r2, [r0] ; [0] - add r2, r2, #3 ; [0] + 3 - asr r2, r2, #3 ; a1 ([0]+3) >> 3 - lsl r2, r2, #16 ; [a1 | x] - orr r2, r2, r2, lsr #16 ; [a1 | a1] - - str r2, [r1], #4 - str r2, [r1], #4 - str r2, [r1], #4 - str r2, [r1], #4 - str r2, [r1], #4 - str r2, [r1], #4 - str r2, [r1], #4 - str r2, [r1] - - bx lr - ENDP ; |vp8_short_inv_walsh4x4_1_v6| - ; Constant Pool c0x00030003 DCD 0x00030003 END diff --git a/vp8/common/arm/idct_arm.h b/vp8/common/arm/idct_arm.h index c710c2eb0..68c0cad11 100644 --- a/vp8/common/arm/idct_arm.h +++ b/vp8/common/arm/idct_arm.h @@ -25,9 +25,6 @@ extern prototype_second_order(vp8_short_inv_walsh4x4_v6); #undef vp8_idct_idct1_scalar_add #define vp8_idct_idct1_scalar_add vp8_dc_only_idct_add_v6 -#undef vp8_idct_iwalsh1 -#define vp8_idct_iwalsh1 vp8_short_inv_walsh4x4_1_v6 - #undef vp8_idct_iwalsh16 #define vp8_idct_iwalsh16 vp8_short_inv_walsh4x4_v6 #endif @@ -46,9 +43,6 @@ extern prototype_second_order(vp8_short_inv_walsh4x4_neon); #undef vp8_idct_idct1_scalar_add #define vp8_idct_idct1_scalar_add vp8_dc_only_idct_add_neon -#undef vp8_idct_iwalsh1 -#define vp8_idct_iwalsh1 vp8_short_inv_walsh4x4_1_neon - #undef vp8_idct_iwalsh16 #define vp8_idct_iwalsh16 vp8_short_inv_walsh4x4_neon #endif diff --git a/vp8/common/arm/neon/iwalsh_neon.asm b/vp8/common/arm/neon/iwalsh_neon.asm index 01c79d937..e8ea2a619 100644 --- a/vp8/common/arm/neon/iwalsh_neon.asm +++ b/vp8/common/arm/neon/iwalsh_neon.asm @@ -8,7 +8,6 @@ ; be found in the AUTHORS file in the root of the source tree. ; EXPORT |vp8_short_inv_walsh4x4_neon| - EXPORT |vp8_short_inv_walsh4x4_1_neon| ARM REQUIRE8 @@ -16,7 +15,7 @@ AREA |.text|, CODE, READONLY ; name this block of code -;short vp8_short_inv_walsh4x4_neon(short *input, short *output) +;short vp8_short_inv_walsh4x4_neon(short *input, short *mb_dqcoeff) |vp8_short_inv_walsh4x4_neon| PROC ; read in all four lines of values: d0->d3 @@ -59,22 +58,30 @@ vshr.s16 q0, q0, #3 ;e/f >> 3 vshr.s16 q1, q1, #3 ;g/h >> 3 - vst4.i16 {d0,d1,d2,d3}, [r1@128] + mov r2, #64 + add r3, r1, #32 + + vst1.i16 d0[0], [r1],r2 + vst1.i16 d1[0], [r3],r2 + vst1.i16 d2[0], [r1],r2 + vst1.i16 d3[0], [r3],r2 + + vst1.i16 d0[1], [r1],r2 + vst1.i16 d1[1], [r3],r2 + vst1.i16 d2[1], [r1],r2 + vst1.i16 d3[1], [r3],r2 + + vst1.i16 d0[2], [r1],r2 + vst1.i16 d1[2], [r3],r2 + vst1.i16 d2[2], [r1],r2 + vst1.i16 d3[2], [r3],r2 + + vst1.i16 d0[3], [r1],r2 + vst1.i16 d1[3], [r3],r2 + vst1.i16 d2[3], [r1] + vst1.i16 d3[3], [r3] bx lr ENDP ; |vp8_short_inv_walsh4x4_neon| - -;short vp8_short_inv_walsh4x4_1_neon(short *input, short *output) -|vp8_short_inv_walsh4x4_1_neon| PROC - ldrsh r2, [r0] ; load input[0] - add r3, r2, #3 ; add 3 - add r2, r1, #16 ; base for last 8 output - asr r0, r3, #3 ; right shift 3 - vdup.16 q0, r0 ; load and duplicate - vst1.16 {q0}, [r1@128] ; write back 8 - vst1.16 {q0}, [r2@128] ; write back last 8 - bx lr - ENDP ; |vp8_short_inv_walsh4x4_1_neon| - END diff --git a/vp8/common/idct.h b/vp8/common/idct.h index 411a1b472..7371f85ff 100644 --- a/vp8/common/idct.h +++ b/vp8/common/idct.h @@ -37,6 +37,10 @@ #define vp8_idct_idct16 vp8_short_idct4x4llm_c #endif extern prototype_idct(vp8_idct_idct16); +/* add this prototype to prevent compiler warning about implicit + * declaration of vp8_short_idct4x4llm_c function in dequantize.c + * when building, for example, neon optimized version */ +extern prototype_idct(vp8_short_idct4x4llm_c); #ifndef vp8_idct_idct1_scalar_add #define vp8_idct_idct1_scalar_add vp8_dc_only_idct_add_c diff --git a/vp8/common/idctllm.c b/vp8/common/idctllm.c index 49496abef..47af52f04 100644 --- a/vp8/common/idctllm.c +++ b/vp8/common/idctllm.c @@ -137,8 +137,9 @@ void vp8_dc_only_idct_add_c(short input_dc, unsigned char *pred_ptr, } -void vp8_short_inv_walsh4x4_c(short *input, short *output) +void vp8_short_inv_walsh4x4_c(short *input, short *mb_dqcoeff) { + short output[16]; int i; int a1, b1, c1, d1; int a2, b2, c2, d2; @@ -183,22 +184,21 @@ void vp8_short_inv_walsh4x4_c(short *input, short *output) ip += 4; op += 4; } + + for(i = 0; i < 16; i++) + { + mb_dqcoeff[i * 16] = output[i]; + } } -void vp8_short_inv_walsh4x4_1_c(short *input, short *output) +void vp8_short_inv_walsh4x4_1_c(short *input, short *mb_dqcoeff) { int i; int a1; - short *op = output; a1 = ((input[0] + 3) >> 3); - - for (i = 0; i < 4; i++) + for(i = 0; i < 16; i++) { - op[0] = a1; - op[1] = a1; - op[2] = a1; - op[3] = a1; - op += 4; + mb_dqcoeff[i * 16] = a1; } } diff --git a/vp8/common/invtrans.c b/vp8/common/invtrans.c index 478cb329f..95e6980fe 100644 --- a/vp8/common/invtrans.c +++ b/vp8/common/invtrans.c @@ -28,18 +28,6 @@ void vp8_inverse_transform_b(const vp8_idct_rtcd_vtable_t *rtcd, BLOCKD *b, } -static void recon_dcblock(MACROBLOCKD *x) -{ - BLOCKD *b = &x->block[24]; - int i; - - for (i = 0; i < 16; i++) - { - x->block[i].dqcoeff[0] = b->diff[i]; - } - -} - void vp8_inverse_transform_mby(const vp8_idct_rtcd_vtable_t *rtcd, MACROBLOCKD *x) { int i; @@ -47,9 +35,7 @@ void vp8_inverse_transform_mby(const vp8_idct_rtcd_vtable_t *rtcd, MACROBLOCKD * if(x->mode_info_context->mbmi.mode != SPLITMV) { /* do 2nd order transform on the dc block */ - IDCT_INVOKE(rtcd, iwalsh16)(x->block[24].dqcoeff, x->block[24].diff); - - recon_dcblock(x); + IDCT_INVOKE(rtcd, iwalsh16)(x->block[24].dqcoeff, x->dqcoeff); } for (i = 0; i < 16; i++) diff --git a/vp8/common/x86/idct_x86.h b/vp8/common/x86/idct_x86.h index f9e3a794d..06e3ea4b5 100644 --- a/vp8/common/x86/idct_x86.h +++ b/vp8/common/x86/idct_x86.h @@ -24,7 +24,6 @@ extern prototype_idct(vp8_short_idct4x4llm_mmx); extern prototype_idct_scalar_add(vp8_dc_only_idct_add_mmx); extern prototype_second_order(vp8_short_inv_walsh4x4_mmx); -extern prototype_second_order(vp8_short_inv_walsh4x4_1_mmx); #if !CONFIG_RUNTIME_CPU_DETECT #undef vp8_idct_idct16 @@ -36,9 +35,6 @@ extern prototype_second_order(vp8_short_inv_walsh4x4_1_mmx); #undef vp8_idct_iwalsh16 #define vp8_idct_iwalsh16 vp8_short_inv_walsh4x4_mmx -#undef vp8_idct_iwalsh1 -#define vp8_idct_iwalsh1 vp8_short_inv_walsh4x4_1_mmx - #endif #endif diff --git a/vp8/common/x86/iwalsh_mmx.asm b/vp8/common/x86/iwalsh_mmx.asm index 10b5274dc..3ab066ba4 100644 --- a/vp8/common/x86/iwalsh_mmx.asm +++ b/vp8/common/x86/iwalsh_mmx.asm @@ -11,42 +11,6 @@ %include "vpx_ports/x86_abi_support.asm" -;void vp8_short_inv_walsh4x4_1_mmx(short *input, short *output) -global sym(vp8_short_inv_walsh4x4_1_mmx) -sym(vp8_short_inv_walsh4x4_1_mmx): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 2 - push rsi - push rdi - ; end prolog - - mov rsi, arg(0) - mov rax, 3 - - mov rdi, arg(1) - add rax, [rsi] ;input[0] + 3 - - movd mm0, eax - - punpcklwd mm0, mm0 ;x x val val - - punpckldq mm0, mm0 ;val val val val - - psraw mm0, 3 ;(input[0] + 3) >> 3 - - movq [rdi + 0], mm0 - movq [rdi + 8], mm0 - movq [rdi + 16], mm0 - movq [rdi + 24], mm0 - - ; begin epilog - pop rdi - pop rsi - UNSHADOW_ARGS - pop rbp - ret - ;void vp8_short_inv_walsh4x4_mmx(short *input, short *output) global sym(vp8_short_inv_walsh4x4_mmx) sym(vp8_short_inv_walsh4x4_mmx): @@ -159,10 +123,50 @@ sym(vp8_short_inv_walsh4x4_mmx): psraw mm2, 3 psraw mm3, 3 - movq [rdi + 0], mm0 - movq [rdi + 8], mm1 - movq [rdi + 16], mm2 - movq [rdi + 24], mm3 +; movq [rdi + 0], mm0 +; movq [rdi + 8], mm1 +; movq [rdi + 16], mm2 +; movq [rdi + 24], mm3 + + movd eax, mm0 + psrlq mm0, 32 + mov word ptr[rdi+32*0], ax + shr eax, 16 + mov word ptr[rdi+32*1], ax + movd eax, mm0 + mov word ptr[rdi+32*2], ax + shr eax, 16 + mov word ptr[rdi+32*3], ax + + movd ecx, mm1 + psrlq mm1, 32 + mov word ptr[rdi+32*4], cx + shr ecx, 16 + mov word ptr[rdi+32*5], cx + movd ecx, mm1 + mov word ptr[rdi+32*6], cx + shr ecx, 16 + mov word ptr[rdi+32*7], cx + + movd eax, mm2 + psrlq mm2, 32 + mov word ptr[rdi+32*8], ax + shr eax, 16 + mov word ptr[rdi+32*9], ax + movd eax, mm2 + mov word ptr[rdi+32*10], ax + shr eax, 16 + mov word ptr[rdi+32*11], ax + + movd ecx, mm3 + psrlq mm3, 32 + mov word ptr[rdi+32*12], cx + shr ecx, 16 + mov word ptr[rdi+32*13], cx + movd ecx, mm3 + mov word ptr[rdi+32*14], cx + shr ecx, 16 + mov word ptr[rdi+32*15], cx ; begin epilog pop rdi diff --git a/vp8/common/x86/iwalsh_sse2.asm b/vp8/common/x86/iwalsh_sse2.asm index 1da4fd8da..5a7133d6c 100644 --- a/vp8/common/x86/iwalsh_sse2.asm +++ b/vp8/common/x86/iwalsh_sse2.asm @@ -96,8 +96,50 @@ sym(vp8_short_inv_walsh4x4_sse2): psraw xmm5, 3 psraw xmm1, 3 - movdqa [rdi + 0], xmm5 - movdqa [rdi + 16], xmm1 +;; movdqa [rdi + 0], xmm5 +;; movdqa [rdi + 16], xmm1 + + movd eax, xmm5 + psrldq xmm5, 4 + mov word ptr[rdi+32*0], ax + shr eax, 16 + mov word ptr[rdi+32*1], ax + movd eax, xmm5 + psrldq xmm5, 4 + mov word ptr[rdi+32*2], ax + shr eax, 16 + mov word ptr[rdi+32*3], ax + + movd eax, xmm5 + psrldq xmm5, 4 + mov word ptr[rdi+32*4], ax + shr eax, 16 + mov word ptr[rdi+32*5], ax + movd eax, xmm5 + mov word ptr[rdi+32*6], ax + shr eax, 16 + mov word ptr[rdi+32*7], ax + + movd eax, xmm1 + psrldq xmm1, 4 + mov word ptr[rdi+32*8], ax + shr eax, 16 + mov word ptr[rdi+32*9], ax + movd eax, xmm1 + psrldq xmm1, 4 + mov word ptr[rdi+32*10], ax + shr eax, 16 + mov word ptr[rdi+32*11], ax + + movd eax, xmm1 + psrldq xmm1, 4 + mov word ptr[rdi+32*12], ax + shr eax, 16 + mov word ptr[rdi+32*13], ax + movd eax, xmm1 + mov word ptr[rdi+32*14], ax + shr eax, 16 + mov word ptr[rdi+32*15], ax ; begin epilog pop rdi diff --git a/vp8/common/x86/x86_systemdependent.c b/vp8/common/x86/x86_systemdependent.c index c4e616a67..b24cbe48f 100644 --- a/vp8/common/x86/x86_systemdependent.c +++ b/vp8/common/x86/x86_systemdependent.c @@ -40,9 +40,6 @@ void vp8_arch_x86_common_init(VP8_COMMON *ctx) rtcd->idct.idct16 = vp8_short_idct4x4llm_mmx; rtcd->idct.idct1_scalar_add = vp8_dc_only_idct_add_mmx; rtcd->idct.iwalsh16 = vp8_short_inv_walsh4x4_mmx; - rtcd->idct.iwalsh1 = vp8_short_inv_walsh4x4_1_mmx; - - rtcd->recon.copy8x8 = vp8_copy_mem8x8_mmx; rtcd->recon.copy8x4 = vp8_copy_mem8x4_mmx; diff --git a/vp8/decoder/arm/arm_dsystemdependent.c b/vp8/decoder/arm/arm_dsystemdependent.c index 1b0091cdb..f802c5181 100644 --- a/vp8/decoder/arm/arm_dsystemdependent.c +++ b/vp8/decoder/arm/arm_dsystemdependent.c @@ -32,8 +32,6 @@ void vp8_arch_arm_decode_init(VP8D_COMP *pbi) { pbi->dequant.block = vp8_dequantize_b_v6; pbi->dequant.idct_add = vp8_dequant_idct_add_v6; - pbi->dequant.dc_idct_add = vp8_dequant_dc_idct_add_v6; - pbi->dequant.dc_idct_add_y_block = vp8_dequant_dc_idct_add_y_block_v6; pbi->dequant.idct_add_y_block = vp8_dequant_idct_add_y_block_v6; pbi->dequant.idct_add_uv_block = vp8_dequant_idct_add_uv_block_v6; } @@ -44,9 +42,6 @@ void vp8_arch_arm_decode_init(VP8D_COMP *pbi) { pbi->dequant.block = vp8_dequantize_b_neon; pbi->dequant.idct_add = vp8_dequant_idct_add_neon; - /*This is not used: NEON always dequants two blocks at once. - pbi->dequant.dc_idct_add = vp8_dequant_dc_idct_add_neon;*/ - pbi->dequant.dc_idct_add_y_block = vp8_dequant_dc_idct_add_y_block_neon; pbi->dequant.idct_add_y_block = vp8_dequant_idct_add_y_block_neon; pbi->dequant.idct_add_uv_block = vp8_dequant_idct_add_uv_block_neon; } diff --git a/vp8/decoder/arm/armv6/dequant_dc_idct_v6.asm b/vp8/decoder/arm/armv6/dequant_dc_idct_v6.asm deleted file mode 100644 index 19f94e089..000000000 --- a/vp8/decoder/arm/armv6/dequant_dc_idct_v6.asm +++ /dev/null @@ -1,213 +0,0 @@ -; -; Copyright (c) 2010 The WebM project authors. All Rights Reserved. -; -; Use of this source code is governed by a BSD-style license and patent -; grant that can be found in the LICENSE file in the root of the source -; tree. All contributing project authors may be found in the AUTHORS -; file in the root of the source tree. -; - - - EXPORT |vp8_dequant_dc_idct_add_v6| - - AREA |.text|, CODE, READONLY - -;void vp8_dequant_dc_idct_v6(short *input, short *dq, -; unsigned char *dest, int stride, int Dc) -; r0 = input -; r1 = dq -; r2 = dst -; r3 = stride -; sp + 36 = Dc - - -|vp8_dequant_dc_idct_add_v6| PROC - stmdb sp!, {r4-r11, lr} - - ldr r6, [sp, #36] - - ldr r4, [r0] ;input - ldr r5, [r1], #4 ;dq - - sub sp, sp, #4 - str r3, [sp] - - smultt r7, r4, r5 - - ldr r4, [r0, #4] ;input - ldr r5, [r1], #4 ;dq - - strh r6, [r0], #2 - strh r7, [r0], #2 - - smulbb r6, r4, r5 - smultt r7, r4, r5 - - ldr r4, [r0, #4] ;input - ldr r5, [r1], #4 ;dq - - strh r6, [r0], #2 - strh r7, [r0], #2 - - mov r12, #3 - -vp8_dequant_dc_add_loop - smulbb r6, r4, r5 - smultt r7, r4, r5 - - ldr r4, [r0, #4] ;input - ldr r5, [r1], #4 ;dq - - strh r6, [r0], #2 - strh r7, [r0], #2 - - smulbb r6, r4, r5 - smultt r7, r4, r5 - - subs r12, r12, #1 - - ldrne r4, [r0, #4] - ldrne r5, [r1], #4 - - strh r6, [r0], #2 - strh r7, [r0], #2 - - bne vp8_dequant_dc_add_loop - - sub r0, r0, #32 - mov r1, r0 - -; short_idct4x4llm_v6_dual - ldr r3, cospi8sqrt2minus1 - ldr r4, sinpi8sqrt2 - ldr r6, [r0, #8] - mov r5, #2 -vp8_dequant_dc_idct_loop1_v6 - ldr r12, [r0, #24] - ldr r14, [r0, #16] - smulwt r9, r3, r6 - smulwb r7, r3, r6 - smulwt r10, r4, r6 - smulwb r8, r4, r6 - pkhbt r7, r7, r9, lsl #16 - smulwt r11, r3, r12 - pkhbt r8, r8, r10, lsl #16 - uadd16 r6, r6, r7 - smulwt r7, r4, r12 - smulwb r9, r3, r12 - smulwb r10, r4, r12 - subs r5, r5, #1 - pkhbt r9, r9, r11, lsl #16 - ldr r11, [r0], #4 - pkhbt r10, r10, r7, lsl #16 - uadd16 r7, r12, r9 - usub16 r7, r8, r7 - uadd16 r6, r6, r10 - uadd16 r10, r11, r14 - usub16 r8, r11, r14 - uadd16 r9, r10, r6 - usub16 r10, r10, r6 - uadd16 r6, r8, r7 - usub16 r7, r8, r7 - str r6, [r1, #8] - ldrne r6, [r0, #8] - str r7, [r1, #16] - str r10, [r1, #24] - str r9, [r1], #4 - bne vp8_dequant_dc_idct_loop1_v6 - - mov r5, #2 - sub r0, r1, #8 -vp8_dequant_dc_idct_loop2_v6 - ldr r6, [r0], #4 - ldr r7, [r0], #4 - ldr r8, [r0], #4 - ldr r9, [r0], #4 - smulwt r1, r3, r6 - smulwt r12, r4, r6 - smulwt lr, r3, r8 - smulwt r10, r4, r8 - pkhbt r11, r8, r6, lsl #16 - pkhbt r1, lr, r1, lsl #16 - pkhbt r12, r10, r12, lsl #16 - pkhtb r6, r6, r8, asr #16 - uadd16 r6, r1, r6 - pkhbt lr, r9, r7, lsl #16 - uadd16 r10, r11, lr - usub16 lr, r11, lr - pkhtb r8, r7, r9, asr #16 - subs r5, r5, #1 - smulwt r1, r3, r8 - smulwb r7, r3, r8 - smulwt r11, r4, r8 - smulwb r9, r4, r8 - pkhbt r1, r7, r1, lsl #16 - uadd16 r8, r1, r8 - pkhbt r11, r9, r11, lsl #16 - usub16 r1, r12, r8 - uadd16 r8, r11, r6 - ldr r9, c0x00040004 - ldr r12, [sp] ; get stride from stack - uadd16 r6, r10, r8 - usub16 r7, r10, r8 - uadd16 r7, r7, r9 - uadd16 r6, r6, r9 - uadd16 r10, r14, r1 - usub16 r1, r14, r1 - uadd16 r10, r10, r9 - uadd16 r1, r1, r9 - ldr r11, [r2] ; load input from dst - mov r8, r7, asr #3 - pkhtb r9, r8, r10, asr #19 - mov r8, r1, asr #3 - pkhtb r8, r8, r6, asr #19 - uxtb16 lr, r11, ror #8 - qadd16 r9, r9, lr - uxtb16 lr, r11 - qadd16 r8, r8, lr - usat16 r9, #8, r9 - usat16 r8, #8, r8 - orr r9, r8, r9, lsl #8 - ldr r11, [r2, r12] ; load input from dst - mov r7, r7, lsl #16 - mov r1, r1, lsl #16 - mov r10, r10, lsl #16 - mov r6, r6, lsl #16 - mov r7, r7, asr #3 - pkhtb r7, r7, r10, asr #19 - mov r1, r1, asr #3 - pkhtb r1, r1, r6, asr #19 - uxtb16 r8, r11, ror #8 - qadd16 r7, r7, r8 - uxtb16 r8, r11 - qadd16 r1, r1, r8 - usat16 r7, #8, r7 - usat16 r1, #8, r1 - orr r1, r1, r7, lsl #8 - str r9, [r2], r12 ; store output to dst - str r1, [r2], r12 ; store output to dst - bne vp8_dequant_dc_idct_loop2_v6 - -; vpx_memset - sub r0, r0, #32 - add sp, sp, #4 - - mov r12, #0 - str r12, [r0] - str r12, [r0, #4] - str r12, [r0, #8] - str r12, [r0, #12] - str r12, [r0, #16] - str r12, [r0, #20] - str r12, [r0, #24] - str r12, [r0, #28] - - ldmia sp!, {r4 - r11, pc} - ENDP ; |vp8_dequant_dc_idct_add_v6| - -; Constant Pool -cospi8sqrt2minus1 DCD 0x00004E7B -sinpi8sqrt2 DCD 0x00008A8C -c0x00040004 DCD 0x00040004 - - END diff --git a/vp8/decoder/arm/armv6/idct_blk_v6.c b/vp8/decoder/arm/armv6/idct_blk_v6.c index 686bb737f..c1ef2852f 100644 --- a/vp8/decoder/arm/armv6/idct_blk_v6.c +++ b/vp8/decoder/arm/armv6/idct_blk_v6.c @@ -13,47 +13,6 @@ #include "vp8/decoder/dequantize.h" -void vp8_dequant_dc_idct_add_y_block_v6(short *q, short *dq, - unsigned char *dst, int stride, - char *eobs, short *dc) -{ - int i; - - for (i = 0; i < 4; i++) - { - if (eobs[0] > 1) - vp8_dequant_dc_idct_add_v6 (q, dq, dst, stride, dc[0]); - else if (eobs[0] == 1) - vp8_dc_only_idct_add_v6 (dc[0], dst, stride, dst, stride); - - if (eobs[1] > 1) - { - vp8_dequant_dc_idct_add_v6 (q+16, dq, dst+4, stride, dc[1]); - } - else if (eobs[1] == 1) - vp8_dc_only_idct_add_v6 (dc[1], dst+4, stride, dst+4, stride); - - if (eobs[2] > 1) - { - vp8_dequant_dc_idct_add_v6 (q+32, dq, dst+8, stride, dc[2]); - } - else if (eobs[2] == 1) - vp8_dc_only_idct_add_v6 (dc[2], dst+8, stride, dst+8, stride); - - if (eobs[3] > 1) - { - vp8_dequant_dc_idct_add_v6 (q+48, dq, dst+12, stride, dc[3]); - } - else if (eobs[3] == 1) - vp8_dc_only_idct_add_v6 (dc[3], dst+12, stride, dst+12, stride); - - q += 64; - dc += 4; - dst += 4*stride; - eobs += 4; - } -} - void vp8_dequant_idct_add_y_block_v6(short *q, short *dq, unsigned char *dst, int stride, char *eobs) diff --git a/vp8/decoder/arm/dequantize_arm.h b/vp8/decoder/arm/dequantize_arm.h index c020c8530..1123e8446 100644 --- a/vp8/decoder/arm/dequantize_arm.h +++ b/vp8/decoder/arm/dequantize_arm.h @@ -15,8 +15,6 @@ #if HAVE_ARMV6 extern prototype_dequant_block(vp8_dequantize_b_v6); extern prototype_dequant_idct_add(vp8_dequant_idct_add_v6); -extern prototype_dequant_dc_idct_add(vp8_dequant_dc_idct_add_v6); -extern prototype_dequant_dc_idct_add_y_block(vp8_dequant_dc_idct_add_y_block_v6); extern prototype_dequant_idct_add_y_block(vp8_dequant_idct_add_y_block_v6); extern prototype_dequant_idct_add_uv_block(vp8_dequant_idct_add_uv_block_v6); @@ -27,12 +25,6 @@ extern prototype_dequant_idct_add_uv_block(vp8_dequant_idct_add_uv_block_v6); #undef vp8_dequant_idct_add #define vp8_dequant_idct_add vp8_dequant_idct_add_v6 -#undef vp8_dequant_dc_idct_add -#define vp8_dequant_dc_idct_add vp8_dequant_dc_idct_add_v6 - -#undef vp8_dequant_dc_idct_add_y_block -#define vp8_dequant_dc_idct_add_y_block vp8_dequant_dc_idct_add_y_block_v6 - #undef vp8_dequant_idct_add_y_block #define vp8_dequant_idct_add_y_block vp8_dequant_idct_add_y_block_v6 @@ -44,8 +36,6 @@ extern prototype_dequant_idct_add_uv_block(vp8_dequant_idct_add_uv_block_v6); #if HAVE_ARMV7 extern prototype_dequant_block(vp8_dequantize_b_neon); extern prototype_dequant_idct_add(vp8_dequant_idct_add_neon); -extern prototype_dequant_dc_idct_add(vp8_dequant_dc_idct_add_neon); -extern prototype_dequant_dc_idct_add_y_block(vp8_dequant_dc_idct_add_y_block_neon); extern prototype_dequant_idct_add_y_block(vp8_dequant_idct_add_y_block_neon); extern prototype_dequant_idct_add_uv_block(vp8_dequant_idct_add_uv_block_neon); @@ -57,12 +47,6 @@ extern prototype_dequant_idct_add_uv_block(vp8_dequant_idct_add_uv_block_neon); #undef vp8_dequant_idct_add #define vp8_dequant_idct_add vp8_dequant_idct_add_neon -#undef vp8_dequant_dc_idct_add -#define vp8_dequant_dc_idct_add vp8_dequant_dc_idct_add_neon - -#undef vp8_dequant_dc_idct_add_y_block -#define vp8_dequant_dc_idct_add_y_block vp8_dequant_dc_idct_add_y_block_neon - #undef vp8_dequant_idct_add_y_block #define vp8_dequant_idct_add_y_block vp8_dequant_idct_add_y_block_neon diff --git a/vp8/decoder/arm/neon/idct_blk_neon.c b/vp8/decoder/arm/neon/idct_blk_neon.c index 086293114..185895f05 100644 --- a/vp8/decoder/arm/neon/idct_blk_neon.c +++ b/vp8/decoder/arm/neon/idct_blk_neon.c @@ -15,46 +15,11 @@ /* place these declarations here because we don't want to maintain them * outside of this scope */ -void idct_dequant_dc_full_2x_neon(short *input, short *dq, - unsigned char *dst, - int stride, short *dc); -void idct_dequant_dc_0_2x_neon(short *input, short *dq, - unsigned char *dst, - int stride, short *dc); void idct_dequant_full_2x_neon(short *q, short *dq, unsigned char *dst, int stride); void idct_dequant_0_2x_neon(short *q, short dq, unsigned char *dst, int stride); -void vp8_dequant_dc_idct_add_y_block_neon(short *q, short *dq, - unsigned char *dst, - int stride, char *eobs, short *dc) -{ - int i; - - for (i = 0; i < 4; i++) - { - if (((short *)(eobs))[0]) - { - if (((short *)eobs)[0] & 0xfefe) - idct_dequant_dc_full_2x_neon (q, dq, dst, stride, dc); - else - idct_dequant_dc_0_2x_neon(q, dq, dst, stride, dc); - } - - if (((short *)(eobs))[1]) - { - if (((short *)eobs)[1] & 0xfefe) - idct_dequant_dc_full_2x_neon (q+32, dq, dst+8, stride, dc+2); - else - idct_dequant_dc_0_2x_neon(q+32, dq, dst+8, stride, dc+2); - } - q += 64; - dc += 4; - dst += 4*stride; - eobs += 4; - } -} void vp8_dequant_idct_add_y_block_neon(short *q, short *dq, unsigned char *dst, diff --git a/vp8/decoder/arm/neon/idct_dequant_dc_0_2x_neon.asm b/vp8/decoder/arm/neon/idct_dequant_dc_0_2x_neon.asm deleted file mode 100644 index bf8d7ddcd..000000000 --- a/vp8/decoder/arm/neon/idct_dequant_dc_0_2x_neon.asm +++ /dev/null @@ -1,75 +0,0 @@ -; -; Copyright (c) 2010 The Webm project authors. All Rights Reserved. -; -; Use of this source code is governed by a BSD-style license and patent -; grant that can be found in the LICENSE file in the root of the source -; tree. All contributing project authors may be found in the AUTHORS -; file in the root of the source tree. -; - - - EXPORT |idct_dequant_dc_0_2x_neon| - ARM - REQUIRE8 - PRESERVE8 - - AREA ||.text||, CODE, READONLY, ALIGN=2 - -;void idct_dequant_dc_0_2x_neon(short *q, short *dq, -; unsigned char *dst, int stride); -; r0 *q, -; r1 *dq, -; r2 *dst -; r3 stride -; sp *dc -|idct_dequant_dc_0_2x_neon| PROC - - ; no q- or dq-coeffs, so r0 and r1 are free to use - ldr r1, [sp] ; *dc - add r12, r2, #4 - ldr r0, [r1] - - vld1.32 {d2[0]}, [r2], r3 ; lo - vld1.32 {d8[0]}, [r12], r3 ; hi - vld1.32 {d2[1]}, [r2], r3 - vld1.32 {d8[1]}, [r12], r3 - vld1.32 {d4[0]}, [r2], r3 - vld1.32 {d10[0]}, [r12], r3 - vld1.32 {d4[1]}, [r2], r3 - vld1.32 {d10[1]}, [r12] - - sxth r1, r0 ; lo *dc - add r1, r1, #4 - asr r1, r1, #3 - vdup.16 q0, r1 - sxth r0, r0, ror #16 ; hi *dc - add r0, r0, #4 - asr r0, r0, #3 - vdup.16 q3, r0 - - vaddw.u8 q1, q0, d2 ; lo - vaddw.u8 q2, q0, d4 - vaddw.u8 q4, q3, d8 ; hi - vaddw.u8 q5, q3, d10 - - vqmovun.s16 d2, q1 ; lo - vqmovun.s16 d4, q2 - vqmovun.s16 d8, q4 ; hi - vqmovun.s16 d10, q5 - - sub r2, r2, r3, lsl #2 ; dst - 4*stride - add r0, r2, #4 - - vst1.32 {d2[0]}, [r2], r3 ; lo - vst1.32 {d8[0]}, [r0], r3 ; hi - vst1.32 {d2[1]}, [r2], r3 - vst1.32 {d8[1]}, [r0], r3 - vst1.32 {d4[0]}, [r2], r3 - vst1.32 {d10[0]}, [r0], r3 - vst1.32 {d4[1]}, [r2] - vst1.32 {d10[1]}, [r0] - - bx lr - - ENDP ;|idct_dequant_dc_0_2x_neon| - END diff --git a/vp8/decoder/arm/neon/idct_dequant_dc_full_2x_neon.asm b/vp8/decoder/arm/neon/idct_dequant_dc_full_2x_neon.asm deleted file mode 100644 index eea41f68c..000000000 --- a/vp8/decoder/arm/neon/idct_dequant_dc_full_2x_neon.asm +++ /dev/null @@ -1,208 +0,0 @@ -; -; Copyright (c) 2010 The Webm project authors. All Rights Reserved. -; -; Use of this source code is governed by a BSD-style license -; that can be found in the LICENSE file in the root of the source -; tree. An additional intellectual property rights grant can be found -; in the file PATENTS. All contributing project authors may -; be found in the AUTHORS file in the root of the source tree. -; - - - EXPORT |idct_dequant_dc_full_2x_neon| - ARM - REQUIRE8 - PRESERVE8 - - AREA ||.text||, CODE, READONLY, ALIGN=2 -;void idct_dequant_dc_full_2x_neon(short *q, short *dq, -; unsigned char *dst, int stride, short *dc); -; r0 *q, -; r1 *dq, -; r2 *dst -; r3 stride -; sp *dc -|idct_dequant_dc_full_2x_neon| PROC - push {r4} - - vld1.16 {q0, q1}, [r1] ; dq (same l/r) - vld1.16 {q2, q3}, [r0] ; l q - add r0, r0, #32 - vld1.16 {q4, q5}, [r0] ; r q - add r12, r2, #4 - - ; interleave the predictors - vld1.32 {d28[0]}, [r2], r3 ; l pre - vld1.32 {d28[1]}, [r12], r3 ; r pre - vld1.32 {d29[0]}, [r2], r3 - vld1.32 {d29[1]}, [r12], r3 - vld1.32 {d30[0]}, [r2], r3 - vld1.32 {d30[1]}, [r12], r3 - vld1.32 {d31[0]}, [r2], r3 - ldr r1, [sp, #4] ; *dc - vld1.32 {d31[1]}, [r12] - - adr r4, cospi8sqrt2minus1 ; pointer to the first constant - - ldrh r12, [r1], #2 ; lo *dc - ldrh r1, [r1] ; hi *dc - - ; dequant: q[i] = q[i] * dq[i] - vmul.i16 q2, q2, q0 - vmul.i16 q3, q3, q1 - vmul.i16 q4, q4, q0 - vmul.i16 q5, q5, q1 - - ; move dc up to neon and overwrite first element - vmov.16 d4[0], r12 - vmov.16 d8[0], r1 - - vld1.16 {d0}, [r4] - - ; q2: l0r0 q3: l8r8 - ; q4: l4r4 q5: l12r12 - vswp d5, d8 - vswp d7, d10 - - ; _CONSTANTS_ * 4,12 >> 16 - ; q6: 4 * sinpi : c1/temp1 - ; q7: 12 * sinpi : d1/temp2 - ; q8: 4 * cospi - ; q9: 12 * cospi - vqdmulh.s16 q6, q4, d0[2] ; sinpi8sqrt2 - vqdmulh.s16 q7, q5, d0[2] - vqdmulh.s16 q8, q4, d0[0] ; cospi8sqrt2minus1 - vqdmulh.s16 q9, q5, d0[0] - - vqadd.s16 q10, q2, q3 ; a1 = 0 + 8 - vqsub.s16 q11, q2, q3 ; b1 = 0 - 8 - - ; vqdmulh only accepts signed values. this was a problem because - ; our constant had the high bit set, and was treated as a negative value. - ; vqdmulh also doubles the value before it shifts by 16. we need to - ; compensate for this. in the case of sinpi8sqrt2, the lowest bit is 0, - ; so we can shift the constant without losing precision. this avoids - ; shift again afterward, but also avoids the sign issue. win win! - ; for cospi8sqrt2minus1 the lowest bit is 1, so we lose precision if we - ; pre-shift it - vshr.s16 q8, q8, #1 - vshr.s16 q9, q9, #1 - - ; q4: 4 + 4 * cospi : d1/temp1 - ; q5: 12 + 12 * cospi : c1/temp2 - vqadd.s16 q4, q4, q8 - vqadd.s16 q5, q5, q9 - - ; c1 = temp1 - temp2 - ; d1 = temp1 + temp2 - vqsub.s16 q2, q6, q5 - vqadd.s16 q3, q4, q7 - - ; [0]: a1+d1 - ; [1]: b1+c1 - ; [2]: b1-c1 - ; [3]: a1-d1 - vqadd.s16 q4, q10, q3 - vqadd.s16 q5, q11, q2 - vqsub.s16 q6, q11, q2 - vqsub.s16 q7, q10, q3 - - ; rotate - vtrn.32 q4, q6 - vtrn.32 q5, q7 - vtrn.16 q4, q5 - vtrn.16 q6, q7 - ; idct loop 2 - ; q4: l 0, 4, 8,12 r 0, 4, 8,12 - ; q5: l 1, 5, 9,13 r 1, 5, 9,13 - ; q6: l 2, 6,10,14 r 2, 6,10,14 - ; q7: l 3, 7,11,15 r 3, 7,11,15 - - ; q8: 1 * sinpi : c1/temp1 - ; q9: 3 * sinpi : d1/temp2 - ; q10: 1 * cospi - ; q11: 3 * cospi - vqdmulh.s16 q8, q5, d0[2] ; sinpi8sqrt2 - vqdmulh.s16 q9, q7, d0[2] - vqdmulh.s16 q10, q5, d0[0] ; cospi8sqrt2minus1 - vqdmulh.s16 q11, q7, d0[0] - - vqadd.s16 q2, q4, q6 ; a1 = 0 + 2 - vqsub.s16 q3, q4, q6 ; b1 = 0 - 2 - - ; see note on shifting above - vshr.s16 q10, q10, #1 - vshr.s16 q11, q11, #1 - - ; q10: 1 + 1 * cospi : d1/temp1 - ; q11: 3 + 3 * cospi : c1/temp2 - vqadd.s16 q10, q5, q10 - vqadd.s16 q11, q7, q11 - - ; q8: c1 = temp1 - temp2 - ; q9: d1 = temp1 + temp2 - vqsub.s16 q8, q8, q11 - vqadd.s16 q9, q10, q9 - - ; a1+d1 - ; b1+c1 - ; b1-c1 - ; a1-d1 - vqadd.s16 q4, q2, q9 - vqadd.s16 q5, q3, q8 - vqsub.s16 q6, q3, q8 - vqsub.s16 q7, q2, q9 - - ; +4 >> 3 (rounding) - vrshr.s16 q4, q4, #3 ; lo - vrshr.s16 q5, q5, #3 - vrshr.s16 q6, q6, #3 ; hi - vrshr.s16 q7, q7, #3 - - vtrn.32 q4, q6 - vtrn.32 q5, q7 - vtrn.16 q4, q5 - vtrn.16 q6, q7 - - ; adding pre - ; input is still packed. pre was read interleaved - vaddw.u8 q4, q4, d28 - vaddw.u8 q5, q5, d29 - vaddw.u8 q6, q6, d30 - vaddw.u8 q7, q7, d31 - - vmov.i16 q14, #0 - vmov q15, q14 - vst1.16 {q14, q15}, [r0] ; write over high input - sub r0, r0, #32 - vst1.16 {q14, q15}, [r0] ; write over low input - - sub r2, r2, r3, lsl #2 ; dst - 4*stride - add r1, r2, #4 ; hi - - ;saturate and narrow - vqmovun.s16 d0, q4 ; lo - vqmovun.s16 d1, q5 - vqmovun.s16 d2, q6 ; hi - vqmovun.s16 d3, q7 - - vst1.32 {d0[0]}, [r2], r3 ; lo - vst1.32 {d0[1]}, [r1], r3 ; hi - vst1.32 {d1[0]}, [r2], r3 - vst1.32 {d1[1]}, [r1], r3 - vst1.32 {d2[0]}, [r2], r3 - vst1.32 {d2[1]}, [r1], r3 - vst1.32 {d3[0]}, [r2] - vst1.32 {d3[1]}, [r1] - - pop {r4} - bx lr - - ENDP ; |idct_dequant_dc_full_2x_neon| - -; Constant Pool -cospi8sqrt2minus1 DCD 0x4e7b -; because the lowest bit in 0x8a8c is 0, we can pre-shift this -sinpi8sqrt2 DCD 0x4546 - - END diff --git a/vp8/decoder/decodframe.c b/vp8/decoder/decodframe.c index 6f2cdfabb..31eafcf54 100644 --- a/vp8/decoder/decodframe.c +++ b/vp8/decoder/decodframe.c @@ -232,45 +232,53 @@ static void decode_macroblock(VP8D_COMP *pbi, MACROBLOCKD *xd, } } } - } - else if (mode == SPLITMV) + else { + short *DQC = xd->block[0].dequant; + + /* save the dc dequant constant in case it is overridden */ + short dc_dequant_temp = DQC[0]; + + if (mode != SPLITMV) + { + BLOCKD *b = &xd->block[24]; + + /* do 2nd order transform on the dc block */ + if (xd->eobs[24] > 1) + { + DEQUANT_INVOKE(&pbi->dequant, block)(b); + + IDCT_INVOKE(RTCD_VTABLE(idct), iwalsh16)(&b->dqcoeff[0], + xd->qcoeff); + ((int *)b->qcoeff)[0] = 0; + ((int *)b->qcoeff)[1] = 0; + ((int *)b->qcoeff)[2] = 0; + ((int *)b->qcoeff)[3] = 0; + ((int *)b->qcoeff)[4] = 0; + ((int *)b->qcoeff)[5] = 0; + ((int *)b->qcoeff)[6] = 0; + ((int *)b->qcoeff)[7] = 0; + } + else + { + b->dqcoeff[0] = b->qcoeff[0] * b->dequant[0]; + IDCT_INVOKE(RTCD_VTABLE(idct), iwalsh1)(&b->dqcoeff[0], + xd->qcoeff); + ((int *)b->qcoeff)[0] = 0; + } + + /* override the dc dequant constant */ + DQC[0] = 1; + } + DEQUANT_INVOKE (&pbi->dequant, idct_add_y_block) (xd->qcoeff, xd->block[0].dequant, xd->dst.y_buffer, xd->dst.y_stride, xd->eobs); - } - else - { - BLOCKD *b = &xd->block[24]; - /* do 2nd order transform on the dc block */ - if (xd->eobs[24] > 1) - { - DEQUANT_INVOKE(&pbi->dequant, block)(b); - - IDCT_INVOKE(RTCD_VTABLE(idct), iwalsh16)(&b->dqcoeff[0], b->diff); - ((int *)b->qcoeff)[0] = 0; - ((int *)b->qcoeff)[1] = 0; - ((int *)b->qcoeff)[2] = 0; - ((int *)b->qcoeff)[3] = 0; - ((int *)b->qcoeff)[4] = 0; - ((int *)b->qcoeff)[5] = 0; - ((int *)b->qcoeff)[6] = 0; - ((int *)b->qcoeff)[7] = 0; - } - else - { - b->dqcoeff[0] = b->qcoeff[0] * b->dequant[0]; - IDCT_INVOKE(RTCD_VTABLE(idct), iwalsh1)(&b->dqcoeff[0], b->diff); - ((int *)b->qcoeff)[0] = 0; - } - - DEQUANT_INVOKE (&pbi->dequant, dc_idct_add_y_block) - (xd->qcoeff, xd->block[0].dequant, - xd->dst.y_buffer, - xd->dst.y_stride, xd->eobs, xd->block[24].diff); + /* restore the dc dequant constant */ + DQC[0] = dc_dequant_temp; } DEQUANT_INVOKE (&pbi->dequant, idct_add_uv_block) diff --git a/vp8/decoder/dequantize.c b/vp8/decoder/dequantize.c index 0861965eb..4a48a3192 100644 --- a/vp8/decoder/dequantize.c +++ b/vp8/decoder/dequantize.c @@ -42,22 +42,3 @@ void vp8_dequant_idct_add_c(short *input, short *dq, vpx_memset(input, 0, 32); } - -void vp8_dequant_dc_idct_add_c(short *input, short *dq, - unsigned char *dest, int stride, - int Dc) -{ - int i; - - input[0] = (short)Dc; - - for (i = 1; i < 16; i++) - { - input[i] = dq[i] * input[i]; - } - - vp8_short_idct4x4llm_c(input, dest, stride, dest, stride); - - vpx_memset(input, 0, 32); - -} diff --git a/vp8/decoder/dequantize.h b/vp8/decoder/dequantize.h index 019b7f6d1..f66cf2bac 100644 --- a/vp8/decoder/dequantize.h +++ b/vp8/decoder/dequantize.h @@ -21,17 +21,6 @@ unsigned char *output, \ int stride) -#define prototype_dequant_dc_idct_add(sym) \ - void sym(short *input, short *dq, \ - unsigned char *dst, \ - int stride, \ - int dc) - -#define prototype_dequant_dc_idct_add_y_block(sym) \ - void sym(short *q, short *dq, \ - unsigned char *dst, \ - int stride, char *eobs, short *dc) - #define prototype_dequant_idct_add_y_block(sym) \ void sym(short *q, short *dq, \ unsigned char *dst, \ @@ -60,16 +49,6 @@ extern prototype_dequant_block(vp8_dequant_block); #endif extern prototype_dequant_idct_add(vp8_dequant_idct_add); -#ifndef vp8_dequant_dc_idct_add -#define vp8_dequant_dc_idct_add vp8_dequant_dc_idct_add_c -#endif -extern prototype_dequant_dc_idct_add(vp8_dequant_dc_idct_add); - -#ifndef vp8_dequant_dc_idct_add_y_block -#define vp8_dequant_dc_idct_add_y_block vp8_dequant_dc_idct_add_y_block_c -#endif -extern prototype_dequant_dc_idct_add_y_block(vp8_dequant_dc_idct_add_y_block); - #ifndef vp8_dequant_idct_add_y_block #define vp8_dequant_idct_add_y_block vp8_dequant_idct_add_y_block_c #endif @@ -85,10 +64,6 @@ typedef prototype_dequant_block((*vp8_dequant_block_fn_t)); typedef prototype_dequant_idct_add((*vp8_dequant_idct_add_fn_t)); -typedef prototype_dequant_dc_idct_add((*vp8_dequant_dc_idct_add_fn_t)); - -typedef prototype_dequant_dc_idct_add_y_block((*vp8_dequant_dc_idct_add_y_block_fn_t)); - typedef prototype_dequant_idct_add_y_block((*vp8_dequant_idct_add_y_block_fn_t)); typedef prototype_dequant_idct_add_uv_block((*vp8_dequant_idct_add_uv_block_fn_t)); @@ -97,8 +72,6 @@ typedef struct { vp8_dequant_block_fn_t block; vp8_dequant_idct_add_fn_t idct_add; - vp8_dequant_dc_idct_add_fn_t dc_idct_add; - vp8_dequant_dc_idct_add_y_block_fn_t dc_idct_add_y_block; vp8_dequant_idct_add_y_block_fn_t idct_add_y_block; vp8_dequant_idct_add_uv_block_fn_t idct_add_uv_block; } vp8_dequant_rtcd_vtable_t; diff --git a/vp8/decoder/generic/dsystemdependent.c b/vp8/decoder/generic/dsystemdependent.c index 9c42bc62d..d9f9ba3c8 100644 --- a/vp8/decoder/generic/dsystemdependent.c +++ b/vp8/decoder/generic/dsystemdependent.c @@ -23,8 +23,6 @@ void vp8_dmachine_specific_config(VP8D_COMP *pbi) pbi->mb.rtcd = &pbi->common.rtcd; pbi->dequant.block = vp8_dequantize_b_c; pbi->dequant.idct_add = vp8_dequant_idct_add_c; - pbi->dequant.dc_idct_add = vp8_dequant_dc_idct_add_c; - pbi->dequant.dc_idct_add_y_block = vp8_dequant_dc_idct_add_y_block_c; pbi->dequant.idct_add_y_block = vp8_dequant_idct_add_y_block_c; pbi->dequant.idct_add_uv_block = vp8_dequant_idct_add_uv_block_c; #endif diff --git a/vp8/decoder/idct_blk.c b/vp8/decoder/idct_blk.c index 1c16b92a9..249fad4ea 100644 --- a/vp8/decoder/idct_blk.c +++ b/vp8/decoder/idct_blk.c @@ -12,39 +12,12 @@ #include "vp8/common/idct.h" #include "dequantize.h" -void vp8_dequant_dc_idct_add_c(short *input, short *dq, - unsigned char *dest, int stride, - int Dc); void vp8_dequant_idct_add_c(short *input, short *dq, unsigned char *dest, int stride); void vp8_dc_only_idct_add_c(short input_dc, unsigned char * pred, int pred_stride, unsigned char *dst_ptr, int dst_stride); -void vp8_dequant_dc_idct_add_y_block_c - (short *q, short *dq, - unsigned char *dst, int stride, char *eobs, short *dc) -{ - int i, j; - - for (i = 0; i < 4; i++) - { - for (j = 0; j < 4; j++) - { - if (*eobs++ > 1) - vp8_dequant_dc_idct_add_c (q, dq, dst, stride, dc[0]); - else - vp8_dc_only_idct_add_c (dc[0], dst, stride, dst, stride); - - q += 16; - dst += 4; - dc ++; - } - - dst += 4*stride - 16; - } -} - void vp8_dequant_idct_add_y_block_c (short *q, short *dq, unsigned char *dst, int stride, char *eobs) diff --git a/vp8/decoder/threading.c b/vp8/decoder/threading.c index eba5830d5..1967781eb 100644 --- a/vp8/decoder/threading.c +++ b/vp8/decoder/threading.c @@ -175,36 +175,7 @@ static void decode_macroblock(VP8D_COMP *pbi, MACROBLOCKD *xd, int mb_row, int m #endif /* dequantization and idct */ - if (xd->mode_info_context->mbmi.mode != B_PRED && xd->mode_info_context->mbmi.mode != SPLITMV) - { - BLOCKD *b = &xd->block[24]; - DEQUANT_INVOKE(&pbi->dequant, block)(b); - - /* do 2nd order transform on the dc block */ - if (xd->eobs[24] > 1) - { - IDCT_INVOKE(RTCD_VTABLE(idct), iwalsh16)(&b->dqcoeff[0], b->diff); - ((int *)b->qcoeff)[0] = 0; - ((int *)b->qcoeff)[1] = 0; - ((int *)b->qcoeff)[2] = 0; - ((int *)b->qcoeff)[3] = 0; - ((int *)b->qcoeff)[4] = 0; - ((int *)b->qcoeff)[5] = 0; - ((int *)b->qcoeff)[6] = 0; - ((int *)b->qcoeff)[7] = 0; - } - else - { - IDCT_INVOKE(RTCD_VTABLE(idct), iwalsh1)(&b->dqcoeff[0], b->diff); - ((int *)b->qcoeff)[0] = 0; - } - - DEQUANT_INVOKE (&pbi->dequant, dc_idct_add_y_block) - (xd->qcoeff, xd->block[0].dequant, - xd->dst.y_buffer, - xd->dst.y_stride, xd->eobs, xd->block[24].diff); - } - else if (xd->mode_info_context->mbmi.mode == B_PRED) + if (xd->mode_info_context->mbmi.mode == B_PRED) { for (i = 0; i < 16; i++) { @@ -214,26 +185,71 @@ static void decode_macroblock(VP8D_COMP *pbi, MACROBLOCKD *xd, int mb_row, int m vp8mt_predict_intra4x4(pbi, xd, b_mode, *(b->base_dst) + b->dst, b->dst_stride, mb_row, mb_col, i); - if (xd->eobs[i] > 1) + if (xd->eobs[i] ) { - DEQUANT_INVOKE(&pbi->dequant, idct_add) - (b->qcoeff, b->dequant, - *(b->base_dst) + b->dst, b->dst_stride); - } - else - { - IDCT_INVOKE(RTCD_VTABLE(idct), idct1_scalar_add) - (b->qcoeff[0] * b->dequant[0], - *(b->base_dst) + b->dst, b->dst_stride, - *(b->base_dst) + b->dst, b->dst_stride); - ((int *)b->qcoeff)[0] = 0; + if (xd->eobs[i] > 1) + { + DEQUANT_INVOKE(&pbi->dequant, idct_add) + (b->qcoeff, b->dequant, + *(b->base_dst) + b->dst, b->dst_stride); + } + else + { + IDCT_INVOKE(RTCD_VTABLE(idct), idct1_scalar_add) + (b->qcoeff[0] * b->dequant[0], + *(b->base_dst) + b->dst, b->dst_stride, + *(b->base_dst) + b->dst, b->dst_stride); + ((int *)b->qcoeff)[0] = 0; + } } } } else { + short *DQC = xd->block[0].dequant; + + DECLARE_ALIGNED(16, short, local_dequant[16]); + + if (xd->mode_info_context->mbmi.mode != SPLITMV) + { + BLOCKD *b = &xd->block[24]; + + /* do 2nd order transform on the dc block */ + if (xd->eobs[24] > 1) + { + DEQUANT_INVOKE(&pbi->dequant, block)(b); + + IDCT_INVOKE(RTCD_VTABLE(idct), iwalsh16)(&b->dqcoeff[0], + xd->qcoeff); + ((int *)b->qcoeff)[0] = 0; + ((int *)b->qcoeff)[1] = 0; + ((int *)b->qcoeff)[2] = 0; + ((int *)b->qcoeff)[3] = 0; + ((int *)b->qcoeff)[4] = 0; + ((int *)b->qcoeff)[5] = 0; + ((int *)b->qcoeff)[6] = 0; + ((int *)b->qcoeff)[7] = 0; + } + else + { + b->dqcoeff[0] = b->qcoeff[0] * b->dequant[0]; + IDCT_INVOKE(RTCD_VTABLE(idct), iwalsh1)(&b->dqcoeff[0], xd->qcoeff); + ((int *)b->qcoeff)[0] = 0; + } + + /* make a local copy of the dequant constants */ + vpx_memcpy(local_dequant, xd->block[0].dequant, + sizeof(local_dequant)); + + /* override the dc dequant constant */ + local_dequant[0] = 1; + + /* use the new dequant constants */ + DQC = local_dequant; + } + DEQUANT_INVOKE (&pbi->dequant, idct_add_y_block) - (xd->qcoeff, xd->block[0].dequant, + (xd->qcoeff, DQC, xd->dst.y_buffer, xd->dst.y_stride, xd->eobs); } @@ -244,7 +260,6 @@ static void decode_macroblock(VP8D_COMP *pbi, MACROBLOCKD *xd, int mb_row, int m xd->dst.uv_stride, xd->eobs+16); } - static THREAD_FUNCTION thread_decoding_proc(void *p_data) { int ithread = ((DECODETHREAD_DATA *)p_data)->ithread; diff --git a/vp8/decoder/x86/dequantize_mmx.asm b/vp8/decoder/x86/dequantize_mmx.asm index 648bde4c5..de9eba89f 100644 --- a/vp8/decoder/x86/dequantize_mmx.asm +++ b/vp8/decoder/x86/dequantize_mmx.asm @@ -246,207 +246,6 @@ sym(vp8_dequant_idct_add_mmx): pop rbp ret - -;void dequant_dc_idct_add_mmx( -;short *input, 0 -;short *dq, 1 -;unsigned char *dest, 2 -;int stride, 3 -;int Dc) 4 -global sym(vp8_dequant_dc_idct_add_mmx) -sym(vp8_dequant_dc_idct_add_mmx): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 5 - GET_GOT rbx - ; end prolog - - mov rax, arg(0) ;input - mov rdx, arg(1) ;dq - - movq mm0, [rax ] - pmullw mm0, [rdx] - - movq mm1, [rax +8] - pmullw mm1, [rdx +8] - - movq mm2, [rax+16] - pmullw mm2, [rdx+16] - - movq mm3, [rax+24] - pmullw mm3, [rdx+24] - - mov rdx, arg(2) ;pred - pxor mm7, mm7 - - - movq [rax], mm7 - movq [rax+8], mm7 - - movq [rax+16],mm7 - movq [rax+24],mm7 - - ; move lower word of Dc to lower word of mm0 - psrlq mm0, 16 - movzx rcx, word ptr arg(4) ;Dc - psllq mm0, 16 - movq mm7, rcx - por mm0, mm7 - - movsxd rax, dword ptr arg(3) ;stride - - psubw mm0, mm2 ; b1= 0-2 - paddw mm2, mm2 ; - - movq mm5, mm1 - paddw mm2, mm0 ; a1 =0+2 - - pmulhw mm5, [GLOBAL(x_s1sqr2)]; - paddw mm5, mm1 ; ip1 * sin(pi/8) * sqrt(2) - - movq mm7, mm3 ; - pmulhw mm7, [GLOBAL(x_c1sqr2less1)]; - - paddw mm7, mm3 ; ip3 * cos(pi/8) * sqrt(2) - psubw mm7, mm5 ; c1 - - movq mm5, mm1 - movq mm4, mm3 - - pmulhw mm5, [GLOBAL(x_c1sqr2less1)] - paddw mm5, mm1 - - pmulhw mm3, [GLOBAL(x_s1sqr2)] - paddw mm3, mm4 - - paddw mm3, mm5 ; d1 - movq mm6, mm2 ; a1 - - movq mm4, mm0 ; b1 - paddw mm2, mm3 ;0 - - paddw mm4, mm7 ;1 - psubw mm0, mm7 ;2 - - psubw mm6, mm3 ;3 - - movq mm1, mm2 ; 03 02 01 00 - movq mm3, mm4 ; 23 22 21 20 - - punpcklwd mm1, mm0 ; 11 01 10 00 - punpckhwd mm2, mm0 ; 13 03 12 02 - - punpcklwd mm3, mm6 ; 31 21 30 20 - punpckhwd mm4, mm6 ; 33 23 32 22 - - movq mm0, mm1 ; 11 01 10 00 - movq mm5, mm2 ; 13 03 12 02 - - punpckldq mm0, mm3 ; 30 20 10 00 - punpckhdq mm1, mm3 ; 31 21 11 01 - - punpckldq mm2, mm4 ; 32 22 12 02 - punpckhdq mm5, mm4 ; 33 23 13 03 - - movq mm3, mm5 ; 33 23 13 03 - - psubw mm0, mm2 ; b1= 0-2 - paddw mm2, mm2 ; - - movq mm5, mm1 - paddw mm2, mm0 ; a1 =0+2 - - pmulhw mm5, [GLOBAL(x_s1sqr2)]; - paddw mm5, mm1 ; ip1 * sin(pi/8) * sqrt(2) - - movq mm7, mm3 ; - pmulhw mm7, [GLOBAL(x_c1sqr2less1)]; - - paddw mm7, mm3 ; ip3 * cos(pi/8) * sqrt(2) - psubw mm7, mm5 ; c1 - - movq mm5, mm1 - movq mm4, mm3 - - pmulhw mm5, [GLOBAL(x_c1sqr2less1)] - paddw mm5, mm1 - - pmulhw mm3, [GLOBAL(x_s1sqr2)] - paddw mm3, mm4 - - paddw mm3, mm5 ; d1 - paddw mm0, [GLOBAL(fours)] - - paddw mm2, [GLOBAL(fours)] - movq mm6, mm2 ; a1 - - movq mm4, mm0 ; b1 - paddw mm2, mm3 ;0 - - paddw mm4, mm7 ;1 - psubw mm0, mm7 ;2 - - psubw mm6, mm3 ;3 - psraw mm2, 3 - - psraw mm0, 3 - psraw mm4, 3 - - psraw mm6, 3 - - movq mm1, mm2 ; 03 02 01 00 - movq mm3, mm4 ; 23 22 21 20 - - punpcklwd mm1, mm0 ; 11 01 10 00 - punpckhwd mm2, mm0 ; 13 03 12 02 - - punpcklwd mm3, mm6 ; 31 21 30 20 - punpckhwd mm4, mm6 ; 33 23 32 22 - - movq mm0, mm1 ; 11 01 10 00 - movq mm5, mm2 ; 13 03 12 02 - - punpckldq mm0, mm3 ; 30 20 10 00 - punpckhdq mm1, mm3 ; 31 21 11 01 - - punpckldq mm2, mm4 ; 32 22 12 02 - punpckhdq mm5, mm4 ; 33 23 13 03 - - pxor mm7, mm7 - - movd mm4, [rdx] - punpcklbw mm4, mm7 - paddsw mm0, mm4 - packuswb mm0, mm7 - movd [rdx], mm0 - - movd mm4, [rdx+rax] - punpcklbw mm4, mm7 - paddsw mm1, mm4 - packuswb mm1, mm7 - movd [rdx+rax], mm1 - - movd mm4, [rdx+2*rax] - punpcklbw mm4, mm7 - paddsw mm2, mm4 - packuswb mm2, mm7 - movd [rdx+rax*2], mm2 - - add rdx, rax - - movd mm4, [rdx+2*rax] - punpcklbw mm4, mm7 - paddsw mm5, mm4 - packuswb mm5, mm7 - movd [rdx+rax*2], mm5 - - ; begin epilog - RESTORE_GOT - UNSHADOW_ARGS - pop rbp - ret - - SECTION_RODATA align 16 x_s1sqr2: diff --git a/vp8/decoder/x86/dequantize_x86.h b/vp8/decoder/x86/dequantize_x86.h index dc68daab3..49bcb7f19 100644 --- a/vp8/decoder/x86/dequantize_x86.h +++ b/vp8/decoder/x86/dequantize_x86.h @@ -22,8 +22,6 @@ #if HAVE_MMX extern prototype_dequant_block(vp8_dequantize_b_mmx); extern prototype_dequant_idct_add(vp8_dequant_idct_add_mmx); -extern prototype_dequant_dc_idct_add(vp8_dequant_dc_idct_add_mmx); -extern prototype_dequant_dc_idct_add_y_block(vp8_dequant_dc_idct_add_y_block_mmx); extern prototype_dequant_idct_add_y_block(vp8_dequant_idct_add_y_block_mmx); extern prototype_dequant_idct_add_uv_block(vp8_dequant_idct_add_uv_block_mmx); @@ -34,12 +32,6 @@ extern prototype_dequant_idct_add_uv_block(vp8_dequant_idct_add_uv_block_mmx); #undef vp8_dequant_idct_add #define vp8_dequant_idct_add vp8_dequant_idct_add_mmx -#undef vp8_dequant_dc_idct_add -#define vp8_dequant_dc_idct_add vp8_dequant_dc_idct_add_mmx - -#undef vp8_dequant_dc_idct_add_y_block -#define vp8_dequant_dc_idct_add_y_block vp8_dequant_dc_idct_add_y_block_mmx - #undef vp8_dequant_idct_add_y_block #define vp8_dequant_idct_add_y_block vp8_dequant_idct_add_y_block_mmx @@ -50,14 +42,10 @@ extern prototype_dequant_idct_add_uv_block(vp8_dequant_idct_add_uv_block_mmx); #endif #if HAVE_SSE2 -extern prototype_dequant_dc_idct_add_y_block(vp8_dequant_dc_idct_add_y_block_sse2); extern prototype_dequant_idct_add_y_block(vp8_dequant_idct_add_y_block_sse2); extern prototype_dequant_idct_add_uv_block(vp8_dequant_idct_add_uv_block_sse2); #if !CONFIG_RUNTIME_CPU_DETECT -#undef vp8_dequant_dc_idct_add_y_block -#define vp8_dequant_dc_idct_add_y_block vp8_dequant_dc_idct_add_y_block_sse2 - #undef vp8_dequant_idct_add_y_block #define vp8_dequant_idct_add_y_block vp8_dequant_idct_add_y_block_sse2 diff --git a/vp8/decoder/x86/idct_blk_mmx.c b/vp8/decoder/x86/idct_blk_mmx.c index 37de5b9fd..29276e5d7 100644 --- a/vp8/decoder/x86/idct_blk_mmx.c +++ b/vp8/decoder/x86/idct_blk_mmx.c @@ -12,41 +12,6 @@ #include "vp8/common/idct.h" #include "vp8/decoder/dequantize.h" -void vp8_dequant_dc_idct_add_y_block_mmx - (short *q, short *dq, - unsigned char *dst, int stride, char *eobs, short *dc) -{ - int i; - - for (i = 0; i < 4; i++) - { - if (eobs[0] > 1) - vp8_dequant_dc_idct_add_mmx (q, dq, dst, stride, dc[0]); - else if (eobs[0] == 1) - vp8_dc_only_idct_add_mmx (dc[0], dst, stride, dst, stride); - - if (eobs[1] > 1) - vp8_dequant_dc_idct_add_mmx (q+16, dq, dst+4, stride, dc[1]); - else if (eobs[1] == 1) - vp8_dc_only_idct_add_mmx (dc[1], dst+4, stride, dst+4, stride); - - if (eobs[2] > 1) - vp8_dequant_dc_idct_add_mmx (q+32, dq, dst+8, stride, dc[2]); - else if (eobs[2] == 1) - vp8_dc_only_idct_add_mmx (dc[2], dst+8, stride, dst+8, stride); - - if (eobs[3] > 1) - vp8_dequant_dc_idct_add_mmx (q+48, dq, dst+12, stride, dc[3]); - else if (eobs[3] == 1) - vp8_dc_only_idct_add_mmx (dc[3], dst+12, stride, dst+12, stride); - - q += 64; - dc += 4; - dst += 4*stride; - eobs += 4; - } -} - void vp8_dequant_idct_add_y_block_mmx (short *q, short *dq, unsigned char *dst, int stride, char *eobs) diff --git a/vp8/decoder/x86/idct_blk_sse2.c b/vp8/decoder/x86/idct_blk_sse2.c index 0495b0610..03c2878c1 100644 --- a/vp8/decoder/x86/idct_blk_sse2.c +++ b/vp8/decoder/x86/idct_blk_sse2.c @@ -12,13 +12,6 @@ #include "vp8/common/idct.h" #include "vp8/decoder/dequantize.h" -void vp8_idct_dequant_dc_0_2x_sse2 - (short *q, short *dq, - unsigned char *dst, int dst_stride, short *dc); -void vp8_idct_dequant_dc_full_2x_sse2 - (short *q, short *dq, - unsigned char *dst, int dst_stride, short *dc); - void vp8_idct_dequant_0_2x_sse2 (short *q, short *dq , unsigned char *dst, int dst_stride); @@ -26,36 +19,6 @@ void vp8_idct_dequant_full_2x_sse2 (short *q, short *dq , unsigned char *dst, int dst_stride); -void vp8_dequant_dc_idct_add_y_block_sse2 - (short *q, short *dq, - unsigned char *dst, int stride, char *eobs, short *dc) -{ - int i; - - for (i = 0; i < 4; i++) - { - if (((short *)(eobs))[0]) - { - if (((short *)(eobs))[0] & 0xfefe) - vp8_idct_dequant_dc_full_2x_sse2 (q, dq, dst, stride, dc); - else - vp8_idct_dequant_dc_0_2x_sse2 (q, dq, dst, stride, dc); - } - - if (((short *)(eobs))[1]) - { - if (((short *)(eobs))[1] & 0xfefe) - vp8_idct_dequant_dc_full_2x_sse2 (q+32, dq, dst+8, stride, dc+2); - else - vp8_idct_dequant_dc_0_2x_sse2 (q+32, dq, dst+8, stride, dc+2); - } - q += 64; - dc += 4; - dst += stride*4; - eobs += 4; - } -} - void vp8_dequant_idct_add_y_block_sse2 (short *q, short *dq, unsigned char *dst, int stride, char *eobs) diff --git a/vp8/decoder/x86/x86_dsystemdependent.c b/vp8/decoder/x86/x86_dsystemdependent.c index 443150483..424052c1b 100644 --- a/vp8/decoder/x86/x86_dsystemdependent.c +++ b/vp8/decoder/x86/x86_dsystemdependent.c @@ -43,8 +43,6 @@ void vp8_arch_x86_decode_init(VP8D_COMP *pbi) { pbi->dequant.block = vp8_dequantize_b_mmx; pbi->dequant.idct_add = vp8_dequant_idct_add_mmx; - pbi->dequant.dc_idct_add = vp8_dequant_dc_idct_add_mmx; - pbi->dequant.dc_idct_add_y_block = vp8_dequant_dc_idct_add_y_block_mmx; pbi->dequant.idct_add_y_block = vp8_dequant_idct_add_y_block_mmx; pbi->dequant.idct_add_uv_block = vp8_dequant_idct_add_uv_block_mmx; } @@ -52,8 +50,6 @@ void vp8_arch_x86_decode_init(VP8D_COMP *pbi) #if HAVE_SSE2 if (flags & HAS_SSE2) { - pbi->dequant.dc_idct_add_y_block = vp8_dequant_dc_idct_add_y_block_sse2; - pbi->dequant.idct_add_y_block = vp8_dequant_idct_add_y_block_sse2; pbi->dequant.idct_add_uv_block = vp8_dequant_idct_add_uv_block_sse2; } #endif diff --git a/vp8/vp8dx_arm.mk b/vp8/vp8dx_arm.mk index 6bde42f4c..b08f9464f 100644 --- a/vp8/vp8dx_arm.mk +++ b/vp8/vp8dx_arm.mk @@ -16,14 +16,11 @@ VP8_DX_SRCS-$(ARCH_ARM) += decoder/arm/dequantize_arm.c VP8_DX_SRCS-$(ARCH_ARM) += decoder/arm/dequantize_arm.h #File list for armv6 -VP8_DX_SRCS-$(HAVE_ARMV6) += decoder/arm/armv6/dequant_dc_idct_v6$(ASM) VP8_DX_SRCS-$(HAVE_ARMV6) += decoder/arm/armv6/dequant_idct_v6$(ASM) VP8_DX_SRCS-$(HAVE_ARMV6) += decoder/arm/armv6/dequantize_v6$(ASM) VP8_DX_SRCS-$(HAVE_ARMV6) += decoder/arm/armv6/idct_blk_v6.c #File list for neon -VP8_DX_SRCS-$(HAVE_ARMV7) += decoder/arm/neon/idct_dequant_dc_full_2x_neon$(ASM) -VP8_DX_SRCS-$(HAVE_ARMV7) += decoder/arm/neon/idct_dequant_dc_0_2x_neon$(ASM) VP8_DX_SRCS-$(HAVE_ARMV7) += decoder/arm/neon/dequant_idct_neon$(ASM) VP8_DX_SRCS-$(HAVE_ARMV7) += decoder/arm/neon/idct_dequant_full_2x_neon$(ASM) VP8_DX_SRCS-$(HAVE_ARMV7) += decoder/arm/neon/idct_dequant_0_2x_neon$(ASM)