Merge "Modified the inverse walsh to output directly"
This commit is contained in:
commit
f46e17fd6f
@ -46,7 +46,6 @@ void vp8_arch_arm_common_init(VP8_COMMON *ctx)
|
||||
rtcd->subpix.bilinear4x4 = vp8_bilinear_predict4x4_armv6;
|
||||
|
||||
rtcd->idct.idct16 = vp8_short_idct4x4llm_v6_dual;
|
||||
rtcd->idct.iwalsh1 = vp8_short_inv_walsh4x4_1_v6;
|
||||
rtcd->idct.iwalsh16 = vp8_short_inv_walsh4x4_v6;
|
||||
|
||||
rtcd->loopfilter.normal_mb_v = vp8_loop_filter_mbv_armv6;
|
||||
@ -80,7 +79,6 @@ void vp8_arch_arm_common_init(VP8_COMMON *ctx)
|
||||
rtcd->subpix.bilinear4x4 = vp8_bilinear_predict4x4_neon;
|
||||
|
||||
rtcd->idct.idct16 = vp8_short_idct4x4llm_neon;
|
||||
rtcd->idct.iwalsh1 = vp8_short_inv_walsh4x4_1_neon;
|
||||
rtcd->idct.iwalsh16 = vp8_short_inv_walsh4x4_neon;
|
||||
|
||||
rtcd->loopfilter.normal_mb_v = vp8_loop_filter_mbv_neon;
|
||||
|
@ -9,7 +9,6 @@
|
||||
;
|
||||
|
||||
EXPORT |vp8_short_inv_walsh4x4_v6|
|
||||
EXPORT |vp8_short_inv_walsh4x4_1_v6|
|
||||
|
||||
ARM
|
||||
REQUIRE8
|
||||
@ -17,19 +16,19 @@
|
||||
|
||||
AREA |.text|, CODE, READONLY ; name this block of code
|
||||
|
||||
;short vp8_short_inv_walsh4x4_v6(short *input, short *output)
|
||||
;short vp8_short_inv_walsh4x4_v6(short *input, short *mb_dqcoeff)
|
||||
|vp8_short_inv_walsh4x4_v6| PROC
|
||||
|
||||
stmdb sp!, {r4 - r11, lr}
|
||||
stmdb sp!, {r4 - r12, lr}
|
||||
|
||||
ldr r2, [r0], #4 ; [1 | 0]
|
||||
ldr r3, [r0], #4 ; [3 | 2]
|
||||
ldr r4, [r0], #4 ; [5 | 4]
|
||||
ldr r5, [r0], #4 ; [7 | 6]
|
||||
ldr r6, [r0], #4 ; [9 | 8]
|
||||
ldr r7, [r0], #4 ; [11 | 10]
|
||||
ldr r8, [r0], #4 ; [13 | 12]
|
||||
ldr r9, [r0] ; [15 | 14]
|
||||
ldr r2, [r0, #0] ; [1 | 0]
|
||||
ldr r3, [r0, #4] ; [3 | 2]
|
||||
ldr r4, [r0, #8] ; [5 | 4]
|
||||
ldr r5, [r0, #12] ; [7 | 6]
|
||||
ldr r6, [r0, #16] ; [9 | 8]
|
||||
ldr r7, [r0, #20] ; [11 | 10]
|
||||
ldr r8, [r0, #24] ; [13 | 12]
|
||||
ldr r9, [r0, #28] ; [15 | 14]
|
||||
|
||||
qadd16 r10, r2, r8 ; a1 [1+13 | 0+12]
|
||||
qadd16 r11, r4, r6 ; b1 [5+9 | 4+8]
|
||||
@ -69,24 +68,27 @@
|
||||
qadd16 r4, r4, r10 ; [b2+3|c2+3]
|
||||
qadd16 r5, r5, r10 ; [a2+3|d2+3]
|
||||
|
||||
asr r12, r2, #3 ; [1 | x]
|
||||
pkhtb r12, r12, r3, asr #19; [1 | 0]
|
||||
lsl lr, r3, #16 ; [~3 | x]
|
||||
lsl r2, r2, #16 ; [~2 | x]
|
||||
asr lr, lr, #3 ; [3 | x]
|
||||
pkhtb lr, lr, r2, asr #19 ; [3 | 2]
|
||||
asr r12, r3, #19 ; [0]
|
||||
strh r12, [r1], #32
|
||||
asr lr, r2, #19 ; [1]
|
||||
strh lr, [r1], #32
|
||||
sxth r2, r2
|
||||
sxth r3, r3
|
||||
asr r2, r2, #3 ; [2]
|
||||
strh r2, [r1], #32
|
||||
asr r3, r3, #3 ; [3]
|
||||
strh r3, [r1], #32
|
||||
|
||||
asr r2, r4, #3 ; [5 | x]
|
||||
pkhtb r2, r2, r5, asr #19 ; [5 | 4]
|
||||
lsl r3, r5, #16 ; [~7 | x]
|
||||
lsl r4, r4, #16 ; [~6 | x]
|
||||
asr r3, r3, #3 ; [7 | x]
|
||||
pkhtb r3, r3, r4, asr #19 ; [7 | 6]
|
||||
|
||||
str r12, [r1], #4
|
||||
str lr, [r1], #4
|
||||
str r2, [r1], #4
|
||||
str r3, [r1], #4
|
||||
asr r12, r5, #19 ; [4]
|
||||
strh r12, [r1], #32
|
||||
asr lr, r4, #19 ; [5]
|
||||
strh lr, [r1], #32
|
||||
sxth r4, r4
|
||||
sxth r5, r5
|
||||
asr r4, r4, #3 ; [6]
|
||||
strh r4, [r1], #32
|
||||
asr r5, r5, #3 ; [7]
|
||||
strh r5, [r1], #32
|
||||
|
||||
qsubaddx r2, r6, r7 ; [c1|a1] [9-10 | 8+11]
|
||||
qaddsubx r3, r6, r7 ; [b1|d1] [9+10 | 8-11]
|
||||
@ -103,50 +105,32 @@
|
||||
qadd16 r8, r8, r10 ; [b2+3|c2+3]
|
||||
qadd16 r9, r9, r10 ; [a2+3|d2+3]
|
||||
|
||||
asr r2, r6, #3 ; [9 | x]
|
||||
pkhtb r2, r2, r7, asr #19 ; [9 | 8]
|
||||
lsl r3, r7, #16 ; [~11| x]
|
||||
lsl r4, r6, #16 ; [~10| x]
|
||||
asr r3, r3, #3 ; [11 | x]
|
||||
pkhtb r3, r3, r4, asr #19 ; [11 | 10]
|
||||
asr r12, r7, #19 ; [8]
|
||||
strh r12, [r1], #32
|
||||
asr lr, r6, #19 ; [9]
|
||||
strh lr, [r1], #32
|
||||
sxth r6, r6
|
||||
sxth r7, r7
|
||||
asr r6, r6, #3 ; [10]
|
||||
strh r6, [r1], #32
|
||||
asr r7, r7, #3 ; [11]
|
||||
strh r7, [r1], #32
|
||||
|
||||
asr r4, r8, #3 ; [13 | x]
|
||||
pkhtb r4, r4, r9, asr #19 ; [13 | 12]
|
||||
lsl r5, r9, #16 ; [~15| x]
|
||||
lsl r6, r8, #16 ; [~14| x]
|
||||
asr r5, r5, #3 ; [15 | x]
|
||||
pkhtb r5, r5, r6, asr #19 ; [15 | 14]
|
||||
asr r12, r9, #19 ; [12]
|
||||
strh r12, [r1], #32
|
||||
asr lr, r8, #19 ; [13]
|
||||
strh lr, [r1], #32
|
||||
sxth r8, r8
|
||||
sxth r9, r9
|
||||
asr r8, r8, #3 ; [14]
|
||||
strh r8, [r1], #32
|
||||
asr r9, r9, #3 ; [15]
|
||||
strh r9, [r1], #32
|
||||
|
||||
str r2, [r1], #4
|
||||
str r3, [r1], #4
|
||||
str r4, [r1], #4
|
||||
str r5, [r1]
|
||||
|
||||
ldmia sp!, {r4 - r11, pc}
|
||||
ldmia sp!, {r4 - r12, pc}
|
||||
ENDP ; |vp8_short_inv_walsh4x4_v6|
|
||||
|
||||
|
||||
;short vp8_short_inv_walsh4x4_1_v6(short *input, short *output)
|
||||
|vp8_short_inv_walsh4x4_1_v6| PROC
|
||||
|
||||
ldrsh r2, [r0] ; [0]
|
||||
add r2, r2, #3 ; [0] + 3
|
||||
asr r2, r2, #3 ; a1 ([0]+3) >> 3
|
||||
lsl r2, r2, #16 ; [a1 | x]
|
||||
orr r2, r2, r2, lsr #16 ; [a1 | a1]
|
||||
|
||||
str r2, [r1], #4
|
||||
str r2, [r1], #4
|
||||
str r2, [r1], #4
|
||||
str r2, [r1], #4
|
||||
str r2, [r1], #4
|
||||
str r2, [r1], #4
|
||||
str r2, [r1], #4
|
||||
str r2, [r1]
|
||||
|
||||
bx lr
|
||||
ENDP ; |vp8_short_inv_walsh4x4_1_v6|
|
||||
|
||||
; Constant Pool
|
||||
c0x00030003 DCD 0x00030003
|
||||
END
|
||||
|
@ -25,9 +25,6 @@ extern prototype_second_order(vp8_short_inv_walsh4x4_v6);
|
||||
#undef vp8_idct_idct1_scalar_add
|
||||
#define vp8_idct_idct1_scalar_add vp8_dc_only_idct_add_v6
|
||||
|
||||
#undef vp8_idct_iwalsh1
|
||||
#define vp8_idct_iwalsh1 vp8_short_inv_walsh4x4_1_v6
|
||||
|
||||
#undef vp8_idct_iwalsh16
|
||||
#define vp8_idct_iwalsh16 vp8_short_inv_walsh4x4_v6
|
||||
#endif
|
||||
@ -46,9 +43,6 @@ extern prototype_second_order(vp8_short_inv_walsh4x4_neon);
|
||||
#undef vp8_idct_idct1_scalar_add
|
||||
#define vp8_idct_idct1_scalar_add vp8_dc_only_idct_add_neon
|
||||
|
||||
#undef vp8_idct_iwalsh1
|
||||
#define vp8_idct_iwalsh1 vp8_short_inv_walsh4x4_1_neon
|
||||
|
||||
#undef vp8_idct_iwalsh16
|
||||
#define vp8_idct_iwalsh16 vp8_short_inv_walsh4x4_neon
|
||||
#endif
|
||||
|
@ -8,7 +8,6 @@
|
||||
; be found in the AUTHORS file in the root of the source tree.
|
||||
;
|
||||
EXPORT |vp8_short_inv_walsh4x4_neon|
|
||||
EXPORT |vp8_short_inv_walsh4x4_1_neon|
|
||||
|
||||
ARM
|
||||
REQUIRE8
|
||||
@ -16,7 +15,7 @@
|
||||
|
||||
AREA |.text|, CODE, READONLY ; name this block of code
|
||||
|
||||
;short vp8_short_inv_walsh4x4_neon(short *input, short *output)
|
||||
;short vp8_short_inv_walsh4x4_neon(short *input, short *mb_dqcoeff)
|
||||
|vp8_short_inv_walsh4x4_neon| PROC
|
||||
|
||||
; read in all four lines of values: d0->d3
|
||||
@ -59,22 +58,30 @@
|
||||
vshr.s16 q0, q0, #3 ;e/f >> 3
|
||||
vshr.s16 q1, q1, #3 ;g/h >> 3
|
||||
|
||||
vst4.i16 {d0,d1,d2,d3}, [r1@128]
|
||||
mov r2, #64
|
||||
add r3, r1, #32
|
||||
|
||||
vst1.i16 d0[0], [r1],r2
|
||||
vst1.i16 d1[0], [r3],r2
|
||||
vst1.i16 d2[0], [r1],r2
|
||||
vst1.i16 d3[0], [r3],r2
|
||||
|
||||
vst1.i16 d0[1], [r1],r2
|
||||
vst1.i16 d1[1], [r3],r2
|
||||
vst1.i16 d2[1], [r1],r2
|
||||
vst1.i16 d3[1], [r3],r2
|
||||
|
||||
vst1.i16 d0[2], [r1],r2
|
||||
vst1.i16 d1[2], [r3],r2
|
||||
vst1.i16 d2[2], [r1],r2
|
||||
vst1.i16 d3[2], [r3],r2
|
||||
|
||||
vst1.i16 d0[3], [r1],r2
|
||||
vst1.i16 d1[3], [r3],r2
|
||||
vst1.i16 d2[3], [r1]
|
||||
vst1.i16 d3[3], [r3]
|
||||
|
||||
bx lr
|
||||
ENDP ; |vp8_short_inv_walsh4x4_neon|
|
||||
|
||||
|
||||
;short vp8_short_inv_walsh4x4_1_neon(short *input, short *output)
|
||||
|vp8_short_inv_walsh4x4_1_neon| PROC
|
||||
ldrsh r2, [r0] ; load input[0]
|
||||
add r3, r2, #3 ; add 3
|
||||
add r2, r1, #16 ; base for last 8 output
|
||||
asr r0, r3, #3 ; right shift 3
|
||||
vdup.16 q0, r0 ; load and duplicate
|
||||
vst1.16 {q0}, [r1@128] ; write back 8
|
||||
vst1.16 {q0}, [r2@128] ; write back last 8
|
||||
bx lr
|
||||
ENDP ; |vp8_short_inv_walsh4x4_1_neon|
|
||||
|
||||
END
|
||||
|
@ -37,6 +37,10 @@
|
||||
#define vp8_idct_idct16 vp8_short_idct4x4llm_c
|
||||
#endif
|
||||
extern prototype_idct(vp8_idct_idct16);
|
||||
/* add this prototype to prevent compiler warning about implicit
|
||||
* declaration of vp8_short_idct4x4llm_c function in dequantize.c
|
||||
* when building, for example, neon optimized version */
|
||||
extern prototype_idct(vp8_short_idct4x4llm_c);
|
||||
|
||||
#ifndef vp8_idct_idct1_scalar_add
|
||||
#define vp8_idct_idct1_scalar_add vp8_dc_only_idct_add_c
|
||||
|
@ -137,8 +137,9 @@ void vp8_dc_only_idct_add_c(short input_dc, unsigned char *pred_ptr,
|
||||
|
||||
}
|
||||
|
||||
void vp8_short_inv_walsh4x4_c(short *input, short *output)
|
||||
void vp8_short_inv_walsh4x4_c(short *input, short *mb_dqcoeff)
|
||||
{
|
||||
short output[16];
|
||||
int i;
|
||||
int a1, b1, c1, d1;
|
||||
int a2, b2, c2, d2;
|
||||
@ -183,22 +184,21 @@ void vp8_short_inv_walsh4x4_c(short *input, short *output)
|
||||
ip += 4;
|
||||
op += 4;
|
||||
}
|
||||
|
||||
for(i = 0; i < 16; i++)
|
||||
{
|
||||
mb_dqcoeff[i * 16] = output[i];
|
||||
}
|
||||
}
|
||||
|
||||
void vp8_short_inv_walsh4x4_1_c(short *input, short *output)
|
||||
void vp8_short_inv_walsh4x4_1_c(short *input, short *mb_dqcoeff)
|
||||
{
|
||||
int i;
|
||||
int a1;
|
||||
short *op = output;
|
||||
|
||||
a1 = ((input[0] + 3) >> 3);
|
||||
|
||||
for (i = 0; i < 4; i++)
|
||||
for(i = 0; i < 16; i++)
|
||||
{
|
||||
op[0] = a1;
|
||||
op[1] = a1;
|
||||
op[2] = a1;
|
||||
op[3] = a1;
|
||||
op += 4;
|
||||
mb_dqcoeff[i * 16] = a1;
|
||||
}
|
||||
}
|
||||
|
@ -28,18 +28,6 @@ void vp8_inverse_transform_b(const vp8_idct_rtcd_vtable_t *rtcd, BLOCKD *b,
|
||||
|
||||
}
|
||||
|
||||
static void recon_dcblock(MACROBLOCKD *x)
|
||||
{
|
||||
BLOCKD *b = &x->block[24];
|
||||
int i;
|
||||
|
||||
for (i = 0; i < 16; i++)
|
||||
{
|
||||
x->block[i].dqcoeff[0] = b->diff[i];
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
void vp8_inverse_transform_mby(const vp8_idct_rtcd_vtable_t *rtcd, MACROBLOCKD *x)
|
||||
{
|
||||
int i;
|
||||
@ -47,9 +35,7 @@ void vp8_inverse_transform_mby(const vp8_idct_rtcd_vtable_t *rtcd, MACROBLOCKD *
|
||||
if(x->mode_info_context->mbmi.mode != SPLITMV)
|
||||
{
|
||||
/* do 2nd order transform on the dc block */
|
||||
IDCT_INVOKE(rtcd, iwalsh16)(x->block[24].dqcoeff, x->block[24].diff);
|
||||
|
||||
recon_dcblock(x);
|
||||
IDCT_INVOKE(rtcd, iwalsh16)(x->block[24].dqcoeff, x->dqcoeff);
|
||||
}
|
||||
|
||||
for (i = 0; i < 16; i++)
|
||||
|
@ -24,7 +24,6 @@ extern prototype_idct(vp8_short_idct4x4llm_mmx);
|
||||
extern prototype_idct_scalar_add(vp8_dc_only_idct_add_mmx);
|
||||
|
||||
extern prototype_second_order(vp8_short_inv_walsh4x4_mmx);
|
||||
extern prototype_second_order(vp8_short_inv_walsh4x4_1_mmx);
|
||||
|
||||
#if !CONFIG_RUNTIME_CPU_DETECT
|
||||
#undef vp8_idct_idct16
|
||||
@ -36,9 +35,6 @@ extern prototype_second_order(vp8_short_inv_walsh4x4_1_mmx);
|
||||
#undef vp8_idct_iwalsh16
|
||||
#define vp8_idct_iwalsh16 vp8_short_inv_walsh4x4_mmx
|
||||
|
||||
#undef vp8_idct_iwalsh1
|
||||
#define vp8_idct_iwalsh1 vp8_short_inv_walsh4x4_1_mmx
|
||||
|
||||
#endif
|
||||
#endif
|
||||
|
||||
|
@ -11,42 +11,6 @@
|
||||
|
||||
%include "vpx_ports/x86_abi_support.asm"
|
||||
|
||||
;void vp8_short_inv_walsh4x4_1_mmx(short *input, short *output)
|
||||
global sym(vp8_short_inv_walsh4x4_1_mmx)
|
||||
sym(vp8_short_inv_walsh4x4_1_mmx):
|
||||
push rbp
|
||||
mov rbp, rsp
|
||||
SHADOW_ARGS_TO_STACK 2
|
||||
push rsi
|
||||
push rdi
|
||||
; end prolog
|
||||
|
||||
mov rsi, arg(0)
|
||||
mov rax, 3
|
||||
|
||||
mov rdi, arg(1)
|
||||
add rax, [rsi] ;input[0] + 3
|
||||
|
||||
movd mm0, eax
|
||||
|
||||
punpcklwd mm0, mm0 ;x x val val
|
||||
|
||||
punpckldq mm0, mm0 ;val val val val
|
||||
|
||||
psraw mm0, 3 ;(input[0] + 3) >> 3
|
||||
|
||||
movq [rdi + 0], mm0
|
||||
movq [rdi + 8], mm0
|
||||
movq [rdi + 16], mm0
|
||||
movq [rdi + 24], mm0
|
||||
|
||||
; begin epilog
|
||||
pop rdi
|
||||
pop rsi
|
||||
UNSHADOW_ARGS
|
||||
pop rbp
|
||||
ret
|
||||
|
||||
;void vp8_short_inv_walsh4x4_mmx(short *input, short *output)
|
||||
global sym(vp8_short_inv_walsh4x4_mmx)
|
||||
sym(vp8_short_inv_walsh4x4_mmx):
|
||||
@ -159,10 +123,50 @@ sym(vp8_short_inv_walsh4x4_mmx):
|
||||
psraw mm2, 3
|
||||
psraw mm3, 3
|
||||
|
||||
movq [rdi + 0], mm0
|
||||
movq [rdi + 8], mm1
|
||||
movq [rdi + 16], mm2
|
||||
movq [rdi + 24], mm3
|
||||
; movq [rdi + 0], mm0
|
||||
; movq [rdi + 8], mm1
|
||||
; movq [rdi + 16], mm2
|
||||
; movq [rdi + 24], mm3
|
||||
|
||||
movd eax, mm0
|
||||
psrlq mm0, 32
|
||||
mov word ptr[rdi+32*0], ax
|
||||
shr eax, 16
|
||||
mov word ptr[rdi+32*1], ax
|
||||
movd eax, mm0
|
||||
mov word ptr[rdi+32*2], ax
|
||||
shr eax, 16
|
||||
mov word ptr[rdi+32*3], ax
|
||||
|
||||
movd ecx, mm1
|
||||
psrlq mm1, 32
|
||||
mov word ptr[rdi+32*4], cx
|
||||
shr ecx, 16
|
||||
mov word ptr[rdi+32*5], cx
|
||||
movd ecx, mm1
|
||||
mov word ptr[rdi+32*6], cx
|
||||
shr ecx, 16
|
||||
mov word ptr[rdi+32*7], cx
|
||||
|
||||
movd eax, mm2
|
||||
psrlq mm2, 32
|
||||
mov word ptr[rdi+32*8], ax
|
||||
shr eax, 16
|
||||
mov word ptr[rdi+32*9], ax
|
||||
movd eax, mm2
|
||||
mov word ptr[rdi+32*10], ax
|
||||
shr eax, 16
|
||||
mov word ptr[rdi+32*11], ax
|
||||
|
||||
movd ecx, mm3
|
||||
psrlq mm3, 32
|
||||
mov word ptr[rdi+32*12], cx
|
||||
shr ecx, 16
|
||||
mov word ptr[rdi+32*13], cx
|
||||
movd ecx, mm3
|
||||
mov word ptr[rdi+32*14], cx
|
||||
shr ecx, 16
|
||||
mov word ptr[rdi+32*15], cx
|
||||
|
||||
; begin epilog
|
||||
pop rdi
|
||||
|
@ -96,8 +96,50 @@ sym(vp8_short_inv_walsh4x4_sse2):
|
||||
psraw xmm5, 3
|
||||
psraw xmm1, 3
|
||||
|
||||
movdqa [rdi + 0], xmm5
|
||||
movdqa [rdi + 16], xmm1
|
||||
;; movdqa [rdi + 0], xmm5
|
||||
;; movdqa [rdi + 16], xmm1
|
||||
|
||||
movd eax, xmm5
|
||||
psrldq xmm5, 4
|
||||
mov word ptr[rdi+32*0], ax
|
||||
shr eax, 16
|
||||
mov word ptr[rdi+32*1], ax
|
||||
movd eax, xmm5
|
||||
psrldq xmm5, 4
|
||||
mov word ptr[rdi+32*2], ax
|
||||
shr eax, 16
|
||||
mov word ptr[rdi+32*3], ax
|
||||
|
||||
movd eax, xmm5
|
||||
psrldq xmm5, 4
|
||||
mov word ptr[rdi+32*4], ax
|
||||
shr eax, 16
|
||||
mov word ptr[rdi+32*5], ax
|
||||
movd eax, xmm5
|
||||
mov word ptr[rdi+32*6], ax
|
||||
shr eax, 16
|
||||
mov word ptr[rdi+32*7], ax
|
||||
|
||||
movd eax, xmm1
|
||||
psrldq xmm1, 4
|
||||
mov word ptr[rdi+32*8], ax
|
||||
shr eax, 16
|
||||
mov word ptr[rdi+32*9], ax
|
||||
movd eax, xmm1
|
||||
psrldq xmm1, 4
|
||||
mov word ptr[rdi+32*10], ax
|
||||
shr eax, 16
|
||||
mov word ptr[rdi+32*11], ax
|
||||
|
||||
movd eax, xmm1
|
||||
psrldq xmm1, 4
|
||||
mov word ptr[rdi+32*12], ax
|
||||
shr eax, 16
|
||||
mov word ptr[rdi+32*13], ax
|
||||
movd eax, xmm1
|
||||
mov word ptr[rdi+32*14], ax
|
||||
shr eax, 16
|
||||
mov word ptr[rdi+32*15], ax
|
||||
|
||||
; begin epilog
|
||||
pop rdi
|
||||
|
@ -40,9 +40,6 @@ void vp8_arch_x86_common_init(VP8_COMMON *ctx)
|
||||
rtcd->idct.idct16 = vp8_short_idct4x4llm_mmx;
|
||||
rtcd->idct.idct1_scalar_add = vp8_dc_only_idct_add_mmx;
|
||||
rtcd->idct.iwalsh16 = vp8_short_inv_walsh4x4_mmx;
|
||||
rtcd->idct.iwalsh1 = vp8_short_inv_walsh4x4_1_mmx;
|
||||
|
||||
|
||||
|
||||
rtcd->recon.copy8x8 = vp8_copy_mem8x8_mmx;
|
||||
rtcd->recon.copy8x4 = vp8_copy_mem8x4_mmx;
|
||||
|
@ -32,8 +32,6 @@ void vp8_arch_arm_decode_init(VP8D_COMP *pbi)
|
||||
{
|
||||
pbi->dequant.block = vp8_dequantize_b_v6;
|
||||
pbi->dequant.idct_add = vp8_dequant_idct_add_v6;
|
||||
pbi->dequant.dc_idct_add = vp8_dequant_dc_idct_add_v6;
|
||||
pbi->dequant.dc_idct_add_y_block = vp8_dequant_dc_idct_add_y_block_v6;
|
||||
pbi->dequant.idct_add_y_block = vp8_dequant_idct_add_y_block_v6;
|
||||
pbi->dequant.idct_add_uv_block = vp8_dequant_idct_add_uv_block_v6;
|
||||
}
|
||||
@ -44,9 +42,6 @@ void vp8_arch_arm_decode_init(VP8D_COMP *pbi)
|
||||
{
|
||||
pbi->dequant.block = vp8_dequantize_b_neon;
|
||||
pbi->dequant.idct_add = vp8_dequant_idct_add_neon;
|
||||
/*This is not used: NEON always dequants two blocks at once.
|
||||
pbi->dequant.dc_idct_add = vp8_dequant_dc_idct_add_neon;*/
|
||||
pbi->dequant.dc_idct_add_y_block = vp8_dequant_dc_idct_add_y_block_neon;
|
||||
pbi->dequant.idct_add_y_block = vp8_dequant_idct_add_y_block_neon;
|
||||
pbi->dequant.idct_add_uv_block = vp8_dequant_idct_add_uv_block_neon;
|
||||
}
|
||||
|
@ -1,213 +0,0 @@
|
||||
;
|
||||
; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
|
||||
;
|
||||
; Use of this source code is governed by a BSD-style license and patent
|
||||
; grant that can be found in the LICENSE file in the root of the source
|
||||
; tree. All contributing project authors may be found in the AUTHORS
|
||||
; file in the root of the source tree.
|
||||
;
|
||||
|
||||
|
||||
EXPORT |vp8_dequant_dc_idct_add_v6|
|
||||
|
||||
AREA |.text|, CODE, READONLY
|
||||
|
||||
;void vp8_dequant_dc_idct_v6(short *input, short *dq,
|
||||
; unsigned char *dest, int stride, int Dc)
|
||||
; r0 = input
|
||||
; r1 = dq
|
||||
; r2 = dst
|
||||
; r3 = stride
|
||||
; sp + 36 = Dc
|
||||
|
||||
|
||||
|vp8_dequant_dc_idct_add_v6| PROC
|
||||
stmdb sp!, {r4-r11, lr}
|
||||
|
||||
ldr r6, [sp, #36]
|
||||
|
||||
ldr r4, [r0] ;input
|
||||
ldr r5, [r1], #4 ;dq
|
||||
|
||||
sub sp, sp, #4
|
||||
str r3, [sp]
|
||||
|
||||
smultt r7, r4, r5
|
||||
|
||||
ldr r4, [r0, #4] ;input
|
||||
ldr r5, [r1], #4 ;dq
|
||||
|
||||
strh r6, [r0], #2
|
||||
strh r7, [r0], #2
|
||||
|
||||
smulbb r6, r4, r5
|
||||
smultt r7, r4, r5
|
||||
|
||||
ldr r4, [r0, #4] ;input
|
||||
ldr r5, [r1], #4 ;dq
|
||||
|
||||
strh r6, [r0], #2
|
||||
strh r7, [r0], #2
|
||||
|
||||
mov r12, #3
|
||||
|
||||
vp8_dequant_dc_add_loop
|
||||
smulbb r6, r4, r5
|
||||
smultt r7, r4, r5
|
||||
|
||||
ldr r4, [r0, #4] ;input
|
||||
ldr r5, [r1], #4 ;dq
|
||||
|
||||
strh r6, [r0], #2
|
||||
strh r7, [r0], #2
|
||||
|
||||
smulbb r6, r4, r5
|
||||
smultt r7, r4, r5
|
||||
|
||||
subs r12, r12, #1
|
||||
|
||||
ldrne r4, [r0, #4]
|
||||
ldrne r5, [r1], #4
|
||||
|
||||
strh r6, [r0], #2
|
||||
strh r7, [r0], #2
|
||||
|
||||
bne vp8_dequant_dc_add_loop
|
||||
|
||||
sub r0, r0, #32
|
||||
mov r1, r0
|
||||
|
||||
; short_idct4x4llm_v6_dual
|
||||
ldr r3, cospi8sqrt2minus1
|
||||
ldr r4, sinpi8sqrt2
|
||||
ldr r6, [r0, #8]
|
||||
mov r5, #2
|
||||
vp8_dequant_dc_idct_loop1_v6
|
||||
ldr r12, [r0, #24]
|
||||
ldr r14, [r0, #16]
|
||||
smulwt r9, r3, r6
|
||||
smulwb r7, r3, r6
|
||||
smulwt r10, r4, r6
|
||||
smulwb r8, r4, r6
|
||||
pkhbt r7, r7, r9, lsl #16
|
||||
smulwt r11, r3, r12
|
||||
pkhbt r8, r8, r10, lsl #16
|
||||
uadd16 r6, r6, r7
|
||||
smulwt r7, r4, r12
|
||||
smulwb r9, r3, r12
|
||||
smulwb r10, r4, r12
|
||||
subs r5, r5, #1
|
||||
pkhbt r9, r9, r11, lsl #16
|
||||
ldr r11, [r0], #4
|
||||
pkhbt r10, r10, r7, lsl #16
|
||||
uadd16 r7, r12, r9
|
||||
usub16 r7, r8, r7
|
||||
uadd16 r6, r6, r10
|
||||
uadd16 r10, r11, r14
|
||||
usub16 r8, r11, r14
|
||||
uadd16 r9, r10, r6
|
||||
usub16 r10, r10, r6
|
||||
uadd16 r6, r8, r7
|
||||
usub16 r7, r8, r7
|
||||
str r6, [r1, #8]
|
||||
ldrne r6, [r0, #8]
|
||||
str r7, [r1, #16]
|
||||
str r10, [r1, #24]
|
||||
str r9, [r1], #4
|
||||
bne vp8_dequant_dc_idct_loop1_v6
|
||||
|
||||
mov r5, #2
|
||||
sub r0, r1, #8
|
||||
vp8_dequant_dc_idct_loop2_v6
|
||||
ldr r6, [r0], #4
|
||||
ldr r7, [r0], #4
|
||||
ldr r8, [r0], #4
|
||||
ldr r9, [r0], #4
|
||||
smulwt r1, r3, r6
|
||||
smulwt r12, r4, r6
|
||||
smulwt lr, r3, r8
|
||||
smulwt r10, r4, r8
|
||||
pkhbt r11, r8, r6, lsl #16
|
||||
pkhbt r1, lr, r1, lsl #16
|
||||
pkhbt r12, r10, r12, lsl #16
|
||||
pkhtb r6, r6, r8, asr #16
|
||||
uadd16 r6, r1, r6
|
||||
pkhbt lr, r9, r7, lsl #16
|
||||
uadd16 r10, r11, lr
|
||||
usub16 lr, r11, lr
|
||||
pkhtb r8, r7, r9, asr #16
|
||||
subs r5, r5, #1
|
||||
smulwt r1, r3, r8
|
||||
smulwb r7, r3, r8
|
||||
smulwt r11, r4, r8
|
||||
smulwb r9, r4, r8
|
||||
pkhbt r1, r7, r1, lsl #16
|
||||
uadd16 r8, r1, r8
|
||||
pkhbt r11, r9, r11, lsl #16
|
||||
usub16 r1, r12, r8
|
||||
uadd16 r8, r11, r6
|
||||
ldr r9, c0x00040004
|
||||
ldr r12, [sp] ; get stride from stack
|
||||
uadd16 r6, r10, r8
|
||||
usub16 r7, r10, r8
|
||||
uadd16 r7, r7, r9
|
||||
uadd16 r6, r6, r9
|
||||
uadd16 r10, r14, r1
|
||||
usub16 r1, r14, r1
|
||||
uadd16 r10, r10, r9
|
||||
uadd16 r1, r1, r9
|
||||
ldr r11, [r2] ; load input from dst
|
||||
mov r8, r7, asr #3
|
||||
pkhtb r9, r8, r10, asr #19
|
||||
mov r8, r1, asr #3
|
||||
pkhtb r8, r8, r6, asr #19
|
||||
uxtb16 lr, r11, ror #8
|
||||
qadd16 r9, r9, lr
|
||||
uxtb16 lr, r11
|
||||
qadd16 r8, r8, lr
|
||||
usat16 r9, #8, r9
|
||||
usat16 r8, #8, r8
|
||||
orr r9, r8, r9, lsl #8
|
||||
ldr r11, [r2, r12] ; load input from dst
|
||||
mov r7, r7, lsl #16
|
||||
mov r1, r1, lsl #16
|
||||
mov r10, r10, lsl #16
|
||||
mov r6, r6, lsl #16
|
||||
mov r7, r7, asr #3
|
||||
pkhtb r7, r7, r10, asr #19
|
||||
mov r1, r1, asr #3
|
||||
pkhtb r1, r1, r6, asr #19
|
||||
uxtb16 r8, r11, ror #8
|
||||
qadd16 r7, r7, r8
|
||||
uxtb16 r8, r11
|
||||
qadd16 r1, r1, r8
|
||||
usat16 r7, #8, r7
|
||||
usat16 r1, #8, r1
|
||||
orr r1, r1, r7, lsl #8
|
||||
str r9, [r2], r12 ; store output to dst
|
||||
str r1, [r2], r12 ; store output to dst
|
||||
bne vp8_dequant_dc_idct_loop2_v6
|
||||
|
||||
; vpx_memset
|
||||
sub r0, r0, #32
|
||||
add sp, sp, #4
|
||||
|
||||
mov r12, #0
|
||||
str r12, [r0]
|
||||
str r12, [r0, #4]
|
||||
str r12, [r0, #8]
|
||||
str r12, [r0, #12]
|
||||
str r12, [r0, #16]
|
||||
str r12, [r0, #20]
|
||||
str r12, [r0, #24]
|
||||
str r12, [r0, #28]
|
||||
|
||||
ldmia sp!, {r4 - r11, pc}
|
||||
ENDP ; |vp8_dequant_dc_idct_add_v6|
|
||||
|
||||
; Constant Pool
|
||||
cospi8sqrt2minus1 DCD 0x00004E7B
|
||||
sinpi8sqrt2 DCD 0x00008A8C
|
||||
c0x00040004 DCD 0x00040004
|
||||
|
||||
END
|
@ -13,47 +13,6 @@
|
||||
#include "vp8/decoder/dequantize.h"
|
||||
|
||||
|
||||
void vp8_dequant_dc_idct_add_y_block_v6(short *q, short *dq,
|
||||
unsigned char *dst, int stride,
|
||||
char *eobs, short *dc)
|
||||
{
|
||||
int i;
|
||||
|
||||
for (i = 0; i < 4; i++)
|
||||
{
|
||||
if (eobs[0] > 1)
|
||||
vp8_dequant_dc_idct_add_v6 (q, dq, dst, stride, dc[0]);
|
||||
else if (eobs[0] == 1)
|
||||
vp8_dc_only_idct_add_v6 (dc[0], dst, stride, dst, stride);
|
||||
|
||||
if (eobs[1] > 1)
|
||||
{
|
||||
vp8_dequant_dc_idct_add_v6 (q+16, dq, dst+4, stride, dc[1]);
|
||||
}
|
||||
else if (eobs[1] == 1)
|
||||
vp8_dc_only_idct_add_v6 (dc[1], dst+4, stride, dst+4, stride);
|
||||
|
||||
if (eobs[2] > 1)
|
||||
{
|
||||
vp8_dequant_dc_idct_add_v6 (q+32, dq, dst+8, stride, dc[2]);
|
||||
}
|
||||
else if (eobs[2] == 1)
|
||||
vp8_dc_only_idct_add_v6 (dc[2], dst+8, stride, dst+8, stride);
|
||||
|
||||
if (eobs[3] > 1)
|
||||
{
|
||||
vp8_dequant_dc_idct_add_v6 (q+48, dq, dst+12, stride, dc[3]);
|
||||
}
|
||||
else if (eobs[3] == 1)
|
||||
vp8_dc_only_idct_add_v6 (dc[3], dst+12, stride, dst+12, stride);
|
||||
|
||||
q += 64;
|
||||
dc += 4;
|
||||
dst += 4*stride;
|
||||
eobs += 4;
|
||||
}
|
||||
}
|
||||
|
||||
void vp8_dequant_idct_add_y_block_v6(short *q, short *dq,
|
||||
unsigned char *dst,
|
||||
int stride, char *eobs)
|
||||
|
@ -15,8 +15,6 @@
|
||||
#if HAVE_ARMV6
|
||||
extern prototype_dequant_block(vp8_dequantize_b_v6);
|
||||
extern prototype_dequant_idct_add(vp8_dequant_idct_add_v6);
|
||||
extern prototype_dequant_dc_idct_add(vp8_dequant_dc_idct_add_v6);
|
||||
extern prototype_dequant_dc_idct_add_y_block(vp8_dequant_dc_idct_add_y_block_v6);
|
||||
extern prototype_dequant_idct_add_y_block(vp8_dequant_idct_add_y_block_v6);
|
||||
extern prototype_dequant_idct_add_uv_block(vp8_dequant_idct_add_uv_block_v6);
|
||||
|
||||
@ -27,12 +25,6 @@ extern prototype_dequant_idct_add_uv_block(vp8_dequant_idct_add_uv_block_v6);
|
||||
#undef vp8_dequant_idct_add
|
||||
#define vp8_dequant_idct_add vp8_dequant_idct_add_v6
|
||||
|
||||
#undef vp8_dequant_dc_idct_add
|
||||
#define vp8_dequant_dc_idct_add vp8_dequant_dc_idct_add_v6
|
||||
|
||||
#undef vp8_dequant_dc_idct_add_y_block
|
||||
#define vp8_dequant_dc_idct_add_y_block vp8_dequant_dc_idct_add_y_block_v6
|
||||
|
||||
#undef vp8_dequant_idct_add_y_block
|
||||
#define vp8_dequant_idct_add_y_block vp8_dequant_idct_add_y_block_v6
|
||||
|
||||
@ -44,8 +36,6 @@ extern prototype_dequant_idct_add_uv_block(vp8_dequant_idct_add_uv_block_v6);
|
||||
#if HAVE_ARMV7
|
||||
extern prototype_dequant_block(vp8_dequantize_b_neon);
|
||||
extern prototype_dequant_idct_add(vp8_dequant_idct_add_neon);
|
||||
extern prototype_dequant_dc_idct_add(vp8_dequant_dc_idct_add_neon);
|
||||
extern prototype_dequant_dc_idct_add_y_block(vp8_dequant_dc_idct_add_y_block_neon);
|
||||
extern prototype_dequant_idct_add_y_block(vp8_dequant_idct_add_y_block_neon);
|
||||
extern prototype_dequant_idct_add_uv_block(vp8_dequant_idct_add_uv_block_neon);
|
||||
|
||||
@ -57,12 +47,6 @@ extern prototype_dequant_idct_add_uv_block(vp8_dequant_idct_add_uv_block_neon);
|
||||
#undef vp8_dequant_idct_add
|
||||
#define vp8_dequant_idct_add vp8_dequant_idct_add_neon
|
||||
|
||||
#undef vp8_dequant_dc_idct_add
|
||||
#define vp8_dequant_dc_idct_add vp8_dequant_dc_idct_add_neon
|
||||
|
||||
#undef vp8_dequant_dc_idct_add_y_block
|
||||
#define vp8_dequant_dc_idct_add_y_block vp8_dequant_dc_idct_add_y_block_neon
|
||||
|
||||
#undef vp8_dequant_idct_add_y_block
|
||||
#define vp8_dequant_idct_add_y_block vp8_dequant_idct_add_y_block_neon
|
||||
|
||||
|
@ -15,46 +15,11 @@
|
||||
/* place these declarations here because we don't want to maintain them
|
||||
* outside of this scope
|
||||
*/
|
||||
void idct_dequant_dc_full_2x_neon(short *input, short *dq,
|
||||
unsigned char *dst,
|
||||
int stride, short *dc);
|
||||
void idct_dequant_dc_0_2x_neon(short *input, short *dq,
|
||||
unsigned char *dst,
|
||||
int stride, short *dc);
|
||||
void idct_dequant_full_2x_neon(short *q, short *dq,
|
||||
unsigned char *dst, int stride);
|
||||
void idct_dequant_0_2x_neon(short *q, short dq,
|
||||
unsigned char *dst, int stride);
|
||||
|
||||
void vp8_dequant_dc_idct_add_y_block_neon(short *q, short *dq,
|
||||
unsigned char *dst,
|
||||
int stride, char *eobs, short *dc)
|
||||
{
|
||||
int i;
|
||||
|
||||
for (i = 0; i < 4; i++)
|
||||
{
|
||||
if (((short *)(eobs))[0])
|
||||
{
|
||||
if (((short *)eobs)[0] & 0xfefe)
|
||||
idct_dequant_dc_full_2x_neon (q, dq, dst, stride, dc);
|
||||
else
|
||||
idct_dequant_dc_0_2x_neon(q, dq, dst, stride, dc);
|
||||
}
|
||||
|
||||
if (((short *)(eobs))[1])
|
||||
{
|
||||
if (((short *)eobs)[1] & 0xfefe)
|
||||
idct_dequant_dc_full_2x_neon (q+32, dq, dst+8, stride, dc+2);
|
||||
else
|
||||
idct_dequant_dc_0_2x_neon(q+32, dq, dst+8, stride, dc+2);
|
||||
}
|
||||
q += 64;
|
||||
dc += 4;
|
||||
dst += 4*stride;
|
||||
eobs += 4;
|
||||
}
|
||||
}
|
||||
|
||||
void vp8_dequant_idct_add_y_block_neon(short *q, short *dq,
|
||||
unsigned char *dst,
|
||||
|
@ -1,75 +0,0 @@
|
||||
;
|
||||
; Copyright (c) 2010 The Webm project authors. All Rights Reserved.
|
||||
;
|
||||
; Use of this source code is governed by a BSD-style license and patent
|
||||
; grant that can be found in the LICENSE file in the root of the source
|
||||
; tree. All contributing project authors may be found in the AUTHORS
|
||||
; file in the root of the source tree.
|
||||
;
|
||||
|
||||
|
||||
EXPORT |idct_dequant_dc_0_2x_neon|
|
||||
ARM
|
||||
REQUIRE8
|
||||
PRESERVE8
|
||||
|
||||
AREA ||.text||, CODE, READONLY, ALIGN=2
|
||||
|
||||
;void idct_dequant_dc_0_2x_neon(short *q, short *dq,
|
||||
; unsigned char *dst, int stride);
|
||||
; r0 *q,
|
||||
; r1 *dq,
|
||||
; r2 *dst
|
||||
; r3 stride
|
||||
; sp *dc
|
||||
|idct_dequant_dc_0_2x_neon| PROC
|
||||
|
||||
; no q- or dq-coeffs, so r0 and r1 are free to use
|
||||
ldr r1, [sp] ; *dc
|
||||
add r12, r2, #4
|
||||
ldr r0, [r1]
|
||||
|
||||
vld1.32 {d2[0]}, [r2], r3 ; lo
|
||||
vld1.32 {d8[0]}, [r12], r3 ; hi
|
||||
vld1.32 {d2[1]}, [r2], r3
|
||||
vld1.32 {d8[1]}, [r12], r3
|
||||
vld1.32 {d4[0]}, [r2], r3
|
||||
vld1.32 {d10[0]}, [r12], r3
|
||||
vld1.32 {d4[1]}, [r2], r3
|
||||
vld1.32 {d10[1]}, [r12]
|
||||
|
||||
sxth r1, r0 ; lo *dc
|
||||
add r1, r1, #4
|
||||
asr r1, r1, #3
|
||||
vdup.16 q0, r1
|
||||
sxth r0, r0, ror #16 ; hi *dc
|
||||
add r0, r0, #4
|
||||
asr r0, r0, #3
|
||||
vdup.16 q3, r0
|
||||
|
||||
vaddw.u8 q1, q0, d2 ; lo
|
||||
vaddw.u8 q2, q0, d4
|
||||
vaddw.u8 q4, q3, d8 ; hi
|
||||
vaddw.u8 q5, q3, d10
|
||||
|
||||
vqmovun.s16 d2, q1 ; lo
|
||||
vqmovun.s16 d4, q2
|
||||
vqmovun.s16 d8, q4 ; hi
|
||||
vqmovun.s16 d10, q5
|
||||
|
||||
sub r2, r2, r3, lsl #2 ; dst - 4*stride
|
||||
add r0, r2, #4
|
||||
|
||||
vst1.32 {d2[0]}, [r2], r3 ; lo
|
||||
vst1.32 {d8[0]}, [r0], r3 ; hi
|
||||
vst1.32 {d2[1]}, [r2], r3
|
||||
vst1.32 {d8[1]}, [r0], r3
|
||||
vst1.32 {d4[0]}, [r2], r3
|
||||
vst1.32 {d10[0]}, [r0], r3
|
||||
vst1.32 {d4[1]}, [r2]
|
||||
vst1.32 {d10[1]}, [r0]
|
||||
|
||||
bx lr
|
||||
|
||||
ENDP ;|idct_dequant_dc_0_2x_neon|
|
||||
END
|
@ -1,208 +0,0 @@
|
||||
;
|
||||
; Copyright (c) 2010 The Webm project authors. All Rights Reserved.
|
||||
;
|
||||
; Use of this source code is governed by a BSD-style license
|
||||
; that can be found in the LICENSE file in the root of the source
|
||||
; tree. An additional intellectual property rights grant can be found
|
||||
; in the file PATENTS. All contributing project authors may
|
||||
; be found in the AUTHORS file in the root of the source tree.
|
||||
;
|
||||
|
||||
|
||||
EXPORT |idct_dequant_dc_full_2x_neon|
|
||||
ARM
|
||||
REQUIRE8
|
||||
PRESERVE8
|
||||
|
||||
AREA ||.text||, CODE, READONLY, ALIGN=2
|
||||
;void idct_dequant_dc_full_2x_neon(short *q, short *dq,
|
||||
; unsigned char *dst, int stride, short *dc);
|
||||
; r0 *q,
|
||||
; r1 *dq,
|
||||
; r2 *dst
|
||||
; r3 stride
|
||||
; sp *dc
|
||||
|idct_dequant_dc_full_2x_neon| PROC
|
||||
push {r4}
|
||||
|
||||
vld1.16 {q0, q1}, [r1] ; dq (same l/r)
|
||||
vld1.16 {q2, q3}, [r0] ; l q
|
||||
add r0, r0, #32
|
||||
vld1.16 {q4, q5}, [r0] ; r q
|
||||
add r12, r2, #4
|
||||
|
||||
; interleave the predictors
|
||||
vld1.32 {d28[0]}, [r2], r3 ; l pre
|
||||
vld1.32 {d28[1]}, [r12], r3 ; r pre
|
||||
vld1.32 {d29[0]}, [r2], r3
|
||||
vld1.32 {d29[1]}, [r12], r3
|
||||
vld1.32 {d30[0]}, [r2], r3
|
||||
vld1.32 {d30[1]}, [r12], r3
|
||||
vld1.32 {d31[0]}, [r2], r3
|
||||
ldr r1, [sp, #4] ; *dc
|
||||
vld1.32 {d31[1]}, [r12]
|
||||
|
||||
adr r4, cospi8sqrt2minus1 ; pointer to the first constant
|
||||
|
||||
ldrh r12, [r1], #2 ; lo *dc
|
||||
ldrh r1, [r1] ; hi *dc
|
||||
|
||||
; dequant: q[i] = q[i] * dq[i]
|
||||
vmul.i16 q2, q2, q0
|
||||
vmul.i16 q3, q3, q1
|
||||
vmul.i16 q4, q4, q0
|
||||
vmul.i16 q5, q5, q1
|
||||
|
||||
; move dc up to neon and overwrite first element
|
||||
vmov.16 d4[0], r12
|
||||
vmov.16 d8[0], r1
|
||||
|
||||
vld1.16 {d0}, [r4]
|
||||
|
||||
; q2: l0r0 q3: l8r8
|
||||
; q4: l4r4 q5: l12r12
|
||||
vswp d5, d8
|
||||
vswp d7, d10
|
||||
|
||||
; _CONSTANTS_ * 4,12 >> 16
|
||||
; q6: 4 * sinpi : c1/temp1
|
||||
; q7: 12 * sinpi : d1/temp2
|
||||
; q8: 4 * cospi
|
||||
; q9: 12 * cospi
|
||||
vqdmulh.s16 q6, q4, d0[2] ; sinpi8sqrt2
|
||||
vqdmulh.s16 q7, q5, d0[2]
|
||||
vqdmulh.s16 q8, q4, d0[0] ; cospi8sqrt2minus1
|
||||
vqdmulh.s16 q9, q5, d0[0]
|
||||
|
||||
vqadd.s16 q10, q2, q3 ; a1 = 0 + 8
|
||||
vqsub.s16 q11, q2, q3 ; b1 = 0 - 8
|
||||
|
||||
; vqdmulh only accepts signed values. this was a problem because
|
||||
; our constant had the high bit set, and was treated as a negative value.
|
||||
; vqdmulh also doubles the value before it shifts by 16. we need to
|
||||
; compensate for this. in the case of sinpi8sqrt2, the lowest bit is 0,
|
||||
; so we can shift the constant without losing precision. this avoids
|
||||
; shift again afterward, but also avoids the sign issue. win win!
|
||||
; for cospi8sqrt2minus1 the lowest bit is 1, so we lose precision if we
|
||||
; pre-shift it
|
||||
vshr.s16 q8, q8, #1
|
||||
vshr.s16 q9, q9, #1
|
||||
|
||||
; q4: 4 + 4 * cospi : d1/temp1
|
||||
; q5: 12 + 12 * cospi : c1/temp2
|
||||
vqadd.s16 q4, q4, q8
|
||||
vqadd.s16 q5, q5, q9
|
||||
|
||||
; c1 = temp1 - temp2
|
||||
; d1 = temp1 + temp2
|
||||
vqsub.s16 q2, q6, q5
|
||||
vqadd.s16 q3, q4, q7
|
||||
|
||||
; [0]: a1+d1
|
||||
; [1]: b1+c1
|
||||
; [2]: b1-c1
|
||||
; [3]: a1-d1
|
||||
vqadd.s16 q4, q10, q3
|
||||
vqadd.s16 q5, q11, q2
|
||||
vqsub.s16 q6, q11, q2
|
||||
vqsub.s16 q7, q10, q3
|
||||
|
||||
; rotate
|
||||
vtrn.32 q4, q6
|
||||
vtrn.32 q5, q7
|
||||
vtrn.16 q4, q5
|
||||
vtrn.16 q6, q7
|
||||
; idct loop 2
|
||||
; q4: l 0, 4, 8,12 r 0, 4, 8,12
|
||||
; q5: l 1, 5, 9,13 r 1, 5, 9,13
|
||||
; q6: l 2, 6,10,14 r 2, 6,10,14
|
||||
; q7: l 3, 7,11,15 r 3, 7,11,15
|
||||
|
||||
; q8: 1 * sinpi : c1/temp1
|
||||
; q9: 3 * sinpi : d1/temp2
|
||||
; q10: 1 * cospi
|
||||
; q11: 3 * cospi
|
||||
vqdmulh.s16 q8, q5, d0[2] ; sinpi8sqrt2
|
||||
vqdmulh.s16 q9, q7, d0[2]
|
||||
vqdmulh.s16 q10, q5, d0[0] ; cospi8sqrt2minus1
|
||||
vqdmulh.s16 q11, q7, d0[0]
|
||||
|
||||
vqadd.s16 q2, q4, q6 ; a1 = 0 + 2
|
||||
vqsub.s16 q3, q4, q6 ; b1 = 0 - 2
|
||||
|
||||
; see note on shifting above
|
||||
vshr.s16 q10, q10, #1
|
||||
vshr.s16 q11, q11, #1
|
||||
|
||||
; q10: 1 + 1 * cospi : d1/temp1
|
||||
; q11: 3 + 3 * cospi : c1/temp2
|
||||
vqadd.s16 q10, q5, q10
|
||||
vqadd.s16 q11, q7, q11
|
||||
|
||||
; q8: c1 = temp1 - temp2
|
||||
; q9: d1 = temp1 + temp2
|
||||
vqsub.s16 q8, q8, q11
|
||||
vqadd.s16 q9, q10, q9
|
||||
|
||||
; a1+d1
|
||||
; b1+c1
|
||||
; b1-c1
|
||||
; a1-d1
|
||||
vqadd.s16 q4, q2, q9
|
||||
vqadd.s16 q5, q3, q8
|
||||
vqsub.s16 q6, q3, q8
|
||||
vqsub.s16 q7, q2, q9
|
||||
|
||||
; +4 >> 3 (rounding)
|
||||
vrshr.s16 q4, q4, #3 ; lo
|
||||
vrshr.s16 q5, q5, #3
|
||||
vrshr.s16 q6, q6, #3 ; hi
|
||||
vrshr.s16 q7, q7, #3
|
||||
|
||||
vtrn.32 q4, q6
|
||||
vtrn.32 q5, q7
|
||||
vtrn.16 q4, q5
|
||||
vtrn.16 q6, q7
|
||||
|
||||
; adding pre
|
||||
; input is still packed. pre was read interleaved
|
||||
vaddw.u8 q4, q4, d28
|
||||
vaddw.u8 q5, q5, d29
|
||||
vaddw.u8 q6, q6, d30
|
||||
vaddw.u8 q7, q7, d31
|
||||
|
||||
vmov.i16 q14, #0
|
||||
vmov q15, q14
|
||||
vst1.16 {q14, q15}, [r0] ; write over high input
|
||||
sub r0, r0, #32
|
||||
vst1.16 {q14, q15}, [r0] ; write over low input
|
||||
|
||||
sub r2, r2, r3, lsl #2 ; dst - 4*stride
|
||||
add r1, r2, #4 ; hi
|
||||
|
||||
;saturate and narrow
|
||||
vqmovun.s16 d0, q4 ; lo
|
||||
vqmovun.s16 d1, q5
|
||||
vqmovun.s16 d2, q6 ; hi
|
||||
vqmovun.s16 d3, q7
|
||||
|
||||
vst1.32 {d0[0]}, [r2], r3 ; lo
|
||||
vst1.32 {d0[1]}, [r1], r3 ; hi
|
||||
vst1.32 {d1[0]}, [r2], r3
|
||||
vst1.32 {d1[1]}, [r1], r3
|
||||
vst1.32 {d2[0]}, [r2], r3
|
||||
vst1.32 {d2[1]}, [r1], r3
|
||||
vst1.32 {d3[0]}, [r2]
|
||||
vst1.32 {d3[1]}, [r1]
|
||||
|
||||
pop {r4}
|
||||
bx lr
|
||||
|
||||
ENDP ; |idct_dequant_dc_full_2x_neon|
|
||||
|
||||
; Constant Pool
|
||||
cospi8sqrt2minus1 DCD 0x4e7b
|
||||
; because the lowest bit in 0x8a8c is 0, we can pre-shift this
|
||||
sinpi8sqrt2 DCD 0x4546
|
||||
|
||||
END
|
@ -232,45 +232,53 @@ static void decode_macroblock(VP8D_COMP *pbi, MACROBLOCKD *xd,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
else if (mode == SPLITMV)
|
||||
else
|
||||
{
|
||||
short *DQC = xd->block[0].dequant;
|
||||
|
||||
/* save the dc dequant constant in case it is overridden */
|
||||
short dc_dequant_temp = DQC[0];
|
||||
|
||||
if (mode != SPLITMV)
|
||||
{
|
||||
BLOCKD *b = &xd->block[24];
|
||||
|
||||
/* do 2nd order transform on the dc block */
|
||||
if (xd->eobs[24] > 1)
|
||||
{
|
||||
DEQUANT_INVOKE(&pbi->dequant, block)(b);
|
||||
|
||||
IDCT_INVOKE(RTCD_VTABLE(idct), iwalsh16)(&b->dqcoeff[0],
|
||||
xd->qcoeff);
|
||||
((int *)b->qcoeff)[0] = 0;
|
||||
((int *)b->qcoeff)[1] = 0;
|
||||
((int *)b->qcoeff)[2] = 0;
|
||||
((int *)b->qcoeff)[3] = 0;
|
||||
((int *)b->qcoeff)[4] = 0;
|
||||
((int *)b->qcoeff)[5] = 0;
|
||||
((int *)b->qcoeff)[6] = 0;
|
||||
((int *)b->qcoeff)[7] = 0;
|
||||
}
|
||||
else
|
||||
{
|
||||
b->dqcoeff[0] = b->qcoeff[0] * b->dequant[0];
|
||||
IDCT_INVOKE(RTCD_VTABLE(idct), iwalsh1)(&b->dqcoeff[0],
|
||||
xd->qcoeff);
|
||||
((int *)b->qcoeff)[0] = 0;
|
||||
}
|
||||
|
||||
/* override the dc dequant constant */
|
||||
DQC[0] = 1;
|
||||
}
|
||||
|
||||
DEQUANT_INVOKE (&pbi->dequant, idct_add_y_block)
|
||||
(xd->qcoeff, xd->block[0].dequant,
|
||||
xd->dst.y_buffer,
|
||||
xd->dst.y_stride, xd->eobs);
|
||||
}
|
||||
else
|
||||
{
|
||||
BLOCKD *b = &xd->block[24];
|
||||
|
||||
/* do 2nd order transform on the dc block */
|
||||
if (xd->eobs[24] > 1)
|
||||
{
|
||||
DEQUANT_INVOKE(&pbi->dequant, block)(b);
|
||||
|
||||
IDCT_INVOKE(RTCD_VTABLE(idct), iwalsh16)(&b->dqcoeff[0], b->diff);
|
||||
((int *)b->qcoeff)[0] = 0;
|
||||
((int *)b->qcoeff)[1] = 0;
|
||||
((int *)b->qcoeff)[2] = 0;
|
||||
((int *)b->qcoeff)[3] = 0;
|
||||
((int *)b->qcoeff)[4] = 0;
|
||||
((int *)b->qcoeff)[5] = 0;
|
||||
((int *)b->qcoeff)[6] = 0;
|
||||
((int *)b->qcoeff)[7] = 0;
|
||||
}
|
||||
else
|
||||
{
|
||||
b->dqcoeff[0] = b->qcoeff[0] * b->dequant[0];
|
||||
IDCT_INVOKE(RTCD_VTABLE(idct), iwalsh1)(&b->dqcoeff[0], b->diff);
|
||||
((int *)b->qcoeff)[0] = 0;
|
||||
}
|
||||
|
||||
DEQUANT_INVOKE (&pbi->dequant, dc_idct_add_y_block)
|
||||
(xd->qcoeff, xd->block[0].dequant,
|
||||
xd->dst.y_buffer,
|
||||
xd->dst.y_stride, xd->eobs, xd->block[24].diff);
|
||||
/* restore the dc dequant constant */
|
||||
DQC[0] = dc_dequant_temp;
|
||||
}
|
||||
|
||||
DEQUANT_INVOKE (&pbi->dequant, idct_add_uv_block)
|
||||
|
@ -42,22 +42,3 @@ void vp8_dequant_idct_add_c(short *input, short *dq,
|
||||
vpx_memset(input, 0, 32);
|
||||
|
||||
}
|
||||
|
||||
void vp8_dequant_dc_idct_add_c(short *input, short *dq,
|
||||
unsigned char *dest, int stride,
|
||||
int Dc)
|
||||
{
|
||||
int i;
|
||||
|
||||
input[0] = (short)Dc;
|
||||
|
||||
for (i = 1; i < 16; i++)
|
||||
{
|
||||
input[i] = dq[i] * input[i];
|
||||
}
|
||||
|
||||
vp8_short_idct4x4llm_c(input, dest, stride, dest, stride);
|
||||
|
||||
vpx_memset(input, 0, 32);
|
||||
|
||||
}
|
||||
|
@ -21,17 +21,6 @@
|
||||
unsigned char *output, \
|
||||
int stride)
|
||||
|
||||
#define prototype_dequant_dc_idct_add(sym) \
|
||||
void sym(short *input, short *dq, \
|
||||
unsigned char *dst, \
|
||||
int stride, \
|
||||
int dc)
|
||||
|
||||
#define prototype_dequant_dc_idct_add_y_block(sym) \
|
||||
void sym(short *q, short *dq, \
|
||||
unsigned char *dst, \
|
||||
int stride, char *eobs, short *dc)
|
||||
|
||||
#define prototype_dequant_idct_add_y_block(sym) \
|
||||
void sym(short *q, short *dq, \
|
||||
unsigned char *dst, \
|
||||
@ -60,16 +49,6 @@ extern prototype_dequant_block(vp8_dequant_block);
|
||||
#endif
|
||||
extern prototype_dequant_idct_add(vp8_dequant_idct_add);
|
||||
|
||||
#ifndef vp8_dequant_dc_idct_add
|
||||
#define vp8_dequant_dc_idct_add vp8_dequant_dc_idct_add_c
|
||||
#endif
|
||||
extern prototype_dequant_dc_idct_add(vp8_dequant_dc_idct_add);
|
||||
|
||||
#ifndef vp8_dequant_dc_idct_add_y_block
|
||||
#define vp8_dequant_dc_idct_add_y_block vp8_dequant_dc_idct_add_y_block_c
|
||||
#endif
|
||||
extern prototype_dequant_dc_idct_add_y_block(vp8_dequant_dc_idct_add_y_block);
|
||||
|
||||
#ifndef vp8_dequant_idct_add_y_block
|
||||
#define vp8_dequant_idct_add_y_block vp8_dequant_idct_add_y_block_c
|
||||
#endif
|
||||
@ -85,10 +64,6 @@ typedef prototype_dequant_block((*vp8_dequant_block_fn_t));
|
||||
|
||||
typedef prototype_dequant_idct_add((*vp8_dequant_idct_add_fn_t));
|
||||
|
||||
typedef prototype_dequant_dc_idct_add((*vp8_dequant_dc_idct_add_fn_t));
|
||||
|
||||
typedef prototype_dequant_dc_idct_add_y_block((*vp8_dequant_dc_idct_add_y_block_fn_t));
|
||||
|
||||
typedef prototype_dequant_idct_add_y_block((*vp8_dequant_idct_add_y_block_fn_t));
|
||||
|
||||
typedef prototype_dequant_idct_add_uv_block((*vp8_dequant_idct_add_uv_block_fn_t));
|
||||
@ -97,8 +72,6 @@ typedef struct
|
||||
{
|
||||
vp8_dequant_block_fn_t block;
|
||||
vp8_dequant_idct_add_fn_t idct_add;
|
||||
vp8_dequant_dc_idct_add_fn_t dc_idct_add;
|
||||
vp8_dequant_dc_idct_add_y_block_fn_t dc_idct_add_y_block;
|
||||
vp8_dequant_idct_add_y_block_fn_t idct_add_y_block;
|
||||
vp8_dequant_idct_add_uv_block_fn_t idct_add_uv_block;
|
||||
} vp8_dequant_rtcd_vtable_t;
|
||||
|
@ -23,8 +23,6 @@ void vp8_dmachine_specific_config(VP8D_COMP *pbi)
|
||||
pbi->mb.rtcd = &pbi->common.rtcd;
|
||||
pbi->dequant.block = vp8_dequantize_b_c;
|
||||
pbi->dequant.idct_add = vp8_dequant_idct_add_c;
|
||||
pbi->dequant.dc_idct_add = vp8_dequant_dc_idct_add_c;
|
||||
pbi->dequant.dc_idct_add_y_block = vp8_dequant_dc_idct_add_y_block_c;
|
||||
pbi->dequant.idct_add_y_block = vp8_dequant_idct_add_y_block_c;
|
||||
pbi->dequant.idct_add_uv_block = vp8_dequant_idct_add_uv_block_c;
|
||||
#endif
|
||||
|
@ -12,39 +12,12 @@
|
||||
#include "vp8/common/idct.h"
|
||||
#include "dequantize.h"
|
||||
|
||||
void vp8_dequant_dc_idct_add_c(short *input, short *dq,
|
||||
unsigned char *dest, int stride,
|
||||
int Dc);
|
||||
void vp8_dequant_idct_add_c(short *input, short *dq,
|
||||
unsigned char *dest, int stride);
|
||||
void vp8_dc_only_idct_add_c(short input_dc, unsigned char * pred,
|
||||
int pred_stride, unsigned char *dst_ptr,
|
||||
int dst_stride);
|
||||
|
||||
void vp8_dequant_dc_idct_add_y_block_c
|
||||
(short *q, short *dq,
|
||||
unsigned char *dst, int stride, char *eobs, short *dc)
|
||||
{
|
||||
int i, j;
|
||||
|
||||
for (i = 0; i < 4; i++)
|
||||
{
|
||||
for (j = 0; j < 4; j++)
|
||||
{
|
||||
if (*eobs++ > 1)
|
||||
vp8_dequant_dc_idct_add_c (q, dq, dst, stride, dc[0]);
|
||||
else
|
||||
vp8_dc_only_idct_add_c (dc[0], dst, stride, dst, stride);
|
||||
|
||||
q += 16;
|
||||
dst += 4;
|
||||
dc ++;
|
||||
}
|
||||
|
||||
dst += 4*stride - 16;
|
||||
}
|
||||
}
|
||||
|
||||
void vp8_dequant_idct_add_y_block_c
|
||||
(short *q, short *dq,
|
||||
unsigned char *dst, int stride, char *eobs)
|
||||
|
@ -175,36 +175,7 @@ static void decode_macroblock(VP8D_COMP *pbi, MACROBLOCKD *xd, int mb_row, int m
|
||||
#endif
|
||||
|
||||
/* dequantization and idct */
|
||||
if (xd->mode_info_context->mbmi.mode != B_PRED && xd->mode_info_context->mbmi.mode != SPLITMV)
|
||||
{
|
||||
BLOCKD *b = &xd->block[24];
|
||||
DEQUANT_INVOKE(&pbi->dequant, block)(b);
|
||||
|
||||
/* do 2nd order transform on the dc block */
|
||||
if (xd->eobs[24] > 1)
|
||||
{
|
||||
IDCT_INVOKE(RTCD_VTABLE(idct), iwalsh16)(&b->dqcoeff[0], b->diff);
|
||||
((int *)b->qcoeff)[0] = 0;
|
||||
((int *)b->qcoeff)[1] = 0;
|
||||
((int *)b->qcoeff)[2] = 0;
|
||||
((int *)b->qcoeff)[3] = 0;
|
||||
((int *)b->qcoeff)[4] = 0;
|
||||
((int *)b->qcoeff)[5] = 0;
|
||||
((int *)b->qcoeff)[6] = 0;
|
||||
((int *)b->qcoeff)[7] = 0;
|
||||
}
|
||||
else
|
||||
{
|
||||
IDCT_INVOKE(RTCD_VTABLE(idct), iwalsh1)(&b->dqcoeff[0], b->diff);
|
||||
((int *)b->qcoeff)[0] = 0;
|
||||
}
|
||||
|
||||
DEQUANT_INVOKE (&pbi->dequant, dc_idct_add_y_block)
|
||||
(xd->qcoeff, xd->block[0].dequant,
|
||||
xd->dst.y_buffer,
|
||||
xd->dst.y_stride, xd->eobs, xd->block[24].diff);
|
||||
}
|
||||
else if (xd->mode_info_context->mbmi.mode == B_PRED)
|
||||
if (xd->mode_info_context->mbmi.mode == B_PRED)
|
||||
{
|
||||
for (i = 0; i < 16; i++)
|
||||
{
|
||||
@ -214,26 +185,71 @@ static void decode_macroblock(VP8D_COMP *pbi, MACROBLOCKD *xd, int mb_row, int m
|
||||
vp8mt_predict_intra4x4(pbi, xd, b_mode, *(b->base_dst) + b->dst,
|
||||
b->dst_stride, mb_row, mb_col, i);
|
||||
|
||||
if (xd->eobs[i] > 1)
|
||||
if (xd->eobs[i] )
|
||||
{
|
||||
DEQUANT_INVOKE(&pbi->dequant, idct_add)
|
||||
(b->qcoeff, b->dequant,
|
||||
*(b->base_dst) + b->dst, b->dst_stride);
|
||||
}
|
||||
else
|
||||
{
|
||||
IDCT_INVOKE(RTCD_VTABLE(idct), idct1_scalar_add)
|
||||
(b->qcoeff[0] * b->dequant[0],
|
||||
*(b->base_dst) + b->dst, b->dst_stride,
|
||||
*(b->base_dst) + b->dst, b->dst_stride);
|
||||
((int *)b->qcoeff)[0] = 0;
|
||||
if (xd->eobs[i] > 1)
|
||||
{
|
||||
DEQUANT_INVOKE(&pbi->dequant, idct_add)
|
||||
(b->qcoeff, b->dequant,
|
||||
*(b->base_dst) + b->dst, b->dst_stride);
|
||||
}
|
||||
else
|
||||
{
|
||||
IDCT_INVOKE(RTCD_VTABLE(idct), idct1_scalar_add)
|
||||
(b->qcoeff[0] * b->dequant[0],
|
||||
*(b->base_dst) + b->dst, b->dst_stride,
|
||||
*(b->base_dst) + b->dst, b->dst_stride);
|
||||
((int *)b->qcoeff)[0] = 0;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
short *DQC = xd->block[0].dequant;
|
||||
|
||||
DECLARE_ALIGNED(16, short, local_dequant[16]);
|
||||
|
||||
if (xd->mode_info_context->mbmi.mode != SPLITMV)
|
||||
{
|
||||
BLOCKD *b = &xd->block[24];
|
||||
|
||||
/* do 2nd order transform on the dc block */
|
||||
if (xd->eobs[24] > 1)
|
||||
{
|
||||
DEQUANT_INVOKE(&pbi->dequant, block)(b);
|
||||
|
||||
IDCT_INVOKE(RTCD_VTABLE(idct), iwalsh16)(&b->dqcoeff[0],
|
||||
xd->qcoeff);
|
||||
((int *)b->qcoeff)[0] = 0;
|
||||
((int *)b->qcoeff)[1] = 0;
|
||||
((int *)b->qcoeff)[2] = 0;
|
||||
((int *)b->qcoeff)[3] = 0;
|
||||
((int *)b->qcoeff)[4] = 0;
|
||||
((int *)b->qcoeff)[5] = 0;
|
||||
((int *)b->qcoeff)[6] = 0;
|
||||
((int *)b->qcoeff)[7] = 0;
|
||||
}
|
||||
else
|
||||
{
|
||||
b->dqcoeff[0] = b->qcoeff[0] * b->dequant[0];
|
||||
IDCT_INVOKE(RTCD_VTABLE(idct), iwalsh1)(&b->dqcoeff[0], xd->qcoeff);
|
||||
((int *)b->qcoeff)[0] = 0;
|
||||
}
|
||||
|
||||
/* make a local copy of the dequant constants */
|
||||
vpx_memcpy(local_dequant, xd->block[0].dequant,
|
||||
sizeof(local_dequant));
|
||||
|
||||
/* override the dc dequant constant */
|
||||
local_dequant[0] = 1;
|
||||
|
||||
/* use the new dequant constants */
|
||||
DQC = local_dequant;
|
||||
}
|
||||
|
||||
DEQUANT_INVOKE (&pbi->dequant, idct_add_y_block)
|
||||
(xd->qcoeff, xd->block[0].dequant,
|
||||
(xd->qcoeff, DQC,
|
||||
xd->dst.y_buffer,
|
||||
xd->dst.y_stride, xd->eobs);
|
||||
}
|
||||
@ -244,7 +260,6 @@ static void decode_macroblock(VP8D_COMP *pbi, MACROBLOCKD *xd, int mb_row, int m
|
||||
xd->dst.uv_stride, xd->eobs+16);
|
||||
}
|
||||
|
||||
|
||||
static THREAD_FUNCTION thread_decoding_proc(void *p_data)
|
||||
{
|
||||
int ithread = ((DECODETHREAD_DATA *)p_data)->ithread;
|
||||
|
@ -246,207 +246,6 @@ sym(vp8_dequant_idct_add_mmx):
|
||||
pop rbp
|
||||
ret
|
||||
|
||||
|
||||
;void dequant_dc_idct_add_mmx(
|
||||
;short *input, 0
|
||||
;short *dq, 1
|
||||
;unsigned char *dest, 2
|
||||
;int stride, 3
|
||||
;int Dc) 4
|
||||
global sym(vp8_dequant_dc_idct_add_mmx)
|
||||
sym(vp8_dequant_dc_idct_add_mmx):
|
||||
push rbp
|
||||
mov rbp, rsp
|
||||
SHADOW_ARGS_TO_STACK 5
|
||||
GET_GOT rbx
|
||||
; end prolog
|
||||
|
||||
mov rax, arg(0) ;input
|
||||
mov rdx, arg(1) ;dq
|
||||
|
||||
movq mm0, [rax ]
|
||||
pmullw mm0, [rdx]
|
||||
|
||||
movq mm1, [rax +8]
|
||||
pmullw mm1, [rdx +8]
|
||||
|
||||
movq mm2, [rax+16]
|
||||
pmullw mm2, [rdx+16]
|
||||
|
||||
movq mm3, [rax+24]
|
||||
pmullw mm3, [rdx+24]
|
||||
|
||||
mov rdx, arg(2) ;pred
|
||||
pxor mm7, mm7
|
||||
|
||||
|
||||
movq [rax], mm7
|
||||
movq [rax+8], mm7
|
||||
|
||||
movq [rax+16],mm7
|
||||
movq [rax+24],mm7
|
||||
|
||||
; move lower word of Dc to lower word of mm0
|
||||
psrlq mm0, 16
|
||||
movzx rcx, word ptr arg(4) ;Dc
|
||||
psllq mm0, 16
|
||||
movq mm7, rcx
|
||||
por mm0, mm7
|
||||
|
||||
movsxd rax, dword ptr arg(3) ;stride
|
||||
|
||||
psubw mm0, mm2 ; b1= 0-2
|
||||
paddw mm2, mm2 ;
|
||||
|
||||
movq mm5, mm1
|
||||
paddw mm2, mm0 ; a1 =0+2
|
||||
|
||||
pmulhw mm5, [GLOBAL(x_s1sqr2)];
|
||||
paddw mm5, mm1 ; ip1 * sin(pi/8) * sqrt(2)
|
||||
|
||||
movq mm7, mm3 ;
|
||||
pmulhw mm7, [GLOBAL(x_c1sqr2less1)];
|
||||
|
||||
paddw mm7, mm3 ; ip3 * cos(pi/8) * sqrt(2)
|
||||
psubw mm7, mm5 ; c1
|
||||
|
||||
movq mm5, mm1
|
||||
movq mm4, mm3
|
||||
|
||||
pmulhw mm5, [GLOBAL(x_c1sqr2less1)]
|
||||
paddw mm5, mm1
|
||||
|
||||
pmulhw mm3, [GLOBAL(x_s1sqr2)]
|
||||
paddw mm3, mm4
|
||||
|
||||
paddw mm3, mm5 ; d1
|
||||
movq mm6, mm2 ; a1
|
||||
|
||||
movq mm4, mm0 ; b1
|
||||
paddw mm2, mm3 ;0
|
||||
|
||||
paddw mm4, mm7 ;1
|
||||
psubw mm0, mm7 ;2
|
||||
|
||||
psubw mm6, mm3 ;3
|
||||
|
||||
movq mm1, mm2 ; 03 02 01 00
|
||||
movq mm3, mm4 ; 23 22 21 20
|
||||
|
||||
punpcklwd mm1, mm0 ; 11 01 10 00
|
||||
punpckhwd mm2, mm0 ; 13 03 12 02
|
||||
|
||||
punpcklwd mm3, mm6 ; 31 21 30 20
|
||||
punpckhwd mm4, mm6 ; 33 23 32 22
|
||||
|
||||
movq mm0, mm1 ; 11 01 10 00
|
||||
movq mm5, mm2 ; 13 03 12 02
|
||||
|
||||
punpckldq mm0, mm3 ; 30 20 10 00
|
||||
punpckhdq mm1, mm3 ; 31 21 11 01
|
||||
|
||||
punpckldq mm2, mm4 ; 32 22 12 02
|
||||
punpckhdq mm5, mm4 ; 33 23 13 03
|
||||
|
||||
movq mm3, mm5 ; 33 23 13 03
|
||||
|
||||
psubw mm0, mm2 ; b1= 0-2
|
||||
paddw mm2, mm2 ;
|
||||
|
||||
movq mm5, mm1
|
||||
paddw mm2, mm0 ; a1 =0+2
|
||||
|
||||
pmulhw mm5, [GLOBAL(x_s1sqr2)];
|
||||
paddw mm5, mm1 ; ip1 * sin(pi/8) * sqrt(2)
|
||||
|
||||
movq mm7, mm3 ;
|
||||
pmulhw mm7, [GLOBAL(x_c1sqr2less1)];
|
||||
|
||||
paddw mm7, mm3 ; ip3 * cos(pi/8) * sqrt(2)
|
||||
psubw mm7, mm5 ; c1
|
||||
|
||||
movq mm5, mm1
|
||||
movq mm4, mm3
|
||||
|
||||
pmulhw mm5, [GLOBAL(x_c1sqr2less1)]
|
||||
paddw mm5, mm1
|
||||
|
||||
pmulhw mm3, [GLOBAL(x_s1sqr2)]
|
||||
paddw mm3, mm4
|
||||
|
||||
paddw mm3, mm5 ; d1
|
||||
paddw mm0, [GLOBAL(fours)]
|
||||
|
||||
paddw mm2, [GLOBAL(fours)]
|
||||
movq mm6, mm2 ; a1
|
||||
|
||||
movq mm4, mm0 ; b1
|
||||
paddw mm2, mm3 ;0
|
||||
|
||||
paddw mm4, mm7 ;1
|
||||
psubw mm0, mm7 ;2
|
||||
|
||||
psubw mm6, mm3 ;3
|
||||
psraw mm2, 3
|
||||
|
||||
psraw mm0, 3
|
||||
psraw mm4, 3
|
||||
|
||||
psraw mm6, 3
|
||||
|
||||
movq mm1, mm2 ; 03 02 01 00
|
||||
movq mm3, mm4 ; 23 22 21 20
|
||||
|
||||
punpcklwd mm1, mm0 ; 11 01 10 00
|
||||
punpckhwd mm2, mm0 ; 13 03 12 02
|
||||
|
||||
punpcklwd mm3, mm6 ; 31 21 30 20
|
||||
punpckhwd mm4, mm6 ; 33 23 32 22
|
||||
|
||||
movq mm0, mm1 ; 11 01 10 00
|
||||
movq mm5, mm2 ; 13 03 12 02
|
||||
|
||||
punpckldq mm0, mm3 ; 30 20 10 00
|
||||
punpckhdq mm1, mm3 ; 31 21 11 01
|
||||
|
||||
punpckldq mm2, mm4 ; 32 22 12 02
|
||||
punpckhdq mm5, mm4 ; 33 23 13 03
|
||||
|
||||
pxor mm7, mm7
|
||||
|
||||
movd mm4, [rdx]
|
||||
punpcklbw mm4, mm7
|
||||
paddsw mm0, mm4
|
||||
packuswb mm0, mm7
|
||||
movd [rdx], mm0
|
||||
|
||||
movd mm4, [rdx+rax]
|
||||
punpcklbw mm4, mm7
|
||||
paddsw mm1, mm4
|
||||
packuswb mm1, mm7
|
||||
movd [rdx+rax], mm1
|
||||
|
||||
movd mm4, [rdx+2*rax]
|
||||
punpcklbw mm4, mm7
|
||||
paddsw mm2, mm4
|
||||
packuswb mm2, mm7
|
||||
movd [rdx+rax*2], mm2
|
||||
|
||||
add rdx, rax
|
||||
|
||||
movd mm4, [rdx+2*rax]
|
||||
punpcklbw mm4, mm7
|
||||
paddsw mm5, mm4
|
||||
packuswb mm5, mm7
|
||||
movd [rdx+rax*2], mm5
|
||||
|
||||
; begin epilog
|
||||
RESTORE_GOT
|
||||
UNSHADOW_ARGS
|
||||
pop rbp
|
||||
ret
|
||||
|
||||
|
||||
SECTION_RODATA
|
||||
align 16
|
||||
x_s1sqr2:
|
||||
|
@ -22,8 +22,6 @@
|
||||
#if HAVE_MMX
|
||||
extern prototype_dequant_block(vp8_dequantize_b_mmx);
|
||||
extern prototype_dequant_idct_add(vp8_dequant_idct_add_mmx);
|
||||
extern prototype_dequant_dc_idct_add(vp8_dequant_dc_idct_add_mmx);
|
||||
extern prototype_dequant_dc_idct_add_y_block(vp8_dequant_dc_idct_add_y_block_mmx);
|
||||
extern prototype_dequant_idct_add_y_block(vp8_dequant_idct_add_y_block_mmx);
|
||||
extern prototype_dequant_idct_add_uv_block(vp8_dequant_idct_add_uv_block_mmx);
|
||||
|
||||
@ -34,12 +32,6 @@ extern prototype_dequant_idct_add_uv_block(vp8_dequant_idct_add_uv_block_mmx);
|
||||
#undef vp8_dequant_idct_add
|
||||
#define vp8_dequant_idct_add vp8_dequant_idct_add_mmx
|
||||
|
||||
#undef vp8_dequant_dc_idct_add
|
||||
#define vp8_dequant_dc_idct_add vp8_dequant_dc_idct_add_mmx
|
||||
|
||||
#undef vp8_dequant_dc_idct_add_y_block
|
||||
#define vp8_dequant_dc_idct_add_y_block vp8_dequant_dc_idct_add_y_block_mmx
|
||||
|
||||
#undef vp8_dequant_idct_add_y_block
|
||||
#define vp8_dequant_idct_add_y_block vp8_dequant_idct_add_y_block_mmx
|
||||
|
||||
@ -50,14 +42,10 @@ extern prototype_dequant_idct_add_uv_block(vp8_dequant_idct_add_uv_block_mmx);
|
||||
#endif
|
||||
|
||||
#if HAVE_SSE2
|
||||
extern prototype_dequant_dc_idct_add_y_block(vp8_dequant_dc_idct_add_y_block_sse2);
|
||||
extern prototype_dequant_idct_add_y_block(vp8_dequant_idct_add_y_block_sse2);
|
||||
extern prototype_dequant_idct_add_uv_block(vp8_dequant_idct_add_uv_block_sse2);
|
||||
|
||||
#if !CONFIG_RUNTIME_CPU_DETECT
|
||||
#undef vp8_dequant_dc_idct_add_y_block
|
||||
#define vp8_dequant_dc_idct_add_y_block vp8_dequant_dc_idct_add_y_block_sse2
|
||||
|
||||
#undef vp8_dequant_idct_add_y_block
|
||||
#define vp8_dequant_idct_add_y_block vp8_dequant_idct_add_y_block_sse2
|
||||
|
||||
|
@ -12,41 +12,6 @@
|
||||
#include "vp8/common/idct.h"
|
||||
#include "vp8/decoder/dequantize.h"
|
||||
|
||||
void vp8_dequant_dc_idct_add_y_block_mmx
|
||||
(short *q, short *dq,
|
||||
unsigned char *dst, int stride, char *eobs, short *dc)
|
||||
{
|
||||
int i;
|
||||
|
||||
for (i = 0; i < 4; i++)
|
||||
{
|
||||
if (eobs[0] > 1)
|
||||
vp8_dequant_dc_idct_add_mmx (q, dq, dst, stride, dc[0]);
|
||||
else if (eobs[0] == 1)
|
||||
vp8_dc_only_idct_add_mmx (dc[0], dst, stride, dst, stride);
|
||||
|
||||
if (eobs[1] > 1)
|
||||
vp8_dequant_dc_idct_add_mmx (q+16, dq, dst+4, stride, dc[1]);
|
||||
else if (eobs[1] == 1)
|
||||
vp8_dc_only_idct_add_mmx (dc[1], dst+4, stride, dst+4, stride);
|
||||
|
||||
if (eobs[2] > 1)
|
||||
vp8_dequant_dc_idct_add_mmx (q+32, dq, dst+8, stride, dc[2]);
|
||||
else if (eobs[2] == 1)
|
||||
vp8_dc_only_idct_add_mmx (dc[2], dst+8, stride, dst+8, stride);
|
||||
|
||||
if (eobs[3] > 1)
|
||||
vp8_dequant_dc_idct_add_mmx (q+48, dq, dst+12, stride, dc[3]);
|
||||
else if (eobs[3] == 1)
|
||||
vp8_dc_only_idct_add_mmx (dc[3], dst+12, stride, dst+12, stride);
|
||||
|
||||
q += 64;
|
||||
dc += 4;
|
||||
dst += 4*stride;
|
||||
eobs += 4;
|
||||
}
|
||||
}
|
||||
|
||||
void vp8_dequant_idct_add_y_block_mmx
|
||||
(short *q, short *dq,
|
||||
unsigned char *dst, int stride, char *eobs)
|
||||
|
@ -12,13 +12,6 @@
|
||||
#include "vp8/common/idct.h"
|
||||
#include "vp8/decoder/dequantize.h"
|
||||
|
||||
void vp8_idct_dequant_dc_0_2x_sse2
|
||||
(short *q, short *dq,
|
||||
unsigned char *dst, int dst_stride, short *dc);
|
||||
void vp8_idct_dequant_dc_full_2x_sse2
|
||||
(short *q, short *dq,
|
||||
unsigned char *dst, int dst_stride, short *dc);
|
||||
|
||||
void vp8_idct_dequant_0_2x_sse2
|
||||
(short *q, short *dq ,
|
||||
unsigned char *dst, int dst_stride);
|
||||
@ -26,36 +19,6 @@ void vp8_idct_dequant_full_2x_sse2
|
||||
(short *q, short *dq ,
|
||||
unsigned char *dst, int dst_stride);
|
||||
|
||||
void vp8_dequant_dc_idct_add_y_block_sse2
|
||||
(short *q, short *dq,
|
||||
unsigned char *dst, int stride, char *eobs, short *dc)
|
||||
{
|
||||
int i;
|
||||
|
||||
for (i = 0; i < 4; i++)
|
||||
{
|
||||
if (((short *)(eobs))[0])
|
||||
{
|
||||
if (((short *)(eobs))[0] & 0xfefe)
|
||||
vp8_idct_dequant_dc_full_2x_sse2 (q, dq, dst, stride, dc);
|
||||
else
|
||||
vp8_idct_dequant_dc_0_2x_sse2 (q, dq, dst, stride, dc);
|
||||
}
|
||||
|
||||
if (((short *)(eobs))[1])
|
||||
{
|
||||
if (((short *)(eobs))[1] & 0xfefe)
|
||||
vp8_idct_dequant_dc_full_2x_sse2 (q+32, dq, dst+8, stride, dc+2);
|
||||
else
|
||||
vp8_idct_dequant_dc_0_2x_sse2 (q+32, dq, dst+8, stride, dc+2);
|
||||
}
|
||||
q += 64;
|
||||
dc += 4;
|
||||
dst += stride*4;
|
||||
eobs += 4;
|
||||
}
|
||||
}
|
||||
|
||||
void vp8_dequant_idct_add_y_block_sse2
|
||||
(short *q, short *dq,
|
||||
unsigned char *dst, int stride, char *eobs)
|
||||
|
@ -43,8 +43,6 @@ void vp8_arch_x86_decode_init(VP8D_COMP *pbi)
|
||||
{
|
||||
pbi->dequant.block = vp8_dequantize_b_mmx;
|
||||
pbi->dequant.idct_add = vp8_dequant_idct_add_mmx;
|
||||
pbi->dequant.dc_idct_add = vp8_dequant_dc_idct_add_mmx;
|
||||
pbi->dequant.dc_idct_add_y_block = vp8_dequant_dc_idct_add_y_block_mmx;
|
||||
pbi->dequant.idct_add_y_block = vp8_dequant_idct_add_y_block_mmx;
|
||||
pbi->dequant.idct_add_uv_block = vp8_dequant_idct_add_uv_block_mmx;
|
||||
}
|
||||
@ -52,8 +50,6 @@ void vp8_arch_x86_decode_init(VP8D_COMP *pbi)
|
||||
#if HAVE_SSE2
|
||||
if (flags & HAS_SSE2)
|
||||
{
|
||||
pbi->dequant.dc_idct_add_y_block = vp8_dequant_dc_idct_add_y_block_sse2;
|
||||
pbi->dequant.idct_add_y_block = vp8_dequant_idct_add_y_block_sse2;
|
||||
pbi->dequant.idct_add_uv_block = vp8_dequant_idct_add_uv_block_sse2;
|
||||
}
|
||||
#endif
|
||||
|
@ -16,14 +16,11 @@ VP8_DX_SRCS-$(ARCH_ARM) += decoder/arm/dequantize_arm.c
|
||||
VP8_DX_SRCS-$(ARCH_ARM) += decoder/arm/dequantize_arm.h
|
||||
|
||||
#File list for armv6
|
||||
VP8_DX_SRCS-$(HAVE_ARMV6) += decoder/arm/armv6/dequant_dc_idct_v6$(ASM)
|
||||
VP8_DX_SRCS-$(HAVE_ARMV6) += decoder/arm/armv6/dequant_idct_v6$(ASM)
|
||||
VP8_DX_SRCS-$(HAVE_ARMV6) += decoder/arm/armv6/dequantize_v6$(ASM)
|
||||
VP8_DX_SRCS-$(HAVE_ARMV6) += decoder/arm/armv6/idct_blk_v6.c
|
||||
|
||||
#File list for neon
|
||||
VP8_DX_SRCS-$(HAVE_ARMV7) += decoder/arm/neon/idct_dequant_dc_full_2x_neon$(ASM)
|
||||
VP8_DX_SRCS-$(HAVE_ARMV7) += decoder/arm/neon/idct_dequant_dc_0_2x_neon$(ASM)
|
||||
VP8_DX_SRCS-$(HAVE_ARMV7) += decoder/arm/neon/dequant_idct_neon$(ASM)
|
||||
VP8_DX_SRCS-$(HAVE_ARMV7) += decoder/arm/neon/idct_dequant_full_2x_neon$(ASM)
|
||||
VP8_DX_SRCS-$(HAVE_ARMV7) += decoder/arm/neon/idct_dequant_0_2x_neon$(ASM)
|
||||
|
Loading…
Reference in New Issue
Block a user