Merge "update arm idct functions"

This commit is contained in:
Fritz Koenig 2010-07-26 06:05:39 -07:00 committed by Code Review
commit 1743f9486b
20 changed files with 675 additions and 527 deletions

View File

@ -0,0 +1,67 @@
;
; Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
;
; Use of this source code is governed by a BSD-style license and patent
; grant that can be found in the LICENSE file in the root of the source
; tree. All contributing project authors may be found in the AUTHORS
; file in the root of the source tree.
;
EXPORT |vp8_dc_only_idct_add_v6|
AREA |.text|, CODE, READONLY
;void vp8_dc_only_idct_add_v6(short input_dc, unsigned char *pred_ptr,
; unsigned char *dst_ptr, int pitch, int stride)
; r0 input_dc
; r1 pred_ptr
; r2 dest_ptr
; r3 pitch
; sp stride
|vp8_dc_only_idct_add_v6| PROC
stmdb sp!, {r4 - r7, lr}
add r0, r0, #4 ; input_dc += 4
ldr r12, c0x0000FFFF
ldr r4, [r1], r3
ldr r6, [r1], r3
and r0, r12, r0, asr #3 ; input_dc >> 3 + mask
ldr lr, [sp, #20]
orr r0, r0, r0, lsl #16 ; a1 | a1
uxtab16 r5, r0, r4 ; a1+2 | a1+0
uxtab16 r4, r0, r4, ror #8 ; a1+3 | a1+1
uxtab16 r7, r0, r6
uxtab16 r6, r0, r6, ror #8
usat16 r5, #8, r5
usat16 r4, #8, r4
usat16 r7, #8, r7
usat16 r6, #8, r6
orr r5, r5, r4, lsl #8
orr r7, r7, r6, lsl #8
ldr r4, [r1], r3
ldr r6, [r1]
str r5, [r2], lr
str r7, [r2], lr
uxtab16 r5, r0, r4
uxtab16 r4, r0, r4, ror #8
uxtab16 r7, r0, r6
uxtab16 r6, r0, r6, ror #8
usat16 r5, #8, r5
usat16 r4, #8, r4
usat16 r7, #8, r7
usat16 r6, #8, r6
orr r5, r5, r4, lsl #8
orr r7, r7, r6, lsl #8
str r5, [r2], lr
str r7, [r2]
ldmia sp!, {r4 - r7, pc}
ENDP ; |vp8_dc_only_idct_add_v6|
; Constant Pool
c0x0000FFFF DCD 0x0000FFFF
END

View File

@ -15,8 +15,6 @@
EXPORT |vp8_short_idct4x4llm_v6_scott|
EXPORT |vp8_short_idct4x4llm_v6_dual|
EXPORT |vp8_dc_only_idct_armv6|
AREA |.text|, CODE, READONLY
;********************************************************************************
@ -344,34 +342,4 @@ loop2_dual
ldmia sp!, {r4 - r11, pc} ; replace vars, return restore
ENDP
; sjl added 10/17/08
;void dc_only_idct_armv6(short input_dc, short *output, int pitch)
|vp8_dc_only_idct_armv6| PROC
stmdb sp!, {r4 - r6, lr}
add r0, r0, #0x4
add r4, r1, r2 ; output + shortpitch
mov r0, r0, ASR #0x3 ;aka a1
add r5, r1, r2, LSL #1 ; output + shortpitch * 2
pkhbt r0, r0, r0, lsl #16 ; a1 | a1
add r6, r5, r2 ; output + shortpitch * 3
str r0, [r1, #0]
str r0, [r1, #4]
str r0, [r4, #0]
str r0, [r4, #4]
str r0, [r5, #0]
str r0, [r5, #4]
str r0, [r6, #0]
str r0, [r6, #4]
ldmia sp!, {r4 - r6, pc}
ENDP ; |vp8_dc_only_idct_armv6|
END

View File

@ -8,8 +8,8 @@
; be found in the AUTHORS file in the root of the source tree.
;
EXPORT |vp8_short_inv_walsh4x4_armv6|
EXPORT |vp8_short_inv_walsh4x4_1_armv6|
EXPORT |vp8_short_inv_walsh4x4_v6|
EXPORT |vp8_short_inv_walsh4x4_1_v6|
ARM
REQUIRE8
@ -17,8 +17,8 @@
AREA |.text|, CODE, READONLY ; name this block of code
;short vp8_short_inv_walsh4x4_armv6(short *input, short *output)
|vp8_short_inv_walsh4x4_armv6| PROC
;short vp8_short_inv_walsh4x4_v6(short *input, short *output)
|vp8_short_inv_walsh4x4_v6| PROC
stmdb sp!, {r4 - r11, lr}
@ -123,11 +123,11 @@
str r5, [r1]
ldmia sp!, {r4 - r11, pc}
ENDP ; |vp8_short_inv_walsh4x4_armv6|
ENDP ; |vp8_short_inv_walsh4x4_v6|
;short vp8_short_inv_walsh4x4_1_armv6(short *input, short *output)
|vp8_short_inv_walsh4x4_1_armv6| PROC
;short vp8_short_inv_walsh4x4_1_v6(short *input, short *output)
|vp8_short_inv_walsh4x4_1_v6| PROC
ldrsh r2, [r0] ; [0]
add r2, r2, #3 ; [0] + 3
@ -145,7 +145,7 @@
str r2, [r1]
bx lr
ENDP ; |vp8_short_inv_walsh4x4_1_armv6|
ENDP ; |vp8_short_inv_walsh4x4_1_v6|
; Constant Pool
c0x00030003 DCD 0x00030003

View File

@ -15,8 +15,9 @@
#if HAVE_ARMV6
extern prototype_idct(vp8_short_idct4x4llm_1_v6);
extern prototype_idct(vp8_short_idct4x4llm_v6_dual);
extern prototype_second_order(vp8_short_inv_walsh4x4_1_armv6);
extern prototype_second_order(vp8_short_inv_walsh4x4_armv6);
extern prototype_idct_scalar_add(vp8_dc_only_idct_add_v6);
extern prototype_second_order(vp8_short_inv_walsh4x4_1_v6);
extern prototype_second_order(vp8_short_inv_walsh4x4_v6);
#undef vp8_idct_idct1
#define vp8_idct_idct1 vp8_short_idct4x4llm_1_v6
@ -24,16 +25,20 @@ extern prototype_second_order(vp8_short_inv_walsh4x4_armv6);
#undef vp8_idct_idct16
#define vp8_idct_idct16 vp8_short_idct4x4llm_v6_dual
#undef vp8_idct_idct1_scalar_add
#define vp8_idct_idct1_scalar_add vp8_dc_only_idct_add_v6
#undef vp8_idct_iwalsh1
#define vp8_idct_iwalsh1 vp8_short_inv_walsh4x4_1_armv6
#define vp8_idct_iwalsh1 vp8_short_inv_walsh4x4_1_v6
#undef vp8_idct_iwalsh16
#define vp8_idct_iwalsh16 vp8_short_inv_walsh4x4_armv6
#define vp8_idct_iwalsh16 vp8_short_inv_walsh4x4_v6
#endif
#if HAVE_ARMV7
extern prototype_idct(vp8_short_idct4x4llm_1_neon);
extern prototype_idct(vp8_short_idct4x4llm_neon);
extern prototype_idct_scalar_add(vp8_dc_only_idct_add_neon);
extern prototype_second_order(vp8_short_inv_walsh4x4_1_neon);
extern prototype_second_order(vp8_short_inv_walsh4x4_neon);
@ -43,6 +48,9 @@ extern prototype_second_order(vp8_short_inv_walsh4x4_neon);
#undef vp8_idct_idct16
#define vp8_idct_idct16 vp8_short_idct4x4llm_neon
#undef vp8_idct_idct1_scalar_add
#define vp8_idct_idct1_scalar_add vp8_dc_only_idct_add_neon
#undef vp8_idct_iwalsh1
#define vp8_idct_iwalsh1 vp8_short_inv_walsh4x4_1_neon

View File

@ -0,0 +1,49 @@
;
; Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
;
; Use of this source code is governed by a BSD-style license and patent
; grant that can be found in the LICENSE file in the root of the source
; tree. All contributing project authors may be found in the AUTHORS
; file in the root of the source tree.
;
EXPORT |vp8_dc_only_idct_add_neon|
ARM
REQUIRE8
PRESERVE8
AREA ||.text||, CODE, READONLY, ALIGN=2
;void vp8_dc_only_idct_add_neon(short input_dc, unsigned char *pred_ptr,
; unsigned char *dst_ptr, int pitch, int stride)
; r0 input_dc
; r1 pred_ptr
; r2 dst_ptr
; r3 pitch
; sp stride
|vp8_dc_only_idct_add_neon| PROC
add r0, r0, #4
asr r0, r0, #3
ldr r12, [sp]
vdup.16 q0, r0
vld1.32 {d2[0]}, [r1], r3
vld1.32 {d2[1]}, [r1], r3
vld1.32 {d4[0]}, [r1], r3
vld1.32 {d4[1]}, [r1]
vaddw.u8 q1, q0, d2
vaddw.u8 q2, q0, d4
vqmovun.s16 d2, q1
vqmovun.s16 d4, q2
vst1.32 {d2[0]}, [r2], r12
vst1.32 {d2[1]}, [r2], r12
vst1.32 {d4[0]}, [r2], r12
vst1.32 {d4[1]}, [r2]
bx lr
ENDP
END

View File

@ -0,0 +1,218 @@
;
; Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
;
; Use of this source code is governed by a BSD-style license and patent
; grant that can be found in the LICENSE file in the root of the source
; tree. All contributing project authors may be found in the AUTHORS
; file in the root of the source tree.
;
EXPORT |vp8_dequant_dc_idct_add_v6|
AREA |.text|, CODE, READONLY
;void vp8_dequant_dc_idct_v6(short *input, short *dq, unsigned char *pred,
; unsigned char *dest, int pitch, int stride, int Dc)
; r0 = input
; r1 = dq
; r2 = pred
; r3 = dest
; sp + 36 = pitch ; +4 = 40
; sp + 40 = stride ; +4 = 44
; sp + 44 = Dc ; +4 = 48
|vp8_dequant_dc_idct_add_v6| PROC
stmdb sp!, {r4-r11, lr}
ldr r6, [sp, #44]
ldr r4, [r0] ;input
ldr r5, [r1], #4 ;dq
sub sp, sp, #4
str r3, [sp]
smultt r7, r4, r5
ldr r4, [r0, #4] ;input
ldr r5, [r1], #4 ;dq
strh r6, [r0], #2
strh r7, [r0], #2
smulbb r6, r4, r5
smultt r7, r4, r5
ldr r4, [r0, #4] ;input
ldr r5, [r1], #4 ;dq
strh r6, [r0], #2
strh r7, [r0], #2
mov r12, #3
vp8_dequant_dc_add_loop
smulbb r6, r4, r5
smultt r7, r4, r5
ldr r4, [r0, #4] ;input
ldr r5, [r1], #4 ;dq
strh r6, [r0], #2
strh r7, [r0], #2
smulbb r6, r4, r5
smultt r7, r4, r5
subs r12, r12, #1
ldrne r4, [r0, #4]
ldrne r5, [r1], #4
strh r6, [r0], #2
strh r7, [r0], #2
bne vp8_dequant_dc_add_loop
sub r0, r0, #32
mov r1, r0
; short_idct4x4llm_v6_dual
ldr r3, cospi8sqrt2minus1
ldr r4, sinpi8sqrt2
ldr r6, [r0, #8]
mov r5, #2
vp8_dequant_dc_idct_loop1_v6
ldr r12, [r0, #24]
ldr r14, [r0, #16]
smulwt r9, r3, r6
smulwb r7, r3, r6
smulwt r10, r4, r6
smulwb r8, r4, r6
pkhbt r7, r7, r9, lsl #16
smulwt r11, r3, r12
pkhbt r8, r8, r10, lsl #16
uadd16 r6, r6, r7
smulwt r7, r4, r12
smulwb r9, r3, r12
smulwb r10, r4, r12
subs r5, r5, #1
pkhbt r9, r9, r11, lsl #16
ldr r11, [r0], #4
pkhbt r10, r10, r7, lsl #16
uadd16 r7, r12, r9
usub16 r7, r8, r7
uadd16 r6, r6, r10
uadd16 r10, r11, r14
usub16 r8, r11, r14
uadd16 r9, r10, r6
usub16 r10, r10, r6
uadd16 r6, r8, r7
usub16 r7, r8, r7
str r6, [r1, #8]
ldrne r6, [r0, #8]
str r7, [r1, #16]
str r10, [r1, #24]
str r9, [r1], #4
bne vp8_dequant_dc_idct_loop1_v6
mov r5, #2
sub r0, r1, #8
vp8_dequant_dc_idct_loop2_v6
ldr r6, [r0], #4
ldr r7, [r0], #4
ldr r8, [r0], #4
ldr r9, [r0], #4
smulwt r1, r3, r6
smulwt r12, r4, r6
smulwt lr, r3, r8
smulwt r10, r4, r8
pkhbt r11, r8, r6, lsl #16
pkhbt r1, lr, r1, lsl #16
pkhbt r12, r10, r12, lsl #16
pkhtb r6, r6, r8, asr #16
uadd16 r6, r1, r6
pkhbt lr, r9, r7, lsl #16
uadd16 r10, r11, lr
usub16 lr, r11, lr
pkhtb r8, r7, r9, asr #16
subs r5, r5, #1
smulwt r1, r3, r8
smulwb r7, r3, r8
smulwt r11, r4, r8
smulwb r9, r4, r8
pkhbt r1, r7, r1, lsl #16
uadd16 r8, r1, r8
pkhbt r11, r9, r11, lsl #16
usub16 r1, r12, r8
uadd16 r8, r11, r6
ldr r9, c0x00040004
ldr r12, [sp, #40]
uadd16 r6, r10, r8
usub16 r7, r10, r8
uadd16 r7, r7, r9
uadd16 r6, r6, r9
uadd16 r10, r14, r1
usub16 r1, r14, r1
uadd16 r10, r10, r9
uadd16 r1, r1, r9
ldr r11, [r2], r12
mov r8, r7, asr #3
pkhtb r9, r8, r10, asr #19
mov r8, r1, asr #3
pkhtb r8, r8, r6, asr #19
uxtb16 lr, r11, ror #8
qadd16 r9, r9, lr
uxtb16 lr, r11
qadd16 r8, r8, lr
usat16 r9, #8, r9
usat16 r8, #8, r8
orr r9, r8, r9, lsl #8
ldr r11, [r2], r12
ldr lr, [sp]
ldr r12, [sp, #44]
mov r7, r7, lsl #16
mov r1, r1, lsl #16
mov r10, r10, lsl #16
mov r6, r6, lsl #16
mov r7, r7, asr #3
pkhtb r7, r7, r10, asr #19
mov r1, r1, asr #3
pkhtb r1, r1, r6, asr #19
uxtb16 r8, r11, ror #8
qadd16 r7, r7, r8
uxtb16 r8, r11
qadd16 r1, r1, r8
usat16 r7, #8, r7
usat16 r1, #8, r1
orr r1, r1, r7, lsl #8
str r9, [lr], r12
str r1, [lr], r12
str lr, [sp]
bne vp8_dequant_dc_idct_loop2_v6
; vpx_memset
sub r0, r0, #32
add sp, sp, #4
mov r12, #0
str r12, [r0]
str r12, [r0, #4]
str r12, [r0, #8]
str r12, [r0, #12]
str r12, [r0, #16]
str r12, [r0, #20]
str r12, [r0, #24]
str r12, [r0, #28]
ldmia sp!, {r4 - r11, pc}
ENDP ; |vp8_dequant_dc_idct_add_v6|
; Constant Pool
cospi8sqrt2minus1 DCD 0x00004E7B
sinpi8sqrt2 DCD 0x00008A8C
c0x00040004 DCD 0x00040004
END

View File

@ -0,0 +1,196 @@
;
; Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
;
; Use of this source code is governed by a BSD-style license and patent
; grant that can be found in the LICENSE file in the root of the source
; tree. All contributing project authors may be found in the AUTHORS
; file in the root of the source tree.
;
EXPORT |vp8_dequant_idct_add_v6|
AREA |.text|, CODE, READONLY
;void vp8_dequant_idct_v6(short *input, short *dq, unsigned char *pred,
; unsigned char *dest, int pitch, int stride)
; r0 = input
; r1 = dq
; r2 = pred
; r3 = dest
; sp + 36 = pitch ; +4 = 40
; sp + 40 = stride ; +4 = 44
|vp8_dequant_idct_add_v6| PROC
stmdb sp!, {r4-r11, lr}
ldr r4, [r0] ;input
ldr r5, [r1], #4 ;dq
sub sp, sp, #4
str r3, [sp]
mov r12, #4
vp8_dequant_add_loop
smulbb r6, r4, r5
smultt r7, r4, r5
ldr r4, [r0, #4] ;input
ldr r5, [r1], #4 ;dq
strh r6, [r0], #2
strh r7, [r0], #2
smulbb r6, r4, r5
smultt r7, r4, r5
subs r12, r12, #1
ldrne r4, [r0, #4]
ldrne r5, [r1], #4
strh r6, [r0], #2
strh r7, [r0], #2
bne vp8_dequant_add_loop
sub r0, r0, #32
mov r1, r0
; short_idct4x4llm_v6_dual
ldr r3, cospi8sqrt2minus1
ldr r4, sinpi8sqrt2
ldr r6, [r0, #8]
mov r5, #2
vp8_dequant_idct_loop1_v6
ldr r12, [r0, #24]
ldr r14, [r0, #16]
smulwt r9, r3, r6
smulwb r7, r3, r6
smulwt r10, r4, r6
smulwb r8, r4, r6
pkhbt r7, r7, r9, lsl #16
smulwt r11, r3, r12
pkhbt r8, r8, r10, lsl #16
uadd16 r6, r6, r7
smulwt r7, r4, r12
smulwb r9, r3, r12
smulwb r10, r4, r12
subs r5, r5, #1
pkhbt r9, r9, r11, lsl #16
ldr r11, [r0], #4
pkhbt r10, r10, r7, lsl #16
uadd16 r7, r12, r9
usub16 r7, r8, r7
uadd16 r6, r6, r10
uadd16 r10, r11, r14
usub16 r8, r11, r14
uadd16 r9, r10, r6
usub16 r10, r10, r6
uadd16 r6, r8, r7
usub16 r7, r8, r7
str r6, [r1, #8]
ldrne r6, [r0, #8]
str r7, [r1, #16]
str r10, [r1, #24]
str r9, [r1], #4
bne vp8_dequant_idct_loop1_v6
mov r5, #2
sub r0, r1, #8
vp8_dequant_idct_loop2_v6
ldr r6, [r0], #4
ldr r7, [r0], #4
ldr r8, [r0], #4
ldr r9, [r0], #4
smulwt r1, r3, r6
smulwt r12, r4, r6
smulwt lr, r3, r8
smulwt r10, r4, r8
pkhbt r11, r8, r6, lsl #16
pkhbt r1, lr, r1, lsl #16
pkhbt r12, r10, r12, lsl #16
pkhtb r6, r6, r8, asr #16
uadd16 r6, r1, r6
pkhbt lr, r9, r7, lsl #16
uadd16 r10, r11, lr
usub16 lr, r11, lr
pkhtb r8, r7, r9, asr #16
subs r5, r5, #1
smulwt r1, r3, r8
smulwb r7, r3, r8
smulwt r11, r4, r8
smulwb r9, r4, r8
pkhbt r1, r7, r1, lsl #16
uadd16 r8, r1, r8
pkhbt r11, r9, r11, lsl #16
usub16 r1, r12, r8
uadd16 r8, r11, r6
ldr r9, c0x00040004
ldr r12, [sp, #40]
uadd16 r6, r10, r8
usub16 r7, r10, r8
uadd16 r7, r7, r9
uadd16 r6, r6, r9
uadd16 r10, r14, r1
usub16 r1, r14, r1
uadd16 r10, r10, r9
uadd16 r1, r1, r9
ldr r11, [r2], r12
mov r8, r7, asr #3
pkhtb r9, r8, r10, asr #19
mov r8, r1, asr #3
pkhtb r8, r8, r6, asr #19
uxtb16 lr, r11, ror #8
qadd16 r9, r9, lr
uxtb16 lr, r11
qadd16 r8, r8, lr
usat16 r9, #8, r9
usat16 r8, #8, r8
orr r9, r8, r9, lsl #8
ldr r11, [r2], r12
ldr lr, [sp]
ldr r12, [sp, #44]
mov r7, r7, lsl #16
mov r1, r1, lsl #16
mov r10, r10, lsl #16
mov r6, r6, lsl #16
mov r7, r7, asr #3
pkhtb r7, r7, r10, asr #19
mov r1, r1, asr #3
pkhtb r1, r1, r6, asr #19
uxtb16 r8, r11, ror #8
qadd16 r7, r7, r8
uxtb16 r8, r11
qadd16 r1, r1, r8
usat16 r7, #8, r7
usat16 r1, #8, r1
orr r1, r1, r7, lsl #8
str r9, [lr], r12
str r1, [lr], r12
str lr, [sp]
bne vp8_dequant_idct_loop2_v6
; vpx_memset
sub r0, r0, #32
add sp, sp, #4
mov r12, #0
str r12, [r0]
str r12, [r0, #4]
str r12, [r0, #8]
str r12, [r0, #12]
str r12, [r0, #16]
str r12, [r0, #20]
str r12, [r0, #24]
str r12, [r0, #28]
ldmia sp!, {r4 - r11, pc}
ENDP ; |vp8_dequant_idct_add_v6|
; Constant Pool
cospi8sqrt2minus1 DCD 0x00004E7B
sinpi8sqrt2 DCD 0x00008A8C
c0x00040004 DCD 0x00040004
END

View File

@ -1,203 +0,0 @@
;
; Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
;
; Use of this source code is governed by a BSD-style license
; that can be found in the LICENSE file in the root of the source
; tree. An additional intellectual property rights grant can be found
; in the file PATENTS. All contributing project authors may
; be found in the AUTHORS file in the root of the source tree.
;
EXPORT |vp8_dequant_dc_idct_v6|
; ARM
; REQUIRE8
; PRESERVE8
AREA |.text|, CODE, READONLY ; name this block of code
;void vp8_dequant_dc_idct_v6(short *input, short *dq, short *output, int pitch,int Dc)
|vp8_dequant_dc_idct_v6| PROC
stmdb sp!, {r4-r11, lr}
ldr r6, [sp, #36] ;load Dc
ldr r4, [r0] ;input
ldr r5, [r1], #4 ;dq
sub sp, sp, #4
str r0, [sp]
smultt r7, r4, r5
ldr r4, [r0, #4] ;input
ldr r5, [r1], #4 ;dq
strh r6, [r0], #2
strh r7, [r0], #2
smulbb r6, r4, r5
smultt r7, r4, r5
ldr r4, [r0, #4] ;input
ldr r5, [r1], #4 ;dq
strh r6, [r0], #2
strh r7, [r0], #2
mov r12, #3
dequant_dc_idct_loop
smulbb r6, r4, r5
smultt r7, r4, r5
ldr r4, [r0, #4] ;input
ldr r5, [r1], #4 ;dq
strh r6, [r0], #2
strh r7, [r0], #2
smulbb r6, r4, r5
smultt r7, r4, r5
subs r12, r12, #1
ldrne r4, [r0, #4]
ldrne r5, [r1], #4
strh r6, [r0], #2
strh r7, [r0], #2
bne dequant_dc_idct_loop
sub r0, r0, #32
mov r1, r2
mov r2, r3
; short_idct4x4llm_v6_dual
mov r3, #0x00004E00 ; cos
orr r3, r3, #0x0000007B ; cospi8sqrt2minus1
mov r4, #0x00008A00 ; sin
orr r4, r4, #0x0000008C ; sinpi8sqrt2
mov r5, #0x2 ; i=2 i
loop1_dual_11
ldr r6, [r0, #(4*2)] ; i5 | i4 5|4
ldr r12, [r0, #(12*2)] ; i13 | i12 13|12
ldr r14, [r0, #(8*2)] ; i9 | i8 9|8
smulwt r9, r3, r6 ; (ip[5] * cospi8sqrt2minus1) >> 16 5c
smulwb r7, r3, r6 ; (ip[4] * cospi8sqrt2minus1) >> 16 4c
smulwt r10, r4, r6 ; (ip[5] * sinpi8sqrt2) >> 16 5s
smulwb r8, r4, r6 ; (ip[4] * sinpi8sqrt2) >> 16 4s
pkhbt r7, r7, r9, lsl #16 ; 5c | 4c
smulwt r11, r3, r12 ; (ip[13] * cospi8sqrt2minus1) >> 16 13c
pkhbt r8, r8, r10, lsl #16 ; 5s | 4s
uadd16 r6, r6, r7 ; 5c+5 | 4c+4
smulwt r7, r4, r12 ; (ip[13] * sinpi8sqrt2) >> 16 13s
smulwb r9, r3, r12 ; (ip[12] * cospi8sqrt2minus1) >> 16 12c
smulwb r10, r4, r12 ; (ip[12] * sinpi8sqrt2) >> 16 12s
subs r5, r5, #0x1 ; i-- --
pkhbt r9, r9, r11, lsl #16 ; 13c | 12c
ldr r11, [r0], #0x4 ; i1 | i0 ++ 1|0
pkhbt r10, r10, r7, lsl #16 ; 13s | 12s
uadd16 r7, r12, r9 ; 13c+13 | 12c+12
usub16 r7, r8, r7 ; c c
uadd16 r6, r6, r10 ; d d
uadd16 r10, r11, r14 ; a a
usub16 r8, r11, r14 ; b b
uadd16 r9, r10, r6 ; a+d a+d
usub16 r10, r10, r6 ; a-d a-d
uadd16 r6, r8, r7 ; b+c b+c
usub16 r7, r8, r7 ; b-c b-c
str r6, [r1, r2] ; o5 | o4
add r6, r2, r2 ; pitch * 2 p2
str r7, [r1, r6] ; o9 | o8
add r6, r6, r2 ; pitch * 3 p3
str r10, [r1, r6] ; o13 | o12
str r9, [r1], #0x4 ; o1 | o0 ++
bne loop1_dual_11 ;
mov r5, #0x2 ; i=2 i
sub r0, r1, #8 ; reset input/output i/o
loop2_dual_22
ldr r6, [r0, r2] ; i5 | i4 5|4
ldr r1, [r0] ; i1 | i0 1|0
ldr r12, [r0, #0x4] ; i3 | i2 3|2
add r14, r2, #0x4 ; pitch + 2 p+2
ldr r14, [r0, r14] ; i7 | i6 7|6
smulwt r9, r3, r6 ; (ip[5] * cospi8sqrt2minus1) >> 16 5c
smulwt r7, r3, r1 ; (ip[1] * cospi8sqrt2minus1) >> 16 1c
smulwt r10, r4, r6 ; (ip[5] * sinpi8sqrt2) >> 16 5s
smulwt r8, r4, r1 ; (ip[1] * sinpi8sqrt2) >> 16 1s
pkhbt r11, r6, r1, lsl #16 ; i0 | i4 0|4
pkhbt r7, r9, r7, lsl #16 ; 1c | 5c
pkhbt r8, r10, r8, lsl #16 ; 1s | 5s = temp1 © tc1
pkhtb r1, r1, r6, asr #16 ; i1 | i5 1|5
uadd16 r1, r7, r1 ; 1c+1 | 5c+5 = temp2 (d) td2
pkhbt r9, r14, r12, lsl #16 ; i2 | i6 2|6
uadd16 r10, r11, r9 ; a a
usub16 r9, r11, r9 ; b b
pkhtb r6, r12, r14, asr #16 ; i3 | i7 3|7
subs r5, r5, #0x1 ; i-- --
smulwt r7, r3, r6 ; (ip[3] * cospi8sqrt2minus1) >> 16 3c
smulwt r11, r4, r6 ; (ip[3] * sinpi8sqrt2) >> 16 3s
smulwb r12, r3, r6 ; (ip[7] * cospi8sqrt2minus1) >> 16 7c
smulwb r14, r4, r6 ; (ip[7] * sinpi8sqrt2) >> 16 7s
pkhbt r7, r12, r7, lsl #16 ; 3c | 7c
pkhbt r11, r14, r11, lsl #16 ; 3s | 7s = temp1 (d) td1
uadd16 r6, r7, r6 ; 3c+3 | 7c+7 = temp2 (c) tc2
usub16 r12, r8, r6 ; c (o1 | o5) c
uadd16 r6, r11, r1 ; d (o3 | o7) d
uadd16 r7, r10, r6 ; a+d a+d
mov r8, #0x4 ; set up 4's 4
orr r8, r8, #0x40000 ; 4|4
usub16 r6, r10, r6 ; a-d a-d
uadd16 r6, r6, r8 ; a-d+4 3|7
uadd16 r7, r7, r8 ; a+d+4 0|4
uadd16 r10, r9, r12 ; b+c b+c
usub16 r1, r9, r12 ; b-c b-c
uadd16 r10, r10, r8 ; b+c+4 1|5
uadd16 r1, r1, r8 ; b-c+4 2|6
mov r8, r10, asr #19 ; o1 >> 3
strh r8, [r0, #2] ; o1
mov r8, r1, asr #19 ; o2 >> 3
strh r8, [r0, #4] ; o2
mov r8, r6, asr #19 ; o3 >> 3
strh r8, [r0, #6] ; o3
mov r8, r7, asr #19 ; o0 >> 3
strh r8, [r0], r2 ; o0 +p
sxth r10, r10 ;
mov r8, r10, asr #3 ; o5 >> 3
strh r8, [r0, #2] ; o5
sxth r1, r1 ;
mov r8, r1, asr #3 ; o6 >> 3
strh r8, [r0, #4] ; o6
sxth r6, r6 ;
mov r8, r6, asr #3 ; o7 >> 3
strh r8, [r0, #6] ; o7
sxth r7, r7 ;
mov r8, r7, asr #3 ; o4 >> 3
strh r8, [r0], r2 ; o4 +p
;;;;; subs r5, r5, #0x1 ; i-- --
bne loop2_dual_22 ;
;vpx_memset
ldr r0, [sp]
add sp, sp, #4
mov r12, #0
str r12, [r0]
str r12, [r0, #4]
str r12, [r0, #8]
str r12, [r0, #12]
str r12, [r0, #16]
str r12, [r0, #20]
str r12, [r0, #24]
str r12, [r0, #28]
ldmia sp!, {r4 - r11, pc} ; replace vars, return restore
ENDP ;|vp8_dequant_dc_idct_v68|
END

View File

@ -1,184 +0,0 @@
;
; Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
;
; Use of this source code is governed by a BSD-style license
; that can be found in the LICENSE file in the root of the source
; tree. An additional intellectual property rights grant can be found
; in the file PATENTS. All contributing project authors may
; be found in the AUTHORS file in the root of the source tree.
;
EXPORT |vp8_dequant_idct_v6|
; ARM
; REQUIRE8
; PRESERVE8
AREA |.text|, CODE, READONLY ; name this block of code
;void vp8_dequant_idct_v6(short *input, short *dq, short *output, int pitch)
|vp8_dequant_idct_v6| PROC
stmdb sp!, {r4-r11, lr}
ldr r4, [r0] ;input
ldr r5, [r1], #4 ;dq
sub sp, sp, #4
str r0, [sp]
mov r12, #4
dequant_idct_loop
smulbb r6, r4, r5
smultt r7, r4, r5
ldr r4, [r0, #4] ;input
ldr r5, [r1], #4 ;dq
strh r6, [r0], #2
strh r7, [r0], #2
smulbb r6, r4, r5
smultt r7, r4, r5
subs r12, r12, #1
ldrne r4, [r0, #4]
ldrne r5, [r1], #4
strh r6, [r0], #2
strh r7, [r0], #2
bne dequant_idct_loop
sub r0, r0, #32
mov r1, r2
mov r2, r3
; short_idct4x4llm_v6_dual
mov r3, #0x00004E00 ; cos
orr r3, r3, #0x0000007B ; cospi8sqrt2minus1
mov r4, #0x00008A00 ; sin
orr r4, r4, #0x0000008C ; sinpi8sqrt2
mov r5, #0x2 ; i=2 i
loop1_dual_1
ldr r6, [r0, #(4*2)] ; i5 | i4 5|4
ldr r12, [r0, #(12*2)] ; i13 | i12 13|12
ldr r14, [r0, #(8*2)] ; i9 | i8 9|8
smulwt r9, r3, r6 ; (ip[5] * cospi8sqrt2minus1) >> 16 5c
smulwb r7, r3, r6 ; (ip[4] * cospi8sqrt2minus1) >> 16 4c
smulwt r10, r4, r6 ; (ip[5] * sinpi8sqrt2) >> 16 5s
smulwb r8, r4, r6 ; (ip[4] * sinpi8sqrt2) >> 16 4s
pkhbt r7, r7, r9, lsl #16 ; 5c | 4c
smulwt r11, r3, r12 ; (ip[13] * cospi8sqrt2minus1) >> 16 13c
pkhbt r8, r8, r10, lsl #16 ; 5s | 4s
uadd16 r6, r6, r7 ; 5c+5 | 4c+4
smulwt r7, r4, r12 ; (ip[13] * sinpi8sqrt2) >> 16 13s
smulwb r9, r3, r12 ; (ip[12] * cospi8sqrt2minus1) >> 16 12c
smulwb r10, r4, r12 ; (ip[12] * sinpi8sqrt2) >> 16 12s
subs r5, r5, #0x1 ; i-- --
pkhbt r9, r9, r11, lsl #16 ; 13c | 12c
ldr r11, [r0], #0x4 ; i1 | i0 ++ 1|0
pkhbt r10, r10, r7, lsl #16 ; 13s | 12s
uadd16 r7, r12, r9 ; 13c+13 | 12c+12
usub16 r7, r8, r7 ; c c
uadd16 r6, r6, r10 ; d d
uadd16 r10, r11, r14 ; a a
usub16 r8, r11, r14 ; b b
uadd16 r9, r10, r6 ; a+d a+d
usub16 r10, r10, r6 ; a-d a-d
uadd16 r6, r8, r7 ; b+c b+c
usub16 r7, r8, r7 ; b-c b-c
str r6, [r1, r2] ; o5 | o4
add r6, r2, r2 ; pitch * 2 p2
str r7, [r1, r6] ; o9 | o8
add r6, r6, r2 ; pitch * 3 p3
str r10, [r1, r6] ; o13 | o12
str r9, [r1], #0x4 ; o1 | o0 ++
bne loop1_dual_1 ;
mov r5, #0x2 ; i=2 i
sub r0, r1, #8 ; reset input/output i/o
loop2_dual_2
ldr r6, [r0, r2] ; i5 | i4 5|4
ldr r1, [r0] ; i1 | i0 1|0
ldr r12, [r0, #0x4] ; i3 | i2 3|2
add r14, r2, #0x4 ; pitch + 2 p+2
ldr r14, [r0, r14] ; i7 | i6 7|6
smulwt r9, r3, r6 ; (ip[5] * cospi8sqrt2minus1) >> 16 5c
smulwt r7, r3, r1 ; (ip[1] * cospi8sqrt2minus1) >> 16 1c
smulwt r10, r4, r6 ; (ip[5] * sinpi8sqrt2) >> 16 5s
smulwt r8, r4, r1 ; (ip[1] * sinpi8sqrt2) >> 16 1s
pkhbt r11, r6, r1, lsl #16 ; i0 | i4 0|4
pkhbt r7, r9, r7, lsl #16 ; 1c | 5c
pkhbt r8, r10, r8, lsl #16 ; 1s | 5s = temp1 © tc1
pkhtb r1, r1, r6, asr #16 ; i1 | i5 1|5
uadd16 r1, r7, r1 ; 1c+1 | 5c+5 = temp2 (d) td2
pkhbt r9, r14, r12, lsl #16 ; i2 | i6 2|6
uadd16 r10, r11, r9 ; a a
usub16 r9, r11, r9 ; b b
pkhtb r6, r12, r14, asr #16 ; i3 | i7 3|7
subs r5, r5, #0x1 ; i-- --
smulwt r7, r3, r6 ; (ip[3] * cospi8sqrt2minus1) >> 16 3c
smulwt r11, r4, r6 ; (ip[3] * sinpi8sqrt2) >> 16 3s
smulwb r12, r3, r6 ; (ip[7] * cospi8sqrt2minus1) >> 16 7c
smulwb r14, r4, r6 ; (ip[7] * sinpi8sqrt2) >> 16 7s
pkhbt r7, r12, r7, lsl #16 ; 3c | 7c
pkhbt r11, r14, r11, lsl #16 ; 3s | 7s = temp1 (d) td1
uadd16 r6, r7, r6 ; 3c+3 | 7c+7 = temp2 (c) tc2
usub16 r12, r8, r6 ; c (o1 | o5) c
uadd16 r6, r11, r1 ; d (o3 | o7) d
uadd16 r7, r10, r6 ; a+d a+d
mov r8, #0x4 ; set up 4's 4
orr r8, r8, #0x40000 ; 4|4
usub16 r6, r10, r6 ; a-d a-d
uadd16 r6, r6, r8 ; a-d+4 3|7
uadd16 r7, r7, r8 ; a+d+4 0|4
uadd16 r10, r9, r12 ; b+c b+c
usub16 r1, r9, r12 ; b-c b-c
uadd16 r10, r10, r8 ; b+c+4 1|5
uadd16 r1, r1, r8 ; b-c+4 2|6
mov r8, r10, asr #19 ; o1 >> 3
strh r8, [r0, #2] ; o1
mov r8, r1, asr #19 ; o2 >> 3
strh r8, [r0, #4] ; o2
mov r8, r6, asr #19 ; o3 >> 3
strh r8, [r0, #6] ; o3
mov r8, r7, asr #19 ; o0 >> 3
strh r8, [r0], r2 ; o0 +p
sxth r10, r10 ;
mov r8, r10, asr #3 ; o5 >> 3
strh r8, [r0, #2] ; o5
sxth r1, r1 ;
mov r8, r1, asr #3 ; o6 >> 3
strh r8, [r0, #4] ; o6
sxth r6, r6 ;
mov r8, r6, asr #3 ; o7 >> 3
strh r8, [r0, #6] ; o7
sxth r7, r7 ;
mov r8, r7, asr #3 ; o4 >> 3
strh r8, [r0], r2 ; o4 +p
;;;;; subs r5, r5, #0x1 ; i-- --
bne loop2_dual_2 ;
;
;vpx_memset
ldr r0, [sp]
add sp, sp, #4
mov r12, #0
str r12, [r0]
str r12, [r0, #4]
str r12, [r0, #8]
str r12, [r0, #12]
str r12, [r0, #16]
str r12, [r0, #20]
str r12, [r0, #24]
str r12, [r0, #28]
ldmia sp!, {r4 - r11, pc} ; replace vars, return restore
ENDP ;|vp8_dequant_idct_v6|
END

View File

@ -14,14 +14,32 @@
#if HAVE_ARMV6
extern prototype_dequant_block(vp8_dequantize_b_v6);
extern prototype_dequant_idct_add(vp8_dequant_idct_add_v6);
extern prototype_dequant_dc_idct_add(vp8_dequant_dc_idct_add_v6);
#undef vp8_dequant_block
#define vp8_dequant_block vp8_dequantize_b_v6
#undef vp8_dequant_idct_add
#define vp8_dequant_idct_add vp8_dequant_idct_add_v6
#undef vp8_dequant_dc_idct_add
#define vp8_dequant_dc_idct_add vp8_dequant_dc_idct_add_v6
#endif
#if HAVE_ARMV7
extern prototype_dequant_block(vp8_dequantize_b_neon);
extern prototype_dequant_idct_add(vp8_dequant_idct_add_neon);
extern prototype_dequant_dc_idct_add(vp8_dequant_dc_idct_add_neon);
#undef vp8_dequant_block
#define vp8_dequant_block vp8_dequantize_b_neon
#undef vp8_dequant_idct_add
#define vp8_dequant_idct_add vp8_dequant_idct_add_neon
#undef vp8_dequant_dc_idct_add
#define vp8_dequant_dc_idct_add vp8_dequant_dc_idct_add_neon
#endif
#endif

View File

@ -9,31 +9,43 @@
;
EXPORT |vp8_dequant_dc_idct_neon|
EXPORT |vp8_dequant_dc_idct_add_neon|
ARM
REQUIRE8
PRESERVE8
AREA ||.text||, CODE, READONLY, ALIGN=2
;void vp8_dequant_dc_idct_c(short *input, short *dq, short *output, int pitch, int Dc);
;void vp8_dequant_dc_idct_add_neon(short *input, short *dq, unsigned char *pred,
; unsigned char *dest, int pitch, int stride,
; int Dc);
; r0 short *input,
; r1 short *dq,
; r2 short *output,
; r3 int pitch,
; (stack) int Dc
|vp8_dequant_dc_idct_neon| PROC
; r2 unsigned char *pred
; r3 unsigned char *dest
; sp int pitch
; sp+4 int stride
; sp+8 int Dc
|vp8_dequant_dc_idct_add_neon| PROC
vld1.16 {q3, q4}, [r0]
vld1.16 {q5, q6}, [r1]
ldr r1, [sp] ;load Dc from stack
ldr r1, [sp, #8] ;load Dc from stack
ldr r12, _dcidct_coeff_
ldr r12, _CONSTANTS_
vmul.i16 q1, q3, q5 ;input for short_idct4x4llm_neon
vmul.i16 q2, q4, q6
vmov.16 d2[0], r1
ldr r1, [sp] ; pitch
vld1.32 {d14[0]}, [r2], r1
vld1.32 {d14[1]}, [r2], r1
vld1.32 {d15[0]}, [r2], r1
vld1.32 {d15[1]}, [r2]
ldr r1, [sp, #4] ; stride
;|short_idct4x4llm_neon| PROC
vld1.16 {d0}, [r12]
vswp d3, d4 ;q2(vp[4] vp[12])
@ -47,14 +59,9 @@
vshr.s16 q3, q3, #1
vshr.s16 q4, q4, #1
vqadd.s16 q3, q3, q2 ;modify since sinpi8sqrt2 > 65536/2 (negtive number)
vqadd.s16 q3, q3, q2
vqadd.s16 q4, q4, q2
;d6 - c1:temp1
;d7 - d1:temp2
;d8 - d1:temp1
;d9 - c1:temp2
vqsub.s16 d10, d6, d9 ;c1
vqadd.s16 d11, d7, d8 ;d1
@ -83,7 +90,7 @@
vshr.s16 q3, q3, #1
vshr.s16 q4, q4, #1
vqadd.s16 q3, q3, q2 ;modify since sinpi8sqrt2 > 65536/2 (negtive number)
vqadd.s16 q3, q3, q2
vqadd.s16 q4, q4, q2
vqsub.s16 d10, d6, d9 ;c1
@ -101,34 +108,29 @@
vrshr.s16 d4, d4, #3
vrshr.s16 d5, d5, #3
add r1, r2, r3
add r12, r1, r3
add r0, r12, r3
vtrn.32 d2, d4
vtrn.32 d3, d5
vtrn.16 d2, d3
vtrn.16 d4, d5
vst1.16 {d2}, [r2]
vst1.16 {d3}, [r1]
vst1.16 {d4}, [r12]
vst1.16 {d5}, [r0]
vaddw.u8 q1, q1, d14
vaddw.u8 q2, q2, d15
vqmovun.s16 d0, q1
vqmovun.s16 d1, q2
vst1.32 {d0[0]}, [r3], r1
vst1.32 {d0[1]}, [r3], r1
vst1.32 {d1[0]}, [r3], r1
vst1.32 {d1[1]}, [r3]
bx lr
ENDP
ENDP ; |vp8_dequant_dc_idct_add_neon|
;-----------------
AREA dcidct4x4_dat, DATA, READWRITE ;read/write by default
;Data section with name data_area is specified. DCD reserves space in memory for 48 data.
;One word each is reserved. Label filter_coeff can be used to access the data.
;Data address: filter_coeff, filter_coeff+4, filter_coeff+8 ...
_dcidct_coeff_
DCD dcidct_coeff
dcidct_coeff
DCD 0x4e7b4e7b, 0x8a8c8a8c
;20091, 20091, 35468, 35468
; Constant Pool
_CONSTANTS_ DCD cospi8sqrt2minus1
cospi8sqrt2minus1 DCD 0x4e7b4e7b
sinpi8sqrt2 DCD 0x8a8c8a8c
END

View File

@ -9,22 +9,33 @@
;
EXPORT |vp8_dequant_idct_neon|
EXPORT |vp8_dequant_idct_add_neon|
ARM
REQUIRE8
PRESERVE8
AREA ||.text||, CODE, READONLY, ALIGN=2
;void vp8_dequant_idct_c(short *input, short *dq, short *output, int pitch);
;void vp8_dequant_idct_neon(short *input, short *dq, unsigned char *pred,
; unsigned char *dest, int pitch, int stride)
; r0 short *input,
; r1 short *dq,
; r2 short *output,
; r3 int pitch,
|vp8_dequant_idct_neon| PROC
; r2 unsigned char *pred
; r3 unsigned char *dest
; sp int pitch
; sp+4 int stride
|vp8_dequant_idct_add_neon| PROC
vld1.16 {q3, q4}, [r0]
vld1.16 {q5, q6}, [r1]
ldr r1, [sp] ; pitch
vld1.32 {d14[0]}, [r2], r1
vld1.32 {d14[1]}, [r2], r1
vld1.32 {d15[0]}, [r2], r1
vld1.32 {d15[1]}, [r2]
ldr r12, _didct_coeff_
ldr r1, [sp, #4] ; stride
ldr r12, _CONSTANTS_
vmul.i16 q1, q3, q5 ;input for short_idct4x4llm_neon
vmul.i16 q2, q4, q6
@ -42,14 +53,9 @@
vshr.s16 q3, q3, #1
vshr.s16 q4, q4, #1
vqadd.s16 q3, q3, q2 ;modify since sinpi8sqrt2 > 65536/2 (negtive number)
vqadd.s16 q3, q3, q2
vqadd.s16 q4, q4, q2
;d6 - c1:temp1
;d7 - d1:temp2
;d8 - d1:temp1
;d9 - c1:temp2
vqsub.s16 d10, d6, d9 ;c1
vqadd.s16 d11, d7, d8 ;d1
@ -78,7 +84,7 @@
vshr.s16 q3, q3, #1
vshr.s16 q4, q4, #1
vqadd.s16 q3, q3, q2 ;modify since sinpi8sqrt2 > 65536/2 (negtive number)
vqadd.s16 q3, q3, q2
vqadd.s16 q4, q4, q2
vqsub.s16 d10, d6, d9 ;c1
@ -96,34 +102,29 @@
vrshr.s16 d4, d4, #3
vrshr.s16 d5, d5, #3
add r1, r2, r3
add r12, r1, r3
add r0, r12, r3
vtrn.32 d2, d4
vtrn.32 d3, d5
vtrn.16 d2, d3
vtrn.16 d4, d5
vst1.16 {d2}, [r2]
vst1.16 {d3}, [r1]
vst1.16 {d4}, [r12]
vst1.16 {d5}, [r0]
vaddw.u8 q1, q1, d14
vaddw.u8 q2, q2, d15
vqmovun.s16 d0, q1
vqmovun.s16 d1, q2
vst1.32 {d0[0]}, [r3], r1
vst1.32 {d0[1]}, [r3], r1
vst1.32 {d1[0]}, [r3], r1
vst1.32 {d1[1]}, [r3]
bx lr
ENDP
ENDP ; |vp8_dequant_idct_add_neon|
;-----------------
AREA didct4x4_dat, DATA, READWRITE ;read/write by default
;Data section with name data_area is specified. DCD reserves space in memory for 48 data.
;One word each is reserved. Label filter_coeff can be used to access the data.
;Data address: filter_coeff, filter_coeff+4, filter_coeff+8 ...
_didct_coeff_
DCD didct_coeff
didct_coeff
DCD 0x4e7b4e7b, 0x8a8c8a8c
;20091, 20091, 35468, 35468
; Constant Pool
_CONSTANTS_ DCD cospi8sqrt2minus1
cospi8sqrt2minus1 DCD 0x4e7b4e7b
sinpi8sqrt2 DCD 0x8a8c8a8c
END

View File

@ -272,8 +272,10 @@ void vp8_decode_macroblock(VP8D_COMP *pbi, MACROBLOCKD *xd)
if (b->eob > 1)
{
DEQUANT_INVOKE(&pbi->dequant, idct_dc_add)(b->qcoeff, &b->dequant[0][0], b->predictor, *(b->base_dst) + b->dst, 16, b->dst_stride,
xd->block[24].diff[i]);
DEQUANT_INVOKE(&pbi->dequant, dc_idct_add)
(b->qcoeff, &b->dequant[0][0], b->predictor,
*(b->base_dst) + b->dst, 16, b->dst_stride,
xd->block[24].diff[i]);
}
else
{

View File

@ -32,10 +32,10 @@ void vp8_dequantize_b_c(BLOCKD *d)
}
}
void vp8_dequant_idct_add_c(short *input, short *dq, unsigned char *pred, unsigned char *dest, int pitch, int stride)
void vp8_dequant_idct_add_c(short *input, short *dq, unsigned char *pred,
unsigned char *dest, int pitch, int stride)
{
// output needs to be at least pitch * 4 for vp8_short_idct4x4llm_c to work properly
short output[16*4];
short output[16];
short *diff_ptr = output;
int r, c;
int i;
@ -45,7 +45,8 @@ void vp8_dequant_idct_add_c(short *input, short *dq, unsigned char *pred, unsign
input[i] = dq[i] * input[i];
}
vp8_short_idct4x4llm_c(input, output, pitch*2);
// the idct halves ( >> 1) the pitch
vp8_short_idct4x4llm_c(input, output, 4 << 1);
vpx_memset(input, 0, 32);
@ -65,16 +66,17 @@ void vp8_dequant_idct_add_c(short *input, short *dq, unsigned char *pred, unsign
}
dest += stride;
diff_ptr += pitch;
diff_ptr += 4;
pred += pitch;
}
}
void vp8_dequant_dc_idct_add_c(short *input, short *dq, unsigned char *pred, unsigned char *dest, int pitch, int stride, int Dc)
void vp8_dequant_dc_idct_add_c(short *input, short *dq, unsigned char *pred,
unsigned char *dest, int pitch, int stride,
int Dc)
{
int i;
// output needs to be at least pitch * 4 for vp8_short_idct4x4llm_c to work properly
short output[16*4];
short output[16];
short *diff_ptr = output;
int r, c;
@ -85,7 +87,8 @@ void vp8_dequant_dc_idct_add_c(short *input, short *dq, unsigned char *pred, uns
input[i] = dq[i] * input[i];
}
vp8_short_idct4x4llm_c(input, output, pitch*2);
// the idct halves ( >> 1) the pitch
vp8_short_idct4x4llm_c(input, output, 4 << 1);
vpx_memset(input, 0, 32);
@ -105,7 +108,7 @@ void vp8_dequant_dc_idct_add_c(short *input, short *dq, unsigned char *pred, uns
}
dest += stride;
diff_ptr += pitch;
diff_ptr += 4;
pred += pitch;
}
}

View File

@ -21,7 +21,7 @@
unsigned char *pred, unsigned char *output, \
int pitch, int stride)
#define prototype_dequant_idct_dc_add(sym) \
#define prototype_dequant_dc_idct_add(sym) \
void sym(short *input, short *dq, \
unsigned char *pred, unsigned char *output, \
int pitch, int stride, \
@ -45,21 +45,21 @@ extern prototype_dequant_block(vp8_dequant_block);
#endif
extern prototype_dequant_idct_add(vp8_dequant_idct_add);
#ifndef vp8_dequant_idct_dc_add
#define vp8_dequant_idct_dc_add vp8_dequant_dc_idct_add_c
#ifndef vp8_dequant_dc_idct_add
#define vp8_dequant_dc_idct_add vp8_dequant_dc_idct_add_c
#endif
extern prototype_dequant_idct_dc_add(vp8_dequant_idct_dc_add);
extern prototype_dequant_dc_idct_add(vp8_dequant_dc_idct_add);
typedef prototype_dequant_block((*vp8_dequant_block_fn_t));
typedef prototype_dequant_idct_add((*vp8_dequant_idct_add_fn_t));
typedef prototype_dequant_idct_dc_add((*vp8_dequant_idct_dc_add_fn_t));
typedef prototype_dequant_dc_idct_add((*vp8_dequant_dc_idct_add_fn_t));
typedef struct
{
vp8_dequant_block_fn_t block;
vp8_dequant_idct_add_fn_t idct_add;
vp8_dequant_idct_dc_add_fn_t idct_dc_add;
vp8_dequant_dc_idct_add_fn_t dc_idct_add;
} vp8_dequant_rtcd_vtable_t;
#if CONFIG_RUNTIME_CPU_DETECT

View File

@ -22,7 +22,7 @@ void vp8_dmachine_specific_config(VP8D_COMP *pbi)
pbi->mb.rtcd = &pbi->common.rtcd;
pbi->dequant.block = vp8_dequantize_b_c;
pbi->dequant.idct_add = vp8_dequant_idct_add_c;
pbi->dequant.idct_dc_add = vp8_dequant_dc_idct_add_c;
pbi->dequant.dc_idct_add = vp8_dequant_dc_idct_add_c;
pbi->dboolhuff.start = vp8dx_start_decode_c;
pbi->dboolhuff.fill = vp8dx_bool_decoder_fill_c;
#if 0 //For use with RTCD, when implemented

View File

@ -22,7 +22,7 @@
#if HAVE_MMX
extern prototype_dequant_block(vp8_dequantize_b_mmx);
extern prototype_dequant_idct_add(vp8_dequant_idct_add_mmx);
extern prototype_dequant_idct_dc_add(vp8_dequant_dc_idct_add_mmx);
extern prototype_dequant_dc_idct_add(vp8_dequant_dc_idct_add_mmx);
#if !CONFIG_RUNTIME_CPU_DETECT
@ -30,10 +30,10 @@ extern prototype_dequant_idct_dc_add(vp8_dequant_dc_idct_add_mmx);
#define vp8_dequant_block vp8_dequantize_b_mmx
#undef vp8_dequant_idct_add
#define vp8_dequant_idct_add vp8_dequant_idct_add_mmx
#define vp8_dequant_idct_add vp8_dequant_idct_mmx
#undef vp8_dequant_idct_dc
#define vp8_dequant_idct_add_dc vp8_dequant_dc_idct_add_mmx
#undef vp8_dequant_dc_idct_add
#define vp8_dequant_dc_idct_add vp8_dequant_dc_idct_mmx
#endif
#endif

View File

@ -44,7 +44,7 @@ void vp8_arch_x86_decode_init(VP8D_COMP *pbi)
{
pbi->dequant.block = vp8_dequantize_b_mmx;
pbi->dequant.idct_add = vp8_dequant_idct_add_mmx;
pbi->dequant.idct_dc_add = vp8_dequant_dc_idct_add_mmx;
pbi->dequant.dc_idct_add = vp8_dequant_dc_idct_add_mmx;
}
#endif

View File

@ -125,6 +125,7 @@ VP8_COMMON_SRCS-$(HAVE_ARMV6) += common/arm/systemdependent.c
VP8_COMMON_SRCS-$(HAVE_ARMV6) += common/arm/vpx_asm_offsets.c
VP8_COMMON_SRCS_REMOVE-$(HAVE_ARMV6) += common/filter_c.c
VP8_COMMON_SRCS_REMOVE-$(HAVE_ARMV6) += common/idctllm.c
VP8_COMMON_SRCS_REMOVE-$(HAVE_ARMV6) += common/recon.c
VP8_COMMON_SRCS_REMOVE-$(HAVE_ARMV6) += common/reconintra4x4.c
VP8_COMMON_SRCS_REMOVE-$(HAVE_ARMV6) += common/generic/systemdependent.c
@ -134,6 +135,7 @@ VP8_COMMON_SRCS-$(HAVE_ARMV6) += common/arm/armv6/bilinearfilter_v6$(ASM)
VP8_COMMON_SRCS-$(HAVE_ARMV6) += common/arm/armv6/copymem8x4_v6$(ASM)
VP8_COMMON_SRCS-$(HAVE_ARMV6) += common/arm/armv6/copymem8x8_v6$(ASM)
VP8_COMMON_SRCS-$(HAVE_ARMV6) += common/arm/armv6/copymem16x16_v6$(ASM)
VP8_COMMON_SRCS-$(HAVE_ARMV6) += common/arm/armv6/dc_only_idct_add_v6$(ASM)
VP8_COMMON_SRCS-$(HAVE_ARMV6) += common/arm/armv6/iwalsh_v6$(ASM)
VP8_COMMON_SRCS-$(HAVE_ARMV6) += common/arm/armv6/filter_v6$(ASM)
VP8_COMMON_SRCS-$(HAVE_ARMV6) += common/arm/armv6/idct_v6$(ASM)
@ -150,6 +152,7 @@ VP8_COMMON_SRCS-$(HAVE_ARMV7) += common/arm/neon/bilinearpredict16x16_neon$(ASM
VP8_COMMON_SRCS-$(HAVE_ARMV7) += common/arm/neon/copymem8x4_neon$(ASM)
VP8_COMMON_SRCS-$(HAVE_ARMV7) += common/arm/neon/copymem8x8_neon$(ASM)
VP8_COMMON_SRCS-$(HAVE_ARMV7) += common/arm/neon/copymem16x16_neon$(ASM)
VP8_COMMON_SRCS-$(HAVE_ARMV7) += common/arm/neon/dc_only_idct_add_neon$(ASM)
VP8_COMMON_SRCS-$(HAVE_ARMV7) += common/arm/neon/iwalsh_neon$(ASM)
VP8_COMMON_SRCS-$(HAVE_ARMV7) += common/arm/neon/loopfiltersimplehorizontaledge_neon$(ASM)
VP8_COMMON_SRCS-$(HAVE_ARMV7) += common/arm/neon/loopfiltersimpleverticaledge_neon$(ASM)

View File

@ -23,12 +23,12 @@ VP8_DX_SRCS_REMOVE-$(HAVE_ARMV6) += decoder/generic/dsystemdependent.c
#File list for armv6
# decoder
VP8_DX_SRCS-$(HAVE_ARMV6) += decoder/arm/armv6/dequantdcidct_v6$(ASM)
VP8_DX_SRCS-$(HAVE_ARMV6) += decoder/arm/armv6/dequantidct_v6$(ASM)
VP8_DX_SRCS-$(HAVE_ARMV6) += decoder/arm/armv6/dequant_dc_idct_v6$(ASM)
VP8_DX_SRCS-$(HAVE_ARMV6) += decoder/arm/armv6/dequant_idct_v6$(ASM)
VP8_DX_SRCS-$(HAVE_ARMV6) += decoder/arm/armv6/dequantize_v6$(ASM)
#File list for neon
# decoder
VP8_DX_SRCS-$(HAVE_ARMV7) += decoder/arm/neon/dequantdcidct_neon$(ASM)
VP8_DX_SRCS-$(HAVE_ARMV7) += decoder/arm/neon/dequantidct_neon$(ASM)
VP8_DX_SRCS-$(HAVE_ARMV7) += decoder/arm/neon/dequant_dc_idct_neon$(ASM)
VP8_DX_SRCS-$(HAVE_ARMV7) += decoder/arm/neon/dequant_idct_neon$(ASM)
VP8_DX_SRCS-$(HAVE_ARMV7) += decoder/arm/neon/dequantizeb_neon$(ASM)