update arm idct functions
Jeff Muizelaar posted some changes to the idct/reconstruction c code. This is the equivalent update for the arm assembly. This shows a good boost on v6, and a minor boost on neon. Here are some numbers for highway in qcif, 2641 frames: HEAD neon: ~161 fps new neon: ~162 fps HEAD v6: ~102 fps new v6: ~106 fps The following functions have been updated for armv6 and neon: vp8_dc_only_idct_add vp8_dequant_idct_add vp8_dequant_dc_idct_add Conflicts: vp8/decoder/arm/armv6/dequantdcidct_v6.asm vp8/decoder/arm/armv6/dequantidct_v6.asm Resolved by removing these files. When I rewrote the functions, I also moved the files to dequant_dc_idct_v6.asm/dequant_idct_v6.asm Change-Id: Ie3300df824d52474eca1a5134cf22d8b7809a5d4
This commit is contained in:
67
vp8/common/arm/armv6/dc_only_idct_add_v6.asm
Normal file
67
vp8/common/arm/armv6/dc_only_idct_add_v6.asm
Normal file
@@ -0,0 +1,67 @@
|
|||||||
|
;
|
||||||
|
; Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
|
||||||
|
;
|
||||||
|
; Use of this source code is governed by a BSD-style license and patent
|
||||||
|
; grant that can be found in the LICENSE file in the root of the source
|
||||||
|
; tree. All contributing project authors may be found in the AUTHORS
|
||||||
|
; file in the root of the source tree.
|
||||||
|
;
|
||||||
|
|
||||||
|
EXPORT |vp8_dc_only_idct_add_v6|
|
||||||
|
|
||||||
|
AREA |.text|, CODE, READONLY
|
||||||
|
|
||||||
|
;void vp8_dc_only_idct_add_v6(short input_dc, unsigned char *pred_ptr,
|
||||||
|
; unsigned char *dst_ptr, int pitch, int stride)
|
||||||
|
; r0 input_dc
|
||||||
|
; r1 pred_ptr
|
||||||
|
; r2 dest_ptr
|
||||||
|
; r3 pitch
|
||||||
|
; sp stride
|
||||||
|
|
||||||
|
|vp8_dc_only_idct_add_v6| PROC
|
||||||
|
stmdb sp!, {r4 - r7, lr}
|
||||||
|
|
||||||
|
add r0, r0, #4 ; input_dc += 4
|
||||||
|
ldr r12, c0x0000FFFF
|
||||||
|
ldr r4, [r1], r3
|
||||||
|
ldr r6, [r1], r3
|
||||||
|
and r0, r12, r0, asr #3 ; input_dc >> 3 + mask
|
||||||
|
ldr lr, [sp, #20]
|
||||||
|
orr r0, r0, r0, lsl #16 ; a1 | a1
|
||||||
|
|
||||||
|
uxtab16 r5, r0, r4 ; a1+2 | a1+0
|
||||||
|
uxtab16 r4, r0, r4, ror #8 ; a1+3 | a1+1
|
||||||
|
uxtab16 r7, r0, r6
|
||||||
|
uxtab16 r6, r0, r6, ror #8
|
||||||
|
usat16 r5, #8, r5
|
||||||
|
usat16 r4, #8, r4
|
||||||
|
usat16 r7, #8, r7
|
||||||
|
usat16 r6, #8, r6
|
||||||
|
orr r5, r5, r4, lsl #8
|
||||||
|
orr r7, r7, r6, lsl #8
|
||||||
|
ldr r4, [r1], r3
|
||||||
|
ldr r6, [r1]
|
||||||
|
str r5, [r2], lr
|
||||||
|
str r7, [r2], lr
|
||||||
|
|
||||||
|
uxtab16 r5, r0, r4
|
||||||
|
uxtab16 r4, r0, r4, ror #8
|
||||||
|
uxtab16 r7, r0, r6
|
||||||
|
uxtab16 r6, r0, r6, ror #8
|
||||||
|
usat16 r5, #8, r5
|
||||||
|
usat16 r4, #8, r4
|
||||||
|
usat16 r7, #8, r7
|
||||||
|
usat16 r6, #8, r6
|
||||||
|
orr r5, r5, r4, lsl #8
|
||||||
|
orr r7, r7, r6, lsl #8
|
||||||
|
str r5, [r2], lr
|
||||||
|
str r7, [r2]
|
||||||
|
|
||||||
|
ldmia sp!, {r4 - r7, pc}
|
||||||
|
|
||||||
|
ENDP ; |vp8_dc_only_idct_add_v6|
|
||||||
|
|
||||||
|
; Constant Pool
|
||||||
|
c0x0000FFFF DCD 0x0000FFFF
|
||||||
|
END
|
||||||
@@ -15,8 +15,6 @@
|
|||||||
EXPORT |vp8_short_idct4x4llm_v6_scott|
|
EXPORT |vp8_short_idct4x4llm_v6_scott|
|
||||||
EXPORT |vp8_short_idct4x4llm_v6_dual|
|
EXPORT |vp8_short_idct4x4llm_v6_dual|
|
||||||
|
|
||||||
EXPORT |vp8_dc_only_idct_armv6|
|
|
||||||
|
|
||||||
AREA |.text|, CODE, READONLY
|
AREA |.text|, CODE, READONLY
|
||||||
|
|
||||||
;********************************************************************************
|
;********************************************************************************
|
||||||
@@ -344,34 +342,4 @@ loop2_dual
|
|||||||
ldmia sp!, {r4 - r11, pc} ; replace vars, return restore
|
ldmia sp!, {r4 - r11, pc} ; replace vars, return restore
|
||||||
ENDP
|
ENDP
|
||||||
|
|
||||||
|
|
||||||
; sjl added 10/17/08
|
|
||||||
;void dc_only_idct_armv6(short input_dc, short *output, int pitch)
|
|
||||||
|vp8_dc_only_idct_armv6| PROC
|
|
||||||
stmdb sp!, {r4 - r6, lr}
|
|
||||||
|
|
||||||
add r0, r0, #0x4
|
|
||||||
add r4, r1, r2 ; output + shortpitch
|
|
||||||
mov r0, r0, ASR #0x3 ;aka a1
|
|
||||||
add r5, r1, r2, LSL #1 ; output + shortpitch * 2
|
|
||||||
pkhbt r0, r0, r0, lsl #16 ; a1 | a1
|
|
||||||
add r6, r5, r2 ; output + shortpitch * 3
|
|
||||||
|
|
||||||
str r0, [r1, #0]
|
|
||||||
str r0, [r1, #4]
|
|
||||||
|
|
||||||
str r0, [r4, #0]
|
|
||||||
str r0, [r4, #4]
|
|
||||||
|
|
||||||
str r0, [r5, #0]
|
|
||||||
str r0, [r5, #4]
|
|
||||||
|
|
||||||
str r0, [r6, #0]
|
|
||||||
str r0, [r6, #4]
|
|
||||||
|
|
||||||
|
|
||||||
ldmia sp!, {r4 - r6, pc}
|
|
||||||
|
|
||||||
ENDP ; |vp8_dc_only_idct_armv6|
|
|
||||||
|
|
||||||
END
|
END
|
||||||
|
|||||||
@@ -8,8 +8,8 @@
|
|||||||
; be found in the AUTHORS file in the root of the source tree.
|
; be found in the AUTHORS file in the root of the source tree.
|
||||||
;
|
;
|
||||||
|
|
||||||
EXPORT |vp8_short_inv_walsh4x4_armv6|
|
EXPORT |vp8_short_inv_walsh4x4_v6|
|
||||||
EXPORT |vp8_short_inv_walsh4x4_1_armv6|
|
EXPORT |vp8_short_inv_walsh4x4_1_v6|
|
||||||
|
|
||||||
ARM
|
ARM
|
||||||
REQUIRE8
|
REQUIRE8
|
||||||
@@ -17,8 +17,8 @@
|
|||||||
|
|
||||||
AREA |.text|, CODE, READONLY ; name this block of code
|
AREA |.text|, CODE, READONLY ; name this block of code
|
||||||
|
|
||||||
;short vp8_short_inv_walsh4x4_armv6(short *input, short *output)
|
;short vp8_short_inv_walsh4x4_v6(short *input, short *output)
|
||||||
|vp8_short_inv_walsh4x4_armv6| PROC
|
|vp8_short_inv_walsh4x4_v6| PROC
|
||||||
|
|
||||||
stmdb sp!, {r4 - r11, lr}
|
stmdb sp!, {r4 - r11, lr}
|
||||||
|
|
||||||
@@ -123,11 +123,11 @@
|
|||||||
str r5, [r1]
|
str r5, [r1]
|
||||||
|
|
||||||
ldmia sp!, {r4 - r11, pc}
|
ldmia sp!, {r4 - r11, pc}
|
||||||
ENDP ; |vp8_short_inv_walsh4x4_armv6|
|
ENDP ; |vp8_short_inv_walsh4x4_v6|
|
||||||
|
|
||||||
|
|
||||||
;short vp8_short_inv_walsh4x4_1_armv6(short *input, short *output)
|
;short vp8_short_inv_walsh4x4_1_v6(short *input, short *output)
|
||||||
|vp8_short_inv_walsh4x4_1_armv6| PROC
|
|vp8_short_inv_walsh4x4_1_v6| PROC
|
||||||
|
|
||||||
ldrsh r2, [r0] ; [0]
|
ldrsh r2, [r0] ; [0]
|
||||||
add r2, r2, #3 ; [0] + 3
|
add r2, r2, #3 ; [0] + 3
|
||||||
@@ -145,7 +145,7 @@
|
|||||||
str r2, [r1]
|
str r2, [r1]
|
||||||
|
|
||||||
bx lr
|
bx lr
|
||||||
ENDP ; |vp8_short_inv_walsh4x4_1_armv6|
|
ENDP ; |vp8_short_inv_walsh4x4_1_v6|
|
||||||
|
|
||||||
; Constant Pool
|
; Constant Pool
|
||||||
c0x00030003 DCD 0x00030003
|
c0x00030003 DCD 0x00030003
|
||||||
|
|||||||
@@ -15,8 +15,9 @@
|
|||||||
#if HAVE_ARMV6
|
#if HAVE_ARMV6
|
||||||
extern prototype_idct(vp8_short_idct4x4llm_1_v6);
|
extern prototype_idct(vp8_short_idct4x4llm_1_v6);
|
||||||
extern prototype_idct(vp8_short_idct4x4llm_v6_dual);
|
extern prototype_idct(vp8_short_idct4x4llm_v6_dual);
|
||||||
extern prototype_second_order(vp8_short_inv_walsh4x4_1_armv6);
|
extern prototype_idct_scalar_add(vp8_dc_only_idct_add_v6);
|
||||||
extern prototype_second_order(vp8_short_inv_walsh4x4_armv6);
|
extern prototype_second_order(vp8_short_inv_walsh4x4_1_v6);
|
||||||
|
extern prototype_second_order(vp8_short_inv_walsh4x4_v6);
|
||||||
|
|
||||||
#undef vp8_idct_idct1
|
#undef vp8_idct_idct1
|
||||||
#define vp8_idct_idct1 vp8_short_idct4x4llm_1_v6
|
#define vp8_idct_idct1 vp8_short_idct4x4llm_1_v6
|
||||||
@@ -24,16 +25,20 @@ extern prototype_second_order(vp8_short_inv_walsh4x4_armv6);
|
|||||||
#undef vp8_idct_idct16
|
#undef vp8_idct_idct16
|
||||||
#define vp8_idct_idct16 vp8_short_idct4x4llm_v6_dual
|
#define vp8_idct_idct16 vp8_short_idct4x4llm_v6_dual
|
||||||
|
|
||||||
|
#undef vp8_idct_idct1_scalar_add
|
||||||
|
#define vp8_idct_idct1_scalar_add vp8_dc_only_idct_add_v6
|
||||||
|
|
||||||
#undef vp8_idct_iwalsh1
|
#undef vp8_idct_iwalsh1
|
||||||
#define vp8_idct_iwalsh1 vp8_short_inv_walsh4x4_1_armv6
|
#define vp8_idct_iwalsh1 vp8_short_inv_walsh4x4_1_v6
|
||||||
|
|
||||||
#undef vp8_idct_iwalsh16
|
#undef vp8_idct_iwalsh16
|
||||||
#define vp8_idct_iwalsh16 vp8_short_inv_walsh4x4_armv6
|
#define vp8_idct_iwalsh16 vp8_short_inv_walsh4x4_v6
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#if HAVE_ARMV7
|
#if HAVE_ARMV7
|
||||||
extern prototype_idct(vp8_short_idct4x4llm_1_neon);
|
extern prototype_idct(vp8_short_idct4x4llm_1_neon);
|
||||||
extern prototype_idct(vp8_short_idct4x4llm_neon);
|
extern prototype_idct(vp8_short_idct4x4llm_neon);
|
||||||
|
extern prototype_idct_scalar_add(vp8_dc_only_idct_add_neon);
|
||||||
extern prototype_second_order(vp8_short_inv_walsh4x4_1_neon);
|
extern prototype_second_order(vp8_short_inv_walsh4x4_1_neon);
|
||||||
extern prototype_second_order(vp8_short_inv_walsh4x4_neon);
|
extern prototype_second_order(vp8_short_inv_walsh4x4_neon);
|
||||||
|
|
||||||
@@ -43,6 +48,9 @@ extern prototype_second_order(vp8_short_inv_walsh4x4_neon);
|
|||||||
#undef vp8_idct_idct16
|
#undef vp8_idct_idct16
|
||||||
#define vp8_idct_idct16 vp8_short_idct4x4llm_neon
|
#define vp8_idct_idct16 vp8_short_idct4x4llm_neon
|
||||||
|
|
||||||
|
#undef vp8_idct_idct1_scalar_add
|
||||||
|
#define vp8_idct_idct1_scalar_add vp8_dc_only_idct_add_neon
|
||||||
|
|
||||||
#undef vp8_idct_iwalsh1
|
#undef vp8_idct_iwalsh1
|
||||||
#define vp8_idct_iwalsh1 vp8_short_inv_walsh4x4_1_neon
|
#define vp8_idct_iwalsh1 vp8_short_inv_walsh4x4_1_neon
|
||||||
|
|
||||||
|
|||||||
49
vp8/common/arm/neon/dc_only_idct_add_neon.asm
Normal file
49
vp8/common/arm/neon/dc_only_idct_add_neon.asm
Normal file
@@ -0,0 +1,49 @@
|
|||||||
|
;
|
||||||
|
; Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
|
||||||
|
;
|
||||||
|
; Use of this source code is governed by a BSD-style license and patent
|
||||||
|
; grant that can be found in the LICENSE file in the root of the source
|
||||||
|
; tree. All contributing project authors may be found in the AUTHORS
|
||||||
|
; file in the root of the source tree.
|
||||||
|
;
|
||||||
|
|
||||||
|
|
||||||
|
EXPORT |vp8_dc_only_idct_add_neon|
|
||||||
|
ARM
|
||||||
|
REQUIRE8
|
||||||
|
PRESERVE8
|
||||||
|
|
||||||
|
AREA ||.text||, CODE, READONLY, ALIGN=2
|
||||||
|
;void vp8_dc_only_idct_add_neon(short input_dc, unsigned char *pred_ptr,
|
||||||
|
; unsigned char *dst_ptr, int pitch, int stride)
|
||||||
|
; r0 input_dc
|
||||||
|
; r1 pred_ptr
|
||||||
|
; r2 dst_ptr
|
||||||
|
; r3 pitch
|
||||||
|
; sp stride
|
||||||
|
|vp8_dc_only_idct_add_neon| PROC
|
||||||
|
add r0, r0, #4
|
||||||
|
asr r0, r0, #3
|
||||||
|
ldr r12, [sp]
|
||||||
|
vdup.16 q0, r0
|
||||||
|
|
||||||
|
vld1.32 {d2[0]}, [r1], r3
|
||||||
|
vld1.32 {d2[1]}, [r1], r3
|
||||||
|
vld1.32 {d4[0]}, [r1], r3
|
||||||
|
vld1.32 {d4[1]}, [r1]
|
||||||
|
|
||||||
|
vaddw.u8 q1, q0, d2
|
||||||
|
vaddw.u8 q2, q0, d4
|
||||||
|
|
||||||
|
vqmovun.s16 d2, q1
|
||||||
|
vqmovun.s16 d4, q2
|
||||||
|
|
||||||
|
vst1.32 {d2[0]}, [r2], r12
|
||||||
|
vst1.32 {d2[1]}, [r2], r12
|
||||||
|
vst1.32 {d4[0]}, [r2], r12
|
||||||
|
vst1.32 {d4[1]}, [r2]
|
||||||
|
|
||||||
|
bx lr
|
||||||
|
|
||||||
|
ENDP
|
||||||
|
END
|
||||||
218
vp8/decoder/arm/armv6/dequant_dc_idct_v6.asm
Normal file
218
vp8/decoder/arm/armv6/dequant_dc_idct_v6.asm
Normal file
@@ -0,0 +1,218 @@
|
|||||||
|
;
|
||||||
|
; Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
|
||||||
|
;
|
||||||
|
; Use of this source code is governed by a BSD-style license and patent
|
||||||
|
; grant that can be found in the LICENSE file in the root of the source
|
||||||
|
; tree. All contributing project authors may be found in the AUTHORS
|
||||||
|
; file in the root of the source tree.
|
||||||
|
;
|
||||||
|
|
||||||
|
|
||||||
|
EXPORT |vp8_dequant_dc_idct_add_v6|
|
||||||
|
|
||||||
|
AREA |.text|, CODE, READONLY
|
||||||
|
|
||||||
|
;void vp8_dequant_dc_idct_v6(short *input, short *dq, unsigned char *pred,
|
||||||
|
; unsigned char *dest, int pitch, int stride, int Dc)
|
||||||
|
; r0 = input
|
||||||
|
; r1 = dq
|
||||||
|
; r2 = pred
|
||||||
|
; r3 = dest
|
||||||
|
; sp + 36 = pitch ; +4 = 40
|
||||||
|
; sp + 40 = stride ; +4 = 44
|
||||||
|
; sp + 44 = Dc ; +4 = 48
|
||||||
|
|
||||||
|
|
||||||
|
|vp8_dequant_dc_idct_add_v6| PROC
|
||||||
|
stmdb sp!, {r4-r11, lr}
|
||||||
|
|
||||||
|
ldr r6, [sp, #44]
|
||||||
|
|
||||||
|
ldr r4, [r0] ;input
|
||||||
|
ldr r5, [r1], #4 ;dq
|
||||||
|
|
||||||
|
sub sp, sp, #4
|
||||||
|
str r3, [sp]
|
||||||
|
|
||||||
|
smultt r7, r4, r5
|
||||||
|
|
||||||
|
ldr r4, [r0, #4] ;input
|
||||||
|
ldr r5, [r1], #4 ;dq
|
||||||
|
|
||||||
|
strh r6, [r0], #2
|
||||||
|
strh r7, [r0], #2
|
||||||
|
|
||||||
|
smulbb r6, r4, r5
|
||||||
|
smultt r7, r4, r5
|
||||||
|
|
||||||
|
ldr r4, [r0, #4] ;input
|
||||||
|
ldr r5, [r1], #4 ;dq
|
||||||
|
|
||||||
|
strh r6, [r0], #2
|
||||||
|
strh r7, [r0], #2
|
||||||
|
|
||||||
|
mov r12, #3
|
||||||
|
|
||||||
|
vp8_dequant_dc_add_loop
|
||||||
|
smulbb r6, r4, r5
|
||||||
|
smultt r7, r4, r5
|
||||||
|
|
||||||
|
ldr r4, [r0, #4] ;input
|
||||||
|
ldr r5, [r1], #4 ;dq
|
||||||
|
|
||||||
|
strh r6, [r0], #2
|
||||||
|
strh r7, [r0], #2
|
||||||
|
|
||||||
|
smulbb r6, r4, r5
|
||||||
|
smultt r7, r4, r5
|
||||||
|
|
||||||
|
subs r12, r12, #1
|
||||||
|
|
||||||
|
ldrne r4, [r0, #4]
|
||||||
|
ldrne r5, [r1], #4
|
||||||
|
|
||||||
|
strh r6, [r0], #2
|
||||||
|
strh r7, [r0], #2
|
||||||
|
|
||||||
|
bne vp8_dequant_dc_add_loop
|
||||||
|
|
||||||
|
sub r0, r0, #32
|
||||||
|
mov r1, r0
|
||||||
|
|
||||||
|
; short_idct4x4llm_v6_dual
|
||||||
|
ldr r3, cospi8sqrt2minus1
|
||||||
|
ldr r4, sinpi8sqrt2
|
||||||
|
ldr r6, [r0, #8]
|
||||||
|
mov r5, #2
|
||||||
|
vp8_dequant_dc_idct_loop1_v6
|
||||||
|
ldr r12, [r0, #24]
|
||||||
|
ldr r14, [r0, #16]
|
||||||
|
smulwt r9, r3, r6
|
||||||
|
smulwb r7, r3, r6
|
||||||
|
smulwt r10, r4, r6
|
||||||
|
smulwb r8, r4, r6
|
||||||
|
pkhbt r7, r7, r9, lsl #16
|
||||||
|
smulwt r11, r3, r12
|
||||||
|
pkhbt r8, r8, r10, lsl #16
|
||||||
|
uadd16 r6, r6, r7
|
||||||
|
smulwt r7, r4, r12
|
||||||
|
smulwb r9, r3, r12
|
||||||
|
smulwb r10, r4, r12
|
||||||
|
subs r5, r5, #1
|
||||||
|
pkhbt r9, r9, r11, lsl #16
|
||||||
|
ldr r11, [r0], #4
|
||||||
|
pkhbt r10, r10, r7, lsl #16
|
||||||
|
uadd16 r7, r12, r9
|
||||||
|
usub16 r7, r8, r7
|
||||||
|
uadd16 r6, r6, r10
|
||||||
|
uadd16 r10, r11, r14
|
||||||
|
usub16 r8, r11, r14
|
||||||
|
uadd16 r9, r10, r6
|
||||||
|
usub16 r10, r10, r6
|
||||||
|
uadd16 r6, r8, r7
|
||||||
|
usub16 r7, r8, r7
|
||||||
|
str r6, [r1, #8]
|
||||||
|
ldrne r6, [r0, #8]
|
||||||
|
str r7, [r1, #16]
|
||||||
|
str r10, [r1, #24]
|
||||||
|
str r9, [r1], #4
|
||||||
|
bne vp8_dequant_dc_idct_loop1_v6
|
||||||
|
|
||||||
|
mov r5, #2
|
||||||
|
sub r0, r1, #8
|
||||||
|
vp8_dequant_dc_idct_loop2_v6
|
||||||
|
ldr r6, [r0], #4
|
||||||
|
ldr r7, [r0], #4
|
||||||
|
ldr r8, [r0], #4
|
||||||
|
ldr r9, [r0], #4
|
||||||
|
smulwt r1, r3, r6
|
||||||
|
smulwt r12, r4, r6
|
||||||
|
smulwt lr, r3, r8
|
||||||
|
smulwt r10, r4, r8
|
||||||
|
pkhbt r11, r8, r6, lsl #16
|
||||||
|
pkhbt r1, lr, r1, lsl #16
|
||||||
|
pkhbt r12, r10, r12, lsl #16
|
||||||
|
pkhtb r6, r6, r8, asr #16
|
||||||
|
uadd16 r6, r1, r6
|
||||||
|
pkhbt lr, r9, r7, lsl #16
|
||||||
|
uadd16 r10, r11, lr
|
||||||
|
usub16 lr, r11, lr
|
||||||
|
pkhtb r8, r7, r9, asr #16
|
||||||
|
subs r5, r5, #1
|
||||||
|
smulwt r1, r3, r8
|
||||||
|
smulwb r7, r3, r8
|
||||||
|
smulwt r11, r4, r8
|
||||||
|
smulwb r9, r4, r8
|
||||||
|
pkhbt r1, r7, r1, lsl #16
|
||||||
|
uadd16 r8, r1, r8
|
||||||
|
pkhbt r11, r9, r11, lsl #16
|
||||||
|
usub16 r1, r12, r8
|
||||||
|
uadd16 r8, r11, r6
|
||||||
|
ldr r9, c0x00040004
|
||||||
|
ldr r12, [sp, #40]
|
||||||
|
uadd16 r6, r10, r8
|
||||||
|
usub16 r7, r10, r8
|
||||||
|
uadd16 r7, r7, r9
|
||||||
|
uadd16 r6, r6, r9
|
||||||
|
uadd16 r10, r14, r1
|
||||||
|
usub16 r1, r14, r1
|
||||||
|
uadd16 r10, r10, r9
|
||||||
|
uadd16 r1, r1, r9
|
||||||
|
ldr r11, [r2], r12
|
||||||
|
mov r8, r7, asr #3
|
||||||
|
pkhtb r9, r8, r10, asr #19
|
||||||
|
mov r8, r1, asr #3
|
||||||
|
pkhtb r8, r8, r6, asr #19
|
||||||
|
uxtb16 lr, r11, ror #8
|
||||||
|
qadd16 r9, r9, lr
|
||||||
|
uxtb16 lr, r11
|
||||||
|
qadd16 r8, r8, lr
|
||||||
|
usat16 r9, #8, r9
|
||||||
|
usat16 r8, #8, r8
|
||||||
|
orr r9, r8, r9, lsl #8
|
||||||
|
ldr r11, [r2], r12
|
||||||
|
ldr lr, [sp]
|
||||||
|
ldr r12, [sp, #44]
|
||||||
|
mov r7, r7, lsl #16
|
||||||
|
mov r1, r1, lsl #16
|
||||||
|
mov r10, r10, lsl #16
|
||||||
|
mov r6, r6, lsl #16
|
||||||
|
mov r7, r7, asr #3
|
||||||
|
pkhtb r7, r7, r10, asr #19
|
||||||
|
mov r1, r1, asr #3
|
||||||
|
pkhtb r1, r1, r6, asr #19
|
||||||
|
uxtb16 r8, r11, ror #8
|
||||||
|
qadd16 r7, r7, r8
|
||||||
|
uxtb16 r8, r11
|
||||||
|
qadd16 r1, r1, r8
|
||||||
|
usat16 r7, #8, r7
|
||||||
|
usat16 r1, #8, r1
|
||||||
|
orr r1, r1, r7, lsl #8
|
||||||
|
str r9, [lr], r12
|
||||||
|
str r1, [lr], r12
|
||||||
|
str lr, [sp]
|
||||||
|
bne vp8_dequant_dc_idct_loop2_v6
|
||||||
|
|
||||||
|
; vpx_memset
|
||||||
|
sub r0, r0, #32
|
||||||
|
add sp, sp, #4
|
||||||
|
|
||||||
|
mov r12, #0
|
||||||
|
str r12, [r0]
|
||||||
|
str r12, [r0, #4]
|
||||||
|
str r12, [r0, #8]
|
||||||
|
str r12, [r0, #12]
|
||||||
|
str r12, [r0, #16]
|
||||||
|
str r12, [r0, #20]
|
||||||
|
str r12, [r0, #24]
|
||||||
|
str r12, [r0, #28]
|
||||||
|
|
||||||
|
ldmia sp!, {r4 - r11, pc}
|
||||||
|
ENDP ; |vp8_dequant_dc_idct_add_v6|
|
||||||
|
|
||||||
|
; Constant Pool
|
||||||
|
cospi8sqrt2minus1 DCD 0x00004E7B
|
||||||
|
sinpi8sqrt2 DCD 0x00008A8C
|
||||||
|
c0x00040004 DCD 0x00040004
|
||||||
|
|
||||||
|
END
|
||||||
196
vp8/decoder/arm/armv6/dequant_idct_v6.asm
Normal file
196
vp8/decoder/arm/armv6/dequant_idct_v6.asm
Normal file
@@ -0,0 +1,196 @@
|
|||||||
|
;
|
||||||
|
; Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
|
||||||
|
;
|
||||||
|
; Use of this source code is governed by a BSD-style license and patent
|
||||||
|
; grant that can be found in the LICENSE file in the root of the source
|
||||||
|
; tree. All contributing project authors may be found in the AUTHORS
|
||||||
|
; file in the root of the source tree.
|
||||||
|
;
|
||||||
|
|
||||||
|
EXPORT |vp8_dequant_idct_add_v6|
|
||||||
|
|
||||||
|
AREA |.text|, CODE, READONLY
|
||||||
|
;void vp8_dequant_idct_v6(short *input, short *dq, unsigned char *pred,
|
||||||
|
; unsigned char *dest, int pitch, int stride)
|
||||||
|
; r0 = input
|
||||||
|
; r1 = dq
|
||||||
|
; r2 = pred
|
||||||
|
; r3 = dest
|
||||||
|
; sp + 36 = pitch ; +4 = 40
|
||||||
|
; sp + 40 = stride ; +4 = 44
|
||||||
|
|
||||||
|
|
||||||
|
|vp8_dequant_idct_add_v6| PROC
|
||||||
|
stmdb sp!, {r4-r11, lr}
|
||||||
|
|
||||||
|
ldr r4, [r0] ;input
|
||||||
|
ldr r5, [r1], #4 ;dq
|
||||||
|
|
||||||
|
sub sp, sp, #4
|
||||||
|
str r3, [sp]
|
||||||
|
|
||||||
|
mov r12, #4
|
||||||
|
|
||||||
|
vp8_dequant_add_loop
|
||||||
|
smulbb r6, r4, r5
|
||||||
|
smultt r7, r4, r5
|
||||||
|
|
||||||
|
ldr r4, [r0, #4] ;input
|
||||||
|
ldr r5, [r1], #4 ;dq
|
||||||
|
|
||||||
|
strh r6, [r0], #2
|
||||||
|
strh r7, [r0], #2
|
||||||
|
|
||||||
|
smulbb r6, r4, r5
|
||||||
|
smultt r7, r4, r5
|
||||||
|
|
||||||
|
subs r12, r12, #1
|
||||||
|
|
||||||
|
ldrne r4, [r0, #4]
|
||||||
|
ldrne r5, [r1], #4
|
||||||
|
|
||||||
|
strh r6, [r0], #2
|
||||||
|
strh r7, [r0], #2
|
||||||
|
|
||||||
|
bne vp8_dequant_add_loop
|
||||||
|
|
||||||
|
sub r0, r0, #32
|
||||||
|
mov r1, r0
|
||||||
|
|
||||||
|
; short_idct4x4llm_v6_dual
|
||||||
|
ldr r3, cospi8sqrt2minus1
|
||||||
|
ldr r4, sinpi8sqrt2
|
||||||
|
ldr r6, [r0, #8]
|
||||||
|
mov r5, #2
|
||||||
|
vp8_dequant_idct_loop1_v6
|
||||||
|
ldr r12, [r0, #24]
|
||||||
|
ldr r14, [r0, #16]
|
||||||
|
smulwt r9, r3, r6
|
||||||
|
smulwb r7, r3, r6
|
||||||
|
smulwt r10, r4, r6
|
||||||
|
smulwb r8, r4, r6
|
||||||
|
pkhbt r7, r7, r9, lsl #16
|
||||||
|
smulwt r11, r3, r12
|
||||||
|
pkhbt r8, r8, r10, lsl #16
|
||||||
|
uadd16 r6, r6, r7
|
||||||
|
smulwt r7, r4, r12
|
||||||
|
smulwb r9, r3, r12
|
||||||
|
smulwb r10, r4, r12
|
||||||
|
subs r5, r5, #1
|
||||||
|
pkhbt r9, r9, r11, lsl #16
|
||||||
|
ldr r11, [r0], #4
|
||||||
|
pkhbt r10, r10, r7, lsl #16
|
||||||
|
uadd16 r7, r12, r9
|
||||||
|
usub16 r7, r8, r7
|
||||||
|
uadd16 r6, r6, r10
|
||||||
|
uadd16 r10, r11, r14
|
||||||
|
usub16 r8, r11, r14
|
||||||
|
uadd16 r9, r10, r6
|
||||||
|
usub16 r10, r10, r6
|
||||||
|
uadd16 r6, r8, r7
|
||||||
|
usub16 r7, r8, r7
|
||||||
|
str r6, [r1, #8]
|
||||||
|
ldrne r6, [r0, #8]
|
||||||
|
str r7, [r1, #16]
|
||||||
|
str r10, [r1, #24]
|
||||||
|
str r9, [r1], #4
|
||||||
|
bne vp8_dequant_idct_loop1_v6
|
||||||
|
|
||||||
|
mov r5, #2
|
||||||
|
sub r0, r1, #8
|
||||||
|
vp8_dequant_idct_loop2_v6
|
||||||
|
ldr r6, [r0], #4
|
||||||
|
ldr r7, [r0], #4
|
||||||
|
ldr r8, [r0], #4
|
||||||
|
ldr r9, [r0], #4
|
||||||
|
smulwt r1, r3, r6
|
||||||
|
smulwt r12, r4, r6
|
||||||
|
smulwt lr, r3, r8
|
||||||
|
smulwt r10, r4, r8
|
||||||
|
pkhbt r11, r8, r6, lsl #16
|
||||||
|
pkhbt r1, lr, r1, lsl #16
|
||||||
|
pkhbt r12, r10, r12, lsl #16
|
||||||
|
pkhtb r6, r6, r8, asr #16
|
||||||
|
uadd16 r6, r1, r6
|
||||||
|
pkhbt lr, r9, r7, lsl #16
|
||||||
|
uadd16 r10, r11, lr
|
||||||
|
usub16 lr, r11, lr
|
||||||
|
pkhtb r8, r7, r9, asr #16
|
||||||
|
subs r5, r5, #1
|
||||||
|
smulwt r1, r3, r8
|
||||||
|
smulwb r7, r3, r8
|
||||||
|
smulwt r11, r4, r8
|
||||||
|
smulwb r9, r4, r8
|
||||||
|
pkhbt r1, r7, r1, lsl #16
|
||||||
|
uadd16 r8, r1, r8
|
||||||
|
pkhbt r11, r9, r11, lsl #16
|
||||||
|
usub16 r1, r12, r8
|
||||||
|
uadd16 r8, r11, r6
|
||||||
|
ldr r9, c0x00040004
|
||||||
|
ldr r12, [sp, #40]
|
||||||
|
uadd16 r6, r10, r8
|
||||||
|
usub16 r7, r10, r8
|
||||||
|
uadd16 r7, r7, r9
|
||||||
|
uadd16 r6, r6, r9
|
||||||
|
uadd16 r10, r14, r1
|
||||||
|
usub16 r1, r14, r1
|
||||||
|
uadd16 r10, r10, r9
|
||||||
|
uadd16 r1, r1, r9
|
||||||
|
ldr r11, [r2], r12
|
||||||
|
mov r8, r7, asr #3
|
||||||
|
pkhtb r9, r8, r10, asr #19
|
||||||
|
mov r8, r1, asr #3
|
||||||
|
pkhtb r8, r8, r6, asr #19
|
||||||
|
uxtb16 lr, r11, ror #8
|
||||||
|
qadd16 r9, r9, lr
|
||||||
|
uxtb16 lr, r11
|
||||||
|
qadd16 r8, r8, lr
|
||||||
|
usat16 r9, #8, r9
|
||||||
|
usat16 r8, #8, r8
|
||||||
|
orr r9, r8, r9, lsl #8
|
||||||
|
ldr r11, [r2], r12
|
||||||
|
ldr lr, [sp]
|
||||||
|
ldr r12, [sp, #44]
|
||||||
|
mov r7, r7, lsl #16
|
||||||
|
mov r1, r1, lsl #16
|
||||||
|
mov r10, r10, lsl #16
|
||||||
|
mov r6, r6, lsl #16
|
||||||
|
mov r7, r7, asr #3
|
||||||
|
pkhtb r7, r7, r10, asr #19
|
||||||
|
mov r1, r1, asr #3
|
||||||
|
pkhtb r1, r1, r6, asr #19
|
||||||
|
uxtb16 r8, r11, ror #8
|
||||||
|
qadd16 r7, r7, r8
|
||||||
|
uxtb16 r8, r11
|
||||||
|
qadd16 r1, r1, r8
|
||||||
|
usat16 r7, #8, r7
|
||||||
|
usat16 r1, #8, r1
|
||||||
|
orr r1, r1, r7, lsl #8
|
||||||
|
str r9, [lr], r12
|
||||||
|
str r1, [lr], r12
|
||||||
|
str lr, [sp]
|
||||||
|
bne vp8_dequant_idct_loop2_v6
|
||||||
|
|
||||||
|
; vpx_memset
|
||||||
|
sub r0, r0, #32
|
||||||
|
add sp, sp, #4
|
||||||
|
|
||||||
|
mov r12, #0
|
||||||
|
str r12, [r0]
|
||||||
|
str r12, [r0, #4]
|
||||||
|
str r12, [r0, #8]
|
||||||
|
str r12, [r0, #12]
|
||||||
|
str r12, [r0, #16]
|
||||||
|
str r12, [r0, #20]
|
||||||
|
str r12, [r0, #24]
|
||||||
|
str r12, [r0, #28]
|
||||||
|
|
||||||
|
ldmia sp!, {r4 - r11, pc}
|
||||||
|
ENDP ; |vp8_dequant_idct_add_v6|
|
||||||
|
|
||||||
|
; Constant Pool
|
||||||
|
cospi8sqrt2minus1 DCD 0x00004E7B
|
||||||
|
sinpi8sqrt2 DCD 0x00008A8C
|
||||||
|
c0x00040004 DCD 0x00040004
|
||||||
|
|
||||||
|
END
|
||||||
@@ -1,203 +0,0 @@
|
|||||||
;
|
|
||||||
; Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
|
|
||||||
;
|
|
||||||
; Use of this source code is governed by a BSD-style license
|
|
||||||
; that can be found in the LICENSE file in the root of the source
|
|
||||||
; tree. An additional intellectual property rights grant can be found
|
|
||||||
; in the file PATENTS. All contributing project authors may
|
|
||||||
; be found in the AUTHORS file in the root of the source tree.
|
|
||||||
;
|
|
||||||
|
|
||||||
|
|
||||||
EXPORT |vp8_dequant_dc_idct_v6|
|
|
||||||
; ARM
|
|
||||||
; REQUIRE8
|
|
||||||
; PRESERVE8
|
|
||||||
|
|
||||||
AREA |.text|, CODE, READONLY ; name this block of code
|
|
||||||
;void vp8_dequant_dc_idct_v6(short *input, short *dq, short *output, int pitch,int Dc)
|
|
||||||
|vp8_dequant_dc_idct_v6| PROC
|
|
||||||
stmdb sp!, {r4-r11, lr}
|
|
||||||
|
|
||||||
ldr r6, [sp, #36] ;load Dc
|
|
||||||
|
|
||||||
ldr r4, [r0] ;input
|
|
||||||
ldr r5, [r1], #4 ;dq
|
|
||||||
|
|
||||||
sub sp, sp, #4
|
|
||||||
str r0, [sp]
|
|
||||||
|
|
||||||
smultt r7, r4, r5
|
|
||||||
|
|
||||||
ldr r4, [r0, #4] ;input
|
|
||||||
ldr r5, [r1], #4 ;dq
|
|
||||||
|
|
||||||
strh r6, [r0], #2
|
|
||||||
strh r7, [r0], #2
|
|
||||||
|
|
||||||
smulbb r6, r4, r5
|
|
||||||
smultt r7, r4, r5
|
|
||||||
|
|
||||||
ldr r4, [r0, #4] ;input
|
|
||||||
ldr r5, [r1], #4 ;dq
|
|
||||||
|
|
||||||
strh r6, [r0], #2
|
|
||||||
strh r7, [r0], #2
|
|
||||||
|
|
||||||
mov r12, #3
|
|
||||||
|
|
||||||
dequant_dc_idct_loop
|
|
||||||
smulbb r6, r4, r5
|
|
||||||
smultt r7, r4, r5
|
|
||||||
|
|
||||||
ldr r4, [r0, #4] ;input
|
|
||||||
ldr r5, [r1], #4 ;dq
|
|
||||||
|
|
||||||
strh r6, [r0], #2
|
|
||||||
strh r7, [r0], #2
|
|
||||||
|
|
||||||
smulbb r6, r4, r5
|
|
||||||
smultt r7, r4, r5
|
|
||||||
|
|
||||||
subs r12, r12, #1
|
|
||||||
|
|
||||||
ldrne r4, [r0, #4]
|
|
||||||
ldrne r5, [r1], #4
|
|
||||||
|
|
||||||
strh r6, [r0], #2
|
|
||||||
strh r7, [r0], #2
|
|
||||||
|
|
||||||
bne dequant_dc_idct_loop
|
|
||||||
|
|
||||||
sub r0, r0, #32
|
|
||||||
mov r1, r2
|
|
||||||
mov r2, r3
|
|
||||||
|
|
||||||
; short_idct4x4llm_v6_dual
|
|
||||||
|
|
||||||
mov r3, #0x00004E00 ; cos
|
|
||||||
orr r3, r3, #0x0000007B ; cospi8sqrt2minus1
|
|
||||||
mov r4, #0x00008A00 ; sin
|
|
||||||
orr r4, r4, #0x0000008C ; sinpi8sqrt2
|
|
||||||
mov r5, #0x2 ; i=2 i
|
|
||||||
loop1_dual_11
|
|
||||||
ldr r6, [r0, #(4*2)] ; i5 | i4 5|4
|
|
||||||
ldr r12, [r0, #(12*2)] ; i13 | i12 13|12
|
|
||||||
ldr r14, [r0, #(8*2)] ; i9 | i8 9|8
|
|
||||||
|
|
||||||
smulwt r9, r3, r6 ; (ip[5] * cospi8sqrt2minus1) >> 16 5c
|
|
||||||
smulwb r7, r3, r6 ; (ip[4] * cospi8sqrt2minus1) >> 16 4c
|
|
||||||
smulwt r10, r4, r6 ; (ip[5] * sinpi8sqrt2) >> 16 5s
|
|
||||||
smulwb r8, r4, r6 ; (ip[4] * sinpi8sqrt2) >> 16 4s
|
|
||||||
pkhbt r7, r7, r9, lsl #16 ; 5c | 4c
|
|
||||||
smulwt r11, r3, r12 ; (ip[13] * cospi8sqrt2minus1) >> 16 13c
|
|
||||||
pkhbt r8, r8, r10, lsl #16 ; 5s | 4s
|
|
||||||
uadd16 r6, r6, r7 ; 5c+5 | 4c+4
|
|
||||||
smulwt r7, r4, r12 ; (ip[13] * sinpi8sqrt2) >> 16 13s
|
|
||||||
smulwb r9, r3, r12 ; (ip[12] * cospi8sqrt2minus1) >> 16 12c
|
|
||||||
smulwb r10, r4, r12 ; (ip[12] * sinpi8sqrt2) >> 16 12s
|
|
||||||
subs r5, r5, #0x1 ; i-- --
|
|
||||||
pkhbt r9, r9, r11, lsl #16 ; 13c | 12c
|
|
||||||
ldr r11, [r0], #0x4 ; i1 | i0 ++ 1|0
|
|
||||||
pkhbt r10, r10, r7, lsl #16 ; 13s | 12s
|
|
||||||
uadd16 r7, r12, r9 ; 13c+13 | 12c+12
|
|
||||||
usub16 r7, r8, r7 ; c c
|
|
||||||
uadd16 r6, r6, r10 ; d d
|
|
||||||
uadd16 r10, r11, r14 ; a a
|
|
||||||
usub16 r8, r11, r14 ; b b
|
|
||||||
uadd16 r9, r10, r6 ; a+d a+d
|
|
||||||
usub16 r10, r10, r6 ; a-d a-d
|
|
||||||
uadd16 r6, r8, r7 ; b+c b+c
|
|
||||||
usub16 r7, r8, r7 ; b-c b-c
|
|
||||||
str r6, [r1, r2] ; o5 | o4
|
|
||||||
add r6, r2, r2 ; pitch * 2 p2
|
|
||||||
str r7, [r1, r6] ; o9 | o8
|
|
||||||
add r6, r6, r2 ; pitch * 3 p3
|
|
||||||
str r10, [r1, r6] ; o13 | o12
|
|
||||||
str r9, [r1], #0x4 ; o1 | o0 ++
|
|
||||||
bne loop1_dual_11 ;
|
|
||||||
mov r5, #0x2 ; i=2 i
|
|
||||||
sub r0, r1, #8 ; reset input/output i/o
|
|
||||||
loop2_dual_22
|
|
||||||
ldr r6, [r0, r2] ; i5 | i4 5|4
|
|
||||||
ldr r1, [r0] ; i1 | i0 1|0
|
|
||||||
ldr r12, [r0, #0x4] ; i3 | i2 3|2
|
|
||||||
add r14, r2, #0x4 ; pitch + 2 p+2
|
|
||||||
ldr r14, [r0, r14] ; i7 | i6 7|6
|
|
||||||
smulwt r9, r3, r6 ; (ip[5] * cospi8sqrt2minus1) >> 16 5c
|
|
||||||
smulwt r7, r3, r1 ; (ip[1] * cospi8sqrt2minus1) >> 16 1c
|
|
||||||
smulwt r10, r4, r6 ; (ip[5] * sinpi8sqrt2) >> 16 5s
|
|
||||||
smulwt r8, r4, r1 ; (ip[1] * sinpi8sqrt2) >> 16 1s
|
|
||||||
pkhbt r11, r6, r1, lsl #16 ; i0 | i4 0|4
|
|
||||||
pkhbt r7, r9, r7, lsl #16 ; 1c | 5c
|
|
||||||
pkhbt r8, r10, r8, lsl #16 ; 1s | 5s = temp1 <20> tc1
|
|
||||||
pkhtb r1, r1, r6, asr #16 ; i1 | i5 1|5
|
|
||||||
uadd16 r1, r7, r1 ; 1c+1 | 5c+5 = temp2 (d) td2
|
|
||||||
pkhbt r9, r14, r12, lsl #16 ; i2 | i6 2|6
|
|
||||||
uadd16 r10, r11, r9 ; a a
|
|
||||||
usub16 r9, r11, r9 ; b b
|
|
||||||
pkhtb r6, r12, r14, asr #16 ; i3 | i7 3|7
|
|
||||||
subs r5, r5, #0x1 ; i-- --
|
|
||||||
smulwt r7, r3, r6 ; (ip[3] * cospi8sqrt2minus1) >> 16 3c
|
|
||||||
smulwt r11, r4, r6 ; (ip[3] * sinpi8sqrt2) >> 16 3s
|
|
||||||
smulwb r12, r3, r6 ; (ip[7] * cospi8sqrt2minus1) >> 16 7c
|
|
||||||
smulwb r14, r4, r6 ; (ip[7] * sinpi8sqrt2) >> 16 7s
|
|
||||||
|
|
||||||
pkhbt r7, r12, r7, lsl #16 ; 3c | 7c
|
|
||||||
pkhbt r11, r14, r11, lsl #16 ; 3s | 7s = temp1 (d) td1
|
|
||||||
uadd16 r6, r7, r6 ; 3c+3 | 7c+7 = temp2 (c) tc2
|
|
||||||
usub16 r12, r8, r6 ; c (o1 | o5) c
|
|
||||||
uadd16 r6, r11, r1 ; d (o3 | o7) d
|
|
||||||
uadd16 r7, r10, r6 ; a+d a+d
|
|
||||||
mov r8, #0x4 ; set up 4's 4
|
|
||||||
orr r8, r8, #0x40000 ; 4|4
|
|
||||||
usub16 r6, r10, r6 ; a-d a-d
|
|
||||||
uadd16 r6, r6, r8 ; a-d+4 3|7
|
|
||||||
uadd16 r7, r7, r8 ; a+d+4 0|4
|
|
||||||
uadd16 r10, r9, r12 ; b+c b+c
|
|
||||||
usub16 r1, r9, r12 ; b-c b-c
|
|
||||||
uadd16 r10, r10, r8 ; b+c+4 1|5
|
|
||||||
uadd16 r1, r1, r8 ; b-c+4 2|6
|
|
||||||
mov r8, r10, asr #19 ; o1 >> 3
|
|
||||||
strh r8, [r0, #2] ; o1
|
|
||||||
mov r8, r1, asr #19 ; o2 >> 3
|
|
||||||
strh r8, [r0, #4] ; o2
|
|
||||||
mov r8, r6, asr #19 ; o3 >> 3
|
|
||||||
strh r8, [r0, #6] ; o3
|
|
||||||
mov r8, r7, asr #19 ; o0 >> 3
|
|
||||||
strh r8, [r0], r2 ; o0 +p
|
|
||||||
sxth r10, r10 ;
|
|
||||||
mov r8, r10, asr #3 ; o5 >> 3
|
|
||||||
strh r8, [r0, #2] ; o5
|
|
||||||
sxth r1, r1 ;
|
|
||||||
mov r8, r1, asr #3 ; o6 >> 3
|
|
||||||
strh r8, [r0, #4] ; o6
|
|
||||||
sxth r6, r6 ;
|
|
||||||
mov r8, r6, asr #3 ; o7 >> 3
|
|
||||||
strh r8, [r0, #6] ; o7
|
|
||||||
sxth r7, r7 ;
|
|
||||||
mov r8, r7, asr #3 ; o4 >> 3
|
|
||||||
strh r8, [r0], r2 ; o4 +p
|
|
||||||
;;;;; subs r5, r5, #0x1 ; i-- --
|
|
||||||
bne loop2_dual_22 ;
|
|
||||||
|
|
||||||
|
|
||||||
;vpx_memset
|
|
||||||
ldr r0, [sp]
|
|
||||||
add sp, sp, #4
|
|
||||||
|
|
||||||
mov r12, #0
|
|
||||||
str r12, [r0]
|
|
||||||
str r12, [r0, #4]
|
|
||||||
str r12, [r0, #8]
|
|
||||||
str r12, [r0, #12]
|
|
||||||
str r12, [r0, #16]
|
|
||||||
str r12, [r0, #20]
|
|
||||||
str r12, [r0, #24]
|
|
||||||
str r12, [r0, #28]
|
|
||||||
|
|
||||||
ldmia sp!, {r4 - r11, pc} ; replace vars, return restore
|
|
||||||
|
|
||||||
ENDP ;|vp8_dequant_dc_idct_v68|
|
|
||||||
|
|
||||||
END
|
|
||||||
@@ -1,184 +0,0 @@
|
|||||||
;
|
|
||||||
; Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
|
|
||||||
;
|
|
||||||
; Use of this source code is governed by a BSD-style license
|
|
||||||
; that can be found in the LICENSE file in the root of the source
|
|
||||||
; tree. An additional intellectual property rights grant can be found
|
|
||||||
; in the file PATENTS. All contributing project authors may
|
|
||||||
; be found in the AUTHORS file in the root of the source tree.
|
|
||||||
;
|
|
||||||
|
|
||||||
|
|
||||||
EXPORT |vp8_dequant_idct_v6|
|
|
||||||
; ARM
|
|
||||||
; REQUIRE8
|
|
||||||
; PRESERVE8
|
|
||||||
|
|
||||||
AREA |.text|, CODE, READONLY ; name this block of code
|
|
||||||
;void vp8_dequant_idct_v6(short *input, short *dq, short *output, int pitch)
|
|
||||||
|vp8_dequant_idct_v6| PROC
|
|
||||||
stmdb sp!, {r4-r11, lr}
|
|
||||||
|
|
||||||
ldr r4, [r0] ;input
|
|
||||||
ldr r5, [r1], #4 ;dq
|
|
||||||
|
|
||||||
sub sp, sp, #4
|
|
||||||
str r0, [sp]
|
|
||||||
|
|
||||||
mov r12, #4
|
|
||||||
|
|
||||||
dequant_idct_loop
|
|
||||||
smulbb r6, r4, r5
|
|
||||||
smultt r7, r4, r5
|
|
||||||
|
|
||||||
ldr r4, [r0, #4] ;input
|
|
||||||
ldr r5, [r1], #4 ;dq
|
|
||||||
|
|
||||||
strh r6, [r0], #2
|
|
||||||
strh r7, [r0], #2
|
|
||||||
|
|
||||||
smulbb r6, r4, r5
|
|
||||||
smultt r7, r4, r5
|
|
||||||
|
|
||||||
subs r12, r12, #1
|
|
||||||
|
|
||||||
ldrne r4, [r0, #4]
|
|
||||||
ldrne r5, [r1], #4
|
|
||||||
|
|
||||||
strh r6, [r0], #2
|
|
||||||
strh r7, [r0], #2
|
|
||||||
|
|
||||||
bne dequant_idct_loop
|
|
||||||
|
|
||||||
sub r0, r0, #32
|
|
||||||
mov r1, r2
|
|
||||||
mov r2, r3
|
|
||||||
|
|
||||||
; short_idct4x4llm_v6_dual
|
|
||||||
|
|
||||||
mov r3, #0x00004E00 ; cos
|
|
||||||
orr r3, r3, #0x0000007B ; cospi8sqrt2minus1
|
|
||||||
mov r4, #0x00008A00 ; sin
|
|
||||||
orr r4, r4, #0x0000008C ; sinpi8sqrt2
|
|
||||||
mov r5, #0x2 ; i=2 i
|
|
||||||
loop1_dual_1
|
|
||||||
ldr r6, [r0, #(4*2)] ; i5 | i4 5|4
|
|
||||||
ldr r12, [r0, #(12*2)] ; i13 | i12 13|12
|
|
||||||
ldr r14, [r0, #(8*2)] ; i9 | i8 9|8
|
|
||||||
|
|
||||||
smulwt r9, r3, r6 ; (ip[5] * cospi8sqrt2minus1) >> 16 5c
|
|
||||||
smulwb r7, r3, r6 ; (ip[4] * cospi8sqrt2minus1) >> 16 4c
|
|
||||||
smulwt r10, r4, r6 ; (ip[5] * sinpi8sqrt2) >> 16 5s
|
|
||||||
smulwb r8, r4, r6 ; (ip[4] * sinpi8sqrt2) >> 16 4s
|
|
||||||
pkhbt r7, r7, r9, lsl #16 ; 5c | 4c
|
|
||||||
smulwt r11, r3, r12 ; (ip[13] * cospi8sqrt2minus1) >> 16 13c
|
|
||||||
pkhbt r8, r8, r10, lsl #16 ; 5s | 4s
|
|
||||||
uadd16 r6, r6, r7 ; 5c+5 | 4c+4
|
|
||||||
smulwt r7, r4, r12 ; (ip[13] * sinpi8sqrt2) >> 16 13s
|
|
||||||
smulwb r9, r3, r12 ; (ip[12] * cospi8sqrt2minus1) >> 16 12c
|
|
||||||
smulwb r10, r4, r12 ; (ip[12] * sinpi8sqrt2) >> 16 12s
|
|
||||||
subs r5, r5, #0x1 ; i-- --
|
|
||||||
pkhbt r9, r9, r11, lsl #16 ; 13c | 12c
|
|
||||||
ldr r11, [r0], #0x4 ; i1 | i0 ++ 1|0
|
|
||||||
pkhbt r10, r10, r7, lsl #16 ; 13s | 12s
|
|
||||||
uadd16 r7, r12, r9 ; 13c+13 | 12c+12
|
|
||||||
usub16 r7, r8, r7 ; c c
|
|
||||||
uadd16 r6, r6, r10 ; d d
|
|
||||||
uadd16 r10, r11, r14 ; a a
|
|
||||||
usub16 r8, r11, r14 ; b b
|
|
||||||
uadd16 r9, r10, r6 ; a+d a+d
|
|
||||||
usub16 r10, r10, r6 ; a-d a-d
|
|
||||||
uadd16 r6, r8, r7 ; b+c b+c
|
|
||||||
usub16 r7, r8, r7 ; b-c b-c
|
|
||||||
str r6, [r1, r2] ; o5 | o4
|
|
||||||
add r6, r2, r2 ; pitch * 2 p2
|
|
||||||
str r7, [r1, r6] ; o9 | o8
|
|
||||||
add r6, r6, r2 ; pitch * 3 p3
|
|
||||||
str r10, [r1, r6] ; o13 | o12
|
|
||||||
str r9, [r1], #0x4 ; o1 | o0 ++
|
|
||||||
bne loop1_dual_1 ;
|
|
||||||
mov r5, #0x2 ; i=2 i
|
|
||||||
sub r0, r1, #8 ; reset input/output i/o
|
|
||||||
loop2_dual_2
|
|
||||||
ldr r6, [r0, r2] ; i5 | i4 5|4
|
|
||||||
ldr r1, [r0] ; i1 | i0 1|0
|
|
||||||
ldr r12, [r0, #0x4] ; i3 | i2 3|2
|
|
||||||
add r14, r2, #0x4 ; pitch + 2 p+2
|
|
||||||
ldr r14, [r0, r14] ; i7 | i6 7|6
|
|
||||||
smulwt r9, r3, r6 ; (ip[5] * cospi8sqrt2minus1) >> 16 5c
|
|
||||||
smulwt r7, r3, r1 ; (ip[1] * cospi8sqrt2minus1) >> 16 1c
|
|
||||||
smulwt r10, r4, r6 ; (ip[5] * sinpi8sqrt2) >> 16 5s
|
|
||||||
smulwt r8, r4, r1 ; (ip[1] * sinpi8sqrt2) >> 16 1s
|
|
||||||
pkhbt r11, r6, r1, lsl #16 ; i0 | i4 0|4
|
|
||||||
pkhbt r7, r9, r7, lsl #16 ; 1c | 5c
|
|
||||||
pkhbt r8, r10, r8, lsl #16 ; 1s | 5s = temp1 <20> tc1
|
|
||||||
pkhtb r1, r1, r6, asr #16 ; i1 | i5 1|5
|
|
||||||
uadd16 r1, r7, r1 ; 1c+1 | 5c+5 = temp2 (d) td2
|
|
||||||
pkhbt r9, r14, r12, lsl #16 ; i2 | i6 2|6
|
|
||||||
uadd16 r10, r11, r9 ; a a
|
|
||||||
usub16 r9, r11, r9 ; b b
|
|
||||||
pkhtb r6, r12, r14, asr #16 ; i3 | i7 3|7
|
|
||||||
subs r5, r5, #0x1 ; i-- --
|
|
||||||
smulwt r7, r3, r6 ; (ip[3] * cospi8sqrt2minus1) >> 16 3c
|
|
||||||
smulwt r11, r4, r6 ; (ip[3] * sinpi8sqrt2) >> 16 3s
|
|
||||||
smulwb r12, r3, r6 ; (ip[7] * cospi8sqrt2minus1) >> 16 7c
|
|
||||||
smulwb r14, r4, r6 ; (ip[7] * sinpi8sqrt2) >> 16 7s
|
|
||||||
|
|
||||||
pkhbt r7, r12, r7, lsl #16 ; 3c | 7c
|
|
||||||
pkhbt r11, r14, r11, lsl #16 ; 3s | 7s = temp1 (d) td1
|
|
||||||
uadd16 r6, r7, r6 ; 3c+3 | 7c+7 = temp2 (c) tc2
|
|
||||||
usub16 r12, r8, r6 ; c (o1 | o5) c
|
|
||||||
uadd16 r6, r11, r1 ; d (o3 | o7) d
|
|
||||||
uadd16 r7, r10, r6 ; a+d a+d
|
|
||||||
mov r8, #0x4 ; set up 4's 4
|
|
||||||
orr r8, r8, #0x40000 ; 4|4
|
|
||||||
usub16 r6, r10, r6 ; a-d a-d
|
|
||||||
uadd16 r6, r6, r8 ; a-d+4 3|7
|
|
||||||
uadd16 r7, r7, r8 ; a+d+4 0|4
|
|
||||||
uadd16 r10, r9, r12 ; b+c b+c
|
|
||||||
usub16 r1, r9, r12 ; b-c b-c
|
|
||||||
uadd16 r10, r10, r8 ; b+c+4 1|5
|
|
||||||
uadd16 r1, r1, r8 ; b-c+4 2|6
|
|
||||||
mov r8, r10, asr #19 ; o1 >> 3
|
|
||||||
strh r8, [r0, #2] ; o1
|
|
||||||
mov r8, r1, asr #19 ; o2 >> 3
|
|
||||||
strh r8, [r0, #4] ; o2
|
|
||||||
mov r8, r6, asr #19 ; o3 >> 3
|
|
||||||
strh r8, [r0, #6] ; o3
|
|
||||||
mov r8, r7, asr #19 ; o0 >> 3
|
|
||||||
strh r8, [r0], r2 ; o0 +p
|
|
||||||
sxth r10, r10 ;
|
|
||||||
mov r8, r10, asr #3 ; o5 >> 3
|
|
||||||
strh r8, [r0, #2] ; o5
|
|
||||||
sxth r1, r1 ;
|
|
||||||
mov r8, r1, asr #3 ; o6 >> 3
|
|
||||||
strh r8, [r0, #4] ; o6
|
|
||||||
sxth r6, r6 ;
|
|
||||||
mov r8, r6, asr #3 ; o7 >> 3
|
|
||||||
strh r8, [r0, #6] ; o7
|
|
||||||
sxth r7, r7 ;
|
|
||||||
mov r8, r7, asr #3 ; o4 >> 3
|
|
||||||
strh r8, [r0], r2 ; o4 +p
|
|
||||||
;;;;; subs r5, r5, #0x1 ; i-- --
|
|
||||||
bne loop2_dual_2 ;
|
|
||||||
;
|
|
||||||
|
|
||||||
;vpx_memset
|
|
||||||
ldr r0, [sp]
|
|
||||||
add sp, sp, #4
|
|
||||||
|
|
||||||
mov r12, #0
|
|
||||||
str r12, [r0]
|
|
||||||
str r12, [r0, #4]
|
|
||||||
str r12, [r0, #8]
|
|
||||||
str r12, [r0, #12]
|
|
||||||
str r12, [r0, #16]
|
|
||||||
str r12, [r0, #20]
|
|
||||||
str r12, [r0, #24]
|
|
||||||
str r12, [r0, #28]
|
|
||||||
|
|
||||||
ldmia sp!, {r4 - r11, pc} ; replace vars, return restore
|
|
||||||
|
|
||||||
ENDP ;|vp8_dequant_idct_v6|
|
|
||||||
|
|
||||||
END
|
|
||||||
@@ -14,14 +14,32 @@
|
|||||||
|
|
||||||
#if HAVE_ARMV6
|
#if HAVE_ARMV6
|
||||||
extern prototype_dequant_block(vp8_dequantize_b_v6);
|
extern prototype_dequant_block(vp8_dequantize_b_v6);
|
||||||
|
extern prototype_dequant_idct_add(vp8_dequant_idct_add_v6);
|
||||||
|
extern prototype_dequant_dc_idct_add(vp8_dequant_dc_idct_add_v6);
|
||||||
|
|
||||||
#undef vp8_dequant_block
|
#undef vp8_dequant_block
|
||||||
#define vp8_dequant_block vp8_dequantize_b_v6
|
#define vp8_dequant_block vp8_dequantize_b_v6
|
||||||
|
|
||||||
|
#undef vp8_dequant_idct_add
|
||||||
|
#define vp8_dequant_idct_add vp8_dequant_idct_add_v6
|
||||||
|
|
||||||
|
#undef vp8_dequant_dc_idct_add
|
||||||
|
#define vp8_dequant_dc_idct_add vp8_dequant_dc_idct_add_v6
|
||||||
|
#endif
|
||||||
|
|
||||||
#if HAVE_ARMV7
|
#if HAVE_ARMV7
|
||||||
extern prototype_dequant_block(vp8_dequantize_b_neon);
|
extern prototype_dequant_block(vp8_dequantize_b_neon);
|
||||||
|
extern prototype_dequant_idct_add(vp8_dequant_idct_add_neon);
|
||||||
|
extern prototype_dequant_dc_idct_add(vp8_dequant_dc_idct_add_neon);
|
||||||
|
|
||||||
#undef vp8_dequant_block
|
#undef vp8_dequant_block
|
||||||
#define vp8_dequant_block vp8_dequantize_b_neon
|
#define vp8_dequant_block vp8_dequantize_b_neon
|
||||||
|
|
||||||
|
#undef vp8_dequant_idct_add
|
||||||
|
#define vp8_dequant_idct_add vp8_dequant_idct_add_neon
|
||||||
|
|
||||||
|
#undef vp8_dequant_dc_idct_add
|
||||||
|
#define vp8_dequant_dc_idct_add vp8_dequant_dc_idct_add_neon
|
||||||
|
#endif
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|||||||
@@ -9,31 +9,43 @@
|
|||||||
;
|
;
|
||||||
|
|
||||||
|
|
||||||
EXPORT |vp8_dequant_dc_idct_neon|
|
EXPORT |vp8_dequant_dc_idct_add_neon|
|
||||||
ARM
|
ARM
|
||||||
REQUIRE8
|
REQUIRE8
|
||||||
PRESERVE8
|
PRESERVE8
|
||||||
|
|
||||||
AREA ||.text||, CODE, READONLY, ALIGN=2
|
AREA ||.text||, CODE, READONLY, ALIGN=2
|
||||||
;void vp8_dequant_dc_idct_c(short *input, short *dq, short *output, int pitch, int Dc);
|
;void vp8_dequant_dc_idct_add_neon(short *input, short *dq, unsigned char *pred,
|
||||||
|
; unsigned char *dest, int pitch, int stride,
|
||||||
|
; int Dc);
|
||||||
; r0 short *input,
|
; r0 short *input,
|
||||||
; r1 short *dq,
|
; r1 short *dq,
|
||||||
; r2 short *output,
|
; r2 unsigned char *pred
|
||||||
; r3 int pitch,
|
; r3 unsigned char *dest
|
||||||
; (stack) int Dc
|
; sp int pitch
|
||||||
|vp8_dequant_dc_idct_neon| PROC
|
; sp+4 int stride
|
||||||
|
; sp+8 int Dc
|
||||||
|
|vp8_dequant_dc_idct_add_neon| PROC
|
||||||
vld1.16 {q3, q4}, [r0]
|
vld1.16 {q3, q4}, [r0]
|
||||||
vld1.16 {q5, q6}, [r1]
|
vld1.16 {q5, q6}, [r1]
|
||||||
|
|
||||||
ldr r1, [sp] ;load Dc from stack
|
ldr r1, [sp, #8] ;load Dc from stack
|
||||||
|
|
||||||
ldr r12, _dcidct_coeff_
|
ldr r12, _CONSTANTS_
|
||||||
|
|
||||||
vmul.i16 q1, q3, q5 ;input for short_idct4x4llm_neon
|
vmul.i16 q1, q3, q5 ;input for short_idct4x4llm_neon
|
||||||
vmul.i16 q2, q4, q6
|
vmul.i16 q2, q4, q6
|
||||||
|
|
||||||
vmov.16 d2[0], r1
|
vmov.16 d2[0], r1
|
||||||
|
|
||||||
|
ldr r1, [sp] ; pitch
|
||||||
|
vld1.32 {d14[0]}, [r2], r1
|
||||||
|
vld1.32 {d14[1]}, [r2], r1
|
||||||
|
vld1.32 {d15[0]}, [r2], r1
|
||||||
|
vld1.32 {d15[1]}, [r2]
|
||||||
|
|
||||||
|
ldr r1, [sp, #4] ; stride
|
||||||
|
|
||||||
;|short_idct4x4llm_neon| PROC
|
;|short_idct4x4llm_neon| PROC
|
||||||
vld1.16 {d0}, [r12]
|
vld1.16 {d0}, [r12]
|
||||||
vswp d3, d4 ;q2(vp[4] vp[12])
|
vswp d3, d4 ;q2(vp[4] vp[12])
|
||||||
@@ -47,14 +59,9 @@
|
|||||||
vshr.s16 q3, q3, #1
|
vshr.s16 q3, q3, #1
|
||||||
vshr.s16 q4, q4, #1
|
vshr.s16 q4, q4, #1
|
||||||
|
|
||||||
vqadd.s16 q3, q3, q2 ;modify since sinpi8sqrt2 > 65536/2 (negtive number)
|
vqadd.s16 q3, q3, q2
|
||||||
vqadd.s16 q4, q4, q2
|
vqadd.s16 q4, q4, q2
|
||||||
|
|
||||||
;d6 - c1:temp1
|
|
||||||
;d7 - d1:temp2
|
|
||||||
;d8 - d1:temp1
|
|
||||||
;d9 - c1:temp2
|
|
||||||
|
|
||||||
vqsub.s16 d10, d6, d9 ;c1
|
vqsub.s16 d10, d6, d9 ;c1
|
||||||
vqadd.s16 d11, d7, d8 ;d1
|
vqadd.s16 d11, d7, d8 ;d1
|
||||||
|
|
||||||
@@ -83,7 +90,7 @@
|
|||||||
vshr.s16 q3, q3, #1
|
vshr.s16 q3, q3, #1
|
||||||
vshr.s16 q4, q4, #1
|
vshr.s16 q4, q4, #1
|
||||||
|
|
||||||
vqadd.s16 q3, q3, q2 ;modify since sinpi8sqrt2 > 65536/2 (negtive number)
|
vqadd.s16 q3, q3, q2
|
||||||
vqadd.s16 q4, q4, q2
|
vqadd.s16 q4, q4, q2
|
||||||
|
|
||||||
vqsub.s16 d10, d6, d9 ;c1
|
vqsub.s16 d10, d6, d9 ;c1
|
||||||
@@ -101,34 +108,29 @@
|
|||||||
vrshr.s16 d4, d4, #3
|
vrshr.s16 d4, d4, #3
|
||||||
vrshr.s16 d5, d5, #3
|
vrshr.s16 d5, d5, #3
|
||||||
|
|
||||||
add r1, r2, r3
|
|
||||||
add r12, r1, r3
|
|
||||||
add r0, r12, r3
|
|
||||||
|
|
||||||
vtrn.32 d2, d4
|
vtrn.32 d2, d4
|
||||||
vtrn.32 d3, d5
|
vtrn.32 d3, d5
|
||||||
vtrn.16 d2, d3
|
vtrn.16 d2, d3
|
||||||
vtrn.16 d4, d5
|
vtrn.16 d4, d5
|
||||||
|
|
||||||
vst1.16 {d2}, [r2]
|
vaddw.u8 q1, q1, d14
|
||||||
vst1.16 {d3}, [r1]
|
vaddw.u8 q2, q2, d15
|
||||||
vst1.16 {d4}, [r12]
|
|
||||||
vst1.16 {d5}, [r0]
|
vqmovun.s16 d0, q1
|
||||||
|
vqmovun.s16 d1, q2
|
||||||
|
|
||||||
|
vst1.32 {d0[0]}, [r3], r1
|
||||||
|
vst1.32 {d0[1]}, [r3], r1
|
||||||
|
vst1.32 {d1[0]}, [r3], r1
|
||||||
|
vst1.32 {d1[1]}, [r3]
|
||||||
|
|
||||||
bx lr
|
bx lr
|
||||||
|
|
||||||
ENDP
|
ENDP ; |vp8_dequant_dc_idct_add_neon|
|
||||||
|
|
||||||
;-----------------
|
; Constant Pool
|
||||||
AREA dcidct4x4_dat, DATA, READWRITE ;read/write by default
|
_CONSTANTS_ DCD cospi8sqrt2minus1
|
||||||
;Data section with name data_area is specified. DCD reserves space in memory for 48 data.
|
cospi8sqrt2minus1 DCD 0x4e7b4e7b
|
||||||
;One word each is reserved. Label filter_coeff can be used to access the data.
|
sinpi8sqrt2 DCD 0x8a8c8a8c
|
||||||
;Data address: filter_coeff, filter_coeff+4, filter_coeff+8 ...
|
|
||||||
_dcidct_coeff_
|
|
||||||
DCD dcidct_coeff
|
|
||||||
dcidct_coeff
|
|
||||||
DCD 0x4e7b4e7b, 0x8a8c8a8c
|
|
||||||
|
|
||||||
;20091, 20091, 35468, 35468
|
|
||||||
|
|
||||||
END
|
END
|
||||||
@@ -9,22 +9,33 @@
|
|||||||
;
|
;
|
||||||
|
|
||||||
|
|
||||||
EXPORT |vp8_dequant_idct_neon|
|
EXPORT |vp8_dequant_idct_add_neon|
|
||||||
ARM
|
ARM
|
||||||
REQUIRE8
|
REQUIRE8
|
||||||
PRESERVE8
|
PRESERVE8
|
||||||
|
|
||||||
AREA ||.text||, CODE, READONLY, ALIGN=2
|
AREA ||.text||, CODE, READONLY, ALIGN=2
|
||||||
;void vp8_dequant_idct_c(short *input, short *dq, short *output, int pitch);
|
;void vp8_dequant_idct_neon(short *input, short *dq, unsigned char *pred,
|
||||||
|
; unsigned char *dest, int pitch, int stride)
|
||||||
; r0 short *input,
|
; r0 short *input,
|
||||||
; r1 short *dq,
|
; r1 short *dq,
|
||||||
; r2 short *output,
|
; r2 unsigned char *pred
|
||||||
; r3 int pitch,
|
; r3 unsigned char *dest
|
||||||
|vp8_dequant_idct_neon| PROC
|
; sp int pitch
|
||||||
|
; sp+4 int stride
|
||||||
|
|
||||||
|
|vp8_dequant_idct_add_neon| PROC
|
||||||
vld1.16 {q3, q4}, [r0]
|
vld1.16 {q3, q4}, [r0]
|
||||||
vld1.16 {q5, q6}, [r1]
|
vld1.16 {q5, q6}, [r1]
|
||||||
|
ldr r1, [sp] ; pitch
|
||||||
|
vld1.32 {d14[0]}, [r2], r1
|
||||||
|
vld1.32 {d14[1]}, [r2], r1
|
||||||
|
vld1.32 {d15[0]}, [r2], r1
|
||||||
|
vld1.32 {d15[1]}, [r2]
|
||||||
|
|
||||||
ldr r12, _didct_coeff_
|
ldr r1, [sp, #4] ; stride
|
||||||
|
|
||||||
|
ldr r12, _CONSTANTS_
|
||||||
|
|
||||||
vmul.i16 q1, q3, q5 ;input for short_idct4x4llm_neon
|
vmul.i16 q1, q3, q5 ;input for short_idct4x4llm_neon
|
||||||
vmul.i16 q2, q4, q6
|
vmul.i16 q2, q4, q6
|
||||||
@@ -42,14 +53,9 @@
|
|||||||
vshr.s16 q3, q3, #1
|
vshr.s16 q3, q3, #1
|
||||||
vshr.s16 q4, q4, #1
|
vshr.s16 q4, q4, #1
|
||||||
|
|
||||||
vqadd.s16 q3, q3, q2 ;modify since sinpi8sqrt2 > 65536/2 (negtive number)
|
vqadd.s16 q3, q3, q2
|
||||||
vqadd.s16 q4, q4, q2
|
vqadd.s16 q4, q4, q2
|
||||||
|
|
||||||
;d6 - c1:temp1
|
|
||||||
;d7 - d1:temp2
|
|
||||||
;d8 - d1:temp1
|
|
||||||
;d9 - c1:temp2
|
|
||||||
|
|
||||||
vqsub.s16 d10, d6, d9 ;c1
|
vqsub.s16 d10, d6, d9 ;c1
|
||||||
vqadd.s16 d11, d7, d8 ;d1
|
vqadd.s16 d11, d7, d8 ;d1
|
||||||
|
|
||||||
@@ -78,7 +84,7 @@
|
|||||||
vshr.s16 q3, q3, #1
|
vshr.s16 q3, q3, #1
|
||||||
vshr.s16 q4, q4, #1
|
vshr.s16 q4, q4, #1
|
||||||
|
|
||||||
vqadd.s16 q3, q3, q2 ;modify since sinpi8sqrt2 > 65536/2 (negtive number)
|
vqadd.s16 q3, q3, q2
|
||||||
vqadd.s16 q4, q4, q2
|
vqadd.s16 q4, q4, q2
|
||||||
|
|
||||||
vqsub.s16 d10, d6, d9 ;c1
|
vqsub.s16 d10, d6, d9 ;c1
|
||||||
@@ -96,34 +102,29 @@
|
|||||||
vrshr.s16 d4, d4, #3
|
vrshr.s16 d4, d4, #3
|
||||||
vrshr.s16 d5, d5, #3
|
vrshr.s16 d5, d5, #3
|
||||||
|
|
||||||
add r1, r2, r3
|
|
||||||
add r12, r1, r3
|
|
||||||
add r0, r12, r3
|
|
||||||
|
|
||||||
vtrn.32 d2, d4
|
vtrn.32 d2, d4
|
||||||
vtrn.32 d3, d5
|
vtrn.32 d3, d5
|
||||||
vtrn.16 d2, d3
|
vtrn.16 d2, d3
|
||||||
vtrn.16 d4, d5
|
vtrn.16 d4, d5
|
||||||
|
|
||||||
vst1.16 {d2}, [r2]
|
vaddw.u8 q1, q1, d14
|
||||||
vst1.16 {d3}, [r1]
|
vaddw.u8 q2, q2, d15
|
||||||
vst1.16 {d4}, [r12]
|
|
||||||
vst1.16 {d5}, [r0]
|
vqmovun.s16 d0, q1
|
||||||
|
vqmovun.s16 d1, q2
|
||||||
|
|
||||||
|
vst1.32 {d0[0]}, [r3], r1
|
||||||
|
vst1.32 {d0[1]}, [r3], r1
|
||||||
|
vst1.32 {d1[0]}, [r3], r1
|
||||||
|
vst1.32 {d1[1]}, [r3]
|
||||||
|
|
||||||
bx lr
|
bx lr
|
||||||
|
|
||||||
ENDP
|
ENDP ; |vp8_dequant_idct_add_neon|
|
||||||
|
|
||||||
;-----------------
|
; Constant Pool
|
||||||
AREA didct4x4_dat, DATA, READWRITE ;read/write by default
|
_CONSTANTS_ DCD cospi8sqrt2minus1
|
||||||
;Data section with name data_area is specified. DCD reserves space in memory for 48 data.
|
cospi8sqrt2minus1 DCD 0x4e7b4e7b
|
||||||
;One word each is reserved. Label filter_coeff can be used to access the data.
|
sinpi8sqrt2 DCD 0x8a8c8a8c
|
||||||
;Data address: filter_coeff, filter_coeff+4, filter_coeff+8 ...
|
|
||||||
_didct_coeff_
|
|
||||||
DCD didct_coeff
|
|
||||||
didct_coeff
|
|
||||||
DCD 0x4e7b4e7b, 0x8a8c8a8c
|
|
||||||
|
|
||||||
;20091, 20091, 35468, 35468
|
|
||||||
|
|
||||||
END
|
END
|
||||||
@@ -272,7 +272,9 @@ void vp8_decode_macroblock(VP8D_COMP *pbi, MACROBLOCKD *xd)
|
|||||||
|
|
||||||
if (b->eob > 1)
|
if (b->eob > 1)
|
||||||
{
|
{
|
||||||
DEQUANT_INVOKE(&pbi->dequant, idct_dc_add)(b->qcoeff, &b->dequant[0][0], b->predictor, *(b->base_dst) + b->dst, 16, b->dst_stride,
|
DEQUANT_INVOKE(&pbi->dequant, dc_idct_add)
|
||||||
|
(b->qcoeff, &b->dequant[0][0], b->predictor,
|
||||||
|
*(b->base_dst) + b->dst, 16, b->dst_stride,
|
||||||
xd->block[24].diff[i]);
|
xd->block[24].diff[i]);
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
|
|||||||
@@ -32,10 +32,10 @@ void vp8_dequantize_b_c(BLOCKD *d)
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
void vp8_dequant_idct_add_c(short *input, short *dq, unsigned char *pred, unsigned char *dest, int pitch, int stride)
|
void vp8_dequant_idct_add_c(short *input, short *dq, unsigned char *pred,
|
||||||
|
unsigned char *dest, int pitch, int stride)
|
||||||
{
|
{
|
||||||
// output needs to be at least pitch * 4 for vp8_short_idct4x4llm_c to work properly
|
short output[16];
|
||||||
short output[16*4];
|
|
||||||
short *diff_ptr = output;
|
short *diff_ptr = output;
|
||||||
int r, c;
|
int r, c;
|
||||||
int i;
|
int i;
|
||||||
@@ -45,7 +45,8 @@ void vp8_dequant_idct_add_c(short *input, short *dq, unsigned char *pred, unsign
|
|||||||
input[i] = dq[i] * input[i];
|
input[i] = dq[i] * input[i];
|
||||||
}
|
}
|
||||||
|
|
||||||
vp8_short_idct4x4llm_c(input, output, pitch*2);
|
// the idct halves ( >> 1) the pitch
|
||||||
|
vp8_short_idct4x4llm_c(input, output, 4 << 1);
|
||||||
|
|
||||||
vpx_memset(input, 0, 32);
|
vpx_memset(input, 0, 32);
|
||||||
|
|
||||||
@@ -65,16 +66,17 @@ void vp8_dequant_idct_add_c(short *input, short *dq, unsigned char *pred, unsign
|
|||||||
}
|
}
|
||||||
|
|
||||||
dest += stride;
|
dest += stride;
|
||||||
diff_ptr += pitch;
|
diff_ptr += 4;
|
||||||
pred += pitch;
|
pred += pitch;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
void vp8_dequant_dc_idct_add_c(short *input, short *dq, unsigned char *pred, unsigned char *dest, int pitch, int stride, int Dc)
|
void vp8_dequant_dc_idct_add_c(short *input, short *dq, unsigned char *pred,
|
||||||
|
unsigned char *dest, int pitch, int stride,
|
||||||
|
int Dc)
|
||||||
{
|
{
|
||||||
int i;
|
int i;
|
||||||
// output needs to be at least pitch * 4 for vp8_short_idct4x4llm_c to work properly
|
short output[16];
|
||||||
short output[16*4];
|
|
||||||
short *diff_ptr = output;
|
short *diff_ptr = output;
|
||||||
int r, c;
|
int r, c;
|
||||||
|
|
||||||
@@ -85,7 +87,8 @@ void vp8_dequant_dc_idct_add_c(short *input, short *dq, unsigned char *pred, uns
|
|||||||
input[i] = dq[i] * input[i];
|
input[i] = dq[i] * input[i];
|
||||||
}
|
}
|
||||||
|
|
||||||
vp8_short_idct4x4llm_c(input, output, pitch*2);
|
// the idct halves ( >> 1) the pitch
|
||||||
|
vp8_short_idct4x4llm_c(input, output, 4 << 1);
|
||||||
|
|
||||||
vpx_memset(input, 0, 32);
|
vpx_memset(input, 0, 32);
|
||||||
|
|
||||||
@@ -105,7 +108,7 @@ void vp8_dequant_dc_idct_add_c(short *input, short *dq, unsigned char *pred, uns
|
|||||||
}
|
}
|
||||||
|
|
||||||
dest += stride;
|
dest += stride;
|
||||||
diff_ptr += pitch;
|
diff_ptr += 4;
|
||||||
pred += pitch;
|
pred += pitch;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -21,7 +21,7 @@
|
|||||||
unsigned char *pred, unsigned char *output, \
|
unsigned char *pred, unsigned char *output, \
|
||||||
int pitch, int stride)
|
int pitch, int stride)
|
||||||
|
|
||||||
#define prototype_dequant_idct_dc_add(sym) \
|
#define prototype_dequant_dc_idct_add(sym) \
|
||||||
void sym(short *input, short *dq, \
|
void sym(short *input, short *dq, \
|
||||||
unsigned char *pred, unsigned char *output, \
|
unsigned char *pred, unsigned char *output, \
|
||||||
int pitch, int stride, \
|
int pitch, int stride, \
|
||||||
@@ -45,21 +45,21 @@ extern prototype_dequant_block(vp8_dequant_block);
|
|||||||
#endif
|
#endif
|
||||||
extern prototype_dequant_idct_add(vp8_dequant_idct_add);
|
extern prototype_dequant_idct_add(vp8_dequant_idct_add);
|
||||||
|
|
||||||
#ifndef vp8_dequant_idct_dc_add
|
#ifndef vp8_dequant_dc_idct_add
|
||||||
#define vp8_dequant_idct_dc_add vp8_dequant_dc_idct_add_c
|
#define vp8_dequant_dc_idct_add vp8_dequant_dc_idct_add_c
|
||||||
#endif
|
#endif
|
||||||
extern prototype_dequant_idct_dc_add(vp8_dequant_idct_dc_add);
|
extern prototype_dequant_dc_idct_add(vp8_dequant_dc_idct_add);
|
||||||
|
|
||||||
typedef prototype_dequant_block((*vp8_dequant_block_fn_t));
|
typedef prototype_dequant_block((*vp8_dequant_block_fn_t));
|
||||||
|
|
||||||
typedef prototype_dequant_idct_add((*vp8_dequant_idct_add_fn_t));
|
typedef prototype_dequant_idct_add((*vp8_dequant_idct_add_fn_t));
|
||||||
typedef prototype_dequant_idct_dc_add((*vp8_dequant_idct_dc_add_fn_t));
|
typedef prototype_dequant_dc_idct_add((*vp8_dequant_dc_idct_add_fn_t));
|
||||||
|
|
||||||
typedef struct
|
typedef struct
|
||||||
{
|
{
|
||||||
vp8_dequant_block_fn_t block;
|
vp8_dequant_block_fn_t block;
|
||||||
vp8_dequant_idct_add_fn_t idct_add;
|
vp8_dequant_idct_add_fn_t idct_add;
|
||||||
vp8_dequant_idct_dc_add_fn_t idct_dc_add;
|
vp8_dequant_dc_idct_add_fn_t dc_idct_add;
|
||||||
} vp8_dequant_rtcd_vtable_t;
|
} vp8_dequant_rtcd_vtable_t;
|
||||||
|
|
||||||
#if CONFIG_RUNTIME_CPU_DETECT
|
#if CONFIG_RUNTIME_CPU_DETECT
|
||||||
|
|||||||
@@ -22,7 +22,7 @@ void vp8_dmachine_specific_config(VP8D_COMP *pbi)
|
|||||||
pbi->mb.rtcd = &pbi->common.rtcd;
|
pbi->mb.rtcd = &pbi->common.rtcd;
|
||||||
pbi->dequant.block = vp8_dequantize_b_c;
|
pbi->dequant.block = vp8_dequantize_b_c;
|
||||||
pbi->dequant.idct_add = vp8_dequant_idct_add_c;
|
pbi->dequant.idct_add = vp8_dequant_idct_add_c;
|
||||||
pbi->dequant.idct_dc_add = vp8_dequant_dc_idct_add_c;
|
pbi->dequant.dc_idct_add = vp8_dequant_dc_idct_add_c;
|
||||||
pbi->dboolhuff.start = vp8dx_start_decode_c;
|
pbi->dboolhuff.start = vp8dx_start_decode_c;
|
||||||
pbi->dboolhuff.fill = vp8dx_bool_decoder_fill_c;
|
pbi->dboolhuff.fill = vp8dx_bool_decoder_fill_c;
|
||||||
#if 0 //For use with RTCD, when implemented
|
#if 0 //For use with RTCD, when implemented
|
||||||
|
|||||||
@@ -22,7 +22,7 @@
|
|||||||
#if HAVE_MMX
|
#if HAVE_MMX
|
||||||
extern prototype_dequant_block(vp8_dequantize_b_mmx);
|
extern prototype_dequant_block(vp8_dequantize_b_mmx);
|
||||||
extern prototype_dequant_idct_add(vp8_dequant_idct_add_mmx);
|
extern prototype_dequant_idct_add(vp8_dequant_idct_add_mmx);
|
||||||
extern prototype_dequant_idct_dc_add(vp8_dequant_dc_idct_add_mmx);
|
extern prototype_dequant_dc_idct_add(vp8_dequant_dc_idct_add_mmx);
|
||||||
|
|
||||||
|
|
||||||
#if !CONFIG_RUNTIME_CPU_DETECT
|
#if !CONFIG_RUNTIME_CPU_DETECT
|
||||||
@@ -30,10 +30,10 @@ extern prototype_dequant_idct_dc_add(vp8_dequant_dc_idct_add_mmx);
|
|||||||
#define vp8_dequant_block vp8_dequantize_b_mmx
|
#define vp8_dequant_block vp8_dequantize_b_mmx
|
||||||
|
|
||||||
#undef vp8_dequant_idct_add
|
#undef vp8_dequant_idct_add
|
||||||
#define vp8_dequant_idct_add vp8_dequant_idct_add_mmx
|
#define vp8_dequant_idct_add vp8_dequant_idct_mmx
|
||||||
|
|
||||||
#undef vp8_dequant_idct_dc
|
#undef vp8_dequant_dc_idct_add
|
||||||
#define vp8_dequant_idct_add_dc vp8_dequant_dc_idct_add_mmx
|
#define vp8_dequant_dc_idct_add vp8_dequant_dc_idct_mmx
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
#endif
|
#endif
|
||||||
|
|||||||
@@ -44,7 +44,7 @@ void vp8_arch_x86_decode_init(VP8D_COMP *pbi)
|
|||||||
{
|
{
|
||||||
pbi->dequant.block = vp8_dequantize_b_mmx;
|
pbi->dequant.block = vp8_dequantize_b_mmx;
|
||||||
pbi->dequant.idct_add = vp8_dequant_idct_add_mmx;
|
pbi->dequant.idct_add = vp8_dequant_idct_add_mmx;
|
||||||
pbi->dequant.idct_dc_add = vp8_dequant_dc_idct_add_mmx;
|
pbi->dequant.dc_idct_add = vp8_dequant_dc_idct_add_mmx;
|
||||||
}
|
}
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|||||||
@@ -125,6 +125,7 @@ VP8_COMMON_SRCS-$(HAVE_ARMV6) += common/arm/systemdependent.c
|
|||||||
VP8_COMMON_SRCS-$(HAVE_ARMV6) += common/arm/vpx_asm_offsets.c
|
VP8_COMMON_SRCS-$(HAVE_ARMV6) += common/arm/vpx_asm_offsets.c
|
||||||
|
|
||||||
VP8_COMMON_SRCS_REMOVE-$(HAVE_ARMV6) += common/filter_c.c
|
VP8_COMMON_SRCS_REMOVE-$(HAVE_ARMV6) += common/filter_c.c
|
||||||
|
VP8_COMMON_SRCS_REMOVE-$(HAVE_ARMV6) += common/idctllm.c
|
||||||
VP8_COMMON_SRCS_REMOVE-$(HAVE_ARMV6) += common/recon.c
|
VP8_COMMON_SRCS_REMOVE-$(HAVE_ARMV6) += common/recon.c
|
||||||
VP8_COMMON_SRCS_REMOVE-$(HAVE_ARMV6) += common/reconintra4x4.c
|
VP8_COMMON_SRCS_REMOVE-$(HAVE_ARMV6) += common/reconintra4x4.c
|
||||||
VP8_COMMON_SRCS_REMOVE-$(HAVE_ARMV6) += common/generic/systemdependent.c
|
VP8_COMMON_SRCS_REMOVE-$(HAVE_ARMV6) += common/generic/systemdependent.c
|
||||||
@@ -134,6 +135,7 @@ VP8_COMMON_SRCS-$(HAVE_ARMV6) += common/arm/armv6/bilinearfilter_v6$(ASM)
|
|||||||
VP8_COMMON_SRCS-$(HAVE_ARMV6) += common/arm/armv6/copymem8x4_v6$(ASM)
|
VP8_COMMON_SRCS-$(HAVE_ARMV6) += common/arm/armv6/copymem8x4_v6$(ASM)
|
||||||
VP8_COMMON_SRCS-$(HAVE_ARMV6) += common/arm/armv6/copymem8x8_v6$(ASM)
|
VP8_COMMON_SRCS-$(HAVE_ARMV6) += common/arm/armv6/copymem8x8_v6$(ASM)
|
||||||
VP8_COMMON_SRCS-$(HAVE_ARMV6) += common/arm/armv6/copymem16x16_v6$(ASM)
|
VP8_COMMON_SRCS-$(HAVE_ARMV6) += common/arm/armv6/copymem16x16_v6$(ASM)
|
||||||
|
VP8_COMMON_SRCS-$(HAVE_ARMV6) += common/arm/armv6/dc_only_idct_add_v6$(ASM)
|
||||||
VP8_COMMON_SRCS-$(HAVE_ARMV6) += common/arm/armv6/iwalsh_v6$(ASM)
|
VP8_COMMON_SRCS-$(HAVE_ARMV6) += common/arm/armv6/iwalsh_v6$(ASM)
|
||||||
VP8_COMMON_SRCS-$(HAVE_ARMV6) += common/arm/armv6/filter_v6$(ASM)
|
VP8_COMMON_SRCS-$(HAVE_ARMV6) += common/arm/armv6/filter_v6$(ASM)
|
||||||
VP8_COMMON_SRCS-$(HAVE_ARMV6) += common/arm/armv6/idct_v6$(ASM)
|
VP8_COMMON_SRCS-$(HAVE_ARMV6) += common/arm/armv6/idct_v6$(ASM)
|
||||||
@@ -150,6 +152,7 @@ VP8_COMMON_SRCS-$(HAVE_ARMV7) += common/arm/neon/bilinearpredict16x16_neon$(ASM
|
|||||||
VP8_COMMON_SRCS-$(HAVE_ARMV7) += common/arm/neon/copymem8x4_neon$(ASM)
|
VP8_COMMON_SRCS-$(HAVE_ARMV7) += common/arm/neon/copymem8x4_neon$(ASM)
|
||||||
VP8_COMMON_SRCS-$(HAVE_ARMV7) += common/arm/neon/copymem8x8_neon$(ASM)
|
VP8_COMMON_SRCS-$(HAVE_ARMV7) += common/arm/neon/copymem8x8_neon$(ASM)
|
||||||
VP8_COMMON_SRCS-$(HAVE_ARMV7) += common/arm/neon/copymem16x16_neon$(ASM)
|
VP8_COMMON_SRCS-$(HAVE_ARMV7) += common/arm/neon/copymem16x16_neon$(ASM)
|
||||||
|
VP8_COMMON_SRCS-$(HAVE_ARMV7) += common/arm/neon/dc_only_idct_add_neon$(ASM)
|
||||||
VP8_COMMON_SRCS-$(HAVE_ARMV7) += common/arm/neon/iwalsh_neon$(ASM)
|
VP8_COMMON_SRCS-$(HAVE_ARMV7) += common/arm/neon/iwalsh_neon$(ASM)
|
||||||
VP8_COMMON_SRCS-$(HAVE_ARMV7) += common/arm/neon/loopfiltersimplehorizontaledge_neon$(ASM)
|
VP8_COMMON_SRCS-$(HAVE_ARMV7) += common/arm/neon/loopfiltersimplehorizontaledge_neon$(ASM)
|
||||||
VP8_COMMON_SRCS-$(HAVE_ARMV7) += common/arm/neon/loopfiltersimpleverticaledge_neon$(ASM)
|
VP8_COMMON_SRCS-$(HAVE_ARMV7) += common/arm/neon/loopfiltersimpleverticaledge_neon$(ASM)
|
||||||
|
|||||||
@@ -23,12 +23,12 @@ VP8_DX_SRCS_REMOVE-$(HAVE_ARMV6) += decoder/generic/dsystemdependent.c
|
|||||||
|
|
||||||
#File list for armv6
|
#File list for armv6
|
||||||
# decoder
|
# decoder
|
||||||
VP8_DX_SRCS-$(HAVE_ARMV6) += decoder/arm/armv6/dequantdcidct_v6$(ASM)
|
VP8_DX_SRCS-$(HAVE_ARMV6) += decoder/arm/armv6/dequant_dc_idct_v6$(ASM)
|
||||||
VP8_DX_SRCS-$(HAVE_ARMV6) += decoder/arm/armv6/dequantidct_v6$(ASM)
|
VP8_DX_SRCS-$(HAVE_ARMV6) += decoder/arm/armv6/dequant_idct_v6$(ASM)
|
||||||
VP8_DX_SRCS-$(HAVE_ARMV6) += decoder/arm/armv6/dequantize_v6$(ASM)
|
VP8_DX_SRCS-$(HAVE_ARMV6) += decoder/arm/armv6/dequantize_v6$(ASM)
|
||||||
|
|
||||||
#File list for neon
|
#File list for neon
|
||||||
# decoder
|
# decoder
|
||||||
VP8_DX_SRCS-$(HAVE_ARMV7) += decoder/arm/neon/dequantdcidct_neon$(ASM)
|
VP8_DX_SRCS-$(HAVE_ARMV7) += decoder/arm/neon/dequant_dc_idct_neon$(ASM)
|
||||||
VP8_DX_SRCS-$(HAVE_ARMV7) += decoder/arm/neon/dequantidct_neon$(ASM)
|
VP8_DX_SRCS-$(HAVE_ARMV7) += decoder/arm/neon/dequant_idct_neon$(ASM)
|
||||||
VP8_DX_SRCS-$(HAVE_ARMV7) += decoder/arm/neon/dequantizeb_neon$(ASM)
|
VP8_DX_SRCS-$(HAVE_ARMV7) += decoder/arm/neon/dequantizeb_neon$(ASM)
|
||||||
|
|||||||
Reference in New Issue
Block a user