56f5a9a060
Jeff Muizelaar posted some changes to the idct/reconstruction c code. This is the equivalent update for the arm assembly. This shows a good boost on v6, and a minor boost on neon. Here are some numbers for highway in qcif, 2641 frames: HEAD neon: ~161 fps new neon: ~162 fps HEAD v6: ~102 fps new v6: ~106 fps The following functions have been updated for armv6 and neon: vp8_dc_only_idct_add vp8_dequant_idct_add vp8_dequant_dc_idct_add Conflicts: vp8/decoder/arm/armv6/dequantdcidct_v6.asm vp8/decoder/arm/armv6/dequantidct_v6.asm Resolved by removing these files. When I rewrote the functions, I also moved the files to dequant_dc_idct_v6.asm/dequant_idct_v6.asm Change-Id: Ie3300df824d52474eca1a5134cf22d8b7809a5d4
197 lines
4.6 KiB
NASM
197 lines
4.6 KiB
NASM
;
|
|
; Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
|
|
;
|
|
; Use of this source code is governed by a BSD-style license and patent
|
|
; grant that can be found in the LICENSE file in the root of the source
|
|
; tree. All contributing project authors may be found in the AUTHORS
|
|
; file in the root of the source tree.
|
|
;
|
|
|
|
EXPORT |vp8_dequant_idct_add_v6|
|
|
|
|
AREA |.text|, CODE, READONLY
|
|
;void vp8_dequant_idct_v6(short *input, short *dq, unsigned char *pred,
|
|
; unsigned char *dest, int pitch, int stride)
|
|
; r0 = input
|
|
; r1 = dq
|
|
; r2 = pred
|
|
; r3 = dest
|
|
; sp + 36 = pitch ; +4 = 40
|
|
; sp + 40 = stride ; +4 = 44
|
|
|
|
|
|
|vp8_dequant_idct_add_v6| PROC
|
|
stmdb sp!, {r4-r11, lr}
|
|
|
|
ldr r4, [r0] ;input
|
|
ldr r5, [r1], #4 ;dq
|
|
|
|
sub sp, sp, #4
|
|
str r3, [sp]
|
|
|
|
mov r12, #4
|
|
|
|
vp8_dequant_add_loop
|
|
smulbb r6, r4, r5
|
|
smultt r7, r4, r5
|
|
|
|
ldr r4, [r0, #4] ;input
|
|
ldr r5, [r1], #4 ;dq
|
|
|
|
strh r6, [r0], #2
|
|
strh r7, [r0], #2
|
|
|
|
smulbb r6, r4, r5
|
|
smultt r7, r4, r5
|
|
|
|
subs r12, r12, #1
|
|
|
|
ldrne r4, [r0, #4]
|
|
ldrne r5, [r1], #4
|
|
|
|
strh r6, [r0], #2
|
|
strh r7, [r0], #2
|
|
|
|
bne vp8_dequant_add_loop
|
|
|
|
sub r0, r0, #32
|
|
mov r1, r0
|
|
|
|
; short_idct4x4llm_v6_dual
|
|
ldr r3, cospi8sqrt2minus1
|
|
ldr r4, sinpi8sqrt2
|
|
ldr r6, [r0, #8]
|
|
mov r5, #2
|
|
vp8_dequant_idct_loop1_v6
|
|
ldr r12, [r0, #24]
|
|
ldr r14, [r0, #16]
|
|
smulwt r9, r3, r6
|
|
smulwb r7, r3, r6
|
|
smulwt r10, r4, r6
|
|
smulwb r8, r4, r6
|
|
pkhbt r7, r7, r9, lsl #16
|
|
smulwt r11, r3, r12
|
|
pkhbt r8, r8, r10, lsl #16
|
|
uadd16 r6, r6, r7
|
|
smulwt r7, r4, r12
|
|
smulwb r9, r3, r12
|
|
smulwb r10, r4, r12
|
|
subs r5, r5, #1
|
|
pkhbt r9, r9, r11, lsl #16
|
|
ldr r11, [r0], #4
|
|
pkhbt r10, r10, r7, lsl #16
|
|
uadd16 r7, r12, r9
|
|
usub16 r7, r8, r7
|
|
uadd16 r6, r6, r10
|
|
uadd16 r10, r11, r14
|
|
usub16 r8, r11, r14
|
|
uadd16 r9, r10, r6
|
|
usub16 r10, r10, r6
|
|
uadd16 r6, r8, r7
|
|
usub16 r7, r8, r7
|
|
str r6, [r1, #8]
|
|
ldrne r6, [r0, #8]
|
|
str r7, [r1, #16]
|
|
str r10, [r1, #24]
|
|
str r9, [r1], #4
|
|
bne vp8_dequant_idct_loop1_v6
|
|
|
|
mov r5, #2
|
|
sub r0, r1, #8
|
|
vp8_dequant_idct_loop2_v6
|
|
ldr r6, [r0], #4
|
|
ldr r7, [r0], #4
|
|
ldr r8, [r0], #4
|
|
ldr r9, [r0], #4
|
|
smulwt r1, r3, r6
|
|
smulwt r12, r4, r6
|
|
smulwt lr, r3, r8
|
|
smulwt r10, r4, r8
|
|
pkhbt r11, r8, r6, lsl #16
|
|
pkhbt r1, lr, r1, lsl #16
|
|
pkhbt r12, r10, r12, lsl #16
|
|
pkhtb r6, r6, r8, asr #16
|
|
uadd16 r6, r1, r6
|
|
pkhbt lr, r9, r7, lsl #16
|
|
uadd16 r10, r11, lr
|
|
usub16 lr, r11, lr
|
|
pkhtb r8, r7, r9, asr #16
|
|
subs r5, r5, #1
|
|
smulwt r1, r3, r8
|
|
smulwb r7, r3, r8
|
|
smulwt r11, r4, r8
|
|
smulwb r9, r4, r8
|
|
pkhbt r1, r7, r1, lsl #16
|
|
uadd16 r8, r1, r8
|
|
pkhbt r11, r9, r11, lsl #16
|
|
usub16 r1, r12, r8
|
|
uadd16 r8, r11, r6
|
|
ldr r9, c0x00040004
|
|
ldr r12, [sp, #40]
|
|
uadd16 r6, r10, r8
|
|
usub16 r7, r10, r8
|
|
uadd16 r7, r7, r9
|
|
uadd16 r6, r6, r9
|
|
uadd16 r10, r14, r1
|
|
usub16 r1, r14, r1
|
|
uadd16 r10, r10, r9
|
|
uadd16 r1, r1, r9
|
|
ldr r11, [r2], r12
|
|
mov r8, r7, asr #3
|
|
pkhtb r9, r8, r10, asr #19
|
|
mov r8, r1, asr #3
|
|
pkhtb r8, r8, r6, asr #19
|
|
uxtb16 lr, r11, ror #8
|
|
qadd16 r9, r9, lr
|
|
uxtb16 lr, r11
|
|
qadd16 r8, r8, lr
|
|
usat16 r9, #8, r9
|
|
usat16 r8, #8, r8
|
|
orr r9, r8, r9, lsl #8
|
|
ldr r11, [r2], r12
|
|
ldr lr, [sp]
|
|
ldr r12, [sp, #44]
|
|
mov r7, r7, lsl #16
|
|
mov r1, r1, lsl #16
|
|
mov r10, r10, lsl #16
|
|
mov r6, r6, lsl #16
|
|
mov r7, r7, asr #3
|
|
pkhtb r7, r7, r10, asr #19
|
|
mov r1, r1, asr #3
|
|
pkhtb r1, r1, r6, asr #19
|
|
uxtb16 r8, r11, ror #8
|
|
qadd16 r7, r7, r8
|
|
uxtb16 r8, r11
|
|
qadd16 r1, r1, r8
|
|
usat16 r7, #8, r7
|
|
usat16 r1, #8, r1
|
|
orr r1, r1, r7, lsl #8
|
|
str r9, [lr], r12
|
|
str r1, [lr], r12
|
|
str lr, [sp]
|
|
bne vp8_dequant_idct_loop2_v6
|
|
|
|
; vpx_memset
|
|
sub r0, r0, #32
|
|
add sp, sp, #4
|
|
|
|
mov r12, #0
|
|
str r12, [r0]
|
|
str r12, [r0, #4]
|
|
str r12, [r0, #8]
|
|
str r12, [r0, #12]
|
|
str r12, [r0, #16]
|
|
str r12, [r0, #20]
|
|
str r12, [r0, #24]
|
|
str r12, [r0, #28]
|
|
|
|
ldmia sp!, {r4 - r11, pc}
|
|
ENDP ; |vp8_dequant_idct_add_v6|
|
|
|
|
; Constant Pool
|
|
cospi8sqrt2minus1 DCD 0x00004E7B
|
|
sinpi8sqrt2 DCD 0x00008A8C
|
|
c0x00040004 DCD 0x00040004
|
|
|
|
END
|