ed9c66f584
Instead of using the predict buffer, the decoder now writes the predictor into the recon buffer. For blocks with eob=0, unnecessary idcts can be eliminated. This gave a performance boost of ~1.8% for the HD clips used. Tero: Added needed changes to ARM side and scheduled some assembly code to prevent interlocks. Patch Set 6: Merged (I1bcdca7a95aacc3a181b9faa6b10e3a71ee24df3) into this commit because of similarities in the idct functions. Patch Set 7: EC bug fix. Change-Id: Ie31d90b5d3522e1108163f2ac491e455e3f955e6
191 lines
4.6 KiB
NASM
191 lines
4.6 KiB
NASM
;
|
|
; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
|
|
;
|
|
; Use of this source code is governed by a BSD-style license and patent
|
|
; grant that can be found in the LICENSE file in the root of the source
|
|
; tree. All contributing project authors may be found in the AUTHORS
|
|
; file in the root of the source tree.
|
|
;
|
|
|
|
EXPORT |vp8_dequant_idct_add_v6|
|
|
|
|
AREA |.text|, CODE, READONLY
|
|
;void vp8_dequant_idct_v6(short *input, short *dq,
|
|
; unsigned char *dest, int stride)
|
|
; r0 = q
|
|
; r1 = dq
|
|
; r2 = dst
|
|
; r3 = stride
|
|
|
|
|vp8_dequant_idct_add_v6| PROC
|
|
stmdb sp!, {r4-r11, lr}
|
|
|
|
ldr r4, [r0] ;input
|
|
ldr r5, [r1], #4 ;dq
|
|
|
|
sub sp, sp, #4
|
|
str r3, [sp]
|
|
|
|
mov r12, #4
|
|
|
|
vp8_dequant_add_loop
|
|
smulbb r6, r4, r5
|
|
smultt r7, r4, r5
|
|
|
|
ldr r4, [r0, #4] ;input
|
|
ldr r5, [r1], #4 ;dq
|
|
|
|
strh r6, [r0], #2
|
|
strh r7, [r0], #2
|
|
|
|
smulbb r6, r4, r5
|
|
smultt r7, r4, r5
|
|
|
|
subs r12, r12, #1
|
|
|
|
ldrne r4, [r0, #4]
|
|
ldrne r5, [r1], #4
|
|
|
|
strh r6, [r0], #2
|
|
strh r7, [r0], #2
|
|
|
|
bne vp8_dequant_add_loop
|
|
|
|
sub r0, r0, #32
|
|
mov r1, r0
|
|
|
|
; short_idct4x4llm_v6_dual
|
|
ldr r3, cospi8sqrt2minus1
|
|
ldr r4, sinpi8sqrt2
|
|
ldr r6, [r0, #8]
|
|
mov r5, #2
|
|
vp8_dequant_idct_loop1_v6
|
|
ldr r12, [r0, #24]
|
|
ldr r14, [r0, #16]
|
|
smulwt r9, r3, r6
|
|
smulwb r7, r3, r6
|
|
smulwt r10, r4, r6
|
|
smulwb r8, r4, r6
|
|
pkhbt r7, r7, r9, lsl #16
|
|
smulwt r11, r3, r12
|
|
pkhbt r8, r8, r10, lsl #16
|
|
uadd16 r6, r6, r7
|
|
smulwt r7, r4, r12
|
|
smulwb r9, r3, r12
|
|
smulwb r10, r4, r12
|
|
subs r5, r5, #1
|
|
pkhbt r9, r9, r11, lsl #16
|
|
ldr r11, [r0], #4
|
|
pkhbt r10, r10, r7, lsl #16
|
|
uadd16 r7, r12, r9
|
|
usub16 r7, r8, r7
|
|
uadd16 r6, r6, r10
|
|
uadd16 r10, r11, r14
|
|
usub16 r8, r11, r14
|
|
uadd16 r9, r10, r6
|
|
usub16 r10, r10, r6
|
|
uadd16 r6, r8, r7
|
|
usub16 r7, r8, r7
|
|
str r6, [r1, #8]
|
|
ldrne r6, [r0, #8]
|
|
str r7, [r1, #16]
|
|
str r10, [r1, #24]
|
|
str r9, [r1], #4
|
|
bne vp8_dequant_idct_loop1_v6
|
|
|
|
mov r5, #2
|
|
sub r0, r1, #8
|
|
vp8_dequant_idct_loop2_v6
|
|
ldr r6, [r0], #4
|
|
ldr r7, [r0], #4
|
|
ldr r8, [r0], #4
|
|
ldr r9, [r0], #4
|
|
smulwt r1, r3, r6
|
|
smulwt r12, r4, r6
|
|
smulwt lr, r3, r8
|
|
smulwt r10, r4, r8
|
|
pkhbt r11, r8, r6, lsl #16
|
|
pkhbt r1, lr, r1, lsl #16
|
|
pkhbt r12, r10, r12, lsl #16
|
|
pkhtb r6, r6, r8, asr #16
|
|
uadd16 r6, r1, r6
|
|
pkhbt lr, r9, r7, lsl #16
|
|
uadd16 r10, r11, lr
|
|
usub16 lr, r11, lr
|
|
pkhtb r8, r7, r9, asr #16
|
|
subs r5, r5, #1
|
|
smulwt r1, r3, r8
|
|
smulwb r7, r3, r8
|
|
smulwt r11, r4, r8
|
|
smulwb r9, r4, r8
|
|
pkhbt r1, r7, r1, lsl #16
|
|
uadd16 r8, r1, r8
|
|
pkhbt r11, r9, r11, lsl #16
|
|
usub16 r1, r12, r8
|
|
uadd16 r8, r11, r6
|
|
ldr r9, c0x00040004
|
|
ldr r12, [sp] ; get stride from stack
|
|
uadd16 r6, r10, r8
|
|
usub16 r7, r10, r8
|
|
uadd16 r7, r7, r9
|
|
uadd16 r6, r6, r9
|
|
uadd16 r10, r14, r1
|
|
usub16 r1, r14, r1
|
|
uadd16 r10, r10, r9
|
|
uadd16 r1, r1, r9
|
|
ldr r11, [r2] ; load input from dst
|
|
mov r8, r7, asr #3
|
|
pkhtb r9, r8, r10, asr #19
|
|
mov r8, r1, asr #3
|
|
pkhtb r8, r8, r6, asr #19
|
|
uxtb16 lr, r11, ror #8
|
|
qadd16 r9, r9, lr
|
|
uxtb16 lr, r11
|
|
qadd16 r8, r8, lr
|
|
usat16 r9, #8, r9
|
|
usat16 r8, #8, r8
|
|
orr r9, r8, r9, lsl #8
|
|
ldr r11, [r2, r12] ; load input from dst
|
|
mov r7, r7, lsl #16
|
|
mov r1, r1, lsl #16
|
|
mov r10, r10, lsl #16
|
|
mov r6, r6, lsl #16
|
|
mov r7, r7, asr #3
|
|
pkhtb r7, r7, r10, asr #19
|
|
mov r1, r1, asr #3
|
|
pkhtb r1, r1, r6, asr #19
|
|
uxtb16 r8, r11, ror #8
|
|
qadd16 r7, r7, r8
|
|
uxtb16 r8, r11
|
|
qadd16 r1, r1, r8
|
|
usat16 r7, #8, r7
|
|
usat16 r1, #8, r1
|
|
orr r1, r1, r7, lsl #8
|
|
str r9, [r2], r12 ; store output to dst
|
|
str r1, [r2], r12 ; store output to dst
|
|
bne vp8_dequant_idct_loop2_v6
|
|
|
|
; vpx_memset
|
|
sub r0, r0, #32
|
|
add sp, sp, #4
|
|
|
|
mov r12, #0
|
|
str r12, [r0]
|
|
str r12, [r0, #4]
|
|
str r12, [r0, #8]
|
|
str r12, [r0, #12]
|
|
str r12, [r0, #16]
|
|
str r12, [r0, #20]
|
|
str r12, [r0, #24]
|
|
str r12, [r0, #28]
|
|
|
|
ldmia sp!, {r4 - r11, pc}
|
|
ENDP ; |vp8_dequant_idct_add_v6|
|
|
|
|
; Constant Pool
|
|
cospi8sqrt2minus1 DCD 0x00004E7B
|
|
sinpi8sqrt2 DCD 0x00008A8C
|
|
c0x00040004 DCD 0x00040004
|
|
|
|
END
|