vpx/vp8/common/arm/armv6/idct_v6.asm
Scott LaVarnway ed9c66f584 Remove usage of predict buffer for decode
Instead of using the predict buffer, the decoder now writes
the predictor into the recon buffer.  For blocks with eob=0,
unnecessary idcts can be eliminated.  This gave a performance
boost of ~1.8% for the HD clips used.

Tero: Added needed changes to ARM side and scheduled some
      assembly code to prevent interlocks.

Patch Set 6:  Merged (I1bcdca7a95aacc3a181b9faa6b10e3a71ee24df3)
into this commit because of similarities in the idct
functions.
Patch Set 7: EC bug fix.

Change-Id: Ie31d90b5d3522e1108163f2ac491e455e3f955e6
2011-10-18 12:06:50 -04:00

203 lines
6.7 KiB
NASM

;
; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
;
; Use of this source code is governed by a BSD-style license
; that can be found in the LICENSE file in the root of the source
; tree. An additional intellectual property rights grant can be found
; in the file PATENTS. All contributing project authors may
; be found in the AUTHORS file in the root of the source tree.
;
EXPORT |vp8_short_idct4x4llm_v6_dual|
AREA |.text|, CODE, READONLY
; void vp8_short_idct4x4llm_c(short *input, unsigned char *pred, int pitch,
; unsigned char *dst, int stride)
; r0 short* input
; r1 unsigned char* pred
; r2 int pitch
; r3 unsigned char* dst
; sp int stride
|vp8_short_idct4x4llm_v6_dual| PROC
stmdb sp!, {r4-r11, lr}
sub sp, sp, #4
mov r4, #0x00008A00 ; sin
orr r4, r4, #0x0000008C ; sinpi8sqrt2
mov r5, #0x00004E00 ; cos
orr r5, r5, #0x0000007B ; cospi8sqrt2minus1
orr r5, r5, #1<<31 ; loop counter on top bit
loop1_dual
ldr r6, [r0, #(4*2)] ; i5 | i4
ldr r12, [r0, #(12*2)] ; i13|i12
ldr r14, [r0, #(8*2)] ; i9 | i8
smulbt r9, r5, r6 ; (ip[5] * cospi8sqrt2minus1) >> 16
smulbb r7, r5, r6 ; (ip[4] * cospi8sqrt2minus1) >> 16
smulwt r10, r4, r6 ; (ip[5] * sinpi8sqrt2) >> 16
smulwb r8, r4, r6 ; (ip[4] * sinpi8sqrt2) >> 16
smulbt r11, r5, r12 ; (ip[13] * cospi8sqrt2minus1) >> 16
pkhtb r7, r9, r7, asr #16 ; 5c | 4c
pkhbt r8, r8, r10, lsl #16 ; 5s | 4s
uadd16 r6, r6, r7 ; 5c+5 | 4c+4
smulwt r7, r4, r12 ; (ip[13] * sinpi8sqrt2) >> 16
smulbb r9, r5, r12 ; (ip[12] * cospi8sqrt2minus1) >> 16
smulwb r10, r4, r12 ; (ip[12] * sinpi8sqrt2) >> 16
subs r5, r5, #1<<31 ; i--
pkhtb r9, r11, r9, asr #16 ; 13c | 12c
ldr r11, [r0] ; i1 | i0
pkhbt r10, r10, r7, lsl #16 ; 13s | 12s
uadd16 r7, r12, r9 ; 13c+13 | 12c+12
usub16 r7, r8, r7 ; c
uadd16 r6, r6, r10 ; d
uadd16 r10, r11, r14 ; a
usub16 r8, r11, r14 ; b
uadd16 r9, r10, r6 ; a+d
usub16 r10, r10, r6 ; a-d
uadd16 r6, r8, r7 ; b+c
usub16 r7, r8, r7 ; b-c
; use input buffer to store intermediate results
str r6, [r0, #(4*2)] ; o5 | o4
str r7, [r0, #(8*2)] ; o9 | o8
str r10,[r0, #(12*2)] ; o13|o12
str r9, [r0], #4 ; o1 | o0
bcs loop1_dual
sub r0, r0, #8 ; reset input/output
str r0, [sp]
loop2_dual
ldr r6, [r0, #(4*2)] ; i5 | i4
ldr r12,[r0, #(2*2)] ; i3 | i2
ldr r14,[r0, #(6*2)] ; i7 | i6
ldr r0, [r0, #(0*2)] ; i1 | i0
smulbt r9, r5, r6 ; (ip[5] * cospi8sqrt2minus1) >> 16
smulbt r7, r5, r0 ; (ip[1] * cospi8sqrt2minus1) >> 16
smulwt r10, r4, r6 ; (ip[5] * sinpi8sqrt2) >> 16
smulwt r8, r4, r0 ; (ip[1] * sinpi8sqrt2) >> 16
pkhbt r11, r6, r0, lsl #16 ; i0 | i4
pkhtb r7, r7, r9, asr #16 ; 1c | 5c
pkhtb r0, r0, r6, asr #16 ; i1 | i5
pkhbt r8, r10, r8, lsl #16 ; 1s | 5s = temp1
uadd16 r0, r7, r0 ; 1c+1 | 5c+5 = temp2
pkhbt r9, r14, r12, lsl #16 ; i2 | i6
uadd16 r10, r11, r9 ; a
usub16 r9, r11, r9 ; b
pkhtb r6, r12, r14, asr #16 ; i3 | i7
subs r5, r5, #1<<31 ; i--
smulbt r7, r5, r6 ; (ip[3] * cospi8sqrt2minus1) >> 16
smulwt r11, r4, r6 ; (ip[3] * sinpi8sqrt2) >> 16
smulbb r12, r5, r6 ; (ip[7] * cospi8sqrt2minus1) >> 16
smulwb r14, r4, r6 ; (ip[7] * sinpi8sqrt2) >> 16
pkhtb r7, r7, r12, asr #16 ; 3c | 7c
pkhbt r11, r14, r11, lsl #16 ; 3s | 7s = temp1
uadd16 r6, r7, r6 ; 3c+3 | 7c+7 = temp2
usub16 r12, r8, r6 ; c (o1 | o5)
uadd16 r6, r11, r0 ; d (o3 | o7)
uadd16 r7, r10, r6 ; a+d
mov r8, #4 ; set up 4's
orr r8, r8, #0x40000 ; 4|4
usub16 r6, r10, r6 ; a-d
uadd16 r6, r6, r8 ; a-d+4, 3|7
uadd16 r7, r7, r8 ; a+d+4, 0|4
uadd16 r10, r9, r12 ; b+c
usub16 r0, r9, r12 ; b-c
uadd16 r10, r10, r8 ; b+c+4, 1|5
uadd16 r8, r0, r8 ; b-c+4, 2|6
ldr lr, [sp, #40] ; dst stride
ldrb r0, [r1] ; pred p0
ldrb r11, [r1, #1] ; pred p1
ldrb r12, [r1, #2] ; pred p2
add r0, r0, r7, asr #19 ; p0 + o0
add r11, r11, r10, asr #19 ; p1 + o1
add r12, r12, r8, asr #19 ; p2 + o2
usat r0, #8, r0 ; d0 = clip8(p0 + o0)
usat r11, #8, r11 ; d1 = clip8(p1 + o1)
usat r12, #8, r12 ; d2 = clip8(p2 + o2)
add r0, r0, r11, lsl #8 ; |--|--|d1|d0|
ldrb r11, [r1, #3] ; pred p3
add r0, r0, r12, lsl #16 ; |--|d2|d1|d0|
add r11, r11, r6, asr #19 ; p3 + o3
sxth r7, r7 ;
sxth r10, r10 ;
usat r11, #8, r11 ; d3 = clip8(p3 + o3)
sxth r8, r8 ;
sxth r6, r6 ;
add r0, r0, r11, lsl #24 ; |d3|d2|d1|d0|
ldrb r12, [r1, r2]! ; pred p4
str r0, [r3], lr
ldrb r11, [r1, #1] ; pred p5
add r12, r12, r7, asr #3 ; p4 + o4
add r11, r11, r10, asr #3 ; p5 + o5
usat r12, #8, r12 ; d4 = clip8(p4 + o4)
usat r11, #8, r11 ; d5 = clip8(p5 + o5)
ldrb r7, [r1, #2] ; pred p6
ldrb r10, [r1, #3] ; pred p6
add r12, r12, r11, lsl #8 ; |--|--|d5|d4|
add r7, r7, r8, asr #3 ; p6 + o6
add r10, r10, r6, asr #3 ; p7 + o7
ldr r0, [sp] ; load input pointer
usat r7, #8, r7 ; d6 = clip8(p6 + o6)
usat r10, #8, r10 ; d7 = clip8(p7 + o7)
add r12, r12, r7, lsl #16 ; |--|d6|d5|d4|
add r12, r12, r10, lsl #24 ; |d7|d6|d5|d4|
str r12, [r3], lr
add r0, r0, #16
add r1, r1, r2 ; pred + pitch
bcs loop2_dual
add sp, sp, #4 ; idct_output buffer
ldmia sp!, {r4 - r11, pc}
ENDP
END