cleanup simple loop filter

move some things around, reorder some instructions

constant 0 is used several times. load it once per call in horiz,
once per loop in vert.

separate saturating instructions to avoid stalls.

just use one usub8 call to set GE flags, rather than uqsub8 followed by
usub8 w/ 0

document some stalls for further consideration

Change-Id: Ic3877e0ddbe314bb8a17fd5db73501a7d64570ec
This commit is contained in:
Johann 2010-08-19 13:37:40 -04:00
parent a522be2941
commit 52852da7c9

View File

@ -55,8 +55,8 @@ pstep RN r1
;stack const char *thresh,
;stack int count
;Note: All 16 elements in flimit are equal. So, in the code, only one load is needed
;for flimit. Same way applies to limit and thresh.
; All 16 elements in flimit are equal. So, in the code, only one load is needed
; for flimit. Same applies to limit. thresh is not used in simple looopfilter
;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
|vp8_loop_filter_simple_horizontal_edge_armv6| PROC
@ -65,23 +65,19 @@ pstep RN r1
ldr r12, [r3] ; limit
ldr r3, [src, -pstep, lsl #1] ; p1
ldr r9, [sp, #40] ; count for 8-in-parallel
ldr r4, [src, -pstep] ; p0
ldr r7, [r2] ; flimit
ldr r5, [src] ; q0
ldr r2, c0x80808080
ldr r6, [src, pstep] ; q1
ldr r7, [r2] ; flimit
ldr r2, c0x80808080
ldr r9, [sp, #40] ; count for 8-in-parallel
uadd8 r7, r7, r7 ; flimit * 2
mov r9, r9, lsl #1 ; double the count. we're doing 4 at a time
uadd8 r12, r7, r12 ; flimit * 2 + limit
mov lr, #0
mov lr, #0 ; need 0 in a couple places
|simple_hnext8|
; vp8_simple_filter_mask() function
; vp8_simple_filter_mask()
uqsub8 r7, r3, r6 ; p1 - q1
uqsub8 r8, r6, r3 ; q1 - p1
@ -89,58 +85,50 @@ pstep RN r1
uqsub8 r11, r5, r4 ; q0 - p0
orr r8, r8, r7 ; abs(p1 - q1)
orr r10, r10, r11 ; abs(p0 - q0)
uhadd8 r8, r8, lr ; abs(p1 - q2) >> 1
uqadd8 r10, r10, r10 ; abs(p0 - q0) * 2
; STALL waiting on r10
uhadd8 r8, r8, lr ; abs(p1 - q2) >> 1
uqadd8 r10, r10, r8 ; abs(p0 - q0)*2 + abs(p1 - q1)/2
; STALL waiting on r10
mvn r8, #0
uqsub8 r10, r10, r12 ; compare to flimit. need to do this twice because uqsub8 doesn't set GE flags
; and usub8 doesn't saturate
usub8 r10, lr, r10 ; set GE flags for each byte
usub8 r10, r12, r10 ; compare to flimit. usub8 sets GE flags
sel r10, r8, lr ; filter mask: F or 0
cmp r10, #0
beq simple_hskip_filter ; skip filtering if we're &ing with 0s. would just write out the same values
beq simple_hskip_filter ; skip filtering if all masks are 0x00
;vp8_simple_filter() function
;vp8_simple_filter()
eor r3, r3, r2 ; p1 offset to convert to a signed value
eor r6, r6, r2 ; q1 offset to convert to a signed value
eor r4, r4, r2 ; p0 offset to convert to a signed value
eor r5, r5, r2 ; q0 offset to convert to a signed value
qsub8 r3, r3, r6 ; vp8_signed_char_clamp(p1-q1)
qsub8 r6, r5, r4 ; vp8_signed_char_clamp(q0-p0)
qadd8 r3, r3, r6 ; += q0-p0
qadd8 r3, r3, r6 ; += q0-p0
qadd8 r3, r3, r6 ; p1-q1 + 3*(q0-p0))
and r3, r3, r10 ; &= mask
qsub8 r3, r3, r6 ; vp8_filter = p1 - q1
qsub8 r6, r5, r4 ; q0 - p0
qadd8 r3, r3, r6 ; += q0 - p0
ldr r7, c0x04040404
qadd8 r3, r3, r6 ; += q0 - p0
ldr r8, c0x03030303
qadd8 r3, r3, r6 ; vp8_filter = p1-q1 + 3*(q0-p0))
;STALL
and r3, r3, r10 ; vp8_filter &= mask
;save bottom 3 bits so that we round one side +4 and the other +3
qadd8 r7 , r3 , r7 ; Filter1 (r3) = vp8_signed_char_clamp(vp8_filter+4)
qadd8 r8 , r3 , r8 ; Filter2 (r8) = vp8_signed_char_clamp(vp8_filter+3)
qadd8 r7 , r3 , r7 ; Filter1 = vp8_filter + 4
qadd8 r8 , r3 , r8 ; Filter2 = vp8_filter + 3
mov r3, #0
shadd8 r7 , r7 , r3
shadd8 r8 , r8 , r3
shadd8 r7 , r7 , r3
shadd8 r8 , r8 , r3
shadd8 r7 , r7 , r3 ; Filter1 >>= 3
shadd8 r8 , r8 , r3 ; Filter2 >>= 3
shadd8 r7 , r7 , lr
shadd8 r8 , r8 , lr
shadd8 r7 , r7 , lr
shadd8 r8 , r8 , lr
shadd8 r7 , r7 , lr ; Filter1 >>= 3
shadd8 r8 , r8 , lr ; Filter2 >>= 3
qsub8 r5 ,r5, r7 ; u = vp8_signed_char_clamp(q0 - Filter1)
qadd8 r4, r4, r8 ; u = vp8_signed_char_clamp(p0 + Filter2)
qsub8 r5 ,r5, r7 ; u = q0 - Filter1
qadd8 r4, r4, r8 ; u = p0 + Filter2
eor r5, r5, r2 ; *oq0 = u^0x80
str r5, [src] ; store oq0 result
eor r4, r4, r2 ; *op0 = u^0x80
str r4, [src, -pstep] ; store op0 result
|simple_hskip_filter|
subs r9, r9, #1
addne src, src, #4 ; next row
@ -204,9 +192,8 @@ pstep RN r1
uhadd8 r7, r7, r8 ; abs(p1 - q1) / 2
uqadd8 r7, r7, r9 ; abs(p0 - q0)*2 + abs(p1 - q1)/2
mvn r10, #0 ; r10 == -1
uqsub8 r7, r7, r12 ; compare to flimit
usub8 r7, r8, r7
usub8 r7, r12, r7 ; compare to flimit
sel lr, r10, r8 ; filter mask
cmp lr, #0
@ -218,35 +205,34 @@ pstep RN r1
eor r4, r4, r2 ; p0 offset to convert to a signed value
eor r5, r5, r2 ; q0 offset to convert to a signed value
qsub8 r3, r3, r6 ; vp8_filter (r3) = vp8_signed_char_clamp(p1-q1)
qsub8 r6, r5, r4 ; vp8_filter = vp8_signed_char_clamp(vp8_filter + 3 * ( q0 - p0))
qsub8 r3, r3, r6 ; vp8_filter = p1 - q1
qsub8 r6, r5, r4 ; q0 - p0
qadd8 r3, r3, r6
ldr r8, c0x03030303 ; r8 = 3
qadd8 r3, r3, r6 ; vp8_filter += q0 - p0
ldr r9, c0x03030303 ; r9 = 3
qadd8 r3, r3, r6
qadd8 r3, r3, r6 ; vp8_filter += q0 - p0
ldr r7, c0x04040404
qadd8 r3, r3, r6
qadd8 r3, r3, r6 ; vp8_filter = p1-q1 + 3*(q0-p0))
;STALL
and r3, r3, lr ; vp8_filter &= mask
;save bottom 3 bits so that we round one side +4 and the other +3
qadd8 r8 , r3 , r8 ; Filter2 (r8) = vp8_signed_char_clamp(vp8_filter+3)
qadd8 r3 , r3 , r7 ; Filter1 (r3) = vp8_signed_char_clamp(vp8_filter+4)
qadd8 r9 , r3 , r9 ; Filter2 = vp8_filter + 3
qadd8 r3 , r3 , r7 ; Filter1 = vp8_filter + 4
mov r7, #0
shadd8 r8 , r8 , r7 ; Filter2 >>= 3
shadd8 r3 , r3 , r7 ; Filter1 >>= 3
shadd8 r8 , r8 , r7
shadd8 r3 , r3 , r7
shadd8 r8 , r8 , r7 ; r8: filter2
shadd8 r3 , r3 , r7 ; r7: filter1
shadd8 r9 , r9 , r8
shadd8 r3 , r3 , r8
shadd8 r9 , r9 , r8
shadd8 r3 , r3 , r8
shadd8 r9 , r9 , r8 ; Filter2 >>= 3
shadd8 r3 , r3 , r8 ; Filter1 >>= 3
;calculate output
sub src, src, pstep, lsl #2
qadd8 r4, r4, r8 ; u = vp8_signed_char_clamp(p0 + Filter2)
qsub8 r5, r5, r3 ; u = vp8_signed_char_clamp(q0 - Filter1)
qadd8 r4, r4, r9 ; u = p0 + Filter2
qsub8 r5, r5, r3 ; u = q0 - Filter1
eor r4, r4, r2 ; *op0 = u^0x80
eor r5, r5, r2 ; *oq0 = u^0x80