Merge "fix armv6 simpleloop filter"
This commit is contained in:
commit
a522be2941
@ -63,23 +63,22 @@ pstep RN r1
|
||||
;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
|
||||
stmdb sp!, {r4 - r11, lr}
|
||||
|
||||
sub src, src, pstep, lsl #1 ; move src pointer down by 2 lines
|
||||
ldr r12, [r3] ; limit
|
||||
ldr r3, [src, -pstep, lsl #1] ; p1
|
||||
|
||||
ldr r12, [r3], #4 ; limit
|
||||
ldr r3, [src], pstep ; p1
|
||||
ldr r9, [sp, #40] ; count for 8-in-parallel
|
||||
ldr r4, [src, -pstep] ; p0
|
||||
|
||||
ldr r9, [sp, #36] ; count for 8-in-parallel
|
||||
ldr r4, [src], pstep ; p0
|
||||
|
||||
ldr r7, [r2], #4 ; flimit
|
||||
ldr r5, [src], pstep ; q0
|
||||
ldr r7, [r2] ; flimit
|
||||
ldr r5, [src] ; q0
|
||||
ldr r2, c0x80808080
|
||||
|
||||
ldr r6, [src] ; q1
|
||||
ldr r6, [src, pstep] ; q1
|
||||
|
||||
uadd8 r7, r7, r7 ; flimit * 2
|
||||
mov r9, r9, lsl #1 ; 4-in-parallel
|
||||
mov r9, r9, lsl #1 ; double the count. we're doing 4 at a time
|
||||
uadd8 r12, r7, r12 ; flimit * 2 + limit
|
||||
mov lr, #0
|
||||
|
||||
|simple_hnext8|
|
||||
; vp8_simple_filter_mask() function
|
||||
@ -89,22 +88,19 @@ pstep RN r1
|
||||
uqsub8 r10, r4, r5 ; p0 - q0
|
||||
uqsub8 r11, r5, r4 ; q0 - p0
|
||||
orr r8, r8, r7 ; abs(p1 - q1)
|
||||
ldr lr, c0x7F7F7F7F ; 01111111 mask
|
||||
orr r10, r10, r11 ; abs(p0 - q0)
|
||||
and r8, lr, r8, lsr #1 ; abs(p1 - q1) / 2
|
||||
uhadd8 r8, r8, lr ; abs(p1 - q2) >> 1
|
||||
uqadd8 r10, r10, r10 ; abs(p0 - q0) * 2
|
||||
mvn lr, #0 ; r10 == -1
|
||||
; STALL waiting on r10
|
||||
uqadd8 r10, r10, r8 ; abs(p0 - q0)*2 + abs(p1 - q1)/2
|
||||
; STALL waiting on r10 :(
|
||||
uqsub8 r10, r10, r12 ; compare to flimit
|
||||
mov r8, #0
|
||||
|
||||
usub8 r10, r8, r10 ; use usub8 instead of ssub8
|
||||
; STALL (maybe?) when are flags set? :/
|
||||
sel r10, lr, r8 ; filter mask: lr
|
||||
|
||||
; STALL waiting on r10
|
||||
mvn r8, #0
|
||||
uqsub8 r10, r10, r12 ; compare to flimit. need to do this twice because uqsub8 doesn't set GE flags
|
||||
; and usub8 doesn't saturate
|
||||
usub8 r10, lr, r10 ; set GE flags for each byte
|
||||
sel r10, r8, lr ; filter mask: F or 0
|
||||
cmp r10, #0
|
||||
beq simple_hskip_filter ; skip filtering
|
||||
beq simple_hskip_filter ; skip filtering if we're &ing with 0s. would just write out the same values
|
||||
|
||||
;vp8_simple_filter() function
|
||||
|
||||
@ -113,55 +109,45 @@ pstep RN r1
|
||||
eor r4, r4, r2 ; p0 offset to convert to a signed value
|
||||
eor r5, r5, r2 ; q0 offset to convert to a signed value
|
||||
|
||||
qsub8 r3, r3, r6 ; vp8_filter (r3) = vp8_signed_char_clamp(p1-q1)
|
||||
qsub8 r6, r5, r4 ; vp8_filter = vp8_signed_char_clamp(vp8_filter + 3 * ( q0 - p0))
|
||||
qsub8 r3, r3, r6 ; vp8_signed_char_clamp(p1-q1)
|
||||
qsub8 r6, r5, r4 ; vp8_signed_char_clamp(q0-p0)
|
||||
qadd8 r3, r3, r6 ; += q0-p0
|
||||
qadd8 r3, r3, r6 ; += q0-p0
|
||||
qadd8 r3, r3, r6 ; p1-q1 + 3*(q0-p0))
|
||||
and r3, r3, r10 ; &= mask
|
||||
|
||||
qadd8 r3, r3, r6
|
||||
ldr r8, c0x03030303 ; r8 = 3
|
||||
|
||||
qadd8 r3, r3, r6
|
||||
ldr r7, c0x04040404
|
||||
|
||||
qadd8 r3, r3, r6
|
||||
and r3, r3, lr ; vp8_filter &= mask;
|
||||
ldr r8, c0x03030303
|
||||
|
||||
;save bottom 3 bits so that we round one side +4 and the other +3
|
||||
qadd8 r7 , r3 , r7 ; Filter1 (r3) = vp8_signed_char_clamp(vp8_filter+4)
|
||||
qadd8 r8 , r3 , r8 ; Filter2 (r8) = vp8_signed_char_clamp(vp8_filter+3)
|
||||
qadd8 r3 , r3 , r7 ; Filter1 (r3) = vp8_signed_char_clamp(vp8_filter+4)
|
||||
|
||||
mov r7, #0
|
||||
shadd8 r8 , r8 , r7 ; Filter2 >>= 3
|
||||
shadd8 r3 , r3 , r7 ; Filter1 >>= 3
|
||||
shadd8 r8 , r8 , r7
|
||||
shadd8 r3 , r3 , r7
|
||||
shadd8 r8 , r8 , r7 ; r8: Filter2
|
||||
shadd8 r3 , r3 , r7 ; r7: filter1
|
||||
mov r3, #0
|
||||
shadd8 r7 , r7 , r3
|
||||
shadd8 r8 , r8 , r3
|
||||
shadd8 r7 , r7 , r3
|
||||
shadd8 r8 , r8 , r3
|
||||
shadd8 r7 , r7 , r3 ; Filter1 >>= 3
|
||||
shadd8 r8 , r8 , r3 ; Filter2 >>= 3
|
||||
|
||||
;calculate output
|
||||
sub src, src, pstep, lsl #1
|
||||
|
||||
qsub8 r5 ,r5, r7 ; u = vp8_signed_char_clamp(q0 - Filter1)
|
||||
qadd8 r4, r4, r8 ; u = vp8_signed_char_clamp(p0 + Filter2)
|
||||
qsub8 r5 ,r5, r3 ; u = vp8_signed_char_clamp(q0 - Filter1)
|
||||
eor r4, r4, r2 ; *op0 = u^0x80
|
||||
str r4, [src], pstep ; store op0 result
|
||||
eor r5, r5, r2 ; *oq0 = u^0x80
|
||||
str r5, [src], pstep ; store oq0 result
|
||||
str r5, [src] ; store oq0 result
|
||||
eor r4, r4, r2 ; *op0 = u^0x80
|
||||
str r4, [src, -pstep] ; store op0 result
|
||||
|
||||
|simple_hskip_filter|
|
||||
add src, src, #4
|
||||
sub src, src, pstep
|
||||
sub src, src, pstep, lsl #1
|
||||
|
||||
subs r9, r9, #1
|
||||
addne src, src, #4 ; next row
|
||||
|
||||
;pld [src]
|
||||
;pld [src, pstep]
|
||||
;pld [src, pstep, lsl #1]
|
||||
|
||||
ldrne r3, [src], pstep ; p1
|
||||
ldrne r4, [src], pstep ; p0
|
||||
ldrne r5, [src], pstep ; q0
|
||||
ldrne r6, [src] ; q1
|
||||
ldrne r3, [src, -pstep, lsl #1] ; p1
|
||||
ldrne r4, [src, -pstep] ; p0
|
||||
ldrne r5, [src] ; q0
|
||||
ldrne r6, [src, pstep] ; q1
|
||||
|
||||
bne simple_hnext8
|
||||
|
||||
@ -174,9 +160,9 @@ pstep RN r1
|
||||
;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
|
||||
stmdb sp!, {r4 - r11, lr}
|
||||
|
||||
ldr r12, [r2], #4 ; r12: flimit
|
||||
ldr r12, [r2] ; r12: flimit
|
||||
ldr r2, c0x80808080
|
||||
ldr r7, [r3], #4 ; limit
|
||||
ldr r7, [r3] ; limit
|
||||
|
||||
; load soure data to r7, r8, r9, r10
|
||||
ldrh r3, [src, #-2]
|
||||
@ -213,16 +199,15 @@ pstep RN r1
|
||||
uqsub8 r10, r5, r4 ; q0 - p0
|
||||
orr r7, r7, r8 ; abs(p1 - q1)
|
||||
orr r9, r9, r10 ; abs(p0 - q0)
|
||||
ldr lr, c0x7F7F7F7F ; 0111 1111 mask
|
||||
uqadd8 r9, r9, r9 ; abs(p0 - q0) * 2
|
||||
and r7, lr, r7, lsr #1 ; abs(p1 - q1) / 2
|
||||
mov r8, #0
|
||||
uqadd8 r9, r9, r9 ; abs(p0 - q0) * 2
|
||||
uhadd8 r7, r7, r8 ; abs(p1 - q1) / 2
|
||||
uqadd8 r7, r7, r9 ; abs(p0 - q0)*2 + abs(p1 - q1)/2
|
||||
mvn r10, #0 ; r10 == -1
|
||||
uqsub8 r7, r7, r12 ; compare to flimit
|
||||
|
||||
usub8 r7, r8, r7
|
||||
sel r7, r10, r8 ; filter mask: lr
|
||||
sel lr, r10, r8 ; filter mask
|
||||
|
||||
cmp lr, #0
|
||||
beq simple_vskip_filter ; skip filtering
|
||||
@ -286,10 +271,6 @@ pstep RN r1
|
||||
|simple_vskip_filter|
|
||||
subs r11, r11, #1
|
||||
|
||||
;pld [src]
|
||||
;pld [src, pstep]
|
||||
;pld [src, pstep, lsl #1]
|
||||
|
||||
; load soure data to r7, r8, r9, r10
|
||||
ldrneh r3, [src, #-2]
|
||||
ldrneh r4, [src], pstep
|
||||
@ -316,7 +297,5 @@ pstep RN r1
|
||||
c0x80808080 DCD 0x80808080
|
||||
c0x03030303 DCD 0x03030303
|
||||
c0x04040404 DCD 0x04040404
|
||||
c0x01010101 DCD 0x01010101
|
||||
c0x7F7F7F7F DCD 0x7F7F7F7F
|
||||
|
||||
END
|
||||
|
Loading…
Reference in New Issue
Block a user