diff --git a/vp8/common/arm/armv6/simpleloopfilter_v6.asm b/vp8/common/arm/armv6/simpleloopfilter_v6.asm index 2f090dbdd..011808430 100644 --- a/vp8/common/arm/armv6/simpleloopfilter_v6.asm +++ b/vp8/common/arm/armv6/simpleloopfilter_v6.asm @@ -63,23 +63,22 @@ pstep RN r1 ;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=- stmdb sp!, {r4 - r11, lr} - sub src, src, pstep, lsl #1 ; move src pointer down by 2 lines + ldr r12, [r3] ; limit + ldr r3, [src, -pstep, lsl #1] ; p1 - ldr r12, [r3], #4 ; limit - ldr r3, [src], pstep ; p1 + ldr r9, [sp, #40] ; count for 8-in-parallel + ldr r4, [src, -pstep] ; p0 - ldr r9, [sp, #36] ; count for 8-in-parallel - ldr r4, [src], pstep ; p0 - - ldr r7, [r2], #4 ; flimit - ldr r5, [src], pstep ; q0 + ldr r7, [r2] ; flimit + ldr r5, [src] ; q0 ldr r2, c0x80808080 - ldr r6, [src] ; q1 + ldr r6, [src, pstep] ; q1 uadd8 r7, r7, r7 ; flimit * 2 - mov r9, r9, lsl #1 ; 4-in-parallel + mov r9, r9, lsl #1 ; double the count. we're doing 4 at a time uadd8 r12, r7, r12 ; flimit * 2 + limit + mov lr, #0 |simple_hnext8| ; vp8_simple_filter_mask() function @@ -89,22 +88,19 @@ pstep RN r1 uqsub8 r10, r4, r5 ; p0 - q0 uqsub8 r11, r5, r4 ; q0 - p0 orr r8, r8, r7 ; abs(p1 - q1) - ldr lr, c0x7F7F7F7F ; 01111111 mask orr r10, r10, r11 ; abs(p0 - q0) - and r8, lr, r8, lsr #1 ; abs(p1 - q1) / 2 + uhadd8 r8, r8, lr ; abs(p1 - q2) >> 1 uqadd8 r10, r10, r10 ; abs(p0 - q0) * 2 - mvn lr, #0 ; r10 == -1 + ; STALL waiting on r10 uqadd8 r10, r10, r8 ; abs(p0 - q0)*2 + abs(p1 - q1)/2 - ; STALL waiting on r10 :( - uqsub8 r10, r10, r12 ; compare to flimit - mov r8, #0 - - usub8 r10, r8, r10 ; use usub8 instead of ssub8 - ; STALL (maybe?) when are flags set? :/ - sel r10, lr, r8 ; filter mask: lr - + ; STALL waiting on r10 + mvn r8, #0 + uqsub8 r10, r10, r12 ; compare to flimit. need to do this twice because uqsub8 doesn't set GE flags + ; and usub8 doesn't saturate + usub8 r10, lr, r10 ; set GE flags for each byte + sel r10, r8, lr ; filter mask: F or 0 cmp r10, #0 - beq simple_hskip_filter ; skip filtering + beq simple_hskip_filter ; skip filtering if we're &ing with 0s. would just write out the same values ;vp8_simple_filter() function @@ -113,55 +109,45 @@ pstep RN r1 eor r4, r4, r2 ; p0 offset to convert to a signed value eor r5, r5, r2 ; q0 offset to convert to a signed value - qsub8 r3, r3, r6 ; vp8_filter (r3) = vp8_signed_char_clamp(p1-q1) - qsub8 r6, r5, r4 ; vp8_filter = vp8_signed_char_clamp(vp8_filter + 3 * ( q0 - p0)) + qsub8 r3, r3, r6 ; vp8_signed_char_clamp(p1-q1) + qsub8 r6, r5, r4 ; vp8_signed_char_clamp(q0-p0) + qadd8 r3, r3, r6 ; += q0-p0 + qadd8 r3, r3, r6 ; += q0-p0 + qadd8 r3, r3, r6 ; p1-q1 + 3*(q0-p0)) + and r3, r3, r10 ; &= mask - qadd8 r3, r3, r6 - ldr r8, c0x03030303 ; r8 = 3 - - qadd8 r3, r3, r6 ldr r7, c0x04040404 - - qadd8 r3, r3, r6 - and r3, r3, lr ; vp8_filter &= mask; + ldr r8, c0x03030303 ;save bottom 3 bits so that we round one side +4 and the other +3 + qadd8 r7 , r3 , r7 ; Filter1 (r3) = vp8_signed_char_clamp(vp8_filter+4) qadd8 r8 , r3 , r8 ; Filter2 (r8) = vp8_signed_char_clamp(vp8_filter+3) - qadd8 r3 , r3 , r7 ; Filter1 (r3) = vp8_signed_char_clamp(vp8_filter+4) - mov r7, #0 - shadd8 r8 , r8 , r7 ; Filter2 >>= 3 - shadd8 r3 , r3 , r7 ; Filter1 >>= 3 - shadd8 r8 , r8 , r7 - shadd8 r3 , r3 , r7 - shadd8 r8 , r8 , r7 ; r8: Filter2 - shadd8 r3 , r3 , r7 ; r7: filter1 + mov r3, #0 + shadd8 r7 , r7 , r3 + shadd8 r8 , r8 , r3 + shadd8 r7 , r7 , r3 + shadd8 r8 , r8 , r3 + shadd8 r7 , r7 , r3 ; Filter1 >>= 3 + shadd8 r8 , r8 , r3 ; Filter2 >>= 3 - ;calculate output - sub src, src, pstep, lsl #1 + qsub8 r5 ,r5, r7 ; u = vp8_signed_char_clamp(q0 - Filter1) qadd8 r4, r4, r8 ; u = vp8_signed_char_clamp(p0 + Filter2) - qsub8 r5 ,r5, r3 ; u = vp8_signed_char_clamp(q0 - Filter1) - eor r4, r4, r2 ; *op0 = u^0x80 - str r4, [src], pstep ; store op0 result eor r5, r5, r2 ; *oq0 = u^0x80 - str r5, [src], pstep ; store oq0 result + str r5, [src] ; store oq0 result + eor r4, r4, r2 ; *op0 = u^0x80 + str r4, [src, -pstep] ; store op0 result |simple_hskip_filter| - add src, src, #4 - sub src, src, pstep - sub src, src, pstep, lsl #1 subs r9, r9, #1 + addne src, src, #4 ; next row - ;pld [src] - ;pld [src, pstep] - ;pld [src, pstep, lsl #1] - - ldrne r3, [src], pstep ; p1 - ldrne r4, [src], pstep ; p0 - ldrne r5, [src], pstep ; q0 - ldrne r6, [src] ; q1 + ldrne r3, [src, -pstep, lsl #1] ; p1 + ldrne r4, [src, -pstep] ; p0 + ldrne r5, [src] ; q0 + ldrne r6, [src, pstep] ; q1 bne simple_hnext8 @@ -174,9 +160,9 @@ pstep RN r1 ;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=- stmdb sp!, {r4 - r11, lr} - ldr r12, [r2], #4 ; r12: flimit + ldr r12, [r2] ; r12: flimit ldr r2, c0x80808080 - ldr r7, [r3], #4 ; limit + ldr r7, [r3] ; limit ; load soure data to r7, r8, r9, r10 ldrh r3, [src, #-2] @@ -213,16 +199,15 @@ pstep RN r1 uqsub8 r10, r5, r4 ; q0 - p0 orr r7, r7, r8 ; abs(p1 - q1) orr r9, r9, r10 ; abs(p0 - q0) - ldr lr, c0x7F7F7F7F ; 0111 1111 mask - uqadd8 r9, r9, r9 ; abs(p0 - q0) * 2 - and r7, lr, r7, lsr #1 ; abs(p1 - q1) / 2 mov r8, #0 + uqadd8 r9, r9, r9 ; abs(p0 - q0) * 2 + uhadd8 r7, r7, r8 ; abs(p1 - q1) / 2 uqadd8 r7, r7, r9 ; abs(p0 - q0)*2 + abs(p1 - q1)/2 mvn r10, #0 ; r10 == -1 uqsub8 r7, r7, r12 ; compare to flimit usub8 r7, r8, r7 - sel r7, r10, r8 ; filter mask: lr + sel lr, r10, r8 ; filter mask cmp lr, #0 beq simple_vskip_filter ; skip filtering @@ -286,10 +271,6 @@ pstep RN r1 |simple_vskip_filter| subs r11, r11, #1 - ;pld [src] - ;pld [src, pstep] - ;pld [src, pstep, lsl #1] - ; load soure data to r7, r8, r9, r10 ldrneh r3, [src, #-2] ldrneh r4, [src], pstep @@ -316,7 +297,5 @@ pstep RN r1 c0x80808080 DCD 0x80808080 c0x03030303 DCD 0x03030303 c0x04040404 DCD 0x04040404 -c0x01010101 DCD 0x01010101 -c0x7F7F7F7F DCD 0x7F7F7F7F END