Add Neon horizontal and vertical vp9_mbloop_filter

- The vp9 mbfilter C code will branch on flat and mask. This CL
  will perform both branches and combine the data. A later CL will
  perform a check to see if all patch will take one branch.
- These functions are about 1.75 times faster than the C code on
  Nexus 7.

PS #3
- Changed all functions to dub limit, blimit, and thresh from
  vld {dx[]}, freeing up r4-r6.
- Changed code to use vbif to reduce one instruction and free
  up a d register.

Change-Id: I028dae0e434dc9891c3677bdb182e201ffb04777
This commit is contained in:
Frank Galligan 2013-07-01 12:52:38 -07:00
parent 204d1b7058
commit 198fa6d0a0
2 changed files with 363 additions and 40 deletions

View File

@ -10,6 +10,8 @@
EXPORT |vp9_loop_filter_horizontal_edge_neon|
EXPORT |vp9_loop_filter_vertical_edge_neon|
EXPORT |vp9_mbloop_filter_horizontal_edge_neon|
EXPORT |vp9_mbloop_filter_vertical_edge_neon|
ARM
AREA ||.text||, CODE, READONLY, ALIGN=2
@ -33,50 +35,47 @@
; sp const uint8_t *thresh,
; sp+4 int count
|vp9_loop_filter_horizontal_edge_neon| PROC
push {r4-r6, lr}
push {lr}
ldr r12, [sp,#20] ; load count
ldrb r4, [r2] ; load *blimit
ldrb r5, [r3] ; load *limit
ldr r12, [sp,#8] ; load count
cmp r12, #0
beq end_vp9_lf_h_edge
ldr r3, [sp, #16] ; load thresh
vdup.u8 d0, r4 ; duplicate blimit
ldrb r6, [r3] ; load *thresh
vdup.u8 d1, r5 ; duplicate limit
vdup.u8 d2, r6 ; duplicate thresh
vld1.8 {d0[]}, [r2] ; duplicate *blimit
ldr r2, [sp, #4] ; load thresh
vld1.8 {d1[]}, [r3] ; duplicate *limit
vld1.8 {d2[]}, [r2] ; duplicate *thresh
count_lf_h_loop
sub r2, r0, r1, lsl #2 ; move src pointer down by 4 lines
add r6, r2, r1
add r3, r2, r1
add r1, r1, r1
vld1.u8 {d3}, [r2@64], r1 ; p3
vld1.u8 {d4}, [r6@64], r1 ; p2
vld1.u8 {d4}, [r3@64], r1 ; p2
vld1.u8 {d5}, [r2@64], r1 ; p1
vld1.u8 {d6}, [r6@64], r1 ; p0
vld1.u8 {d6}, [r3@64], r1 ; p0
vld1.u8 {d7}, [r2@64], r1 ; q0
vld1.u8 {d16}, [r6@64], r1 ; q1
vld1.u8 {d16}, [r3@64], r1 ; q1
vld1.u8 {d17}, [r2@64] ; q2
vld1.u8 {d18}, [r6@64] ; q3
vld1.u8 {d18}, [r3@64] ; q3
sub r2, r2, r1, lsl #1
sub r6, r6, r1, lsl #1
sub r3, r3, r1, lsl #1
bl vp9_loop_filter_neon
vst1.u8 {d4}, [r2@64], r1 ; store op1
vst1.u8 {d5}, [r6@64], r1 ; store op0
vst1.u8 {d5}, [r3@64], r1 ; store op0
vst1.u8 {d6}, [r2@64], r1 ; store oq0
vst1.u8 {d7}, [r6@64], r1 ; store oq1
vst1.u8 {d7}, [r3@64], r1 ; store oq1
add r0, r0, #8
subs r12, r12, #1
bne count_lf_h_loop
end_vp9_lf_h_edge
pop {r4-r6, pc}
pop {pc}
ENDP ; |vp9_loop_filter_horizontal_edge_neon|
; Currently vp9 only works on iterations 8 at a time. The vp8 loop filter
@ -98,31 +97,28 @@ end_vp9_lf_h_edge
; sp const uint8_t *thresh,
; sp+4 int count
|vp9_loop_filter_vertical_edge_neon| PROC
push {r4-r6, lr}
push {lr}
ldr r12, [sp,#20] ; load count
ldrb r4, [r2] ; load *blimit
ldrb r5, [r3] ; load *limit
ldr r12, [sp,#8] ; load count
cmp r12, #0
beq end_vp9_lf_v_edge
ldr r3, [sp, #16] ; load thresh
vdup.u8 d0, r4 ; duplicate blimit
ldrb r6, [r3] ; load *thresh
vdup.u8 d1, r5 ; duplicate limit
vdup.u8 d2, r6 ; duplicate thresh
vld1.8 {d0[]}, [r2] ; duplicate *blimit
ldr r2, [sp, #4] ; load thresh
vld1.8 {d1[]}, [r3] ; duplicate *limit
vld1.8 {d2[]}, [r2] ; duplicate *thresh
count_lf_v_loop
sub r6, r0, #4 ; move s pointer down by 4 columns
sub r2, r0, #4 ; move s pointer down by 4 columns
vld1.u8 {d3}, [r6], r1 ; load s data
vld1.u8 {d4}, [r6], r1
vld1.u8 {d5}, [r6], r1
vld1.u8 {d6}, [r6], r1
vld1.u8 {d7}, [r6], r1
vld1.u8 {d16}, [r6], r1
vld1.u8 {d17}, [r6], r1
vld1.u8 {d18}, [r6]
vld1.u8 {d3}, [r2], r1 ; load s data
vld1.u8 {d4}, [r2], r1
vld1.u8 {d5}, [r2], r1
vld1.u8 {d6}, [r2], r1
vld1.u8 {d7}, [r2], r1
vld1.u8 {d16}, [r2], r1
vld1.u8 {d17}, [r2], r1
vld1.u8 {d18}, [r2]
;transpose to 8x16 matrix
vtrn.32 d3, d7
@ -159,7 +155,7 @@ count_lf_v_loop
bne count_lf_v_loop
end_vp9_lf_v_edge
pop {r4-r6, pc}
pop {pc}
ENDP ; |vp9_loop_filter_vertical_edge_neon|
; void vp9_loop_filter_neon();
@ -224,7 +220,7 @@ end_vp9_lf_v_edge
vsub.s8 d28, d7, d6 ; ( qs0 - ps0)
vcge.u8 d17, d0, d17 ; (a > blimit * 2 + limit) * -1
vcge.u8 d17, d0, d17 ; a > blimit
vqsub.s8 d27, d5, d16 ; filter = clamp(ps1-qs1)
vorr d22, d21, d22 ; hevmask
@ -267,4 +263,331 @@ end_vp9_lf_v_edge
bx lr
ENDP ; |vp9_loop_filter_neon|
; void vp9_mbloop_filter_horizontal_edge_neon(uint8_t *s, int p,
; const uint8_t *blimit,
; const uint8_t *limit,
; const uint8_t *thresh,
; int count)
; r0 uint8_t *s,
; r1 int p, /* pitch */
; r2 const uint8_t *blimit,
; r3 const uint8_t *limit,
; sp const uint8_t *thresh,
; sp+4 int count
|vp9_mbloop_filter_horizontal_edge_neon| PROC
push {lr}
ldr r12, [sp,#8] ; load count
cmp r12, #0
beq end_vp9_mblf_h_edge
vld1.8 {d0[]}, [r2] ; duplicate *blimit
ldr r2, [sp, #4] ; load thresh
vld1.8 {d1[]}, [r3] ; duplicate *limit
vld1.8 {d2[]}, [r2] ; duplicate *thresh
count_mblf_h_loop
sub r3, r0, r1, lsl #2 ; move src pointer down by 4 lines
add r2, r3, r1
add r1, r1, r1
vld1.u8 {d3}, [r3@64], r1 ; p3
vld1.u8 {d4}, [r2@64], r1 ; p2
vld1.u8 {d5}, [r3@64], r1 ; p1
vld1.u8 {d6}, [r2@64], r1 ; p0
vld1.u8 {d7}, [r3@64], r1 ; q0
vld1.u8 {d16}, [r2@64], r1 ; q1
vld1.u8 {d17}, [r3@64] ; q2
vld1.u8 {d18}, [r2@64], r1 ; q3
sub r3, r3, r1, lsl #1
sub r2, r2, r1, lsl #2
bl vp9_mbloop_filter_neon
vst1.u8 {d2}, [r2@64], r1 ; store op2
vst1.u8 {d3}, [r3@64], r1 ; store op1
vst1.u8 {d4}, [r2@64], r1 ; store op0
vst1.u8 {d5}, [r3@64], r1 ; store oq0
vst1.u8 {d6}, [r2@64], r1 ; store oq1
vst1.u8 {d7}, [r3@64], r1 ; store oq2
add r0, r0, #8
subs r12, r12, #1
bne count_mblf_h_loop
end_vp9_mblf_h_edge
pop {pc}
ENDP ; |vp9_mbloop_filter_horizontal_edge_neon|
; void vp9_mbloop_filter_vertical_edge_neon(uint8_t *s,
; int pitch,
; const uint8_t *blimit,
; const uint8_t *limit,
; const uint8_t *thresh,
; int count)
;
; r0 uint8_t *s,
; r1 int pitch,
; r2 const uint8_t *blimit,
; r3 const uint8_t *limit,
; sp const uint8_t *thresh,
; sp+4 int count
|vp9_mbloop_filter_vertical_edge_neon| PROC
push {lr}
ldr r12, [sp,#8] ; load count
cmp r12, #0
beq end_vp9_mblf_v_edge
vld1.8 {d0[]}, [r2] ; duplicate *blimit
ldr r2, [sp, #4] ; load thresh
vld1.8 {d1[]}, [r3] ; duplicate *limit
vld1.8 {d2[]}, [r2] ; duplicate *thresh
count_mblf_v_loop
sub r2, r0, #4 ; move s pointer down by 4 columns
vld1.u8 {d3}, [r2], r1 ; load s data
vld1.u8 {d4}, [r2], r1
vld1.u8 {d5}, [r2], r1
vld1.u8 {d6}, [r2], r1
vld1.u8 {d7}, [r2], r1
vld1.u8 {d16}, [r2], r1
vld1.u8 {d17}, [r2], r1
vld1.u8 {d18}, [r2]
;transpose to 8x16 matrix
vtrn.32 d3, d7
vtrn.32 d4, d16
vtrn.32 d5, d17
vtrn.32 d6, d18
vtrn.16 d3, d5
vtrn.16 d4, d6
vtrn.16 d7, d17
vtrn.16 d16, d18
vtrn.8 d3, d4
vtrn.8 d5, d6
vtrn.8 d7, d16
vtrn.8 d17, d18
sub r2, r0, #3
add r3, r0, #1
bl vp9_mbloop_filter_neon
;store op2, op1, op0, oq0
vst4.8 {d2[0], d3[0], d4[0], d5[0]}, [r2], r1
vst4.8 {d2[1], d3[1], d4[1], d5[1]}, [r2], r1
vst4.8 {d2[2], d3[2], d4[2], d5[2]}, [r2], r1
vst4.8 {d2[3], d3[3], d4[3], d5[3]}, [r2], r1
vst4.8 {d2[4], d3[4], d4[4], d5[4]}, [r2], r1
vst4.8 {d2[5], d3[5], d4[5], d5[5]}, [r2], r1
vst4.8 {d2[6], d3[6], d4[6], d5[6]}, [r2], r1
vst4.8 {d2[7], d3[7], d4[7], d5[7]}, [r2]
;store oq1, oq2
vst2.8 {d6[0], d7[0]}, [r3], r1
vst2.8 {d6[1], d7[1]}, [r3], r1
vst2.8 {d6[2], d7[2]}, [r3], r1
vst2.8 {d6[3], d7[3]}, [r3], r1
vst2.8 {d6[4], d7[4]}, [r3], r1
vst2.8 {d6[5], d7[5]}, [r3], r1
vst2.8 {d6[6], d7[6]}, [r3], r1
vst2.8 {d6[7], d7[7]}, [r3]
add r0, r0, r1, lsl #3 ; s += pitch * 8
subs r12, r12, #1
bne count_mblf_v_loop
end_vp9_mblf_v_edge
pop {pc}
ENDP ; |vp9_mbloop_filter_vertical_edge_neon|
; void vp9_mbloop_filter_neon();
; This is a helper function for the loopfilters. The invidual functions do the
; necessary load, transpose (if necessary) and store. The function does not use
; registers d8-d15.
;
; r0-r3 PRESERVE
; d0 blimit
; d1 limit
; d2 thresh
; d3 p3
; d4 p2
; d5 p1
; d6 p0
; d7 q0
; d16 q1
; d17 q2
; d18 q3
|vp9_mbloop_filter_neon| PROC
; filter_mask
vabd.u8 d19, d3, d4 ; abs(p3 - p2)
vabd.u8 d20, d4, d5 ; abs(p2 - p1)
vabd.u8 d21, d5, d6 ; abs(p1 - p0)
vabd.u8 d22, d16, d7 ; abs(q1 - q0)
vabd.u8 d23, d17, d16 ; abs(q2 - q1)
vabd.u8 d24, d18, d17 ; abs(q3 - q2)
; only compare the largest value to limit
vmax.u8 d19, d19, d20 ; max(abs(p3 - p2), abs(p2 - p1))
vmax.u8 d20, d21, d22 ; max(abs(p1 - p0), abs(q1 - q0))
vmax.u8 d23, d23, d24 ; max(abs(q2 - q1), abs(q3 - q2))
vmax.u8 d19, d19, d20
vabd.u8 d24, d6, d7 ; abs(p0 - q0)
vmax.u8 d19, d19, d23
vabd.u8 d23, d5, d16 ; a = abs(p1 - q1)
vqadd.u8 d24, d24, d24 ; b = abs(p0 - q0) * 2
; abs () > limit
vcge.u8 d19, d1, d19
; flatmask4
vabd.u8 d25, d6, d4 ; abs(p0 - p2)
vabd.u8 d26, d7, d17 ; abs(q0 - q2)
vabd.u8 d27, d3, d6 ; abs(p3 - p0)
vabd.u8 d28, d18, d7 ; abs(q3 - q0)
; only compare the largest value to thresh
vmax.u8 d25, d25, d26 ; max(abs(p0 - p2), abs(q0 - q2))
vmax.u8 d26, d27, d28 ; max(abs(p3 - p0), abs(q3 - q0))
vmax.u8 d25, d25, d26
vmax.u8 d20, d20, d25
vshr.u8 d23, d23, #1 ; a = a / 2
vqadd.u8 d24, d24, d23 ; a = b + a
vmov.u8 d23, #1
vcge.u8 d24, d0, d24 ; a > blimit
vcge.u8 d20, d23, d20 ; flat
vand d19, d19, d24 ; mask
; hevmask
vcgt.u8 d21, d21, d2 ; (abs(p1 - p0) > thresh)*-1
vcgt.u8 d22, d22, d2 ; (abs(q1 - q0) > thresh)*-1
vorr d21, d21, d22 ; hev
vmov.u8 d22, #0x80
; mbfilter() function
; filter() function
; convert to signed
veor d23, d7, d22 ; qs0
veor d24, d6, d22 ; ps0
veor d25, d5, d22 ; ps1
veor d26, d16, d22 ; qs1
vmov.u8 d27, #3
vsub.s8 d28, d23, d24 ; ( qs0 - ps0)
vqsub.s8 d29, d25, d26 ; filter = clamp(ps1-qs1)
vmull.s8 q15, d28, d27 ; 3 * ( qs0 - ps0)
vand d29, d29, d21 ; filter &= hev
vaddw.s8 q15, q15, d29 ; filter + 3 * (qs0 - ps0)
vmov.u8 d29, #4
; filter = clamp(filter + 3 * ( qs0 - ps0))
vqmovn.s16 d28, q15
vand d28, d28, d19 ; filter &= mask
vqadd.s8 d30, d28, d27 ; filter2 = clamp(filter+3)
vqadd.s8 d29, d28, d29 ; filter1 = clamp(filter+4)
vshr.s8 d30, d30, #3 ; filter2 >>= 3
vshr.s8 d29, d29, #3 ; filter1 >>= 3
vand d20, d20, d19 ; flat & mask
vqadd.s8 d24, d24, d30 ; op0 = clamp(ps0 + filter2)
vqsub.s8 d23, d23, d29 ; oq0 = clamp(qs0 - filter1)
; outer tap adjustments: ++filter1 >> 1
vrshr.s8 d29, d29, #1
vbic d29, d29, d21 ; filter &= ~hev
vqadd.s8 d25, d25, d29 ; op1 = clamp(ps1 + filter)
vqsub.s8 d26, d26, d29 ; oq1 = clamp(qs1 - filter)
veor d24, d24, d22 ; *f_op0 = u^0x80
veor d23, d23, d22 ; *f_oq0 = u^0x80
veor d25, d25, d22 ; *f_op1 = u^0x80
veor d26, d26, d22 ; *f_oq1 = u^0x80
; mbfilter flat && mask branch
; TODO(fgalligan): Can I decrease the cycles shifting to consective d's
; and using vibt on the q's?
vmov.u8 d21, #2
vaddl.u8 q14, d6, d7 ; op2 = p0 + q0
vmlal.u8 q14, d3, d27 ; op2 += p3 * 3
vmlal.u8 q14, d4, d21 ; op2 += p2 * 2
vaddw.u8 q14, d5 ; op2 += p1
vqrshrn.u16 d30, q14, #3 ; r_op2
vsubw.u8 q14, d3 ; op1 = op2 - p3
vsubw.u8 q14, d4 ; op1 -= p2
vaddw.u8 q14, d5 ; op1 += p1
vaddw.u8 q14, d16 ; op1 += q1
vqrshrn.u16 d31, q14, #3 ; r_op1
vsubw.u8 q14, d3 ; op0 = op1 - p3
vsubw.u8 q14, d5 ; op0 -= p1
vaddw.u8 q14, d6 ; op0 += p0
vaddw.u8 q14, d17 ; op0 += q2
vqrshrn.u16 d21, q14, #3 ; r_op0
vsubw.u8 q14, d3 ; oq0 = op0 - p3
vsubw.u8 q14, d6 ; oq0 -= p0
vaddw.u8 q14, d7 ; oq0 += q0
vaddw.u8 q14, d18 ; oq0 += q3
vqrshrn.u16 d22, q14, #3 ; r_oq0
vsubw.u8 q14, d4 ; oq1 = oq0 - p2
vsubw.u8 q14, d7 ; oq1 -= q0
vaddw.u8 q14, d16 ; oq1 += q1
vaddw.u8 q14, d18 ; oq1 += q3
vqrshrn.u16 d0, q14, #3 ; r_oq1
vsubw.u8 q14, d5 ; oq2 = oq0 - p1
vsubw.u8 q14, d16 ; oq2 -= q1
vaddw.u8 q14, d17 ; oq2 += q2
vaddw.u8 q14, d18 ; oq2 += q3
vqrshrn.u16 d1, q14, #3 ; r_oq2
; Filter does not set op2 or oq2, so use p2 and q2.
vbit d2, d30, d20 ; op2 |= r_op2 & (flat & mask)
vbif d2, d4, d20 ; op2 |= op2 & ~(flat & mask)
vbit d3, d31, d20 ; op1 |= r_op1 & (flat & mask)
vbif d3, d25, d20 ; op1 |= f_op1 & ~(flat & mask)
vbit d4, d21, d20 ; op0 |= r_op0 & (flat & mask)
vbif d4, d24, d20 ; op0 |= f_op0 & ~(flat & mask)
vbit d5, d22, d20 ; oq0 |= r_oq0 & (flat & mask)
vbif d5, d23, d20 ; oq0 |= f_oq0 & ~(flat & mask)
vbit d6, d0, d20 ; oq1 |= r_oq1 & (flat & mask)
vbif d6, d26, d20 ; oq1 |= f_oq1 & ~(flat & mask)
vbit d7, d1, d20 ; oq2 |= r_oq2 & (flat & mask)
vbif d7, d17, d20 ; oq2 |= oq2 & ~(flat & mask)
bx lr
ENDP ; |vp9_mbloop_filter_neon|
END

View File

@ -109,7 +109,7 @@ prototype void vp9_mb_lpf_vertical_edge_w "uint8_t *s, int pitch, const uint8_t
specialize vp9_mb_lpf_vertical_edge_w sse2
prototype void vp9_mbloop_filter_vertical_edge "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count"
specialize vp9_mbloop_filter_vertical_edge sse2
specialize vp9_mbloop_filter_vertical_edge sse2 neon
prototype void vp9_loop_filter_vertical_edge "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count"
specialize vp9_loop_filter_vertical_edge mmx neon
@ -118,7 +118,7 @@ prototype void vp9_mb_lpf_horizontal_edge_w "uint8_t *s, int pitch, const uint8_
specialize vp9_mb_lpf_horizontal_edge_w sse2
prototype void vp9_mbloop_filter_horizontal_edge "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count"
specialize vp9_mbloop_filter_horizontal_edge sse2
specialize vp9_mbloop_filter_horizontal_edge sse2 neon
prototype void vp9_loop_filter_horizontal_edge "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count"
specialize vp9_loop_filter_horizontal_edge mmx neon