Save NEON registers in VP8 NEON functions
The recent compiler can generate optimized code that uses NEON registers for various operations besides floating-point operations. Therefore, only saving callee-saved registers d8 - d15 at the beginning of the encoder/decoder is not enough anymore. This patch added register saving code in VP8 NEON functions that use those registers. Change-Id: Ie9e44f5188cf410990c8aaaac68faceee9dffd31
This commit is contained in:
parent
5ba44e37a4
commit
33df6d1fc1
@ -26,6 +26,7 @@
|
||||
|
||||
|vp8_build_intra_predictors_mby_neon_func| PROC
|
||||
push {r4-r8, lr}
|
||||
vpush {d8-d15}
|
||||
|
||||
cmp r3, #0
|
||||
beq case_dc_pred
|
||||
@ -37,8 +38,8 @@
|
||||
beq case_tm_pred
|
||||
|
||||
case_dc_pred
|
||||
ldr r4, [sp, #24] ; Up
|
||||
ldr r5, [sp, #28] ; Left
|
||||
ldr r4, [sp, #88] ; Up
|
||||
ldr r5, [sp, #92] ; Left
|
||||
|
||||
; Default the DC average to 128
|
||||
mov r12, #128
|
||||
@ -143,6 +144,7 @@ skip_dc_pred_up_left
|
||||
vst1.u8 {q0}, [r1]!
|
||||
vst1.u8 {q0}, [r1]!
|
||||
|
||||
vpop {d8-d15}
|
||||
pop {r4-r8,pc}
|
||||
case_v_pred
|
||||
; Copy down above row
|
||||
@ -165,6 +167,7 @@ case_v_pred
|
||||
vst1.u8 {q0}, [r1]!
|
||||
vst1.u8 {q0}, [r1]!
|
||||
vst1.u8 {q0}, [r1]!
|
||||
vpop {d8-d15}
|
||||
pop {r4-r8,pc}
|
||||
|
||||
case_h_pred
|
||||
@ -224,6 +227,7 @@ case_h_pred
|
||||
vst1.u8 {q2}, [r1]!
|
||||
vst1.u8 {q3}, [r1]!
|
||||
|
||||
vpop {d8-d15}
|
||||
pop {r4-r8,pc}
|
||||
|
||||
case_tm_pred
|
||||
@ -293,6 +297,7 @@ case_tm_pred_loop
|
||||
subs r12, r12, #1
|
||||
bne case_tm_pred_loop
|
||||
|
||||
vpop {d8-d15}
|
||||
pop {r4-r8,pc}
|
||||
|
||||
ENDP
|
||||
@ -307,6 +312,7 @@ case_tm_pred_loop
|
||||
|
||||
|vp8_build_intra_predictors_mby_s_neon_func| PROC
|
||||
push {r4-r8, lr}
|
||||
vpush {d8-d15}
|
||||
|
||||
mov r1, r0 ; unsigned char *ypred_ptr = x->dst.y_buffer; //x->Predictor;
|
||||
|
||||
@ -320,8 +326,8 @@ case_tm_pred_loop
|
||||
beq case_tm_pred_s
|
||||
|
||||
case_dc_pred_s
|
||||
ldr r4, [sp, #24] ; Up
|
||||
ldr r5, [sp, #28] ; Left
|
||||
ldr r4, [sp, #88] ; Up
|
||||
ldr r5, [sp, #92] ; Left
|
||||
|
||||
; Default the DC average to 128
|
||||
mov r12, #128
|
||||
@ -426,6 +432,7 @@ skip_dc_pred_up_left_s
|
||||
vst1.u8 {q0}, [r1], r2
|
||||
vst1.u8 {q0}, [r1], r2
|
||||
|
||||
vpop {d8-d15}
|
||||
pop {r4-r8,pc}
|
||||
case_v_pred_s
|
||||
; Copy down above row
|
||||
@ -448,6 +455,8 @@ case_v_pred_s
|
||||
vst1.u8 {q0}, [r1], r2
|
||||
vst1.u8 {q0}, [r1], r2
|
||||
vst1.u8 {q0}, [r1], r2
|
||||
|
||||
vpop {d8-d15}
|
||||
pop {r4-r8,pc}
|
||||
|
||||
case_h_pred_s
|
||||
@ -507,6 +516,7 @@ case_h_pred_s
|
||||
vst1.u8 {q2}, [r1], r2
|
||||
vst1.u8 {q3}, [r1], r2
|
||||
|
||||
vpop {d8-d15}
|
||||
pop {r4-r8,pc}
|
||||
|
||||
case_tm_pred_s
|
||||
@ -576,6 +586,7 @@ case_tm_pred_loop_s
|
||||
subs r12, r12, #1
|
||||
bne case_tm_pred_loop_s
|
||||
|
||||
vpop {d8-d15}
|
||||
pop {r4-r8,pc}
|
||||
|
||||
ENDP
|
||||
|
@ -22,6 +22,7 @@
|
||||
; r3 stride
|
||||
|idct_dequant_0_2x_neon| PROC
|
||||
push {r4, r5}
|
||||
vpush {d8-d15}
|
||||
|
||||
add r12, r2, #4
|
||||
vld1.32 {d2[0]}, [r2], r3
|
||||
@ -72,6 +73,7 @@
|
||||
vst1.32 {d4[1]}, [r2]
|
||||
vst1.32 {d10[1]}, [r0]
|
||||
|
||||
vpop {d8-d15}
|
||||
pop {r4, r5}
|
||||
bx lr
|
||||
|
||||
|
@ -22,6 +22,8 @@
|
||||
; r2 *dst
|
||||
; r3 stride
|
||||
|idct_dequant_full_2x_neon| PROC
|
||||
vpush {d8-d15}
|
||||
|
||||
vld1.16 {q0, q1}, [r1] ; dq (same l/r)
|
||||
vld1.16 {q2, q3}, [r0] ; l q
|
||||
add r0, r0, #32
|
||||
@ -184,6 +186,7 @@
|
||||
vst1.32 {d3[0]}, [r2]
|
||||
vst1.32 {d3[1]}, [r1]
|
||||
|
||||
vpop {d8-d15}
|
||||
bx lr
|
||||
|
||||
ENDP ; |idct_dequant_full_2x_neon|
|
||||
|
@ -24,10 +24,12 @@
|
||||
; sp unsigned char thresh,
|
||||
|vp8_loop_filter_horizontal_edge_y_neon| PROC
|
||||
push {lr}
|
||||
vpush {d8-d15}
|
||||
|
||||
vdup.u8 q0, r2 ; duplicate blimit
|
||||
vdup.u8 q1, r3 ; duplicate limit
|
||||
sub r2, r0, r1, lsl #2 ; move src pointer down by 4 lines
|
||||
ldr r3, [sp, #4] ; load thresh
|
||||
ldr r3, [sp, #68] ; load thresh
|
||||
add r12, r2, r1
|
||||
add r1, r1, r1
|
||||
|
||||
@ -52,6 +54,7 @@
|
||||
vst1.u8 {q7}, [r2@128], r1 ; store oq0
|
||||
vst1.u8 {q8}, [r12@128], r1 ; store oq1
|
||||
|
||||
vpop {d8-d15}
|
||||
pop {pc}
|
||||
ENDP ; |vp8_loop_filter_horizontal_edge_y_neon|
|
||||
|
||||
@ -64,10 +67,12 @@
|
||||
; sp+4 unsigned char *v
|
||||
|vp8_loop_filter_horizontal_edge_uv_neon| PROC
|
||||
push {lr}
|
||||
vpush {d8-d15}
|
||||
|
||||
vdup.u8 q0, r2 ; duplicate blimit
|
||||
vdup.u8 q1, r3 ; duplicate limit
|
||||
ldr r12, [sp, #4] ; load thresh
|
||||
ldr r2, [sp, #8] ; load v ptr
|
||||
ldr r12, [sp, #68] ; load thresh
|
||||
ldr r2, [sp, #72] ; load v ptr
|
||||
vdup.u8 q2, r12 ; duplicate thresh
|
||||
|
||||
sub r3, r0, r1, lsl #2 ; move u pointer down by 4 lines
|
||||
@ -104,6 +109,7 @@
|
||||
vst1.u8 {d16}, [r0@64] ; store u oq1
|
||||
vst1.u8 {d17}, [r2@64] ; store v oq1
|
||||
|
||||
vpop {d8-d15}
|
||||
pop {pc}
|
||||
ENDP ; |vp8_loop_filter_horizontal_edge_uv_neon|
|
||||
|
||||
@ -120,11 +126,13 @@
|
||||
|
||||
|vp8_loop_filter_vertical_edge_y_neon| PROC
|
||||
push {lr}
|
||||
vpush {d8-d15}
|
||||
|
||||
vdup.u8 q0, r2 ; duplicate blimit
|
||||
vdup.u8 q1, r3 ; duplicate limit
|
||||
sub r2, r0, #4 ; src ptr down by 4 columns
|
||||
add r1, r1, r1
|
||||
ldr r3, [sp, #4] ; load thresh
|
||||
ldr r3, [sp, #68] ; load thresh
|
||||
add r12, r2, r1, asr #1
|
||||
|
||||
vld1.u8 {d6}, [r2], r1
|
||||
@ -194,6 +202,7 @@
|
||||
vst4.8 {d14[6], d15[6], d16[6], d17[6]}, [r0]
|
||||
vst4.8 {d14[7], d15[7], d16[7], d17[7]}, [r12]
|
||||
|
||||
vpop {d8-d15}
|
||||
pop {pc}
|
||||
ENDP ; |vp8_loop_filter_vertical_edge_y_neon|
|
||||
|
||||
@ -210,9 +219,11 @@
|
||||
; sp+4 unsigned char *v
|
||||
|vp8_loop_filter_vertical_edge_uv_neon| PROC
|
||||
push {lr}
|
||||
vpush {d8-d15}
|
||||
|
||||
vdup.u8 q0, r2 ; duplicate blimit
|
||||
sub r12, r0, #4 ; move u pointer down by 4 columns
|
||||
ldr r2, [sp, #8] ; load v ptr
|
||||
ldr r2, [sp, #72] ; load v ptr
|
||||
vdup.u8 q1, r3 ; duplicate limit
|
||||
sub r3, r2, #4 ; move v pointer down by 4 columns
|
||||
|
||||
@ -233,7 +244,7 @@
|
||||
vld1.u8 {d20}, [r12]
|
||||
vld1.u8 {d21}, [r3]
|
||||
|
||||
ldr r12, [sp, #4] ; load thresh
|
||||
ldr r12, [sp, #68] ; load thresh
|
||||
|
||||
;transpose to 8x16 matrix
|
||||
vtrn.32 q3, q7
|
||||
@ -281,6 +292,7 @@
|
||||
vst4.8 {d10[7], d11[7], d12[7], d13[7]}, [r0]
|
||||
vst4.8 {d14[7], d15[7], d16[7], d17[7]}, [r2]
|
||||
|
||||
vpop {d8-d15}
|
||||
pop {pc}
|
||||
ENDP ; |vp8_loop_filter_vertical_edge_uv_neon|
|
||||
|
||||
|
@ -9,7 +9,6 @@
|
||||
;
|
||||
|
||||
|
||||
;EXPORT |vp8_loop_filter_simple_horizontal_edge_neon|
|
||||
EXPORT |vp8_loop_filter_bhs_neon|
|
||||
EXPORT |vp8_loop_filter_mbhs_neon|
|
||||
ARM
|
||||
@ -22,7 +21,7 @@
|
||||
; q1 limit, PRESERVE
|
||||
|
||||
|vp8_loop_filter_simple_horizontal_edge_neon| PROC
|
||||
|
||||
vpush {d8-d15}
|
||||
sub r3, r0, r1, lsl #1 ; move src pointer down by 2 lines
|
||||
|
||||
vld1.u8 {q7}, [r0@128], r1 ; q0
|
||||
@ -82,6 +81,7 @@
|
||||
vst1.u8 {q6}, [r3@128] ; store op0
|
||||
vst1.u8 {q7}, [r0@128] ; store oq0
|
||||
|
||||
vpop {d8-d15}
|
||||
bx lr
|
||||
ENDP ; |vp8_loop_filter_simple_horizontal_edge_neon|
|
||||
|
||||
|
@ -9,7 +9,6 @@
|
||||
;
|
||||
|
||||
|
||||
;EXPORT |vp8_loop_filter_simple_vertical_edge_neon|
|
||||
EXPORT |vp8_loop_filter_bvs_neon|
|
||||
EXPORT |vp8_loop_filter_mbvs_neon|
|
||||
ARM
|
||||
@ -22,6 +21,8 @@
|
||||
; q1 limit, PRESERVE
|
||||
|
||||
|vp8_loop_filter_simple_vertical_edge_neon| PROC
|
||||
vpush {d8-d15}
|
||||
|
||||
sub r0, r0, #2 ; move src pointer down by 2 columns
|
||||
add r12, r1, r1
|
||||
add r3, r0, r1
|
||||
@ -120,6 +121,7 @@
|
||||
vst2.8 {d14[6], d15[6]}, [r0], r12
|
||||
vst2.8 {d14[7], d15[7]}, [r3]
|
||||
|
||||
vpop {d8-d15}
|
||||
bx lr
|
||||
ENDP ; |vp8_loop_filter_simple_vertical_edge_neon|
|
||||
|
||||
|
@ -28,8 +28,10 @@
|
||||
; sp unsigned char thresh,
|
||||
|vp8_mbloop_filter_horizontal_edge_y_neon| PROC
|
||||
push {lr}
|
||||
vpush {d8-d15}
|
||||
|
||||
add r1, r1, r1 ; double stride
|
||||
ldr r12, [sp, #4] ; load thresh
|
||||
ldr r12, [sp, #68] ; load thresh
|
||||
sub r0, r0, r1, lsl #1 ; move src pointer down by 4 lines
|
||||
vdup.u8 q2, r12 ; thresh
|
||||
add r12, r0, r1, lsr #1 ; move src pointer up by 1 line
|
||||
@ -55,6 +57,7 @@
|
||||
vst1.u8 {q8}, [r12@128] ; store oq1
|
||||
vst1.u8 {q9}, [r0@128] ; store oq2
|
||||
|
||||
vpop {d8-d15}
|
||||
pop {pc}
|
||||
ENDP ; |vp8_mbloop_filter_horizontal_edge_y_neon|
|
||||
|
||||
@ -72,10 +75,12 @@
|
||||
|
||||
|vp8_mbloop_filter_horizontal_edge_uv_neon| PROC
|
||||
push {lr}
|
||||
ldr r12, [sp, #4] ; load thresh
|
||||
vpush {d8-d15}
|
||||
|
||||
ldr r12, [sp, #68] ; load thresh
|
||||
sub r0, r0, r1, lsl #2 ; move u pointer down by 4 lines
|
||||
vdup.u8 q2, r12 ; thresh
|
||||
ldr r12, [sp, #8] ; load v ptr
|
||||
ldr r12, [sp, #72] ; load v ptr
|
||||
sub r12, r12, r1, lsl #2 ; move v pointer down by 4 lines
|
||||
|
||||
vld1.u8 {d6}, [r0@64], r1 ; p3
|
||||
@ -116,6 +121,7 @@
|
||||
vst1.u8 {d18}, [r0@64], r1 ; store u oq2
|
||||
vst1.u8 {d19}, [r12@64], r1 ; store v oq2
|
||||
|
||||
vpop {d8-d15}
|
||||
pop {pc}
|
||||
ENDP ; |vp8_mbloop_filter_horizontal_edge_uv_neon|
|
||||
|
||||
@ -130,7 +136,9 @@
|
||||
; sp unsigned char thresh,
|
||||
|vp8_mbloop_filter_vertical_edge_y_neon| PROC
|
||||
push {lr}
|
||||
ldr r12, [sp, #4] ; load thresh
|
||||
vpush {d8-d15}
|
||||
|
||||
ldr r12, [sp, #68] ; load thresh
|
||||
sub r0, r0, #4 ; move src pointer down by 4 columns
|
||||
vdup.s8 q2, r12 ; thresh
|
||||
add r12, r0, r1, lsl #3 ; move src pointer down by 8 lines
|
||||
@ -208,6 +216,7 @@
|
||||
vst1.8 {d20}, [r0]
|
||||
vst1.8 {d21}, [r12]
|
||||
|
||||
vpop {d8-d15}
|
||||
pop {pc}
|
||||
ENDP ; |vp8_mbloop_filter_vertical_edge_y_neon|
|
||||
|
||||
@ -224,10 +233,12 @@
|
||||
; sp+4 unsigned char *v
|
||||
|vp8_mbloop_filter_vertical_edge_uv_neon| PROC
|
||||
push {lr}
|
||||
ldr r12, [sp, #4] ; load thresh
|
||||
vpush {d8-d15}
|
||||
|
||||
ldr r12, [sp, #68] ; load thresh
|
||||
sub r0, r0, #4 ; move u pointer down by 4 columns
|
||||
vdup.u8 q2, r12 ; thresh
|
||||
ldr r12, [sp, #8] ; load v ptr
|
||||
ldr r12, [sp, #72] ; load v ptr
|
||||
sub r12, r12, #4 ; move v pointer down by 4 columns
|
||||
|
||||
vld1.u8 {d6}, [r0], r1 ;load u data
|
||||
@ -303,6 +314,7 @@
|
||||
vst1.8 {d20}, [r0]
|
||||
vst1.8 {d21}, [r12]
|
||||
|
||||
vpop {d8-d15}
|
||||
pop {pc}
|
||||
ENDP ; |vp8_mbloop_filter_vertical_edge_uv_neon|
|
||||
|
||||
|
@ -24,6 +24,7 @@
|
||||
; r3 int ref_stride
|
||||
|vp8_sad16x16_neon| PROC
|
||||
;;
|
||||
vpush {d8-d15}
|
||||
vld1.8 {q0}, [r0], r1
|
||||
vld1.8 {q4}, [r2], r3
|
||||
|
||||
@ -132,6 +133,7 @@
|
||||
|
||||
vmov.32 r0, d0[0]
|
||||
|
||||
vpop {d8-d15}
|
||||
bx lr
|
||||
|
||||
ENDP
|
||||
@ -143,6 +145,8 @@
|
||||
; unsigned char *ref_ptr,
|
||||
; int ref_stride)
|
||||
|vp8_sad16x8_neon| PROC
|
||||
vpush {d8-d15}
|
||||
|
||||
vld1.8 {q0}, [r0], r1
|
||||
vld1.8 {q4}, [r2], r3
|
||||
|
||||
@ -200,6 +204,7 @@
|
||||
|
||||
vmov.32 r0, d0[0]
|
||||
|
||||
vpop {d8-d15}
|
||||
bx lr
|
||||
|
||||
ENDP
|
||||
|
@ -25,6 +25,7 @@
|
||||
; int ref_stride)
|
||||
|
||||
|vp8_sad8x8_neon| PROC
|
||||
vpush {d8-d15}
|
||||
vld1.8 {d0}, [r0], r1
|
||||
vld1.8 {d8}, [r2], r3
|
||||
|
||||
@ -70,6 +71,7 @@
|
||||
|
||||
vmov.32 r0, d0[0]
|
||||
|
||||
vpop {d8-d15}
|
||||
bx lr
|
||||
|
||||
ENDP
|
||||
@ -82,6 +84,7 @@
|
||||
; int ref_stride)
|
||||
|
||||
|vp8_sad8x16_neon| PROC
|
||||
vpush {d8-d15}
|
||||
vld1.8 {d0}, [r0], r1
|
||||
vld1.8 {d8}, [r2], r3
|
||||
|
||||
@ -167,6 +170,7 @@
|
||||
|
||||
vmov.32 r0, d0[0]
|
||||
|
||||
vpop {d8-d15}
|
||||
bx lr
|
||||
|
||||
ENDP
|
||||
@ -179,6 +183,7 @@
|
||||
; int ref_stride)
|
||||
|
||||
|vp8_sad4x4_neon| PROC
|
||||
vpush {d8-d15}
|
||||
vld1.8 {d0}, [r0], r1
|
||||
vld1.8 {d8}, [r2], r3
|
||||
|
||||
@ -202,6 +207,7 @@
|
||||
vpaddl.u32 d0, d1
|
||||
vmov.32 r0, d0[0]
|
||||
|
||||
vpop {d8-d15}
|
||||
bx lr
|
||||
|
||||
ENDP
|
||||
|
@ -37,12 +37,14 @@
|
||||
; result of the multiplication that is needed in IDCT.
|
||||
|
||||
|vp8_short_idct4x4llm_neon| PROC
|
||||
vpush {d8-d15}
|
||||
|
||||
adr r12, idct_coeff
|
||||
vld1.16 {q1, q2}, [r0]
|
||||
vld1.16 {d0}, [r12]
|
||||
|
||||
vswp d3, d4 ;q2(vp[4] vp[12])
|
||||
ldr r0, [sp] ; stride
|
||||
ldr r0, [sp, #64] ; stride
|
||||
|
||||
vqdmulh.s16 q3, q2, d0[2]
|
||||
vqdmulh.s16 q4, q2, d0[0]
|
||||
@ -125,6 +127,7 @@
|
||||
vst1.32 d2[0], [r3], r0
|
||||
vst1.32 d2[1], [r3], r0
|
||||
|
||||
vpop {d8-d15}
|
||||
bx lr
|
||||
|
||||
ENDP
|
||||
|
@ -43,10 +43,11 @@ filter16_coeff
|
||||
|
||||
|vp8_sixtap_predict16x16_neon| PROC
|
||||
push {r4-r5, lr}
|
||||
vpush {d8-d15}
|
||||
|
||||
adr r12, filter16_coeff
|
||||
ldr r4, [sp, #12] ;load parameters from stack
|
||||
ldr r5, [sp, #16] ;load parameters from stack
|
||||
ldr r4, [sp, #76] ;load parameters from stack
|
||||
ldr r5, [sp, #80] ;load parameters from stack
|
||||
|
||||
cmp r2, #0 ;skip first_pass filter if xoffset=0
|
||||
beq secondpass_filter16x16_only
|
||||
@ -291,6 +292,8 @@ secondpass_inner_loop_neon
|
||||
bne filt_blk2d_sp16x16_outloop_neon
|
||||
|
||||
add sp, sp, #336
|
||||
|
||||
vpop {d8-d15}
|
||||
pop {r4-r5,pc}
|
||||
|
||||
;--------------------
|
||||
@ -384,6 +387,7 @@ filt_blk2d_fpo16x16_loop_neon
|
||||
|
||||
bne filt_blk2d_fpo16x16_loop_neon
|
||||
|
||||
vpop {d8-d15}
|
||||
pop {r4-r5,pc}
|
||||
|
||||
;--------------------
|
||||
@ -482,6 +486,7 @@ secondpass_only_inner_loop_neon
|
||||
|
||||
bne filt_blk2d_spo16x16_outloop_neon
|
||||
|
||||
vpop {d8-d15}
|
||||
pop {r4-r5,pc}
|
||||
|
||||
ENDP
|
||||
|
@ -35,10 +35,11 @@ filter4_coeff
|
||||
|
||||
|vp8_sixtap_predict4x4_neon| PROC
|
||||
push {r4, lr}
|
||||
vpush {d8-d15}
|
||||
|
||||
adr r12, filter4_coeff
|
||||
ldr r4, [sp, #8] ;load parameters from stack
|
||||
ldr lr, [sp, #12] ;load parameters from stack
|
||||
ldr r4, [sp, #72] ;load parameters from stack
|
||||
ldr lr, [sp, #76] ;load parameters from stack
|
||||
|
||||
cmp r2, #0 ;skip first_pass filter if xoffset=0
|
||||
beq secondpass_filter4x4_only
|
||||
@ -261,6 +262,7 @@ filter4_coeff
|
||||
vst1.32 {d4[0]}, [r1]
|
||||
vst1.32 {d4[1]}, [r2]
|
||||
|
||||
vpop {d8-d15}
|
||||
pop {r4, pc}
|
||||
|
||||
|
||||
@ -348,6 +350,7 @@ firstpass_filter4x4_only
|
||||
vst1.32 {d28[0]}, [r1]
|
||||
vst1.32 {d28[1]}, [r2]
|
||||
|
||||
vpop {d8-d15}
|
||||
pop {r4, pc}
|
||||
|
||||
|
||||
@ -413,6 +416,7 @@ secondpass_filter4x4_only
|
||||
vst1.32 {d4[0]}, [r1]
|
||||
vst1.32 {d4[1]}, [r2]
|
||||
|
||||
vpop {d8-d15}
|
||||
pop {r4, pc}
|
||||
|
||||
ENDP
|
||||
|
@ -35,10 +35,11 @@ filter8_coeff
|
||||
|
||||
|vp8_sixtap_predict8x4_neon| PROC
|
||||
push {r4-r5, lr}
|
||||
vpush {d8-d15}
|
||||
|
||||
adr r12, filter8_coeff
|
||||
ldr r4, [sp, #12] ;load parameters from stack
|
||||
ldr r5, [sp, #16] ;load parameters from stack
|
||||
ldr r4, [sp, #76] ;load parameters from stack
|
||||
ldr r5, [sp, #80] ;load parameters from stack
|
||||
|
||||
cmp r2, #0 ;skip first_pass filter if xoffset=0
|
||||
beq secondpass_filter8x4_only
|
||||
@ -297,6 +298,8 @@ filter8_coeff
|
||||
vst1.u8 {d9}, [r4], r5
|
||||
|
||||
add sp, sp, #32
|
||||
|
||||
vpop {d8-d15}
|
||||
pop {r4-r5,pc}
|
||||
|
||||
;--------------------
|
||||
@ -392,6 +395,7 @@ firstpass_filter8x4_only
|
||||
vst1.u8 {d24}, [r4], r5
|
||||
vst1.u8 {d25}, [r4], r5
|
||||
|
||||
vpop {d8-d15}
|
||||
pop {r4-r5,pc}
|
||||
|
||||
;---------------------
|
||||
@ -464,6 +468,7 @@ secondpass_filter8x4_only
|
||||
vst1.u8 {d8}, [r4], r5
|
||||
vst1.u8 {d9}, [r4], r5
|
||||
|
||||
vpop {d8-d15}
|
||||
pop {r4-r5,pc}
|
||||
|
||||
ENDP
|
||||
|
@ -35,11 +35,11 @@ filter8_coeff
|
||||
|
||||
|vp8_sixtap_predict8x8_neon| PROC
|
||||
push {r4-r5, lr}
|
||||
|
||||
vpush {d8-d15}
|
||||
adr r12, filter8_coeff
|
||||
|
||||
ldr r4, [sp, #12] ;load parameters from stack
|
||||
ldr r5, [sp, #16] ;load parameters from stack
|
||||
ldr r4, [sp, #76] ;load parameters from stack
|
||||
ldr r5, [sp, #80] ;load parameters from stack
|
||||
|
||||
cmp r2, #0 ;skip first_pass filter if xoffset=0
|
||||
beq secondpass_filter8x8_only
|
||||
@ -324,6 +324,8 @@ filt_blk2d_sp8x8_loop_neon
|
||||
bne filt_blk2d_sp8x8_loop_neon
|
||||
|
||||
add sp, sp, #64
|
||||
|
||||
vpop {d8-d15}
|
||||
pop {r4-r5,pc}
|
||||
|
||||
;---------------------
|
||||
@ -428,6 +430,7 @@ filt_blk2d_fpo8x8_loop_neon
|
||||
|
||||
bne filt_blk2d_fpo8x8_loop_neon
|
||||
|
||||
vpop {d8-d15}
|
||||
pop {r4-r5,pc}
|
||||
|
||||
;---------------------
|
||||
@ -515,6 +518,7 @@ filt_blk2d_spo8x8_loop_neon
|
||||
|
||||
bne filt_blk2d_spo8x8_loop_neon
|
||||
|
||||
vpop {d8-d15}
|
||||
pop {r4-r5,pc}
|
||||
|
||||
ENDP
|
||||
|
@ -26,6 +26,7 @@
|
||||
; r3 int recon_stride
|
||||
; stack unsigned int *sse
|
||||
|vp8_variance16x16_neon| PROC
|
||||
vpush {q5}
|
||||
vmov.i8 q8, #0 ;q8 - sum
|
||||
vmov.i8 q9, #0 ;q9, q10 - sse
|
||||
vmov.i8 q10, #0
|
||||
@ -67,7 +68,7 @@ variance16x16_neon_loop
|
||||
vadd.u32 q10, q9, q10 ;accumulate sse
|
||||
vpaddl.s32 q0, q8 ;accumulate sum
|
||||
|
||||
ldr r12, [sp] ;load *sse from stack
|
||||
ldr r12, [sp, #16] ;load *sse from stack
|
||||
|
||||
vpaddl.u32 q1, q10
|
||||
vadd.s64 d0, d0, d1
|
||||
@ -87,6 +88,8 @@ variance16x16_neon_loop
|
||||
vsub.u32 d0, d1, d10
|
||||
|
||||
vmov.32 r0, d0[0] ;return
|
||||
|
||||
vpop {q5}
|
||||
bx lr
|
||||
|
||||
ENDP
|
||||
@ -99,6 +102,8 @@ variance16x16_neon_loop
|
||||
; int recon_stride,
|
||||
; unsigned int *sse)
|
||||
|vp8_variance16x8_neon| PROC
|
||||
vpush {q5}
|
||||
|
||||
vmov.i8 q8, #0 ;q8 - sum
|
||||
vmov.i8 q9, #0 ;q9, q10 - sse
|
||||
vmov.i8 q10, #0
|
||||
@ -137,7 +142,7 @@ variance16x8_neon_loop
|
||||
vadd.u32 q10, q9, q10 ;accumulate sse
|
||||
vpaddl.s32 q0, q8 ;accumulate sum
|
||||
|
||||
ldr r12, [sp] ;load *sse from stack
|
||||
ldr r12, [sp, #16] ;load *sse from stack
|
||||
|
||||
vpaddl.u32 q1, q10
|
||||
vadd.s64 d0, d0, d1
|
||||
@ -149,6 +154,8 @@ variance16x8_neon_loop
|
||||
vsub.u32 d0, d1, d10
|
||||
|
||||
vmov.32 r0, d0[0] ;return
|
||||
|
||||
vpop {q5}
|
||||
bx lr
|
||||
|
||||
ENDP
|
||||
@ -162,6 +169,8 @@ variance16x8_neon_loop
|
||||
; unsigned int *sse)
|
||||
|
||||
|vp8_variance8x16_neon| PROC
|
||||
vpush {q5}
|
||||
|
||||
vmov.i8 q8, #0 ;q8 - sum
|
||||
vmov.i8 q9, #0 ;q9, q10 - sse
|
||||
vmov.i8 q10, #0
|
||||
@ -192,7 +201,7 @@ variance8x16_neon_loop
|
||||
vadd.u32 q10, q9, q10 ;accumulate sse
|
||||
vpaddl.s32 q0, q8 ;accumulate sum
|
||||
|
||||
ldr r12, [sp] ;load *sse from stack
|
||||
ldr r12, [sp, #16] ;load *sse from stack
|
||||
|
||||
vpaddl.u32 q1, q10
|
||||
vadd.s64 d0, d0, d1
|
||||
@ -204,6 +213,8 @@ variance8x16_neon_loop
|
||||
vsub.u32 d0, d1, d10
|
||||
|
||||
vmov.32 r0, d0[0] ;return
|
||||
|
||||
vpop {q5}
|
||||
bx lr
|
||||
|
||||
ENDP
|
||||
@ -215,6 +226,8 @@ variance8x16_neon_loop
|
||||
; r3 int recon_stride
|
||||
; stack unsigned int *sse
|
||||
|vp8_variance8x8_neon| PROC
|
||||
vpush {q5}
|
||||
|
||||
vmov.i8 q8, #0 ;q8 - sum
|
||||
vmov.i8 q9, #0 ;q9, q10 - sse
|
||||
vmov.i8 q10, #0
|
||||
@ -257,7 +270,7 @@ variance8x8_neon_loop
|
||||
vadd.u32 q10, q9, q10 ;accumulate sse
|
||||
vpaddl.s32 q0, q8 ;accumulate sum
|
||||
|
||||
ldr r12, [sp] ;load *sse from stack
|
||||
ldr r12, [sp, #16] ;load *sse from stack
|
||||
|
||||
vpaddl.u32 q1, q10
|
||||
vadd.s64 d0, d0, d1
|
||||
@ -269,6 +282,8 @@ variance8x8_neon_loop
|
||||
vsub.u32 d0, d1, d10
|
||||
|
||||
vmov.32 r0, d0[0] ;return
|
||||
|
||||
vpop {q5}
|
||||
bx lr
|
||||
|
||||
ENDP
|
||||
|
@ -31,11 +31,12 @@ bilinear_taps_coeff
|
||||
|
||||
|vp8_sub_pixel_variance16x16_neon_func| PROC
|
||||
push {r4-r6, lr}
|
||||
vpush {d8-d15}
|
||||
|
||||
adr r12, bilinear_taps_coeff
|
||||
ldr r4, [sp, #16] ;load *dst_ptr from stack
|
||||
ldr r5, [sp, #20] ;load dst_pixels_per_line from stack
|
||||
ldr r6, [sp, #24] ;load *sse from stack
|
||||
ldr r4, [sp, #80] ;load *dst_ptr from stack
|
||||
ldr r5, [sp, #84] ;load dst_pixels_per_line from stack
|
||||
ldr r6, [sp, #88] ;load *sse from stack
|
||||
|
||||
cmp r2, #0 ;skip first_pass filter if xoffset=0
|
||||
beq secondpass_bfilter16x16_only
|
||||
@ -416,6 +417,7 @@ sub_pixel_variance16x16_neon_loop
|
||||
add sp, sp, #528
|
||||
vmov.32 r0, d0[0] ;return
|
||||
|
||||
vpop {d8-d15}
|
||||
pop {r4-r6,pc}
|
||||
|
||||
ENDP
|
||||
|
@ -31,9 +31,10 @@
|
||||
;================================================
|
||||
|vp8_variance_halfpixvar16x16_h_neon| PROC
|
||||
push {lr}
|
||||
vpush {d8-d15}
|
||||
|
||||
mov r12, #4 ;loop counter
|
||||
ldr lr, [sp, #4] ;load *sse from stack
|
||||
ldr lr, [sp, #68] ;load *sse from stack
|
||||
vmov.i8 q8, #0 ;q8 - sum
|
||||
vmov.i8 q9, #0 ;q9, q10 - sse
|
||||
vmov.i8 q10, #0
|
||||
@ -116,6 +117,8 @@ vp8_filt_fpo16x16s_4_0_loop_neon
|
||||
vsub.u32 d0, d1, d10
|
||||
|
||||
vmov.32 r0, d0[0] ;return
|
||||
|
||||
vpop {d8-d15}
|
||||
pop {pc}
|
||||
ENDP
|
||||
|
||||
@ -131,11 +134,12 @@ vp8_filt_fpo16x16s_4_0_loop_neon
|
||||
;================================================
|
||||
|vp8_variance_halfpixvar16x16_v_neon| PROC
|
||||
push {lr}
|
||||
vpush {d8-d15}
|
||||
|
||||
mov r12, #4 ;loop counter
|
||||
|
||||
vld1.u8 {q0}, [r0], r1 ;load src data
|
||||
ldr lr, [sp, #4] ;load *sse from stack
|
||||
ldr lr, [sp, #68] ;load *sse from stack
|
||||
|
||||
vmov.i8 q8, #0 ;q8 - sum
|
||||
vmov.i8 q9, #0 ;q9, q10 - sse
|
||||
@ -212,6 +216,8 @@ vp8_filt_spo16x16s_0_4_loop_neon
|
||||
vsub.u32 d0, d1, d10
|
||||
|
||||
vmov.32 r0, d0[0] ;return
|
||||
|
||||
vpop {d8-d15}
|
||||
pop {pc}
|
||||
ENDP
|
||||
|
||||
@ -227,10 +233,11 @@ vp8_filt_spo16x16s_0_4_loop_neon
|
||||
;================================================
|
||||
|vp8_variance_halfpixvar16x16_hv_neon| PROC
|
||||
push {lr}
|
||||
vpush {d8-d15}
|
||||
|
||||
vld1.u8 {d0, d1, d2, d3}, [r0], r1 ;load src data
|
||||
|
||||
ldr lr, [sp, #4] ;load *sse from stack
|
||||
ldr lr, [sp, #68] ;load *sse from stack
|
||||
vmov.i8 q13, #0 ;q8 - sum
|
||||
vext.8 q1, q0, q1, #1 ;construct src_ptr[1]
|
||||
|
||||
@ -331,6 +338,8 @@ vp8_filt16x16s_4_4_loop_neon
|
||||
vsub.u32 d0, d1, d10
|
||||
|
||||
vmov.32 r0, d0[0] ;return
|
||||
|
||||
vpop {d8-d15}
|
||||
pop {pc}
|
||||
ENDP
|
||||
|
||||
@ -349,10 +358,11 @@ vp8_filt16x16s_4_4_loop_neon
|
||||
|
||||
|vp8_sub_pixel_variance16x16s_neon| PROC
|
||||
push {r4, lr}
|
||||
vpush {d8-d15}
|
||||
|
||||
ldr r4, [sp, #8] ;load *dst_ptr from stack
|
||||
ldr r12, [sp, #12] ;load dst_pixels_per_line from stack
|
||||
ldr lr, [sp, #16] ;load *sse from stack
|
||||
ldr r4, [sp, #72] ;load *dst_ptr from stack
|
||||
ldr r12, [sp, #76] ;load dst_pixels_per_line from stack
|
||||
ldr lr, [sp, #80] ;load *sse from stack
|
||||
|
||||
cmp r2, #0 ;skip first_pass filter if xoffset=0
|
||||
beq secondpass_bfilter16x16s_only
|
||||
@ -566,6 +576,7 @@ sub_pixel_variance16x16s_neon_loop
|
||||
add sp, sp, #256
|
||||
vmov.32 r0, d0[0] ;return
|
||||
|
||||
vpop {d8-d15}
|
||||
pop {r4, pc}
|
||||
ENDP
|
||||
|
||||
|
@ -26,11 +26,12 @@
|
||||
|
||||
|vp8_sub_pixel_variance8x8_neon| PROC
|
||||
push {r4-r5, lr}
|
||||
vpush {d8-d15}
|
||||
|
||||
adr r12, bilinear_taps_coeff
|
||||
ldr r4, [sp, #12] ;load *dst_ptr from stack
|
||||
ldr r5, [sp, #16] ;load dst_pixels_per_line from stack
|
||||
ldr lr, [sp, #20] ;load *sse from stack
|
||||
ldr r4, [sp, #76] ;load *dst_ptr from stack
|
||||
ldr r5, [sp, #80] ;load dst_pixels_per_line from stack
|
||||
ldr lr, [sp, #84] ;load *sse from stack
|
||||
|
||||
cmp r2, #0 ;skip first_pass filter if xoffset=0
|
||||
beq skip_firstpass_filter
|
||||
@ -210,6 +211,8 @@ sub_pixel_variance8x8_neon_loop
|
||||
vsub.u32 d0, d1, d10
|
||||
|
||||
vmov.32 r0, d0[0] ;return
|
||||
|
||||
vpop {d8-d15}
|
||||
pop {r4-r5, pc}
|
||||
|
||||
ENDP
|
||||
|
@ -65,8 +65,10 @@
|
||||
; unsigned char *pred, int pred_stride)
|
||||
|vp8_subtract_mby_neon| PROC
|
||||
push {r4-r7}
|
||||
vpush {d8-d15}
|
||||
|
||||
mov r12, #4
|
||||
ldr r4, [sp, #16] ; pred_stride
|
||||
ldr r4, [sp, #80] ; pred_stride
|
||||
mov r6, #32 ; "diff" stride x2
|
||||
add r5, r0, #16 ; second diff pointer
|
||||
|
||||
@ -101,6 +103,7 @@ subtract_mby_loop
|
||||
subs r12, r12, #1
|
||||
bne subtract_mby_loop
|
||||
|
||||
vpop {d8-d15}
|
||||
pop {r4-r7}
|
||||
bx lr
|
||||
ENDP
|
||||
@ -112,9 +115,11 @@ subtract_mby_loop
|
||||
|
||||
|vp8_subtract_mbuv_neon| PROC
|
||||
push {r4-r7}
|
||||
ldr r4, [sp, #16] ; upred
|
||||
ldr r5, [sp, #20] ; vpred
|
||||
ldr r6, [sp, #24] ; pred_stride
|
||||
vpush {d8-d15}
|
||||
|
||||
ldr r4, [sp, #80] ; upred
|
||||
ldr r5, [sp, #84] ; vpred
|
||||
ldr r6, [sp, #88] ; pred_stride
|
||||
add r0, r0, #512 ; short *udiff = diff + 256;
|
||||
mov r12, #32 ; "diff" stride x2
|
||||
add r7, r0, #16 ; second diff pointer
|
||||
@ -191,6 +196,7 @@ subtract_mby_loop
|
||||
vst1.16 {q14}, [r0], r12
|
||||
vst1.16 {q15}, [r7], r12
|
||||
|
||||
vpop {d8-d15}
|
||||
pop {r4-r7}
|
||||
bx lr
|
||||
|
||||
|
@ -21,6 +21,7 @@
|
||||
;void vp8_memcpy_partial_neon(unsigned char *dst_ptr, unsigned char *src_ptr,
|
||||
; int sz);
|
||||
|vp8_memcpy_partial_neon| PROC
|
||||
vpush {d8-d15}
|
||||
;pld [r1] ;preload pred data
|
||||
;pld [r1, #128]
|
||||
;pld [r1, #256]
|
||||
@ -64,6 +65,7 @@ extra_copy_neon_loop
|
||||
bne extra_copy_neon_loop
|
||||
|
||||
done_copy_neon_loop
|
||||
vpop {d8-d15}
|
||||
bx lr
|
||||
ENDP
|
||||
|
||||
|
@ -27,6 +27,8 @@
|
||||
;from vp8_variance().
|
||||
|
||||
|vp8_mse16x16_neon| PROC
|
||||
vpush {q7}
|
||||
|
||||
vmov.i8 q7, #0 ;q7, q8, q9, q10 - sse
|
||||
vmov.i8 q8, #0
|
||||
vmov.i8 q9, #0
|
||||
@ -62,7 +64,7 @@ mse16x16_neon_loop
|
||||
vadd.u32 q7, q7, q8
|
||||
vadd.u32 q9, q9, q10
|
||||
|
||||
ldr r12, [sp] ;load *sse from stack
|
||||
ldr r12, [sp, #16] ;load *sse from stack
|
||||
|
||||
vadd.u32 q10, q7, q9
|
||||
vpaddl.u32 q1, q10
|
||||
@ -71,6 +73,7 @@ mse16x16_neon_loop
|
||||
vst1.32 {d0[0]}, [r12]
|
||||
vmov.32 r0, d0[0]
|
||||
|
||||
vpop {q7}
|
||||
bx lr
|
||||
|
||||
ENDP
|
||||
@ -82,6 +85,8 @@ mse16x16_neon_loop
|
||||
; r2 unsigned char *ref_ptr,
|
||||
; r3 int recon_stride
|
||||
|vp8_get4x4sse_cs_neon| PROC
|
||||
vpush {q7}
|
||||
|
||||
vld1.8 {d0}, [r0], r1 ;Load up source and reference
|
||||
vld1.8 {d4}, [r2], r3
|
||||
vld1.8 {d1}, [r0], r1
|
||||
@ -109,6 +114,8 @@ mse16x16_neon_loop
|
||||
vadd.u64 d0, d2, d3
|
||||
|
||||
vmov.32 r0, d0[0]
|
||||
|
||||
vpop {q7}
|
||||
bx lr
|
||||
|
||||
ENDP
|
||||
|
Loading…
Reference in New Issue
Block a user