Neon version of vp8_build_intra_predictors_mby_s() and
vp8_build_intra_predictors_mbuv_s(). This patch replaces the assembly version with an intrinsic version. On a Nexus 7, vpxenc (in realtime mode, speed -12) reported a performance improvement of ~2.6%. Change-Id: I9ef65bad929450c0215253fdae1c16c8b4a8f26f
This commit is contained in:
parent
9293d267d2
commit
dcbfacbb98
@ -294,6 +294,11 @@ INSTANTIATE_TEST_CASE_P(SSSE3, IntraPredYTest,
|
||||
::testing::Values(
|
||||
vp8_build_intra_predictors_mby_s_ssse3));
|
||||
#endif
|
||||
#if HAVE_NEON
|
||||
INSTANTIATE_TEST_CASE_P(NEON, IntraPredYTest,
|
||||
::testing::Values(
|
||||
vp8_build_intra_predictors_mby_s_neon));
|
||||
#endif
|
||||
|
||||
typedef void (*IntraPredUvFunc)(MACROBLOCKD *x,
|
||||
uint8_t *uabove_row,
|
||||
@ -382,5 +387,10 @@ INSTANTIATE_TEST_CASE_P(SSSE3, IntraPredUVTest,
|
||||
::testing::Values(
|
||||
vp8_build_intra_predictors_mbuv_s_ssse3));
|
||||
#endif
|
||||
#if HAVE_NEON
|
||||
INSTANTIATE_TEST_CASE_P(NEON, IntraPredUVTest,
|
||||
::testing::Values(
|
||||
vp8_build_intra_predictors_mbuv_s_neon));
|
||||
#endif
|
||||
|
||||
} // namespace
|
||||
|
@ -1,595 +0,0 @@
|
||||
;
|
||||
; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
|
||||
;
|
||||
; Use of this source code is governed by a BSD-style license
|
||||
; that can be found in the LICENSE file in the root of the source
|
||||
; tree. An additional intellectual property rights grant can be found
|
||||
; in the file PATENTS. All contributing project authors may
|
||||
; be found in the AUTHORS file in the root of the source tree.
|
||||
;
|
||||
|
||||
|
||||
EXPORT |vp8_build_intra_predictors_mby_neon_func|
|
||||
EXPORT |vp8_build_intra_predictors_mby_s_neon_func|
|
||||
|
||||
ARM
|
||||
REQUIRE8
|
||||
PRESERVE8
|
||||
|
||||
AREA ||.text||, CODE, READONLY, ALIGN=2
|
||||
; r0 unsigned char *y_buffer
|
||||
; r1 unsigned char *ypred_ptr
|
||||
; r2 int y_stride
|
||||
; r3 int mode
|
||||
; stack int Up
|
||||
; stack int Left
|
||||
|
||||
|vp8_build_intra_predictors_mby_neon_func| PROC
|
||||
push {r4-r8, lr}
|
||||
vpush {d8-d15}
|
||||
|
||||
cmp r3, #0
|
||||
beq case_dc_pred
|
||||
cmp r3, #1
|
||||
beq case_v_pred
|
||||
cmp r3, #2
|
||||
beq case_h_pred
|
||||
cmp r3, #3
|
||||
beq case_tm_pred
|
||||
|
||||
case_dc_pred
|
||||
ldr r4, [sp, #88] ; Up
|
||||
ldr r5, [sp, #92] ; Left
|
||||
|
||||
; Default the DC average to 128
|
||||
mov r12, #128
|
||||
vdup.u8 q0, r12
|
||||
|
||||
; Zero out running sum
|
||||
mov r12, #0
|
||||
|
||||
; compute shift and jump
|
||||
adds r7, r4, r5
|
||||
beq skip_dc_pred_up_left
|
||||
|
||||
; Load above row, if it exists
|
||||
cmp r4, #0
|
||||
beq skip_dc_pred_up
|
||||
|
||||
sub r6, r0, r2
|
||||
vld1.8 {q1}, [r6]
|
||||
vpaddl.u8 q2, q1
|
||||
vpaddl.u16 q3, q2
|
||||
vpaddl.u32 q4, q3
|
||||
|
||||
vmov.32 r4, d8[0]
|
||||
vmov.32 r6, d9[0]
|
||||
|
||||
add r12, r4, r6
|
||||
|
||||
; Move back to interger registers
|
||||
|
||||
skip_dc_pred_up
|
||||
|
||||
cmp r5, #0
|
||||
beq skip_dc_pred_left
|
||||
|
||||
sub r0, r0, #1
|
||||
|
||||
; Load left row, if it exists
|
||||
ldrb r3, [r0], r2
|
||||
ldrb r4, [r0], r2
|
||||
ldrb r5, [r0], r2
|
||||
ldrb r6, [r0], r2
|
||||
|
||||
add r12, r12, r3
|
||||
add r12, r12, r4
|
||||
add r12, r12, r5
|
||||
add r12, r12, r6
|
||||
|
||||
ldrb r3, [r0], r2
|
||||
ldrb r4, [r0], r2
|
||||
ldrb r5, [r0], r2
|
||||
ldrb r6, [r0], r2
|
||||
|
||||
add r12, r12, r3
|
||||
add r12, r12, r4
|
||||
add r12, r12, r5
|
||||
add r12, r12, r6
|
||||
|
||||
ldrb r3, [r0], r2
|
||||
ldrb r4, [r0], r2
|
||||
ldrb r5, [r0], r2
|
||||
ldrb r6, [r0], r2
|
||||
|
||||
add r12, r12, r3
|
||||
add r12, r12, r4
|
||||
add r12, r12, r5
|
||||
add r12, r12, r6
|
||||
|
||||
ldrb r3, [r0], r2
|
||||
ldrb r4, [r0], r2
|
||||
ldrb r5, [r0], r2
|
||||
ldrb r6, [r0]
|
||||
|
||||
add r12, r12, r3
|
||||
add r12, r12, r4
|
||||
add r12, r12, r5
|
||||
add r12, r12, r6
|
||||
|
||||
skip_dc_pred_left
|
||||
add r7, r7, #3 ; Shift
|
||||
sub r4, r7, #1
|
||||
mov r5, #1
|
||||
add r12, r12, r5, lsl r4
|
||||
mov r5, r12, lsr r7 ; expected_dc
|
||||
|
||||
vdup.u8 q0, r5
|
||||
|
||||
skip_dc_pred_up_left
|
||||
vst1.u8 {q0}, [r1]!
|
||||
vst1.u8 {q0}, [r1]!
|
||||
vst1.u8 {q0}, [r1]!
|
||||
vst1.u8 {q0}, [r1]!
|
||||
vst1.u8 {q0}, [r1]!
|
||||
vst1.u8 {q0}, [r1]!
|
||||
vst1.u8 {q0}, [r1]!
|
||||
vst1.u8 {q0}, [r1]!
|
||||
vst1.u8 {q0}, [r1]!
|
||||
vst1.u8 {q0}, [r1]!
|
||||
vst1.u8 {q0}, [r1]!
|
||||
vst1.u8 {q0}, [r1]!
|
||||
vst1.u8 {q0}, [r1]!
|
||||
vst1.u8 {q0}, [r1]!
|
||||
vst1.u8 {q0}, [r1]!
|
||||
vst1.u8 {q0}, [r1]!
|
||||
|
||||
vpop {d8-d15}
|
||||
pop {r4-r8,pc}
|
||||
case_v_pred
|
||||
; Copy down above row
|
||||
sub r6, r0, r2
|
||||
vld1.8 {q0}, [r6]
|
||||
|
||||
vst1.u8 {q0}, [r1]!
|
||||
vst1.u8 {q0}, [r1]!
|
||||
vst1.u8 {q0}, [r1]!
|
||||
vst1.u8 {q0}, [r1]!
|
||||
vst1.u8 {q0}, [r1]!
|
||||
vst1.u8 {q0}, [r1]!
|
||||
vst1.u8 {q0}, [r1]!
|
||||
vst1.u8 {q0}, [r1]!
|
||||
vst1.u8 {q0}, [r1]!
|
||||
vst1.u8 {q0}, [r1]!
|
||||
vst1.u8 {q0}, [r1]!
|
||||
vst1.u8 {q0}, [r1]!
|
||||
vst1.u8 {q0}, [r1]!
|
||||
vst1.u8 {q0}, [r1]!
|
||||
vst1.u8 {q0}, [r1]!
|
||||
vst1.u8 {q0}, [r1]!
|
||||
vpop {d8-d15}
|
||||
pop {r4-r8,pc}
|
||||
|
||||
case_h_pred
|
||||
; Load 4x yleft_col
|
||||
sub r0, r0, #1
|
||||
|
||||
ldrb r3, [r0], r2
|
||||
ldrb r4, [r0], r2
|
||||
ldrb r5, [r0], r2
|
||||
ldrb r6, [r0], r2
|
||||
vdup.u8 q0, r3
|
||||
vdup.u8 q1, r4
|
||||
vdup.u8 q2, r5
|
||||
vdup.u8 q3, r6
|
||||
vst1.u8 {q0}, [r1]!
|
||||
vst1.u8 {q1}, [r1]!
|
||||
vst1.u8 {q2}, [r1]!
|
||||
vst1.u8 {q3}, [r1]!
|
||||
|
||||
ldrb r3, [r0], r2
|
||||
ldrb r4, [r0], r2
|
||||
ldrb r5, [r0], r2
|
||||
ldrb r6, [r0], r2
|
||||
vdup.u8 q0, r3
|
||||
vdup.u8 q1, r4
|
||||
vdup.u8 q2, r5
|
||||
vdup.u8 q3, r6
|
||||
vst1.u8 {q0}, [r1]!
|
||||
vst1.u8 {q1}, [r1]!
|
||||
vst1.u8 {q2}, [r1]!
|
||||
vst1.u8 {q3}, [r1]!
|
||||
|
||||
|
||||
ldrb r3, [r0], r2
|
||||
ldrb r4, [r0], r2
|
||||
ldrb r5, [r0], r2
|
||||
ldrb r6, [r0], r2
|
||||
vdup.u8 q0, r3
|
||||
vdup.u8 q1, r4
|
||||
vdup.u8 q2, r5
|
||||
vdup.u8 q3, r6
|
||||
vst1.u8 {q0}, [r1]!
|
||||
vst1.u8 {q1}, [r1]!
|
||||
vst1.u8 {q2}, [r1]!
|
||||
vst1.u8 {q3}, [r1]!
|
||||
|
||||
ldrb r3, [r0], r2
|
||||
ldrb r4, [r0], r2
|
||||
ldrb r5, [r0], r2
|
||||
ldrb r6, [r0], r2
|
||||
vdup.u8 q0, r3
|
||||
vdup.u8 q1, r4
|
||||
vdup.u8 q2, r5
|
||||
vdup.u8 q3, r6
|
||||
vst1.u8 {q0}, [r1]!
|
||||
vst1.u8 {q1}, [r1]!
|
||||
vst1.u8 {q2}, [r1]!
|
||||
vst1.u8 {q3}, [r1]!
|
||||
|
||||
vpop {d8-d15}
|
||||
pop {r4-r8,pc}
|
||||
|
||||
case_tm_pred
|
||||
; Load yabove_row
|
||||
sub r3, r0, r2
|
||||
vld1.8 {q8}, [r3]
|
||||
|
||||
; Load ytop_left
|
||||
sub r3, r3, #1
|
||||
ldrb r7, [r3]
|
||||
|
||||
vdup.u16 q7, r7
|
||||
|
||||
; Compute yabove_row - ytop_left
|
||||
mov r3, #1
|
||||
vdup.u8 q0, r3
|
||||
|
||||
vmull.u8 q4, d16, d0
|
||||
vmull.u8 q5, d17, d0
|
||||
|
||||
vsub.s16 q4, q4, q7
|
||||
vsub.s16 q5, q5, q7
|
||||
|
||||
; Load 4x yleft_col
|
||||
sub r0, r0, #1
|
||||
mov r12, #4
|
||||
|
||||
case_tm_pred_loop
|
||||
ldrb r3, [r0], r2
|
||||
ldrb r4, [r0], r2
|
||||
ldrb r5, [r0], r2
|
||||
ldrb r6, [r0], r2
|
||||
vdup.u16 q0, r3
|
||||
vdup.u16 q1, r4
|
||||
vdup.u16 q2, r5
|
||||
vdup.u16 q3, r6
|
||||
|
||||
vqadd.s16 q8, q0, q4
|
||||
vqadd.s16 q9, q0, q5
|
||||
|
||||
vqadd.s16 q10, q1, q4
|
||||
vqadd.s16 q11, q1, q5
|
||||
|
||||
vqadd.s16 q12, q2, q4
|
||||
vqadd.s16 q13, q2, q5
|
||||
|
||||
vqadd.s16 q14, q3, q4
|
||||
vqadd.s16 q15, q3, q5
|
||||
|
||||
vqshrun.s16 d0, q8, #0
|
||||
vqshrun.s16 d1, q9, #0
|
||||
|
||||
vqshrun.s16 d2, q10, #0
|
||||
vqshrun.s16 d3, q11, #0
|
||||
|
||||
vqshrun.s16 d4, q12, #0
|
||||
vqshrun.s16 d5, q13, #0
|
||||
|
||||
vqshrun.s16 d6, q14, #0
|
||||
vqshrun.s16 d7, q15, #0
|
||||
|
||||
vst1.u8 {q0}, [r1]!
|
||||
vst1.u8 {q1}, [r1]!
|
||||
vst1.u8 {q2}, [r1]!
|
||||
vst1.u8 {q3}, [r1]!
|
||||
|
||||
subs r12, r12, #1
|
||||
bne case_tm_pred_loop
|
||||
|
||||
vpop {d8-d15}
|
||||
pop {r4-r8,pc}
|
||||
|
||||
ENDP
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
; r0 unsigned char *y_buffer
|
||||
; r1 unsigned char *ypred_ptr
|
||||
; r2 int y_stride
|
||||
; r3 int mode
|
||||
; stack int Up
|
||||
; stack int Left
|
||||
|
||||
|vp8_build_intra_predictors_mby_s_neon_func| PROC
|
||||
push {r4-r8, lr}
|
||||
vpush {d8-d15}
|
||||
|
||||
mov r1, r0 ; unsigned char *ypred_ptr = x->dst.y_buffer; //x->Predictor;
|
||||
|
||||
cmp r3, #0
|
||||
beq case_dc_pred_s
|
||||
cmp r3, #1
|
||||
beq case_v_pred_s
|
||||
cmp r3, #2
|
||||
beq case_h_pred_s
|
||||
cmp r3, #3
|
||||
beq case_tm_pred_s
|
||||
|
||||
case_dc_pred_s
|
||||
ldr r4, [sp, #88] ; Up
|
||||
ldr r5, [sp, #92] ; Left
|
||||
|
||||
; Default the DC average to 128
|
||||
mov r12, #128
|
||||
vdup.u8 q0, r12
|
||||
|
||||
; Zero out running sum
|
||||
mov r12, #0
|
||||
|
||||
; compute shift and jump
|
||||
adds r7, r4, r5
|
||||
beq skip_dc_pred_up_left_s
|
||||
|
||||
; Load above row, if it exists
|
||||
cmp r4, #0
|
||||
beq skip_dc_pred_up_s
|
||||
|
||||
sub r6, r0, r2
|
||||
vld1.8 {q1}, [r6]
|
||||
vpaddl.u8 q2, q1
|
||||
vpaddl.u16 q3, q2
|
||||
vpaddl.u32 q4, q3
|
||||
|
||||
vmov.32 r4, d8[0]
|
||||
vmov.32 r6, d9[0]
|
||||
|
||||
add r12, r4, r6
|
||||
|
||||
; Move back to interger registers
|
||||
|
||||
skip_dc_pred_up_s
|
||||
|
||||
cmp r5, #0
|
||||
beq skip_dc_pred_left_s
|
||||
|
||||
sub r0, r0, #1
|
||||
|
||||
; Load left row, if it exists
|
||||
ldrb r3, [r0], r2
|
||||
ldrb r4, [r0], r2
|
||||
ldrb r5, [r0], r2
|
||||
ldrb r6, [r0], r2
|
||||
|
||||
add r12, r12, r3
|
||||
add r12, r12, r4
|
||||
add r12, r12, r5
|
||||
add r12, r12, r6
|
||||
|
||||
ldrb r3, [r0], r2
|
||||
ldrb r4, [r0], r2
|
||||
ldrb r5, [r0], r2
|
||||
ldrb r6, [r0], r2
|
||||
|
||||
add r12, r12, r3
|
||||
add r12, r12, r4
|
||||
add r12, r12, r5
|
||||
add r12, r12, r6
|
||||
|
||||
ldrb r3, [r0], r2
|
||||
ldrb r4, [r0], r2
|
||||
ldrb r5, [r0], r2
|
||||
ldrb r6, [r0], r2
|
||||
|
||||
add r12, r12, r3
|
||||
add r12, r12, r4
|
||||
add r12, r12, r5
|
||||
add r12, r12, r6
|
||||
|
||||
ldrb r3, [r0], r2
|
||||
ldrb r4, [r0], r2
|
||||
ldrb r5, [r0], r2
|
||||
ldrb r6, [r0]
|
||||
|
||||
add r12, r12, r3
|
||||
add r12, r12, r4
|
||||
add r12, r12, r5
|
||||
add r12, r12, r6
|
||||
|
||||
skip_dc_pred_left_s
|
||||
add r7, r7, #3 ; Shift
|
||||
sub r4, r7, #1
|
||||
mov r5, #1
|
||||
add r12, r12, r5, lsl r4
|
||||
mov r5, r12, lsr r7 ; expected_dc
|
||||
|
||||
vdup.u8 q0, r5
|
||||
|
||||
skip_dc_pred_up_left_s
|
||||
vst1.u8 {q0}, [r1], r2
|
||||
vst1.u8 {q0}, [r1], r2
|
||||
vst1.u8 {q0}, [r1], r2
|
||||
vst1.u8 {q0}, [r1], r2
|
||||
vst1.u8 {q0}, [r1], r2
|
||||
vst1.u8 {q0}, [r1], r2
|
||||
vst1.u8 {q0}, [r1], r2
|
||||
vst1.u8 {q0}, [r1], r2
|
||||
vst1.u8 {q0}, [r1], r2
|
||||
vst1.u8 {q0}, [r1], r2
|
||||
vst1.u8 {q0}, [r1], r2
|
||||
vst1.u8 {q0}, [r1], r2
|
||||
vst1.u8 {q0}, [r1], r2
|
||||
vst1.u8 {q0}, [r1], r2
|
||||
vst1.u8 {q0}, [r1], r2
|
||||
vst1.u8 {q0}, [r1], r2
|
||||
|
||||
vpop {d8-d15}
|
||||
pop {r4-r8,pc}
|
||||
case_v_pred_s
|
||||
; Copy down above row
|
||||
sub r6, r0, r2
|
||||
vld1.8 {q0}, [r6]
|
||||
|
||||
vst1.u8 {q0}, [r1], r2
|
||||
vst1.u8 {q0}, [r1], r2
|
||||
vst1.u8 {q0}, [r1], r2
|
||||
vst1.u8 {q0}, [r1], r2
|
||||
vst1.u8 {q0}, [r1], r2
|
||||
vst1.u8 {q0}, [r1], r2
|
||||
vst1.u8 {q0}, [r1], r2
|
||||
vst1.u8 {q0}, [r1], r2
|
||||
vst1.u8 {q0}, [r1], r2
|
||||
vst1.u8 {q0}, [r1], r2
|
||||
vst1.u8 {q0}, [r1], r2
|
||||
vst1.u8 {q0}, [r1], r2
|
||||
vst1.u8 {q0}, [r1], r2
|
||||
vst1.u8 {q0}, [r1], r2
|
||||
vst1.u8 {q0}, [r1], r2
|
||||
vst1.u8 {q0}, [r1], r2
|
||||
|
||||
vpop {d8-d15}
|
||||
pop {r4-r8,pc}
|
||||
|
||||
case_h_pred_s
|
||||
; Load 4x yleft_col
|
||||
sub r0, r0, #1
|
||||
|
||||
ldrb r3, [r0], r2
|
||||
ldrb r4, [r0], r2
|
||||
ldrb r5, [r0], r2
|
||||
ldrb r6, [r0], r2
|
||||
vdup.u8 q0, r3
|
||||
vdup.u8 q1, r4
|
||||
vdup.u8 q2, r5
|
||||
vdup.u8 q3, r6
|
||||
vst1.u8 {q0}, [r1], r2
|
||||
vst1.u8 {q1}, [r1], r2
|
||||
vst1.u8 {q2}, [r1], r2
|
||||
vst1.u8 {q3}, [r1], r2
|
||||
|
||||
ldrb r3, [r0], r2
|
||||
ldrb r4, [r0], r2
|
||||
ldrb r5, [r0], r2
|
||||
ldrb r6, [r0], r2
|
||||
vdup.u8 q0, r3
|
||||
vdup.u8 q1, r4
|
||||
vdup.u8 q2, r5
|
||||
vdup.u8 q3, r6
|
||||
vst1.u8 {q0}, [r1], r2
|
||||
vst1.u8 {q1}, [r1], r2
|
||||
vst1.u8 {q2}, [r1], r2
|
||||
vst1.u8 {q3}, [r1], r2
|
||||
|
||||
|
||||
ldrb r3, [r0], r2
|
||||
ldrb r4, [r0], r2
|
||||
ldrb r5, [r0], r2
|
||||
ldrb r6, [r0], r2
|
||||
vdup.u8 q0, r3
|
||||
vdup.u8 q1, r4
|
||||
vdup.u8 q2, r5
|
||||
vdup.u8 q3, r6
|
||||
vst1.u8 {q0}, [r1], r2
|
||||
vst1.u8 {q1}, [r1], r2
|
||||
vst1.u8 {q2}, [r1], r2
|
||||
vst1.u8 {q3}, [r1], r2
|
||||
|
||||
ldrb r3, [r0], r2
|
||||
ldrb r4, [r0], r2
|
||||
ldrb r5, [r0], r2
|
||||
ldrb r6, [r0], r2
|
||||
vdup.u8 q0, r3
|
||||
vdup.u8 q1, r4
|
||||
vdup.u8 q2, r5
|
||||
vdup.u8 q3, r6
|
||||
vst1.u8 {q0}, [r1], r2
|
||||
vst1.u8 {q1}, [r1], r2
|
||||
vst1.u8 {q2}, [r1], r2
|
||||
vst1.u8 {q3}, [r1], r2
|
||||
|
||||
vpop {d8-d15}
|
||||
pop {r4-r8,pc}
|
||||
|
||||
case_tm_pred_s
|
||||
; Load yabove_row
|
||||
sub r3, r0, r2
|
||||
vld1.8 {q8}, [r3]
|
||||
|
||||
; Load ytop_left
|
||||
sub r3, r3, #1
|
||||
ldrb r7, [r3]
|
||||
|
||||
vdup.u16 q7, r7
|
||||
|
||||
; Compute yabove_row - ytop_left
|
||||
mov r3, #1
|
||||
vdup.u8 q0, r3
|
||||
|
||||
vmull.u8 q4, d16, d0
|
||||
vmull.u8 q5, d17, d0
|
||||
|
||||
vsub.s16 q4, q4, q7
|
||||
vsub.s16 q5, q5, q7
|
||||
|
||||
; Load 4x yleft_col
|
||||
sub r0, r0, #1
|
||||
mov r12, #4
|
||||
|
||||
case_tm_pred_loop_s
|
||||
ldrb r3, [r0], r2
|
||||
ldrb r4, [r0], r2
|
||||
ldrb r5, [r0], r2
|
||||
ldrb r6, [r0], r2
|
||||
vdup.u16 q0, r3
|
||||
vdup.u16 q1, r4
|
||||
vdup.u16 q2, r5
|
||||
vdup.u16 q3, r6
|
||||
|
||||
vqadd.s16 q8, q0, q4
|
||||
vqadd.s16 q9, q0, q5
|
||||
|
||||
vqadd.s16 q10, q1, q4
|
||||
vqadd.s16 q11, q1, q5
|
||||
|
||||
vqadd.s16 q12, q2, q4
|
||||
vqadd.s16 q13, q2, q5
|
||||
|
||||
vqadd.s16 q14, q3, q4
|
||||
vqadd.s16 q15, q3, q5
|
||||
|
||||
vqshrun.s16 d0, q8, #0
|
||||
vqshrun.s16 d1, q9, #0
|
||||
|
||||
vqshrun.s16 d2, q10, #0
|
||||
vqshrun.s16 d3, q11, #0
|
||||
|
||||
vqshrun.s16 d4, q12, #0
|
||||
vqshrun.s16 d5, q13, #0
|
||||
|
||||
vqshrun.s16 d6, q14, #0
|
||||
vqshrun.s16 d7, q15, #0
|
||||
|
||||
vst1.u8 {q0}, [r1], r2
|
||||
vst1.u8 {q1}, [r1], r2
|
||||
vst1.u8 {q2}, [r1], r2
|
||||
vst1.u8 {q3}, [r1], r2
|
||||
|
||||
subs r12, r12, #1
|
||||
bne case_tm_pred_loop_s
|
||||
|
||||
vpop {d8-d15}
|
||||
pop {r4-r8,pc}
|
||||
|
||||
ENDP
|
||||
|
||||
|
||||
END
|
210
vp8/common/arm/neon/reconintra_neon.c
Normal file
210
vp8/common/arm/neon/reconintra_neon.c
Normal file
@ -0,0 +1,210 @@
|
||||
/*
|
||||
* Copyright (c) 2014 The WebM project authors. All Rights Reserved.
|
||||
*
|
||||
* Use of this source code is governed by a BSD-style license
|
||||
* that can be found in the LICENSE file in the root of the source
|
||||
* tree. An additional intellectual property rights grant can be found
|
||||
* in the file PATENTS. All contributing project authors may
|
||||
* be found in the AUTHORS file in the root of the source tree.
|
||||
*/
|
||||
|
||||
#include <arm_neon.h>
|
||||
|
||||
#include "vp8/common/blockd.h"
|
||||
|
||||
void vp8_build_intra_predictors_mby_s_neon(MACROBLOCKD *x,
|
||||
unsigned char * yabove_row,
|
||||
unsigned char * yleft,
|
||||
int left_stride,
|
||||
unsigned char * ypred_ptr,
|
||||
int y_stride) {
|
||||
const int mode = x->mode_info_context->mbmi.mode;
|
||||
int i;
|
||||
|
||||
switch (mode) {
|
||||
case DC_PRED:
|
||||
{
|
||||
int shift = x->up_available + x->left_available;
|
||||
uint8x16_t v_expected_dc = vdupq_n_u8(128);
|
||||
|
||||
if (shift) {
|
||||
unsigned int average = 0;
|
||||
int expected_dc;
|
||||
if (x->up_available) {
|
||||
const uint8x16_t v_above = vld1q_u8(yabove_row);
|
||||
const uint16x8_t a = vpaddlq_u8(v_above);
|
||||
const uint32x4_t b = vpaddlq_u16(a);
|
||||
const uint64x2_t c = vpaddlq_u32(b);
|
||||
const uint32x2_t d = vadd_u32(vreinterpret_u32_u64(vget_low_u64(c)),
|
||||
vreinterpret_u32_u64(vget_high_u64(c)));
|
||||
average = vget_lane_u32(d, 0);
|
||||
}
|
||||
if (x->left_available) {
|
||||
for (i = 0; i < 16; ++i) {
|
||||
average += yleft[0];
|
||||
yleft += left_stride;
|
||||
}
|
||||
}
|
||||
shift += 3;
|
||||
expected_dc = (average + (1 << (shift - 1))) >> shift;
|
||||
v_expected_dc = vmovq_n_u8((uint8_t)expected_dc);
|
||||
}
|
||||
for (i = 0; i < 16; ++i) {
|
||||
vst1q_u8(ypred_ptr, v_expected_dc);
|
||||
ypred_ptr += y_stride;
|
||||
}
|
||||
}
|
||||
break;
|
||||
case V_PRED:
|
||||
{
|
||||
const uint8x16_t v_above = vld1q_u8(yabove_row);
|
||||
for (i = 0; i < 16; ++i) {
|
||||
vst1q_u8(ypred_ptr, v_above);
|
||||
ypred_ptr += y_stride;
|
||||
}
|
||||
}
|
||||
break;
|
||||
case H_PRED:
|
||||
{
|
||||
for (i = 0; i < 16; ++i) {
|
||||
const uint8x16_t v_yleft = vmovq_n_u8((uint8_t)yleft[0]);
|
||||
yleft += left_stride;
|
||||
vst1q_u8(ypred_ptr, v_yleft);
|
||||
ypred_ptr += y_stride;
|
||||
}
|
||||
}
|
||||
break;
|
||||
case TM_PRED:
|
||||
{
|
||||
const uint16x8_t v_ytop_left = vmovq_n_u16((int16_t)yabove_row[-1]);
|
||||
const uint8x16_t v_above = vld1q_u8(yabove_row);
|
||||
for (i = 0; i < 16; ++i) {
|
||||
const uint8x8_t v_yleft = vmov_n_u8((int8_t)yleft[0]);
|
||||
const uint16x8_t a_lo = vaddl_u8(vget_low_u8(v_above), v_yleft);
|
||||
const uint16x8_t a_hi = vaddl_u8(vget_high_u8(v_above), v_yleft);
|
||||
const int16x8_t b_lo = vsubq_s16(vreinterpretq_s16_u16(a_lo),
|
||||
vreinterpretq_s16_u16(v_ytop_left));
|
||||
const int16x8_t b_hi = vsubq_s16(vreinterpretq_s16_u16(a_hi),
|
||||
vreinterpretq_s16_u16(v_ytop_left));
|
||||
const uint8x8_t pred_lo = vqmovun_s16(b_lo);
|
||||
const uint8x8_t pred_hi = vqmovun_s16(b_hi);
|
||||
|
||||
vst1q_u8(ypred_ptr, vcombine_u8(pred_lo, pred_hi));
|
||||
ypred_ptr += y_stride;
|
||||
yleft += left_stride;
|
||||
}
|
||||
}
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
void vp8_build_intra_predictors_mbuv_s_neon(MACROBLOCKD *x,
|
||||
unsigned char * uabove_row,
|
||||
unsigned char * vabove_row,
|
||||
unsigned char * uleft,
|
||||
unsigned char * vleft,
|
||||
int left_stride,
|
||||
unsigned char * upred_ptr,
|
||||
unsigned char * vpred_ptr,
|
||||
int pred_stride) {
|
||||
const int mode = x->mode_info_context->mbmi.uv_mode;
|
||||
int i;
|
||||
|
||||
switch (mode) {
|
||||
case DC_PRED:
|
||||
{
|
||||
int shift = x->up_available + x->left_available;
|
||||
uint8x8_t v_expected_udc = vdup_n_u8(128);
|
||||
uint8x8_t v_expected_vdc = vdup_n_u8(128);
|
||||
|
||||
if (shift) {
|
||||
unsigned int average_u = 0;
|
||||
unsigned int average_v = 0;
|
||||
int expected_udc;
|
||||
int expected_vdc;
|
||||
if (x->up_available) {
|
||||
const uint8x8_t v_uabove = vld1_u8(uabove_row);
|
||||
const uint8x8_t v_vabove = vld1_u8(vabove_row);
|
||||
const uint16x8_t a = vpaddlq_u8(vcombine_u8(v_uabove, v_vabove));
|
||||
const uint32x4_t b = vpaddlq_u16(a);
|
||||
const uint64x2_t c = vpaddlq_u32(b);
|
||||
average_u = vgetq_lane_u32(vreinterpretq_u32_u64((c)), 0);
|
||||
average_v = vgetq_lane_u32(vreinterpretq_u32_u64((c)), 2);
|
||||
}
|
||||
if (x->left_available) {
|
||||
for (i = 0; i < 8; ++i) {
|
||||
average_u += uleft[0];
|
||||
uleft += left_stride;
|
||||
average_v += vleft[0];
|
||||
vleft += left_stride;
|
||||
}
|
||||
}
|
||||
shift += 2;
|
||||
expected_udc = (average_u + (1 << (shift - 1))) >> shift;
|
||||
expected_vdc = (average_v + (1 << (shift - 1))) >> shift;
|
||||
v_expected_udc = vmov_n_u8((uint8_t)expected_udc);
|
||||
v_expected_vdc = vmov_n_u8((uint8_t)expected_vdc);
|
||||
}
|
||||
for (i = 0; i < 8; ++i) {
|
||||
vst1_u8(upred_ptr, v_expected_udc);
|
||||
upred_ptr += pred_stride;
|
||||
vst1_u8(vpred_ptr, v_expected_vdc);
|
||||
vpred_ptr += pred_stride;
|
||||
}
|
||||
}
|
||||
break;
|
||||
case V_PRED:
|
||||
{
|
||||
const uint8x8_t v_uabove = vld1_u8(uabove_row);
|
||||
const uint8x8_t v_vabove = vld1_u8(vabove_row);
|
||||
for (i = 0; i < 8; ++i) {
|
||||
vst1_u8(upred_ptr, v_uabove);
|
||||
upred_ptr += pred_stride;
|
||||
vst1_u8(vpred_ptr, v_vabove);
|
||||
vpred_ptr += pred_stride;
|
||||
}
|
||||
}
|
||||
break;
|
||||
case H_PRED:
|
||||
{
|
||||
for (i = 0; i < 8; ++i) {
|
||||
const uint8x8_t v_uleft = vmov_n_u8((uint8_t)uleft[0]);
|
||||
const uint8x8_t v_vleft = vmov_n_u8((uint8_t)vleft[0]);
|
||||
uleft += left_stride;
|
||||
vleft += left_stride;
|
||||
vst1_u8(upred_ptr, v_uleft);
|
||||
upred_ptr += pred_stride;
|
||||
vst1_u8(vpred_ptr, v_vleft);
|
||||
vpred_ptr += pred_stride;
|
||||
}
|
||||
}
|
||||
break;
|
||||
case TM_PRED:
|
||||
{
|
||||
const uint16x8_t v_utop_left = vmovq_n_u16((int16_t)uabove_row[-1]);
|
||||
const uint16x8_t v_vtop_left = vmovq_n_u16((int16_t)vabove_row[-1]);
|
||||
const uint8x8_t v_uabove = vld1_u8(uabove_row);
|
||||
const uint8x8_t v_vabove = vld1_u8(vabove_row);
|
||||
for (i = 0; i < 8; ++i) {
|
||||
const uint8x8_t v_uleft = vmov_n_u8((int8_t)uleft[0]);
|
||||
const uint8x8_t v_vleft = vmov_n_u8((int8_t)vleft[0]);
|
||||
const uint16x8_t a_u = vaddl_u8(v_uabove, v_uleft);
|
||||
const uint16x8_t a_v = vaddl_u8(v_vabove, v_vleft);
|
||||
const int16x8_t b_u = vsubq_s16(vreinterpretq_s16_u16(a_u),
|
||||
vreinterpretq_s16_u16(v_utop_left));
|
||||
const int16x8_t b_v = vsubq_s16(vreinterpretq_s16_u16(a_v),
|
||||
vreinterpretq_s16_u16(v_vtop_left));
|
||||
const uint8x8_t pred_u = vqmovun_s16(b_u);
|
||||
const uint8x8_t pred_v = vqmovun_s16(b_v);
|
||||
|
||||
vst1_u8(upred_ptr, pred_u);
|
||||
vst1_u8(vpred_ptr, pred_v);
|
||||
upred_ptr += pred_stride;
|
||||
vpred_ptr += pred_stride;
|
||||
uleft += left_stride;
|
||||
vleft += left_stride;
|
||||
}
|
||||
}
|
||||
break;
|
||||
}
|
||||
}
|
@ -1,58 +0,0 @@
|
||||
/*
|
||||
* Copyright (c) 2010 The WebM project authors. All Rights Reserved.
|
||||
*
|
||||
* Use of this source code is governed by a BSD-style license
|
||||
* that can be found in the LICENSE file in the root of the source
|
||||
* tree. An additional intellectual property rights grant can be found
|
||||
* in the file PATENTS. All contributing project authors may
|
||||
* be found in the AUTHORS file in the root of the source tree.
|
||||
*/
|
||||
|
||||
|
||||
#include "vpx_config.h"
|
||||
#include "vp8_rtcd.h"
|
||||
#include "vp8/common/blockd.h"
|
||||
#include "vpx_mem/vpx_mem.h"
|
||||
|
||||
#if HAVE_NEON_ASM
|
||||
extern void vp8_build_intra_predictors_mby_neon_func(
|
||||
unsigned char *y_buffer,
|
||||
unsigned char *ypred_ptr,
|
||||
int y_stride,
|
||||
int mode,
|
||||
int Up,
|
||||
int Left);
|
||||
|
||||
void vp8_build_intra_predictors_mby_neon(MACROBLOCKD *x)
|
||||
{
|
||||
unsigned char *y_buffer = x->dst.y_buffer;
|
||||
unsigned char *ypred_ptr = x->predictor;
|
||||
int y_stride = x->dst.y_stride;
|
||||
int mode = x->mode_info_context->mbmi.mode;
|
||||
int Up = x->up_available;
|
||||
int Left = x->left_available;
|
||||
|
||||
vp8_build_intra_predictors_mby_neon_func(y_buffer, ypred_ptr, y_stride, mode, Up, Left);
|
||||
}
|
||||
|
||||
extern void vp8_build_intra_predictors_mby_s_neon_func(
|
||||
unsigned char *y_buffer,
|
||||
unsigned char *ypred_ptr,
|
||||
int y_stride,
|
||||
int mode,
|
||||
int Up,
|
||||
int Left);
|
||||
|
||||
void vp8_build_intra_predictors_mby_s_neon(MACROBLOCKD *x)
|
||||
{
|
||||
unsigned char *y_buffer = x->dst.y_buffer;
|
||||
unsigned char *ypred_ptr = x->predictor;
|
||||
int y_stride = x->dst.y_stride;
|
||||
int mode = x->mode_info_context->mbmi.mode;
|
||||
int Up = x->up_available;
|
||||
int Left = x->left_available;
|
||||
|
||||
vp8_build_intra_predictors_mby_s_neon_func(y_buffer, ypred_ptr, y_stride, mode, Up, Left);
|
||||
}
|
||||
|
||||
#endif
|
@ -149,11 +149,10 @@ $vp8_copy_mem8x4_media=vp8_copy_mem8x4_v6;
|
||||
$vp8_copy_mem8x4_dspr2=vp8_copy_mem8x4_dspr2;
|
||||
|
||||
add_proto qw/void vp8_build_intra_predictors_mby_s/, "struct macroblockd *x, unsigned char * yabove_row, unsigned char * yleft, int left_stride, unsigned char * ypred_ptr, int y_stride";
|
||||
specialize qw/vp8_build_intra_predictors_mby_s sse2 ssse3/;
|
||||
#TODO: fix assembly for neon
|
||||
specialize qw/vp8_build_intra_predictors_mby_s sse2 ssse3 neon/;
|
||||
|
||||
add_proto qw/void vp8_build_intra_predictors_mbuv_s/, "struct macroblockd *x, unsigned char * uabove_row, unsigned char * vabove_row, unsigned char *uleft, unsigned char *vleft, int left_stride, unsigned char * upred_ptr, unsigned char * vpred_ptr, int pred_stride";
|
||||
specialize qw/vp8_build_intra_predictors_mbuv_s sse2 ssse3/;
|
||||
specialize qw/vp8_build_intra_predictors_mbuv_s sse2 ssse3 neon/;
|
||||
|
||||
add_proto qw/void vp8_intra4x4_predict/, "unsigned char *Above, unsigned char *yleft, int left_stride, int b_mode, unsigned char *dst, int dst_stride, unsigned char top_left";
|
||||
specialize qw/vp8_intra4x4_predict media/;
|
||||
|
@ -156,9 +156,7 @@ VP8_COMMON_SRCS-$(HAVE_MEDIA) += common/arm/armv6/vp8_variance_halfpixvar16x16_
|
||||
VP8_COMMON_SRCS-$(HAVE_MEDIA) += common/arm/armv6/vp8_variance_halfpixvar16x16_hv_armv6$(ASM)
|
||||
|
||||
# common (neon)
|
||||
#VP8_COMMON_SRCS-$(HAVE_NEON_ASM) += common/arm/reconintra_arm.c
|
||||
VP8_COMMON_SRCS-$(HAVE_NEON_ASM) += common/arm/neon/loopfiltersimpleverticaledge_neon$(ASM)
|
||||
#VP8_COMMON_SRCS-$(HAVE_NEON_ASM) += common/arm/neon/buildintrapredictorsmby_neon$(ASM)
|
||||
VP8_COMMON_SRCS-$(HAVE_NEON_ASM) += common/arm/neon/idct_blk_neon.c
|
||||
|
||||
# common (neon intrinsics)
|
||||
@ -172,6 +170,7 @@ VP8_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/iwalsh_neon.c
|
||||
VP8_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/loopfilter_neon.c
|
||||
VP8_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/loopfiltersimplehorizontaledge_neon.c
|
||||
VP8_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/mbloopfilter_neon.c
|
||||
VP8_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/reconintra_neon.c
|
||||
VP8_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/sad_neon.c
|
||||
VP8_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/shortidct4x4llm_neon.c
|
||||
VP8_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/sixtappredict_neon.c
|
||||
|
Loading…
Reference in New Issue
Block a user