refine arm code for sum of frame
This commit is contained in:
parent
2b26a28d15
commit
e14186b535
@ -72,18 +72,17 @@ WELS_ASM_FUNC_END
|
||||
|
||||
WELS_ASM_FUNC_BEGIN SumOf8x8BlockOfFrame_neon
|
||||
//(uint8_t* pRefPicture, const int32_t kiWidth, const int32_t kiHeight,const int32_t kiRefStride,uint16_t* pFeatureOfBlock, uint32_t pTimesOfFeatureValue[])
|
||||
stmdb sp!, {r4-r8}
|
||||
ldr r5, [sp, #24] //pTimesOfFeatureValue
|
||||
ldr r4, [sp, #20] //pFeatureOfBlock
|
||||
stmdb sp!, {r4-r12}
|
||||
ldr r5, [sp, #40] //pTimesOfFeatureValue
|
||||
ldr r4, [sp, #36] //pFeatureOfBlock
|
||||
|
||||
mov r8, r0
|
||||
mov r6, r1
|
||||
add r8, r6
|
||||
add r4, r4, r6, lsl #1
|
||||
|
||||
_height_loop8x8:
|
||||
mov r7, r6
|
||||
_width_loop8x8:
|
||||
_width_loop8x8_1:
|
||||
subs r0, r8, r7
|
||||
vld1.64 {d0}, [r0], r3
|
||||
vld1.64 {d1}, [r0], r3
|
||||
@ -98,7 +97,6 @@ _width_loop8x8:
|
||||
vpadal.u8 q0, q1
|
||||
vpadal.u8 q0, q2
|
||||
vpadal.u8 q0, q3
|
||||
|
||||
vpaddl.u16 q0, q0
|
||||
vpadd.i32 d0, d1
|
||||
vpadd.i32 d0, d0
|
||||
@ -112,30 +110,65 @@ _width_loop8x8:
|
||||
str r0, [r1]
|
||||
|
||||
subs r7, #1
|
||||
bne _width_loop8x8
|
||||
bne _width_loop8x8_1
|
||||
|
||||
add r8, r3
|
||||
add r4, r4, r6, lsl #1
|
||||
subs r2, #1
|
||||
beq _SumOf8x8BlockOfFrame_end
|
||||
|
||||
|
||||
_height_loop8x8:
|
||||
mov r7, r6
|
||||
_width_loop8x8_2:
|
||||
subs r0, r8, r7
|
||||
subs r1, r4, r7, lsl #1
|
||||
|
||||
subs r9, r1, r6, lsl #1 // last line of pFeatureOfBlock[i]
|
||||
ldrh r10, [r9] // sum of last line of pFeatureOfBlock[i]
|
||||
|
||||
subs r11, r0, r3
|
||||
vld1.64 {d1}, [r11]
|
||||
add r0, r11, r3, lsl #3
|
||||
vld1.64 {d0}, [r0] //
|
||||
|
||||
vpaddl.u8 q0, q0
|
||||
vpadd.u16 d0, d0, d1
|
||||
vpaddl.u16 d0, d0
|
||||
vmov r11, r12, d0
|
||||
subs r10, r12
|
||||
add r0, r10, r11
|
||||
|
||||
strh r0, [r1] // sum -> pFeatureOfBlock[i]
|
||||
|
||||
add r1, r5, r0, lsl #2
|
||||
ldr r0, [r1]
|
||||
add r0, #1
|
||||
str r0, [r1]
|
||||
subs r7, #1
|
||||
bne _width_loop8x8_2
|
||||
|
||||
add r8, r3
|
||||
add r4, r4, r6, lsl #1
|
||||
subs r2, #1
|
||||
bne _height_loop8x8
|
||||
|
||||
ldmia sp!, {r4-r8}
|
||||
_SumOf8x8BlockOfFrame_end:
|
||||
ldmia sp!, {r4-r12}
|
||||
WELS_ASM_FUNC_END
|
||||
|
||||
WELS_ASM_FUNC_BEGIN SumOf16x16BlockOfFrame_neon
|
||||
//(uint8_t* pRefPicture, const int32_t kiWidth, const int32_t kiHeight,const int32_t kiRefStride,uint16_t* pFeatureOfBlock, uint32_t pTimesOfFeatureValue[])
|
||||
stmdb sp!, {r4-r8}
|
||||
ldr r5, [sp, #24] //pTimesOfFeatureValue
|
||||
ldr r4, [sp, #20] //pFeatureOfBlock
|
||||
stmdb sp!, {r4-r12}
|
||||
ldr r5, [sp, #40] //pTimesOfFeatureValue
|
||||
ldr r4, [sp, #36] //pFeatureOfBlock
|
||||
|
||||
mov r8, r0
|
||||
mov r6, r1
|
||||
add r8, r6
|
||||
add r4, r4, r6, lsl #1
|
||||
|
||||
_height_loop16x16:
|
||||
mov r7, r6
|
||||
_width_loop16x16:
|
||||
_width_loop16x16_1:
|
||||
subs r0, r8, r7
|
||||
vld1.64 {q0}, [r0], r3
|
||||
vpaddl.u8 q0, q0
|
||||
@ -156,13 +189,50 @@ _width_loop16x16:
|
||||
str r0, [r1]
|
||||
|
||||
subs r7, #1
|
||||
bne _width_loop16x16
|
||||
bne _width_loop16x16_1
|
||||
add r8, r3
|
||||
add r4, r4, r6, lsl #1
|
||||
subs r2, #1
|
||||
beq _SumOf16x16BlockOfFrame_neon_end
|
||||
|
||||
_height_loop16x16:
|
||||
mov r7, r6
|
||||
_width_loop16x16_2:
|
||||
subs r0, r8, r7
|
||||
subs r1, r4, r7, lsl #1
|
||||
subs r9, r1, r6, lsl #1 // last line of pFeatureOfBlock[i]
|
||||
ldrh r10, [r9] // sum of last line of pFeatureOfBlock[i]
|
||||
|
||||
subs r11, r0, r3
|
||||
vld1.64 {q1}, [r11]
|
||||
add r0, r11, r3, lsl #4
|
||||
vld1.64 {q0}, [r0] //
|
||||
|
||||
vpaddl.u8 q0, q0
|
||||
vpaddl.u8 q1, q1
|
||||
vpadd.u16 d0, d0, d1
|
||||
vpadd.u16 d1, d2, d3
|
||||
vpadd.u16 d0, d0, d1
|
||||
vpaddl.u16 d0, d0
|
||||
|
||||
vmov r11, r12, d0
|
||||
subs r10, r12
|
||||
add r0, r10, r11
|
||||
|
||||
strh r0, [r1] // sum -> pFeatureOfBlock[i]
|
||||
add r1, r5, r0, lsl #2
|
||||
ldr r0, [r1]
|
||||
add r0, #1
|
||||
str r0, [r1]
|
||||
|
||||
subs r7, #1
|
||||
bne _width_loop16x16_2
|
||||
|
||||
add r8, r3
|
||||
add r4, r4, r6, lsl #1
|
||||
subs r2, #1
|
||||
bne _height_loop16x16
|
||||
|
||||
ldmia sp!, {r4-r8}
|
||||
_SumOf16x16BlockOfFrame_neon_end:
|
||||
ldmia sp!, {r4-r12}
|
||||
WELS_ASM_FUNC_END
|
||||
#endif
|
||||
|
@ -72,9 +72,8 @@ WELS_ASM_AARCH64_FUNC_BEGIN SumOf8x8BlockOfFrame_AArch64_neon
|
||||
add x8, x8, x6
|
||||
add x4, x4, x6, lsl #1
|
||||
|
||||
_height_loop8x8:
|
||||
mov x7, x6
|
||||
_width_loop8x8:
|
||||
_width_loop8x8_1:
|
||||
subs x0, x8, x7
|
||||
ld1 {v0.d}[0], [x0], x3
|
||||
ld1 {v0.d}[1], [x0], x3
|
||||
@ -100,13 +99,48 @@ _width_loop8x8:
|
||||
add w0, w0, #1
|
||||
str w0, [x1]
|
||||
subs x7, x7, #1
|
||||
cbnz x7, _width_loop8x8
|
||||
cbnz x7, _width_loop8x8_1
|
||||
|
||||
add x8, x8, x3
|
||||
add x4, x4, x6, lsl #1
|
||||
subs x2, x2, #1
|
||||
cbz x2, _SumOf8x8BlockOfFrame_AArch64_neon_end
|
||||
|
||||
_height_loop8x8:
|
||||
mov x7, x6
|
||||
_width_loop8x8_2:
|
||||
subs x0, x8, x7
|
||||
subs x1, x4, x7, lsl #1
|
||||
subs x9, x1, x6, lsl #1 // last line of pFeatureOfBlock[i]
|
||||
ldrh w10, [x9] // sum of last line of pFeatureOfBlock[i]
|
||||
|
||||
subs x11, x0, x3
|
||||
ld1 {v0.d}[1], [x11]
|
||||
add x0, x11, x3, lsl #3
|
||||
ld1 {v0.d}[0], [x0] //
|
||||
|
||||
uaddlp v0.8h, v0.16b
|
||||
addp v0.8h, v0.8h, v1.8h
|
||||
uaddlp v0.4s, v0.8h
|
||||
umov w11, v0.s[0]
|
||||
umov w12, v0.s[1]
|
||||
|
||||
subs w10, w10, w12
|
||||
mov x0, #0
|
||||
add w0, w10, w11
|
||||
strh w0, [x1] // sum -> pFeatureOfBlock[i]
|
||||
add x1, x5, x0, lsl #2
|
||||
ldr w0, [x1]
|
||||
add w0, w0, #1
|
||||
str w0, [x1]
|
||||
subs x7, x7, #1
|
||||
cbnz x7, _width_loop8x8_2
|
||||
|
||||
add x8, x8, x3
|
||||
add x4, x4, x6, lsl #1
|
||||
subs x2, x2, #1
|
||||
cbnz x2, _height_loop8x8
|
||||
|
||||
_SumOf8x8BlockOfFrame_AArch64_neon_end:
|
||||
WELS_ASM_AARCH64_FUNC_END
|
||||
|
||||
WELS_ASM_AARCH64_FUNC_BEGIN SumOf16x16BlockOfFrame_AArch64_neon
|
||||
@ -119,9 +153,8 @@ WELS_ASM_AARCH64_FUNC_BEGIN SumOf16x16BlockOfFrame_AArch64_neon
|
||||
add x8, x8, x6
|
||||
add x4, x4, x6, lsl #1
|
||||
|
||||
_height_loop16x16:
|
||||
mov x7, x6
|
||||
_width_loop16x16:
|
||||
_width_loop16x16_1:
|
||||
subs x0, x8, x7
|
||||
ld1 {v0.16b}, [x0], x3
|
||||
uaddlp v0.8h, v0.16b
|
||||
@ -141,11 +174,47 @@ _width_loop16x16:
|
||||
add w0, w0, #1
|
||||
str w0, [x1]
|
||||
subs x7, x7, #1
|
||||
cbnz x7, _width_loop16x16
|
||||
cbnz x7, _width_loop16x16_1
|
||||
|
||||
add x8, x8, x3
|
||||
add x4, x4, x6, lsl #1
|
||||
subs x2, x2, #1
|
||||
cbz x2, _SumOf16x16BlockOfFrame_AArch64_neon_end
|
||||
|
||||
_height_loop16x16:
|
||||
mov x7, x6
|
||||
_width_loop16x16_2:
|
||||
subs x0, x8, x7
|
||||
|
||||
subs x1, x4, x7, lsl #1
|
||||
subs x9, x1, x6, lsl #1 // last line of pFeatureOfBlock[i]
|
||||
ldrh w10, [x9] // sum of last line of pFeatureOfBlock[i]
|
||||
|
||||
subs x11, x0, x3
|
||||
ld1 {v1.16b}, [x11]
|
||||
add x0, x11, x3, lsl #4
|
||||
ld1 {v0.16b}, [x0] //
|
||||
|
||||
uaddlv h0, v0.16b
|
||||
uaddlv h1, v1.16b
|
||||
umov w11, v0.h[0]
|
||||
umov w12, v1.h[0]
|
||||
|
||||
subs w10, w10, w12
|
||||
mov x0, #0
|
||||
add w0, w10, w11
|
||||
strh w0, [x1] // sum -> pFeatureOfBlock[i]
|
||||
add x1, x5, x0, lsl #2
|
||||
ldr w0, [x1]
|
||||
add w0, w0, #1
|
||||
str w0, [x1]
|
||||
subs x7, x7, #1
|
||||
cbnz x7, _width_loop16x16_2
|
||||
|
||||
add x8, x8, x3
|
||||
add x4, x4, x6, lsl #1
|
||||
subs x2, x2, #1
|
||||
cbnz x2, _height_loop16x16
|
||||
_SumOf16x16BlockOfFrame_AArch64_neon_end:
|
||||
WELS_ASM_AARCH64_FUNC_END
|
||||
#endif
|
Loading…
Reference in New Issue
Block a user