refine arm code for sum of frame
This commit is contained in:
@@ -72,18 +72,17 @@ WELS_ASM_FUNC_END
|
|||||||
|
|
||||||
WELS_ASM_FUNC_BEGIN SumOf8x8BlockOfFrame_neon
|
WELS_ASM_FUNC_BEGIN SumOf8x8BlockOfFrame_neon
|
||||||
//(uint8_t* pRefPicture, const int32_t kiWidth, const int32_t kiHeight,const int32_t kiRefStride,uint16_t* pFeatureOfBlock, uint32_t pTimesOfFeatureValue[])
|
//(uint8_t* pRefPicture, const int32_t kiWidth, const int32_t kiHeight,const int32_t kiRefStride,uint16_t* pFeatureOfBlock, uint32_t pTimesOfFeatureValue[])
|
||||||
stmdb sp!, {r4-r8}
|
stmdb sp!, {r4-r12}
|
||||||
ldr r5, [sp, #24] //pTimesOfFeatureValue
|
ldr r5, [sp, #40] //pTimesOfFeatureValue
|
||||||
ldr r4, [sp, #20] //pFeatureOfBlock
|
ldr r4, [sp, #36] //pFeatureOfBlock
|
||||||
|
|
||||||
mov r8, r0
|
mov r8, r0
|
||||||
mov r6, r1
|
mov r6, r1
|
||||||
add r8, r6
|
add r8, r6
|
||||||
add r4, r4, r6, lsl #1
|
add r4, r4, r6, lsl #1
|
||||||
|
|
||||||
_height_loop8x8:
|
|
||||||
mov r7, r6
|
mov r7, r6
|
||||||
_width_loop8x8:
|
_width_loop8x8_1:
|
||||||
subs r0, r8, r7
|
subs r0, r8, r7
|
||||||
vld1.64 {d0}, [r0], r3
|
vld1.64 {d0}, [r0], r3
|
||||||
vld1.64 {d1}, [r0], r3
|
vld1.64 {d1}, [r0], r3
|
||||||
@@ -98,7 +97,6 @@ _width_loop8x8:
|
|||||||
vpadal.u8 q0, q1
|
vpadal.u8 q0, q1
|
||||||
vpadal.u8 q0, q2
|
vpadal.u8 q0, q2
|
||||||
vpadal.u8 q0, q3
|
vpadal.u8 q0, q3
|
||||||
|
|
||||||
vpaddl.u16 q0, q0
|
vpaddl.u16 q0, q0
|
||||||
vpadd.i32 d0, d1
|
vpadd.i32 d0, d1
|
||||||
vpadd.i32 d0, d0
|
vpadd.i32 d0, d0
|
||||||
@@ -112,30 +110,65 @@ _width_loop8x8:
|
|||||||
str r0, [r1]
|
str r0, [r1]
|
||||||
|
|
||||||
subs r7, #1
|
subs r7, #1
|
||||||
bne _width_loop8x8
|
bne _width_loop8x8_1
|
||||||
|
|
||||||
|
add r8, r3
|
||||||
|
add r4, r4, r6, lsl #1
|
||||||
|
subs r2, #1
|
||||||
|
beq _SumOf8x8BlockOfFrame_end
|
||||||
|
|
||||||
|
|
||||||
|
_height_loop8x8:
|
||||||
|
mov r7, r6
|
||||||
|
_width_loop8x8_2:
|
||||||
|
subs r0, r8, r7
|
||||||
|
subs r1, r4, r7, lsl #1
|
||||||
|
|
||||||
|
subs r9, r1, r6, lsl #1 // last line of pFeatureOfBlock[i]
|
||||||
|
ldrh r10, [r9] // sum of last line of pFeatureOfBlock[i]
|
||||||
|
|
||||||
|
subs r11, r0, r3
|
||||||
|
vld1.64 {d1}, [r11]
|
||||||
|
add r0, r11, r3, lsl #3
|
||||||
|
vld1.64 {d0}, [r0] //
|
||||||
|
|
||||||
|
vpaddl.u8 q0, q0
|
||||||
|
vpadd.u16 d0, d0, d1
|
||||||
|
vpaddl.u16 d0, d0
|
||||||
|
vmov r11, r12, d0
|
||||||
|
subs r10, r12
|
||||||
|
add r0, r10, r11
|
||||||
|
|
||||||
|
strh r0, [r1] // sum -> pFeatureOfBlock[i]
|
||||||
|
|
||||||
|
add r1, r5, r0, lsl #2
|
||||||
|
ldr r0, [r1]
|
||||||
|
add r0, #1
|
||||||
|
str r0, [r1]
|
||||||
|
subs r7, #1
|
||||||
|
bne _width_loop8x8_2
|
||||||
|
|
||||||
add r8, r3
|
add r8, r3
|
||||||
add r4, r4, r6, lsl #1
|
add r4, r4, r6, lsl #1
|
||||||
subs r2, #1
|
subs r2, #1
|
||||||
bne _height_loop8x8
|
bne _height_loop8x8
|
||||||
|
_SumOf8x8BlockOfFrame_end:
|
||||||
ldmia sp!, {r4-r8}
|
ldmia sp!, {r4-r12}
|
||||||
WELS_ASM_FUNC_END
|
WELS_ASM_FUNC_END
|
||||||
|
|
||||||
WELS_ASM_FUNC_BEGIN SumOf16x16BlockOfFrame_neon
|
WELS_ASM_FUNC_BEGIN SumOf16x16BlockOfFrame_neon
|
||||||
//(uint8_t* pRefPicture, const int32_t kiWidth, const int32_t kiHeight,const int32_t kiRefStride,uint16_t* pFeatureOfBlock, uint32_t pTimesOfFeatureValue[])
|
//(uint8_t* pRefPicture, const int32_t kiWidth, const int32_t kiHeight,const int32_t kiRefStride,uint16_t* pFeatureOfBlock, uint32_t pTimesOfFeatureValue[])
|
||||||
stmdb sp!, {r4-r8}
|
stmdb sp!, {r4-r12}
|
||||||
ldr r5, [sp, #24] //pTimesOfFeatureValue
|
ldr r5, [sp, #40] //pTimesOfFeatureValue
|
||||||
ldr r4, [sp, #20] //pFeatureOfBlock
|
ldr r4, [sp, #36] //pFeatureOfBlock
|
||||||
|
|
||||||
mov r8, r0
|
mov r8, r0
|
||||||
mov r6, r1
|
mov r6, r1
|
||||||
add r8, r6
|
add r8, r6
|
||||||
add r4, r4, r6, lsl #1
|
add r4, r4, r6, lsl #1
|
||||||
|
|
||||||
_height_loop16x16:
|
|
||||||
mov r7, r6
|
mov r7, r6
|
||||||
_width_loop16x16:
|
_width_loop16x16_1:
|
||||||
subs r0, r8, r7
|
subs r0, r8, r7
|
||||||
vld1.64 {q0}, [r0], r3
|
vld1.64 {q0}, [r0], r3
|
||||||
vpaddl.u8 q0, q0
|
vpaddl.u8 q0, q0
|
||||||
@@ -156,13 +189,50 @@ _width_loop16x16:
|
|||||||
str r0, [r1]
|
str r0, [r1]
|
||||||
|
|
||||||
subs r7, #1
|
subs r7, #1
|
||||||
bne _width_loop16x16
|
bne _width_loop16x16_1
|
||||||
|
add r8, r3
|
||||||
|
add r4, r4, r6, lsl #1
|
||||||
|
subs r2, #1
|
||||||
|
beq _SumOf16x16BlockOfFrame_neon_end
|
||||||
|
|
||||||
|
_height_loop16x16:
|
||||||
|
mov r7, r6
|
||||||
|
_width_loop16x16_2:
|
||||||
|
subs r0, r8, r7
|
||||||
|
subs r1, r4, r7, lsl #1
|
||||||
|
subs r9, r1, r6, lsl #1 // last line of pFeatureOfBlock[i]
|
||||||
|
ldrh r10, [r9] // sum of last line of pFeatureOfBlock[i]
|
||||||
|
|
||||||
|
subs r11, r0, r3
|
||||||
|
vld1.64 {q1}, [r11]
|
||||||
|
add r0, r11, r3, lsl #4
|
||||||
|
vld1.64 {q0}, [r0] //
|
||||||
|
|
||||||
|
vpaddl.u8 q0, q0
|
||||||
|
vpaddl.u8 q1, q1
|
||||||
|
vpadd.u16 d0, d0, d1
|
||||||
|
vpadd.u16 d1, d2, d3
|
||||||
|
vpadd.u16 d0, d0, d1
|
||||||
|
vpaddl.u16 d0, d0
|
||||||
|
|
||||||
|
vmov r11, r12, d0
|
||||||
|
subs r10, r12
|
||||||
|
add r0, r10, r11
|
||||||
|
|
||||||
|
strh r0, [r1] // sum -> pFeatureOfBlock[i]
|
||||||
|
add r1, r5, r0, lsl #2
|
||||||
|
ldr r0, [r1]
|
||||||
|
add r0, #1
|
||||||
|
str r0, [r1]
|
||||||
|
|
||||||
|
subs r7, #1
|
||||||
|
bne _width_loop16x16_2
|
||||||
|
|
||||||
add r8, r3
|
add r8, r3
|
||||||
add r4, r4, r6, lsl #1
|
add r4, r4, r6, lsl #1
|
||||||
subs r2, #1
|
subs r2, #1
|
||||||
bne _height_loop16x16
|
bne _height_loop16x16
|
||||||
|
_SumOf16x16BlockOfFrame_neon_end:
|
||||||
ldmia sp!, {r4-r8}
|
ldmia sp!, {r4-r12}
|
||||||
WELS_ASM_FUNC_END
|
WELS_ASM_FUNC_END
|
||||||
#endif
|
#endif
|
||||||
|
|||||||
@@ -72,9 +72,8 @@ WELS_ASM_AARCH64_FUNC_BEGIN SumOf8x8BlockOfFrame_AArch64_neon
|
|||||||
add x8, x8, x6
|
add x8, x8, x6
|
||||||
add x4, x4, x6, lsl #1
|
add x4, x4, x6, lsl #1
|
||||||
|
|
||||||
_height_loop8x8:
|
|
||||||
mov x7, x6
|
mov x7, x6
|
||||||
_width_loop8x8:
|
_width_loop8x8_1:
|
||||||
subs x0, x8, x7
|
subs x0, x8, x7
|
||||||
ld1 {v0.d}[0], [x0], x3
|
ld1 {v0.d}[0], [x0], x3
|
||||||
ld1 {v0.d}[1], [x0], x3
|
ld1 {v0.d}[1], [x0], x3
|
||||||
@@ -100,13 +99,48 @@ _width_loop8x8:
|
|||||||
add w0, w0, #1
|
add w0, w0, #1
|
||||||
str w0, [x1]
|
str w0, [x1]
|
||||||
subs x7, x7, #1
|
subs x7, x7, #1
|
||||||
cbnz x7, _width_loop8x8
|
cbnz x7, _width_loop8x8_1
|
||||||
|
|
||||||
|
add x8, x8, x3
|
||||||
|
add x4, x4, x6, lsl #1
|
||||||
|
subs x2, x2, #1
|
||||||
|
cbz x2, _SumOf8x8BlockOfFrame_AArch64_neon_end
|
||||||
|
|
||||||
|
_height_loop8x8:
|
||||||
|
mov x7, x6
|
||||||
|
_width_loop8x8_2:
|
||||||
|
subs x0, x8, x7
|
||||||
|
subs x1, x4, x7, lsl #1
|
||||||
|
subs x9, x1, x6, lsl #1 // last line of pFeatureOfBlock[i]
|
||||||
|
ldrh w10, [x9] // sum of last line of pFeatureOfBlock[i]
|
||||||
|
|
||||||
|
subs x11, x0, x3
|
||||||
|
ld1 {v0.d}[1], [x11]
|
||||||
|
add x0, x11, x3, lsl #3
|
||||||
|
ld1 {v0.d}[0], [x0] //
|
||||||
|
|
||||||
|
uaddlp v0.8h, v0.16b
|
||||||
|
addp v0.8h, v0.8h, v1.8h
|
||||||
|
uaddlp v0.4s, v0.8h
|
||||||
|
umov w11, v0.s[0]
|
||||||
|
umov w12, v0.s[1]
|
||||||
|
|
||||||
|
subs w10, w10, w12
|
||||||
|
mov x0, #0
|
||||||
|
add w0, w10, w11
|
||||||
|
strh w0, [x1] // sum -> pFeatureOfBlock[i]
|
||||||
|
add x1, x5, x0, lsl #2
|
||||||
|
ldr w0, [x1]
|
||||||
|
add w0, w0, #1
|
||||||
|
str w0, [x1]
|
||||||
|
subs x7, x7, #1
|
||||||
|
cbnz x7, _width_loop8x8_2
|
||||||
|
|
||||||
add x8, x8, x3
|
add x8, x8, x3
|
||||||
add x4, x4, x6, lsl #1
|
add x4, x4, x6, lsl #1
|
||||||
subs x2, x2, #1
|
subs x2, x2, #1
|
||||||
cbnz x2, _height_loop8x8
|
cbnz x2, _height_loop8x8
|
||||||
|
_SumOf8x8BlockOfFrame_AArch64_neon_end:
|
||||||
WELS_ASM_AARCH64_FUNC_END
|
WELS_ASM_AARCH64_FUNC_END
|
||||||
|
|
||||||
WELS_ASM_AARCH64_FUNC_BEGIN SumOf16x16BlockOfFrame_AArch64_neon
|
WELS_ASM_AARCH64_FUNC_BEGIN SumOf16x16BlockOfFrame_AArch64_neon
|
||||||
@@ -119,9 +153,8 @@ WELS_ASM_AARCH64_FUNC_BEGIN SumOf16x16BlockOfFrame_AArch64_neon
|
|||||||
add x8, x8, x6
|
add x8, x8, x6
|
||||||
add x4, x4, x6, lsl #1
|
add x4, x4, x6, lsl #1
|
||||||
|
|
||||||
_height_loop16x16:
|
|
||||||
mov x7, x6
|
mov x7, x6
|
||||||
_width_loop16x16:
|
_width_loop16x16_1:
|
||||||
subs x0, x8, x7
|
subs x0, x8, x7
|
||||||
ld1 {v0.16b}, [x0], x3
|
ld1 {v0.16b}, [x0], x3
|
||||||
uaddlp v0.8h, v0.16b
|
uaddlp v0.8h, v0.16b
|
||||||
@@ -141,11 +174,47 @@ _width_loop16x16:
|
|||||||
add w0, w0, #1
|
add w0, w0, #1
|
||||||
str w0, [x1]
|
str w0, [x1]
|
||||||
subs x7, x7, #1
|
subs x7, x7, #1
|
||||||
cbnz x7, _width_loop16x16
|
cbnz x7, _width_loop16x16_1
|
||||||
|
|
||||||
|
add x8, x8, x3
|
||||||
|
add x4, x4, x6, lsl #1
|
||||||
|
subs x2, x2, #1
|
||||||
|
cbz x2, _SumOf16x16BlockOfFrame_AArch64_neon_end
|
||||||
|
|
||||||
|
_height_loop16x16:
|
||||||
|
mov x7, x6
|
||||||
|
_width_loop16x16_2:
|
||||||
|
subs x0, x8, x7
|
||||||
|
|
||||||
|
subs x1, x4, x7, lsl #1
|
||||||
|
subs x9, x1, x6, lsl #1 // last line of pFeatureOfBlock[i]
|
||||||
|
ldrh w10, [x9] // sum of last line of pFeatureOfBlock[i]
|
||||||
|
|
||||||
|
subs x11, x0, x3
|
||||||
|
ld1 {v1.16b}, [x11]
|
||||||
|
add x0, x11, x3, lsl #4
|
||||||
|
ld1 {v0.16b}, [x0] //
|
||||||
|
|
||||||
|
uaddlv h0, v0.16b
|
||||||
|
uaddlv h1, v1.16b
|
||||||
|
umov w11, v0.h[0]
|
||||||
|
umov w12, v1.h[0]
|
||||||
|
|
||||||
|
subs w10, w10, w12
|
||||||
|
mov x0, #0
|
||||||
|
add w0, w10, w11
|
||||||
|
strh w0, [x1] // sum -> pFeatureOfBlock[i]
|
||||||
|
add x1, x5, x0, lsl #2
|
||||||
|
ldr w0, [x1]
|
||||||
|
add w0, w0, #1
|
||||||
|
str w0, [x1]
|
||||||
|
subs x7, x7, #1
|
||||||
|
cbnz x7, _width_loop16x16_2
|
||||||
|
|
||||||
add x8, x8, x3
|
add x8, x8, x3
|
||||||
add x4, x4, x6, lsl #1
|
add x4, x4, x6, lsl #1
|
||||||
subs x2, x2, #1
|
subs x2, x2, #1
|
||||||
cbnz x2, _height_loop16x16
|
cbnz x2, _height_loop16x16
|
||||||
|
_SumOf16x16BlockOfFrame_AArch64_neon_end:
|
||||||
WELS_ASM_AARCH64_FUNC_END
|
WELS_ASM_AARCH64_FUNC_END
|
||||||
#endif
|
#endif
|
||||||
Reference in New Issue
Block a user