Merge pull request #1279 from dongzha/NewAddARMHash
add arm32/64 code for InitHash
This commit is contained in:
commit
e25a82b3d6
@ -235,4 +235,133 @@ _width_loop16x16_2:
|
||||
_SumOf16x16BlockOfFrame_neon_end:
|
||||
ldmia sp!, {r4-r12}
|
||||
WELS_ASM_FUNC_END
|
||||
|
||||
WELS_ASM_FUNC_BEGIN InitializeHashforFeature_neon
|
||||
// (uint32_t* pTimesOfFeatureValue, uint16_t* pBuf, const int32_t kiListSize, uint16_t** pLocationOfFeature, uint16_t** pFeatureValuePointerList);
|
||||
stmdb sp!, {r4-r7}
|
||||
ldr r4, [sp, #16] //pFeatureValuePointerList
|
||||
bic r5, r2, #3
|
||||
_hash_assign_loop_x4:
|
||||
vld1.64 {q0}, [r0]!
|
||||
vshl.u32 q0, q0, #2
|
||||
vceq.u32 q1, q0, #0
|
||||
vand.i32 d2, d2, d3
|
||||
vmov r6, r7, d2
|
||||
and r6, r6, r7
|
||||
cmp r6, #0xffffffff
|
||||
beq _hash_assign_with_copy_x4
|
||||
|
||||
veor q1, q1
|
||||
vext.32 q2, q1, q0, #3
|
||||
vext.32 q3, q1, q0, #2
|
||||
vext.32 q4, q1, q0, #1
|
||||
vadd.u32 q0, q0, q2
|
||||
vadd.u32 q0, q0, q3
|
||||
vadd.u32 q0, q0, q4
|
||||
vext.32 q2, q1, q0, #3
|
||||
vdup.32 q3, r1
|
||||
vadd.u32 q2, q2, q3
|
||||
vst1.64 {q2}, [r3]!
|
||||
vst1.64 {q2}, [r4]!
|
||||
vmov.32 r6, d1[1]
|
||||
add r1, r1, r6
|
||||
b _assign_next
|
||||
|
||||
_hash_assign_with_copy_x4:
|
||||
vdup.32 q2, r1
|
||||
vst1.64 {q2}, [r3]!
|
||||
vst1.64 {q2}, [r4]!
|
||||
|
||||
_assign_next:
|
||||
subs r5, r5, #4
|
||||
bne _hash_assign_loop_x4
|
||||
|
||||
and r5, r2, #3
|
||||
cmp r5, #0
|
||||
beq _hash_assign_end
|
||||
_hash_assign_loop_x4_rem:
|
||||
str r1, [r3], #4
|
||||
str r1, [r4], #4
|
||||
ldr r7, [r0], #4
|
||||
lsl r7, r7, #2
|
||||
add r1, r1, r7
|
||||
subs r5, r5, #1
|
||||
bne _hash_assign_loop_x4_rem
|
||||
_hash_assign_end:
|
||||
|
||||
ldmia sp!, {r4-r7}
|
||||
WELS_ASM_FUNC_END
|
||||
|
||||
.align 16
|
||||
mv_x_inc_x4: .short 0x10, 0x10, 0x10, 0x10, 0x00, 0x00, 0x00, 0x00
|
||||
mv_y_inc_x4: .short 0x04, 0x04, 0x04, 0x04, 0x00, 0x00, 0x00, 0x00
|
||||
mx_x_offset_x4: .short 0x00, 0x04, 0x08, 0x0c, 0x00, 0x00, 0x00, 0x00
|
||||
|
||||
WELS_ASM_FUNC_BEGIN FillQpelLocationByFeatureValue_neon
|
||||
// void (uint16_t* pFeatureOfBlock, const int32_t kiWidth, const int32_t kiHeight, uint16_t** pFeatureValuePointerList)
|
||||
stmdb sp!, {r4-r8}
|
||||
vpush {q4-q7}
|
||||
adr r7, mv_x_inc_x4
|
||||
vld1.64 {q7}, [r7]
|
||||
adr r7, mv_y_inc_x4
|
||||
vld1.64 {q6}, [r7]
|
||||
adr r7, mx_x_offset_x4
|
||||
vld1.64 {q5}, [r7]
|
||||
veor q4, q4
|
||||
veor q3, q3
|
||||
vdup.32 q8, r3
|
||||
_hash_height_loop:
|
||||
mov r7, r1
|
||||
vmov q2, q5 //mx_x_offset_x4
|
||||
_hash_width_loop:
|
||||
vld1.64 {d0}, [r0]!
|
||||
vshll.u16 q0, d0, #2
|
||||
vadd.u32 q0, q8
|
||||
vmov q1, q2
|
||||
vmov q4, q3
|
||||
vzip.16 q1, q4
|
||||
|
||||
vmov.32 r4, d0[0]
|
||||
ldr r5, [r4]
|
||||
vmov.32 r6, d2[0]
|
||||
str r6, [r5]
|
||||
add r5, r5, #4
|
||||
pld [r5] // cache miss?
|
||||
str r5, [r4]
|
||||
|
||||
vmov.32 r4, d0[1]
|
||||
ldr r5, [r4]
|
||||
vmov.32 r6, d2[1]
|
||||
str r6, [r5]
|
||||
add r5, r5, #4
|
||||
pld [r5] // cache miss?
|
||||
str r5, [r4]
|
||||
|
||||
vmov.32 r4, d1[0]
|
||||
ldr r5, [r4]
|
||||
vmov.32 r6, d3[0]
|
||||
str r6, [r5]
|
||||
add r5, r5, #4
|
||||
pld [r5] // cache miss?
|
||||
str r5, [r4]
|
||||
|
||||
vmov.32 r4, d1[1]
|
||||
ldr r5, [r4]
|
||||
vmov.32 r6, d3[1]
|
||||
str r6, [r5]
|
||||
add r5, r5, #4
|
||||
pld [r5] // cache miss?
|
||||
str r5, [r4]
|
||||
|
||||
vadd.u16 q2, q2, q7
|
||||
subs r7, #4
|
||||
bne _hash_width_loop
|
||||
|
||||
vadd.u16 q3, q3, q6
|
||||
subs r2, #1
|
||||
bne _hash_height_loop
|
||||
|
||||
vpop {q4-q7}
|
||||
ldmia sp!, {r4-r8}
|
||||
WELS_ASM_FUNC_END
|
||||
#endif
|
||||
|
@ -217,4 +217,121 @@ _width_loop16x16_2:
|
||||
cbnz x2, _height_loop16x16
|
||||
_SumOf16x16BlockOfFrame_AArch64_neon_end:
|
||||
WELS_ASM_AARCH64_FUNC_END
|
||||
|
||||
WELS_ASM_AARCH64_FUNC_BEGIN InitializeHashforFeature_AArch64_neon
|
||||
// (uint32_t* pTimesOfFeatureValue, uint16_t* pBuf, const int32_t kiListSize, uint16_t** pLocationOfFeature, uint16_t** pFeatureValuePointerList);
|
||||
mov x9, #3
|
||||
bic x5, x2, x9
|
||||
mov x8, #0
|
||||
_hash_assign_loop_x4:
|
||||
ld1 {v0.16b}, [x0], #16
|
||||
shl v0.4s, v0.4s, #2
|
||||
addv s1, v0.4s
|
||||
umov w7, v1.s[0]
|
||||
cbz w7, _hash_assign_with_copy_x4
|
||||
|
||||
ins v2.d[0], x1
|
||||
umov w8, v0.s[0]
|
||||
add x1, x1, x8
|
||||
ins v2.d[1], x1
|
||||
umov w8, v0.s[1]
|
||||
add x1, x1, x8
|
||||
ins v3.d[0], x1
|
||||
umov w8, v0.s[2]
|
||||
add x1, x1, x8
|
||||
ins v3.d[1], x1
|
||||
umov w8, v0.s[3]
|
||||
add x1, x1, x8
|
||||
st1 {v2.16b, v3.16b}, [x3], #32
|
||||
st1 {v2.16b, v3.16b}, [x4], #32
|
||||
b _assign_next
|
||||
_hash_assign_with_copy_x4:
|
||||
dup v2.2d, x1
|
||||
dup v3.2d, x1
|
||||
st1 {v2.16b, v3.16b}, [x3], #32
|
||||
st1 {v2.16b, v3.16b}, [x4], #32
|
||||
|
||||
_assign_next:
|
||||
subs x5, x5, #4
|
||||
cbnz x5, _hash_assign_loop_x4
|
||||
|
||||
and x5, x2, x9
|
||||
cbz x5, _hash_assign_end
|
||||
|
||||
|
||||
_hash_assign_loop_x4_rem:
|
||||
str x1, [x3], #8
|
||||
str x1, [x4], #8
|
||||
ldr w8, [x0], #4
|
||||
lsl w8, w8, #2
|
||||
add x1, x1, x8
|
||||
subs x5, x5, #1
|
||||
cbnz x5, _hash_assign_loop_x4_rem
|
||||
|
||||
_hash_assign_end:
|
||||
WELS_ASM_AARCH64_FUNC_END
|
||||
|
||||
.align 16
|
||||
mv_x_inc_x4: .short 0x10, 0x10, 0x10, 0x10, 0x00, 0x00, 0x00, 0x00
|
||||
mv_y_inc_x4: .short 0x04, 0x04, 0x04, 0x04, 0x00, 0x00, 0x00, 0x00
|
||||
mx_x_offset_x4: .short 0x00, 0x04, 0x08, 0x0c, 0x00, 0x00, 0x00, 0x00
|
||||
|
||||
WELS_ASM_AARCH64_FUNC_BEGIN FillQpelLocationByFeatureValue_AArch64_neon
|
||||
// void (uint16_t* pFeatureOfBlock, const int32_t kiWidth, const int32_t kiHeight, uint16_t** pFeatureValuePointerList)
|
||||
ldr q7, mv_x_inc_x4
|
||||
ldr q6, mv_y_inc_x4
|
||||
ldr q5, mx_x_offset_x4
|
||||
|
||||
eor v4.16b, v4.16b, v4.16b
|
||||
eor v3.16b, v3.16b, v3.16b
|
||||
dup v16.2d, x3 // v8->v16
|
||||
|
||||
_hash_height_loop:
|
||||
mov x7, x1
|
||||
mov.16b v2, v5 //mx_x_offset_x4
|
||||
|
||||
_hash_width_loop:
|
||||
ld1 {v0.d}[0], [x0], #8
|
||||
|
||||
ushll v0.4s, v0.4h, #3
|
||||
uaddw v17.2d, v16.2d, v0.2s
|
||||
uaddw2 v18.2d, v16.2d, v0.4s
|
||||
zip1 v1.8h, v2.8h, v3.8h
|
||||
|
||||
umov x4, v17.d[0]
|
||||
ldr x5, [x4]
|
||||
umov w6, v1.s[0]
|
||||
str w6, [x5]
|
||||
add x5, x5, #4
|
||||
str x5, [x4]
|
||||
|
||||
umov x4, v17.d[1]
|
||||
ldr x5, [x4]
|
||||
umov w6, v1.s[1]
|
||||
str w6, [x5]
|
||||
add x5, x5, #4
|
||||
str x5, [x4]
|
||||
|
||||
umov x4, v18.d[0]
|
||||
ldr x5, [x4]
|
||||
umov w6, v1.s[2]
|
||||
str w6, [x5]
|
||||
add x5, x5, #4
|
||||
str x5, [x4]
|
||||
|
||||
umov x4, v18.d[1]
|
||||
ldr x5, [x4]
|
||||
umov w6, v1.s[3]
|
||||
str w6, [x5]
|
||||
add x5, x5, #4
|
||||
str x5, [x4]
|
||||
|
||||
add v2.8h, v2.8h, v7.8h
|
||||
subs x7, x7, #4
|
||||
cbnz x7, _hash_width_loop
|
||||
|
||||
add v3.8h, v3.8h, v6.8h
|
||||
subs x2, x2, #1
|
||||
cbnz x2, _hash_height_loop
|
||||
WELS_ASM_AARCH64_FUNC_END
|
||||
#endif
|
@ -271,6 +271,10 @@ void SumOf16x16BlockOfFrame_sse4 (uint8_t* pRefPicture, const int32_t kiWidth, c
|
||||
#ifdef HAVE_NEON
|
||||
extern "C"
|
||||
{
|
||||
void InitializeHashforFeature_neon (uint32_t* pTimesOfFeatureValue, uint16_t* pBuf, const int32_t kiListSize,
|
||||
uint16_t** pLocationOfFeature, uint16_t** pFeatureValuePointerList);
|
||||
void FillQpelLocationByFeatureValue_neon (uint16_t* pFeatureOfBlock, const int32_t kiWidth, const int32_t kiHeight,
|
||||
uint16_t** pFeatureValuePointerList);
|
||||
int32_t SumOf8x8SingleBlock_neon (uint8_t* pRef, const int32_t kiRefStride);
|
||||
int32_t SumOf16x16SingleBlock_neon (uint8_t* pRef, const int32_t kiRefStride);
|
||||
void SumOf8x8BlockOfFrame_neon (uint8_t* pRefPicture, const int32_t kiWidth, const int32_t kiHeight,
|
||||
@ -285,6 +289,10 @@ void SumOf16x16BlockOfFrame_neon (uint8_t* pRefPicture, const int32_t kiWidth, c
|
||||
#ifdef HAVE_NEON_AARCH64
|
||||
extern "C"
|
||||
{
|
||||
void InitializeHashforFeature_AArch64_neon (uint32_t* pTimesOfFeatureValue, uint16_t* pBuf, const int32_t kiListSize,
|
||||
uint16_t** pLocationOfFeature, uint16_t** pFeatureValuePointerList);
|
||||
void FillQpelLocationByFeatureValue_AArch64_neon (uint16_t* pFeatureOfBlock, const int32_t kiWidth, const int32_t kiHeight,
|
||||
uint16_t** pFeatureValuePointerList);
|
||||
int32_t SumOf8x8SingleBlock_AArch64_neon (uint8_t* pRef, const int32_t kiRefStride);
|
||||
int32_t SumOf16x16SingleBlock_AArch64_neon (uint8_t* pRef, const int32_t kiRefStride);
|
||||
void SumOf8x8BlockOfFrame_AArch64_neon (uint8_t* pRefPicture, const int32_t kiWidth, const int32_t kiHeight,
|
||||
|
@ -125,6 +125,8 @@ void WelsInitMeFunc (SWelsFuncPtrList* pFuncList, uint32_t uiCpuFlag, bool bScre
|
||||
#if defined (HAVE_NEON)
|
||||
if (uiCpuFlag & WELS_CPU_NEON) {
|
||||
//for feature search
|
||||
pFuncList->pfInitializeHashforFeature = InitializeHashforFeature_neon;
|
||||
pFuncList->pfFillQpelLocationByFeatureValue = FillQpelLocationByFeatureValue_neon;
|
||||
pFuncList->pfCalculateBlockFeatureOfFrame[0] = SumOf8x8BlockOfFrame_neon;
|
||||
pFuncList->pfCalculateBlockFeatureOfFrame[1] = SumOf16x16BlockOfFrame_neon;
|
||||
//TODO: it is possible to differentiate width that is times of 8, so as to accelerate the speed when width is times of 8?
|
||||
@ -136,6 +138,8 @@ void WelsInitMeFunc (SWelsFuncPtrList* pFuncList, uint32_t uiCpuFlag, bool bScre
|
||||
#if defined (HAVE_NEON_AARCH64)
|
||||
if (uiCpuFlag & WELS_CPU_NEON) {
|
||||
//for feature search
|
||||
pFuncList->pfInitializeHashforFeature = InitializeHashforFeature_AArch64_neon;
|
||||
pFuncList->pfFillQpelLocationByFeatureValue = FillQpelLocationByFeatureValue_AArch64_neon;
|
||||
pFuncList->pfCalculateBlockFeatureOfFrame[0] = SumOf8x8BlockOfFrame_AArch64_neon;
|
||||
pFuncList->pfCalculateBlockFeatureOfFrame[1] = SumOf16x16BlockOfFrame_AArch64_neon;
|
||||
//TODO: it is possible to differentiate width that is times of 8, so as to accelerate the speed when width is times of 8?
|
||||
|
@ -281,6 +281,10 @@ GENERATE_SumOfFrame (SumOf8x8BlockOfFrame_ref, SumOf8x8BlockOfFrame_neon, 1, 320
|
||||
GENERATE_SumOfFrame (SumOf16x16BlockOfFrame_ref, SumOf16x16BlockOfFrame_neon, 1, 320)
|
||||
GENERATE_SumOfFrame (SumOf8x8BlockOfFrame_ref, SumOf8x8BlockOfFrame_neon, 640, 320)
|
||||
GENERATE_SumOfFrame (SumOf16x16BlockOfFrame_ref, SumOf16x16BlockOfFrame_neon, 640, 320)
|
||||
GENERATE_InitializeHashforFeature (InitializeHashforFeature_ref, InitializeHashforFeature_neon, 10, 10)
|
||||
GENERATE_FillQpelLocationByFeatureValue (FillQpelLocationByFeatureValue_ref, FillQpelLocationByFeatureValue_neon, 16, 16)
|
||||
GENERATE_InitializeHashforFeature (InitializeHashforFeature_ref, InitializeHashforFeature_neon, 640, 320)
|
||||
GENERATE_FillQpelLocationByFeatureValue (FillQpelLocationByFeatureValue_ref, FillQpelLocationByFeatureValue_neon, 640, 320)
|
||||
#endif
|
||||
|
||||
#ifdef HAVE_NEON_AARCH64
|
||||
@ -290,4 +294,8 @@ GENERATE_SumOfFrame (SumOf8x8BlockOfFrame_ref, SumOf8x8BlockOfFrame_AArch64_neon
|
||||
GENERATE_SumOfFrame (SumOf16x16BlockOfFrame_ref, SumOf16x16BlockOfFrame_AArch64_neon, 1, 320)
|
||||
GENERATE_SumOfFrame (SumOf8x8BlockOfFrame_ref, SumOf8x8BlockOfFrame_AArch64_neon, 640, 320)
|
||||
GENERATE_SumOfFrame (SumOf16x16BlockOfFrame_ref, SumOf16x16BlockOfFrame_AArch64_neon, 640, 320)
|
||||
GENERATE_InitializeHashforFeature (InitializeHashforFeature_ref, InitializeHashforFeature_AArch64_neon, 10, 10)
|
||||
GENERATE_FillQpelLocationByFeatureValue (FillQpelLocationByFeatureValue_ref, FillQpelLocationByFeatureValue_AArch64_neon, 16, 16)
|
||||
GENERATE_InitializeHashforFeature (InitializeHashforFeature_ref, InitializeHashforFeature_AArch64_neon, 640, 320)
|
||||
GENERATE_FillQpelLocationByFeatureValue (FillQpelLocationByFeatureValue_ref, FillQpelLocationByFeatureValue_AArch64_neon, 640, 320)
|
||||
#endif
|
||||
|
Loading…
x
Reference in New Issue
Block a user