Merge pull request #1279 from dongzha/NewAddARMHash

add arm32/64 code for InitHash
This commit is contained in:
zhilwang 2014-08-15 14:14:51 +08:00
commit e25a82b3d6
5 changed files with 266 additions and 0 deletions

View File

@ -235,4 +235,133 @@ _width_loop16x16_2:
_SumOf16x16BlockOfFrame_neon_end:
ldmia sp!, {r4-r12}
WELS_ASM_FUNC_END
WELS_ASM_FUNC_BEGIN InitializeHashforFeature_neon
// (uint32_t* pTimesOfFeatureValue, uint16_t* pBuf, const int32_t kiListSize, uint16_t** pLocationOfFeature, uint16_t** pFeatureValuePointerList);
stmdb sp!, {r4-r7}
ldr r4, [sp, #16] //pFeatureValuePointerList
bic r5, r2, #3
_hash_assign_loop_x4:
vld1.64 {q0}, [r0]!
vshl.u32 q0, q0, #2
vceq.u32 q1, q0, #0
vand.i32 d2, d2, d3
vmov r6, r7, d2
and r6, r6, r7
cmp r6, #0xffffffff
beq _hash_assign_with_copy_x4
veor q1, q1
vext.32 q2, q1, q0, #3
vext.32 q3, q1, q0, #2
vext.32 q4, q1, q0, #1
vadd.u32 q0, q0, q2
vadd.u32 q0, q0, q3
vadd.u32 q0, q0, q4
vext.32 q2, q1, q0, #3
vdup.32 q3, r1
vadd.u32 q2, q2, q3
vst1.64 {q2}, [r3]!
vst1.64 {q2}, [r4]!
vmov.32 r6, d1[1]
add r1, r1, r6
b _assign_next
_hash_assign_with_copy_x4:
vdup.32 q2, r1
vst1.64 {q2}, [r3]!
vst1.64 {q2}, [r4]!
_assign_next:
subs r5, r5, #4
bne _hash_assign_loop_x4
and r5, r2, #3
cmp r5, #0
beq _hash_assign_end
_hash_assign_loop_x4_rem:
str r1, [r3], #4
str r1, [r4], #4
ldr r7, [r0], #4
lsl r7, r7, #2
add r1, r1, r7
subs r5, r5, #1
bne _hash_assign_loop_x4_rem
_hash_assign_end:
ldmia sp!, {r4-r7}
WELS_ASM_FUNC_END
.align 16
mv_x_inc_x4: .short 0x10, 0x10, 0x10, 0x10, 0x00, 0x00, 0x00, 0x00
mv_y_inc_x4: .short 0x04, 0x04, 0x04, 0x04, 0x00, 0x00, 0x00, 0x00
mx_x_offset_x4: .short 0x00, 0x04, 0x08, 0x0c, 0x00, 0x00, 0x00, 0x00
WELS_ASM_FUNC_BEGIN FillQpelLocationByFeatureValue_neon
// void (uint16_t* pFeatureOfBlock, const int32_t kiWidth, const int32_t kiHeight, uint16_t** pFeatureValuePointerList)
stmdb sp!, {r4-r8}
vpush {q4-q7}
adr r7, mv_x_inc_x4
vld1.64 {q7}, [r7]
adr r7, mv_y_inc_x4
vld1.64 {q6}, [r7]
adr r7, mx_x_offset_x4
vld1.64 {q5}, [r7]
veor q4, q4
veor q3, q3
vdup.32 q8, r3
_hash_height_loop:
mov r7, r1
vmov q2, q5 //mx_x_offset_x4
_hash_width_loop:
vld1.64 {d0}, [r0]!
vshll.u16 q0, d0, #2
vadd.u32 q0, q8
vmov q1, q2
vmov q4, q3
vzip.16 q1, q4
vmov.32 r4, d0[0]
ldr r5, [r4]
vmov.32 r6, d2[0]
str r6, [r5]
add r5, r5, #4
pld [r5] // cache miss?
str r5, [r4]
vmov.32 r4, d0[1]
ldr r5, [r4]
vmov.32 r6, d2[1]
str r6, [r5]
add r5, r5, #4
pld [r5] // cache miss?
str r5, [r4]
vmov.32 r4, d1[0]
ldr r5, [r4]
vmov.32 r6, d3[0]
str r6, [r5]
add r5, r5, #4
pld [r5] // cache miss?
str r5, [r4]
vmov.32 r4, d1[1]
ldr r5, [r4]
vmov.32 r6, d3[1]
str r6, [r5]
add r5, r5, #4
pld [r5] // cache miss?
str r5, [r4]
vadd.u16 q2, q2, q7
subs r7, #4
bne _hash_width_loop
vadd.u16 q3, q3, q6
subs r2, #1
bne _hash_height_loop
vpop {q4-q7}
ldmia sp!, {r4-r8}
WELS_ASM_FUNC_END
#endif

View File

@ -217,4 +217,121 @@ _width_loop16x16_2:
cbnz x2, _height_loop16x16
_SumOf16x16BlockOfFrame_AArch64_neon_end:
WELS_ASM_AARCH64_FUNC_END
WELS_ASM_AARCH64_FUNC_BEGIN InitializeHashforFeature_AArch64_neon
// (uint32_t* pTimesOfFeatureValue, uint16_t* pBuf, const int32_t kiListSize, uint16_t** pLocationOfFeature, uint16_t** pFeatureValuePointerList);
mov x9, #3
bic x5, x2, x9
mov x8, #0
_hash_assign_loop_x4:
ld1 {v0.16b}, [x0], #16
shl v0.4s, v0.4s, #2
addv s1, v0.4s
umov w7, v1.s[0]
cbz w7, _hash_assign_with_copy_x4
ins v2.d[0], x1
umov w8, v0.s[0]
add x1, x1, x8
ins v2.d[1], x1
umov w8, v0.s[1]
add x1, x1, x8
ins v3.d[0], x1
umov w8, v0.s[2]
add x1, x1, x8
ins v3.d[1], x1
umov w8, v0.s[3]
add x1, x1, x8
st1 {v2.16b, v3.16b}, [x3], #32
st1 {v2.16b, v3.16b}, [x4], #32
b _assign_next
_hash_assign_with_copy_x4:
dup v2.2d, x1
dup v3.2d, x1
st1 {v2.16b, v3.16b}, [x3], #32
st1 {v2.16b, v3.16b}, [x4], #32
_assign_next:
subs x5, x5, #4
cbnz x5, _hash_assign_loop_x4
and x5, x2, x9
cbz x5, _hash_assign_end
_hash_assign_loop_x4_rem:
str x1, [x3], #8
str x1, [x4], #8
ldr w8, [x0], #4
lsl w8, w8, #2
add x1, x1, x8
subs x5, x5, #1
cbnz x5, _hash_assign_loop_x4_rem
_hash_assign_end:
WELS_ASM_AARCH64_FUNC_END
.align 16
mv_x_inc_x4: .short 0x10, 0x10, 0x10, 0x10, 0x00, 0x00, 0x00, 0x00
mv_y_inc_x4: .short 0x04, 0x04, 0x04, 0x04, 0x00, 0x00, 0x00, 0x00
mx_x_offset_x4: .short 0x00, 0x04, 0x08, 0x0c, 0x00, 0x00, 0x00, 0x00
WELS_ASM_AARCH64_FUNC_BEGIN FillQpelLocationByFeatureValue_AArch64_neon
// void (uint16_t* pFeatureOfBlock, const int32_t kiWidth, const int32_t kiHeight, uint16_t** pFeatureValuePointerList)
ldr q7, mv_x_inc_x4
ldr q6, mv_y_inc_x4
ldr q5, mx_x_offset_x4
eor v4.16b, v4.16b, v4.16b
eor v3.16b, v3.16b, v3.16b
dup v16.2d, x3 // v8->v16
_hash_height_loop:
mov x7, x1
mov.16b v2, v5 //mx_x_offset_x4
_hash_width_loop:
ld1 {v0.d}[0], [x0], #8
ushll v0.4s, v0.4h, #3
uaddw v17.2d, v16.2d, v0.2s
uaddw2 v18.2d, v16.2d, v0.4s
zip1 v1.8h, v2.8h, v3.8h
umov x4, v17.d[0]
ldr x5, [x4]
umov w6, v1.s[0]
str w6, [x5]
add x5, x5, #4
str x5, [x4]
umov x4, v17.d[1]
ldr x5, [x4]
umov w6, v1.s[1]
str w6, [x5]
add x5, x5, #4
str x5, [x4]
umov x4, v18.d[0]
ldr x5, [x4]
umov w6, v1.s[2]
str w6, [x5]
add x5, x5, #4
str x5, [x4]
umov x4, v18.d[1]
ldr x5, [x4]
umov w6, v1.s[3]
str w6, [x5]
add x5, x5, #4
str x5, [x4]
add v2.8h, v2.8h, v7.8h
subs x7, x7, #4
cbnz x7, _hash_width_loop
add v3.8h, v3.8h, v6.8h
subs x2, x2, #1
cbnz x2, _hash_height_loop
WELS_ASM_AARCH64_FUNC_END
#endif

View File

@ -271,6 +271,10 @@ void SumOf16x16BlockOfFrame_sse4 (uint8_t* pRefPicture, const int32_t kiWidth, c
#ifdef HAVE_NEON
extern "C"
{
void InitializeHashforFeature_neon (uint32_t* pTimesOfFeatureValue, uint16_t* pBuf, const int32_t kiListSize,
uint16_t** pLocationOfFeature, uint16_t** pFeatureValuePointerList);
void FillQpelLocationByFeatureValue_neon (uint16_t* pFeatureOfBlock, const int32_t kiWidth, const int32_t kiHeight,
uint16_t** pFeatureValuePointerList);
int32_t SumOf8x8SingleBlock_neon (uint8_t* pRef, const int32_t kiRefStride);
int32_t SumOf16x16SingleBlock_neon (uint8_t* pRef, const int32_t kiRefStride);
void SumOf8x8BlockOfFrame_neon (uint8_t* pRefPicture, const int32_t kiWidth, const int32_t kiHeight,
@ -285,6 +289,10 @@ void SumOf16x16BlockOfFrame_neon (uint8_t* pRefPicture, const int32_t kiWidth, c
#ifdef HAVE_NEON_AARCH64
extern "C"
{
void InitializeHashforFeature_AArch64_neon (uint32_t* pTimesOfFeatureValue, uint16_t* pBuf, const int32_t kiListSize,
uint16_t** pLocationOfFeature, uint16_t** pFeatureValuePointerList);
void FillQpelLocationByFeatureValue_AArch64_neon (uint16_t* pFeatureOfBlock, const int32_t kiWidth, const int32_t kiHeight,
uint16_t** pFeatureValuePointerList);
int32_t SumOf8x8SingleBlock_AArch64_neon (uint8_t* pRef, const int32_t kiRefStride);
int32_t SumOf16x16SingleBlock_AArch64_neon (uint8_t* pRef, const int32_t kiRefStride);
void SumOf8x8BlockOfFrame_AArch64_neon (uint8_t* pRefPicture, const int32_t kiWidth, const int32_t kiHeight,

View File

@ -125,6 +125,8 @@ void WelsInitMeFunc (SWelsFuncPtrList* pFuncList, uint32_t uiCpuFlag, bool bScre
#if defined (HAVE_NEON)
if (uiCpuFlag & WELS_CPU_NEON) {
//for feature search
pFuncList->pfInitializeHashforFeature = InitializeHashforFeature_neon;
pFuncList->pfFillQpelLocationByFeatureValue = FillQpelLocationByFeatureValue_neon;
pFuncList->pfCalculateBlockFeatureOfFrame[0] = SumOf8x8BlockOfFrame_neon;
pFuncList->pfCalculateBlockFeatureOfFrame[1] = SumOf16x16BlockOfFrame_neon;
//TODO: it is possible to differentiate width that is times of 8, so as to accelerate the speed when width is times of 8?
@ -136,6 +138,8 @@ void WelsInitMeFunc (SWelsFuncPtrList* pFuncList, uint32_t uiCpuFlag, bool bScre
#if defined (HAVE_NEON_AARCH64)
if (uiCpuFlag & WELS_CPU_NEON) {
//for feature search
pFuncList->pfInitializeHashforFeature = InitializeHashforFeature_AArch64_neon;
pFuncList->pfFillQpelLocationByFeatureValue = FillQpelLocationByFeatureValue_AArch64_neon;
pFuncList->pfCalculateBlockFeatureOfFrame[0] = SumOf8x8BlockOfFrame_AArch64_neon;
pFuncList->pfCalculateBlockFeatureOfFrame[1] = SumOf16x16BlockOfFrame_AArch64_neon;
//TODO: it is possible to differentiate width that is times of 8, so as to accelerate the speed when width is times of 8?

View File

@ -281,6 +281,10 @@ GENERATE_SumOfFrame (SumOf8x8BlockOfFrame_ref, SumOf8x8BlockOfFrame_neon, 1, 320
GENERATE_SumOfFrame (SumOf16x16BlockOfFrame_ref, SumOf16x16BlockOfFrame_neon, 1, 320)
GENERATE_SumOfFrame (SumOf8x8BlockOfFrame_ref, SumOf8x8BlockOfFrame_neon, 640, 320)
GENERATE_SumOfFrame (SumOf16x16BlockOfFrame_ref, SumOf16x16BlockOfFrame_neon, 640, 320)
GENERATE_InitializeHashforFeature (InitializeHashforFeature_ref, InitializeHashforFeature_neon, 10, 10)
GENERATE_FillQpelLocationByFeatureValue (FillQpelLocationByFeatureValue_ref, FillQpelLocationByFeatureValue_neon, 16, 16)
GENERATE_InitializeHashforFeature (InitializeHashforFeature_ref, InitializeHashforFeature_neon, 640, 320)
GENERATE_FillQpelLocationByFeatureValue (FillQpelLocationByFeatureValue_ref, FillQpelLocationByFeatureValue_neon, 640, 320)
#endif
#ifdef HAVE_NEON_AARCH64
@ -290,4 +294,8 @@ GENERATE_SumOfFrame (SumOf8x8BlockOfFrame_ref, SumOf8x8BlockOfFrame_AArch64_neon
GENERATE_SumOfFrame (SumOf16x16BlockOfFrame_ref, SumOf16x16BlockOfFrame_AArch64_neon, 1, 320)
GENERATE_SumOfFrame (SumOf8x8BlockOfFrame_ref, SumOf8x8BlockOfFrame_AArch64_neon, 640, 320)
GENERATE_SumOfFrame (SumOf16x16BlockOfFrame_ref, SumOf16x16BlockOfFrame_AArch64_neon, 640, 320)
GENERATE_InitializeHashforFeature (InitializeHashforFeature_ref, InitializeHashforFeature_AArch64_neon, 10, 10)
GENERATE_FillQpelLocationByFeatureValue (FillQpelLocationByFeatureValue_ref, FillQpelLocationByFeatureValue_AArch64_neon, 16, 16)
GENERATE_InitializeHashforFeature (InitializeHashforFeature_ref, InitializeHashforFeature_AArch64_neon, 640, 320)
GENERATE_FillQpelLocationByFeatureValue (FillQpelLocationByFeatureValue_ref, FillQpelLocationByFeatureValue_AArch64_neon, 640, 320)
#endif