diff --git a/codec/encoder/core/arm/svc_motion_estimation.S b/codec/encoder/core/arm/svc_motion_estimation.S index 706bbfa0..65410b34 100644 --- a/codec/encoder/core/arm/svc_motion_estimation.S +++ b/codec/encoder/core/arm/svc_motion_estimation.S @@ -235,4 +235,133 @@ _width_loop16x16_2: _SumOf16x16BlockOfFrame_neon_end: ldmia sp!, {r4-r12} WELS_ASM_FUNC_END + +WELS_ASM_FUNC_BEGIN InitializeHashforFeature_neon +// (uint32_t* pTimesOfFeatureValue, uint16_t* pBuf, const int32_t kiListSize, uint16_t** pLocationOfFeature, uint16_t** pFeatureValuePointerList); + stmdb sp!, {r4-r7} + ldr r4, [sp, #16] //pFeatureValuePointerList + bic r5, r2, #3 +_hash_assign_loop_x4: + vld1.64 {q0}, [r0]! + vshl.u32 q0, q0, #2 + vceq.u32 q1, q0, #0 + vand.i32 d2, d2, d3 + vmov r6, r7, d2 + and r6, r6, r7 + cmp r6, #0xffffffff + beq _hash_assign_with_copy_x4 + + veor q1, q1 + vext.32 q2, q1, q0, #3 + vext.32 q3, q1, q0, #2 + vext.32 q4, q1, q0, #1 + vadd.u32 q0, q0, q2 + vadd.u32 q0, q0, q3 + vadd.u32 q0, q0, q4 + vext.32 q2, q1, q0, #3 + vdup.32 q3, r1 + vadd.u32 q2, q2, q3 + vst1.64 {q2}, [r3]! + vst1.64 {q2}, [r4]! + vmov.32 r6, d1[1] + add r1, r1, r6 + b _assign_next + +_hash_assign_with_copy_x4: + vdup.32 q2, r1 + vst1.64 {q2}, [r3]! + vst1.64 {q2}, [r4]! + +_assign_next: + subs r5, r5, #4 + bne _hash_assign_loop_x4 + + and r5, r2, #3 + cmp r5, #0 + beq _hash_assign_end +_hash_assign_loop_x4_rem: + str r1, [r3], #4 + str r1, [r4], #4 + ldr r7, [r0], #4 + lsl r7, r7, #2 + add r1, r1, r7 + subs r5, r5, #1 + bne _hash_assign_loop_x4_rem +_hash_assign_end: + + ldmia sp!, {r4-r7} +WELS_ASM_FUNC_END + +.align 16 +mv_x_inc_x4: .short 0x10, 0x10, 0x10, 0x10, 0x00, 0x00, 0x00, 0x00 +mv_y_inc_x4: .short 0x04, 0x04, 0x04, 0x04, 0x00, 0x00, 0x00, 0x00 +mx_x_offset_x4: .short 0x00, 0x04, 0x08, 0x0c, 0x00, 0x00, 0x00, 0x00 + +WELS_ASM_FUNC_BEGIN FillQpelLocationByFeatureValue_neon +// void (uint16_t* pFeatureOfBlock, const int32_t kiWidth, const int32_t kiHeight, uint16_t** pFeatureValuePointerList) + stmdb sp!, {r4-r8} + vpush {q4-q7} + adr r7, mv_x_inc_x4 + vld1.64 {q7}, [r7] + adr r7, mv_y_inc_x4 + vld1.64 {q6}, [r7] + adr r7, mx_x_offset_x4 + vld1.64 {q5}, [r7] + veor q4, q4 + veor q3, q3 + vdup.32 q8, r3 +_hash_height_loop: + mov r7, r1 + vmov q2, q5 //mx_x_offset_x4 +_hash_width_loop: + vld1.64 {d0}, [r0]! + vshll.u16 q0, d0, #2 + vadd.u32 q0, q8 + vmov q1, q2 + vmov q4, q3 + vzip.16 q1, q4 + + vmov.32 r4, d0[0] + ldr r5, [r4] + vmov.32 r6, d2[0] + str r6, [r5] + add r5, r5, #4 + pld [r5] // cache miss? + str r5, [r4] + + vmov.32 r4, d0[1] + ldr r5, [r4] + vmov.32 r6, d2[1] + str r6, [r5] + add r5, r5, #4 + pld [r5] // cache miss? + str r5, [r4] + + vmov.32 r4, d1[0] + ldr r5, [r4] + vmov.32 r6, d3[0] + str r6, [r5] + add r5, r5, #4 + pld [r5] // cache miss? + str r5, [r4] + + vmov.32 r4, d1[1] + ldr r5, [r4] + vmov.32 r6, d3[1] + str r6, [r5] + add r5, r5, #4 + pld [r5] // cache miss? + str r5, [r4] + + vadd.u16 q2, q2, q7 + subs r7, #4 + bne _hash_width_loop + + vadd.u16 q3, q3, q6 + subs r2, #1 + bne _hash_height_loop + + vpop {q4-q7} + ldmia sp!, {r4-r8} +WELS_ASM_FUNC_END #endif diff --git a/codec/encoder/core/arm64/svc_motion_estimation_aarch64_neon.S b/codec/encoder/core/arm64/svc_motion_estimation_aarch64_neon.S index 15ea861c..3c67cb32 100644 --- a/codec/encoder/core/arm64/svc_motion_estimation_aarch64_neon.S +++ b/codec/encoder/core/arm64/svc_motion_estimation_aarch64_neon.S @@ -217,4 +217,121 @@ _width_loop16x16_2: cbnz x2, _height_loop16x16 _SumOf16x16BlockOfFrame_AArch64_neon_end: WELS_ASM_AARCH64_FUNC_END + +WELS_ASM_AARCH64_FUNC_BEGIN InitializeHashforFeature_AArch64_neon +// (uint32_t* pTimesOfFeatureValue, uint16_t* pBuf, const int32_t kiListSize, uint16_t** pLocationOfFeature, uint16_t** pFeatureValuePointerList); + mov x9, #3 + bic x5, x2, x9 + mov x8, #0 +_hash_assign_loop_x4: + ld1 {v0.16b}, [x0], #16 + shl v0.4s, v0.4s, #2 + addv s1, v0.4s + umov w7, v1.s[0] + cbz w7, _hash_assign_with_copy_x4 + + ins v2.d[0], x1 + umov w8, v0.s[0] + add x1, x1, x8 + ins v2.d[1], x1 + umov w8, v0.s[1] + add x1, x1, x8 + ins v3.d[0], x1 + umov w8, v0.s[2] + add x1, x1, x8 + ins v3.d[1], x1 + umov w8, v0.s[3] + add x1, x1, x8 + st1 {v2.16b, v3.16b}, [x3], #32 + st1 {v2.16b, v3.16b}, [x4], #32 + b _assign_next +_hash_assign_with_copy_x4: + dup v2.2d, x1 + dup v3.2d, x1 + st1 {v2.16b, v3.16b}, [x3], #32 + st1 {v2.16b, v3.16b}, [x4], #32 + +_assign_next: + subs x5, x5, #4 + cbnz x5, _hash_assign_loop_x4 + + and x5, x2, x9 + cbz x5, _hash_assign_end + + +_hash_assign_loop_x4_rem: + str x1, [x3], #8 + str x1, [x4], #8 + ldr w8, [x0], #4 + lsl w8, w8, #2 + add x1, x1, x8 + subs x5, x5, #1 + cbnz x5, _hash_assign_loop_x4_rem + +_hash_assign_end: +WELS_ASM_AARCH64_FUNC_END + +.align 16 +mv_x_inc_x4: .short 0x10, 0x10, 0x10, 0x10, 0x00, 0x00, 0x00, 0x00 +mv_y_inc_x4: .short 0x04, 0x04, 0x04, 0x04, 0x00, 0x00, 0x00, 0x00 +mx_x_offset_x4: .short 0x00, 0x04, 0x08, 0x0c, 0x00, 0x00, 0x00, 0x00 + +WELS_ASM_AARCH64_FUNC_BEGIN FillQpelLocationByFeatureValue_AArch64_neon +// void (uint16_t* pFeatureOfBlock, const int32_t kiWidth, const int32_t kiHeight, uint16_t** pFeatureValuePointerList) + ldr q7, mv_x_inc_x4 + ldr q6, mv_y_inc_x4 + ldr q5, mx_x_offset_x4 + + eor v4.16b, v4.16b, v4.16b + eor v3.16b, v3.16b, v3.16b + dup v16.2d, x3 // v8->v16 + +_hash_height_loop: + mov x7, x1 + mov.16b v2, v5 //mx_x_offset_x4 + +_hash_width_loop: + ld1 {v0.d}[0], [x0], #8 + + ushll v0.4s, v0.4h, #3 + uaddw v17.2d, v16.2d, v0.2s + uaddw2 v18.2d, v16.2d, v0.4s + zip1 v1.8h, v2.8h, v3.8h + + umov x4, v17.d[0] + ldr x5, [x4] + umov w6, v1.s[0] + str w6, [x5] + add x5, x5, #4 + str x5, [x4] + + umov x4, v17.d[1] + ldr x5, [x4] + umov w6, v1.s[1] + str w6, [x5] + add x5, x5, #4 + str x5, [x4] + + umov x4, v18.d[0] + ldr x5, [x4] + umov w6, v1.s[2] + str w6, [x5] + add x5, x5, #4 + str x5, [x4] + + umov x4, v18.d[1] + ldr x5, [x4] + umov w6, v1.s[3] + str w6, [x5] + add x5, x5, #4 + str x5, [x4] + + add v2.8h, v2.8h, v7.8h + subs x7, x7, #4 + cbnz x7, _hash_width_loop + + add v3.8h, v3.8h, v6.8h + subs x2, x2, #1 + cbnz x2, _hash_height_loop +WELS_ASM_AARCH64_FUNC_END #endif \ No newline at end of file diff --git a/codec/encoder/core/inc/svc_motion_estimate.h b/codec/encoder/core/inc/svc_motion_estimate.h index 96eb7b0d..1b98d1ab 100644 --- a/codec/encoder/core/inc/svc_motion_estimate.h +++ b/codec/encoder/core/inc/svc_motion_estimate.h @@ -271,6 +271,10 @@ void SumOf16x16BlockOfFrame_sse4 (uint8_t* pRefPicture, const int32_t kiWidth, c #ifdef HAVE_NEON extern "C" { +void InitializeHashforFeature_neon (uint32_t* pTimesOfFeatureValue, uint16_t* pBuf, const int32_t kiListSize, + uint16_t** pLocationOfFeature, uint16_t** pFeatureValuePointerList); +void FillQpelLocationByFeatureValue_neon (uint16_t* pFeatureOfBlock, const int32_t kiWidth, const int32_t kiHeight, + uint16_t** pFeatureValuePointerList); int32_t SumOf8x8SingleBlock_neon (uint8_t* pRef, const int32_t kiRefStride); int32_t SumOf16x16SingleBlock_neon (uint8_t* pRef, const int32_t kiRefStride); void SumOf8x8BlockOfFrame_neon (uint8_t* pRefPicture, const int32_t kiWidth, const int32_t kiHeight, @@ -285,6 +289,10 @@ void SumOf16x16BlockOfFrame_neon (uint8_t* pRefPicture, const int32_t kiWidth, c #ifdef HAVE_NEON_AARCH64 extern "C" { +void InitializeHashforFeature_AArch64_neon (uint32_t* pTimesOfFeatureValue, uint16_t* pBuf, const int32_t kiListSize, + uint16_t** pLocationOfFeature, uint16_t** pFeatureValuePointerList); +void FillQpelLocationByFeatureValue_AArch64_neon (uint16_t* pFeatureOfBlock, const int32_t kiWidth, const int32_t kiHeight, + uint16_t** pFeatureValuePointerList); int32_t SumOf8x8SingleBlock_AArch64_neon (uint8_t* pRef, const int32_t kiRefStride); int32_t SumOf16x16SingleBlock_AArch64_neon (uint8_t* pRef, const int32_t kiRefStride); void SumOf8x8BlockOfFrame_AArch64_neon (uint8_t* pRefPicture, const int32_t kiWidth, const int32_t kiHeight, diff --git a/codec/encoder/core/src/svc_motion_estimate.cpp b/codec/encoder/core/src/svc_motion_estimate.cpp index 33390659..828fbf4d 100644 --- a/codec/encoder/core/src/svc_motion_estimate.cpp +++ b/codec/encoder/core/src/svc_motion_estimate.cpp @@ -125,6 +125,8 @@ void WelsInitMeFunc (SWelsFuncPtrList* pFuncList, uint32_t uiCpuFlag, bool bScre #if defined (HAVE_NEON) if (uiCpuFlag & WELS_CPU_NEON) { //for feature search + pFuncList->pfInitializeHashforFeature = InitializeHashforFeature_neon; + pFuncList->pfFillQpelLocationByFeatureValue = FillQpelLocationByFeatureValue_neon; pFuncList->pfCalculateBlockFeatureOfFrame[0] = SumOf8x8BlockOfFrame_neon; pFuncList->pfCalculateBlockFeatureOfFrame[1] = SumOf16x16BlockOfFrame_neon; //TODO: it is possible to differentiate width that is times of 8, so as to accelerate the speed when width is times of 8? @@ -136,6 +138,8 @@ void WelsInitMeFunc (SWelsFuncPtrList* pFuncList, uint32_t uiCpuFlag, bool bScre #if defined (HAVE_NEON_AARCH64) if (uiCpuFlag & WELS_CPU_NEON) { //for feature search + pFuncList->pfInitializeHashforFeature = InitializeHashforFeature_AArch64_neon; + pFuncList->pfFillQpelLocationByFeatureValue = FillQpelLocationByFeatureValue_AArch64_neon; pFuncList->pfCalculateBlockFeatureOfFrame[0] = SumOf8x8BlockOfFrame_AArch64_neon; pFuncList->pfCalculateBlockFeatureOfFrame[1] = SumOf16x16BlockOfFrame_AArch64_neon; //TODO: it is possible to differentiate width that is times of 8, so as to accelerate the speed when width is times of 8? diff --git a/test/encoder/EncUT_SVC_me.cpp b/test/encoder/EncUT_SVC_me.cpp index bf4c7edb..304c44fc 100644 --- a/test/encoder/EncUT_SVC_me.cpp +++ b/test/encoder/EncUT_SVC_me.cpp @@ -281,6 +281,10 @@ GENERATE_SumOfFrame (SumOf8x8BlockOfFrame_ref, SumOf8x8BlockOfFrame_neon, 1, 320 GENERATE_SumOfFrame (SumOf16x16BlockOfFrame_ref, SumOf16x16BlockOfFrame_neon, 1, 320) GENERATE_SumOfFrame (SumOf8x8BlockOfFrame_ref, SumOf8x8BlockOfFrame_neon, 640, 320) GENERATE_SumOfFrame (SumOf16x16BlockOfFrame_ref, SumOf16x16BlockOfFrame_neon, 640, 320) +GENERATE_InitializeHashforFeature (InitializeHashforFeature_ref, InitializeHashforFeature_neon, 10, 10) +GENERATE_FillQpelLocationByFeatureValue (FillQpelLocationByFeatureValue_ref, FillQpelLocationByFeatureValue_neon, 16, 16) +GENERATE_InitializeHashforFeature (InitializeHashforFeature_ref, InitializeHashforFeature_neon, 640, 320) +GENERATE_FillQpelLocationByFeatureValue (FillQpelLocationByFeatureValue_ref, FillQpelLocationByFeatureValue_neon, 640, 320) #endif #ifdef HAVE_NEON_AARCH64 @@ -290,4 +294,8 @@ GENERATE_SumOfFrame (SumOf8x8BlockOfFrame_ref, SumOf8x8BlockOfFrame_AArch64_neon GENERATE_SumOfFrame (SumOf16x16BlockOfFrame_ref, SumOf16x16BlockOfFrame_AArch64_neon, 1, 320) GENERATE_SumOfFrame (SumOf8x8BlockOfFrame_ref, SumOf8x8BlockOfFrame_AArch64_neon, 640, 320) GENERATE_SumOfFrame (SumOf16x16BlockOfFrame_ref, SumOf16x16BlockOfFrame_AArch64_neon, 640, 320) +GENERATE_InitializeHashforFeature (InitializeHashforFeature_ref, InitializeHashforFeature_AArch64_neon, 10, 10) +GENERATE_FillQpelLocationByFeatureValue (FillQpelLocationByFeatureValue_ref, FillQpelLocationByFeatureValue_AArch64_neon, 16, 16) +GENERATE_InitializeHashforFeature (InitializeHashforFeature_ref, InitializeHashforFeature_AArch64_neon, 640, 320) +GENERATE_FillQpelLocationByFeatureValue (FillQpelLocationByFeatureValue_ref, FillQpelLocationByFeatureValue_AArch64_neon, 640, 320) #endif