Add x86 32/64bit asm code for SumOfBlocks.
This commit is contained in:
parent
681b1da698
commit
f2314151e8
@ -244,6 +244,22 @@ void SumOf8x8BlockOfFrame_c (uint8_t* pRefPicture, const int32_t kiWidth, const
|
||||
void SumOf16x16BlockOfFrame_c (uint8_t* pRefPicture, const int32_t kiWidth, const int32_t kiHeight,
|
||||
const int32_t kiRefStride,
|
||||
uint16_t* pFeatureOfBlock, uint32_t pTimesOfFeatureValue[]);
|
||||
|
||||
#ifdef X86_ASM
|
||||
extern "C"
|
||||
{
|
||||
int32_t SumOf8x8SingleBlock_sse2 (uint8_t* pRef, const int32_t kiRefStride);
|
||||
int32_t SumOf16x16SingleBlock_sse2 (uint8_t* pRef, const int32_t kiRefStride);
|
||||
void SumOf8x8BlockOfFrame_sse2 (uint8_t* pRefPicture, const int32_t kiWidth, const int32_t kiHeight,
|
||||
const int32_t kiRefStride, uint16_t* pFeatureOfBlock, uint32_t pTimesOfFeatureValue[]);
|
||||
void SumOf16x16BlockOfFrame_sse2 (uint8_t* pRefPicture, const int32_t kiWidth, const int32_t kiHeight,
|
||||
const int32_t kiRefStride, uint16_t* pFeatureOfBlock, uint32_t pTimesOfFeatureValue[]);
|
||||
void SumOf8x8BlockOfFrame_sse4 (uint8_t* pRefPicture, const int32_t kiWidth, const int32_t kiHeight,
|
||||
const int32_t kiRefStride, uint16_t* pFeatureOfBlock, uint32_t pTimesOfFeatureValue[]);
|
||||
void SumOf16x16BlockOfFrame_sse4 (uint8_t* pRefPicture, const int32_t kiWidth, const int32_t kiHeight,
|
||||
const int32_t kiRefStride, uint16_t* pFeatureOfBlock, uint32_t pTimesOfFeatureValue[]);
|
||||
}
|
||||
#endif
|
||||
#ifdef HAVE_NEON
|
||||
extern "C"
|
||||
{
|
||||
|
@ -102,22 +102,42 @@ void WelsInitMeFunc (SWelsFuncPtrList* pFuncList, uint32_t uiCpuFlag, bool bScre
|
||||
//TODO: it is possible to differentiate width that is times of 8, so as to accelerate the speed when width is times of 8?
|
||||
pFuncList->pfCalculateSingleBlockFeature[0] = SumOf8x8SingleBlock_c;
|
||||
pFuncList->pfCalculateSingleBlockFeature[1] = SumOf16x16SingleBlock_c;
|
||||
#if defined (X86_ASM)
|
||||
if (uiCpuFlag & WELS_CPU_SSE2) {
|
||||
//for feature search
|
||||
pFuncList->pfCalculateBlockFeatureOfFrame[0] = SumOf8x8BlockOfFrame_sse2;
|
||||
pFuncList->pfCalculateBlockFeatureOfFrame[1] = SumOf16x16BlockOfFrame_sse2;
|
||||
//TODO: it is possible to differentiate width that is times of 8, so as to accelerate the speed when width is times of 8?
|
||||
pFuncList->pfCalculateSingleBlockFeature[0] = SumOf8x8SingleBlock_sse2;
|
||||
pFuncList->pfCalculateSingleBlockFeature[1] = SumOf16x16SingleBlock_sse2;
|
||||
}
|
||||
if (uiCpuFlag & WELS_CPU_SSE41) {
|
||||
//for feature search
|
||||
pFuncList->pfCalculateBlockFeatureOfFrame[0] = SumOf8x8BlockOfFrame_sse4;
|
||||
pFuncList->pfCalculateBlockFeatureOfFrame[1] = SumOf16x16BlockOfFrame_sse4;
|
||||
}
|
||||
#endif
|
||||
|
||||
#if defined (HAVE_NEON)
|
||||
//for feature search
|
||||
pFuncList->pfCalculateBlockFeatureOfFrame[0] = SumOf8x8BlockOfFrame_neon;
|
||||
pFuncList->pfCalculateBlockFeatureOfFrame[1] = SumOf16x16BlockOfFrame_neon;
|
||||
//TODO: it is possible to differentiate width that is times of 8, so as to accelerate the speed when width is times of 8?
|
||||
pFuncList->pfCalculateSingleBlockFeature[0] = SumOf8x8SingleBlock_neon;
|
||||
pFuncList->pfCalculateSingleBlockFeature[1] = SumOf16x16SingleBlock_neon;
|
||||
if (uiCpuFlag & WELS_CPU_NEON) {
|
||||
//for feature search
|
||||
pFuncList->pfCalculateBlockFeatureOfFrame[0] = SumOf8x8BlockOfFrame_neon;
|
||||
pFuncList->pfCalculateBlockFeatureOfFrame[1] = SumOf16x16BlockOfFrame_neon;
|
||||
//TODO: it is possible to differentiate width that is times of 8, so as to accelerate the speed when width is times of 8?
|
||||
pFuncList->pfCalculateSingleBlockFeature[0] = SumOf8x8SingleBlock_neon;
|
||||
pFuncList->pfCalculateSingleBlockFeature[1] = SumOf16x16SingleBlock_neon;
|
||||
}
|
||||
#endif
|
||||
|
||||
#if defined (HAVE_NEON_AARCH64)
|
||||
//for feature search
|
||||
pFuncList->pfCalculateBlockFeatureOfFrame[0] = SumOf8x8BlockOfFrame_AArch64_neon;
|
||||
pFuncList->pfCalculateBlockFeatureOfFrame[1] = SumOf16x16BlockOfFrame_AArch64_neon;
|
||||
//TODO: it is possible to differentiate width that is times of 8, so as to accelerate the speed when width is times of 8?
|
||||
pFuncList->pfCalculateSingleBlockFeature[0] = SumOf8x8SingleBlock_AArch64_neon;
|
||||
pFuncList->pfCalculateSingleBlockFeature[1] = SumOf16x16SingleBlock_AArch64_neon;
|
||||
if (uiCpuFlag & WELS_CPU_NEON) {
|
||||
//for feature search
|
||||
pFuncList->pfCalculateBlockFeatureOfFrame[0] = SumOf8x8BlockOfFrame_AArch64_neon;
|
||||
pFuncList->pfCalculateBlockFeatureOfFrame[1] = SumOf16x16BlockOfFrame_AArch64_neon;
|
||||
//TODO: it is possible to differentiate width that is times of 8, so as to accelerate the speed when width is times of 8?
|
||||
pFuncList->pfCalculateSingleBlockFeature[0] = SumOf8x8SingleBlock_AArch64_neon;
|
||||
pFuncList->pfCalculateSingleBlockFeature[1] = SumOf16x16SingleBlock_AArch64_neon;
|
||||
}
|
||||
#endif
|
||||
}
|
||||
}
|
||||
|
File diff suppressed because it is too large
Load Diff
@ -92,6 +92,11 @@ TEST (SVC_ME_FunTest, method) {\
|
||||
GENERATE_SumOfSingleBlock (SumOf8x8SingleBlock_ref, SumOf8x8SingleBlock_c)
|
||||
GENERATE_SumOfSingleBlock (SumOf16x16SingleBlock_ref, SumOf16x16SingleBlock_c)
|
||||
|
||||
#ifdef X86_ASM
|
||||
GENERATE_SumOfSingleBlock (SumOf8x8SingleBlock_ref, SumOf8x8SingleBlock_sse2)
|
||||
GENERATE_SumOfSingleBlock (SumOf16x16SingleBlock_ref, SumOf16x16SingleBlock_sse2)
|
||||
#endif
|
||||
|
||||
#ifdef HAVE_NEON
|
||||
GENERATE_SumOfSingleBlock (SumOf8x8SingleBlock_ref, SumOf8x8SingleBlock_neon)
|
||||
GENERATE_SumOfSingleBlock (SumOf16x16SingleBlock_ref, SumOf16x16SingleBlock_neon)
|
||||
@ -138,6 +143,31 @@ GENERATE_SumOfFrame (SumOf16x16BlockOfFrame_ref, SumOf16x16BlockOfFrame_c, 1, 32
|
||||
GENERATE_SumOfFrame (SumOf8x8BlockOfFrame_ref, SumOf8x8BlockOfFrame_c, 640, 320)
|
||||
GENERATE_SumOfFrame (SumOf16x16BlockOfFrame_ref, SumOf16x16BlockOfFrame_c, 640, 320)
|
||||
|
||||
#ifdef X86_ASM
|
||||
GENERATE_SumOfFrame (SumOf8x8BlockOfFrame_ref, SumOf8x8BlockOfFrame_sse2, 6, 6)
|
||||
GENERATE_SumOfFrame (SumOf16x16BlockOfFrame_ref, SumOf16x16BlockOfFrame_sse2, 6, 6)
|
||||
GENERATE_SumOfFrame (SumOf8x8BlockOfFrame_ref, SumOf8x8BlockOfFrame_sse2, 6, 320)
|
||||
GENERATE_SumOfFrame (SumOf16x16BlockOfFrame_ref, SumOf16x16BlockOfFrame_sse2, 6, 320)
|
||||
GENERATE_SumOfFrame (SumOf8x8BlockOfFrame_ref, SumOf8x8BlockOfFrame_sse2, 640, 320)
|
||||
GENERATE_SumOfFrame (SumOf16x16BlockOfFrame_ref, SumOf16x16BlockOfFrame_sse2, 640, 320)
|
||||
|
||||
GENERATE_SumOfFrame (SumOf8x8BlockOfFrame_ref, SumOf8x8BlockOfFrame_sse4, 8, 2)
|
||||
GENERATE_SumOfFrame (SumOf16x16BlockOfFrame_ref, SumOf16x16BlockOfFrame_sse4, 16, 2)
|
||||
GENERATE_SumOfFrame (SumOf8x8BlockOfFrame_ref, SumOf8x8BlockOfFrame_sse4, 8, 320)
|
||||
GENERATE_SumOfFrame (SumOf16x16BlockOfFrame_ref, SumOf16x16BlockOfFrame_sse4, 16, 320)
|
||||
GENERATE_SumOfFrame (SumOf8x8BlockOfFrame_ref, SumOf8x8BlockOfFrame_sse4, 640, 320)
|
||||
GENERATE_SumOfFrame (SumOf16x16BlockOfFrame_ref, SumOf16x16BlockOfFrame_sse4, 640, 320)
|
||||
#endif
|
||||
|
||||
#ifdef HAVE_NEON
|
||||
GENERATE_SumOfFrame (SumOf8x8BlockOfFrame_ref, SumOf8x8BlockOfFrame_neon, 1, 1)
|
||||
GENERATE_SumOfFrame (SumOf16x16BlockOfFrame_ref, SumOf16x16BlockOfFrame_neon, 1, 1)
|
||||
GENERATE_SumOfFrame (SumOf8x8BlockOfFrame_ref, SumOf8x8BlockOfFrame_neon, 1, 320)
|
||||
GENERATE_SumOfFrame (SumOf16x16BlockOfFrame_ref, SumOf16x16BlockOfFrame_neon, 1, 320)
|
||||
GENERATE_SumOfFrame (SumOf8x8BlockOfFrame_ref, SumOf8x8BlockOfFrame_neon, 640, 320)
|
||||
GENERATE_SumOfFrame (SumOf16x16BlockOfFrame_ref, SumOf16x16BlockOfFrame_neon, 640, 320)
|
||||
#endif
|
||||
|
||||
#ifdef HAVE_NEON
|
||||
GENERATE_SumOfFrame (SumOf8x8BlockOfFrame_ref, SumOf8x8BlockOfFrame_neon, 1, 1)
|
||||
GENERATE_SumOfFrame (SumOf16x16BlockOfFrame_ref, SumOf16x16BlockOfFrame_neon, 1, 1)
|
||||
|
Loading…
Reference in New Issue
Block a user