Add x86 32/64bit asm code for SumOfBlocks.

This commit is contained in:
zhiliang wang 2014-08-13 11:18:39 +08:00
parent 681b1da698
commit f2314151e8
4 changed files with 1384 additions and 13 deletions

View File

@ -244,6 +244,22 @@ void SumOf8x8BlockOfFrame_c (uint8_t* pRefPicture, const int32_t kiWidth, const
void SumOf16x16BlockOfFrame_c (uint8_t* pRefPicture, const int32_t kiWidth, const int32_t kiHeight,
const int32_t kiRefStride,
uint16_t* pFeatureOfBlock, uint32_t pTimesOfFeatureValue[]);
#ifdef X86_ASM
extern "C"
{
int32_t SumOf8x8SingleBlock_sse2 (uint8_t* pRef, const int32_t kiRefStride);
int32_t SumOf16x16SingleBlock_sse2 (uint8_t* pRef, const int32_t kiRefStride);
void SumOf8x8BlockOfFrame_sse2 (uint8_t* pRefPicture, const int32_t kiWidth, const int32_t kiHeight,
const int32_t kiRefStride, uint16_t* pFeatureOfBlock, uint32_t pTimesOfFeatureValue[]);
void SumOf16x16BlockOfFrame_sse2 (uint8_t* pRefPicture, const int32_t kiWidth, const int32_t kiHeight,
const int32_t kiRefStride, uint16_t* pFeatureOfBlock, uint32_t pTimesOfFeatureValue[]);
void SumOf8x8BlockOfFrame_sse4 (uint8_t* pRefPicture, const int32_t kiWidth, const int32_t kiHeight,
const int32_t kiRefStride, uint16_t* pFeatureOfBlock, uint32_t pTimesOfFeatureValue[]);
void SumOf16x16BlockOfFrame_sse4 (uint8_t* pRefPicture, const int32_t kiWidth, const int32_t kiHeight,
const int32_t kiRefStride, uint16_t* pFeatureOfBlock, uint32_t pTimesOfFeatureValue[]);
}
#endif
#ifdef HAVE_NEON
extern "C"
{

View File

@ -102,22 +102,42 @@ void WelsInitMeFunc (SWelsFuncPtrList* pFuncList, uint32_t uiCpuFlag, bool bScre
//TODO: it is possible to differentiate width that is times of 8, so as to accelerate the speed when width is times of 8?
pFuncList->pfCalculateSingleBlockFeature[0] = SumOf8x8SingleBlock_c;
pFuncList->pfCalculateSingleBlockFeature[1] = SumOf16x16SingleBlock_c;
#if defined (X86_ASM)
if (uiCpuFlag & WELS_CPU_SSE2) {
//for feature search
pFuncList->pfCalculateBlockFeatureOfFrame[0] = SumOf8x8BlockOfFrame_sse2;
pFuncList->pfCalculateBlockFeatureOfFrame[1] = SumOf16x16BlockOfFrame_sse2;
//TODO: it is possible to differentiate width that is times of 8, so as to accelerate the speed when width is times of 8?
pFuncList->pfCalculateSingleBlockFeature[0] = SumOf8x8SingleBlock_sse2;
pFuncList->pfCalculateSingleBlockFeature[1] = SumOf16x16SingleBlock_sse2;
}
if (uiCpuFlag & WELS_CPU_SSE41) {
//for feature search
pFuncList->pfCalculateBlockFeatureOfFrame[0] = SumOf8x8BlockOfFrame_sse4;
pFuncList->pfCalculateBlockFeatureOfFrame[1] = SumOf16x16BlockOfFrame_sse4;
}
#endif
#if defined (HAVE_NEON)
//for feature search
pFuncList->pfCalculateBlockFeatureOfFrame[0] = SumOf8x8BlockOfFrame_neon;
pFuncList->pfCalculateBlockFeatureOfFrame[1] = SumOf16x16BlockOfFrame_neon;
//TODO: it is possible to differentiate width that is times of 8, so as to accelerate the speed when width is times of 8?
pFuncList->pfCalculateSingleBlockFeature[0] = SumOf8x8SingleBlock_neon;
pFuncList->pfCalculateSingleBlockFeature[1] = SumOf16x16SingleBlock_neon;
if (uiCpuFlag & WELS_CPU_NEON) {
//for feature search
pFuncList->pfCalculateBlockFeatureOfFrame[0] = SumOf8x8BlockOfFrame_neon;
pFuncList->pfCalculateBlockFeatureOfFrame[1] = SumOf16x16BlockOfFrame_neon;
//TODO: it is possible to differentiate width that is times of 8, so as to accelerate the speed when width is times of 8?
pFuncList->pfCalculateSingleBlockFeature[0] = SumOf8x8SingleBlock_neon;
pFuncList->pfCalculateSingleBlockFeature[1] = SumOf16x16SingleBlock_neon;
}
#endif
#if defined (HAVE_NEON_AARCH64)
//for feature search
pFuncList->pfCalculateBlockFeatureOfFrame[0] = SumOf8x8BlockOfFrame_AArch64_neon;
pFuncList->pfCalculateBlockFeatureOfFrame[1] = SumOf16x16BlockOfFrame_AArch64_neon;
//TODO: it is possible to differentiate width that is times of 8, so as to accelerate the speed when width is times of 8?
pFuncList->pfCalculateSingleBlockFeature[0] = SumOf8x8SingleBlock_AArch64_neon;
pFuncList->pfCalculateSingleBlockFeature[1] = SumOf16x16SingleBlock_AArch64_neon;
if (uiCpuFlag & WELS_CPU_NEON) {
//for feature search
pFuncList->pfCalculateBlockFeatureOfFrame[0] = SumOf8x8BlockOfFrame_AArch64_neon;
pFuncList->pfCalculateBlockFeatureOfFrame[1] = SumOf16x16BlockOfFrame_AArch64_neon;
//TODO: it is possible to differentiate width that is times of 8, so as to accelerate the speed when width is times of 8?
pFuncList->pfCalculateSingleBlockFeature[0] = SumOf8x8SingleBlock_AArch64_neon;
pFuncList->pfCalculateSingleBlockFeature[1] = SumOf16x16SingleBlock_AArch64_neon;
}
#endif
}
}

File diff suppressed because it is too large Load Diff

View File

@ -92,6 +92,11 @@ TEST (SVC_ME_FunTest, method) {\
GENERATE_SumOfSingleBlock (SumOf8x8SingleBlock_ref, SumOf8x8SingleBlock_c)
GENERATE_SumOfSingleBlock (SumOf16x16SingleBlock_ref, SumOf16x16SingleBlock_c)
#ifdef X86_ASM
GENERATE_SumOfSingleBlock (SumOf8x8SingleBlock_ref, SumOf8x8SingleBlock_sse2)
GENERATE_SumOfSingleBlock (SumOf16x16SingleBlock_ref, SumOf16x16SingleBlock_sse2)
#endif
#ifdef HAVE_NEON
GENERATE_SumOfSingleBlock (SumOf8x8SingleBlock_ref, SumOf8x8SingleBlock_neon)
GENERATE_SumOfSingleBlock (SumOf16x16SingleBlock_ref, SumOf16x16SingleBlock_neon)
@ -138,6 +143,31 @@ GENERATE_SumOfFrame (SumOf16x16BlockOfFrame_ref, SumOf16x16BlockOfFrame_c, 1, 32
GENERATE_SumOfFrame (SumOf8x8BlockOfFrame_ref, SumOf8x8BlockOfFrame_c, 640, 320)
GENERATE_SumOfFrame (SumOf16x16BlockOfFrame_ref, SumOf16x16BlockOfFrame_c, 640, 320)
#ifdef X86_ASM
GENERATE_SumOfFrame (SumOf8x8BlockOfFrame_ref, SumOf8x8BlockOfFrame_sse2, 6, 6)
GENERATE_SumOfFrame (SumOf16x16BlockOfFrame_ref, SumOf16x16BlockOfFrame_sse2, 6, 6)
GENERATE_SumOfFrame (SumOf8x8BlockOfFrame_ref, SumOf8x8BlockOfFrame_sse2, 6, 320)
GENERATE_SumOfFrame (SumOf16x16BlockOfFrame_ref, SumOf16x16BlockOfFrame_sse2, 6, 320)
GENERATE_SumOfFrame (SumOf8x8BlockOfFrame_ref, SumOf8x8BlockOfFrame_sse2, 640, 320)
GENERATE_SumOfFrame (SumOf16x16BlockOfFrame_ref, SumOf16x16BlockOfFrame_sse2, 640, 320)
GENERATE_SumOfFrame (SumOf8x8BlockOfFrame_ref, SumOf8x8BlockOfFrame_sse4, 8, 2)
GENERATE_SumOfFrame (SumOf16x16BlockOfFrame_ref, SumOf16x16BlockOfFrame_sse4, 16, 2)
GENERATE_SumOfFrame (SumOf8x8BlockOfFrame_ref, SumOf8x8BlockOfFrame_sse4, 8, 320)
GENERATE_SumOfFrame (SumOf16x16BlockOfFrame_ref, SumOf16x16BlockOfFrame_sse4, 16, 320)
GENERATE_SumOfFrame (SumOf8x8BlockOfFrame_ref, SumOf8x8BlockOfFrame_sse4, 640, 320)
GENERATE_SumOfFrame (SumOf16x16BlockOfFrame_ref, SumOf16x16BlockOfFrame_sse4, 640, 320)
#endif
#ifdef HAVE_NEON
GENERATE_SumOfFrame (SumOf8x8BlockOfFrame_ref, SumOf8x8BlockOfFrame_neon, 1, 1)
GENERATE_SumOfFrame (SumOf16x16BlockOfFrame_ref, SumOf16x16BlockOfFrame_neon, 1, 1)
GENERATE_SumOfFrame (SumOf8x8BlockOfFrame_ref, SumOf8x8BlockOfFrame_neon, 1, 320)
GENERATE_SumOfFrame (SumOf16x16BlockOfFrame_ref, SumOf16x16BlockOfFrame_neon, 1, 320)
GENERATE_SumOfFrame (SumOf8x8BlockOfFrame_ref, SumOf8x8BlockOfFrame_neon, 640, 320)
GENERATE_SumOfFrame (SumOf16x16BlockOfFrame_ref, SumOf16x16BlockOfFrame_neon, 640, 320)
#endif
#ifdef HAVE_NEON
GENERATE_SumOfFrame (SumOf8x8BlockOfFrame_ref, SumOf8x8BlockOfFrame_neon, 1, 1)
GENERATE_SumOfFrame (SumOf16x16BlockOfFrame_ref, SumOf16x16BlockOfFrame_neon, 1, 1)