Add Motion Compehension ARM64 Neon Code
This commit is contained in:
parent
b47606a4ff
commit
ad9e2dab4f
@ -27,6 +27,7 @@
|
||||
F0B204F918FD23BF005DA23F /* copy_mb.cpp in Sources */ = {isa = PBXBuildFile; fileRef = F0B204F818FD23BF005DA23F /* copy_mb.cpp */; };
|
||||
F556A8241906673900E156A8 /* arm_arch64_common_macro.S in Sources */ = {isa = PBXBuildFile; fileRef = F556A8221906673900E156A8 /* arm_arch64_common_macro.S */; };
|
||||
F556A8251906673900E156A8 /* expand_picture_aarch64_neon.S in Sources */ = {isa = PBXBuildFile; fileRef = F556A8231906673900E156A8 /* expand_picture_aarch64_neon.S */; };
|
||||
F5B8D82D190757290037849A /* mc_aarch64_neon.S in Sources */ = {isa = PBXBuildFile; fileRef = F5B8D82C190757290037849A /* mc_aarch64_neon.S */; };
|
||||
FAABAA1818E9354A00D4186F /* sad_common.cpp in Sources */ = {isa = PBXBuildFile; fileRef = FAABAA1718E9354A00D4186F /* sad_common.cpp */; };
|
||||
/* End PBXBuildFile section */
|
||||
|
||||
@ -87,6 +88,7 @@
|
||||
F0B204F818FD23BF005DA23F /* copy_mb.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = copy_mb.cpp; sourceTree = "<group>"; };
|
||||
F556A8221906673900E156A8 /* arm_arch64_common_macro.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; name = arm_arch64_common_macro.S; path = arm64/arm_arch64_common_macro.S; sourceTree = "<group>"; };
|
||||
F556A8231906673900E156A8 /* expand_picture_aarch64_neon.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; name = expand_picture_aarch64_neon.S; path = arm64/expand_picture_aarch64_neon.S; sourceTree = "<group>"; };
|
||||
F5B8D82C190757290037849A /* mc_aarch64_neon.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; name = mc_aarch64_neon.S; path = arm64/mc_aarch64_neon.S; sourceTree = "<group>"; };
|
||||
FAABAA1618E9353F00D4186F /* sad_common.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = sad_common.h; sourceTree = "<group>"; };
|
||||
FAABAA1718E9354A00D4186F /* sad_common.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = sad_common.cpp; sourceTree = "<group>"; };
|
||||
/* End PBXFileReference section */
|
||||
@ -223,6 +225,7 @@
|
||||
F556A81D1906669F00E156A8 /* arm64 */ = {
|
||||
isa = PBXGroup;
|
||||
children = (
|
||||
F5B8D82C190757290037849A /* mc_aarch64_neon.S */,
|
||||
F556A8221906673900E156A8 /* arm_arch64_common_macro.S */,
|
||||
F556A8231906673900E156A8 /* expand_picture_aarch64_neon.S */,
|
||||
);
|
||||
@ -310,6 +313,7 @@
|
||||
isa = PBXSourcesBuildPhase;
|
||||
buildActionMask = 2147483647;
|
||||
files = (
|
||||
F5B8D82D190757290037849A /* mc_aarch64_neon.S in Sources */,
|
||||
4C3406C918D96EA600DFA14A /* arm_arch_common_macro.S in Sources */,
|
||||
F556A8241906673900E156A8 /* arm_arch64_common_macro.S in Sources */,
|
||||
4C3406CE18D96EA600DFA14A /* crt_util_safe_x.cpp in Sources */,
|
||||
|
2274
codec/common/arm64/mc_aarch64_neon.S
Executable file
2274
codec/common/arm64/mc_aarch64_neon.S
Executable file
File diff suppressed because it is too large
Load Diff
@ -40,60 +40,171 @@ extern "C" {
|
||||
#endif//__cplusplus
|
||||
|
||||
#if defined(HAVE_NEON)
|
||||
void McCopyWidthEq4_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);
|
||||
void McCopyWidthEq4_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);
|
||||
|
||||
void McCopyWidthEq8_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);
|
||||
void McCopyWidthEq8_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);
|
||||
|
||||
void McCopyWidthEq16_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);
|
||||
void McCopyWidthEq16_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);
|
||||
|
||||
void McChromaWidthEq8_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t* pWeights, int32_t iHeight);
|
||||
void McChromaWidthEq8_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
|
||||
int32_t* pWeights, int32_t iHeight);
|
||||
|
||||
void McChromaWidthEq4_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t* pWeights, int32_t iHeight);
|
||||
void McChromaWidthEq4_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
|
||||
int32_t* pWeights, int32_t iHeight);
|
||||
|
||||
void PixelAvgWidthEq16_neon(uint8_t* pDst, int32_t iDstStride, uint8_t* pSrcA, uint8_t* pSrcB, int32_t iHeight);
|
||||
void PixelAvgWidthEq8_neon(uint8_t* pDst, int32_t iDstStride, uint8_t* pSrcA, uint8_t* pSrcB, int32_t iHeight);
|
||||
void PixelAvgWidthEq4_neon(uint8_t* pDst, int32_t iDstStride, uint8_t* pSrcA, uint8_t* pSrcB, int32_t iHeight);
|
||||
void PixelAvgWidthEq16_neon (uint8_t* pDst, int32_t iDstStride, uint8_t* pSrcA, uint8_t* pSrcB, int32_t iHeight);
|
||||
void PixelAvgWidthEq8_neon (uint8_t* pDst, int32_t iDstStride, uint8_t* pSrcA, uint8_t* pSrcB, int32_t iHeight);
|
||||
void PixelAvgWidthEq4_neon (uint8_t* pDst, int32_t iDstStride, uint8_t* pSrcA, uint8_t* pSrcB, int32_t iHeight);
|
||||
|
||||
void McHorVer01WidthEq16_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);
|
||||
void McHorVer01WidthEq8_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);
|
||||
void McHorVer01WidthEq4_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);
|
||||
void McHorVer03WidthEq16_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);
|
||||
void McHorVer03WidthEq8_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);
|
||||
void McHorVer03WidthEq4_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);
|
||||
void McHorVer01WidthEq16_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
|
||||
int32_t iHeight);
|
||||
void McHorVer01WidthEq8_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
|
||||
int32_t iHeight);
|
||||
void McHorVer01WidthEq4_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
|
||||
int32_t iHeight);
|
||||
void McHorVer03WidthEq16_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
|
||||
int32_t iHeight);
|
||||
void McHorVer03WidthEq8_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
|
||||
int32_t iHeight);
|
||||
void McHorVer03WidthEq4_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
|
||||
int32_t iHeight);
|
||||
|
||||
void McHorVer10WidthEq16_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);
|
||||
void McHorVer10WidthEq8_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);
|
||||
void McHorVer10WidthEq4_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);
|
||||
void McHorVer30WidthEq16_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);
|
||||
void McHorVer30WidthEq8_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);
|
||||
void McHorVer30WidthEq4_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);
|
||||
void McHorVer10WidthEq16_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
|
||||
int32_t iHeight);
|
||||
void McHorVer10WidthEq8_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
|
||||
int32_t iHeight);
|
||||
void McHorVer10WidthEq4_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
|
||||
int32_t iHeight);
|
||||
void McHorVer30WidthEq16_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
|
||||
int32_t iHeight);
|
||||
void McHorVer30WidthEq8_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
|
||||
int32_t iHeight);
|
||||
void McHorVer30WidthEq4_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
|
||||
int32_t iHeight);
|
||||
|
||||
//horizontal filter to gain half sample, that is (2, 0) location in quarter sample
|
||||
void McHorVer20WidthEq16_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);
|
||||
void McHorVer20WidthEq8_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);
|
||||
void McHorVer20WidthEq4_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);
|
||||
//horizontal filter to gain half sample, that is (2, 0) location in quarter sample
|
||||
void McHorVer20WidthEq16_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
|
||||
int32_t iHeight);
|
||||
void McHorVer20WidthEq8_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
|
||||
int32_t iHeight);
|
||||
void McHorVer20WidthEq4_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
|
||||
int32_t iHeight);
|
||||
|
||||
//vertical filter to gain half sample, that is (0, 2) location in quarter sample
|
||||
void McHorVer02WidthEq16_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);
|
||||
void McHorVer02WidthEq8_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);
|
||||
void McHorVer02WidthEq4_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);
|
||||
//vertical filter to gain half sample, that is (0, 2) location in quarter sample
|
||||
void McHorVer02WidthEq16_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
|
||||
int32_t iHeight);
|
||||
void McHorVer02WidthEq8_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
|
||||
int32_t iHeight);
|
||||
void McHorVer02WidthEq4_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
|
||||
int32_t iHeight);
|
||||
|
||||
//horizontal and vertical filter to gain half sample, that is (2, 2) location in quarter sample
|
||||
void McHorVer22WidthEq16_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);
|
||||
void McHorVer22WidthEq8_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);
|
||||
void McHorVer22WidthEq4_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);
|
||||
//horizontal and vertical filter to gain half sample, that is (2, 2) location in quarter sample
|
||||
void McHorVer22WidthEq16_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
|
||||
int32_t iHeight);
|
||||
void McHorVer22WidthEq8_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
|
||||
int32_t iHeight);
|
||||
void McHorVer22WidthEq4_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
|
||||
int32_t iHeight);
|
||||
|
||||
void PixStrideAvgWidthEq16_neon(uint8_t* pDst, int32_t iDstStride, const uint8_t* pSrcA, int32_t iSrcStrideA, const uint8_t* pSrcB, int32_t iSrcStrideB, int32_t iHeight);
|
||||
void PixStrideAvgWidthEq8_neon(uint8_t* pDst, int32_t iDstStride, const uint8_t* pSrcA, int32_t iSrcStrideA, const uint8_t* pSrcB, int32_t iSrcStrideB, int32_t iHeight);
|
||||
void PixStrideAvgWidthEq16_neon (uint8_t* pDst, int32_t iDstStride, const uint8_t* pSrcA, int32_t iSrcStrideA,
|
||||
const uint8_t* pSrcB, int32_t iSrcStrideB, int32_t iHeight);
|
||||
void PixStrideAvgWidthEq8_neon (uint8_t* pDst, int32_t iDstStride, const uint8_t* pSrcA, int32_t iSrcStrideA,
|
||||
const uint8_t* pSrcB, int32_t iSrcStrideB, int32_t iHeight);
|
||||
|
||||
void McHorVer20Width17_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);// width+1
|
||||
void McHorVer20Width9_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);// width+1
|
||||
void McHorVer20Width17_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
|
||||
int32_t iHeight);// width+1
|
||||
void McHorVer20Width9_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
|
||||
int32_t iHeight);// width+1
|
||||
|
||||
void McHorVer02Height17_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);// height+1
|
||||
void McHorVer02Height9_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);// height+1
|
||||
void McHorVer02Height17_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
|
||||
int32_t iHeight);// height+1
|
||||
void McHorVer02Height9_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
|
||||
int32_t iHeight);// height+1
|
||||
|
||||
void McHorVer22Width17_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);//width+1&&height+1
|
||||
void McHorVer22Width9_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);//width+1&&height+1
|
||||
void McHorVer22Width17_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
|
||||
int32_t iHeight);//width+1&&height+1
|
||||
void McHorVer22Width9_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
|
||||
int32_t iHeight);//width+1&&height+1
|
||||
#endif
|
||||
|
||||
#if defined(HAVE_NEON_AARCH64)
|
||||
void McCopyWidthEq4_AArch64_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
|
||||
int32_t iHeight);
|
||||
void McCopyWidthEq8_AArch64_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
|
||||
int32_t iHeight);
|
||||
void McCopyWidthEq16_AArch64_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
|
||||
int32_t iHeight);
|
||||
void McChromaWidthEq8_AArch64_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
|
||||
int32_t* pWeights, int32_t iHeight);
|
||||
void McChromaWidthEq4_AArch64_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
|
||||
int32_t* pWeights, int32_t iHeight);
|
||||
void PixelAvgWidthEq16_AArch64_neon (uint8_t* pDst, int32_t iDstStride, const uint8_t* pSrcA, int32_t iSrcAStride,
|
||||
const uint8_t* pSrcB, int32_t iSrcBStride, int32_t iHeight);
|
||||
void PixelAvgWidthEq8_AArch64_neon (uint8_t* pDst, int32_t iDstStride, const uint8_t* pSrcA, int32_t iSrcAStride,
|
||||
const uint8_t* pSrcB, int32_t iSrcBStride, int32_t iHeight);
|
||||
void PixelAvgWidthEq4_AArch64_neon (uint8_t* pDst, int32_t iDstStride, const uint8_t* pSrcA, int32_t iSrcAStride,
|
||||
const uint8_t* pSrcB, int32_t iSrcBStride, int32_t iHeight);
|
||||
void McHorVer01WidthEq16_AArch64_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
|
||||
int32_t iHeight);
|
||||
void McHorVer01WidthEq8_AArch64_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
|
||||
int32_t iHeight);
|
||||
void McHorVer01WidthEq4_AArch64_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
|
||||
int32_t iHeight);
|
||||
void McHorVer03WidthEq16_AArch64_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
|
||||
int32_t iHeight);
|
||||
void McHorVer03WidthEq8_AArch64_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
|
||||
int32_t iHeight);
|
||||
void McHorVer03WidthEq4_AArch64_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
|
||||
int32_t iHeight);
|
||||
void McHorVer10WidthEq16_AArch64_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
|
||||
int32_t iHeight);
|
||||
void McHorVer10WidthEq8_AArch64_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
|
||||
int32_t iHeight);
|
||||
void McHorVer10WidthEq4_AArch64_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
|
||||
int32_t iHeight);
|
||||
void McHorVer30WidthEq16_AArch64_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
|
||||
int32_t iHeight);
|
||||
void McHorVer30WidthEq8_AArch64_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
|
||||
int32_t iHeight);
|
||||
void McHorVer30WidthEq4_AArch64_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
|
||||
int32_t iHeight);
|
||||
//horizontal filter to gain half sample, that is (2, 0) location in quarter sample
|
||||
void McHorVer20WidthEq16_AArch64_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
|
||||
int32_t iHeight);
|
||||
void McHorVer20WidthEq8_AArch64_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
|
||||
int32_t iHeight);
|
||||
void McHorVer20WidthEq4_AArch64_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
|
||||
int32_t iHeight);
|
||||
//vertical filter to gain half sample, that is (0, 2) location in quarter sample
|
||||
void McHorVer02WidthEq16_AArch64_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
|
||||
int32_t iHeight);
|
||||
void McHorVer02WidthEq8_AArch64_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
|
||||
int32_t iHeight);
|
||||
void McHorVer02WidthEq4_AArch64_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
|
||||
int32_t iHeight);
|
||||
//horizontal and vertical filter to gain half sample, that is (2, 2) location in quarter sample
|
||||
void McHorVer22WidthEq16_AArch64_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
|
||||
int32_t iHeight);
|
||||
void McHorVer22WidthEq8_AArch64_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
|
||||
int32_t iHeight);
|
||||
void McHorVer22WidthEq4_AArch64_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
|
||||
int32_t iHeight);
|
||||
void PixStrideAvgWidthEq16_AArch64_neon (uint8_t* pDst, int32_t iDstStride, const uint8_t* pSrcA, int32_t iSrcStrideA,
|
||||
const uint8_t* pSrcB, int32_t iSrcStrideB, int32_t iHeight);
|
||||
void PixStrideAvgWidthEq8_AArch64_neon (uint8_t* pDst, int32_t iDstStride, const uint8_t* pSrcA, int32_t iSrcStrideA,
|
||||
const uint8_t* pSrcB, int32_t iSrcStrideB, int32_t iHeight);
|
||||
void McHorVer20Width17_AArch64_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
|
||||
int32_t iHeight);// width+1
|
||||
void McHorVer20Width9_AArch64_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
|
||||
int32_t iHeight);// width+1
|
||||
void McHorVer02Height17_AArch64_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
|
||||
int32_t iHeight);// height+1
|
||||
void McHorVer02Height9_AArch64_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
|
||||
int32_t iHeight);// height+1
|
||||
void McHorVer22Width17_AArch64_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
|
||||
int32_t iHeight);//width+1&&height+1
|
||||
void McHorVer22Width9_AArch64_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
|
||||
int32_t iHeight);//width+1&&height+1
|
||||
#endif
|
||||
|
||||
#if defined(X86_ASM)
|
||||
@ -131,18 +242,21 @@ void McHorVer22Width8HorFirst_sse2 (const uint8_t* pSrc, int32_t iSrcStride, uin
|
||||
void McHorVer22Width8VerLastAlign_sse2 (const uint8_t* pTap, int32_t iTapStride, uint8_t* pDst, int32_t iDstStride,
|
||||
int32_t iWidth, int32_t iHeight);
|
||||
void McHorVer22Width8VerLastUnAlign_sse2 (const uint8_t* pTap, int32_t iTapStride, uint8_t* pDst, int32_t iDstStride,
|
||||
int32_t iWidth, int32_t iHeight);
|
||||
int32_t iWidth, int32_t iHeight);
|
||||
|
||||
void PixelAvgWidthEq16_sse2 (uint8_t* pDst, int32_t iDstStride, const uint8_t* pSrcA, int32_t iSrcAStride,
|
||||
const uint8_t* pSrcB, int32_t iSrcBStride, int32_t iHeight);
|
||||
|
||||
void McHorVer20Width9Or17_sse2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iWidth,
|
||||
void McHorVer20Width9Or17_sse2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
|
||||
int32_t iWidth,
|
||||
int32_t iHeight);
|
||||
|
||||
void McHorVer02Height9Or17_sse2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iWidth,
|
||||
void McHorVer02Height9Or17_sse2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
|
||||
int32_t iWidth,
|
||||
int32_t iHeight);
|
||||
|
||||
void McHorVer22HorFirst_sse2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pTap, int32_t iTapStride, int32_t iWidth,
|
||||
void McHorVer22HorFirst_sse2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pTap, int32_t iTapStride,
|
||||
int32_t iWidth,
|
||||
int32_t iHeight);
|
||||
|
||||
//***************************************************************************//
|
||||
|
204
codec/common/inc/mc_common.h.orig
Normal file
204
codec/common/inc/mc_common.h.orig
Normal file
@ -0,0 +1,204 @@
|
||||
/*!
|
||||
* \copy
|
||||
* Copyright (c) 2013, Cisco Systems
|
||||
* All rights reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions
|
||||
* are met:
|
||||
*
|
||||
* * Redistributions of source code must retain the above copyright
|
||||
* notice, this list of conditions and the following disclaimer.
|
||||
*
|
||||
* * Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in
|
||||
* the documentation and/or other materials provided with the
|
||||
* distribution.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
|
||||
* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
|
||||
* COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
|
||||
* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
|
||||
* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
|
||||
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
|
||||
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
|
||||
* ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
* POSSIBILITY OF SUCH DAMAGE.
|
||||
*
|
||||
*/
|
||||
|
||||
#ifndef MC_COMMON_H
|
||||
#define MC_COMMON_H
|
||||
|
||||
#include "typedefs.h"
|
||||
|
||||
#if defined(__cplusplus)
|
||||
extern "C" {
|
||||
#endif//__cplusplus
|
||||
|
||||
#if defined(HAVE_NEON)
|
||||
void McCopyWidthEq4_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);
|
||||
|
||||
void McCopyWidthEq8_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);
|
||||
|
||||
void McCopyWidthEq16_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);
|
||||
|
||||
void McChromaWidthEq8_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t* pWeights, int32_t iHeight);
|
||||
|
||||
void McChromaWidthEq4_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t* pWeights, int32_t iHeight);
|
||||
|
||||
void PixelAvgWidthEq16_neon(uint8_t* pDst, int32_t iDstStride, uint8_t* pSrcA, uint8_t* pSrcB, int32_t iHeight);
|
||||
void PixelAvgWidthEq8_neon(uint8_t* pDst, int32_t iDstStride, uint8_t* pSrcA, uint8_t* pSrcB, int32_t iHeight);
|
||||
void PixelAvgWidthEq4_neon(uint8_t* pDst, int32_t iDstStride, uint8_t* pSrcA, uint8_t* pSrcB, int32_t iHeight);
|
||||
|
||||
void McHorVer01WidthEq16_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);
|
||||
void McHorVer01WidthEq8_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);
|
||||
void McHorVer01WidthEq4_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);
|
||||
void McHorVer03WidthEq16_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);
|
||||
void McHorVer03WidthEq8_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);
|
||||
void McHorVer03WidthEq4_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);
|
||||
|
||||
void McHorVer10WidthEq16_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);
|
||||
void McHorVer10WidthEq8_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);
|
||||
void McHorVer10WidthEq4_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);
|
||||
void McHorVer30WidthEq16_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);
|
||||
void McHorVer30WidthEq8_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);
|
||||
void McHorVer30WidthEq4_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);
|
||||
|
||||
//horizontal filter to gain half sample, that is (2, 0) location in quarter sample
|
||||
void McHorVer20WidthEq16_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);
|
||||
void McHorVer20WidthEq8_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);
|
||||
void McHorVer20WidthEq4_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);
|
||||
|
||||
//vertical filter to gain half sample, that is (0, 2) location in quarter sample
|
||||
void McHorVer02WidthEq16_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);
|
||||
void McHorVer02WidthEq8_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);
|
||||
void McHorVer02WidthEq4_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);
|
||||
|
||||
//horizontal and vertical filter to gain half sample, that is (2, 2) location in quarter sample
|
||||
void McHorVer22WidthEq16_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);
|
||||
void McHorVer22WidthEq8_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);
|
||||
void McHorVer22WidthEq4_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);
|
||||
|
||||
void PixStrideAvgWidthEq16_neon(uint8_t* pDst, int32_t iDstStride, const uint8_t* pSrcA, int32_t iSrcStrideA, const uint8_t* pSrcB, int32_t iSrcStrideB, int32_t iHeight);
|
||||
void PixStrideAvgWidthEq8_neon(uint8_t* pDst, int32_t iDstStride, const uint8_t* pSrcA, int32_t iSrcStrideA, const uint8_t* pSrcB, int32_t iSrcStrideB, int32_t iHeight);
|
||||
|
||||
void McHorVer20Width17_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);// width+1
|
||||
void McHorVer20Width9_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);// width+1
|
||||
|
||||
void McHorVer02Height17_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);// height+1
|
||||
void McHorVer02Height9_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);// height+1
|
||||
|
||||
void McHorVer22Width17_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);//width+1&&height+1
|
||||
void McHorVer22Width9_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);//width+1&&height+1
|
||||
#endif
|
||||
|
||||
#if defined(HAVE_NEON_AARCH64)
|
||||
void McCopyWidthEq4_AArch64_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);
|
||||
void McCopyWidthEq8_AArch64_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);
|
||||
void McCopyWidthEq16_AArch64_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);
|
||||
void McChromaWidthEq8_AArch64_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t* pWeights, int32_t iHeight);
|
||||
void McChromaWidthEq4_AArch64_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t* pWeights, int32_t iHeight);
|
||||
void PixelAvgWidthEq16_AArch64_neon(uint8_t* pDst, int32_t iDstStride, const uint8_t* pSrcA, int32_t iSrcAStride, const uint8_t* pSrcB, int32_t iSrcBStride, int32_t iHeight);
|
||||
void PixelAvgWidthEq8_AArch64_neon(uint8_t* pDst, int32_t iDstStride, const uint8_t* pSrcA, int32_t iSrcAStride, const uint8_t* pSrcB, int32_t iSrcBStride, int32_t iHeight);
|
||||
void PixelAvgWidthEq4_AArch64_neon(uint8_t* pDst, int32_t iDstStride, const uint8_t* pSrcA, int32_t iSrcAStride, const uint8_t* pSrcB, int32_t iSrcBStride, int32_t iHeight);
|
||||
void McHorVer01WidthEq16_AArch64_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);
|
||||
void McHorVer01WidthEq8_AArch64_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);
|
||||
void McHorVer01WidthEq4_AArch64_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);
|
||||
void McHorVer03WidthEq16_AArch64_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);
|
||||
void McHorVer03WidthEq8_AArch64_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);
|
||||
void McHorVer03WidthEq4_AArch64_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);
|
||||
void McHorVer10WidthEq16_AArch64_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);
|
||||
void McHorVer10WidthEq8_AArch64_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);
|
||||
void McHorVer10WidthEq4_AArch64_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);
|
||||
void McHorVer30WidthEq16_AArch64_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);
|
||||
void McHorVer30WidthEq8_AArch64_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);
|
||||
void McHorVer30WidthEq4_AArch64_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);
|
||||
//horizontal filter to gain half sample, that is (2, 0) location in quarter sample
|
||||
void McHorVer20WidthEq16_AArch64_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);
|
||||
void McHorVer20WidthEq8_AArch64_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);
|
||||
void McHorVer20WidthEq4_AArch64_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);
|
||||
//vertical filter to gain half sample, that is (0, 2) location in quarter sample
|
||||
void McHorVer02WidthEq16_AArch64_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);
|
||||
void McHorVer02WidthEq8_AArch64_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);
|
||||
void McHorVer02WidthEq4_AArch64_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);
|
||||
//horizontal and vertical filter to gain half sample, that is (2, 2) location in quarter sample
|
||||
void McHorVer22WidthEq16_AArch64_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);
|
||||
void McHorVer22WidthEq8_AArch64_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);
|
||||
void McHorVer22WidthEq4_AArch64_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);
|
||||
void PixStrideAvgWidthEq16_AArch64_neon(uint8_t* pDst, int32_t iDstStride, const uint8_t* pSrcA, int32_t iSrcStrideA, const uint8_t* pSrcB, int32_t iSrcStrideB, int32_t iHeight);
|
||||
void PixStrideAvgWidthEq8_AArch64_neon(uint8_t* pDst, int32_t iDstStride, const uint8_t* pSrcA, int32_t iSrcStrideA, const uint8_t* pSrcB, int32_t iSrcStrideB, int32_t iHeight);
|
||||
void McHorVer20Width17_AArch64_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);// width+1
|
||||
void McHorVer20Width9_AArch64_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);// width+1
|
||||
void McHorVer02Height17_AArch64_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);// height+1
|
||||
void McHorVer02Height9_AArch64_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);// height+1
|
||||
void McHorVer22Width17_AArch64_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);//width+1&&height+1
|
||||
void McHorVer22Width9_AArch64_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);//width+1&&height+1
|
||||
#endif
|
||||
|
||||
#if defined(X86_ASM)
|
||||
//***************************************************************************//
|
||||
// MMXEXT definition //
|
||||
//***************************************************************************//
|
||||
void McHorVer20WidthEq4_mmx (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
|
||||
int32_t iHeight);
|
||||
void McChromaWidthEq4_mmx (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
|
||||
const uint8_t* kpABCD, int32_t iHeight);
|
||||
void McCopyWidthEq4_mmx (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
|
||||
int32_t iHeight);
|
||||
void McCopyWidthEq8_mmx (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
|
||||
int32_t iHeight);
|
||||
void PixelAvgWidthEq4_mmx (uint8_t* pDst, int32_t iDstStride, const uint8_t* pSrcA, int32_t iSrcAStride,
|
||||
const uint8_t* pSrcB, int32_t iSrcBStride, int32_t iHeight);
|
||||
void PixelAvgWidthEq8_mmx (uint8_t* pDst, int32_t iDstStride, const uint8_t* pSrcA, int32_t iSrcAStride,
|
||||
const uint8_t* pSrcB, int32_t iSrcBStride, int32_t iHeight);
|
||||
|
||||
//***************************************************************************//
|
||||
// SSE2 definition //
|
||||
//***************************************************************************//
|
||||
void McChromaWidthEq8_sse2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
|
||||
const uint8_t* kpABCD, int32_t iHeight);
|
||||
void McCopyWidthEq16_sse2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
|
||||
int32_t iHeight);
|
||||
void McHorVer20WidthEq8_sse2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
|
||||
int32_t iHeight);
|
||||
void McHorVer20WidthEq16_sse2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
|
||||
int32_t iHeight);
|
||||
void McHorVer02WidthEq8_sse2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
|
||||
int32_t iHeight);
|
||||
void McHorVer22Width8HorFirst_sse2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
|
||||
int32_t iHeight);
|
||||
void McHorVer22Width8VerLastAlign_sse2 (const uint8_t* pTap, int32_t iTapStride, uint8_t* pDst, int32_t iDstStride,
|
||||
int32_t iWidth, int32_t iHeight);
|
||||
void McHorVer22Width8VerLastUnAlign_sse2 (const uint8_t* pTap, int32_t iTapStride, uint8_t* pDst, int32_t iDstStride,
|
||||
int32_t iWidth, int32_t iHeight);
|
||||
|
||||
void PixelAvgWidthEq16_sse2 (uint8_t* pDst, int32_t iDstStride, const uint8_t* pSrcA, int32_t iSrcAStride,
|
||||
const uint8_t* pSrcB, int32_t iSrcBStride, int32_t iHeight);
|
||||
|
||||
void McHorVer20Width9Or17_sse2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iWidth,
|
||||
int32_t iHeight);
|
||||
|
||||
void McHorVer02Height9Or17_sse2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iWidth,
|
||||
int32_t iHeight);
|
||||
|
||||
void McHorVer22HorFirst_sse2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pTap, int32_t iTapStride, int32_t iWidth,
|
||||
int32_t iHeight);
|
||||
|
||||
//***************************************************************************//
|
||||
// SSSE3 definition //
|
||||
//***************************************************************************//
|
||||
|
||||
void McChromaWidthEq8_ssse3 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
|
||||
const uint8_t* kpABCD, int32_t iHeight);
|
||||
|
||||
#endif //X86_ASM
|
||||
|
||||
#if defined(__cplusplus)
|
||||
}
|
||||
#endif//__cplusplus
|
||||
|
||||
#endif//MC_COMMON_H
|
File diff suppressed because it is too large
Load Diff
1305
codec/decoder/core/src/mc.cpp.orig
Normal file
1305
codec/decoder/core/src/mc.cpp.orig
Normal file
File diff suppressed because it is too large
Load Diff
@ -89,8 +89,10 @@ VerFilterFunc fpVerFilter = NULL;
|
||||
HorFilterFunc fpHorFilter = NULL;
|
||||
HorFilterFuncInput16Bits fpHorFilterInput16Bits = NULL;
|
||||
|
||||
typedef void (*WelsMcFunc0) (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);
|
||||
typedef void (*WelsMcFunc1) (uint8_t* pDst, int32_t iDstStride, const uint8_t* psrcA, int32_t iSrcAStride, const uint8_t* pSrcB,
|
||||
typedef void (*WelsMcFunc0) (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
|
||||
int32_t iHeight);
|
||||
typedef void (*WelsMcFunc1) (uint8_t* pDst, int32_t iDstStride, const uint8_t* psrcA, int32_t iSrcAStride,
|
||||
const uint8_t* pSrcB,
|
||||
int32_t iSrcBStride, int32_t iHeight);
|
||||
WelsMcFunc0 McCopyWidthEq16 = NULL;
|
||||
WelsMcFunc0 McCopyWidthEq8 = NULL;
|
||||
@ -323,7 +325,8 @@ static inline void McHorVer33WidthEq16 (const uint8_t* pSrc, int32_t iSrcStride,
|
||||
pfPixelAvgWidthEq16 (pDst, iDstStride, pTmp, 16, &pTmp[256], 16, iHeight);
|
||||
}
|
||||
|
||||
static inline void McHorVer20_c (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iWidth,
|
||||
static inline void McHorVer20_c (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
|
||||
int32_t iWidth,
|
||||
int32_t iHeight) {
|
||||
int32_t i, j;
|
||||
for (i = 0; i < iHeight; i++) {
|
||||
@ -335,7 +338,8 @@ static inline void McHorVer20_c (const uint8_t* pSrc, int32_t iSrcStride, uint8_
|
||||
}
|
||||
}
|
||||
//vertical filter to gain half sample, that is (0, 2) location in quarter sample
|
||||
static inline void McHorVer02_c (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iWidth,
|
||||
static inline void McHorVer02_c (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
|
||||
int32_t iWidth,
|
||||
int32_t iHeight) {
|
||||
int32_t i, j;
|
||||
for (i = 0; i < iHeight; i++) {
|
||||
@ -347,7 +351,8 @@ static inline void McHorVer02_c (const uint8_t* pSrc, int32_t iSrcStride, uint8_
|
||||
}
|
||||
}
|
||||
//horizontal and vertical filter to gain half sample, that is (2, 2) location in quarter sample
|
||||
static inline void McHorVer22_c (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iWidth,
|
||||
static inline void McHorVer22_c (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
|
||||
int32_t iWidth,
|
||||
int32_t iHeight) {
|
||||
int16_t pTmp[17 + 5] = {0}; //w+1
|
||||
int32_t i, j, k;
|
||||
@ -481,94 +486,190 @@ void McChroma_ssse3 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int
|
||||
|
||||
#endif //X86_ASM
|
||||
|
||||
//***************************************************************************//
|
||||
// NEON implementation //
|
||||
//***************************************************************************//
|
||||
//***************************************************************************//
|
||||
// NEON implementation //
|
||||
//***************************************************************************//
|
||||
#if defined(HAVE_NEON)
|
||||
void McHorVer20Width9Or17_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
|
||||
int32_t iWidth, int32_t iHeight) {
|
||||
void McHorVer20Width9Or17_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
|
||||
int32_t iWidth, int32_t iHeight) {
|
||||
if (iWidth == 17)
|
||||
McHorVer20Width17_neon(pSrc, iSrcStride, pDst, iDstStride, iHeight);
|
||||
McHorVer20Width17_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight);
|
||||
else //if (iWidth == 9)
|
||||
McHorVer20Width9_neon(pSrc, iSrcStride, pDst, iDstStride, iHeight);
|
||||
McHorVer20Width9_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight);
|
||||
}
|
||||
void McHorVer02Height9Or17_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
|
||||
int32_t iWidth, int32_t iHeight){
|
||||
void McHorVer02Height9Or17_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
|
||||
int32_t iWidth, int32_t iHeight) {
|
||||
if (iWidth == 16)
|
||||
McHorVer02Height17_neon(pSrc, iSrcStride, pDst, iDstStride, iHeight);
|
||||
McHorVer02Height17_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight);
|
||||
else //if (iWidth == 8)
|
||||
McHorVer02Height9_neon(pSrc, iSrcStride, pDst, iDstStride, iHeight);
|
||||
McHorVer02Height9_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight);
|
||||
}
|
||||
void McHorVer22Width9Or17Height9Or17_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
|
||||
int32_t iWidth, int32_t iHeight){
|
||||
void McHorVer22Width9Or17Height9Or17_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
|
||||
int32_t iWidth, int32_t iHeight) {
|
||||
if (iWidth == 17)
|
||||
McHorVer22Width17_neon(pSrc, iSrcStride, pDst, iDstStride, iHeight);
|
||||
McHorVer22Width17_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight);
|
||||
else //if (iWidth == 9)
|
||||
McHorVer22Width9_neon(pSrc, iSrcStride, pDst, iDstStride, iHeight);
|
||||
McHorVer22Width9_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight);
|
||||
}
|
||||
void EncMcHorVer11_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight) {
|
||||
ENFORCE_STACK_ALIGN_1D(uint8_t, pTmp, 512, 16)
|
||||
McHorVer20WidthEq16_neon(pSrc, iSrcStride, pTmp, 16, iHeight);
|
||||
McHorVer02WidthEq16_neon(pSrc, iSrcStride, &pTmp[256], 16, iHeight);
|
||||
PixelAvgWidthEq16_neon(pDst, iDstStride, pTmp, &pTmp[256], iHeight);
|
||||
void EncMcHorVer11_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight) {
|
||||
ENFORCE_STACK_ALIGN_1D (uint8_t, pTmp, 512, 16)
|
||||
McHorVer20WidthEq16_neon (pSrc, iSrcStride, pTmp, 16, iHeight);
|
||||
McHorVer02WidthEq16_neon (pSrc, iSrcStride, &pTmp[256], 16, iHeight);
|
||||
PixelAvgWidthEq16_neon (pDst, iDstStride, pTmp, &pTmp[256], iHeight);
|
||||
}
|
||||
void EncMcHorVer12_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight) {
|
||||
ENFORCE_STACK_ALIGN_1D(uint8_t, pTmp, 512, 16)
|
||||
McHorVer02WidthEq16_neon(pSrc, iSrcStride, pTmp, 16, iHeight);
|
||||
McHorVer22WidthEq16_neon(pSrc, iSrcStride, &pTmp[256], 16, iHeight);
|
||||
PixelAvgWidthEq16_neon(pDst, iDstStride, pTmp, &pTmp[256], iHeight);
|
||||
void EncMcHorVer12_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight) {
|
||||
ENFORCE_STACK_ALIGN_1D (uint8_t, pTmp, 512, 16)
|
||||
McHorVer02WidthEq16_neon (pSrc, iSrcStride, pTmp, 16, iHeight);
|
||||
McHorVer22WidthEq16_neon (pSrc, iSrcStride, &pTmp[256], 16, iHeight);
|
||||
PixelAvgWidthEq16_neon (pDst, iDstStride, pTmp, &pTmp[256], iHeight);
|
||||
}
|
||||
void EncMcHorVer13_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight) {
|
||||
ENFORCE_STACK_ALIGN_1D(uint8_t, pTmp, 512, 16)
|
||||
McHorVer20WidthEq16_neon(pSrc+iSrcStride, iSrcStride, pTmp, 16, iHeight);
|
||||
McHorVer02WidthEq16_neon(pSrc, iSrcStride, &pTmp[256], 16, iHeight);
|
||||
PixelAvgWidthEq16_neon(pDst, iDstStride, pTmp, &pTmp[256], iHeight);
|
||||
void EncMcHorVer13_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight) {
|
||||
ENFORCE_STACK_ALIGN_1D (uint8_t, pTmp, 512, 16)
|
||||
McHorVer20WidthEq16_neon (pSrc + iSrcStride, iSrcStride, pTmp, 16, iHeight);
|
||||
McHorVer02WidthEq16_neon (pSrc, iSrcStride, &pTmp[256], 16, iHeight);
|
||||
PixelAvgWidthEq16_neon (pDst, iDstStride, pTmp, &pTmp[256], iHeight);
|
||||
}
|
||||
void EncMcHorVer21_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight) {
|
||||
ENFORCE_STACK_ALIGN_1D(uint8_t, pTmp, 512, 16)
|
||||
McHorVer20WidthEq16_neon(pSrc, iSrcStride, pTmp, 16, iHeight);
|
||||
McHorVer22WidthEq16_neon(pSrc, iSrcStride, &pTmp[256], 16, iHeight);
|
||||
PixelAvgWidthEq16_neon(pDst, iDstStride, pTmp, &pTmp[256], iHeight);
|
||||
void EncMcHorVer21_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight) {
|
||||
ENFORCE_STACK_ALIGN_1D (uint8_t, pTmp, 512, 16)
|
||||
McHorVer20WidthEq16_neon (pSrc, iSrcStride, pTmp, 16, iHeight);
|
||||
McHorVer22WidthEq16_neon (pSrc, iSrcStride, &pTmp[256], 16, iHeight);
|
||||
PixelAvgWidthEq16_neon (pDst, iDstStride, pTmp, &pTmp[256], iHeight);
|
||||
}
|
||||
void EncMcHorVer23_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight) {
|
||||
ENFORCE_STACK_ALIGN_1D(uint8_t, pTmp, 512, 16)
|
||||
McHorVer20WidthEq16_neon(pSrc+iSrcStride, iSrcStride, pTmp, 16, iHeight);
|
||||
McHorVer22WidthEq16_neon(pSrc, iSrcStride, &pTmp[256], 16, iHeight);
|
||||
PixelAvgWidthEq16_neon(pDst, iDstStride, pTmp, &pTmp[256], iHeight);
|
||||
void EncMcHorVer23_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight) {
|
||||
ENFORCE_STACK_ALIGN_1D (uint8_t, pTmp, 512, 16)
|
||||
McHorVer20WidthEq16_neon (pSrc + iSrcStride, iSrcStride, pTmp, 16, iHeight);
|
||||
McHorVer22WidthEq16_neon (pSrc, iSrcStride, &pTmp[256], 16, iHeight);
|
||||
PixelAvgWidthEq16_neon (pDst, iDstStride, pTmp, &pTmp[256], iHeight);
|
||||
}
|
||||
void EncMcHorVer31_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight) {
|
||||
ENFORCE_STACK_ALIGN_1D(uint8_t, pTmp, 512, 16)
|
||||
McHorVer20WidthEq16_neon(pSrc, iSrcStride, pTmp, 16, iHeight);
|
||||
McHorVer02WidthEq16_neon(pSrc+1, iSrcStride, &pTmp[256], 16, iHeight);
|
||||
PixelAvgWidthEq16_neon(pDst, iDstStride, pTmp, &pTmp[256], iHeight);
|
||||
void EncMcHorVer31_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight) {
|
||||
ENFORCE_STACK_ALIGN_1D (uint8_t, pTmp, 512, 16)
|
||||
McHorVer20WidthEq16_neon (pSrc, iSrcStride, pTmp, 16, iHeight);
|
||||
McHorVer02WidthEq16_neon (pSrc + 1, iSrcStride, &pTmp[256], 16, iHeight);
|
||||
PixelAvgWidthEq16_neon (pDst, iDstStride, pTmp, &pTmp[256], iHeight);
|
||||
}
|
||||
void EncMcHorVer32_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight) {
|
||||
ENFORCE_STACK_ALIGN_1D(uint8_t, pTmp, 512, 16)
|
||||
McHorVer02WidthEq16_neon(pSrc+1, iSrcStride, pTmp, 16, iHeight);
|
||||
McHorVer22WidthEq16_neon(pSrc, iSrcStride, &pTmp[256], 16, iHeight);
|
||||
PixelAvgWidthEq16_neon(pDst, iDstStride, pTmp, &pTmp[256], iHeight);
|
||||
void EncMcHorVer32_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight) {
|
||||
ENFORCE_STACK_ALIGN_1D (uint8_t, pTmp, 512, 16)
|
||||
McHorVer02WidthEq16_neon (pSrc + 1, iSrcStride, pTmp, 16, iHeight);
|
||||
McHorVer22WidthEq16_neon (pSrc, iSrcStride, &pTmp[256], 16, iHeight);
|
||||
PixelAvgWidthEq16_neon (pDst, iDstStride, pTmp, &pTmp[256], iHeight);
|
||||
}
|
||||
void EncMcHorVer33_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight) {
|
||||
ENFORCE_STACK_ALIGN_1D(uint8_t, pTmp, 512, 16)
|
||||
McHorVer20WidthEq16_neon(pSrc+iSrcStride, iSrcStride, pTmp, 16, iHeight);
|
||||
McHorVer02WidthEq16_neon(pSrc+1, iSrcStride, &pTmp[256], 16, iHeight);
|
||||
PixelAvgWidthEq16_neon(pDst, iDstStride, pTmp, &pTmp[256], iHeight);
|
||||
void EncMcHorVer33_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight) {
|
||||
ENFORCE_STACK_ALIGN_1D (uint8_t, pTmp, 512, 16)
|
||||
McHorVer20WidthEq16_neon (pSrc + iSrcStride, iSrcStride, pTmp, 16, iHeight);
|
||||
McHorVer02WidthEq16_neon (pSrc + 1, iSrcStride, &pTmp[256], 16, iHeight);
|
||||
PixelAvgWidthEq16_neon (pDst, iDstStride, pTmp, &pTmp[256], iHeight);
|
||||
}
|
||||
void EncMcChroma_neon(const uint8_t *pSrc, int32_t iSrcStride, uint8_t *pDst, int32_t iDstStride,
|
||||
SMVUnitXY sMv, int32_t iWidth, int32_t iHeight) {
|
||||
const int32_t kiD8x = sMv.iMvX&0x07;
|
||||
const int32_t kiD8y = sMv.iMvY&0x07;
|
||||
void EncMcChroma_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
|
||||
SMVUnitXY sMv, int32_t iWidth, int32_t iHeight) {
|
||||
const int32_t kiD8x = sMv.iMvX & 0x07;
|
||||
const int32_t kiD8y = sMv.iMvY & 0x07;
|
||||
if (0 == kiD8x && 0 == kiD8y) {
|
||||
if(8 == iWidth)
|
||||
McCopyWidthEq8_neon(pSrc, iSrcStride, pDst, iDstStride, iHeight);
|
||||
if (8 == iWidth)
|
||||
McCopyWidthEq8_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight);
|
||||
else // iWidth == 4
|
||||
McCopyWidthEq4_neon(pSrc, iSrcStride, pDst, iDstStride, iHeight);
|
||||
}
|
||||
else {
|
||||
if(8 == iWidth)
|
||||
McChromaWidthEq8_neon(pSrc, iSrcStride, pDst, iDstStride, (int32_t*)(g_kuiABCD[kiD8y][kiD8x]), iHeight);
|
||||
McCopyWidthEq4_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight);
|
||||
} else {
|
||||
if (8 == iWidth)
|
||||
McChromaWidthEq8_neon (pSrc, iSrcStride, pDst, iDstStride, (int32_t*) (g_kuiABCD[kiD8y][kiD8x]), iHeight);
|
||||
else //if(4 == iWidth)
|
||||
McChromaWidthEq4_neon(pSrc, iSrcStride, pDst, iDstStride, (int32_t*)(g_kuiABCD[kiD8y][kiD8x]), iHeight);
|
||||
McChromaWidthEq4_neon (pSrc, iSrcStride, pDst, iDstStride, (int32_t*) (g_kuiABCD[kiD8y][kiD8x]), iHeight);
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
#if defined(HAVE_NEON_AARCH64)
|
||||
void McHorVer20Width9Or17_AArch64_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
|
||||
int32_t iWidth, int32_t iHeight) {
|
||||
if (iWidth == 17)
|
||||
McHorVer20Width17_AArch64_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight);
|
||||
else //if (iWidth == 9)
|
||||
McHorVer20Width9_AArch64_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight);
|
||||
}
|
||||
void McHorVer02Height9Or17_AArch64_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
|
||||
int32_t iWidth, int32_t iHeight) {
|
||||
if (iWidth == 16)
|
||||
McHorVer02Height17_AArch64_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight);
|
||||
else //if (iWidth == 8)
|
||||
McHorVer02Height9_AArch64_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight);
|
||||
}
|
||||
void McHorVer22Width9Or17Height9Or17_AArch64_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst,
|
||||
int32_t iDstStride,
|
||||
int32_t iWidth, int32_t iHeight) {
|
||||
if (iWidth == 17)
|
||||
McHorVer22Width17_AArch64_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight);
|
||||
else //if (iWidth == 9)
|
||||
McHorVer22Width9_AArch64_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight);
|
||||
}
|
||||
void EncMcHorVer11_AArch64_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
|
||||
int32_t iHeight) {
|
||||
ENFORCE_STACK_ALIGN_1D (uint8_t, pTmp, 512, 16)
|
||||
McHorVer20WidthEq16_AArch64_neon (pSrc, iSrcStride, pTmp, 16, iHeight);
|
||||
McHorVer02WidthEq16_AArch64_neon (pSrc, iSrcStride, &pTmp[256], 16, iHeight);
|
||||
PixelAvgWidthEq16_AArch64_neon (pDst, iDstStride, pTmp, 16, &pTmp[256], 16, iHeight);
|
||||
}
|
||||
void EncMcHorVer12_AArch64_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
|
||||
int32_t iHeight) {
|
||||
ENFORCE_STACK_ALIGN_1D (uint8_t, pTmp, 512, 16)
|
||||
McHorVer02WidthEq16_AArch64_neon (pSrc, iSrcStride, pTmp, 16, iHeight);
|
||||
McHorVer22WidthEq16_AArch64_neon (pSrc, iSrcStride, &pTmp[256], 16, iHeight);
|
||||
PixelAvgWidthEq16_AArch64_neon (pDst, iDstStride, pTmp, 16, &pTmp[256], 16, iHeight);
|
||||
}
|
||||
void EncMcHorVer13_AArch64_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
|
||||
int32_t iHeight) {
|
||||
ENFORCE_STACK_ALIGN_1D (uint8_t, pTmp, 512, 16)
|
||||
McHorVer20WidthEq16_AArch64_neon (pSrc + iSrcStride, iSrcStride, pTmp, 16, iHeight);
|
||||
McHorVer02WidthEq16_AArch64_neon (pSrc, iSrcStride, &pTmp[256], 16, iHeight);
|
||||
PixelAvgWidthEq16_AArch64_neon (pDst, iDstStride, pTmp, 16, &pTmp[256], 16, iHeight);
|
||||
}
|
||||
void EncMcHorVer21_AArch64_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
|
||||
int32_t iHeight) {
|
||||
ENFORCE_STACK_ALIGN_1D (uint8_t, pTmp, 512, 16)
|
||||
McHorVer20WidthEq16_AArch64_neon (pSrc, iSrcStride, pTmp, 16, iHeight);
|
||||
McHorVer22WidthEq16_AArch64_neon (pSrc, iSrcStride, &pTmp[256], 16, iHeight);
|
||||
PixelAvgWidthEq16_AArch64_neon (pDst, iDstStride, pTmp, 16, &pTmp[256], 16, iHeight);
|
||||
}
|
||||
void EncMcHorVer23_AArch64_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
|
||||
int32_t iHeight) {
|
||||
ENFORCE_STACK_ALIGN_1D (uint8_t, pTmp, 512, 16)
|
||||
McHorVer20WidthEq16_AArch64_neon (pSrc + iSrcStride, iSrcStride, pTmp, 16, iHeight);
|
||||
McHorVer22WidthEq16_AArch64_neon (pSrc, iSrcStride, &pTmp[256], 16, iHeight);
|
||||
PixelAvgWidthEq16_AArch64_neon (pDst, iDstStride, pTmp, 16, &pTmp[256], 16, iHeight);
|
||||
}
|
||||
void EncMcHorVer31_AArch64_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
|
||||
int32_t iHeight) {
|
||||
ENFORCE_STACK_ALIGN_1D (uint8_t, pTmp, 512, 16)
|
||||
McHorVer20WidthEq16_AArch64_neon (pSrc, iSrcStride, pTmp, 16, iHeight);
|
||||
McHorVer02WidthEq16_AArch64_neon (pSrc + 1, iSrcStride, &pTmp[256], 16, iHeight);
|
||||
PixelAvgWidthEq16_AArch64_neon (pDst, iDstStride, pTmp, 16, &pTmp[256], 16, iHeight);
|
||||
}
|
||||
void EncMcHorVer32_AArch64_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
|
||||
int32_t iHeight) {
|
||||
ENFORCE_STACK_ALIGN_1D (uint8_t, pTmp, 512, 16)
|
||||
McHorVer02WidthEq16_AArch64_neon (pSrc + 1, iSrcStride, pTmp, 16, iHeight);
|
||||
McHorVer22WidthEq16_AArch64_neon (pSrc, iSrcStride, &pTmp[256], 16, iHeight);
|
||||
PixelAvgWidthEq16_AArch64_neon (pDst, iDstStride, pTmp, 16, &pTmp[256], 16, iHeight);
|
||||
}
|
||||
void EncMcHorVer33_AArch64_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
|
||||
int32_t iHeight) {
|
||||
ENFORCE_STACK_ALIGN_1D (uint8_t, pTmp, 512, 16)
|
||||
McHorVer20WidthEq16_AArch64_neon (pSrc + iSrcStride, iSrcStride, pTmp, 16, iHeight);
|
||||
McHorVer02WidthEq16_AArch64_neon (pSrc + 1, iSrcStride, &pTmp[256], 16, iHeight);
|
||||
PixelAvgWidthEq16_AArch64_neon (pDst, iDstStride, pTmp, 16, &pTmp[256], 16, iHeight);
|
||||
}
|
||||
void EncMcChroma_AArch64_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
|
||||
SMVUnitXY sMv, int32_t iWidth, int32_t iHeight) {
|
||||
const int32_t kiD8x = sMv.iMvX & 0x07;
|
||||
const int32_t kiD8y = sMv.iMvY & 0x07;
|
||||
if (0 == kiD8x && 0 == kiD8y) {
|
||||
if (8 == iWidth)
|
||||
McCopyWidthEq8_AArch64_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight);
|
||||
else // iWidth == 4
|
||||
McCopyWidthEq4_AArch64_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight);
|
||||
} else {
|
||||
if (8 == iWidth)
|
||||
McChromaWidthEq8_AArch64_neon (pSrc, iSrcStride, pDst, iDstStride, (int32_t*) (g_kuiABCD[kiD8y][kiD8x]), iHeight);
|
||||
else //if(4 == iWidth)
|
||||
McChromaWidthEq4_AArch64_neon (pSrc, iSrcStride, pDst, iDstStride, (int32_t*) (g_kuiABCD[kiD8y][kiD8x]), iHeight);
|
||||
}
|
||||
}
|
||||
#endif
|
||||
@ -599,7 +700,14 @@ void WelsInitMcFuncs (SWelsFuncPtrList* pFuncList, uint32_t uiCpuFlag) {
|
||||
McHorVer03WidthEq16_neon, EncMcHorVer13_neon, EncMcHorVer23_neon, EncMcHorVer33_neon
|
||||
};
|
||||
#endif
|
||||
|
||||
#if defined(HAVE_NEON_AARCH64)
|
||||
static PWelsLumaQuarpelMcFunc pWelsMcFuncWidthEq16_AArch64_neon[16] = { //[x][y]
|
||||
McCopyWidthEq16_AArch64_neon, McHorVer10WidthEq16_AArch64_neon, McHorVer20WidthEq16_AArch64_neon, McHorVer30WidthEq16_AArch64_neon,
|
||||
McHorVer01WidthEq16_AArch64_neon, EncMcHorVer11_AArch64_neon, EncMcHorVer21_AArch64_neon, EncMcHorVer31_AArch64_neon,
|
||||
McHorVer02WidthEq16_AArch64_neon, EncMcHorVer12_AArch64_neon, McHorVer22WidthEq16_AArch64_neon, EncMcHorVer32_AArch64_neon,
|
||||
McHorVer03WidthEq16_AArch64_neon, EncMcHorVer13_AArch64_neon, EncMcHorVer23_AArch64_neon, EncMcHorVer33_AArch64_neon
|
||||
};
|
||||
#endif
|
||||
pFuncList->sMcFuncs.pfLumaHalfpelHor = McHorVer20_c;
|
||||
pFuncList->sMcFuncs.pfLumaHalfpelVer = McHorVer02_c;
|
||||
pFuncList->sMcFuncs.pfLumaHalfpelCen = McHorVer22_c;
|
||||
@ -651,5 +759,16 @@ void WelsInitMcFuncs (SWelsFuncPtrList* pFuncList, uint32_t uiCpuFlag) {
|
||||
pFuncList->sMcFuncs.pfLumaHalfpelCen = McHorVer22Width9Or17Height9Or17_neon;//iWidth+1/heigh+1
|
||||
}
|
||||
#endif
|
||||
#if defined(HAVE_NEON_AARCH64)
|
||||
if (uiCpuFlag & WELS_CPU_NEON) {
|
||||
pFuncList->sMcFuncs.pfLumaQuarpelMc = pWelsMcFuncWidthEq16_AArch64_neon;
|
||||
pFuncList->sMcFuncs.pfChromaMc = EncMcChroma_AArch64_neon;
|
||||
pFuncList->sMcFuncs.pfSampleAveraging[0] = PixStrideAvgWidthEq8_AArch64_neon;
|
||||
pFuncList->sMcFuncs.pfSampleAveraging[1] = PixStrideAvgWidthEq16_AArch64_neon;
|
||||
pFuncList->sMcFuncs.pfLumaHalfpelHor = McHorVer20Width9Or17_AArch64_neon;//iWidth+1:8/16
|
||||
pFuncList->sMcFuncs.pfLumaHalfpelVer = McHorVer02Height9Or17_AArch64_neon;//heigh+1:8/16
|
||||
pFuncList->sMcFuncs.pfLumaHalfpelCen = McHorVer22Width9Or17Height9Or17_AArch64_neon;//iWidth+1/heigh+1
|
||||
}
|
||||
#endif
|
||||
}
|
||||
}
|
||||
|
762
codec/encoder/core/src/mc.cpp.orig
Normal file
762
codec/encoder/core/src/mc.cpp.orig
Normal file
@ -0,0 +1,762 @@
|
||||
/*!
|
||||
* \copy
|
||||
* Copyright (c) 2009-2013, Cisco Systems
|
||||
* All rights reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions
|
||||
* are met:
|
||||
*
|
||||
* * Redistributions of source code must retain the above copyright
|
||||
* notice, this list of conditions and the following disclaimer.
|
||||
*
|
||||
* * Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in
|
||||
* the documentation and/or other materials provided with the
|
||||
* distribution.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
|
||||
* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
|
||||
* COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
|
||||
* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
|
||||
* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
|
||||
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
|
||||
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
|
||||
* ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
* POSSIBILITY OF SUCH DAMAGE.
|
||||
*
|
||||
*
|
||||
* \file mc.c
|
||||
*
|
||||
* \brief Interfaces implementation for motion compensation
|
||||
*
|
||||
* \date 03/17/2009 Created
|
||||
*
|
||||
*************************************************************************************
|
||||
*/
|
||||
|
||||
#include "mc.h"
|
||||
#include "cpu_core.h"
|
||||
|
||||
namespace WelsSVCEnc {
|
||||
/*------------------weight for chroma fraction pixel interpolation------------------*/
|
||||
//kuiA = (8 - dx) * (8 - dy);
|
||||
//kuiB = dx * (8 - dy);
|
||||
//kuiC = (8 - dx) * dy;
|
||||
//kuiD = dx * dy
|
||||
static const uint8_t g_kuiABCD[8][8][4] = { ////g_kuiA[dy][dx], g_kuiB[dy][dx], g_kuiC[dy][dx], g_kuiD[dy][dx]
|
||||
{
|
||||
{64, 0, 0, 0}, {56, 8, 0, 0}, {48, 16, 0, 0}, {40, 24, 0, 0},
|
||||
{32, 32, 0, 0}, {24, 40, 0, 0}, {16, 48, 0, 0}, {8, 56, 0, 0}
|
||||
},
|
||||
{
|
||||
{56, 0, 8, 0}, {49, 7, 7, 1}, {42, 14, 6, 2}, {35, 21, 5, 3},
|
||||
{28, 28, 4, 4}, {21, 35, 3, 5}, {14, 42, 2, 6}, {7, 49, 1, 7}
|
||||
},
|
||||
{
|
||||
{48, 0, 16, 0}, {42, 6, 14, 2}, {36, 12, 12, 4}, {30, 18, 10, 6},
|
||||
{24, 24, 8, 8}, {18, 30, 6, 10}, {12, 36, 4, 12}, {6, 42, 2, 14}
|
||||
},
|
||||
{
|
||||
{40, 0, 24, 0}, {35, 5, 21, 3}, {30, 10, 18, 6}, {25, 15, 15, 9},
|
||||
{20, 20, 12, 12}, {15, 25, 9, 15}, {10, 30, 6, 18}, {5, 35, 3, 21}
|
||||
},
|
||||
{
|
||||
{32, 0, 32, 0}, {28, 4, 28, 4}, {24, 8, 24, 8}, {20, 12, 20, 12},
|
||||
{16, 16, 16, 16}, {12, 20, 12, 20}, {8, 24, 8, 24}, {4, 28, 4, 28}
|
||||
},
|
||||
{
|
||||
{24, 0, 40, 0}, {21, 3, 35, 5}, {18, 6, 30, 10}, {15, 9, 25, 15},
|
||||
{12, 12, 20, 20}, {9, 15, 15, 25}, {6, 18, 10, 30}, {3, 21, 5, 35}
|
||||
},
|
||||
{
|
||||
{16, 0, 48, 0}, {14, 2, 42, 6}, {12, 4, 36, 12}, {10, 6, 30, 18},
|
||||
{8, 8, 24, 24}, {6, 10, 18, 30}, {4, 12, 12, 36}, {2, 14, 6, 42}
|
||||
},
|
||||
{
|
||||
{8, 0, 56, 0}, {7, 1, 49, 7}, {6, 2, 42, 14}, {5, 3, 35, 21},
|
||||
{4, 4, 28, 28}, {3, 5, 21, 35}, {2, 6, 14, 42}, {1, 7, 7, 49}
|
||||
}
|
||||
};
|
||||
typedef int32_t (*VerFilterFunc) (const uint8_t* pSrc, const int32_t kiSrcStride);
|
||||
typedef int32_t (*HorFilterFunc) (const uint8_t* pSrc);
|
||||
typedef int32_t (*HorFilterFuncInput16Bits) (int16_t* pSrc);
|
||||
|
||||
VerFilterFunc fpVerFilter = NULL;
|
||||
HorFilterFunc fpHorFilter = NULL;
|
||||
HorFilterFuncInput16Bits fpHorFilterInput16Bits = NULL;
|
||||
|
||||
typedef void (*WelsMcFunc0) (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);
|
||||
typedef void (*WelsMcFunc1) (uint8_t* pDst, int32_t iDstStride, const uint8_t* psrcA, int32_t iSrcAStride, const uint8_t* pSrcB,
|
||||
int32_t iSrcBStride, int32_t iHeight);
|
||||
WelsMcFunc0 McCopyWidthEq16 = NULL;
|
||||
WelsMcFunc0 McCopyWidthEq8 = NULL;
|
||||
WelsMcFunc0 McCopyWidthEq4 = NULL;
|
||||
WelsMcFunc0 pfMcHorVer02WidthEq16 = NULL;
|
||||
WelsMcFunc1 pfPixelAvgWidthEq16 = NULL;
|
||||
WelsMcFunc0 pfMcHorVer20WidthEq16 = NULL;
|
||||
WelsMcFunc0 pfMcHorVer22WidthEq16 = NULL;
|
||||
|
||||
//***************************************************************************//
|
||||
// C code implementation //
|
||||
//***************************************************************************//
|
||||
static inline void McCopyWidthEq4_c (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
|
||||
int32_t iHeight) {
|
||||
int32_t i;
|
||||
for (i = 0; i < iHeight; i++) {
|
||||
memcpy (pDst, pSrc, 4); // confirmed_safe_unsafe_usage
|
||||
pDst += iDstStride;
|
||||
pSrc += iSrcStride;
|
||||
}
|
||||
}
|
||||
|
||||
static inline void McCopyWidthEq8_c (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
|
||||
int32_t iHeight)
|
||||
|
||||
{
|
||||
int32_t i;
|
||||
for (i = 0; i < iHeight; i++) {
|
||||
memcpy (pDst, pSrc, 8); // confirmed_safe_unsafe_usage
|
||||
pDst += iDstStride;
|
||||
pSrc += iSrcStride;
|
||||
}
|
||||
}
|
||||
static inline void McCopyWidthEq16_c (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
|
||||
int32_t iHeight) {
|
||||
int32_t i;
|
||||
for (i = 0; i < iHeight; i++) {
|
||||
memcpy (pDst, pSrc, 16); // confirmed_safe_unsafe_usage
|
||||
pDst += iDstStride;
|
||||
pSrc += iSrcStride;
|
||||
}
|
||||
}
|
||||
|
||||
//--------------------Luma sample MC------------------//
|
||||
static inline int32_t HorFilter_c (const uint8_t* pSrc) {
|
||||
int32_t iPix05 = pSrc[-2] + pSrc[3];
|
||||
int32_t iPix14 = pSrc[-1] + pSrc[2];
|
||||
int32_t iPix23 = pSrc[ 0] + pSrc[1];
|
||||
|
||||
return (iPix05 - ((iPix14 << 2) + iPix14) + (iPix23 << 4) + (iPix23 << 2));
|
||||
}
|
||||
|
||||
static inline int32_t HorFilterInput16bit1_c (int16_t* pSrc) {
|
||||
int32_t iPix05 = pSrc[-2] + pSrc[3];
|
||||
int32_t iPix14 = pSrc[-1] + pSrc[2];
|
||||
int32_t iPix23 = pSrc[ 0] + pSrc[1];
|
||||
|
||||
return (iPix05 - ((iPix14 << 2) + iPix14) + (iPix23 << 4) + (iPix23 << 2));
|
||||
}
|
||||
static inline int32_t VerFilter_c (const uint8_t* pSrc, const int32_t kiSrcStride) {
|
||||
const int32_t kiLine1 = kiSrcStride;
|
||||
const int32_t kiLine2 = (kiSrcStride << 1);
|
||||
const int32_t kiLine3 = kiLine1 + kiLine2;
|
||||
const uint32_t kuiPix05 = * (pSrc - kiLine2) + * (pSrc + kiLine3);
|
||||
const uint32_t kuiPix14 = * (pSrc - kiLine1) + * (pSrc + kiLine2);
|
||||
const uint32_t kuiPix23 = * (pSrc) + * (pSrc + kiLine1);
|
||||
|
||||
return (kuiPix05 - ((kuiPix14 << 2) + kuiPix14) + (kuiPix23 << 4) + (kuiPix23 << 2));
|
||||
}
|
||||
|
||||
static inline void PixelAvgWidthEq8_c (uint8_t* pDst, int32_t iDstStride, const uint8_t* pSrcA, int32_t iSrcAStride,
|
||||
const uint8_t* pSrcB, int32_t iSrcBStride, int32_t iHeight) {
|
||||
int32_t i, j;
|
||||
for (i = 0; i < iHeight; i++) {
|
||||
for (j = 0; j < 8; j++) {
|
||||
pDst[j] = (pSrcA[j] + pSrcB[j] + 1) >> 1;
|
||||
}
|
||||
pDst += iDstStride;
|
||||
pSrcA += iSrcAStride;
|
||||
pSrcB += iSrcBStride;
|
||||
}
|
||||
}
|
||||
static inline void PixelAvgWidthEq16_c (uint8_t* pDst, int32_t iDstStride, const uint8_t* pSrcA, int32_t iSrcAStride,
|
||||
const uint8_t* pSrcB, int32_t iSrcBStride, int32_t iHeight) {
|
||||
int32_t i, j;
|
||||
for (i = 0; i < iHeight; i++) {
|
||||
for (j = 0; j < 16; j++) {
|
||||
pDst[j] = (pSrcA[j] + pSrcB[j] + 1) >> 1;
|
||||
}
|
||||
pDst += iDstStride;
|
||||
pSrcA += iSrcAStride;
|
||||
pSrcB += iSrcBStride;
|
||||
}
|
||||
}
|
||||
|
||||
//horizontal filter to gain half sample, that is (2, 0) location in quarter sample
|
||||
static inline void McHorVer20WidthEq16_c (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
|
||||
int32_t iHeight) {
|
||||
int32_t i, j;
|
||||
for (i = 0; i < iHeight; i++) {
|
||||
for (j = 0; j < 16; j++) {
|
||||
pDst[j] = WelsClip1 ((fpHorFilter (pSrc + j) + 16) >> 5);
|
||||
}
|
||||
pDst += iDstStride;
|
||||
pSrc += iSrcStride;
|
||||
}
|
||||
}
|
||||
//vertical filter to gain half sample, that is (0, 2) location in quarter sample
|
||||
static inline void McHorVer02WidthEq16_c (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
|
||||
int32_t iHeight) {
|
||||
int32_t i, j;
|
||||
for (i = 0; i < iHeight; i++) {
|
||||
for (j = 0; j < 16; j++) {
|
||||
pDst[j] = WelsClip1 ((fpVerFilter (pSrc + j, iSrcStride) + 16) >> 5);
|
||||
}
|
||||
pDst += iDstStride;
|
||||
pSrc += iSrcStride;
|
||||
}
|
||||
}
|
||||
//horizontal and vertical filter to gain half sample, that is (2, 2) location in quarter sample
|
||||
static inline void McHorVer22WidthEq16_c (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
|
||||
int32_t iHeight) {
|
||||
int16_t pTmp[16 + 5] = {0}; //16
|
||||
int32_t i, j, k;
|
||||
|
||||
for (i = 0; i < iHeight; i++) {
|
||||
for (j = 0; j < 16 + 5; j++) {
|
||||
pTmp[j] = fpVerFilter (pSrc - 2 + j, iSrcStride);
|
||||
}
|
||||
for (k = 0; k < 16; k++) {
|
||||
pDst[k] = WelsClip1 ((fpHorFilterInput16Bits (&pTmp[2 + k]) + 512) >> 10);
|
||||
}
|
||||
pSrc += iSrcStride;
|
||||
pDst += iDstStride;
|
||||
}
|
||||
}
|
||||
|
||||
/////////////////////luma MC//////////////////////////
|
||||
|
||||
static inline void McHorVer01WidthEq16 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
|
||||
int32_t iHeight) {
|
||||
ENFORCE_STACK_ALIGN_1D (uint8_t, pTmp, 256, 16)
|
||||
|
||||
pfMcHorVer02WidthEq16 (pSrc, iSrcStride, pTmp, 16, iHeight);
|
||||
pfPixelAvgWidthEq16 (pDst, iDstStride, pSrc, iSrcStride, pTmp, 16, iHeight);
|
||||
}
|
||||
static inline void McHorVer03WidthEq16 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
|
||||
int32_t iHeight) {
|
||||
ENFORCE_STACK_ALIGN_1D (uint8_t, pTmp, 256, 16)
|
||||
|
||||
pfMcHorVer02WidthEq16 (pSrc, iSrcStride, pTmp, 16, iHeight);
|
||||
pfPixelAvgWidthEq16 (pDst, iDstStride, pSrc + iSrcStride, iSrcStride, pTmp, 16, iHeight);
|
||||
}
|
||||
static inline void McHorVer10WidthEq16 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
|
||||
int32_t iHeight) {
|
||||
ENFORCE_STACK_ALIGN_1D (uint8_t, pTmp, 256, 16)
|
||||
|
||||
pfMcHorVer20WidthEq16 (pSrc, iSrcStride, pTmp, 16, iHeight);
|
||||
pfPixelAvgWidthEq16 (pDst, iDstStride, pSrc, iSrcStride, pTmp, 16, iHeight);
|
||||
}
|
||||
static inline void McHorVer11WidthEq16 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
|
||||
int32_t iHeight) {
|
||||
ENFORCE_STACK_ALIGN_1D (uint8_t, pTmp, 512, 16)
|
||||
|
||||
pfMcHorVer20WidthEq16 (pSrc, iSrcStride, pTmp, 16, iHeight);
|
||||
pfMcHorVer02WidthEq16 (pSrc, iSrcStride, &pTmp[256], 16, iHeight);
|
||||
pfPixelAvgWidthEq16 (pDst, iDstStride, pTmp, 16, &pTmp[256], 16, iHeight);
|
||||
}
|
||||
static inline void McHorVer12WidthEq16 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
|
||||
int32_t iHeight) {
|
||||
ENFORCE_STACK_ALIGN_1D (uint8_t, pTmp, 512, 16)
|
||||
|
||||
pfMcHorVer02WidthEq16 (pSrc, iSrcStride, pTmp, 16, iHeight);
|
||||
pfMcHorVer22WidthEq16 (pSrc, iSrcStride, &pTmp[256], 16, iHeight);
|
||||
pfPixelAvgWidthEq16 (pDst, iDstStride, pTmp, 16, &pTmp[256], 16, iHeight);
|
||||
}
|
||||
static inline void McHorVer13WidthEq16 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
|
||||
int32_t iHeight) {
|
||||
ENFORCE_STACK_ALIGN_1D (uint8_t, pTmp, 512, 16)
|
||||
|
||||
pfMcHorVer20WidthEq16 (pSrc + iSrcStride, iSrcStride, pTmp, 16, iHeight);
|
||||
pfMcHorVer02WidthEq16 (pSrc, iSrcStride, &pTmp[256], 16, iHeight);
|
||||
pfPixelAvgWidthEq16 (pDst, iDstStride, pTmp, 16, &pTmp[256], 16, iHeight);
|
||||
}
|
||||
static inline void McHorVer21WidthEq16 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
|
||||
int32_t iHeight) {
|
||||
ENFORCE_STACK_ALIGN_1D (uint8_t, pTmp, 512, 16)
|
||||
|
||||
pfMcHorVer20WidthEq16 (pSrc, iSrcStride, pTmp, 16, iHeight);
|
||||
pfMcHorVer22WidthEq16 (pSrc, iSrcStride, &pTmp[256], 16, iHeight);
|
||||
pfPixelAvgWidthEq16 (pDst, iDstStride, pTmp, 16, &pTmp[256], 16, iHeight);
|
||||
}
|
||||
static inline void McHorVer23WidthEq16 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
|
||||
int32_t iHeight) {
|
||||
ENFORCE_STACK_ALIGN_1D (uint8_t, pTmp, 512, 16)
|
||||
|
||||
pfMcHorVer20WidthEq16 (pSrc + iSrcStride, iSrcStride, pTmp, 16, iHeight);
|
||||
pfMcHorVer22WidthEq16 (pSrc, iSrcStride, &pTmp[256], 16, iHeight);
|
||||
pfPixelAvgWidthEq16 (pDst, iDstStride, pTmp, 16, &pTmp[256], 16, iHeight);
|
||||
}
|
||||
static inline void McHorVer30WidthEq16 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
|
||||
int32_t iHeight) {
|
||||
ENFORCE_STACK_ALIGN_1D (uint8_t, pTmp, 256, 16)
|
||||
|
||||
pfMcHorVer20WidthEq16 (pSrc, iSrcStride, pTmp, 16, iHeight);
|
||||
pfPixelAvgWidthEq16 (pDst, iDstStride, pSrc + 1, iSrcStride, pTmp, 16, iHeight);
|
||||
}
|
||||
static inline void McHorVer31WidthEq16 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
|
||||
int32_t iHeight) {
|
||||
ENFORCE_STACK_ALIGN_1D (uint8_t, pTmp, 512, 16)
|
||||
|
||||
pfMcHorVer20WidthEq16 (pSrc, iSrcStride, pTmp, 16, iHeight);
|
||||
pfMcHorVer02WidthEq16 (pSrc + 1, iSrcStride, &pTmp[256], 16, iHeight);
|
||||
pfPixelAvgWidthEq16 (pDst, iDstStride, pTmp, 16, &pTmp[256], 16, iHeight);
|
||||
}
|
||||
static inline void McHorVer32WidthEq16 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
|
||||
int32_t iHeight) {
|
||||
ENFORCE_STACK_ALIGN_1D (uint8_t, pTmp, 512, 16)
|
||||
|
||||
pfMcHorVer02WidthEq16 (pSrc + 1, iSrcStride, pTmp, 16, iHeight);
|
||||
pfMcHorVer22WidthEq16 (pSrc, iSrcStride, &pTmp[256], 16, iHeight);
|
||||
pfPixelAvgWidthEq16 (pDst, iDstStride, pTmp, 16, &pTmp[256], 16, iHeight);
|
||||
}
|
||||
static inline void McHorVer33WidthEq16 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
|
||||
int32_t iHeight) {
|
||||
ENFORCE_STACK_ALIGN_1D (uint8_t, pTmp, 512, 16)
|
||||
|
||||
pfMcHorVer20WidthEq16 (pSrc + iSrcStride, iSrcStride, pTmp, 16, iHeight);
|
||||
pfMcHorVer02WidthEq16 (pSrc + 1, iSrcStride, &pTmp[256], 16, iHeight);
|
||||
pfPixelAvgWidthEq16 (pDst, iDstStride, pTmp, 16, &pTmp[256], 16, iHeight);
|
||||
}
|
||||
|
||||
static inline void McHorVer20_c (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iWidth,
|
||||
int32_t iHeight) {
|
||||
int32_t i, j;
|
||||
for (i = 0; i < iHeight; i++) {
|
||||
for (j = 0; j < iWidth; j++) {
|
||||
pDst[j] = WelsClip1 ((fpHorFilter (pSrc + j) + 16) >> 5);
|
||||
}
|
||||
pDst += iDstStride;
|
||||
pSrc += iSrcStride;
|
||||
}
|
||||
}
|
||||
//vertical filter to gain half sample, that is (0, 2) location in quarter sample
|
||||
static inline void McHorVer02_c (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iWidth,
|
||||
int32_t iHeight) {
|
||||
int32_t i, j;
|
||||
for (i = 0; i < iHeight; i++) {
|
||||
for (j = 0; j < iWidth; j++) {
|
||||
pDst[j] = WelsClip1 ((fpVerFilter (pSrc + j, iSrcStride) + 16) >> 5);
|
||||
}
|
||||
pDst += iDstStride;
|
||||
pSrc += iSrcStride;
|
||||
}
|
||||
}
|
||||
//horizontal and vertical filter to gain half sample, that is (2, 2) location in quarter sample
|
||||
static inline void McHorVer22_c (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iWidth,
|
||||
int32_t iHeight) {
|
||||
int16_t pTmp[17 + 5] = {0}; //w+1
|
||||
int32_t i, j, k;
|
||||
|
||||
for (i = 0; i < iHeight; i++) {
|
||||
for (j = 0; j < iWidth + 5; j++) {
|
||||
pTmp[j] = fpVerFilter (pSrc - 2 + j, iSrcStride);
|
||||
}
|
||||
for (k = 0; k < iWidth; k++) {
|
||||
pDst[k] = WelsClip1 ((fpHorFilterInput16Bits (&pTmp[2 + k]) + 512) >> 10);
|
||||
}
|
||||
pSrc += iSrcStride;
|
||||
pDst += iDstStride;
|
||||
}
|
||||
}
|
||||
static inline void McCopy (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iWidth,
|
||||
int32_t iHeight) {
|
||||
int32_t i;
|
||||
if (iWidth == 16 && McCopyWidthEq16 != NULL)
|
||||
McCopyWidthEq16 (pSrc, iSrcStride, pDst, iDstStride, iHeight);
|
||||
else if (iWidth == 8 && McCopyWidthEq8 != NULL)
|
||||
McCopyWidthEq8 (pSrc, iSrcStride, pDst, iDstStride, iHeight);
|
||||
else if (iWidth == 4 && McCopyWidthEq4 != NULL)
|
||||
McCopyWidthEq4 (pSrc, iSrcStride, pDst, iDstStride, iHeight);
|
||||
else {
|
||||
for (i = 0; i < iHeight; i++) {
|
||||
memcpy (pDst, pSrc, iWidth); // confirmed_safe_unsafe_usage
|
||||
pDst += iDstStride;
|
||||
pSrc += iSrcStride;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void McChroma_c (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
|
||||
SMVUnitXY mv, int32_t iWidth, int32_t iHeight)
|
||||
//pSrc has been added the offset of mv
|
||||
{
|
||||
const int32_t kiDx = mv.iMvX & 0x07;
|
||||
const int32_t kiDy = mv.iMvY & 0x07;
|
||||
|
||||
if (0 == kiDx && 0 == kiDy) {
|
||||
McCopy (pSrc, iSrcStride, pDst, iDstStride, iWidth, iHeight);
|
||||
} else {
|
||||
const int32_t kiDA = g_kuiABCD[kiDy][kiDx][0];
|
||||
const int32_t kiDB = g_kuiABCD[kiDy][kiDx][1];
|
||||
const int32_t kiDC = g_kuiABCD[kiDy][kiDx][2];
|
||||
const int32_t kiDD = g_kuiABCD[kiDy][kiDx][3];
|
||||
|
||||
int32_t i, j;
|
||||
|
||||
const uint8_t* pSrcNext = pSrc + iSrcStride;
|
||||
|
||||
for (i = 0; i < iHeight; i++) {
|
||||
for (j = 0; j < iWidth; j++) {
|
||||
pDst[j] = (kiDA * pSrc[j] + kiDB * pSrc[j + 1] + kiDC * pSrcNext[j] + kiDD * pSrcNext[j + 1] + 32) >> 6;
|
||||
}
|
||||
pDst += iDstStride;
|
||||
pSrc = pSrcNext;
|
||||
pSrcNext += iSrcStride;
|
||||
}
|
||||
}
|
||||
}
|
||||
//***************************************************************************//
|
||||
// MMXEXT and SSE2 implementation //
|
||||
//***************************************************************************//
|
||||
#if defined(X86_ASM)
|
||||
|
||||
static inline void McHorVer22WidthEq8_sse2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
|
||||
int32_t iHeight) {
|
||||
ENFORCE_STACK_ALIGN_2D (int16_t, pTap, 21, 8, 16)
|
||||
McHorVer22Width8HorFirst_sse2 (pSrc - 2, iSrcStride, (uint8_t*)pTap, 16, iHeight + 5);
|
||||
McHorVer22Width8VerLastAlign_sse2 ((uint8_t*)pTap, 16, pDst, iDstStride, 8, iHeight);
|
||||
}
|
||||
|
||||
//2010.2.5
|
||||
|
||||
static inline void McHorVer02WidthEq16_sse2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* PDst, int32_t iDstStride,
|
||||
int32_t iHeight) {
|
||||
McHorVer02WidthEq8_sse2 (pSrc, iSrcStride, PDst, iDstStride, iHeight);
|
||||
McHorVer02WidthEq8_sse2 (&pSrc[8], iSrcStride, &PDst[8], iDstStride, iHeight);
|
||||
}
|
||||
static inline void McHorVer22WidthEq16_sse2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
|
||||
int32_t iHeight) {
|
||||
McHorVer22WidthEq8_sse2 (pSrc, iSrcStride, pDst, iDstStride, iHeight);
|
||||
McHorVer22WidthEq8_sse2 (&pSrc[8], iSrcStride, &pDst[8], iDstStride, iHeight);
|
||||
}
|
||||
void McHorVer22Width9Or17Height9Or17_sse2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
|
||||
int32_t iWidth,
|
||||
int32_t iHeight) {
|
||||
ENFORCE_STACK_ALIGN_2D (int16_t, pTap, 22, 24, 16)
|
||||
int32_t tmp1 = 2 * (iWidth - 8);
|
||||
McHorVer22HorFirst_sse2 (pSrc - 2, iSrcStride, (uint8_t*)pTap, 48, iWidth, iHeight + 5);
|
||||
McHorVer22Width8VerLastAlign_sse2 ((uint8_t*)pTap, 48, pDst, iDstStride, iWidth - 1, iHeight);
|
||||
McHorVer22Width8VerLastUnAlign_sse2 ((uint8_t*)pTap + tmp1, 48, pDst + iWidth - 8, iDstStride, 8, iHeight);
|
||||
}
|
||||
|
||||
typedef void (*McChromaWidthEqx) (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
|
||||
const uint8_t* pABCD, int32_t iHeigh);
|
||||
void McChroma_sse2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
|
||||
SMVUnitXY sMv, int32_t iWidth, int32_t iHeight) {
|
||||
const int32_t kiD8x = sMv.iMvX & 0x07;
|
||||
const int32_t kiD8y = sMv.iMvY & 0x07;
|
||||
static const McChromaWidthEqx kpfFuncs[2] = {
|
||||
McChromaWidthEq4_mmx,
|
||||
McChromaWidthEq8_sse2
|
||||
};
|
||||
|
||||
if (0 == kiD8x && 0 == kiD8y) {
|
||||
McCopy (pSrc, iSrcStride, pDst, iDstStride, iWidth, iHeight);
|
||||
} else {
|
||||
kpfFuncs[ (iWidth >> 3)] (pSrc, iSrcStride, pDst, iDstStride, g_kuiABCD[kiD8y][kiD8x], iHeight);
|
||||
}
|
||||
}
|
||||
|
||||
void McChroma_ssse3 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
|
||||
SMVUnitXY sMv, int32_t iWidth, int32_t iHeight) {
|
||||
const int32_t kiD8x = sMv.iMvX & 0x07;
|
||||
const int32_t kiD8y = sMv.iMvY & 0x07;
|
||||
|
||||
static const McChromaWidthEqx kpfFuncs[2] = {
|
||||
McChromaWidthEq4_mmx,
|
||||
McChromaWidthEq8_ssse3
|
||||
};
|
||||
if (0 == kiD8x && 0 == kiD8y) {
|
||||
McCopy (pSrc, iSrcStride, pDst, iDstStride, iWidth, iHeight);
|
||||
} else {
|
||||
kpfFuncs[ (iWidth >> 3)] (pSrc, iSrcStride, pDst, iDstStride, g_kuiABCD[kiD8y][kiD8x], iHeight);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
#endif //X86_ASM
|
||||
|
||||
//***************************************************************************//
|
||||
// NEON implementation //
|
||||
//***************************************************************************//
|
||||
#if defined(HAVE_NEON)
|
||||
void McHorVer20Width9Or17_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
|
||||
int32_t iWidth, int32_t iHeight) {
|
||||
if (iWidth == 17)
|
||||
McHorVer20Width17_neon(pSrc, iSrcStride, pDst, iDstStride, iHeight);
|
||||
else //if (iWidth == 9)
|
||||
McHorVer20Width9_neon(pSrc, iSrcStride, pDst, iDstStride, iHeight);
|
||||
}
|
||||
void McHorVer02Height9Or17_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
|
||||
int32_t iWidth, int32_t iHeight){
|
||||
if (iWidth == 16)
|
||||
McHorVer02Height17_neon(pSrc, iSrcStride, pDst, iDstStride, iHeight);
|
||||
else //if (iWidth == 8)
|
||||
McHorVer02Height9_neon(pSrc, iSrcStride, pDst, iDstStride, iHeight);
|
||||
}
|
||||
void McHorVer22Width9Or17Height9Or17_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
|
||||
int32_t iWidth, int32_t iHeight){
|
||||
if (iWidth == 17)
|
||||
McHorVer22Width17_neon(pSrc, iSrcStride, pDst, iDstStride, iHeight);
|
||||
else //if (iWidth == 9)
|
||||
McHorVer22Width9_neon(pSrc, iSrcStride, pDst, iDstStride, iHeight);
|
||||
}
|
||||
void EncMcHorVer11_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight) {
|
||||
ENFORCE_STACK_ALIGN_1D(uint8_t, pTmp, 512, 16)
|
||||
McHorVer20WidthEq16_neon(pSrc, iSrcStride, pTmp, 16, iHeight);
|
||||
McHorVer02WidthEq16_neon(pSrc, iSrcStride, &pTmp[256], 16, iHeight);
|
||||
PixelAvgWidthEq16_neon(pDst, iDstStride, pTmp, &pTmp[256], iHeight);
|
||||
}
|
||||
void EncMcHorVer12_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight) {
|
||||
ENFORCE_STACK_ALIGN_1D(uint8_t, pTmp, 512, 16)
|
||||
McHorVer02WidthEq16_neon(pSrc, iSrcStride, pTmp, 16, iHeight);
|
||||
McHorVer22WidthEq16_neon(pSrc, iSrcStride, &pTmp[256], 16, iHeight);
|
||||
PixelAvgWidthEq16_neon(pDst, iDstStride, pTmp, &pTmp[256], iHeight);
|
||||
}
|
||||
void EncMcHorVer13_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight) {
|
||||
ENFORCE_STACK_ALIGN_1D(uint8_t, pTmp, 512, 16)
|
||||
McHorVer20WidthEq16_neon(pSrc+iSrcStride, iSrcStride, pTmp, 16, iHeight);
|
||||
McHorVer02WidthEq16_neon(pSrc, iSrcStride, &pTmp[256], 16, iHeight);
|
||||
PixelAvgWidthEq16_neon(pDst, iDstStride, pTmp, &pTmp[256], iHeight);
|
||||
}
|
||||
void EncMcHorVer21_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight) {
|
||||
ENFORCE_STACK_ALIGN_1D(uint8_t, pTmp, 512, 16)
|
||||
McHorVer20WidthEq16_neon(pSrc, iSrcStride, pTmp, 16, iHeight);
|
||||
McHorVer22WidthEq16_neon(pSrc, iSrcStride, &pTmp[256], 16, iHeight);
|
||||
PixelAvgWidthEq16_neon(pDst, iDstStride, pTmp, &pTmp[256], iHeight);
|
||||
}
|
||||
void EncMcHorVer23_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight) {
|
||||
ENFORCE_STACK_ALIGN_1D(uint8_t, pTmp, 512, 16)
|
||||
McHorVer20WidthEq16_neon(pSrc+iSrcStride, iSrcStride, pTmp, 16, iHeight);
|
||||
McHorVer22WidthEq16_neon(pSrc, iSrcStride, &pTmp[256], 16, iHeight);
|
||||
PixelAvgWidthEq16_neon(pDst, iDstStride, pTmp, &pTmp[256], iHeight);
|
||||
}
|
||||
void EncMcHorVer31_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight) {
|
||||
ENFORCE_STACK_ALIGN_1D(uint8_t, pTmp, 512, 16)
|
||||
McHorVer20WidthEq16_neon(pSrc, iSrcStride, pTmp, 16, iHeight);
|
||||
McHorVer02WidthEq16_neon(pSrc+1, iSrcStride, &pTmp[256], 16, iHeight);
|
||||
PixelAvgWidthEq16_neon(pDst, iDstStride, pTmp, &pTmp[256], iHeight);
|
||||
}
|
||||
void EncMcHorVer32_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight) {
|
||||
ENFORCE_STACK_ALIGN_1D(uint8_t, pTmp, 512, 16)
|
||||
McHorVer02WidthEq16_neon(pSrc+1, iSrcStride, pTmp, 16, iHeight);
|
||||
McHorVer22WidthEq16_neon(pSrc, iSrcStride, &pTmp[256], 16, iHeight);
|
||||
PixelAvgWidthEq16_neon(pDst, iDstStride, pTmp, &pTmp[256], iHeight);
|
||||
}
|
||||
void EncMcHorVer33_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight) {
|
||||
ENFORCE_STACK_ALIGN_1D(uint8_t, pTmp, 512, 16)
|
||||
McHorVer20WidthEq16_neon(pSrc+iSrcStride, iSrcStride, pTmp, 16, iHeight);
|
||||
McHorVer02WidthEq16_neon(pSrc+1, iSrcStride, &pTmp[256], 16, iHeight);
|
||||
PixelAvgWidthEq16_neon(pDst, iDstStride, pTmp, &pTmp[256], iHeight);
|
||||
}
|
||||
void EncMcChroma_neon(const uint8_t *pSrc, int32_t iSrcStride, uint8_t *pDst, int32_t iDstStride,
|
||||
SMVUnitXY sMv, int32_t iWidth, int32_t iHeight) {
|
||||
const int32_t kiD8x = sMv.iMvX&0x07;
|
||||
const int32_t kiD8y = sMv.iMvY&0x07;
|
||||
if (0 == kiD8x && 0 == kiD8y) {
|
||||
if(8 == iWidth)
|
||||
McCopyWidthEq8_neon(pSrc, iSrcStride, pDst, iDstStride, iHeight);
|
||||
else // iWidth == 4
|
||||
McCopyWidthEq4_neon(pSrc, iSrcStride, pDst, iDstStride, iHeight);
|
||||
}
|
||||
else {
|
||||
if(8 == iWidth)
|
||||
McChromaWidthEq8_neon(pSrc, iSrcStride, pDst, iDstStride, (int32_t*)(g_kuiABCD[kiD8y][kiD8x]), iHeight);
|
||||
else //if(4 == iWidth)
|
||||
McChromaWidthEq4_neon(pSrc, iSrcStride, pDst, iDstStride, (int32_t*)(g_kuiABCD[kiD8y][kiD8x]), iHeight);
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
#if defined(HAVE_NEON_AARCH64)
|
||||
void McHorVer20Width9Or17_AArch64_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
|
||||
int32_t iWidth, int32_t iHeight) {
|
||||
if (iWidth == 17)
|
||||
McHorVer20Width17_AArch64_neon(pSrc, iSrcStride, pDst, iDstStride, iHeight);
|
||||
else //if (iWidth == 9)
|
||||
McHorVer20Width9_AArch64_neon(pSrc, iSrcStride, pDst, iDstStride, iHeight);
|
||||
}
|
||||
void McHorVer02Height9Or17_AArch64_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
|
||||
int32_t iWidth, int32_t iHeight){
|
||||
if (iWidth == 16)
|
||||
McHorVer02Height17_AArch64_neon(pSrc, iSrcStride, pDst, iDstStride, iHeight);
|
||||
else //if (iWidth == 8)
|
||||
McHorVer02Height9_AArch64_neon(pSrc, iSrcStride, pDst, iDstStride, iHeight);
|
||||
}
|
||||
void McHorVer22Width9Or17Height9Or17_AArch64_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
|
||||
int32_t iWidth, int32_t iHeight){
|
||||
if (iWidth == 17)
|
||||
McHorVer22Width17_AArch64_neon(pSrc, iSrcStride, pDst, iDstStride, iHeight);
|
||||
else //if (iWidth == 9)
|
||||
McHorVer22Width9_AArch64_neon(pSrc, iSrcStride, pDst, iDstStride, iHeight);
|
||||
}
|
||||
void EncMcHorVer11_AArch64_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight) {
|
||||
ENFORCE_STACK_ALIGN_1D(uint8_t, pTmp, 512, 16)
|
||||
McHorVer20WidthEq16_AArch64_neon(pSrc, iSrcStride, pTmp, 16, iHeight);
|
||||
McHorVer02WidthEq16_AArch64_neon(pSrc, iSrcStride, &pTmp[256], 16, iHeight);
|
||||
PixelAvgWidthEq16_AArch64_neon(pDst, iDstStride, pTmp, 16, &pTmp[256], 16,iHeight);
|
||||
}
|
||||
void EncMcHorVer12_AArch64_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight) {
|
||||
ENFORCE_STACK_ALIGN_1D(uint8_t, pTmp, 512, 16)
|
||||
McHorVer02WidthEq16_AArch64_neon(pSrc, iSrcStride, pTmp, 16, iHeight);
|
||||
McHorVer22WidthEq16_AArch64_neon(pSrc, iSrcStride, &pTmp[256], 16, iHeight);
|
||||
PixelAvgWidthEq16_AArch64_neon(pDst, iDstStride, pTmp, 16, &pTmp[256], 16, iHeight);
|
||||
}
|
||||
void EncMcHorVer13_AArch64_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight) {
|
||||
ENFORCE_STACK_ALIGN_1D(uint8_t, pTmp, 512, 16)
|
||||
McHorVer20WidthEq16_AArch64_neon(pSrc+iSrcStride, iSrcStride, pTmp, 16, iHeight);
|
||||
McHorVer02WidthEq16_AArch64_neon(pSrc, iSrcStride, &pTmp[256], 16, iHeight);
|
||||
PixelAvgWidthEq16_AArch64_neon(pDst, iDstStride, pTmp, 16, &pTmp[256], 16, iHeight);
|
||||
}
|
||||
void EncMcHorVer21_AArch64_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight) {
|
||||
ENFORCE_STACK_ALIGN_1D(uint8_t, pTmp, 512, 16)
|
||||
McHorVer20WidthEq16_AArch64_neon(pSrc, iSrcStride, pTmp, 16, iHeight);
|
||||
McHorVer22WidthEq16_AArch64_neon(pSrc, iSrcStride, &pTmp[256], 16, iHeight);
|
||||
PixelAvgWidthEq16_AArch64_neon(pDst, iDstStride, pTmp, 16, &pTmp[256], 16, iHeight);
|
||||
}
|
||||
void EncMcHorVer23_AArch64_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight) {
|
||||
ENFORCE_STACK_ALIGN_1D(uint8_t, pTmp, 512, 16)
|
||||
McHorVer20WidthEq16_AArch64_neon(pSrc+iSrcStride, iSrcStride, pTmp, 16, iHeight);
|
||||
McHorVer22WidthEq16_AArch64_neon(pSrc, iSrcStride, &pTmp[256], 16, iHeight);
|
||||
PixelAvgWidthEq16_AArch64_neon(pDst, iDstStride, pTmp, 16, &pTmp[256], 16, iHeight);
|
||||
}
|
||||
void EncMcHorVer31_AArch64_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight) {
|
||||
ENFORCE_STACK_ALIGN_1D(uint8_t, pTmp, 512, 16)
|
||||
McHorVer20WidthEq16_AArch64_neon(pSrc, iSrcStride, pTmp, 16, iHeight);
|
||||
McHorVer02WidthEq16_AArch64_neon(pSrc+1, iSrcStride, &pTmp[256], 16, iHeight);
|
||||
PixelAvgWidthEq16_AArch64_neon(pDst, iDstStride, pTmp, 16, &pTmp[256], 16, iHeight);
|
||||
}
|
||||
void EncMcHorVer32_AArch64_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight) {
|
||||
ENFORCE_STACK_ALIGN_1D(uint8_t, pTmp, 512, 16)
|
||||
McHorVer02WidthEq16_AArch64_neon(pSrc+1, iSrcStride, pTmp, 16, iHeight);
|
||||
McHorVer22WidthEq16_AArch64_neon(pSrc, iSrcStride, &pTmp[256], 16, iHeight);
|
||||
PixelAvgWidthEq16_AArch64_neon(pDst, iDstStride, pTmp, 16, &pTmp[256], 16, iHeight);
|
||||
}
|
||||
void EncMcHorVer33_AArch64_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight) {
|
||||
ENFORCE_STACK_ALIGN_1D(uint8_t, pTmp, 512, 16)
|
||||
McHorVer20WidthEq16_AArch64_neon(pSrc+iSrcStride, iSrcStride, pTmp, 16, iHeight);
|
||||
McHorVer02WidthEq16_AArch64_neon(pSrc+1, iSrcStride, &pTmp[256], 16, iHeight);
|
||||
PixelAvgWidthEq16_AArch64_neon(pDst, iDstStride, pTmp, 16, &pTmp[256], 16, iHeight);
|
||||
}
|
||||
void EncMcChroma_AArch64_neon(const uint8_t *pSrc, int32_t iSrcStride, uint8_t *pDst, int32_t iDstStride,
|
||||
SMVUnitXY sMv, int32_t iWidth, int32_t iHeight) {
|
||||
const int32_t kiD8x = sMv.iMvX&0x07;
|
||||
const int32_t kiD8y = sMv.iMvY&0x07;
|
||||
if (0 == kiD8x && 0 == kiD8y) {
|
||||
if(8 == iWidth)
|
||||
McCopyWidthEq8_AArch64_neon(pSrc, iSrcStride, pDst, iDstStride, iHeight);
|
||||
else // iWidth == 4
|
||||
McCopyWidthEq4_AArch64_neon(pSrc, iSrcStride, pDst, iDstStride, iHeight);
|
||||
}
|
||||
else {
|
||||
if(8 == iWidth)
|
||||
McChromaWidthEq8_AArch64_neon(pSrc, iSrcStride, pDst, iDstStride, (int32_t*)(g_kuiABCD[kiD8y][kiD8x]), iHeight);
|
||||
else //if(4 == iWidth)
|
||||
McChromaWidthEq4_AArch64_neon(pSrc, iSrcStride, pDst, iDstStride, (int32_t*)(g_kuiABCD[kiD8y][kiD8x]), iHeight);
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
typedef void (*PixelAvgFunc) (uint8_t*, int32_t, const uint8_t*, int32_t, const uint8_t*, int32_t, int32_t);
|
||||
void WelsInitMcFuncs (SWelsFuncPtrList* pFuncList, uint32_t uiCpuFlag) {
|
||||
static PixelAvgFunc pfPixAvgFunc[2] = {PixelAvgWidthEq8_c, PixelAvgWidthEq16_c};
|
||||
|
||||
static PWelsLumaQuarpelMcFunc pWelsMcFuncWidthEq16[16] = { //[y*4+x]
|
||||
McCopyWidthEq16_c, McHorVer10WidthEq16, McHorVer20WidthEq16_c, McHorVer30WidthEq16,
|
||||
McHorVer01WidthEq16, McHorVer11WidthEq16, McHorVer21WidthEq16, McHorVer31WidthEq16,
|
||||
McHorVer02WidthEq16_c, McHorVer12WidthEq16, McHorVer22WidthEq16_c, McHorVer32WidthEq16,
|
||||
McHorVer03WidthEq16, McHorVer13WidthEq16, McHorVer23WidthEq16, McHorVer33WidthEq16
|
||||
};
|
||||
#if defined (X86_ASM)
|
||||
static PWelsLumaQuarpelMcFunc pWelsMcFuncWidthEq16_sse2[16] = {
|
||||
McCopyWidthEq16_sse2, McHorVer10WidthEq16, McHorVer20WidthEq16_sse2, McHorVer30WidthEq16,
|
||||
McHorVer01WidthEq16, McHorVer11WidthEq16, McHorVer21WidthEq16, McHorVer31WidthEq16,
|
||||
McHorVer02WidthEq16_sse2, McHorVer12WidthEq16, McHorVer22WidthEq16_sse2, McHorVer32WidthEq16,
|
||||
McHorVer03WidthEq16, McHorVer13WidthEq16, McHorVer23WidthEq16, McHorVer33WidthEq16
|
||||
};
|
||||
#endif
|
||||
#if defined(HAVE_NEON)
|
||||
static PWelsLumaQuarpelMcFunc pWelsMcFuncWidthEq16_neon[16] = { //[x][y]
|
||||
McCopyWidthEq16_neon, McHorVer10WidthEq16_neon, McHorVer20WidthEq16_neon, McHorVer30WidthEq16_neon,
|
||||
McHorVer01WidthEq16_neon, EncMcHorVer11_neon, EncMcHorVer21_neon, EncMcHorVer31_neon,
|
||||
McHorVer02WidthEq16_neon, EncMcHorVer12_neon, McHorVer22WidthEq16_neon, EncMcHorVer32_neon,
|
||||
McHorVer03WidthEq16_neon, EncMcHorVer13_neon, EncMcHorVer23_neon, EncMcHorVer33_neon
|
||||
};
|
||||
#endif
|
||||
#if defined(HAVE_NEON_AARCH64)
|
||||
static PWelsLumaQuarpelMcFunc pWelsMcFuncWidthEq16_AArch64_neon[16] = { //[x][y]
|
||||
McCopyWidthEq16_AArch64_neon, McHorVer10WidthEq16_AArch64_neon, McHorVer20WidthEq16_AArch64_neon, McHorVer30WidthEq16_AArch64_neon,
|
||||
McHorVer01WidthEq16_AArch64_neon, EncMcHorVer11_AArch64_neon, EncMcHorVer21_AArch64_neon, EncMcHorVer31_AArch64_neon,
|
||||
McHorVer02WidthEq16_AArch64_neon, EncMcHorVer12_AArch64_neon, McHorVer22WidthEq16_AArch64_neon, EncMcHorVer32_AArch64_neon,
|
||||
McHorVer03WidthEq16_AArch64_neon, EncMcHorVer13_AArch64_neon, EncMcHorVer23_AArch64_neon, EncMcHorVer33_AArch64_neon
|
||||
};
|
||||
#endif
|
||||
pFuncList->sMcFuncs.pfLumaHalfpelHor = McHorVer20_c;
|
||||
pFuncList->sMcFuncs.pfLumaHalfpelVer = McHorVer02_c;
|
||||
pFuncList->sMcFuncs.pfLumaHalfpelCen = McHorVer22_c;
|
||||
pFuncList->sMcFuncs.pfSampleAveraging = pfPixAvgFunc;
|
||||
pFuncList->sMcFuncs.pfChromaMc = McChroma_c;
|
||||
fpVerFilter = VerFilter_c;
|
||||
fpHorFilter = HorFilter_c;
|
||||
fpHorFilterInput16Bits = HorFilterInput16bit1_c;
|
||||
McCopyWidthEq4 = McCopyWidthEq4_c;
|
||||
McCopyWidthEq8 = McCopyWidthEq8_c;
|
||||
McCopyWidthEq16 = McCopyWidthEq16_c;
|
||||
pfPixelAvgWidthEq16 = PixelAvgWidthEq16_c;
|
||||
pfMcHorVer02WidthEq16 = McHorVer02WidthEq16_c;
|
||||
pfMcHorVer20WidthEq16 = McHorVer20WidthEq16_c;
|
||||
pfMcHorVer22WidthEq16 = McHorVer22WidthEq16_c;
|
||||
pFuncList->sMcFuncs.pfLumaQuarpelMc = pWelsMcFuncWidthEq16;
|
||||
#if defined (X86_ASM)
|
||||
if (uiCpuFlag & WELS_CPU_SSE2) {
|
||||
pFuncList->sMcFuncs.pfLumaHalfpelHor = McHorVer20Width9Or17_sse2;
|
||||
pFuncList->sMcFuncs.pfLumaHalfpelVer = McHorVer02Height9Or17_sse2;
|
||||
pFuncList->sMcFuncs.pfLumaHalfpelCen = McHorVer22Width9Or17Height9Or17_sse2;
|
||||
pFuncList->sMcFuncs.pfSampleAveraging[0] = PixelAvgWidthEq8_mmx;
|
||||
pFuncList->sMcFuncs.pfSampleAveraging[1] = PixelAvgWidthEq16_sse2;
|
||||
pFuncList->sMcFuncs.pfChromaMc = McChroma_sse2;
|
||||
McCopyWidthEq4 = McCopyWidthEq4_mmx;
|
||||
McCopyWidthEq8 = McCopyWidthEq8_mmx;
|
||||
McCopyWidthEq16 = McCopyWidthEq16_sse2;
|
||||
pfPixelAvgWidthEq16 = PixelAvgWidthEq16_sse2;
|
||||
pfMcHorVer02WidthEq16 = McHorVer02WidthEq16_sse2;
|
||||
pfMcHorVer20WidthEq16 = McHorVer20WidthEq16_sse2;
|
||||
pfMcHorVer22WidthEq16 = McHorVer22WidthEq16_sse2;
|
||||
pFuncList->sMcFuncs.pfLumaQuarpelMc = pWelsMcFuncWidthEq16_sse2;
|
||||
}
|
||||
|
||||
if (uiCpuFlag & WELS_CPU_SSSE3) {
|
||||
pFuncList->sMcFuncs.pfChromaMc = McChroma_ssse3;
|
||||
}
|
||||
|
||||
#endif //(X86_ASM)
|
||||
|
||||
#if defined(HAVE_NEON)
|
||||
if (uiCpuFlag & WELS_CPU_NEON) {
|
||||
pFuncList->sMcFuncs.pfLumaQuarpelMc = pWelsMcFuncWidthEq16_neon;
|
||||
pFuncList->sMcFuncs.pfChromaMc = EncMcChroma_neon;
|
||||
pFuncList->sMcFuncs.pfSampleAveraging[0] = PixStrideAvgWidthEq8_neon;
|
||||
pFuncList->sMcFuncs.pfSampleAveraging[1] = PixStrideAvgWidthEq16_neon;
|
||||
pFuncList->sMcFuncs.pfLumaHalfpelHor = McHorVer20Width9Or17_neon;//iWidth+1:8/16
|
||||
pFuncList->sMcFuncs.pfLumaHalfpelVer = McHorVer02Height9Or17_neon;//heigh+1:8/16
|
||||
pFuncList->sMcFuncs.pfLumaHalfpelCen = McHorVer22Width9Or17Height9Or17_neon;//iWidth+1/heigh+1
|
||||
}
|
||||
#endif
|
||||
#if defined(HAVE_NEON_AARCH64)
|
||||
if (uiCpuFlag & WELS_CPU_NEON) {
|
||||
pFuncList->sMcFuncs.pfLumaQuarpelMc = pWelsMcFuncWidthEq16_AArch64_neon;
|
||||
pFuncList->sMcFuncs.pfChromaMc = EncMcChroma_AArch64_neon;
|
||||
pFuncList->sMcFuncs.pfSampleAveraging[0] = PixStrideAvgWidthEq8_AArch64_neon;
|
||||
pFuncList->sMcFuncs.pfSampleAveraging[1] = PixStrideAvgWidthEq16_AArch64_neon;
|
||||
pFuncList->sMcFuncs.pfLumaHalfpelHor = McHorVer20Width9Or17_AArch64_neon;//iWidth+1:8/16
|
||||
pFuncList->sMcFuncs.pfLumaHalfpelVer = McHorVer02Height9Or17_AArch64_neon;//heigh+1:8/16
|
||||
pFuncList->sMcFuncs.pfLumaHalfpelCen = McHorVer22Width9Or17Height9Or17_AArch64_neon;//iWidth+1/heigh+1
|
||||
}
|
||||
#endif
|
||||
}
|
||||
}
|
Loading…
x
Reference in New Issue
Block a user