Add ARM 32bit asm code for decoder.

This commit is contained in:
Licai Guo 2014-02-28 13:36:34 +08:00
parent fc056c7ef0
commit 0fd9db2878
15 changed files with 4903 additions and 11 deletions

View File

@ -36,6 +36,11 @@
4CE4469D18BC5EAB0017DF25 /* utils.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 4CE4467A18BC5EAA0017DF25 /* utils.cpp */; };
4CE4469E18BC5EAB0017DF25 /* welsCodecTrace.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 4CE4468418BC5EAB0017DF25 /* welsCodecTrace.cpp */; };
4CE4469F18BC5EAB0017DF25 /* welsDecoderExt.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 4CE4468518BC5EAB0017DF25 /* welsDecoderExt.cpp */; };
4CE447AB18BC6BE90017DF25 /* arm_arch_common_macro.S in Sources */ = {isa = PBXBuildFile; fileRef = 4CE447A618BC6BE90017DF25 /* arm_arch_common_macro.S */; };
4CE447AC18BC6BE90017DF25 /* block_add_neon.S in Sources */ = {isa = PBXBuildFile; fileRef = 4CE447A718BC6BE90017DF25 /* block_add_neon.S */; };
4CE447AD18BC6BE90017DF25 /* deblocking_neon.S in Sources */ = {isa = PBXBuildFile; fileRef = 4CE447A818BC6BE90017DF25 /* deblocking_neon.S */; };
4CE447AE18BC6BE90017DF25 /* intra_pred_neon.S in Sources */ = {isa = PBXBuildFile; fileRef = 4CE447A918BC6BE90017DF25 /* intra_pred_neon.S */; };
4CE447AF18BC6BE90017DF25 /* mc_neon.S in Sources */ = {isa = PBXBuildFile; fileRef = 4CE447AA18BC6BE90017DF25 /* mc_neon.S */; };
/* End PBXBuildFile section */
/* Begin PBXContainerItemProxy section */
@ -83,7 +88,7 @@
4CE4465018BC5EAA0017DF25 /* error_code.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = error_code.h; sourceTree = "<group>"; };
4CE4465118BC5EAA0017DF25 /* expand_pic.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = expand_pic.h; sourceTree = "<group>"; };
4CE4465218BC5EAA0017DF25 /* fmo.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = fmo.h; sourceTree = "<group>"; };
4CE4465318BC5EAA0017DF25 /* get_intra_predictor.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = get_intra_predictor.h; sourceTree = "<group>"; };
4CE4465318BC5EAA0017DF25 /* get_intra_predictor.h */ = {isa = PBXFileReference; fileEncoding = 4; indentWidth = 2; lastKnownFileType = sourcecode.c.h; path = get_intra_predictor.h; sourceTree = "<group>"; tabWidth = 2; usesTabs = 1; };
4CE4465418BC5EAA0017DF25 /* manage_dec_ref.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = manage_dec_ref.h; sourceTree = "<group>"; };
4CE4465518BC5EAA0017DF25 /* mb_cache.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = mb_cache.h; sourceTree = "<group>"; };
4CE4465618BC5EAA0017DF25 /* mc.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = mc.h; sourceTree = "<group>"; };
@ -102,19 +107,19 @@
4CE4466318BC5EAA0017DF25 /* vlc_decoder.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = vlc_decoder.h; sourceTree = "<group>"; };
4CE4466418BC5EAA0017DF25 /* wels_common_basis.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = wels_common_basis.h; sourceTree = "<group>"; };
4CE4466518BC5EAA0017DF25 /* wels_const.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = wels_const.h; sourceTree = "<group>"; };
4CE4466718BC5EAA0017DF25 /* au_parser.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = au_parser.cpp; sourceTree = "<group>"; };
4CE4466818BC5EAA0017DF25 /* bit_stream.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = bit_stream.cpp; sourceTree = "<group>"; };
4CE4466718BC5EAA0017DF25 /* au_parser.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = au_parser.cpp; sourceTree = "<group>"; usesTabs = 1; };
4CE4466818BC5EAA0017DF25 /* bit_stream.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = bit_stream.cpp; sourceTree = "<group>"; usesTabs = 1; };
4CE4466918BC5EAA0017DF25 /* deblocking.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = deblocking.cpp; sourceTree = "<group>"; };
4CE4466A18BC5EAA0017DF25 /* decode_mb_aux.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = decode_mb_aux.cpp; sourceTree = "<group>"; };
4CE4466B18BC5EAA0017DF25 /* decode_slice.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = decode_slice.cpp; sourceTree = "<group>"; };
4CE4466C18BC5EAA0017DF25 /* decoder.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = decoder.cpp; sourceTree = "<group>"; };
4CE4466B18BC5EAA0017DF25 /* decode_slice.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = decode_slice.cpp; sourceTree = "<group>"; usesTabs = 1; };
4CE4466C18BC5EAA0017DF25 /* decoder.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = decoder.cpp; sourceTree = "<group>"; tabWidth = 2; usesTabs = 1; };
4CE4466D18BC5EAA0017DF25 /* decoder_core.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = decoder_core.cpp; sourceTree = "<group>"; };
4CE4466E18BC5EAA0017DF25 /* decoder_data_tables.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = decoder_data_tables.cpp; sourceTree = "<group>"; };
4CE4466F18BC5EAA0017DF25 /* expand_pic.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = expand_pic.cpp; sourceTree = "<group>"; };
4CE4467018BC5EAA0017DF25 /* fmo.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = fmo.cpp; sourceTree = "<group>"; };
4CE4467118BC5EAA0017DF25 /* get_intra_predictor.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = get_intra_predictor.cpp; sourceTree = "<group>"; };
4CE4467218BC5EAA0017DF25 /* manage_dec_ref.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = manage_dec_ref.cpp; sourceTree = "<group>"; };
4CE4467318BC5EAA0017DF25 /* mc.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = mc.cpp; sourceTree = "<group>"; };
4CE4467318BC5EAA0017DF25 /* mc.cpp */ = {isa = PBXFileReference; fileEncoding = 4; indentWidth = 4; lastKnownFileType = sourcecode.cpp.cpp; path = mc.cpp; sourceTree = "<group>"; tabWidth = 1; usesTabs = 1; wrapsLines = 1; };
4CE4467418BC5EAA0017DF25 /* mem_align.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = mem_align.cpp; sourceTree = "<group>"; };
4CE4467518BC5EAA0017DF25 /* memmgr_nal_unit.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = memmgr_nal_unit.cpp; sourceTree = "<group>"; };
4CE4467618BC5EAA0017DF25 /* mv_pred.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = mv_pred.cpp; sourceTree = "<group>"; };
@ -127,6 +132,11 @@
4CE4468318BC5EAB0017DF25 /* wels_dec_export.def */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = text; path = wels_dec_export.def; sourceTree = "<group>"; };
4CE4468418BC5EAB0017DF25 /* welsCodecTrace.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = welsCodecTrace.cpp; sourceTree = "<group>"; };
4CE4468518BC5EAB0017DF25 /* welsDecoderExt.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = welsDecoderExt.cpp; sourceTree = "<group>"; };
4CE447A618BC6BE90017DF25 /* arm_arch_common_macro.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = arm_arch_common_macro.S; sourceTree = "<group>"; };
4CE447A718BC6BE90017DF25 /* block_add_neon.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = block_add_neon.S; sourceTree = "<group>"; };
4CE447A818BC6BE90017DF25 /* deblocking_neon.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = deblocking_neon.S; sourceTree = "<group>"; };
4CE447A918BC6BE90017DF25 /* intra_pred_neon.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = intra_pred_neon.S; sourceTree = "<group>"; };
4CE447AA18BC6BE90017DF25 /* mc_neon.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = mc_neon.S; sourceTree = "<group>"; };
/* End PBXFileReference section */
/* Begin PBXFrameworksBuildPhase section */
@ -212,6 +222,7 @@
4CE4463F18BC5EAA0017DF25 /* core */ = {
isa = PBXGroup;
children = (
4CE447A518BC6BE90017DF25 /* arm */,
4CE4464418BC5EAA0017DF25 /* inc */,
4CE4466618BC5EAA0017DF25 /* src */,
);
@ -313,6 +324,18 @@
path = src;
sourceTree = "<group>";
};
4CE447A518BC6BE90017DF25 /* arm */ = {
isa = PBXGroup;
children = (
4CE447A618BC6BE90017DF25 /* arm_arch_common_macro.S */,
4CE447A718BC6BE90017DF25 /* block_add_neon.S */,
4CE447A818BC6BE90017DF25 /* deblocking_neon.S */,
4CE447A918BC6BE90017DF25 /* intra_pred_neon.S */,
4CE447AA18BC6BE90017DF25 /* mc_neon.S */,
);
path = arm;
sourceTree = "<group>";
};
/* End PBXGroup section */
/* Begin PBXNativeTarget section */
@ -394,20 +417,25 @@
isa = PBXSourcesBuildPhase;
buildActionMask = 2147483647;
files = (
4CE447AF18BC6BE90017DF25 /* mc_neon.S in Sources */,
4CE4469B18BC5EAB0017DF25 /* pic_queue.cpp in Sources */,
4CE4469F18BC5EAB0017DF25 /* welsDecoderExt.cpp in Sources */,
4CE4469318BC5EAB0017DF25 /* fmo.cpp in Sources */,
4CE4469D18BC5EAB0017DF25 /* utils.cpp in Sources */,
4CE4469118BC5EAB0017DF25 /* decoder_data_tables.cpp in Sources */,
4CE4469718BC5EAB0017DF25 /* mem_align.cpp in Sources */,
4CE447AB18BC6BE90017DF25 /* arm_arch_common_macro.S in Sources */,
4CE4469518BC5EAB0017DF25 /* manage_dec_ref.cpp in Sources */,
4CE4468A18BC5EAB0017DF25 /* au_parser.cpp in Sources */,
4CE4469218BC5EAB0017DF25 /* expand_pic.cpp in Sources */,
4CE4469918BC5EAB0017DF25 /* mv_pred.cpp in Sources */,
4CE447AC18BC6BE90017DF25 /* block_add_neon.S in Sources */,
4CE4469418BC5EAB0017DF25 /* get_intra_predictor.cpp in Sources */,
4CE4469018BC5EAB0017DF25 /* decoder_core.cpp in Sources */,
4CE4469E18BC5EAB0017DF25 /* welsCodecTrace.cpp in Sources */,
4CE447AE18BC6BE90017DF25 /* intra_pred_neon.S in Sources */,
4CE4469618BC5EAB0017DF25 /* mc.cpp in Sources */,
4CE447AD18BC6BE90017DF25 /* deblocking_neon.S in Sources */,
4CE4469C18BC5EAB0017DF25 /* rec_mb.cpp in Sources */,
4CE4468B18BC5EAB0017DF25 /* bit_stream.cpp in Sources */,
4CE4468D18BC5EAB0017DF25 /* decode_mb_aux.cpp in Sources */,

View File

@ -33,7 +33,23 @@ void DeblockChromaEq4H_ssse3 (uint8_t* pPixCb, uint8_t* pPixCr, int32_t iStride,
void DeblockChromaLt4H_ssse3 (uint8_t* pPixCb, uint8_t* pPixCr, int32_t iStride, int32_t iAlpha, int32_t iBeta,
int8_t* pTC);
#endif
#if defined(HAVE_NEON)
void DeblockLumaLt4V_neon(uint8_t* pPixY, int32_t iStride, int32_t iAlpha, int32_t iBeta, int8_t* pTc);
void DeblockLumaEq4V_neon(uint8_t* pPixY, int32_t iStride, int32_t iAlpha, int32_t iBeta);
void DeblockLumaLt4H_neon(uint8_t* pPixY, int32_t iStride, int32_t iAlpha, int32_t iBeta, int8_t* pTc);
void DeblockLumaEq4H_neon(uint8_t* pPixY, int32_t iStride, int32_t iAlpha, int32_t iBeta);
void DeblockChromaLt4V_neon(uint8_t* pPixCb, uint8_t* pPixCr, int32_t iStride, int32_t iAlpha, int32_t iBeta, int8_t* pTC);
void DeblockChromaEq4V_neon(uint8_t* pPixCb, uint8_t* pPixCr, int32_t iStride, int32_t iAlpha, int32_t iBeta);
void DeblockChromaLt4H_neon(uint8_t* pPixCb, uint8_t* pPixCr, int32_t iStride, int32_t iAlpha, int32_t iBeta, int8_t* pTC);
void DeblockChromaEq4H_neon(uint8_t* pPixCb, uint8_t* pPixCr, int32_t iStride, int32_t iAlpha, int32_t iBeta);
#endif
#if defined(__cplusplus)
}
#endif//__cplusplus
#endif //WELS_DEBLOCKING_COMMON_H__

View File

@ -39,6 +39,79 @@
extern "C" {
#endif//__cplusplus
#if defined(HAVE_NEON)
/*
void McCopy_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
int32_t iWidth, int32_t iHeight);
void McHorVer20_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
int32_t iWidth, int32_t iHeight);
void McHorVer02_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
int32_t iWidth, int32_t iHeight);
void McHorVer22_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
int32_t iWidth, int32_t iHeight);
void McHorVer01_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iWidth, int32_t iHeight);
void McHorVer03_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iWidth, int32_t iHeight);
void McHorVer10_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iWidth, int32_t iHeight);
void McHorVer11_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iWidth, int32_t iHeight);
void McHorVer12_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iWidth, int32_t iHeight);
void McHorVer13_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iWidth, int32_t iHeight);
void McHorVer21_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iWidth, int32_t iHeight);
void McHorVer23_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iWidth, int32_t iHeight);
void McHorVer30_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iWidth, int32_t iHeight);
void McHorVer31_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iWidth, int32_t iHeight);
void McHorVer32_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iWidth, int32_t iHeight);
void McHorVer33_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iWidth, int32_t iHeight);
void McLuma_neon(uint8_t* pSrc, int32_t src_stride, uint8_t* dst, int32_t dst_stride,
int16_t iMvX, int16_t iMvY, int32_t width, int32_t height);
void McChroma_neon(uint8_t* pSrc, int32_t src_stride, uint8_t* dst, int32_t dst_stride,
int16_t iMvX, int16_t iMvY, int32_t width, int32_t height);
*/
void McCopyWidthEq4_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);
void McCopyWidthEq8_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);
void McCopyWidthEq16_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);
void McChromaWidthEq8_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t* pWeights, int32_t iHeight);
void McChromaWidthEq4_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t* pWeights, int32_t iHeight);
void PixelAvgWidthEq16_neon(uint8_t* pDst, int32_t iDstStride, uint8_t* pSrcA, uint8_t* pSrcB, int32_t iHeight);
void PixelAvgWidthEq8_neon(uint8_t* pDst, int32_t iDstStride, uint8_t* pSrcA, uint8_t* pSrcB, int32_t iHeight);
void PixelAvgWidthEq4_neon(uint8_t* pDst, int32_t iDstStride, uint8_t* pSrcA, uint8_t* pSrcB, int32_t iHeight);
void McHorVer01WidthEq16_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);
void McHorVer01WidthEq8_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);
void McHorVer01WidthEq4_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);
void McHorVer03WidthEq16_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);
void McHorVer03WidthEq8_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);
void McHorVer03WidthEq4_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);
void McHorVer10WidthEq16_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);
void McHorVer10WidthEq8_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);
void McHorVer10WidthEq4_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);
void McHorVer30WidthEq16_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);
void McHorVer30WidthEq8_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);
void McHorVer30WidthEq4_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);
//horizontal filter to gain half sample, that is (2, 0) location in quarter sample
void McHorVer20WidthEq16_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);
void McHorVer20WidthEq8_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);
void McHorVer20WidthEq4_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);
//vertical filter to gain half sample, that is (0, 2) location in quarter sample
void McHorVer02WidthEq16_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);
void McHorVer02WidthEq8_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);
void McHorVer02WidthEq4_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);
//horizontal and vertical filter to gain half sample, that is (2, 2) location in quarter sample
void McHorVer22WidthEq16_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);
void McHorVer22WidthEq8_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);
void McHorVer22WidthEq4_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);
#endif
#if defined(X86_ASM)
//***************************************************************************//
// MMXEXT definition //

View File

@ -0,0 +1,55 @@
/*!
* \copy
* Copyright (c) 2013, Cisco Systems
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
*
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in
* the documentation and/or other materials provided with the
* distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
* COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
* ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*
*/
#ifdef APPLE_IOS
.macro WELS_ASM_FUNC_BEGIN
.align 2
.arm
.globl _$0
_$0:
.endm
#else
.macro WELS_ASM_FUNC_BEGIN funcName
.align 2
.arm
.global \funcName
\funcName:
.endm
#endif
.macro WELS_ASM_FUNC_END
mov pc, lr
.endm

View File

@ -0,0 +1,620 @@
/*!
* \copy
* Copyright (c) 2013, Cisco Systems
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
*
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in
* the documentation and/or other materials provided with the
* distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
* COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
* ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*
*/
#ifdef HAVE_NEON
.text
#include "arm_arch_common_macro.S"
#ifdef APPLE_IOS
.macro ORR_32BYTES_TO_8BYTES
// { // input: smb#a q0 & q1, smb#b q2&q3, q0[0], q0[1], q2[0], q2[1], output: d_0, d_1
vorr.s16 $0, $1
vorr.s16 $2, $3
vorr.s16 $8, $4, $5
vorr.s16 $9, $6, $7
// }
.endm
.macro ADD_PRED_1BYTE_TO_RESID_2BYTES
// { // input: q0~q3, d0~d3, output: d0~d3;
vaddw.u8 $0, $4
vaddw.u8 $1, $5
vaddw.u8 $2, $6
vaddw.u8 $3, $7
vqmovun.s16 $4, $0 //saturation
vqmovun.s16 $6, $2
vqmovun.s16 $5, $1
vqmovun.s16 $7, $3
// }
.endm
.macro ROW_TRANSFORM_1_STEP
// { // input: src_d[0]~[3], output: e_q[0]~[3]; working: $8 $9
vaddl.s16 $4, $0, $2 //int32 e[i][0] = src[0] + src[2];
vsubl.s16 $5, $0, $2 //int32 e[i][1] = src[0] - src[2];
vshr.s16 $8, $1, #1
vshr.s16 $9, $3, #1
vsubl.s16 $6, $8, $3 //int32 e[i][2] = (src[1]>>1)-src[3];
vaddl.s16 $7, $1, $9 //int32 e[i][3] = src[1] + (src[3]>>1);
// }
.endm
.macro TRANSFORM_4BYTES // both row & col transform used
// { // output: f_q[0]~[3], input: e_q[0]~[3];
vadd.s32 $0, $4, $7 //int16 f[i][0] = e[i][0] + e[i][3];
vadd.s32 $1, $5, $6 //int16 f[i][1] = e[i][1] + e[i][2];
vsub.s32 $2, $5, $6 //int16 f[i][2] = e[i][1] - e[i][2];
vsub.s32 $3, $4, $7 //int16 f[i][3] = e[i][0] - e[i][3];
// }
.endm
.macro COL_TRANSFORM_1_STEP
// { // input: src_q[0]~[3], output: e_q[0]~[3];
vadd.s32 $4, $0, $2 //int32 e[0][j] = f[0][j] + f[2][j];
vsub.s32 $5, $0, $2 //int32 e[1][j] = f[0][j] - f[2][j];
vshr.s32 $6, $1, #1
vshr.s32 $7, $3, #1
vsub.s32 $6, $6, $3 //int32 e[2][j] = (f[1][j]>>1) - f[3][j];
vadd.s32 $7, $1, $7 //int32 e[3][j] = f[1][j] + (f[3][j]>>1);
// }
.endm
.macro ADD_AND_CLIP_RS
// { // input: src_q[0]~[1], matrix_max~min, pred_q, working_d[0]:[1], output: q;
vrshrn.s32 $5, $0, #6
vrshrn.s32 $6, $1, #6
vqadd.s16 $7, $4
vmin.s16 $7, $7, $2
vmax.s16 $7, $7, $3
// }
.endm
#else
.macro ORR_32BYTES_TO_8BYTES arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8, arg9
// { // input: smb#a q0 & q1, smb#b q2&q3, q0[0], q0[1], q2[0], q2[1], output: d_0, d_1
vorr.s16 \arg0, \arg1
vorr.s16 \arg2, \arg3
vorr.s16 \arg8, \arg4, \arg5
vorr.s16 \arg9, \arg6, \arg7
// }
.endm
.macro ADD_PRED_1BYTE_TO_RESID_2BYTES arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7
// { // input: q0~q3, d0~d3, output: d0~d3;
vaddw.u8 \arg0, \arg4
vaddw.u8 \arg1, \arg5
vaddw.u8 \arg2, \arg6
vaddw.u8 \arg3, \arg7
vqmovun.s16 \arg4, \arg0 //saturation
vqmovun.s16 \arg6, \arg2
vqmovun.s16 \arg5, \arg1
vqmovun.s16 \arg7, \arg3
// }
.endm
.macro ROW_TRANSFORM_1_STEP arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8, arg9
// { // input: src_d[0]~[3], output: e_q[0]~[3]; working: $8 $9
vaddl.s16 \arg4, \arg0, \arg2 //int32 e[i][0] = src[0] + src[2];
vsubl.s16 \arg5, \arg0, \arg2 //int32 e[i][1] = src[0] - src[2];
vshr.s16 \arg8, \arg1, #1
vshr.s16 \arg9, \arg3, #1
vsubl.s16 \arg6, \arg8, \arg3 //int32 e[i][2] = (src[1]>>1)-src[3];
vaddl.s16 \arg7, \arg1, \arg9 //int32 e[i][3] = src[1] + (src[3]>>1);
// }
.endm
.macro TRANSFORM_4BYTES arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7 // both row & col transform used
// { // output: f_q[0]~[3], input: e_q[0]~[3];
vadd.s32 \arg0, \arg4, \arg7 //int16 f[i][0] = e[i][0] + e[i][3];
vadd.s32 \arg1, \arg5, \arg6 //int16 f[i][1] = e[i][1] + e[i][2];
vsub.s32 \arg2, \arg5, \arg6 //int16 f[i][2] = e[i][1] - e[i][2];
vsub.s32 \arg3, \arg4, \arg7 //int16 f[i][3] = e[i][0] - e[i][3];
// }
.endm
.macro COL_TRANSFORM_1_STEP arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7
// { // input: src_q[0]~[3], output: e_q[0]~[3];
vadd.s32 \arg4, \arg0, \arg2 //int32 e[0][j] = f[0][j] + f[2][j];
vsub.s32 \arg5, \arg0, \arg2 //int32 e[1][j] = f[0][j] - f[2][j];
vshr.s32 \arg6, \arg1, #1
vshr.s32 \arg7, \arg3, #1
vsub.s32 \arg6, \arg6, \arg3 //int32 e[2][j] = (f[1][j]>>1) - f[3][j];
vadd.s32 \arg7, \arg1, \arg7 //int32 e[3][j] = f[1][j] + (f[3][j]>>1);
// }
.endm
.macro ADD_AND_CLIP_RS arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7
// { // input: src_q[0]~[1], matrix_max~min, pred_q, working_d[0]:[1], output: q;
vrshrn.s32 \arg5, \arg0, #6
vrshrn.s32 \arg6, \arg1, #6
vqadd.s16 \arg7, \arg4
vmin.s16 \arg7, \arg7, \arg2
vmax.s16 \arg7, \arg7, \arg3
// }
.endm
#endif
// r0 int16_t* block,
// r1 int8_t* non_zero_count,
WELS_ASM_FUNC_BEGIN SetNonZeroCount_neon
vld1.64 {d0-d2}, [r1]
vceq.s8 q0, q0, #0
vceq.s8 d2, d2, #0
vmvn q0, q0
vmvn d2, d2
vabs.s8 q0, q0
vabs.s8 d2, d2
vst1.64 {d0-d2}, [r1]
WELS_ASM_FUNC_END
// r0 int16_t* block,
// r1 int8_t* non_zero_count,
WELS_ASM_FUNC_BEGIN svc_non_zero_count_neon
push {r2-r4}
mov r4, #3
mov r3, #64
add r2, r0, #32
pld [r0, #512]
non_zero_count_two_8x8_loop:
vld1.64 {q0, q1}, [r0,:128], r3
vld1.64 {q2, q3}, [r2,:128], r3
vld1.64 {q4, q5}, [r0,:128], r3
vld1.64 {q6, q7}, [r2,:128], r3
vld1.64 {q8, q9}, [r0,:128], r3
vld1.64 {q10, q11}, [r2,:128], r3//load #0 8x8 block resi data,
vld1.64 {q12, q13}, [r0,:128], r3
vld1.64 {q14, q15}, [r2,:128], r3//load #1 8x8 block resi data,
pld [r0, #512]
ORR_32BYTES_TO_8BYTES q0, q1, q2, q3, d0, d1, d4, d5, d2, d3 // output q1
// vceq.i16 q1, q1, #0
ORR_32BYTES_TO_8BYTES q8, q9,q10,q11,d16,d17,d20,d21,d4,d5 // output q2
// vceq.i16 q2, q2, #0
ORR_32BYTES_TO_8BYTES q4, q5, q6, q7, d8, d9, d12, d13, d10, d11 // output q5
// vceq.i16 q5, q5, #0
ORR_32BYTES_TO_8BYTES q12,q13,q14,q15,d24,d25, d28, d29, d12, d13 // output q6
// vceq.i16 q6, q6, #0
vqmovn.u64 d0, q1 // 8bytes-->4bytes
vqmovn.u64 d8, q5
vqmovn.u64 d1, q2
vqmovn.u64 d9, q6
vqmovn.u32 d2, q0 // 4bytes-->2bytes
vqmovn.u32 d3, q4
vceq.i16 q0, q1, #0
vmvn q0, q0
vabs.s16 q2, q0
vmovn.u16 d6, q2 // 2bytes-->1bytes
vst1.u8 {d6}, [r1]!
// pld [r0]
subs r4, r4, #1
bne non_zero_count_two_8x8_loop
pop {r2-r4}
WELS_ASM_FUNC_END
// r0 int16_t* block,
// r1 int8_t* non_zero_count,
WELS_ASM_FUNC_BEGIN svc_rs_non_zero_count_neon
vld1.i16 {q0, q1}, [r0]! // block is unaligned!!!
vld1.i16 {q2, q3}, [r0]!
vld1.i16 {q4, q5}, [r0]!
vld1.i16 {q6, q7}, [r0]!
vld1.i16 {q8, q9}, [r0]!
vld1.i16 {q10, q11}, [r0]!
vld1.i16 {q12, q13}, [r0]!
vld1.i16 {q14, q15}, [r0]!
ORR_32BYTES_TO_8BYTES q0, q2, q1, q3, q4, q6, q5, q7, q4, q5
vorr.s16 q0, q4
vorr.s16 q1, q5 // output d0~d3
ORR_32BYTES_TO_8BYTES q8, q10, q9, q11, q12, q14, q13, q15, q12, q13
vorr.s16 q6, q8, q12
vorr.s16 q7, q9, q13 // output d12~d15
vqmovn.u64 d4, q0 // 8bytes-->4bytes
vqmovn.u64 d6, q6
vqmovn.u64 d5, q1
vqmovn.u64 d7, q7
vqmovn.u32 d8, q2 // 4bytes-->2bytes
vqmovn.u32 d9, q3
vceq.i16 q5, q4, #0
vmvn q5, q5
vabs.s16 q5, q5
vmovn.u16 d10, q5 // 2bytes-->1bytes
vst1.u8 {d10}, [r1]!
vld1.i16 {q0, q1}, [r0]!
vld1.i16 {q2, q3}, [r0]!
vld1.i16 {q4, q5}, [r0]!
vld1.i16 {q6, q7}, [r0]!
vld1.i16 {q8, q9}, [r0]!
vld1.i16 {q10, q11}, [r0]!
vld1.i16 {q12, q13}, [r0]!
vld1.i16 {q14, q15}, [r0]!
ORR_32BYTES_TO_8BYTES q0, q2, q1, q3, q4, q6, q5, q7, q4, q5
vorr.s16 q0, q4
vorr.s16 q1, q5 // output d0~d3
ORR_32BYTES_TO_8BYTES q8, q10, q9, q11, q12, q14, q13, q15, q12, q13
vorr.s16 q6, q8, q12
vorr.s16 q7, q9, q13 // output d12~d15
vqmovn.u64 d4, q0 // 8bytes-->4bytes
vqmovn.u64 d6, q6
vqmovn.u64 d5, q1
vqmovn.u64 d7, q7
vqmovn.u32 d8, q2 // 4bytes-->2bytes
vqmovn.u32 d9, q3
vceq.i16 q5, q4, #0
vmvn q5, q5
vabs.s16 q5, q5
vmovn.u16 d10, q5 // 2bytes-->1bytes
vst1.u8 {d10}, [r1]!
// Chroma
vld1.i16 {q0, q1}, [r0]!
vld1.i16 {q2, q3}, [r0]!
vld1.i16 {q4, q5}, [r0]!
vld1.i16 {q6, q7}, [r0]! //load Cb block,
vld1.i16 {q8, q9}, [r0]!
vld1.i16 {q10, q11}, [r0]!
vld1.i16 {q12, q13}, [r0]!
vld1.i16 {q14, q15}, [r0]! //load Cr block,
ORR_32BYTES_TO_8BYTES q0, q1, q2, q3, q4, q5, q6, q7, q4, q6
vorr.s16 q0, q2
vorr.s16 q1, q4, q6 // output d0~d3
ORR_32BYTES_TO_8BYTES q8, q9, q10, q11, q12, q13, q14, q15, q12, q14
vorr.s16 q2, q8, q10
vorr.s16 q3, q12, q14 // output d4~d7
vqmovn.u64 d8, q0 // 8bytes-->4bytes
vqmovn.u64 d10, q2
vqmovn.u64 d9, q1
vqmovn.u64 d11, q3
vqmovn.u32 d12, q4 // 4bytes-->2bytes
vqmovn.u32 d13, q5
vceq.i16 q7, q6, #0
vmvn q7, q7
vabs.s16 q7, q7
vmovn.u16 d10, q7 // 2bytes-->1bytes
vst1.u8 {d10}, [r1]!
WELS_ASM_FUNC_END
// r0 int16_t * block,
// r1 int32_t stride
WELS_ASM_FUNC_BEGIN WelsResBlockZero16x16_neon// can use for 256*sizeof(int16_t)
push {r2}
mov r2, #16
// each row 16 elements, 16*sizeof(int16_t)
// memset(ptr_dest, 0, 16*sizeof(int16_t));
// ptr_dest += stride;
lsl r1, r1, #1 // r1 = 2*r1
veor.i16 q0, q0, q0
veor.i16 q1, q1, q1
block_zero_16x16_luma_loop:
vst1.i16 {q0, q1}, [r0], r1
subs r2, r2, #2
vst1.i16 {q0, q1}, [r0], r1
bne block_zero_16x16_luma_loop
pop {r2}
WELS_ASM_FUNC_END
WELS_ASM_FUNC_BEGIN WelsResBlockZero8x8_neon// can use for 64*sizeof(int16_t)
push {r2}
mov r2, #8
// each row 8 elements, 8*sizeof(int16_t)
// memset(ptr_dest, 0, 8*sizeof(int16_t));
// ptr_dest += stride;
lsl r1, r1, #1
veor.i16 q0, q0, q0
block_zero_8x8_chma_loop:
vst1.i16 {q0}, [r0], r1
subs r2, r2, #2
vst1.i16 {q0}, [r0], r1
bne block_zero_8x8_chma_loop
pop {r2}
WELS_ASM_FUNC_END
// r0 int8_t* dst_addr,
// r1 memset_value
// r2 int32_t bytes_nmb,
WELS_ASM_FUNC_BEGIN svc_block_memset_neon// dst should continue
vdup.u8 q0, r1
vdup.u8 q1, r1
block_memset_loop:
vst1.64 {q0, q1}, [r0,:64]!
subs r2, r2, #64
vst1.64 {q0, q1}, [r0,:64]!
bne block_memset_loop
WELS_ASM_FUNC_END
// int16_t* dst,
// int16_t* src,
// int32_t stride
WELS_ASM_FUNC_BEGIN svc_block_copy_16x16_neon
push {r3}
mov r3, #16
// each element is sizeof(int16_t)
lsl r2, r2, #1 // r2 = 2*r2
block_copy_16x16_luma_loop:
vld1.i16 {q0, q1}, [r1], r2
subs r3, r3, #1
vst1.i16 {q0, q1}, [r0]!
bne block_copy_16x16_luma_loop
pop {r3}
WELS_ASM_FUNC_END
WELS_ASM_FUNC_BEGIN svc_block_copy_8x8_neon
push {r3}
mov r3, #8
// each element is sizeof(int16_t)
lsl r2, r2, #1 // r2 = 2*r2
block_copy_8x8_chma_loop:
vld1.i16 {q0}, [r1], r2
subs r3, r3, #1
vst1.i16 {q0}, [r0]!
bne block_copy_8x8_chma_loop
pop {r3}
WELS_ASM_FUNC_END
// r0 uint8_t * dest,
// r1 uint8_t * pred,
// r2 int16_t * res,
// r3 int32_t stride,
WELS_ASM_FUNC_BEGIN svc_block_add_16x16_neon
push {r4}
mov r4, #16
pld [r1]
block_recon_16x16_luma_loop:
vld1.64 {d16,d17}, [r1,:64], r3 //load 16 pred data, update addr
vld1.s16 {q0, q1}, [r2]! //load 8+8 resi data, update addr
vld1.64 {d18,d19}, [r1,:64], r3
vld1.s16 {q2, q3}, [r2]!
ADD_PRED_1BYTE_TO_RESID_2BYTES q0, q1, q2, q3, d16, d17, d18, d19
pld [r1]
vst1.64 {q8}, [r0], r3 //store result
vst1.64 {q9}, [r0], r3
//#ifdef DEBUG_NEON
// vst1.u8 {q8}, [r0]!
// vst1.u8 {q9}, [r0]!
//#endif
vld1.64 {d20,d21}, [r1,:64], r3 //load 16 pred data, update addr
vld1.s16 {q4, q5}, [r2]! //load 8+8 resi data, update addr
vld1.64 {d22,d23}, [r1,:64], r3
vld1.s16 {q6, q7}, [r2]!
ADD_PRED_1BYTE_TO_RESID_2BYTES q4, q5, q6, q7, d20, d21, d22, d23
pld [r1]
vst1.64 {q10}, [r0], r3
vst1.64 {q11}, [r0], r3
//#ifdef DEBUG_NEON
// vst1.u8 {q10}, [r0]!
// vst1.u8 {q11}, [r0]!
//#endif
subs r4, r4, #4
bne block_recon_16x16_luma_loop
pop {r4}
WELS_ASM_FUNC_END
WELS_ASM_FUNC_BEGIN svc_block_add_8x8_neon
vld1.u8 {d24}, [r1], r3 //load 8 pred data
vld1.i16 {q8, q9}, [r2]! //load 8+8 resi data, update addr
vld1.u8 {d25}, [r1], r3 //load 8 pred data, q12
vld1.i16 {q10, q11}, [r2]! //load 8+8 resi data, update addr
vld1.u8 {d26}, [r1], r3 //load 8 pred data
vld1.u8 {d27}, [r1], r3 //load 8 pred data, q13
ADD_PRED_1BYTE_TO_RESID_2BYTES q8, q9, q10, q11, d24, d25, d26, d27
pld [r1]
vst1.u8 {d24}, [r0], r3 //store result
vst1.u8 {d25}, [r0], r3 //store result
vst1.u8 {d26}, [r0], r3 //store result
vst1.u8 {d27}, [r0], r3 //store result
//#ifdef DEBUG_NEON
// vst1.u8 {d24}, [r0]!
//#endif
vld1.u8 {d24}, [r1], r3 //load 8 pred data
vld1.i16 {q8, q9}, [r2]! //load 8+8 resi data, update addr
vld1.u8 {d25}, [r1], r3 //load 8 pred data, q12
vld1.i16 {q10, q11}, [r2]! //load 8+8 resi data, update addr
vld1.u8 {d26}, [r1], r3 //load 8 pred data
vld1.u8 {d27}, [r1], r3 //load 8 pred data, q13
ADD_PRED_1BYTE_TO_RESID_2BYTES q8, q9, q10, q11, d24, d25, d26, d27
vst1.u8 {d24}, [r0], r3 //store result
vst1.u8 {d25}, [r0], r3 //store result
vst1.u8 {d26}, [r0], r3 //store result
vst1.u8 {d27}, [r0], r3 //store result
//#ifdef DEBUG_NEON
// vst1.u8 {d24}, [r0]!
//#endif
WELS_ASM_FUNC_END
// int16_t* dst,
// int16_t* src,
// int stride
WELS_ASM_FUNC_BEGIN svc_simple_idct4x4_neon
vld4.s16 {d0, d1, d2, d3}, [r1] // cost 3 cycles!
lsl r2, r2, #1
ROW_TRANSFORM_1_STEP d0, d1, d2, d3, q4, q5, q6, q7, d4, d5
TRANSFORM_4BYTES q0, q1, q2, q3, q4, q5, q6, q7
// transform element 32bits
vtrn.s32 q0, q1 //[0 1 2 3]+[4 5 6 7]-->[0 4 2 6]+[1 5 3 7]
vtrn.s32 q2, q3 //[8 9 10 11]+[12 13 14 15]-->[8 12 10 14]+[9 13 11 15]
vswp d1, d4 //[0 4 2 6]+[8 12 10 14]-->[0 4 8 12]+[2 6 10 14]
vswp d3, d6 //[1 5 3 7]+[9 13 11 15]-->[1 5 9 13]+[3 7 11 15]
COL_TRANSFORM_1_STEP q0, q1, q2, q3, q4, q5, q6, q7
TRANSFORM_4BYTES q0, q1, q2, q3, q4, q5, q6, q7
vrshrn.s32 d0, q0, #6
vst1.s16 {d0}, [r0], r2 //store
vrshrn.s32 d1, q1, #6
vst1.s16 {d1}, [r0], r2 //store
vrshrn.s32 d2, q2, #6
vst1.s16 {d2}, [r0], r2 //store
vrshrn.s32 d3, q3, #6
vst1.s16 {d3}, [r0], r2 //store
WELS_ASM_FUNC_END
// int16_t* dst,
// int16_t* src,
// int stride
WELS_ASM_FUNC_BEGIN svc_idct4x4_add_neon
vld4.s16 {d0, d1, d2, d3}, [r1] // cost 3 cycles!
lsl r2, r2, #1
ROW_TRANSFORM_1_STEP d0, d1, d2, d3, q4, q5, q6, q7, d4, d5
TRANSFORM_4BYTES q0, q1, q2, q3, q4, q5, q6, q7
// transform element 32bits
vtrn.s32 q0, q1 //[0 1 2 3]+[4 5 6 7]-->[0 4 2 6]+[1 5 3 7]
vtrn.s32 q2, q3 //[8 9 10 11]+[12 13 14 15]-->[8 12 10 14]+[9 13 11 15]
vswp d1, d4 //[0 4 2 6]+[8 12 10 14]-->[0 4 8 12]+[2 6 10 14]
vswp d3, d6 //[1 5 3 7]+[9 13 11 15]-->[1 5 9 13]+[3 7 11 15]
COL_TRANSFORM_1_STEP q0, q1, q2, q3, q4, q5, q6, q7
TRANSFORM_4BYTES q0, q1, q2, q3, q4, q5, q6, q7
//see draft G.8.5.3 , after clip_rs() into [-255, 255]
vmov.i16 q10,#0xFF
veor q11, q11
vsub.i16 q11, q11,q10
// vmvn.i16 q11,#0xFF
mov r1, r0
vld1.s16 {d16}, [r0], r2
vld1.s16 {d17}, [r0], r2
ADD_AND_CLIP_RS q0, q1, q10, q11, q8, d8, d9, q4
vst1.s16 {d8}, [r1], r2 //store
vst1.s16 {d9}, [r1], r2 //store
vld1.s16 {d18}, [r0], r2
vld1.s16 {d19}, [r0], r2
ADD_AND_CLIP_RS q2, q3, q10, q11, q9, d10, d11, q5
vst1.s16 {d10}, [r1], r2 //store
vst1.s16 {d11}, [r1], r2 //store
WELS_ASM_FUNC_END
// uint8_t *pred, const int32_t stride, int16_t *rs
WELS_ASM_FUNC_BEGIN IdctResAddPred_neon
vld4.s16 {d0, d1, d2, d3}, [r2] // cost 3 cycles!
ROW_TRANSFORM_1_STEP d0, d1, d2, d3, q4, q5, q6, q7, d4, d5
TRANSFORM_4BYTES q0, q1, q2, q3, q4, q5, q6, q7
// transform element 32bits
vtrn.s32 q0, q1 //[0 1 2 3]+[4 5 6 7]-->[0 4 2 6]+[1 5 3 7]
vtrn.s32 q2, q3 //[8 9 10 11]+[12 13 14 15]-->[8 12 10 14]+[9 13 11 15]
vswp d1, d4 //[0 4 2 6]+[8 12 10 14]-->[0 4 8 12]+[2 6 10 14]
vswp d3, d6 //[1 5 3 7]+[9 13 11 15]-->[1 5 9 13]+[3 7 11 15]
COL_TRANSFORM_1_STEP q0, q1, q2, q3, q4, q5, q6, q7
TRANSFORM_4BYTES q0, q1, q2, q3, q4, q5, q6, q7
//after clip_table[MAX_NEG_CROP] into [0, 255]
mov r2, r0
vld1.32 {d12[0]},[r0],r1
vld1.32 {d12[1]},[r0],r1
vld1.32 {d14[0]},[r0],r1
vld1.32 {d14[1]},[r0]
vrshrn.s32 d8, q0, #6
vrshrn.s32 d9, q1, #6
vrshrn.s32 d10, q2, #6
vrshrn.s32 d11, q3, #6
vmovl.u8 q0,d12
vmovl.u8 q1,d14
vadd.s16 q0,q4
vadd.s16 q1,q5
vqmovun.s16 d12,q0
vqmovun.s16 d14,q1
vst1.32 {d12[0]},[r2],r1
vst1.32 {d12[1]},[r2],r1
vst1.32 {d14[0]},[r2],r1
vst1.32 {d14[1]},[r2]
WELS_ASM_FUNC_END
#endif

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,746 @@
/*!
* \copy
* Copyright (c) 2013, Cisco Systems
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
*
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in
* the documentation and/or other materials provided with the
* distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
* COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
* ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*
*/
#ifdef HAVE_NEON
//Global macro
.text
#include "arm_arch_common_macro.S"
#ifdef APPLE_IOS
//Global macro
.macro GET_8BYTE_DATA
vld1.8 {$0[0]}, [$1], $2
vld1.8 {$0[1]}, [$1], $2
vld1.8 {$0[2]}, [$1], $2
vld1.8 {$0[3]}, [$1], $2
vld1.8 {$0[4]}, [$1], $2
vld1.8 {$0[5]}, [$1], $2
vld1.8 {$0[6]}, [$1], $2
vld1.8 {$0[7]}, [$1], $2
.endmacro
#else
//Global macro
.macro GET_8BYTE_DATA arg0, arg1, arg2
vld1.8 {\arg0[0]}, [\arg1], \arg2
vld1.8 {\arg0[1]}, [\arg1], \arg2
vld1.8 {\arg0[2]}, [\arg1], \arg2
vld1.8 {\arg0[3]}, [\arg1], \arg2
vld1.8 {\arg0[4]}, [\arg1], \arg2
vld1.8 {\arg0[5]}, [\arg1], \arg2
vld1.8 {\arg0[6]}, [\arg1], \arg2
vld1.8 {\arg0[7]}, [\arg1], \arg2
.endm
#endif
/*
* void get_i16x16_luma_pred_v(uint8_t *pred, const int32_t stride)
* r0 --- pred
* r1 --- stride
* return --- void
*/
WELS_ASM_FUNC_BEGIN WelsDecoderI16x16LumaPredV_neon
//Get the top line data to 'q0'
sub r2, r0, r1
vldm r2, {d0, d1}
mov r2, r0
mov r3, #4
//Set the top line to the each line of MB(16*16)
loop_0_get_i16x16_luma_pred_v:
vst1.8 {d0,d1}, [r2], r1
vst1.8 {d0,d1}, [r2], r1
vst1.8 {d0,d1}, [r2], r1
vst1.8 {d0,d1}, [r2], r1
subs r3, #1
bne loop_0_get_i16x16_luma_pred_v
WELS_ASM_FUNC_END
/*
* void get_i16x16_luma_pred_h(uint8_t *pred, const int32_t stride)
* r0 --- pred
* r1 --- stride
* return --- void
*/
WELS_ASM_FUNC_BEGIN WelsDecoderI16x16LumaPredH_neon
sub r2, r0, #1
mov r3, #4
loop_0_get_i16x16_luma_pred_h:
//Get one byte data from left side
vld1.8 {d0[],d1[]}, [r2], r1
vld1.8 {d2[],d3[]}, [r2], r1
vld1.8 {d4[],d5[]}, [r2], r1
vld1.8 {d6[],d7[]}, [r2], r1
//Set the line of MB using the left side byte data
vst1.8 {d0,d1}, [r0], r1
vst1.8 {d2,d3}, [r0], r1
vst1.8 {d4,d5}, [r0], r1
vst1.8 {d6,d7}, [r0], r1
subs r3, #1
bne loop_0_get_i16x16_luma_pred_h
WELS_ASM_FUNC_END
/*
* void get_i16x16_luma_pred_dc_both(uint8_t *pred, const int32_t stride)
* r0 --- pred
* r1 --- stride
* return --- void
*/
WELS_ASM_FUNC_BEGIN WelsDecoderI16x16LumaPredDc_neon
//stmdb sp!, { r2-r5, lr}
//Get the left vertical line data
sub r2, r0, #1
GET_8BYTE_DATA d0, r2, r1
GET_8BYTE_DATA d1, r2, r1
//Get the top horizontal line data
sub r2, r0, r1
vldm r2, {d2, d3}
//Calculate the sum of top horizontal line data and vertical line data
vpaddl.u8 q0, q0
vpaddl.u8 q1, q1
vadd.u16 q0, q0, q1
vadd.u16 d0, d0, d1
vpaddl.u16 d0, d0
vpaddl.u32 d0, d0
//Calculate the mean value
vrshr.u16 d0, d0, #5
vdup.8 q0, d0[0]
//Set the mean value to the all of member of MB
mov r2, #4
loop_0_get_i16x16_luma_pred_dc_both:
vst1.8 {d0,d1}, [r0], r1
vst1.8 {d0,d1}, [r0], r1
vst1.8 {d0,d1}, [r0], r1
vst1.8 {d0,d1}, [r0], r1
subs r2, #1
bne loop_0_get_i16x16_luma_pred_dc_both
WELS_ASM_FUNC_END
/*
* void get_i16x16_luma_pred_plane(uint8_t *pred, const int32_t stride)
* r0 --- pred
* r1 --- stride
* return --- void
*/
//The table for SIMD instruction {(8,7,6,5,4,3,2,1) * 5}
CONST0_GET_I16X16_LUMA_PRED_PLANE: .long 0x191e2328, 0x050a0f14
//The table for SIMD instruction {-7,-6,-5,-4,-3,-2,-1,0}
CONST1_GET_I16X16_LUMA_PRED_PLANE: .long 0xfcfbfaf9, 0x00fffefd
WELS_ASM_FUNC_BEGIN WelsDecoderI16x16LumaPredPlane_neon
//stmdb sp!, { r2-r5, lr}
//Load the table {(8,7,6,5,4,3,2,1) * 5}
adr r2, CONST0_GET_I16X16_LUMA_PRED_PLANE
vldr d0, [r2]
//Pack the top[-1] ~ top[6] to d1
sub r2, r0, r1
sub r3, r2, #1
vld1.8 d1, [r3]
//Pack the top[8] ~ top[15] to d2
add r3, #9
vld1.8 d2, [r3]
//Save the top[15] to d6 for next step
vdup.u8 d6, d2[7]
//Get and pack left[-1] ~ left[6] to d4
sub r3, r2, #1
GET_8BYTE_DATA d4, r3, r1
//Get and pack left[8] ~ left[15] to d3
add r3, r1
GET_8BYTE_DATA d3, r3, r1
//Save the left[15] to d7 for next step
vdup.u8 d7, d3[7]
//revert the sequence of d2,d3
vrev64.8 q1, q1
vsubl.u8 q2, d3, d4 //q2={left[8]-left[6],left[9]-left[5],left[10]-left[4], ...}
vsubl.u8 q1, d2, d1 //q1={top[8]-top[6],top[9]-top[5],top[10]-top[4], ...}
vmovl.u8 q0, d0
vmul.s16 q1, q0, q1 //q1 = q1*{(8,7,6,5,4,3,2,1) * 5}
vmul.s16 q2, q0, q2 //q2 = q2*{(8,7,6,5,4,3,2,1) * 5}
//Calculate the sum of items of q1, q2
vpadd.s16 d0, d2, d3
vpadd.s16 d1, d4, d5
vpaddl.s16 q0, q0
vpaddl.s32 q0, q0
//Get the value of 'b', 'c' and extend to q1, q2.
vrshr.s64 q0, #6
vdup.s16 q1, d0[0]
vdup.s16 q2, d1[0]
//Load the table {-7,-6,-5,-4,-3,-2,-1,0} to d0
adr r2, CONST1_GET_I16X16_LUMA_PRED_PLANE
vld1.32 {d0}, [r2]
//Get the value of 'a' and save to q3
vaddl.u8 q3, d6, d7
vshl.u16 q3, #4
//calculate a+'b'*{-7,-6,-5,-4,-3,-2,-1,0} + c*{-7}
vmovl.s8 q0, d0
vmla.s16 q3, q0, q1
vmla.s16 q3, q2, d0[0]
//Calculate a+'b'*{1,2,3,4,5,6,7,8} + c*{-7}
vshl.s16 q5, q1, #3
vadd.s16 q5, q3
//right shift 5 bits and rounding
vqrshrun.s16 d0, q3, #5
vqrshrun.s16 d1, q5, #5
//Set the line of MB
vst1.u32 {d0,d1}, [r0], r1
//Do the same processing for setting other lines
mov r2, #15
loop_0_get_i16x16_luma_pred_plane:
vadd.s16 q3, q2
vadd.s16 q5, q2
vqrshrun.s16 d0, q3, #5
vqrshrun.s16 d1, q5, #5
vst1.u32 {d0,d1}, [r0], r1
subs r2, #1
bne loop_0_get_i16x16_luma_pred_plane
WELS_ASM_FUNC_END
/*
* void get_i4x4_luma_pred_v(uint8_t *pred, const int32_t stride);
* r0 --- pred
* r1 --- stride
* return --- void
*/
WELS_ASM_FUNC_BEGIN WelsDecoderI4x4LumaPredV_neon
//stmdb sp!, { r2-r5, lr}
//Load the top row (4 bytes)
sub r2, r0, r1
ldr r2, [r2]
//Set the luma MB using top line
str r2, [r0], r1
str r2, [r0], r1
str r2, [r0], r1
str r2, [r0]
WELS_ASM_FUNC_END
/*
* void get_i4x4_luma_pred_h(uint8_t *pred, const int32_t stride);
* r0 --- pred
* r1 --- stride
* return --- void
*/
WELS_ASM_FUNC_BEGIN WelsDecoderI4x4LumaPredH_neon
//stmdb sp!, { r2-r5, lr}
//Load the left column (4 bytes)
sub r2, r0, #1
vld1.8 {d0[]}, [r2], r1
vld1.8 {d1[]}, [r2], r1
vld1.8 {d2[]}, [r2], r1
vld1.8 {d3[]}, [r2]
//Set the luma MB using the left side byte
vst1.32 {d0[0]}, [r0], r1
vst1.32 {d1[0]}, [r0], r1
vst1.32 {d2[0]}, [r0], r1
vst1.32 {d3[0]}, [r0]
WELS_ASM_FUNC_END
/*
* void get_i4x4_luma_pred_d_l(uint8_t *pred, const int32_t stride);
* r0 --- pred
* r1 --- stride
* return --- void
*/
WELS_ASM_FUNC_BEGIN WelsDecoderI4x4LumaPredDDL_neon
//stmdb sp!, { r2-r5, lr}
//Load the top row data(8 bytes)
sub r2, r0, r1
vld1.32 {d0}, [r2]
//For "t7 + (t7<<1)"
vdup.8 d1, d0[7]
//calculate "t0+t1,t1+t2,t2+t3...t6+t7,t7+t7"
vext.8 d1, d0, d1, #1
vaddl.u8 q1, d1, d0
//calculate "x,t0+t1+t1+t2,t1+t2+t2+t3,...t5+t6+t6+t7,t6+t7+t7+t7"
vext.8 q2, q1, q1, #14
vadd.u16 q0, q1, q2
//right shift 2 bits and rounding
vqrshrn.u16 d0, q0, #2
//Save "ddl0, ddl1, ddl2, ddl3"
vext.8 d1, d0, d0, #1
vst1.32 d1[0], [r0], r1
//Save "ddl1, ddl2, ddl3, ddl4"
vext.8 d1, d0, d0, #2
vst1.32 d1[0], [r0], r1
//Save "ddl2, ddl3, ddl4, ddl5"
vext.8 d1, d0, d0, #3
vst1.32 d1[0], [r0], r1
//Save "ddl3, ddl4, ddl5, ddl6"
vst1.32 d0[1], [r0]
WELS_ASM_FUNC_END
/*
* void get_i4x4_luma_pred_d_r(uint8_t *pred, const int32_t stride);
* r0 --- pred
* r1 --- stride
* return --- void
*/
WELS_ASM_FUNC_BEGIN WelsDecoderI4x4LumaPredDDR_neon
//stmdb sp!, { r2-r5, lr}
//Load the top row (4 bytes)
sub r2, r0, r1
vld1.32 {d0[1]}, [r2]
//Load the left column (5 bytes)
sub r2, #1
vld1.8 {d0[3]}, [r2], r1
vld1.8 {d0[2]}, [r2], r1
vld1.8 {d0[1]}, [r2], r1
vld1.8 {d0[0]}, [r2], r1
vld1.8 {d1[7]}, [r2] //For packing the right sequence to do SIMD processing
vext.8 d2, d1, d0, #7 //d0:{L2,L1,L0,LT,T0,T1,T2,T3}
//d2:{L3,L2,L1,L0,LT,T0,T1,T2}
//q2:{L2+L3,L1+L2,L0+L1...T1+T2,T2+T3}
vaddl.u8 q2, d2, d0
//q1:{TL0+LT0,LT0+T01,...L12+L23}
vext.8 q3, q3, q2, #14
vadd.u16 q1, q2, q3
//right shift 2 bits and rounding
vqrshrn.u16 d0, q1, #2
//Adjust the data sequence for setting luma MB of 'pred'
vst1.32 d0[1], [r0], r1
vext.8 d0, d0, d0, #7
vst1.32 d0[1], [r0], r1
vext.8 d0, d0, d0, #7
vst1.32 d0[1], [r0], r1
vext.8 d0, d0, d0, #7
vst1.32 d0[1], [r0]
WELS_ASM_FUNC_END
/*
* void get_i4x4_luma_pred_v_l(uint8_t *pred, const int32_t stride);
* r0 --- pred
* r1 --- stride
* return --- void
*/
WELS_ASM_FUNC_BEGIN WelsDecoderI4x4LumaPredVL_neon
//stmdb sp!, { r2-r5, lr}
//Load the top row (8 bytes)
sub r2, r0, r1
vld1.32 {d0}, [r2]
vext.8 d1, d0, d0, #1
vaddl.u8 q1, d1, d0 //q1:{t0+t1,t1+t2,t2+t3...t5+t6,x,x}
vext.8 q2, q1, q1, #2
vadd.u16 q2, q1, q2 //q2:{t0+t1+t1+t2,t1+t2+t2+t3,...t4+t5+t5+t6,x,x}
//calculate the "vl0,vl1,vl2,vl3,vl4"
vqrshrn.u16 d0, q1, #1
//calculate the "vl5,vl6,vl7,vl8,vl9"
vqrshrn.u16 d1, q2, #2
//Adjust the data sequence for setting the luma MB
vst1.32 d0[0], [r0], r1
vst1.32 d1[0], [r0], r1
vext.8 d0, d0, d0, #1
vext.8 d1, d1, d1, #1
vst1.32 d0[0], [r0], r1
vst1.32 d1[0], [r0]
WELS_ASM_FUNC_END
/*
* void get_i4x4_luma_pred_v_r(uint8_t *pred, const int32_t stride);
* r0 --- pred
* r1 --- stride
* return --- void
*/
WELS_ASM_FUNC_BEGIN WelsDecoderI4x4LumaPredVR_neon
//stmdb sp!, { r2-r5, lr}
//Load the top row (4 bytes)
sub r2, r0, r1
vld1.32 {d0[1]}, [r2]
//Load the left column (4 bytes)
sub r2, #1
vld1.8 {d0[3]}, [r2], r1
vld1.8 {d0[2]}, [r2], r1
vld1.8 {d0[1]}, [r2], r1
vld1.8 {d0[0]}, [r2]
vext.8 d1, d0, d0, #7
vaddl.u8 q1, d0, d1 //q1:{X,L2+L1,L1+L0,L0+LT,LT+T0,T0+T1,T1+T2,T2+T3}
vext.u8 q2, q1, q1, #14
vadd.u16 q2, q2, q1 //q2:{X,L2+L1+L1+L0,L1+L0+L0+LT,...T1+T2+T2+T3}
//Calculate the vr0 ~ vr9
vqrshrn.u16 d1, q2, #2
vqrshrn.u16 d0, q1, #1
//Adjust the data sequence for setting the luma MB
vst1.32 d0[1], [r0], r1
vst1.32 d1[1], [r0], r1
add r2, r0, r1
vst1.8 d1[3], [r0]!
vst1.16 d0[2], [r0]!
vst1.8 d0[6], [r0]!
vst1.8 d1[2], [r2]!
vst1.16 d1[2], [r2]!
vst1.8 d1[6], [r2]
WELS_ASM_FUNC_END
/*
* get_i4x4_luma_pred_h_u(uint8_t *pred, const int32_t stride);
* r0 --- pred
* r1 --- stride
* return --- void
*/
//NO TEST
WELS_ASM_FUNC_BEGIN WelsDecoderI4x4LumaPredHU_neon
//stmdb sp!, { r2-r5, lr}
//Load the left column data
sub r2, r0, #1
mov r3, #3
mul r3, r1
add r3, r2
vld1.8 {d0[]}, [r3]
vld1.8 {d0[4]}, [r2], r1
vld1.8 {d0[5]}, [r2], r1
vld1.8 {d0[6]}, [r2], r1 //d0:{L3,L3,L3,L3,L0,L1,L2,L3}
vext.8 d1, d0, d0, #1
vaddl.u8 q2, d0, d1 //q2:{L3+L3,L3+L3,L3+L3,L3+L0,L0+L1,L1+L2,L2+L3,L3+L3}
vext.u8 d2, d5, d4, #2
vadd.u16 d3, d2, d5 //d3:{L0+L1+L1+L2,L1+L2+L2+L3,L2+L3+L3+L3,L3+L3+L3+L3}
//Calculate the hu0 ~ hu5
vqrshrn.u16 d2, q2, #1
vqrshrn.u16 d1, q1, #2
//Adjust the data sequence for setting the luma MB
vzip.8 d2, d1
vst1.32 d1[0], [r0], r1
vext.8 d2, d1, d1, #2
vst1.32 d2[0], [r0], r1
vst1.32 d1[1], [r0], r1
vst1.32 d0[0], [r0]
WELS_ASM_FUNC_END
/*
* void get_i4x4_luma_pred_h_d(uint8_t *pred, const int32_t stride);
* r0 --- pred
* r1 --- stride
* return --- void
*/
WELS_ASM_FUNC_BEGIN WelsDecoderI4x4LumaPredHD_neon
//stmdb sp!, { r2-r5, lr}
//Load the data
sub r2, r0, r1
sub r2, #1
vld1.32 {d0[1]}, [r2], r1
vld1.8 {d0[3]}, [r2], r1
vld1.8 {d0[2]}, [r2], r1
vld1.8 {d0[1]}, [r2], r1
vld1.8 {d0[0]}, [r2] //d0:{L3,L2,L1,L0,LT,T0,T1,T2}
vext.8 d1, d0, d0, #7
vaddl.u8 q1, d0, d1 //q1:{x,L3+L2,L2+L1,L1+L0,L0+LT,LT+T0,T0+T1,T1+T2}
vext.u8 q2, q1, q1, #14 //q2:{x,x, L3+L2,L2+L1,L1+L0,L0+LT,LT+T0,T0+T1}
vadd.u16 q3, q2, q1 //q3:{x,x,L3+L2+L2+L1,L2+L1+L1+L0,L1+L0+L0+LT,L0+LT+LT+T0,LT+T0+T0+T1,T0+T1+T1+T2}
//Calculate the hd0~hd9
vqrshrn.u16 d1, q3, #2
vqrshrn.u16 d0, q2, #1
//Adjust the data sequence for setting the luma MB
vmov d3, d1
vtrn.8 d0, d1
vext.u8 d2, d1, d1, #6
vst2.16 {d2[3], d3[3]}, [r0], r1
vst2.16 {d0[2], d1[2]}, [r0], r1
vmov d3, d0
vst2.16 {d2[2], d3[2]}, [r0], r1
vst2.16 {d0[1], d1[1]}, [r0]
WELS_ASM_FUNC_END
/*
* void get_i_chroma_pred_v(uint8_t *pred, const int32_t stride);
* r0 --- pred
* r1 --- stride
* return --- void
*/
WELS_ASM_FUNC_BEGIN WelsDecoderIChromaPredV_neon
//stmdb sp!, { r2-r5, lr}
//Get the top row (8 byte)
sub r2, r0, r1
vldr d0, [r2]
//Set the chroma MB using top row data
vst1.8 {d0}, [r0], r1
vst1.8 {d0}, [r0], r1
vst1.8 {d0}, [r0], r1
vst1.8 {d0}, [r0], r1
vst1.8 {d0}, [r0], r1
vst1.8 {d0}, [r0], r1
vst1.8 {d0}, [r0], r1
vst1.8 {d0}, [r0]
WELS_ASM_FUNC_END
/*
* void get_i_chroma_pred_h(uint8_t *pred, const int32_t stride);
* r0 --- pred
* r1 --- stride
* return --- void
*/
WELS_ASM_FUNC_BEGIN WelsDecoderIChromaPredH_neon
//stmdb sp!, { r2-r5, lr}
////Get the left column (8 byte)
sub r2, r0, #1
vld1.8 {d0[]}, [r2], r1
vld1.8 {d1[]}, [r2], r1
vld1.8 {d2[]}, [r2], r1
vld1.8 {d3[]}, [r2], r1
vld1.8 {d4[]}, [r2], r1
vld1.8 {d5[]}, [r2], r1
vld1.8 {d6[]}, [r2], r1
vld1.8 {d7[]}, [r2]
//Set the chroma MB using left column data
vst1.8 {d0}, [r0], r1
vst1.8 {d1}, [r0], r1
vst1.8 {d2}, [r0], r1
vst1.8 {d3}, [r0], r1
vst1.8 {d4}, [r0], r1
vst1.8 {d5}, [r0], r1
vst1.8 {d6}, [r0], r1
vst1.8 {d7}, [r0]
WELS_ASM_FUNC_END
/*
* void get_i_chroma_pred_dc_both(uint8_t *pred, const int32_t stride);
* r0 --- pred
* r1 --- stride
* return --- void
*/
WELS_ASM_FUNC_BEGIN WelsDecoderIChromaPredDC_neon
//stmdb sp!, { r2-r5, lr}
//Load the left column data (8 bytes)
sub r2, r0, #1
GET_8BYTE_DATA d0, r2, r1
//Load the top row data (8 bytes)
sub r2, r0, r1
vldr d1, [r2]
//Calculate the sum of left column and top row
vpaddl.u8 q0, q0
vpaddl.u16 q0, q0
vadd.u32 d2, d0, d1 //'m1' save to d2
vrshr.u32 q0, q0, #2 //calculate 'm2','m3'
vrshr.u32 d2, d2, #3 //calculate 'm4'
//duplicate the 'mx' to a vector line
vdup.8 d4, d2[0]
vdup.8 d5, d1[4]
vdup.8 d6, d0[4]
vdup.8 d7, d2[4]
//Set the chroma MB
vst2.32 {d4[0],d5[0]}, [r0], r1
vst2.32 {d4[0],d5[0]}, [r0], r1
vst2.32 {d4[0],d5[0]}, [r0], r1
vst2.32 {d4[0],d5[0]}, [r0], r1
vst2.32 {d6[0],d7[0]}, [r0], r1
vst2.32 {d6[0],d7[0]}, [r0], r1
vst2.32 {d6[0],d7[0]}, [r0], r1
vst2.32 {d6[0],d7[0]}, [r0]
WELS_ASM_FUNC_END
/*
* void get_i_chroma_pred_plane(uint8_t *pred, const int32_t stride);
* r0 --- pred
* r1 --- stride
* return --- void
*/
//Table {{1,2,3,4,1,2,3,4}*17}
CONST0_GET_I_CHROMA_PRED_PLANE: .long 0x44332211, 0x44332211//0x140f0a05, 0x28231e19
//Table {-3,-2,-1,0,1,2,3,4}
CONST1_GET_I_CHROMA_PRED_PLANE: .long 0xfffefffd, 0x0000ffff,0x00020001,0x00040003
WELS_ASM_FUNC_BEGIN WelsDecoderIChromaPredPlane_neon
//stmdb sp!, { r2-r5, lr}
//Load the top row data
sub r2, r0, #1
sub r2, r1
vld1.32 {d1[0]}, [r2]
add r2, #5
vld1.32 {d0[0]}, [r2]
//Load the left column data
sub r2, #5
vld1.8 {d1[4]}, [r2], r1
vld1.8 {d1[5]}, [r2], r1
vld1.8 {d1[6]}, [r2], r1
vld1.8 {d1[7]}, [r2], r1 //d1:{LT,T0,T1,T2,LT,L0,L1,L2}
add r2, r1
vld1.8 {d0[4]}, [r2], r1
vld1.8 {d0[5]}, [r2], r1
vld1.8 {d0[6]}, [r2], r1
vld1.8 {d0[7]}, [r2] //d0:{T4,T5,T6,T7,L4,L5,L6.L7}
//Save T7 to d3 for next step
vdup.u8 d3, d0[3]
//Save L7 to d4 for next step
vdup.u8 d4, d0[7]
//Calculate the value of 'a' and save to q2
vaddl.u8 q2, d3, d4
vshl.u16 q2, #4
//Load the table {{1,2,3,4,1,2,3,4}*17}
adr r2, CONST0_GET_I_CHROMA_PRED_PLANE
vld1.32 {d2}, [r2]
//Calculate the 'b','c', and save to q0
vrev32.8 d1, d1
vsubl.u8 q0, d0, d1
vmovl.u8 q1, d2
vmul.s16 q0, q1
vpaddl.s16 q0, q0
vpaddl.s32 q0, q0
vrshr.s64 q0, #5
//Load the table {-3,-2,-1,0,1,2,3,4} to q3
adr r2, CONST1_GET_I_CHROMA_PRED_PLANE
vld1.32 {d6, d7}, [r2]
//Duplicate the 'b','c' to q0, q1 for SIMD instruction
vdup.s16 q1, d1[0]
vdup.s16 q0, d0[0]
//Calculate the "(a + b * (j - 3) + c * (- 3) + 16) >> 5;"
vmla.s16 q2, q0, q3
vmla.s16 q2, q1, d6[0]
vqrshrun.s16 d0, q2, #5
//Set a line of chroma MB
vst1.u32 {d0}, [r0], r1
//Do the same processing for each line.
mov r2, #7
loop_0_get_i_chroma_pred_plane:
vadd.s16 q2, q1
vqrshrun.s16 d0, q2, #5
vst1.u32 {d0}, [r0], r1
subs r2, #1
bne loop_0_get_i_chroma_pred_plane
WELS_ASM_FUNC_END
#endif

1621
codec/decoder/core/arm/mc_neon.S Executable file

File diff suppressed because it is too large Load Diff

View File

@ -50,6 +50,10 @@ extern "C" {
void IdctResAddPred_mmx (uint8_t* pPred, const int32_t kiStride, int16_t* pRs);
#endif//X86_ASM
#if defined(HAVE_NEON)
void IdctResAddPred_neon(uint8_t *pred, const int32_t stride, int16_t *rs);
#endif
#if defined(__cplusplus)
}
#endif//__cplusplus

View File

@ -68,6 +68,12 @@ void WelsChromaDcIdct (int16_t* pBlock);
extern "C" {
#endif//__cplusplus
#if defined(HAVE_NEON)
void WelsResBlockZero16x16_neon(int16_t* pBlock, int32_t iStride);
void WelsResBlockZero8x8_neon(int16_t* pBlock, int32_t iStride);
void SetNonZeroCount_neon(int16_t* pBlock, int8_t* pNonZeroCount);
#endif
#ifdef X86_ASM
void WelsResBlockZero16x16_sse2 (int16_t* pBlock, int32_t iStride);
void WelsResBlockZero8x8_sse2 (int16_t* pBlock, int32_t iStride);

View File

@ -107,6 +107,27 @@ void WelsDecoderI4x4LumaPredDDL_mmx (uint8_t* pPred, const int32_t kiStride);
void WelsDecoderI4x4LumaPredVL_mmx (uint8_t* pPred, const int32_t kiStride);
#endif//X86_ASM
#if defined(HAVE_NEON)
void WelsDecoderI16x16LumaPredV_neon(uint8_t *pPred, const int32_t kiStride);
void WelsDecoderI16x16LumaPredH_neon(uint8_t *pPred, const int32_t kiStride);
void WelsDecoderI16x16LumaPredDc_neon(uint8_t *pPred, const int32_t kiStride);
void WelsDecoderI16x16LumaPredPlane_neon(uint8_t *pPred, const int32_t kiStride);
void WelsDecoderI4x4LumaPredV_neon(uint8_t *pPred, const int32_t kiStride);
void WelsDecoderI4x4LumaPredH_neon(uint8_t *pPred, const int32_t kiStride);
void WelsDecoderI4x4LumaPredDDL_neon(uint8_t *pPred, const int32_t kiStride);
void WelsDecoderI4x4LumaPredDDR_neon(uint8_t *pPred, const int32_t kiStride);
void WelsDecoderI4x4LumaPredVL_neon(uint8_t *pPred, const int32_t kiStride);
void WelsDecoderI4x4LumaPredVR_neon(uint8_t *pPred, const int32_t kiStride);
void WelsDecoderI4x4LumaPredHU_neon(uint8_t *pPred, const int32_t kiStride);
void WelsDecoderI4x4LumaPredHD_neon(uint8_t *pPred, const int32_t kiStride);
void WelsDecoderIChromaPredV_neon(uint8_t *pPred, const int32_t kiStride);
void WelsDecoderIChromaPredH_neon(uint8_t *pPred, const int32_t kiStride);
void WelsDecoderIChromaPredDC_neon(uint8_t *pPred, const int32_t kiStride);
void WelsDecoderIChromaPredPlane_neon(uint8_t *pPred, const int32_t kiStride);
#endif//HAVE_NEON
#if defined(__cplusplus)
}
#endif//__cplusplus

View File

@ -39,6 +39,7 @@
*/
#include "deblocking.h"
#include "deblocking_common.h"
#include "cpu_core.h"
namespace WelsDec {
@ -718,6 +719,19 @@ void DeblockingInit (SDeblockingFunc* pFunc, int32_t iCpu) {
}
#endif
#if defined(HAVE_NEON)
{
pFunc->pfLumaDeblockingLT4Ver = DeblockLumaLt4V_neon;
pFunc->pfLumaDeblockingEQ4Ver = DeblockLumaEq4V_neon;
pFunc->pfLumaDeblockingLT4Hor = DeblockLumaLt4H_neon;
pFunc->pfLumaDeblockingEQ4Hor = DeblockLumaEq4H_neon;
pFunc->pfChromaDeblockingLT4Ver = DeblockChromaLt4V_neon;
pFunc->pfChromaDeblockingEQ4Ver = DeblockChromaEq4V_neon;
pFunc->pfChromaDeblockingLT4Hor = DeblockChromaLt4H_neon;
pFunc->pfChromaDeblockinEQ4Hor = DeblockChromaEq4H_neon;
}
#endif
}
} // namespace WelsDec

View File

@ -1148,6 +1148,12 @@ void WelsBlockFuncInit (SBlockFunc* pFunc, int32_t iCpu) {
pFunc->pWelsBlockZero8x8Func = WelsResBlockZero8x8_sse2;
}
#endif
#ifdef HAVE_NEON
pFunc->pWelsBlockZero16x16Func = WelsResBlockZero16x16_neon;
pFunc->pWelsBlockZero8x8Func = WelsResBlockZero8x8_neon;
pFunc->pWelsSetNonZeroCountFunc = SetNonZeroCount_neon;
#endif
}
void WelsBlockZero16x16_c (int16_t* pBlock, int32_t iStride) {
WelsBlockInit (pBlock, 16, 16, iStride, 0);

View File

@ -655,6 +655,30 @@ void AssignFuncPointerForRec (PWelsDecoderContext pCtx) {
InitDctClipTable();
pCtx->pIdctResAddPredFunc = IdctResAddPred_c;
#if defined(HAVE_NEON)
pCtx->pIdctResAddPredFunc = IdctResAddPred_neon;
pCtx->pGetI16x16LumaPredFunc[I16_PRED_DC] = WelsDecoderI16x16LumaPredDc_neon;
pCtx->pGetI16x16LumaPredFunc[I16_PRED_P] = WelsDecoderI16x16LumaPredPlane_neon;
pCtx->pGetI16x16LumaPredFunc[I16_PRED_H] = WelsDecoderI16x16LumaPredH_neon;
pCtx->pGetI16x16LumaPredFunc[I16_PRED_V] = WelsDecoderI16x16LumaPredV_neon;
pCtx->pGetI4x4LumaPredFunc[I4_PRED_V ] = WelsDecoderI4x4LumaPredV_neon;
pCtx->pGetI4x4LumaPredFunc[I4_PRED_H ] = WelsDecoderI4x4LumaPredH_neon;
pCtx->pGetI4x4LumaPredFunc[I4_PRED_DDL ] = WelsDecoderI4x4LumaPredDDL_neon;
pCtx->pGetI4x4LumaPredFunc[I4_PRED_DDR ] = WelsDecoderI4x4LumaPredDDR_neon;
pCtx->pGetI4x4LumaPredFunc[I4_PRED_VL ] = WelsDecoderI4x4LumaPredVL_neon;
pCtx->pGetI4x4LumaPredFunc[I4_PRED_VR ] = WelsDecoderI4x4LumaPredVR_neon;
pCtx->pGetI4x4LumaPredFunc[I4_PRED_HU ] = WelsDecoderI4x4LumaPredHU_neon;
pCtx->pGetI4x4LumaPredFunc[I4_PRED_HD ] = WelsDecoderI4x4LumaPredHD_neon;
pCtx->pGetIChromaPredFunc[C_PRED_H] = WelsDecoderIChromaPredH_neon;
pCtx->pGetIChromaPredFunc[C_PRED_V] = WelsDecoderIChromaPredV_neon;
pCtx->pGetIChromaPredFunc[C_PRED_P ] = WelsDecoderIChromaPredPlane_neon;
pCtx->pGetIChromaPredFunc[C_PRED_DC] = WelsDecoderIChromaPredDC_neon;
#endif//HAVE_NEON
#if defined(X86_ASM)
if (pCtx->uiCpuFlag & WELS_CPU_MMXEXT) {

View File

@ -636,7 +636,7 @@ void McLuma_sse2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_
void McChroma_sse2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
int16_t iMvX, int16_t iMvY, int32_t iWidth, int32_t iHeight) {
static const PMcChromaWidthExtFunc kpMcChromaWidthFuncs[2] = {
McChromaWidthEq4_mmx,
McChromaWidthEq4_mmx,
McChromaWidthEq8_sse2
};
const int32_t kiD8x = iMvX & 0x07;
@ -651,17 +651,334 @@ void McChroma_sse2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int3
McChromaWithFragMv_c (pSrc, iSrcStride, pDst, iDstStride, iMvX, iMvY, iWidth, iHeight);
}
#endif //X86_ASM
//***************************************************************************//
// NEON implementation //
//***************************************************************************//
#if defined(HAVE_NEON)
void McCopy_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
int32_t iWidth, int32_t iHeight)
{
if (16 == iWidth)
McCopyWidthEq16_neon(pSrc, iSrcStride, pDst, iDstStride, iHeight);
else if(8 == iWidth)
McCopyWidthEq8_neon(pSrc, iSrcStride, pDst, iDstStride, iHeight);
else
McCopyWidthEq4_neon(pSrc, iSrcStride, pDst, iDstStride, iHeight);
}
void McHorVer20_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
int32_t iWidth, int32_t iHeight)
{
if (iWidth == 16)
McHorVer20WidthEq16_neon(pSrc, iSrcStride, pDst, iDstStride, iHeight);
else if (iWidth == 8)
McHorVer20WidthEq8_neon(pSrc, iSrcStride, pDst, iDstStride, iHeight);
else if (iWidth == 4)
McHorVer20WidthEq4_neon(pSrc, iSrcStride, pDst, iDstStride, iHeight);
}
void McHorVer02_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
int32_t iWidth, int32_t iHeight)
{
if (iWidth == 16)
McHorVer02WidthEq16_neon(pSrc, iSrcStride, pDst, iDstStride, iHeight);
else if (iWidth == 8)
McHorVer02WidthEq8_neon(pSrc, iSrcStride, pDst, iDstStride, iHeight);
else if (iWidth == 4)
McHorVer02WidthEq4_neon(pSrc, iSrcStride, pDst, iDstStride, iHeight);
}
void McHorVer22_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
int32_t iWidth, int32_t iHeight)
{
if (iWidth == 16)
McHorVer22WidthEq16_neon(pSrc, iSrcStride, pDst, iDstStride, iHeight);
else if (iWidth == 8)
McHorVer22WidthEq8_neon(pSrc, iSrcStride, pDst, iDstStride, iHeight);
else if (iWidth == 4)
McHorVer22WidthEq4_neon(pSrc, iSrcStride, pDst, iDstStride, iHeight);
}
void McHorVer01_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
int32_t iWidth, int32_t iHeight)
{
if (iWidth == 16)
McHorVer01WidthEq16_neon(pSrc, iSrcStride, pDst, iDstStride, iHeight);
else if (iWidth == 8)
McHorVer01WidthEq8_neon(pSrc, iSrcStride, pDst, iDstStride, iHeight);
else if (iWidth == 4)
McHorVer01WidthEq4_neon(pSrc, iSrcStride, pDst, iDstStride, iHeight);
}
void McHorVer03_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
int32_t iWidth, int32_t iHeight)
{
if (iWidth == 16)
McHorVer03WidthEq16_neon(pSrc, iSrcStride, pDst, iDstStride, iHeight);
else if (iWidth == 8)
McHorVer03WidthEq8_neon(pSrc, iSrcStride, pDst, iDstStride, iHeight);
else if (iWidth == 4)
McHorVer03WidthEq4_neon(pSrc, iSrcStride, pDst, iDstStride, iHeight);
}
void McHorVer10_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
int32_t iWidth, int32_t iHeight)
{
if (iWidth == 16)
McHorVer10WidthEq16_neon(pSrc, iSrcStride, pDst, iDstStride, iHeight);
else if (iWidth == 8)
McHorVer10WidthEq8_neon(pSrc, iSrcStride, pDst, iDstStride, iHeight);
else if (iWidth == 4)
McHorVer10WidthEq4_neon(pSrc, iSrcStride, pDst, iDstStride, iHeight);
}
void McHorVer11_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
int32_t iWidth, int32_t iHeight)
{
ENFORCE_STACK_ALIGN_1D( uint8_t, pHorTmp, 256, 16 );
ENFORCE_STACK_ALIGN_1D( uint8_t, pVerTmp, 256, 16 );
if (iWidth == 16)
{
McHorVer20WidthEq16_neon(pSrc, iSrcStride, pHorTmp, 16, iHeight);
McHorVer02WidthEq16_neon(pSrc, iSrcStride, pVerTmp, 16, iHeight);
PixelAvgWidthEq16_neon(pDst, iDstStride, pHorTmp, pVerTmp, iHeight);
}
else if (iWidth == 8)
{
McHorVer20WidthEq8_neon(pSrc, iSrcStride, pHorTmp, 16, iHeight);
McHorVer02WidthEq8_neon(pSrc, iSrcStride, pVerTmp, 16, iHeight);
PixelAvgWidthEq8_neon(pDst, iDstStride, pHorTmp, pVerTmp, iHeight);
}
else if (iWidth == 4)
{
McHorVer20WidthEq4_neon(pSrc, iSrcStride, pHorTmp, 16, iHeight);
McHorVer02WidthEq4_neon(pSrc, iSrcStride, pVerTmp, 16, iHeight);
PixelAvgWidthEq4_neon(pDst, iDstStride, pHorTmp, pVerTmp, iHeight);
}
}
void McHorVer12_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
int32_t iWidth, int32_t iHeight)
{
ENFORCE_STACK_ALIGN_1D( uint8_t, pVerTmp, 256, 16 );
ENFORCE_STACK_ALIGN_1D( uint8_t, pCtrTmp, 256, 16 );
if (iWidth == 16)
{
McHorVer02WidthEq16_neon(pSrc, iSrcStride, pVerTmp, 16, iHeight);
McHorVer22WidthEq16_neon(pSrc, iSrcStride, pCtrTmp, 16, iHeight);
PixelAvgWidthEq16_neon(pDst, iDstStride, pVerTmp, pCtrTmp, iHeight);
}
else if (iWidth == 8)
{
McHorVer02WidthEq8_neon(pSrc, iSrcStride, pVerTmp, 16, iHeight);
McHorVer22WidthEq8_neon(pSrc, iSrcStride, pCtrTmp, 16, iHeight);
PixelAvgWidthEq8_neon(pDst, iDstStride, pVerTmp, pCtrTmp, iHeight);
}
else if (iWidth == 4)
{
McHorVer02WidthEq4_neon(pSrc, iSrcStride, pVerTmp, 16, iHeight);
McHorVer22WidthEq4_neon(pSrc, iSrcStride, pCtrTmp, 16, iHeight);
PixelAvgWidthEq4_neon(pDst, iDstStride, pVerTmp, pCtrTmp, iHeight);
}
}
void McHorVer13_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
int32_t iWidth, int32_t iHeight)
{
ENFORCE_STACK_ALIGN_1D( uint8_t, pHorTmp, 256, 16 );
ENFORCE_STACK_ALIGN_1D( uint8_t, pVerTmp, 256, 16 );
if (iWidth == 16)
{
McHorVer20WidthEq16_neon(pSrc+iSrcStride, iSrcStride, pHorTmp, 16, iHeight);
McHorVer02WidthEq16_neon(pSrc, iSrcStride, pVerTmp, 16, iHeight);
PixelAvgWidthEq16_neon(pDst, iDstStride, pHorTmp, pVerTmp, iHeight);
}
else if (iWidth == 8)
{
McHorVer20WidthEq8_neon(pSrc+iSrcStride, iSrcStride, pHorTmp, 16, iHeight);
McHorVer02WidthEq8_neon(pSrc, iSrcStride, pVerTmp, 16, iHeight);
PixelAvgWidthEq8_neon(pDst, iDstStride, pHorTmp, pVerTmp, iHeight);
}
else if (iWidth == 4)
{
McHorVer20WidthEq4_neon(pSrc+iSrcStride, iSrcStride, pHorTmp, 16, iHeight);
McHorVer02WidthEq4_neon(pSrc, iSrcStride, pVerTmp, 16, iHeight);
PixelAvgWidthEq4_neon(pDst, iDstStride, pHorTmp, pVerTmp, iHeight);
}
}
void McHorVer21_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
int32_t iWidth, int32_t iHeight)
{
ENFORCE_STACK_ALIGN_1D( uint8_t, pHorTmp, 256, 16 );
ENFORCE_STACK_ALIGN_1D( uint8_t, pCtrTmp, 256, 16 );
if (iWidth == 16)
{
McHorVer20WidthEq16_neon(pSrc, iSrcStride, pHorTmp, 16, iHeight);
McHorVer22WidthEq16_neon(pSrc, iSrcStride, pCtrTmp, 16, iHeight);
PixelAvgWidthEq16_neon(pDst, iDstStride, pHorTmp, pCtrTmp, iHeight);
}
else if (iWidth == 8)
{
McHorVer20WidthEq8_neon(pSrc, iSrcStride, pHorTmp, 16, iHeight);
McHorVer22WidthEq8_neon(pSrc, iSrcStride, pCtrTmp, 16, iHeight);
PixelAvgWidthEq8_neon(pDst, iDstStride, pHorTmp, pCtrTmp, iHeight);
}
else if (iWidth == 4)
{
McHorVer20WidthEq4_neon(pSrc, iSrcStride, pHorTmp, 16, iHeight);
McHorVer22WidthEq4_neon(pSrc, iSrcStride, pCtrTmp, 16, iHeight);
PixelAvgWidthEq4_neon(pDst, iDstStride, pHorTmp, pCtrTmp, iHeight);
}
}
void McHorVer23_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
int32_t iWidth, int32_t iHeight)
{
ENFORCE_STACK_ALIGN_1D( uint8_t, pHorTmp, 256, 16 );
ENFORCE_STACK_ALIGN_1D( uint8_t, pCtrTmp, 256, 16 );
if (iWidth == 16)
{
McHorVer20WidthEq16_neon(pSrc+iSrcStride, iSrcStride, pHorTmp, 16, iHeight);
McHorVer22WidthEq16_neon(pSrc, iSrcStride, pCtrTmp, 16, iHeight);
PixelAvgWidthEq16_neon(pDst, iDstStride, pHorTmp, pCtrTmp, iHeight);
}
else if (iWidth == 8)
{
McHorVer20WidthEq8_neon(pSrc+iSrcStride, iSrcStride, pHorTmp, 16, iHeight);
McHorVer22WidthEq8_neon(pSrc, iSrcStride, pCtrTmp, 16, iHeight);
PixelAvgWidthEq8_neon(pDst, iDstStride, pHorTmp, pCtrTmp, iHeight);
}
else if (iWidth == 4)
{
McHorVer20WidthEq4_neon(pSrc+iSrcStride, iSrcStride, pHorTmp, 16, iHeight);
McHorVer22WidthEq4_neon(pSrc, iSrcStride, pCtrTmp, 16, iHeight);
PixelAvgWidthEq4_neon(pDst, iDstStride, pHorTmp, pCtrTmp, iHeight);
}
}
void McHorVer30_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
int32_t iWidth, int32_t iHeight)
{
if (iWidth == 16)
McHorVer30WidthEq16_neon(pSrc, iSrcStride, pDst, iDstStride, iHeight);
else if (iWidth == 8)
McHorVer30WidthEq8_neon(pSrc, iSrcStride, pDst, iDstStride, iHeight);
else if (iWidth == 4)
McHorVer30WidthEq4_neon(pSrc, iSrcStride, pDst, iDstStride, iHeight);
}
void McHorVer31_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
int32_t iWidth, int32_t iHeight)
{
ENFORCE_STACK_ALIGN_1D( uint8_t, pHorTmp, 256, 16 );
ENFORCE_STACK_ALIGN_1D( uint8_t, pVerTmp, 256, 16 );
if (iWidth == 16) {
McHorVer20WidthEq16_neon(pSrc, iSrcStride, pHorTmp, 16, iHeight);
McHorVer02WidthEq16_neon(pSrc+1, iSrcStride, pVerTmp, 16, iHeight);
PixelAvgWidthEq16_neon(pDst, iDstStride, pHorTmp, pVerTmp, iHeight);
}
else if (iWidth == 8){
McHorVer20WidthEq8_neon(pSrc, iSrcStride, pHorTmp, 16, iHeight);
McHorVer02WidthEq8_neon(pSrc+1, iSrcStride, pVerTmp, 16, iHeight);
PixelAvgWidthEq8_neon(pDst, iDstStride, pHorTmp, pVerTmp, iHeight);
}
else if (iWidth == 4)
{
McHorVer20WidthEq4_neon(pSrc, iSrcStride, pHorTmp, 16, iHeight);
McHorVer02WidthEq4_neon(pSrc+1, iSrcStride, pVerTmp, 16, iHeight);
PixelAvgWidthEq4_neon(pDst, iDstStride, pHorTmp, pVerTmp, iHeight);
}
}
void McHorVer32_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
int32_t iWidth, int32_t iHeight)
{
ENFORCE_STACK_ALIGN_1D( uint8_t, pVerTmp, 256, 16 );
ENFORCE_STACK_ALIGN_1D( uint8_t, pCtrTmp, 256, 16 );
if (iWidth == 16)
{
McHorVer02WidthEq16_neon(pSrc+1, iSrcStride, pVerTmp, 16, iHeight);
McHorVer22WidthEq16_neon(pSrc, iSrcStride, pCtrTmp, 16, iHeight);
PixelAvgWidthEq16_neon(pDst, iDstStride, pVerTmp, pCtrTmp, iHeight);
}
else if (iWidth == 8)
{
McHorVer02WidthEq8_neon(pSrc+1, iSrcStride, pVerTmp, 16, iHeight);
McHorVer22WidthEq8_neon(pSrc, iSrcStride, pCtrTmp, 16, iHeight);
PixelAvgWidthEq8_neon(pDst, iDstStride, pVerTmp, pCtrTmp, iHeight);
}
else if (iWidth == 4)
{
McHorVer02WidthEq4_neon(pSrc+1, iSrcStride, pVerTmp, 16, iHeight);
McHorVer22WidthEq4_neon(pSrc, iSrcStride, pCtrTmp, 16, iHeight);
PixelAvgWidthEq4_neon(pDst, iDstStride, pVerTmp, pCtrTmp, iHeight);
}
}
void McHorVer33_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
int32_t iWidth, int32_t iHeight)
{
ENFORCE_STACK_ALIGN_1D( uint8_t, pHorTmp, 256, 16 );
ENFORCE_STACK_ALIGN_1D( uint8_t, pVerTmp, 256, 16 );
if (iWidth == 16)
{
McHorVer20WidthEq16_neon(pSrc+iSrcStride, iSrcStride, pHorTmp, 16, iHeight);
McHorVer02WidthEq16_neon(pSrc+1, iSrcStride, pVerTmp, 16, iHeight);
PixelAvgWidthEq16_neon(pDst, iDstStride, pHorTmp, pVerTmp, iHeight);
}
else if (iWidth == 8)
{
McHorVer20WidthEq8_neon(pSrc+iSrcStride, iSrcStride, pHorTmp, 16, iHeight);
McHorVer02WidthEq8_neon(pSrc+1, iSrcStride, pVerTmp, 16, iHeight);
PixelAvgWidthEq8_neon(pDst, iDstStride, pHorTmp, pVerTmp, iHeight);
}
else if (iWidth == 4)
{
McHorVer20WidthEq4_neon(pSrc+iSrcStride, iSrcStride, pHorTmp, 16, iHeight);
McHorVer02WidthEq4_neon(pSrc+1, iSrcStride, pVerTmp, 16, iHeight);
PixelAvgWidthEq4_neon(pDst, iDstStride, pHorTmp, pVerTmp, iHeight);
}
}
void McLuma_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
int16_t iMvX, int16_t iMvY, int32_t iWidth, int32_t iHeight)
{
static PWelsMcWidthHeightFunc pWelsMcFunc[4][4] = //[x][y]
{
{McCopy_neon, McHorVer01_neon, McHorVer02_neon, McHorVer03_neon},
{McHorVer10_neon, McHorVer11_neon, McHorVer12_neon, McHorVer13_neon},
{McHorVer20_neon, McHorVer21_neon, McHorVer22_neon, McHorVer23_neon},
{McHorVer30_neon, McHorVer31_neon, McHorVer32_neon, McHorVer33_neon},
};
// pSrc += (iMvY >> 2) * iSrcStride + (iMvX >> 2);
pWelsMcFunc[iMvX&0x03][iMvY&0x03](pSrc, iSrcStride, pDst, iDstStride, iWidth, iHeight);
}
void McChroma_neon(const uint8_t *pSrc, int32_t iSrcStride, uint8_t *pDst, int32_t iDstStride,
int16_t iMvX, int16_t iMvY, int32_t iWidth, int32_t iHeight)
{
if (0 == iMvX && 0 == iMvY)
{
if(8 == iWidth)
McCopyWidthEq8_neon(pSrc, iSrcStride, pDst, iDstStride, iHeight);
else if(iWidth == 4)
McCopyWidthEq4_neon(pSrc, iSrcStride, pDst, iDstStride, iHeight);
else //here iWidth == 2
McCopyWidthEq2_c(pSrc,iSrcStride,pDst,iDstStride,iHeight);
}
else
{
const int32_t kiD8x = iMvX & 0x07;
const int32_t kiD8y = iMvY & 0x07;
if(8 == iWidth)
McChromaWidthEq8_neon(pSrc, iSrcStride, pDst, iDstStride, (int32_t*)(g_kuiABCD[kiD8y][kiD8x]), iHeight);
else if(4 == iWidth)
McChromaWidthEq4_neon(pSrc, iSrcStride, pDst, iDstStride, (int32_t*)(g_kuiABCD[kiD8y][kiD8x]), iHeight);
else //here iWidth == 2
McChromaWithFragMv_c(pSrc, iSrcStride, pDst, iDstStride, iMvX, iMvY, iWidth, iHeight);
}
}
#endif
void InitMcFunc (SMcFunc* pMcFunc, int32_t iCpu) {
pMcFunc->pMcLumaFunc = McLuma_c;
pMcFunc->pMcChromaFunc = McChroma_c;
#ifdef HAVE_NEON
pMcFunc->pMcLumaFunc = McLuma_neon;
pMcFunc->pMcChromaFunc = McChroma_neon;
#endif
#if defined (X86_ASM)
if (iCpu & WELS_CPU_SSE2) {
pMcFunc->pMcLumaFunc = McLuma_sse2;
pMcFunc->pMcChromaFunc = McChroma_sse2;
pMcFunc->pMcLumaFunc = McLuma_sse2;
pMcFunc->pMcChromaFunc = McChroma_sse2;
}
#endif //(X86_ASM)
}