Modify code style, remove trailing space.

This commit is contained in:
Licai Guo
2014-03-03 15:42:01 +08:00
parent b7a25df13f
commit 7768cd0a98
15 changed files with 1636 additions and 1972 deletions

View File

@@ -71,7 +71,7 @@
4CE4474718BC61650017DF25 /* typedefs.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = typedefs.h; sourceTree = "<group>"; }; 4CE4474718BC61650017DF25 /* typedefs.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = typedefs.h; sourceTree = "<group>"; };
4CE4474918BC61650017DF25 /* WelsThreadLib.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = WelsThreadLib.cpp; sourceTree = "<group>"; }; 4CE4474918BC61650017DF25 /* WelsThreadLib.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = WelsThreadLib.cpp; sourceTree = "<group>"; };
4CE4474A18BC61650017DF25 /* WelsThreadLib.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = WelsThreadLib.h; sourceTree = "<group>"; }; 4CE4474A18BC61650017DF25 /* WelsThreadLib.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = WelsThreadLib.h; sourceTree = "<group>"; };
4CE447BC18C085320017DF25 /* deblocking_neon.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = deblocking_neon.S; sourceTree = "<group>"; }; 4CE447BC18C085320017DF25 /* deblocking_neon.S */ = {isa = PBXFileReference; fileEncoding = 4; indentWidth = 4; lastKnownFileType = sourcecode.asm; path = deblocking_neon.S; sourceTree = "<group>"; tabWidth = 4; usesTabs = 0; };
4CE447BE18C085900017DF25 /* arm_arch_common_macro.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = arm_arch_common_macro.S; sourceTree = "<group>"; }; 4CE447BE18C085900017DF25 /* arm_arch_common_macro.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = arm_arch_common_macro.S; sourceTree = "<group>"; };
/* End PBXFileReference section */ /* End PBXFileReference section */

View File

@@ -84,9 +84,9 @@
4CE4464E18BC5EAA0017DF25 /* decoder_context.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = decoder_context.h; sourceTree = "<group>"; }; 4CE4464E18BC5EAA0017DF25 /* decoder_context.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = decoder_context.h; sourceTree = "<group>"; };
4CE4464F18BC5EAA0017DF25 /* decoder_core.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = decoder_core.h; sourceTree = "<group>"; }; 4CE4464F18BC5EAA0017DF25 /* decoder_core.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = decoder_core.h; sourceTree = "<group>"; };
4CE4465018BC5EAA0017DF25 /* error_code.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = error_code.h; sourceTree = "<group>"; }; 4CE4465018BC5EAA0017DF25 /* error_code.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = error_code.h; sourceTree = "<group>"; };
4CE4465118BC5EAA0017DF25 /* expand_pic.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = expand_pic.h; sourceTree = "<group>"; }; 4CE4465118BC5EAA0017DF25 /* expand_pic.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = expand_pic.h; sourceTree = "<group>"; usesTabs = 1; };
4CE4465218BC5EAA0017DF25 /* fmo.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = fmo.h; sourceTree = "<group>"; }; 4CE4465218BC5EAA0017DF25 /* fmo.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = fmo.h; sourceTree = "<group>"; usesTabs = 1; };
4CE4465318BC5EAA0017DF25 /* get_intra_predictor.h */ = {isa = PBXFileReference; fileEncoding = 4; indentWidth = 2; lastKnownFileType = sourcecode.c.h; path = get_intra_predictor.h; sourceTree = "<group>"; tabWidth = 2; usesTabs = 1; }; 4CE4465318BC5EAA0017DF25 /* get_intra_predictor.h */ = {isa = PBXFileReference; fileEncoding = 4; indentWidth = 4; lastKnownFileType = sourcecode.c.h; path = get_intra_predictor.h; sourceTree = "<group>"; tabWidth = 4; usesTabs = 0; };
4CE4465418BC5EAA0017DF25 /* manage_dec_ref.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = manage_dec_ref.h; sourceTree = "<group>"; }; 4CE4465418BC5EAA0017DF25 /* manage_dec_ref.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = manage_dec_ref.h; sourceTree = "<group>"; };
4CE4465518BC5EAA0017DF25 /* mb_cache.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = mb_cache.h; sourceTree = "<group>"; }; 4CE4465518BC5EAA0017DF25 /* mb_cache.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = mb_cache.h; sourceTree = "<group>"; };
4CE4465618BC5EAA0017DF25 /* mc.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = mc.h; sourceTree = "<group>"; }; 4CE4465618BC5EAA0017DF25 /* mc.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = mc.h; sourceTree = "<group>"; };
@@ -105,19 +105,19 @@
4CE4466318BC5EAA0017DF25 /* vlc_decoder.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = vlc_decoder.h; sourceTree = "<group>"; }; 4CE4466318BC5EAA0017DF25 /* vlc_decoder.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = vlc_decoder.h; sourceTree = "<group>"; };
4CE4466418BC5EAA0017DF25 /* wels_common_basis.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = wels_common_basis.h; sourceTree = "<group>"; }; 4CE4466418BC5EAA0017DF25 /* wels_common_basis.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = wels_common_basis.h; sourceTree = "<group>"; };
4CE4466518BC5EAA0017DF25 /* wels_const.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = wels_const.h; sourceTree = "<group>"; }; 4CE4466518BC5EAA0017DF25 /* wels_const.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = wels_const.h; sourceTree = "<group>"; };
4CE4466718BC5EAA0017DF25 /* au_parser.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = au_parser.cpp; sourceTree = "<group>"; usesTabs = 1; }; 4CE4466718BC5EAA0017DF25 /* au_parser.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = au_parser.cpp; sourceTree = "<group>"; usesTabs = 0; };
4CE4466818BC5EAA0017DF25 /* bit_stream.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = bit_stream.cpp; sourceTree = "<group>"; usesTabs = 1; }; 4CE4466818BC5EAA0017DF25 /* bit_stream.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = bit_stream.cpp; sourceTree = "<group>"; usesTabs = 0; };
4CE4466918BC5EAA0017DF25 /* deblocking.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = deblocking.cpp; sourceTree = "<group>"; }; 4CE4466918BC5EAA0017DF25 /* deblocking.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = deblocking.cpp; sourceTree = "<group>"; tabWidth = 2; };
4CE4466A18BC5EAA0017DF25 /* decode_mb_aux.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = decode_mb_aux.cpp; sourceTree = "<group>"; }; 4CE4466A18BC5EAA0017DF25 /* decode_mb_aux.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = decode_mb_aux.cpp; sourceTree = "<group>"; };
4CE4466B18BC5EAA0017DF25 /* decode_slice.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = decode_slice.cpp; sourceTree = "<group>"; usesTabs = 1; }; 4CE4466B18BC5EAA0017DF25 /* decode_slice.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = decode_slice.cpp; sourceTree = "<group>"; usesTabs = 0; };
4CE4466C18BC5EAA0017DF25 /* decoder.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = decoder.cpp; sourceTree = "<group>"; tabWidth = 2; usesTabs = 1; }; 4CE4466C18BC5EAA0017DF25 /* decoder.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = decoder.cpp; sourceTree = "<group>"; tabWidth = 4; usesTabs = 0; };
4CE4466D18BC5EAA0017DF25 /* decoder_core.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = decoder_core.cpp; sourceTree = "<group>"; }; 4CE4466D18BC5EAA0017DF25 /* decoder_core.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = decoder_core.cpp; sourceTree = "<group>"; };
4CE4466E18BC5EAA0017DF25 /* decoder_data_tables.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = decoder_data_tables.cpp; sourceTree = "<group>"; }; 4CE4466E18BC5EAA0017DF25 /* decoder_data_tables.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = decoder_data_tables.cpp; sourceTree = "<group>"; };
4CE4466F18BC5EAA0017DF25 /* expand_pic.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = expand_pic.cpp; sourceTree = "<group>"; }; 4CE4466F18BC5EAA0017DF25 /* expand_pic.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = expand_pic.cpp; sourceTree = "<group>"; };
4CE4467018BC5EAA0017DF25 /* fmo.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = fmo.cpp; sourceTree = "<group>"; }; 4CE4467018BC5EAA0017DF25 /* fmo.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = fmo.cpp; sourceTree = "<group>"; };
4CE4467118BC5EAA0017DF25 /* get_intra_predictor.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = get_intra_predictor.cpp; sourceTree = "<group>"; }; 4CE4467118BC5EAA0017DF25 /* get_intra_predictor.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = get_intra_predictor.cpp; sourceTree = "<group>"; };
4CE4467218BC5EAA0017DF25 /* manage_dec_ref.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = manage_dec_ref.cpp; sourceTree = "<group>"; }; 4CE4467218BC5EAA0017DF25 /* manage_dec_ref.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = manage_dec_ref.cpp; sourceTree = "<group>"; };
4CE4467318BC5EAA0017DF25 /* mc.cpp */ = {isa = PBXFileReference; fileEncoding = 4; indentWidth = 4; lastKnownFileType = sourcecode.cpp.cpp; path = mc.cpp; sourceTree = "<group>"; tabWidth = 1; usesTabs = 1; wrapsLines = 1; }; 4CE4467318BC5EAA0017DF25 /* mc.cpp */ = {isa = PBXFileReference; fileEncoding = 4; indentWidth = 4; lastKnownFileType = sourcecode.cpp.cpp; path = mc.cpp; sourceTree = "<group>"; tabWidth = 1; usesTabs = 0; wrapsLines = 1; };
4CE4467418BC5EAA0017DF25 /* mem_align.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = mem_align.cpp; sourceTree = "<group>"; }; 4CE4467418BC5EAA0017DF25 /* mem_align.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = mem_align.cpp; sourceTree = "<group>"; };
4CE4467518BC5EAA0017DF25 /* memmgr_nal_unit.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = memmgr_nal_unit.cpp; sourceTree = "<group>"; }; 4CE4467518BC5EAA0017DF25 /* memmgr_nal_unit.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = memmgr_nal_unit.cpp; sourceTree = "<group>"; };
4CE4467618BC5EAA0017DF25 /* mv_pred.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = mv_pred.cpp; sourceTree = "<group>"; }; 4CE4467618BC5EAA0017DF25 /* mv_pred.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = mv_pred.cpp; sourceTree = "<group>"; };

View File

@@ -218,25 +218,23 @@ void WelsXmmRegEmptyOp(void * pSrc) {
#if defined(ANDROID_NDK) #if defined(ANDROID_NDK)
uint32_t WelsCPUFeatureDetectAndroid() uint32_t WelsCPUFeatureDetectAndroid()
{ {
uint32_t uiCPU = 0; uint32_t uiCPU = 0;
AndroidCpuFamily cpuFamily = ANDROID_CPU_FAMILY_UNKNOWN; AndroidCpuFamily cpuFamily = ANDROID_CPU_FAMILY_UNKNOWN;
uint64_t uiFeatures = 0; uint64_t uiFeatures = 0;
cpuFamily = android_getCpuFamily();
cpuFamily = android_getCpuFamily(); if (cpuFamily == ANDROID_CPU_FAMILY_ARM) {
if (cpuFamily == ANDROID_CPU_FAMILY_ARM) uiFeatures = android_getCpuFeatures();
{ if (uiFeatures & ANDROID_CPU_ARM_FEATURE_ARMv7){
uiFeatures = android_getCpuFeatures(); uiCPU |= WELS_CPU_ARMv7;
if (uiFeatures & ANDROID_CPU_ARM_FEATURE_ARMv7){ }
uiCPU |= WELS_CPU_ARMv7; if (uiFeatures & ANDROID_CPU_ARM_FEATURE_VFPv3){
} uiCPU |= WELS_CPU_VFPv3;
if (uiFeatures & ANDROID_CPU_ARM_FEATURE_VFPv3){ }
uiCPU |= WELS_CPU_VFPv3; if (uiFeatures & ANDROID_CPU_ARM_FEATURE_NEON){
} uiCPU |= WELS_CPU_NEON;
if (uiFeatures & ANDROID_CPU_ARM_FEATURE_NEON){ }
uiCPU |= WELS_CPU_NEON; }
} return uiCPU;
}
return uiCPU;
} }
#endif #endif
@@ -246,9 +244,8 @@ uint32_t WelsCPUFeatureDetectIOS() //Need to be updated for the new device of AP
{ {
uint32_t uiCPU = 0; uint32_t uiCPU = 0;
struct utsname sSystemInfo; struct utsname sSystemInfo;
uname (&sSystemInfo); uname (&sSystemInfo);
if ((0 != strcmp(sSystemInfo.machine, "iPhone1,1")) && //iPhone 2G if ((0 != strcmp(sSystemInfo.machine, "iPhone1,1")) && //iPhone 2G
(0 != strcmp(sSystemInfo.machine, "iPhone1,2")) && //iPhone 3G (0 != strcmp(sSystemInfo.machine, "iPhone1,2")) && //iPhone 3G
(0 != strcmp(sSystemInfo.machine, "iPod1,1")) && //iPod 1G (0 != strcmp(sSystemInfo.machine, "iPod1,1")) && //iPod 1G

View File

@@ -82,12 +82,12 @@ void WelsXmmRegEmptyOp(void * pSrc);
#if defined(ANDROID_NDK) #if defined(ANDROID_NDK)
uint32_t WelsCPUFeatureDetectAndroid(); uint32_t WelsCPUFeatureDetectAndroid();
#endif #endif
#if defined(APPLE_IOS) #if defined(APPLE_IOS)
uint32_t WelsCPUFeatureDetectIOS(); uint32_t WelsCPUFeatureDetectIOS();
#endif #endif
#endif #endif
#if defined(__cplusplus) #if defined(__cplusplus)
} }
#endif//__cplusplus #endif//__cplusplus

View File

@@ -37,13 +37,13 @@ void DeblockChromaLt4H_ssse3 (uint8_t* pPixCb, uint8_t* pPixCr, int32_t iStride,
#if defined(HAVE_NEON) #if defined(HAVE_NEON)
void DeblockLumaLt4V_neon(uint8_t* pPixY, int32_t iStride, int32_t iAlpha, int32_t iBeta, int8_t* pTc); void DeblockLumaLt4V_neon(uint8_t* pPixY, int32_t iStride, int32_t iAlpha, int32_t iBeta, int8_t* pTc);
void DeblockLumaEq4V_neon(uint8_t* pPixY, int32_t iStride, int32_t iAlpha, int32_t iBeta); void DeblockLumaEq4V_neon(uint8_t* pPixY, int32_t iStride, int32_t iAlpha, int32_t iBeta);
void DeblockLumaLt4H_neon(uint8_t* pPixY, int32_t iStride, int32_t iAlpha, int32_t iBeta, int8_t* pTc); void DeblockLumaLt4H_neon(uint8_t* pPixY, int32_t iStride, int32_t iAlpha, int32_t iBeta, int8_t* pTc);
void DeblockLumaEq4H_neon(uint8_t* pPixY, int32_t iStride, int32_t iAlpha, int32_t iBeta); void DeblockLumaEq4H_neon(uint8_t* pPixY, int32_t iStride, int32_t iAlpha, int32_t iBeta);
void DeblockChromaLt4V_neon(uint8_t* pPixCb, uint8_t* pPixCr, int32_t iStride, int32_t iAlpha, int32_t iBeta, int8_t* pTC); void DeblockChromaLt4V_neon(uint8_t* pPixCb, uint8_t* pPixCr, int32_t iStride, int32_t iAlpha, int32_t iBeta, int8_t* pTC);
void DeblockChromaEq4V_neon(uint8_t* pPixCb, uint8_t* pPixCr, int32_t iStride, int32_t iAlpha, int32_t iBeta); void DeblockChromaEq4V_neon(uint8_t* pPixCb, uint8_t* pPixCr, int32_t iStride, int32_t iAlpha, int32_t iBeta);
void DeblockChromaLt4H_neon(uint8_t* pPixCb, uint8_t* pPixCr, int32_t iStride, int32_t iAlpha, int32_t iBeta, int8_t* pTC); void DeblockChromaLt4H_neon(uint8_t* pPixCb, uint8_t* pPixCr, int32_t iStride, int32_t iAlpha, int32_t iBeta, int8_t* pTC);
void DeblockChromaEq4H_neon(uint8_t* pPixCb, uint8_t* pPixCr, int32_t iStride, int32_t iAlpha, int32_t iBeta); void DeblockChromaEq4H_neon(uint8_t* pPixCb, uint8_t* pPixCr, int32_t iStride, int32_t iAlpha, int32_t iBeta);
#endif #endif

File diff suppressed because it is too large Load Diff

View File

@@ -40,72 +40,44 @@ extern "C" {
#endif//__cplusplus #endif//__cplusplus
#if defined(HAVE_NEON) #if defined(HAVE_NEON)
/*
void McCopy_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
int32_t iWidth, int32_t iHeight);
void McHorVer20_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
int32_t iWidth, int32_t iHeight);
void McHorVer02_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
int32_t iWidth, int32_t iHeight);
void McHorVer22_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
int32_t iWidth, int32_t iHeight);
void McHorVer01_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iWidth, int32_t iHeight);
void McHorVer03_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iWidth, int32_t iHeight);
void McHorVer10_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iWidth, int32_t iHeight);
void McHorVer11_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iWidth, int32_t iHeight);
void McHorVer12_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iWidth, int32_t iHeight);
void McHorVer13_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iWidth, int32_t iHeight);
void McHorVer21_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iWidth, int32_t iHeight);
void McHorVer23_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iWidth, int32_t iHeight);
void McHorVer30_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iWidth, int32_t iHeight);
void McHorVer31_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iWidth, int32_t iHeight);
void McHorVer32_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iWidth, int32_t iHeight);
void McHorVer33_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iWidth, int32_t iHeight);
void McLuma_neon(uint8_t* pSrc, int32_t src_stride, uint8_t* dst, int32_t dst_stride,
int16_t iMvX, int16_t iMvY, int32_t width, int32_t height);
void McChroma_neon(uint8_t* pSrc, int32_t src_stride, uint8_t* dst, int32_t dst_stride,
int16_t iMvX, int16_t iMvY, int32_t width, int32_t height);
*/
void McCopyWidthEq4_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight); void McCopyWidthEq4_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);
void McCopyWidthEq8_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight); void McCopyWidthEq8_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);
void McCopyWidthEq16_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight); void McCopyWidthEq16_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);
void McChromaWidthEq8_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t* pWeights, int32_t iHeight); void McChromaWidthEq8_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t* pWeights, int32_t iHeight);
void McChromaWidthEq4_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t* pWeights, int32_t iHeight); void McChromaWidthEq4_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t* pWeights, int32_t iHeight);
void PixelAvgWidthEq16_neon(uint8_t* pDst, int32_t iDstStride, uint8_t* pSrcA, uint8_t* pSrcB, int32_t iHeight); void PixelAvgWidthEq16_neon(uint8_t* pDst, int32_t iDstStride, uint8_t* pSrcA, uint8_t* pSrcB, int32_t iHeight);
void PixelAvgWidthEq8_neon(uint8_t* pDst, int32_t iDstStride, uint8_t* pSrcA, uint8_t* pSrcB, int32_t iHeight); void PixelAvgWidthEq8_neon(uint8_t* pDst, int32_t iDstStride, uint8_t* pSrcA, uint8_t* pSrcB, int32_t iHeight);
void PixelAvgWidthEq4_neon(uint8_t* pDst, int32_t iDstStride, uint8_t* pSrcA, uint8_t* pSrcB, int32_t iHeight); void PixelAvgWidthEq4_neon(uint8_t* pDst, int32_t iDstStride, uint8_t* pSrcA, uint8_t* pSrcB, int32_t iHeight);
void McHorVer01WidthEq16_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight); void McHorVer01WidthEq16_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);
void McHorVer01WidthEq8_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight); void McHorVer01WidthEq8_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);
void McHorVer01WidthEq4_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight); void McHorVer01WidthEq4_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);
void McHorVer03WidthEq16_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight); void McHorVer03WidthEq16_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);
void McHorVer03WidthEq8_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight); void McHorVer03WidthEq8_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);
void McHorVer03WidthEq4_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight); void McHorVer03WidthEq4_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);
void McHorVer10WidthEq16_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight); void McHorVer10WidthEq16_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);
void McHorVer10WidthEq8_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight); void McHorVer10WidthEq8_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);
void McHorVer10WidthEq4_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight); void McHorVer10WidthEq4_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);
void McHorVer30WidthEq16_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight); void McHorVer30WidthEq16_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);
void McHorVer30WidthEq8_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight); void McHorVer30WidthEq8_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);
void McHorVer30WidthEq4_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight); void McHorVer30WidthEq4_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);
//horizontal filter to gain half sample, that is (2, 0) location in quarter sample //horizontal filter to gain half sample, that is (2, 0) location in quarter sample
void McHorVer20WidthEq16_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight); void McHorVer20WidthEq16_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);
void McHorVer20WidthEq8_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight); void McHorVer20WidthEq8_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);
void McHorVer20WidthEq4_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight); void McHorVer20WidthEq4_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);
//vertical filter to gain half sample, that is (0, 2) location in quarter sample //vertical filter to gain half sample, that is (0, 2) location in quarter sample
void McHorVer02WidthEq16_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight); void McHorVer02WidthEq16_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);
void McHorVer02WidthEq8_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight); void McHorVer02WidthEq8_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);
void McHorVer02WidthEq4_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight); void McHorVer02WidthEq4_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);
//horizontal and vertical filter to gain half sample, that is (2, 2) location in quarter sample //horizontal and vertical filter to gain half sample, that is (2, 2) location in quarter sample
void McHorVer22WidthEq16_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight); void McHorVer22WidthEq16_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);
void McHorVer22WidthEq8_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight); void McHorVer22WidthEq8_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);

View File

@@ -29,7 +29,7 @@
* POSSIBILITY OF SUCH DAMAGE. * POSSIBILITY OF SUCH DAMAGE.
* *
*/ */
#ifdef HAVE_NEON #ifdef HAVE_NEON
.text .text
#include "arm_arch_common_macro.S" #include "arm_arch_common_macro.S"
@@ -37,32 +37,32 @@
.macro ROW_TRANSFORM_1_STEP .macro ROW_TRANSFORM_1_STEP
// { // input: src_d[0]~[3], output: e_q[0]~[3]; working: $8 $9 // { // input: src_d[0]~[3], output: e_q[0]~[3]; working: $8 $9
vaddl.s16 $4, $0, $2 //int32 e[i][0] = src[0] + src[2]; vaddl.s16 $4, $0, $2 //int32 e[i][0] = src[0] + src[2];
vsubl.s16 $5, $0, $2 //int32 e[i][1] = src[0] - src[2]; vsubl.s16 $5, $0, $2 //int32 e[i][1] = src[0] - src[2];
vshr.s16 $8, $1, #1 vshr.s16 $8, $1, #1
vshr.s16 $9, $3, #1 vshr.s16 $9, $3, #1
vsubl.s16 $6, $8, $3 //int32 e[i][2] = (src[1]>>1)-src[3]; vsubl.s16 $6, $8, $3 //int32 e[i][2] = (src[1]>>1)-src[3];
vaddl.s16 $7, $1, $9 //int32 e[i][3] = src[1] + (src[3]>>1); vaddl.s16 $7, $1, $9 //int32 e[i][3] = src[1] + (src[3]>>1);
// } // }
.endm .endm
.macro TRANSFORM_4BYTES // both row & col transform used .macro TRANSFORM_4BYTES // both row & col transform used
// { // output: f_q[0]~[3], input: e_q[0]~[3]; // { // output: f_q[0]~[3], input: e_q[0]~[3];
vadd.s32 $0, $4, $7 //int16 f[i][0] = e[i][0] + e[i][3]; vadd.s32 $0, $4, $7 //int16 f[i][0] = e[i][0] + e[i][3];
vadd.s32 $1, $5, $6 //int16 f[i][1] = e[i][1] + e[i][2]; vadd.s32 $1, $5, $6 //int16 f[i][1] = e[i][1] + e[i][2];
vsub.s32 $2, $5, $6 //int16 f[i][2] = e[i][1] - e[i][2]; vsub.s32 $2, $5, $6 //int16 f[i][2] = e[i][1] - e[i][2];
vsub.s32 $3, $4, $7 //int16 f[i][3] = e[i][0] - e[i][3]; vsub.s32 $3, $4, $7 //int16 f[i][3] = e[i][0] - e[i][3];
// } // }
.endm .endm
.macro COL_TRANSFORM_1_STEP .macro COL_TRANSFORM_1_STEP
// { // input: src_q[0]~[3], output: e_q[0]~[3]; // { // input: src_q[0]~[3], output: e_q[0]~[3];
vadd.s32 $4, $0, $2 //int32 e[0][j] = f[0][j] + f[2][j]; vadd.s32 $4, $0, $2 //int32 e[0][j] = f[0][j] + f[2][j];
vsub.s32 $5, $0, $2 //int32 e[1][j] = f[0][j] - f[2][j]; vsub.s32 $5, $0, $2 //int32 e[1][j] = f[0][j] - f[2][j];
vshr.s32 $6, $1, #1 vshr.s32 $6, $1, #1
vshr.s32 $7, $3, #1 vshr.s32 $7, $3, #1
vsub.s32 $6, $6, $3 //int32 e[2][j] = (f[1][j]>>1) - f[3][j]; vsub.s32 $6, $6, $3 //int32 e[2][j] = (f[1][j]>>1) - f[3][j];
vadd.s32 $7, $1, $7 //int32 e[3][j] = f[1][j] + (f[3][j]>>1); vadd.s32 $7, $1, $7 //int32 e[3][j] = f[1][j] + (f[3][j]>>1);
// } // }
.endm .endm
@@ -70,101 +70,101 @@
.macro ROW_TRANSFORM_1_STEP arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8, arg9 .macro ROW_TRANSFORM_1_STEP arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8, arg9
// { // input: src_d[0]~[3], output: e_q[0]~[3]; working: $8 $9 // { // input: src_d[0]~[3], output: e_q[0]~[3]; working: $8 $9
vaddl.s16 \arg4, \arg0, \arg2 //int32 e[i][0] = src[0] + src[2]; vaddl.s16 \arg4, \arg0, \arg2 //int32 e[i][0] = src[0] + src[2];
vsubl.s16 \arg5, \arg0, \arg2 //int32 e[i][1] = src[0] - src[2]; vsubl.s16 \arg5, \arg0, \arg2 //int32 e[i][1] = src[0] - src[2];
vshr.s16 \arg8, \arg1, #1 vshr.s16 \arg8, \arg1, #1
vshr.s16 \arg9, \arg3, #1 vshr.s16 \arg9, \arg3, #1
vsubl.s16 \arg6, \arg8, \arg3 //int32 e[i][2] = (src[1]>>1)-src[3]; vsubl.s16 \arg6, \arg8, \arg3 //int32 e[i][2] = (src[1]>>1)-src[3];
vaddl.s16 \arg7, \arg1, \arg9 //int32 e[i][3] = src[1] + (src[3]>>1); vaddl.s16 \arg7, \arg1, \arg9 //int32 e[i][3] = src[1] + (src[3]>>1);
// } // }
.endm .endm
.macro TRANSFORM_4BYTES arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7 // both row & col transform used .macro TRANSFORM_4BYTES arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7 // both row & col transform used
// { // output: f_q[0]~[3], input: e_q[0]~[3]; // { // output: f_q[0]~[3], input: e_q[0]~[3];
vadd.s32 \arg0, \arg4, \arg7 //int16 f[i][0] = e[i][0] + e[i][3]; vadd.s32 \arg0, \arg4, \arg7 //int16 f[i][0] = e[i][0] + e[i][3];
vadd.s32 \arg1, \arg5, \arg6 //int16 f[i][1] = e[i][1] + e[i][2]; vadd.s32 \arg1, \arg5, \arg6 //int16 f[i][1] = e[i][1] + e[i][2];
vsub.s32 \arg2, \arg5, \arg6 //int16 f[i][2] = e[i][1] - e[i][2]; vsub.s32 \arg2, \arg5, \arg6 //int16 f[i][2] = e[i][1] - e[i][2];
vsub.s32 \arg3, \arg4, \arg7 //int16 f[i][3] = e[i][0] - e[i][3]; vsub.s32 \arg3, \arg4, \arg7 //int16 f[i][3] = e[i][0] - e[i][3];
// } // }
.endm .endm
.macro COL_TRANSFORM_1_STEP arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7 .macro COL_TRANSFORM_1_STEP arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7
// { // input: src_q[0]~[3], output: e_q[0]~[3]; // { // input: src_q[0]~[3], output: e_q[0]~[3];
vadd.s32 \arg4, \arg0, \arg2 //int32 e[0][j] = f[0][j] + f[2][j]; vadd.s32 \arg4, \arg0, \arg2 //int32 e[0][j] = f[0][j] + f[2][j];
vsub.s32 \arg5, \arg0, \arg2 //int32 e[1][j] = f[0][j] - f[2][j]; vsub.s32 \arg5, \arg0, \arg2 //int32 e[1][j] = f[0][j] - f[2][j];
vshr.s32 \arg6, \arg1, #1 vshr.s32 \arg6, \arg1, #1
vshr.s32 \arg7, \arg3, #1 vshr.s32 \arg7, \arg3, #1
vsub.s32 \arg6, \arg6, \arg3 //int32 e[2][j] = (f[1][j]>>1) - f[3][j]; vsub.s32 \arg6, \arg6, \arg3 //int32 e[2][j] = (f[1][j]>>1) - f[3][j];
vadd.s32 \arg7, \arg1, \arg7 //int32 e[3][j] = f[1][j] + (f[3][j]>>1); vadd.s32 \arg7, \arg1, \arg7 //int32 e[3][j] = f[1][j] + (f[3][j]>>1);
// } // }
.endm .endm
#endif #endif
// r0 int16_t* block, // r0 int16_t* block,
// r1 int8_t* non_zero_count, // r1 int8_t* non_zero_count,
WELS_ASM_FUNC_BEGIN SetNonZeroCount_neon WELS_ASM_FUNC_BEGIN SetNonZeroCount_neon
vld1.64 {d0-d2}, [r1] vld1.64 {d0-d2}, [r1]
vceq.s8 q0, q0, #0 vceq.s8 q0, q0, #0
vceq.s8 d2, d2, #0 vceq.s8 d2, d2, #0
vmvn q0, q0 vmvn q0, q0
vmvn d2, d2 vmvn d2, d2
vabs.s8 q0, q0 vabs.s8 q0, q0
vabs.s8 d2, d2 vabs.s8 d2, d2
vst1.64 {d0-d2}, [r1] vst1.64 {d0-d2}, [r1]
WELS_ASM_FUNC_END WELS_ASM_FUNC_END
// r0 int16_t * block, // r0 int16_t * block,
// r1 int32_t stride // r1 int32_t stride
WELS_ASM_FUNC_BEGIN WelsResBlockZero16x16_neon// can use for 256*sizeof(int16_t) WELS_ASM_FUNC_BEGIN WelsResBlockZero16x16_neon// can use for 256*sizeof(int16_t)
push {r2} push {r2}
mov r2, #16 mov r2, #16
// each row 16 elements, 16*sizeof(int16_t) // each row 16 elements, 16*sizeof(int16_t)
// memset(ptr_dest, 0, 16*sizeof(int16_t)); // memset(ptr_dest, 0, 16*sizeof(int16_t));
// ptr_dest += stride; // ptr_dest += stride;
lsl r1, r1, #1 // r1 = 2*r1 lsl r1, r1, #1 // r1 = 2*r1
veor.i16 q0, q0, q0 veor.i16 q0, q0, q0
veor.i16 q1, q1, q1 veor.i16 q1, q1, q1
block_zero_16x16_luma_loop: block_zero_16x16_luma_loop:
vst1.i16 {q0, q1}, [r0], r1 vst1.i16 {q0, q1}, [r0], r1
subs r2, r2, #2 subs r2, r2, #2
vst1.i16 {q0, q1}, [r0], r1 vst1.i16 {q0, q1}, [r0], r1
bne block_zero_16x16_luma_loop bne block_zero_16x16_luma_loop
pop {r2} pop {r2}
WELS_ASM_FUNC_END WELS_ASM_FUNC_END
WELS_ASM_FUNC_BEGIN WelsResBlockZero8x8_neon// can use for 64*sizeof(int16_t) WELS_ASM_FUNC_BEGIN WelsResBlockZero8x8_neon// can use for 64*sizeof(int16_t)
push {r2} push {r2}
mov r2, #8 mov r2, #8
// each row 8 elements, 8*sizeof(int16_t) // each row 8 elements, 8*sizeof(int16_t)
// memset(ptr_dest, 0, 8*sizeof(int16_t)); // memset(ptr_dest, 0, 8*sizeof(int16_t));
// ptr_dest += stride; // ptr_dest += stride;
lsl r1, r1, #1 lsl r1, r1, #1
veor.i16 q0, q0, q0 veor.i16 q0, q0, q0
block_zero_8x8_chma_loop: block_zero_8x8_chma_loop:
vst1.i16 {q0}, [r0], r1 vst1.i16 {q0}, [r0], r1
subs r2, r2, #2 subs r2, r2, #2
vst1.i16 {q0}, [r0], r1 vst1.i16 {q0}, [r0], r1
bne block_zero_8x8_chma_loop bne block_zero_8x8_chma_loop
pop {r2} pop {r2}
WELS_ASM_FUNC_END WELS_ASM_FUNC_END
// uint8_t *pred, const int32_t stride, int16_t *rs // uint8_t *pred, const int32_t stride, int16_t *rs
WELS_ASM_FUNC_BEGIN IdctResAddPred_neon WELS_ASM_FUNC_BEGIN IdctResAddPred_neon
vld4.s16 {d0, d1, d2, d3}, [r2] // cost 3 cycles!
vld4.s16 {d0, d1, d2, d3}, [r2] // cost 3 cycles!
ROW_TRANSFORM_1_STEP d0, d1, d2, d3, q4, q5, q6, q7, d4, d5 ROW_TRANSFORM_1_STEP d0, d1, d2, d3, q4, q5, q6, q7, d4, d5
TRANSFORM_4BYTES q0, q1, q2, q3, q4, q5, q6, q7 TRANSFORM_4BYTES q0, q1, q2, q3, q4, q5, q6, q7
// transform element 32bits // transform element 32bits
vtrn.s32 q0, q1 //[0 1 2 3]+[4 5 6 7]-->[0 4 2 6]+[1 5 3 7] vtrn.s32 q0, q1 //[0 1 2 3]+[4 5 6 7]-->[0 4 2 6]+[1 5 3 7]
vtrn.s32 q2, q3 //[8 9 10 11]+[12 13 14 15]-->[8 12 10 14]+[9 13 11 15] vtrn.s32 q2, q3 //[8 9 10 11]+[12 13 14 15]-->[8 12 10 14]+[9 13 11 15]
@@ -172,9 +172,9 @@ block_zero_8x8_chma_loop:
vswp d3, d6 //[1 5 3 7]+[9 13 11 15]-->[1 5 9 13]+[3 7 11 15] vswp d3, d6 //[1 5 3 7]+[9 13 11 15]-->[1 5 9 13]+[3 7 11 15]
COL_TRANSFORM_1_STEP q0, q1, q2, q3, q4, q5, q6, q7 COL_TRANSFORM_1_STEP q0, q1, q2, q3, q4, q5, q6, q7
TRANSFORM_4BYTES q0, q1, q2, q3, q4, q5, q6, q7 TRANSFORM_4BYTES q0, q1, q2, q3, q4, q5, q6, q7
//after clip_table[MAX_NEG_CROP] into [0, 255] //after clip_table[MAX_NEG_CROP] into [0, 255]
mov r2, r0 mov r2, r0
vld1.32 {d12[0]},[r0],r1 vld1.32 {d12[0]},[r0],r1
@@ -186,7 +186,7 @@ block_zero_8x8_chma_loop:
vrshrn.s32 d9, q1, #6 vrshrn.s32 d9, q1, #6
vrshrn.s32 d10, q2, #6 vrshrn.s32 d10, q2, #6
vrshrn.s32 d11, q3, #6 vrshrn.s32 d11, q3, #6
vmovl.u8 q0,d12 vmovl.u8 q0,d12
vmovl.u8 q1,d14 vmovl.u8 q1,d14
vadd.s16 q0,q4 vadd.s16 q0,q4
@@ -199,5 +199,5 @@ block_zero_8x8_chma_loop:
vst1.32 {d12[1]},[r2],r1 vst1.32 {d12[1]},[r2],r1
vst1.32 {d14[0]},[r2],r1 vst1.32 {d14[0]},[r2],r1
vst1.32 {d14[1]},[r2] vst1.32 {d14[1]},[r2]
WELS_ASM_FUNC_END WELS_ASM_FUNC_END
#endif #endif

View File

@@ -29,7 +29,7 @@
* POSSIBILITY OF SUCH DAMAGE. * POSSIBILITY OF SUCH DAMAGE.
* *
*/ */
#ifdef HAVE_NEON #ifdef HAVE_NEON
//Global macro //Global macro
.text .text
@@ -61,79 +61,60 @@
.endm .endm
#endif #endif
/*
* void get_i16x16_luma_pred_v(uint8_t *pred, const int32_t stride) WELS_ASM_FUNC_BEGIN WelsDecoderI16x16LumaPredV_neon
* r0 --- pred
* r1 --- stride
* return --- void
*/
WELS_ASM_FUNC_BEGIN WelsDecoderI16x16LumaPredV_neon
//Get the top line data to 'q0' //Get the top line data to 'q0'
sub r2, r0, r1 sub r2, r0, r1
vldm r2, {d0, d1} vldm r2, {d0, d1}
mov r2, r0 mov r2, r0
mov r3, #4 mov r3, #4
//Set the top line to the each line of MB(16*16) //Set the top line to the each line of MB(16*16)
loop_0_get_i16x16_luma_pred_v: loop_0_get_i16x16_luma_pred_v:
vst1.8 {d0,d1}, [r2], r1 vst1.8 {d0,d1}, [r2], r1
vst1.8 {d0,d1}, [r2], r1 vst1.8 {d0,d1}, [r2], r1
vst1.8 {d0,d1}, [r2], r1 vst1.8 {d0,d1}, [r2], r1
vst1.8 {d0,d1}, [r2], r1 vst1.8 {d0,d1}, [r2], r1
subs r3, #1 subs r3, #1
bne loop_0_get_i16x16_luma_pred_v bne loop_0_get_i16x16_luma_pred_v
WELS_ASM_FUNC_END WELS_ASM_FUNC_END
/*
* void get_i16x16_luma_pred_h(uint8_t *pred, const int32_t stride) WELS_ASM_FUNC_BEGIN WelsDecoderI16x16LumaPredH_neon
* r0 --- pred
* r1 --- stride
* return --- void
*/
WELS_ASM_FUNC_BEGIN WelsDecoderI16x16LumaPredH_neon
sub r2, r0, #1 sub r2, r0, #1
mov r3, #4 mov r3, #4
loop_0_get_i16x16_luma_pred_h: loop_0_get_i16x16_luma_pred_h:
//Get one byte data from left side //Get one byte data from left side
vld1.8 {d0[],d1[]}, [r2], r1 vld1.8 {d0[],d1[]}, [r2], r1
vld1.8 {d2[],d3[]}, [r2], r1 vld1.8 {d2[],d3[]}, [r2], r1
vld1.8 {d4[],d5[]}, [r2], r1 vld1.8 {d4[],d5[]}, [r2], r1
vld1.8 {d6[],d7[]}, [r2], r1 vld1.8 {d6[],d7[]}, [r2], r1
//Set the line of MB using the left side byte data //Set the line of MB using the left side byte data
vst1.8 {d0,d1}, [r0], r1 vst1.8 {d0,d1}, [r0], r1
vst1.8 {d2,d3}, [r0], r1 vst1.8 {d2,d3}, [r0], r1
vst1.8 {d4,d5}, [r0], r1 vst1.8 {d4,d5}, [r0], r1
vst1.8 {d6,d7}, [r0], r1 vst1.8 {d6,d7}, [r0], r1
subs r3, #1 subs r3, #1
bne loop_0_get_i16x16_luma_pred_h bne loop_0_get_i16x16_luma_pred_h
WELS_ASM_FUNC_END WELS_ASM_FUNC_END
/* WELS_ASM_FUNC_BEGIN WelsDecoderI16x16LumaPredDc_neon
* void get_i16x16_luma_pred_dc_both(uint8_t *pred, const int32_t stride)
* r0 --- pred
* r1 --- stride
* return --- void
*/
WELS_ASM_FUNC_BEGIN WelsDecoderI16x16LumaPredDc_neon
//stmdb sp!, { r2-r5, lr} //stmdb sp!, { r2-r5, lr}
//Get the left vertical line data //Get the left vertical line data
sub r2, r0, #1 sub r2, r0, #1
GET_8BYTE_DATA d0, r2, r1 GET_8BYTE_DATA d0, r2, r1
GET_8BYTE_DATA d1, r2, r1 GET_8BYTE_DATA d1, r2, r1
//Get the top horizontal line data //Get the top horizontal line data
sub r2, r0, r1 sub r2, r0, r1
vldm r2, {d2, d3} vldm r2, {d2, d3}
//Calculate the sum of top horizontal line data and vertical line data //Calculate the sum of top horizontal line data and vertical line data
vpaddl.u8 q0, q0 vpaddl.u8 q0, q0
vpaddl.u8 q1, q1 vpaddl.u8 q1, q1
@@ -141,11 +122,11 @@ loop_0_get_i16x16_luma_pred_h:
vadd.u16 d0, d0, d1 vadd.u16 d0, d0, d1
vpaddl.u16 d0, d0 vpaddl.u16 d0, d0
vpaddl.u32 d0, d0 vpaddl.u32 d0, d0
//Calculate the mean value //Calculate the mean value
vrshr.u16 d0, d0, #5 vrshr.u16 d0, d0, #5
vdup.8 q0, d0[0] vdup.8 q0, d0[0]
//Set the mean value to the all of member of MB //Set the mean value to the all of member of MB
mov r2, #4 mov r2, #4
loop_0_get_i16x16_luma_pred_dc_both: loop_0_get_i16x16_luma_pred_dc_both:
@@ -154,28 +135,22 @@ loop_0_get_i16x16_luma_pred_dc_both:
vst1.8 {d0,d1}, [r0], r1 vst1.8 {d0,d1}, [r0], r1
vst1.8 {d0,d1}, [r0], r1 vst1.8 {d0,d1}, [r0], r1
subs r2, #1 subs r2, #1
bne loop_0_get_i16x16_luma_pred_dc_both bne loop_0_get_i16x16_luma_pred_dc_both
WELS_ASM_FUNC_END WELS_ASM_FUNC_END
/*
* void get_i16x16_luma_pred_plane(uint8_t *pred, const int32_t stride)
* r0 --- pred
* r1 --- stride
* return --- void
*/
//The table for SIMD instruction {(8,7,6,5,4,3,2,1) * 5} //The table for SIMD instruction {(8,7,6,5,4,3,2,1) * 5}
CONST0_GET_I16X16_LUMA_PRED_PLANE: .long 0x191e2328, 0x050a0f14 CONST0_GET_I16X16_LUMA_PRED_PLANE: .long 0x191e2328, 0x050a0f14
//The table for SIMD instruction {-7,-6,-5,-4,-3,-2,-1,0} //The table for SIMD instruction {-7,-6,-5,-4,-3,-2,-1,0}
CONST1_GET_I16X16_LUMA_PRED_PLANE: .long 0xfcfbfaf9, 0x00fffefd CONST1_GET_I16X16_LUMA_PRED_PLANE: .long 0xfcfbfaf9, 0x00fffefd
WELS_ASM_FUNC_BEGIN WelsDecoderI16x16LumaPredPlane_neon
WELS_ASM_FUNC_BEGIN WelsDecoderI16x16LumaPredPlane_neon
//stmdb sp!, { r2-r5, lr} //stmdb sp!, { r2-r5, lr}
//Load the table {(8,7,6,5,4,3,2,1) * 5} //Load the table {(8,7,6,5,4,3,2,1) * 5}
adr r2, CONST0_GET_I16X16_LUMA_PRED_PLANE adr r2, CONST0_GET_I16X16_LUMA_PRED_PLANE
vldr d0, [r2] vldr d0, [r2]
@@ -184,51 +159,51 @@ CONST1_GET_I16X16_LUMA_PRED_PLANE: .long 0xfcfbfaf9, 0x00fffefd
sub r2, r0, r1 sub r2, r0, r1
sub r3, r2, #1 sub r3, r2, #1
vld1.8 d1, [r3] vld1.8 d1, [r3]
//Pack the top[8] ~ top[15] to d2 //Pack the top[8] ~ top[15] to d2
add r3, #9 add r3, #9
vld1.8 d2, [r3] vld1.8 d2, [r3]
//Save the top[15] to d6 for next step //Save the top[15] to d6 for next step
vdup.u8 d6, d2[7] vdup.u8 d6, d2[7]
//Get and pack left[-1] ~ left[6] to d4 //Get and pack left[-1] ~ left[6] to d4
sub r3, r2, #1 sub r3, r2, #1
GET_8BYTE_DATA d4, r3, r1 GET_8BYTE_DATA d4, r3, r1
//Get and pack left[8] ~ left[15] to d3 //Get and pack left[8] ~ left[15] to d3
add r3, r1 add r3, r1
GET_8BYTE_DATA d3, r3, r1 GET_8BYTE_DATA d3, r3, r1
//Save the left[15] to d7 for next step //Save the left[15] to d7 for next step
vdup.u8 d7, d3[7] vdup.u8 d7, d3[7]
//revert the sequence of d2,d3 //revert the sequence of d2,d3
vrev64.8 q1, q1 vrev64.8 q1, q1
vsubl.u8 q2, d3, d4 //q2={left[8]-left[6],left[9]-left[5],left[10]-left[4], ...} vsubl.u8 q2, d3, d4 //q2={left[8]-left[6],left[9]-left[5],left[10]-left[4], ...}
vsubl.u8 q1, d2, d1 //q1={top[8]-top[6],top[9]-top[5],top[10]-top[4], ...} vsubl.u8 q1, d2, d1 //q1={top[8]-top[6],top[9]-top[5],top[10]-top[4], ...}
vmovl.u8 q0, d0 vmovl.u8 q0, d0
vmul.s16 q1, q0, q1 //q1 = q1*{(8,7,6,5,4,3,2,1) * 5} vmul.s16 q1, q0, q1 //q1 = q1*{(8,7,6,5,4,3,2,1) * 5}
vmul.s16 q2, q0, q2 //q2 = q2*{(8,7,6,5,4,3,2,1) * 5} vmul.s16 q2, q0, q2 //q2 = q2*{(8,7,6,5,4,3,2,1) * 5}
//Calculate the sum of items of q1, q2 //Calculate the sum of items of q1, q2
vpadd.s16 d0, d2, d3 vpadd.s16 d0, d2, d3
vpadd.s16 d1, d4, d5 vpadd.s16 d1, d4, d5
vpaddl.s16 q0, q0 vpaddl.s16 q0, q0
vpaddl.s32 q0, q0 vpaddl.s32 q0, q0
//Get the value of 'b', 'c' and extend to q1, q2. //Get the value of 'b', 'c' and extend to q1, q2.
vrshr.s64 q0, #6 vrshr.s64 q0, #6
vdup.s16 q1, d0[0] vdup.s16 q1, d0[0]
vdup.s16 q2, d1[0] vdup.s16 q2, d1[0]
//Load the table {-7,-6,-5,-4,-3,-2,-1,0} to d0 //Load the table {-7,-6,-5,-4,-3,-2,-1,0} to d0
adr r2, CONST1_GET_I16X16_LUMA_PRED_PLANE adr r2, CONST1_GET_I16X16_LUMA_PRED_PLANE
vld1.32 {d0}, [r2] vld1.32 {d0}, [r2]
//Get the value of 'a' and save to q3 //Get the value of 'a' and save to q3
vaddl.u8 q3, d6, d7 vaddl.u8 q3, d6, d7
vshl.u16 q3, #4 vshl.u16 q3, #4
@@ -237,156 +212,132 @@ CONST1_GET_I16X16_LUMA_PRED_PLANE: .long 0xfcfbfaf9, 0x00fffefd
vmovl.s8 q0, d0 vmovl.s8 q0, d0
vmla.s16 q3, q0, q1 vmla.s16 q3, q0, q1
vmla.s16 q3, q2, d0[0] vmla.s16 q3, q2, d0[0]
//Calculate a+'b'*{1,2,3,4,5,6,7,8} + c*{-7} //Calculate a+'b'*{1,2,3,4,5,6,7,8} + c*{-7}
vshl.s16 q5, q1, #3 vshl.s16 q5, q1, #3
vadd.s16 q5, q3 vadd.s16 q5, q3
//right shift 5 bits and rounding //right shift 5 bits and rounding
vqrshrun.s16 d0, q3, #5 vqrshrun.s16 d0, q3, #5
vqrshrun.s16 d1, q5, #5 vqrshrun.s16 d1, q5, #5
//Set the line of MB //Set the line of MB
vst1.u32 {d0,d1}, [r0], r1 vst1.u32 {d0,d1}, [r0], r1
//Do the same processing for setting other lines //Do the same processing for setting other lines
mov r2, #15 mov r2, #15
loop_0_get_i16x16_luma_pred_plane: loop_0_get_i16x16_luma_pred_plane:
vadd.s16 q3, q2 vadd.s16 q3, q2
vadd.s16 q5, q2 vadd.s16 q5, q2
vqrshrun.s16 d0, q3, #5 vqrshrun.s16 d0, q3, #5
vqrshrun.s16 d1, q5, #5 vqrshrun.s16 d1, q5, #5
vst1.u32 {d0,d1}, [r0], r1 vst1.u32 {d0,d1}, [r0], r1
subs r2, #1 subs r2, #1
bne loop_0_get_i16x16_luma_pred_plane bne loop_0_get_i16x16_luma_pred_plane
WELS_ASM_FUNC_END
WELS_ASM_FUNC_END
/* WELS_ASM_FUNC_BEGIN WelsDecoderI4x4LumaPredV_neon
* void get_i4x4_luma_pred_v(uint8_t *pred, const int32_t stride);
* r0 --- pred
* r1 --- stride
* return --- void
*/
WELS_ASM_FUNC_BEGIN WelsDecoderI4x4LumaPredV_neon
//stmdb sp!, { r2-r5, lr} //stmdb sp!, { r2-r5, lr}
//Load the top row (4 bytes) //Load the top row (4 bytes)
sub r2, r0, r1 sub r2, r0, r1
ldr r2, [r2] ldr r2, [r2]
//Set the luma MB using top line //Set the luma MB using top line
str r2, [r0], r1 str r2, [r0], r1
str r2, [r0], r1 str r2, [r0], r1
str r2, [r0], r1 str r2, [r0], r1
str r2, [r0] str r2, [r0]
WELS_ASM_FUNC_END
/* WELS_ASM_FUNC_END
* void get_i4x4_luma_pred_h(uint8_t *pred, const int32_t stride);
* r0 --- pred
* r1 --- stride
* return --- void WELS_ASM_FUNC_BEGIN WelsDecoderI4x4LumaPredH_neon
*/
WELS_ASM_FUNC_BEGIN WelsDecoderI4x4LumaPredH_neon
//stmdb sp!, { r2-r5, lr} //stmdb sp!, { r2-r5, lr}
//Load the left column (4 bytes) //Load the left column (4 bytes)
sub r2, r0, #1 sub r2, r0, #1
vld1.8 {d0[]}, [r2], r1 vld1.8 {d0[]}, [r2], r1
vld1.8 {d1[]}, [r2], r1 vld1.8 {d1[]}, [r2], r1
vld1.8 {d2[]}, [r2], r1 vld1.8 {d2[]}, [r2], r1
vld1.8 {d3[]}, [r2] vld1.8 {d3[]}, [r2]
//Set the luma MB using the left side byte //Set the luma MB using the left side byte
vst1.32 {d0[0]}, [r0], r1 vst1.32 {d0[0]}, [r0], r1
vst1.32 {d1[0]}, [r0], r1 vst1.32 {d1[0]}, [r0], r1
vst1.32 {d2[0]}, [r0], r1 vst1.32 {d2[0]}, [r0], r1
vst1.32 {d3[0]}, [r0] vst1.32 {d3[0]}, [r0]
WELS_ASM_FUNC_END WELS_ASM_FUNC_END
/*
* void get_i4x4_luma_pred_d_l(uint8_t *pred, const int32_t stride); WELS_ASM_FUNC_BEGIN WelsDecoderI4x4LumaPredDDL_neon
* r0 --- pred
* r1 --- stride
* return --- void
*/
WELS_ASM_FUNC_BEGIN WelsDecoderI4x4LumaPredDDL_neon
//stmdb sp!, { r2-r5, lr} //stmdb sp!, { r2-r5, lr}
//Load the top row data(8 bytes) //Load the top row data(8 bytes)
sub r2, r0, r1 sub r2, r0, r1
vld1.32 {d0}, [r2] vld1.32 {d0}, [r2]
//For "t7 + (t7<<1)" //For "t7 + (t7<<1)"
vdup.8 d1, d0[7] vdup.8 d1, d0[7]
//calculate "t0+t1,t1+t2,t2+t3...t6+t7,t7+t7" //calculate "t0+t1,t1+t2,t2+t3...t6+t7,t7+t7"
vext.8 d1, d0, d1, #1 vext.8 d1, d0, d1, #1
vaddl.u8 q1, d1, d0 vaddl.u8 q1, d1, d0
//calculate "x,t0+t1+t1+t2,t1+t2+t2+t3,...t5+t6+t6+t7,t6+t7+t7+t7" //calculate "x,t0+t1+t1+t2,t1+t2+t2+t3,...t5+t6+t6+t7,t6+t7+t7+t7"
vext.8 q2, q1, q1, #14 vext.8 q2, q1, q1, #14
vadd.u16 q0, q1, q2 vadd.u16 q0, q1, q2
//right shift 2 bits and rounding //right shift 2 bits and rounding
vqrshrn.u16 d0, q0, #2 vqrshrn.u16 d0, q0, #2
//Save "ddl0, ddl1, ddl2, ddl3" //Save "ddl0, ddl1, ddl2, ddl3"
vext.8 d1, d0, d0, #1 vext.8 d1, d0, d0, #1
vst1.32 d1[0], [r0], r1 vst1.32 d1[0], [r0], r1
//Save "ddl1, ddl2, ddl3, ddl4" //Save "ddl1, ddl2, ddl3, ddl4"
vext.8 d1, d0, d0, #2 vext.8 d1, d0, d0, #2
vst1.32 d1[0], [r0], r1 vst1.32 d1[0], [r0], r1
//Save "ddl2, ddl3, ddl4, ddl5" //Save "ddl2, ddl3, ddl4, ddl5"
vext.8 d1, d0, d0, #3 vext.8 d1, d0, d0, #3
vst1.32 d1[0], [r0], r1 vst1.32 d1[0], [r0], r1
//Save "ddl3, ddl4, ddl5, ddl6" //Save "ddl3, ddl4, ddl5, ddl6"
vst1.32 d0[1], [r0] vst1.32 d0[1], [r0]
WELS_ASM_FUNC_END
/* WELS_ASM_FUNC_END
* void get_i4x4_luma_pred_d_r(uint8_t *pred, const int32_t stride);
* r0 --- pred
* r1 --- stride
* return --- void
*/
WELS_ASM_FUNC_BEGIN WelsDecoderI4x4LumaPredDDR_neon
WELS_ASM_FUNC_BEGIN WelsDecoderI4x4LumaPredDDR_neon
//stmdb sp!, { r2-r5, lr} //stmdb sp!, { r2-r5, lr}
//Load the top row (4 bytes) //Load the top row (4 bytes)
sub r2, r0, r1 sub r2, r0, r1
vld1.32 {d0[1]}, [r2] vld1.32 {d0[1]}, [r2]
//Load the left column (5 bytes) //Load the left column (5 bytes)
sub r2, #1 sub r2, #1
vld1.8 {d0[3]}, [r2], r1 vld1.8 {d0[3]}, [r2], r1
vld1.8 {d0[2]}, [r2], r1 vld1.8 {d0[2]}, [r2], r1
vld1.8 {d0[1]}, [r2], r1 vld1.8 {d0[1]}, [r2], r1
vld1.8 {d0[0]}, [r2], r1 vld1.8 {d0[0]}, [r2], r1
vld1.8 {d1[7]}, [r2] //For packing the right sequence to do SIMD processing vld1.8 {d1[7]}, [r2] //For packing the right sequence to do SIMD processing
vext.8 d2, d1, d0, #7 //d0:{L2,L1,L0,LT,T0,T1,T2,T3} vext.8 d2, d1, d0, #7 //d0:{L2,L1,L0,LT,T0,T1,T2,T3}
//d2:{L3,L2,L1,L0,LT,T0,T1,T2} //d2:{L3,L2,L1,L0,LT,T0,T1,T2}
//q2:{L2+L3,L1+L2,L0+L1...T1+T2,T2+T3} //q2:{L2+L3,L1+L2,L0+L1...T1+T2,T2+T3}
vaddl.u8 q2, d2, d0 vaddl.u8 q2, d2, d0
//q1:{TL0+LT0,LT0+T01,...L12+L23} //q1:{TL0+LT0,LT0+T01,...L12+L23}
vext.8 q3, q3, q2, #14 vext.8 q3, q3, q2, #14
vadd.u16 q1, q2, q3 vadd.u16 q1, q2, q3
//right shift 2 bits and rounding //right shift 2 bits and rounding
vqrshrn.u16 d0, q1, #2 vqrshrn.u16 d0, q1, #2
//Adjust the data sequence for setting luma MB of 'pred' //Adjust the data sequence for setting luma MB of 'pred'
vst1.32 d0[1], [r0], r1 vst1.32 d0[1], [r0], r1
vext.8 d0, d0, d0, #7 vext.8 d0, d0, d0, #7
@@ -396,34 +347,29 @@ loop_0_get_i16x16_luma_pred_plane:
vext.8 d0, d0, d0, #7 vext.8 d0, d0, d0, #7
vst1.32 d0[1], [r0] vst1.32 d0[1], [r0]
WELS_ASM_FUNC_END WELS_ASM_FUNC_END
/*
* void get_i4x4_luma_pred_v_l(uint8_t *pred, const int32_t stride); WELS_ASM_FUNC_BEGIN WelsDecoderI4x4LumaPredVL_neon
* r0 --- pred
* r1 --- stride
* return --- void
*/
WELS_ASM_FUNC_BEGIN WelsDecoderI4x4LumaPredVL_neon
//stmdb sp!, { r2-r5, lr} //stmdb sp!, { r2-r5, lr}
//Load the top row (8 bytes) //Load the top row (8 bytes)
sub r2, r0, r1 sub r2, r0, r1
vld1.32 {d0}, [r2] vld1.32 {d0}, [r2]
vext.8 d1, d0, d0, #1 vext.8 d1, d0, d0, #1
vaddl.u8 q1, d1, d0 //q1:{t0+t1,t1+t2,t2+t3...t5+t6,x,x} vaddl.u8 q1, d1, d0 //q1:{t0+t1,t1+t2,t2+t3...t5+t6,x,x}
vext.8 q2, q1, q1, #2 vext.8 q2, q1, q1, #2
vadd.u16 q2, q1, q2 //q2:{t0+t1+t1+t2,t1+t2+t2+t3,...t4+t5+t5+t6,x,x} vadd.u16 q2, q1, q2 //q2:{t0+t1+t1+t2,t1+t2+t2+t3,...t4+t5+t5+t6,x,x}
//calculate the "vl0,vl1,vl2,vl3,vl4" //calculate the "vl0,vl1,vl2,vl3,vl4"
vqrshrn.u16 d0, q1, #1 vqrshrn.u16 d0, q1, #1
//calculate the "vl5,vl6,vl7,vl8,vl9" //calculate the "vl5,vl6,vl7,vl8,vl9"
vqrshrn.u16 d1, q2, #2 vqrshrn.u16 d1, q2, #2
//Adjust the data sequence for setting the luma MB //Adjust the data sequence for setting the luma MB
vst1.32 d0[0], [r0], r1 vst1.32 d0[0], [r0], r1
vst1.32 d1[0], [r0], r1 vst1.32 d1[0], [r0], r1
@@ -431,121 +377,104 @@ loop_0_get_i16x16_luma_pred_plane:
vext.8 d1, d1, d1, #1 vext.8 d1, d1, d1, #1
vst1.32 d0[0], [r0], r1 vst1.32 d0[0], [r0], r1
vst1.32 d1[0], [r0] vst1.32 d1[0], [r0]
WELS_ASM_FUNC_END WELS_ASM_FUNC_END
/*
* void get_i4x4_luma_pred_v_r(uint8_t *pred, const int32_t stride);
* r0 --- pred
* r1 --- stride
* return --- void
*/
WELS_ASM_FUNC_BEGIN WelsDecoderI4x4LumaPredVR_neon WELS_ASM_FUNC_BEGIN WelsDecoderI4x4LumaPredVR_neon
//stmdb sp!, { r2-r5, lr} //stmdb sp!, { r2-r5, lr}
//Load the top row (4 bytes) //Load the top row (4 bytes)
sub r2, r0, r1 sub r2, r0, r1
vld1.32 {d0[1]}, [r2] vld1.32 {d0[1]}, [r2]
//Load the left column (4 bytes) //Load the left column (4 bytes)
sub r2, #1 sub r2, #1
vld1.8 {d0[3]}, [r2], r1 vld1.8 {d0[3]}, [r2], r1
vld1.8 {d0[2]}, [r2], r1 vld1.8 {d0[2]}, [r2], r1
vld1.8 {d0[1]}, [r2], r1 vld1.8 {d0[1]}, [r2], r1
vld1.8 {d0[0]}, [r2] vld1.8 {d0[0]}, [r2]
vext.8 d1, d0, d0, #7 vext.8 d1, d0, d0, #7
vaddl.u8 q1, d0, d1 //q1:{X,L2+L1,L1+L0,L0+LT,LT+T0,T0+T1,T1+T2,T2+T3} vaddl.u8 q1, d0, d1 //q1:{X,L2+L1,L1+L0,L0+LT,LT+T0,T0+T1,T1+T2,T2+T3}
vext.u8 q2, q1, q1, #14 vext.u8 q2, q1, q1, #14
vadd.u16 q2, q2, q1 //q2:{X,L2+L1+L1+L0,L1+L0+L0+LT,...T1+T2+T2+T3} vadd.u16 q2, q2, q1 //q2:{X,L2+L1+L1+L0,L1+L0+L0+LT,...T1+T2+T2+T3}
//Calculate the vr0 ~ vr9 //Calculate the vr0 ~ vr9
vqrshrn.u16 d1, q2, #2 vqrshrn.u16 d1, q2, #2
vqrshrn.u16 d0, q1, #1 vqrshrn.u16 d0, q1, #1
//Adjust the data sequence for setting the luma MB //Adjust the data sequence for setting the luma MB
vst1.32 d0[1], [r0], r1 vst1.32 d0[1], [r0], r1
vst1.32 d1[1], [r0], r1 vst1.32 d1[1], [r0], r1
add r2, r0, r1 add r2, r0, r1
vst1.8 d1[3], [r0]! vst1.8 d1[3], [r0]!
vst1.16 d0[2], [r0]! vst1.16 d0[2], [r0]!
vst1.8 d0[6], [r0]! vst1.8 d0[6], [r0]!
vst1.8 d1[2], [r2]! vst1.8 d1[2], [r2]!
vst1.16 d1[2], [r2]! vst1.16 d1[2], [r2]!
vst1.8 d1[6], [r2] vst1.8 d1[6], [r2]
WELS_ASM_FUNC_END WELS_ASM_FUNC_END
/*
* get_i4x4_luma_pred_h_u(uint8_t *pred, const int32_t stride); WELS_ASM_FUNC_BEGIN WelsDecoderI4x4LumaPredHU_neon
* r0 --- pred
* r1 --- stride
* return --- void
*/
//NO TEST
WELS_ASM_FUNC_BEGIN WelsDecoderI4x4LumaPredHU_neon
//stmdb sp!, { r2-r5, lr} //stmdb sp!, { r2-r5, lr}
//Load the left column data //Load the left column data
sub r2, r0, #1 sub r2, r0, #1
mov r3, #3 mov r3, #3
mul r3, r1 mul r3, r1
add r3, r2 add r3, r2
vld1.8 {d0[]}, [r3] vld1.8 {d0[]}, [r3]
vld1.8 {d0[4]}, [r2], r1 vld1.8 {d0[4]}, [r2], r1
vld1.8 {d0[5]}, [r2], r1 vld1.8 {d0[5]}, [r2], r1
vld1.8 {d0[6]}, [r2], r1 //d0:{L3,L3,L3,L3,L0,L1,L2,L3} vld1.8 {d0[6]}, [r2], r1 //d0:{L3,L3,L3,L3,L0,L1,L2,L3}
vext.8 d1, d0, d0, #1 vext.8 d1, d0, d0, #1
vaddl.u8 q2, d0, d1 //q2:{L3+L3,L3+L3,L3+L3,L3+L0,L0+L1,L1+L2,L2+L3,L3+L3} vaddl.u8 q2, d0, d1 //q2:{L3+L3,L3+L3,L3+L3,L3+L0,L0+L1,L1+L2,L2+L3,L3+L3}
vext.u8 d2, d5, d4, #2 vext.u8 d2, d5, d4, #2
vadd.u16 d3, d2, d5 //d3:{L0+L1+L1+L2,L1+L2+L2+L3,L2+L3+L3+L3,L3+L3+L3+L3} vadd.u16 d3, d2, d5 //d3:{L0+L1+L1+L2,L1+L2+L2+L3,L2+L3+L3+L3,L3+L3+L3+L3}
//Calculate the hu0 ~ hu5 //Calculate the hu0 ~ hu5
vqrshrn.u16 d2, q2, #1 vqrshrn.u16 d2, q2, #1
vqrshrn.u16 d1, q1, #2 vqrshrn.u16 d1, q1, #2
//Adjust the data sequence for setting the luma MB //Adjust the data sequence for setting the luma MB
vzip.8 d2, d1 vzip.8 d2, d1
vst1.32 d1[0], [r0], r1 vst1.32 d1[0], [r0], r1
vext.8 d2, d1, d1, #2 vext.8 d2, d1, d1, #2
vst1.32 d2[0], [r0], r1 vst1.32 d2[0], [r0], r1
vst1.32 d1[1], [r0], r1 vst1.32 d1[1], [r0], r1
vst1.32 d0[0], [r0] vst1.32 d0[0], [r0]
WELS_ASM_FUNC_END
/* WELS_ASM_FUNC_END
* void get_i4x4_luma_pred_h_d(uint8_t *pred, const int32_t stride);
* r0 --- pred
* r1 --- stride WELS_ASM_FUNC_BEGIN WelsDecoderI4x4LumaPredHD_neon
* return --- void
*/
WELS_ASM_FUNC_BEGIN WelsDecoderI4x4LumaPredHD_neon
//stmdb sp!, { r2-r5, lr} //stmdb sp!, { r2-r5, lr}
//Load the data //Load the data
sub r2, r0, r1 sub r2, r0, r1
sub r2, #1 sub r2, #1
vld1.32 {d0[1]}, [r2], r1 vld1.32 {d0[1]}, [r2], r1
vld1.8 {d0[3]}, [r2], r1 vld1.8 {d0[3]}, [r2], r1
vld1.8 {d0[2]}, [r2], r1 vld1.8 {d0[2]}, [r2], r1
vld1.8 {d0[1]}, [r2], r1 vld1.8 {d0[1]}, [r2], r1
vld1.8 {d0[0]}, [r2] //d0:{L3,L2,L1,L0,LT,T0,T1,T2} vld1.8 {d0[0]}, [r2] //d0:{L3,L2,L1,L0,LT,T0,T1,T2}
vext.8 d1, d0, d0, #7 vext.8 d1, d0, d0, #7
vaddl.u8 q1, d0, d1 //q1:{x,L3+L2,L2+L1,L1+L0,L0+LT,LT+T0,T0+T1,T1+T2} vaddl.u8 q1, d0, d1 //q1:{x,L3+L2,L2+L1,L1+L0,L0+LT,LT+T0,T0+T1,T1+T2}
vext.u8 q2, q1, q1, #14 //q2:{x,x, L3+L2,L2+L1,L1+L0,L0+LT,LT+T0,T0+T1} vext.u8 q2, q1, q1, #14 //q2:{x,x, L3+L2,L2+L1,L1+L0,L0+LT,LT+T0,T0+T1}
vadd.u16 q3, q2, q1 //q3:{x,x,L3+L2+L2+L1,L2+L1+L1+L0,L1+L0+L0+LT,L0+LT+LT+T0,LT+T0+T0+T1,T0+T1+T1+T2} vadd.u16 q3, q2, q1 //q3:{x,x,L3+L2+L2+L1,L2+L1+L1+L0,L1+L0+L0+LT,L0+LT+LT+T0,LT+T0+T0+T1,T0+T1+T1+T2}
//Calculate the hd0~hd9 //Calculate the hd0~hd9
vqrshrn.u16 d1, q3, #2 vqrshrn.u16 d1, q3, #2
vqrshrn.u16 d0, q2, #1 vqrshrn.u16 d0, q2, #1
//Adjust the data sequence for setting the luma MB //Adjust the data sequence for setting the luma MB
vmov d3, d1 vmov d3, d1
vtrn.8 d0, d1 vtrn.8 d0, d1
@@ -556,17 +485,10 @@ loop_0_get_i16x16_luma_pred_plane:
vst2.16 {d2[2], d3[2]}, [r0], r1 vst2.16 {d2[2], d3[2]}, [r0], r1
vst2.16 {d0[1], d1[1]}, [r0] vst2.16 {d0[1], d1[1]}, [r0]
WELS_ASM_FUNC_END WELS_ASM_FUNC_END
/* WELS_ASM_FUNC_BEGIN WelsDecoderIChromaPredV_neon
* void get_i_chroma_pred_v(uint8_t *pred, const int32_t stride);
* r0 --- pred
* r1 --- stride
* return --- void
*/
WELS_ASM_FUNC_BEGIN WelsDecoderIChromaPredV_neon
//stmdb sp!, { r2-r5, lr} //stmdb sp!, { r2-r5, lr}
//Get the top row (8 byte) //Get the top row (8 byte)
sub r2, r0, r1 sub r2, r0, r1
@@ -580,32 +502,25 @@ loop_0_get_i16x16_luma_pred_plane:
vst1.8 {d0}, [r0], r1 vst1.8 {d0}, [r0], r1
vst1.8 {d0}, [r0], r1 vst1.8 {d0}, [r0], r1
vst1.8 {d0}, [r0], r1 vst1.8 {d0}, [r0], r1
vst1.8 {d0}, [r0] vst1.8 {d0}, [r0]
WELS_ASM_FUNC_END WELS_ASM_FUNC_END
/* WELS_ASM_FUNC_BEGIN WelsDecoderIChromaPredH_neon
* void get_i_chroma_pred_h(uint8_t *pred, const int32_t stride);
* r0 --- pred
* r1 --- stride
* return --- void
*/
WELS_ASM_FUNC_BEGIN WelsDecoderIChromaPredH_neon
//stmdb sp!, { r2-r5, lr} //stmdb sp!, { r2-r5, lr}
////Get the left column (8 byte) ////Get the left column (8 byte)
sub r2, r0, #1 sub r2, r0, #1
vld1.8 {d0[]}, [r2], r1 vld1.8 {d0[]}, [r2], r1
vld1.8 {d1[]}, [r2], r1 vld1.8 {d1[]}, [r2], r1
vld1.8 {d2[]}, [r2], r1 vld1.8 {d2[]}, [r2], r1
vld1.8 {d3[]}, [r2], r1 vld1.8 {d3[]}, [r2], r1
vld1.8 {d4[]}, [r2], r1 vld1.8 {d4[]}, [r2], r1
vld1.8 {d5[]}, [r2], r1 vld1.8 {d5[]}, [r2], r1
vld1.8 {d6[]}, [r2], r1 vld1.8 {d6[]}, [r2], r1
vld1.8 {d7[]}, [r2] vld1.8 {d7[]}, [r2]
//Set the chroma MB using left column data //Set the chroma MB using left column data
vst1.8 {d0}, [r0], r1 vst1.8 {d0}, [r0], r1
vst1.8 {d1}, [r0], r1 vst1.8 {d1}, [r0], r1
vst1.8 {d2}, [r0], r1 vst1.8 {d2}, [r0], r1
@@ -613,100 +528,88 @@ loop_0_get_i16x16_luma_pred_plane:
vst1.8 {d4}, [r0], r1 vst1.8 {d4}, [r0], r1
vst1.8 {d5}, [r0], r1 vst1.8 {d5}, [r0], r1
vst1.8 {d6}, [r0], r1 vst1.8 {d6}, [r0], r1
vst1.8 {d7}, [r0] vst1.8 {d7}, [r0]
WELS_ASM_FUNC_END WELS_ASM_FUNC_END
/* WELS_ASM_FUNC_BEGIN WelsDecoderIChromaPredDC_neon
* void get_i_chroma_pred_dc_both(uint8_t *pred, const int32_t stride);
* r0 --- pred
* r1 --- stride
* return --- void
*/
WELS_ASM_FUNC_BEGIN WelsDecoderIChromaPredDC_neon
//stmdb sp!, { r2-r5, lr} //stmdb sp!, { r2-r5, lr}
//Load the left column data (8 bytes) //Load the left column data (8 bytes)
sub r2, r0, #1 sub r2, r0, #1
GET_8BYTE_DATA d0, r2, r1 GET_8BYTE_DATA d0, r2, r1
//Load the top row data (8 bytes) //Load the top row data (8 bytes)
sub r2, r0, r1 sub r2, r0, r1
vldr d1, [r2] vldr d1, [r2]
//Calculate the sum of left column and top row //Calculate the sum of left column and top row
vpaddl.u8 q0, q0 vpaddl.u8 q0, q0
vpaddl.u16 q0, q0 vpaddl.u16 q0, q0
vadd.u32 d2, d0, d1 //'m1' save to d2 vadd.u32 d2, d0, d1 //'m1' save to d2
vrshr.u32 q0, q0, #2 //calculate 'm2','m3' vrshr.u32 q0, q0, #2 //calculate 'm2','m3'
vrshr.u32 d2, d2, #3 //calculate 'm4' vrshr.u32 d2, d2, #3 //calculate 'm4'
//duplicate the 'mx' to a vector line //duplicate the 'mx' to a vector line
vdup.8 d4, d2[0] vdup.8 d4, d2[0]
vdup.8 d5, d1[4] vdup.8 d5, d1[4]
vdup.8 d6, d0[4] vdup.8 d6, d0[4]
vdup.8 d7, d2[4] vdup.8 d7, d2[4]
//Set the chroma MB
vst2.32 {d4[0],d5[0]}, [r0], r1
vst2.32 {d4[0],d5[0]}, [r0], r1
vst2.32 {d4[0],d5[0]}, [r0], r1
vst2.32 {d4[0],d5[0]}, [r0], r1
vst2.32 {d6[0],d7[0]}, [r0], r1
vst2.32 {d6[0],d7[0]}, [r0], r1
vst2.32 {d6[0],d7[0]}, [r0], r1
vst2.32 {d6[0],d7[0]}, [r0]
WELS_ASM_FUNC_END
/* //Set the chroma MB
* void get_i_chroma_pred_plane(uint8_t *pred, const int32_t stride); vst2.32 {d4[0],d5[0]}, [r0], r1
* r0 --- pred vst2.32 {d4[0],d5[0]}, [r0], r1
* r1 --- stride vst2.32 {d4[0],d5[0]}, [r0], r1
* return --- void vst2.32 {d4[0],d5[0]}, [r0], r1
*/ vst2.32 {d6[0],d7[0]}, [r0], r1
vst2.32 {d6[0],d7[0]}, [r0], r1
vst2.32 {d6[0],d7[0]}, [r0], r1
vst2.32 {d6[0],d7[0]}, [r0]
WELS_ASM_FUNC_END
//Table {{1,2,3,4,1,2,3,4}*17} //Table {{1,2,3,4,1,2,3,4}*17}
CONST0_GET_I_CHROMA_PRED_PLANE: .long 0x44332211, 0x44332211//0x140f0a05, 0x28231e19 CONST0_GET_I_CHROMA_PRED_PLANE: .long 0x44332211, 0x44332211//0x140f0a05, 0x28231e19
//Table {-3,-2,-1,0,1,2,3,4} //Table {-3,-2,-1,0,1,2,3,4}
CONST1_GET_I_CHROMA_PRED_PLANE: .long 0xfffefffd, 0x0000ffff,0x00020001,0x00040003 CONST1_GET_I_CHROMA_PRED_PLANE: .long 0xfffefffd, 0x0000ffff,0x00020001,0x00040003
WELS_ASM_FUNC_BEGIN WelsDecoderIChromaPredPlane_neon WELS_ASM_FUNC_BEGIN WelsDecoderIChromaPredPlane_neon
//stmdb sp!, { r2-r5, lr} //stmdb sp!, { r2-r5, lr}
//Load the top row data //Load the top row data
sub r2, r0, #1 sub r2, r0, #1
sub r2, r1 sub r2, r1
vld1.32 {d1[0]}, [r2] vld1.32 {d1[0]}, [r2]
add r2, #5 add r2, #5
vld1.32 {d0[0]}, [r2] vld1.32 {d0[0]}, [r2]
//Load the left column data //Load the left column data
sub r2, #5 sub r2, #5
vld1.8 {d1[4]}, [r2], r1 vld1.8 {d1[4]}, [r2], r1
vld1.8 {d1[5]}, [r2], r1 vld1.8 {d1[5]}, [r2], r1
vld1.8 {d1[6]}, [r2], r1 vld1.8 {d1[6]}, [r2], r1
vld1.8 {d1[7]}, [r2], r1 //d1:{LT,T0,T1,T2,LT,L0,L1,L2} vld1.8 {d1[7]}, [r2], r1 //d1:{LT,T0,T1,T2,LT,L0,L1,L2}
add r2, r1 add r2, r1
vld1.8 {d0[4]}, [r2], r1 vld1.8 {d0[4]}, [r2], r1
vld1.8 {d0[5]}, [r2], r1 vld1.8 {d0[5]}, [r2], r1
vld1.8 {d0[6]}, [r2], r1 vld1.8 {d0[6]}, [r2], r1
vld1.8 {d0[7]}, [r2] //d0:{T4,T5,T6,T7,L4,L5,L6.L7} vld1.8 {d0[7]}, [r2] //d0:{T4,T5,T6,T7,L4,L5,L6.L7}
//Save T7 to d3 for next step //Save T7 to d3 for next step
vdup.u8 d3, d0[3] vdup.u8 d3, d0[3]
//Save L7 to d4 for next step //Save L7 to d4 for next step
vdup.u8 d4, d0[7] vdup.u8 d4, d0[7]
//Calculate the value of 'a' and save to q2 //Calculate the value of 'a' and save to q2
vaddl.u8 q2, d3, d4 vaddl.u8 q2, d3, d4
vshl.u16 q2, #4 vshl.u16 q2, #4
//Load the table {{1,2,3,4,1,2,3,4}*17} //Load the table {{1,2,3,4,1,2,3,4}*17}
adr r2, CONST0_GET_I_CHROMA_PRED_PLANE adr r2, CONST0_GET_I_CHROMA_PRED_PLANE
vld1.32 {d2}, [r2] vld1.32 {d2}, [r2]
//Calculate the 'b','c', and save to q0 //Calculate the 'b','c', and save to q0
vrev32.8 d1, d1 vrev32.8 d1, d1
vsubl.u8 q0, d0, d1 vsubl.u8 q0, d0, d1
@@ -715,32 +618,32 @@ CONST1_GET_I_CHROMA_PRED_PLANE: .long 0xfffefffd, 0x0000ffff,0x00020001,0x000400
vpaddl.s16 q0, q0 vpaddl.s16 q0, q0
vpaddl.s32 q0, q0 vpaddl.s32 q0, q0
vrshr.s64 q0, #5 vrshr.s64 q0, #5
//Load the table {-3,-2,-1,0,1,2,3,4} to q3 //Load the table {-3,-2,-1,0,1,2,3,4} to q3
adr r2, CONST1_GET_I_CHROMA_PRED_PLANE adr r2, CONST1_GET_I_CHROMA_PRED_PLANE
vld1.32 {d6, d7}, [r2] vld1.32 {d6, d7}, [r2]
//Duplicate the 'b','c' to q0, q1 for SIMD instruction //Duplicate the 'b','c' to q0, q1 for SIMD instruction
vdup.s16 q1, d1[0] vdup.s16 q1, d1[0]
vdup.s16 q0, d0[0] vdup.s16 q0, d0[0]
//Calculate the "(a + b * (j - 3) + c * (- 3) + 16) >> 5;" //Calculate the "(a + b * (j - 3) + c * (- 3) + 16) >> 5;"
vmla.s16 q2, q0, q3 vmla.s16 q2, q0, q3
vmla.s16 q2, q1, d6[0] vmla.s16 q2, q1, d6[0]
vqrshrun.s16 d0, q2, #5 vqrshrun.s16 d0, q2, #5
//Set a line of chroma MB //Set a line of chroma MB
vst1.u32 {d0}, [r0], r1 vst1.u32 {d0}, [r0], r1
//Do the same processing for each line. //Do the same processing for each line.
mov r2, #7 mov r2, #7
loop_0_get_i_chroma_pred_plane: loop_0_get_i_chroma_pred_plane:
vadd.s16 q2, q1 vadd.s16 q2, q1
vqrshrun.s16 d0, q2, #5 vqrshrun.s16 d0, q2, #5
vst1.u32 {d0}, [r0], r1 vst1.u32 {d0}, [r0], r1
subs r2, #1 subs r2, #1
bne loop_0_get_i_chroma_pred_plane bne loop_0_get_i_chroma_pred_plane
WELS_ASM_FUNC_END WELS_ASM_FUNC_END
#endif #endif

File diff suppressed because it is too large Load Diff

View File

@@ -112,7 +112,7 @@ void WelsDecoderI16x16LumaPredV_neon(uint8_t *pPred, const int32_t kiStride);
void WelsDecoderI16x16LumaPredH_neon(uint8_t *pPred, const int32_t kiStride); void WelsDecoderI16x16LumaPredH_neon(uint8_t *pPred, const int32_t kiStride);
void WelsDecoderI16x16LumaPredDc_neon(uint8_t *pPred, const int32_t kiStride); void WelsDecoderI16x16LumaPredDc_neon(uint8_t *pPred, const int32_t kiStride);
void WelsDecoderI16x16LumaPredPlane_neon(uint8_t *pPred, const int32_t kiStride); void WelsDecoderI16x16LumaPredPlane_neon(uint8_t *pPred, const int32_t kiStride);
void WelsDecoderI4x4LumaPredV_neon(uint8_t *pPred, const int32_t kiStride); void WelsDecoderI4x4LumaPredV_neon(uint8_t *pPred, const int32_t kiStride);
void WelsDecoderI4x4LumaPredH_neon(uint8_t *pPred, const int32_t kiStride); void WelsDecoderI4x4LumaPredH_neon(uint8_t *pPred, const int32_t kiStride);
void WelsDecoderI4x4LumaPredDDL_neon(uint8_t *pPred, const int32_t kiStride); void WelsDecoderI4x4LumaPredDDL_neon(uint8_t *pPred, const int32_t kiStride);
@@ -121,11 +121,11 @@ void WelsDecoderI4x4LumaPredVL_neon(uint8_t *pPred, const int32_t kiStride);
void WelsDecoderI4x4LumaPredVR_neon(uint8_t *pPred, const int32_t kiStride); void WelsDecoderI4x4LumaPredVR_neon(uint8_t *pPred, const int32_t kiStride);
void WelsDecoderI4x4LumaPredHU_neon(uint8_t *pPred, const int32_t kiStride); void WelsDecoderI4x4LumaPredHU_neon(uint8_t *pPred, const int32_t kiStride);
void WelsDecoderI4x4LumaPredHD_neon(uint8_t *pPred, const int32_t kiStride); void WelsDecoderI4x4LumaPredHD_neon(uint8_t *pPred, const int32_t kiStride);
void WelsDecoderIChromaPredV_neon(uint8_t *pPred, const int32_t kiStride); void WelsDecoderIChromaPredV_neon(uint8_t *pPred, const int32_t kiStride);
void WelsDecoderIChromaPredH_neon(uint8_t *pPred, const int32_t kiStride); void WelsDecoderIChromaPredH_neon(uint8_t *pPred, const int32_t kiStride);
void WelsDecoderIChromaPredDC_neon(uint8_t *pPred, const int32_t kiStride); void WelsDecoderIChromaPredDC_neon(uint8_t *pPred, const int32_t kiStride);
void WelsDecoderIChromaPredPlane_neon(uint8_t *pPred, const int32_t kiStride); void WelsDecoderIChromaPredPlane_neon(uint8_t *pPred, const int32_t kiStride);
#endif//HAVE_NEON #endif//HAVE_NEON
#if defined(__cplusplus) #if defined(__cplusplus)

View File

@@ -720,13 +720,13 @@ void DeblockingInit (SDeblockingFunc* pFunc, int32_t iCpu) {
#endif #endif
#if defined(HAVE_NEON) #if defined(HAVE_NEON)
if ( iCpu & WELS_CPU_NEON ) if ( iCpu & WELS_CPU_NEON )
{ {
pFunc->pfLumaDeblockingLT4Ver = DeblockLumaLt4V_neon; pFunc->pfLumaDeblockingLT4Ver = DeblockLumaLt4V_neon;
pFunc->pfLumaDeblockingEQ4Ver = DeblockLumaEq4V_neon; pFunc->pfLumaDeblockingEQ4Ver = DeblockLumaEq4V_neon;
pFunc->pfLumaDeblockingLT4Hor = DeblockLumaLt4H_neon; pFunc->pfLumaDeblockingLT4Hor = DeblockLumaLt4H_neon;
pFunc->pfLumaDeblockingEQ4Hor = DeblockLumaEq4H_neon; pFunc->pfLumaDeblockingEQ4Hor = DeblockLumaEq4H_neon;
pFunc->pfChromaDeblockingLT4Ver = DeblockChromaLt4V_neon; pFunc->pfChromaDeblockingLT4Ver = DeblockChromaLt4V_neon;
pFunc->pfChromaDeblockingEQ4Ver = DeblockChromaEq4V_neon; pFunc->pfChromaDeblockingEQ4Ver = DeblockChromaEq4V_neon;
pFunc->pfChromaDeblockingLT4Hor = DeblockChromaLt4H_neon; pFunc->pfChromaDeblockingLT4Hor = DeblockChromaLt4H_neon;

View File

@@ -1152,8 +1152,8 @@ void WelsBlockFuncInit (SBlockFunc* pFunc, int32_t iCpu) {
#ifdef HAVE_NEON #ifdef HAVE_NEON
if ( iCpu & WELS_CPU_NEON ) { if ( iCpu & WELS_CPU_NEON ) {
pFunc->pWelsBlockZero16x16Func = WelsResBlockZero16x16_neon; pFunc->pWelsBlockZero16x16Func = WelsResBlockZero16x16_neon;
pFunc->pWelsBlockZero8x8Func = WelsResBlockZero8x8_neon; pFunc->pWelsBlockZero8x8Func = WelsResBlockZero8x8_neon;
pFunc->pWelsSetNonZeroCountFunc = SetNonZeroCount_neon; pFunc->pWelsSetNonZeroCountFunc = SetNonZeroCount_neon;
} }
#endif #endif
} }

View File

@@ -662,30 +662,30 @@ void AssignFuncPointerForRec (PWelsDecoderContext pCtx) {
InitDctClipTable(); InitDctClipTable();
pCtx->pIdctResAddPredFunc = IdctResAddPred_c; pCtx->pIdctResAddPredFunc = IdctResAddPred_c;
#if defined(HAVE_NEON) #if defined(HAVE_NEON)
if ( pCtx->uiCpuFlag & WELS_CPU_NEON ) { if ( pCtx->uiCpuFlag & WELS_CPU_NEON ) {
pCtx->pIdctResAddPredFunc = IdctResAddPred_neon; pCtx->pIdctResAddPredFunc = IdctResAddPred_neon;
pCtx->pGetI16x16LumaPredFunc[I16_PRED_DC] = WelsDecoderI16x16LumaPredDc_neon; pCtx->pGetI16x16LumaPredFunc[I16_PRED_DC] = WelsDecoderI16x16LumaPredDc_neon;
pCtx->pGetI16x16LumaPredFunc[I16_PRED_P] = WelsDecoderI16x16LumaPredPlane_neon; pCtx->pGetI16x16LumaPredFunc[I16_PRED_P] = WelsDecoderI16x16LumaPredPlane_neon;
pCtx->pGetI16x16LumaPredFunc[I16_PRED_H] = WelsDecoderI16x16LumaPredH_neon; pCtx->pGetI16x16LumaPredFunc[I16_PRED_H] = WelsDecoderI16x16LumaPredH_neon;
pCtx->pGetI16x16LumaPredFunc[I16_PRED_V] = WelsDecoderI16x16LumaPredV_neon; pCtx->pGetI16x16LumaPredFunc[I16_PRED_V] = WelsDecoderI16x16LumaPredV_neon;
pCtx->pGetI4x4LumaPredFunc[I4_PRED_V ] = WelsDecoderI4x4LumaPredV_neon; pCtx->pGetI4x4LumaPredFunc[I4_PRED_V ] = WelsDecoderI4x4LumaPredV_neon;
pCtx->pGetI4x4LumaPredFunc[I4_PRED_H ] = WelsDecoderI4x4LumaPredH_neon; pCtx->pGetI4x4LumaPredFunc[I4_PRED_H ] = WelsDecoderI4x4LumaPredH_neon;
pCtx->pGetI4x4LumaPredFunc[I4_PRED_DDL ] = WelsDecoderI4x4LumaPredDDL_neon; pCtx->pGetI4x4LumaPredFunc[I4_PRED_DDL ] = WelsDecoderI4x4LumaPredDDL_neon;
pCtx->pGetI4x4LumaPredFunc[I4_PRED_DDR ] = WelsDecoderI4x4LumaPredDDR_neon; pCtx->pGetI4x4LumaPredFunc[I4_PRED_DDR ] = WelsDecoderI4x4LumaPredDDR_neon;
pCtx->pGetI4x4LumaPredFunc[I4_PRED_VL ] = WelsDecoderI4x4LumaPredVL_neon; pCtx->pGetI4x4LumaPredFunc[I4_PRED_VL ] = WelsDecoderI4x4LumaPredVL_neon;
pCtx->pGetI4x4LumaPredFunc[I4_PRED_VR ] = WelsDecoderI4x4LumaPredVR_neon; pCtx->pGetI4x4LumaPredFunc[I4_PRED_VR ] = WelsDecoderI4x4LumaPredVR_neon;
pCtx->pGetI4x4LumaPredFunc[I4_PRED_HU ] = WelsDecoderI4x4LumaPredHU_neon; pCtx->pGetI4x4LumaPredFunc[I4_PRED_HU ] = WelsDecoderI4x4LumaPredHU_neon;
pCtx->pGetI4x4LumaPredFunc[I4_PRED_HD ] = WelsDecoderI4x4LumaPredHD_neon; pCtx->pGetI4x4LumaPredFunc[I4_PRED_HD ] = WelsDecoderI4x4LumaPredHD_neon;
pCtx->pGetIChromaPredFunc[C_PRED_H] = WelsDecoderIChromaPredH_neon; pCtx->pGetIChromaPredFunc[C_PRED_H] = WelsDecoderIChromaPredH_neon;
pCtx->pGetIChromaPredFunc[C_PRED_V] = WelsDecoderIChromaPredV_neon; pCtx->pGetIChromaPredFunc[C_PRED_V] = WelsDecoderIChromaPredV_neon;
pCtx->pGetIChromaPredFunc[C_PRED_P ] = WelsDecoderIChromaPredPlane_neon; pCtx->pGetIChromaPredFunc[C_PRED_P ] = WelsDecoderIChromaPredPlane_neon;
pCtx->pGetIChromaPredFunc[C_PRED_DC] = WelsDecoderIChromaPredDC_neon; pCtx->pGetIChromaPredFunc[C_PRED_DC] = WelsDecoderIChromaPredDC_neon;
} }
#endif//HAVE_NEON #endif//HAVE_NEON

View File

@@ -669,8 +669,8 @@ void McCopy_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t
void McHorVer20_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, void McHorVer20_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
int32_t iWidth, int32_t iHeight) int32_t iWidth, int32_t iHeight)
{ {
if (iWidth == 16) if (iWidth == 16)
McHorVer20WidthEq16_neon(pSrc, iSrcStride, pDst, iDstStride, iHeight); McHorVer20WidthEq16_neon(pSrc, iSrcStride, pDst, iDstStride, iHeight);
else if (iWidth == 8) else if (iWidth == 8)
McHorVer20WidthEq8_neon(pSrc, iSrcStride, pDst, iDstStride, iHeight); McHorVer20WidthEq8_neon(pSrc, iSrcStride, pDst, iDstStride, iHeight);
else if (iWidth == 4) else if (iWidth == 4)
@@ -690,13 +690,13 @@ void McHorVer22_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int
int32_t iWidth, int32_t iHeight) int32_t iWidth, int32_t iHeight)
{ {
if (iWidth == 16) if (iWidth == 16)
McHorVer22WidthEq16_neon(pSrc, iSrcStride, pDst, iDstStride, iHeight); McHorVer22WidthEq16_neon(pSrc, iSrcStride, pDst, iDstStride, iHeight);
else if (iWidth == 8) else if (iWidth == 8)
McHorVer22WidthEq8_neon(pSrc, iSrcStride, pDst, iDstStride, iHeight); McHorVer22WidthEq8_neon(pSrc, iSrcStride, pDst, iDstStride, iHeight);
else if (iWidth == 4) else if (iWidth == 4)
McHorVer22WidthEq4_neon(pSrc, iSrcStride, pDst, iDstStride, iHeight); McHorVer22WidthEq4_neon(pSrc, iSrcStride, pDst, iDstStride, iHeight);
} }
void McHorVer01_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, void McHorVer01_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
int32_t iWidth, int32_t iHeight) int32_t iWidth, int32_t iHeight)
{ {
@@ -927,7 +927,7 @@ void McHorVer33_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int
PixelAvgWidthEq4_neon(pDst, iDstStride, pHorTmp, pVerTmp, iHeight); PixelAvgWidthEq4_neon(pDst, iDstStride, pHorTmp, pVerTmp, iHeight);
} }
} }
void McLuma_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, void McLuma_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
int16_t iMvX, int16_t iMvY, int32_t iWidth, int32_t iHeight) int16_t iMvX, int16_t iMvY, int32_t iWidth, int32_t iHeight)
{ {