Modify code style, remove trailing space.

2014-03-03 15:42:01 +08:00 · 2014-03-03 15:42:01 +08:00 · 7768cd0a98
commit 7768cd0a98
parent b7a25df13f
15 changed files with 1636 additions and 1972 deletions
--- a/codec/build/iOS/common/common.xcodeproj/project.pbxproj
+++ b/codec/build/iOS/common/common.xcodeproj/project.pbxproj
@ -71,7 +71,7 @@
 		4CE4474718BC61650017DF25 /* typedefs.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = typedefs.h; sourceTree = "<group>"; };
 		4CE4474918BC61650017DF25 /* WelsThreadLib.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = WelsThreadLib.cpp; sourceTree = "<group>"; };
 		4CE4474A18BC61650017DF25 /* WelsThreadLib.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = WelsThreadLib.h; sourceTree = "<group>"; };
-		4CE447BC18C085320017DF25 /* deblocking_neon.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = deblocking_neon.S; sourceTree = "<group>"; };
+		4CE447BC18C085320017DF25 /* deblocking_neon.S */ = {isa = PBXFileReference; fileEncoding = 4; indentWidth = 4; lastKnownFileType = sourcecode.asm; path = deblocking_neon.S; sourceTree = "<group>"; tabWidth = 4; usesTabs = 0; };
 		4CE447BE18C085900017DF25 /* arm_arch_common_macro.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = arm_arch_common_macro.S; sourceTree = "<group>"; };
 /* End PBXFileReference section */

--- a/codec/build/iOS/dec/welsdec/welsdec.xcodeproj/project.pbxproj
+++ b/codec/build/iOS/dec/welsdec/welsdec.xcodeproj/project.pbxproj
@ -84,9 +84,9 @@
 		4CE4464E18BC5EAA0017DF25 /* decoder_context.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = decoder_context.h; sourceTree = "<group>"; };
 		4CE4464F18BC5EAA0017DF25 /* decoder_core.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = decoder_core.h; sourceTree = "<group>"; };
 		4CE4465018BC5EAA0017DF25 /* error_code.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = error_code.h; sourceTree = "<group>"; };
-		4CE4465118BC5EAA0017DF25 /* expand_pic.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = expand_pic.h; sourceTree = "<group>"; };
-		4CE4465218BC5EAA0017DF25 /* fmo.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = fmo.h; sourceTree = "<group>"; };
-		4CE4465318BC5EAA0017DF25 /* get_intra_predictor.h */ = {isa = PBXFileReference; fileEncoding = 4; indentWidth = 2; lastKnownFileType = sourcecode.c.h; path = get_intra_predictor.h; sourceTree = "<group>"; tabWidth = 2; usesTabs = 1; };
+		4CE4465118BC5EAA0017DF25 /* expand_pic.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = expand_pic.h; sourceTree = "<group>"; usesTabs = 1; };
+		4CE4465218BC5EAA0017DF25 /* fmo.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = fmo.h; sourceTree = "<group>"; usesTabs = 1; };
+		4CE4465318BC5EAA0017DF25 /* get_intra_predictor.h */ = {isa = PBXFileReference; fileEncoding = 4; indentWidth = 4; lastKnownFileType = sourcecode.c.h; path = get_intra_predictor.h; sourceTree = "<group>"; tabWidth = 4; usesTabs = 0; };
 		4CE4465418BC5EAA0017DF25 /* manage_dec_ref.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = manage_dec_ref.h; sourceTree = "<group>"; };
 		4CE4465518BC5EAA0017DF25 /* mb_cache.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = mb_cache.h; sourceTree = "<group>"; };
 		4CE4465618BC5EAA0017DF25 /* mc.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = mc.h; sourceTree = "<group>"; };
@ -105,19 +105,19 @@
 		4CE4466318BC5EAA0017DF25 /* vlc_decoder.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = vlc_decoder.h; sourceTree = "<group>"; };
 		4CE4466418BC5EAA0017DF25 /* wels_common_basis.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = wels_common_basis.h; sourceTree = "<group>"; };
 		4CE4466518BC5EAA0017DF25 /* wels_const.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = wels_const.h; sourceTree = "<group>"; };
-		4CE4466718BC5EAA0017DF25 /* au_parser.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = au_parser.cpp; sourceTree = "<group>"; usesTabs = 1; };
-		4CE4466818BC5EAA0017DF25 /* bit_stream.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = bit_stream.cpp; sourceTree = "<group>"; usesTabs = 1; };
-		4CE4466918BC5EAA0017DF25 /* deblocking.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = deblocking.cpp; sourceTree = "<group>"; };
+		4CE4466718BC5EAA0017DF25 /* au_parser.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = au_parser.cpp; sourceTree = "<group>"; usesTabs = 0; };
+		4CE4466818BC5EAA0017DF25 /* bit_stream.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = bit_stream.cpp; sourceTree = "<group>"; usesTabs = 0; };
+		4CE4466918BC5EAA0017DF25 /* deblocking.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = deblocking.cpp; sourceTree = "<group>"; tabWidth = 2; };
 		4CE4466A18BC5EAA0017DF25 /* decode_mb_aux.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = decode_mb_aux.cpp; sourceTree = "<group>"; };
-		4CE4466B18BC5EAA0017DF25 /* decode_slice.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = decode_slice.cpp; sourceTree = "<group>"; usesTabs = 1; };
-		4CE4466C18BC5EAA0017DF25 /* decoder.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = decoder.cpp; sourceTree = "<group>"; tabWidth = 2; usesTabs = 1; };
+		4CE4466B18BC5EAA0017DF25 /* decode_slice.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = decode_slice.cpp; sourceTree = "<group>"; usesTabs = 0; };
+		4CE4466C18BC5EAA0017DF25 /* decoder.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = decoder.cpp; sourceTree = "<group>"; tabWidth = 4; usesTabs = 0; };
 		4CE4466D18BC5EAA0017DF25 /* decoder_core.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = decoder_core.cpp; sourceTree = "<group>"; };
 		4CE4466E18BC5EAA0017DF25 /* decoder_data_tables.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = decoder_data_tables.cpp; sourceTree = "<group>"; };
 		4CE4466F18BC5EAA0017DF25 /* expand_pic.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = expand_pic.cpp; sourceTree = "<group>"; };
 		4CE4467018BC5EAA0017DF25 /* fmo.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = fmo.cpp; sourceTree = "<group>"; };
 		4CE4467118BC5EAA0017DF25 /* get_intra_predictor.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = get_intra_predictor.cpp; sourceTree = "<group>"; };
 		4CE4467218BC5EAA0017DF25 /* manage_dec_ref.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = manage_dec_ref.cpp; sourceTree = "<group>"; };
-		4CE4467318BC5EAA0017DF25 /* mc.cpp */ = {isa = PBXFileReference; fileEncoding = 4; indentWidth = 4; lastKnownFileType = sourcecode.cpp.cpp; path = mc.cpp; sourceTree = "<group>"; tabWidth = 1; usesTabs = 1; wrapsLines = 1; };
+		4CE4467318BC5EAA0017DF25 /* mc.cpp */ = {isa = PBXFileReference; fileEncoding = 4; indentWidth = 4; lastKnownFileType = sourcecode.cpp.cpp; path = mc.cpp; sourceTree = "<group>"; tabWidth = 1; usesTabs = 0; wrapsLines = 1; };
 		4CE4467418BC5EAA0017DF25 /* mem_align.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = mem_align.cpp; sourceTree = "<group>"; };
 		4CE4467518BC5EAA0017DF25 /* memmgr_nal_unit.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = memmgr_nal_unit.cpp; sourceTree = "<group>"; };
 		4CE4467618BC5EAA0017DF25 /* mv_pred.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = mv_pred.cpp; sourceTree = "<group>"; };
--- a/codec/common/cpu.cpp
+++ b/codec/common/cpu.cpp
@ -218,25 +218,23 @@ void WelsXmmRegEmptyOp(void * pSrc) {
 #if defined(ANDROID_NDK)
 uint32_t WelsCPUFeatureDetectAndroid()
 {
-	uint32_t         uiCPU = 0;
-    AndroidCpuFamily cpuFamily = ANDROID_CPU_FAMILY_UNKNOWN;
-    uint64_t         uiFeatures = 0;
-    
-    cpuFamily = android_getCpuFamily();
-    if (cpuFamily == ANDROID_CPU_FAMILY_ARM)
-	{
-        uiFeatures = android_getCpuFeatures();
-		if (uiFeatures & ANDROID_CPU_ARM_FEATURE_ARMv7){
-		    uiCPU |= WELS_CPU_ARMv7;
-		}
-		if (uiFeatures & ANDROID_CPU_ARM_FEATURE_VFPv3){
-		    uiCPU |= WELS_CPU_VFPv3;
-		}
-		if (uiFeatures & ANDROID_CPU_ARM_FEATURE_NEON){
-		    uiCPU |= WELS_CPU_NEON;
-		}
-	}
-    return uiCPU;
+  uint32_t         uiCPU = 0;
+  AndroidCpuFamily cpuFamily = ANDROID_CPU_FAMILY_UNKNOWN;
+  uint64_t         uiFeatures = 0;
+  cpuFamily = android_getCpuFamily();
+  if (cpuFamily == ANDROID_CPU_FAMILY_ARM)	{
+    uiFeatures = android_getCpuFeatures();
+    if (uiFeatures & ANDROID_CPU_ARM_FEATURE_ARMv7){
+      uiCPU |= WELS_CPU_ARMv7;
+    }
+    if (uiFeatures & ANDROID_CPU_ARM_FEATURE_VFPv3){
+      uiCPU |= WELS_CPU_VFPv3;
+    }
+    if (uiFeatures & ANDROID_CPU_ARM_FEATURE_NEON){
+      uiCPU |= WELS_CPU_NEON;
+    }
+  }
+  return uiCPU;
 }

 #endif
@ -246,9 +244,8 @@ uint32_t WelsCPUFeatureDetectIOS() //Need to be updated for the new device of AP
 {
    uint32_t       uiCPU = 0;
    struct utsname sSystemInfo;
-    
    uname (&sSystemInfo);
-    
+
    if ((0 != strcmp(sSystemInfo.machine, "iPhone1,1")) && //iPhone 2G
        (0 != strcmp(sSystemInfo.machine, "iPhone1,2")) && //iPhone 3G
        (0 != strcmp(sSystemInfo.machine, "iPod1,1")) &&   //iPod 1G
--- a/codec/common/cpu.h
+++ b/codec/common/cpu.h
@ -82,12 +82,12 @@ void     WelsXmmRegEmptyOp(void * pSrc);
 #if defined(ANDROID_NDK)
 	uint32_t WelsCPUFeatureDetectAndroid();
 #endif
-	
+
 #if defined(APPLE_IOS)
 	uint32_t WelsCPUFeatureDetectIOS();
 #endif
 #endif
-    
+
 #if defined(__cplusplus)
 }
 #endif//__cplusplus
--- a/codec/common/deblocking_common.h
+++ b/codec/common/deblocking_common.h
@ -37,13 +37,13 @@ void DeblockChromaLt4H_ssse3 (uint8_t* pPixCb, uint8_t* pPixCr, int32_t iStride,
 #if defined(HAVE_NEON)
 void DeblockLumaLt4V_neon(uint8_t* pPixY, int32_t iStride, int32_t iAlpha, int32_t iBeta, int8_t* pTc);
 void DeblockLumaEq4V_neon(uint8_t* pPixY, int32_t iStride, int32_t iAlpha, int32_t iBeta);
-    
+
 void DeblockLumaLt4H_neon(uint8_t* pPixY, int32_t iStride, int32_t iAlpha, int32_t iBeta, int8_t* pTc);
 void DeblockLumaEq4H_neon(uint8_t* pPixY, int32_t iStride, int32_t iAlpha, int32_t iBeta);
-    
+
 void DeblockChromaLt4V_neon(uint8_t* pPixCb, uint8_t* pPixCr, int32_t iStride, int32_t iAlpha, int32_t iBeta, int8_t* pTC);
 void DeblockChromaEq4V_neon(uint8_t* pPixCb, uint8_t* pPixCr, int32_t iStride, int32_t iAlpha, int32_t iBeta);
-    
+
 void DeblockChromaLt4H_neon(uint8_t* pPixCb, uint8_t* pPixCr, int32_t iStride, int32_t iAlpha, int32_t iBeta, int8_t* pTC);
 void DeblockChromaEq4H_neon(uint8_t* pPixCb, uint8_t* pPixCr, int32_t iStride, int32_t iAlpha, int32_t iBeta);
 #endif
--- a/codec/common/deblocking_neon.S
+++ b/codec/common/deblocking_neon.S
--- a/codec/common/mc_common.h
+++ b/codec/common/mc_common.h
@ -40,72 +40,44 @@ extern "C" {
 #endif//__cplusplus

 #if defined(HAVE_NEON)
-/*
-void McCopy_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
-                      int32_t iWidth, int32_t iHeight);
-void McHorVer20_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
-                    int32_t iWidth, int32_t iHeight);
-void McHorVer02_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
-                    int32_t iWidth, int32_t iHeight);
-void McHorVer22_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
-                    int32_t iWidth, int32_t iHeight);
-    
-void McHorVer01_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iWidth, int32_t iHeight);
-void McHorVer03_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iWidth, int32_t iHeight);
-void McHorVer10_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iWidth, int32_t iHeight);
-void McHorVer11_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iWidth, int32_t iHeight);
-void McHorVer12_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iWidth, int32_t iHeight);
-void McHorVer13_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iWidth, int32_t iHeight);
-void McHorVer21_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iWidth, int32_t iHeight);
-void McHorVer23_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iWidth, int32_t iHeight);
-void McHorVer30_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iWidth, int32_t iHeight);
-void McHorVer31_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iWidth, int32_t iHeight);
-void McHorVer32_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iWidth, int32_t iHeight);
-void McHorVer33_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iWidth, int32_t iHeight);
-    
-void McLuma_neon(uint8_t* pSrc, int32_t src_stride, uint8_t* dst, int32_t dst_stride,
-                        int16_t iMvX, int16_t iMvY, int32_t width, int32_t height);
-void McChroma_neon(uint8_t* pSrc, int32_t src_stride, uint8_t* dst, int32_t dst_stride,
-                          int16_t iMvX, int16_t iMvY, int32_t width, int32_t height);
- */
 void McCopyWidthEq4_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);
-    
+
 void McCopyWidthEq8_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);
-    
+
 void McCopyWidthEq16_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);
-    
+
 void McChromaWidthEq8_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t* pWeights, int32_t iHeight);
-    
+
 void McChromaWidthEq4_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t* pWeights, int32_t iHeight);
-    
+
 void PixelAvgWidthEq16_neon(uint8_t* pDst, int32_t iDstStride, uint8_t* pSrcA, uint8_t* pSrcB, int32_t iHeight);
 void PixelAvgWidthEq8_neon(uint8_t* pDst, int32_t iDstStride, uint8_t* pSrcA, uint8_t* pSrcB, int32_t iHeight);
 void PixelAvgWidthEq4_neon(uint8_t* pDst, int32_t iDstStride, uint8_t* pSrcA, uint8_t* pSrcB, int32_t iHeight);
-    
+
 void McHorVer01WidthEq16_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);
 void McHorVer01WidthEq8_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);
 void McHorVer01WidthEq4_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);
 void McHorVer03WidthEq16_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);
 void McHorVer03WidthEq8_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);
 void McHorVer03WidthEq4_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);
-    
+
 void McHorVer10WidthEq16_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);
 void McHorVer10WidthEq8_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);
 void McHorVer10WidthEq4_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);
 void McHorVer30WidthEq16_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);
 void McHorVer30WidthEq8_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);
 void McHorVer30WidthEq4_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);
-    
+
    //horizontal filter to gain half sample, that is (2, 0) location in quarter sample
 void McHorVer20WidthEq16_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);
 void McHorVer20WidthEq8_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);
 void McHorVer20WidthEq4_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);
-    
+
    //vertical filter to gain half sample, that is (0, 2) location in quarter sample
 void McHorVer02WidthEq16_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);
 void McHorVer02WidthEq8_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);
 void McHorVer02WidthEq4_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);
-    
+
    //horizontal and vertical filter to gain half sample, that is (2, 2) location in quarter sample
 void McHorVer22WidthEq16_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);
 void McHorVer22WidthEq8_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);
--- a/codec/decoder/core/arm/block_add_neon.S
+++ b/codec/decoder/core/arm/block_add_neon.S
@ -29,7 +29,7 @@
 *     POSSIBILITY OF SUCH DAMAGE.
 *
 */
- 
+
 #ifdef HAVE_NEON
 .text
 #include "arm_arch_common_macro.S"
@ -37,32 +37,32 @@

 .macro	ROW_TRANSFORM_1_STEP
 //	{	//	input: src_d[0]~[3], output: e_q[0]~[3]; working: $8 $9
-		vaddl.s16		$4, $0, $2			//int32 e[i][0] = src[0] + src[2];
-		vsubl.s16		$5, $0, $2			//int32 e[i][1] = src[0] - src[2];
-		vshr.s16		$8, $1, #1
-		vshr.s16		$9, $3, #1
-		vsubl.s16		$6, $8, $3			//int32 e[i][2] = (src[1]>>1)-src[3];	
-		vaddl.s16		$7, $1, $9			//int32 e[i][3] = src[1] + (src[3]>>1);		
+    vaddl.s16		$4, $0, $2			//int32 e[i][0] = src[0] + src[2];
+    vsubl.s16		$5, $0, $2			//int32 e[i][1] = src[0] - src[2];
+    vshr.s16		$8, $1, #1
+    vshr.s16		$9, $3, #1
+    vsubl.s16		$6, $8, $3			//int32 e[i][2] = (src[1]>>1)-src[3];
+    vaddl.s16		$7, $1, $9			//int32 e[i][3] = src[1] + (src[3]>>1);
 //	}
 .endm

 .macro	TRANSFORM_4BYTES	// both row & col transform used
 //	{	//	output: f_q[0]~[3], input: e_q[0]~[3];
-		vadd.s32		$0, $4, $7			//int16 f[i][0] = e[i][0] + e[i][3];
-		vadd.s32		$1, $5, $6			//int16 f[i][1] = e[i][1] + e[i][2];
-		vsub.s32		$2, $5, $6			//int16 f[i][2] = e[i][1] - e[i][2];
-		vsub.s32		$3, $4, $7			//int16 f[i][3] = e[i][0] - e[i][3];
+    vadd.s32		$0, $4, $7			//int16 f[i][0] = e[i][0] + e[i][3];
+    vadd.s32		$1, $5, $6			//int16 f[i][1] = e[i][1] + e[i][2];
+    vsub.s32		$2, $5, $6			//int16 f[i][2] = e[i][1] - e[i][2];
+    vsub.s32		$3, $4, $7			//int16 f[i][3] = e[i][0] - e[i][3];
 //	}
 .endm

 .macro	COL_TRANSFORM_1_STEP
 //	{	//	input: src_q[0]~[3], output: e_q[0]~[3];
-		vadd.s32		$4, $0, $2			//int32 e[0][j] = f[0][j] + f[2][j];
-		vsub.s32		$5, $0, $2			//int32 e[1][j] = f[0][j] - f[2][j];
-		vshr.s32		$6, $1, #1
-		vshr.s32		$7, $3, #1
-		vsub.s32		$6, $6, $3			//int32 e[2][j] = (f[1][j]>>1) - f[3][j];	
-		vadd.s32		$7, $1, $7			//int32 e[3][j] = f[1][j] + (f[3][j]>>1);
+    vadd.s32		$4, $0, $2			//int32 e[0][j] = f[0][j] + f[2][j];
+    vsub.s32		$5, $0, $2			//int32 e[1][j] = f[0][j] - f[2][j];
+    vshr.s32		$6, $1, #1
+    vshr.s32		$7, $3, #1
+    vsub.s32		$6, $6, $3			//int32 e[2][j] = (f[1][j]>>1) - f[3][j];
+    vadd.s32		$7, $1, $7			//int32 e[3][j] = f[1][j] + (f[3][j]>>1);
 //	}
 .endm

@ -70,101 +70,101 @@

 .macro	ROW_TRANSFORM_1_STEP arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8, arg9
 //	{	//	input: src_d[0]~[3], output: e_q[0]~[3]; working: $8 $9
-		vaddl.s16		\arg4, \arg0, \arg2			//int32 e[i][0] = src[0] + src[2];
-		vsubl.s16		\arg5, \arg0, \arg2			//int32 e[i][1] = src[0] - src[2];
-		vshr.s16		\arg8, \arg1, #1
-		vshr.s16		\arg9, \arg3, #1
-		vsubl.s16		\arg6, \arg8, \arg3			//int32 e[i][2] = (src[1]>>1)-src[3];	
-		vaddl.s16		\arg7, \arg1, \arg9			//int32 e[i][3] = src[1] + (src[3]>>1);		
+    vaddl.s16		\arg4, \arg0, \arg2			//int32 e[i][0] = src[0] + src[2];
+    vsubl.s16		\arg5, \arg0, \arg2			//int32 e[i][1] = src[0] - src[2];
+    vshr.s16		\arg8, \arg1, #1
+    vshr.s16		\arg9, \arg3, #1
+    vsubl.s16		\arg6, \arg8, \arg3			//int32 e[i][2] = (src[1]>>1)-src[3];
+    vaddl.s16		\arg7, \arg1, \arg9			//int32 e[i][3] = src[1] + (src[3]>>1);
 //	}
 .endm

 .macro	TRANSFORM_4BYTES  arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7 // both row & col transform used
 //	{	//	output: f_q[0]~[3], input: e_q[0]~[3];
-		vadd.s32		\arg0, \arg4, \arg7			//int16 f[i][0] = e[i][0] + e[i][3];
-		vadd.s32		\arg1, \arg5, \arg6			//int16 f[i][1] = e[i][1] + e[i][2];
-		vsub.s32		\arg2, \arg5, \arg6			//int16 f[i][2] = e[i][1] - e[i][2];
-		vsub.s32		\arg3, \arg4, \arg7			//int16 f[i][3] = e[i][0] - e[i][3];
+    vadd.s32		\arg0, \arg4, \arg7			//int16 f[i][0] = e[i][0] + e[i][3];
+    vadd.s32		\arg1, \arg5, \arg6			//int16 f[i][1] = e[i][1] + e[i][2];
+    vsub.s32		\arg2, \arg5, \arg6			//int16 f[i][2] = e[i][1] - e[i][2];
+    vsub.s32		\arg3, \arg4, \arg7			//int16 f[i][3] = e[i][0] - e[i][3];
 //	}
 .endm

 .macro	COL_TRANSFORM_1_STEP arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7
 //	{	//	input: src_q[0]~[3], output: e_q[0]~[3];
-		vadd.s32		\arg4, \arg0, \arg2			//int32 e[0][j] = f[0][j] + f[2][j];
-		vsub.s32		\arg5, \arg0, \arg2			//int32 e[1][j] = f[0][j] - f[2][j];
-		vshr.s32		\arg6, \arg1, #1
-		vshr.s32		\arg7, \arg3, #1
-		vsub.s32		\arg6, \arg6, \arg3			//int32 e[2][j] = (f[1][j]>>1) - f[3][j];	
-		vadd.s32		\arg7, \arg1, \arg7			//int32 e[3][j] = f[1][j] + (f[3][j]>>1);
+    vadd.s32		\arg4, \arg0, \arg2			//int32 e[0][j] = f[0][j] + f[2][j];
+    vsub.s32		\arg5, \arg0, \arg2			//int32 e[1][j] = f[0][j] - f[2][j];
+    vshr.s32		\arg6, \arg1, #1
+    vshr.s32		\arg7, \arg3, #1
+    vsub.s32		\arg6, \arg6, \arg3			//int32 e[2][j] = (f[1][j]>>1) - f[3][j];
+    vadd.s32		\arg7, \arg1, \arg7			//int32 e[3][j] = f[1][j] + (f[3][j]>>1);
 //	}
 .endm
 #endif
 // r0    int16_t* block,
 // r1    int8_t* non_zero_count,
-  WELS_ASM_FUNC_BEGIN SetNonZeroCount_neon
-	
+WELS_ASM_FUNC_BEGIN SetNonZeroCount_neon
+
 	vld1.64	{d0-d2}, [r1]
-		
+
 	vceq.s8	q0, q0, #0
 	vceq.s8	d2, d2, #0
 	vmvn	q0, q0
 	vmvn	d2, d2
 	vabs.s8	q0, q0
 	vabs.s8	d2, d2
-	
+
 	vst1.64	{d0-d2}, [r1]
-  WELS_ASM_FUNC_END
+WELS_ASM_FUNC_END


-//	r0 int16_t * block, 
+//	r0 int16_t * block,
 //	r1	int32_t stride
-  WELS_ASM_FUNC_BEGIN WelsResBlockZero16x16_neon// can use for 256*sizeof(int16_t)
+WELS_ASM_FUNC_BEGIN WelsResBlockZero16x16_neon// can use for 256*sizeof(int16_t)
 	push		{r2}
 	mov			r2, #16
 // each row 16 elements, 16*sizeof(int16_t)
 //	memset(ptr_dest, 0, 16*sizeof(int16_t));
-//	ptr_dest += stride;	
+//	ptr_dest += stride;
 	lsl			r1, r1, #1	// r1 = 2*r1
 	veor.i16	q0, q0, q0
 	veor.i16	q1, q1, q1
-			
-block_zero_16x16_luma_loop:	
+
+block_zero_16x16_luma_loop:
 	vst1.i16	{q0, q1}, [r0], r1
 	subs		r2,	r2, #2
-	vst1.i16	{q0, q1}, [r0], r1	
+	vst1.i16	{q0, q1}, [r0], r1
 	bne			block_zero_16x16_luma_loop
-	
+
 	pop		{r2}
-  WELS_ASM_FUNC_END
-	
-  WELS_ASM_FUNC_BEGIN WelsResBlockZero8x8_neon// can use for 64*sizeof(int16_t)
+WELS_ASM_FUNC_END
+
+WELS_ASM_FUNC_BEGIN WelsResBlockZero8x8_neon// can use for 64*sizeof(int16_t)
 	push		{r2}
 	mov			r2, #8
 // each row 8 elements, 8*sizeof(int16_t)
 //	memset(ptr_dest, 0, 8*sizeof(int16_t));
-//	ptr_dest += stride;	
+//	ptr_dest += stride;
 	lsl			r1, r1, #1
 	veor.i16	q0, q0, q0
-		
-block_zero_8x8_chma_loop:	
+
+block_zero_8x8_chma_loop:
 	vst1.i16	{q0}, [r0], r1
 	subs		r2,	r2, #2
-	vst1.i16	{q0}, [r0], r1	
+	vst1.i16	{q0}, [r0], r1
 	bne			block_zero_8x8_chma_loop
-	
+
 	pop		{r2}
-  WELS_ASM_FUNC_END
+WELS_ASM_FUNC_END


 //	uint8_t *pred, const int32_t stride, int16_t *rs
-  WELS_ASM_FUNC_BEGIN IdctResAddPred_neon
+WELS_ASM_FUNC_BEGIN IdctResAddPred_neon
+
+	vld4.s16		{d0, d1, d2, d3}, [r2]		// cost 3 cycles!

-	vld4.s16		{d0, d1, d2, d3}, [r2]		// cost 3 cycles!	
-	
 	ROW_TRANSFORM_1_STEP		d0, d1, d2, d3, q4, q5, q6, q7, d4, d5
-	
-	TRANSFORM_4BYTES		q0, q1, q2, q3, q4, q5, q6, q7	
-	
+
+	TRANSFORM_4BYTES		q0, q1, q2, q3, q4, q5, q6, q7
+
 	// transform element 32bits
 	vtrn.s32		q0, q1				//[0 1 2 3]+[4 5 6 7]-->[0 4 2 6]+[1 5 3 7]
 	vtrn.s32		q2, q3				//[8 9 10 11]+[12 13 14 15]-->[8 12 10 14]+[9 13 11 15]
@ -172,9 +172,9 @@ block_zero_8x8_chma_loop:
 	vswp			d3, d6				//[1 5 3 7]+[9 13 11 15]-->[1 5 9 13]+[3 7 11 15]

 	COL_TRANSFORM_1_STEP		q0, q1, q2, q3, q4, q5, q6, q7
-	
-	TRANSFORM_4BYTES		q0, q1, q2, q3, q4, q5, q6, q7	
-			
+
+	TRANSFORM_4BYTES		q0, q1, q2, q3, q4, q5, q6, q7
+
 	//after clip_table[MAX_NEG_CROP] into [0, 255]
 	mov			r2, r0
 	vld1.32		{d12[0]},[r0],r1
@ -186,7 +186,7 @@ block_zero_8x8_chma_loop:
 	vrshrn.s32		d9, q1, #6
 	vrshrn.s32		d10, q2, #6
 	vrshrn.s32		d11, q3, #6
-		
+
 	vmovl.u8		q0,d12
 	vmovl.u8		q1,d14
 	vadd.s16		q0,q4
@ -199,5 +199,5 @@ block_zero_8x8_chma_loop:
 	vst1.32		{d12[1]},[r2],r1
 	vst1.32		{d14[0]},[r2],r1
 	vst1.32		{d14[1]},[r2]
-  WELS_ASM_FUNC_END
+WELS_ASM_FUNC_END
 #endif
--- a/codec/decoder/core/arm/intra_pred_neon.S
+++ b/codec/decoder/core/arm/intra_pred_neon.S
@ -29,7 +29,7 @@
 *     POSSIBILITY OF SUCH DAMAGE.
 *
 */
- 
+
 #ifdef  HAVE_NEON
 //Global macro
 .text
@ -61,79 +61,60 @@
 .endm
 #endif

-/*
- * void get_i16x16_luma_pred_v(uint8_t *pred, const int32_t stride)
- * r0     --- pred
- * r1     --- stride
- * return --- void
- */
- 
-  WELS_ASM_FUNC_BEGIN WelsDecoderI16x16LumaPredV_neon
+
+WELS_ASM_FUNC_BEGIN WelsDecoderI16x16LumaPredV_neon
 	//Get the top line data to 'q0'
 	sub  r2, r0, r1
 	vldm r2, {d0, d1}
-    
+
 	mov  r2, r0
 	mov  r3, #4
-	//Set the top line to the each line of MB(16*16) 
+	//Set the top line to the each line of MB(16*16)
 loop_0_get_i16x16_luma_pred_v:
 	vst1.8 {d0,d1}, [r2], r1
 	vst1.8 {d0,d1}, [r2], r1
 	vst1.8 {d0,d1}, [r2], r1
 	vst1.8 {d0,d1}, [r2], r1
 	subs  r3, #1
-	bne  loop_0_get_i16x16_luma_pred_v				
-													
-	WELS_ASM_FUNC_END
+	bne  loop_0_get_i16x16_luma_pred_v
+
+WELS_ASM_FUNC_END


-/*
- * void get_i16x16_luma_pred_h(uint8_t *pred, const int32_t stride)
- * r0     --- pred
- * r1     --- stride
- * return --- void
- */
- 
-  WELS_ASM_FUNC_BEGIN WelsDecoderI16x16LumaPredH_neon
+
+WELS_ASM_FUNC_BEGIN WelsDecoderI16x16LumaPredH_neon
 	sub  r2, r0, #1
 	mov  r3, #4
 loop_0_get_i16x16_luma_pred_h:
 	//Get one byte data from left side
 	vld1.8 {d0[],d1[]}, [r2], r1
-	vld1.8 {d2[],d3[]}, [r2], r1	
-	vld1.8 {d4[],d5[]}, [r2], r1	
+	vld1.8 {d2[],d3[]}, [r2], r1
+	vld1.8 {d4[],d5[]}, [r2], r1
 	vld1.8 {d6[],d7[]}, [r2], r1
-	
+
 	//Set the line of MB using the left side byte data
 	vst1.8 {d0,d1}, [r0], r1
 	vst1.8 {d2,d3}, [r0], r1
 	vst1.8 {d4,d5}, [r0], r1
 	vst1.8 {d6,d7}, [r0], r1
-	
+
 	subs  r3, #1
-	bne  loop_0_get_i16x16_luma_pred_h		
+	bne  loop_0_get_i16x16_luma_pred_h

-	WELS_ASM_FUNC_END
+WELS_ASM_FUNC_END


-/*
- * void get_i16x16_luma_pred_dc_both(uint8_t *pred, const int32_t stride)
- * r0     --- pred
- * r1     --- stride
- * return --- void
- */
-
-  WELS_ASM_FUNC_BEGIN WelsDecoderI16x16LumaPredDc_neon
+WELS_ASM_FUNC_BEGIN WelsDecoderI16x16LumaPredDc_neon
 	//stmdb sp!, { r2-r5, lr}
 	//Get the left vertical line data
 	sub r2, r0, #1
 	GET_8BYTE_DATA d0, r2, r1
 	GET_8BYTE_DATA d1, r2, r1
-	
+
 	//Get the top horizontal line data
-	sub  r2, r0, r1			
+	sub  r2, r0, r1
 	vldm r2, {d2, d3}
-	
+
 	//Calculate the sum of top horizontal line data and vertical line data
 	vpaddl.u8 q0, q0
 	vpaddl.u8 q1, q1
@ -141,11 +122,11 @@ loop_0_get_i16x16_luma_pred_h:
 	vadd.u16  d0, d0, d1
 	vpaddl.u16 d0, d0
 	vpaddl.u32 d0, d0
-	
-	//Calculate the mean value 
+
+	//Calculate the mean value
 	vrshr.u16  d0, d0, #5
 	vdup.8     q0, d0[0]
-	
+
 	//Set the mean value to the all of member of MB
 	mov  r2, #4
 loop_0_get_i16x16_luma_pred_dc_both:
@ -154,28 +135,22 @@ loop_0_get_i16x16_luma_pred_dc_both:
 	vst1.8 {d0,d1}, [r0], r1
 	vst1.8 {d0,d1}, [r0], r1
 	subs  r2, #1
-	bne  loop_0_get_i16x16_luma_pred_dc_both					
-			
-	WELS_ASM_FUNC_END
+	bne  loop_0_get_i16x16_luma_pred_dc_both
+
+WELS_ASM_FUNC_END


-/*
- * void get_i16x16_luma_pred_plane(uint8_t *pred, const int32_t stride)
- * r0     --- pred
- * r1     --- stride
- * return --- void
- */

 //The table for SIMD instruction {(8,7,6,5,4,3,2,1) * 5}
 CONST0_GET_I16X16_LUMA_PRED_PLANE: .long 0x191e2328, 0x050a0f14

-//The table for SIMD instruction {-7,-6,-5,-4,-3,-2,-1,0}                
+//The table for SIMD instruction {-7,-6,-5,-4,-3,-2,-1,0}
 CONST1_GET_I16X16_LUMA_PRED_PLANE: .long 0xfcfbfaf9, 0x00fffefd
-                  

-  WELS_ASM_FUNC_BEGIN WelsDecoderI16x16LumaPredPlane_neon
+
+WELS_ASM_FUNC_BEGIN WelsDecoderI16x16LumaPredPlane_neon
 	//stmdb sp!, { r2-r5, lr}
-        
+
 	//Load the table {(8,7,6,5,4,3,2,1) * 5}
 	adr r2, CONST0_GET_I16X16_LUMA_PRED_PLANE
 	vldr    d0, [r2]
@ -184,51 +159,51 @@ CONST1_GET_I16X16_LUMA_PRED_PLANE: .long 0xfcfbfaf9, 0x00fffefd
 	sub       r2,  r0, r1
 	sub       r3,  r2, #1
 	vld1.8    d1, [r3]
-	
+
 	//Pack the top[8] ~ top[15] to d2
 	add       r3, #9
 	vld1.8    d2, [r3]
-    
+
 	//Save the top[15] to d6 for next step
 	vdup.u8   d6,   d2[7]
-	
+
 	//Get and pack left[-1] ~ left[6] to d4
 	sub       r3,  r2, #1
 	GET_8BYTE_DATA d4, r3, r1
-	
+
 	//Get and pack left[8] ~ left[15] to d3
 	add       r3,  r1
 	GET_8BYTE_DATA d3, r3, r1
-	
+
 	//Save the left[15] to d7 for next step
 	vdup.u8   d7,   d3[7]
-    
+
 	//revert the sequence of d2,d3
 	vrev64.8   q1, q1

 	vsubl.u8   q2, d3, d4 //q2={left[8]-left[6],left[9]-left[5],left[10]-left[4], ...}
 	vsubl.u8   q1, d2, d1 //q1={top[8]-top[6],top[9]-top[5],top[10]-top[4], ...}

-        
+
 	vmovl.u8   q0, d0
 	vmul.s16   q1, q0, q1 //q1 = q1*{(8,7,6,5,4,3,2,1) * 5}
 	vmul.s16   q2, q0, q2 //q2 = q2*{(8,7,6,5,4,3,2,1) * 5}
-	
+
 	//Calculate the sum of items of q1, q2
 	vpadd.s16  d0, d2, d3
 	vpadd.s16  d1, d4, d5
 	vpaddl.s16 q0, q0
 	vpaddl.s32 q0, q0
-	
+
 	//Get the value of 'b', 'c' and extend to q1, q2.
 	vrshr.s64  q0, #6
 	vdup.s16   q1, d0[0]
 	vdup.s16   q2, d1[0]
-	
+
 	//Load the table {-7,-6,-5,-4,-3,-2,-1,0} to d0
 	adr r2, CONST1_GET_I16X16_LUMA_PRED_PLANE
 	vld1.32   {d0}, [r2]
-	
+
 	//Get the value of 'a' and save to q3
 	vaddl.u8  q3, d6, d7
 	vshl.u16  q3, #4
@ -237,156 +212,132 @@ CONST1_GET_I16X16_LUMA_PRED_PLANE: .long 0xfcfbfaf9, 0x00fffefd
 	vmovl.s8  q0, d0
 	vmla.s16  q3, q0, q1
 	vmla.s16  q3, q2, d0[0]
-	
+
 	//Calculate a+'b'*{1,2,3,4,5,6,7,8} + c*{-7}
 	vshl.s16  q5, q1, #3
 	vadd.s16  q5, q3
-	
+
 	//right shift 5 bits and rounding
 	vqrshrun.s16 d0, q3, #5
 	vqrshrun.s16 d1, q5, #5
-	
+
 	//Set the line of MB
 	vst1.u32  {d0,d1}, [r0], r1
-	
-	
+
+
 	//Do the same processing for setting other lines
 	mov  r2, #15
-loop_0_get_i16x16_luma_pred_plane:	
+loop_0_get_i16x16_luma_pred_plane:
 	vadd.s16  q3, q2
 	vadd.s16  q5, q2
 	vqrshrun.s16 d0, q3, #5
 	vqrshrun.s16 d1, q5, #5
 	vst1.u32  {d0,d1}, [r0], r1
 	subs  r2, #1
-	bne  loop_0_get_i16x16_luma_pred_plane	
-		
-	WELS_ASM_FUNC_END
+	bne  loop_0_get_i16x16_luma_pred_plane

+WELS_ASM_FUNC_END

-/*
- * void get_i4x4_luma_pred_v(uint8_t *pred, const int32_t stride);
- * r0     --- pred
- * r1     --- stride
- * return --- void
- */
- 
-  WELS_ASM_FUNC_BEGIN WelsDecoderI4x4LumaPredV_neon
+WELS_ASM_FUNC_BEGIN WelsDecoderI4x4LumaPredV_neon
 	//stmdb sp!, { r2-r5, lr}
 	//Load the top row (4 bytes)
 	sub  r2, r0, r1
 	ldr  r2, [r2]
-	
+
 	//Set the luma MB using top line
 	str  r2, [r0], r1
 	str  r2, [r0], r1
 	str  r2, [r0], r1
 	str  r2, [r0]
-        
-	WELS_ASM_FUNC_END

-/*
- * void get_i4x4_luma_pred_h(uint8_t *pred, const int32_t stride);
- * r0     --- pred
- * r1     --- stride
- * return --- void
- */
- 
-  WELS_ASM_FUNC_BEGIN WelsDecoderI4x4LumaPredH_neon
+WELS_ASM_FUNC_END
+
+
+
+WELS_ASM_FUNC_BEGIN WelsDecoderI4x4LumaPredH_neon
 	//stmdb sp!, { r2-r5, lr}
 	//Load the left column (4 bytes)
 	sub  r2, r0, #1
 	vld1.8 {d0[]}, [r2], r1
-	vld1.8 {d1[]}, [r2], r1	
-	vld1.8 {d2[]}, [r2], r1	
+	vld1.8 {d1[]}, [r2], r1
+	vld1.8 {d2[]}, [r2], r1
 	vld1.8 {d3[]}, [r2]
-	
+
 	//Set the luma MB using the left side byte
 	vst1.32 {d0[0]}, [r0], r1
 	vst1.32 {d1[0]}, [r0], r1
 	vst1.32 {d2[0]}, [r0], r1
 	vst1.32 {d3[0]}, [r0]

-	WELS_ASM_FUNC_END
+WELS_ASM_FUNC_END

-/*
- * void get_i4x4_luma_pred_d_l(uint8_t *pred, const int32_t stride);
- * r0     --- pred
- * r1     --- stride
- * return --- void
- */
-  WELS_ASM_FUNC_BEGIN WelsDecoderI4x4LumaPredDDL_neon
+
+WELS_ASM_FUNC_BEGIN WelsDecoderI4x4LumaPredDDL_neon
 	//stmdb sp!, { r2-r5, lr}
 	//Load the top row data(8 bytes)
 	sub    r2,  r0, r1
 	vld1.32  {d0}, [r2]
-	
+
 	//For "t7 + (t7<<1)"
 	vdup.8   d1,  d0[7]
-	
+
 	//calculate "t0+t1,t1+t2,t2+t3...t6+t7,t7+t7"
 	vext.8   d1,  d0, d1, #1
 	vaddl.u8 q1,  d1, d0
-	
+
 	//calculate "x,t0+t1+t1+t2,t1+t2+t2+t3,...t5+t6+t6+t7,t6+t7+t7+t7"
 	vext.8   q2,  q1, q1, #14
 	vadd.u16 q0,  q1, q2
-	
+
 	//right shift 2 bits and rounding
 	vqrshrn.u16  d0,  q0, #2
-	
+
 	//Save "ddl0, ddl1, ddl2, ddl3"
 	vext.8   d1, d0, d0, #1
 	vst1.32  d1[0], [r0], r1
-	
+
 	//Save "ddl1, ddl2, ddl3, ddl4"
 	vext.8   d1, d0, d0, #2
 	vst1.32  d1[0], [r0], r1
-	
+
 	//Save "ddl2, ddl3, ddl4, ddl5"
 	vext.8   d1, d0, d0, #3
-	vst1.32  d1[0], [r0], r1	
-	
+	vst1.32  d1[0], [r0], r1
+
 	//Save "ddl3, ddl4, ddl5, ddl6"
-	vst1.32  d0[1], [r0]	
-		
-	WELS_ASM_FUNC_END
+	vst1.32  d0[1], [r0]

-/*
- * void get_i4x4_luma_pred_d_r(uint8_t *pred, const int32_t stride);
- * r0     --- pred
- * r1     --- stride
- * return --- void
- */
+WELS_ASM_FUNC_END

-  WELS_ASM_FUNC_BEGIN WelsDecoderI4x4LumaPredDDR_neon
+
+WELS_ASM_FUNC_BEGIN WelsDecoderI4x4LumaPredDDR_neon
 	//stmdb sp!, { r2-r5, lr}
 	//Load the top row (4 bytes)
 	sub    r2,  r0, r1
 	vld1.32  {d0[1]}, [r2]
-	
+
 	//Load the left column (5 bytes)
 	sub    r2,  #1
 	vld1.8 {d0[3]}, [r2], r1
-	vld1.8 {d0[2]}, [r2], r1	
+	vld1.8 {d0[2]}, [r2], r1
 	vld1.8 {d0[1]}, [r2], r1
-	vld1.8 {d0[0]}, [r2], r1	
+	vld1.8 {d0[0]}, [r2], r1
 	vld1.8 {d1[7]}, [r2] //For packing the right sequence to do SIMD processing
-	
-	
+
+
 	vext.8   d2, d1, d0, #7   //d0:{L2,L1,L0,LT,T0,T1,T2,T3}
 	                          //d2:{L3,L2,L1,L0,LT,T0,T1,T2}
-	
+
 	//q2:{L2+L3,L1+L2,L0+L1...T1+T2,T2+T3}
 	vaddl.u8 q2, d2, d0
-	
+
 	//q1:{TL0+LT0,LT0+T01,...L12+L23}
 	vext.8   q3, q3, q2, #14
 	vadd.u16 q1, q2, q3
-	
+
 	//right shift 2 bits and rounding
 	vqrshrn.u16 d0, q1, #2
-	
+
 	//Adjust the data sequence for setting luma MB of 'pred'
 	vst1.32   d0[1], [r0], r1
 	vext.8    d0, d0, d0, #7
@ -396,34 +347,29 @@ loop_0_get_i16x16_luma_pred_plane:
 	vext.8    d0, d0, d0, #7
 	vst1.32   d0[1], [r0]

-	WELS_ASM_FUNC_END
+WELS_ASM_FUNC_END


-/*
- * void get_i4x4_luma_pred_v_l(uint8_t *pred, const int32_t stride);
- * r0     --- pred
- * r1     --- stride
- * return --- void
- */
-  WELS_ASM_FUNC_BEGIN WelsDecoderI4x4LumaPredVL_neon
+
+WELS_ASM_FUNC_BEGIN WelsDecoderI4x4LumaPredVL_neon
 	//stmdb sp!, { r2-r5, lr}
 	//Load the top row (8 bytes)
 	sub    r2,  r0, r1
 	vld1.32  {d0}, [r2]

-        
+
 	vext.8   d1,  d0, d0, #1
 	vaddl.u8 q1,  d1, d0     //q1:{t0+t1,t1+t2,t2+t3...t5+t6,x,x}
-	
+
 	vext.8   q2,  q1, q1, #2
 	vadd.u16 q2,  q1, q2     //q2:{t0+t1+t1+t2,t1+t2+t2+t3,...t4+t5+t5+t6,x,x}
-	
+
 	//calculate the "vl0,vl1,vl2,vl3,vl4"
 	vqrshrn.u16  d0,  q1, #1
-	
+
 	//calculate the "vl5,vl6,vl7,vl8,vl9"
 	vqrshrn.u16  d1,  q2, #2
-	
+
 	//Adjust the data sequence for setting the luma MB
 	vst1.32  d0[0], [r0], r1
 	vst1.32  d1[0], [r0], r1
@ -431,121 +377,104 @@ loop_0_get_i16x16_luma_pred_plane:
 	vext.8   d1,  d1, d1, #1
 	vst1.32  d0[0], [r0], r1
 	vst1.32  d1[0], [r0]
-	
-	WELS_ASM_FUNC_END
+
+WELS_ASM_FUNC_END


-/*
- * void get_i4x4_luma_pred_v_r(uint8_t *pred, const int32_t stride);
- * r0     --- pred
- * r1     --- stride
- * return --- void
- */

-  WELS_ASM_FUNC_BEGIN WelsDecoderI4x4LumaPredVR_neon
+WELS_ASM_FUNC_BEGIN WelsDecoderI4x4LumaPredVR_neon
 	//stmdb sp!, { r2-r5, lr}
 	//Load the top row (4 bytes)
 	sub       r2,  r0, r1
 	vld1.32   {d0[1]}, [r2]
-	
+
 	//Load the left column (4 bytes)
 	sub       r2,  #1
-	vld1.8    {d0[3]}, [r2], r1	
+	vld1.8    {d0[3]}, [r2], r1
 	vld1.8    {d0[2]}, [r2], r1
-	vld1.8    {d0[1]}, [r2], r1	
-	vld1.8    {d0[0]}, [r2]	
+	vld1.8    {d0[1]}, [r2], r1
+	vld1.8    {d0[0]}, [r2]
+

-        
 	vext.8    d1, d0, d0, #7
 	vaddl.u8  q1, d0, d1      //q1:{X,L2+L1,L1+L0,L0+LT,LT+T0,T0+T1,T1+T2,T2+T3}
-	
+
 	vext.u8   q2, q1, q1, #14
 	vadd.u16  q2, q2, q1      //q2:{X,L2+L1+L1+L0,L1+L0+L0+LT,...T1+T2+T2+T3}
-	
+
 	//Calculate the vr0 ~ vr9
 	vqrshrn.u16 d1, q2, #2
 	vqrshrn.u16 d0, q1, #1
-	
+
 	//Adjust the data sequence for setting the luma MB
 	vst1.32  d0[1], [r0], r1
 	vst1.32  d1[1], [r0], r1
 	add    r2, r0, r1
 	vst1.8   d1[3], [r0]!
-	vst1.16  d0[2], [r0]!    
+	vst1.16  d0[2], [r0]!
 	vst1.8   d0[6], [r0]!
 	vst1.8   d1[2], [r2]!
-	vst1.16  d1[2], [r2]!    
+	vst1.16  d1[2], [r2]!
 	vst1.8   d1[6], [r2]
-  WELS_ASM_FUNC_END
+WELS_ASM_FUNC_END


-/*
- * get_i4x4_luma_pred_h_u(uint8_t *pred, const int32_t stride);
- * r0     --- pred
- * r1     --- stride
- * return --- void
- */
- //NO TEST 
-  WELS_ASM_FUNC_BEGIN WelsDecoderI4x4LumaPredHU_neon
+
+WELS_ASM_FUNC_BEGIN WelsDecoderI4x4LumaPredHU_neon
 	//stmdb sp!, { r2-r5, lr}
 	//Load the left column data
 	sub       r2,  r0, #1
 	mov       r3,  #3
 	mul       r3,  r1
 	add       r3,  r2
-	vld1.8    {d0[]},  [r3]	   	   
-	vld1.8    {d0[4]}, [r2], r1	
+	vld1.8    {d0[]},  [r3]
+	vld1.8    {d0[4]}, [r2], r1
 	vld1.8    {d0[5]}, [r2], r1
-	vld1.8    {d0[6]}, [r2], r1 //d0:{L3,L3,L3,L3,L0,L1,L2,L3}	
+	vld1.8    {d0[6]}, [r2], r1 //d0:{L3,L3,L3,L3,L0,L1,L2,L3}

 	vext.8    d1, d0, d0, #1
-	vaddl.u8  q2, d0, d1        //q2:{L3+L3,L3+L3,L3+L3,L3+L0,L0+L1,L1+L2,L2+L3,L3+L3}	
-	
+	vaddl.u8  q2, d0, d1        //q2:{L3+L3,L3+L3,L3+L3,L3+L0,L0+L1,L1+L2,L2+L3,L3+L3}
+
 	vext.u8   d2, d5, d4, #2
-	vadd.u16  d3, d2, d5        //d3:{L0+L1+L1+L2,L1+L2+L2+L3,L2+L3+L3+L3,L3+L3+L3+L3} 
-	
+	vadd.u16  d3, d2, d5        //d3:{L0+L1+L1+L2,L1+L2+L2+L3,L2+L3+L3+L3,L3+L3+L3+L3}
+
 	//Calculate the hu0 ~ hu5
 	vqrshrn.u16 d2, q2, #1
 	vqrshrn.u16 d1, q1, #2
-	
+
 	//Adjust the data sequence for setting the luma MB
 	vzip.8   d2, d1
 	vst1.32  d1[0], [r0], r1
-	vext.8   d2, d1, d1, #2	
+	vext.8   d2, d1, d1, #2
 	vst1.32  d2[0], [r0], r1
 	vst1.32  d1[1], [r0], r1
 	vst1.32  d0[0], [r0]
-	
-  WELS_ASM_FUNC_END

-/*
- * void get_i4x4_luma_pred_h_d(uint8_t *pred, const int32_t stride);
- * r0     --- pred
- * r1     --- stride
- * return --- void
- */
-  WELS_ASM_FUNC_BEGIN WelsDecoderI4x4LumaPredHD_neon
+WELS_ASM_FUNC_END
+
+
+WELS_ASM_FUNC_BEGIN WelsDecoderI4x4LumaPredHD_neon
 	//stmdb sp!, { r2-r5, lr}
 	//Load the data
 	sub       r2,  r0, r1
 	sub       r2,  #1
 	vld1.32   {d0[1]}, [r2], r1
-	vld1.8    {d0[3]}, [r2], r1	
+	vld1.8    {d0[3]}, [r2], r1
 	vld1.8    {d0[2]}, [r2], r1
-	vld1.8    {d0[1]}, [r2], r1	
+	vld1.8    {d0[1]}, [r2], r1
 	vld1.8    {d0[0]}, [r2]	    //d0:{L3,L2,L1,L0,LT,T0,T1,T2}


 	vext.8    d1, d0, d0, #7
 	vaddl.u8  q1, d0, d1        //q1:{x,L3+L2,L2+L1,L1+L0,L0+LT,LT+T0,T0+T1,T1+T2}
-	
+
 	vext.u8   q2, q1, q1, #14   //q2:{x,x, L3+L2,L2+L1,L1+L0,L0+LT,LT+T0,T0+T1}
 	vadd.u16  q3, q2, q1        //q3:{x,x,L3+L2+L2+L1,L2+L1+L1+L0,L1+L0+L0+LT,L0+LT+LT+T0,LT+T0+T0+T1,T0+T1+T1+T2}
-	
+
 	//Calculate the hd0~hd9
 	vqrshrn.u16 d1, q3, #2
 	vqrshrn.u16 d0, q2, #1
-	
+
 	//Adjust the data sequence for setting the luma MB
 	vmov      d3, d1
 	vtrn.8    d0, d1
@ -556,17 +485,10 @@ loop_0_get_i16x16_luma_pred_plane:
 	vst2.16  {d2[2], d3[2]}, [r0], r1
 	vst2.16  {d0[1], d1[1]}, [r0]

-  WELS_ASM_FUNC_END
+WELS_ASM_FUNC_END


-/*
- * void get_i_chroma_pred_v(uint8_t *pred, const int32_t stride);
- * r0     --- pred
- * r1     --- stride
- * return --- void
- */
- 
-  WELS_ASM_FUNC_BEGIN WelsDecoderIChromaPredV_neon
+WELS_ASM_FUNC_BEGIN WelsDecoderIChromaPredV_neon
 	//stmdb sp!, { r2-r5, lr}
 	//Get the top row (8 byte)
 	sub  r2, r0, r1
@ -580,32 +502,25 @@ loop_0_get_i16x16_luma_pred_plane:
 	vst1.8 {d0}, [r0], r1
 	vst1.8 {d0}, [r0], r1
 	vst1.8 {d0}, [r0], r1
-	vst1.8 {d0}, [r0]			
-													
-	WELS_ASM_FUNC_END
+	vst1.8 {d0}, [r0]
+
+WELS_ASM_FUNC_END


-/*
- * void get_i_chroma_pred_h(uint8_t *pred, const int32_t stride);
- * r0     --- pred
- * r1     --- stride
- * return --- void
- */
- 
-  WELS_ASM_FUNC_BEGIN WelsDecoderIChromaPredH_neon
+WELS_ASM_FUNC_BEGIN WelsDecoderIChromaPredH_neon
 	//stmdb sp!, { r2-r5, lr}
 	////Get the left column (8 byte)
 	sub  r2, r0, #1
 	vld1.8 {d0[]}, [r2], r1
-	vld1.8 {d1[]}, [r2], r1	
-	vld1.8 {d2[]}, [r2], r1	
+	vld1.8 {d1[]}, [r2], r1
+	vld1.8 {d2[]}, [r2], r1
 	vld1.8 {d3[]}, [r2], r1
 	vld1.8 {d4[]}, [r2], r1
-	vld1.8 {d5[]}, [r2], r1	
-	vld1.8 {d6[]}, [r2], r1	
+	vld1.8 {d5[]}, [r2], r1
+	vld1.8 {d6[]}, [r2], r1
 	vld1.8 {d7[]}, [r2]
-	 
-	//Set the chroma MB using left column data 
+
+	//Set the chroma MB using left column data
 	vst1.8 {d0}, [r0], r1
 	vst1.8 {d1}, [r0], r1
 	vst1.8 {d2}, [r0], r1
@ -613,100 +528,88 @@ loop_0_get_i16x16_luma_pred_plane:
 	vst1.8 {d4}, [r0], r1
 	vst1.8 {d5}, [r0], r1
 	vst1.8 {d6}, [r0], r1
-	vst1.8 {d7}, [r0]	
-	
-	WELS_ASM_FUNC_END
+	vst1.8 {d7}, [r0]
+
+WELS_ASM_FUNC_END


-/*
- * void get_i_chroma_pred_dc_both(uint8_t *pred, const int32_t stride);
- * r0     --- pred
- * r1     --- stride
- * return --- void
- */
- 
-    WELS_ASM_FUNC_BEGIN WelsDecoderIChromaPredDC_neon
+WELS_ASM_FUNC_BEGIN WelsDecoderIChromaPredDC_neon
    //stmdb sp!, { r2-r5, lr}
    //Load the left column data (8 bytes)
    sub r2, r0, #1
-    GET_8BYTE_DATA d0, r2, r1	
-    
+    GET_8BYTE_DATA d0, r2, r1
+
    //Load the top row data (8 bytes)
-    sub  r2, r0, r1			
+    sub  r2, r0, r1
    vldr d1, [r2]
-    
+
    //Calculate the sum of left column and top row
    vpaddl.u8  q0, q0
    vpaddl.u16 q0, q0
    vadd.u32   d2, d0, d1 //'m1' save to d2
-    
-    vrshr.u32  q0, q0, #2 //calculate 'm2','m3' 
-    vrshr.u32  d2, d2, #3 //calculate 'm4' 
-    
+
+    vrshr.u32  q0, q0, #2 //calculate 'm2','m3'
+    vrshr.u32  d2, d2, #3 //calculate 'm4'
+
    //duplicate the 'mx' to a vector line
    vdup.8     d4, d2[0]
    vdup.8     d5, d1[4]
    vdup.8     d6, d0[4]
    vdup.8     d7, d2[4]
-    
-    //Set the chroma MB 
-    vst2.32 {d4[0],d5[0]}, [r0], r1
-    vst2.32 {d4[0],d5[0]}, [r0], r1
-    vst2.32 {d4[0],d5[0]}, [r0], r1	
-    vst2.32 {d4[0],d5[0]}, [r0], r1
-    vst2.32 {d6[0],d7[0]}, [r0], r1
-    vst2.32 {d6[0],d7[0]}, [r0], r1
-    vst2.32 {d6[0],d7[0]}, [r0], r1	
-    vst2.32 {d6[0],d7[0]}, [r0]
-    		
-    WELS_ASM_FUNC_END

-/*
- * void get_i_chroma_pred_plane(uint8_t *pred, const int32_t stride);
- * r0     --- pred
- * r1     --- stride
- * return --- void
- */
+    //Set the chroma MB
+    vst2.32 {d4[0],d5[0]}, [r0], r1
+    vst2.32 {d4[0],d5[0]}, [r0], r1
+    vst2.32 {d4[0],d5[0]}, [r0], r1
+    vst2.32 {d4[0],d5[0]}, [r0], r1
+    vst2.32 {d6[0],d7[0]}, [r0], r1
+    vst2.32 {d6[0],d7[0]}, [r0], r1
+    vst2.32 {d6[0],d7[0]}, [r0], r1
+    vst2.32 {d6[0],d7[0]}, [r0]
+
+WELS_ASM_FUNC_END
+
+
 //Table {{1,2,3,4,1,2,3,4}*17}
 CONST0_GET_I_CHROMA_PRED_PLANE: .long 0x44332211, 0x44332211//0x140f0a05, 0x28231e19
 //Table {-3,-2,-1,0,1,2,3,4}
 CONST1_GET_I_CHROMA_PRED_PLANE: .long 0xfffefffd, 0x0000ffff,0x00020001,0x00040003

-  WELS_ASM_FUNC_BEGIN WelsDecoderIChromaPredPlane_neon
+WELS_ASM_FUNC_BEGIN WelsDecoderIChromaPredPlane_neon
 	//stmdb sp!, { r2-r5, lr}
 	//Load the top row data
 	sub  r2, r0, #1
 	sub  r2, r1
-	vld1.32 {d1[0]}, [r2] 
+	vld1.32 {d1[0]}, [r2]
 	add  r2, #5
 	vld1.32 {d0[0]}, [r2]
-	
+
 	//Load the left column data
 	sub  r2, #5
 	vld1.8 {d1[4]}, [r2], r1
-	vld1.8 {d1[5]}, [r2], r1	
+	vld1.8 {d1[5]}, [r2], r1
 	vld1.8 {d1[6]}, [r2], r1
-	vld1.8 {d1[7]}, [r2], r1 //d1:{LT,T0,T1,T2,LT,L0,L1,L2}	
+	vld1.8 {d1[7]}, [r2], r1 //d1:{LT,T0,T1,T2,LT,L0,L1,L2}
 	add  r2, r1
 	vld1.8 {d0[4]}, [r2], r1
 	vld1.8 {d0[5]}, [r2], r1
 	vld1.8 {d0[6]}, [r2], r1
 	vld1.8 {d0[7]}, [r2]     //d0:{T4,T5,T6,T7,L4,L5,L6.L7}
-	
-	
+
+
 	//Save T7 to d3 for next step
 	vdup.u8   d3,   d0[3]
 	//Save L7 to d4 for next step
 	vdup.u8   d4,   d0[7]
-	
+
 	//Calculate the value of 'a' and save to q2
 	vaddl.u8  q2, d3, d4
 	vshl.u16  q2, #4
-	
+
 	//Load the table {{1,2,3,4,1,2,3,4}*17}
 	adr r2, CONST0_GET_I_CHROMA_PRED_PLANE
 	vld1.32   {d2}, [r2]
-	
+
 	//Calculate the 'b','c', and save to q0
 	vrev32.8  d1, d1
 	vsubl.u8  q0, d0, d1
@ -715,32 +618,32 @@ CONST1_GET_I_CHROMA_PRED_PLANE: .long 0xfffefffd, 0x0000ffff,0x00020001,0x000400
 	vpaddl.s16 q0, q0
 	vpaddl.s32 q0, q0
 	vrshr.s64  q0, #5
-	
+
 	//Load the table {-3,-2,-1,0,1,2,3,4} to q3
 	adr r2, CONST1_GET_I_CHROMA_PRED_PLANE
 	vld1.32   {d6, d7}, [r2]
-	
+
 	//Duplicate the 'b','c' to q0, q1 for SIMD instruction
 	vdup.s16   q1, d1[0]
 	vdup.s16   q0, d0[0]
-		
+
 	//Calculate the "(a + b * (j - 3) + c * (- 3) + 16) >> 5;"
 	vmla.s16   q2, q0, q3
 	vmla.s16   q2, q1, d6[0]
 	vqrshrun.s16 d0, q2, #5
-	
+
 	//Set a line of chroma MB
 	vst1.u32  {d0}, [r0], r1
-	
+
 	//Do the same processing for each line.
 	mov  r2, #7
-loop_0_get_i_chroma_pred_plane:	
+loop_0_get_i_chroma_pred_plane:
 	vadd.s16   q2, q1
 	vqrshrun.s16 d0, q2, #5
 	vst1.u32  {d0}, [r0], r1
 	subs  r2, #1
-	bne  loop_0_get_i_chroma_pred_plane		
-    
-	WELS_ASM_FUNC_END
+	bne  loop_0_get_i_chroma_pred_plane
+
+WELS_ASM_FUNC_END

 #endif
--- a/codec/decoder/core/arm/mc_neon.S
+++ b/codec/decoder/core/arm/mc_neon.S
--- a/codec/decoder/core/inc/get_intra_predictor.h
+++ b/codec/decoder/core/inc/get_intra_predictor.h
@ -112,7 +112,7 @@ void WelsDecoderI16x16LumaPredV_neon(uint8_t *pPred, const int32_t kiStride);
 void WelsDecoderI16x16LumaPredH_neon(uint8_t *pPred, const int32_t kiStride);
 void WelsDecoderI16x16LumaPredDc_neon(uint8_t *pPred, const int32_t kiStride);
 void WelsDecoderI16x16LumaPredPlane_neon(uint8_t *pPred, const int32_t kiStride);
-	
+
 void WelsDecoderI4x4LumaPredV_neon(uint8_t *pPred, const int32_t kiStride);
 void WelsDecoderI4x4LumaPredH_neon(uint8_t *pPred, const int32_t kiStride);
 void WelsDecoderI4x4LumaPredDDL_neon(uint8_t *pPred, const int32_t kiStride);
@ -121,11 +121,11 @@ void WelsDecoderI4x4LumaPredVL_neon(uint8_t *pPred, const int32_t kiStride);
 void WelsDecoderI4x4LumaPredVR_neon(uint8_t *pPred, const int32_t kiStride);
 void WelsDecoderI4x4LumaPredHU_neon(uint8_t *pPred, const int32_t kiStride);
 void WelsDecoderI4x4LumaPredHD_neon(uint8_t *pPred, const int32_t kiStride);
-	
+
 void WelsDecoderIChromaPredV_neon(uint8_t *pPred, const int32_t kiStride);
 void WelsDecoderIChromaPredH_neon(uint8_t *pPred, const int32_t kiStride);
 void WelsDecoderIChromaPredDC_neon(uint8_t *pPred, const int32_t kiStride);
-void WelsDecoderIChromaPredPlane_neon(uint8_t *pPred, const int32_t kiStride);	
+void WelsDecoderIChromaPredPlane_neon(uint8_t *pPred, const int32_t kiStride);
 #endif//HAVE_NEON

 #if defined(__cplusplus)
--- a/codec/decoder/core/src/deblocking.cpp
+++ b/codec/decoder/core/src/deblocking.cpp
@ -720,13 +720,13 @@ void  DeblockingInit (SDeblockingFunc*  pFunc,  int32_t iCpu) {
 #endif

 #if defined(HAVE_NEON)
-    if ( iCpu & WELS_CPU_NEON )
+  if ( iCpu & WELS_CPU_NEON )
 	{
 		pFunc->pfLumaDeblockingLT4Ver		= DeblockLumaLt4V_neon;
 		pFunc->pfLumaDeblockingEQ4Ver		= DeblockLumaEq4V_neon;
 		pFunc->pfLumaDeblockingLT4Hor		= DeblockLumaLt4H_neon;
 		pFunc->pfLumaDeblockingEQ4Hor		= DeblockLumaEq4H_neon;
-		
+
 		pFunc->pfChromaDeblockingLT4Ver     = DeblockChromaLt4V_neon;
 		pFunc->pfChromaDeblockingEQ4Ver     = DeblockChromaEq4V_neon;
 		pFunc->pfChromaDeblockingLT4Hor     = DeblockChromaLt4H_neon;
--- a/codec/decoder/core/src/decode_slice.cpp
+++ b/codec/decoder/core/src/decode_slice.cpp
@ -1152,8 +1152,8 @@ void WelsBlockFuncInit (SBlockFunc*   pFunc,  int32_t iCpu) {
 #ifdef	HAVE_NEON
  if ( iCpu & WELS_CPU_NEON ) {
    pFunc->pWelsBlockZero16x16Func		= WelsResBlockZero16x16_neon;
-    pFunc->pWelsBlockZero8x8Func			= WelsResBlockZero8x8_neon;
-    pFunc->pWelsSetNonZeroCountFunc			= SetNonZeroCount_neon;
+    pFunc->pWelsBlockZero8x8Func		= WelsResBlockZero8x8_neon;
+    pFunc->pWelsSetNonZeroCountFunc		= SetNonZeroCount_neon;
  }
 #endif
 }
--- a/codec/decoder/core/src/decoder.cpp
+++ b/codec/decoder/core/src/decoder.cpp
@ -662,30 +662,30 @@ void AssignFuncPointerForRec (PWelsDecoderContext pCtx) {

  InitDctClipTable();
  pCtx->pIdctResAddPredFunc	= IdctResAddPred_c;
-    
+
 #if defined(HAVE_NEON)
  if ( pCtx->uiCpuFlag & WELS_CPU_NEON ) {
    pCtx->pIdctResAddPredFunc	= IdctResAddPred_neon;
-    
-	  pCtx->pGetI16x16LumaPredFunc[I16_PRED_DC] = WelsDecoderI16x16LumaPredDc_neon;
-	  pCtx->pGetI16x16LumaPredFunc[I16_PRED_P]  = WelsDecoderI16x16LumaPredPlane_neon;
-	  pCtx->pGetI16x16LumaPredFunc[I16_PRED_H]  = WelsDecoderI16x16LumaPredH_neon;
-	  pCtx->pGetI16x16LumaPredFunc[I16_PRED_V]  = WelsDecoderI16x16LumaPredV_neon;
-    
-	  pCtx->pGetI4x4LumaPredFunc[I4_PRED_V    ] = WelsDecoderI4x4LumaPredV_neon;
-	  pCtx->pGetI4x4LumaPredFunc[I4_PRED_H    ] = WelsDecoderI4x4LumaPredH_neon;
-	  pCtx->pGetI4x4LumaPredFunc[I4_PRED_DDL  ] = WelsDecoderI4x4LumaPredDDL_neon;
-	  pCtx->pGetI4x4LumaPredFunc[I4_PRED_DDR  ] = WelsDecoderI4x4LumaPredDDR_neon;
-	  pCtx->pGetI4x4LumaPredFunc[I4_PRED_VL   ] = WelsDecoderI4x4LumaPredVL_neon;
-	  pCtx->pGetI4x4LumaPredFunc[I4_PRED_VR   ] = WelsDecoderI4x4LumaPredVR_neon;
-	  pCtx->pGetI4x4LumaPredFunc[I4_PRED_HU   ] = WelsDecoderI4x4LumaPredHU_neon;
-	  pCtx->pGetI4x4LumaPredFunc[I4_PRED_HD   ] = WelsDecoderI4x4LumaPredHD_neon;
-	
-	  pCtx->pGetIChromaPredFunc[C_PRED_H]       = WelsDecoderIChromaPredH_neon;
-	  pCtx->pGetIChromaPredFunc[C_PRED_V]       = WelsDecoderIChromaPredV_neon;
-	  pCtx->pGetIChromaPredFunc[C_PRED_P ]      = WelsDecoderIChromaPredPlane_neon;
-	  pCtx->pGetIChromaPredFunc[C_PRED_DC]      = WelsDecoderIChromaPredDC_neon;
-	}
+
+    pCtx->pGetI16x16LumaPredFunc[I16_PRED_DC] = WelsDecoderI16x16LumaPredDc_neon;
+    pCtx->pGetI16x16LumaPredFunc[I16_PRED_P]  = WelsDecoderI16x16LumaPredPlane_neon;
+    pCtx->pGetI16x16LumaPredFunc[I16_PRED_H]  = WelsDecoderI16x16LumaPredH_neon;
+    pCtx->pGetI16x16LumaPredFunc[I16_PRED_V]  = WelsDecoderI16x16LumaPredV_neon;
+
+    pCtx->pGetI4x4LumaPredFunc[I4_PRED_V    ] = WelsDecoderI4x4LumaPredV_neon;
+    pCtx->pGetI4x4LumaPredFunc[I4_PRED_H    ] = WelsDecoderI4x4LumaPredH_neon;
+    pCtx->pGetI4x4LumaPredFunc[I4_PRED_DDL  ] = WelsDecoderI4x4LumaPredDDL_neon;
+    pCtx->pGetI4x4LumaPredFunc[I4_PRED_DDR  ] = WelsDecoderI4x4LumaPredDDR_neon;
+    pCtx->pGetI4x4LumaPredFunc[I4_PRED_VL   ] = WelsDecoderI4x4LumaPredVL_neon;
+    pCtx->pGetI4x4LumaPredFunc[I4_PRED_VR   ] = WelsDecoderI4x4LumaPredVR_neon;
+    pCtx->pGetI4x4LumaPredFunc[I4_PRED_HU   ] = WelsDecoderI4x4LumaPredHU_neon;
+    pCtx->pGetI4x4LumaPredFunc[I4_PRED_HD   ] = WelsDecoderI4x4LumaPredHD_neon;
+
+    pCtx->pGetIChromaPredFunc[C_PRED_H]       = WelsDecoderIChromaPredH_neon;
+    pCtx->pGetIChromaPredFunc[C_PRED_V]       = WelsDecoderIChromaPredV_neon;
+    pCtx->pGetIChromaPredFunc[C_PRED_P ]      = WelsDecoderIChromaPredPlane_neon;
+    pCtx->pGetIChromaPredFunc[C_PRED_DC]      = WelsDecoderIChromaPredDC_neon;
+  }
 #endif//HAVE_NEON


--- a/codec/decoder/core/src/mc.cpp
+++ b/codec/decoder/core/src/mc.cpp
@ -669,8 +669,8 @@ void McCopy_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t
 void McHorVer20_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
 																				int32_t iWidth, int32_t iHeight)
 {
-		if (iWidth == 16)
-				McHorVer20WidthEq16_neon(pSrc, iSrcStride, pDst, iDstStride, iHeight);
+  if (iWidth == 16)
+	   McHorVer20WidthEq16_neon(pSrc, iSrcStride, pDst, iDstStride, iHeight);
 		else if (iWidth == 8)
 				McHorVer20WidthEq8_neon(pSrc, iSrcStride, pDst, iDstStride, iHeight);
 		else if (iWidth == 4)
@ -690,13 +690,13 @@ void McHorVer22_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int
 																				int32_t iWidth, int32_t iHeight)
 {
 		if (iWidth == 16)
-				McHorVer22WidthEq16_neon(pSrc, iSrcStride, pDst, iDstStride, iHeight);
+    McHorVer22WidthEq16_neon(pSrc, iSrcStride, pDst, iDstStride, iHeight);
 		else if (iWidth == 8)
 				McHorVer22WidthEq8_neon(pSrc, iSrcStride, pDst, iDstStride, iHeight);
 		else if (iWidth == 4)
 				McHorVer22WidthEq4_neon(pSrc, iSrcStride, pDst, iDstStride, iHeight);
 }
-		
+
 void McHorVer01_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
 																							int32_t iWidth, int32_t iHeight)
 {
@ -927,7 +927,7 @@ void McHorVer33_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int
 				PixelAvgWidthEq4_neon(pDst, iDstStride, pHorTmp, pVerTmp, iHeight);
 		}
 }
-		
+
 void McLuma_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
 											int16_t iMvX, int16_t iMvY, int32_t iWidth, int32_t iHeight)
 {