Modify code style, remove trailing space.

2014-03-03 15:42:01 +08:00
parent b7a25df13f
commit 7768cd0a98
15 changed files with 1636 additions and 1972 deletions
--- a/codec/build/iOS/common/common.xcodeproj/project.pbxproj
+++ b/codec/build/iOS/common/common.xcodeproj/project.pbxproj
@@ -71,7 +71,7 @@
 		4CE4474718BC61650017DF25 /* typedefs.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = typedefs.h; sourceTree = "<group>"; };
 		4CE4474918BC61650017DF25 /* WelsThreadLib.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = WelsThreadLib.cpp; sourceTree = "<group>"; };
 		4CE4474A18BC61650017DF25 /* WelsThreadLib.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = WelsThreadLib.h; sourceTree = "<group>"; };
-		4CE447BC18C085320017DF25 /* deblocking_neon.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = deblocking_neon.S; sourceTree = "<group>"; };
+		4CE447BC18C085320017DF25 /* deblocking_neon.S */ = {isa = PBXFileReference; fileEncoding = 4; indentWidth = 4; lastKnownFileType = sourcecode.asm; path = deblocking_neon.S; sourceTree = "<group>"; tabWidth = 4; usesTabs = 0; };
 		4CE447BE18C085900017DF25 /* arm_arch_common_macro.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = arm_arch_common_macro.S; sourceTree = "<group>"; };
 /* End PBXFileReference section */

--- a/codec/build/iOS/dec/welsdec/welsdec.xcodeproj/project.pbxproj
+++ b/codec/build/iOS/dec/welsdec/welsdec.xcodeproj/project.pbxproj
@@ -84,9 +84,9 @@
 		4CE4464E18BC5EAA0017DF25 /* decoder_context.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = decoder_context.h; sourceTree = "<group>"; };
 		4CE4464F18BC5EAA0017DF25 /* decoder_core.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = decoder_core.h; sourceTree = "<group>"; };
 		4CE4465018BC5EAA0017DF25 /* error_code.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = error_code.h; sourceTree = "<group>"; };
-		4CE4465118BC5EAA0017DF25 /* expand_pic.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = expand_pic.h; sourceTree = "<group>"; };
-		4CE4465218BC5EAA0017DF25 /* fmo.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = fmo.h; sourceTree = "<group>"; };
-		4CE4465318BC5EAA0017DF25 /* get_intra_predictor.h */ = {isa = PBXFileReference; fileEncoding = 4; indentWidth = 2; lastKnownFileType = sourcecode.c.h; path = get_intra_predictor.h; sourceTree = "<group>"; tabWidth = 2; usesTabs = 1; };
+		4CE4465118BC5EAA0017DF25 /* expand_pic.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = expand_pic.h; sourceTree = "<group>"; usesTabs = 1; };
+		4CE4465218BC5EAA0017DF25 /* fmo.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = fmo.h; sourceTree = "<group>"; usesTabs = 1; };
+		4CE4465318BC5EAA0017DF25 /* get_intra_predictor.h */ = {isa = PBXFileReference; fileEncoding = 4; indentWidth = 4; lastKnownFileType = sourcecode.c.h; path = get_intra_predictor.h; sourceTree = "<group>"; tabWidth = 4; usesTabs = 0; };
 		4CE4465418BC5EAA0017DF25 /* manage_dec_ref.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = manage_dec_ref.h; sourceTree = "<group>"; };
 		4CE4465518BC5EAA0017DF25 /* mb_cache.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = mb_cache.h; sourceTree = "<group>"; };
 		4CE4465618BC5EAA0017DF25 /* mc.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = mc.h; sourceTree = "<group>"; };
@@ -105,19 +105,19 @@
 		4CE4466318BC5EAA0017DF25 /* vlc_decoder.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = vlc_decoder.h; sourceTree = "<group>"; };
 		4CE4466418BC5EAA0017DF25 /* wels_common_basis.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = wels_common_basis.h; sourceTree = "<group>"; };
 		4CE4466518BC5EAA0017DF25 /* wels_const.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = wels_const.h; sourceTree = "<group>"; };
-		4CE4466718BC5EAA0017DF25 /* au_parser.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = au_parser.cpp; sourceTree = "<group>"; usesTabs = 1; };
-		4CE4466818BC5EAA0017DF25 /* bit_stream.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = bit_stream.cpp; sourceTree = "<group>"; usesTabs = 1; };
-		4CE4466918BC5EAA0017DF25 /* deblocking.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = deblocking.cpp; sourceTree = "<group>"; };
+		4CE4466718BC5EAA0017DF25 /* au_parser.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = au_parser.cpp; sourceTree = "<group>"; usesTabs = 0; };
+		4CE4466818BC5EAA0017DF25 /* bit_stream.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = bit_stream.cpp; sourceTree = "<group>"; usesTabs = 0; };
+		4CE4466918BC5EAA0017DF25 /* deblocking.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = deblocking.cpp; sourceTree = "<group>"; tabWidth = 2; };
 		4CE4466A18BC5EAA0017DF25 /* decode_mb_aux.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = decode_mb_aux.cpp; sourceTree = "<group>"; };
-		4CE4466B18BC5EAA0017DF25 /* decode_slice.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = decode_slice.cpp; sourceTree = "<group>"; usesTabs = 1; };
-		4CE4466C18BC5EAA0017DF25 /* decoder.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = decoder.cpp; sourceTree = "<group>"; tabWidth = 2; usesTabs = 1; };
+		4CE4466B18BC5EAA0017DF25 /* decode_slice.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = decode_slice.cpp; sourceTree = "<group>"; usesTabs = 0; };
+		4CE4466C18BC5EAA0017DF25 /* decoder.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = decoder.cpp; sourceTree = "<group>"; tabWidth = 4; usesTabs = 0; };
 		4CE4466D18BC5EAA0017DF25 /* decoder_core.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = decoder_core.cpp; sourceTree = "<group>"; };
 		4CE4466E18BC5EAA0017DF25 /* decoder_data_tables.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = decoder_data_tables.cpp; sourceTree = "<group>"; };
 		4CE4466F18BC5EAA0017DF25 /* expand_pic.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = expand_pic.cpp; sourceTree = "<group>"; };
 		4CE4467018BC5EAA0017DF25 /* fmo.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = fmo.cpp; sourceTree = "<group>"; };
 		4CE4467118BC5EAA0017DF25 /* get_intra_predictor.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = get_intra_predictor.cpp; sourceTree = "<group>"; };
 		4CE4467218BC5EAA0017DF25 /* manage_dec_ref.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = manage_dec_ref.cpp; sourceTree = "<group>"; };
-		4CE4467318BC5EAA0017DF25 /* mc.cpp */ = {isa = PBXFileReference; fileEncoding = 4; indentWidth = 4; lastKnownFileType = sourcecode.cpp.cpp; path = mc.cpp; sourceTree = "<group>"; tabWidth = 1; usesTabs = 1; wrapsLines = 1; };
+		4CE4467318BC5EAA0017DF25 /* mc.cpp */ = {isa = PBXFileReference; fileEncoding = 4; indentWidth = 4; lastKnownFileType = sourcecode.cpp.cpp; path = mc.cpp; sourceTree = "<group>"; tabWidth = 1; usesTabs = 0; wrapsLines = 1; };
 		4CE4467418BC5EAA0017DF25 /* mem_align.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = mem_align.cpp; sourceTree = "<group>"; };
 		4CE4467518BC5EAA0017DF25 /* memmgr_nal_unit.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = memmgr_nal_unit.cpp; sourceTree = "<group>"; };
 		4CE4467618BC5EAA0017DF25 /* mv_pred.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = mv_pred.cpp; sourceTree = "<group>"; };
--- a/codec/common/cpu.cpp
+++ b/codec/common/cpu.cpp
@@ -218,25 +218,23 @@ void WelsXmmRegEmptyOp(void * pSrc) {
 #if defined(ANDROID_NDK)
 uint32_t WelsCPUFeatureDetectAndroid()
 {
-	uint32_t         uiCPU = 0;
-    AndroidCpuFamily cpuFamily = ANDROID_CPU_FAMILY_UNKNOWN;
-    uint64_t         uiFeatures = 0;
-    
-    cpuFamily = android_getCpuFamily();
-    if (cpuFamily == ANDROID_CPU_FAMILY_ARM)
-	{
-        uiFeatures = android_getCpuFeatures();
-		if (uiFeatures & ANDROID_CPU_ARM_FEATURE_ARMv7){
-		    uiCPU |= WELS_CPU_ARMv7;
-		}
-		if (uiFeatures & ANDROID_CPU_ARM_FEATURE_VFPv3){
-		    uiCPU |= WELS_CPU_VFPv3;
-		}
-		if (uiFeatures & ANDROID_CPU_ARM_FEATURE_NEON){
-		    uiCPU |= WELS_CPU_NEON;
-		}
-	}
-    return uiCPU;
+  uint32_t         uiCPU = 0;
+  AndroidCpuFamily cpuFamily = ANDROID_CPU_FAMILY_UNKNOWN;
+  uint64_t         uiFeatures = 0;
+  cpuFamily = android_getCpuFamily();
+  if (cpuFamily == ANDROID_CPU_FAMILY_ARM)	{
+    uiFeatures = android_getCpuFeatures();
+    if (uiFeatures & ANDROID_CPU_ARM_FEATURE_ARMv7){
+      uiCPU |= WELS_CPU_ARMv7;
+    }
+    if (uiFeatures & ANDROID_CPU_ARM_FEATURE_VFPv3){
+      uiCPU |= WELS_CPU_VFPv3;
+    }
+    if (uiFeatures & ANDROID_CPU_ARM_FEATURE_NEON){
+      uiCPU |= WELS_CPU_NEON;
+    }
+  }
+  return uiCPU;
 }

 #endif
@@ -246,7 +244,6 @@ uint32_t WelsCPUFeatureDetectIOS() //Need to be updated for the new device of AP
 {
    uint32_t       uiCPU = 0;
    struct utsname sSystemInfo;
-    
    uname (&sSystemInfo);

    if ((0 != strcmp(sSystemInfo.machine, "iPhone1,1")) && //iPhone 2G
--- a/codec/common/deblocking_neon.S
+++ b/codec/common/deblocking_neon.S
--- a/codec/common/mc_common.h
+++ b/codec/common/mc_common.h
@@ -40,34 +40,6 @@ extern "C" {
 #endif//__cplusplus

 #if defined(HAVE_NEON)
-/*
-void McCopy_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
-                      int32_t iWidth, int32_t iHeight);
-void McHorVer20_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
-                    int32_t iWidth, int32_t iHeight);
-void McHorVer02_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
-                    int32_t iWidth, int32_t iHeight);
-void McHorVer22_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
-                    int32_t iWidth, int32_t iHeight);
-    
-void McHorVer01_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iWidth, int32_t iHeight);
-void McHorVer03_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iWidth, int32_t iHeight);
-void McHorVer10_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iWidth, int32_t iHeight);
-void McHorVer11_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iWidth, int32_t iHeight);
-void McHorVer12_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iWidth, int32_t iHeight);
-void McHorVer13_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iWidth, int32_t iHeight);
-void McHorVer21_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iWidth, int32_t iHeight);
-void McHorVer23_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iWidth, int32_t iHeight);
-void McHorVer30_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iWidth, int32_t iHeight);
-void McHorVer31_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iWidth, int32_t iHeight);
-void McHorVer32_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iWidth, int32_t iHeight);
-void McHorVer33_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iWidth, int32_t iHeight);
-    
-void McLuma_neon(uint8_t* pSrc, int32_t src_stride, uint8_t* dst, int32_t dst_stride,
-                        int16_t iMvX, int16_t iMvY, int32_t width, int32_t height);
-void McChroma_neon(uint8_t* pSrc, int32_t src_stride, uint8_t* dst, int32_t dst_stride,
-                          int16_t iMvX, int16_t iMvY, int32_t width, int32_t height);
- */
 void McCopyWidthEq4_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);

 void McCopyWidthEq8_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);
--- a/codec/decoder/core/arm/block_add_neon.S
+++ b/codec/decoder/core/arm/block_add_neon.S
@@ -37,32 +37,32 @@

 .macro	ROW_TRANSFORM_1_STEP
 //	{	//	input: src_d[0]~[3], output: e_q[0]~[3]; working: $8 $9
-		vaddl.s16		$4, $0, $2			//int32 e[i][0] = src[0] + src[2];
-		vsubl.s16		$5, $0, $2			//int32 e[i][1] = src[0] - src[2];
-		vshr.s16		$8, $1, #1
-		vshr.s16		$9, $3, #1
-		vsubl.s16		$6, $8, $3			//int32 e[i][2] = (src[1]>>1)-src[3];	
-		vaddl.s16		$7, $1, $9			//int32 e[i][3] = src[1] + (src[3]>>1);		
+    vaddl.s16		$4, $0, $2			//int32 e[i][0] = src[0] + src[2];
+    vsubl.s16		$5, $0, $2			//int32 e[i][1] = src[0] - src[2];
+    vshr.s16		$8, $1, #1
+    vshr.s16		$9, $3, #1
+    vsubl.s16		$6, $8, $3			//int32 e[i][2] = (src[1]>>1)-src[3];
+    vaddl.s16		$7, $1, $9			//int32 e[i][3] = src[1] + (src[3]>>1);
 //	}
 .endm

 .macro	TRANSFORM_4BYTES	// both row & col transform used
 //	{	//	output: f_q[0]~[3], input: e_q[0]~[3];
-		vadd.s32		$0, $4, $7			//int16 f[i][0] = e[i][0] + e[i][3];
-		vadd.s32		$1, $5, $6			//int16 f[i][1] = e[i][1] + e[i][2];
-		vsub.s32		$2, $5, $6			//int16 f[i][2] = e[i][1] - e[i][2];
-		vsub.s32		$3, $4, $7			//int16 f[i][3] = e[i][0] - e[i][3];
+    vadd.s32		$0, $4, $7			//int16 f[i][0] = e[i][0] + e[i][3];
+    vadd.s32		$1, $5, $6			//int16 f[i][1] = e[i][1] + e[i][2];
+    vsub.s32		$2, $5, $6			//int16 f[i][2] = e[i][1] - e[i][2];
+    vsub.s32		$3, $4, $7			//int16 f[i][3] = e[i][0] - e[i][3];
 //	}
 .endm

 .macro	COL_TRANSFORM_1_STEP
 //	{	//	input: src_q[0]~[3], output: e_q[0]~[3];
-		vadd.s32		$4, $0, $2			//int32 e[0][j] = f[0][j] + f[2][j];
-		vsub.s32		$5, $0, $2			//int32 e[1][j] = f[0][j] - f[2][j];
-		vshr.s32		$6, $1, #1
-		vshr.s32		$7, $3, #1
-		vsub.s32		$6, $6, $3			//int32 e[2][j] = (f[1][j]>>1) - f[3][j];	
-		vadd.s32		$7, $1, $7			//int32 e[3][j] = f[1][j] + (f[3][j]>>1);
+    vadd.s32		$4, $0, $2			//int32 e[0][j] = f[0][j] + f[2][j];
+    vsub.s32		$5, $0, $2			//int32 e[1][j] = f[0][j] - f[2][j];
+    vshr.s32		$6, $1, #1
+    vshr.s32		$7, $3, #1
+    vsub.s32		$6, $6, $3			//int32 e[2][j] = (f[1][j]>>1) - f[3][j];
+    vadd.s32		$7, $1, $7			//int32 e[3][j] = f[1][j] + (f[3][j]>>1);
 //	}
 .endm

@@ -70,38 +70,38 @@

 .macro	ROW_TRANSFORM_1_STEP arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8, arg9
 //	{	//	input: src_d[0]~[3], output: e_q[0]~[3]; working: $8 $9
-		vaddl.s16		\arg4, \arg0, \arg2			//int32 e[i][0] = src[0] + src[2];
-		vsubl.s16		\arg5, \arg0, \arg2			//int32 e[i][1] = src[0] - src[2];
-		vshr.s16		\arg8, \arg1, #1
-		vshr.s16		\arg9, \arg3, #1
-		vsubl.s16		\arg6, \arg8, \arg3			//int32 e[i][2] = (src[1]>>1)-src[3];	
-		vaddl.s16		\arg7, \arg1, \arg9			//int32 e[i][3] = src[1] + (src[3]>>1);		
+    vaddl.s16		\arg4, \arg0, \arg2			//int32 e[i][0] = src[0] + src[2];
+    vsubl.s16		\arg5, \arg0, \arg2			//int32 e[i][1] = src[0] - src[2];
+    vshr.s16		\arg8, \arg1, #1
+    vshr.s16		\arg9, \arg3, #1
+    vsubl.s16		\arg6, \arg8, \arg3			//int32 e[i][2] = (src[1]>>1)-src[3];
+    vaddl.s16		\arg7, \arg1, \arg9			//int32 e[i][3] = src[1] + (src[3]>>1);
 //	}
 .endm

 .macro	TRANSFORM_4BYTES  arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7 // both row & col transform used
 //	{	//	output: f_q[0]~[3], input: e_q[0]~[3];
-		vadd.s32		\arg0, \arg4, \arg7			//int16 f[i][0] = e[i][0] + e[i][3];
-		vadd.s32		\arg1, \arg5, \arg6			//int16 f[i][1] = e[i][1] + e[i][2];
-		vsub.s32		\arg2, \arg5, \arg6			//int16 f[i][2] = e[i][1] - e[i][2];
-		vsub.s32		\arg3, \arg4, \arg7			//int16 f[i][3] = e[i][0] - e[i][3];
+    vadd.s32		\arg0, \arg4, \arg7			//int16 f[i][0] = e[i][0] + e[i][3];
+    vadd.s32		\arg1, \arg5, \arg6			//int16 f[i][1] = e[i][1] + e[i][2];
+    vsub.s32		\arg2, \arg5, \arg6			//int16 f[i][2] = e[i][1] - e[i][2];
+    vsub.s32		\arg3, \arg4, \arg7			//int16 f[i][3] = e[i][0] - e[i][3];
 //	}
 .endm

 .macro	COL_TRANSFORM_1_STEP arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7
 //	{	//	input: src_q[0]~[3], output: e_q[0]~[3];
-		vadd.s32		\arg4, \arg0, \arg2			//int32 e[0][j] = f[0][j] + f[2][j];
-		vsub.s32		\arg5, \arg0, \arg2			//int32 e[1][j] = f[0][j] - f[2][j];
-		vshr.s32		\arg6, \arg1, #1
-		vshr.s32		\arg7, \arg3, #1
-		vsub.s32		\arg6, \arg6, \arg3			//int32 e[2][j] = (f[1][j]>>1) - f[3][j];	
-		vadd.s32		\arg7, \arg1, \arg7			//int32 e[3][j] = f[1][j] + (f[3][j]>>1);
+    vadd.s32		\arg4, \arg0, \arg2			//int32 e[0][j] = f[0][j] + f[2][j];
+    vsub.s32		\arg5, \arg0, \arg2			//int32 e[1][j] = f[0][j] - f[2][j];
+    vshr.s32		\arg6, \arg1, #1
+    vshr.s32		\arg7, \arg3, #1
+    vsub.s32		\arg6, \arg6, \arg3			//int32 e[2][j] = (f[1][j]>>1) - f[3][j];
+    vadd.s32		\arg7, \arg1, \arg7			//int32 e[3][j] = f[1][j] + (f[3][j]>>1);
 //	}
 .endm
 #endif
 // r0    int16_t* block,
 // r1    int8_t* non_zero_count,
-  WELS_ASM_FUNC_BEGIN SetNonZeroCount_neon
+WELS_ASM_FUNC_BEGIN SetNonZeroCount_neon

 	vld1.64	{d0-d2}, [r1]

@@ -113,12 +113,12 @@
 	vabs.s8	d2, d2

 	vst1.64	{d0-d2}, [r1]
-  WELS_ASM_FUNC_END
+WELS_ASM_FUNC_END


 //	r0 int16_t * block,
 //	r1	int32_t stride
-  WELS_ASM_FUNC_BEGIN WelsResBlockZero16x16_neon// can use for 256*sizeof(int16_t)
+WELS_ASM_FUNC_BEGIN WelsResBlockZero16x16_neon// can use for 256*sizeof(int16_t)
 	push		{r2}
 	mov			r2, #16
 // each row 16 elements, 16*sizeof(int16_t)
@@ -135,9 +135,9 @@ block_zero_16x16_luma_loop:
 	bne			block_zero_16x16_luma_loop

 	pop		{r2}
-  WELS_ASM_FUNC_END
+WELS_ASM_FUNC_END

-  WELS_ASM_FUNC_BEGIN WelsResBlockZero8x8_neon// can use for 64*sizeof(int16_t)
+WELS_ASM_FUNC_BEGIN WelsResBlockZero8x8_neon// can use for 64*sizeof(int16_t)
 	push		{r2}
 	mov			r2, #8
 // each row 8 elements, 8*sizeof(int16_t)
@@ -153,11 +153,11 @@ block_zero_8x8_chma_loop:
 	bne			block_zero_8x8_chma_loop

 	pop		{r2}
-  WELS_ASM_FUNC_END
+WELS_ASM_FUNC_END


 //	uint8_t *pred, const int32_t stride, int16_t *rs
-  WELS_ASM_FUNC_BEGIN IdctResAddPred_neon
+WELS_ASM_FUNC_BEGIN IdctResAddPred_neon

 	vld4.s16		{d0, d1, d2, d3}, [r2]		// cost 3 cycles!

@@ -199,5 +199,5 @@ block_zero_8x8_chma_loop:
 	vst1.32		{d12[1]},[r2],r1
 	vst1.32		{d14[0]},[r2],r1
 	vst1.32		{d14[1]},[r2]
-  WELS_ASM_FUNC_END
+WELS_ASM_FUNC_END
 #endif
--- a/codec/decoder/core/arm/intra_pred_neon.S
+++ b/codec/decoder/core/arm/intra_pred_neon.S
@@ -61,14 +61,8 @@
 .endm
 #endif

-/*
- * void get_i16x16_luma_pred_v(uint8_t *pred, const int32_t stride)
- * r0     --- pred
- * r1     --- stride
- * return --- void
- */

-  WELS_ASM_FUNC_BEGIN WelsDecoderI16x16LumaPredV_neon
+WELS_ASM_FUNC_BEGIN WelsDecoderI16x16LumaPredV_neon
 	//Get the top line data to 'q0'
 	sub  r2, r0, r1
 	vldm r2, {d0, d1}
@@ -84,17 +78,11 @@ loop_0_get_i16x16_luma_pred_v:
 	subs  r3, #1
 	bne  loop_0_get_i16x16_luma_pred_v

-	WELS_ASM_FUNC_END
+WELS_ASM_FUNC_END


-/*
- * void get_i16x16_luma_pred_h(uint8_t *pred, const int32_t stride)
- * r0     --- pred
- * r1     --- stride
- * return --- void
- */

-  WELS_ASM_FUNC_BEGIN WelsDecoderI16x16LumaPredH_neon
+WELS_ASM_FUNC_BEGIN WelsDecoderI16x16LumaPredH_neon
 	sub  r2, r0, #1
 	mov  r3, #4
 loop_0_get_i16x16_luma_pred_h:
@@ -113,17 +101,10 @@ loop_0_get_i16x16_luma_pred_h:
 	subs  r3, #1
 	bne  loop_0_get_i16x16_luma_pred_h

-	WELS_ASM_FUNC_END
+WELS_ASM_FUNC_END


-/*
- * void get_i16x16_luma_pred_dc_both(uint8_t *pred, const int32_t stride)
- * r0     --- pred
- * r1     --- stride
- * return --- void
- */
-
-  WELS_ASM_FUNC_BEGIN WelsDecoderI16x16LumaPredDc_neon
+WELS_ASM_FUNC_BEGIN WelsDecoderI16x16LumaPredDc_neon
 	//stmdb sp!, { r2-r5, lr}
 	//Get the left vertical line data
 	sub r2, r0, #1
@@ -156,15 +137,9 @@ loop_0_get_i16x16_luma_pred_dc_both:
 	subs  r2, #1
 	bne  loop_0_get_i16x16_luma_pred_dc_both

-	WELS_ASM_FUNC_END
+WELS_ASM_FUNC_END


-/*
- * void get_i16x16_luma_pred_plane(uint8_t *pred, const int32_t stride)
- * r0     --- pred
- * r1     --- stride
- * return --- void
- */

 //The table for SIMD instruction {(8,7,6,5,4,3,2,1) * 5}
 CONST0_GET_I16X16_LUMA_PRED_PLANE: .long 0x191e2328, 0x050a0f14
@@ -173,7 +148,7 @@ CONST0_GET_I16X16_LUMA_PRED_PLANE: .long 0x191e2328, 0x050a0f14
 CONST1_GET_I16X16_LUMA_PRED_PLANE: .long 0xfcfbfaf9, 0x00fffefd


-  WELS_ASM_FUNC_BEGIN WelsDecoderI16x16LumaPredPlane_neon
+WELS_ASM_FUNC_BEGIN WelsDecoderI16x16LumaPredPlane_neon
 	//stmdb sp!, { r2-r5, lr}

 	//Load the table {(8,7,6,5,4,3,2,1) * 5}
@@ -261,17 +236,9 @@ loop_0_get_i16x16_luma_pred_plane:
 	subs  r2, #1
 	bne  loop_0_get_i16x16_luma_pred_plane

-	WELS_ASM_FUNC_END
+WELS_ASM_FUNC_END

-
-/*
- * void get_i4x4_luma_pred_v(uint8_t *pred, const int32_t stride);
- * r0     --- pred
- * r1     --- stride
- * return --- void
- */
- 
-  WELS_ASM_FUNC_BEGIN WelsDecoderI4x4LumaPredV_neon
+WELS_ASM_FUNC_BEGIN WelsDecoderI4x4LumaPredV_neon
 	//stmdb sp!, { r2-r5, lr}
 	//Load the top row (4 bytes)
 	sub  r2, r0, r1
@@ -283,16 +250,11 @@ loop_0_get_i16x16_luma_pred_plane:
 	str  r2, [r0], r1
 	str  r2, [r0]

-	WELS_ASM_FUNC_END
+WELS_ASM_FUNC_END

-/*
- * void get_i4x4_luma_pred_h(uint8_t *pred, const int32_t stride);
- * r0     --- pred
- * r1     --- stride
- * return --- void
- */

-  WELS_ASM_FUNC_BEGIN WelsDecoderI4x4LumaPredH_neon
+
+WELS_ASM_FUNC_BEGIN WelsDecoderI4x4LumaPredH_neon
 	//stmdb sp!, { r2-r5, lr}
 	//Load the left column (4 bytes)
 	sub  r2, r0, #1
@@ -307,15 +269,10 @@ loop_0_get_i16x16_luma_pred_plane:
 	vst1.32 {d2[0]}, [r0], r1
 	vst1.32 {d3[0]}, [r0]

-	WELS_ASM_FUNC_END
+WELS_ASM_FUNC_END

-/*
- * void get_i4x4_luma_pred_d_l(uint8_t *pred, const int32_t stride);
- * r0     --- pred
- * r1     --- stride
- * return --- void
- */
-  WELS_ASM_FUNC_BEGIN WelsDecoderI4x4LumaPredDDL_neon
+
+WELS_ASM_FUNC_BEGIN WelsDecoderI4x4LumaPredDDL_neon
 	//stmdb sp!, { r2-r5, lr}
 	//Load the top row data(8 bytes)
 	sub    r2,  r0, r1
@@ -350,16 +307,10 @@ loop_0_get_i16x16_luma_pred_plane:
 	//Save "ddl3, ddl4, ddl5, ddl6"
 	vst1.32  d0[1], [r0]

-	WELS_ASM_FUNC_END
+WELS_ASM_FUNC_END

-/*
- * void get_i4x4_luma_pred_d_r(uint8_t *pred, const int32_t stride);
- * r0     --- pred
- * r1     --- stride
- * return --- void
- */

-  WELS_ASM_FUNC_BEGIN WelsDecoderI4x4LumaPredDDR_neon
+WELS_ASM_FUNC_BEGIN WelsDecoderI4x4LumaPredDDR_neon
 	//stmdb sp!, { r2-r5, lr}
 	//Load the top row (4 bytes)
 	sub    r2,  r0, r1
@@ -396,16 +347,11 @@ loop_0_get_i16x16_luma_pred_plane:
 	vext.8    d0, d0, d0, #7
 	vst1.32   d0[1], [r0]

-	WELS_ASM_FUNC_END
+WELS_ASM_FUNC_END


-/*
- * void get_i4x4_luma_pred_v_l(uint8_t *pred, const int32_t stride);
- * r0     --- pred
- * r1     --- stride
- * return --- void
- */
-  WELS_ASM_FUNC_BEGIN WelsDecoderI4x4LumaPredVL_neon
+
+WELS_ASM_FUNC_BEGIN WelsDecoderI4x4LumaPredVL_neon
 	//stmdb sp!, { r2-r5, lr}
 	//Load the top row (8 bytes)
 	sub    r2,  r0, r1
@@ -432,17 +378,11 @@ loop_0_get_i16x16_luma_pred_plane:
 	vst1.32  d0[0], [r0], r1
 	vst1.32  d1[0], [r0]

-	WELS_ASM_FUNC_END
+WELS_ASM_FUNC_END


-/*
- * void get_i4x4_luma_pred_v_r(uint8_t *pred, const int32_t stride);
- * r0     --- pred
- * r1     --- stride
- * return --- void
- */

-  WELS_ASM_FUNC_BEGIN WelsDecoderI4x4LumaPredVR_neon
+WELS_ASM_FUNC_BEGIN WelsDecoderI4x4LumaPredVR_neon
 	//stmdb sp!, { r2-r5, lr}
 	//Load the top row (4 bytes)
 	sub       r2,  r0, r1
@@ -476,17 +416,11 @@ loop_0_get_i16x16_luma_pred_plane:
 	vst1.8   d1[2], [r2]!
 	vst1.16  d1[2], [r2]!
 	vst1.8   d1[6], [r2]
-  WELS_ASM_FUNC_END
+WELS_ASM_FUNC_END


-/*
- * get_i4x4_luma_pred_h_u(uint8_t *pred, const int32_t stride);
- * r0     --- pred
- * r1     --- stride
- * return --- void
- */
- //NO TEST 
-  WELS_ASM_FUNC_BEGIN WelsDecoderI4x4LumaPredHU_neon
+
+WELS_ASM_FUNC_BEGIN WelsDecoderI4x4LumaPredHU_neon
 	//stmdb sp!, { r2-r5, lr}
 	//Load the left column data
 	sub       r2,  r0, #1
@@ -516,15 +450,10 @@ loop_0_get_i16x16_luma_pred_plane:
 	vst1.32  d1[1], [r0], r1
 	vst1.32  d0[0], [r0]

-  WELS_ASM_FUNC_END
+WELS_ASM_FUNC_END

-/*
- * void get_i4x4_luma_pred_h_d(uint8_t *pred, const int32_t stride);
- * r0     --- pred
- * r1     --- stride
- * return --- void
- */
-  WELS_ASM_FUNC_BEGIN WelsDecoderI4x4LumaPredHD_neon
+
+WELS_ASM_FUNC_BEGIN WelsDecoderI4x4LumaPredHD_neon
 	//stmdb sp!, { r2-r5, lr}
 	//Load the data
 	sub       r2,  r0, r1
@@ -556,17 +485,10 @@ loop_0_get_i16x16_luma_pred_plane:
 	vst2.16  {d2[2], d3[2]}, [r0], r1
 	vst2.16  {d0[1], d1[1]}, [r0]

-  WELS_ASM_FUNC_END
+WELS_ASM_FUNC_END


-/*
- * void get_i_chroma_pred_v(uint8_t *pred, const int32_t stride);
- * r0     --- pred
- * r1     --- stride
- * return --- void
- */
- 
-  WELS_ASM_FUNC_BEGIN WelsDecoderIChromaPredV_neon
+WELS_ASM_FUNC_BEGIN WelsDecoderIChromaPredV_neon
 	//stmdb sp!, { r2-r5, lr}
 	//Get the top row (8 byte)
 	sub  r2, r0, r1
@@ -582,17 +504,10 @@ loop_0_get_i16x16_luma_pred_plane:
 	vst1.8 {d0}, [r0], r1
 	vst1.8 {d0}, [r0]

-	WELS_ASM_FUNC_END
+WELS_ASM_FUNC_END


-/*
- * void get_i_chroma_pred_h(uint8_t *pred, const int32_t stride);
- * r0     --- pred
- * r1     --- stride
- * return --- void
- */
- 
-  WELS_ASM_FUNC_BEGIN WelsDecoderIChromaPredH_neon
+WELS_ASM_FUNC_BEGIN WelsDecoderIChromaPredH_neon
 	//stmdb sp!, { r2-r5, lr}
 	////Get the left column (8 byte)
 	sub  r2, r0, #1
@@ -615,17 +530,10 @@ loop_0_get_i16x16_luma_pred_plane:
 	vst1.8 {d6}, [r0], r1
 	vst1.8 {d7}, [r0]

-	WELS_ASM_FUNC_END
+WELS_ASM_FUNC_END


-/*
- * void get_i_chroma_pred_dc_both(uint8_t *pred, const int32_t stride);
- * r0     --- pred
- * r1     --- stride
- * return --- void
- */
- 
-    WELS_ASM_FUNC_BEGIN WelsDecoderIChromaPredDC_neon
+WELS_ASM_FUNC_BEGIN WelsDecoderIChromaPredDC_neon
    //stmdb sp!, { r2-r5, lr}
    //Load the left column data (8 bytes)
    sub r2, r0, #1
@@ -659,20 +567,15 @@ loop_0_get_i16x16_luma_pred_plane:
    vst2.32 {d6[0],d7[0]}, [r0], r1
    vst2.32 {d6[0],d7[0]}, [r0]

-    WELS_ASM_FUNC_END
+WELS_ASM_FUNC_END
+

-/*
- * void get_i_chroma_pred_plane(uint8_t *pred, const int32_t stride);
- * r0     --- pred
- * r1     --- stride
- * return --- void
- */
 //Table {{1,2,3,4,1,2,3,4}*17}
 CONST0_GET_I_CHROMA_PRED_PLANE: .long 0x44332211, 0x44332211//0x140f0a05, 0x28231e19
 //Table {-3,-2,-1,0,1,2,3,4}
 CONST1_GET_I_CHROMA_PRED_PLANE: .long 0xfffefffd, 0x0000ffff,0x00020001,0x00040003

-  WELS_ASM_FUNC_BEGIN WelsDecoderIChromaPredPlane_neon
+WELS_ASM_FUNC_BEGIN WelsDecoderIChromaPredPlane_neon
 	//stmdb sp!, { r2-r5, lr}
 	//Load the top row data
 	sub  r2, r0, #1
@@ -741,6 +644,6 @@ loop_0_get_i_chroma_pred_plane:
 	subs  r2, #1
 	bne  loop_0_get_i_chroma_pred_plane

-	WELS_ASM_FUNC_END
+WELS_ASM_FUNC_END

 #endif
--- a/codec/decoder/core/arm/mc_neon.S
+++ b/codec/decoder/core/arm/mc_neon.S
@@ -37,172 +37,168 @@
 #ifdef APPLE_IOS
 .macro	AVERAGE_TWO_8BITS
 //	{	// input:dst_d, src_d A and B; working: q13
-		vaddl.u8	q13, $2, $1
-		vrshrn.u16		$0, q13, #1		
+    vaddl.u8	q13, $2, $1
+    vrshrn.u16		$0, q13, #1
 //	}
 .endm

-//h_filter(src) = (src[-2] + src[3]) - 5*(src[-1] + src[2]) + 20*(src[ 0] + src[1]);//
-//clip((h_filter(src)+16)>>5)
 .macro	FILTER_6TAG_8BITS
 //	{	// input:src[-2], src[-1], src[0], src[1], src[2], src[3], dst_d, multiplier a/b; working: q12, q13
-		vaddl.u8	q12, $0, $5	//q12=src[-2]+src[3]
-		vaddl.u8	q13, $2, $3	//src[0]+src[1]
-		vmla.u16	q12, q13, $7	//q12 += 20*(src[0]+src[1]), 2 cycles
-		vaddl.u8	q13, $1, $4	//src[-1]+src[2]
-		vmls.s16	q12, q13, $8	//q12 -= 5*(src[-1]+src[2]), 2 cycles
-		vqrshrun.s16		$6, q12, #5
+    vaddl.u8	q12, $0, $5	//q12=src[-2]+src[3]
+    vaddl.u8	q13, $2, $3	//src[0]+src[1]
+    vmla.u16	q12, q13, $7	//q12 += 20*(src[0]+src[1]), 2 cycles
+    vaddl.u8	q13, $1, $4	//src[-1]+src[2]
+    vmls.s16	q12, q13, $8	//q12 -= 5*(src[-1]+src[2]), 2 cycles
+    vqrshrun.s16		$6, q12, #5
 //	}
 .endm

 .macro	FILTER_6TAG_8BITS_AVERAGE_WITH_0
 //	{	// input:src[-2], src[-1], src[0], src[1], src[2], src[3], dst_d, multiplier a/b; working: q12, q13
-		vaddl.u8	q12, $0, $5	//q12=src[-2]+src[3]
-		vaddl.u8	q13, $2, $3	//src[0]+src[1]
-		vmla.u16	q12, q13, $7	//q12 += 20*(src[0]+src[1]), 2 cycles
-		vaddl.u8	q13, $1, $4	//src[-1]+src[2]
-		vmls.s16	q12, q13, $8	//q12 -= 5*(src[-1]+src[2]), 2 cycles
-		vqrshrun.s16		$6, q12, #5
-		vaddl.u8	q13, $2, $6
-		vrshrn.u16		$6, q13, #1		
+    vaddl.u8	q12, $0, $5	//q12=src[-2]+src[3]
+    vaddl.u8	q13, $2, $3	//src[0]+src[1]
+    vmla.u16	q12, q13, $7	//q12 += 20*(src[0]+src[1]), 2 cycles
+    vaddl.u8	q13, $1, $4	//src[-1]+src[2]
+    vmls.s16	q12, q13, $8	//q12 -= 5*(src[-1]+src[2]), 2 cycles
+    vqrshrun.s16		$6, q12, #5
+    vaddl.u8	q13, $2, $6
+    vrshrn.u16		$6, q13, #1
 //	}
 .endm

 .macro	FILTER_6TAG_8BITS_AVERAGE_WITH_1
 //	{	// input:src[-2], src[-1], src[0], src[1], src[2], src[3], dst_d, multiplier a/b; working: q12, q13
-		vaddl.u8	q12, $0, $5	//q12=src[-2]+src[3]
-		vaddl.u8	q13, $2, $3	//src[0]+src[1]
-		vmla.u16	q12, q13, $7	//q12 += 20*(src[0]+src[1]), 2 cycles
-		vaddl.u8	q13, $1, $4	//src[-1]+src[2]
-		vmls.s16	q12, q13, $8	//q12 -= 5*(src[-1]+src[2]), 2 cycles
-		vqrshrun.s16		$6, q12, #5
-		vaddl.u8	q13, $3, $6
-		vrshrn.u16		$6, q13, #1		
+    vaddl.u8	q12, $0, $5	//q12=src[-2]+src[3]
+    vaddl.u8	q13, $2, $3	//src[0]+src[1]
+    vmla.u16	q12, q13, $7	//q12 += 20*(src[0]+src[1]), 2 cycles
+    vaddl.u8	q13, $1, $4	//src[-1]+src[2]
+    vmls.s16	q12, q13, $8	//q12 -= 5*(src[-1]+src[2]), 2 cycles
+    vqrshrun.s16		$6, q12, #5
+    vaddl.u8	q13, $3, $6
+    vrshrn.u16		$6, q13, #1
 //	}
 .endm

 .macro	FILTER_6TAG_8BITS_TO_16BITS
 //	{	// input:d_src[-2], d_src[-1], d_src[0], d_src[1], d_src[2], d_src[3], dst_q, multiplier a/b; working:q13
-		vaddl.u8	$6, $0, $5		//dst_q=src[-2]+src[3]
-		vaddl.u8	q13, $2, $3	//src[0]+src[1]
-		vmla.u16	$6, q13, $7	//dst_q += 20*(src[0]+src[1]), 2 cycles
-		vaddl.u8	q13, $1, $4	//src[-1]+src[2]
-		vmls.s16	$6, q13, $8	//dst_q -= 5*(src[-1]+src[2]), 2 cycles
+    vaddl.u8	$6, $0, $5		//dst_q=src[-2]+src[3]
+    vaddl.u8	q13, $2, $3	//src[0]+src[1]
+    vmla.u16	$6, q13, $7	//dst_q += 20*(src[0]+src[1]), 2 cycles
+    vaddl.u8	q13, $1, $4	//src[-1]+src[2]
+    vmls.s16	$6, q13, $8	//dst_q -= 5*(src[-1]+src[2]), 2 cycles
 //	}
 .endm

 .macro	FILTER_3_IN_16BITS_TO_8BITS
 //	{	// input:a, b, c, dst_d;
-		vsub.s16	$0, $0, $1			//a-b
-		vshr.s16	$0, $0, #2			//(a-b)/4
-		vsub.s16	$0, $0, $1			//(a-b)/4-b
-		vadd.s16	$0, $0, $2			//(a-b)/4-b+c
-		vshr.s16	$0, $0, #2			//((a-b)/4-b+c)/4
-		vadd.s16	$0, $0, $2			//((a-b)/4-b+c)/4+c = (a-5*b+20*c)/16
-		vqrshrun.s16	$3, $0, #6		//(+32)>>6
+    vsub.s16	$0, $0, $1			//a-b
+    vshr.s16	$0, $0, #2			//(a-b)/4
+    vsub.s16	$0, $0, $1			//(a-b)/4-b
+    vadd.s16	$0, $0, $2			//(a-b)/4-b+c
+    vshr.s16	$0, $0, #2			//((a-b)/4-b+c)/4
+    vadd.s16	$0, $0, $2			//((a-b)/4-b+c)/4+c = (a-5*b+20*c)/16
+    vqrshrun.s16	$3, $0, #6		//(+32)>>6
 //	}
 .endm

 .macro	UNPACK_2_16BITS_TO_ABC
 //	{	// input:q_src[-2:5], q_src[6:13](avail 8+5)/q_src[6:**](avail 4+5), dst_a, dst_b, dst_c;
-		vext.16	$4, $0, $1, #2		//src[0]
-		vext.16	$3, $0, $1, #3		//src[1]
-		vadd.s16	$4, $3					//c=src[0]+src[1]
+    vext.16	$4, $0, $1, #2		//src[0]
+    vext.16	$3, $0, $1, #3		//src[1]
+    vadd.s16	$4, $3					//c=src[0]+src[1]

-		vext.16	$3, $0, $1, #1		//src[-1]
-		vext.16	$2, $0, $1, #4		//src[2]
-		vadd.s16	$3, $2					//b=src[-1]+src[2]	
+    vext.16	$3, $0, $1, #1		//src[-1]
+    vext.16	$2, $0, $1, #4		//src[2]
+    vadd.s16	$3, $2					//b=src[-1]+src[2]

-		vext.16	$2, $0, $1, #5		//src[3]	
-		vadd.s16	$2, $0					//a=src[-2]+src[3]
+    vext.16	$2, $0, $1, #5		//src[3]
+    vadd.s16	$2, $0					//a=src[-2]+src[3]
 //	}
 .endm
 #else
 .macro	AVERAGE_TWO_8BITS arg0, arg1, arg2
 //	{	// input:dst_d, src_d A and B; working: q13
-		vaddl.u8	q13, \arg2, \arg1
-		vrshrn.u16		\arg0, q13, #1		
+    vaddl.u8	q13, \arg2, \arg1
+    vrshrn.u16		\arg0, q13, #1
 //	}
 .endm

-//h_filter(src) = (src[-2] + src[3]) - 5*(src[-1] + src[2]) + 20*(src[ 0] + src[1]);//
-//clip((h_filter(src)+16)>>5)
 .macro	FILTER_6TAG_8BITS arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8
 //	{	// input:src[-2], src[-1], src[0], src[1], src[2], src[3], dst_d, multiplier a/b; working: q12, q13
-		vaddl.u8	q12, \arg0, \arg5	//q12=src[-2]+src[3]
-		vaddl.u8	q13, \arg2, \arg3	//src[0]+src[1]
-		vmla.u16	q12, q13, \arg7	//q12 += 20*(src[0]+src[1]), 2 cycles
-		vaddl.u8	q13, \arg1, \arg4	//src[-1]+src[2]
-		vmls.s16	q12, q13, \arg8	//q12 -= 5*(src[-1]+src[2]), 2 cycles
-		vqrshrun.s16		\arg6, q12, #5
+    vaddl.u8	q12, \arg0, \arg5	//q12=src[-2]+src[3]
+    vaddl.u8	q13, \arg2, \arg3	//src[0]+src[1]
+    vmla.u16	q12, q13, \arg7	//q12 += 20*(src[0]+src[1]), 2 cycles
+    vaddl.u8	q13, \arg1, \arg4	//src[-1]+src[2]
+    vmls.s16	q12, q13, \arg8	//q12 -= 5*(src[-1]+src[2]), 2 cycles
+    vqrshrun.s16		\arg6, q12, #5
 //	}
 .endm

 .macro	FILTER_6TAG_8BITS_AVERAGE_WITH_0 arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8
 //	{	// input:src[-2], src[-1], src[0], src[1], src[2], src[3], dst_d, multiplier a/b; working: q12, q13
-		vaddl.u8	q12, \arg0, \arg5	//q12=src[-2]+src[3]
-		vaddl.u8	q13, \arg2, \arg3	//src[0]+src[1]
-		vmla.u16	q12, q13, \arg7	//q12 += 20*(src[0]+src[1]), 2 cycles
-		vaddl.u8	q13, \arg1, \arg4	//src[-1]+src[2]
-		vmls.s16	q12, q13, \arg8	//q12 -= 5*(src[-1]+src[2]), 2 cycles
-		vqrshrun.s16		\arg6, q12, #5
-		vaddl.u8	q13, \arg2, \arg6
-		vrshrn.u16		\arg6, q13, #1		
+    vaddl.u8	q12, \arg0, \arg5	//q12=src[-2]+src[3]
+    vaddl.u8	q13, \arg2, \arg3	//src[0]+src[1]
+    vmla.u16	q12, q13, \arg7	//q12 += 20*(src[0]+src[1]), 2 cycles
+    vaddl.u8	q13, \arg1, \arg4	//src[-1]+src[2]
+    vmls.s16	q12, q13, \arg8	//q12 -= 5*(src[-1]+src[2]), 2 cycles
+    vqrshrun.s16		\arg6, q12, #5
+    vaddl.u8	q13, \arg2, \arg6
+    vrshrn.u16		\arg6, q13, #1
 //	}
 .endm

 .macro	FILTER_6TAG_8BITS_AVERAGE_WITH_1 arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8
 //	{	// input:src[-2], src[-1], src[0], src[1], src[2], src[3], dst_d, multiplier a/b; working: q12, q13
-		vaddl.u8	q12, \arg0, \arg5	//q12=src[-2]+src[3]
-		vaddl.u8	q13, \arg2, \arg3	//src[0]+src[1]
-		vmla.u16	q12, q13, \arg7	//q12 += 20*(src[0]+src[1]), 2 cycles
-		vaddl.u8	q13, \arg1, \arg4	//src[-1]+src[2]
-		vmls.s16	q12, q13, \arg8	//q12 -= 5*(src[-1]+src[2]), 2 cycles
-		vqrshrun.s16		\arg6, q12, #5
-		vaddl.u8	q13, \arg3, \arg6
-		vrshrn.u16		\arg6, q13, #1		
+    vaddl.u8	q12, \arg0, \arg5	//q12=src[-2]+src[3]
+    vaddl.u8	q13, \arg2, \arg3	//src[0]+src[1]
+    vmla.u16	q12, q13, \arg7	//q12 += 20*(src[0]+src[1]), 2 cycles
+    vaddl.u8	q13, \arg1, \arg4	//src[-1]+src[2]
+    vmls.s16	q12, q13, \arg8	//q12 -= 5*(src[-1]+src[2]), 2 cycles
+    vqrshrun.s16		\arg6, q12, #5
+    vaddl.u8	q13, \arg3, \arg6
+    vrshrn.u16		\arg6, q13, #1
 //	}
 .endm

 .macro	FILTER_6TAG_8BITS_TO_16BITS arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8
 //	{	// input:d_src[-2], d_src[-1], d_src[0], d_src[1], d_src[2], d_src[3], dst_q, multiplier a/b; working:q13
-		vaddl.u8	\arg6, \arg0, \arg5		//dst_q=src[-2]+src[3]
-		vaddl.u8	q13, \arg2, \arg3	//src[0]+src[1]
-		vmla.u16	\arg6, q13, \arg7	//dst_q += 20*(src[0]+src[1]), 2 cycles
-		vaddl.u8	q13, \arg1, \arg4	//src[-1]+src[2]
-		vmls.s16	\arg6, q13, \arg8	//dst_q -= 5*(src[-1]+src[2]), 2 cycles
+    vaddl.u8	\arg6, \arg0, \arg5		//dst_q=src[-2]+src[3]
+    vaddl.u8	q13, \arg2, \arg3	//src[0]+src[1]
+    vmla.u16	\arg6, q13, \arg7	//dst_q += 20*(src[0]+src[1]), 2 cycles
+    vaddl.u8	q13, \arg1, \arg4	//src[-1]+src[2]
+    vmls.s16	\arg6, q13, \arg8	//dst_q -= 5*(src[-1]+src[2]), 2 cycles
 //	}
 .endm

 .macro	FILTER_3_IN_16BITS_TO_8BITS arg0, arg1, arg2, arg3
 //	{	// input:a, b, c, dst_d;
-		vsub.s16	\arg0, \arg0, \arg1			//a-b
-		vshr.s16	\arg0, \arg0, #2			//(a-b)/4
-		vsub.s16	\arg0, \arg0, \arg1			//(a-b)/4-b
-		vadd.s16	\arg0, \arg0, \arg2			//(a-b)/4-b+c
-		vshr.s16	\arg0, \arg0, #2			//((a-b)/4-b+c)/4
-		vadd.s16	\arg0, \arg0, \arg2			//((a-b)/4-b+c)/4+c = (a-5*b+20*c)/16
-		vqrshrun.s16	\arg3, \arg0, #6		//(+32)>>6
+    vsub.s16	\arg0, \arg0, \arg1			//a-b
+    vshr.s16	\arg0, \arg0, #2			//(a-b)/4
+    vsub.s16	\arg0, \arg0, \arg1			//(a-b)/4-b
+    vadd.s16	\arg0, \arg0, \arg2			//(a-b)/4-b+c
+    vshr.s16	\arg0, \arg0, #2			//((a-b)/4-b+c)/4
+    vadd.s16	\arg0, \arg0, \arg2			//((a-b)/4-b+c)/4+c = (a-5*b+20*c)/16
+    vqrshrun.s16	\arg3, \arg0, #6		//(+32)>>6
 //	}
 .endm

 .macro	UNPACK_2_16BITS_TO_ABC arg0, arg1, arg2, arg3, arg4
 //	{	// input:q_src[-2:5], q_src[6:13](avail 8+5)/q_src[6:**](avail 4+5), dst_a, dst_b, dst_c;
-		vext.16	\arg4, \arg0, \arg1, #2		//src[0]
-		vext.16	\arg3, \arg0, \arg1, #3		//src[1]
-		vadd.s16	\arg4, \arg3					//c=src[0]+src[1]
+    vext.16	\arg4, \arg0, \arg1, #2		//src[0]
+    vext.16	\arg3, \arg0, \arg1, #3		//src[1]
+    vadd.s16	\arg4, \arg3					//c=src[0]+src[1]

-		vext.16	\arg3, \arg0, \arg1, #1		//src[-1]
-		vext.16	\arg2, \arg0, \arg1, #4		//src[2]
-		vadd.s16	\arg3,\arg2					//b=src[-1]+src[2]	
+    vext.16	\arg3, \arg0, \arg1, #1		//src[-1]
+    vext.16	\arg2, \arg0, \arg1, #4		//src[2]
+    vadd.s16	\arg3,\arg2					//b=src[-1]+src[2]

-		vext.16	\arg2, \arg0, \arg1, #5		//src[3]	
-		vadd.s16	\arg2, \arg0					//a=src[-2]+src[3]
+    vext.16	\arg2, \arg0, \arg1, #5		//src[3]
+    vadd.s16	\arg2, \arg0					//a=src[-2]+src[3]
 //	}
 .endm
 #endif
-//uint8_t* src, int32_t src_stride, uint8_t* dst, int32_t dst_stride, int32_t height
-  WELS_ASM_FUNC_BEGIN McHorVer20WidthEq16_neon
+
+WELS_ASM_FUNC_BEGIN McHorVer20WidthEq16_neon
 	push		{r4}
 	ldr			r4, [sp, #4]

@@ -231,10 +227,10 @@ w16_h_mc_luma_loop:
 	cmp		r4, #0
 	bne		w16_h_mc_luma_loop
 	pop		{r4}
-  WELS_ASM_FUNC_END
+WELS_ASM_FUNC_END

-//uint8_t* src, int32_t src_stride, uint8_t* dst, int32_t dst_stride, int32_t height
-  WELS_ASM_FUNC_BEGIN McHorVer20WidthEq8_neon
+
+WELS_ASM_FUNC_BEGIN McHorVer20WidthEq8_neon
 	push		{r4}
 	ldr			r4, [sp, #4]

@@ -260,10 +256,10 @@ w8_h_mc_luma_loop:
 	cmp		r4, #0
 	bne		w8_h_mc_luma_loop
 	pop		{r4}
-  WELS_ASM_FUNC_END
+WELS_ASM_FUNC_END

-//uint8_t* src, int32_t src_stride, uint8_t* dst, int32_t dst_stride, int32_t height
-  WELS_ASM_FUNC_BEGIN McHorVer20WidthEq4_neon
+
+WELS_ASM_FUNC_BEGIN McHorVer20WidthEq4_neon
 	push		{r4, r5, r6}
 	ldr			r6, [sp, #12]

@@ -298,10 +294,10 @@ w4_h_mc_luma_loop:
 	bne		w4_h_mc_luma_loop

 	pop		{r4, r5, r6}
-  WELS_ASM_FUNC_END
+WELS_ASM_FUNC_END

-//uint8_t* src, int32_t src_stride, uint8_t* dst, int32_t dst_stride, int32_t height
-  WELS_ASM_FUNC_BEGIN McHorVer10WidthEq16_neon
+
+WELS_ASM_FUNC_BEGIN McHorVer10WidthEq16_neon
 	push		{r4}
 	ldr			r4, [sp, #4]

@@ -330,10 +326,10 @@ w16_xy_10_mc_luma_loop:
 	cmp		r4, #0
 	bne		w16_xy_10_mc_luma_loop
 	pop		{r4}
-  WELS_ASM_FUNC_END
+WELS_ASM_FUNC_END

-//uint8_t* src, int32_t src_stride, uint8_t* dst, int32_t dst_stride, int32_t height
-  WELS_ASM_FUNC_BEGIN McHorVer10WidthEq8_neon
+
+WELS_ASM_FUNC_BEGIN McHorVer10WidthEq8_neon
 	push		{r4}
 	ldr			r4, [sp, #4]

@@ -359,10 +355,10 @@ w8_xy_10_mc_luma_loop:
 	cmp		r4, #0
 	bne		w8_xy_10_mc_luma_loop
 	pop		{r4}
-  WELS_ASM_FUNC_END
+WELS_ASM_FUNC_END

-//uint8_t* src, int32_t src_stride, uint8_t* dst, int32_t dst_stride, int32_t height
-  WELS_ASM_FUNC_BEGIN McHorVer10WidthEq4_neon
+
+WELS_ASM_FUNC_BEGIN McHorVer10WidthEq4_neon
 	push		{r4, r5, r6}
 	ldr			r6, [sp, #12]

@@ -397,10 +393,10 @@ w4_xy_10_mc_luma_loop:
 	bne		w4_xy_10_mc_luma_loop

 	pop		{r4, r5, r6}
-  WELS_ASM_FUNC_END
+WELS_ASM_FUNC_END

-//uint8_t* src, int32_t src_stride, uint8_t* dst, int32_t dst_stride, int32_t height
-  WELS_ASM_FUNC_BEGIN McHorVer30WidthEq16_neon
+
+WELS_ASM_FUNC_BEGIN McHorVer30WidthEq16_neon
 	push		{r4}
 	ldr			r4, [sp, #4]

@@ -429,10 +425,10 @@ w16_xy_30_mc_luma_loop:
 	cmp		r4, #0
 	bne		w16_xy_30_mc_luma_loop
 	pop		{r4}
-  WELS_ASM_FUNC_END
+WELS_ASM_FUNC_END

-//uint8_t* src, int32_t src_stride, uint8_t* dst, int32_t dst_stride, int32_t height
-  WELS_ASM_FUNC_BEGIN McHorVer30WidthEq8_neon
+
+WELS_ASM_FUNC_BEGIN McHorVer30WidthEq8_neon
 	push		{r4}
 	ldr			r4, [sp, #4]

@@ -458,10 +454,10 @@ w8_xy_30_mc_luma_loop:
 	cmp		r4, #0
 	bne		w8_xy_30_mc_luma_loop
 	pop		{r4}
-  WELS_ASM_FUNC_END
+WELS_ASM_FUNC_END

-//uint8_t* src, int32_t src_stride, uint8_t* dst, int32_t dst_stride, int32_t height
-  WELS_ASM_FUNC_BEGIN McHorVer30WidthEq4_neon
+
+WELS_ASM_FUNC_BEGIN McHorVer30WidthEq4_neon
 	push		{r4, r5, r6}
 	ldr			r6, [sp, #12]

@@ -496,10 +492,10 @@ w4_xy_30_mc_luma_loop:
 	bne		w4_xy_30_mc_luma_loop

 	pop		{r4, r5, r6}
-  WELS_ASM_FUNC_END
+WELS_ASM_FUNC_END

-//uint8_t* src, int32_t src_stride, uint8_t* dst, int32_t dst_stride, int32_t height
-  WELS_ASM_FUNC_BEGIN McHorVer01WidthEq16_neon
+
+WELS_ASM_FUNC_BEGIN McHorVer01WidthEq16_neon
 	push		{r4}
 	ldr			r4, [sp, #4]

@@ -578,9 +574,10 @@ w16_xy_01_luma_loop:
 	cmp		r4, #0
 	bne		w16_xy_01_luma_loop
 	pop		{r4}
-  WELS_ASM_FUNC_END
+WELS_ASM_FUNC_END

-  WELS_ASM_FUNC_BEGIN McHorVer01WidthEq8_neon
+
+WELS_ASM_FUNC_BEGIN McHorVer01WidthEq8_neon
 	push		{r4}
 	ldr			r4, [sp, #4]

@@ -631,9 +628,10 @@ w8_xy_01_mc_luma_loop:
 	bne		w8_xy_01_mc_luma_loop

 	pop		{r4}
-  WELS_ASM_FUNC_END
+WELS_ASM_FUNC_END

-  WELS_ASM_FUNC_BEGIN McHorVer01WidthEq4_neon
+
+WELS_ASM_FUNC_BEGIN McHorVer01WidthEq4_neon
 	push		{r4, r5, r6, r7}
 	sub			r0, r1, lsl #1		//src[-2*src_stride]
 	pld			[r0]
@@ -689,9 +687,10 @@ w4_xy_01_mc_luma_loop:
 	bne		w4_xy_01_mc_luma_loop

 	pop		{r4, r5, r6, r7}
-  WELS_ASM_FUNC_END
+WELS_ASM_FUNC_END

-  WELS_ASM_FUNC_BEGIN McHorVer03WidthEq16_neon
+
+WELS_ASM_FUNC_BEGIN McHorVer03WidthEq16_neon
 	push		{r4}
 	ldr			r4, [sp, #4]

@@ -770,9 +769,10 @@ w16_xy_03_luma_loop:
 	cmp		r4, #0
 	bne		w16_xy_03_luma_loop
 	pop		{r4}
-  WELS_ASM_FUNC_END
+WELS_ASM_FUNC_END

-  WELS_ASM_FUNC_BEGIN McHorVer03WidthEq8_neon
+
+WELS_ASM_FUNC_BEGIN McHorVer03WidthEq8_neon
 	push		{r4}
 	ldr			r4, [sp, #4]

@@ -823,9 +823,10 @@ w8_xy_03_mc_luma_loop:
 	bne		w8_xy_03_mc_luma_loop

 	pop		{r4}
-  WELS_ASM_FUNC_END
+WELS_ASM_FUNC_END

-  WELS_ASM_FUNC_BEGIN McHorVer03WidthEq4_neon
+
+WELS_ASM_FUNC_BEGIN McHorVer03WidthEq4_neon
 	push		{r4, r5, r6, r7}
 	sub			r0, r1, lsl #1		//src[-2*src_stride]
 	pld			[r0]
@@ -881,10 +882,10 @@ w4_xy_03_mc_luma_loop:
 	bne		w4_xy_03_mc_luma_loop

 	pop		{r4, r5, r6, r7}
-  WELS_ASM_FUNC_END
+WELS_ASM_FUNC_END

-//uint8_t* src, int32_t src_stride, uint8_t* dst, int32_t dst_stride, int32_t height
-  WELS_ASM_FUNC_BEGIN McHorVer02WidthEq16_neon
+
+WELS_ASM_FUNC_BEGIN McHorVer02WidthEq16_neon
 	push		{r4}
 	ldr			r4, [sp, #4]

@@ -963,10 +964,10 @@ w16_v_mc_luma_loop:
 	cmp		r4, #0
 	bne		w16_v_mc_luma_loop
 	pop		{r4}
-  WELS_ASM_FUNC_END
+WELS_ASM_FUNC_END

-//uint8_t* src, int32_t src_stride, uint8_t* dst, int32_t dst_stride, int32_t height
-  WELS_ASM_FUNC_BEGIN McHorVer02WidthEq8_neon
+
+WELS_ASM_FUNC_BEGIN McHorVer02WidthEq8_neon
 	push		{r4}
 	ldr			r4, [sp, #4]

@@ -1017,10 +1018,10 @@ w8_v_mc_luma_loop:
 	bne		w8_v_mc_luma_loop

 	pop		{r4}
-  WELS_ASM_FUNC_END
+WELS_ASM_FUNC_END

-//uint8_t* src, int32_t src_stride, uint8_t* dst, int32_t dst_stride, int32_t height
-  WELS_ASM_FUNC_BEGIN McHorVer02WidthEq4_neon
+
+WELS_ASM_FUNC_BEGIN McHorVer02WidthEq4_neon
 	push		{r4, r5, r6, r7}
 	sub			r0, r1, lsl #1		//src[-2*src_stride]
 	pld			[r0]
@@ -1076,10 +1077,10 @@ w4_v_mc_luma_loop:
 	bne		w4_v_mc_luma_loop

 	pop		{r4, r5, r6, r7}
-  WELS_ASM_FUNC_END
+WELS_ASM_FUNC_END

-//uint8_t* src, int32_t src_stride, uint8_t* dst, int32_t dst_stride, int32_t height
-  WELS_ASM_FUNC_BEGIN McHorVer22WidthEq16_neon
+
+WELS_ASM_FUNC_BEGIN McHorVer22WidthEq16_neon
 	push		{r4}
 	ldr			r4, [sp, #4]

@@ -1089,8 +1090,6 @@ w4_v_mc_luma_loop:
 	pld			[r0, r1]

 	vmov.u16	q14, #0x0014			// 20
-//	vmov.u32	d30, #0x00140000
-//	vorr.u32	d30, #0x0005			//0x0014 0005 0014 0005
 	vld1.u8	{d0-d2}, [r0], r1		//use 21(16+5), =src[-2]
 	vld1.u8	{d3-d5}, [r0], r1		//use 21(16+5), =src[-1]

@@ -1115,7 +1114,7 @@ w16_hv_mc_luma_loop:
 	// horizon filtered
 	UNPACK_2_16BITS_TO_ABC	q9, q10, q11, q12, q13
 	FILTER_3_IN_16BITS_TO_8BITS q11, q12, q13, d0	//output to q0[0]
-//	vst1.u8	d18, [r2]		//write 8Byte		
+
 	// vertical filtered into q10/q11
 	FILTER_6TAG_8BITS_TO_16BITS 	d2, d5, d8,d11, d14, d17,q11, q14, q15	// only 5 avail
 	// horizon filtered
@@ -1133,13 +1132,13 @@ w16_hv_mc_luma_loop:
 	// horizon filtered
 	UNPACK_2_16BITS_TO_ABC	q9, q10, q11, q12, q13
 	FILTER_3_IN_16BITS_TO_8BITS q11, q12, q13, d3	//output to d3
-//	vst1.u8	d18, [r2]!			//write 8Byte		
+
 	// vertical filtered into q10/q11
 	FILTER_6TAG_8BITS_TO_16BITS 	d5, d8,d11, d14, d17, d2,q11, q14, q15	// only 5 avail
 	// horizon filtered
 	UNPACK_2_16BITS_TO_ABC	q10, q11, q9, q12, q13
 	FILTER_3_IN_16BITS_TO_8BITS q9, q12, q13, d4	//output to d4
-//	vst1.u8	d20, [r2]!			//write 8Byte
+
 	vst1.u8	{d3, d4}, [r2], r3		//write 16Byte

 	vld1.u8	{d3-d5}, [r0], r1		//read 3rd row
@@ -1151,13 +1150,12 @@ w16_hv_mc_luma_loop:
 	// horizon filtered
 	UNPACK_2_16BITS_TO_ABC	q9, q10, q11, q12, q13
 	FILTER_3_IN_16BITS_TO_8BITS q11, q12, q13, d6	//output to d6
-//	vst1.u8	d18, [r2]!			//write 8Byte		
+
 	// vertical filtered into q10/q11
 	FILTER_6TAG_8BITS_TO_16BITS 	d8,d11, d14, d17, d2, d5,q11, q14, q15	// only 5 avail
 	// horizon filtered
 	UNPACK_2_16BITS_TO_ABC	q10, q11, q9, q12, q13
 	FILTER_3_IN_16BITS_TO_8BITS q9, q12, q13, d7	//output to d7
-//	vst1.u8	d20, [r2]!			//write 8Byte
 	vst1.u8	{d6, d7}, [r2], r3		//write 16Byte

 	vld1.u8	{d6-d8}, [r0], r1		//read 4th row
@@ -1169,13 +1167,11 @@ w16_hv_mc_luma_loop:
 	// horizon filtered
 	UNPACK_2_16BITS_TO_ABC	q9, q10, q11, q12, q13
 	FILTER_3_IN_16BITS_TO_8BITS q11, q12, q13, d9	//output to d9
-//	vst1.u8	d18, [r2]!			//write 8Byte		
 	// vertical filtered into q10/q11
 	FILTER_6TAG_8BITS_TO_16BITS 	d11, d14, d17, d2, d5, d8,q11, q14, q15	// only 5 avail
 	// horizon filtered
 	UNPACK_2_16BITS_TO_ABC	q10, q11, q9, q12, q13
 	FILTER_3_IN_16BITS_TO_8BITS q9, q12, q13, d10	//output to d10
-//	vst1.u8	d20, [r2]!			//write 8Byte	
 	vst1.u8	{d9, d10}, [r2], r3		//write 16Byte

 	//d12~d17(q6~q8), d0~d8(q0~q3+d8), --> d0~d14
@@ -1193,10 +1189,10 @@ w16_hv_mc_luma_loop:
 	cmp		r4, #0
 	bne		w16_hv_mc_luma_loop
 	pop		{r4}
-  WELS_ASM_FUNC_END
+WELS_ASM_FUNC_END

-//uint8_t* src, int32_t src_stride, uint8_t* dst, int32_t dst_stride, int32_t height
-  WELS_ASM_FUNC_BEGIN McHorVer22WidthEq8_neon
+
+WELS_ASM_FUNC_BEGIN McHorVer22WidthEq8_neon
 	push		{r4}
 	ldr			r4, [sp, #4]

@@ -1231,7 +1227,6 @@ w8_hv_mc_luma_loop:
 	UNPACK_2_16BITS_TO_ABC	q6, q7, q11, q12, q13
 	FILTER_3_IN_16BITS_TO_8BITS q11, q12, q13, d12	//output to q6[0]
 	vst1.u8	d12, [r2], r3			//write 8Byte
-//	add			r2, #8

 	vld1.u8	{q0}, [r0], r1		//read 2nd row
 	//the 2nd row
@@ -1243,7 +1238,6 @@ w8_hv_mc_luma_loop:
 	UNPACK_2_16BITS_TO_ABC	q6, q7, q11, q12, q13
 	FILTER_3_IN_16BITS_TO_8BITS q11, q12, q13, d12	//output to q6[0]
 	vst1.u8	d12, [r2], r3		//write 8Byte
-//	add			r2, #8

 	vld1.u8	{q1}, [r0], r1		//read 3rd row
 	//the 3rd row
@@ -1255,7 +1249,6 @@ w8_hv_mc_luma_loop:
 	UNPACK_2_16BITS_TO_ABC	q6, q7, q11, q12, q13
 	FILTER_3_IN_16BITS_TO_8BITS q11, q12, q13, d12	//output to q6[0]
 	vst1.u8	d12, [r2], r3			//write 8Byte
-//	add			r2, #8

 	vld1.u8	{q2}, [r0], r1		//read 4th row
 	//the 4th row
@@ -1267,7 +1260,6 @@ w8_hv_mc_luma_loop:
 	UNPACK_2_16BITS_TO_ABC	q6, q7, q11, q12, q13
 	FILTER_3_IN_16BITS_TO_8BITS q11, q12, q13, d12	//output to q6[0]
 	vst1.u8	d12, [r2], r3			//write 8Byte
-//	add			r2, #8	

 	//q4~q5, q0~q2, --> q0~q4
 	vswp	q0, q4
@@ -1279,10 +1271,10 @@ w8_hv_mc_luma_loop:
 	cmp		r4, #0
 	bne		w8_hv_mc_luma_loop
 	pop		{r4}
-  WELS_ASM_FUNC_END
+WELS_ASM_FUNC_END

-//uint8_t* src, int32_t src_stride, uint8_t* dst, int32_t dst_stride, int32_t height
-  WELS_ASM_FUNC_BEGIN McHorVer22WidthEq4_neon
+
+WELS_ASM_FUNC_BEGIN McHorVer22WidthEq4_neon
 	push		{r4 ,r5, r6}
 	ldr			r6, [sp, #12]

@@ -1331,7 +1323,6 @@ w4_hv_mc_luma_loop:
 	vmov		r4 ,r5, d22
 	str		r4, [r2], r3				//write 4Byte
 	str		r5, [r2], r3				//write 4Byte
-//	add			r2, #32

 	//the 3rd&4th row
 	vld1.u8	{q0}, [r0], r1	//use 9(4+5), =src[3]
@@ -1356,7 +1347,6 @@ w4_hv_mc_luma_loop:
 	vmov		r4 ,r5, d22
 	str		r4, [r2], r3				//write 4Byte
 	str		r5, [r2], r3				//write 4Byte
-//	add			r2, #32

 	//q4~q6, q0~q1, --> q0~q4
 	vswp	q4, q0
@@ -1370,10 +1360,10 @@ w4_hv_mc_luma_loop:
 	bne		w4_hv_mc_luma_loop

 	pop		{r4, r5, r6}
-  WELS_ASM_FUNC_END
+WELS_ASM_FUNC_END

-//uint8_t* src, int32_t src_stride, uint8_t* dst, int32_t dst_stride, int32_t height
-  WELS_ASM_FUNC_BEGIN McCopyWidthEq16_neon
+
+WELS_ASM_FUNC_BEGIN McCopyWidthEq16_neon
 	push		{r4}
 	ldr			r4, [sp, #4]
 w16_copy_loop:
@@ -1386,10 +1376,10 @@ w16_copy_loop:
 	bne			w16_copy_loop

 	pop		{r4}
-  WELS_ASM_FUNC_END
+WELS_ASM_FUNC_END

-//uint8_t* src, int32_t src_stride, uint8_t* dst, int32_t dst_stride, int32_t height
-  WELS_ASM_FUNC_BEGIN McCopyWidthEq8_neon
+
+WELS_ASM_FUNC_BEGIN McCopyWidthEq8_neon
 	push		{r4}
 	ldr			r4, [sp, #4]
 w8_copy_loop:
@@ -1402,10 +1392,10 @@ w8_copy_loop:
 	bne			w8_copy_loop

 	pop		{r4}
-  WELS_ASM_FUNC_END
+WELS_ASM_FUNC_END

-//uint8_t* src, int32_t src_stride, uint8_t* dst, int32_t dst_stride, int32_t height
-  WELS_ASM_FUNC_BEGIN McCopyWidthEq4_neon
+
+WELS_ASM_FUNC_BEGIN McCopyWidthEq4_neon
 	push		{r4, r5, r6}
 	ldr			r4, [sp, #12]
 w4_copy_loop:
@@ -1419,10 +1409,10 @@ w4_copy_loop:
 	bne			w4_copy_loop

 	pop		{r4, r5, r6}
-  WELS_ASM_FUNC_END
+WELS_ASM_FUNC_END

-//uint8_t* dst, int32_t dst_stride, uint8_t* srcA, uint8_t* srcB, int32_t height
-  WELS_ASM_FUNC_BEGIN PixelAvgWidthEq16_neon
+
+WELS_ASM_FUNC_BEGIN PixelAvgWidthEq16_neon
 	push		{r4}
 	ldr			r4, [sp, #4]
 w16_pix_avg_loop:
@@ -1457,9 +1447,10 @@ w16_pix_avg_loop:
 	bne			w16_pix_avg_loop

 	pop		{r4}
-  WELS_ASM_FUNC_END
+WELS_ASM_FUNC_END

-  WELS_ASM_FUNC_BEGIN PixelAvgWidthEq8_neon
+
+WELS_ASM_FUNC_BEGIN PixelAvgWidthEq8_neon
 	push		{r4, r5}
 	ldr			r4, [sp, #8]
 	mov			r5, #16
@@ -1469,8 +1460,6 @@ w8_pix_avg_loop:
 	vld1.u8		{d2}, [r3], r5
 	vld1.u8		{d1}, [r2], r5
 	vld1.u8		{d3}, [r3], r5
-//	add		r2, #32
-//	add		r3, #32

 	AVERAGE_TWO_8BITS		d0, d0, d2
 	AVERAGE_TWO_8BITS		d1, d1, d3
@@ -1481,8 +1470,6 @@ w8_pix_avg_loop:
 	vld1.u8		{d6}, [r3], r5
 	vld1.u8		{d5}, [r2], r5
 	vld1.u8		{d7}, [r3], r5
-//	add		r2, #32
-//	add		r3, #32

 	AVERAGE_TWO_8BITS		d4, d4, d6
 	AVERAGE_TWO_8BITS		d5, d5, d7
@@ -1494,10 +1481,10 @@ w8_pix_avg_loop:
 	bne			w8_pix_avg_loop

 	pop		{r4, r5}
-  WELS_ASM_FUNC_END
+WELS_ASM_FUNC_END

-//uint8_t* src, int32_t src_stride, uint8_t* dst, int32_t dst_stride, int32_t height
-  WELS_ASM_FUNC_BEGIN PixelAvgWidthEq4_neon
+
+WELS_ASM_FUNC_BEGIN PixelAvgWidthEq4_neon
 	push		{r4-r8}
 	ldr			r4, [sp, #20]
 w4_pix_avg_loop:
@@ -1522,15 +1509,9 @@ w4_pix_avg_loop:
 	bne			w4_pix_avg_loop

 	pop		{r4-r8}
-  WELS_ASM_FUNC_END
-
-//uint8_t* src, int32_t src_stride, uint8_t* dst, int32_t dst_stride, int32_t* weights, int32_t height
-//cA = (8 - dx) * (8 - dy);  
-//cB = dx * (8 - dy);
-//cC = (8 - dx) * dy;
-//cD = dx * dy
-  WELS_ASM_FUNC_BEGIN McChromaWidthEq8_neon
+WELS_ASM_FUNC_END

+WELS_ASM_FUNC_BEGIN McChromaWidthEq8_neon
 	push		{r4, r5}
 	ldr			r4, [sp, #8]
 	ldr			r5, [sp, #12]
@@ -1572,10 +1553,10 @@ w8_mc_chroma_loop:	// each two pxl row
 	bne			w8_mc_chroma_loop

 	pop		{r4, r5}
-  WELS_ASM_FUNC_END
+WELS_ASM_FUNC_END

-//uint8_t* src, int32_t src_stride, uint8_t* dst, int32_t dst_stride, int32_t* weights, int32_t height
-  WELS_ASM_FUNC_BEGIN McChromaWidthEq4_neon
+
+WELS_ASM_FUNC_BEGIN McChromaWidthEq4_neon

 	push		{r4, r5, r6}
 	ldr			r4, [sp, #12]
@@ -1617,5 +1598,5 @@ w4_mc_chroma_loop:	// each two pxl row
 	bne			w4_mc_chroma_loop

 	pop		{r4, r5, r6}
-  WELS_ASM_FUNC_END
+WELS_ASM_FUNC_END
 #endif
--- a/codec/decoder/core/src/deblocking.cpp
+++ b/codec/decoder/core/src/deblocking.cpp
@@ -720,7 +720,7 @@ void  DeblockingInit (SDeblockingFunc*  pFunc,  int32_t iCpu) {
 #endif

 #if defined(HAVE_NEON)
-    if ( iCpu & WELS_CPU_NEON )
+  if ( iCpu & WELS_CPU_NEON )
 	{
 		pFunc->pfLumaDeblockingLT4Ver		= DeblockLumaLt4V_neon;
 		pFunc->pfLumaDeblockingEQ4Ver		= DeblockLumaEq4V_neon;
--- a/codec/decoder/core/src/decode_slice.cpp
+++ b/codec/decoder/core/src/decode_slice.cpp
@@ -1152,8 +1152,8 @@ void WelsBlockFuncInit (SBlockFunc*   pFunc,  int32_t iCpu) {
 #ifdef	HAVE_NEON
  if ( iCpu & WELS_CPU_NEON ) {
    pFunc->pWelsBlockZero16x16Func		= WelsResBlockZero16x16_neon;
-    pFunc->pWelsBlockZero8x8Func			= WelsResBlockZero8x8_neon;
-    pFunc->pWelsSetNonZeroCountFunc			= SetNonZeroCount_neon;
+    pFunc->pWelsBlockZero8x8Func		= WelsResBlockZero8x8_neon;
+    pFunc->pWelsSetNonZeroCountFunc		= SetNonZeroCount_neon;
  }
 #endif
 }
--- a/codec/decoder/core/src/decoder.cpp
+++ b/codec/decoder/core/src/decoder.cpp
@@ -667,25 +667,25 @@ void AssignFuncPointerForRec (PWelsDecoderContext pCtx) {
  if ( pCtx->uiCpuFlag & WELS_CPU_NEON ) {
    pCtx->pIdctResAddPredFunc	= IdctResAddPred_neon;

-	  pCtx->pGetI16x16LumaPredFunc[I16_PRED_DC] = WelsDecoderI16x16LumaPredDc_neon;
-	  pCtx->pGetI16x16LumaPredFunc[I16_PRED_P]  = WelsDecoderI16x16LumaPredPlane_neon;
-	  pCtx->pGetI16x16LumaPredFunc[I16_PRED_H]  = WelsDecoderI16x16LumaPredH_neon;
-	  pCtx->pGetI16x16LumaPredFunc[I16_PRED_V]  = WelsDecoderI16x16LumaPredV_neon;
+    pCtx->pGetI16x16LumaPredFunc[I16_PRED_DC] = WelsDecoderI16x16LumaPredDc_neon;
+    pCtx->pGetI16x16LumaPredFunc[I16_PRED_P]  = WelsDecoderI16x16LumaPredPlane_neon;
+    pCtx->pGetI16x16LumaPredFunc[I16_PRED_H]  = WelsDecoderI16x16LumaPredH_neon;
+    pCtx->pGetI16x16LumaPredFunc[I16_PRED_V]  = WelsDecoderI16x16LumaPredV_neon;

-	  pCtx->pGetI4x4LumaPredFunc[I4_PRED_V    ] = WelsDecoderI4x4LumaPredV_neon;
-	  pCtx->pGetI4x4LumaPredFunc[I4_PRED_H    ] = WelsDecoderI4x4LumaPredH_neon;
-	  pCtx->pGetI4x4LumaPredFunc[I4_PRED_DDL  ] = WelsDecoderI4x4LumaPredDDL_neon;
-	  pCtx->pGetI4x4LumaPredFunc[I4_PRED_DDR  ] = WelsDecoderI4x4LumaPredDDR_neon;
-	  pCtx->pGetI4x4LumaPredFunc[I4_PRED_VL   ] = WelsDecoderI4x4LumaPredVL_neon;
-	  pCtx->pGetI4x4LumaPredFunc[I4_PRED_VR   ] = WelsDecoderI4x4LumaPredVR_neon;
-	  pCtx->pGetI4x4LumaPredFunc[I4_PRED_HU   ] = WelsDecoderI4x4LumaPredHU_neon;
-	  pCtx->pGetI4x4LumaPredFunc[I4_PRED_HD   ] = WelsDecoderI4x4LumaPredHD_neon;
+    pCtx->pGetI4x4LumaPredFunc[I4_PRED_V    ] = WelsDecoderI4x4LumaPredV_neon;
+    pCtx->pGetI4x4LumaPredFunc[I4_PRED_H    ] = WelsDecoderI4x4LumaPredH_neon;
+    pCtx->pGetI4x4LumaPredFunc[I4_PRED_DDL  ] = WelsDecoderI4x4LumaPredDDL_neon;
+    pCtx->pGetI4x4LumaPredFunc[I4_PRED_DDR  ] = WelsDecoderI4x4LumaPredDDR_neon;
+    pCtx->pGetI4x4LumaPredFunc[I4_PRED_VL   ] = WelsDecoderI4x4LumaPredVL_neon;
+    pCtx->pGetI4x4LumaPredFunc[I4_PRED_VR   ] = WelsDecoderI4x4LumaPredVR_neon;
+    pCtx->pGetI4x4LumaPredFunc[I4_PRED_HU   ] = WelsDecoderI4x4LumaPredHU_neon;
+    pCtx->pGetI4x4LumaPredFunc[I4_PRED_HD   ] = WelsDecoderI4x4LumaPredHD_neon;

-	  pCtx->pGetIChromaPredFunc[C_PRED_H]       = WelsDecoderIChromaPredH_neon;
-	  pCtx->pGetIChromaPredFunc[C_PRED_V]       = WelsDecoderIChromaPredV_neon;
-	  pCtx->pGetIChromaPredFunc[C_PRED_P ]      = WelsDecoderIChromaPredPlane_neon;
-	  pCtx->pGetIChromaPredFunc[C_PRED_DC]      = WelsDecoderIChromaPredDC_neon;
-	}
+    pCtx->pGetIChromaPredFunc[C_PRED_H]       = WelsDecoderIChromaPredH_neon;
+    pCtx->pGetIChromaPredFunc[C_PRED_V]       = WelsDecoderIChromaPredV_neon;
+    pCtx->pGetIChromaPredFunc[C_PRED_P ]      = WelsDecoderIChromaPredPlane_neon;
+    pCtx->pGetIChromaPredFunc[C_PRED_DC]      = WelsDecoderIChromaPredDC_neon;
+  }
 #endif//HAVE_NEON


--- a/codec/decoder/core/src/mc.cpp
+++ b/codec/decoder/core/src/mc.cpp
@@ -669,8 +669,8 @@ void McCopy_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t
 void McHorVer20_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
 																				int32_t iWidth, int32_t iHeight)
 {
-		if (iWidth == 16)
-				McHorVer20WidthEq16_neon(pSrc, iSrcStride, pDst, iDstStride, iHeight);
+  if (iWidth == 16)
+	   McHorVer20WidthEq16_neon(pSrc, iSrcStride, pDst, iDstStride, iHeight);
 		else if (iWidth == 8)
 				McHorVer20WidthEq8_neon(pSrc, iSrcStride, pDst, iDstStride, iHeight);
 		else if (iWidth == 4)
@@ -690,7 +690,7 @@ void McHorVer22_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int
 																				int32_t iWidth, int32_t iHeight)
 {
 		if (iWidth == 16)
-				McHorVer22WidthEq16_neon(pSrc, iSrcStride, pDst, iDstStride, iHeight);
+    McHorVer22WidthEq16_neon(pSrc, iSrcStride, pDst, iDstStride, iHeight);
 		else if (iWidth == 8)
 				McHorVer22WidthEq8_neon(pSrc, iSrcStride, pDst, iDstStride, iHeight);
 		else if (iWidth == 4)