diff --git a/codec/build/iOS/common/common.xcodeproj/project.pbxproj b/codec/build/iOS/common/common.xcodeproj/project.pbxproj
index e5167f8b..ff32a4e8 100644
--- a/codec/build/iOS/common/common.xcodeproj/project.pbxproj
+++ b/codec/build/iOS/common/common.xcodeproj/project.pbxproj
@@ -19,6 +19,8 @@
 		4CE4475018BC61650017DF25 /* deblocking_common.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 4CE4473818BC61650017DF25 /* deblocking_common.cpp */; };
 		4CE4475218BC61650017DF25 /* logging.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 4CE4473C18BC61650017DF25 /* logging.cpp */; };
 		4CE4475818BC61650017DF25 /* WelsThreadLib.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 4CE4474918BC61650017DF25 /* WelsThreadLib.cpp */; };
+		4CE447BD18C085320017DF25 /* deblocking_neon.S in Sources */ = {isa = PBXBuildFile; fileRef = 4CE447BC18C085320017DF25 /* deblocking_neon.S */; };
+		4CE447BF18C085900017DF25 /* arm_arch_common_macro.S in Sources */ = {isa = PBXBuildFile; fileRef = 4CE447BE18C085900017DF25 /* arm_arch_common_macro.S */; };
 /* End PBXBuildFile section */
 
 /* Begin PBXContainerItemProxy section */
@@ -69,6 +71,8 @@
 		4CE4474718BC61650017DF25 /* typedefs.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = typedefs.h; sourceTree = "<group>"; };
 		4CE4474918BC61650017DF25 /* WelsThreadLib.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = WelsThreadLib.cpp; sourceTree = "<group>"; };
 		4CE4474A18BC61650017DF25 /* WelsThreadLib.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = WelsThreadLib.h; sourceTree = "<group>"; };
+		4CE447BC18C085320017DF25 /* deblocking_neon.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = deblocking_neon.S; sourceTree = "<group>"; };
+		4CE447BE18C085900017DF25 /* arm_arch_common_macro.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = arm_arch_common_macro.S; sourceTree = "<group>"; };
 /* End PBXFileReference section */
 
 /* Begin PBXFrameworksBuildPhase section */
@@ -144,6 +148,8 @@
 		4CE4472F18BC61650017DF25 /* common */ = {
 			isa = PBXGroup;
 			children = (
+				4CE447BE18C085900017DF25 /* arm_arch_common_macro.S */,
+				4CE447BC18C085320017DF25 /* deblocking_neon.S */,
 				4CE4473118BC61650017DF25 /* cpu.cpp */,
 				4CE4473218BC61650017DF25 /* cpu.h */,
 				4CE4473318BC61650017DF25 /* cpu_core.h */,
@@ -247,9 +253,11 @@
 			isa = PBXSourcesBuildPhase;
 			buildActionMask = 2147483647;
 			files = (
+				4CE447BF18C085900017DF25 /* arm_arch_common_macro.S in Sources */,
 				4CE4475018BC61650017DF25 /* deblocking_common.cpp in Sources */,
 				4CE4474C18BC61650017DF25 /* cpu.cpp in Sources */,
 				4CE4475218BC61650017DF25 /* logging.cpp in Sources */,
+				4CE447BD18C085320017DF25 /* deblocking_neon.S in Sources */,
 				4CE4475818BC61650017DF25 /* WelsThreadLib.cpp in Sources */,
 				4CE4474E18BC61650017DF25 /* crt_util_safe_x.cpp in Sources */,
 			);
diff --git a/codec/build/iOS/dec/welsdec/welsdec.xcodeproj/project.pbxproj b/codec/build/iOS/dec/welsdec/welsdec.xcodeproj/project.pbxproj
index fa1fa4da..518aaddc 100644
--- a/codec/build/iOS/dec/welsdec/welsdec.xcodeproj/project.pbxproj
+++ b/codec/build/iOS/dec/welsdec/welsdec.xcodeproj/project.pbxproj
@@ -36,9 +36,7 @@
 		4CE4469D18BC5EAB0017DF25 /* utils.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 4CE4467A18BC5EAA0017DF25 /* utils.cpp */; };
 		4CE4469E18BC5EAB0017DF25 /* welsCodecTrace.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 4CE4468418BC5EAB0017DF25 /* welsCodecTrace.cpp */; };
 		4CE4469F18BC5EAB0017DF25 /* welsDecoderExt.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 4CE4468518BC5EAB0017DF25 /* welsDecoderExt.cpp */; };
-		4CE447AB18BC6BE90017DF25 /* arm_arch_common_macro.S in Sources */ = {isa = PBXBuildFile; fileRef = 4CE447A618BC6BE90017DF25 /* arm_arch_common_macro.S */; };
 		4CE447AC18BC6BE90017DF25 /* block_add_neon.S in Sources */ = {isa = PBXBuildFile; fileRef = 4CE447A718BC6BE90017DF25 /* block_add_neon.S */; };
-		4CE447AD18BC6BE90017DF25 /* deblocking_neon.S in Sources */ = {isa = PBXBuildFile; fileRef = 4CE447A818BC6BE90017DF25 /* deblocking_neon.S */; };
 		4CE447AE18BC6BE90017DF25 /* intra_pred_neon.S in Sources */ = {isa = PBXBuildFile; fileRef = 4CE447A918BC6BE90017DF25 /* intra_pred_neon.S */; };
 		4CE447AF18BC6BE90017DF25 /* mc_neon.S in Sources */ = {isa = PBXBuildFile; fileRef = 4CE447AA18BC6BE90017DF25 /* mc_neon.S */; };
 /* End PBXBuildFile section */
@@ -132,9 +130,7 @@
 		4CE4468318BC5EAB0017DF25 /* wels_dec_export.def */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = text; path = wels_dec_export.def; sourceTree = "<group>"; };
 		4CE4468418BC5EAB0017DF25 /* welsCodecTrace.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = welsCodecTrace.cpp; sourceTree = "<group>"; };
 		4CE4468518BC5EAB0017DF25 /* welsDecoderExt.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = welsDecoderExt.cpp; sourceTree = "<group>"; };
-		4CE447A618BC6BE90017DF25 /* arm_arch_common_macro.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = arm_arch_common_macro.S; sourceTree = "<group>"; };
 		4CE447A718BC6BE90017DF25 /* block_add_neon.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = block_add_neon.S; sourceTree = "<group>"; };
-		4CE447A818BC6BE90017DF25 /* deblocking_neon.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = deblocking_neon.S; sourceTree = "<group>"; };
 		4CE447A918BC6BE90017DF25 /* intra_pred_neon.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = intra_pred_neon.S; sourceTree = "<group>"; };
 		4CE447AA18BC6BE90017DF25 /* mc_neon.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = mc_neon.S; sourceTree = "<group>"; };
 /* End PBXFileReference section */
@@ -327,9 +323,7 @@
 		4CE447A518BC6BE90017DF25 /* arm */ = {
 			isa = PBXGroup;
 			children = (
-				4CE447A618BC6BE90017DF25 /* arm_arch_common_macro.S */,
 				4CE447A718BC6BE90017DF25 /* block_add_neon.S */,
-				4CE447A818BC6BE90017DF25 /* deblocking_neon.S */,
 				4CE447A918BC6BE90017DF25 /* intra_pred_neon.S */,
 				4CE447AA18BC6BE90017DF25 /* mc_neon.S */,
 			);
@@ -424,7 +418,6 @@
 				4CE4469D18BC5EAB0017DF25 /* utils.cpp in Sources */,
 				4CE4469118BC5EAB0017DF25 /* decoder_data_tables.cpp in Sources */,
 				4CE4469718BC5EAB0017DF25 /* mem_align.cpp in Sources */,
-				4CE447AB18BC6BE90017DF25 /* arm_arch_common_macro.S in Sources */,
 				4CE4469518BC5EAB0017DF25 /* manage_dec_ref.cpp in Sources */,
 				4CE4468A18BC5EAB0017DF25 /* au_parser.cpp in Sources */,
 				4CE4469218BC5EAB0017DF25 /* expand_pic.cpp in Sources */,
@@ -435,7 +428,6 @@
 				4CE4469E18BC5EAB0017DF25 /* welsCodecTrace.cpp in Sources */,
 				4CE447AE18BC6BE90017DF25 /* intra_pred_neon.S in Sources */,
 				4CE4469618BC5EAB0017DF25 /* mc.cpp in Sources */,
-				4CE447AD18BC6BE90017DF25 /* deblocking_neon.S in Sources */,
 				4CE4469C18BC5EAB0017DF25 /* rec_mb.cpp in Sources */,
 				4CE4468B18BC5EAB0017DF25 /* bit_stream.cpp in Sources */,
 				4CE4468D18BC5EAB0017DF25 /* decode_mb_aux.cpp in Sources */,
diff --git a/codec/decoder/core/arm/arm_arch_common_macro.S b/codec/common/arm_arch_common_macro.S
similarity index 100%
rename from codec/decoder/core/arm/arm_arch_common_macro.S
rename to codec/common/arm_arch_common_macro.S
diff --git a/codec/common/cpu.cpp b/codec/common/cpu.cpp
index 6cc85f38..03049094 100644
--- a/codec/common/cpu.cpp
+++ b/codec/common/cpu.cpp
@@ -38,7 +38,12 @@
  *************************************************************************************
  */
 #include <string.h>
-
+#ifdef ANDROID_NDK
+#include <cpu-features.h>
+#endif
+#ifdef APPLE_IOS
+#include <sys/utsname.h>
+#endif
 #include "cpu.h"
 #include "cpu_core.h"
 
@@ -209,4 +214,53 @@ void WelsXmmRegEmptyOp(void * pSrc) {
 
 #endif
 
+#if defined(HAVE_NEON)//For supporting both android platform and iOS platform
+#if defined(ANDROID_NDK)
+uint32_t WelsCPUFeatureDetectAndroid()
+{
+	uint32_t         uiCPU = 0;
+    AndroidCpuFamily cpuFamily = ANDROID_CPU_FAMILY_UNKNOWN;
+    uint64_t         uiFeatures = 0;
+    
+    cpuFamily = android_getCpuFamily();
+    if (cpuFamily == ANDROID_CPU_FAMILY_ARM)
+	{
+        uiFeatures = android_getCpuFeatures();
+		if (uiFeatures & ANDROID_CPU_ARM_FEATURE_ARMv7){
+		    uiCPU |= WELS_CPU_ARMv7;
+		}
+		if (uiFeatures & ANDROID_CPU_ARM_FEATURE_VFPv3){
+		    uiCPU |= WELS_CPU_VFPv3;
+		}
+		if (uiFeatures & ANDROID_CPU_ARM_FEATURE_NEON){
+		    uiCPU |= WELS_CPU_NEON;
+		}
+	}
+    return uiCPU;
+}
+
+#endif
+
+#if defined(APPLE_IOS)
+uint32_t WelsCPUFeatureDetectIOS() //Need to be updated for the new device of APPLE
+{
+    uint32_t       uiCPU = 0;
+    struct utsname sSystemInfo;
+    
+    uname (&sSystemInfo);
+    
+    if ((0 != strcmp(sSystemInfo.machine, "iPhone1,1")) && //iPhone 2G
+        (0 != strcmp(sSystemInfo.machine, "iPhone1,2")) && //iPhone 3G
+        (0 != strcmp(sSystemInfo.machine, "iPod1,1")) &&   //iPod 1G
+        (0 != strcmp(sSystemInfo.machine, "iPod2,1")))     //iPod 2G
+    {
+        uiCPU |= WELS_CPU_ARMv7;
+        uiCPU |= WELS_CPU_VFPv3;
+        uiCPU |= WELS_CPU_NEON;
+    }
+    return uiCPU;
+}
+#endif
+#endif
+
 
diff --git a/codec/common/cpu.h b/codec/common/cpu.h
index a119833e..fc458ca2 100644
--- a/codec/common/cpu.h
+++ b/codec/common/cpu.h
@@ -78,6 +78,16 @@ void     WelsXmmRegLoad(void * src);
 
 void     WelsXmmRegEmptyOp(void * pSrc);
 
+#if defined(HAVE_NEON)
+#if defined(ANDROID_NDK)
+	uint32_t WelsCPUFeatureDetectAndroid();
+#endif
+	
+#if defined(APPLE_IOS)
+	uint32_t WelsCPUFeatureDetectIOS();
+#endif
+#endif
+    
 #if defined(__cplusplus)
 }
 #endif//__cplusplus
diff --git a/codec/common/cpu_core.h b/codec/common/cpu_core.h
index 27fa5245..babcab8a 100644
--- a/codec/common/cpu_core.h
+++ b/codec/common/cpu_core.h
@@ -73,6 +73,11 @@
 #define WELS_CPU_CACHELINE_64    0x40000000    /* CacheLine Size 64 */
 #define WELS_CPU_CACHELINE_128   0x80000000    /* CacheLine Size 128 */
 
+/* For the android OS */
+#define WELS_CPU_ARMv7      0x000001    /* ARMv7 */
+#define WELS_CPU_VFPv3      0x000002    /* VFPv3 */
+#define WELS_CPU_NEON       0x000004    /* NEON */
+
 /*
  *	Interfaces for CPU core feature detection as below
  */
diff --git a/codec/decoder/core/arm/deblocking_neon.S b/codec/common/deblocking_neon.S
similarity index 77%
rename from codec/decoder/core/arm/deblocking_neon.S
rename to codec/common/deblocking_neon.S
index 276f799e..cdb225ab 100755
--- a/codec/decoder/core/arm/deblocking_neon.S
+++ b/codec/common/deblocking_neon.S
@@ -981,361 +981,21 @@
 
 //eq4_end:
   WELS_ASM_FUNC_END
-  
-#ifdef APPLE_IOS
-//in: $0(const) $1 $2; out:$3 $4
-//used register: r6, r7, q0, q1
-.macro BS_NZC_CHECK 
-    //vld1.8   {d0,d1}, [$0] 
-    vld1.8   {d0,d1}, [$0, :64] 
-    /* Arrenge the input data --- TOP */
-	ands     r6, $1, #2
-	beq      bs_nzc_check_jump0
-	
-    sub      r6, $0, $2, lsl #4
-	sub      r6, $2, lsl #3
-    add      r6, #12
-    vld1.32  d3[1], [r6]
-	
-bs_nzc_check_jump0:	
-    vext.8   q1, q1, q0, #12
-	vadd.u8  $3, q0, q1
-
-    
-    /* Arrenge the input data --- LEFT */
-	ands     r6, $1, #1
-	beq      bs_nzc_check_jump1
-	
-    sub      r6, $0, #21
-	add      r7, r6, #4 
-    vld1.8   d3[4], [r6]
-	add      r6, r7, #4
-    vld1.8   d3[5], [r7]
-	add      r7, r6, #4
-    vld1.8   d3[6], [r6]
-    vld1.8   d3[7], [r7]
-	
-bs_nzc_check_jump1:
-	vzip.8   d0, d1	
-	vzip.8   d0, d1
-    vext.8   q1, q1, q0, #12
-	vadd.u8  $4, q0, q1
-
-.endm
 
 
-//in: $0(const) $1 $2; out:$3 $4
-//used register: r6, r7, q0, q1
-.macro BS_REF_INDEX_CHECK 
-    //vld1.8   {d0,d1}, [$0] 
-	vld1.8   {d0,d1}, [$0, :128] 
-    /* Arrenge the input data --- TOP */
-	ands     r6, $1, #2
-	beq      bs_ref_index_check_jump0
+// r0    int8_t* non_zero_count,
+  WELS_ASM_FUNC_BEGIN enc_avc_non_zero_count_neon
 	
-    sub      r6, $0, $2, lsl #4
-    add      r6, #12
-    vld1.32  d3[1], [r6]
-	
-bs_ref_index_check_jump0:
-    vext.8   q1, q1, q0, #12
-    vabd.u8  $3, q0, q1
-
-    
-    /* Arrenge the input data --- LEFT */
-	ands     r6, $1, #1
-	beq      bs_ref_index_check_jump1
-	
-    sub      r6, $0, #13
-	add      r7, r6, #4 
-    vld1.8   d3[4], [r6]
-	add      r6, r7, #4
-    vld1.8   d3[5], [r7]
-	add      r7, r6, #4
-    vld1.8   d3[6], [r6]
-    vld1.8   d3[7], [r7]
-	
-bs_ref_index_check_jump1:
-	vzip.8   d0, d1
-	vzip.8   d0, d1
-    vext.8   q1, q1, q0, #12
-	vabd.u8  $4, q0, q1
-.endmacro
-
-.macro BS_COMPARE_MV //in: $0,$1(const),$2(const),$3(const),$4(const); out:$5, $6
-    mov       r6, #4
-    vabd.s16  q5, $0, $1
-    vabd.s16  q6, $1, $2
-	vdup.s16  $0, r6
-    vabd.s16  q7, $2, $3	
-    vabd.s16  q8, $3, $4	    
-    
-    vcge.s16  q5, $0
-    vcge.s16  q6, $0
-    vcge.s16  q7, $0
-    vcge.s16  q8, $0 
-	
-	vpadd.i16 d10, d10, d11
-    vpadd.i16 d11, d12, d13
-    vpadd.i16 d12, d14, d15
-    vpadd.i16 d13, d16, d17  
-   
-    vaddhn.i16  $5, q5, q5
-    vaddhn.i16  $6, q6, q6
-.endmacro
-
-//in: $0(const) $1 $2; out:$3 $4 $5 $6
-//used register: r6, r7, q0, q1, q2, q3, q4
-.macro BS_MV_CHECK 
-    //vldm   $0, {q0,q1,q2,q3}
-    vld1.32  {q0,q1}, [$0, :128]
-	add      r6, $0, #32
-	vld1.32  {q2,q3}, [r6, :128]
-
-    /* Arrenge the input data --- TOP */
-	ands     r6, $1, #2
-	beq      bs_mv_check_jump0
+	vld1.64	{d0-d2}, [r0]
 		
-    sub      r6, $0, $2, lsl #6
-    add      r6, #48
-    vld1.8   {d8, d9}, [r6]
+	vceq.s8	q0, q0, #0
+	vceq.s8	d2, d2, #0
+	vmvn	q0, q0
+	vmvn	d2, d2
+	vabs.s8	q0, q0
+	vabs.s8	d2, d2
 	
-bs_mv_check_jump0:
-    BS_COMPARE_MV  q4, q0, q1, q2, q3, $3, $4
-    
-    /* Arrenge the input data --- LEFT */
-	ands     r6, $1, #1
-	beq      bs_mv_check_jump1
-	
-    sub      r6, $0, #52
-    //mov      r7, #16
-    add      r7, r6, #16
-	vld1.32   d8[0], [r6]
-	add      r6, r7, #16
-    vld1.32   d8[1], [r7]
-	add      r7, r6, #16
-    vld1.32   d9[0], [r6]
-    vld1.32   d9[1], [r7]
-	
-bs_mv_check_jump1:
-	vzip.32   q0, q2
-	vzip.32   q1, q3
-	vzip.32   q0, q1
-    vzip.32   q2, q3
-    BS_COMPARE_MV  q4, q0, q1, q2, q3, $5, $6
-.endmacro
-#else
-//in: $0(const) $1 $2; out:$3 $4
-//used register: r6, r7, q0, q1
-.macro BS_NZC_CHECK arg0, arg1, arg2, arg3, arg4 
-    //vld1.8   {d0,d1}, [\arg0] 
-    vld1.8   {d0,d1}, [\arg0, :64] 
-    /* Arrenge the input data --- TOP */
-	ands     r6, \arg1, #2
-	beq      bs_nzc_check_jump0
-	
-    sub      r6, \arg0, \arg2, lsl #4
-	sub      r6, \arg2, lsl #3
-    add      r6, #12
-    vld1.32  d3[1], [r6]
-	
-bs_nzc_check_jump0:	
-    vext.8   q1, q1, q0, #12
-	vadd.u8  \arg3, q0, q1
+	vst1.64	{d0-d2}, [r0]
+  WELS_ASM_FUNC_END
 
-    
-    /* Arrenge the input data --- LEFT */
-	ands     r6, \arg1, #1
-	beq      bs_nzc_check_jump1
-	
-    sub      r6, \arg0, #21
-	add      r7, r6, #4 
-    vld1.8   d3[4], [r6]
-	add      r6, r7, #4
-    vld1.8   d3[5], [r7]
-	add      r7, r6, #4
-    vld1.8   d3[6], [r6]
-    vld1.8   d3[7], [r7]
-	
-bs_nzc_check_jump1:
-	vzip.8   d0, d1	
-	vzip.8   d0, d1
-    vext.8   q1, q1, q0, #12
-	vadd.u8  \arg4, q0, q1
-
-.endm
-
-
-//in: \arg0(const) \arg1 \arg2; out:\arg3 \arg4
-//used register: r6, r7, q0, q1
-.macro BS_REF_INDEX_CHECK arg0, arg1, arg2, arg3, arg4  
-    //vld1.8   {d0,d1}, [\arg0] 
-	vld1.8   {d0,d1}, [\arg0, :128] 
-    /* Arrenge the input data --- TOP */
-	ands     r6, \arg1, #2
-	beq      bs_ref_index_check_jump0
-	
-    sub      r6, \arg0, \arg2, lsl #4
-    add      r6, #12
-    vld1.32  d3[1], [r6]
-	
-bs_ref_index_check_jump0:
-    vext.8   q1, q1, q0, #12
-    vabd.u8  \arg3, q0, q1
-
-    
-    /* Arrenge the input data --- LEFT */
-	ands     r6, \arg1, #1
-	beq      bs_ref_index_check_jump1
-	
-    sub      r6, \arg0, #13
-	add      r7, r6, #4 
-    vld1.8   d3[4], [r6]
-	add      r6, r7, #4
-    vld1.8   d3[5], [r7]
-	add      r7, r6, #4
-    vld1.8   d3[6], [r6]
-    vld1.8   d3[7], [r7]
-	
-bs_ref_index_check_jump1:
-	vzip.8   d0, d1
-	vzip.8   d0, d1
-    vext.8   q1, q1, q0, #12
-	vabd.u8  \arg4, q0, q1
-.endm
-
-//in: \arg0,\arg1(const),\arg2(const),\arg3(const),\arg4(const); out:\arg5, \arg6
-.macro BS_COMPARE_MV arg0, arg1, arg2, arg3, arg4, arg5, arg6  
-
-    mov       r6, #4
-    vabd.s16  q5, \arg0, \arg1
-    vabd.s16  q6, \arg1, \arg2
-	vdup.s16  \arg0, r6
-    vabd.s16  q7, \arg2, \arg3	
-    vabd.s16  q8, \arg3, \arg4	    
-    
-    vcge.s16  q5, \arg0
-    vcge.s16  q6, \arg0
-    vcge.s16  q7, \arg0
-    vcge.s16  q8, \arg0 
-	
-	vpadd.i16 d10, d10, d11
-    vpadd.i16 d11, d12, d13
-    vpadd.i16 d12, d14, d15
-    vpadd.i16 d13, d16, d17  
-   
-    vaddhn.i16  \arg5, q5, q5
-    vaddhn.i16  \arg6, q6, q6
-.endm
-
-//in: \arg0(const) \arg1 \arg2; out:\arg3 \arg4 \arg5 \arg6
-//used register: r6, r7, q0, q1, q2, q3, q4
-.macro BS_MV_CHECK arg0, arg1, arg2, arg3, arg4, arg5, arg6 
-    //vldm   \arg0, {q0,q1,q2,q3}
-    vld1.32  {q0,q1}, [\arg0, :128]
-	add      r6, \arg0, #32
-	vld1.32  {q2,q3}, [r6, :128]
-
-    /* Arrenge the input data --- TOP */
-	ands     r6, \arg1, #2
-	beq      bs_mv_check_jump0
-		
-    sub      r6, \arg0, \arg2, lsl #6
-    add      r6, #48
-    vld1.8   {d8, d9}, [r6]
-	
-bs_mv_check_jump0:
-    BS_COMPARE_MV  q4, q0, q1, q2, q3, \arg3, \arg4
-    
-    /* Arrenge the input data --- LEFT */
-	ands     r6, \arg1, #1
-	beq      bs_mv_check_jump1
-	
-    sub      r6, \arg0, #52
-    //mov      r7, #16
-    add      r7, r6, #16
-	vld1.32   d8[0], [r6]
-	add      r6, r7, #16
-    vld1.32   d8[1], [r7]
-	add      r7, r6, #16
-    vld1.32   d9[0], [r6]
-    vld1.32   d9[1], [r7]
-	
-bs_mv_check_jump1:
-	vzip.32   q0, q2
-	vzip.32   q1, q3
-	vzip.32   q0, q1
-    vzip.32   q2, q3
-    BS_COMPARE_MV  q4, q0, q1, q2, q3, \arg5, \arg6
-.endm
-#endif
-/*
- * void	deblocking_BS_calc_neon(int8_t  *pNzc, 
- *								int8_t  *pRef_index, 
- *								int16_t *pMv[], 
- *                              int32_t boundry_flag,
- *								int32_t mb_width,
- *								uint8_t *bS);
- *
- * r0 = cur_layer->nzc[cur_mb_xy]
- * r1 = cur_layer->ref_index[0][cur_mb_xy]
- * r2 = cur_layer->mv[0][cur_mb_xy]
- * r3 = boundry_flag (LEFT_FLAG/TOP_FLAG)
- * r4 = cur_layer->mb_width
- * r5 = BS[8][4] save all of the BS value for whole MB(16*16)
- */
- 
-	WELS_ASM_FUNC_BEGIN deblocking_BS_calc_neon
-	
-	stmdb sp!, {r4-r7}
-	
-	ldr  r4, [sp, #16]  //Save mb_width to r4
-	ldr  r5, [sp, #20]	//Save BS to r5
-	
-	/* Checking the nzc status */
-	BS_NZC_CHECK r0, r3, r4, q14, q15 //q14,q15 save the nzc status
-        
-	/* Checking the nzc_rs status */
-	//BS_NZC_CHECK r1, r4, q12, q13 //q12,q13 save the mzc_rs status
-	
-	/* For checking bS[I] = 2 */
-	mov      r6, #2
-	//vqadd.u8 q14, q12
-	//vqadd.u8 q15, q13
-	vcgt.s8  q14, q14, #0
-	vdup.u8  q0, r6
-	vcgt.s8  q15, q15, #0
-	
-	vand.u8  q14, q14, q0 //q14 save the nzc check result all the time --- for dir is top
-	vand.u8  q15, q15, q0 //q15 save the nzc check result all the time --- for dir is left
-	
-	
-	/* Checking the ref_index status*/
-	BS_REF_INDEX_CHECK r1, r3, r4, q12, q13 //q12,q13 save the ref_index status
-	
-	vcgt.s8  q12, q12, #0
-	vcgt.s8  q13, q13, #0
-		
-	/* Checking the mv status*/
-	BS_MV_CHECK r2, r3, r4, d20, d21, d22, d23//q10, q11 save the mv status
-
-	/* For checking bS[I] = 1 */
-	mov      r6, #1
-	vqadd.u8 q12, q10
-	vdup.u8  q0, r6
-	vqadd.u8 q13, q11
-
-	vand.u8  q12, q12, q0 //q12 save the nzc check result all the time --- for dir is top
-	vand.u8  q13, q13, q0 //q13 save the nzc check result all the time --- for dir is left
-	
-	
-	/* Check bS[I] is '1' or '2' */
-	vmax.u8 q1, q12, q14
-	vmax.u8 q0, q13, q15
-	
-	//vstm r5, {q0, q1}
-    vst1.32 {q0, q1}, [r5]
-	ldmia sp!, {r4-r7}
-    WELS_ASM_FUNC_END
-/*====== deblocking_BS_calc_neon End ======*/
 #endif
diff --git a/codec/decoder/core/arm/block_add_neon.S b/codec/decoder/core/arm/block_add_neon.S
index 5327ae5e..94a4713b 100755
--- a/codec/decoder/core/arm/block_add_neon.S
+++ b/codec/decoder/core/arm/block_add_neon.S
@@ -34,29 +34,6 @@
 .text
 #include "arm_arch_common_macro.S"
 #ifdef APPLE_IOS
-.macro	ORR_32BYTES_TO_8BYTES
-//	{	//	input: smb#a q0 & q1, smb#b q2&q3, q0[0], q0[1], q2[0], q2[1], output: d_0, d_1
-		vorr.s16	$0, $1
-		vorr.s16	$2, $3		
-		vorr.s16	$8, $4, $5
-		vorr.s16	$9, $6, $7
-//	}
-.endm
-
-.macro	ADD_PRED_1BYTE_TO_RESID_2BYTES
-//	{	//	input: q0~q3, d0~d3, output: d0~d3;
-
-		vaddw.u8		$0, $4
-		vaddw.u8		$1, $5
-		vaddw.u8		$2, $6
-		vaddw.u8		$3, $7
-		
-		vqmovun.s16	$4, $0			//saturation
-		vqmovun.s16	$6, $2	
-		vqmovun.s16	$5, $1
-		vqmovun.s16	$7, $3		
-//	}
-.endm
 
 .macro	ROW_TRANSFORM_1_STEP
 //	{	//	input: src_d[0]~[3], output: e_q[0]~[3]; working: $8 $9
@@ -89,39 +66,7 @@
 //	}
 .endm
 
-.macro	ADD_AND_CLIP_RS
-//	{	//	input: src_q[0]~[1], matrix_max~min, pred_q, working_d[0]:[1], output: q;	
-		vrshrn.s32		$5, $0, #6
-		vrshrn.s32		$6, $1, #6
-		vqadd.s16		$7, $4
-		vmin.s16		$7, $7, $2
-		vmax.s16		$7, $7, $3
-//	}
-.endm
 #else
-.macro	ORR_32BYTES_TO_8BYTES arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8, arg9
-//	{	//	input: smb#a q0 & q1, smb#b q2&q3, q0[0], q0[1], q2[0], q2[1], output: d_0, d_1
-		vorr.s16	\arg0, \arg1
-		vorr.s16	\arg2, \arg3		
-		vorr.s16	\arg8, \arg4, \arg5
-		vorr.s16	\arg9, \arg6, \arg7
-//	}
-.endm
-
-.macro	ADD_PRED_1BYTE_TO_RESID_2BYTES arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7
-//	{	//	input: q0~q3, d0~d3, output: d0~d3;
-
-		vaddw.u8		\arg0, \arg4
-		vaddw.u8		\arg1, \arg5
-		vaddw.u8		\arg2, \arg6
-		vaddw.u8		\arg3, \arg7
-		
-		vqmovun.s16	\arg4, \arg0			//saturation
-		vqmovun.s16	\arg6, \arg2	
-		vqmovun.s16	\arg5, \arg1
-		vqmovun.s16	\arg7, \arg3		
-//	}
-.endm
 
 .macro	ROW_TRANSFORM_1_STEP arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8, arg9
 //	{	//	input: src_d[0]~[3], output: e_q[0]~[3]; working: $8 $9
@@ -153,16 +98,6 @@
 		vadd.s32		\arg7, \arg1, \arg7			//int32 e[3][j] = f[1][j] + (f[3][j]>>1);
 //	}
 .endm
-
-.macro	ADD_AND_CLIP_RS arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7
-//	{	//	input: src_q[0]~[1], matrix_max~min, pred_q, working_d[0]:[1], output: q;	
-		vrshrn.s32		\arg5, \arg0, #6
-		vrshrn.s32		\arg6, \arg1, #6
-		vqadd.s16		\arg7, \arg4
-		vmin.s16		\arg7, \arg7, \arg2
-		vmax.s16		\arg7, \arg7, \arg3
-//	}
-.endm
 #endif
 // r0    int16_t* block,
 // r1    int8_t* non_zero_count,
@@ -180,157 +115,6 @@
 	vst1.64	{d0-d2}, [r1]
   WELS_ASM_FUNC_END
 
-// r0    int16_t* block,
-// r1    int8_t* non_zero_count,
-  WELS_ASM_FUNC_BEGIN svc_non_zero_count_neon
-	push		{r2-r4}
-	mov			r4, #3
-	mov			r3, #64
-	add			r2, r0, #32
-	pld			[r0, #512]
-non_zero_count_two_8x8_loop:
-
-	vld1.64	{q0, q1}, [r0,:128], r3
-	vld1.64	{q2, q3}, [r2,:128], r3
-	vld1.64	{q4, q5}, [r0,:128], r3
-	vld1.64	{q6, q7}, [r2,:128], r3
-	vld1.64	{q8, q9}, [r0,:128], r3
-	vld1.64	{q10, q11}, [r2,:128], r3//load #0 8x8 block resi data,	
-	vld1.64	{q12, q13}, [r0,:128], r3
-	vld1.64	{q14, q15}, [r2,:128], r3//load #1 8x8 block resi data, 
-	pld			[r0, #512]
-	
-	ORR_32BYTES_TO_8BYTES	q0, q1, q2, q3, d0, d1, d4, d5, d2, d3	// output q1
-//	vceq.i16	q1, q1, #0	
-	
-	ORR_32BYTES_TO_8BYTES	q8, q9,q10,q11,d16,d17,d20,d21,d4,d5	// output q2
-//	vceq.i16	q2, q2, #0	
-	
-	ORR_32BYTES_TO_8BYTES	 q4, q5, q6, q7, d8, d9, d12, d13, d10, d11	// output q5
-//	vceq.i16	q5, q5, #0	
-
-	ORR_32BYTES_TO_8BYTES	q12,q13,q14,q15,d24,d25, d28, d29, d12, d13	// output q6
-//	vceq.i16	q6, q6, #0	
-
-	vqmovn.u64	d0, q1		// 8bytes-->4bytes
-	vqmovn.u64	d8, q5	
-	vqmovn.u64	d1, q2					
-	vqmovn.u64	d9, q6
-		
-	vqmovn.u32	d2, q0		// 4bytes-->2bytes
-	vqmovn.u32	d3, q4
-
-	vceq.i16	q0, q1, #0	
-	vmvn    	q0, q0
-	vabs.s16	q2, q0
-	vmovn.u16	d6, q2		// 2bytes-->1bytes
-	vst1.u8	{d6}, [r1]!
-		
-//	pld			[r0]
-	subs		r4,	r4, #1
-	bne			non_zero_count_two_8x8_loop
-
-	pop		{r2-r4}
-  WELS_ASM_FUNC_END
-
-// r0    int16_t* block,
-// r1    int8_t* non_zero_count,
-  WELS_ASM_FUNC_BEGIN svc_rs_non_zero_count_neon
-
-	vld1.i16	{q0, q1}, [r0]!		// block is unaligned!!!
-	vld1.i16	{q2, q3}, [r0]!
-	vld1.i16	{q4, q5}, [r0]!
-	vld1.i16	{q6, q7}, [r0]!
-	
-	vld1.i16	{q8, q9}, [r0]!
-	vld1.i16	{q10, q11}, [r0]!
-	vld1.i16	{q12, q13}, [r0]!
-	vld1.i16	{q14, q15}, [r0]!
-	
-	ORR_32BYTES_TO_8BYTES	q0, q2, q1, q3, q4, q6, q5, q7, q4, q5
-	vorr.s16	q0, q4
-	vorr.s16	q1, q5			// output d0~d3	
-	ORR_32BYTES_TO_8BYTES	q8, q10, q9, q11, q12, q14, q13, q15, q12, q13
-	vorr.s16	q6, q8, q12
-	vorr.s16	q7, q9, q13	// output d12~d15
-	
-	vqmovn.u64	d4, q0		// 8bytes-->4bytes
-	vqmovn.u64	d6, q6	
-	vqmovn.u64	d5, q1
-	vqmovn.u64	d7, q7
-		
-	vqmovn.u32	d8, q2		// 4bytes-->2bytes
-	vqmovn.u32	d9, q3
-
-	vceq.i16	q5, q4, #0	
-	vmvn    	q5, q5
-	vabs.s16	q5, q5
-	vmovn.u16	d10, q5	// 2bytes-->1bytes
-	vst1.u8	{d10}, [r1]!			
-
-	vld1.i16	{q0, q1}, [r0]!
-	vld1.i16	{q2, q3}, [r0]!
-	vld1.i16	{q4, q5}, [r0]!
-	vld1.i16	{q6, q7}, [r0]!
-	
-	vld1.i16	{q8, q9}, [r0]!
-	vld1.i16	{q10, q11}, [r0]!
-	vld1.i16	{q12, q13}, [r0]!
-	vld1.i16	{q14, q15}, [r0]!
-	
-	ORR_32BYTES_TO_8BYTES	q0, q2, q1, q3, q4, q6, q5, q7, q4, q5
-	vorr.s16	q0, q4
-	vorr.s16	q1, q5			// output d0~d3	
-	ORR_32BYTES_TO_8BYTES	q8, q10, q9, q11, q12, q14, q13, q15, q12, q13
-	vorr.s16	q6, q8, q12
-	vorr.s16	q7, q9, q13	// output d12~d15
-	
-	vqmovn.u64	d4, q0		// 8bytes-->4bytes
-	vqmovn.u64	d6, q6	
-	vqmovn.u64	d5, q1
-	vqmovn.u64	d7, q7
-		
-	vqmovn.u32	d8, q2		// 4bytes-->2bytes
-	vqmovn.u32	d9, q3
-
-	vceq.i16	q5, q4, #0	
-	vmvn    	q5, q5
-	vabs.s16	q5, q5
-	vmovn.u16	d10, q5	// 2bytes-->1bytes
-	vst1.u8	{d10}, [r1]!
-	
-//	Chroma
-	vld1.i16	{q0, q1}, [r0]!
-	vld1.i16	{q2, q3}, [r0]!
-	vld1.i16	{q4, q5}, [r0]!
-	vld1.i16	{q6, q7}, [r0]!	//load Cb block,
-	
-	vld1.i16	{q8, q9}, [r0]!
-	vld1.i16	{q10, q11}, [r0]!		
-	vld1.i16	{q12, q13}, [r0]!
-	vld1.i16	{q14, q15}, [r0]!	//load Cr block, 
-
-	ORR_32BYTES_TO_8BYTES	q0, q1, q2, q3, q4, q5, q6, q7, q4, q6
-	vorr.s16	q0, q2
-	vorr.s16	q1, q4, q6			// output d0~d3
-	ORR_32BYTES_TO_8BYTES	q8, q9, q10, q11, q12, q13, q14, q15, q12, q14
-	vorr.s16	q2, q8, q10
-	vorr.s16	q3, q12, q14		// output d4~d7			
-		
-	vqmovn.u64	d8, q0		// 8bytes-->4bytes
-	vqmovn.u64	d10, q2	
-	vqmovn.u64	d9, q1
-	vqmovn.u64	d11, q3
-		
-	vqmovn.u32	d12, q4		// 4bytes-->2bytes
-	vqmovn.u32	d13, q5
-
-	vceq.i16	q7, q6, #0	
-	vmvn    	q7, q7	
-	vabs.s16	q7, q7
-	vmovn.u16	d10, q7	// 2bytes-->1bytes
-	vst1.u8	{d10}, [r1]!		
-  WELS_ASM_FUNC_END
 
 //	r0 int16_t * block, 
 //	r1	int32_t stride
@@ -371,207 +155,6 @@ block_zero_8x8_chma_loop:
 	pop		{r2}
   WELS_ASM_FUNC_END
 
-//	r0	int8_t* dst_addr, 
-//	r1	memset_value
-//	r2	int32_t bytes_nmb,
-
-  WELS_ASM_FUNC_BEGIN svc_block_memset_neon// dst should continue
-	vdup.u8	q0, r1
-	vdup.u8	q1, r1
-		
-block_memset_loop:	
-	vst1.64	{q0, q1}, [r0,:64]!
-	subs		r2,	r2, #64
-	vst1.64	{q0, q1}, [r0,:64]!
-	bne			block_memset_loop
-  WELS_ASM_FUNC_END
-
-//	int16_t* dst, 
-//	int16_t* src,
-//	int32_t stride	
-  WELS_ASM_FUNC_BEGIN svc_block_copy_16x16_neon
-	push		{r3}
-	mov			r3, #16
-// each element is sizeof(int16_t)
-	lsl			r2, r2, #1	// r2 = 2*r2
-
-block_copy_16x16_luma_loop:	
-	vld1.i16	{q0, q1}, [r1], r2
-	subs		r3,	r3, #1
-	vst1.i16	{q0, q1}, [r0]!
-	bne			block_copy_16x16_luma_loop
-	
-	pop		{r3}
-  WELS_ASM_FUNC_END
-	
-  WELS_ASM_FUNC_BEGIN svc_block_copy_8x8_neon
-	push		{r3}
-	mov			r3, #8
-// each element is sizeof(int16_t)
-	lsl			r2, r2, #1	// r2 = 2*r2
-
-block_copy_8x8_chma_loop:	
-	vld1.i16	{q0}, [r1], r2
-	subs		r3,	r3, #1
-	vst1.i16	{q0}, [r0]!
-	bne			block_copy_8x8_chma_loop
-	
-	pop		{r3}
-  WELS_ASM_FUNC_END
-
-// r0    uint8_t * dest,
-// r1    uint8_t * pred,
-// r2    int16_t * res,
-// r3    int32_t stride,
-  WELS_ASM_FUNC_BEGIN svc_block_add_16x16_neon
-	push		{r4}
-	mov		r4, #16
-	pld		[r1]	
-block_recon_16x16_luma_loop:
-
-	vld1.64		{d16,d17}, [r1,:64], r3		//load 16 pred data, update addr
-	vld1.s16		{q0, q1}, [r2]!				//load 8+8 resi data, update addr
-	vld1.64		{d18,d19}, [r1,:64], r3
-	vld1.s16		{q2, q3}, [r2]!
-	ADD_PRED_1BYTE_TO_RESID_2BYTES		q0, q1, q2, q3, d16, d17, d18, d19
-	pld		[r1]
-	vst1.64         {q8}, [r0], r3      //store result		
-	vst1.64         {q9}, [r0], r3
-//#ifdef	DEBUG_NEON
-//	vst1.u8         {q8}, [r0]!		
-//	vst1.u8         {q9}, [r0]!
-//#endif
-
-	vld1.64		{d20,d21}, [r1,:64], r3		//load 16 pred data, update addr
-	vld1.s16		{q4, q5}, [r2]!			//load 8+8 resi data, update addr
-	vld1.64		{d22,d23}, [r1,:64], r3
-	vld1.s16		{q6, q7}, [r2]!
-	ADD_PRED_1BYTE_TO_RESID_2BYTES		q4, q5, q6, q7, d20, d21, d22, d23
-	pld		[r1]
-	vst1.64         {q10}, [r0], r3
-	vst1.64         {q11}, [r0], r3
-//#ifdef	DEBUG_NEON
-//	vst1.u8         {q10}, [r0]!
-//	vst1.u8         {q11}, [r0]!
-//#endif
-
-	subs		r4, r4, #4
-	bne		block_recon_16x16_luma_loop
-
-	pop		{r4}
-  WELS_ASM_FUNC_END
-
-
-  WELS_ASM_FUNC_BEGIN svc_block_add_8x8_neon
-
-	vld1.u8		{d24}, [r1], r3		//load 8 pred data
-	vld1.i16		{q8, q9}, [r2]!		//load 8+8 resi data, update addr	
-	vld1.u8		{d25}, [r1], r3		//load 8 pred data, q12	
-	vld1.i16		{q10, q11}, [r2]!		//load 8+8 resi data, update addr
-	vld1.u8		{d26}, [r1], r3		//load 8 pred data
-	vld1.u8		{d27}, [r1], r3		//load 8 pred data, q13
-
-	ADD_PRED_1BYTE_TO_RESID_2BYTES		q8, q9, q10, q11, d24, d25, d26, d27
-	pld		[r1]
-	vst1.u8         {d24}, [r0], r3      //store result	 
-	vst1.u8         {d25}, [r0], r3      //store result	 
-	vst1.u8         {d26}, [r0], r3      //store result	 
-	vst1.u8         {d27}, [r0], r3      //store result		
-//#ifdef	DEBUG_NEON
-//	vst1.u8         {d24}, [r0]!
-//#endif
-	
-	vld1.u8		{d24}, [r1], r3		//load 8 pred data
-	vld1.i16		{q8, q9}, [r2]!		//load 8+8 resi data, update addr	
-	vld1.u8		{d25}, [r1], r3		//load 8 pred data, q12	
-	vld1.i16		{q10, q11}, [r2]!		//load 8+8 resi data, update addr
-	vld1.u8		{d26}, [r1], r3		//load 8 pred data
-	vld1.u8		{d27}, [r1], r3		//load 8 pred data, q13
-
-	ADD_PRED_1BYTE_TO_RESID_2BYTES		q8, q9, q10, q11, d24, d25, d26, d27
-	vst1.u8         {d24}, [r0], r3      //store result	 
-	vst1.u8         {d25}, [r0], r3      //store result	 
-	vst1.u8         {d26}, [r0], r3      //store result	 
-	vst1.u8         {d27}, [r0], r3      //store result		
-//#ifdef	DEBUG_NEON
-//	vst1.u8         {d24}, [r0]!
-//#endif
-  WELS_ASM_FUNC_END
-
-
-//	int16_t* dst,
-//	int16_t* src,
-//	int stride
-  WELS_ASM_FUNC_BEGIN svc_simple_idct4x4_neon
-
-	vld4.s16		{d0, d1, d2, d3}, [r1]	// cost 3 cycles!
-	lsl			r2, r2, #1	
-
-	ROW_TRANSFORM_1_STEP		d0, d1, d2, d3, q4, q5, q6, q7, d4, d5
-	
-	TRANSFORM_4BYTES		q0, q1, q2, q3, q4, q5, q6, q7
-	
-	// transform element 32bits
-	vtrn.s32		q0, q1				//[0 1 2 3]+[4 5 6 7]-->[0 4 2 6]+[1 5 3 7]
-	vtrn.s32		q2, q3				//[8 9 10 11]+[12 13 14 15]-->[8 12 10 14]+[9 13 11 15]
-	vswp			d1, d4				//[0 4 2 6]+[8 12 10 14]-->[0 4 8 12]+[2 6 10 14]
-	vswp			d3, d6				//[1 5 3 7]+[9 13 11 15]-->[1 5 9 13]+[3 7 11 15]
-
-	COL_TRANSFORM_1_STEP		q0, q1, q2, q3, q4, q5, q6, q7
-	
-	TRANSFORM_4BYTES		q0, q1, q2, q3, q4, q5, q6, q7	
-
-	vrshrn.s32		d0, q0, #6	
-	vst1.s16		{d0}, [r0], r2	//store			
-	vrshrn.s32		d1, q1, #6	
-	vst1.s16		{d1}, [r0], r2	//store	
-	vrshrn.s32		d2, q2, #6
-	vst1.s16		{d2}, [r0], r2	//store				
-	vrshrn.s32		d3, q3, #6	
-	vst1.s16		{d3}, [r0], r2	//store			
-
-  WELS_ASM_FUNC_END
-//	int16_t* dst,
-//	int16_t* src,
-//	int stride
-  WELS_ASM_FUNC_BEGIN svc_idct4x4_add_neon
-
-	vld4.s16		{d0, d1, d2, d3}, [r1]		// cost 3 cycles!	
-	lsl			r2, r2, #1	
-	
-	ROW_TRANSFORM_1_STEP		d0, d1, d2, d3, q4, q5, q6, q7, d4, d5
-	
-	TRANSFORM_4BYTES		q0, q1, q2, q3, q4, q5, q6, q7	
-	
-	// transform element 32bits
-	vtrn.s32		q0, q1				//[0 1 2 3]+[4 5 6 7]-->[0 4 2 6]+[1 5 3 7]
-	vtrn.s32		q2, q3				//[8 9 10 11]+[12 13 14 15]-->[8 12 10 14]+[9 13 11 15]
-	vswp			d1, d4				//[0 4 2 6]+[8 12 10 14]-->[0 4 8 12]+[2 6 10 14]
-	vswp			d3, d6				//[1 5 3 7]+[9 13 11 15]-->[1 5 9 13]+[3 7 11 15]
-
-	COL_TRANSFORM_1_STEP		q0, q1, q2, q3, q4, q5, q6, q7
-	
-	TRANSFORM_4BYTES		q0, q1, q2, q3, q4, q5, q6, q7	
-			
-	//see draft G.8.5.3 , after clip_rs() into [-255, 255]
-	vmov.i16		q10,#0xFF
-	veor			q11, q11
-	vsub.i16		q11, q11,q10
-//	vmvn.i16		q11,#0xFF
-
-	mov			r1, r0
-	vld1.s16		{d16}, [r0], r2	
-	vld1.s16		{d17}, [r0], r2
-	ADD_AND_CLIP_RS	q0, q1, q10, q11, q8, d8, d9, q4
-	vst1.s16		{d8}, [r1], r2	//store
-	vst1.s16		{d9}, [r1], r2	//store	
-			
-	vld1.s16		{d18}, [r0], r2	
-	vld1.s16		{d19}, [r0], r2
-	ADD_AND_CLIP_RS	q2, q3, q10, q11, q9, d10, d11, q5	
-	vst1.s16		{d10}, [r1], r2	//store
-	vst1.s16		{d11}, [r1], r2	//store
-  WELS_ASM_FUNC_END
 
 //	uint8_t *pred, const int32_t stride, int16_t *rs
   WELS_ASM_FUNC_BEGIN IdctResAddPred_neon
diff --git a/codec/decoder/core/src/deblocking.cpp b/codec/decoder/core/src/deblocking.cpp
index 772ff48e..80ce7484 100644
--- a/codec/decoder/core/src/deblocking.cpp
+++ b/codec/decoder/core/src/deblocking.cpp
@@ -720,6 +720,7 @@ void  DeblockingInit (SDeblockingFunc*  pFunc,  int32_t iCpu) {
 #endif
 
 #if defined(HAVE_NEON)
+    if ( iCpu & WELS_CPU_NEON )
 	{
 		pFunc->pfLumaDeblockingLT4Ver		= DeblockLumaLt4V_neon;
 		pFunc->pfLumaDeblockingEQ4Ver		= DeblockLumaEq4V_neon;
diff --git a/codec/decoder/core/src/decode_slice.cpp b/codec/decoder/core/src/decode_slice.cpp
index e8a6f4f4..f94c85f4 100644
--- a/codec/decoder/core/src/decode_slice.cpp
+++ b/codec/decoder/core/src/decode_slice.cpp
@@ -1150,9 +1150,11 @@ void WelsBlockFuncInit (SBlockFunc*   pFunc,  int32_t iCpu) {
 #endif
 
 #ifdef	HAVE_NEON
-  pFunc->pWelsBlockZero16x16Func		= WelsResBlockZero16x16_neon;
-  pFunc->pWelsBlockZero8x8Func			= WelsResBlockZero8x8_neon;
-  pFunc->pWelsSetNonZeroCountFunc			= SetNonZeroCount_neon;
+  if ( iCpu & WELS_CPU_NEON ) {
+    pFunc->pWelsBlockZero16x16Func		= WelsResBlockZero16x16_neon;
+    pFunc->pWelsBlockZero8x8Func			= WelsResBlockZero8x8_neon;
+    pFunc->pWelsSetNonZeroCountFunc			= SetNonZeroCount_neon;
+  }
 #endif
 }
 void WelsBlockZero16x16_c (int16_t* pBlock, int32_t iStride) {
diff --git a/codec/decoder/core/src/decoder.cpp b/codec/decoder/core/src/decoder.cpp
index 2d183e5e..96d469d0 100644
--- a/codec/decoder/core/src/decoder.cpp
+++ b/codec/decoder/core/src/decoder.cpp
@@ -146,7 +146,14 @@ void WelsDecoderDefaults (PWelsDecoderContext pCtx) {
 
 #if defined(X86_ASM)
   pCtx->uiCpuFlag = WelsCPUFeatureDetect (&iCpuCores);
-#endif//X86_ASM
+#elif defined(HAVE_NEON)
+#if defined(ANDROID_NDK)
+  pCtx->uiCpuFlag	= WelsCPUFeatureDetectAndroid();
+#endif
+#if defined(APPLE_IOS)
+  pCtx->uiCpuFlag	= WelsCPUFeatureDetectIOS();
+#endif
+#endif
 
   pCtx->iImgWidthInPixel		= 0;
   pCtx->iImgHeightInPixel		= 0;		// alloc picture data when picture size is available
@@ -657,26 +664,28 @@ void AssignFuncPointerForRec (PWelsDecoderContext pCtx) {
   pCtx->pIdctResAddPredFunc	= IdctResAddPred_c;
     
 #if defined(HAVE_NEON)
-  pCtx->pIdctResAddPredFunc	= IdctResAddPred_neon;
+  if ( pCtx->uiCpuFlag & WELS_CPU_NEON ) {
+    pCtx->pIdctResAddPredFunc	= IdctResAddPred_neon;
     
-	pCtx->pGetI16x16LumaPredFunc[I16_PRED_DC] = WelsDecoderI16x16LumaPredDc_neon;
-	pCtx->pGetI16x16LumaPredFunc[I16_PRED_P]  = WelsDecoderI16x16LumaPredPlane_neon;
-	pCtx->pGetI16x16LumaPredFunc[I16_PRED_H]  = WelsDecoderI16x16LumaPredH_neon;
-	pCtx->pGetI16x16LumaPredFunc[I16_PRED_V]  = WelsDecoderI16x16LumaPredV_neon;
+	  pCtx->pGetI16x16LumaPredFunc[I16_PRED_DC] = WelsDecoderI16x16LumaPredDc_neon;
+	  pCtx->pGetI16x16LumaPredFunc[I16_PRED_P]  = WelsDecoderI16x16LumaPredPlane_neon;
+	  pCtx->pGetI16x16LumaPredFunc[I16_PRED_H]  = WelsDecoderI16x16LumaPredH_neon;
+	  pCtx->pGetI16x16LumaPredFunc[I16_PRED_V]  = WelsDecoderI16x16LumaPredV_neon;
     
-	pCtx->pGetI4x4LumaPredFunc[I4_PRED_V    ] = WelsDecoderI4x4LumaPredV_neon;
-	pCtx->pGetI4x4LumaPredFunc[I4_PRED_H    ] = WelsDecoderI4x4LumaPredH_neon;
-	pCtx->pGetI4x4LumaPredFunc[I4_PRED_DDL  ] = WelsDecoderI4x4LumaPredDDL_neon;
-	pCtx->pGetI4x4LumaPredFunc[I4_PRED_DDR  ] = WelsDecoderI4x4LumaPredDDR_neon;
-	pCtx->pGetI4x4LumaPredFunc[I4_PRED_VL   ] = WelsDecoderI4x4LumaPredVL_neon;
-	pCtx->pGetI4x4LumaPredFunc[I4_PRED_VR   ] = WelsDecoderI4x4LumaPredVR_neon;
-	pCtx->pGetI4x4LumaPredFunc[I4_PRED_HU   ] = WelsDecoderI4x4LumaPredHU_neon;
-	pCtx->pGetI4x4LumaPredFunc[I4_PRED_HD   ] = WelsDecoderI4x4LumaPredHD_neon;
+	  pCtx->pGetI4x4LumaPredFunc[I4_PRED_V    ] = WelsDecoderI4x4LumaPredV_neon;
+	  pCtx->pGetI4x4LumaPredFunc[I4_PRED_H    ] = WelsDecoderI4x4LumaPredH_neon;
+	  pCtx->pGetI4x4LumaPredFunc[I4_PRED_DDL  ] = WelsDecoderI4x4LumaPredDDL_neon;
+	  pCtx->pGetI4x4LumaPredFunc[I4_PRED_DDR  ] = WelsDecoderI4x4LumaPredDDR_neon;
+	  pCtx->pGetI4x4LumaPredFunc[I4_PRED_VL   ] = WelsDecoderI4x4LumaPredVL_neon;
+	  pCtx->pGetI4x4LumaPredFunc[I4_PRED_VR   ] = WelsDecoderI4x4LumaPredVR_neon;
+	  pCtx->pGetI4x4LumaPredFunc[I4_PRED_HU   ] = WelsDecoderI4x4LumaPredHU_neon;
+	  pCtx->pGetI4x4LumaPredFunc[I4_PRED_HD   ] = WelsDecoderI4x4LumaPredHD_neon;
 	
-	pCtx->pGetIChromaPredFunc[C_PRED_H]       = WelsDecoderIChromaPredH_neon;
-	pCtx->pGetIChromaPredFunc[C_PRED_V]       = WelsDecoderIChromaPredV_neon;
-	pCtx->pGetIChromaPredFunc[C_PRED_P ]      = WelsDecoderIChromaPredPlane_neon;
-	pCtx->pGetIChromaPredFunc[C_PRED_DC]      = WelsDecoderIChromaPredDC_neon;
+	  pCtx->pGetIChromaPredFunc[C_PRED_H]       = WelsDecoderIChromaPredH_neon;
+	  pCtx->pGetIChromaPredFunc[C_PRED_V]       = WelsDecoderIChromaPredV_neon;
+	  pCtx->pGetIChromaPredFunc[C_PRED_P ]      = WelsDecoderIChromaPredPlane_neon;
+	  pCtx->pGetIChromaPredFunc[C_PRED_DC]      = WelsDecoderIChromaPredDC_neon;
+	}
 #endif//HAVE_NEON
 
 
diff --git a/codec/decoder/core/src/mc.cpp b/codec/decoder/core/src/mc.cpp
index ae2be16d..fa840512 100644
--- a/codec/decoder/core/src/mc.cpp
+++ b/codec/decoder/core/src/mc.cpp
@@ -971,8 +971,10 @@ void InitMcFunc (SMcFunc* pMcFunc, int32_t iCpu) {
   pMcFunc->pMcChromaFunc = McChroma_c;
 
 #ifdef	HAVE_NEON
-	 pMcFunc->pMcLumaFunc	  = McLuma_neon;
-	 pMcFunc->pMcChromaFunc  = McChroma_neon;
+  if ( iCpu & WELS_CPU_NEON ) {
+	   pMcFunc->pMcLumaFunc	  = McLuma_neon;
+	   pMcFunc->pMcChromaFunc  = McChroma_neon;
+		}
 #endif
 
 #if defined (X86_ASM)