Add ARM 32bit asm code for decoder.

2014-02-28 13:36:34 +08:00 · 2014-02-28 13:36:34 +08:00 · 0fd9db2878
commit 0fd9db2878
parent fc056c7ef0
15 changed files with 4903 additions and 11 deletions
--- a/codec/build/iOS/dec/welsdec/welsdec.xcodeproj/project.pbxproj
+++ b/codec/build/iOS/dec/welsdec/welsdec.xcodeproj/project.pbxproj
@ -36,6 +36,11 @@
 		4CE4469D18BC5EAB0017DF25 /* utils.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 4CE4467A18BC5EAA0017DF25 /* utils.cpp */; };
 		4CE4469E18BC5EAB0017DF25 /* welsCodecTrace.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 4CE4468418BC5EAB0017DF25 /* welsCodecTrace.cpp */; };
 		4CE4469F18BC5EAB0017DF25 /* welsDecoderExt.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 4CE4468518BC5EAB0017DF25 /* welsDecoderExt.cpp */; };
+		4CE447AB18BC6BE90017DF25 /* arm_arch_common_macro.S in Sources */ = {isa = PBXBuildFile; fileRef = 4CE447A618BC6BE90017DF25 /* arm_arch_common_macro.S */; };
+		4CE447AC18BC6BE90017DF25 /* block_add_neon.S in Sources */ = {isa = PBXBuildFile; fileRef = 4CE447A718BC6BE90017DF25 /* block_add_neon.S */; };
+		4CE447AD18BC6BE90017DF25 /* deblocking_neon.S in Sources */ = {isa = PBXBuildFile; fileRef = 4CE447A818BC6BE90017DF25 /* deblocking_neon.S */; };
+		4CE447AE18BC6BE90017DF25 /* intra_pred_neon.S in Sources */ = {isa = PBXBuildFile; fileRef = 4CE447A918BC6BE90017DF25 /* intra_pred_neon.S */; };
+		4CE447AF18BC6BE90017DF25 /* mc_neon.S in Sources */ = {isa = PBXBuildFile; fileRef = 4CE447AA18BC6BE90017DF25 /* mc_neon.S */; };
 /* End PBXBuildFile section */

 /* Begin PBXContainerItemProxy section */
@ -83,7 +88,7 @@
 		4CE4465018BC5EAA0017DF25 /* error_code.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = error_code.h; sourceTree = "<group>"; };
 		4CE4465118BC5EAA0017DF25 /* expand_pic.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = expand_pic.h; sourceTree = "<group>"; };
 		4CE4465218BC5EAA0017DF25 /* fmo.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = fmo.h; sourceTree = "<group>"; };
-		4CE4465318BC5EAA0017DF25 /* get_intra_predictor.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = get_intra_predictor.h; sourceTree = "<group>"; };
+		4CE4465318BC5EAA0017DF25 /* get_intra_predictor.h */ = {isa = PBXFileReference; fileEncoding = 4; indentWidth = 2; lastKnownFileType = sourcecode.c.h; path = get_intra_predictor.h; sourceTree = "<group>"; tabWidth = 2; usesTabs = 1; };
 		4CE4465418BC5EAA0017DF25 /* manage_dec_ref.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = manage_dec_ref.h; sourceTree = "<group>"; };
 		4CE4465518BC5EAA0017DF25 /* mb_cache.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = mb_cache.h; sourceTree = "<group>"; };
 		4CE4465618BC5EAA0017DF25 /* mc.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = mc.h; sourceTree = "<group>"; };
@ -102,19 +107,19 @@
 		4CE4466318BC5EAA0017DF25 /* vlc_decoder.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = vlc_decoder.h; sourceTree = "<group>"; };
 		4CE4466418BC5EAA0017DF25 /* wels_common_basis.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = wels_common_basis.h; sourceTree = "<group>"; };
 		4CE4466518BC5EAA0017DF25 /* wels_const.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = wels_const.h; sourceTree = "<group>"; };
-		4CE4466718BC5EAA0017DF25 /* au_parser.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = au_parser.cpp; sourceTree = "<group>"; };
-		4CE4466818BC5EAA0017DF25 /* bit_stream.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = bit_stream.cpp; sourceTree = "<group>"; };
+		4CE4466718BC5EAA0017DF25 /* au_parser.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = au_parser.cpp; sourceTree = "<group>"; usesTabs = 1; };
+		4CE4466818BC5EAA0017DF25 /* bit_stream.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = bit_stream.cpp; sourceTree = "<group>"; usesTabs = 1; };
 		4CE4466918BC5EAA0017DF25 /* deblocking.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = deblocking.cpp; sourceTree = "<group>"; };
 		4CE4466A18BC5EAA0017DF25 /* decode_mb_aux.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = decode_mb_aux.cpp; sourceTree = "<group>"; };
-		4CE4466B18BC5EAA0017DF25 /* decode_slice.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = decode_slice.cpp; sourceTree = "<group>"; };
-		4CE4466C18BC5EAA0017DF25 /* decoder.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = decoder.cpp; sourceTree = "<group>"; };
+		4CE4466B18BC5EAA0017DF25 /* decode_slice.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = decode_slice.cpp; sourceTree = "<group>"; usesTabs = 1; };
+		4CE4466C18BC5EAA0017DF25 /* decoder.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = decoder.cpp; sourceTree = "<group>"; tabWidth = 2; usesTabs = 1; };
 		4CE4466D18BC5EAA0017DF25 /* decoder_core.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = decoder_core.cpp; sourceTree = "<group>"; };
 		4CE4466E18BC5EAA0017DF25 /* decoder_data_tables.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = decoder_data_tables.cpp; sourceTree = "<group>"; };
 		4CE4466F18BC5EAA0017DF25 /* expand_pic.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = expand_pic.cpp; sourceTree = "<group>"; };
 		4CE4467018BC5EAA0017DF25 /* fmo.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = fmo.cpp; sourceTree = "<group>"; };
 		4CE4467118BC5EAA0017DF25 /* get_intra_predictor.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = get_intra_predictor.cpp; sourceTree = "<group>"; };
 		4CE4467218BC5EAA0017DF25 /* manage_dec_ref.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = manage_dec_ref.cpp; sourceTree = "<group>"; };
-		4CE4467318BC5EAA0017DF25 /* mc.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = mc.cpp; sourceTree = "<group>"; };
+		4CE4467318BC5EAA0017DF25 /* mc.cpp */ = {isa = PBXFileReference; fileEncoding = 4; indentWidth = 4; lastKnownFileType = sourcecode.cpp.cpp; path = mc.cpp; sourceTree = "<group>"; tabWidth = 1; usesTabs = 1; wrapsLines = 1; };
 		4CE4467418BC5EAA0017DF25 /* mem_align.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = mem_align.cpp; sourceTree = "<group>"; };
 		4CE4467518BC5EAA0017DF25 /* memmgr_nal_unit.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = memmgr_nal_unit.cpp; sourceTree = "<group>"; };
 		4CE4467618BC5EAA0017DF25 /* mv_pred.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = mv_pred.cpp; sourceTree = "<group>"; };
@ -127,6 +132,11 @@
 		4CE4468318BC5EAB0017DF25 /* wels_dec_export.def */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = text; path = wels_dec_export.def; sourceTree = "<group>"; };
 		4CE4468418BC5EAB0017DF25 /* welsCodecTrace.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = welsCodecTrace.cpp; sourceTree = "<group>"; };
 		4CE4468518BC5EAB0017DF25 /* welsDecoderExt.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = welsDecoderExt.cpp; sourceTree = "<group>"; };
+		4CE447A618BC6BE90017DF25 /* arm_arch_common_macro.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = arm_arch_common_macro.S; sourceTree = "<group>"; };
+		4CE447A718BC6BE90017DF25 /* block_add_neon.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = block_add_neon.S; sourceTree = "<group>"; };
+		4CE447A818BC6BE90017DF25 /* deblocking_neon.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = deblocking_neon.S; sourceTree = "<group>"; };
+		4CE447A918BC6BE90017DF25 /* intra_pred_neon.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = intra_pred_neon.S; sourceTree = "<group>"; };
+		4CE447AA18BC6BE90017DF25 /* mc_neon.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = mc_neon.S; sourceTree = "<group>"; };
 /* End PBXFileReference section */

 /* Begin PBXFrameworksBuildPhase section */
@ -212,6 +222,7 @@
 		4CE4463F18BC5EAA0017DF25 /* core */ = {
 			isa = PBXGroup;
 			children = (
+				4CE447A518BC6BE90017DF25 /* arm */,
 				4CE4464418BC5EAA0017DF25 /* inc */,
 				4CE4466618BC5EAA0017DF25 /* src */,
 			);
@ -313,6 +324,18 @@
 			path = src;
 			sourceTree = "<group>";
 		};
+		4CE447A518BC6BE90017DF25 /* arm */ = {
+			isa = PBXGroup;
+			children = (
+				4CE447A618BC6BE90017DF25 /* arm_arch_common_macro.S */,
+				4CE447A718BC6BE90017DF25 /* block_add_neon.S */,
+				4CE447A818BC6BE90017DF25 /* deblocking_neon.S */,
+				4CE447A918BC6BE90017DF25 /* intra_pred_neon.S */,
+				4CE447AA18BC6BE90017DF25 /* mc_neon.S */,
+			);
+			path = arm;
+			sourceTree = "<group>";
+		};
 /* End PBXGroup section */

 /* Begin PBXNativeTarget section */
@ -394,20 +417,25 @@
 			isa = PBXSourcesBuildPhase;
 			buildActionMask = 2147483647;
 			files = (
+				4CE447AF18BC6BE90017DF25 /* mc_neon.S in Sources */,
 				4CE4469B18BC5EAB0017DF25 /* pic_queue.cpp in Sources */,
 				4CE4469F18BC5EAB0017DF25 /* welsDecoderExt.cpp in Sources */,
 				4CE4469318BC5EAB0017DF25 /* fmo.cpp in Sources */,
 				4CE4469D18BC5EAB0017DF25 /* utils.cpp in Sources */,
 				4CE4469118BC5EAB0017DF25 /* decoder_data_tables.cpp in Sources */,
 				4CE4469718BC5EAB0017DF25 /* mem_align.cpp in Sources */,
+				4CE447AB18BC6BE90017DF25 /* arm_arch_common_macro.S in Sources */,
 				4CE4469518BC5EAB0017DF25 /* manage_dec_ref.cpp in Sources */,
 				4CE4468A18BC5EAB0017DF25 /* au_parser.cpp in Sources */,
 				4CE4469218BC5EAB0017DF25 /* expand_pic.cpp in Sources */,
 				4CE4469918BC5EAB0017DF25 /* mv_pred.cpp in Sources */,
+				4CE447AC18BC6BE90017DF25 /* block_add_neon.S in Sources */,
 				4CE4469418BC5EAB0017DF25 /* get_intra_predictor.cpp in Sources */,
 				4CE4469018BC5EAB0017DF25 /* decoder_core.cpp in Sources */,
 				4CE4469E18BC5EAB0017DF25 /* welsCodecTrace.cpp in Sources */,
+				4CE447AE18BC6BE90017DF25 /* intra_pred_neon.S in Sources */,
 				4CE4469618BC5EAB0017DF25 /* mc.cpp in Sources */,
+				4CE447AD18BC6BE90017DF25 /* deblocking_neon.S in Sources */,
 				4CE4469C18BC5EAB0017DF25 /* rec_mb.cpp in Sources */,
 				4CE4468B18BC5EAB0017DF25 /* bit_stream.cpp in Sources */,
 				4CE4468D18BC5EAB0017DF25 /* decode_mb_aux.cpp in Sources */,
--- a/codec/common/deblocking_common.h
+++ b/codec/common/deblocking_common.h
@ -33,7 +33,23 @@ void DeblockChromaEq4H_ssse3 (uint8_t* pPixCb, uint8_t* pPixCr, int32_t iStride,
 void DeblockChromaLt4H_ssse3 (uint8_t* pPixCb, uint8_t* pPixCr, int32_t iStride, int32_t iAlpha, int32_t iBeta,
                             int8_t* pTC);
 #endif
+
+#if defined(HAVE_NEON)
+void DeblockLumaLt4V_neon(uint8_t* pPixY, int32_t iStride, int32_t iAlpha, int32_t iBeta, int8_t* pTc);
+void DeblockLumaEq4V_neon(uint8_t* pPixY, int32_t iStride, int32_t iAlpha, int32_t iBeta);
+    
+void DeblockLumaLt4H_neon(uint8_t* pPixY, int32_t iStride, int32_t iAlpha, int32_t iBeta, int8_t* pTc);
+void DeblockLumaEq4H_neon(uint8_t* pPixY, int32_t iStride, int32_t iAlpha, int32_t iBeta);
+    
+void DeblockChromaLt4V_neon(uint8_t* pPixCb, uint8_t* pPixCr, int32_t iStride, int32_t iAlpha, int32_t iBeta, int8_t* pTC);
+void DeblockChromaEq4V_neon(uint8_t* pPixCb, uint8_t* pPixCr, int32_t iStride, int32_t iAlpha, int32_t iBeta);
+    
+void DeblockChromaLt4H_neon(uint8_t* pPixCb, uint8_t* pPixCr, int32_t iStride, int32_t iAlpha, int32_t iBeta, int8_t* pTC);
+void DeblockChromaEq4H_neon(uint8_t* pPixCb, uint8_t* pPixCr, int32_t iStride, int32_t iAlpha, int32_t iBeta);
+#endif
+
 #if defined(__cplusplus)
 }
 #endif//__cplusplus
+
 #endif //WELS_DEBLOCKING_COMMON_H__
--- a/codec/common/mc_common.h
+++ b/codec/common/mc_common.h
@ -39,6 +39,79 @@
 extern "C" {
 #endif//__cplusplus

+#if defined(HAVE_NEON)
+/*
+void McCopy_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+                      int32_t iWidth, int32_t iHeight);
+void McHorVer20_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+                    int32_t iWidth, int32_t iHeight);
+void McHorVer02_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+                    int32_t iWidth, int32_t iHeight);
+void McHorVer22_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+                    int32_t iWidth, int32_t iHeight);
+    
+void McHorVer01_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iWidth, int32_t iHeight);
+void McHorVer03_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iWidth, int32_t iHeight);
+void McHorVer10_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iWidth, int32_t iHeight);
+void McHorVer11_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iWidth, int32_t iHeight);
+void McHorVer12_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iWidth, int32_t iHeight);
+void McHorVer13_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iWidth, int32_t iHeight);
+void McHorVer21_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iWidth, int32_t iHeight);
+void McHorVer23_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iWidth, int32_t iHeight);
+void McHorVer30_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iWidth, int32_t iHeight);
+void McHorVer31_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iWidth, int32_t iHeight);
+void McHorVer32_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iWidth, int32_t iHeight);
+void McHorVer33_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iWidth, int32_t iHeight);
+    
+void McLuma_neon(uint8_t* pSrc, int32_t src_stride, uint8_t* dst, int32_t dst_stride,
+                        int16_t iMvX, int16_t iMvY, int32_t width, int32_t height);
+void McChroma_neon(uint8_t* pSrc, int32_t src_stride, uint8_t* dst, int32_t dst_stride,
+                          int16_t iMvX, int16_t iMvY, int32_t width, int32_t height);
+ */
+void McCopyWidthEq4_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);
+    
+void McCopyWidthEq8_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);
+    
+void McCopyWidthEq16_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);
+    
+void McChromaWidthEq8_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t* pWeights, int32_t iHeight);
+    
+void McChromaWidthEq4_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t* pWeights, int32_t iHeight);
+    
+void PixelAvgWidthEq16_neon(uint8_t* pDst, int32_t iDstStride, uint8_t* pSrcA, uint8_t* pSrcB, int32_t iHeight);
+void PixelAvgWidthEq8_neon(uint8_t* pDst, int32_t iDstStride, uint8_t* pSrcA, uint8_t* pSrcB, int32_t iHeight);
+void PixelAvgWidthEq4_neon(uint8_t* pDst, int32_t iDstStride, uint8_t* pSrcA, uint8_t* pSrcB, int32_t iHeight);
+    
+void McHorVer01WidthEq16_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);
+void McHorVer01WidthEq8_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);
+void McHorVer01WidthEq4_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);
+void McHorVer03WidthEq16_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);
+void McHorVer03WidthEq8_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);
+void McHorVer03WidthEq4_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);
+    
+void McHorVer10WidthEq16_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);
+void McHorVer10WidthEq8_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);
+void McHorVer10WidthEq4_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);
+void McHorVer30WidthEq16_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);
+void McHorVer30WidthEq8_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);
+void McHorVer30WidthEq4_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);
+    
+    //horizontal filter to gain half sample, that is (2, 0) location in quarter sample
+void McHorVer20WidthEq16_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);
+void McHorVer20WidthEq8_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);
+void McHorVer20WidthEq4_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);
+    
+    //vertical filter to gain half sample, that is (0, 2) location in quarter sample
+void McHorVer02WidthEq16_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);
+void McHorVer02WidthEq8_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);
+void McHorVer02WidthEq4_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);
+    
+    //horizontal and vertical filter to gain half sample, that is (2, 2) location in quarter sample
+void McHorVer22WidthEq16_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);
+void McHorVer22WidthEq8_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);
+void McHorVer22WidthEq4_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);
+#endif
+
 #if defined(X86_ASM)
 //***************************************************************************//
 //                       MMXEXT definition                                   //
--- a/codec/decoder/core/arm/arm_arch_common_macro.S
+++ b/codec/decoder/core/arm/arm_arch_common_macro.S
@ -0,0 +1,55 @@
+/*!
+ * \copy
+ *     Copyright (c)  2013, Cisco Systems
+ *     All rights reserved.
+ *
+ *     Redistribution and use in source and binary forms, with or without
+ *     modification, are permitted provided that the following conditions
+ *     are met:
+ *
+ *        * Redistributions of source code must retain the above copyright
+ *          notice, this list of conditions and the following disclaimer.
+ *
+ *        * Redistributions in binary form must reproduce the above copyright
+ *          notice, this list of conditions and the following disclaimer in
+ *          the documentation and/or other materials provided with the
+ *          distribution.
+ *
+ *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *     POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#ifdef APPLE_IOS
+
+.macro WELS_ASM_FUNC_BEGIN
+.align 2
+.arm
+.globl _$0
+_$0:
+.endm
+
+#else
+
+.macro WELS_ASM_FUNC_BEGIN funcName
+.align 2
+.arm
+.global \funcName
+\funcName:
+.endm
+
+#endif
+
+.macro WELS_ASM_FUNC_END
+mov pc, lr
+.endm
--- a/codec/decoder/core/arm/block_add_neon.S
+++ b/codec/decoder/core/arm/block_add_neon.S
@ -0,0 +1,620 @@
+/*!
+ * \copy
+ *     Copyright (c)  2013, Cisco Systems
+ *     All rights reserved.
+ *
+ *     Redistribution and use in source and binary forms, with or without
+ *     modification, are permitted provided that the following conditions
+ *     are met:
+ *
+ *        * Redistributions of source code must retain the above copyright
+ *          notice, this list of conditions and the following disclaimer.
+ *
+ *        * Redistributions in binary form must reproduce the above copyright
+ *          notice, this list of conditions and the following disclaimer in
+ *          the documentation and/or other materials provided with the
+ *          distribution.
+ *
+ *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *     POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+ 
+#ifdef HAVE_NEON
+.text
+#include "arm_arch_common_macro.S"
+#ifdef APPLE_IOS
+.macro	ORR_32BYTES_TO_8BYTES
+//	{	//	input: smb#a q0 & q1, smb#b q2&q3, q0[0], q0[1], q2[0], q2[1], output: d_0, d_1
+		vorr.s16	$0, $1
+		vorr.s16	$2, $3		
+		vorr.s16	$8, $4, $5
+		vorr.s16	$9, $6, $7
+//	}
+.endm
+
+.macro	ADD_PRED_1BYTE_TO_RESID_2BYTES
+//	{	//	input: q0~q3, d0~d3, output: d0~d3;
+
+		vaddw.u8		$0, $4
+		vaddw.u8		$1, $5
+		vaddw.u8		$2, $6
+		vaddw.u8		$3, $7
+		
+		vqmovun.s16	$4, $0			//saturation
+		vqmovun.s16	$6, $2	
+		vqmovun.s16	$5, $1
+		vqmovun.s16	$7, $3		
+//	}
+.endm
+
+.macro	ROW_TRANSFORM_1_STEP
+//	{	//	input: src_d[0]~[3], output: e_q[0]~[3]; working: $8 $9
+		vaddl.s16		$4, $0, $2			//int32 e[i][0] = src[0] + src[2];
+		vsubl.s16		$5, $0, $2			//int32 e[i][1] = src[0] - src[2];
+		vshr.s16		$8, $1, #1
+		vshr.s16		$9, $3, #1
+		vsubl.s16		$6, $8, $3			//int32 e[i][2] = (src[1]>>1)-src[3];	
+		vaddl.s16		$7, $1, $9			//int32 e[i][3] = src[1] + (src[3]>>1);		
+//	}
+.endm
+
+.macro	TRANSFORM_4BYTES	// both row & col transform used
+//	{	//	output: f_q[0]~[3], input: e_q[0]~[3];
+		vadd.s32		$0, $4, $7			//int16 f[i][0] = e[i][0] + e[i][3];
+		vadd.s32		$1, $5, $6			//int16 f[i][1] = e[i][1] + e[i][2];
+		vsub.s32		$2, $5, $6			//int16 f[i][2] = e[i][1] - e[i][2];
+		vsub.s32		$3, $4, $7			//int16 f[i][3] = e[i][0] - e[i][3];
+//	}
+.endm
+
+.macro	COL_TRANSFORM_1_STEP
+//	{	//	input: src_q[0]~[3], output: e_q[0]~[3];
+		vadd.s32		$4, $0, $2			//int32 e[0][j] = f[0][j] + f[2][j];
+		vsub.s32		$5, $0, $2			//int32 e[1][j] = f[0][j] - f[2][j];
+		vshr.s32		$6, $1, #1
+		vshr.s32		$7, $3, #1
+		vsub.s32		$6, $6, $3			//int32 e[2][j] = (f[1][j]>>1) - f[3][j];	
+		vadd.s32		$7, $1, $7			//int32 e[3][j] = f[1][j] + (f[3][j]>>1);
+//	}
+.endm
+
+.macro	ADD_AND_CLIP_RS
+//	{	//	input: src_q[0]~[1], matrix_max~min, pred_q, working_d[0]:[1], output: q;	
+		vrshrn.s32		$5, $0, #6
+		vrshrn.s32		$6, $1, #6
+		vqadd.s16		$7, $4
+		vmin.s16		$7, $7, $2
+		vmax.s16		$7, $7, $3
+//	}
+.endm
+#else
+.macro	ORR_32BYTES_TO_8BYTES arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8, arg9
+//	{	//	input: smb#a q0 & q1, smb#b q2&q3, q0[0], q0[1], q2[0], q2[1], output: d_0, d_1
+		vorr.s16	\arg0, \arg1
+		vorr.s16	\arg2, \arg3		
+		vorr.s16	\arg8, \arg4, \arg5
+		vorr.s16	\arg9, \arg6, \arg7
+//	}
+.endm
+
+.macro	ADD_PRED_1BYTE_TO_RESID_2BYTES arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7
+//	{	//	input: q0~q3, d0~d3, output: d0~d3;
+
+		vaddw.u8		\arg0, \arg4
+		vaddw.u8		\arg1, \arg5
+		vaddw.u8		\arg2, \arg6
+		vaddw.u8		\arg3, \arg7
+		
+		vqmovun.s16	\arg4, \arg0			//saturation
+		vqmovun.s16	\arg6, \arg2	
+		vqmovun.s16	\arg5, \arg1
+		vqmovun.s16	\arg7, \arg3		
+//	}
+.endm
+
+.macro	ROW_TRANSFORM_1_STEP arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8, arg9
+//	{	//	input: src_d[0]~[3], output: e_q[0]~[3]; working: $8 $9
+		vaddl.s16		\arg4, \arg0, \arg2			//int32 e[i][0] = src[0] + src[2];
+		vsubl.s16		\arg5, \arg0, \arg2			//int32 e[i][1] = src[0] - src[2];
+		vshr.s16		\arg8, \arg1, #1
+		vshr.s16		\arg9, \arg3, #1
+		vsubl.s16		\arg6, \arg8, \arg3			//int32 e[i][2] = (src[1]>>1)-src[3];	
+		vaddl.s16		\arg7, \arg1, \arg9			//int32 e[i][3] = src[1] + (src[3]>>1);		
+//	}
+.endm
+
+.macro	TRANSFORM_4BYTES  arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7 // both row & col transform used
+//	{	//	output: f_q[0]~[3], input: e_q[0]~[3];
+		vadd.s32		\arg0, \arg4, \arg7			//int16 f[i][0] = e[i][0] + e[i][3];
+		vadd.s32		\arg1, \arg5, \arg6			//int16 f[i][1] = e[i][1] + e[i][2];
+		vsub.s32		\arg2, \arg5, \arg6			//int16 f[i][2] = e[i][1] - e[i][2];
+		vsub.s32		\arg3, \arg4, \arg7			//int16 f[i][3] = e[i][0] - e[i][3];
+//	}
+.endm
+
+.macro	COL_TRANSFORM_1_STEP arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7
+//	{	//	input: src_q[0]~[3], output: e_q[0]~[3];
+		vadd.s32		\arg4, \arg0, \arg2			//int32 e[0][j] = f[0][j] + f[2][j];
+		vsub.s32		\arg5, \arg0, \arg2			//int32 e[1][j] = f[0][j] - f[2][j];
+		vshr.s32		\arg6, \arg1, #1
+		vshr.s32		\arg7, \arg3, #1
+		vsub.s32		\arg6, \arg6, \arg3			//int32 e[2][j] = (f[1][j]>>1) - f[3][j];	
+		vadd.s32		\arg7, \arg1, \arg7			//int32 e[3][j] = f[1][j] + (f[3][j]>>1);
+//	}
+.endm
+
+.macro	ADD_AND_CLIP_RS arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7
+//	{	//	input: src_q[0]~[1], matrix_max~min, pred_q, working_d[0]:[1], output: q;	
+		vrshrn.s32		\arg5, \arg0, #6
+		vrshrn.s32		\arg6, \arg1, #6
+		vqadd.s16		\arg7, \arg4
+		vmin.s16		\arg7, \arg7, \arg2
+		vmax.s16		\arg7, \arg7, \arg3
+//	}
+.endm
+#endif
+// r0    int16_t* block,
+// r1    int8_t* non_zero_count,
+  WELS_ASM_FUNC_BEGIN SetNonZeroCount_neon
+	
+	vld1.64	{d0-d2}, [r1]
+		
+	vceq.s8	q0, q0, #0
+	vceq.s8	d2, d2, #0
+	vmvn	q0, q0
+	vmvn	d2, d2
+	vabs.s8	q0, q0
+	vabs.s8	d2, d2
+	
+	vst1.64	{d0-d2}, [r1]
+  WELS_ASM_FUNC_END
+
+// r0    int16_t* block,
+// r1    int8_t* non_zero_count,
+  WELS_ASM_FUNC_BEGIN svc_non_zero_count_neon
+	push		{r2-r4}
+	mov			r4, #3
+	mov			r3, #64
+	add			r2, r0, #32
+	pld			[r0, #512]
+non_zero_count_two_8x8_loop:
+
+	vld1.64	{q0, q1}, [r0,:128], r3
+	vld1.64	{q2, q3}, [r2,:128], r3
+	vld1.64	{q4, q5}, [r0,:128], r3
+	vld1.64	{q6, q7}, [r2,:128], r3
+	vld1.64	{q8, q9}, [r0,:128], r3
+	vld1.64	{q10, q11}, [r2,:128], r3//load #0 8x8 block resi data,	
+	vld1.64	{q12, q13}, [r0,:128], r3
+	vld1.64	{q14, q15}, [r2,:128], r3//load #1 8x8 block resi data, 
+	pld			[r0, #512]
+	
+	ORR_32BYTES_TO_8BYTES	q0, q1, q2, q3, d0, d1, d4, d5, d2, d3	// output q1
+//	vceq.i16	q1, q1, #0	
+	
+	ORR_32BYTES_TO_8BYTES	q8, q9,q10,q11,d16,d17,d20,d21,d4,d5	// output q2
+//	vceq.i16	q2, q2, #0	
+	
+	ORR_32BYTES_TO_8BYTES	 q4, q5, q6, q7, d8, d9, d12, d13, d10, d11	// output q5
+//	vceq.i16	q5, q5, #0	
+
+	ORR_32BYTES_TO_8BYTES	q12,q13,q14,q15,d24,d25, d28, d29, d12, d13	// output q6
+//	vceq.i16	q6, q6, #0	
+
+	vqmovn.u64	d0, q1		// 8bytes-->4bytes
+	vqmovn.u64	d8, q5	
+	vqmovn.u64	d1, q2					
+	vqmovn.u64	d9, q6
+		
+	vqmovn.u32	d2, q0		// 4bytes-->2bytes
+	vqmovn.u32	d3, q4
+
+	vceq.i16	q0, q1, #0	
+	vmvn    	q0, q0
+	vabs.s16	q2, q0
+	vmovn.u16	d6, q2		// 2bytes-->1bytes
+	vst1.u8	{d6}, [r1]!
+		
+//	pld			[r0]
+	subs		r4,	r4, #1
+	bne			non_zero_count_two_8x8_loop
+
+	pop		{r2-r4}
+  WELS_ASM_FUNC_END
+
+// r0    int16_t* block,
+// r1    int8_t* non_zero_count,
+  WELS_ASM_FUNC_BEGIN svc_rs_non_zero_count_neon
+
+	vld1.i16	{q0, q1}, [r0]!		// block is unaligned!!!
+	vld1.i16	{q2, q3}, [r0]!
+	vld1.i16	{q4, q5}, [r0]!
+	vld1.i16	{q6, q7}, [r0]!
+	
+	vld1.i16	{q8, q9}, [r0]!
+	vld1.i16	{q10, q11}, [r0]!
+	vld1.i16	{q12, q13}, [r0]!
+	vld1.i16	{q14, q15}, [r0]!
+	
+	ORR_32BYTES_TO_8BYTES	q0, q2, q1, q3, q4, q6, q5, q7, q4, q5
+	vorr.s16	q0, q4
+	vorr.s16	q1, q5			// output d0~d3	
+	ORR_32BYTES_TO_8BYTES	q8, q10, q9, q11, q12, q14, q13, q15, q12, q13
+	vorr.s16	q6, q8, q12
+	vorr.s16	q7, q9, q13	// output d12~d15
+	
+	vqmovn.u64	d4, q0		// 8bytes-->4bytes
+	vqmovn.u64	d6, q6	
+	vqmovn.u64	d5, q1
+	vqmovn.u64	d7, q7
+		
+	vqmovn.u32	d8, q2		// 4bytes-->2bytes
+	vqmovn.u32	d9, q3
+
+	vceq.i16	q5, q4, #0	
+	vmvn    	q5, q5
+	vabs.s16	q5, q5
+	vmovn.u16	d10, q5	// 2bytes-->1bytes
+	vst1.u8	{d10}, [r1]!			
+
+	vld1.i16	{q0, q1}, [r0]!
+	vld1.i16	{q2, q3}, [r0]!
+	vld1.i16	{q4, q5}, [r0]!
+	vld1.i16	{q6, q7}, [r0]!
+	
+	vld1.i16	{q8, q9}, [r0]!
+	vld1.i16	{q10, q11}, [r0]!
+	vld1.i16	{q12, q13}, [r0]!
+	vld1.i16	{q14, q15}, [r0]!
+	
+	ORR_32BYTES_TO_8BYTES	q0, q2, q1, q3, q4, q6, q5, q7, q4, q5
+	vorr.s16	q0, q4
+	vorr.s16	q1, q5			// output d0~d3	
+	ORR_32BYTES_TO_8BYTES	q8, q10, q9, q11, q12, q14, q13, q15, q12, q13
+	vorr.s16	q6, q8, q12
+	vorr.s16	q7, q9, q13	// output d12~d15
+	
+	vqmovn.u64	d4, q0		// 8bytes-->4bytes
+	vqmovn.u64	d6, q6	
+	vqmovn.u64	d5, q1
+	vqmovn.u64	d7, q7
+		
+	vqmovn.u32	d8, q2		// 4bytes-->2bytes
+	vqmovn.u32	d9, q3
+
+	vceq.i16	q5, q4, #0	
+	vmvn    	q5, q5
+	vabs.s16	q5, q5
+	vmovn.u16	d10, q5	// 2bytes-->1bytes
+	vst1.u8	{d10}, [r1]!
+	
+//	Chroma
+	vld1.i16	{q0, q1}, [r0]!
+	vld1.i16	{q2, q3}, [r0]!
+	vld1.i16	{q4, q5}, [r0]!
+	vld1.i16	{q6, q7}, [r0]!	//load Cb block,
+	
+	vld1.i16	{q8, q9}, [r0]!
+	vld1.i16	{q10, q11}, [r0]!		
+	vld1.i16	{q12, q13}, [r0]!
+	vld1.i16	{q14, q15}, [r0]!	//load Cr block, 
+
+	ORR_32BYTES_TO_8BYTES	q0, q1, q2, q3, q4, q5, q6, q7, q4, q6
+	vorr.s16	q0, q2
+	vorr.s16	q1, q4, q6			// output d0~d3
+	ORR_32BYTES_TO_8BYTES	q8, q9, q10, q11, q12, q13, q14, q15, q12, q14
+	vorr.s16	q2, q8, q10
+	vorr.s16	q3, q12, q14		// output d4~d7			
+		
+	vqmovn.u64	d8, q0		// 8bytes-->4bytes
+	vqmovn.u64	d10, q2	
+	vqmovn.u64	d9, q1
+	vqmovn.u64	d11, q3
+		
+	vqmovn.u32	d12, q4		// 4bytes-->2bytes
+	vqmovn.u32	d13, q5
+
+	vceq.i16	q7, q6, #0	
+	vmvn    	q7, q7	
+	vabs.s16	q7, q7
+	vmovn.u16	d10, q7	// 2bytes-->1bytes
+	vst1.u8	{d10}, [r1]!		
+  WELS_ASM_FUNC_END
+
+//	r0 int16_t * block, 
+//	r1	int32_t stride
+  WELS_ASM_FUNC_BEGIN WelsResBlockZero16x16_neon// can use for 256*sizeof(int16_t)
+	push		{r2}
+	mov			r2, #16
+// each row 16 elements, 16*sizeof(int16_t)
+//	memset(ptr_dest, 0, 16*sizeof(int16_t));
+//	ptr_dest += stride;	
+	lsl			r1, r1, #1	// r1 = 2*r1
+	veor.i16	q0, q0, q0
+	veor.i16	q1, q1, q1
+			
+block_zero_16x16_luma_loop:	
+	vst1.i16	{q0, q1}, [r0], r1
+	subs		r2,	r2, #2
+	vst1.i16	{q0, q1}, [r0], r1	
+	bne			block_zero_16x16_luma_loop
+	
+	pop		{r2}
+  WELS_ASM_FUNC_END
+	
+  WELS_ASM_FUNC_BEGIN WelsResBlockZero8x8_neon// can use for 64*sizeof(int16_t)
+	push		{r2}
+	mov			r2, #8
+// each row 8 elements, 8*sizeof(int16_t)
+//	memset(ptr_dest, 0, 8*sizeof(int16_t));
+//	ptr_dest += stride;	
+	lsl			r1, r1, #1
+	veor.i16	q0, q0, q0
+		
+block_zero_8x8_chma_loop:	
+	vst1.i16	{q0}, [r0], r1
+	subs		r2,	r2, #2
+	vst1.i16	{q0}, [r0], r1	
+	bne			block_zero_8x8_chma_loop
+	
+	pop		{r2}
+  WELS_ASM_FUNC_END
+
+//	r0	int8_t* dst_addr, 
+//	r1	memset_value
+//	r2	int32_t bytes_nmb,
+
+  WELS_ASM_FUNC_BEGIN svc_block_memset_neon// dst should continue
+	vdup.u8	q0, r1
+	vdup.u8	q1, r1
+		
+block_memset_loop:	
+	vst1.64	{q0, q1}, [r0,:64]!
+	subs		r2,	r2, #64
+	vst1.64	{q0, q1}, [r0,:64]!
+	bne			block_memset_loop
+  WELS_ASM_FUNC_END
+
+//	int16_t* dst, 
+//	int16_t* src,
+//	int32_t stride	
+  WELS_ASM_FUNC_BEGIN svc_block_copy_16x16_neon
+	push		{r3}
+	mov			r3, #16
+// each element is sizeof(int16_t)
+	lsl			r2, r2, #1	// r2 = 2*r2
+
+block_copy_16x16_luma_loop:	
+	vld1.i16	{q0, q1}, [r1], r2
+	subs		r3,	r3, #1
+	vst1.i16	{q0, q1}, [r0]!
+	bne			block_copy_16x16_luma_loop
+	
+	pop		{r3}
+  WELS_ASM_FUNC_END
+	
+  WELS_ASM_FUNC_BEGIN svc_block_copy_8x8_neon
+	push		{r3}
+	mov			r3, #8
+// each element is sizeof(int16_t)
+	lsl			r2, r2, #1	// r2 = 2*r2
+
+block_copy_8x8_chma_loop:	
+	vld1.i16	{q0}, [r1], r2
+	subs		r3,	r3, #1
+	vst1.i16	{q0}, [r0]!
+	bne			block_copy_8x8_chma_loop
+	
+	pop		{r3}
+  WELS_ASM_FUNC_END
+
+// r0    uint8_t * dest,
+// r1    uint8_t * pred,
+// r2    int16_t * res,
+// r3    int32_t stride,
+  WELS_ASM_FUNC_BEGIN svc_block_add_16x16_neon
+	push		{r4}
+	mov		r4, #16
+	pld		[r1]	
+block_recon_16x16_luma_loop:
+
+	vld1.64		{d16,d17}, [r1,:64], r3		//load 16 pred data, update addr
+	vld1.s16		{q0, q1}, [r2]!				//load 8+8 resi data, update addr
+	vld1.64		{d18,d19}, [r1,:64], r3
+	vld1.s16		{q2, q3}, [r2]!
+	ADD_PRED_1BYTE_TO_RESID_2BYTES		q0, q1, q2, q3, d16, d17, d18, d19
+	pld		[r1]
+	vst1.64         {q8}, [r0], r3      //store result		
+	vst1.64         {q9}, [r0], r3
+//#ifdef	DEBUG_NEON
+//	vst1.u8         {q8}, [r0]!		
+//	vst1.u8         {q9}, [r0]!
+//#endif
+
+	vld1.64		{d20,d21}, [r1,:64], r3		//load 16 pred data, update addr
+	vld1.s16		{q4, q5}, [r2]!			//load 8+8 resi data, update addr
+	vld1.64		{d22,d23}, [r1,:64], r3
+	vld1.s16		{q6, q7}, [r2]!
+	ADD_PRED_1BYTE_TO_RESID_2BYTES		q4, q5, q6, q7, d20, d21, d22, d23
+	pld		[r1]
+	vst1.64         {q10}, [r0], r3
+	vst1.64         {q11}, [r0], r3
+//#ifdef	DEBUG_NEON
+//	vst1.u8         {q10}, [r0]!
+//	vst1.u8         {q11}, [r0]!
+//#endif
+
+	subs		r4, r4, #4
+	bne		block_recon_16x16_luma_loop
+
+	pop		{r4}
+  WELS_ASM_FUNC_END
+
+
+  WELS_ASM_FUNC_BEGIN svc_block_add_8x8_neon
+
+	vld1.u8		{d24}, [r1], r3		//load 8 pred data
+	vld1.i16		{q8, q9}, [r2]!		//load 8+8 resi data, update addr	
+	vld1.u8		{d25}, [r1], r3		//load 8 pred data, q12	
+	vld1.i16		{q10, q11}, [r2]!		//load 8+8 resi data, update addr
+	vld1.u8		{d26}, [r1], r3		//load 8 pred data
+	vld1.u8		{d27}, [r1], r3		//load 8 pred data, q13
+
+	ADD_PRED_1BYTE_TO_RESID_2BYTES		q8, q9, q10, q11, d24, d25, d26, d27
+	pld		[r1]
+	vst1.u8         {d24}, [r0], r3      //store result	 
+	vst1.u8         {d25}, [r0], r3      //store result	 
+	vst1.u8         {d26}, [r0], r3      //store result	 
+	vst1.u8         {d27}, [r0], r3      //store result		
+//#ifdef	DEBUG_NEON
+//	vst1.u8         {d24}, [r0]!
+//#endif
+	
+	vld1.u8		{d24}, [r1], r3		//load 8 pred data
+	vld1.i16		{q8, q9}, [r2]!		//load 8+8 resi data, update addr	
+	vld1.u8		{d25}, [r1], r3		//load 8 pred data, q12	
+	vld1.i16		{q10, q11}, [r2]!		//load 8+8 resi data, update addr
+	vld1.u8		{d26}, [r1], r3		//load 8 pred data
+	vld1.u8		{d27}, [r1], r3		//load 8 pred data, q13
+
+	ADD_PRED_1BYTE_TO_RESID_2BYTES		q8, q9, q10, q11, d24, d25, d26, d27
+	vst1.u8         {d24}, [r0], r3      //store result	 
+	vst1.u8         {d25}, [r0], r3      //store result	 
+	vst1.u8         {d26}, [r0], r3      //store result	 
+	vst1.u8         {d27}, [r0], r3      //store result		
+//#ifdef	DEBUG_NEON
+//	vst1.u8         {d24}, [r0]!
+//#endif
+  WELS_ASM_FUNC_END
+
+
+//	int16_t* dst,
+//	int16_t* src,
+//	int stride
+  WELS_ASM_FUNC_BEGIN svc_simple_idct4x4_neon
+
+	vld4.s16		{d0, d1, d2, d3}, [r1]	// cost 3 cycles!
+	lsl			r2, r2, #1	
+
+	ROW_TRANSFORM_1_STEP		d0, d1, d2, d3, q4, q5, q6, q7, d4, d5
+	
+	TRANSFORM_4BYTES		q0, q1, q2, q3, q4, q5, q6, q7
+	
+	// transform element 32bits
+	vtrn.s32		q0, q1				//[0 1 2 3]+[4 5 6 7]-->[0 4 2 6]+[1 5 3 7]
+	vtrn.s32		q2, q3				//[8 9 10 11]+[12 13 14 15]-->[8 12 10 14]+[9 13 11 15]
+	vswp			d1, d4				//[0 4 2 6]+[8 12 10 14]-->[0 4 8 12]+[2 6 10 14]
+	vswp			d3, d6				//[1 5 3 7]+[9 13 11 15]-->[1 5 9 13]+[3 7 11 15]
+
+	COL_TRANSFORM_1_STEP		q0, q1, q2, q3, q4, q5, q6, q7
+	
+	TRANSFORM_4BYTES		q0, q1, q2, q3, q4, q5, q6, q7	
+
+	vrshrn.s32		d0, q0, #6	
+	vst1.s16		{d0}, [r0], r2	//store			
+	vrshrn.s32		d1, q1, #6	
+	vst1.s16		{d1}, [r0], r2	//store	
+	vrshrn.s32		d2, q2, #6
+	vst1.s16		{d2}, [r0], r2	//store				
+	vrshrn.s32		d3, q3, #6	
+	vst1.s16		{d3}, [r0], r2	//store			
+
+  WELS_ASM_FUNC_END
+//	int16_t* dst,
+//	int16_t* src,
+//	int stride
+  WELS_ASM_FUNC_BEGIN svc_idct4x4_add_neon
+
+	vld4.s16		{d0, d1, d2, d3}, [r1]		// cost 3 cycles!	
+	lsl			r2, r2, #1	
+	
+	ROW_TRANSFORM_1_STEP		d0, d1, d2, d3, q4, q5, q6, q7, d4, d5
+	
+	TRANSFORM_4BYTES		q0, q1, q2, q3, q4, q5, q6, q7	
+	
+	// transform element 32bits
+	vtrn.s32		q0, q1				//[0 1 2 3]+[4 5 6 7]-->[0 4 2 6]+[1 5 3 7]
+	vtrn.s32		q2, q3				//[8 9 10 11]+[12 13 14 15]-->[8 12 10 14]+[9 13 11 15]
+	vswp			d1, d4				//[0 4 2 6]+[8 12 10 14]-->[0 4 8 12]+[2 6 10 14]
+	vswp			d3, d6				//[1 5 3 7]+[9 13 11 15]-->[1 5 9 13]+[3 7 11 15]
+
+	COL_TRANSFORM_1_STEP		q0, q1, q2, q3, q4, q5, q6, q7
+	
+	TRANSFORM_4BYTES		q0, q1, q2, q3, q4, q5, q6, q7	
+			
+	//see draft G.8.5.3 , after clip_rs() into [-255, 255]
+	vmov.i16		q10,#0xFF
+	veor			q11, q11
+	vsub.i16		q11, q11,q10
+//	vmvn.i16		q11,#0xFF
+
+	mov			r1, r0
+	vld1.s16		{d16}, [r0], r2	
+	vld1.s16		{d17}, [r0], r2
+	ADD_AND_CLIP_RS	q0, q1, q10, q11, q8, d8, d9, q4
+	vst1.s16		{d8}, [r1], r2	//store
+	vst1.s16		{d9}, [r1], r2	//store	
+			
+	vld1.s16		{d18}, [r0], r2	
+	vld1.s16		{d19}, [r0], r2
+	ADD_AND_CLIP_RS	q2, q3, q10, q11, q9, d10, d11, q5	
+	vst1.s16		{d10}, [r1], r2	//store
+	vst1.s16		{d11}, [r1], r2	//store
+  WELS_ASM_FUNC_END
+
+//	uint8_t *pred, const int32_t stride, int16_t *rs
+  WELS_ASM_FUNC_BEGIN IdctResAddPred_neon
+
+	vld4.s16		{d0, d1, d2, d3}, [r2]		// cost 3 cycles!	
+	
+	ROW_TRANSFORM_1_STEP		d0, d1, d2, d3, q4, q5, q6, q7, d4, d5
+	
+	TRANSFORM_4BYTES		q0, q1, q2, q3, q4, q5, q6, q7	
+	
+	// transform element 32bits
+	vtrn.s32		q0, q1				//[0 1 2 3]+[4 5 6 7]-->[0 4 2 6]+[1 5 3 7]
+	vtrn.s32		q2, q3				//[8 9 10 11]+[12 13 14 15]-->[8 12 10 14]+[9 13 11 15]
+	vswp			d1, d4				//[0 4 2 6]+[8 12 10 14]-->[0 4 8 12]+[2 6 10 14]
+	vswp			d3, d6				//[1 5 3 7]+[9 13 11 15]-->[1 5 9 13]+[3 7 11 15]
+
+	COL_TRANSFORM_1_STEP		q0, q1, q2, q3, q4, q5, q6, q7
+	
+	TRANSFORM_4BYTES		q0, q1, q2, q3, q4, q5, q6, q7	
+			
+	//after clip_table[MAX_NEG_CROP] into [0, 255]
+	mov			r2, r0
+	vld1.32		{d12[0]},[r0],r1
+	vld1.32		{d12[1]},[r0],r1
+	vld1.32		{d14[0]},[r0],r1
+	vld1.32		{d14[1]},[r0]
+
+	vrshrn.s32		d8, q0, #6
+	vrshrn.s32		d9, q1, #6
+	vrshrn.s32		d10, q2, #6
+	vrshrn.s32		d11, q3, #6
+		
+	vmovl.u8		q0,d12
+	vmovl.u8		q1,d14
+	vadd.s16		q0,q4
+	vadd.s16		q1,q5
+
+	vqmovun.s16		d12,q0
+	vqmovun.s16		d14,q1
+
+	vst1.32		{d12[0]},[r2],r1
+	vst1.32		{d12[1]},[r2],r1
+	vst1.32		{d14[0]},[r2],r1
+	vst1.32		{d14[1]},[r2]
+  WELS_ASM_FUNC_END
+#endif
--- a/codec/decoder/core/arm/deblocking_neon.S
+++ b/codec/decoder/core/arm/deblocking_neon.S
--- a/codec/decoder/core/arm/intra_pred_neon.S
+++ b/codec/decoder/core/arm/intra_pred_neon.S
@ -0,0 +1,746 @@
+/*!
+ * \copy
+ *     Copyright (c)  2013, Cisco Systems
+ *     All rights reserved.
+ *
+ *     Redistribution and use in source and binary forms, with or without
+ *     modification, are permitted provided that the following conditions
+ *     are met:
+ *
+ *        * Redistributions of source code must retain the above copyright
+ *          notice, this list of conditions and the following disclaimer.
+ *
+ *        * Redistributions in binary form must reproduce the above copyright
+ *          notice, this list of conditions and the following disclaimer in
+ *          the documentation and/or other materials provided with the
+ *          distribution.
+ *
+ *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *     POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+ 
+#ifdef  HAVE_NEON
+//Global macro
+.text
+#include "arm_arch_common_macro.S"
+
+#ifdef APPLE_IOS
+//Global macro
+.macro GET_8BYTE_DATA
+	vld1.8 {$0[0]}, [$1], $2
+	vld1.8 {$0[1]}, [$1], $2
+	vld1.8 {$0[2]}, [$1], $2
+	vld1.8 {$0[3]}, [$1], $2
+	vld1.8 {$0[4]}, [$1], $2
+	vld1.8 {$0[5]}, [$1], $2
+	vld1.8 {$0[6]}, [$1], $2
+	vld1.8 {$0[7]}, [$1], $2
+.endmacro
+#else
+//Global macro
+.macro GET_8BYTE_DATA arg0, arg1, arg2
+	vld1.8 {\arg0[0]}, [\arg1], \arg2
+	vld1.8 {\arg0[1]}, [\arg1], \arg2
+	vld1.8 {\arg0[2]}, [\arg1], \arg2
+	vld1.8 {\arg0[3]}, [\arg1], \arg2
+	vld1.8 {\arg0[4]}, [\arg1], \arg2
+	vld1.8 {\arg0[5]}, [\arg1], \arg2
+	vld1.8 {\arg0[6]}, [\arg1], \arg2
+	vld1.8 {\arg0[7]}, [\arg1], \arg2
+.endm
+#endif
+
+/*
+ * void get_i16x16_luma_pred_v(uint8_t *pred, const int32_t stride)
+ * r0     --- pred
+ * r1     --- stride
+ * return --- void
+ */
+ 
+  WELS_ASM_FUNC_BEGIN WelsDecoderI16x16LumaPredV_neon
+	//Get the top line data to 'q0'
+	sub  r2, r0, r1
+	vldm r2, {d0, d1}
+    
+	mov  r2, r0
+	mov  r3, #4
+	//Set the top line to the each line of MB(16*16) 
+loop_0_get_i16x16_luma_pred_v:
+	vst1.8 {d0,d1}, [r2], r1
+	vst1.8 {d0,d1}, [r2], r1
+	vst1.8 {d0,d1}, [r2], r1
+	vst1.8 {d0,d1}, [r2], r1
+	subs  r3, #1
+	bne  loop_0_get_i16x16_luma_pred_v				
+													
+	WELS_ASM_FUNC_END
+
+
+/*
+ * void get_i16x16_luma_pred_h(uint8_t *pred, const int32_t stride)
+ * r0     --- pred
+ * r1     --- stride
+ * return --- void
+ */
+ 
+  WELS_ASM_FUNC_BEGIN WelsDecoderI16x16LumaPredH_neon
+	sub  r2, r0, #1
+	mov  r3, #4
+loop_0_get_i16x16_luma_pred_h:
+	//Get one byte data from left side
+	vld1.8 {d0[],d1[]}, [r2], r1
+	vld1.8 {d2[],d3[]}, [r2], r1	
+	vld1.8 {d4[],d5[]}, [r2], r1	
+	vld1.8 {d6[],d7[]}, [r2], r1
+	
+	//Set the line of MB using the left side byte data
+	vst1.8 {d0,d1}, [r0], r1
+	vst1.8 {d2,d3}, [r0], r1
+	vst1.8 {d4,d5}, [r0], r1
+	vst1.8 {d6,d7}, [r0], r1
+	
+	subs  r3, #1
+	bne  loop_0_get_i16x16_luma_pred_h		
+
+	WELS_ASM_FUNC_END
+
+
+/*
+ * void get_i16x16_luma_pred_dc_both(uint8_t *pred, const int32_t stride)
+ * r0     --- pred
+ * r1     --- stride
+ * return --- void
+ */
+
+  WELS_ASM_FUNC_BEGIN WelsDecoderI16x16LumaPredDc_neon
+	//stmdb sp!, { r2-r5, lr}
+	//Get the left vertical line data
+	sub r2, r0, #1
+	GET_8BYTE_DATA d0, r2, r1
+	GET_8BYTE_DATA d1, r2, r1
+	
+	//Get the top horizontal line data
+	sub  r2, r0, r1			
+	vldm r2, {d2, d3}
+	
+	//Calculate the sum of top horizontal line data and vertical line data
+	vpaddl.u8 q0, q0
+	vpaddl.u8 q1, q1
+	vadd.u16  q0, q0, q1
+	vadd.u16  d0, d0, d1
+	vpaddl.u16 d0, d0
+	vpaddl.u32 d0, d0
+	
+	//Calculate the mean value 
+	vrshr.u16  d0, d0, #5
+	vdup.8     q0, d0[0]
+	
+	//Set the mean value to the all of member of MB
+	mov  r2, #4
+loop_0_get_i16x16_luma_pred_dc_both:
+	vst1.8 {d0,d1}, [r0], r1
+	vst1.8 {d0,d1}, [r0], r1
+	vst1.8 {d0,d1}, [r0], r1
+	vst1.8 {d0,d1}, [r0], r1
+	subs  r2, #1
+	bne  loop_0_get_i16x16_luma_pred_dc_both					
+			
+	WELS_ASM_FUNC_END
+
+
+/*
+ * void get_i16x16_luma_pred_plane(uint8_t *pred, const int32_t stride)
+ * r0     --- pred
+ * r1     --- stride
+ * return --- void
+ */
+
+//The table for SIMD instruction {(8,7,6,5,4,3,2,1) * 5}
+CONST0_GET_I16X16_LUMA_PRED_PLANE: .long 0x191e2328, 0x050a0f14
+
+//The table for SIMD instruction {-7,-6,-5,-4,-3,-2,-1,0}                
+CONST1_GET_I16X16_LUMA_PRED_PLANE: .long 0xfcfbfaf9, 0x00fffefd
+                  
+
+  WELS_ASM_FUNC_BEGIN WelsDecoderI16x16LumaPredPlane_neon
+	//stmdb sp!, { r2-r5, lr}
+        
+	//Load the table {(8,7,6,5,4,3,2,1) * 5}
+	adr r2, CONST0_GET_I16X16_LUMA_PRED_PLANE
+	vldr    d0, [r2]
+
+	//Pack the top[-1] ~ top[6] to d1
+	sub       r2,  r0, r1
+	sub       r3,  r2, #1
+	vld1.8    d1, [r3]
+	
+	//Pack the top[8] ~ top[15] to d2
+	add       r3, #9
+	vld1.8    d2, [r3]
+    
+	//Save the top[15] to d6 for next step
+	vdup.u8   d6,   d2[7]
+	
+	//Get and pack left[-1] ~ left[6] to d4
+	sub       r3,  r2, #1
+	GET_8BYTE_DATA d4, r3, r1
+	
+	//Get and pack left[8] ~ left[15] to d3
+	add       r3,  r1
+	GET_8BYTE_DATA d3, r3, r1
+	
+	//Save the left[15] to d7 for next step
+	vdup.u8   d7,   d3[7]
+    
+	//revert the sequence of d2,d3
+	vrev64.8   q1, q1
+
+	vsubl.u8   q2, d3, d4 //q2={left[8]-left[6],left[9]-left[5],left[10]-left[4], ...}
+	vsubl.u8   q1, d2, d1 //q1={top[8]-top[6],top[9]-top[5],top[10]-top[4], ...}
+
+        
+	vmovl.u8   q0, d0
+	vmul.s16   q1, q0, q1 //q1 = q1*{(8,7,6,5,4,3,2,1) * 5}
+	vmul.s16   q2, q0, q2 //q2 = q2*{(8,7,6,5,4,3,2,1) * 5}
+	
+	//Calculate the sum of items of q1, q2
+	vpadd.s16  d0, d2, d3
+	vpadd.s16  d1, d4, d5
+	vpaddl.s16 q0, q0
+	vpaddl.s32 q0, q0
+	
+	//Get the value of 'b', 'c' and extend to q1, q2.
+	vrshr.s64  q0, #6
+	vdup.s16   q1, d0[0]
+	vdup.s16   q2, d1[0]
+	
+	//Load the table {-7,-6,-5,-4,-3,-2,-1,0} to d0
+	adr r2, CONST1_GET_I16X16_LUMA_PRED_PLANE
+	vld1.32   {d0}, [r2]
+	
+	//Get the value of 'a' and save to q3
+	vaddl.u8  q3, d6, d7
+	vshl.u16  q3, #4
+
+	//calculate a+'b'*{-7,-6,-5,-4,-3,-2,-1,0} + c*{-7}
+	vmovl.s8  q0, d0
+	vmla.s16  q3, q0, q1
+	vmla.s16  q3, q2, d0[0]
+	
+	//Calculate a+'b'*{1,2,3,4,5,6,7,8} + c*{-7}
+	vshl.s16  q5, q1, #3
+	vadd.s16  q5, q3
+	
+	//right shift 5 bits and rounding
+	vqrshrun.s16 d0, q3, #5
+	vqrshrun.s16 d1, q5, #5
+	
+	//Set the line of MB
+	vst1.u32  {d0,d1}, [r0], r1
+	
+	
+	//Do the same processing for setting other lines
+	mov  r2, #15
+loop_0_get_i16x16_luma_pred_plane:	
+	vadd.s16  q3, q2
+	vadd.s16  q5, q2
+	vqrshrun.s16 d0, q3, #5
+	vqrshrun.s16 d1, q5, #5
+	vst1.u32  {d0,d1}, [r0], r1
+	subs  r2, #1
+	bne  loop_0_get_i16x16_luma_pred_plane	
+		
+	WELS_ASM_FUNC_END
+
+
+/*
+ * void get_i4x4_luma_pred_v(uint8_t *pred, const int32_t stride);
+ * r0     --- pred
+ * r1     --- stride
+ * return --- void
+ */
+ 
+  WELS_ASM_FUNC_BEGIN WelsDecoderI4x4LumaPredV_neon
+	//stmdb sp!, { r2-r5, lr}
+	//Load the top row (4 bytes)
+	sub  r2, r0, r1
+	ldr  r2, [r2]
+	
+	//Set the luma MB using top line
+	str  r2, [r0], r1
+	str  r2, [r0], r1
+	str  r2, [r0], r1
+	str  r2, [r0]
+        
+	WELS_ASM_FUNC_END
+
+/*
+ * void get_i4x4_luma_pred_h(uint8_t *pred, const int32_t stride);
+ * r0     --- pred
+ * r1     --- stride
+ * return --- void
+ */
+ 
+  WELS_ASM_FUNC_BEGIN WelsDecoderI4x4LumaPredH_neon
+	//stmdb sp!, { r2-r5, lr}
+	//Load the left column (4 bytes)
+	sub  r2, r0, #1
+	vld1.8 {d0[]}, [r2], r1
+	vld1.8 {d1[]}, [r2], r1	
+	vld1.8 {d2[]}, [r2], r1	
+	vld1.8 {d3[]}, [r2]
+	
+	//Set the luma MB using the left side byte
+	vst1.32 {d0[0]}, [r0], r1
+	vst1.32 {d1[0]}, [r0], r1
+	vst1.32 {d2[0]}, [r0], r1
+	vst1.32 {d3[0]}, [r0]
+
+	WELS_ASM_FUNC_END
+
+/*
+ * void get_i4x4_luma_pred_d_l(uint8_t *pred, const int32_t stride);
+ * r0     --- pred
+ * r1     --- stride
+ * return --- void
+ */
+  WELS_ASM_FUNC_BEGIN WelsDecoderI4x4LumaPredDDL_neon
+	//stmdb sp!, { r2-r5, lr}
+	//Load the top row data(8 bytes)
+	sub    r2,  r0, r1
+	vld1.32  {d0}, [r2]
+	
+	//For "t7 + (t7<<1)"
+	vdup.8   d1,  d0[7]
+	
+	//calculate "t0+t1,t1+t2,t2+t3...t6+t7,t7+t7"
+	vext.8   d1,  d0, d1, #1
+	vaddl.u8 q1,  d1, d0
+	
+	//calculate "x,t0+t1+t1+t2,t1+t2+t2+t3,...t5+t6+t6+t7,t6+t7+t7+t7"
+	vext.8   q2,  q1, q1, #14
+	vadd.u16 q0,  q1, q2
+	
+	//right shift 2 bits and rounding
+	vqrshrn.u16  d0,  q0, #2
+	
+	//Save "ddl0, ddl1, ddl2, ddl3"
+	vext.8   d1, d0, d0, #1
+	vst1.32  d1[0], [r0], r1
+	
+	//Save "ddl1, ddl2, ddl3, ddl4"
+	vext.8   d1, d0, d0, #2
+	vst1.32  d1[0], [r0], r1
+	
+	//Save "ddl2, ddl3, ddl4, ddl5"
+	vext.8   d1, d0, d0, #3
+	vst1.32  d1[0], [r0], r1	
+	
+	//Save "ddl3, ddl4, ddl5, ddl6"
+	vst1.32  d0[1], [r0]	
+		
+	WELS_ASM_FUNC_END
+
+/*
+ * void get_i4x4_luma_pred_d_r(uint8_t *pred, const int32_t stride);
+ * r0     --- pred
+ * r1     --- stride
+ * return --- void
+ */
+
+  WELS_ASM_FUNC_BEGIN WelsDecoderI4x4LumaPredDDR_neon
+	//stmdb sp!, { r2-r5, lr}
+	//Load the top row (4 bytes)
+	sub    r2,  r0, r1
+	vld1.32  {d0[1]}, [r2]
+	
+	//Load the left column (5 bytes)
+	sub    r2,  #1
+	vld1.8 {d0[3]}, [r2], r1
+	vld1.8 {d0[2]}, [r2], r1	
+	vld1.8 {d0[1]}, [r2], r1
+	vld1.8 {d0[0]}, [r2], r1	
+	vld1.8 {d1[7]}, [r2] //For packing the right sequence to do SIMD processing
+	
+	
+	vext.8   d2, d1, d0, #7   //d0:{L2,L1,L0,LT,T0,T1,T2,T3}
+	                          //d2:{L3,L2,L1,L0,LT,T0,T1,T2}
+	
+	//q2:{L2+L3,L1+L2,L0+L1...T1+T2,T2+T3}
+	vaddl.u8 q2, d2, d0
+	
+	//q1:{TL0+LT0,LT0+T01,...L12+L23}
+	vext.8   q3, q3, q2, #14
+	vadd.u16 q1, q2, q3
+	
+	//right shift 2 bits and rounding
+	vqrshrn.u16 d0, q1, #2
+	
+	//Adjust the data sequence for setting luma MB of 'pred'
+	vst1.32   d0[1], [r0], r1
+	vext.8    d0, d0, d0, #7
+	vst1.32   d0[1], [r0], r1
+	vext.8    d0, d0, d0, #7
+	vst1.32   d0[1], [r0], r1
+	vext.8    d0, d0, d0, #7
+	vst1.32   d0[1], [r0]
+
+	WELS_ASM_FUNC_END
+
+
+/*
+ * void get_i4x4_luma_pred_v_l(uint8_t *pred, const int32_t stride);
+ * r0     --- pred
+ * r1     --- stride
+ * return --- void
+ */
+  WELS_ASM_FUNC_BEGIN WelsDecoderI4x4LumaPredVL_neon
+	//stmdb sp!, { r2-r5, lr}
+	//Load the top row (8 bytes)
+	sub    r2,  r0, r1
+	vld1.32  {d0}, [r2]
+
+        
+	vext.8   d1,  d0, d0, #1
+	vaddl.u8 q1,  d1, d0     //q1:{t0+t1,t1+t2,t2+t3...t5+t6,x,x}
+	
+	vext.8   q2,  q1, q1, #2
+	vadd.u16 q2,  q1, q2     //q2:{t0+t1+t1+t2,t1+t2+t2+t3,...t4+t5+t5+t6,x,x}
+	
+	//calculate the "vl0,vl1,vl2,vl3,vl4"
+	vqrshrn.u16  d0,  q1, #1
+	
+	//calculate the "vl5,vl6,vl7,vl8,vl9"
+	vqrshrn.u16  d1,  q2, #2
+	
+	//Adjust the data sequence for setting the luma MB
+	vst1.32  d0[0], [r0], r1
+	vst1.32  d1[0], [r0], r1
+	vext.8   d0,  d0, d0, #1
+	vext.8   d1,  d1, d1, #1
+	vst1.32  d0[0], [r0], r1
+	vst1.32  d1[0], [r0]
+	
+	WELS_ASM_FUNC_END
+
+
+/*
+ * void get_i4x4_luma_pred_v_r(uint8_t *pred, const int32_t stride);
+ * r0     --- pred
+ * r1     --- stride
+ * return --- void
+ */
+
+  WELS_ASM_FUNC_BEGIN WelsDecoderI4x4LumaPredVR_neon
+	//stmdb sp!, { r2-r5, lr}
+	//Load the top row (4 bytes)
+	sub       r2,  r0, r1
+	vld1.32   {d0[1]}, [r2]
+	
+	//Load the left column (4 bytes)
+	sub       r2,  #1
+	vld1.8    {d0[3]}, [r2], r1	
+	vld1.8    {d0[2]}, [r2], r1
+	vld1.8    {d0[1]}, [r2], r1	
+	vld1.8    {d0[0]}, [r2]	
+
+        
+	vext.8    d1, d0, d0, #7
+	vaddl.u8  q1, d0, d1      //q1:{X,L2+L1,L1+L0,L0+LT,LT+T0,T0+T1,T1+T2,T2+T3}
+	
+	vext.u8   q2, q1, q1, #14
+	vadd.u16  q2, q2, q1      //q2:{X,L2+L1+L1+L0,L1+L0+L0+LT,...T1+T2+T2+T3}
+	
+	//Calculate the vr0 ~ vr9
+	vqrshrn.u16 d1, q2, #2
+	vqrshrn.u16 d0, q1, #1
+	
+	//Adjust the data sequence for setting the luma MB
+	vst1.32  d0[1], [r0], r1
+	vst1.32  d1[1], [r0], r1
+	add    r2, r0, r1
+	vst1.8   d1[3], [r0]!
+	vst1.16  d0[2], [r0]!    
+	vst1.8   d0[6], [r0]!
+	vst1.8   d1[2], [r2]!
+	vst1.16  d1[2], [r2]!    
+	vst1.8   d1[6], [r2]
+  WELS_ASM_FUNC_END
+
+
+/*
+ * get_i4x4_luma_pred_h_u(uint8_t *pred, const int32_t stride);
+ * r0     --- pred
+ * r1     --- stride
+ * return --- void
+ */
+ //NO TEST 
+  WELS_ASM_FUNC_BEGIN WelsDecoderI4x4LumaPredHU_neon
+	//stmdb sp!, { r2-r5, lr}
+	//Load the left column data
+	sub       r2,  r0, #1
+	mov       r3,  #3
+	mul       r3,  r1
+	add       r3,  r2
+	vld1.8    {d0[]},  [r3]	   	   
+	vld1.8    {d0[4]}, [r2], r1	
+	vld1.8    {d0[5]}, [r2], r1
+	vld1.8    {d0[6]}, [r2], r1 //d0:{L3,L3,L3,L3,L0,L1,L2,L3}	
+
+	vext.8    d1, d0, d0, #1
+	vaddl.u8  q2, d0, d1        //q2:{L3+L3,L3+L3,L3+L3,L3+L0,L0+L1,L1+L2,L2+L3,L3+L3}	
+	
+	vext.u8   d2, d5, d4, #2
+	vadd.u16  d3, d2, d5        //d3:{L0+L1+L1+L2,L1+L2+L2+L3,L2+L3+L3+L3,L3+L3+L3+L3} 
+	
+	//Calculate the hu0 ~ hu5
+	vqrshrn.u16 d2, q2, #1
+	vqrshrn.u16 d1, q1, #2
+	
+	//Adjust the data sequence for setting the luma MB
+	vzip.8   d2, d1
+	vst1.32  d1[0], [r0], r1
+	vext.8   d2, d1, d1, #2	
+	vst1.32  d2[0], [r0], r1
+	vst1.32  d1[1], [r0], r1
+	vst1.32  d0[0], [r0]
+	
+  WELS_ASM_FUNC_END
+
+/*
+ * void get_i4x4_luma_pred_h_d(uint8_t *pred, const int32_t stride);
+ * r0     --- pred
+ * r1     --- stride
+ * return --- void
+ */
+  WELS_ASM_FUNC_BEGIN WelsDecoderI4x4LumaPredHD_neon
+	//stmdb sp!, { r2-r5, lr}
+	//Load the data
+	sub       r2,  r0, r1
+	sub       r2,  #1
+	vld1.32   {d0[1]}, [r2], r1
+	vld1.8    {d0[3]}, [r2], r1	
+	vld1.8    {d0[2]}, [r2], r1
+	vld1.8    {d0[1]}, [r2], r1	
+	vld1.8    {d0[0]}, [r2]	    //d0:{L3,L2,L1,L0,LT,T0,T1,T2}
+
+
+	vext.8    d1, d0, d0, #7
+	vaddl.u8  q1, d0, d1        //q1:{x,L3+L2,L2+L1,L1+L0,L0+LT,LT+T0,T0+T1,T1+T2}
+	
+	vext.u8   q2, q1, q1, #14   //q2:{x,x, L3+L2,L2+L1,L1+L0,L0+LT,LT+T0,T0+T1}
+	vadd.u16  q3, q2, q1        //q3:{x,x,L3+L2+L2+L1,L2+L1+L1+L0,L1+L0+L0+LT,L0+LT+LT+T0,LT+T0+T0+T1,T0+T1+T1+T2}
+	
+	//Calculate the hd0~hd9
+	vqrshrn.u16 d1, q3, #2
+	vqrshrn.u16 d0, q2, #1
+	
+	//Adjust the data sequence for setting the luma MB
+	vmov      d3, d1
+	vtrn.8    d0, d1
+	vext.u8   d2, d1, d1, #6
+	vst2.16  {d2[3], d3[3]}, [r0], r1
+	vst2.16  {d0[2], d1[2]}, [r0], r1
+	vmov     d3, d0
+	vst2.16  {d2[2], d3[2]}, [r0], r1
+	vst2.16  {d0[1], d1[1]}, [r0]
+
+  WELS_ASM_FUNC_END
+
+
+/*
+ * void get_i_chroma_pred_v(uint8_t *pred, const int32_t stride);
+ * r0     --- pred
+ * r1     --- stride
+ * return --- void
+ */
+ 
+  WELS_ASM_FUNC_BEGIN WelsDecoderIChromaPredV_neon
+	//stmdb sp!, { r2-r5, lr}
+	//Get the top row (8 byte)
+	sub  r2, r0, r1
+	vldr d0, [r2]
+
+	//Set the chroma MB using top row data
+	vst1.8 {d0}, [r0], r1
+	vst1.8 {d0}, [r0], r1
+	vst1.8 {d0}, [r0], r1
+	vst1.8 {d0}, [r0], r1
+	vst1.8 {d0}, [r0], r1
+	vst1.8 {d0}, [r0], r1
+	vst1.8 {d0}, [r0], r1
+	vst1.8 {d0}, [r0]			
+													
+	WELS_ASM_FUNC_END
+
+
+/*
+ * void get_i_chroma_pred_h(uint8_t *pred, const int32_t stride);
+ * r0     --- pred
+ * r1     --- stride
+ * return --- void
+ */
+ 
+  WELS_ASM_FUNC_BEGIN WelsDecoderIChromaPredH_neon
+	//stmdb sp!, { r2-r5, lr}
+	////Get the left column (8 byte)
+	sub  r2, r0, #1
+	vld1.8 {d0[]}, [r2], r1
+	vld1.8 {d1[]}, [r2], r1	
+	vld1.8 {d2[]}, [r2], r1	
+	vld1.8 {d3[]}, [r2], r1
+	vld1.8 {d4[]}, [r2], r1
+	vld1.8 {d5[]}, [r2], r1	
+	vld1.8 {d6[]}, [r2], r1	
+	vld1.8 {d7[]}, [r2]
+	 
+	//Set the chroma MB using left column data 
+	vst1.8 {d0}, [r0], r1
+	vst1.8 {d1}, [r0], r1
+	vst1.8 {d2}, [r0], r1
+	vst1.8 {d3}, [r0], r1
+	vst1.8 {d4}, [r0], r1
+	vst1.8 {d5}, [r0], r1
+	vst1.8 {d6}, [r0], r1
+	vst1.8 {d7}, [r0]	
+	
+	WELS_ASM_FUNC_END
+
+
+/*
+ * void get_i_chroma_pred_dc_both(uint8_t *pred, const int32_t stride);
+ * r0     --- pred
+ * r1     --- stride
+ * return --- void
+ */
+ 
+    WELS_ASM_FUNC_BEGIN WelsDecoderIChromaPredDC_neon
+    //stmdb sp!, { r2-r5, lr}
+    //Load the left column data (8 bytes)
+    sub r2, r0, #1
+    GET_8BYTE_DATA d0, r2, r1	
+    
+    //Load the top row data (8 bytes)
+    sub  r2, r0, r1			
+    vldr d1, [r2]
+    
+    //Calculate the sum of left column and top row
+    vpaddl.u8  q0, q0
+    vpaddl.u16 q0, q0
+    vadd.u32   d2, d0, d1 //'m1' save to d2
+    
+    vrshr.u32  q0, q0, #2 //calculate 'm2','m3' 
+    vrshr.u32  d2, d2, #3 //calculate 'm4' 
+    
+    //duplicate the 'mx' to a vector line
+    vdup.8     d4, d2[0]
+    vdup.8     d5, d1[4]
+    vdup.8     d6, d0[4]
+    vdup.8     d7, d2[4]
+    
+    //Set the chroma MB 
+    vst2.32 {d4[0],d5[0]}, [r0], r1
+    vst2.32 {d4[0],d5[0]}, [r0], r1
+    vst2.32 {d4[0],d5[0]}, [r0], r1	
+    vst2.32 {d4[0],d5[0]}, [r0], r1
+    vst2.32 {d6[0],d7[0]}, [r0], r1
+    vst2.32 {d6[0],d7[0]}, [r0], r1
+    vst2.32 {d6[0],d7[0]}, [r0], r1	
+    vst2.32 {d6[0],d7[0]}, [r0]
+    		
+    WELS_ASM_FUNC_END
+
+/*
+ * void get_i_chroma_pred_plane(uint8_t *pred, const int32_t stride);
+ * r0     --- pred
+ * r1     --- stride
+ * return --- void
+ */
+//Table {{1,2,3,4,1,2,3,4}*17}
+CONST0_GET_I_CHROMA_PRED_PLANE: .long 0x44332211, 0x44332211//0x140f0a05, 0x28231e19
+//Table {-3,-2,-1,0,1,2,3,4}
+CONST1_GET_I_CHROMA_PRED_PLANE: .long 0xfffefffd, 0x0000ffff,0x00020001,0x00040003
+
+  WELS_ASM_FUNC_BEGIN WelsDecoderIChromaPredPlane_neon
+	//stmdb sp!, { r2-r5, lr}
+	//Load the top row data
+	sub  r2, r0, #1
+	sub  r2, r1
+	vld1.32 {d1[0]}, [r2] 
+	add  r2, #5
+	vld1.32 {d0[0]}, [r2]
+	
+	//Load the left column data
+	sub  r2, #5
+	vld1.8 {d1[4]}, [r2], r1
+	vld1.8 {d1[5]}, [r2], r1	
+	vld1.8 {d1[6]}, [r2], r1
+	vld1.8 {d1[7]}, [r2], r1 //d1:{LT,T0,T1,T2,LT,L0,L1,L2}	
+	add  r2, r1
+	vld1.8 {d0[4]}, [r2], r1
+	vld1.8 {d0[5]}, [r2], r1
+	vld1.8 {d0[6]}, [r2], r1
+	vld1.8 {d0[7]}, [r2]     //d0:{T4,T5,T6,T7,L4,L5,L6.L7}
+	
+	
+	//Save T7 to d3 for next step
+	vdup.u8   d3,   d0[3]
+	//Save L7 to d4 for next step
+	vdup.u8   d4,   d0[7]
+	
+	//Calculate the value of 'a' and save to q2
+	vaddl.u8  q2, d3, d4
+	vshl.u16  q2, #4
+	
+	//Load the table {{1,2,3,4,1,2,3,4}*17}
+	adr r2, CONST0_GET_I_CHROMA_PRED_PLANE
+	vld1.32   {d2}, [r2]
+	
+	//Calculate the 'b','c', and save to q0
+	vrev32.8  d1, d1
+	vsubl.u8  q0, d0, d1
+	vmovl.u8   q1, d2
+	vmul.s16   q0, q1
+	vpaddl.s16 q0, q0
+	vpaddl.s32 q0, q0
+	vrshr.s64  q0, #5
+	
+	//Load the table {-3,-2,-1,0,1,2,3,4} to q3
+	adr r2, CONST1_GET_I_CHROMA_PRED_PLANE
+	vld1.32   {d6, d7}, [r2]
+	
+	//Duplicate the 'b','c' to q0, q1 for SIMD instruction
+	vdup.s16   q1, d1[0]
+	vdup.s16   q0, d0[0]
+		
+	//Calculate the "(a + b * (j - 3) + c * (- 3) + 16) >> 5;"
+	vmla.s16   q2, q0, q3
+	vmla.s16   q2, q1, d6[0]
+	vqrshrun.s16 d0, q2, #5
+	
+	//Set a line of chroma MB
+	vst1.u32  {d0}, [r0], r1
+	
+	//Do the same processing for each line.
+	mov  r2, #7
+loop_0_get_i_chroma_pred_plane:	
+	vadd.s16   q2, q1
+	vqrshrun.s16 d0, q2, #5
+	vst1.u32  {d0}, [r0], r1
+	subs  r2, #1
+	bne  loop_0_get_i_chroma_pred_plane		
+    
+	WELS_ASM_FUNC_END
+
+#endif
--- a/codec/decoder/core/arm/mc_neon.S
+++ b/codec/decoder/core/arm/mc_neon.S
--- a/codec/decoder/core/inc/decode_mb_aux.h
+++ b/codec/decoder/core/inc/decode_mb_aux.h
@ -50,6 +50,10 @@ extern "C" {
 void IdctResAddPred_mmx (uint8_t* pPred, const int32_t kiStride, int16_t* pRs);
 #endif//X86_ASM

+#if defined(HAVE_NEON)
+void IdctResAddPred_neon(uint8_t *pred, const int32_t stride, int16_t *rs);
+#endif
+
 #if defined(__cplusplus)
 }
 #endif//__cplusplus
--- a/codec/decoder/core/inc/decode_slice.h
+++ b/codec/decoder/core/inc/decode_slice.h
@ -68,6 +68,12 @@ void WelsChromaDcIdct (int16_t* pBlock);
 extern "C" {
 #endif//__cplusplus

+#if defined(HAVE_NEON)
+void WelsResBlockZero16x16_neon(int16_t* pBlock, int32_t iStride);
+void WelsResBlockZero8x8_neon(int16_t* pBlock, int32_t iStride);
+void SetNonZeroCount_neon(int16_t* pBlock, int8_t* pNonZeroCount);
+#endif
+
 #ifdef  X86_ASM
 void WelsResBlockZero16x16_sse2 (int16_t* pBlock, int32_t iStride);
 void WelsResBlockZero8x8_sse2 (int16_t* pBlock, int32_t iStride);
--- a/codec/decoder/core/inc/get_intra_predictor.h
+++ b/codec/decoder/core/inc/get_intra_predictor.h
@ -107,6 +107,27 @@ void WelsDecoderI4x4LumaPredDDL_mmx (uint8_t* pPred, const int32_t kiStride);
 void WelsDecoderI4x4LumaPredVL_mmx (uint8_t* pPred, const int32_t kiStride);
 #endif//X86_ASM

+#if defined(HAVE_NEON)
+void WelsDecoderI16x16LumaPredV_neon(uint8_t *pPred, const int32_t kiStride);
+void WelsDecoderI16x16LumaPredH_neon(uint8_t *pPred, const int32_t kiStride);
+void WelsDecoderI16x16LumaPredDc_neon(uint8_t *pPred, const int32_t kiStride);
+void WelsDecoderI16x16LumaPredPlane_neon(uint8_t *pPred, const int32_t kiStride);
+	
+void WelsDecoderI4x4LumaPredV_neon(uint8_t *pPred, const int32_t kiStride);
+void WelsDecoderI4x4LumaPredH_neon(uint8_t *pPred, const int32_t kiStride);
+void WelsDecoderI4x4LumaPredDDL_neon(uint8_t *pPred, const int32_t kiStride);
+void WelsDecoderI4x4LumaPredDDR_neon(uint8_t *pPred, const int32_t kiStride);
+void WelsDecoderI4x4LumaPredVL_neon(uint8_t *pPred, const int32_t kiStride);
+void WelsDecoderI4x4LumaPredVR_neon(uint8_t *pPred, const int32_t kiStride);
+void WelsDecoderI4x4LumaPredHU_neon(uint8_t *pPred, const int32_t kiStride);
+void WelsDecoderI4x4LumaPredHD_neon(uint8_t *pPred, const int32_t kiStride);
+	
+void WelsDecoderIChromaPredV_neon(uint8_t *pPred, const int32_t kiStride);
+void WelsDecoderIChromaPredH_neon(uint8_t *pPred, const int32_t kiStride);
+void WelsDecoderIChromaPredDC_neon(uint8_t *pPred, const int32_t kiStride);
+void WelsDecoderIChromaPredPlane_neon(uint8_t *pPred, const int32_t kiStride);	
+#endif//HAVE_NEON
+
 #if defined(__cplusplus)
 }
 #endif//__cplusplus
--- a/codec/decoder/core/src/deblocking.cpp
+++ b/codec/decoder/core/src/deblocking.cpp
@ -39,6 +39,7 @@
 */

 #include "deblocking.h"
+#include "deblocking_common.h"
 #include "cpu_core.h"

 namespace WelsDec {
@ -718,6 +719,19 @@ void  DeblockingInit (SDeblockingFunc*  pFunc,  int32_t iCpu) {
  }
 #endif

+#if defined(HAVE_NEON)
+	{
+		pFunc->pfLumaDeblockingLT4Ver		= DeblockLumaLt4V_neon;
+		pFunc->pfLumaDeblockingEQ4Ver		= DeblockLumaEq4V_neon;
+		pFunc->pfLumaDeblockingLT4Hor		= DeblockLumaLt4H_neon;
+		pFunc->pfLumaDeblockingEQ4Hor		= DeblockLumaEq4H_neon;
+		
+		pFunc->pfChromaDeblockingLT4Ver     = DeblockChromaLt4V_neon;
+		pFunc->pfChromaDeblockingEQ4Ver     = DeblockChromaEq4V_neon;
+		pFunc->pfChromaDeblockingLT4Hor     = DeblockChromaLt4H_neon;
+		pFunc->pfChromaDeblockinEQ4Hor      = DeblockChromaEq4H_neon;
+	}
+#endif
 }

 } // namespace WelsDec
--- a/codec/decoder/core/src/decode_slice.cpp
+++ b/codec/decoder/core/src/decode_slice.cpp
@ -1148,6 +1148,12 @@ void WelsBlockFuncInit (SBlockFunc*   pFunc,  int32_t iCpu) {
    pFunc->pWelsBlockZero8x8Func	    = WelsResBlockZero8x8_sse2;
  }
 #endif
+
+#ifdef	HAVE_NEON
+  pFunc->pWelsBlockZero16x16Func		= WelsResBlockZero16x16_neon;
+  pFunc->pWelsBlockZero8x8Func			= WelsResBlockZero8x8_neon;
+  pFunc->pWelsSetNonZeroCountFunc			= SetNonZeroCount_neon;
+#endif
 }
 void WelsBlockZero16x16_c (int16_t* pBlock, int32_t iStride) {
  WelsBlockInit (pBlock, 16, 16, iStride, 0);
--- a/codec/decoder/core/src/decoder.cpp
+++ b/codec/decoder/core/src/decoder.cpp
@ -655,6 +655,30 @@ void AssignFuncPointerForRec (PWelsDecoderContext pCtx) {

  InitDctClipTable();
  pCtx->pIdctResAddPredFunc	= IdctResAddPred_c;
+    
+#if defined(HAVE_NEON)
+  pCtx->pIdctResAddPredFunc	= IdctResAddPred_neon;
+    
+	pCtx->pGetI16x16LumaPredFunc[I16_PRED_DC] = WelsDecoderI16x16LumaPredDc_neon;
+	pCtx->pGetI16x16LumaPredFunc[I16_PRED_P]  = WelsDecoderI16x16LumaPredPlane_neon;
+	pCtx->pGetI16x16LumaPredFunc[I16_PRED_H]  = WelsDecoderI16x16LumaPredH_neon;
+	pCtx->pGetI16x16LumaPredFunc[I16_PRED_V]  = WelsDecoderI16x16LumaPredV_neon;
+    
+	pCtx->pGetI4x4LumaPredFunc[I4_PRED_V    ] = WelsDecoderI4x4LumaPredV_neon;
+	pCtx->pGetI4x4LumaPredFunc[I4_PRED_H    ] = WelsDecoderI4x4LumaPredH_neon;
+	pCtx->pGetI4x4LumaPredFunc[I4_PRED_DDL  ] = WelsDecoderI4x4LumaPredDDL_neon;
+	pCtx->pGetI4x4LumaPredFunc[I4_PRED_DDR  ] = WelsDecoderI4x4LumaPredDDR_neon;
+	pCtx->pGetI4x4LumaPredFunc[I4_PRED_VL   ] = WelsDecoderI4x4LumaPredVL_neon;
+	pCtx->pGetI4x4LumaPredFunc[I4_PRED_VR   ] = WelsDecoderI4x4LumaPredVR_neon;
+	pCtx->pGetI4x4LumaPredFunc[I4_PRED_HU   ] = WelsDecoderI4x4LumaPredHU_neon;
+	pCtx->pGetI4x4LumaPredFunc[I4_PRED_HD   ] = WelsDecoderI4x4LumaPredHD_neon;
+	
+	pCtx->pGetIChromaPredFunc[C_PRED_H]       = WelsDecoderIChromaPredH_neon;
+	pCtx->pGetIChromaPredFunc[C_PRED_V]       = WelsDecoderIChromaPredV_neon;
+	pCtx->pGetIChromaPredFunc[C_PRED_P ]      = WelsDecoderIChromaPredPlane_neon;
+	pCtx->pGetIChromaPredFunc[C_PRED_DC]      = WelsDecoderIChromaPredDC_neon;
+#endif//HAVE_NEON
+

 #if defined(X86_ASM)
  if (pCtx->uiCpuFlag & WELS_CPU_MMXEXT) {
--- a/codec/decoder/core/src/mc.cpp
+++ b/codec/decoder/core/src/mc.cpp
@ -636,7 +636,7 @@ void McLuma_sse2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_
 void McChroma_sse2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
                      int16_t iMvX, int16_t iMvY, int32_t iWidth, int32_t iHeight) {
  static const PMcChromaWidthExtFunc kpMcChromaWidthFuncs[2] = {
-    McChromaWidthEq4_mmx,
+				McChromaWidthEq4_mmx,
    McChromaWidthEq8_sse2
  };
  const int32_t kiD8x = iMvX & 0x07;
@ -651,17 +651,334 @@ void McChroma_sse2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int3
    McChromaWithFragMv_c (pSrc, iSrcStride, pDst, iDstStride, iMvX, iMvY, iWidth, iHeight);
 }

-
 #endif //X86_ASM
-
+//***************************************************************************//
+//                       NEON implementation                      //
+//***************************************************************************//
+#if defined(HAVE_NEON)
+void McCopy_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+																						int32_t iWidth, int32_t iHeight)
+{
+  if (16 == iWidth)
+				McCopyWidthEq16_neon(pSrc, iSrcStride, pDst, iDstStride, iHeight);
+		else if(8 == iWidth)
+				McCopyWidthEq8_neon(pSrc, iSrcStride, pDst, iDstStride, iHeight);
+		else
+				McCopyWidthEq4_neon(pSrc, iSrcStride, pDst, iDstStride, iHeight);
+		}
+void McHorVer20_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+																				int32_t iWidth, int32_t iHeight)
+{
+		if (iWidth == 16)
+				McHorVer20WidthEq16_neon(pSrc, iSrcStride, pDst, iDstStride, iHeight);
+		else if (iWidth == 8)
+				McHorVer20WidthEq8_neon(pSrc, iSrcStride, pDst, iDstStride, iHeight);
+		else if (iWidth == 4)
+				McHorVer20WidthEq4_neon(pSrc, iSrcStride, pDst, iDstStride, iHeight);
+}
+void McHorVer02_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+																				int32_t iWidth, int32_t iHeight)
+{
+		if (iWidth == 16)
+				McHorVer02WidthEq16_neon(pSrc, iSrcStride, pDst, iDstStride, iHeight);
+		else if (iWidth == 8)
+				McHorVer02WidthEq8_neon(pSrc, iSrcStride, pDst, iDstStride, iHeight);
+		else if (iWidth == 4)
+				McHorVer02WidthEq4_neon(pSrc, iSrcStride, pDst, iDstStride, iHeight);
+}
+void McHorVer22_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+																				int32_t iWidth, int32_t iHeight)
+{
+		if (iWidth == 16)
+				McHorVer22WidthEq16_neon(pSrc, iSrcStride, pDst, iDstStride, iHeight);
+		else if (iWidth == 8)
+				McHorVer22WidthEq8_neon(pSrc, iSrcStride, pDst, iDstStride, iHeight);
+		else if (iWidth == 4)
+				McHorVer22WidthEq4_neon(pSrc, iSrcStride, pDst, iDstStride, iHeight);
+}
+		
+void McHorVer01_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+																							int32_t iWidth, int32_t iHeight)
+{
+  if (iWidth == 16)
+				McHorVer01WidthEq16_neon(pSrc, iSrcStride, pDst, iDstStride, iHeight);
+  else if (iWidth == 8)
+				McHorVer01WidthEq8_neon(pSrc, iSrcStride, pDst, iDstStride, iHeight);
+  else if (iWidth == 4)
+				McHorVer01WidthEq4_neon(pSrc, iSrcStride, pDst, iDstStride, iHeight);
+}
+void McHorVer03_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+																							int32_t iWidth, int32_t iHeight)
+{
+  if (iWidth == 16)
+				McHorVer03WidthEq16_neon(pSrc, iSrcStride, pDst, iDstStride, iHeight);
+		else if (iWidth == 8)
+				McHorVer03WidthEq8_neon(pSrc, iSrcStride, pDst, iDstStride, iHeight);
+		else if (iWidth == 4)
+				McHorVer03WidthEq4_neon(pSrc, iSrcStride, pDst, iDstStride, iHeight);
+}
+void McHorVer10_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+																							int32_t iWidth, int32_t iHeight)
+{
+  if (iWidth == 16)
+				McHorVer10WidthEq16_neon(pSrc, iSrcStride, pDst, iDstStride, iHeight);
+		else if (iWidth == 8)
+				McHorVer10WidthEq8_neon(pSrc, iSrcStride, pDst, iDstStride, iHeight);
+		else if (iWidth == 4)
+				McHorVer10WidthEq4_neon(pSrc, iSrcStride, pDst, iDstStride, iHeight);
+}
+void McHorVer11_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+																							int32_t iWidth, int32_t iHeight)
+{
+  ENFORCE_STACK_ALIGN_1D( uint8_t, pHorTmp, 256, 16 );
+  ENFORCE_STACK_ALIGN_1D( uint8_t, pVerTmp, 256, 16 );
+  if (iWidth == 16)
+		{
+				McHorVer20WidthEq16_neon(pSrc, iSrcStride, pHorTmp, 16, iHeight);
+				McHorVer02WidthEq16_neon(pSrc, iSrcStride, pVerTmp, 16, iHeight);
+				PixelAvgWidthEq16_neon(pDst, iDstStride, pHorTmp, pVerTmp, iHeight);
+		}
+		else if (iWidth == 8)
+		{
+				McHorVer20WidthEq8_neon(pSrc, iSrcStride, pHorTmp, 16, iHeight);
+				McHorVer02WidthEq8_neon(pSrc, iSrcStride, pVerTmp, 16, iHeight);
+				PixelAvgWidthEq8_neon(pDst, iDstStride, pHorTmp, pVerTmp, iHeight);
+  }
+		else if (iWidth == 4)
+		{
+				McHorVer20WidthEq4_neon(pSrc, iSrcStride, pHorTmp, 16, iHeight);
+				McHorVer02WidthEq4_neon(pSrc, iSrcStride, pVerTmp, 16, iHeight);
+				PixelAvgWidthEq4_neon(pDst, iDstStride, pHorTmp, pVerTmp, iHeight);
+		}
+}
+void McHorVer12_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+																							int32_t iWidth, int32_t iHeight)
+{
+  ENFORCE_STACK_ALIGN_1D( uint8_t, pVerTmp, 256, 16 );
+  ENFORCE_STACK_ALIGN_1D( uint8_t, pCtrTmp, 256, 16 );
+  if (iWidth == 16)
+		{
+				McHorVer02WidthEq16_neon(pSrc, iSrcStride, pVerTmp, 16, iHeight);
+				McHorVer22WidthEq16_neon(pSrc, iSrcStride, pCtrTmp, 16, iHeight);
+				PixelAvgWidthEq16_neon(pDst, iDstStride, pVerTmp, pCtrTmp, iHeight);
+  }
+		else if (iWidth == 8)
+		{
+				McHorVer02WidthEq8_neon(pSrc, iSrcStride, pVerTmp, 16, iHeight);
+				McHorVer22WidthEq8_neon(pSrc, iSrcStride, pCtrTmp, 16, iHeight);
+				PixelAvgWidthEq8_neon(pDst, iDstStride, pVerTmp, pCtrTmp, iHeight);
+  }
+		else if (iWidth == 4)
+		{
+				McHorVer02WidthEq4_neon(pSrc, iSrcStride, pVerTmp, 16, iHeight);
+				McHorVer22WidthEq4_neon(pSrc, iSrcStride, pCtrTmp, 16, iHeight);
+				PixelAvgWidthEq4_neon(pDst, iDstStride, pVerTmp, pCtrTmp, iHeight);
+  }
+}
+void McHorVer13_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+																							int32_t iWidth, int32_t iHeight)
+{
+  ENFORCE_STACK_ALIGN_1D( uint8_t, pHorTmp, 256, 16 );
+  ENFORCE_STACK_ALIGN_1D( uint8_t, pVerTmp, 256, 16 );
+  if (iWidth == 16)
+  {
+				McHorVer20WidthEq16_neon(pSrc+iSrcStride, iSrcStride, pHorTmp, 16, iHeight);
+				McHorVer02WidthEq16_neon(pSrc, iSrcStride, pVerTmp, 16, iHeight);
+				PixelAvgWidthEq16_neon(pDst, iDstStride, pHorTmp, pVerTmp, iHeight);
+		}
+		else if (iWidth == 8)
+		{
+				McHorVer20WidthEq8_neon(pSrc+iSrcStride, iSrcStride, pHorTmp, 16, iHeight);
+				McHorVer02WidthEq8_neon(pSrc, iSrcStride, pVerTmp, 16, iHeight);
+				PixelAvgWidthEq8_neon(pDst, iDstStride, pHorTmp, pVerTmp, iHeight);
+		}
+		else if (iWidth == 4)
+		{
+				McHorVer20WidthEq4_neon(pSrc+iSrcStride, iSrcStride, pHorTmp, 16, iHeight);
+				McHorVer02WidthEq4_neon(pSrc, iSrcStride, pVerTmp, 16, iHeight);
+				PixelAvgWidthEq4_neon(pDst, iDstStride, pHorTmp, pVerTmp, iHeight);
+		}
+}
+void McHorVer21_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+																							int32_t iWidth, int32_t iHeight)
+{
+  ENFORCE_STACK_ALIGN_1D( uint8_t, pHorTmp, 256, 16 );
+  ENFORCE_STACK_ALIGN_1D( uint8_t, pCtrTmp, 256, 16 );
+  if (iWidth == 16)
+		{
+				McHorVer20WidthEq16_neon(pSrc, iSrcStride, pHorTmp, 16, iHeight);
+				McHorVer22WidthEq16_neon(pSrc, iSrcStride, pCtrTmp, 16, iHeight);
+				PixelAvgWidthEq16_neon(pDst, iDstStride, pHorTmp, pCtrTmp, iHeight);
+		}
+		else if (iWidth == 8)
+		{
+				McHorVer20WidthEq8_neon(pSrc, iSrcStride, pHorTmp, 16, iHeight);
+				McHorVer22WidthEq8_neon(pSrc, iSrcStride, pCtrTmp, 16, iHeight);
+				PixelAvgWidthEq8_neon(pDst, iDstStride, pHorTmp, pCtrTmp, iHeight);
+		}
+		else if (iWidth == 4)
+		{
+				McHorVer20WidthEq4_neon(pSrc, iSrcStride, pHorTmp, 16, iHeight);
+				McHorVer22WidthEq4_neon(pSrc, iSrcStride, pCtrTmp, 16, iHeight);
+				PixelAvgWidthEq4_neon(pDst, iDstStride, pHorTmp, pCtrTmp, iHeight);
+		}
+}
+void McHorVer23_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+																							int32_t iWidth, int32_t iHeight)
+{
+  ENFORCE_STACK_ALIGN_1D( uint8_t, pHorTmp, 256, 16 );
+  ENFORCE_STACK_ALIGN_1D( uint8_t, pCtrTmp, 256, 16 );
+  if (iWidth == 16)
+		{
+				McHorVer20WidthEq16_neon(pSrc+iSrcStride, iSrcStride, pHorTmp, 16, iHeight);
+				McHorVer22WidthEq16_neon(pSrc, iSrcStride, pCtrTmp, 16, iHeight);
+				PixelAvgWidthEq16_neon(pDst, iDstStride, pHorTmp, pCtrTmp, iHeight);
+		}
+  else if (iWidth == 8)
+		{
+				McHorVer20WidthEq8_neon(pSrc+iSrcStride, iSrcStride, pHorTmp, 16, iHeight);
+				McHorVer22WidthEq8_neon(pSrc, iSrcStride, pCtrTmp, 16, iHeight);
+				PixelAvgWidthEq8_neon(pDst, iDstStride, pHorTmp, pCtrTmp, iHeight);
+		}
+		else if (iWidth == 4)
+		{
+				McHorVer20WidthEq4_neon(pSrc+iSrcStride, iSrcStride, pHorTmp, 16, iHeight);
+				McHorVer22WidthEq4_neon(pSrc, iSrcStride, pCtrTmp, 16, iHeight);
+				PixelAvgWidthEq4_neon(pDst, iDstStride, pHorTmp, pCtrTmp, iHeight);
+		}
+}
+void McHorVer30_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+																							int32_t iWidth, int32_t iHeight)
+{
+  if (iWidth == 16)
+				McHorVer30WidthEq16_neon(pSrc, iSrcStride, pDst, iDstStride, iHeight);
+		else if (iWidth == 8)
+				McHorVer30WidthEq8_neon(pSrc, iSrcStride, pDst, iDstStride, iHeight);
+		else if (iWidth == 4)
+				McHorVer30WidthEq4_neon(pSrc, iSrcStride, pDst, iDstStride, iHeight);
+}
+void McHorVer31_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+																							int32_t iWidth, int32_t iHeight)
+{
+  ENFORCE_STACK_ALIGN_1D( uint8_t, pHorTmp, 256, 16 );
+  ENFORCE_STACK_ALIGN_1D( uint8_t, pVerTmp, 256, 16 );
+  if (iWidth == 16) {
+				McHorVer20WidthEq16_neon(pSrc, iSrcStride, pHorTmp, 16, iHeight);
+				McHorVer02WidthEq16_neon(pSrc+1, iSrcStride, pVerTmp, 16, iHeight);
+				PixelAvgWidthEq16_neon(pDst, iDstStride, pHorTmp, pVerTmp, iHeight);
+		}
+		else if (iWidth == 8){
+				McHorVer20WidthEq8_neon(pSrc, iSrcStride, pHorTmp, 16, iHeight);
+				McHorVer02WidthEq8_neon(pSrc+1, iSrcStride, pVerTmp, 16, iHeight);
+				PixelAvgWidthEq8_neon(pDst, iDstStride, pHorTmp, pVerTmp, iHeight);
+		}
+		else if (iWidth == 4)
+		{
+				McHorVer20WidthEq4_neon(pSrc, iSrcStride, pHorTmp, 16, iHeight);
+				McHorVer02WidthEq4_neon(pSrc+1, iSrcStride, pVerTmp, 16, iHeight);
+				PixelAvgWidthEq4_neon(pDst, iDstStride, pHorTmp, pVerTmp, iHeight);
+		}
+}
+void McHorVer32_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+																							int32_t iWidth, int32_t iHeight)
+{
+  ENFORCE_STACK_ALIGN_1D( uint8_t, pVerTmp, 256, 16 );
+  ENFORCE_STACK_ALIGN_1D( uint8_t, pCtrTmp, 256, 16 );
+  if (iWidth == 16)
+		{
+				McHorVer02WidthEq16_neon(pSrc+1, iSrcStride, pVerTmp, 16, iHeight);
+				McHorVer22WidthEq16_neon(pSrc, iSrcStride, pCtrTmp, 16, iHeight);
+				PixelAvgWidthEq16_neon(pDst, iDstStride, pVerTmp, pCtrTmp, iHeight);
+		}
+		else if (iWidth == 8)
+		{
+				McHorVer02WidthEq8_neon(pSrc+1, iSrcStride, pVerTmp, 16, iHeight);
+				McHorVer22WidthEq8_neon(pSrc, iSrcStride, pCtrTmp, 16, iHeight);
+				PixelAvgWidthEq8_neon(pDst, iDstStride, pVerTmp, pCtrTmp, iHeight);
+		}
+		else if (iWidth == 4)
+		{
+				McHorVer02WidthEq4_neon(pSrc+1, iSrcStride, pVerTmp, 16, iHeight);
+				McHorVer22WidthEq4_neon(pSrc, iSrcStride, pCtrTmp, 16, iHeight);
+				PixelAvgWidthEq4_neon(pDst, iDstStride, pVerTmp, pCtrTmp, iHeight);
+		}
+}
+void McHorVer33_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+																							int32_t iWidth, int32_t iHeight)
+{
+  ENFORCE_STACK_ALIGN_1D( uint8_t, pHorTmp, 256, 16 );
+  ENFORCE_STACK_ALIGN_1D( uint8_t, pVerTmp, 256, 16 );
+  if (iWidth == 16)
+		{
+				McHorVer20WidthEq16_neon(pSrc+iSrcStride, iSrcStride, pHorTmp, 16, iHeight);
+				McHorVer02WidthEq16_neon(pSrc+1, iSrcStride, pVerTmp, 16, iHeight);
+				PixelAvgWidthEq16_neon(pDst, iDstStride, pHorTmp, pVerTmp, iHeight);
+		}
+		else if (iWidth == 8)
+		{
+				McHorVer20WidthEq8_neon(pSrc+iSrcStride, iSrcStride, pHorTmp, 16, iHeight);
+				McHorVer02WidthEq8_neon(pSrc+1, iSrcStride, pVerTmp, 16, iHeight);
+				PixelAvgWidthEq8_neon(pDst, iDstStride, pHorTmp, pVerTmp, iHeight);
+		}
+		else if (iWidth == 4)
+		{
+				McHorVer20WidthEq4_neon(pSrc+iSrcStride, iSrcStride, pHorTmp, 16, iHeight);
+				McHorVer02WidthEq4_neon(pSrc+1, iSrcStride, pVerTmp, 16, iHeight);
+				PixelAvgWidthEq4_neon(pDst, iDstStride, pHorTmp, pVerTmp, iHeight);
+		}
+}
+		
+void McLuma_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+											int16_t iMvX, int16_t iMvY, int32_t iWidth, int32_t iHeight)
+{
+  static PWelsMcWidthHeightFunc pWelsMcFunc[4][4] =  //[x][y]
+  {
+				{McCopy_neon,  McHorVer01_neon, McHorVer02_neon,    McHorVer03_neon},
+				{McHorVer10_neon, McHorVer11_neon, McHorVer12_neon, McHorVer13_neon},
+				{McHorVer20_neon,    McHorVer21_neon, McHorVer22_neon,    McHorVer23_neon},
+				{McHorVer30_neon, McHorVer31_neon, McHorVer32_neon, McHorVer33_neon},
+		};
+  //	pSrc += (iMvY >> 2) * iSrcStride + (iMvX >> 2);
+  pWelsMcFunc[iMvX&0x03][iMvY&0x03](pSrc, iSrcStride, pDst, iDstStride, iWidth, iHeight);
+}
+void McChroma_neon(const uint8_t *pSrc, int32_t iSrcStride, uint8_t *pDst, int32_t iDstStride,
+												int16_t iMvX, int16_t iMvY, int32_t iWidth, int32_t iHeight)
+{
+  if (0 == iMvX && 0 == iMvY)
+		{
+				if(8 == iWidth)
+				  McCopyWidthEq8_neon(pSrc, iSrcStride, pDst, iDstStride, iHeight);
+				else if(iWidth == 4)
+				  McCopyWidthEq4_neon(pSrc, iSrcStride, pDst, iDstStride, iHeight);
+				else //here iWidth == 2
+				  McCopyWidthEq2_c(pSrc,iSrcStride,pDst,iDstStride,iHeight);
+		}
+		else
+		{
+				const int32_t kiD8x = iMvX & 0x07;
+				const int32_t kiD8y = iMvY & 0x07;
+				if(8 == iWidth)
+				  McChromaWidthEq8_neon(pSrc, iSrcStride, pDst, iDstStride, (int32_t*)(g_kuiABCD[kiD8y][kiD8x]), iHeight);
+				else if(4 == iWidth)
+				  McChromaWidthEq4_neon(pSrc, iSrcStride, pDst, iDstStride, (int32_t*)(g_kuiABCD[kiD8y][kiD8x]), iHeight);
+				else //here iWidth == 2
+				  McChromaWithFragMv_c(pSrc, iSrcStride, pDst, iDstStride, iMvX, iMvY, iWidth, iHeight);
+		}
+}
+#endif
 void InitMcFunc (SMcFunc* pMcFunc, int32_t iCpu) {
  pMcFunc->pMcLumaFunc   = McLuma_c;
  pMcFunc->pMcChromaFunc = McChroma_c;

+#ifdef	HAVE_NEON
+	 pMcFunc->pMcLumaFunc	  = McLuma_neon;
+	 pMcFunc->pMcChromaFunc  = McChroma_neon;
+#endif
+
 #if defined (X86_ASM)
  if (iCpu & WELS_CPU_SSE2) {
-    pMcFunc->pMcLumaFunc   = McLuma_sse2;
-    pMcFunc->pMcChromaFunc = McChroma_sse2;
+  pMcFunc->pMcLumaFunc   = McLuma_sse2;
+  pMcFunc->pMcChromaFunc = McChroma_sse2;
  }
 #endif //(X86_ASM)
 }