Add arm64 neon code for intraSad&Satd

2014-06-25 13:43:26 +08:00 · 2014-06-25 13:43:26 +08:00 · f0ec323e2c
commit f0ec323e2c
parent 3f333b01fd
5 changed files with 684 additions and 0 deletions
--- a/codec/build/iOS/enc/welsenc/welsenc.xcodeproj/project.pbxproj
+++ b/codec/build/iOS/enc/welsenc/welsenc.xcodeproj/project.pbxproj
@ -7,6 +7,7 @@
 	objects = {

 /* Begin PBXBuildFile section */
+		4C23BC60195A77E0003B81FC /* intra_pred_sad_3_opt_neon_aarch64.S in Sources */ = {isa = PBXBuildFile; fileRef = 4C23BC5F195A77E0003B81FC /* intra_pred_sad_3_opt_neon_aarch64.S */; };
 		4C34066D18C57D0400DFA14A /* intra_pred_neon.S in Sources */ = {isa = PBXBuildFile; fileRef = 4C34066618C57D0400DFA14A /* intra_pred_neon.S */; };
 		4C34066E18C57D0400DFA14A /* intra_pred_sad_3_opt_neon.S in Sources */ = {isa = PBXBuildFile; fileRef = 4C34066718C57D0400DFA14A /* intra_pred_sad_3_opt_neon.S */; };
 		4C34067018C57D0400DFA14A /* memory_neon.S in Sources */ = {isa = PBXBuildFile; fileRef = 4C34066918C57D0400DFA14A /* memory_neon.S */; };
@ -61,6 +62,7 @@
 /* End PBXCopyFilesBuildPhase section */

 /* Begin PBXFileReference section */
+		4C23BC5F195A77E0003B81FC /* intra_pred_sad_3_opt_neon_aarch64.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; name = intra_pred_sad_3_opt_neon_aarch64.S; path = arm64/intra_pred_sad_3_opt_neon_aarch64.S; sourceTree = "<group>"; };
 		4C34066618C57D0400DFA14A /* intra_pred_neon.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = intra_pred_neon.S; sourceTree = "<group>"; };
 		4C34066718C57D0400DFA14A /* intra_pred_sad_3_opt_neon.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = intra_pred_sad_3_opt_neon.S; sourceTree = "<group>"; };
 		4C34066918C57D0400DFA14A /* memory_neon.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = memory_neon.S; sourceTree = "<group>"; };
@ -182,6 +184,7 @@
 		4CB8F2B219235FAC005D6386 /* arm64 */ = {
 			isa = PBXGroup;
 			children = (
+				4C23BC5F195A77E0003B81FC /* intra_pred_sad_3_opt_neon_aarch64.S */,
 				4CBC1B82194ACBB400214D9E /* intra_pred_aarch64_neon.S */,
 				4CB8F2B319235FC5005D6386 /* pixel_aarch64_neon.S */,
 			);
@ -422,6 +425,7 @@
 				4CE4472518BC605C0017DF25 /* svc_encode_mb.cpp in Sources */,
 				4CE4471A18BC605C0017DF25 /* mv_pred.cpp in Sources */,
 				4C34066E18C57D0400DFA14A /* intra_pred_sad_3_opt_neon.S in Sources */,
+				4C23BC60195A77E0003B81FC /* intra_pred_sad_3_opt_neon_aarch64.S in Sources */,
 				4CE4472B18BC605C0017DF25 /* wels_preprocess.cpp in Sources */,
 				4CE4470E18BC605C0017DF25 /* au_set.cpp in Sources */,
 				4CBC1B83194ACBB400214D9E /* intra_pred_aarch64_neon.S in Sources */,
--- a/codec/encoder/core/arm64/intra_pred_sad_3_opt_neon_aarch64.S
+++ b/codec/encoder/core/arm64/intra_pred_sad_3_opt_neon_aarch64.S
@ -0,0 +1,665 @@
+/*!
+ * \copy
+ *     Copyright (c)  2013, Cisco Systems
+ *     All rights reserved.
+ *
+ *     Redistribution and use in source and binary forms, with or without
+ *     modification, are permitted provided that the following conditions
+ *     are met:
+ *
+ *        * Redistributions of source code must retain the above copyright
+ *          notice, this list of conditions and the following disclaimer.
+ *
+ *        * Redistributions in binary form must reproduce the above copyright
+ *          notice, this list of conditions and the following disclaimer in
+ *          the documentation and/or other materials provided with the
+ *          distribution.
+ *
+ *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *     POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#ifdef HAVE_NEON_AARCH64
+.text
+#include "arm_arch64_common_macro.S"
+
+.macro LOAD_LUMA_DATA
+    sub     x7, x0, x1
+    ld1     {v0.16b}, [x7]      //top
+    sub     x7, x0, #1
+    ld1     {v1.b}[0], [x7], x1
+    ld1     {v1.b}[1], [x7], x1
+    ld1     {v1.b}[2], [x7], x1
+    ld1     {v1.b}[3], [x7], x1
+    ld1     {v1.b}[4], [x7], x1
+    ld1     {v1.b}[5], [x7], x1
+    ld1     {v1.b}[6], [x7], x1
+    ld1     {v1.b}[7], [x7], x1
+    ld1     {v1.b}[8], [x7], x1
+    ld1     {v1.b}[9], [x7], x1
+    ld1     {v1.b}[10], [x7], x1
+    ld1     {v1.b}[11], [x7], x1
+    ld1     {v1.b}[12], [x7], x1
+    ld1     {v1.b}[13], [x7], x1
+    ld1     {v1.b}[14], [x7], x1
+    ld1     {v1.b}[15], [x7]    //left
+.endm
+
+.macro LOAD_16X4_DATA
+    //Load the p_enc data and save to "v22 ~ v25"--- 16X4 bytes
+    ld1     {v0.16b}, [x2], x3
+    ld1     {v1.16b}, [x2], x3
+    ld1     {v20.16b}, [x2], x3
+    ld1     {v21.16b}, [x2], x3
+    trn1    v22.4s, v0.4s, v1.4s
+    trn2    v23.4s, v0.4s, v1.4s
+    trn1    v24.4s, v20.4s, v21.4s
+    trn2    v25.4s, v20.4s, v21.4s
+.endm
+
+.macro GET_16X16_V_SATD
+    trn1    v6.4s, v4.4s, v5.4s
+    trn2    v7.4s, v4.4s, v5.4s
+    add     v4.8h, v6.8h, v7.8h
+    sub     v5.8h, v6.8h, v7.8h
+    trn1    v6.8h, v4.8h, v5.8h
+    trn2    v7.8h, v4.8h, v5.8h
+    add     v4.8h, v6.8h, v7.8h
+    sub     v5.8h, v6.8h, v7.8h
+    trn1    v6.4s, v4.4s, v5.4s
+    trn2    v7.4s, v4.4s, v5.4s     //{0,1,3,2, 4,5,7,6} v6 {8,9,11,10, 12,13,15,14} v7
+.endm
+
+.macro GET_16X16_H_SATD
+    trn1    v16.4s, v4.4s, v5.4s
+    trn2    v17.4s, v4.4s, v5.4s
+    add     v4.8h, v16.8h, v17.8h
+    sub     v5.8h, v16.8h, v17.8h
+    trn1    v16.8h, v4.8h, v5.8h
+    trn2    v17.8h, v4.8h, v5.8h
+    add     v4.8h, v16.8h, v17.8h
+    sub     v5.8h, v16.8h, v17.8h
+    trn1    v16.4s, v4.4s, v5.4s
+    trn2    v17.4s, v4.4s, v5.4s    //{0,1,3,2, 4,5,7,6} v16 {8,9,11,10, 12,13,15,14} v17
+.endm
+
+#ifdef __APPLE__
+.macro SELECT_BEST_COST
+    cmp     w1, $0
+    csel    $0, $0, w1, hs
+    cset    w7, lo
+    cmp     w2, $0
+    mov     w6, #2
+    csel    $0, $0, w2, hs
+    csel    w7, w7, w6, hs
+.endm
+
+.macro LOAD_CHROMA_DATA
+    sub     x9, $0, x1
+    ld1     {$1}, [x9]      //top_cb
+    sub     x9, $0, #1
+    ld1     {$2}[8], [x9], x1
+    ld1     {$2}[9], [x9], x1
+    ld1     {$2}[10], [x9], x1
+    ld1     {$2}[11], [x9], x1
+    ld1     {$2}[12], [x9], x1
+    ld1     {$2}[13], [x9], x1
+    ld1     {$2}[14], [x9], x1
+    ld1     {$2}[15], [x9], x1 //left_cb
+.endm
+
+.macro LOAD_8X4_DATA
+    //Load the p_enc data and save to "v20 ~ v21"--- 8X4 bytes
+    ld1     {v0.8b}, [$0], x3
+    ld1     {v1.8b}, [$0], x3
+    ld1     {v0.d}[1], [$0], x3
+    ld1     {v1.d}[1], [$0], x3
+    trn1    v2.4s, v0.4s, v1.4s
+    trn2    v1.4s, v0.4s, v1.4s
+    trn1    v20.2d, v2.2d, v1.2d
+    trn2    v21.2d, v2.2d, v1.2d
+.endm
+
+.macro HDM_TRANSFORM_4X4_L0
+    //Do the vertical transform
+    uadd$9   v0.8h, $0, $1
+    usub$9   v1.8h, $0, $1
+    trn1    v3.2d, v0.2d, v1.2d
+    trn2    v1.2d, v0.2d, v1.2d
+    add     v4.8h, v3.8h, v1.8h //{0,1,2,3,4,5,6,7}
+    sub     v5.8h, v3.8h, v1.8h //{12,13,14,15,8,9,10,11}
+
+    //Do the horizontal transform
+    trn1    v0.4s, v4.4s, v5.4s
+    trn2    v1.4s, v4.4s, v5.4s
+    add     v4.8h, v0.8h, v1.8h
+    sub     v5.8h, v0.8h, v1.8h
+    trn1    v0.8h, v4.8h, v5.8h
+    trn2    v1.8h, v4.8h, v5.8h
+    add     v4.8h, v0.8h, v1.8h
+    sub     v5.8h, v0.8h, v1.8h
+
+    //16x16_v
+    trn1    v0.2s, v4.2s, v5.2s
+    trn2    v1.2s, v4.2s, v5.2s
+    sabal   $5, v0.4h, $2
+    sabal   $5, v1.4h, $8.4h
+    sabal2  $5, v4.8h, $8.8h
+    sabal2  $5, v5.8h, $8.8h
+
+    //16x16_h
+    ins     v3.d[0], v4.d[1]
+    trn1    v0.4h, v4.4h, v3.4h
+    trn2    v1.4h, v4.4h, v3.4h
+    sabal   $6, v0.4h, $3
+    sabdl   v4.4s, v1.4h, $8.4h
+    sabal   v4.4s, v5.4h, $8.4h
+    sabal2  v4.4s, v5.8h, $8.8h
+    add     $6, $6, v4.4s
+
+    //16x16_dc_both
+    sabal   $7, v0.4h, $4
+    add     $7, $7, v4.4s
+.endm
+#else
+.macro SELECT_BEST_COST arg0
+    cmp     w1, \arg0
+    csel    \arg0, \arg0, w1, hs
+    cset    w7, lo
+    cmp     w2, \arg0
+    mov     w6, #2
+    csel    \arg0, \arg0, w2, hs
+    csel    w7, w7, w6, hs
+.endm
+
+.macro LOAD_CHROMA_DATA arg0, arg1, arg2
+    sub     x9, \arg0, x1
+    ld1     {\arg1}, [x9]      //top_cb
+    sub     x9, $0, #1
+    ld1     {\arg2}[8], [x9], x1
+    ld1     {\arg2}[9], [x9], x1
+    ld1     {\arg2}[10], [x9], x1
+    ld1     {\arg2}[11], [x9], x1
+    ld1     {\arg2}[12], [x9], x1
+    ld1     {\arg2}[13], [x9], x1
+    ld1     {\arg2}[14], [x9], x1
+    ld1     {\arg2}[15], [x9], x1 //left_cb
+.endm
+
+.macro LOAD_8X4_DATA arg0
+    //Load the p_enc data and save to "v20 ~ v21"--- 8X4 bytes
+    ld1     {v0.8b}, [\arg0], x3
+    ld1     {v1.8b}, [\arg0], x3
+    ld1     {v0.d}[1], [\arg0], x3
+    ld1     {v1.d}[1], [\arg0], x3
+    trn1    v2.4s, v0.4s, v1.4s
+    trn2    v1.4s, v0.4s, v1.4s
+    trn1    v20.2d, v2.2d, v1.2d
+    trn2    v21.2d, v2.2d, v1.2d
+.endm
+
+.macro HDM_TRANSFORM_4X4_L0 arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8, arg9
+    //Do the vertical transform
+    uadd\arg9\()   v0.8h, \arg0, \arg1
+    usub\arg9\()   v1.8h, \arg0, \arg1
+    trn1    v3.2d, v0.2d, v1.2d
+    trn2    v1.2d, v0.2d, v1.2d
+    add     v4.8h, v3.8h, v1.8h //{0,1,2,3,4,5,6,7}
+    sub     v5.8h, v3.8h, v1.8h //{12,13,14,15,8,9,10,11}
+
+    //Do the horizontal transform
+    trn1    v0.4s, v4.4s, v5.4s
+    trn2    v1.4s, v4.4s, v5.4s
+    add     v4.8h, v0.8h, v1.8h
+    sub     v5.8h, v0.8h, v1.8h
+    trn1    v0.8h, v4.8h, v5.8h
+    trn2    v1.8h, v4.8h, v5.8h
+    add     v4.8h, v0.8h, v1.8h
+    sub     v5.8h, v0.8h, v1.8h
+
+    //16x16_v
+    trn1    v0.2s, v4.2s, v5.2s
+    trn2    v1.2s, v4.2s, v5.2s
+    sabal   \arg5, v0.4h, \arg2
+    sabal   \arg5, v1.4h, \arg8\().4h
+    sabal2  \arg5, v4.8h, \arg8\().8h
+    sabal2  \arg5, v5.8h, \arg8\().8h
+
+    //16x16_h
+    ins     v3.d[0], v4.d[1]
+    trn1    v0.4h, v4.4h, v3.4h
+    trn2    v1.4h, v4.4h, v3.4h
+    sabal   \arg6, v0.4h, \arg3
+    sabdl   v4.4s, v1.4h, \arg8\().4h
+    sabal   v4.4s, v5.4h, \arg8\().4h
+    sabal2  v4.4s, v5.8h, \arg8\().8h
+    add     \arg6, \arg6, v4.4s
+
+    //16x16_dc_both
+    sabal   \arg7, v0.4h, \arg4
+    add     \arg7, \arg7, v4.4s
+.endm
+#endif
+
+WELS_ASM_ARCH64_FUNC_BEGIN WelsIntra8x8Combined3Sad_AArch64_neon
+    ldr     x11, [sp, #0]
+
+    LOAD_CHROMA_DATA x0, v0.8b, v0.b
+
+    uaddlp  v1.8h, v0.16b
+    uaddlp  v2.4s, v1.8h
+    ins     v3.d[0], v2.d[1]
+    add     v3.2s, v2.2s, v3.2s
+    urshr   v2.4s, v2.4s, #2
+    urshr   v3.2s, v3.2s, #3
+
+    dup     v20.8b, v3.b[0]
+    dup     v21.8b, v2.b[4]
+    dup     v22.8b, v2.b[12]
+    dup     v23.8b, v3.b[4]
+    ins     v20.s[1], v21.s[0]
+    ins     v22.s[1], v23.s[0]
+
+    LOAD_CHROMA_DATA x7, v4.8b, v4.b
+
+    uaddlp  v5.8h, v4.16b
+    uaddlp  v6.4s, v5.8h
+    ins     v7.d[0], v6.d[1]
+    add     v7.2s, v6.2s, v7.2s
+    urshr   v6.4s, v6.4s, #2
+    urshr   v7.2s, v7.2s, #3
+
+    dup     v24.8b, v7.b[0]
+    dup     v25.8b, v6.b[4]
+    dup     v26.8b, v6.b[12]
+    dup     v27.8b, v7.b[4]
+    ins     v24.s[1], v25.s[0]
+    ins     v26.s[1], v27.s[0]
+
+    sub     x9, x0, #1
+    sub     x10, x7, #1
+
+    ld1     {v3.8b}, [x2], x3
+    ld1     {v5.8b}, [x11], x3
+
+    ld1r    {v6.8b}, [x9], x1
+    ld1r    {v7.8b}, [x10], x1
+
+    uabdl   v29.8h, v0.8b, v3.8b
+    uabal   v29.8h, v4.8b, v5.8b   //top
+
+    uabdl   v30.8h, v6.8b, v3.8b
+    uabal   v30.8h, v7.8b, v5.8b   //left
+
+    uabdl   v31.8h, v20.8b, v3.8b
+    uabal   v31.8h, v24.8b, v5.8b   //Dc
+.rept 3
+    ld1     {v3.8b}, [x2], x3
+    ld1     {v5.8b}, [x11], x3
+
+    ld1r    {v6.8b}, [x9], x1
+    ld1r    {v7.8b}, [x10], x1
+
+    uabal   v29.8h, v0.8b, v3.8b
+    uabal   v29.8h, v4.8b, v5.8b   //top
+
+    uabal   v30.8h, v6.8b, v3.8b
+    uabal   v30.8h, v7.8b, v5.8b   //left
+
+    uabal   v31.8h, v20.8b, v3.8b
+    uabal   v31.8h, v24.8b, v5.8b   //Dc
+.endr
+
+.rept 4
+    ld1     {v3.8b}, [x2], x3
+    ld1     {v5.8b}, [x11], x3
+
+    ld1r    {v6.8b}, [x9], x1
+    ld1r    {v7.8b}, [x10], x1
+
+    uabal   v29.8h, v0.8b, v3.8b
+    uabal   v29.8h, v4.8b, v5.8b   //top
+
+    uabal   v30.8h, v6.8b, v3.8b
+    uabal   v30.8h, v7.8b, v5.8b   //left
+
+    uabal   v31.8h, v22.8b, v3.8b
+    uabal   v31.8h, v26.8b, v5.8b   //Dc
+.endr
+
+    saddlv  s29, v29.8h
+    fmov    w2, s29
+    add     w2, w2, w5, lsl #1
+    saddlv  s30, v30.8h
+    fmov    w1, s30
+    add     w1, w1, w5, lsl #1
+    saddlv  s31, v31.8h
+    fmov    w0, s31
+
+    SELECT_BEST_COST w0
+
+    str     w7, [x4]
+WELS_ASM_ARCH64_FUNC_END
+
+WELS_ASM_ARCH64_FUNC_BEGIN WelsIntra16x16Combined3Sad_AArch64_neon
+
+    LOAD_LUMA_DATA
+
+    uaddlv    h2, v0.16b
+    uaddlv    h3, v1.16b
+    add       v2.8h, v2.8h, v3.8h
+    uqrshrn   b2, h2, #5
+    dup       v2.16b, v2.b[0]   //Dc
+
+    sub     x7, x0, #1
+    ld1     {v3.16b}, [x2], x3
+    ld1r    {v4.16b}, [x7], x1
+
+    uabdl   v29.8h, v0.8b, v3.8b
+    uabal2  v29.8h, v0.16b,v3.16b   //top
+
+    uabdl   v30.8h, v4.8b, v3.8b
+    uabal2  v30.8h, v4.16b,v3.16b   //left
+
+    uabdl   v31.8h, v2.8b, v3.8b
+    uabal2  v31.8h, v2.16b,v3.16b   //Dc
+    mov     x6, #15
+sad_intra_16x16_x3_opt_loop0:
+    ld1     {v3.16b}, [x2], x3
+    ld1r    {v4.16b}, [x7], x1
+
+    uabal   v29.8h, v0.8b, v3.8b
+    uabal2  v29.8h, v0.16b,v3.16b   //top
+
+    uabal   v30.8h, v4.8b, v3.8b
+    uabal2  v30.8h, v4.16b,v3.16b   //left
+
+    uabal   v31.8h, v2.8b, v3.8b
+    uabal2  v31.8h, v2.16b,v3.16b   //Dc
+    sub     x6, x6, #1
+    cbnz    x6,  sad_intra_16x16_x3_opt_loop0
+
+    saddlv  s29, v29.8h
+    fmov    w0, s29
+    saddlv  s30, v30.8h
+    fmov    w1, s30
+    add     w1, w1, w5, lsl #1
+    saddlv  s31, v31.8h
+    fmov    w2, s31
+    add     w2, w2, w5, lsl #1
+
+    SELECT_BEST_COST w0
+
+    str     w7, [x4]
+WELS_ASM_ARCH64_FUNC_END
+
+WELS_ASM_ARCH64_FUNC_BEGIN WelsIntra4x4Combined3Satd_AArch64_neon
+    sub     x9, x0, x1
+    ld1     {v16.s}[0], [x9]      //top
+    sub     x9, x0, #1
+    ld1     {v16.b}[4], [x9], x1
+    ld1     {v16.b}[5], [x9], x1
+    ld1     {v16.b}[6], [x9], x1
+    ld1     {v16.b}[7], [x9], x1
+
+
+    uaddlv  h2, v16.8b
+    uqrshrn b17, h2, #3
+    urshr   v2.4h, v2.4h, #3
+    shl     v2.4h, v2.4h, #4
+
+    //Calculate the 4x4_v 4x4_h mode SATD and save to "v6, v7"
+    ushll   v4.8h, v16.8b, #2
+    ins     v5.d[0], v4.d[1]
+    trn1    v6.2s, v4.2s, v5.2s
+    trn2    v7.2s, v4.2s, v5.2s
+
+    add     v4.4h, v6.4h, v7.4h
+    sub     v5.4h, v6.4h, v7.4h
+    trn1    v6.4h, v4.4h, v5.4h
+    trn2    v7.4h, v4.4h, v5.4h
+    add     v4.4h, v6.4h, v7.4h
+    sub     v5.4h, v6.4h, v7.4h
+    trn1    v6.2s, v4.2s, v5.2s
+    trn2    v7.2s, v4.2s, v5.2s     //{0,1,3,2,top} v6 {0,1,3,2,left} v7
+
+    eor     v31.16b, v31.16b, v31.16b  //Save the SATD of DC_BOTH
+    eor     v30.16b, v30.16b, v30.16b  //Save the SATD of H
+    eor     v29.16b, v29.16b, v29.16b  //Save the SATD of V
+    eor     v28.16b, v28.16b, v28.16b  //For zero register
+
+    //Load the p_enc data and save to "v22 ~ v23"--- 16X4 bytes
+    ld1     {v22.s}[0], [x2], x3
+    ld1     {v22.s}[1], [x2], x3
+    ld1     {v23.s}[0], [x2], x3
+    ld1     {v23.s}[1], [x2], x3
+
+    HDM_TRANSFORM_4X4_L0 v22.8b, v23.8b, v6.4h, v7.4h, v2.4h, v29.4s, v30.4s, v31.4s, v28, l
+
+    ldr     x11, [sp, #0]
+    urshr   v29.4s, v29.4s, #1
+    addv    s29, v29.4s
+    fmov    w0, s29
+    add     w0, w0, w11
+
+    urshr   v30.4s, v30.4s, #1
+    addv    s30, v30.4s
+    fmov    w1, s30
+    add     w1, w1, w7
+
+    urshr   v31.4s, v31.4s, #1
+    addv    s31, v31.4s
+    fmov    w2, s31
+    add     w2, w2, w6
+
+    mov     w10, w0
+    SELECT_BEST_COST w10
+
+    str     w7, [x5]
+
+    sub     w9, w10, w2
+    cbnz    w9, satd_intra_4x4_x3_opt_jump0
+    dup     v0.16b, v17.b[0]
+    st1     {v0.16b}, [x4]
+    b       satd_intra_4x4_x3_opt_end
+
+satd_intra_4x4_x3_opt_jump0:
+    sub     w8, w10, w1
+    cbnz    w8, satd_intra_4x4_x3_opt_jump1
+    dup     v0.16b, v16.b[4]
+    dup     v1.16b, v16.b[5]
+    dup     v2.16b, v16.b[6]
+    dup     v3.16b, v16.b[7]
+    st4     {v0.s,v1.s,v2.s,v3.s}[0], [x4]
+    b       satd_intra_4x4_x3_opt_end
+
+satd_intra_4x4_x3_opt_jump1:
+    st1     {v16.S}[0], [x4], #4
+    st1     {v16.S}[0], [x4], #4
+    st1     {v16.S}[0], [x4], #4
+    st1     {v16.S}[0], [x4]
+satd_intra_4x4_x3_opt_end:
+    mov     w0, w10
+
+WELS_ASM_ARCH64_FUNC_END
+
+WELS_ASM_ARCH64_FUNC_BEGIN WelsIntra8x8Combined3Satd_AArch64_neon
+    ldr     x11, [sp, #0]
+
+    LOAD_CHROMA_DATA x0, v0.8b, v0.b
+
+    LOAD_CHROMA_DATA x7, v1.8b, v1.b
+
+    //Calculate the 16x16_v mode SATD and save to "v6, v7"
+    ushll   v4.8h, v0.8b, #2
+    ushll   v5.8h, v1.8b, #2
+    GET_16X16_V_SATD
+
+    //Calculate the 16x16_h mode SATD and save to "v16, v17"
+    ushll2  v4.8h, v0.16b, #2
+    ushll2  v5.8h, v1.16b, #2
+    GET_16X16_H_SATD
+
+    uaddlp  v0.8h, v0.16b
+    uaddlp  v2.4s, v0.8h
+    ins     v3.d[0], v2.d[1]
+    add     v3.2s, v2.2s, v3.2s
+
+    uaddlp  v1.8h, v1.16b
+    uaddlp  v4.4s, v1.8h
+    ins     v5.d[0], v4.d[1]
+    add     v5.2s, v4.2s, v5.2s
+
+    trn2    v0.4s, v2.4s, v4.4s
+    urshr   v0.4s, v0.4s, #2
+    urshr   v3.2s, v3.2s, #3
+    urshr   v5.2s, v5.2s, #3
+
+    ushll   v22.2d, v0.2s, #4    //{1cb, 1cr}
+    ushll2  v23.2d, v0.4s, #4    //{2cb, 2cr}
+    ushll   v24.2d, v3.2s, #4   //{0cb, 3cb}
+    ushll   v25.2d, v5.2s, #4   //{0cr, 3cr}
+
+    eor     v31.16b, v31.16b, v31.16b  //Save the SATD of DC_BOTH
+    eor     v30.16b, v30.16b, v30.16b  //Save the SATD of H
+    eor     v29.16b, v29.16b, v29.16b  //Save the SATD of V
+    eor     v28.16b, v28.16b, v28.16b  //For zero register
+
+    ins     v18.d[0], v6.d[1]
+    ins     v19.d[0], v7.d[1]
+    ins     v26.d[0], v16.d[1]
+    ins     v27.d[0], v17.d[1]
+
+    LOAD_8X4_DATA x2
+
+    HDM_TRANSFORM_4X4_L0 v20.8b, v21.8b, v6.4h, v16.4h, v24.4h, v29.4s, v30.4s, v31.4s, v28, l
+    HDM_TRANSFORM_4X4_L0 v20.16b, v21.16b, v18.4h, v16.4h, v22.4h, v29.4s, v30.4s, v31.4s, v28, l2
+
+    LOAD_8X4_DATA x11
+
+    ins     v22.d[0], v22.d[1]
+    HDM_TRANSFORM_4X4_L0 v20.8b, v21.8b, v7.4h, v17.4h, v25.4h, v29.4s, v30.4s, v31.4s, v28, l
+    HDM_TRANSFORM_4X4_L0 v20.16b, v21.16b, v19.4h, v17.4h, v22.4h, v29.4s, v30.4s, v31.4s, v28, l2
+
+    LOAD_8X4_DATA x2
+
+    ins     v24.d[0], v24.d[1]
+    HDM_TRANSFORM_4X4_L0 v20.8b, v21.8b, v6.4h, v26.4h, v23.4h, v29.4s, v30.4s, v31.4s, v28, l
+    HDM_TRANSFORM_4X4_L0 v20.16b, v21.16b, v18.4h, v26.4h, v24.4h, v29.4s, v30.4s, v31.4s, v28, l2
+
+    LOAD_8X4_DATA x11
+
+    ins     v23.d[0], v23.d[1]
+    ins     v25.d[0], v25.d[1]
+    HDM_TRANSFORM_4X4_L0 v20.8b, v21.8b, v7.4h, v27.4h, v23.4h, v29.4s, v30.4s, v31.4s, v28, l
+    HDM_TRANSFORM_4X4_L0 v20.16b, v21.16b, v19.4h, v27.4h, v25.4h, v29.4s, v30.4s, v31.4s, v28, l2
+
+    urshr   v29.4s, v29.4s, #1
+    addv    s29, v29.4s
+    fmov    w2, s29
+    add     w2, w2, w5, lsl #1
+
+    urshr   v30.4s, v30.4s, #1
+    addv    s30, v30.4s
+    fmov    w1, s30
+    add     w1, w1, w5, lsl #1
+
+    urshr   v31.4s, v31.4s, #1
+    addv    s31, v31.4s
+    fmov    w0, s31
+
+    SELECT_BEST_COST w0
+
+    str     w7, [x4]
+WELS_ASM_ARCH64_FUNC_END
+
+
+WELS_ASM_ARCH64_FUNC_BEGIN WelsIntra16x16Combined3Satd_AArch64_neon
+    LOAD_LUMA_DATA
+
+    uaddlv  h2, v0.16b
+    uaddlv  h3, v1.16b
+    add     v2.8h, v2.8h, v3.8h
+    urshr   v2.4h, v2.4h, #5
+    shl     v2.4h, v2.4h, #4
+
+    //Calculate the 16x16_v mode SATD and save to "v6, v7"
+    ushll   v4.8h, v0.8b, #2
+    ushll2  v5.8h, v0.16b, #2
+    GET_16X16_V_SATD
+
+    //Calculate the 16x16_h mode SATD and save to "v16, v17"
+    ushll   v4.8h, v1.8b, #2
+    ushll2  v5.8h, v1.16b, #2
+    GET_16X16_H_SATD
+
+    eor     v31.16b, v31.16b, v31.16b  //Save the SATD of DC_BOTH
+    eor     v30.16b, v30.16b, v30.16b  //Save the SATD of H
+    eor     v29.16b, v29.16b, v29.16b  //Save the SATD of V
+    eor     v28.16b, v28.16b, v28.16b  //For zero register
+
+    ins     v18.d[0], v6.d[1]
+    ins     v19.d[0], v7.d[1]
+    ins     v26.d[0], v16.d[1]
+    ins     v27.d[0], v17.d[1]
+
+    LOAD_16X4_DATA
+
+    HDM_TRANSFORM_4X4_L0 v22.8b, v24.8b, v6.4h, v16.4h, v2.4h, v29.4s, v30.4s, v31.4s, v28, l
+    HDM_TRANSFORM_4X4_L0 v22.16b, v24.16b, v7.4h, v16.4h, v2.4h, v29.4s, v30.4s, v31.4s, v28, l2
+    HDM_TRANSFORM_4X4_L0 v23.8b, v25.8b, v18.4h, v16.4h, v2.4h, v29.4s, v30.4s, v31.4s, v28, l
+    HDM_TRANSFORM_4X4_L0 v23.16b, v25.16b, v19.4h, v16.4h, v2.4h, v29.4s, v30.4s, v31.4s, v28, l2
+
+    LOAD_16X4_DATA
+
+    HDM_TRANSFORM_4X4_L0 v22.8b, v24.8b, v6.4h, v26.4h, v2.4h, v29.4s, v30.4s, v31.4s, v28, l
+    HDM_TRANSFORM_4X4_L0 v22.16b, v24.16b, v7.4h, v26.4h, v2.4h, v29.4s, v30.4s, v31.4s, v28, l2
+    HDM_TRANSFORM_4X4_L0 v23.8b, v25.8b, v18.4h, v26.4h, v2.4h, v29.4s, v30.4s, v31.4s, v28, l
+    HDM_TRANSFORM_4X4_L0 v23.16b, v25.16b, v19.4h, v26.4h, v2.4h, v29.4s, v30.4s, v31.4s, v28, l2
+
+    LOAD_16X4_DATA
+
+    HDM_TRANSFORM_4X4_L0 v22.8b, v24.8b, v6.4h, v17.4h, v2.4h, v29.4s, v30.4s, v31.4s, v28, l
+    HDM_TRANSFORM_4X4_L0 v22.16b, v24.16b, v7.4h, v17.4h, v2.4h, v29.4s, v30.4s, v31.4s, v28, l2
+    HDM_TRANSFORM_4X4_L0 v23.8b, v25.8b, v18.4h, v17.4h, v2.4h, v29.4s, v30.4s, v31.4s, v28, l
+    HDM_TRANSFORM_4X4_L0 v23.16b, v25.16b, v19.4h, v17.4h, v2.4h, v29.4s, v30.4s, v31.4s, v28, l2
+
+    LOAD_16X4_DATA
+
+    HDM_TRANSFORM_4X4_L0 v22.8b, v24.8b, v6.4h, v27.4h, v2.4h, v29.4s, v30.4s, v31.4s, v28, l
+    HDM_TRANSFORM_4X4_L0 v22.16b, v24.16b, v7.4h, v27.4h, v2.4h, v29.4s, v30.4s, v31.4s, v28, l2
+    HDM_TRANSFORM_4X4_L0 v23.8b, v25.8b, v18.4h, v27.4h, v2.4h, v29.4s, v30.4s, v31.4s, v28, l
+    HDM_TRANSFORM_4X4_L0 v23.16b, v25.16b, v19.4h, v27.4h, v2.4h, v29.4s, v30.4s, v31.4s, v28, l2
+
+    urshr   v29.4s, v29.4s, #1
+    addv    s29, v29.4s
+    fmov    w0, s29
+
+    urshr   v30.4s, v30.4s, #1
+    addv    s30, v30.4s
+    fmov    w1, s30
+    add     w1, w1, w5, lsl #1
+
+    urshr   v31.4s, v31.4s, #1
+    addv    s31, v31.4s
+    fmov    w2, s31
+    add     w2, w2, w5, lsl #1
+
+    SELECT_BEST_COST w0
+
+    str     w7, [x4]
+
+WELS_ASM_ARCH64_FUNC_END
+
+#endif
--- a/codec/encoder/core/inc/sample.h
+++ b/codec/encoder/core/inc/sample.h
@ -108,6 +108,14 @@ int32_t WelsSampleSatd16x16_AArch64_neon (uint8_t*, int32_t, uint8_t*, int32_t);
 int32_t WelsSampleSatd16x8_AArch64_neon (uint8_t*, int32_t, uint8_t*, int32_t);
 int32_t WelsSampleSatd8x16_AArch64_neon (uint8_t*, int32_t, uint8_t*, int32_t);
 int32_t WelsSampleSatd8x8_AArch64_neon (uint8_t*, int32_t, uint8_t*, int32_t);
+int32_t WelsIntra16x16Combined3Satd_AArch64_neon (uint8_t*, int32_t, uint8_t*, int32_t, int32_t*, int32_t, uint8_t*);
+int32_t WelsIntra16x16Combined3Sad_AArch64_neon (uint8_t*, int32_t, uint8_t*, int32_t, int32_t*, int32_t, uint8_t*);
+int32_t WelsIntra8x8Combined3Satd_AArch64_neon (uint8_t*, int32_t, uint8_t*, int32_t, int32_t*, int32_t, uint8_t*, uint8_t*,
+                                            uint8_t*);
+int32_t WelsIntra8x8Combined3Sad_AArch64_neon (uint8_t*, int32_t, uint8_t*, int32_t, int32_t*, int32_t, uint8_t*, uint8_t*,
+                                           uint8_t*);
+int32_t WelsIntra4x4Combined3Satd_AArch64_neon (uint8_t*, int32_t, uint8_t*, int32_t, uint8_t*, int32_t*, int32_t, int32_t,
+                                            int32_t);
 #endif
 #if defined(__cplusplus)
 }
--- a/codec/encoder/core/src/sample.cpp
+++ b/codec/encoder/core/src/sample.cpp
@ -433,6 +433,12 @@ void WelsInitSampleSadFunc (SWelsFuncPtrList* pFuncList, uint32_t uiCpuFlag) {
    pFuncList->sSampleDealingFuncs.pfSampleSatd[BLOCK_8x16 ] = WelsSampleSatd8x16_AArch64_neon;
    pFuncList->sSampleDealingFuncs.pfSampleSatd[BLOCK_16x8 ] = WelsSampleSatd16x8_AArch64_neon;
    pFuncList->sSampleDealingFuncs.pfSampleSatd[BLOCK_16x16] = WelsSampleSatd16x16_AArch64_neon;
+
+    pFuncList->sSampleDealingFuncs.pfIntra4x4Combined3Satd   = WelsIntra4x4Combined3Satd_AArch64_neon;
+    pFuncList->sSampleDealingFuncs.pfIntra8x8Combined3Satd   = WelsIntra8x8Combined3Satd_AArch64_neon;
+    pFuncList->sSampleDealingFuncs.pfIntra8x8Combined3Sad    = WelsIntra8x8Combined3Sad_AArch64_neon;
+    pFuncList->sSampleDealingFuncs.pfIntra16x16Combined3Satd = WelsIntra16x16Combined3Satd_AArch64_neon;
+    pFuncList->sSampleDealingFuncs.pfIntra16x16Combined3Sad  = WelsIntra16x16Combined3Sad_AArch64_neon;
  }
 #endif
 }
--- a/codec/encoder/targets.mk
+++ b/codec/encoder/targets.mk
@ -60,6 +60,7 @@ endif
 ifeq ($(ASM_ARCH), arm64)
 ENCODER_ASM_ARM64_SRCS=\
 	$(ENCODER_SRCDIR)/core/arm64/intra_pred_aarch64_neon.S\
+	$(ENCODER_SRCDIR)/core/arm64/intra_pred_sad_3_opt_neon_aarch64.S\
 	$(ENCODER_SRCDIR)/core/arm64/pixel_aarch64_neon.S\

 ENCODER_OBJS += $(ENCODER_ASM_ARM64_SRCS:.S=.$(OBJ))