Add arm64 neon code for intraSad&Satd

This commit is contained in:
zhiliang wang 2014-06-25 13:43:26 +08:00
parent 3f333b01fd
commit f0ec323e2c
5 changed files with 684 additions and 0 deletions

View File

@ -7,6 +7,7 @@
objects = {
/* Begin PBXBuildFile section */
4C23BC60195A77E0003B81FC /* intra_pred_sad_3_opt_neon_aarch64.S in Sources */ = {isa = PBXBuildFile; fileRef = 4C23BC5F195A77E0003B81FC /* intra_pred_sad_3_opt_neon_aarch64.S */; };
4C34066D18C57D0400DFA14A /* intra_pred_neon.S in Sources */ = {isa = PBXBuildFile; fileRef = 4C34066618C57D0400DFA14A /* intra_pred_neon.S */; };
4C34066E18C57D0400DFA14A /* intra_pred_sad_3_opt_neon.S in Sources */ = {isa = PBXBuildFile; fileRef = 4C34066718C57D0400DFA14A /* intra_pred_sad_3_opt_neon.S */; };
4C34067018C57D0400DFA14A /* memory_neon.S in Sources */ = {isa = PBXBuildFile; fileRef = 4C34066918C57D0400DFA14A /* memory_neon.S */; };
@ -61,6 +62,7 @@
/* End PBXCopyFilesBuildPhase section */
/* Begin PBXFileReference section */
4C23BC5F195A77E0003B81FC /* intra_pred_sad_3_opt_neon_aarch64.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; name = intra_pred_sad_3_opt_neon_aarch64.S; path = arm64/intra_pred_sad_3_opt_neon_aarch64.S; sourceTree = "<group>"; };
4C34066618C57D0400DFA14A /* intra_pred_neon.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = intra_pred_neon.S; sourceTree = "<group>"; };
4C34066718C57D0400DFA14A /* intra_pred_sad_3_opt_neon.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = intra_pred_sad_3_opt_neon.S; sourceTree = "<group>"; };
4C34066918C57D0400DFA14A /* memory_neon.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = memory_neon.S; sourceTree = "<group>"; };
@ -182,6 +184,7 @@
4CB8F2B219235FAC005D6386 /* arm64 */ = {
isa = PBXGroup;
children = (
4C23BC5F195A77E0003B81FC /* intra_pred_sad_3_opt_neon_aarch64.S */,
4CBC1B82194ACBB400214D9E /* intra_pred_aarch64_neon.S */,
4CB8F2B319235FC5005D6386 /* pixel_aarch64_neon.S */,
);
@ -422,6 +425,7 @@
4CE4472518BC605C0017DF25 /* svc_encode_mb.cpp in Sources */,
4CE4471A18BC605C0017DF25 /* mv_pred.cpp in Sources */,
4C34066E18C57D0400DFA14A /* intra_pred_sad_3_opt_neon.S in Sources */,
4C23BC60195A77E0003B81FC /* intra_pred_sad_3_opt_neon_aarch64.S in Sources */,
4CE4472B18BC605C0017DF25 /* wels_preprocess.cpp in Sources */,
4CE4470E18BC605C0017DF25 /* au_set.cpp in Sources */,
4CBC1B83194ACBB400214D9E /* intra_pred_aarch64_neon.S in Sources */,

View File

@ -0,0 +1,665 @@
/*!
* \copy
* Copyright (c) 2013, Cisco Systems
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
*
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in
* the documentation and/or other materials provided with the
* distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
* COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
* ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*
*/
#ifdef HAVE_NEON_AARCH64
.text
#include "arm_arch64_common_macro.S"
.macro LOAD_LUMA_DATA
sub x7, x0, x1
ld1 {v0.16b}, [x7] //top
sub x7, x0, #1
ld1 {v1.b}[0], [x7], x1
ld1 {v1.b}[1], [x7], x1
ld1 {v1.b}[2], [x7], x1
ld1 {v1.b}[3], [x7], x1
ld1 {v1.b}[4], [x7], x1
ld1 {v1.b}[5], [x7], x1
ld1 {v1.b}[6], [x7], x1
ld1 {v1.b}[7], [x7], x1
ld1 {v1.b}[8], [x7], x1
ld1 {v1.b}[9], [x7], x1
ld1 {v1.b}[10], [x7], x1
ld1 {v1.b}[11], [x7], x1
ld1 {v1.b}[12], [x7], x1
ld1 {v1.b}[13], [x7], x1
ld1 {v1.b}[14], [x7], x1
ld1 {v1.b}[15], [x7] //left
.endm
.macro LOAD_16X4_DATA
//Load the p_enc data and save to "v22 ~ v25"--- 16X4 bytes
ld1 {v0.16b}, [x2], x3
ld1 {v1.16b}, [x2], x3
ld1 {v20.16b}, [x2], x3
ld1 {v21.16b}, [x2], x3
trn1 v22.4s, v0.4s, v1.4s
trn2 v23.4s, v0.4s, v1.4s
trn1 v24.4s, v20.4s, v21.4s
trn2 v25.4s, v20.4s, v21.4s
.endm
.macro GET_16X16_V_SATD
trn1 v6.4s, v4.4s, v5.4s
trn2 v7.4s, v4.4s, v5.4s
add v4.8h, v6.8h, v7.8h
sub v5.8h, v6.8h, v7.8h
trn1 v6.8h, v4.8h, v5.8h
trn2 v7.8h, v4.8h, v5.8h
add v4.8h, v6.8h, v7.8h
sub v5.8h, v6.8h, v7.8h
trn1 v6.4s, v4.4s, v5.4s
trn2 v7.4s, v4.4s, v5.4s //{0,1,3,2, 4,5,7,6} v6 {8,9,11,10, 12,13,15,14} v7
.endm
.macro GET_16X16_H_SATD
trn1 v16.4s, v4.4s, v5.4s
trn2 v17.4s, v4.4s, v5.4s
add v4.8h, v16.8h, v17.8h
sub v5.8h, v16.8h, v17.8h
trn1 v16.8h, v4.8h, v5.8h
trn2 v17.8h, v4.8h, v5.8h
add v4.8h, v16.8h, v17.8h
sub v5.8h, v16.8h, v17.8h
trn1 v16.4s, v4.4s, v5.4s
trn2 v17.4s, v4.4s, v5.4s //{0,1,3,2, 4,5,7,6} v16 {8,9,11,10, 12,13,15,14} v17
.endm
#ifdef __APPLE__
.macro SELECT_BEST_COST
cmp w1, $0
csel $0, $0, w1, hs
cset w7, lo
cmp w2, $0
mov w6, #2
csel $0, $0, w2, hs
csel w7, w7, w6, hs
.endm
.macro LOAD_CHROMA_DATA
sub x9, $0, x1
ld1 {$1}, [x9] //top_cb
sub x9, $0, #1
ld1 {$2}[8], [x9], x1
ld1 {$2}[9], [x9], x1
ld1 {$2}[10], [x9], x1
ld1 {$2}[11], [x9], x1
ld1 {$2}[12], [x9], x1
ld1 {$2}[13], [x9], x1
ld1 {$2}[14], [x9], x1
ld1 {$2}[15], [x9], x1 //left_cb
.endm
.macro LOAD_8X4_DATA
//Load the p_enc data and save to "v20 ~ v21"--- 8X4 bytes
ld1 {v0.8b}, [$0], x3
ld1 {v1.8b}, [$0], x3
ld1 {v0.d}[1], [$0], x3
ld1 {v1.d}[1], [$0], x3
trn1 v2.4s, v0.4s, v1.4s
trn2 v1.4s, v0.4s, v1.4s
trn1 v20.2d, v2.2d, v1.2d
trn2 v21.2d, v2.2d, v1.2d
.endm
.macro HDM_TRANSFORM_4X4_L0
//Do the vertical transform
uadd$9 v0.8h, $0, $1
usub$9 v1.8h, $0, $1
trn1 v3.2d, v0.2d, v1.2d
trn2 v1.2d, v0.2d, v1.2d
add v4.8h, v3.8h, v1.8h //{0,1,2,3,4,5,6,7}
sub v5.8h, v3.8h, v1.8h //{12,13,14,15,8,9,10,11}
//Do the horizontal transform
trn1 v0.4s, v4.4s, v5.4s
trn2 v1.4s, v4.4s, v5.4s
add v4.8h, v0.8h, v1.8h
sub v5.8h, v0.8h, v1.8h
trn1 v0.8h, v4.8h, v5.8h
trn2 v1.8h, v4.8h, v5.8h
add v4.8h, v0.8h, v1.8h
sub v5.8h, v0.8h, v1.8h
//16x16_v
trn1 v0.2s, v4.2s, v5.2s
trn2 v1.2s, v4.2s, v5.2s
sabal $5, v0.4h, $2
sabal $5, v1.4h, $8.4h
sabal2 $5, v4.8h, $8.8h
sabal2 $5, v5.8h, $8.8h
//16x16_h
ins v3.d[0], v4.d[1]
trn1 v0.4h, v4.4h, v3.4h
trn2 v1.4h, v4.4h, v3.4h
sabal $6, v0.4h, $3
sabdl v4.4s, v1.4h, $8.4h
sabal v4.4s, v5.4h, $8.4h
sabal2 v4.4s, v5.8h, $8.8h
add $6, $6, v4.4s
//16x16_dc_both
sabal $7, v0.4h, $4
add $7, $7, v4.4s
.endm
#else
.macro SELECT_BEST_COST arg0
cmp w1, \arg0
csel \arg0, \arg0, w1, hs
cset w7, lo
cmp w2, \arg0
mov w6, #2
csel \arg0, \arg0, w2, hs
csel w7, w7, w6, hs
.endm
.macro LOAD_CHROMA_DATA arg0, arg1, arg2
sub x9, \arg0, x1
ld1 {\arg1}, [x9] //top_cb
sub x9, $0, #1
ld1 {\arg2}[8], [x9], x1
ld1 {\arg2}[9], [x9], x1
ld1 {\arg2}[10], [x9], x1
ld1 {\arg2}[11], [x9], x1
ld1 {\arg2}[12], [x9], x1
ld1 {\arg2}[13], [x9], x1
ld1 {\arg2}[14], [x9], x1
ld1 {\arg2}[15], [x9], x1 //left_cb
.endm
.macro LOAD_8X4_DATA arg0
//Load the p_enc data and save to "v20 ~ v21"--- 8X4 bytes
ld1 {v0.8b}, [\arg0], x3
ld1 {v1.8b}, [\arg0], x3
ld1 {v0.d}[1], [\arg0], x3
ld1 {v1.d}[1], [\arg0], x3
trn1 v2.4s, v0.4s, v1.4s
trn2 v1.4s, v0.4s, v1.4s
trn1 v20.2d, v2.2d, v1.2d
trn2 v21.2d, v2.2d, v1.2d
.endm
.macro HDM_TRANSFORM_4X4_L0 arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8, arg9
//Do the vertical transform
uadd\arg9\() v0.8h, \arg0, \arg1
usub\arg9\() v1.8h, \arg0, \arg1
trn1 v3.2d, v0.2d, v1.2d
trn2 v1.2d, v0.2d, v1.2d
add v4.8h, v3.8h, v1.8h //{0,1,2,3,4,5,6,7}
sub v5.8h, v3.8h, v1.8h //{12,13,14,15,8,9,10,11}
//Do the horizontal transform
trn1 v0.4s, v4.4s, v5.4s
trn2 v1.4s, v4.4s, v5.4s
add v4.8h, v0.8h, v1.8h
sub v5.8h, v0.8h, v1.8h
trn1 v0.8h, v4.8h, v5.8h
trn2 v1.8h, v4.8h, v5.8h
add v4.8h, v0.8h, v1.8h
sub v5.8h, v0.8h, v1.8h
//16x16_v
trn1 v0.2s, v4.2s, v5.2s
trn2 v1.2s, v4.2s, v5.2s
sabal \arg5, v0.4h, \arg2
sabal \arg5, v1.4h, \arg8\().4h
sabal2 \arg5, v4.8h, \arg8\().8h
sabal2 \arg5, v5.8h, \arg8\().8h
//16x16_h
ins v3.d[0], v4.d[1]
trn1 v0.4h, v4.4h, v3.4h
trn2 v1.4h, v4.4h, v3.4h
sabal \arg6, v0.4h, \arg3
sabdl v4.4s, v1.4h, \arg8\().4h
sabal v4.4s, v5.4h, \arg8\().4h
sabal2 v4.4s, v5.8h, \arg8\().8h
add \arg6, \arg6, v4.4s
//16x16_dc_both
sabal \arg7, v0.4h, \arg4
add \arg7, \arg7, v4.4s
.endm
#endif
WELS_ASM_ARCH64_FUNC_BEGIN WelsIntra8x8Combined3Sad_AArch64_neon
ldr x11, [sp, #0]
LOAD_CHROMA_DATA x0, v0.8b, v0.b
uaddlp v1.8h, v0.16b
uaddlp v2.4s, v1.8h
ins v3.d[0], v2.d[1]
add v3.2s, v2.2s, v3.2s
urshr v2.4s, v2.4s, #2
urshr v3.2s, v3.2s, #3
dup v20.8b, v3.b[0]
dup v21.8b, v2.b[4]
dup v22.8b, v2.b[12]
dup v23.8b, v3.b[4]
ins v20.s[1], v21.s[0]
ins v22.s[1], v23.s[0]
LOAD_CHROMA_DATA x7, v4.8b, v4.b
uaddlp v5.8h, v4.16b
uaddlp v6.4s, v5.8h
ins v7.d[0], v6.d[1]
add v7.2s, v6.2s, v7.2s
urshr v6.4s, v6.4s, #2
urshr v7.2s, v7.2s, #3
dup v24.8b, v7.b[0]
dup v25.8b, v6.b[4]
dup v26.8b, v6.b[12]
dup v27.8b, v7.b[4]
ins v24.s[1], v25.s[0]
ins v26.s[1], v27.s[0]
sub x9, x0, #1
sub x10, x7, #1
ld1 {v3.8b}, [x2], x3
ld1 {v5.8b}, [x11], x3
ld1r {v6.8b}, [x9], x1
ld1r {v7.8b}, [x10], x1
uabdl v29.8h, v0.8b, v3.8b
uabal v29.8h, v4.8b, v5.8b //top
uabdl v30.8h, v6.8b, v3.8b
uabal v30.8h, v7.8b, v5.8b //left
uabdl v31.8h, v20.8b, v3.8b
uabal v31.8h, v24.8b, v5.8b //Dc
.rept 3
ld1 {v3.8b}, [x2], x3
ld1 {v5.8b}, [x11], x3
ld1r {v6.8b}, [x9], x1
ld1r {v7.8b}, [x10], x1
uabal v29.8h, v0.8b, v3.8b
uabal v29.8h, v4.8b, v5.8b //top
uabal v30.8h, v6.8b, v3.8b
uabal v30.8h, v7.8b, v5.8b //left
uabal v31.8h, v20.8b, v3.8b
uabal v31.8h, v24.8b, v5.8b //Dc
.endr
.rept 4
ld1 {v3.8b}, [x2], x3
ld1 {v5.8b}, [x11], x3
ld1r {v6.8b}, [x9], x1
ld1r {v7.8b}, [x10], x1
uabal v29.8h, v0.8b, v3.8b
uabal v29.8h, v4.8b, v5.8b //top
uabal v30.8h, v6.8b, v3.8b
uabal v30.8h, v7.8b, v5.8b //left
uabal v31.8h, v22.8b, v3.8b
uabal v31.8h, v26.8b, v5.8b //Dc
.endr
saddlv s29, v29.8h
fmov w2, s29
add w2, w2, w5, lsl #1
saddlv s30, v30.8h
fmov w1, s30
add w1, w1, w5, lsl #1
saddlv s31, v31.8h
fmov w0, s31
SELECT_BEST_COST w0
str w7, [x4]
WELS_ASM_ARCH64_FUNC_END
WELS_ASM_ARCH64_FUNC_BEGIN WelsIntra16x16Combined3Sad_AArch64_neon
LOAD_LUMA_DATA
uaddlv h2, v0.16b
uaddlv h3, v1.16b
add v2.8h, v2.8h, v3.8h
uqrshrn b2, h2, #5
dup v2.16b, v2.b[0] //Dc
sub x7, x0, #1
ld1 {v3.16b}, [x2], x3
ld1r {v4.16b}, [x7], x1
uabdl v29.8h, v0.8b, v3.8b
uabal2 v29.8h, v0.16b,v3.16b //top
uabdl v30.8h, v4.8b, v3.8b
uabal2 v30.8h, v4.16b,v3.16b //left
uabdl v31.8h, v2.8b, v3.8b
uabal2 v31.8h, v2.16b,v3.16b //Dc
mov x6, #15
sad_intra_16x16_x3_opt_loop0:
ld1 {v3.16b}, [x2], x3
ld1r {v4.16b}, [x7], x1
uabal v29.8h, v0.8b, v3.8b
uabal2 v29.8h, v0.16b,v3.16b //top
uabal v30.8h, v4.8b, v3.8b
uabal2 v30.8h, v4.16b,v3.16b //left
uabal v31.8h, v2.8b, v3.8b
uabal2 v31.8h, v2.16b,v3.16b //Dc
sub x6, x6, #1
cbnz x6, sad_intra_16x16_x3_opt_loop0
saddlv s29, v29.8h
fmov w0, s29
saddlv s30, v30.8h
fmov w1, s30
add w1, w1, w5, lsl #1
saddlv s31, v31.8h
fmov w2, s31
add w2, w2, w5, lsl #1
SELECT_BEST_COST w0
str w7, [x4]
WELS_ASM_ARCH64_FUNC_END
WELS_ASM_ARCH64_FUNC_BEGIN WelsIntra4x4Combined3Satd_AArch64_neon
sub x9, x0, x1
ld1 {v16.s}[0], [x9] //top
sub x9, x0, #1
ld1 {v16.b}[4], [x9], x1
ld1 {v16.b}[5], [x9], x1
ld1 {v16.b}[6], [x9], x1
ld1 {v16.b}[7], [x9], x1
uaddlv h2, v16.8b
uqrshrn b17, h2, #3
urshr v2.4h, v2.4h, #3
shl v2.4h, v2.4h, #4
//Calculate the 4x4_v 4x4_h mode SATD and save to "v6, v7"
ushll v4.8h, v16.8b, #2
ins v5.d[0], v4.d[1]
trn1 v6.2s, v4.2s, v5.2s
trn2 v7.2s, v4.2s, v5.2s
add v4.4h, v6.4h, v7.4h
sub v5.4h, v6.4h, v7.4h
trn1 v6.4h, v4.4h, v5.4h
trn2 v7.4h, v4.4h, v5.4h
add v4.4h, v6.4h, v7.4h
sub v5.4h, v6.4h, v7.4h
trn1 v6.2s, v4.2s, v5.2s
trn2 v7.2s, v4.2s, v5.2s //{0,1,3,2,top} v6 {0,1,3,2,left} v7
eor v31.16b, v31.16b, v31.16b //Save the SATD of DC_BOTH
eor v30.16b, v30.16b, v30.16b //Save the SATD of H
eor v29.16b, v29.16b, v29.16b //Save the SATD of V
eor v28.16b, v28.16b, v28.16b //For zero register
//Load the p_enc data and save to "v22 ~ v23"--- 16X4 bytes
ld1 {v22.s}[0], [x2], x3
ld1 {v22.s}[1], [x2], x3
ld1 {v23.s}[0], [x2], x3
ld1 {v23.s}[1], [x2], x3
HDM_TRANSFORM_4X4_L0 v22.8b, v23.8b, v6.4h, v7.4h, v2.4h, v29.4s, v30.4s, v31.4s, v28, l
ldr x11, [sp, #0]
urshr v29.4s, v29.4s, #1
addv s29, v29.4s
fmov w0, s29
add w0, w0, w11
urshr v30.4s, v30.4s, #1
addv s30, v30.4s
fmov w1, s30
add w1, w1, w7
urshr v31.4s, v31.4s, #1
addv s31, v31.4s
fmov w2, s31
add w2, w2, w6
mov w10, w0
SELECT_BEST_COST w10
str w7, [x5]
sub w9, w10, w2
cbnz w9, satd_intra_4x4_x3_opt_jump0
dup v0.16b, v17.b[0]
st1 {v0.16b}, [x4]
b satd_intra_4x4_x3_opt_end
satd_intra_4x4_x3_opt_jump0:
sub w8, w10, w1
cbnz w8, satd_intra_4x4_x3_opt_jump1
dup v0.16b, v16.b[4]
dup v1.16b, v16.b[5]
dup v2.16b, v16.b[6]
dup v3.16b, v16.b[7]
st4 {v0.s,v1.s,v2.s,v3.s}[0], [x4]
b satd_intra_4x4_x3_opt_end
satd_intra_4x4_x3_opt_jump1:
st1 {v16.S}[0], [x4], #4
st1 {v16.S}[0], [x4], #4
st1 {v16.S}[0], [x4], #4
st1 {v16.S}[0], [x4]
satd_intra_4x4_x3_opt_end:
mov w0, w10
WELS_ASM_ARCH64_FUNC_END
WELS_ASM_ARCH64_FUNC_BEGIN WelsIntra8x8Combined3Satd_AArch64_neon
ldr x11, [sp, #0]
LOAD_CHROMA_DATA x0, v0.8b, v0.b
LOAD_CHROMA_DATA x7, v1.8b, v1.b
//Calculate the 16x16_v mode SATD and save to "v6, v7"
ushll v4.8h, v0.8b, #2
ushll v5.8h, v1.8b, #2
GET_16X16_V_SATD
//Calculate the 16x16_h mode SATD and save to "v16, v17"
ushll2 v4.8h, v0.16b, #2
ushll2 v5.8h, v1.16b, #2
GET_16X16_H_SATD
uaddlp v0.8h, v0.16b
uaddlp v2.4s, v0.8h
ins v3.d[0], v2.d[1]
add v3.2s, v2.2s, v3.2s
uaddlp v1.8h, v1.16b
uaddlp v4.4s, v1.8h
ins v5.d[0], v4.d[1]
add v5.2s, v4.2s, v5.2s
trn2 v0.4s, v2.4s, v4.4s
urshr v0.4s, v0.4s, #2
urshr v3.2s, v3.2s, #3
urshr v5.2s, v5.2s, #3
ushll v22.2d, v0.2s, #4 //{1cb, 1cr}
ushll2 v23.2d, v0.4s, #4 //{2cb, 2cr}
ushll v24.2d, v3.2s, #4 //{0cb, 3cb}
ushll v25.2d, v5.2s, #4 //{0cr, 3cr}
eor v31.16b, v31.16b, v31.16b //Save the SATD of DC_BOTH
eor v30.16b, v30.16b, v30.16b //Save the SATD of H
eor v29.16b, v29.16b, v29.16b //Save the SATD of V
eor v28.16b, v28.16b, v28.16b //For zero register
ins v18.d[0], v6.d[1]
ins v19.d[0], v7.d[1]
ins v26.d[0], v16.d[1]
ins v27.d[0], v17.d[1]
LOAD_8X4_DATA x2
HDM_TRANSFORM_4X4_L0 v20.8b, v21.8b, v6.4h, v16.4h, v24.4h, v29.4s, v30.4s, v31.4s, v28, l
HDM_TRANSFORM_4X4_L0 v20.16b, v21.16b, v18.4h, v16.4h, v22.4h, v29.4s, v30.4s, v31.4s, v28, l2
LOAD_8X4_DATA x11
ins v22.d[0], v22.d[1]
HDM_TRANSFORM_4X4_L0 v20.8b, v21.8b, v7.4h, v17.4h, v25.4h, v29.4s, v30.4s, v31.4s, v28, l
HDM_TRANSFORM_4X4_L0 v20.16b, v21.16b, v19.4h, v17.4h, v22.4h, v29.4s, v30.4s, v31.4s, v28, l2
LOAD_8X4_DATA x2
ins v24.d[0], v24.d[1]
HDM_TRANSFORM_4X4_L0 v20.8b, v21.8b, v6.4h, v26.4h, v23.4h, v29.4s, v30.4s, v31.4s, v28, l
HDM_TRANSFORM_4X4_L0 v20.16b, v21.16b, v18.4h, v26.4h, v24.4h, v29.4s, v30.4s, v31.4s, v28, l2
LOAD_8X4_DATA x11
ins v23.d[0], v23.d[1]
ins v25.d[0], v25.d[1]
HDM_TRANSFORM_4X4_L0 v20.8b, v21.8b, v7.4h, v27.4h, v23.4h, v29.4s, v30.4s, v31.4s, v28, l
HDM_TRANSFORM_4X4_L0 v20.16b, v21.16b, v19.4h, v27.4h, v25.4h, v29.4s, v30.4s, v31.4s, v28, l2
urshr v29.4s, v29.4s, #1
addv s29, v29.4s
fmov w2, s29
add w2, w2, w5, lsl #1
urshr v30.4s, v30.4s, #1
addv s30, v30.4s
fmov w1, s30
add w1, w1, w5, lsl #1
urshr v31.4s, v31.4s, #1
addv s31, v31.4s
fmov w0, s31
SELECT_BEST_COST w0
str w7, [x4]
WELS_ASM_ARCH64_FUNC_END
WELS_ASM_ARCH64_FUNC_BEGIN WelsIntra16x16Combined3Satd_AArch64_neon
LOAD_LUMA_DATA
uaddlv h2, v0.16b
uaddlv h3, v1.16b
add v2.8h, v2.8h, v3.8h
urshr v2.4h, v2.4h, #5
shl v2.4h, v2.4h, #4
//Calculate the 16x16_v mode SATD and save to "v6, v7"
ushll v4.8h, v0.8b, #2
ushll2 v5.8h, v0.16b, #2
GET_16X16_V_SATD
//Calculate the 16x16_h mode SATD and save to "v16, v17"
ushll v4.8h, v1.8b, #2
ushll2 v5.8h, v1.16b, #2
GET_16X16_H_SATD
eor v31.16b, v31.16b, v31.16b //Save the SATD of DC_BOTH
eor v30.16b, v30.16b, v30.16b //Save the SATD of H
eor v29.16b, v29.16b, v29.16b //Save the SATD of V
eor v28.16b, v28.16b, v28.16b //For zero register
ins v18.d[0], v6.d[1]
ins v19.d[0], v7.d[1]
ins v26.d[0], v16.d[1]
ins v27.d[0], v17.d[1]
LOAD_16X4_DATA
HDM_TRANSFORM_4X4_L0 v22.8b, v24.8b, v6.4h, v16.4h, v2.4h, v29.4s, v30.4s, v31.4s, v28, l
HDM_TRANSFORM_4X4_L0 v22.16b, v24.16b, v7.4h, v16.4h, v2.4h, v29.4s, v30.4s, v31.4s, v28, l2
HDM_TRANSFORM_4X4_L0 v23.8b, v25.8b, v18.4h, v16.4h, v2.4h, v29.4s, v30.4s, v31.4s, v28, l
HDM_TRANSFORM_4X4_L0 v23.16b, v25.16b, v19.4h, v16.4h, v2.4h, v29.4s, v30.4s, v31.4s, v28, l2
LOAD_16X4_DATA
HDM_TRANSFORM_4X4_L0 v22.8b, v24.8b, v6.4h, v26.4h, v2.4h, v29.4s, v30.4s, v31.4s, v28, l
HDM_TRANSFORM_4X4_L0 v22.16b, v24.16b, v7.4h, v26.4h, v2.4h, v29.4s, v30.4s, v31.4s, v28, l2
HDM_TRANSFORM_4X4_L0 v23.8b, v25.8b, v18.4h, v26.4h, v2.4h, v29.4s, v30.4s, v31.4s, v28, l
HDM_TRANSFORM_4X4_L0 v23.16b, v25.16b, v19.4h, v26.4h, v2.4h, v29.4s, v30.4s, v31.4s, v28, l2
LOAD_16X4_DATA
HDM_TRANSFORM_4X4_L0 v22.8b, v24.8b, v6.4h, v17.4h, v2.4h, v29.4s, v30.4s, v31.4s, v28, l
HDM_TRANSFORM_4X4_L0 v22.16b, v24.16b, v7.4h, v17.4h, v2.4h, v29.4s, v30.4s, v31.4s, v28, l2
HDM_TRANSFORM_4X4_L0 v23.8b, v25.8b, v18.4h, v17.4h, v2.4h, v29.4s, v30.4s, v31.4s, v28, l
HDM_TRANSFORM_4X4_L0 v23.16b, v25.16b, v19.4h, v17.4h, v2.4h, v29.4s, v30.4s, v31.4s, v28, l2
LOAD_16X4_DATA
HDM_TRANSFORM_4X4_L0 v22.8b, v24.8b, v6.4h, v27.4h, v2.4h, v29.4s, v30.4s, v31.4s, v28, l
HDM_TRANSFORM_4X4_L0 v22.16b, v24.16b, v7.4h, v27.4h, v2.4h, v29.4s, v30.4s, v31.4s, v28, l2
HDM_TRANSFORM_4X4_L0 v23.8b, v25.8b, v18.4h, v27.4h, v2.4h, v29.4s, v30.4s, v31.4s, v28, l
HDM_TRANSFORM_4X4_L0 v23.16b, v25.16b, v19.4h, v27.4h, v2.4h, v29.4s, v30.4s, v31.4s, v28, l2
urshr v29.4s, v29.4s, #1
addv s29, v29.4s
fmov w0, s29
urshr v30.4s, v30.4s, #1
addv s30, v30.4s
fmov w1, s30
add w1, w1, w5, lsl #1
urshr v31.4s, v31.4s, #1
addv s31, v31.4s
fmov w2, s31
add w2, w2, w5, lsl #1
SELECT_BEST_COST w0
str w7, [x4]
WELS_ASM_ARCH64_FUNC_END
#endif

View File

@ -108,6 +108,14 @@ int32_t WelsSampleSatd16x16_AArch64_neon (uint8_t*, int32_t, uint8_t*, int32_t);
int32_t WelsSampleSatd16x8_AArch64_neon (uint8_t*, int32_t, uint8_t*, int32_t);
int32_t WelsSampleSatd8x16_AArch64_neon (uint8_t*, int32_t, uint8_t*, int32_t);
int32_t WelsSampleSatd8x8_AArch64_neon (uint8_t*, int32_t, uint8_t*, int32_t);
int32_t WelsIntra16x16Combined3Satd_AArch64_neon (uint8_t*, int32_t, uint8_t*, int32_t, int32_t*, int32_t, uint8_t*);
int32_t WelsIntra16x16Combined3Sad_AArch64_neon (uint8_t*, int32_t, uint8_t*, int32_t, int32_t*, int32_t, uint8_t*);
int32_t WelsIntra8x8Combined3Satd_AArch64_neon (uint8_t*, int32_t, uint8_t*, int32_t, int32_t*, int32_t, uint8_t*, uint8_t*,
uint8_t*);
int32_t WelsIntra8x8Combined3Sad_AArch64_neon (uint8_t*, int32_t, uint8_t*, int32_t, int32_t*, int32_t, uint8_t*, uint8_t*,
uint8_t*);
int32_t WelsIntra4x4Combined3Satd_AArch64_neon (uint8_t*, int32_t, uint8_t*, int32_t, uint8_t*, int32_t*, int32_t, int32_t,
int32_t);
#endif
#if defined(__cplusplus)
}

View File

@ -433,6 +433,12 @@ void WelsInitSampleSadFunc (SWelsFuncPtrList* pFuncList, uint32_t uiCpuFlag) {
pFuncList->sSampleDealingFuncs.pfSampleSatd[BLOCK_8x16 ] = WelsSampleSatd8x16_AArch64_neon;
pFuncList->sSampleDealingFuncs.pfSampleSatd[BLOCK_16x8 ] = WelsSampleSatd16x8_AArch64_neon;
pFuncList->sSampleDealingFuncs.pfSampleSatd[BLOCK_16x16] = WelsSampleSatd16x16_AArch64_neon;
pFuncList->sSampleDealingFuncs.pfIntra4x4Combined3Satd = WelsIntra4x4Combined3Satd_AArch64_neon;
pFuncList->sSampleDealingFuncs.pfIntra8x8Combined3Satd = WelsIntra8x8Combined3Satd_AArch64_neon;
pFuncList->sSampleDealingFuncs.pfIntra8x8Combined3Sad = WelsIntra8x8Combined3Sad_AArch64_neon;
pFuncList->sSampleDealingFuncs.pfIntra16x16Combined3Satd = WelsIntra16x16Combined3Satd_AArch64_neon;
pFuncList->sSampleDealingFuncs.pfIntra16x16Combined3Sad = WelsIntra16x16Combined3Sad_AArch64_neon;
}
#endif
}

View File

@ -60,6 +60,7 @@ endif
ifeq ($(ASM_ARCH), arm64)
ENCODER_ASM_ARM64_SRCS=\
$(ENCODER_SRCDIR)/core/arm64/intra_pred_aarch64_neon.S\
$(ENCODER_SRCDIR)/core/arm64/intra_pred_sad_3_opt_neon_aarch64.S\
$(ENCODER_SRCDIR)/core/arm64/pixel_aarch64_neon.S\
ENCODER_OBJS += $(ENCODER_ASM_ARM64_SRCS:.S=.$(OBJ))