Add arm64 neon code for intraSad&Satd
This commit is contained in:
parent
3f333b01fd
commit
f0ec323e2c
@ -7,6 +7,7 @@
|
||||
objects = {
|
||||
|
||||
/* Begin PBXBuildFile section */
|
||||
4C23BC60195A77E0003B81FC /* intra_pred_sad_3_opt_neon_aarch64.S in Sources */ = {isa = PBXBuildFile; fileRef = 4C23BC5F195A77E0003B81FC /* intra_pred_sad_3_opt_neon_aarch64.S */; };
|
||||
4C34066D18C57D0400DFA14A /* intra_pred_neon.S in Sources */ = {isa = PBXBuildFile; fileRef = 4C34066618C57D0400DFA14A /* intra_pred_neon.S */; };
|
||||
4C34066E18C57D0400DFA14A /* intra_pred_sad_3_opt_neon.S in Sources */ = {isa = PBXBuildFile; fileRef = 4C34066718C57D0400DFA14A /* intra_pred_sad_3_opt_neon.S */; };
|
||||
4C34067018C57D0400DFA14A /* memory_neon.S in Sources */ = {isa = PBXBuildFile; fileRef = 4C34066918C57D0400DFA14A /* memory_neon.S */; };
|
||||
@ -61,6 +62,7 @@
|
||||
/* End PBXCopyFilesBuildPhase section */
|
||||
|
||||
/* Begin PBXFileReference section */
|
||||
4C23BC5F195A77E0003B81FC /* intra_pred_sad_3_opt_neon_aarch64.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; name = intra_pred_sad_3_opt_neon_aarch64.S; path = arm64/intra_pred_sad_3_opt_neon_aarch64.S; sourceTree = "<group>"; };
|
||||
4C34066618C57D0400DFA14A /* intra_pred_neon.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = intra_pred_neon.S; sourceTree = "<group>"; };
|
||||
4C34066718C57D0400DFA14A /* intra_pred_sad_3_opt_neon.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = intra_pred_sad_3_opt_neon.S; sourceTree = "<group>"; };
|
||||
4C34066918C57D0400DFA14A /* memory_neon.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = memory_neon.S; sourceTree = "<group>"; };
|
||||
@ -182,6 +184,7 @@
|
||||
4CB8F2B219235FAC005D6386 /* arm64 */ = {
|
||||
isa = PBXGroup;
|
||||
children = (
|
||||
4C23BC5F195A77E0003B81FC /* intra_pred_sad_3_opt_neon_aarch64.S */,
|
||||
4CBC1B82194ACBB400214D9E /* intra_pred_aarch64_neon.S */,
|
||||
4CB8F2B319235FC5005D6386 /* pixel_aarch64_neon.S */,
|
||||
);
|
||||
@ -422,6 +425,7 @@
|
||||
4CE4472518BC605C0017DF25 /* svc_encode_mb.cpp in Sources */,
|
||||
4CE4471A18BC605C0017DF25 /* mv_pred.cpp in Sources */,
|
||||
4C34066E18C57D0400DFA14A /* intra_pred_sad_3_opt_neon.S in Sources */,
|
||||
4C23BC60195A77E0003B81FC /* intra_pred_sad_3_opt_neon_aarch64.S in Sources */,
|
||||
4CE4472B18BC605C0017DF25 /* wels_preprocess.cpp in Sources */,
|
||||
4CE4470E18BC605C0017DF25 /* au_set.cpp in Sources */,
|
||||
4CBC1B83194ACBB400214D9E /* intra_pred_aarch64_neon.S in Sources */,
|
||||
|
665
codec/encoder/core/arm64/intra_pred_sad_3_opt_neon_aarch64.S
Executable file
665
codec/encoder/core/arm64/intra_pred_sad_3_opt_neon_aarch64.S
Executable file
@ -0,0 +1,665 @@
|
||||
/*!
|
||||
* \copy
|
||||
* Copyright (c) 2013, Cisco Systems
|
||||
* All rights reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions
|
||||
* are met:
|
||||
*
|
||||
* * Redistributions of source code must retain the above copyright
|
||||
* notice, this list of conditions and the following disclaimer.
|
||||
*
|
||||
* * Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in
|
||||
* the documentation and/or other materials provided with the
|
||||
* distribution.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
|
||||
* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
|
||||
* COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
|
||||
* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
|
||||
* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
|
||||
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
|
||||
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
|
||||
* ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
* POSSIBILITY OF SUCH DAMAGE.
|
||||
*
|
||||
*/
|
||||
|
||||
#ifdef HAVE_NEON_AARCH64
|
||||
.text
|
||||
#include "arm_arch64_common_macro.S"
|
||||
|
||||
.macro LOAD_LUMA_DATA
|
||||
sub x7, x0, x1
|
||||
ld1 {v0.16b}, [x7] //top
|
||||
sub x7, x0, #1
|
||||
ld1 {v1.b}[0], [x7], x1
|
||||
ld1 {v1.b}[1], [x7], x1
|
||||
ld1 {v1.b}[2], [x7], x1
|
||||
ld1 {v1.b}[3], [x7], x1
|
||||
ld1 {v1.b}[4], [x7], x1
|
||||
ld1 {v1.b}[5], [x7], x1
|
||||
ld1 {v1.b}[6], [x7], x1
|
||||
ld1 {v1.b}[7], [x7], x1
|
||||
ld1 {v1.b}[8], [x7], x1
|
||||
ld1 {v1.b}[9], [x7], x1
|
||||
ld1 {v1.b}[10], [x7], x1
|
||||
ld1 {v1.b}[11], [x7], x1
|
||||
ld1 {v1.b}[12], [x7], x1
|
||||
ld1 {v1.b}[13], [x7], x1
|
||||
ld1 {v1.b}[14], [x7], x1
|
||||
ld1 {v1.b}[15], [x7] //left
|
||||
.endm
|
||||
|
||||
.macro LOAD_16X4_DATA
|
||||
//Load the p_enc data and save to "v22 ~ v25"--- 16X4 bytes
|
||||
ld1 {v0.16b}, [x2], x3
|
||||
ld1 {v1.16b}, [x2], x3
|
||||
ld1 {v20.16b}, [x2], x3
|
||||
ld1 {v21.16b}, [x2], x3
|
||||
trn1 v22.4s, v0.4s, v1.4s
|
||||
trn2 v23.4s, v0.4s, v1.4s
|
||||
trn1 v24.4s, v20.4s, v21.4s
|
||||
trn2 v25.4s, v20.4s, v21.4s
|
||||
.endm
|
||||
|
||||
.macro GET_16X16_V_SATD
|
||||
trn1 v6.4s, v4.4s, v5.4s
|
||||
trn2 v7.4s, v4.4s, v5.4s
|
||||
add v4.8h, v6.8h, v7.8h
|
||||
sub v5.8h, v6.8h, v7.8h
|
||||
trn1 v6.8h, v4.8h, v5.8h
|
||||
trn2 v7.8h, v4.8h, v5.8h
|
||||
add v4.8h, v6.8h, v7.8h
|
||||
sub v5.8h, v6.8h, v7.8h
|
||||
trn1 v6.4s, v4.4s, v5.4s
|
||||
trn2 v7.4s, v4.4s, v5.4s //{0,1,3,2, 4,5,7,6} v6 {8,9,11,10, 12,13,15,14} v7
|
||||
.endm
|
||||
|
||||
.macro GET_16X16_H_SATD
|
||||
trn1 v16.4s, v4.4s, v5.4s
|
||||
trn2 v17.4s, v4.4s, v5.4s
|
||||
add v4.8h, v16.8h, v17.8h
|
||||
sub v5.8h, v16.8h, v17.8h
|
||||
trn1 v16.8h, v4.8h, v5.8h
|
||||
trn2 v17.8h, v4.8h, v5.8h
|
||||
add v4.8h, v16.8h, v17.8h
|
||||
sub v5.8h, v16.8h, v17.8h
|
||||
trn1 v16.4s, v4.4s, v5.4s
|
||||
trn2 v17.4s, v4.4s, v5.4s //{0,1,3,2, 4,5,7,6} v16 {8,9,11,10, 12,13,15,14} v17
|
||||
.endm
|
||||
|
||||
#ifdef __APPLE__
|
||||
.macro SELECT_BEST_COST
|
||||
cmp w1, $0
|
||||
csel $0, $0, w1, hs
|
||||
cset w7, lo
|
||||
cmp w2, $0
|
||||
mov w6, #2
|
||||
csel $0, $0, w2, hs
|
||||
csel w7, w7, w6, hs
|
||||
.endm
|
||||
|
||||
.macro LOAD_CHROMA_DATA
|
||||
sub x9, $0, x1
|
||||
ld1 {$1}, [x9] //top_cb
|
||||
sub x9, $0, #1
|
||||
ld1 {$2}[8], [x9], x1
|
||||
ld1 {$2}[9], [x9], x1
|
||||
ld1 {$2}[10], [x9], x1
|
||||
ld1 {$2}[11], [x9], x1
|
||||
ld1 {$2}[12], [x9], x1
|
||||
ld1 {$2}[13], [x9], x1
|
||||
ld1 {$2}[14], [x9], x1
|
||||
ld1 {$2}[15], [x9], x1 //left_cb
|
||||
.endm
|
||||
|
||||
.macro LOAD_8X4_DATA
|
||||
//Load the p_enc data and save to "v20 ~ v21"--- 8X4 bytes
|
||||
ld1 {v0.8b}, [$0], x3
|
||||
ld1 {v1.8b}, [$0], x3
|
||||
ld1 {v0.d}[1], [$0], x3
|
||||
ld1 {v1.d}[1], [$0], x3
|
||||
trn1 v2.4s, v0.4s, v1.4s
|
||||
trn2 v1.4s, v0.4s, v1.4s
|
||||
trn1 v20.2d, v2.2d, v1.2d
|
||||
trn2 v21.2d, v2.2d, v1.2d
|
||||
.endm
|
||||
|
||||
.macro HDM_TRANSFORM_4X4_L0
|
||||
//Do the vertical transform
|
||||
uadd$9 v0.8h, $0, $1
|
||||
usub$9 v1.8h, $0, $1
|
||||
trn1 v3.2d, v0.2d, v1.2d
|
||||
trn2 v1.2d, v0.2d, v1.2d
|
||||
add v4.8h, v3.8h, v1.8h //{0,1,2,3,4,5,6,7}
|
||||
sub v5.8h, v3.8h, v1.8h //{12,13,14,15,8,9,10,11}
|
||||
|
||||
//Do the horizontal transform
|
||||
trn1 v0.4s, v4.4s, v5.4s
|
||||
trn2 v1.4s, v4.4s, v5.4s
|
||||
add v4.8h, v0.8h, v1.8h
|
||||
sub v5.8h, v0.8h, v1.8h
|
||||
trn1 v0.8h, v4.8h, v5.8h
|
||||
trn2 v1.8h, v4.8h, v5.8h
|
||||
add v4.8h, v0.8h, v1.8h
|
||||
sub v5.8h, v0.8h, v1.8h
|
||||
|
||||
//16x16_v
|
||||
trn1 v0.2s, v4.2s, v5.2s
|
||||
trn2 v1.2s, v4.2s, v5.2s
|
||||
sabal $5, v0.4h, $2
|
||||
sabal $5, v1.4h, $8.4h
|
||||
sabal2 $5, v4.8h, $8.8h
|
||||
sabal2 $5, v5.8h, $8.8h
|
||||
|
||||
//16x16_h
|
||||
ins v3.d[0], v4.d[1]
|
||||
trn1 v0.4h, v4.4h, v3.4h
|
||||
trn2 v1.4h, v4.4h, v3.4h
|
||||
sabal $6, v0.4h, $3
|
||||
sabdl v4.4s, v1.4h, $8.4h
|
||||
sabal v4.4s, v5.4h, $8.4h
|
||||
sabal2 v4.4s, v5.8h, $8.8h
|
||||
add $6, $6, v4.4s
|
||||
|
||||
//16x16_dc_both
|
||||
sabal $7, v0.4h, $4
|
||||
add $7, $7, v4.4s
|
||||
.endm
|
||||
#else
|
||||
.macro SELECT_BEST_COST arg0
|
||||
cmp w1, \arg0
|
||||
csel \arg0, \arg0, w1, hs
|
||||
cset w7, lo
|
||||
cmp w2, \arg0
|
||||
mov w6, #2
|
||||
csel \arg0, \arg0, w2, hs
|
||||
csel w7, w7, w6, hs
|
||||
.endm
|
||||
|
||||
.macro LOAD_CHROMA_DATA arg0, arg1, arg2
|
||||
sub x9, \arg0, x1
|
||||
ld1 {\arg1}, [x9] //top_cb
|
||||
sub x9, $0, #1
|
||||
ld1 {\arg2}[8], [x9], x1
|
||||
ld1 {\arg2}[9], [x9], x1
|
||||
ld1 {\arg2}[10], [x9], x1
|
||||
ld1 {\arg2}[11], [x9], x1
|
||||
ld1 {\arg2}[12], [x9], x1
|
||||
ld1 {\arg2}[13], [x9], x1
|
||||
ld1 {\arg2}[14], [x9], x1
|
||||
ld1 {\arg2}[15], [x9], x1 //left_cb
|
||||
.endm
|
||||
|
||||
.macro LOAD_8X4_DATA arg0
|
||||
//Load the p_enc data and save to "v20 ~ v21"--- 8X4 bytes
|
||||
ld1 {v0.8b}, [\arg0], x3
|
||||
ld1 {v1.8b}, [\arg0], x3
|
||||
ld1 {v0.d}[1], [\arg0], x3
|
||||
ld1 {v1.d}[1], [\arg0], x3
|
||||
trn1 v2.4s, v0.4s, v1.4s
|
||||
trn2 v1.4s, v0.4s, v1.4s
|
||||
trn1 v20.2d, v2.2d, v1.2d
|
||||
trn2 v21.2d, v2.2d, v1.2d
|
||||
.endm
|
||||
|
||||
.macro HDM_TRANSFORM_4X4_L0 arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8, arg9
|
||||
//Do the vertical transform
|
||||
uadd\arg9\() v0.8h, \arg0, \arg1
|
||||
usub\arg9\() v1.8h, \arg0, \arg1
|
||||
trn1 v3.2d, v0.2d, v1.2d
|
||||
trn2 v1.2d, v0.2d, v1.2d
|
||||
add v4.8h, v3.8h, v1.8h //{0,1,2,3,4,5,6,7}
|
||||
sub v5.8h, v3.8h, v1.8h //{12,13,14,15,8,9,10,11}
|
||||
|
||||
//Do the horizontal transform
|
||||
trn1 v0.4s, v4.4s, v5.4s
|
||||
trn2 v1.4s, v4.4s, v5.4s
|
||||
add v4.8h, v0.8h, v1.8h
|
||||
sub v5.8h, v0.8h, v1.8h
|
||||
trn1 v0.8h, v4.8h, v5.8h
|
||||
trn2 v1.8h, v4.8h, v5.8h
|
||||
add v4.8h, v0.8h, v1.8h
|
||||
sub v5.8h, v0.8h, v1.8h
|
||||
|
||||
//16x16_v
|
||||
trn1 v0.2s, v4.2s, v5.2s
|
||||
trn2 v1.2s, v4.2s, v5.2s
|
||||
sabal \arg5, v0.4h, \arg2
|
||||
sabal \arg5, v1.4h, \arg8\().4h
|
||||
sabal2 \arg5, v4.8h, \arg8\().8h
|
||||
sabal2 \arg5, v5.8h, \arg8\().8h
|
||||
|
||||
//16x16_h
|
||||
ins v3.d[0], v4.d[1]
|
||||
trn1 v0.4h, v4.4h, v3.4h
|
||||
trn2 v1.4h, v4.4h, v3.4h
|
||||
sabal \arg6, v0.4h, \arg3
|
||||
sabdl v4.4s, v1.4h, \arg8\().4h
|
||||
sabal v4.4s, v5.4h, \arg8\().4h
|
||||
sabal2 v4.4s, v5.8h, \arg8\().8h
|
||||
add \arg6, \arg6, v4.4s
|
||||
|
||||
//16x16_dc_both
|
||||
sabal \arg7, v0.4h, \arg4
|
||||
add \arg7, \arg7, v4.4s
|
||||
.endm
|
||||
#endif
|
||||
|
||||
WELS_ASM_ARCH64_FUNC_BEGIN WelsIntra8x8Combined3Sad_AArch64_neon
|
||||
ldr x11, [sp, #0]
|
||||
|
||||
LOAD_CHROMA_DATA x0, v0.8b, v0.b
|
||||
|
||||
uaddlp v1.8h, v0.16b
|
||||
uaddlp v2.4s, v1.8h
|
||||
ins v3.d[0], v2.d[1]
|
||||
add v3.2s, v2.2s, v3.2s
|
||||
urshr v2.4s, v2.4s, #2
|
||||
urshr v3.2s, v3.2s, #3
|
||||
|
||||
dup v20.8b, v3.b[0]
|
||||
dup v21.8b, v2.b[4]
|
||||
dup v22.8b, v2.b[12]
|
||||
dup v23.8b, v3.b[4]
|
||||
ins v20.s[1], v21.s[0]
|
||||
ins v22.s[1], v23.s[0]
|
||||
|
||||
LOAD_CHROMA_DATA x7, v4.8b, v4.b
|
||||
|
||||
uaddlp v5.8h, v4.16b
|
||||
uaddlp v6.4s, v5.8h
|
||||
ins v7.d[0], v6.d[1]
|
||||
add v7.2s, v6.2s, v7.2s
|
||||
urshr v6.4s, v6.4s, #2
|
||||
urshr v7.2s, v7.2s, #3
|
||||
|
||||
dup v24.8b, v7.b[0]
|
||||
dup v25.8b, v6.b[4]
|
||||
dup v26.8b, v6.b[12]
|
||||
dup v27.8b, v7.b[4]
|
||||
ins v24.s[1], v25.s[0]
|
||||
ins v26.s[1], v27.s[0]
|
||||
|
||||
sub x9, x0, #1
|
||||
sub x10, x7, #1
|
||||
|
||||
ld1 {v3.8b}, [x2], x3
|
||||
ld1 {v5.8b}, [x11], x3
|
||||
|
||||
ld1r {v6.8b}, [x9], x1
|
||||
ld1r {v7.8b}, [x10], x1
|
||||
|
||||
uabdl v29.8h, v0.8b, v3.8b
|
||||
uabal v29.8h, v4.8b, v5.8b //top
|
||||
|
||||
uabdl v30.8h, v6.8b, v3.8b
|
||||
uabal v30.8h, v7.8b, v5.8b //left
|
||||
|
||||
uabdl v31.8h, v20.8b, v3.8b
|
||||
uabal v31.8h, v24.8b, v5.8b //Dc
|
||||
.rept 3
|
||||
ld1 {v3.8b}, [x2], x3
|
||||
ld1 {v5.8b}, [x11], x3
|
||||
|
||||
ld1r {v6.8b}, [x9], x1
|
||||
ld1r {v7.8b}, [x10], x1
|
||||
|
||||
uabal v29.8h, v0.8b, v3.8b
|
||||
uabal v29.8h, v4.8b, v5.8b //top
|
||||
|
||||
uabal v30.8h, v6.8b, v3.8b
|
||||
uabal v30.8h, v7.8b, v5.8b //left
|
||||
|
||||
uabal v31.8h, v20.8b, v3.8b
|
||||
uabal v31.8h, v24.8b, v5.8b //Dc
|
||||
.endr
|
||||
|
||||
.rept 4
|
||||
ld1 {v3.8b}, [x2], x3
|
||||
ld1 {v5.8b}, [x11], x3
|
||||
|
||||
ld1r {v6.8b}, [x9], x1
|
||||
ld1r {v7.8b}, [x10], x1
|
||||
|
||||
uabal v29.8h, v0.8b, v3.8b
|
||||
uabal v29.8h, v4.8b, v5.8b //top
|
||||
|
||||
uabal v30.8h, v6.8b, v3.8b
|
||||
uabal v30.8h, v7.8b, v5.8b //left
|
||||
|
||||
uabal v31.8h, v22.8b, v3.8b
|
||||
uabal v31.8h, v26.8b, v5.8b //Dc
|
||||
.endr
|
||||
|
||||
saddlv s29, v29.8h
|
||||
fmov w2, s29
|
||||
add w2, w2, w5, lsl #1
|
||||
saddlv s30, v30.8h
|
||||
fmov w1, s30
|
||||
add w1, w1, w5, lsl #1
|
||||
saddlv s31, v31.8h
|
||||
fmov w0, s31
|
||||
|
||||
SELECT_BEST_COST w0
|
||||
|
||||
str w7, [x4]
|
||||
WELS_ASM_ARCH64_FUNC_END
|
||||
|
||||
WELS_ASM_ARCH64_FUNC_BEGIN WelsIntra16x16Combined3Sad_AArch64_neon
|
||||
|
||||
LOAD_LUMA_DATA
|
||||
|
||||
uaddlv h2, v0.16b
|
||||
uaddlv h3, v1.16b
|
||||
add v2.8h, v2.8h, v3.8h
|
||||
uqrshrn b2, h2, #5
|
||||
dup v2.16b, v2.b[0] //Dc
|
||||
|
||||
sub x7, x0, #1
|
||||
ld1 {v3.16b}, [x2], x3
|
||||
ld1r {v4.16b}, [x7], x1
|
||||
|
||||
uabdl v29.8h, v0.8b, v3.8b
|
||||
uabal2 v29.8h, v0.16b,v3.16b //top
|
||||
|
||||
uabdl v30.8h, v4.8b, v3.8b
|
||||
uabal2 v30.8h, v4.16b,v3.16b //left
|
||||
|
||||
uabdl v31.8h, v2.8b, v3.8b
|
||||
uabal2 v31.8h, v2.16b,v3.16b //Dc
|
||||
mov x6, #15
|
||||
sad_intra_16x16_x3_opt_loop0:
|
||||
ld1 {v3.16b}, [x2], x3
|
||||
ld1r {v4.16b}, [x7], x1
|
||||
|
||||
uabal v29.8h, v0.8b, v3.8b
|
||||
uabal2 v29.8h, v0.16b,v3.16b //top
|
||||
|
||||
uabal v30.8h, v4.8b, v3.8b
|
||||
uabal2 v30.8h, v4.16b,v3.16b //left
|
||||
|
||||
uabal v31.8h, v2.8b, v3.8b
|
||||
uabal2 v31.8h, v2.16b,v3.16b //Dc
|
||||
sub x6, x6, #1
|
||||
cbnz x6, sad_intra_16x16_x3_opt_loop0
|
||||
|
||||
saddlv s29, v29.8h
|
||||
fmov w0, s29
|
||||
saddlv s30, v30.8h
|
||||
fmov w1, s30
|
||||
add w1, w1, w5, lsl #1
|
||||
saddlv s31, v31.8h
|
||||
fmov w2, s31
|
||||
add w2, w2, w5, lsl #1
|
||||
|
||||
SELECT_BEST_COST w0
|
||||
|
||||
str w7, [x4]
|
||||
WELS_ASM_ARCH64_FUNC_END
|
||||
|
||||
WELS_ASM_ARCH64_FUNC_BEGIN WelsIntra4x4Combined3Satd_AArch64_neon
|
||||
sub x9, x0, x1
|
||||
ld1 {v16.s}[0], [x9] //top
|
||||
sub x9, x0, #1
|
||||
ld1 {v16.b}[4], [x9], x1
|
||||
ld1 {v16.b}[5], [x9], x1
|
||||
ld1 {v16.b}[6], [x9], x1
|
||||
ld1 {v16.b}[7], [x9], x1
|
||||
|
||||
|
||||
uaddlv h2, v16.8b
|
||||
uqrshrn b17, h2, #3
|
||||
urshr v2.4h, v2.4h, #3
|
||||
shl v2.4h, v2.4h, #4
|
||||
|
||||
//Calculate the 4x4_v 4x4_h mode SATD and save to "v6, v7"
|
||||
ushll v4.8h, v16.8b, #2
|
||||
ins v5.d[0], v4.d[1]
|
||||
trn1 v6.2s, v4.2s, v5.2s
|
||||
trn2 v7.2s, v4.2s, v5.2s
|
||||
|
||||
add v4.4h, v6.4h, v7.4h
|
||||
sub v5.4h, v6.4h, v7.4h
|
||||
trn1 v6.4h, v4.4h, v5.4h
|
||||
trn2 v7.4h, v4.4h, v5.4h
|
||||
add v4.4h, v6.4h, v7.4h
|
||||
sub v5.4h, v6.4h, v7.4h
|
||||
trn1 v6.2s, v4.2s, v5.2s
|
||||
trn2 v7.2s, v4.2s, v5.2s //{0,1,3,2,top} v6 {0,1,3,2,left} v7
|
||||
|
||||
eor v31.16b, v31.16b, v31.16b //Save the SATD of DC_BOTH
|
||||
eor v30.16b, v30.16b, v30.16b //Save the SATD of H
|
||||
eor v29.16b, v29.16b, v29.16b //Save the SATD of V
|
||||
eor v28.16b, v28.16b, v28.16b //For zero register
|
||||
|
||||
//Load the p_enc data and save to "v22 ~ v23"--- 16X4 bytes
|
||||
ld1 {v22.s}[0], [x2], x3
|
||||
ld1 {v22.s}[1], [x2], x3
|
||||
ld1 {v23.s}[0], [x2], x3
|
||||
ld1 {v23.s}[1], [x2], x3
|
||||
|
||||
HDM_TRANSFORM_4X4_L0 v22.8b, v23.8b, v6.4h, v7.4h, v2.4h, v29.4s, v30.4s, v31.4s, v28, l
|
||||
|
||||
ldr x11, [sp, #0]
|
||||
urshr v29.4s, v29.4s, #1
|
||||
addv s29, v29.4s
|
||||
fmov w0, s29
|
||||
add w0, w0, w11
|
||||
|
||||
urshr v30.4s, v30.4s, #1
|
||||
addv s30, v30.4s
|
||||
fmov w1, s30
|
||||
add w1, w1, w7
|
||||
|
||||
urshr v31.4s, v31.4s, #1
|
||||
addv s31, v31.4s
|
||||
fmov w2, s31
|
||||
add w2, w2, w6
|
||||
|
||||
mov w10, w0
|
||||
SELECT_BEST_COST w10
|
||||
|
||||
str w7, [x5]
|
||||
|
||||
sub w9, w10, w2
|
||||
cbnz w9, satd_intra_4x4_x3_opt_jump0
|
||||
dup v0.16b, v17.b[0]
|
||||
st1 {v0.16b}, [x4]
|
||||
b satd_intra_4x4_x3_opt_end
|
||||
|
||||
satd_intra_4x4_x3_opt_jump0:
|
||||
sub w8, w10, w1
|
||||
cbnz w8, satd_intra_4x4_x3_opt_jump1
|
||||
dup v0.16b, v16.b[4]
|
||||
dup v1.16b, v16.b[5]
|
||||
dup v2.16b, v16.b[6]
|
||||
dup v3.16b, v16.b[7]
|
||||
st4 {v0.s,v1.s,v2.s,v3.s}[0], [x4]
|
||||
b satd_intra_4x4_x3_opt_end
|
||||
|
||||
satd_intra_4x4_x3_opt_jump1:
|
||||
st1 {v16.S}[0], [x4], #4
|
||||
st1 {v16.S}[0], [x4], #4
|
||||
st1 {v16.S}[0], [x4], #4
|
||||
st1 {v16.S}[0], [x4]
|
||||
satd_intra_4x4_x3_opt_end:
|
||||
mov w0, w10
|
||||
|
||||
WELS_ASM_ARCH64_FUNC_END
|
||||
|
||||
WELS_ASM_ARCH64_FUNC_BEGIN WelsIntra8x8Combined3Satd_AArch64_neon
|
||||
ldr x11, [sp, #0]
|
||||
|
||||
LOAD_CHROMA_DATA x0, v0.8b, v0.b
|
||||
|
||||
LOAD_CHROMA_DATA x7, v1.8b, v1.b
|
||||
|
||||
//Calculate the 16x16_v mode SATD and save to "v6, v7"
|
||||
ushll v4.8h, v0.8b, #2
|
||||
ushll v5.8h, v1.8b, #2
|
||||
GET_16X16_V_SATD
|
||||
|
||||
//Calculate the 16x16_h mode SATD and save to "v16, v17"
|
||||
ushll2 v4.8h, v0.16b, #2
|
||||
ushll2 v5.8h, v1.16b, #2
|
||||
GET_16X16_H_SATD
|
||||
|
||||
uaddlp v0.8h, v0.16b
|
||||
uaddlp v2.4s, v0.8h
|
||||
ins v3.d[0], v2.d[1]
|
||||
add v3.2s, v2.2s, v3.2s
|
||||
|
||||
uaddlp v1.8h, v1.16b
|
||||
uaddlp v4.4s, v1.8h
|
||||
ins v5.d[0], v4.d[1]
|
||||
add v5.2s, v4.2s, v5.2s
|
||||
|
||||
trn2 v0.4s, v2.4s, v4.4s
|
||||
urshr v0.4s, v0.4s, #2
|
||||
urshr v3.2s, v3.2s, #3
|
||||
urshr v5.2s, v5.2s, #3
|
||||
|
||||
ushll v22.2d, v0.2s, #4 //{1cb, 1cr}
|
||||
ushll2 v23.2d, v0.4s, #4 //{2cb, 2cr}
|
||||
ushll v24.2d, v3.2s, #4 //{0cb, 3cb}
|
||||
ushll v25.2d, v5.2s, #4 //{0cr, 3cr}
|
||||
|
||||
eor v31.16b, v31.16b, v31.16b //Save the SATD of DC_BOTH
|
||||
eor v30.16b, v30.16b, v30.16b //Save the SATD of H
|
||||
eor v29.16b, v29.16b, v29.16b //Save the SATD of V
|
||||
eor v28.16b, v28.16b, v28.16b //For zero register
|
||||
|
||||
ins v18.d[0], v6.d[1]
|
||||
ins v19.d[0], v7.d[1]
|
||||
ins v26.d[0], v16.d[1]
|
||||
ins v27.d[0], v17.d[1]
|
||||
|
||||
LOAD_8X4_DATA x2
|
||||
|
||||
HDM_TRANSFORM_4X4_L0 v20.8b, v21.8b, v6.4h, v16.4h, v24.4h, v29.4s, v30.4s, v31.4s, v28, l
|
||||
HDM_TRANSFORM_4X4_L0 v20.16b, v21.16b, v18.4h, v16.4h, v22.4h, v29.4s, v30.4s, v31.4s, v28, l2
|
||||
|
||||
LOAD_8X4_DATA x11
|
||||
|
||||
ins v22.d[0], v22.d[1]
|
||||
HDM_TRANSFORM_4X4_L0 v20.8b, v21.8b, v7.4h, v17.4h, v25.4h, v29.4s, v30.4s, v31.4s, v28, l
|
||||
HDM_TRANSFORM_4X4_L0 v20.16b, v21.16b, v19.4h, v17.4h, v22.4h, v29.4s, v30.4s, v31.4s, v28, l2
|
||||
|
||||
LOAD_8X4_DATA x2
|
||||
|
||||
ins v24.d[0], v24.d[1]
|
||||
HDM_TRANSFORM_4X4_L0 v20.8b, v21.8b, v6.4h, v26.4h, v23.4h, v29.4s, v30.4s, v31.4s, v28, l
|
||||
HDM_TRANSFORM_4X4_L0 v20.16b, v21.16b, v18.4h, v26.4h, v24.4h, v29.4s, v30.4s, v31.4s, v28, l2
|
||||
|
||||
LOAD_8X4_DATA x11
|
||||
|
||||
ins v23.d[0], v23.d[1]
|
||||
ins v25.d[0], v25.d[1]
|
||||
HDM_TRANSFORM_4X4_L0 v20.8b, v21.8b, v7.4h, v27.4h, v23.4h, v29.4s, v30.4s, v31.4s, v28, l
|
||||
HDM_TRANSFORM_4X4_L0 v20.16b, v21.16b, v19.4h, v27.4h, v25.4h, v29.4s, v30.4s, v31.4s, v28, l2
|
||||
|
||||
urshr v29.4s, v29.4s, #1
|
||||
addv s29, v29.4s
|
||||
fmov w2, s29
|
||||
add w2, w2, w5, lsl #1
|
||||
|
||||
urshr v30.4s, v30.4s, #1
|
||||
addv s30, v30.4s
|
||||
fmov w1, s30
|
||||
add w1, w1, w5, lsl #1
|
||||
|
||||
urshr v31.4s, v31.4s, #1
|
||||
addv s31, v31.4s
|
||||
fmov w0, s31
|
||||
|
||||
SELECT_BEST_COST w0
|
||||
|
||||
str w7, [x4]
|
||||
WELS_ASM_ARCH64_FUNC_END
|
||||
|
||||
|
||||
WELS_ASM_ARCH64_FUNC_BEGIN WelsIntra16x16Combined3Satd_AArch64_neon
|
||||
LOAD_LUMA_DATA
|
||||
|
||||
uaddlv h2, v0.16b
|
||||
uaddlv h3, v1.16b
|
||||
add v2.8h, v2.8h, v3.8h
|
||||
urshr v2.4h, v2.4h, #5
|
||||
shl v2.4h, v2.4h, #4
|
||||
|
||||
//Calculate the 16x16_v mode SATD and save to "v6, v7"
|
||||
ushll v4.8h, v0.8b, #2
|
||||
ushll2 v5.8h, v0.16b, #2
|
||||
GET_16X16_V_SATD
|
||||
|
||||
//Calculate the 16x16_h mode SATD and save to "v16, v17"
|
||||
ushll v4.8h, v1.8b, #2
|
||||
ushll2 v5.8h, v1.16b, #2
|
||||
GET_16X16_H_SATD
|
||||
|
||||
eor v31.16b, v31.16b, v31.16b //Save the SATD of DC_BOTH
|
||||
eor v30.16b, v30.16b, v30.16b //Save the SATD of H
|
||||
eor v29.16b, v29.16b, v29.16b //Save the SATD of V
|
||||
eor v28.16b, v28.16b, v28.16b //For zero register
|
||||
|
||||
ins v18.d[0], v6.d[1]
|
||||
ins v19.d[0], v7.d[1]
|
||||
ins v26.d[0], v16.d[1]
|
||||
ins v27.d[0], v17.d[1]
|
||||
|
||||
LOAD_16X4_DATA
|
||||
|
||||
HDM_TRANSFORM_4X4_L0 v22.8b, v24.8b, v6.4h, v16.4h, v2.4h, v29.4s, v30.4s, v31.4s, v28, l
|
||||
HDM_TRANSFORM_4X4_L0 v22.16b, v24.16b, v7.4h, v16.4h, v2.4h, v29.4s, v30.4s, v31.4s, v28, l2
|
||||
HDM_TRANSFORM_4X4_L0 v23.8b, v25.8b, v18.4h, v16.4h, v2.4h, v29.4s, v30.4s, v31.4s, v28, l
|
||||
HDM_TRANSFORM_4X4_L0 v23.16b, v25.16b, v19.4h, v16.4h, v2.4h, v29.4s, v30.4s, v31.4s, v28, l2
|
||||
|
||||
LOAD_16X4_DATA
|
||||
|
||||
HDM_TRANSFORM_4X4_L0 v22.8b, v24.8b, v6.4h, v26.4h, v2.4h, v29.4s, v30.4s, v31.4s, v28, l
|
||||
HDM_TRANSFORM_4X4_L0 v22.16b, v24.16b, v7.4h, v26.4h, v2.4h, v29.4s, v30.4s, v31.4s, v28, l2
|
||||
HDM_TRANSFORM_4X4_L0 v23.8b, v25.8b, v18.4h, v26.4h, v2.4h, v29.4s, v30.4s, v31.4s, v28, l
|
||||
HDM_TRANSFORM_4X4_L0 v23.16b, v25.16b, v19.4h, v26.4h, v2.4h, v29.4s, v30.4s, v31.4s, v28, l2
|
||||
|
||||
LOAD_16X4_DATA
|
||||
|
||||
HDM_TRANSFORM_4X4_L0 v22.8b, v24.8b, v6.4h, v17.4h, v2.4h, v29.4s, v30.4s, v31.4s, v28, l
|
||||
HDM_TRANSFORM_4X4_L0 v22.16b, v24.16b, v7.4h, v17.4h, v2.4h, v29.4s, v30.4s, v31.4s, v28, l2
|
||||
HDM_TRANSFORM_4X4_L0 v23.8b, v25.8b, v18.4h, v17.4h, v2.4h, v29.4s, v30.4s, v31.4s, v28, l
|
||||
HDM_TRANSFORM_4X4_L0 v23.16b, v25.16b, v19.4h, v17.4h, v2.4h, v29.4s, v30.4s, v31.4s, v28, l2
|
||||
|
||||
LOAD_16X4_DATA
|
||||
|
||||
HDM_TRANSFORM_4X4_L0 v22.8b, v24.8b, v6.4h, v27.4h, v2.4h, v29.4s, v30.4s, v31.4s, v28, l
|
||||
HDM_TRANSFORM_4X4_L0 v22.16b, v24.16b, v7.4h, v27.4h, v2.4h, v29.4s, v30.4s, v31.4s, v28, l2
|
||||
HDM_TRANSFORM_4X4_L0 v23.8b, v25.8b, v18.4h, v27.4h, v2.4h, v29.4s, v30.4s, v31.4s, v28, l
|
||||
HDM_TRANSFORM_4X4_L0 v23.16b, v25.16b, v19.4h, v27.4h, v2.4h, v29.4s, v30.4s, v31.4s, v28, l2
|
||||
|
||||
urshr v29.4s, v29.4s, #1
|
||||
addv s29, v29.4s
|
||||
fmov w0, s29
|
||||
|
||||
urshr v30.4s, v30.4s, #1
|
||||
addv s30, v30.4s
|
||||
fmov w1, s30
|
||||
add w1, w1, w5, lsl #1
|
||||
|
||||
urshr v31.4s, v31.4s, #1
|
||||
addv s31, v31.4s
|
||||
fmov w2, s31
|
||||
add w2, w2, w5, lsl #1
|
||||
|
||||
SELECT_BEST_COST w0
|
||||
|
||||
str w7, [x4]
|
||||
|
||||
WELS_ASM_ARCH64_FUNC_END
|
||||
|
||||
#endif
|
@ -108,6 +108,14 @@ int32_t WelsSampleSatd16x16_AArch64_neon (uint8_t*, int32_t, uint8_t*, int32_t);
|
||||
int32_t WelsSampleSatd16x8_AArch64_neon (uint8_t*, int32_t, uint8_t*, int32_t);
|
||||
int32_t WelsSampleSatd8x16_AArch64_neon (uint8_t*, int32_t, uint8_t*, int32_t);
|
||||
int32_t WelsSampleSatd8x8_AArch64_neon (uint8_t*, int32_t, uint8_t*, int32_t);
|
||||
int32_t WelsIntra16x16Combined3Satd_AArch64_neon (uint8_t*, int32_t, uint8_t*, int32_t, int32_t*, int32_t, uint8_t*);
|
||||
int32_t WelsIntra16x16Combined3Sad_AArch64_neon (uint8_t*, int32_t, uint8_t*, int32_t, int32_t*, int32_t, uint8_t*);
|
||||
int32_t WelsIntra8x8Combined3Satd_AArch64_neon (uint8_t*, int32_t, uint8_t*, int32_t, int32_t*, int32_t, uint8_t*, uint8_t*,
|
||||
uint8_t*);
|
||||
int32_t WelsIntra8x8Combined3Sad_AArch64_neon (uint8_t*, int32_t, uint8_t*, int32_t, int32_t*, int32_t, uint8_t*, uint8_t*,
|
||||
uint8_t*);
|
||||
int32_t WelsIntra4x4Combined3Satd_AArch64_neon (uint8_t*, int32_t, uint8_t*, int32_t, uint8_t*, int32_t*, int32_t, int32_t,
|
||||
int32_t);
|
||||
#endif
|
||||
#if defined(__cplusplus)
|
||||
}
|
||||
|
@ -433,6 +433,12 @@ void WelsInitSampleSadFunc (SWelsFuncPtrList* pFuncList, uint32_t uiCpuFlag) {
|
||||
pFuncList->sSampleDealingFuncs.pfSampleSatd[BLOCK_8x16 ] = WelsSampleSatd8x16_AArch64_neon;
|
||||
pFuncList->sSampleDealingFuncs.pfSampleSatd[BLOCK_16x8 ] = WelsSampleSatd16x8_AArch64_neon;
|
||||
pFuncList->sSampleDealingFuncs.pfSampleSatd[BLOCK_16x16] = WelsSampleSatd16x16_AArch64_neon;
|
||||
|
||||
pFuncList->sSampleDealingFuncs.pfIntra4x4Combined3Satd = WelsIntra4x4Combined3Satd_AArch64_neon;
|
||||
pFuncList->sSampleDealingFuncs.pfIntra8x8Combined3Satd = WelsIntra8x8Combined3Satd_AArch64_neon;
|
||||
pFuncList->sSampleDealingFuncs.pfIntra8x8Combined3Sad = WelsIntra8x8Combined3Sad_AArch64_neon;
|
||||
pFuncList->sSampleDealingFuncs.pfIntra16x16Combined3Satd = WelsIntra16x16Combined3Satd_AArch64_neon;
|
||||
pFuncList->sSampleDealingFuncs.pfIntra16x16Combined3Sad = WelsIntra16x16Combined3Sad_AArch64_neon;
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
@ -60,6 +60,7 @@ endif
|
||||
ifeq ($(ASM_ARCH), arm64)
|
||||
ENCODER_ASM_ARM64_SRCS=\
|
||||
$(ENCODER_SRCDIR)/core/arm64/intra_pred_aarch64_neon.S\
|
||||
$(ENCODER_SRCDIR)/core/arm64/intra_pred_sad_3_opt_neon_aarch64.S\
|
||||
$(ENCODER_SRCDIR)/core/arm64/pixel_aarch64_neon.S\
|
||||
|
||||
ENCODER_OBJS += $(ENCODER_ASM_ARM64_SRCS:.S=.$(OBJ))
|
||||
|
Loading…
Reference in New Issue
Block a user