Add Sad arm64 code
This commit is contained in:
parent
3c4d151e03
commit
e6c9eb9824
@ -12,6 +12,7 @@
|
||||
4C34067018C57D0400DFA14A /* memory_neon.S in Sources */ = {isa = PBXBuildFile; fileRef = 4C34066918C57D0400DFA14A /* memory_neon.S */; };
|
||||
4C34067118C57D0400DFA14A /* pixel_neon.S in Sources */ = {isa = PBXBuildFile; fileRef = 4C34066A18C57D0400DFA14A /* pixel_neon.S */; };
|
||||
4C34067218C57D0400DFA14A /* reconstruct_neon.S in Sources */ = {isa = PBXBuildFile; fileRef = 4C34066B18C57D0400DFA14A /* reconstruct_neon.S */; };
|
||||
4CB8F2B419235FC5005D6386 /* pixel_neon_aarch64.S in Sources */ = {isa = PBXBuildFile; fileRef = 4CB8F2B319235FC5005D6386 /* pixel_neon_aarch64.S */; };
|
||||
4CE4431518B6FFA00017DF25 /* Foundation.framework in Frameworks */ = {isa = PBXBuildFile; fileRef = 4CE4431418B6FFA00017DF25 /* Foundation.framework */; };
|
||||
4CE4432318B6FFA00017DF25 /* XCTest.framework in Frameworks */ = {isa = PBXBuildFile; fileRef = 4CE4432218B6FFA00017DF25 /* XCTest.framework */; };
|
||||
4CE4432418B6FFA00017DF25 /* Foundation.framework in Frameworks */ = {isa = PBXBuildFile; fileRef = 4CE4431418B6FFA00017DF25 /* Foundation.framework */; };
|
||||
@ -81,6 +82,7 @@
|
||||
4C34066918C57D0400DFA14A /* memory_neon.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = memory_neon.S; sourceTree = "<group>"; };
|
||||
4C34066A18C57D0400DFA14A /* pixel_neon.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = pixel_neon.S; sourceTree = "<group>"; };
|
||||
4C34066B18C57D0400DFA14A /* reconstruct_neon.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = reconstruct_neon.S; sourceTree = "<group>"; };
|
||||
4CB8F2B319235FC5005D6386 /* pixel_neon_aarch64.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; name = pixel_neon_aarch64.S; path = arm64/pixel_neon_aarch64.S; sourceTree = "<group>"; };
|
||||
4CDBFB9D18E5068D0025A767 /* wels_transpose_matrix.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = wels_transpose_matrix.h; sourceTree = "<group>"; };
|
||||
4CE4431118B6FFA00017DF25 /* libwelsenc.a */ = {isa = PBXFileReference; explicitFileType = archive.ar; includeInIndex = 0; path = libwelsenc.a; sourceTree = BUILT_PRODUCTS_DIR; };
|
||||
4CE4431418B6FFA00017DF25 /* Foundation.framework */ = {isa = PBXFileReference; lastKnownFileType = wrapper.framework; name = Foundation.framework; path = System/Library/Frameworks/Foundation.framework; sourceTree = SDKROOT; };
|
||||
@ -210,6 +212,14 @@
|
||||
path = arm;
|
||||
sourceTree = "<group>";
|
||||
};
|
||||
4CB8F2B219235FAC005D6386 /* arm64 */ = {
|
||||
isa = PBXGroup;
|
||||
children = (
|
||||
4CB8F2B319235FC5005D6386 /* pixel_neon_aarch64.S */,
|
||||
);
|
||||
name = arm64;
|
||||
sourceTree = "<group>";
|
||||
};
|
||||
4CE4430818B6FFA00017DF25 = {
|
||||
isa = PBXGroup;
|
||||
children = (
|
||||
@ -270,6 +280,7 @@
|
||||
4CE446A118BC605B0017DF25 /* core */ = {
|
||||
isa = PBXGroup;
|
||||
children = (
|
||||
4CB8F2B219235FAC005D6386 /* arm64 */,
|
||||
4C34066418C57D0400DFA14A /* arm */,
|
||||
4CE446A918BC605C0017DF25 /* inc */,
|
||||
4CE446DC18BC605C0017DF25 /* src */,
|
||||
@ -507,6 +518,7 @@
|
||||
4CE4471918BC605C0017DF25 /* memory_align.cpp in Sources */,
|
||||
4CE4472418BC605C0017DF25 /* svc_enc_slice_segment.cpp in Sources */,
|
||||
4CE4472318BC605C0017DF25 /* svc_base_layer_md.cpp in Sources */,
|
||||
4CB8F2B419235FC5005D6386 /* pixel_neon_aarch64.S in Sources */,
|
||||
4CE4471E18BC605C0017DF25 /* ratectl.cpp in Sources */,
|
||||
4C34066D18C57D0400DFA14A /* intra_pred_neon.S in Sources */,
|
||||
4CE4471C18BC605C0017DF25 /* picture_handle.cpp in Sources */,
|
||||
@ -648,6 +660,7 @@
|
||||
"$(SRCROOT)/../../../../processing/interface",
|
||||
"$(SRCROOT)/../../../../api/svc",
|
||||
"$(SRCROOT)/../../../../common/arm",
|
||||
"$(SRCROOT)/../../../../common/arm64",
|
||||
);
|
||||
IPHONEOS_DEPLOYMENT_TARGET = 6.1;
|
||||
ONLY_ACTIVE_ARCH = NO;
|
||||
@ -686,6 +699,7 @@
|
||||
"$(SRCROOT)/../../../../processing/interface",
|
||||
"$(SRCROOT)/../../../../api/svc",
|
||||
"$(SRCROOT)/../../../../common/arm",
|
||||
"$(SRCROOT)/../../../../common/arm64",
|
||||
);
|
||||
IPHONEOS_DEPLOYMENT_TARGET = 6.1;
|
||||
OTHER_LDFLAGS = "-ObjC";
|
||||
|
@ -89,6 +89,19 @@ void WelsSampleSadFour4x4_neon (uint8_t*, int32_t, uint8_t*, int32_t, int32_t*);
|
||||
|
||||
#endif
|
||||
|
||||
#if defined (HAVE_NEON_AARCH64)
|
||||
int32_t WelsSampleSad4x4_AArch64_neon (uint8_t*, int32_t, uint8_t*, int32_t);
|
||||
int32_t WelsSampleSad16x16_AArch64_neon (uint8_t*, int32_t, uint8_t*, int32_t);
|
||||
int32_t WelsSampleSad16x8_AArch64_neon (uint8_t*, int32_t, uint8_t*, int32_t);
|
||||
int32_t WelsSampleSad8x16_AArch64_neon (uint8_t*, int32_t, uint8_t*, int32_t);
|
||||
int32_t WelsSampleSad8x8_AArch64_neon (uint8_t*, int32_t, uint8_t*, int32_t);
|
||||
|
||||
void WelsSampleSadFour16x16_AArch64_neon (uint8_t*, int32_t, uint8_t*, int32_t, int32_t*);
|
||||
void WelsSampleSadFour16x8_AArch64_neon (uint8_t*, int32_t, uint8_t*, int32_t, int32_t*);
|
||||
void WelsSampleSadFour8x16_AArch64_neon (uint8_t*, int32_t, uint8_t*, int32_t, int32_t*);
|
||||
void WelsSampleSadFour8x8_AArch64_neon (uint8_t*, int32_t, uint8_t*, int32_t, int32_t*);
|
||||
void WelsSampleSadFour4x4_AArch64_neon (uint8_t*, int32_t, uint8_t*, int32_t, int32_t*);
|
||||
#endif
|
||||
#if defined(__cplusplus)
|
||||
}
|
||||
#endif//__cplusplus
|
||||
|
398
codec/encoder/core/arm64/pixel_neon_aarch64.S
Executable file
398
codec/encoder/core/arm64/pixel_neon_aarch64.S
Executable file
@ -0,0 +1,398 @@
|
||||
/*!
|
||||
* \copy
|
||||
* Copyright (c) 2013, Cisco Systems
|
||||
* All rights reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions
|
||||
* are met:
|
||||
*
|
||||
* * Redistributions of source code must retain the above copyright
|
||||
* notice, this list of conditions and the following disclaimer.
|
||||
*
|
||||
* * Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in
|
||||
* the documentation and/or other materials provided with the
|
||||
* distribution.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
|
||||
* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
|
||||
* COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
|
||||
* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
|
||||
* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
|
||||
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
|
||||
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
|
||||
* ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
* POSSIBILITY OF SUCH DAMAGE.
|
||||
*
|
||||
*/
|
||||
|
||||
#ifdef HAVE_NEON_AARCH64
|
||||
.text
|
||||
#include "arm_arch64_common_macro.S"
|
||||
|
||||
.macro CALC_AND_STORE_SAD
|
||||
uaddlp v2.4s, v2.8h
|
||||
addp v2.4s, v2.4s, v2.4s
|
||||
addp v2.2s, v2.2s, v2.2s
|
||||
umov w0, v2.s[0]
|
||||
.endm
|
||||
|
||||
.macro CALC_AND_STORE_SAD_FOUR
|
||||
addp v0.8h, v28.8h, v29.8h
|
||||
addp v1.8h, v30.8h, v31.8h
|
||||
addp v0.8h, v0.8h, v1.8h
|
||||
addp v0.8h, v0.8h, v0.8h
|
||||
eor v1.8b, v1.8b, v1.8b
|
||||
saddl v0.4s, v0.4h, v1.4h
|
||||
st1 {v0.4s}, [x4]
|
||||
.endm
|
||||
|
||||
.macro LOAD_8X8_1
|
||||
ld1 {v0.8b}, [x0], x1
|
||||
ld1 {v1.8b}, [x0], x1
|
||||
ld1 {v2.8b}, [x0], x1
|
||||
ld1 {v3.8b}, [x0], x1
|
||||
ld1 {v4.8b}, [x0], x1
|
||||
ld1 {v5.8b}, [x0], x1
|
||||
ld1 {v6.8b}, [x0], x1
|
||||
ld1 {v7.8b}, [x0], x1
|
||||
.endm
|
||||
|
||||
.macro LOAD_8X8_2
|
||||
ld1 {v16.8b}, [$0], x3
|
||||
ld1 {v17.8b}, [$0], x3
|
||||
ld1 {v18.8b}, [$0], x3
|
||||
ld1 {v19.8b}, [$0], x3
|
||||
ld1 {v20.8b}, [$0], x3
|
||||
ld1 {v21.8b}, [$0], x3
|
||||
ld1 {v22.8b}, [$0], x3
|
||||
ld1 {v23.8b}, [$0], x3
|
||||
.endm
|
||||
|
||||
.macro CALC_ABS_8X8_1
|
||||
uab$1l $0, v0.8b, v16.8b
|
||||
uabal $0, v1.8b, v17.8b
|
||||
uabal $0, v2.8b, v18.8b
|
||||
uabal $0, v3.8b, v19.8b
|
||||
uabal $0, v4.8b, v20.8b
|
||||
uabal $0, v5.8b, v21.8b
|
||||
uabal $0, v6.8b, v22.8b
|
||||
uabal $0, v7.8b, v23.8b
|
||||
.endm
|
||||
|
||||
.macro CALC_ABS_8X8_2
|
||||
uab$0l v29.8h, v0.8b, v18.8b
|
||||
uabal v29.8h, v1.8b, v19.8b
|
||||
uabal v29.8h, v2.8b, v20.8b
|
||||
uabal v29.8h, v3.8b, v21.8b
|
||||
uabal v29.8h, v4.8b, v22.8b
|
||||
uabal v29.8h, v5.8b, v23.8b
|
||||
uabal v29.8h, v6.8b, v24.8b
|
||||
uabal v29.8h, v7.8b, v25.8b
|
||||
.endm
|
||||
|
||||
.macro LOAD_16X8_1
|
||||
ld1 {v0.16b}, [x0], x1
|
||||
ld1 {v1.16b}, [x0], x1
|
||||
ld1 {v2.16b}, [x0], x1
|
||||
ld1 {v3.16b}, [x0], x1
|
||||
ld1 {v4.16b}, [x0], x1
|
||||
ld1 {v5.16b}, [x0], x1
|
||||
ld1 {v6.16b}, [x0], x1
|
||||
ld1 {v7.16b}, [x0], x1
|
||||
.endm
|
||||
|
||||
.macro LOAD_16X8_2
|
||||
ld1 {v16.16b}, [$0], x3
|
||||
ld1 {v17.16b}, [$0], x3
|
||||
ld1 {v18.16b}, [$0], x3
|
||||
ld1 {v19.16b}, [$0], x3
|
||||
ld1 {v20.16b}, [$0], x3
|
||||
ld1 {v21.16b}, [$0], x3
|
||||
ld1 {v22.16b}, [$0], x3
|
||||
ld1 {v23.16b}, [$0], x3
|
||||
.endm
|
||||
|
||||
.macro CALC_ABS_16X8_1
|
||||
uab$1l $0, v0.8b, v16.8b
|
||||
uabal2 $0, v0.16b,v16.16b
|
||||
uabal $0, v1.8b, v17.8b
|
||||
uabal2 $0, v1.16b,v17.16b
|
||||
uabal $0, v2.8b, v18.8b
|
||||
uabal2 $0, v2.16b,v18.16b
|
||||
uabal $0, v3.8b, v19.8b
|
||||
uabal2 $0, v3.16b,v19.16b
|
||||
uabal $0, v4.8b, v20.8b
|
||||
uabal2 $0, v4.16b,v20.16b
|
||||
uabal $0, v5.8b, v21.8b
|
||||
uabal2 $0, v5.16b,v21.16b
|
||||
uabal $0, v6.8b, v22.8b
|
||||
uabal2 $0, v6.16b,v22.16b
|
||||
uabal $0, v7.8b, v23.8b
|
||||
uabal2 $0, v7.16b,v23.16b
|
||||
.endm
|
||||
|
||||
.macro CALC_ABS_16X8_2
|
||||
uab$0l v29.8h, v0.8b, v18.8b
|
||||
uabal2 v29.8h, v0.16b,v18.16b
|
||||
uabal v29.8h, v1.8b, v19.8b
|
||||
uabal2 v29.8h, v1.16b,v19.16b
|
||||
uabal v29.8h, v2.8b, v20.8b
|
||||
uabal2 v29.8h, v2.16b,v20.16b
|
||||
uabal v29.8h, v3.8b, v21.8b
|
||||
uabal2 v29.8h, v3.16b,v21.16b
|
||||
uabal v29.8h, v4.8b, v22.8b
|
||||
uabal2 v29.8h, v4.16b,v22.16b
|
||||
uabal v29.8h, v5.8b, v23.8b
|
||||
uabal2 v29.8h, v5.16b,v23.16b
|
||||
uabal v29.8h, v6.8b, v24.8b
|
||||
uabal2 v29.8h, v6.16b,v24.16b
|
||||
uabal v29.8h, v7.8b, v25.8b
|
||||
uabal2 v29.8h, v7.16b,v25.16b
|
||||
.endm
|
||||
|
||||
WELS_ASM_ARCH64_FUNC_BEGIN WelsSampleSad4x4_AArch64_neon
|
||||
sxtw x1, w1
|
||||
sxtw x3, w3
|
||||
ld1 {v0.s}[0], [x0], x1
|
||||
ld1 {v1.s}[0], [x2], x3
|
||||
uabdl v2.8h, v0.8b, v1.8b
|
||||
.rept 3
|
||||
ld1 {v0.s}[0], [x0], x1
|
||||
ld1 {v1.s}[0], [x2], x3
|
||||
uabal v2.8h, v0.8b, v1.8b
|
||||
.endr
|
||||
uaddlp v2.2s, v2.4h
|
||||
addp v2.2s, v2.2s, v2.2s
|
||||
umov w0, v2.s[0]
|
||||
WELS_ASM_ARCH64_FUNC_END
|
||||
|
||||
WELS_ASM_ARCH64_FUNC_BEGIN WelsSampleSad8x8_AArch64_neon
|
||||
sxtw x1, w1
|
||||
sxtw x3, w3
|
||||
ld1 {v0.8b}, [x0], x1
|
||||
ld1 {v1.8b}, [x2], x3
|
||||
uabdl v2.8h, v0.8b, v1.8b
|
||||
.rept 7
|
||||
ld1 {v0.8b}, [x0], x1
|
||||
ld1 {v1.8b}, [x2], x3
|
||||
uabal v2.8h, v0.8b, v1.8b
|
||||
.endr
|
||||
CALC_AND_STORE_SAD
|
||||
WELS_ASM_ARCH64_FUNC_END
|
||||
|
||||
WELS_ASM_ARCH64_FUNC_BEGIN WelsSampleSad8x16_AArch64_neon
|
||||
sxtw x1, w1
|
||||
sxtw x3, w3
|
||||
ld1 {v0.8b}, [x0], x1
|
||||
ld1 {v1.8b}, [x2], x3
|
||||
uabdl v2.8h, v0.8b, v1.8b
|
||||
.rept 15
|
||||
ld1 {v0.8b}, [x0], x1
|
||||
ld1 {v1.8b}, [x2], x3
|
||||
uabal v2.8h, v0.8b, v1.8b
|
||||
.endr
|
||||
CALC_AND_STORE_SAD
|
||||
WELS_ASM_ARCH64_FUNC_END
|
||||
|
||||
WELS_ASM_ARCH64_FUNC_BEGIN WelsSampleSad16x8_AArch64_neon
|
||||
sxtw x1, w1
|
||||
sxtw x3, w3
|
||||
ld1 {v0.16b}, [x0], x1
|
||||
ld1 {v1.16b}, [x2], x3
|
||||
uabdl v2.8h, v0.8b, v1.8b
|
||||
uabal2 v2.8h, v0.16b, v1.16b
|
||||
.rept 7
|
||||
ld1 {v0.16b}, [x0], x1
|
||||
ld1 {v1.16b}, [x2], x3
|
||||
uabal v2.8h, v0.8b, v1.8b
|
||||
uabal2 v2.8h, v0.16b, v1.16b
|
||||
.endr
|
||||
CALC_AND_STORE_SAD
|
||||
WELS_ASM_ARCH64_FUNC_END
|
||||
|
||||
WELS_ASM_ARCH64_FUNC_BEGIN WelsSampleSad16x16_AArch64_neon
|
||||
sxtw x1, w1
|
||||
sxtw x3, w3
|
||||
ld1 {v0.16b}, [x0], x1
|
||||
ld1 {v1.16b}, [x2], x3
|
||||
uabdl v2.8h, v0.8b, v1.8b
|
||||
uabal2 v2.8h, v0.16b, v1.16b
|
||||
.rept 15
|
||||
ld1 {v0.16b}, [x0], x1
|
||||
ld1 {v1.16b}, [x2], x3
|
||||
uabal v2.8h, v0.8b, v1.8b
|
||||
uabal2 v2.8h, v0.16b, v1.16b
|
||||
.endr
|
||||
CALC_AND_STORE_SAD
|
||||
WELS_ASM_ARCH64_FUNC_END
|
||||
|
||||
WELS_ASM_ARCH64_FUNC_BEGIN WelsSampleSadFour4x4_AArch64_neon
|
||||
sxtw x1, w1
|
||||
sxtw x3, w3
|
||||
ld1 {v0.s}[0], [x0], x1
|
||||
ld1 {v0.s}[1], [x0], x1
|
||||
ld1 {v1.s}[0], [x0], x1
|
||||
ld1 {v1.s}[1], [x0]
|
||||
sub x0, x2, x3
|
||||
ld1 {v2.s}[0], [x0], x3
|
||||
ld1 {v2.s}[1], [x0], x3
|
||||
ld1 {v3.s}[0], [x0], x3
|
||||
ld1 {v3.s}[1], [x0], x3
|
||||
ld1 {v4.s}[0], [x0], x3
|
||||
ld1 {v4.s}[1], [x0], x3
|
||||
|
||||
uabdl v28.8h, v0.8b, v2.8b
|
||||
uabal v28.8h, v1.8b, v3.8b
|
||||
|
||||
uabdl v29.8h, v0.8b, v3.8b
|
||||
uabal v29.8h, v1.8b, v4.8b
|
||||
|
||||
sub x0, x2, #1
|
||||
ld1 {v2.s}[0], [x0], x3
|
||||
ld1 {v2.s}[1], [x0], x3
|
||||
ld1 {v3.s}[0], [x0], x3
|
||||
ld1 {v3.s}[1], [x0]
|
||||
uabdl v30.8h, v0.8b, v2.8b
|
||||
uabal v30.8h, v1.8b, v3.8b
|
||||
|
||||
add x0, x2, #1
|
||||
ld1 {v2.s}[0], [x0], x3
|
||||
ld1 {v2.s}[1], [x0], x3
|
||||
ld1 {v3.s}[0], [x0], x3
|
||||
ld1 {v3.s}[1], [x0]
|
||||
uabdl v31.8h, v0.8b, v2.8b
|
||||
uabal v31.8h, v1.8b, v3.8b
|
||||
|
||||
CALC_AND_STORE_SAD_FOUR
|
||||
WELS_ASM_ARCH64_FUNC_END
|
||||
|
||||
WELS_ASM_ARCH64_FUNC_BEGIN WelsSampleSadFour8x8_AArch64_neon
|
||||
sxtw x1, w1
|
||||
sxtw x3, w3
|
||||
LOAD_8X8_1
|
||||
sub x0, x2, x3
|
||||
LOAD_8X8_2 x0
|
||||
ld1 {v24.8b}, [x0], x3
|
||||
ld1 {v25.8b}, [x0]
|
||||
|
||||
CALC_ABS_8X8_1 v28.8h, d
|
||||
CALC_ABS_8X8_2 d
|
||||
|
||||
sub x0, x2, #1
|
||||
LOAD_8X8_2 x0
|
||||
CALC_ABS_8X8_1 v30.8h, d
|
||||
|
||||
add x0, x2, #1
|
||||
LOAD_8X8_2 x0
|
||||
CALC_ABS_8X8_1 v31.8h, d
|
||||
|
||||
CALC_AND_STORE_SAD_FOUR
|
||||
WELS_ASM_ARCH64_FUNC_END
|
||||
|
||||
WELS_ASM_ARCH64_FUNC_BEGIN WelsSampleSadFour8x16_AArch64_neon
|
||||
sxtw x1, w1
|
||||
sxtw x3, w3
|
||||
LOAD_8X8_1
|
||||
sub x5, x2, x3
|
||||
LOAD_8X8_2 x5
|
||||
ld1 {v24.8b}, [x5], x3
|
||||
ld1 {v25.8b}, [x5], x3
|
||||
|
||||
CALC_ABS_8X8_1 v28.8h, d
|
||||
CALC_ABS_8X8_2 d
|
||||
|
||||
sub x6, x2, #1
|
||||
LOAD_8X8_2 x6
|
||||
CALC_ABS_8X8_1 v30.8h, d
|
||||
|
||||
add x7, x2, #1
|
||||
LOAD_8X8_2 x7
|
||||
CALC_ABS_8X8_1 v31.8h, d
|
||||
|
||||
LOAD_8X8_1
|
||||
sub x5, x5, x3
|
||||
sub x5, x5, x3
|
||||
LOAD_8X8_2 x5
|
||||
ld1 {v24.8b}, [x5], x3
|
||||
ld1 {v25.8b}, [x5]
|
||||
|
||||
CALC_ABS_8X8_1 v28.8h, a
|
||||
CALC_ABS_8X8_2 a
|
||||
|
||||
LOAD_8X8_2 x6
|
||||
CALC_ABS_8X8_1 v30.8h, a
|
||||
|
||||
LOAD_8X8_2 x7
|
||||
CALC_ABS_8X8_1 v31.8h, a
|
||||
|
||||
CALC_AND_STORE_SAD_FOUR
|
||||
WELS_ASM_ARCH64_FUNC_END
|
||||
|
||||
WELS_ASM_ARCH64_FUNC_BEGIN WelsSampleSadFour16x8_AArch64_neon
|
||||
sxtw x1, w1
|
||||
sxtw x3, w3
|
||||
LOAD_16X8_1
|
||||
sub x0, x2, x3
|
||||
LOAD_16X8_2 x0
|
||||
ld1 {v24.16b}, [x0], x3
|
||||
ld1 {v25.16b}, [x0]
|
||||
|
||||
CALC_ABS_16X8_1 v28.8h, d
|
||||
CALC_ABS_16X8_2 d
|
||||
|
||||
sub x0, x2, #1
|
||||
LOAD_16X8_2 x0
|
||||
CALC_ABS_16X8_1 v30.8h, d
|
||||
|
||||
add x0, x2, #1
|
||||
LOAD_16X8_2 x0
|
||||
CALC_ABS_16X8_1 v31.8h, d
|
||||
|
||||
CALC_AND_STORE_SAD_FOUR
|
||||
WELS_ASM_ARCH64_FUNC_END
|
||||
|
||||
WELS_ASM_ARCH64_FUNC_BEGIN WelsSampleSadFour16x16_AArch64_neon
|
||||
sxtw x1, w1
|
||||
sxtw x3, w3
|
||||
|
||||
LOAD_16X8_1
|
||||
sub x5, x2, x3
|
||||
LOAD_16X8_2 x5
|
||||
ld1 {v24.16b}, [x5], x3
|
||||
ld1 {v25.16b}, [x5], x3
|
||||
|
||||
CALC_ABS_16X8_1 v28.8h, d
|
||||
CALC_ABS_16X8_2 d
|
||||
|
||||
sub x6, x2, #1
|
||||
LOAD_16X8_2 x6
|
||||
CALC_ABS_16X8_1 v30.8h, d
|
||||
|
||||
add x7, x2, #1
|
||||
LOAD_16X8_2 x7
|
||||
CALC_ABS_16X8_1 v31.8h, d
|
||||
|
||||
LOAD_16X8_1
|
||||
sub x5, x5, x3
|
||||
sub x5, x5, x3
|
||||
LOAD_16X8_2 x5
|
||||
ld1 {v24.16b}, [x5], x3
|
||||
ld1 {v25.16b}, [x5]
|
||||
|
||||
CALC_ABS_16X8_1 v28.8h, a
|
||||
CALC_ABS_16X8_2 a
|
||||
|
||||
LOAD_16X8_2 x6
|
||||
CALC_ABS_16X8_1 v30.8h, a
|
||||
|
||||
LOAD_16X8_2 x7
|
||||
CALC_ABS_16X8_1 v31.8h, a
|
||||
|
||||
CALC_AND_STORE_SAD_FOUR
|
||||
WELS_ASM_ARCH64_FUNC_END
|
||||
#endif
|
@ -413,6 +413,23 @@ void WelsInitSampleSadFunc (SWelsFuncPtrList* pFuncList, uint32_t uiCpuFlag) {
|
||||
pFuncList->sSampleDealingFuncs.pfIntra16x16Combined3Sad = WelsIntra16x16Combined3Sad_neon;
|
||||
}
|
||||
#endif
|
||||
|
||||
#if defined (HAVE_NEON_AARCH64)
|
||||
if (uiCpuFlag & WELS_CPU_NEON) {
|
||||
pFuncList->sSampleDealingFuncs.pfSampleSad[BLOCK_4x4 ] = WelsSampleSad4x4_AArch64_neon;
|
||||
pFuncList->sSampleDealingFuncs.pfSampleSad[BLOCK_16x16] = WelsSampleSad16x16_AArch64_neon;
|
||||
pFuncList->sSampleDealingFuncs.pfSampleSad[BLOCK_16x8 ] = WelsSampleSad16x8_AArch64_neon;
|
||||
pFuncList->sSampleDealingFuncs.pfSampleSad[BLOCK_8x16] = WelsSampleSad8x16_AArch64_neon;
|
||||
pFuncList->sSampleDealingFuncs.pfSampleSad[BLOCK_8x8] = WelsSampleSad8x8_AArch64_neon;
|
||||
|
||||
pFuncList->sSampleDealingFuncs.pfSample4Sad[BLOCK_16x16] = WelsSampleSadFour16x16_AArch64_neon;
|
||||
pFuncList->sSampleDealingFuncs.pfSample4Sad[BLOCK_16x8] = WelsSampleSadFour16x8_AArch64_neon;
|
||||
pFuncList->sSampleDealingFuncs.pfSample4Sad[BLOCK_8x16] = WelsSampleSadFour8x16_AArch64_neon;
|
||||
pFuncList->sSampleDealingFuncs.pfSample4Sad[BLOCK_8x8] = WelsSampleSadFour8x8_AArch64_neon;
|
||||
pFuncList->sSampleDealingFuncs.pfSample4Sad[BLOCK_4x4] = WelsSampleSadFour4x4_AArch64_neon;
|
||||
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
} // namespace WelsSVCEnc
|
||||
|
@ -60,6 +60,13 @@ ENCODER_ASM_ARM_SRCS=\
|
||||
ENCODER_OBJS += $(ENCODER_ASM_ARM_SRCS:.S=.$(OBJ))
|
||||
endif
|
||||
|
||||
ifeq ($(ASM_ARCH), arm64)
|
||||
ENCODER_ASM_ARM64_SRCS=\
|
||||
$(ENCODER_SRCDIR)/core/arm64/pixel_neon_aarch64.S\
|
||||
|
||||
ENCODER_OBJS += $(ENCODER_ASM_ARM64_SRCS:.S=.$(OBJ))
|
||||
endif
|
||||
|
||||
OBJS += $(ENCODER_OBJS)
|
||||
$(ENCODER_SRCDIR)/%.$(OBJ): $(ENCODER_SRCDIR)/%.cpp
|
||||
$(QUIET_CXX)$(CXX) $(CFLAGS) $(CXXFLAGS) $(INCLUDES) $(ENCODER_CFLAGS) $(ENCODER_INCLUDES) -c $(CXX_O) $<
|
||||
|
Loading…
x
Reference in New Issue
Block a user