From 947e3c64fe52a231ef464b45d1b3c1142eefa1ed Mon Sep 17 00:00:00 2001 From: zhiliang wang Date: Fri, 13 Jun 2014 15:21:48 +0800 Subject: [PATCH] Add arm64 code for intra-pred --- .../welsdec/welsdec.xcodeproj/project.pbxproj | 14 + .../welsenc/welsenc.xcodeproj/project.pbxproj | 12 +- .../core/arm64/intra_pred_aarch64_neon.S | 525 ++++++++++++++++++ codec/decoder/core/inc/get_intra_predictor.h | 25 + codec/decoder/core/src/decoder.cpp | 29 + codec/decoder/targets.mk | 7 + .../core/arm64/intra_pred_aarch64_neon.S | 504 +++++++++++++++++ ...el_neon_aarch64.S => pixel_aarch64_neon.S} | 0 codec/encoder/core/inc/get_intra_predictor.h | 26 + .../encoder/core/src/get_intra_predictor.cpp | 28 + codec/encoder/targets.mk | 3 +- 11 files changed, 1168 insertions(+), 5 deletions(-) create mode 100755 codec/decoder/core/arm64/intra_pred_aarch64_neon.S create mode 100755 codec/encoder/core/arm64/intra_pred_aarch64_neon.S rename codec/encoder/core/arm64/{pixel_neon_aarch64.S => pixel_aarch64_neon.S} (100%) diff --git a/codec/build/iOS/dec/welsdec/welsdec.xcodeproj/project.pbxproj b/codec/build/iOS/dec/welsdec/welsdec.xcodeproj/project.pbxproj index 8bfbe4b7..99e7aaa4 100644 --- a/codec/build/iOS/dec/welsdec/welsdec.xcodeproj/project.pbxproj +++ b/codec/build/iOS/dec/welsdec/welsdec.xcodeproj/project.pbxproj @@ -7,6 +7,7 @@ objects = { /* Begin PBXBuildFile section */ + 4CBC1B81194AC4E100214D9E /* intra_pred_aarch64_neon.S in Sources */ = {isa = PBXBuildFile; fileRef = 4CBC1B80194AC4E100214D9E /* intra_pred_aarch64_neon.S */; }; 4CE4427D18B6FC360017DF25 /* Foundation.framework in Frameworks */ = {isa = PBXBuildFile; fileRef = 4CE4427C18B6FC360017DF25 /* Foundation.framework */; }; 4CE4468A18BC5EAB0017DF25 /* au_parser.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 4CE4466718BC5EAA0017DF25 /* au_parser.cpp */; }; 4CE4468B18BC5EAB0017DF25 /* bit_stream.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 4CE4466818BC5EAA0017DF25 /* bit_stream.cpp */; }; @@ -48,6 +49,7 @@ /* End PBXCopyFilesBuildPhase section */ /* Begin PBXFileReference section */ + 4CBC1B80194AC4E100214D9E /* intra_pred_aarch64_neon.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; name = intra_pred_aarch64_neon.S; path = arm64/intra_pred_aarch64_neon.S; sourceTree = ""; }; 4CE4427918B6FC360017DF25 /* libwelsdec.a */ = {isa = PBXFileReference; explicitFileType = archive.ar; includeInIndex = 0; path = libwelsdec.a; sourceTree = BUILT_PRODUCTS_DIR; }; 4CE4427C18B6FC360017DF25 /* Foundation.framework */ = {isa = PBXFileReference; lastKnownFileType = wrapper.framework; name = Foundation.framework; path = System/Library/Frameworks/Foundation.framework; sourceTree = SDKROOT; }; 4CE4428D18B6FC360017DF25 /* UIKit.framework */ = {isa = PBXFileReference; lastKnownFileType = wrapper.framework; name = UIKit.framework; path = Library/Frameworks/UIKit.framework; sourceTree = DEVELOPER_DIR; }; @@ -127,6 +129,14 @@ /* End PBXFrameworksBuildPhase section */ /* Begin PBXGroup section */ + 4CBC1B7F194AC4A400214D9E /* arm64 */ = { + isa = PBXGroup; + children = ( + 4CBC1B80194AC4E100214D9E /* intra_pred_aarch64_neon.S */, + ); + name = arm64; + sourceTree = ""; + }; 4CE4427018B6FC360017DF25 = { isa = PBXGroup; children = ( @@ -166,6 +176,7 @@ 4CE4463F18BC5EAA0017DF25 /* core */ = { isa = PBXGroup; children = ( + 4CBC1B7F194AC4A400214D9E /* arm64 */, 4CE447A518BC6BE90017DF25 /* arm */, 4CE4464418BC5EAA0017DF25 /* inc */, 4CE4466618BC5EAA0017DF25 /* src */, @@ -343,6 +354,7 @@ 4CE4469418BC5EAB0017DF25 /* get_intra_predictor.cpp in Sources */, 9AED66561946A1DE009A3567 /* welsCodecTrace.cpp in Sources */, F0B204FC18FD23D8005DA23F /* error_concealment.cpp in Sources */, + 4CBC1B81194AC4E100214D9E /* intra_pred_aarch64_neon.S in Sources */, 4CE4469018BC5EAB0017DF25 /* decoder_core.cpp in Sources */, 4CE447AE18BC6BE90017DF25 /* intra_pred_neon.S in Sources */, 4CE4469618BC5EAB0017DF25 /* mc.cpp in Sources */, @@ -458,6 +470,7 @@ "$(SRCROOT)/../../../../common/inc", "$(SRCROOT)/../../../../api/svc", "$(SRCROOT)/../../../../common/arm", + "$(SRCROOT)/../../../../common/arm64", ); IPHONEOS_DEPLOYMENT_TARGET = 6.1; OTHER_LDFLAGS = "-ObjC"; @@ -492,6 +505,7 @@ "$(SRCROOT)/../../../../common/inc", "$(SRCROOT)/../../../../api/svc", "$(SRCROOT)/../../../../common/arm", + "$(SRCROOT)/../../../../common/arm64", ); IPHONEOS_DEPLOYMENT_TARGET = 6.1; OTHER_LDFLAGS = "-ObjC"; diff --git a/codec/build/iOS/enc/welsenc/welsenc.xcodeproj/project.pbxproj b/codec/build/iOS/enc/welsenc/welsenc.xcodeproj/project.pbxproj index c675cff1..3a60112d 100644 --- a/codec/build/iOS/enc/welsenc/welsenc.xcodeproj/project.pbxproj +++ b/codec/build/iOS/enc/welsenc/welsenc.xcodeproj/project.pbxproj @@ -12,7 +12,8 @@ 4C34067018C57D0400DFA14A /* memory_neon.S in Sources */ = {isa = PBXBuildFile; fileRef = 4C34066918C57D0400DFA14A /* memory_neon.S */; }; 4C34067118C57D0400DFA14A /* pixel_neon.S in Sources */ = {isa = PBXBuildFile; fileRef = 4C34066A18C57D0400DFA14A /* pixel_neon.S */; }; 4C34067218C57D0400DFA14A /* reconstruct_neon.S in Sources */ = {isa = PBXBuildFile; fileRef = 4C34066B18C57D0400DFA14A /* reconstruct_neon.S */; }; - 4CB8F2B419235FC5005D6386 /* pixel_neon_aarch64.S in Sources */ = {isa = PBXBuildFile; fileRef = 4CB8F2B319235FC5005D6386 /* pixel_neon_aarch64.S */; }; + 4CB8F2B419235FC5005D6386 /* pixel_aarch64_neon.S in Sources */ = {isa = PBXBuildFile; fileRef = 4CB8F2B319235FC5005D6386 /* pixel_aarch64_neon.S */; }; + 4CBC1B83194ACBB400214D9E /* intra_pred_aarch64_neon.S in Sources */ = {isa = PBXBuildFile; fileRef = 4CBC1B82194ACBB400214D9E /* intra_pred_aarch64_neon.S */; }; 4CE4431518B6FFA00017DF25 /* Foundation.framework in Frameworks */ = {isa = PBXBuildFile; fileRef = 4CE4431418B6FFA00017DF25 /* Foundation.framework */; }; 4CE4470E18BC605C0017DF25 /* au_set.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 4CE446DD18BC605C0017DF25 /* au_set.cpp */; }; 4CE4470F18BC605C0017DF25 /* deblocking.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 4CE446DE18BC605C0017DF25 /* deblocking.cpp */; }; @@ -65,7 +66,8 @@ 4C34066918C57D0400DFA14A /* memory_neon.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = memory_neon.S; sourceTree = ""; }; 4C34066A18C57D0400DFA14A /* pixel_neon.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = pixel_neon.S; sourceTree = ""; }; 4C34066B18C57D0400DFA14A /* reconstruct_neon.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = reconstruct_neon.S; sourceTree = ""; }; - 4CB8F2B319235FC5005D6386 /* pixel_neon_aarch64.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; name = pixel_neon_aarch64.S; path = arm64/pixel_neon_aarch64.S; sourceTree = ""; }; + 4CB8F2B319235FC5005D6386 /* pixel_aarch64_neon.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; name = pixel_aarch64_neon.S; path = arm64/pixel_aarch64_neon.S; sourceTree = ""; }; + 4CBC1B82194ACBB400214D9E /* intra_pred_aarch64_neon.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; name = intra_pred_aarch64_neon.S; path = arm64/intra_pred_aarch64_neon.S; sourceTree = ""; }; 4CDBFB9D18E5068D0025A767 /* wels_transpose_matrix.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = wels_transpose_matrix.h; sourceTree = ""; }; 4CE4431118B6FFA00017DF25 /* libwelsenc.a */ = {isa = PBXFileReference; explicitFileType = archive.ar; includeInIndex = 0; path = libwelsenc.a; sourceTree = BUILT_PRODUCTS_DIR; }; 4CE4431418B6FFA00017DF25 /* Foundation.framework */ = {isa = PBXFileReference; lastKnownFileType = wrapper.framework; name = Foundation.framework; path = System/Library/Frameworks/Foundation.framework; sourceTree = SDKROOT; }; @@ -180,7 +182,8 @@ 4CB8F2B219235FAC005D6386 /* arm64 */ = { isa = PBXGroup; children = ( - 4CB8F2B319235FC5005D6386 /* pixel_neon_aarch64.S */, + 4CBC1B82194ACBB400214D9E /* intra_pred_aarch64_neon.S */, + 4CB8F2B319235FC5005D6386 /* pixel_aarch64_neon.S */, ); name = arm64; sourceTree = ""; @@ -421,6 +424,7 @@ 4C34066E18C57D0400DFA14A /* intra_pred_sad_3_opt_neon.S in Sources */, 4CE4472B18BC605C0017DF25 /* wels_preprocess.cpp in Sources */, 4CE4470E18BC605C0017DF25 /* au_set.cpp in Sources */, + 4CBC1B83194ACBB400214D9E /* intra_pred_aarch64_neon.S in Sources */, 4CE4471718BC605C0017DF25 /* mc.cpp in Sources */, 4CE4472918BC605C0017DF25 /* svc_set_mb_syn_cavlc.cpp in Sources */, 4CE4471818BC605C0017DF25 /* md.cpp in Sources */, @@ -428,7 +432,7 @@ 4CE4471918BC605C0017DF25 /* memory_align.cpp in Sources */, 4CE4472418BC605C0017DF25 /* svc_enc_slice_segment.cpp in Sources */, 4CE4472318BC605C0017DF25 /* svc_base_layer_md.cpp in Sources */, - 4CB8F2B419235FC5005D6386 /* pixel_neon_aarch64.S in Sources */, + 4CB8F2B419235FC5005D6386 /* pixel_aarch64_neon.S in Sources */, 4CE4471E18BC605C0017DF25 /* ratectl.cpp in Sources */, 4C34066D18C57D0400DFA14A /* intra_pred_neon.S in Sources */, 4CE4471C18BC605C0017DF25 /* picture_handle.cpp in Sources */, diff --git a/codec/decoder/core/arm64/intra_pred_aarch64_neon.S b/codec/decoder/core/arm64/intra_pred_aarch64_neon.S new file mode 100755 index 00000000..0768f48b --- /dev/null +++ b/codec/decoder/core/arm64/intra_pred_aarch64_neon.S @@ -0,0 +1,525 @@ +/*! + * \copy + * Copyright (c) 2013, Cisco Systems + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS + * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE + * COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN + * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + */ + +#ifdef HAVE_NEON_AARCH64 +.text +#include "arm_arch64_common_macro.S" + +// for Luma 4x4 +WELS_ASM_ARCH64_FUNC_BEGIN WelsDecoderI4x4LumaPredH_AArch64_neon + sxtw x1, w1 + sub x2, x0, #1 +.rept 4 + ld1r {v0.8b}, [x2], x1 + st1 {v0.S}[0], [x0], x1 +.endr +WELS_ASM_ARCH64_FUNC_END + +WELS_ASM_ARCH64_FUNC_BEGIN WelsDecoderI4x4LumaPredDc_AArch64_neon + sxtw x1, w1 + sub x2, x0, x1 + sub x3, x0, #1 + ldr s0, [x2] + ld1 {v0.b}[4], [x3], x1 + ld1 {v0.b}[5], [x3], x1 + ld1 {v0.b}[6], [x3], x1 + ld1 {v0.b}[7], [x3] + uaddlv h0, v0.8b + uqrshrn b0, h0, #3 + dup v0.8b, v0.b[0] +.rept 4 + st1 {v0.S}[0], [x0], x1 +.endr +WELS_ASM_ARCH64_FUNC_END + +WELS_ASM_ARCH64_FUNC_BEGIN WelsDecoderI4x4LumaPredDcTop_AArch64_neon + sxtw x1, w1 + sub x2, x0, x1 + sub v0.8b, v0.8b, v0.8b + ldr s0, [x2] + uaddlv h0, v0.8b + uqrshrn v0.8b, v0.8h, #2 + dup v0.8b, v0.b[0] +.rept 4 + st1 {v0.S}[0], [x0], x1 +.endr +WELS_ASM_ARCH64_FUNC_END + +WELS_ASM_ARCH64_FUNC_BEGIN WelsDecoderI4x4LumaPredDDL_AArch64_neon + sxtw x1, w1 + sub x2, x0, x1 + ld1 {v0.8b}, [x2] + dup v1.8b, v0.b[7] + ext v2.8b, v0.8b, v1.8b, #1 + ext v3.8b, v0.8b, v1.8b, #2 + ushll v2.8h, v2.8b, #1 + uaddl v1.8h, v3.8b, v0.8b + add v1.8h, v1.8h, v2.8h + uqrshrn v1.8b, v1.8h, #2 + st1 {v1.S}[0], [x0], x1 + ext v0.8b, v1.8b, v2.8b, #1 + st1 {v0.S}[0], [x0], x1 + ext v0.8b, v1.8b, v2.8b, #2 + st1 {v0.S}[0], [x0], x1 + ext v0.8b, v1.8b, v2.8b, #3 + st1 {v0.S}[0], [x0] +WELS_ASM_ARCH64_FUNC_END + +WELS_ASM_ARCH64_FUNC_BEGIN WelsDecoderI4x4LumaPredDDLTop_AArch64_neon + sxtw x1, w1 + sub x2, x0, x1 + ld1 {v0.8b}, [x2] + dup v1.8b, v0.b[3] + mov v0.S[1], v1.S[0] + ext v2.8b, v0.8b, v1.8b, #1 + ext v3.8b, v0.8b, v1.8b, #2 + ushll v2.8h, v2.8b, #1 + uaddl v1.8h, v3.8b, v0.8b + add v1.8h, v1.8h, v2.8h + uqrshrn v1.8b, v1.8h, #2 + st1 {v1.S}[0], [x0], x1 + ext v0.8b, v1.8b, v2.8b, #1 + st1 {v0.S}[0], [x0], x1 + ext v0.8b, v1.8b, v2.8b, #2 + st1 {v0.S}[0], [x0], x1 + ext v0.8b, v1.8b, v2.8b, #3 + st1 {v0.S}[0], [x0] +WELS_ASM_ARCH64_FUNC_END + +WELS_ASM_ARCH64_FUNC_BEGIN WelsDecoderI4x4LumaPredVL_AArch64_neon + sxtw x1, w1 + sub x2, x0, x1 + ld1 {v0.8b}, [x2] + ext v1.8b, v0.8b, v0.8b, #1 + uaddl v1.8h, v1.8b, v0.8b + uqrshrn v0.8b, v1.8h, #1 // v0.8b is VL0, VL1, VL2, VL3, VL4, ... + ext v2.16b, v1.16b, v1.16b, #2 + add v1.8h, v2.8h, v1.8h + uqrshrn v1.8b, v1.8h, #2 // v1.8b is VL5, VL6, VL7, VL8, VL9 + st1 {v0.s}[0], [x0], x1 // write the first row + st1 {v1.s}[0], [x0], x1 // write the second row + ext v3.8b, v0.8b, v0.8b, #1 + ext v2.8b, v1.8b, v1.8b, #1 + st1 {v3.s}[0], [x0], x1 // write the third row + st1 {v2.s}[0], [x0] // write the fourth row +WELS_ASM_ARCH64_FUNC_END + +WELS_ASM_ARCH64_FUNC_BEGIN WelsDecoderI4x4LumaPredVLTop_AArch64_neon + sxtw x1, w1 + sub x2, x0, x1 + ld1 {v0.8b}, [x2] + dup v1.8b, v0.b[3] + mov v0.s[1], v1.s[0] + ext v1.8b, v0.8b, v0.8b, #1 + uaddl v1.8h, v1.8b, v0.8b + uqrshrn v0.8b, v1.8h, #1 // v0.8b is VL0, VL1, VL2, VL3, VL4, ... + ext v2.16b, v1.16b, v1.16b, #2 + add v1.8h, v2.8h, v1.8h + uqrshrn v1.8b, v1.8h, #2 // v1.8b is VL5, VL6, VL7, VL8, VL9 + st1 {v0.s}[0], [x0], x1 // write the first row + st1 {v1.s}[0], [x0], x1 // write the second row + ext v3.8b, v0.8b, v0.8b, #1 + ext v2.8b, v1.8b, v1.8b, #1 + st1 {v3.s}[0], [x0], x1 // write the third row + st1 {v2.s}[0], [x0] // write the fourth row +WELS_ASM_ARCH64_FUNC_END + +WELS_ASM_ARCH64_FUNC_BEGIN WelsDecoderI4x4LumaPredVR_AArch64_neon + sxtw x1, w1 + sub x2, x0, x1 + ld1 {v0.s}[1], [x2] + sub x2, x2, #1 + ld1 {v0.b}[3], [x2], x1 + ld1 {v0.b}[2], [x2], x1 + ld1 {v0.b}[1], [x2], x1 + ld1 {v0.b}[0], [x2] // v0.8b l2, l1, l0, lt, t0, t1, t2, t3 + + ext v1.8b, v0.8b, v0.8b, #7 + uaddl v2.8h, v1.8b, v0.8b //v2:{X,L2+L1,L1+L0,L0+LT,LT+T0,T0+T1,T1+T2,T2+T3} + ext v1.16b, v2.16b, v2.16b, #14 + add v3.8h, v2.8h, v1.8h //v3:{X,L2+L1+L1+L0,L1+L0+L0+LT,...T1+T2+T2+T3} + + uqrshrn v3.8b, v3.8h, #2 + uqrshrn v2.8b, v2.8h, #1 + + st1 {v2.s}[1], [x0], x1 + st1 {v3.s}[1], [x0], x1 + + ext v2.8b, v2.8b, v2.8b, #7 + ins v2.b[4], v3.b[3] + st1 {v2.s}[1], [x0], x1 + + ext v3.8b, v3.8b, v3.8b, #7 + ins v3.b[4], v3.b[3] + st1 {v3.s}[1], [x0], x1 + +WELS_ASM_ARCH64_FUNC_END + + +WELS_ASM_ARCH64_FUNC_BEGIN WelsDecoderI4x4LumaPredHU_AArch64_neon + sxtw x1, w1 + sub x2, x0, #1 + mov x3, #3 + mul x3, x3, x1 + add x3, x3, x2 + ld1r {v0.8b}, [x3] + ld1 {v0.b}[4], [x2], x1 + ld1 {v0.b}[5], [x2], x1 + ld1 {v0.b}[6], [x2], x1 //d0:{L3,L3,L3,L3,L0,L1,L2,L3} + + ext v1.8b, v0.8b, v0.8b, #1 + uaddl v2.8h, v0.8b, v1.8b //v2:{L3+L3,L3+L3,L3+L3,L3+L0,L0+L1,L1+L2,L2+L3,L3+L3} + ext v3.16b, v2.16b, v2.16b, #2 + add v3.8h, v3.8h, v2.8h //v2:{x, HU1, HU3, HU5, x} + + uqrshrn v2.8b, v2.8h, #1 // HU0, HU2, HU4 + uqrshrn v3.8b, v3.8h, #2 // HU1, HU3, HU5 + zip2 v3.8b, v2.8b, v3.8b // HU0, HU1, HU2, HU3, HU4, HU5 + mov v3.h[3], v0.h[0] // v0.8b is hu0, hu1, hu2, hu3, hu4, hu5, l3, l3 + ext v2.8b, v3.8b, v0.8b, #2 + st1 {v3.s}[0], [x0], x1 + st1 {v2.s}[0], [x0], x1 + st1 {v3.s}[1], [x0], x1 + st1 {v0.s}[0], [x0] +WELS_ASM_ARCH64_FUNC_END + +WELS_ASM_ARCH64_FUNC_BEGIN WelsDecoderI4x4LumaPredHD_AArch64_neon + sxtw x1, w1 + sub x2, x0, #1 + sub x2, x2, x1 // x2 points to top left + ld1 {v0.s}[1], [x2], x1 + ld1 {v0.b}[3], [x2], x1 + ld1 {v0.b}[2], [x2], x1 + ld1 {v0.b}[1], [x2], x1 + ld1 {v0.b}[0], [x2] // v0.8b: l3, l2, l1, l0, lt, t0, t1, t2 + ext v1.8b, v0.8b, v0.8b, #1 // v1.8b: l2, l1, l0, lt, t0, t1, t2, l3 + uaddl v2.8h, v0.8b, v1.8b + ext v3.16b, v2.16b, v2.16b, #2 + add v3.8h, v3.8h, v2.8h + uqrshrn v2.8b, v2.8h, #1 // hd8, hd6, hd4, hd0, xxx + uqrshrn v3.8b, v3.8h, #2 // hd9, hd7, hd5, hd1, hd2, hd3 + zip1 v2.8b, v2.8b, v3.8b // hd8, hd9, hd6, hd7, hd4, hd5, hd0, hd1 + mov v1.h[0], v3.h[2] + ext v3.8b, v2.8b, v1.8b, #6 + st1 {v3.s}[0], [x0], x1 + st1 {v2.s}[1], [x0], x1 + ext v3.8b, v2.8b, v1.8b, #2 + st1 {v3.s}[0], [x0], x1 + st1 {v2.s}[0], [x0] +WELS_ASM_ARCH64_FUNC_END + +// for Chroma 8x8 +WELS_ASM_ARCH64_FUNC_BEGIN WelsDecoderIChromaPredV_AArch64_neon + sxtw x1, w1 + sub x2, x0, x1 + ld1 {v0.8b}, [x2] +.rept 8 + st1 {v0.8b}, [x0], x1 +.endr +WELS_ASM_ARCH64_FUNC_END + +WELS_ASM_ARCH64_FUNC_BEGIN WelsDecoderIChromaPredH_AArch64_neon + sxtw x1, w1 + sub x2, x0, #1 +.rept 8 + ld1r {v0.8b}, [x2], x1 + st1 {v0.8b}, [x0], x1 +.endr +WELS_ASM_ARCH64_FUNC_END + + +WELS_ASM_ARCH64_FUNC_BEGIN WelsDecoderIChromaPredDc_AArch64_neon + sxtw x1, w1 + sub x2, x0, x1 + sub x3, x0, #1 + ld1 {v0.8b}, [x2] + ld1 {v0.b}[8], [x3], x1 + ld1 {v0.b}[9], [x3], x1 + ld1 {v0.b}[10], [x3], x1 + ld1 {v0.b}[11], [x3], x1 + ld1 {v0.b}[12], [x3], x1 + ld1 {v0.b}[13], [x3], x1 + ld1 {v0.b}[14], [x3], x1 + ld1 {v0.b}[15], [x3] + + uaddlp v1.8h, v0.16b + uaddlp v2.4s, v1.8h + ins v3.d[0], v2.d[1] + add v3.2s, v2.2s, v3.2s + urshr v2.4s, v2.4s, #2 + urshr v3.2s, v3.2s, #3 + + dup v0.8b, v3.b[0] + dup v1.8b, v2.b[4] + dup v2.8b, v2.b[12] + dup v3.8b, v3.b[4] + ins v0.s[1], v1.s[0] + ins v2.s[1], v3.s[0] +.rept 4 + st1 {v0.8b}, [x0], x1 +.endr +.rept 4 + st1 {v2.8b}, [x0], x1 +.endr + +WELS_ASM_ARCH64_FUNC_END + +WELS_ASM_ARCH64_FUNC_BEGIN WelsDecoderIChromaPredDcTop_AArch64_neon + sxtw x1, w1 + sub x2, x0, x1 + ld1 {v0.8b}, [x2] + uaddlp v0.4h, v0.8b + addp v0.8h, v0.8h, v0.8h + dup v1.8h, v0.h[0] + dup v2.8h, v0.h[1] + mov v1.D[1], v2.D[0] + uqrshrn v1.8b, v1.8h, #2 +.rept 8 + st1 {v1.8b}, [x0], x1 +.endr +WELS_ASM_ARCH64_FUNC_END + +.align 16 +intra_1_to_4: .short 17*1, 17*2, 17*3, 17*4, 17*1, 17*2, 17*3, 17*4 +intra_m3_to_p4: .short -3, -2, -1, 0, 1, 2, 3, 4 + +WELS_ASM_ARCH64_FUNC_BEGIN WelsDecoderIChromaPredPlane_AArch64_neon + sxtw x1, w1 + sub x2, x0, x1 + sub x2, x2, #1 + mov x3, x2 + // load pTop[2-i] and pLeft[(2-i)*kiStride] + ld1 {v1.b}[3], [x2], #1 + ld1 {v1.b}[2], [x2], #1 + ld1 {v1.b}[1], [x2], #1 + ld1 {v1.b}[0], [x2], #1 + ld1 {v1.b}[7], [x3], x1 + ld1 {v1.b}[6], [x3], x1 + ld1 {v1.b}[5], [x3], x1 + ld1 {v1.b}[4], [x3], x1 + add x2, x2, #1 + add x3, x3, x1 + // load pTop[4+i] and pLeft[(4+i)*kiStride] + ld1 {v0.b}[0], [x2], #1 + ld1 {v0.b}[1], [x2], #1 + ld1 {v0.b}[2], [x2], #1 + ld1 {v0.b}[3], [x2], #1 + ld1 {v0.b}[4], [x3], x1 + ld1 {v0.b}[5], [x3], x1 + ld1 {v0.b}[6], [x3], x1 + ld1 {v0.b}[7], [x3], x1 + + uxtl v1.8h, v1.8b + uxtl v0.8h, v0.8b + ldr q2, intra_1_to_4 + ldr q3, intra_m3_to_p4 + dup v4.8h, v0.h[3] + dup v5.8h, v0.h[7] + add v4.8h, v4.8h, v5.8h + sub v0.8h, v0.8h, v1.8h + shl v4.8h, v4.8h, #4 // v4.8h is a + mul v0.8h, v0.8h, v2.8h // v0.h[0-3] is H, v0.h[4-7] is V + saddlp v0.4s, v0.8h + addp v0.4s, v0.4s, v0.4s // v0.s[0] is H, v0.s[1] is V + sqrshrn v0.4h, v0.4s, #5 + dup v1.8h, v0.h[0] // v1.8h is b + dup v0.8h, v0.h[1] // v0.8h is c + mla v4.8h, v1.8h, v3.8h + mla v4.8h, v0.8h, v3.h[0] + sqrshrun v1.8b, v4.8h, #5 + st1 {v1.8b}, [x0], x1 +.rept 7 + add v4.8h, v4.8h, v0.8h + sqrshrun v1.8b, v4.8h, #5 + st1 {v1.8b}, [x0], x1 +.endr +WELS_ASM_ARCH64_FUNC_END + +//for Luma 16x16 +WELS_ASM_ARCH64_FUNC_BEGIN WelsDecoderI16x16LumaPredV_AArch64_neon + sxtw x1, w1 + sub x2, x0, x1 + ld1 {v0.16b}, [x2], x1 +.rept 16 + st1 {v0.16b}, [x0], x1 +.endr +WELS_ASM_ARCH64_FUNC_END + +WELS_ASM_ARCH64_FUNC_BEGIN WelsDecoderI16x16LumaPredH_AArch64_neon + sxtw x1, w1 + sub x2, x0, #1 +.rept 16 + ld1r {v0.16b}, [x2], x1 + st1 {v0.16b}, [x0], x1 +.endr +WELS_ASM_ARCH64_FUNC_END + +WELS_ASM_ARCH64_FUNC_BEGIN WelsDecoderI16x16LumaPredDc_AArch64_neon + sxtw x1, w1 + sub x2, x0, x1 + sub x3, x0, #1 + ld1 {v0.16b}, [x2] + ld1 {v1.b}[0], [x3], x1 + ld1 {v1.b}[1], [x3], x1 + ld1 {v1.b}[2], [x3], x1 + ld1 {v1.b}[3], [x3], x1 + ld1 {v1.b}[4], [x3], x1 + ld1 {v1.b}[5], [x3], x1 + ld1 {v1.b}[6], [x3], x1 + ld1 {v1.b}[7], [x3], x1 + ld1 {v1.b}[8], [x3], x1 + ld1 {v1.b}[9], [x3], x1 + ld1 {v1.b}[10], [x3], x1 + ld1 {v1.b}[11], [x3], x1 + ld1 {v1.b}[12], [x3], x1 + ld1 {v1.b}[13], [x3], x1 + ld1 {v1.b}[14], [x3], x1 + ld1 {v1.b}[15], [x3] + // reduce instruction + uaddlv h0, v0.16b + uaddlv h1, v1.16b + add v0.8h, v0.8h, v1.8h + uqrshrn b0, h0, #5 + dup v0.16b, v0.b[0] +.rept 16 + st1 {v0.16b}, [x0], x1 +.endr +WELS_ASM_ARCH64_FUNC_END + +WELS_ASM_ARCH64_FUNC_BEGIN WelsDecoderI16x16LumaPredDcTop_AArch64_neon + sxtw x1, w1 + sub x2, x0, x1 + ld1 {v0.16b}, [x2] + // reduce instruction + uaddlv h0, v0.16b + uqrshrn v0.8b, v0.8h, 4 + dup v0.16b, v0.b[0] +.rept 16 + st1 {v0.16b}, [x0], x1 +.endr +WELS_ASM_ARCH64_FUNC_END + +WELS_ASM_ARCH64_FUNC_BEGIN WelsDecoderI16x16LumaPredDcLeft_AArch64_neon + sxtw x1, w1 + sub x3, x0, #1 + ld1 {v1.b}[0], [x3], x1 + ld1 {v1.b}[1], [x3], x1 + ld1 {v1.b}[2], [x3], x1 + ld1 {v1.b}[3], [x3], x1 + ld1 {v1.b}[4], [x3], x1 + ld1 {v1.b}[5], [x3], x1 + ld1 {v1.b}[6], [x3], x1 + ld1 {v1.b}[7], [x3], x1 + ld1 {v1.b}[8], [x3], x1 + ld1 {v1.b}[9], [x3], x1 + ld1 {v1.b}[10], [x3], x1 + ld1 {v1.b}[11], [x3], x1 + ld1 {v1.b}[12], [x3], x1 + ld1 {v1.b}[13], [x3], x1 + ld1 {v1.b}[14], [x3], x1 + ld1 {v1.b}[15], [x3] + // reduce instruction + uaddlv h1, v1.16b + uqrshrn v0.8b, v1.8h, #4 + dup v0.16b, v0.b[0] +.rept 16 + st1 {v0.16b}, [x0], x1 +.endr +WELS_ASM_ARCH64_FUNC_END + + +.align 16 +intra_1_to_8: .short 5, 10, 15, 20, 25, 30, 35, 40 +intra_m7_to_p8: .short -7, -6, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, 6, 7, 8 + +WELS_ASM_ARCH64_FUNC_BEGIN WelsDecoderI16x16LumaPredPlane_AArch64_neon + sxtw x1, w1 + sub x2, x0, x1 + sub x2, x2, #1 + mov x3, x2 + ld1 {v0.8b}, [x3] // v0 low 8 bit in top(reverse order) + add x3, x3, #9 + rev64 v0.8b, v0.8b // reverse v0 + ld1 {v1.8b}, [x3] // v1 high 8 bit in top + uxtl v0.8h, v0.8b // extend to 16 bit integer + uxtl v1.8h, v1.8b // extend to 16 bit integer + ld1 {v2.b}[7], [x2], x1 + ld1 {v2.b}[6], [x2], x1 + ld1 {v2.b}[5], [x2], x1 + ld1 {v2.b}[4], [x2], x1 + ld1 {v2.b}[3], [x2], x1 + ld1 {v2.b}[2], [x2], x1 + ld1 {v2.b}[1], [x2], x1 + ld1 {v2.b}[0], [x2], x1 // v2.8b low 8 bit in left + add x2, x2, x1 + ld1 {v3.b}[0], [x2], x1 + ld1 {v3.b}[1], [x2], x1 + ld1 {v3.b}[2], [x2], x1 + ld1 {v3.b}[3], [x2], x1 + ld1 {v3.b}[4], [x2], x1 + ld1 {v3.b}[5], [x2], x1 + ld1 {v3.b}[6], [x2], x1 + ld1 {v3.b}[7], [x2] // v3.8b high 8bit in left + uxtl v2.8h, v2.8b + uxtl v3.8h, v3.8b + sub v0.8h, v1.8h, v0.8h + sub v2.8h, v3.8h, v2.8h + ldr q4, intra_1_to_8 + mul v0.8h, v0.8h, v4.8h + mul v2.8h, v2.8h, v4.8h + saddlv s0, v0.8h + saddlv s2, v2.8h + add v1.8h, v1.8h, v3.8h + sqrshrn v0.4h, v0.4S, #6 // b is in v0.h[0] + sqrshrn v2.4h, v2.4S, #6 // c is in v2.h[0] + shl v1.8h, v1.8h, #4 // a is in v1.h[7] + ldr q4, intra_m7_to_p8 + ldr q5, intra_m7_to_p8 + 16 + dup v1.8h, v1.h[7] + dup v3.8h, v1.h[7] + mla v1.8h, v4.8h, v0.h[0] + mla v3.8h, v5.8h, v0.h[0] + dup v2.8h, v2.h[0] // v2.8h is [cccccccc] + mla v1.8h, v2.8h, v4.h[0] + mla v3.8h, v2.8h, v4.h[0] + sqrshrun v4.8b, v1.8h, #5 + sqrshrun2 v4.16b, v3.8h, #5 + st1 {v4.16b}, [x0], x1 +.rept 15 + add v1.8h, v1.8h, v2.8h + add v3.8h, v3.8h, v2.8h + sqrshrun v4.8b, v1.8h, #5 + sqrshrun2 v4.16b, v3.8h, #5 + st1 {v4.16b}, [x0], x1 +.endr +WELS_ASM_ARCH64_FUNC_END +#endif \ No newline at end of file diff --git a/codec/decoder/core/inc/get_intra_predictor.h b/codec/decoder/core/inc/get_intra_predictor.h index 1c528cb2..2da61036 100644 --- a/codec/decoder/core/inc/get_intra_predictor.h +++ b/codec/decoder/core/inc/get_intra_predictor.h @@ -128,6 +128,31 @@ void WelsDecoderIChromaPredDc_neon (uint8_t* pPred, const int32_t kiStride); void WelsDecoderIChromaPredPlane_neon (uint8_t* pPred, const int32_t kiStride); #endif//HAVE_NEON +#if defined(HAVE_NEON_AARCH64) +void WelsDecoderI16x16LumaPredV_AArch64_neon (uint8_t* pPred, const int32_t kiStride); +void WelsDecoderI16x16LumaPredH_AArch64_neon (uint8_t* pPred, const int32_t kiStride); +void WelsDecoderI16x16LumaPredDc_AArch64_neon (uint8_t* pPred, const int32_t kiStride); +void WelsDecoderI16x16LumaPredPlane_AArch64_neon (uint8_t* pPred, const int32_t kiStride); +void WelsDecoderI16x16LumaPredDcTop_AArch64_neon (uint8_t* pPred, const int32_t kiStride); +void WelsDecoderI16x16LumaPredDcLeft_AArch64_neon (uint8_t* pPred, const int32_t kiStride); + +void WelsDecoderI4x4LumaPredH_AArch64_neon (uint8_t* pPred, const int32_t kiStride); +void WelsDecoderI4x4LumaPredDDL_AArch64_neon (uint8_t* pPred, const int32_t kiStride); +void WelsDecoderI4x4LumaPredDDLTop_AArch64_neon (uint8_t* pPred, const int32_t kiStride); +void WelsDecoderI4x4LumaPredVL_AArch64_neon (uint8_t* pPred, const int32_t kiStride); +void WelsDecoderI4x4LumaPredVLTop_AArch64_neon (uint8_t* pPred, const int32_t kiStride); +void WelsDecoderI4x4LumaPredVR_AArch64_neon (uint8_t* pPred, const int32_t kiStride); +void WelsDecoderI4x4LumaPredHU_AArch64_neon (uint8_t* pPred, const int32_t kiStride); +void WelsDecoderI4x4LumaPredHD_AArch64_neon (uint8_t* pPred, const int32_t kiStride); +void WelsDecoderI4x4LumaPredDc_AArch64_neon (uint8_t* pPred, const int32_t kiStride); +void WelsDecoderI4x4LumaPredDcTop_AArch64_neon (uint8_t* pPred, const int32_t kiStride); + +void WelsDecoderIChromaPredV_AArch64_neon (uint8_t* pPred, const int32_t kiStride); +void WelsDecoderIChromaPredH_AArch64_neon (uint8_t* pPred, const int32_t kiStride); +void WelsDecoderIChromaPredDc_AArch64_neon (uint8_t* pPred, const int32_t kiStride); +void WelsDecoderIChromaPredPlane_AArch64_neon (uint8_t* pPred, const int32_t kiStride); +void WelsDecoderIChromaPredDcTop_AArch64_neon (uint8_t* pPred, const int32_t kiStride); +#endif//HAVE_NEON_AARCH64 #if defined(__cplusplus) } #endif//__cplusplus diff --git a/codec/decoder/core/src/decoder.cpp b/codec/decoder/core/src/decoder.cpp index ad9351da..4c56cf2b 100644 --- a/codec/decoder/core/src/decoder.cpp +++ b/codec/decoder/core/src/decoder.cpp @@ -669,6 +669,35 @@ void AssignFuncPointerForRec (PWelsDecoderContext pCtx) { } #endif//HAVE_NEON +#if defined(HAVE_NEON_AARCH64) + if (pCtx->uiCpuFlag & WELS_CPU_NEON) { + //pCtx->pIdctResAddPredFunc = IdctResAddPred_neon; + + pCtx->pGetI16x16LumaPredFunc[I16_PRED_DC] = WelsDecoderI16x16LumaPredDc_AArch64_neon; + pCtx->pGetI16x16LumaPredFunc[I16_PRED_P] = WelsDecoderI16x16LumaPredPlane_AArch64_neon; + pCtx->pGetI16x16LumaPredFunc[I16_PRED_H] = WelsDecoderI16x16LumaPredH_AArch64_neon; + pCtx->pGetI16x16LumaPredFunc[I16_PRED_V] = WelsDecoderI16x16LumaPredV_AArch64_neon; + pCtx->pGetI16x16LumaPredFunc[I16_PRED_DC_L] = WelsDecoderI16x16LumaPredDcLeft_AArch64_neon; + pCtx->pGetI16x16LumaPredFunc[I16_PRED_DC_T] = WelsDecoderI16x16LumaPredDcTop_AArch64_neon; + + pCtx->pGetI4x4LumaPredFunc[I4_PRED_H ] = WelsDecoderI4x4LumaPredH_AArch64_neon; + pCtx->pGetI4x4LumaPredFunc[I4_PRED_DDL ] = WelsDecoderI4x4LumaPredDDL_AArch64_neon; + pCtx->pGetI4x4LumaPredFunc[I4_PRED_DDL_TOP] = WelsDecoderI4x4LumaPredDDLTop_AArch64_neon; + pCtx->pGetI4x4LumaPredFunc[I4_PRED_VL ] = WelsDecoderI4x4LumaPredVL_AArch64_neon; + pCtx->pGetI4x4LumaPredFunc[I4_PRED_VL_TOP ] = WelsDecoderI4x4LumaPredVLTop_AArch64_neon; + pCtx->pGetI4x4LumaPredFunc[I4_PRED_VR ] = WelsDecoderI4x4LumaPredVR_AArch64_neon; + pCtx->pGetI4x4LumaPredFunc[I4_PRED_HU ] = WelsDecoderI4x4LumaPredHU_AArch64_neon; + pCtx->pGetI4x4LumaPredFunc[I4_PRED_HD ] = WelsDecoderI4x4LumaPredHD_AArch64_neon; + pCtx->pGetI4x4LumaPredFunc[I4_PRED_DC ] = WelsDecoderI4x4LumaPredDc_AArch64_neon; + pCtx->pGetI4x4LumaPredFunc[I4_PRED_DC_T ] = WelsDecoderI4x4LumaPredDcTop_AArch64_neon; + + pCtx->pGetIChromaPredFunc[C_PRED_H] = WelsDecoderIChromaPredH_AArch64_neon; + pCtx->pGetIChromaPredFunc[C_PRED_V] = WelsDecoderIChromaPredV_AArch64_neon; + pCtx->pGetIChromaPredFunc[C_PRED_P ] = WelsDecoderIChromaPredPlane_AArch64_neon; + pCtx->pGetIChromaPredFunc[C_PRED_DC] = WelsDecoderIChromaPredDc_AArch64_neon; + pCtx->pGetIChromaPredFunc[C_PRED_DC_T] = WelsDecoderIChromaPredDcTop_AArch64_neon; + } +#endif//HAVE_NEON_AARCH64 #if defined(X86_ASM) if (pCtx->uiCpuFlag & WELS_CPU_MMXEXT) { diff --git a/codec/decoder/targets.mk b/codec/decoder/targets.mk index 3e105221..03e58673 100644 --- a/codec/decoder/targets.mk +++ b/codec/decoder/targets.mk @@ -39,6 +39,13 @@ DECODER_ASM_ARM_SRCS=\ DECODER_OBJS += $(DECODER_ASM_ARM_SRCS:.S=.$(OBJ)) endif +ifeq ($(ASM_ARCH), arm64) +DECODER_ASM_ARM64_SRCS=\ + $(DECODER_SRCDIR)/core/arm64/intra_pred_aarch64_neon.S\ + +DECODER_OBJS += $(DECODER_ASM_ARM64_SRCS:.S=.$(OBJ)) +endif + OBJS += $(DECODER_OBJS) $(DECODER_SRCDIR)/%.$(OBJ): $(DECODER_SRCDIR)/%.cpp $(QUIET_CXX)$(CXX) $(CFLAGS) $(CXXFLAGS) $(INCLUDES) $(DECODER_CFLAGS) $(DECODER_INCLUDES) -c $(CXX_O) $< diff --git a/codec/encoder/core/arm64/intra_pred_aarch64_neon.S b/codec/encoder/core/arm64/intra_pred_aarch64_neon.S new file mode 100755 index 00000000..cc4db47e --- /dev/null +++ b/codec/encoder/core/arm64/intra_pred_aarch64_neon.S @@ -0,0 +1,504 @@ +/*! + * \copy + * Copyright (c) 2013, Cisco Systems + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS + * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE + * COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN + * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + */ + +#ifdef HAVE_NEON_AARCH64 +.text +#include "arm_arch64_common_macro.S" + +// for Luma 4x4 +WELS_ASM_ARCH64_FUNC_BEGIN WelsI4x4LumaPredH_AArch64_neon + sub x3, x1, #1 +.rept 4 + ld1r {v0.8b}, [x3], x2 + st1 {v0.S}[0], [x0], 4 +.endr +WELS_ASM_ARCH64_FUNC_END + +WELS_ASM_ARCH64_FUNC_BEGIN WelsI4x4LumaPredDc_AArch64_neon + sub x3, x1, x2 + sub x4, x1, #1 + ldr s0, [x3] + ld1 {v0.b}[4], [x4], x2 + ld1 {v0.b}[5], [x4], x2 + ld1 {v0.b}[6], [x4], x2 + ld1 {v0.b}[7], [x4] + uaddlv h0, v0.8b + uqrshrn b0, h0, #3 + dup v0.8b, v0.b[0] +.rept 4 + st1 {v0.S}[0], [x0], 4 +.endr +WELS_ASM_ARCH64_FUNC_END + +WELS_ASM_ARCH64_FUNC_BEGIN WelsI4x4LumaPredDcTop_AArch64_neon + sub x3, x1, x2 + sub v0.8b, v0.8b, v0.8b + ldr s0, [x3] + uaddlv h0, v0.8b + uqrshrn v0.8b, v0.8h, #2 + dup v0.8b, v0.b[0] +.rept 4 + st1 {v0.S}[0], [x0], 4 +.endr +WELS_ASM_ARCH64_FUNC_END + +WELS_ASM_ARCH64_FUNC_BEGIN WelsI4x4LumaPredDDL_AArch64_neon + sub x3, x1, x2 + ld1 {v0.8b}, [x3] + dup v1.8b, v0.b[7] + ext v2.8b, v0.8b, v1.8b, #1 + ext v3.8b, v0.8b, v1.8b, #2 + ushll v2.8h, v2.8b, #1 + uaddl v1.8h, v3.8b, v0.8b + add v1.8h, v1.8h, v2.8h + uqrshrn v1.8b, v1.8h, #2 + st1 {v1.S}[0], [x0], 4 + ext v0.8b, v1.8b, v2.8b, #1 + st1 {v0.S}[0], [x0], 4 + ext v0.8b, v1.8b, v2.8b, #2 + st1 {v0.S}[0], [x0], 4 + ext v0.8b, v1.8b, v2.8b, #3 + st1 {v0.S}[0], [x0] +WELS_ASM_ARCH64_FUNC_END + +WELS_ASM_ARCH64_FUNC_BEGIN WelsI4x4LumaPredDDLTop_AArch64_neon + sub x3, x1, x2 + ld1 {v0.8b}, [x3] + dup v1.8b, v0.b[3] + mov v0.S[1], v1.S[0] + ext v2.8b, v0.8b, v1.8b, #1 + ext v3.8b, v0.8b, v1.8b, #2 + ushll v2.8h, v2.8b, #1 + uaddl v1.8h, v3.8b, v0.8b + add v1.8h, v1.8h, v2.8h + uqrshrn v1.8b, v1.8h, #2 + st1 {v1.S}[0], [x0], 4 + ext v0.8b, v1.8b, v2.8b, #1 + st1 {v0.S}[0], [x0], 4 + ext v0.8b, v1.8b, v2.8b, #2 + st1 {v0.S}[0], [x0], 4 + ext v0.8b, v1.8b, v2.8b, #3 + st1 {v0.S}[0], [x0] +WELS_ASM_ARCH64_FUNC_END + +WELS_ASM_ARCH64_FUNC_BEGIN WelsI4x4LumaPredVL_AArch64_neon + sub x3, x1, x2 + ld1 {v0.8b}, [x3] + ext v1.8b, v0.8b, v0.8b, #1 + uaddl v1.8h, v1.8b, v0.8b + uqrshrn v0.8b, v1.8h, #1 // v0.8b is VL0, VL1, VL2, VL3, VL4, ... + ext v2.16b, v1.16b, v1.16b, #2 + add v1.8h, v2.8h, v1.8h + uqrshrn v1.8b, v1.8h, #2 // v1.8b is VL5, VL6, VL7, VL8, VL9 + st1 {v0.s}[0], [x0], 4 // write the first row + st1 {v1.s}[0], [x0], 4 // write the second row + ext v3.8b, v0.8b, v0.8b, #1 + ext v2.8b, v1.8b, v1.8b, #1 + st1 {v3.s}[0], [x0], 4 // write the third row + st1 {v2.s}[0], [x0] // write the fourth row +WELS_ASM_ARCH64_FUNC_END + +WELS_ASM_ARCH64_FUNC_BEGIN WelsI4x4LumaPredVLTop_AArch64_neon + sub x3, x1, x2 + ld1 {v0.8b}, [x3] + dup v1.8b, v0.b[3] + mov v0.s[1], v1.s[0] + ext v1.8b, v0.8b, v0.8b, #1 + uaddl v1.8h, v1.8b, v0.8b + uqrshrn v0.8b, v1.8h, #1 // v0.8b is VL0, VL1, VL2, VL3, VL4, ... + ext v2.16b, v1.16b, v1.16b, #2 + add v1.8h, v2.8h, v1.8h + uqrshrn v1.8b, v1.8h, #2 // v1.8b is VL5, VL6, VL7, VL8, VL9 + st1 {v0.s}[0], [x0], 4 // write the first row + st1 {v1.s}[0], [x0], 4 // write the second row + ext v3.8b, v0.8b, v0.8b, #1 + ext v2.8b, v1.8b, v1.8b, #1 + st1 {v3.s}[0], [x0], 4 // write the third row + st1 {v2.s}[0], [x0] // write the fourth row +WELS_ASM_ARCH64_FUNC_END + +WELS_ASM_ARCH64_FUNC_BEGIN WelsI4x4LumaPredVR_AArch64_neon + sub x3, x1, x2 + ld1 {v0.s}[1], [x3] + sub x3, x3, #1 + ld1 {v0.b}[3], [x3], x2 + ld1 {v0.b}[2], [x3], x2 + ld1 {v0.b}[1], [x3], x2 + ld1 {v0.b}[0], [x3] // v0.8b l2, l1, l0, lt, t0, t1, t2, t3 + + ext v1.8b, v0.8b, v0.8b, #7 + uaddl v2.8h, v1.8b, v0.8b //v2:{X,L2+L1,L1+L0,L0+LT,LT+T0,T0+T1,T1+T2,T2+T3} + ext v1.16b, v2.16b, v2.16b, #14 + add v3.8h, v2.8h, v1.8h //v3:{X,L2+L1+L1+L0,L1+L0+L0+LT,...T1+T2+T2+T3} + + uqrshrn v3.8b, v3.8h, #2 + uqrshrn v2.8b, v2.8h, #1 + + st1 {v2.s}[1], [x0], 4 + st1 {v3.s}[1], [x0], 4 + + ext v2.8b, v2.8b, v2.8b, #7 + ins v2.b[4], v3.b[3] + st1 {v2.s}[1], [x0], 4 + + ext v3.8b, v3.8b, v3.8b, #7 + ins v3.b[4], v3.b[3] + st1 {v3.s}[1], [x0] + +WELS_ASM_ARCH64_FUNC_END + + +WELS_ASM_ARCH64_FUNC_BEGIN WelsI4x4LumaPredHU_AArch64_neon + sub x3, x1, #1 + mov x4, #3 + mul x4, x4, x2 + add x4, x4, x3 + ld1r {v0.8b}, [x4] + ld1 {v0.b}[4], [x3], x2 + ld1 {v0.b}[5], [x3], x2 + ld1 {v0.b}[6], [x3], x2 //d0:{L3,L3,L3,L3,L0,L1,L2,L3} + + ext v1.8b, v0.8b, v0.8b, #1 + uaddl v2.8h, v0.8b, v1.8b //v2:{L3+L3,L3+L3,L3+L3,L3+L0,L0+L1,L1+L2,L2+L3,L3+L3} + ext v3.16b, v2.16b, v2.16b, #2 + add v3.8h, v3.8h, v2.8h //v2:{x, HU1, HU3, HU5, x} + + uqrshrn v2.8b, v2.8h, #1 // HU0, HU2, HU4 + uqrshrn v3.8b, v3.8h, #2 // HU1, HU3, HU5 + zip2 v3.8b, v2.8b, v3.8b // HU0, HU1, HU2, HU3, HU4, HU5 + mov v3.h[3], v0.h[0] // v0.8b is hu0, hu1, hu2, hu3, hu4, hu5, l3, l3 + ext v2.8b, v3.8b, v0.8b, #2 + st1 {v3.s}[0], [x0], 4 + st1 {v2.s}[0], [x0], 4 + st1 {v3.s}[1], [x0], 4 + st1 {v0.s}[0], [x0] +WELS_ASM_ARCH64_FUNC_END + +WELS_ASM_ARCH64_FUNC_BEGIN WelsI4x4LumaPredHD_AArch64_neon + sub x3, x1, #1 + sub x3, x3, x2 // x2 points to top left + ld1 {v0.s}[1], [x3], x2 + ld1 {v0.b}[3], [x3], x2 + ld1 {v0.b}[2], [x3], x2 + ld1 {v0.b}[1], [x3], x2 + ld1 {v0.b}[0], [x3] // v0.8b: l3, l2, l1, l0, lt, t0, t1, t2 + ext v1.8b, v0.8b, v0.8b, #1 // v1.8b: l2, l1, l0, lt, t0, t1, t2, l3 + uaddl v2.8h, v0.8b, v1.8b + ext v3.16b, v2.16b, v2.16b, #2 + add v3.8h, v3.8h, v2.8h + uqrshrn v2.8b, v2.8h, #1 // hd8, hd6, hd4, hd0, xxx + uqrshrn v3.8b, v3.8h, #2 // hd9, hd7, hd5, hd1, hd2, hd3 + zip1 v2.8b, v2.8b, v3.8b // hd8, hd9, hd6, hd7, hd4, hd5, hd0, hd1 + mov v1.h[0], v3.h[2] + ext v3.8b, v2.8b, v1.8b, #6 + st1 {v3.s}[0], [x0], 4 + st1 {v2.s}[1], [x0], 4 + ext v3.8b, v2.8b, v1.8b, #2 + st1 {v3.s}[0], [x0], 4 + st1 {v2.s}[0], [x0] +WELS_ASM_ARCH64_FUNC_END + +// for Chroma 8x8 +WELS_ASM_ARCH64_FUNC_BEGIN WelsIChromaPredV_AArch64_neon + sub x3, x1, x2 + ld1 {v0.8b}, [x3] +.rept 8 + st1 {v0.8b}, [x0], 8 +.endr +WELS_ASM_ARCH64_FUNC_END + +WELS_ASM_ARCH64_FUNC_BEGIN WelsIChromaPredH_AArch64_neon + sub x3, x1, #1 +.rept 8 + ld1r {v0.8b}, [x3], x2 + st1 {v0.8b}, [x0], 8 +.endr +WELS_ASM_ARCH64_FUNC_END + + +WELS_ASM_ARCH64_FUNC_BEGIN WelsIChromaPredDc_AArch64_neon + sub x3, x1, x2 + sub x4, x1, #1 + ld1 {v0.8b}, [x3] + ld1 {v0.b}[8], [x4], x2 + ld1 {v0.b}[9], [x4], x2 + ld1 {v0.b}[10], [x4], x2 + ld1 {v0.b}[11], [x4], x2 + ld1 {v0.b}[12], [x4], x2 + ld1 {v0.b}[13], [x4], x2 + ld1 {v0.b}[14], [x4], x2 + ld1 {v0.b}[15], [x4] + + uaddlp v1.8h, v0.16b + uaddlp v2.4s, v1.8h + ins v3.d[0], v2.d[1] + add v3.2s, v2.2s, v3.2s + urshr v2.4s, v2.4s, #2 + urshr v3.2s, v3.2s, #3 + + dup v0.8b, v3.b[0] + dup v1.8b, v2.b[4] + dup v2.8b, v2.b[12] + dup v3.8b, v3.b[4] + ins v0.s[1], v1.s[0] + ins v2.s[1], v3.s[0] +.rept 4 + st1 {v0.8b}, [x0], 8 +.endr +.rept 4 + st1 {v2.8b}, [x0], 8 +.endr + +WELS_ASM_ARCH64_FUNC_END + +WELS_ASM_ARCH64_FUNC_BEGIN WelsIChromaPredDcTop_AArch64_neon + sub x3, x1, x2 + ld1 {v0.8b}, [x3] + uaddlp v0.4h, v0.8b + addp v0.8h, v0.8h, v0.8h + dup v1.8h, v0.h[0] + dup v2.8h, v0.h[1] + mov v1.D[1], v2.D[0] + uqrshrn v1.8b, v1.8h, #2 +.rept 8 + st1 {v1.8b}, [x0], 8 +.endr +WELS_ASM_ARCH64_FUNC_END + +.align 16 +intra_1_to_4: .short 17*1, 17*2, 17*3, 17*4, 17*1, 17*2, 17*3, 17*4 +intra_m3_to_p4: .short -3, -2, -1, 0, 1, 2, 3, 4 + +WELS_ASM_ARCH64_FUNC_BEGIN WelsIChromaPredPlane_AArch64_neon + sub x3, x1, x2 + sub x3, x3, #1 + mov x4, x3 + // load pTop[2-i] and pLeft[(2-i)*kiStride] + ld1 {v1.b}[3], [x3], #1 + ld1 {v1.b}[2], [x3], #1 + ld1 {v1.b}[1], [x3], #1 + ld1 {v1.b}[0], [x3], #1 + ld1 {v1.b}[7], [x4], x2 + ld1 {v1.b}[6], [x4], x2 + ld1 {v1.b}[5], [x4], x2 + ld1 {v1.b}[4], [x4], x2 + add x3, x3, #1 + add x4, x4, x2 + // load pTop[4+i] and pLeft[(4+i)*kiStride] + ld1 {v0.b}[0], [x3], #1 + ld1 {v0.b}[1], [x3], #1 + ld1 {v0.b}[2], [x3], #1 + ld1 {v0.b}[3], [x3], #1 + ld1 {v0.b}[4], [x4], x2 + ld1 {v0.b}[5], [x4], x2 + ld1 {v0.b}[6], [x4], x2 + ld1 {v0.b}[7], [x4], x2 + + uxtl v1.8h, v1.8b + uxtl v0.8h, v0.8b + ldr q2, intra_1_to_4 + ldr q3, intra_m3_to_p4 + dup v4.8h, v0.h[3] + dup v5.8h, v0.h[7] + add v4.8h, v4.8h, v5.8h + sub v0.8h, v0.8h, v1.8h + shl v4.8h, v4.8h, #4 // v4.8h is a + mul v0.8h, v0.8h, v2.8h // v0.h[0-3] is H, v0.h[4-7] is V + saddlp v0.4s, v0.8h + addp v0.4s, v0.4s, v0.4s // v0.s[0] is H, v0.s[1] is V + sqrshrn v0.4h, v0.4s, #5 + dup v1.8h, v0.h[0] // v1.8h is b + dup v0.8h, v0.h[1] // v0.8h is c + mla v4.8h, v1.8h, v3.8h + mla v4.8h, v0.8h, v3.h[0] + sqrshrun v1.8b, v4.8h, #5 + st1 {v1.8b}, [x0], 8 +.rept 7 + add v4.8h, v4.8h, v0.8h + sqrshrun v1.8b, v4.8h, #5 + st1 {v1.8b}, [x0], 8 +.endr +WELS_ASM_ARCH64_FUNC_END + +//for Luma 16x16 +WELS_ASM_ARCH64_FUNC_BEGIN WelsI16x16LumaPredV_AArch64_neon + sub x3, x1, x2 + ld1 {v0.16b}, [x3], x1 +.rept 16 + st1 {v0.16b}, [x0], 16 +.endr +WELS_ASM_ARCH64_FUNC_END + +WELS_ASM_ARCH64_FUNC_BEGIN WelsI16x16LumaPredH_AArch64_neon + sub x3, x1, #1 +.rept 16 + ld1r {v0.16b}, [x3], x2 + st1 {v0.16b}, [x0], 16 +.endr +WELS_ASM_ARCH64_FUNC_END + +WELS_ASM_ARCH64_FUNC_BEGIN WelsI16x16LumaPredDc_AArch64_neon + sub x3, x1, x2 + sub x4, x1, #1 + ld1 {v0.16b}, [x3] + ld1 {v1.b}[0], [x4], x2 + ld1 {v1.b}[1], [x4], x2 + ld1 {v1.b}[2], [x4], x2 + ld1 {v1.b}[3], [x4], x2 + ld1 {v1.b}[4], [x4], x2 + ld1 {v1.b}[5], [x4], x2 + ld1 {v1.b}[6], [x4], x2 + ld1 {v1.b}[7], [x4], x2 + ld1 {v1.b}[8], [x4], x2 + ld1 {v1.b}[9], [x4], x2 + ld1 {v1.b}[10], [x4], x2 + ld1 {v1.b}[11], [x4], x2 + ld1 {v1.b}[12], [x4], x2 + ld1 {v1.b}[13], [x4], x2 + ld1 {v1.b}[14], [x4], x2 + ld1 {v1.b}[15], [x4] + // reduce instruction + uaddlv h0, v0.16b + uaddlv h1, v1.16b + add v0.8h, v0.8h, v1.8h + uqrshrn b0, h0, #5 + dup v0.16b, v0.b[0] +.rept 16 + st1 {v0.16b}, [x0], 16 +.endr +WELS_ASM_ARCH64_FUNC_END + +WELS_ASM_ARCH64_FUNC_BEGIN WelsI16x16LumaPredDcTop_AArch64_neon + sub x3, x1, x2 + ld1 {v0.16b}, [x3] + // reduce instruction + uaddlv h0, v0.16b + uqrshrn v0.8b, v0.8h, 4 + dup v0.16b, v0.b[0] +.rept 16 + st1 {v0.16b}, [x0], 16 +.endr +WELS_ASM_ARCH64_FUNC_END + +WELS_ASM_ARCH64_FUNC_BEGIN WelsI16x16LumaPredDcLeft_AArch64_neon + sub x3, x1, #1 + ld1 {v1.b}[0], [x3], x2 + ld1 {v1.b}[1], [x3], x2 + ld1 {v1.b}[2], [x3], x2 + ld1 {v1.b}[3], [x3], x2 + ld1 {v1.b}[4], [x3], x2 + ld1 {v1.b}[5], [x3], x2 + ld1 {v1.b}[6], [x3], x2 + ld1 {v1.b}[7], [x3], x2 + ld1 {v1.b}[8], [x3], x2 + ld1 {v1.b}[9], [x3], x2 + ld1 {v1.b}[10], [x3], x2 + ld1 {v1.b}[11], [x3], x2 + ld1 {v1.b}[12], [x3], x2 + ld1 {v1.b}[13], [x3], x2 + ld1 {v1.b}[14], [x3], x2 + ld1 {v1.b}[15], [x3] + // reduce instruction + uaddlv h1, v1.16b + uqrshrn v0.8b, v1.8h, #4 + dup v0.16b, v0.b[0] +.rept 16 + st1 {v0.16b}, [x0], 16 +.endr +WELS_ASM_ARCH64_FUNC_END + + +.align 16 +intra_1_to_8: .short 5, 10, 15, 20, 25, 30, 35, 40 +intra_m7_to_p8: .short -7, -6, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, 6, 7, 8 + +WELS_ASM_ARCH64_FUNC_BEGIN WelsI16x16LumaPredPlane_AArch64_neon + sub x3, x1, x2 + sub x3, x3, #1 + mov x4, x3 + ld1 {v0.8b}, [x4] // v0 low 8 bit in top(reverse order) + add x4, x4, #9 + rev64 v0.8b, v0.8b // reverse v0 + ld1 {v1.8b}, [x4] // v1 high 8 bit in top + uxtl v0.8h, v0.8b // extend to 16 bit integer + uxtl v1.8h, v1.8b // extend to 16 bit integer + ld1 {v2.b}[7], [x3], x2 + ld1 {v2.b}[6], [x3], x2 + ld1 {v2.b}[5], [x3], x2 + ld1 {v2.b}[4], [x3], x2 + ld1 {v2.b}[3], [x3], x2 + ld1 {v2.b}[2], [x3], x2 + ld1 {v2.b}[1], [x3], x2 + ld1 {v2.b}[0], [x3], x2 // v2.8b low 8 bit in left + add x3, x3, x2 + ld1 {v3.b}[0], [x3], x2 + ld1 {v3.b}[1], [x3], x2 + ld1 {v3.b}[2], [x3], x2 + ld1 {v3.b}[3], [x3], x2 + ld1 {v3.b}[4], [x3], x2 + ld1 {v3.b}[5], [x3], x2 + ld1 {v3.b}[6], [x3], x2 + ld1 {v3.b}[7], [x3] // v3.8b high 8bit in left + uxtl v2.8h, v2.8b + uxtl v3.8h, v3.8b + sub v0.8h, v1.8h, v0.8h + sub v2.8h, v3.8h, v2.8h + ldr q4, intra_1_to_8 + mul v0.8h, v0.8h, v4.8h + mul v2.8h, v2.8h, v4.8h + saddlv s0, v0.8h + saddlv s2, v2.8h + add v1.8h, v1.8h, v3.8h + sqrshrn v0.4h, v0.4S, #6 // b is in v0.h[0] + sqrshrn v2.4h, v2.4S, #6 // c is in v2.h[0] + shl v1.8h, v1.8h, #4 // a is in v1.h[7] + ldr q4, intra_m7_to_p8 + ldr q5, intra_m7_to_p8 + 16 + dup v1.8h, v1.h[7] + dup v3.8h, v1.h[7] + mla v1.8h, v4.8h, v0.h[0] + mla v3.8h, v5.8h, v0.h[0] + dup v2.8h, v2.h[0] // v2.8h is [cccccccc] + mla v1.8h, v2.8h, v4.h[0] + mla v3.8h, v2.8h, v4.h[0] + sqrshrun v4.8b, v1.8h, #5 + sqrshrun2 v4.16b, v3.8h, #5 + st1 {v4.16b}, [x0], 16 +.rept 15 + add v1.8h, v1.8h, v2.8h + add v3.8h, v3.8h, v2.8h + sqrshrun v4.8b, v1.8h, #5 + sqrshrun2 v4.16b, v3.8h, #5 + st1 {v4.16b}, [x0], 16 +.endr +WELS_ASM_ARCH64_FUNC_END +#endif \ No newline at end of file diff --git a/codec/encoder/core/arm64/pixel_neon_aarch64.S b/codec/encoder/core/arm64/pixel_aarch64_neon.S similarity index 100% rename from codec/encoder/core/arm64/pixel_neon_aarch64.S rename to codec/encoder/core/arm64/pixel_aarch64_neon.S diff --git a/codec/encoder/core/inc/get_intra_predictor.h b/codec/encoder/core/inc/get_intra_predictor.h index e295c7b4..5b1c1593 100644 --- a/codec/encoder/core/inc/get_intra_predictor.h +++ b/codec/encoder/core/inc/get_intra_predictor.h @@ -135,6 +135,32 @@ void WelsIChromaPredH_neon (uint8_t* pPred, uint8_t* pRef, const int32_t kiStrid void WelsIChromaPredDc_neon (uint8_t* pPred, uint8_t* pRef, const int32_t kiStride); void WelsIChromaPredPlane_neon (uint8_t* pPred, uint8_t* pRef, const int32_t kiStride); #endif//HAVE_NEON + +#if defined(HAVE_NEON_AARCH64) +void WelsI16x16LumaPredV_AArch64_neon (uint8_t* pPred, uint8_t* pRef, const int32_t kiStride); +void WelsI16x16LumaPredH_AArch64_neon (uint8_t* pPred, uint8_t* pRef, const int32_t kiStride); +void WelsI16x16LumaPredDc_AArch64_neon (uint8_t* pPred, uint8_t* pRef, const int32_t kiStride); +void WelsI16x16LumaPredPlane_AArch64_neon (uint8_t* pPred, uint8_t* pRef, const int32_t kiStride); +void WelsI16x16LumaPredDcTop_AArch64_neon (uint8_t* pPred, uint8_t* pRef, const int32_t kiStride); +void WelsI16x16LumaPredDcLeft_AArch64_neon (uint8_t* pPred, uint8_t* pRef, const int32_t kiStride); + +void WelsI4x4LumaPredH_AArch64_neon (uint8_t* pPred, uint8_t* pRef, const int32_t kiStride); +void WelsI4x4LumaPredDDL_AArch64_neon (uint8_t* pPred, uint8_t* pRef, const int32_t kiStride); +void WelsI4x4LumaPredDDLTop_AArch64_neon (uint8_t* pPred, uint8_t* pRef, const int32_t kiStride); +void WelsI4x4LumaPredVL_AArch64_neon (uint8_t* pPred, uint8_t* pRef, const int32_t kiStride); +void WelsI4x4LumaPredVLTop_AArch64_neon (uint8_t* pPred, uint8_t* pRef, const int32_t kiStride); +void WelsI4x4LumaPredVR_AArch64_neon (uint8_t* pPred, uint8_t* pRef, const int32_t kiStride); +void WelsI4x4LumaPredHU_AArch64_neon (uint8_t* pPred, uint8_t* pRef, const int32_t kiStride); +void WelsI4x4LumaPredHD_AArch64_neon (uint8_t* pPred, uint8_t* pRef, const int32_t kiStride); +void WelsI4x4LumaPredDc_AArch64_neon (uint8_t* pPred, uint8_t* pRef, const int32_t kiStride); +void WelsI4x4LumaPredDcTop_AArch64_neon (uint8_t* pPred, uint8_t* pRef, const int32_t kiStride); + +void WelsIChromaPredV_AArch64_neon (uint8_t* pPred, uint8_t* pRef, const int32_t kiStride); +void WelsIChromaPredH_AArch64_neon (uint8_t* pPred, uint8_t* pRef, const int32_t kiStride); +void WelsIChromaPredDc_AArch64_neon (uint8_t* pPred, uint8_t* pRef, const int32_t kiStride); +void WelsIChromaPredPlane_AArch64_neon (uint8_t* pPred, uint8_t* pRef, const int32_t kiStride); +void WelsIChromaPredDcTop_AArch64_neon (uint8_t* pPred, uint8_t* pRef, const int32_t kiStride); +#endif//HAVE_NEON_AARCH64 #if defined(__cplusplus) } #endif//__cplusplus diff --git a/codec/encoder/core/src/get_intra_predictor.cpp b/codec/encoder/core/src/get_intra_predictor.cpp index 91142aea..03103769 100644 --- a/codec/encoder/core/src/get_intra_predictor.cpp +++ b/codec/encoder/core/src/get_intra_predictor.cpp @@ -732,6 +732,34 @@ void WelsInitIntraPredFuncs (SWelsFuncPtrList* pFuncList, const uint32_t kuiCpuF } #endif +#if defined(HAVE_NEON_AARCH64) + if (kuiCpuFlag & WELS_CPU_NEON) { + pFuncList->pfGetLumaI16x16Pred[I16_PRED_DC] = WelsI16x16LumaPredDc_AArch64_neon; + pFuncList->pfGetLumaI16x16Pred[I16_PRED_P] = WelsI16x16LumaPredPlane_AArch64_neon; + pFuncList->pfGetLumaI16x16Pred[I16_PRED_H] = WelsI16x16LumaPredH_AArch64_neon; + pFuncList->pfGetLumaI16x16Pred[I16_PRED_V] = WelsI16x16LumaPredV_AArch64_neon; + pFuncList->pfGetLumaI16x16Pred[I16_PRED_DC_L] = WelsI16x16LumaPredDcLeft_AArch64_neon; + pFuncList->pfGetLumaI16x16Pred[I16_PRED_DC_T] = WelsI16x16LumaPredDcTop_AArch64_neon; + + pFuncList->pfGetLumaI4x4Pred[I4_PRED_H ] = WelsI4x4LumaPredH_AArch64_neon; + pFuncList->pfGetLumaI4x4Pred[I4_PRED_DDL ] = WelsI4x4LumaPredDDL_AArch64_neon; + pFuncList->pfGetLumaI4x4Pred[I4_PRED_DDL_TOP] = WelsI4x4LumaPredDDLTop_AArch64_neon; + pFuncList->pfGetLumaI4x4Pred[I4_PRED_VL ] = WelsI4x4LumaPredVL_AArch64_neon; + pFuncList->pfGetLumaI4x4Pred[I4_PRED_VL_TOP ] = WelsI4x4LumaPredVLTop_AArch64_neon; + pFuncList->pfGetLumaI4x4Pred[I4_PRED_VR ] = WelsI4x4LumaPredVR_AArch64_neon; + pFuncList->pfGetLumaI4x4Pred[I4_PRED_HU ] = WelsI4x4LumaPredHU_AArch64_neon; + pFuncList->pfGetLumaI4x4Pred[I4_PRED_HD ] = WelsI4x4LumaPredHD_AArch64_neon; + pFuncList->pfGetLumaI4x4Pred[I4_PRED_DC ] = WelsI4x4LumaPredDc_AArch64_neon; + pFuncList->pfGetLumaI4x4Pred[I4_PRED_DC_T ] = WelsI4x4LumaPredDcTop_AArch64_neon; + + pFuncList->pfGetChromaPred[C_PRED_H] = WelsIChromaPredH_AArch64_neon; + pFuncList->pfGetChromaPred[C_PRED_V] = WelsIChromaPredV_AArch64_neon; + pFuncList->pfGetChromaPred[C_PRED_P ] = WelsIChromaPredPlane_AArch64_neon; + pFuncList->pfGetChromaPred[C_PRED_DC] = WelsIChromaPredDc_AArch64_neon; + pFuncList->pfGetChromaPred[C_PRED_DC_T] = WelsIChromaPredDcTop_AArch64_neon; + } +#endif//HAVE_NEON_AARCH64 + #ifdef X86_ASM if (kuiCpuFlag & WELS_CPU_MMXEXT) { pFuncList->pfGetLumaI4x4Pred[I4_PRED_DDR] = WelsI4x4LumaPredDDR_mmx; diff --git a/codec/encoder/targets.mk b/codec/encoder/targets.mk index 99c39205..82c5cd9c 100644 --- a/codec/encoder/targets.mk +++ b/codec/encoder/targets.mk @@ -59,7 +59,8 @@ endif ifeq ($(ASM_ARCH), arm64) ENCODER_ASM_ARM64_SRCS=\ - $(ENCODER_SRCDIR)/core/arm64/pixel_neon_aarch64.S\ + $(ENCODER_SRCDIR)/core/arm64/intra_pred_aarch64_neon.S\ + $(ENCODER_SRCDIR)/core/arm64/pixel_aarch64_neon.S\ ENCODER_OBJS += $(ENCODER_ASM_ARM64_SRCS:.S=.$(OBJ)) endif