diff --git a/codec/build/iOS/common/common.xcodeproj/project.pbxproj b/codec/build/iOS/common/common.xcodeproj/project.pbxproj index 3aff240a..22cfb0ed 100644 --- a/codec/build/iOS/common/common.xcodeproj/project.pbxproj +++ b/codec/build/iOS/common/common.xcodeproj/project.pbxproj @@ -22,6 +22,7 @@ F0B204F918FD23BF005DA23F /* copy_mb.cpp in Sources */ = {isa = PBXBuildFile; fileRef = F0B204F818FD23BF005DA23F /* copy_mb.cpp */; }; F556A8241906673900E156A8 /* arm_arch64_common_macro.S in Sources */ = {isa = PBXBuildFile; fileRef = F556A8221906673900E156A8 /* arm_arch64_common_macro.S */; }; F556A8251906673900E156A8 /* expand_picture_aarch64_neon.S in Sources */ = {isa = PBXBuildFile; fileRef = F556A8231906673900E156A8 /* expand_picture_aarch64_neon.S */; }; + F5AC94FF193EB7D800F58154 /* deblocking_aarch64_neon.S in Sources */ = {isa = PBXBuildFile; fileRef = F5AC94FE193EB7D800F58154 /* deblocking_aarch64_neon.S */; }; F5B8D82D190757290037849A /* mc_aarch64_neon.S in Sources */ = {isa = PBXBuildFile; fileRef = F5B8D82C190757290037849A /* mc_aarch64_neon.S */; }; FAABAA1818E9354A00D4186F /* sad_common.cpp in Sources */ = {isa = PBXBuildFile; fileRef = FAABAA1718E9354A00D4186F /* sad_common.cpp */; }; /* End PBXBuildFile section */ @@ -69,6 +70,7 @@ F0B204F818FD23BF005DA23F /* copy_mb.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = copy_mb.cpp; sourceTree = ""; }; F556A8221906673900E156A8 /* arm_arch64_common_macro.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; name = arm_arch64_common_macro.S; path = arm64/arm_arch64_common_macro.S; sourceTree = ""; }; F556A8231906673900E156A8 /* expand_picture_aarch64_neon.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; name = expand_picture_aarch64_neon.S; path = arm64/expand_picture_aarch64_neon.S; sourceTree = ""; }; + F5AC94FE193EB7D800F58154 /* deblocking_aarch64_neon.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; name = deblocking_aarch64_neon.S; path = arm64/deblocking_aarch64_neon.S; sourceTree = ""; }; F5B8D82C190757290037849A /* mc_aarch64_neon.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; name = mc_aarch64_neon.S; path = arm64/mc_aarch64_neon.S; sourceTree = ""; }; FAABAA1618E9353F00D4186F /* sad_common.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = sad_common.h; sourceTree = ""; }; FAABAA1718E9354A00D4186F /* sad_common.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = sad_common.cpp; sourceTree = ""; }; @@ -175,6 +177,7 @@ F556A81D1906669F00E156A8 /* arm64 */ = { isa = PBXGroup; children = ( + F5AC94FE193EB7D800F58154 /* deblocking_aarch64_neon.S */, F5B8D82C190757290037849A /* mc_aarch64_neon.S */, F556A8221906673900E156A8 /* arm_arch64_common_macro.S */, F556A8231906673900E156A8 /* expand_picture_aarch64_neon.S */, @@ -236,6 +239,7 @@ F5B8D82D190757290037849A /* mc_aarch64_neon.S in Sources */, 4C3406C918D96EA600DFA14A /* arm_arch_common_macro.S in Sources */, F556A8241906673900E156A8 /* arm_arch64_common_macro.S in Sources */, + F5AC94FF193EB7D800F58154 /* deblocking_aarch64_neon.S in Sources */, 4C3406CE18D96EA600DFA14A /* crt_util_safe_x.cpp in Sources */, 4C3406CF18D96EA600DFA14A /* deblocking_common.cpp in Sources */, 4C3406D018D96EA600DFA14A /* logging.cpp in Sources */, diff --git a/codec/build/iOS/dec/welsdec/welsdec.xcodeproj/project.pbxproj b/codec/build/iOS/dec/welsdec/welsdec.xcodeproj/project.pbxproj index 8bfbe4b7..286b51f5 100644 --- a/codec/build/iOS/dec/welsdec/welsdec.xcodeproj/project.pbxproj +++ b/codec/build/iOS/dec/welsdec/welsdec.xcodeproj/project.pbxproj @@ -460,6 +460,7 @@ "$(SRCROOT)/../../../../common/arm", ); IPHONEOS_DEPLOYMENT_TARGET = 6.1; + ONLY_ACTIVE_ARCH = NO; OTHER_LDFLAGS = "-ObjC"; PRODUCT_NAME = "$(TARGET_NAME)"; SKIP_INSTALL = YES; @@ -494,6 +495,7 @@ "$(SRCROOT)/../../../../common/arm", ); IPHONEOS_DEPLOYMENT_TARGET = 6.1; + ONLY_ACTIVE_ARCH = NO; OTHER_LDFLAGS = "-ObjC"; PRODUCT_NAME = "$(TARGET_NAME)"; SKIP_INSTALL = YES; diff --git a/codec/common/arm64/deblocking_aarch64_neon.S b/codec/common/arm64/deblocking_aarch64_neon.S new file mode 100644 index 00000000..cfa9405c --- /dev/null +++ b/codec/common/arm64/deblocking_aarch64_neon.S @@ -0,0 +1,1116 @@ +/*! +* \copy +* Copyright (c) 2013, Cisco Systems +* All rights reserved. + +* Redistribution and use in source and binary forms, with or without +* modification, are permitted provided that the following conditions +* are met: + +* * Redistributions of source code must retain the above copyright +* notice, this list of conditions and the following disclaimer. + +* * Redistributions in binary form must reproduce the above copyright +* notice, this list of conditions and the following disclaimer in +* the documentation and/or other materials provided with the +* distribution. + +* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS +* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE +* COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, +* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, +* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT +* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN +* ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +* POSSIBILITY OF SUCH DAMAGE. + +*/ + +#ifdef HAVE_NEON_AARCH64 +.text + +#include "arm_arch64_common_macro.S" +#ifdef __APPLE__ + +.macro MASK_MATRIX + uabd $6.16b, $1.16b, $2.16b + cmhi $6.16b, $4.16b, $6.16b + + uabd $4.16b, $0.16b, $1.16b + cmhi $4.16b, $5.16b, $4.16b + and $6.16b, $6.16b, $4.16b + + uabd $4.16b, $3.16b, $2.16b + cmhi $4.16b, $5.16b, $4.16b + and $6.16b, $6.16b, $4.16b +.endm + +.macro DIFF_LUMA_LT4_P1_Q1 //(Use Tmp v23, v24) + //v0, v1, v2, v3, v17(beta), v18(-Tc0), v6(Tc0), v7(flag), v19, v20 + urhadd $8.16b, $2.16b, $3.16b + uhadd $8.16b, $0.16b, $8.16b + usubl $9.8h, $8.8b, $1.8b + sqxtn $9.8b, $9.8h + usubl2 $8.8h, $8.16b, $1.16b + sqxtn2 $9.16b, $8.8h + smax $8.16b, $9.16b, $5.16b +// + smin $8.16b, $8.16b, $6.16b + uabd $9.16b, $0.16b, $2.16b + cmhi $9.16b, $4.16b, $9.16b + and $8.16b, $8.16b, $9.16b + and $8.16b, $8.16b, $7.16b + add $8.16b, $1.16b, $8.16b + abs $9.16b, $9.16b +.endm + +.macro DIFF_LUMA_LT4_P0_Q0_1 + usubl $5.8h, $0.8b, $3.8b + usubl $6.8h, $2.8b, $1.8b + shl $6.8h, $6.8h, #2 + add $5.8h, $5.8h, $6.8h + sqrshrn $4.8b, $5.8h, #3 +.endm + +.macro DIFF_LUMA_LT4_P0_Q0_2 + usubl2 $5.8h, $0.16b, $3.16b + usubl2 $6.8h, $2.16b, $1.16b + shl $6.8h, $6.8h, #2 + add $5.8h, $5.8h, $6.8h + sqrshrn2 $4.16b, $5.8h, #3 +.endm + +.macro EXTRACT_DELTA_INTO_TWO_PART + cmge $1.16b, $0.16b, #0 + and $1.16b, $0.16b, $1.16b + sub $0.16b, $1.16b, $0.16b +.endm + +.macro DIFF_LUMA_EQ4_P2P1P0_1 + uaddl $8.8h, $1.8b, $2.8b + uaddl $9.8h, $3.8b, $4.8b + add $9.8h, $9.8h, $8.8h + + uaddl $8.8h, $0.8b, $1.8b + shl $8.8h, $8.8h, #1 + add $8.8h, $9.8h, $8.8h + + rshrn $0.8b, $9.8h, #2 + rshrn $7.8b, $8.8h, #3 + shl $9.8h, $9.8h, #1 + usubl $8.8h, $5.8b, $1.8b + add $9.8h, $8.8h, $9.8h + + uaddl $8.8h, $2.8b, $5.8b + uaddw $8.8h, $8.8h, $2.8b + uaddw $8.8h, $8.8h, $3.8b + + rshrn $9.8b, $9.8h, #3 + rshrn $8.8b, $8.8h, #2 + bsl $6.8b, $9.8b, $8.8b +.endm + +.macro DIFF_LUMA_EQ4_P2P1P0_2 + uaddl2 $8.8h, $1.16b, $2.16b + uaddl2 $9.8h, $3.16b, $4.16b + add $9.8h, $9.8h, $8.8h + + uaddl2 $8.8h, $0.16b, $1.16b + shl $8.8h, $8.8h, #1 + add $8.8h, $9.8h, $8.8h + + rshrn2 $0.16b, $9.8h, #2 + rshrn2 $7.16b, $8.8h, #3 + shl $9.8h, $9.8h, #1 + usubl2 $8.8h, $5.16b, $1.16b + add $9.8h, $8.8h, $9.8h + + uaddl2 $8.8h, $2.16b, $5.16b + uaddw2 $8.8h, $8.8h, $2.16b + uaddw2 $8.8h, $8.8h, $3.16b + + rshrn2 $9.16b, $9.8h, #3 + rshrn2 $8.16b, $8.8h, #2 + bsl $6.16b, $9.16b, $8.16b +.endm + + +.macro DIFF_CHROMA_EQ4_P0Q0_1 + uaddl $4.8h, $0.8b, $3.8b + shl $4.8h, $4.8h, #1 + usubl $5.8h, $1.8b, $3.8b + add $5.8h, $5.8h, $4.8h + rshrn $6.8b, $5.8h, #2 + usubl $5.8h, $2.8b, $0.8b + add $5.8h, $5.8h, $4.8h + rshrn $7.8b, $5.8h, #2 +.endm + +.macro DIFF_CHROMA_EQ4_P0Q0_2 + uaddl2 $4.8h, $0.16b, $3.16b + shl $4.8h, $4.8h, #1 + usubl2 $5.8h, $1.16b, $3.16b + add $5.8h, $5.8h, $4.8h + rshrn2 $6.16b, $5.8h, #2 + usubl2 $5.8h, $2.16b, $0.16b + add $5.8h, $5.8h, $4.8h + rshrn2 $7.16b, $5.8h, #2 +.endm + +.macro DIFF_LUMA_EQ4_MASK + mov.16b $3, $2 + bsl $3.16b, $0.16b, $1.16b +.endm + +.macro LOAD_LUMA_DATA_3 + ld3 {$0.b, $1.b, $2.b} [$6], [x2], x1 + ld3 {$3.b, $4.b, $5.b} [$6], [x0], x1 +.endm + +.macro LOAD_LUMA_DATA_4 + ld4 {$0.b, $1.b, $2.b, $3.b} [$8], [x3], x1 + ld4 {$4.b, $5.b, $6.b, $7.b} [$8], [x0], x1 +.endm + +.macro STORE_LUMA_DATA_4 + st4 {$0.b, $1.b, $2.b, $3.b} [$4], [x0], x1 + st4 {$0.b, $1.b, $2.b, $3.b} [$5], [x2], x1 +.endm + +.macro STORE_LUMA_DATA_3 + st3 {$0.b, $1.b, $2.b} [$6], [x3], x1 + st3 {$3.b, $4.b, $5.b} [$6], [x0], x1 +.endm + +.macro LOAD_CHROMA_DATA_4 + ld4 {$0.b, $1.b, $2.b, $3.b} [$5], [$4], x2 +.endm + +.macro STORE_CHROMA_DATA_2 + st2 {$0.b, $1.b} [$3], [$2], x2 +.endm + +.macro ZERO_JUMP_END + mov $1, $0.d[0] + mov $2, $0.d[1] + orr $1, $1, $2 + cbz $1, $3 +.endm + +.macro BS_NZC_CHECK + ld1 {v0.16b}, [$0] + //Arrange the input data --- TOP + ands x6, $1, #2 + cbz x6, bs_nzc_check_jump0 + sub x6, $0, $2, lsl #4 + sub x6, x6, $2, lsl #3 + add x6, x6, #12 + ld1 {v1.s} [3], [x6] + + bs_nzc_check_jump0: + ext.16b v1, v1, v0, #12 + add $3.16b, v0.16b, v1.16b + + // Arrange the input data --- LEFT + ands x6, $1, #1 + cbz x6, bs_nzc_check_jump1 + + sub x6, $0, #21 + add x7, x6, #4 + ld1 {v1.b} [12], [x6] + add x6, x7, #4 + ld1 {v1.b} [13], [x7] + add x7, x6, #4 + ld1 {v1.b} [14], [x6] + ld1 {v1.b} [15], [x7] + +bs_nzc_check_jump1: + ins v2.d[0], v0.d[1] + zip1 v0.16b, v0.16b, v2.16b + ins v2.d[0], v0.d[1] + zip1 v0.16b, v0.16b, v2.16b + ext.16b v1, v1, v0, #12 + add $4.16b, v0.16b, v1.16b +.endm + +.macro BS_COMPARE_MV //in: $0,$1(const),$2(const),$3(const),$4(const); out:$5 + mov w6, #4 + sabd v20.8h, $0.8h, $1.8h + sabd v21.8h, $1.8h, $2.8h + dup $0.8h, w6 + sabd v22.8h, $2.8h, $3.8h + sabd v23.8h, $3.8h, $4.8h + + cmge v20.8h, v20.8h, $0.8h + cmge v21.8h, v21.8h, $0.8h + cmge v22.8h, v22.8h, $0.8h + cmge v23.8h, v23.8h, $0.8h + + addp v20.8h, v20.8h, v21.8h + addp v21.8h, v22.8h, v23.8h + + addhn $5.8b, v20.8h, v20.8h + addhn2 $5.16b, v21.8h, v21.8h +.endm + +.macro BS_MV_CHECK + ldp q0, q1, [$0], #32 + ldp q2, q3, [$0] + sub $0, $0, #32 + // Arrenge the input data --- TOP + ands x6, $1, #2 + cbz x6, bs_mv_check_jump0 + sub x6, $0, $2, lsl #6 + add x6, x6, #48 + ld1 {v4.16b}, [x6] +bs_mv_check_jump0: + BS_COMPARE_MV v4, v0, v1, v2, v3, $3 + // Arrange the input data --- LEFT + ands x6, $1, #1 + cbz x6, bs_mv_check_jump1 + sub x6, $0, #52 + add x7, x6, #16 + ld1 {v4.s} [0], [x6] + add x6, x7, #16 + ld1 {v4.s} [1], [x7] + add x7, x6, #16 + ld1 {v4.s} [2], [x6] + ld1 {v4.s} [3], [x7] +bs_mv_check_jump1: + zip1 $5.4s, v0.4s, v2.4s + zip2 $6.4s, v0.4s, v2.4s + zip1 v0.4s, v1.4s, v3.4s + zip2 v2.4s, v1.4s, v3.4s + zip2 v1.4s, $5.4s, v0.4s + zip1 v0.4s, $5.4s, v0.4s + zip2 v3.4s, $6.4s, v2.4s + zip1 v2.4s, $6.4s, v2.4s + BS_COMPARE_MV v4, v0, v1, v2, v3, $4 +.endm + +#else + +.macro MASK_MATRIX arg0, arg1, arg2, arg3, arg4, arg5, arg6 + uabd \arg6.16b, \arg1.16b, \arg2.16b + cmhi \arg6.16b, \arg4.16b, \arg6.16b + + uabd \arg4.16b, \arg0.16b, \arg1.16b + cmhi \arg4.16b, \arg5.16b, \arg4.16b + and \arg6.16b, \arg6.16b, \arg4.16b + + uabd \arg4.16b, \arg3.16b, \arg2.16b + cmhi \arg4.16b, \arg5.16b, \arg4.16b + and \arg6.16b, \arg6.16b, \arg4.16b +.endm + +.macro DIFF_LUMA_LT4_P1_Q1 arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8, arg9 + //v0, v1, v2, v3, v17(beta), v18(-Tc0), v6(Tc0), v7(flag), v19, v20 + urhadd \arg8.16b, \arg2.16b, \arg3.16b + uhadd \arg8.16b, \arg0.16b, \arg8.16b + usubl \arg9.8h, \arg8.8b, \arg1.8b + sqxtn \arg9.8b, \arg9.8h + usubl2 \arg8.8h, \arg8.16b, \arg1.16b + sqxtn2 \arg9.16b, \arg8.8h + smax \arg8.16b, \arg9.16b, \arg5.16b + // + smin \arg8.16b, \arg8.16b, \arg6.16b + uabd \arg9.16b, \arg0.16b, \arg2.16b + cmhi \arg9.16b, \arg4.16b, \arg9.16b + and \arg8.16b, \arg8.16b, \arg9.16b + and \arg8.16b, \arg8.16b, \arg7.16b + add \arg8.16b, \arg1.16b, \arg8.16b + abs \arg9.16b, \arg9.16b +.endm + +.macro DIFF_LUMA_LT4_P0_Q0_1 arg0, arg1, arg2, arg3, arg4, arg5, arg6 + usubl \arg5.8h, \arg0.8b, \arg3.8b + usubl \arg6.8h, \arg2.8b, \arg1.8b + shl \arg6.8h, \arg6.8h, #2 + add \arg5.8h, \arg5.8h, \arg6.8h + sqrshrn \arg4.8b, \arg5.8h, #3 +.endm + +.macro DIFF_LUMA_LT4_P0_Q0_2 arg0, arg1, arg2, arg3, arg4, arg5, arg6 + usubl2 \arg5.8h, \arg0.16b, \arg3.16b + usubl2 \arg6.8h, \arg2.16b, \arg1.16b + shl \arg6.8h, \arg6.8h, #2 + add \arg5.8h, \arg5.8h, \arg6.8h + sqrshrn2 \arg4.16b, \arg5.8h, #3 +.endm + +.macro EXTRACT_DELTA_INTO_TWO_PART arg0, arg1 + cmge \arg1.16b, \arg0.16b, #0 + and \arg1.16b, \arg0.16b, \arg1.16b + sub \arg0.16b, \arg1.16b, \arg0.16b +.endm + +.macro DIFF_LUMA_EQ4_P2P1P0_1 arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8, arg9 + uaddl \arg8.8h, \arg1.8b, \arg2.8b + uaddl \arg9.8h, \arg3.8b, \arg4.8b + add \arg9.8h, \arg9.8h, \arg8.8h + + uaddl \arg8.8h, \arg0.8b, \arg1.8b + shl \arg8.8h, \arg8.8h, #1 + add \arg8.8h, \arg9.8h, \arg8.8h + + rshrn \arg0.8b, \arg9.8h, #2 + rshrn \arg7.8b, \arg8.8h, #3 + shl \arg9.8h, \arg9.8h, #1 + usubl \arg8.8h, \arg5.8b, \arg1.8b + add \arg9.8h, \arg8.8h, \arg9.8h + + uaddl \arg8.8h, \arg2.8b, \arg5.8b + uaddw \arg8.8h, \arg8.8h, \arg2.8b + uaddw \arg8.8h, \arg8.8h, \arg3.8b + + rshrn \arg9.8b, \arg9.8h, #3 + rshrn \arg8.8b, \arg8.8h, #2 + bsl \arg6.8b, \arg9.8b, \arg8.8b +.endm + +.macro DIFF_LUMA_EQ4_P2P1P0_2 arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8, arg9 + uaddl2 \arg8.8h, \arg1.16b, \arg2.16b + uaddl2 \arg9.8h, \arg3.16b, \arg4.16b + add \arg9.8h, \arg9.8h, \arg8.8h + + uaddl2 \arg8.8h, \arg0.16b, \arg1.16b + shl \arg8.8h, \arg8.8h, #1 + add \arg8.8h, \arg9.8h, \arg8.8h + + rshrn2 \arg0.16b, \arg9.8h, #2 + rshrn2 \arg7.16b, \arg8.8h, #3 + shl \arg9.8h, \arg9.8h, #1 + usubl2 \arg8.8h, \arg5.16b, \arg1.16b + add \arg9.8h, \arg8.8h, \arg9.8h + + uaddl2 \arg8.8h, \arg2.16b, \arg5.16b + uaddw2 \arg8.8h, \arg8.8h, \arg2.16b + uaddw2 \arg8.8h, \arg8.8h, \arg3.16b + + rshrn2 \arg9.16b, \arg9.8h, #3 + rshrn2 \arg8.16b, \arg8.8h, #2 + bsl \arg6.16b, \arg9.16b, \arg8.16b +.endm + + +.macro DIFF_CHROMA_EQ4_P0Q0_1 arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7 + uaddl \arg4.8h, \arg0.8b, \arg3.8b + shl \arg4.8h, \arg4.8h, #1 + usubl \arg5.8h, \arg1.8b, \arg3.8b + add \arg5.8h, \arg5.8h, \arg4.8h + rshrn \arg6.8b, \arg5.8h, #2 + usubl \arg5.8h, \arg2.8b, \arg0.8b + add \arg5.8h, \arg5.8h, \arg4.8h + rshrn \arg7.8b, \arg5.8h, #2 +.endm + +.macro DIFF_CHROMA_EQ4_P0Q0_2 arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7 + uaddl2 \arg4.8h, \arg0.16b, \arg3.16b + shl \arg4.8h, \arg4.8h, #1 + usubl2 \arg5.8h, \arg1.16b, \arg3.16b + add \arg5.8h, \arg5.8h, \arg4.8h + rshrn2 \arg6.16b, \arg5.8h, #2 + usubl2 \arg5.8h, \arg2.16b, \arg0.16b + add \arg5.8h, \arg5.8h, \arg4.8h + rshrn2 \arg7.16b, \arg5.8h, #2 +.endm + +.macro DIFF_LUMA_EQ4_MASK arg0, arg1, arg2, arg3 + mov.16b \arg3, \arg2 + bsl \arg3.16b, \arg0.16b, \arg1.16b +.endm + +.macro LOAD_LUMA_DATA_3 arg0, arg1, arg2, arg3, arg4, arg5, arg6 + ld3 {\arg0.b, \arg1.b, \arg2.b} [\arg6], [x2], x1 + ld3 {\arg3.b, \arg4.b, \arg5.b} [\arg6], [x0], x1 +.endm + +.macro LOAD_LUMA_DATA_4 arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8 + ld4 {\arg0.b, \arg1.b, \arg2.b, \arg3.b} [\arg8], [x3], x1 + ld4 {\arg4.b, \arg5.b, \arg6.b, \arg7.b} [\arg8], [x0], x1 +.endm + +.macro STORE_LUMA_DATA_4 arg0, arg1, arg2, arg3, arg4, arg5 + st4 {\arg0.b, \arg1.b, \arg2.b, \arg3.b} [\arg4], [x0], x1 + st4 {\arg0.b, \arg1.b, \arg2.b, \arg3.b} [\arg5], [x2], x1 +.endm + +.macro STORE_LUMA_DATA_3 arg0, arg1, arg2, arg3, arg4, arg5, arg6 + st3 {\arg0.b, \arg1.b, \arg2.b} [\arg6], [x3], x1 + st3 {\arg3.b, \arg4.b, \arg5.b} [\arg6], [x0], x1 +.endm + +.macro LOAD_CHROMA_DATA_4 arg0, arg1, arg2, arg3, arg4, arg5 + ld4 {\arg0.b, \arg1.b, \arg2.b, \arg3.b} [\arg5], [\arg4], x2 +.endm + +.macro STORE_CHROMA_DATA_2 arg0, arg1, arg2, arg3 + st2 {\arg0.b, \arg1.b} [\arg3], [\arg2], x2 +.endm + +.macro ZERO_JUMP_END arg0, arg1, arg2, arg3 + mov \arg1, \arg0.d[0] + mov \arg2, \arg0.d[1] + orr \arg1, \arg1, \arg2 + cbz \arg1, \arg3 +.endm + +.macro BS_NZC_CHECK arg0, arg1, arg2, arg3, arg4 + ld1 {v0.16b}, [\arg0] + //Arrange the input data --- TOP + ands x6, \arg1, #2 + cbz x6, bs_nzc_check_jump0 + sub x6, \arg0, \arg2, lsl #4 + sub x6, x6, \arg2, lsl #3 + add x6, x6, #12 + ld1 {v1.s} [3], [x6] + +bs_nzc_check_jump0: + ext.16b v1, v1, v0, #12 + add \arg3.16b, v0.16b, v1.16b + + // Arrange the input data --- LEFT + ands x6, \arg1, #1 + cbz x6, bs_nzc_check_jump1 + + sub x6, \arg0, #21 + add x7, x6, #4 + ld1 {v1.b} [12], [x6] + add x6, x7, #4 + ld1 {v1.b} [13], [x7] + add x7, x6, #4 + ld1 {v1.b} [14], [x6] + ld1 {v1.b} [15], [x7] + +bs_nzc_check_jump1: + ins v2.d[0], v0.d[1] + zip1 v0.16b, v0.16b, v2.16b + ins v2.d[0], v0.d[1] + zip1 v0.16b, v0.16b, v2.16b + ext.16b v1, v1, v0, #12 + add \arg4.16b, v0.16b, v1.16b +.endm + +.macro BS_COMPARE_MV arg0, arg1, arg2, arg3, arg4, arg5 + //in: \arg0,\arg1(const),\arg2(const),\arg3(const),\arg4(const); out:\arg5 + mov w6, #4 + sabd v20.8h, \arg0.8h, \arg1.8h + sabd v21.8h, \arg1.8h, \arg2.8h + dup \arg0.8h, w6 + sabd v22.8h, \arg2.8h, \arg3.8h + sabd v23.8h, \arg3.8h, \arg4.8h + + cmge v20.8h, v20.8h, \arg0.8h + cmge v21.8h, v21.8h, \arg0.8h + cmge v22.8h, v22.8h, \arg0.8h + cmge v23.8h, v23.8h, \arg0.8h + + addp v20.8h, v20.8h, v21.8h + addp v21.8h, v22.8h, v23.8h + + addhn \arg5.8b, v20.8h, v20.8h + addhn2 \arg5.16b, v21.8h, v21.8h +.endm + +.macro BS_MV_CHECK arg0, arg1, arg2, arg3, arg4, arg5, arg6 + ldp q0, q1, [\arg0], #32 + ldp q2, q3, [\arg0] + sub \arg0, \arg0, #32 + // Arrenge the input data --- TOP + ands x6, \arg1, #2 + cbz x6, bs_mv_check_jump0 + sub x6, \arg0, \arg2, lsl #6 + add x6, x6, #48 + ld1 {v4.16b}, [x6] +bs_mv_check_jump0: + BS_COMPARE_MV v4, v0, v1, v2, v3, \arg3 + // Arrange the input data --- LEFT + ands x6, \arg1, #1 + cbz x6, bs_mv_check_jump1 + sub x6, \arg0, #52 + add x7, x6, #16 + ld1 {v4.s} [0], [x6] + add x6, x7, #16 + ld1 {v4.s} [1], [x7] + add x7, x6, #16 + ld1 {v4.s} [2], [x6] + ld1 {v4.s} [3], [x7] +bs_mv_check_jump1: + zip1 \arg5.4s, v0.4s, v2.4s + zip2 \arg6.4s, v0.4s, v2.4s + zip1 v0.4s, v1.4s, v3.4s + zip2 v2.4s, v1.4s, v3.4s + zip2 v1.4s, \arg5.4s, v0.4s + zip1 v0.4s, \arg5.4s, v0.4s + zip2 v3.4s, \arg6.4s, v2.4s + zip1 v2.4s, \arg6.4s, v2.4s + BS_COMPARE_MV v4, v0, v1, v2, v3, \arg4 +.endm +#endif + +WELS_ASM_ARCH64_FUNC_BEGIN WelsNonZeroCount_AArch64_neon + ld1 {v0.8b, v1.8b, v2.8b}, [x0] + ins v0.d[1], v1.d[0] + uzp1 v0.2d, v0.2d, v1.2d + cmeq v0.16b, v0.16b, #0 + cmeq v2.8b, v2.8b, #0 + mvn v0.16b, v0.16b + mvn v2.8b, v2.8b + abs v0.16b, v0.16b + abs v2.8b, v2.8b + ins v1.d[0], v0.d[1] + st1 {v0.8b, v1.8b, v2.8b}, [x0] +WELS_ASM_ARCH64_FUNC_END + + +WELS_ASM_ARCH64_FUNC_BEGIN DeblockLumaLt4V_AArch64_neon //uint8_t* pPix, int32_t iStride, int32_t iAlpha, int32_t iBeta, int8_t* tc + dup v16.16b, w2 //alpha + dup v17.16b, w3 //beta + add x2, x1, x1, lsl #1 + sub x2, x0, x2 + movi v23.16b, #128 + ld1 {v0.16b}, [x2], x1 + ld1 {v1.16b}, [x2], x1 + ld1 {v2.16b}, [x2] + ld1 {v3.16b}, [x0], x1 + ld1 {v4.16b}, [x0], x1 + ld1 {v5.16b}, [x0] + sub x2, x2, x1 + ld4r {v18.8b, v19.8b, v20.8b, v21.8b}, [x4] + trn1 v18.2s, v18.2s, v19.2s + trn1 v20.2s, v20.2s, v21.2s + trn1 v6.2d, v18.2d, v20.2d // iTc0: 0000, 1111, 2222, 3333 + cmge v7.16b, v6.16b, #0 // iTc0 Flag + + MASK_MATRIX v1, v2, v3, v4, v16, v17, v18 + and v7.16b, v7.16b, v18.16b // need filter flag + + ZERO_JUMP_END v7, x3, x4, DeblockLumaLt4V_AArch64_neon_end + + eor v18.16b, v18.16b, v18.16b + sub v18.16b, v18.16b, v6.16b // -iTc0: 0000, 1111, 2222, 3333 + + DIFF_LUMA_LT4_P1_Q1 v0, v1, v2, v3, v17, v18, v6, v7, v19, v20 + st1 {v19.16b}, [x2], x1 + + DIFF_LUMA_LT4_P1_Q1 v5, v4, v3, v2, v17, v18, v6, v7, v21, v22 + + abs v20.16b, v20.16b + abs v22.16b, v22.16b + add v6.16b, v6.16b, v20.16b + add v6.16b, v6.16b, v22.16b + eor v18.16b, v18.16b, v18.16b + sub v18.16b, v18.16b, v6.16b + + DIFF_LUMA_LT4_P0_Q0_1 v1, v2, v3, v4, v19, v20, v22 + DIFF_LUMA_LT4_P0_Q0_2 v1, v2, v3, v4, v19, v20, v22 + + smax v19.16b, v19.16b, v18.16b + smin v19.16b, v19.16b, v6.16b + and v19.16b, v19.16b, v7.16b + + EXTRACT_DELTA_INTO_TWO_PART v19, v20 + uqadd v2.16b, v2.16b, v20.16b + uqsub v2.16b, v2.16b, v19.16b + st1 {v2.16b}, [x2], x1 + uqsub v3.16b, v3.16b, v20.16b + uqadd v3.16b, v3.16b, v19.16b + st1 {v3.16b}, [x2], x1 + st1 {v21.16b}, [x2] +DeblockLumaLt4V_AArch64_neon_end: +WELS_ASM_ARCH64_FUNC_END + + +WELS_ASM_ARCH64_FUNC_BEGIN DeblockLumaEq4V_AArch64_neon + dup v16.16b, w2 //alpha + dup v17.16b, w3 //beta + sub x3, x0, x1, lsl #2 + + ld1 {v0.16b}, [x3], x1 + ld1 {v4.16b}, [x0], x1 + ld1 {v1.16b}, [x3], x1 + ld1 {v5.16b}, [x0], x1 + ld1 {v2.16b}, [x3], x1 + ld1 {v6.16b}, [x0], x1 + ld1 {v3.16b}, [x3] + ld1 {v7.16b}, [x0] + + sub x3, x3, x1, lsl #1 + MASK_MATRIX v2, v3, v4, v5, v16, v17, v18 + lsr w2, w2, #2 + add w2, w2, #2 + dup v16.16b, w2 //((alpha >> 2) + 2) + uabd v19.16b, v3.16b, v4.16b + cmhi v20.16b, v16.16b, v19.16b //iDetaP0Q0 < ((iAlpha >> 2) + 2) + + uabd v21.16b, v1.16b, v3.16b + cmhi v21.16b, v17.16b, v21.16b //bDetaP2P0 + and v21.16b, v21.16b, v20.16b //(iDetaP0Q0 < ((iAlpha >> 2) + 2))&&bDetaP2P0 + + uabd v22.16b, v6.16b, v4.16b + cmhi v22.16b, v17.16b, v22.16b //bDetaQ2Q0 + and v22.16b, v22.16b, v20.16b //(iDetaP0Q0 < ((iAlpha >> 2) + 2))&&bDetaQ2Q0 + and v20.16b, v20.16b, v18.16b //(iDetaP0Q0 < iAlpha) && bDetaP1P0 && bDetaQ1Q0&&(iDetaP0Q0 < ((iAlpha >> 2) + 2)) + + mov.16b v23, v21 + mov.16b v24, v21 + + mov.16b v25, v0 + DIFF_LUMA_EQ4_P2P1P0_1 v0, v1, v2, v3, v4, v5, v23, v19, v17, v16 + DIFF_LUMA_EQ4_P2P1P0_2 v25, v1, v2, v3, v4, v5, v24, v19, v17, v16 + ins v0.d[1], v25.d[1] + ins v23.d[1], v24.d[1] + and v21.16b, v20.16b, v21.16b + DIFF_LUMA_EQ4_MASK v19, v1, v21, v17 + st1 {v17.16b}, [x3], x1 + DIFF_LUMA_EQ4_MASK v0, v2, v21, v17 + st1 {v17.16b}, [x3], x1 + DIFF_LUMA_EQ4_MASK v23, v3, v18, v17 + st1 {v17.16b}, [x3], x1 + + + mov.16b v23, v22 + mov.16b v24, v22 + mov.16b v25, v7 + DIFF_LUMA_EQ4_P2P1P0_1 v7, v6, v5, v4, v3, v2, v23, v19, v17, v16 + DIFF_LUMA_EQ4_P2P1P0_2 v25, v6, v5, v4, v3, v2, v24, v19, v17, v16 + ins v7.d[1], v25.d[1] + ins v23.d[1], v24.d[1] + and v22.16b, v20.16b, v22.16b + DIFF_LUMA_EQ4_MASK v23, v4, v18, v17 + st1 {v17.16b}, [x3], x1 + DIFF_LUMA_EQ4_MASK v7, v5, v22, v17 + st1 {v17.16b}, [x3], x1 + DIFF_LUMA_EQ4_MASK v19, v6, v22, v17 + st1 {v17.16b}, [x3], x1 +DeblockLumaEq4V_AArch64_neon_end: +WELS_ASM_ARCH64_FUNC_END + + +WELS_ASM_ARCH64_FUNC_BEGIN DeblockLumaLt4H_AArch64_neon //uint8_t* pPix, int32_t iStride, int32_t iAlpha, int32_t iBeta, int8_t* tc + dup v16.16b, w2 //alpha + dup v17.16b, w3 //beta + sub x2, x0, #3 + movi v23.16b, #128 + + LOAD_LUMA_DATA_3 v0, v1, v2, v3, v4, v5, 0 + LOAD_LUMA_DATA_3 v0, v1, v2, v3, v4, v5, 1 + LOAD_LUMA_DATA_3 v0, v1, v2, v3, v4, v5, 2 + LOAD_LUMA_DATA_3 v0, v1, v2, v3, v4, v5, 3 + LOAD_LUMA_DATA_3 v0, v1, v2, v3, v4, v5, 4 + LOAD_LUMA_DATA_3 v0, v1, v2, v3, v4, v5, 5 + LOAD_LUMA_DATA_3 v0, v1, v2, v3, v4, v5, 6 + LOAD_LUMA_DATA_3 v0, v1, v2, v3, v4, v5, 7 + + LOAD_LUMA_DATA_3 v0, v1, v2, v3, v4, v5, 8 + LOAD_LUMA_DATA_3 v0, v1, v2, v3, v4, v5, 9 + LOAD_LUMA_DATA_3 v0, v1, v2, v3, v4, v5, 10 + LOAD_LUMA_DATA_3 v0, v1, v2, v3, v4, v5, 11 + LOAD_LUMA_DATA_3 v0, v1, v2, v3, v4, v5, 12 + LOAD_LUMA_DATA_3 v0, v1, v2, v3, v4, v5, 13 + LOAD_LUMA_DATA_3 v0, v1, v2, v3, v4, v5, 14 + LOAD_LUMA_DATA_3 v0, v1, v2, v3, v4, v5, 15 + + sub x0, x0, x1, lsl #4 + + ld4r {v18.8b, v19.8b, v20.8b, v21.8b}, [x4] + trn1 v18.2s, v18.2s, v19.2s + trn1 v20.2s, v20.2s, v21.2s + trn1 v6.2d, v18.2d, v20.2d // iTc0: 0000, 1111, 2222, 3333 + cmge v7.16b, v6.16b, #0 // iTc0 Flag + + MASK_MATRIX v1, v2, v3, v4, v16, v17, v18 + and v7.16b, v7.16b, v18.16b // need filter flag + + ZERO_JUMP_END v7, x3, x4, DeblockLumaLt4H_AArch64_neon_end + + eor v18.16b, v18.16b, v18.16b + sub v18.16b, v18.16b, v6.16b // -iTc0: 0000, 1111, 2222, 3333 + + DIFF_LUMA_LT4_P1_Q1 v0, v1, v2, v3, v17, v18, v6, v7, v19, v20 //Use Tmp v23,v24 + mov.16b v25, v19 + + DIFF_LUMA_LT4_P1_Q1 v5, v4, v3, v2, v17, v18, v6, v7, v21, v22 //Use Tmp v23,v24 + + abs v20.16b, v20.16b + abs v22.16b, v22.16b + add v6.16b, v6.16b, v20.16b + add v6.16b, v6.16b, v22.16b + eor v18.16b, v18.16b, v18.16b + sub v18.16b, v18.16b, v6.16b + + DIFF_LUMA_LT4_P0_Q0_1 v1, v2, v3, v4, v19, v20, v22 + DIFF_LUMA_LT4_P0_Q0_2 v1, v2, v3, v4, v19, v20, v22 + + smax v19.16b, v19.16b, v18.16b + smin v19.16b, v19.16b, v6.16b + and v19.16b, v19.16b, v7.16b + + EXTRACT_DELTA_INTO_TWO_PART v19, v20 + uqadd v2.16b, v2.16b, v20.16b + uqsub v2.16b, v2.16b, v19.16b + mov.16b v26, v2 + uqsub v3.16b, v3.16b, v20.16b + uqadd v3.16b, v3.16b, v19.16b + mov.16b v27, v3 + mov.16b v28, v21 + + sub x0, x0, #2 + add x2, x0, x1 + lsl x1, x1, #1 + + STORE_LUMA_DATA_4 v25, v26, v27, v28, 0, 1 + STORE_LUMA_DATA_4 v25, v26, v27, v28, 2, 3 + STORE_LUMA_DATA_4 v25, v26, v27, v28, 4, 5 + STORE_LUMA_DATA_4 v25, v26, v27, v28, 6, 7 + + STORE_LUMA_DATA_4 v25, v26, v27, v28, 8, 9 + STORE_LUMA_DATA_4 v25, v26, v27, v28, 10, 11 + STORE_LUMA_DATA_4 v25, v26, v27, v28, 12, 13 + STORE_LUMA_DATA_4 v25, v26, v27, v28, 14, 15 +DeblockLumaLt4H_AArch64_neon_end: +WELS_ASM_ARCH64_FUNC_END + + +WELS_ASM_ARCH64_FUNC_BEGIN DeblockLumaEq4H_AArch64_neon + dup v16.16b, w2 //alpha + dup v17.16b, w3 //beta + sub x3, x0, #4 + + LOAD_LUMA_DATA_4 v0, v1, v2, v3, v4, v5, v6, v7, 0 + LOAD_LUMA_DATA_4 v0, v1, v2, v3, v4, v5, v6, v7, 1 + LOAD_LUMA_DATA_4 v0, v1, v2, v3, v4, v5, v6, v7, 2 + LOAD_LUMA_DATA_4 v0, v1, v2, v3, v4, v5, v6, v7, 3 + LOAD_LUMA_DATA_4 v0, v1, v2, v3, v4, v5, v6, v7, 4 + LOAD_LUMA_DATA_4 v0, v1, v2, v3, v4, v5, v6, v7, 5 + LOAD_LUMA_DATA_4 v0, v1, v2, v3, v4, v5, v6, v7, 6 + LOAD_LUMA_DATA_4 v0, v1, v2, v3, v4, v5, v6, v7, 7 + + LOAD_LUMA_DATA_4 v0, v1, v2, v3, v4, v5, v6, v7, 8 + LOAD_LUMA_DATA_4 v0, v1, v2, v3, v4, v5, v6, v7, 9 + LOAD_LUMA_DATA_4 v0, v1, v2, v3, v4, v5, v6, v7, 10 + LOAD_LUMA_DATA_4 v0, v1, v2, v3, v4, v5, v6, v7, 11 + LOAD_LUMA_DATA_4 v0, v1, v2, v3, v4, v5, v6, v7, 12 + LOAD_LUMA_DATA_4 v0, v1, v2, v3, v4, v5, v6, v7, 13 + LOAD_LUMA_DATA_4 v0, v1, v2, v3, v4, v5, v6, v7, 14 + LOAD_LUMA_DATA_4 v0, v1, v2, v3, v4, v5, v6, v7, 15 + + sub x0, x0, x1, lsl #4 + sub x3, x0, #3 + MASK_MATRIX v2, v3, v4, v5, v16, v17, v18 + + ZERO_JUMP_END v18, x4, x5, DeblockLumaEq4H_AArch64_neon_end + + lsr w2, w2, #2 + add w2, w2, #2 + dup v16.16b, w2 //((alpha >> 2) + 2) + uabd v19.16b, v3.16b, v4.16b + cmhi v20.16b, v16.16b, v19.16b //iDetaP0Q0 < ((iAlpha >> 2) + 2) + + uabd v21.16b, v1.16b, v3.16b + cmhi v21.16b, v17.16b, v21.16b //bDetaP2P0 + and v21.16b, v21.16b, v20.16b //(iDetaP0Q0 < ((iAlpha >> 2) + 2))&&bDetaP2P0 + + uabd v22.16b, v6.16b, v4.16b + cmhi v22.16b, v17.16b, v22.16b //bDetaQ2Q0 + and v22.16b, v22.16b, v20.16b //(iDetaP0Q0 < ((iAlpha >> 2) + 2))&&bDetaQ2Q0 + and v20.16b, v20.16b, v18.16b //(iDetaP0Q0 < iAlpha) && bDetaP1P0 && bDetaQ1Q0&&(iDetaP0Q0 < ((iAlpha >> 2) + 2)) + + mov.16b v23, v21 + mov.16b v24, v21 + + mov.16b v25, v0 + DIFF_LUMA_EQ4_P2P1P0_1 v0, v1, v2, v3, v4, v5, v23, v19, v17, v16 + DIFF_LUMA_EQ4_P2P1P0_2 v25, v1, v2, v3, v4, v5, v24, v19, v17, v16 + ins v0.d[1], v25.d[1] + ins v23.d[1], v24.d[1] + and v21.16b, v20.16b, v21.16b + DIFF_LUMA_EQ4_MASK v19, v1, v21, v17 + mov.16b v26, v17 + DIFF_LUMA_EQ4_MASK v0, v2, v21, v17 + mov.16b v27, v17 + DIFF_LUMA_EQ4_MASK v23, v3, v18, v17 + mov.16b v28, v17 + + + mov.16b v23, v22 + mov.16b v24, v22 + mov.16b v25, v7 + DIFF_LUMA_EQ4_P2P1P0_1 v7, v6, v5, v4, v3, v2, v23, v19, v17, v16 + DIFF_LUMA_EQ4_P2P1P0_2 v25, v6, v5, v4, v3, v2, v24, v19, v17, v16 + ins v7.d[1], v25.d[1] + ins v23.d[1], v24.d[1] + and v22.16b, v20.16b, v22.16b + DIFF_LUMA_EQ4_MASK v23, v4, v18, v17 + mov.16b v29, v17 + DIFF_LUMA_EQ4_MASK v7, v5, v22, v17 + mov.16b v30, v17 + DIFF_LUMA_EQ4_MASK v19, v6, v22, v17 + mov.16b v31, v17 + + STORE_LUMA_DATA_3 v26, v27, v28, v29, v30, v31, 0 + STORE_LUMA_DATA_3 v26, v27, v28, v29, v30, v31, 1 + STORE_LUMA_DATA_3 v26, v27, v28, v29, v30, v31, 2 + STORE_LUMA_DATA_3 v26, v27, v28, v29, v30, v31, 3 + STORE_LUMA_DATA_3 v26, v27, v28, v29, v30, v31, 4 + STORE_LUMA_DATA_3 v26, v27, v28, v29, v30, v31, 5 + STORE_LUMA_DATA_3 v26, v27, v28, v29, v30, v31, 6 + STORE_LUMA_DATA_3 v26, v27, v28, v29, v30, v31, 7 + STORE_LUMA_DATA_3 v26, v27, v28, v29, v30, v31, 8 + STORE_LUMA_DATA_3 v26, v27, v28, v29, v30, v31, 9 + STORE_LUMA_DATA_3 v26, v27, v28, v29, v30, v31, 10 + STORE_LUMA_DATA_3 v26, v27, v28, v29, v30, v31, 11 + STORE_LUMA_DATA_3 v26, v27, v28, v29, v30, v31, 12 + STORE_LUMA_DATA_3 v26, v27, v28, v29, v30, v31, 13 + STORE_LUMA_DATA_3 v26, v27, v28, v29, v30, v31, 14 + STORE_LUMA_DATA_3 v26, v27, v28, v29, v30, v31, 15 +DeblockLumaEq4H_AArch64_neon_end: +WELS_ASM_ARCH64_FUNC_END + + +WELS_ASM_ARCH64_FUNC_BEGIN DeblockChromaLt4V_AArch64_neon //uint8_t* pPixCb, uint8_t* pPixCr, int32_t iStrideX, int32_t iAlpha, int32_t iBeta, int8_t* pTc + dup v16.16b, w3 //alpha + dup v17.16b, w4 //beta + lsl x3, x2, #1 + sub x6, x0, x3 //pPixCb-2*Stride + sub x7, x1, x3 //pPixCr-2*Stride + + ld1 {v0.d} [0], [x6], x2 + ld1 {v1.d} [0], [x6] + ld1 {v2.d} [0], [x0], x2 + ld1 {v3.d} [0], [x0] + ld1 {v0.d} [1], [x7], x2 + ld1 {v1.d} [1], [x7] + ld1 {v2.d} [1], [x1], x2 + ld1 {v3.d} [1], [x1] + + ld4r {v18.8b, v19.8b, v20.8b, v21.8b}, [x5] + trn1 v18.4h, v18.4h, v19.4h //0011,0011, + trn1 v20.4h, v20.4h, v21.4h //2233,2233 + zip1 v6.4s, v18.4s, v20.4s //iTc0: 0011,2233,0011,2233 + cmgt v7.16b, v6.16b, #0 // iTc0 Flag + + MASK_MATRIX v0, v1, v2, v3, v16, v17, v18 + and v7.16b, v7.16b, v18.16b // need filter flag + + ZERO_JUMP_END v7, x4, x5, DeblockChromaLt4V_AArch64_neon_end + + eor v18.16b, v18.16b, v18.16b + sub v18.16b, v18.16b, v6.16b //-iTc0: 0011,2233,0011,2233 + + DIFF_LUMA_LT4_P0_Q0_1 v0, v1, v2, v3, v19, v20, v22 + DIFF_LUMA_LT4_P0_Q0_2 v0, v1, v2, v3, v19, v20, v22 + + smax v19.16b, v19.16b, v18.16b + smin v19.16b, v19.16b, v6.16b + and v19.16b, v19.16b, v7.16b + + EXTRACT_DELTA_INTO_TWO_PART v19, v20 + uqadd v1.16b, v1.16b, v20.16b + uqsub v1.16b, v1.16b, v19.16b + st1 {v1.d} [0], [x6], x2 + st1 {v1.d} [1], [x7], x2 + uqsub v2.16b, v2.16b, v20.16b + uqadd v2.16b, v2.16b, v19.16b + st1 {v2.d} [0], [x6] + st1 {v2.d} [1], [x7] +DeblockChromaLt4V_AArch64_neon_end: +WELS_ASM_ARCH64_FUNC_END + +WELS_ASM_ARCH64_FUNC_BEGIN DeblockChromaLt4H_AArch64_neon //uint8_t* pPixCb, uint8_t* pPixCr, int32_t iStrideX, int32_t iAlpha, int32_t iBeta, int8_t* pTc + dup v16.16b, w3 //alpha + dup v17.16b, w4 //beta + sub x6, x0, #2 //pPixCb-2 + sub x7, x1, #2 //pPixCr-2 + + LOAD_CHROMA_DATA_4 v0, v1, v2, v3, x6, 0 + LOAD_CHROMA_DATA_4 v0, v1, v2, v3, x6, 1 + LOAD_CHROMA_DATA_4 v0, v1, v2, v3, x6, 2 + LOAD_CHROMA_DATA_4 v0, v1, v2, v3, x6, 3 + LOAD_CHROMA_DATA_4 v0, v1, v2, v3, x6, 4 + LOAD_CHROMA_DATA_4 v0, v1, v2, v3, x6, 5 + LOAD_CHROMA_DATA_4 v0, v1, v2, v3, x6, 6 + LOAD_CHROMA_DATA_4 v0, v1, v2, v3, x6, 7 + + LOAD_CHROMA_DATA_4 v0, v1, v2, v3, x7, 8 + LOAD_CHROMA_DATA_4 v0, v1, v2, v3, x7, 9 + LOAD_CHROMA_DATA_4 v0, v1, v2, v3, x7, 10 + LOAD_CHROMA_DATA_4 v0, v1, v2, v3, x7, 11 + LOAD_CHROMA_DATA_4 v0, v1, v2, v3, x7, 12 + LOAD_CHROMA_DATA_4 v0, v1, v2, v3, x7, 13 + LOAD_CHROMA_DATA_4 v0, v1, v2, v3, x7, 14 + LOAD_CHROMA_DATA_4 v0, v1, v2, v3, x7, 15 + + sub x0, x0, #1 + sub x1, x1, #1 + + ld4r {v18.8b, v19.8b, v20.8b, v21.8b}, [x5] + trn1 v18.4h, v18.4h, v19.4h //0011,0011, + trn1 v20.4h, v20.4h, v21.4h //2233,2233 + zip1 v6.4s, v18.4s, v20.4s //iTc0: 0011,2233,0011,2233 + cmgt v7.16b, v6.16b, #0 // iTc0 Flag + + MASK_MATRIX v0, v1, v2, v3, v16, v17, v18 + and v7.16b, v7.16b, v18.16b // need filter flag + + ZERO_JUMP_END v7, x4, x5, DeblockChromaLt4H_AArch64_neon_end + eor v18.16b, v18.16b, v18.16b + sub v18.16b, v18.16b, v6.16b //-iTc0: 0011,2233,0011,2233 + + DIFF_LUMA_LT4_P0_Q0_1 v0, v1, v2, v3, v19, v20, v22 + DIFF_LUMA_LT4_P0_Q0_2 v0, v1, v2, v3, v19, v20, v22 + + smax v19.16b, v19.16b, v18.16b + smin v19.16b, v19.16b, v6.16b + and v19.16b, v19.16b, v7.16b + + EXTRACT_DELTA_INTO_TWO_PART v19, v20 + uqadd v1.16b, v1.16b, v20.16b + uqsub v1.16b, v1.16b, v19.16b + uqsub v2.16b, v2.16b, v20.16b + uqadd v2.16b, v2.16b, v19.16b + + STORE_CHROMA_DATA_2 v1, v2, x0, 0 + STORE_CHROMA_DATA_2 v1, v2, x0, 1 + STORE_CHROMA_DATA_2 v1, v2, x0, 2 + STORE_CHROMA_DATA_2 v1, v2, x0, 3 + STORE_CHROMA_DATA_2 v1, v2, x0, 4 + STORE_CHROMA_DATA_2 v1, v2, x0, 5 + STORE_CHROMA_DATA_2 v1, v2, x0, 6 + STORE_CHROMA_DATA_2 v1, v2, x0, 7 + + STORE_CHROMA_DATA_2 v1, v2, x1, 8 + STORE_CHROMA_DATA_2 v1, v2, x1, 9 + STORE_CHROMA_DATA_2 v1, v2, x1, 10 + STORE_CHROMA_DATA_2 v1, v2, x1, 11 + STORE_CHROMA_DATA_2 v1, v2, x1, 12 + STORE_CHROMA_DATA_2 v1, v2, x1, 13 + STORE_CHROMA_DATA_2 v1, v2, x1, 14 + STORE_CHROMA_DATA_2 v1, v2, x1, 15 +DeblockChromaLt4H_AArch64_neon_end: +WELS_ASM_ARCH64_FUNC_END + +WELS_ASM_ARCH64_FUNC_BEGIN DeblockChromaEq4V_AArch64_neon //uint8_t* pPixCb, uint8_t* pPixCr, int32_t iStrideX, int32_t iAlpha, int32_t iBeta + dup v16.16b, w3 //alpha + dup v17.16b, w4 //beta + lsl x3, x2, #1 + sub x6, x0, x3 //pPixCb-2*Stride + sub x7, x1, x3 //pPixCr-2*Stride + + ld1 {v0.d} [0], [x6], x2 + ld1 {v1.d} [0], [x6] + ld1 {v2.d} [0], [x0], x2 + ld1 {v3.d} [0], [x0] + ld1 {v0.d} [1], [x7], x2 + ld1 {v1.d} [1], [x7] + ld1 {v2.d} [1], [x1], x2 + ld1 {v3.d} [1], [x1] + + MASK_MATRIX v0, v1, v2, v3, v16, v17, v7 + + ZERO_JUMP_END v7, x3, x4, DeblockChromaEq4V_AArch64_neon_end + + DIFF_CHROMA_EQ4_P0Q0_1 v0, v1, v2, v3, v18, v19, v20, v21 + DIFF_CHROMA_EQ4_P0Q0_2 v0, v1, v2, v3, v18, v19, v20, v21 + + mov.16b v6, v7 + bsl v6.16b, v20.16b, v1.16b + bsl v7.16b, v21.16b, v2.16b + + st1 {v6.d} [0], [x6], x2 + st1 {v6.d} [1], [x7], x2 + + st1 {v7.d} [0], [x6] + st1 {v7.d} [1], [x7] +DeblockChromaEq4V_AArch64_neon_end: +WELS_ASM_ARCH64_FUNC_END + +WELS_ASM_ARCH64_FUNC_BEGIN DeblockChromaEq4H_AArch64_neon //uint8_t* pPixCb, uint8_t* pPixCr, int32_t iStrideX, int32_t iAlpha, int32_t iBeta + dup v16.16b, w3 //alpha + dup v17.16b, w4 //beta + + sub x6, x0, #2 //pPixCb-2 + sub x7, x1, #2 //pPixCr-2 + + LOAD_CHROMA_DATA_4 v0, v1, v2, v3, x6, 0 + LOAD_CHROMA_DATA_4 v0, v1, v2, v3, x6, 1 + LOAD_CHROMA_DATA_4 v0, v1, v2, v3, x6, 2 + LOAD_CHROMA_DATA_4 v0, v1, v2, v3, x6, 3 + LOAD_CHROMA_DATA_4 v0, v1, v2, v3, x6, 4 + LOAD_CHROMA_DATA_4 v0, v1, v2, v3, x6, 5 + LOAD_CHROMA_DATA_4 v0, v1, v2, v3, x6, 6 + LOAD_CHROMA_DATA_4 v0, v1, v2, v3, x6, 7 + + LOAD_CHROMA_DATA_4 v0, v1, v2, v3, x7, 8 + LOAD_CHROMA_DATA_4 v0, v1, v2, v3, x7, 9 + LOAD_CHROMA_DATA_4 v0, v1, v2, v3, x7, 10 + LOAD_CHROMA_DATA_4 v0, v1, v2, v3, x7, 11 + LOAD_CHROMA_DATA_4 v0, v1, v2, v3, x7, 12 + LOAD_CHROMA_DATA_4 v0, v1, v2, v3, x7, 13 + LOAD_CHROMA_DATA_4 v0, v1, v2, v3, x7, 14 + LOAD_CHROMA_DATA_4 v0, v1, v2, v3, x7, 15 + sub x0, x0, #1 + sub x1, x1, #1 + + MASK_MATRIX v0, v1, v2, v3, v16, v17, v7 + + ZERO_JUMP_END v7, x3, x4, DeblockChromaEq4H_AArch64_neon_end + + DIFF_CHROMA_EQ4_P0Q0_1 v0, v1, v2, v3, v18, v19, v20, v21 + DIFF_CHROMA_EQ4_P0Q0_2 v0, v1, v2, v3, v18, v19, v20, v21 + + mov.16b v6, v7 + bsl v6.16b, v20.16b, v1.16b + bsl v7.16b, v21.16b, v2.16b + + STORE_CHROMA_DATA_2 v6, v7, x0, 0 + STORE_CHROMA_DATA_2 v6, v7, x0, 1 + STORE_CHROMA_DATA_2 v6, v7, x0, 2 + STORE_CHROMA_DATA_2 v6, v7, x0, 3 + STORE_CHROMA_DATA_2 v6, v7, x0, 4 + STORE_CHROMA_DATA_2 v6, v7, x0, 5 + STORE_CHROMA_DATA_2 v6, v7, x0, 6 + STORE_CHROMA_DATA_2 v6, v7, x0, 7 + + STORE_CHROMA_DATA_2 v6, v7, x1, 8 + STORE_CHROMA_DATA_2 v6, v7, x1, 9 + STORE_CHROMA_DATA_2 v6, v7, x1, 10 + STORE_CHROMA_DATA_2 v6, v7, x1, 11 + STORE_CHROMA_DATA_2 v6, v7, x1, 12 + STORE_CHROMA_DATA_2 v6, v7, x1, 13 + STORE_CHROMA_DATA_2 v6, v7, x1, 14 + STORE_CHROMA_DATA_2 v6, v7, x1, 15 + DeblockChromaEq4H_AArch64_neon_end: +WELS_ASM_ARCH64_FUNC_END + + +WELS_ASM_ARCH64_FUNC_BEGIN DeblockingBSCalcEnc_AArch64_neon + // Checking the nzc status + BS_NZC_CHECK x0, x2, x3, v16, v17 //v16,v17 save the nzc status + // For checking bS[I] = 2 + movi v0.16b, #0 + cmgt v16.16b, v16.16b, v0.16b + cmgt v17.16b, v17.16b, v0.16b + movi v0.16b, #2 + + and v16.16b, v16.16b, v0.16b //v16 save the nzc check result all the time --- for dir is top + and v17.16b, v17.16b, v0.16b //v17 save the nzc check result all the time --- for dir is left + + // Checking the mv status + BS_MV_CHECK x1, x2, x3, v18, v19, v5 , v6 //v18, v19 save the mv status + // For checking bS[I] = 1 + movi v0.16b, #1 + and v18.16b, v18.16b, v0.16b //v18 save the nzc check result all the time --- for dir is top + and v19.16b, v19.16b, v0.16b //v19 save the nzc check result all the time --- for dir is left + // Check bS[I] is '1' or '2' + umax v1.16b, v18.16b, v16.16b + umax v0.16b, v19.16b, v17.16b + st1 {v0.16b, v1.16b}, [x4] +WELS_ASM_ARCH64_FUNC_END + + +#endif diff --git a/codec/common/inc/deblocking_common.h b/codec/common/inc/deblocking_common.h index d706b745..0bf64710 100644 --- a/codec/common/inc/deblocking_common.h +++ b/codec/common/inc/deblocking_common.h @@ -50,6 +50,18 @@ void DeblockChromaLt4H_neon (uint8_t* pPixCb, uint8_t* pPixCr, int32_t iStride, void DeblockChromaEq4H_neon (uint8_t* pPixCb, uint8_t* pPixCr, int32_t iStride, int32_t iAlpha, int32_t iBeta); #endif +#if defined(HAVE_NEON_AARCH64) +void DeblockLumaLt4V_AArch64_neon (uint8_t* pPixY, int32_t iStride, int32_t iAlpha, int32_t iBeta, int8_t* pTc); +void DeblockLumaEq4V_AArch64_neon (uint8_t* pPixY, int32_t iStride, int32_t iAlpha, int32_t iBeta); +void DeblockLumaLt4H_AArch64_neon (uint8_t* pPixY, int32_t iStride, int32_t iAlpha, int32_t iBeta, int8_t* pTc); +void DeblockLumaEq4H_AArch64_neon (uint8_t* pPixY, int32_t iStride, int32_t iAlpha, int32_t iBeta); +void DeblockChromaLt4V_AArch64_neon (uint8_t* pPixCb, uint8_t* pPixCr, int32_t iStride, int32_t iAlpha, int32_t iBeta, + int8_t* pTC); +void DeblockChromaEq4V_AArch64_neon (uint8_t* pPixCb, uint8_t* pPixCr, int32_t iStride, int32_t iAlpha, int32_t iBeta); +void DeblockChromaLt4H_AArch64_neon (uint8_t* pPixCb, uint8_t* pPixCr, int32_t iStride, int32_t iAlpha, int32_t iBeta, + int8_t* pTC); +void DeblockChromaEq4H_AArch64_neon (uint8_t* pPixCb, uint8_t* pPixCr, int32_t iStride, int32_t iAlpha, int32_t iBeta); +#endif #if defined(__cplusplus) } #endif//__cplusplus diff --git a/codec/common/targets.mk b/codec/common/targets.mk index ed57a40e..9be6e6ef 100644 --- a/codec/common/targets.mk +++ b/codec/common/targets.mk @@ -39,6 +39,7 @@ endif ifeq ($(ASM_ARCH), arm64) COMMON_ASM_ARM64_SRCS=\ + $(COMMON_SRCDIR)/arm64/deblocking_aarch64_neon.S\ $(COMMON_SRCDIR)/arm64/expand_picture_aarch64_neon.S\ $(COMMON_SRCDIR)/arm64/mc_aarch64_neon.S\ diff --git a/codec/decoder/core/src/deblocking.cpp b/codec/decoder/core/src/deblocking.cpp index 3b153ce2..f1009aa4 100644 --- a/codec/decoder/core/src/deblocking.cpp +++ b/codec/decoder/core/src/deblocking.cpp @@ -732,6 +732,20 @@ void DeblockingInit (SDeblockingFunc* pFunc, int32_t iCpu) { pFunc->pfChromaDeblockingEQ4Hor = DeblockChromaEq4H_neon; } #endif + +#if defined(HAVE_NEON_AARCH64) + if (iCpu & WELS_CPU_NEON) { + pFunc->pfLumaDeblockingLT4Ver = DeblockLumaLt4V_AArch64_neon; + pFunc->pfLumaDeblockingEQ4Ver = DeblockLumaEq4V_AArch64_neon; + pFunc->pfLumaDeblockingLT4Hor = DeblockLumaLt4H_AArch64_neon; + pFunc->pfLumaDeblockingEQ4Hor = DeblockLumaEq4H_AArch64_neon; + + pFunc->pfChromaDeblockingLT4Ver = DeblockChromaLt4V_AArch64_neon; + pFunc->pfChromaDeblockingEQ4Ver = DeblockChromaEq4V_AArch64_neon; + pFunc->pfChromaDeblockingLT4Hor = DeblockChromaLt4H_AArch64_neon; + pFunc->pfChromaDeblockingEQ4Hor = DeblockChromaEq4H_AArch64_neon; + } +#endif } } // namespace WelsDec diff --git a/codec/encoder/core/inc/deblocking.h b/codec/encoder/core/inc/deblocking.h index eacf19c0..9285398d 100644 --- a/codec/encoder/core/inc/deblocking.h +++ b/codec/encoder/core/inc/deblocking.h @@ -69,6 +69,10 @@ void WelsNonZeroCount_neon (int8_t* pNonZeroCount); void DeblockingBSCalcEnc_neon (int8_t* pNzc, SMVUnitXY* pMv, int32_t iBoundryFlag, int32_t iMbStride, uint8_t (*pBS)[4][4]); #endif +#if defined(HAVE_NEON_AARCH64) +void WelsNonZeroCount_AArch64_neon (int8_t* pNonZeroCount); +void DeblockingBSCalcEnc_AArch64_neon (int8_t* pNzc, SMVUnitXY* pMv, int32_t iBoundryFlag, int32_t iMbStride, uint8_t (*pBS)[4][4]); +#endif #if defined(__cplusplus) } #endif//__cplusplus diff --git a/codec/encoder/core/src/deblocking.cpp b/codec/encoder/core/src/deblocking.cpp index da993dd0..294e0bf9 100644 --- a/codec/encoder/core/src/deblocking.cpp +++ b/codec/encoder/core/src/deblocking.cpp @@ -573,6 +573,27 @@ void DeblockingBSCalc_neon (SWelsFuncPtrList* pFunc, SMB* pCurMb, uint8_t uiBS[2 } #endif +#if defined(HAVE_NEON_AARCH64) && defined(SINGLE_REF_FRAME) +void DeblockingBSCalc_AArch64_neon (SWelsFuncPtrList* pFunc, SMB* pCurMb, uint8_t uiBS[2][4][4], Mb_Type uiCurMbType, + int32_t iMbStride, int32_t iLeftFlag, int32_t iTopFlag) { + DeblockingBSCalcEnc_AArch64_neon (pCurMb->pNonZeroCount, pCurMb->sMv, pCurMb->uiNeighborAvail, iMbStride, uiBS); + if (iLeftFlag) { + if (IS_INTRA ((pCurMb - 1)->uiMbType)) { + * (uint32_t*)uiBS[0][0] = 0x04040404; + } + } else { + * (uint32_t*)uiBS[0][0] = 0; + } + if (iTopFlag) { + if (IS_INTRA ((pCurMb - iMbStride)->uiMbType)) { + * (uint32_t*)uiBS[1][0] = 0x04040404; + } + } else { + * (uint32_t*)uiBS[1][0] = 0; + } +} +#endif + void DeblockingBSCalc_c (SWelsFuncPtrList* pFunc, SMB* pCurMb, uint8_t uiBS[2][4][4], Mb_Type uiCurMbType, int32_t iMbStride, int32_t iLeftFlag, int32_t iTopFlag) { if (iLeftFlag) { @@ -765,6 +786,11 @@ void WelsBlockFuncInit (PSetNoneZeroCountZeroFunc* pfSetNZCZero, int32_t iCpu) *pfSetNZCZero = WelsNonZeroCount_neon; } #endif +#ifdef HAVE_NEON_AARCH64 + if (iCpu & WELS_CPU_NEON) { + *pfSetNZCZero = WelsNonZeroCount_AArch64_neon; + } +#endif } void DeblockingInit (DeblockingFunc* pFunc, int32_t iCpu) { @@ -811,6 +837,24 @@ void DeblockingInit (DeblockingFunc* pFunc, int32_t iCpu) { #endif } #endif + +#if defined(HAVE_NEON_AARCH64) + if (iCpu & WELS_CPU_NEON) { + pFunc->pfLumaDeblockingLT4Ver = DeblockLumaLt4V_AArch64_neon; + pFunc->pfLumaDeblockingEQ4Ver = DeblockLumaEq4V_AArch64_neon; + pFunc->pfLumaDeblockingLT4Hor = DeblockLumaLt4H_AArch64_neon; + pFunc->pfLumaDeblockingEQ4Hor = DeblockLumaEq4H_AArch64_neon; + + pFunc->pfChromaDeblockingLT4Ver = DeblockChromaLt4V_AArch64_neon; + pFunc->pfChromaDeblockingEQ4Ver = DeblockChromaEq4V_AArch64_neon; + pFunc->pfChromaDeblockingLT4Hor = DeblockChromaLt4H_AArch64_neon; + pFunc->pfChromaDeblockingEQ4Hor = DeblockChromaEq4H_AArch64_neon; + +#if defined(SINGLE_REF_FRAME) + pFunc->pfDeblockingBSCalc = DeblockingBSCalc_AArch64_neon; +#endif + } +#endif } diff --git a/test/decoder/DecUT_Deblock.cpp b/test/decoder/DecUT_Deblock.cpp index 3f2abf78..3ca62f56 100644 --- a/test/decoder/DecUT_Deblock.cpp +++ b/test/decoder/DecUT_Deblock.cpp @@ -127,3 +127,20 @@ GENERATE_CHROMA_UT (ChromaLt4H_neon, DeblockChromaLt4H_neon, DeblockChromaLt4H_c GENERATE_CHROMA_UT (ChromaEq4V_neon, DeblockChromaEq4V_neon_wrap, DeblockChromaEq4V_c_wrap, WELS_CPU_NEON, 0) GENERATE_CHROMA_UT (ChromaEq4H_neon, DeblockChromaEq4H_neon_wrap, DeblockChromaEq4H_c_wrap, WELS_CPU_NEON, 1) #endif + +#if defined(HAVE_NEON_AARCH64) +WRAP_LUMA_FUNC (DeblockLumaEq4V_AArch64_neon) +WRAP_LUMA_FUNC (DeblockLumaEq4H_AArch64_neon) +WRAP_CHROMA_FUNC (DeblockChromaEq4V_AArch64_neon) +WRAP_CHROMA_FUNC (DeblockChromaEq4H_AArch64_neon) + +GENERATE_LUMA_UT (LumaLt4V_AArch64_neon, DeblockLumaLt4V_AArch64_neon, DeblockLumaLt4V_c, WELS_CPU_NEON, 0) +GENERATE_LUMA_UT (LumaLt4H_AArch64_neon, DeblockLumaLt4H_AArch64_neon, DeblockLumaLt4H_c, WELS_CPU_NEON, 1) +GENERATE_LUMA_UT (LumaEq4V_AArch64_neon, DeblockLumaEq4V_AArch64_neon_wrap, DeblockLumaEq4V_c_wrap, WELS_CPU_NEON, 0) +GENERATE_LUMA_UT (LumaEq4H_AArch64_neon, DeblockLumaEq4H_AArch64_neon_wrap, DeblockLumaEq4H_c_wrap, WELS_CPU_NEON, 1) + +GENERATE_CHROMA_UT (ChromaLt4V_AArch64_neon, DeblockChromaLt4V_AArch64_neon, DeblockChromaLt4V_c, WELS_CPU_NEON, 0) +GENERATE_CHROMA_UT (ChromaLt4H_AArch64_neon, DeblockChromaLt4H_AArch64_neon, DeblockChromaLt4H_c, WELS_CPU_NEON, 1) +GENERATE_CHROMA_UT (ChromaEq4V_AArch64_neon, DeblockChromaEq4V_AArch64_neon_wrap, DeblockChromaEq4V_c_wrap, WELS_CPU_NEON, 0) +GENERATE_CHROMA_UT (ChromaEq4H_AArch64_neon, DeblockChromaEq4H_AArch64_neon_wrap, DeblockChromaEq4H_c_wrap, WELS_CPU_NEON, 1) +#endif