From ad9e2dab4f1e829fecdf412c036dcf2e9c45635c Mon Sep 17 00:00:00 2001 From: dongzhang Date: Wed, 23 Apr 2014 10:42:57 +0800 Subject: [PATCH] Add Motion Compehension ARM64 Neon Code --- .../common/common.xcodeproj/project.pbxproj | 4 + codec/common/arm64/mc_aarch64_neon.S | 2274 +++++++++++++++++ codec/common/inc/mc_common.h | 202 +- codec/common/inc/mc_common.h.orig | 204 ++ codec/decoder/core/src/mc.cpp | 941 ++++--- codec/decoder/core/src/mc.cpp.orig | 1305 ++++++++++ codec/encoder/core/src/mc.cpp | 265 +- codec/encoder/core/src/mc.cpp.orig | 762 ++++++ 8 files changed, 5476 insertions(+), 481 deletions(-) create mode 100755 codec/common/arm64/mc_aarch64_neon.S create mode 100644 codec/common/inc/mc_common.h.orig create mode 100644 codec/decoder/core/src/mc.cpp.orig create mode 100644 codec/encoder/core/src/mc.cpp.orig diff --git a/codec/build/iOS/common/common.xcodeproj/project.pbxproj b/codec/build/iOS/common/common.xcodeproj/project.pbxproj index 82373c38..20b494b3 100644 --- a/codec/build/iOS/common/common.xcodeproj/project.pbxproj +++ b/codec/build/iOS/common/common.xcodeproj/project.pbxproj @@ -27,6 +27,7 @@ F0B204F918FD23BF005DA23F /* copy_mb.cpp in Sources */ = {isa = PBXBuildFile; fileRef = F0B204F818FD23BF005DA23F /* copy_mb.cpp */; }; F556A8241906673900E156A8 /* arm_arch64_common_macro.S in Sources */ = {isa = PBXBuildFile; fileRef = F556A8221906673900E156A8 /* arm_arch64_common_macro.S */; }; F556A8251906673900E156A8 /* expand_picture_aarch64_neon.S in Sources */ = {isa = PBXBuildFile; fileRef = F556A8231906673900E156A8 /* expand_picture_aarch64_neon.S */; }; + F5B8D82D190757290037849A /* mc_aarch64_neon.S in Sources */ = {isa = PBXBuildFile; fileRef = F5B8D82C190757290037849A /* mc_aarch64_neon.S */; }; FAABAA1818E9354A00D4186F /* sad_common.cpp in Sources */ = {isa = PBXBuildFile; fileRef = FAABAA1718E9354A00D4186F /* sad_common.cpp */; }; /* End PBXBuildFile section */ @@ -87,6 +88,7 @@ F0B204F818FD23BF005DA23F /* copy_mb.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = copy_mb.cpp; sourceTree = ""; }; F556A8221906673900E156A8 /* arm_arch64_common_macro.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; name = arm_arch64_common_macro.S; path = arm64/arm_arch64_common_macro.S; sourceTree = ""; }; F556A8231906673900E156A8 /* expand_picture_aarch64_neon.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; name = expand_picture_aarch64_neon.S; path = arm64/expand_picture_aarch64_neon.S; sourceTree = ""; }; + F5B8D82C190757290037849A /* mc_aarch64_neon.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; name = mc_aarch64_neon.S; path = arm64/mc_aarch64_neon.S; sourceTree = ""; }; FAABAA1618E9353F00D4186F /* sad_common.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = sad_common.h; sourceTree = ""; }; FAABAA1718E9354A00D4186F /* sad_common.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = sad_common.cpp; sourceTree = ""; }; /* End PBXFileReference section */ @@ -223,6 +225,7 @@ F556A81D1906669F00E156A8 /* arm64 */ = { isa = PBXGroup; children = ( + F5B8D82C190757290037849A /* mc_aarch64_neon.S */, F556A8221906673900E156A8 /* arm_arch64_common_macro.S */, F556A8231906673900E156A8 /* expand_picture_aarch64_neon.S */, ); @@ -310,6 +313,7 @@ isa = PBXSourcesBuildPhase; buildActionMask = 2147483647; files = ( + F5B8D82D190757290037849A /* mc_aarch64_neon.S in Sources */, 4C3406C918D96EA600DFA14A /* arm_arch_common_macro.S in Sources */, F556A8241906673900E156A8 /* arm_arch64_common_macro.S in Sources */, 4C3406CE18D96EA600DFA14A /* crt_util_safe_x.cpp in Sources */, diff --git a/codec/common/arm64/mc_aarch64_neon.S b/codec/common/arm64/mc_aarch64_neon.S new file mode 100755 index 00000000..77642e40 --- /dev/null +++ b/codec/common/arm64/mc_aarch64_neon.S @@ -0,0 +1,2274 @@ +/*! + * \copy + * Copyright (c) 2013, Cisco Systems + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS + * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE + * COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN + * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + */ + +#ifdef HAVE_NEON_AARCH64 +.text +#include "arm_arch64_common_macro.S" +.align 16 +filter_para: .short 0, 1, -5, 20, 0, 0, 0, 0 + +#ifdef __APPLE__ + +.macro FILTER_6TAG_8BITS1 +// { // input:src[-2], src[-1], src[0], src[1], src[2], src[3], dst_d, multiplier a/b; working: v18, v19 + uaddl v18.8h, $0.8b, $5.8b //v18=src[-2]+src[3] + uaddl v19.8h, $2.8b, $3.8b //src[0]+src[1] + mla v18.8h, v19.8h, $7.8h //v18 += 20*(src[0]+src[1]), 2 cycles + uaddl v19.8h, $1.8b, $4.8b //src[-1]+src[2] + mls v18.8h, v19.8h, $8.8h //v18 -= 5*(src[-1]+src[2]), 2 cycles + sqrshrun $6.8b, v18.8h, #5 +// } +.endm + +.macro FILTER_6TAG_8BITS2 +// { // input:src[-2], src[-1], src[0], src[1], src[2], src[3], dst_d, multiplier a/b; working: v18, v19 + uaddl2 v18.8h, $0.16b, $5.16b //v18=src[-2]+src[3] + uaddl2 v19.8h, $2.16b, $3.16b //src[0]+src[1] + mla v18.8h, v19.8h, $7.8h //v18 += 20*(src[0]+src[1]), 2 cycles + uaddl2 v19.8h, $1.16b, $4.16b //src[-1]+src[2] + mls v18.8h, v19.8h, $8.8h //v18 -= 5*(src[-1]+src[2]), 2 cycles + sqrshrun2 $6.16b, v18.8h, #5 +// } +.endm + +.macro FILTER_6TAG_8BITS1_AVERAGE_WITH_0 +// { // input:src[-2], src[-1], src[0], src[1], src[2], src[3], dst_d, multiplier a/b; working: v18, v19 + uaddl v18.8h, $0.8b, $5.8b //v18=src[-2]+src[3] + uaddl v19.8h, $2.8b, $3.8b //src[0]+src[1] + mla v18.8h, v19.8h, $7.8h //v18 += 20*(src[0]+src[1]), 2 cycles + uaddl v19.8h, $1.8b, $4.8b //src[-1]+src[2] + mls v18.8h, v19.8h, $8.8h //v18 -= 5*(src[-1]+src[2]), 2 cycles + sqrshrun $6.8b, v18.8h, #5 + uaddl v19.8h, $2.8b, $6.8b + rshrn $6.8b, v19.8h, #1 +// } +.endm + +.macro FILTER_6TAG_8BITS2_AVERAGE_WITH_0 +// { // input:src[-2], src[-1], src[0], src[1], src[2], src[3], dst_d, multiplier a/b; working: v18, v19 + uaddl2 v18.8h, $0.16b, $5.16b //v18=src[-2]+src[3] + uaddl2 v19.8h, $2.16b, $3.16b //src[0]+src[1] + mla v18.8h, v19.8h, $7.8h //v18 += 20*(src[0]+src[1]), 2 cycles + uaddl2 v19.8h, $1.16b, $4.16b //src[-1]+src[2] + mls v18.8h, v19.8h, $8.8h //v18 -= 5*(src[-1]+src[2]), 2 cycles + sqrshrun2 $6.16b, v18.8h, #5 + uaddl2 v19.8h, $2.16b, $6.16b + rshrn2 $6.16b, v19.8h, #1 +// } +.endm + +.macro FILTER_6TAG_8BITS1_AVERAGE_WITH_1 +// { // input:src[-2], src[-1], src[0], src[1], src[2], src[3], dst_d, multiplier a/b; working: v18, v19 + uaddl v18.8h, $0.8b, $5.8b //v18=src[-2]+src[3] + uaddl v19.8h, $2.8b, $3.8b //src[0]+src[1] + mla v18.8h, v19.8h, $7.8h //v18 += 20*(src[0]+src[1]), 2 cycles + uaddl v19.8h, $1.8b, $4.8b //src[-1]+src[2] + mls v18.8h, v19.8h, $8.8h //v18 -= 5*(src[-1]+src[2]), 2 cycles + sqrshrun $6.8b, v18.8h, #5 + uaddl v19.8h, $3.8b, $6.8b + rshrn $6.8b, v19.8h, #1 +// } +.endm + +.macro FILTER_6TAG_8BITS2_AVERAGE_WITH_1 +// { // input:src[-2], src[-1], src[0], src[1], src[2], src[3], dst_d, multiplier a/b; working: v18, v19 + uaddl2 v18.8h, $0.16b, $5.16b //v18=src[-2]+src[3] + uaddl2 v19.8h, $2.16b, $3.16b //src[0]+src[1] + mla v18.8h, v19.8h, $7.8h //v18 += 20*(src[0]+src[1]), 2 cycles + uaddl2 v19.8h, $1.16b, $4.16b //src[-1]+src[2] + mls v18.8h, v19.8h, $8.8h //v18 -= 5*(src[-1]+src[2]), 2 cycles + sqrshrun2 $6.16b, v18.8h, #5 + uaddl2 v19.8h, $3.16b, $6.16b + rshrn2 $6.16b, v19.8h, #1 +// } +.endm + +.macro FILTER_6TAG_8BITS_TO_16BITS1 +// { // input:d_src[-2], d_src[-1], d_src[0], d_src[1], d_src[2], d_src[3], dst_q, multiplier a/b; working:v31 + uaddl $6.8h, $0.8b, $5.8b //dst_q=src[-2]+src[3] + uaddl v31.8h, $2.8b, $3.8b //src[0]+src[1] + mla $6.8h, v31.8h, $7.8h //dst_q += 20*(src[0]+src[1]), 2 cycles + uaddl v31.8h, $1.8b, $4.8b //src[-1]+src[2] + mls $6.8h, v31.8h, $8.8h //dst_q -= 5*(src[-1]+src[2]), 2 cycles +// } +.endm + +.macro FILTER_6TAG_8BITS_TO_16BITS2 +// { // input:d_src[-2], d_src[-1], d_src[0], d_src[1], d_src[2], d_src[3], dst_q, multiplier a/b; working:v31 + uaddl2 $6.8h, $0.16b, $5.16b //dst_q=src[-2]+src[3] + uaddl2 v31.8h, $2.16b, $3.16b //src[0]+src[1] + mla $6.8h, v31.8h, $7.8h //dst_q += 20*(src[0]+src[1]), 2 cycles + uaddl2 v31.8h, $1.16b, $4.16b //src[-1]+src[2] + mls $6.8h, v31.8h, $8.8h //dst_q -= 5*(src[-1]+src[2]), 2 cycles +// } +.endm + +.macro FILTER_3_IN_16BITS_TO_8BITS1 +// { // input:a, b, c, dst_d; + sub $0.8h, $0.8h, $1.8h //a-b + sshr $0.8h, $0.8h, #2 //(a-b)/4 + sub $0.8h, $0.8h, $1.8h //(a-b)/4-b + add $0.8h, $0.8h, $2.8h //(a-b)/4-b+c + sshr $0.8h, $0.8h, #2 //((a-b)/4-b+c)/4 + add $0.8h, $0.8h, $2.8h //((a-b)/4-b+c)/4+c = (a-5*b+20*c)/16 + sqrshrun $3.8b, $0.8h, #6 //(+32)>>6 +// } +.endm + +.macro FILTER_3_IN_16BITS_TO_8BITS2 +// { // input:a, b, c, dst_d; + sub $0.8h, $0.8h, $1.8h //a-b + sshr $0.8h, $0.8h, #2 //(a-b)/4 + sub $0.8h, $0.8h, $1.8h //(a-b)/4-b + add $0.8h, $0.8h, $2.8h //(a-b)/4-b+c + sshr $0.8h, $0.8h, #2 //((a-b)/4-b+c)/4 + add $0.8h, $0.8h, $2.8h //((a-b)/4-b+c)/4+c = (a-5*b+20*c)/16 + sqrshrun2 $3.16b, $0.8h, #6 //(+32)>>6 +// } +.endm + +.macro UNPACK_2_16BITS_TO_ABC +// { // input:q_src[-2:5], q_src[6:13](avail 8+5)/q_src[6:**](avail 4+5), dst_a, dst_b, dst_c; + ext $4.16b, $0.16b, $1.16b, #4 //src[0] + ext $3.16b, $0.16b, $1.16b, #6 //src[1] + add $4.8h, $4.8h, $3.8h //c=src[0]+src[1] + + ext $3.16b, $0.16b, $1.16b, #2 //src[-1] + ext $2.16b, $0.16b, $1.16b, #8 //src[2] + add $3.8h, $3.8h, $2.8h //b=src[-1]+src[2] + + ext $2.16b, $0.16b, $1.16b, #10 //src[3] + add $2.8h, $2.8h, $0.8h //a=src[-2]+src[3] +// } +.endm + +.macro AVERAGE_TWO_8BITS1 +// { // input:dst_d, src_d A and B; working: v5 + uaddl v30.8h, $2.8b, $1.8b + rshrn $0.8b, v30.8h, #1 +// } +.endm + +.macro AVERAGE_TWO_8BITS2 +// { // input:dst_d, src_d A and B; working: v5 + uaddl2 v30.8h, $2.16b, $1.16b + rshrn2 $0.16b, v30.8h, #1 +// } +.endm + +.macro FILTER_SINGLE_TAG_8BITS // when width=17/9, used +// { // input: src_d{Y[0][1][2][3][4][5]X}, + rev64 $2.8b, $0.8b // X[5][4][3][2][1][0]O + uaddl $2.8h, $0.8b, $2.8b // each 16bits, *[50][41][32][23][14][05]* + mul $2.4h, $2.4h, $1.4h // 0+1*[50]-5*[41]+20[32] + addv $3, $2.4h + sqrshrun $0.8b, $0.8h, #5 +// } +.endm + +.macro UNPACK_FILTER_SINGLE_TAG_16BITS // v0, v1, v22, v23 +// { // each 16bits; input: d_dst, d_src[0:5], para, working, working, d(low part of d_dst) + ext.16b $3, $1, $1, #14 // X[0][1][2][3][4][5]O + ext.16b $4, $3, $3, #8 // [3][4][5]OX[0][1][2] + rev64 $4.8h, $4.8h // X[5][4][3][2][1][0]O + add $3.8h, $3.8h, $4.8h // each 16bits, *[50][41][32][23][14][05]* + smull $3.4s, $3.4h, $2.4h // 0+1*[50]-5*[41]+20[32] + saddlv $5, $3.4s + //sshr $0.2d, $0.2d, #4 + sqrshrun $0.2s, $0.2d, #10 + uqxtn $0.4h, $0.4s + uqxtn $0.8b, $0.8h + // } +.endm + +#else +.macro FILTER_6TAG_8BITS1 arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8 +// { // input:src[-2], src[-1], src[0], src[1], src[2], src[3], dst_d, multiplier a/b; working: v18, v19 + uaddl v18.8h, \arg0.8b, \arg5.8b //v18=src[-2]+src[3] + uaddl v19.8h, \arg2.8b, \arg3.8b //src[0]+src[1] + mla v18.8h, v19.8h, \arg7.8h //v18 += 20*(src[0]+src[1]), 2 cycles + uaddl v19.8h, \arg1.8b, \arg4.8b //src[-1]+src[2] + mls v18.8h, v19.8h, \arg8.8h //v18 -= 5*(src[-1]+src[2]), 2 cycles + sqrshrun \arg6.8b, v18.8h, #5 +// } +.endm + +.macro FILTER_6TAG_8BITS2 arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8 +// { // input:src[-2], src[-1], src[0], src[1], src[2], src[3], dst_d, multiplier a/b; working: v18, v19 + uaddl2 v18.8h, \arg0.16b, \arg5.16b //v18=src[-2]+src[3] + uaddl2 v19.8h, \arg2.16b, \arg3.16b //src[0]+src[1] + mla v18.8h, v19.8h, \arg7.8h //v18 += 20*(src[0]+src[1]), 2 cycles + uaddl2 v19.8h, \arg1.16b, \arg4.16b //src[-1]+src[2] + mls v18.8h, v19.8h, \arg8.8h //v18 -= 5*(src[-1]+src[2]), 2 cycles + sqrshrun2 \arg6.16b, v18.8h, #5 +// } +.endm + +.macro FILTER_6TAG_8BITS1_AVERAGE_WITH_0 arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8 +// { // input:src[-2], src[-1], src[0], src[1], src[2], src[3], dst_d, multiplier a/b; working: v18, v19 + uaddl v18.8h, \arg0.8b, \arg5.8b //v18=src[-2]+src[3] + uaddl v19.8h, \arg2.8b, \arg3.8b //src[0]+src[1] + mla v18.8h, v19.8h, \arg7.8h //v18 += 20*(src[0]+src[1]), 2 cycles + uaddl v19.8h, \arg1.8b, \arg4.8b //src[-1]+src[2] + mls v18.8h, v19.8h, \arg8.8h //v18 -= 5*(src[-1]+src[2]), 2 cycles + sqrshrun \arg6.8b, v18.8h, #5 + uaddl v19.8h, \arg2.8b, \arg6.8b + rshrn \arg6.8b, v19.8h, #1 +// } +.endm + +.macro FILTER_6TAG_8BITS2_AVERAGE_WITH_0 arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8 +// { // input:src[-2], src[-1], src[0], src[1], src[2], src[3], dst_d, multiplier a/b; working: v18, v19 + uaddl2 v18.8h, \arg0.16b, \arg5.16b //v18=src[-2]+src[3] + uaddl2 v19.8h, \arg2.16b, \arg3.16b //src[0]+src[1] + mla v18.8h, v19.8h, \arg7.8h //v18 += 20*(src[0]+src[1]), 2 cycles + uaddl2 v19.8h, \arg1.16b, \arg4.16b //src[-1]+src[2] + mls v18.8h, v19.8h, \arg8.8h //v18 -= 5*(src[-1]+src[2]), 2 cycles + sqrshrun2 \arg6.16b, v18.8h, #5 + uaddl2 v19.8h, \arg2.16b, \arg6.16b + rshrn2 \arg6.16b, v19.8h, #1 +// } +.endm + +.macro FILTER_6TAG_8BITS1_AVERAGE_WITH_1 arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8 +// { // input:src[-2], src[-1], src[0], src[1], src[2], src[3], dst_d, multiplier a/b; working: v18, v19 + uaddl v18.8h, \arg0.8b, \arg5.8b //v18=src[-2]+src[3] + uaddl v19.8h, \arg2.8b, \arg3.8b //src[0]+src[1] + mla v18.8h, v19.8h, \arg7.8h //v18 += 20*(src[0]+src[1]), 2 cycles + uaddl v19.8h, \arg1.8b, \arg4.8b //src[-1]+src[2] + mls v18.8h, v19.8h, \arg8.8h //v18 -= 5*(src[-1]+src[2]), 2 cycles + sqrshrun \arg6.8b, v18.8h, #5 + uaddl v19.8h, \arg3.8b, \arg6.8b + rshrn \arg6.8b, v19.8h, #1 +// } +.endm + +.macro FILTER_6TAG_8BITS2_AVERAGE_WITH_1 arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8 +// { // input:src[-2], src[-1], src[0], src[1], src[2], src[3], dst_d, multiplier a/b; working: v18, v19 + uaddl2 v18.8h, \arg0.16b, \arg5.16b //v18=src[-2]+src[3] + uaddl2 v19.8h, \arg2.16b, \arg3.16b //src[0]+src[1] + mla v18.8h, v19.8h, \arg7.8h //v18 += 20*(src[0]+src[1]), 2 cycles + uaddl2 v19.8h, \arg1.16b, \arg4.16b //src[-1]+src[2] + mls v18.8h, v19.8h, \arg8.8h //v18 -= 5*(src[-1]+src[2]), 2 cycles + sqrshrun2 \arg6.16b, v18.8h, #5 + uaddl2 v19.8h, \arg3.16b, \arg6.16b + rshrn2 \arg6.16b, v19.8h, #1 +// } +.endm + +.macro FILTER_6TAG_8BITS_TO_16BITS1 arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8 +// { // input:d_src[-2], d_src[-1], d_src[0], d_src[1], d_src[2], d_src[3], dst_q, multiplier a/b; working:v31 + uaddl \arg6.8h, \arg0.8b, \arg5.8b //dst_q=src[-2]+src[3] + uaddl v31.8h, \arg2.8b, \arg3.8b //src[0]+src[1] + mla \arg6.8h, v31.8h, \arg7.8h //dst_q += 20*(src[0]+src[1]), 2 cycles + uaddl v31.8h, \arg1.8b, \arg4.8b //src[-1]+src[2] + mls \arg6.8h, v31.8h, \arg8.8h //dst_q -= 5*(src[-1]+src[2]), 2 cycles +// } +.endm + +.macro FILTER_6TAG_8BITS_TO_16BITS2 arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8 +// { // input:d_src[-2], d_src[-1], d_src[0], d_src[1], d_src[2], d_src[3], dst_q, multiplier a/b; working:v31 + uaddl2 \arg6.8h, \arg0.16b, \arg5.16b //dst_q=src[-2]+src[3] + uaddl2 v31.8h, \arg2.16b, \arg3.16b //src[0]+src[1] + mla \arg6.8h, v31.8h, \arg7.8h //dst_q += 20*(src[0]+src[1]), 2 cycles + uaddl2 v31.8h, \arg1.16b, \arg4.16b //src[-1]+src[2] + mls \arg6.8h, v31.8h, \arg8.8h //dst_q -= 5*(src[-1]+src[2]), 2 cycles +// } +.endm + +.macro FILTER_3_IN_16BITS_TO_8BITS1 arg0, arg1, arg2, arg3 +// { // input:a, b, c, dst_d; + sub \arg0.8h, \arg0.8h, \arg1.8h //a-b + sshr \arg0.8h, \arg0.8h, #2 //(a-b)/4 + sub \arg0.8h, \arg0.8h, \arg1.8h //(a-b)/4-b + add \arg0.8h, \arg0.8h, \arg2.8h //(a-b)/4-b+c + sshr \arg0.8h, \arg0.8h, #2 //((a-b)/4-b+c)/4 + add \arg0.8h, \arg0.8h, \arg2.8h //((a-b)/4-b+c)/4+c = (a-5*b+20*c)/16 + sqrshrun \arg3.8b, \arg0.8h, #6 //(+32)>>6 +// } +.endm + +.macro FILTER_3_IN_16BITS_TO_8BITS2 arg0, arg1, arg2, arg3 +// { // input:a, b, c, dst_d; + sub \arg0.8h, \arg0.8h, \arg1.8h //a-b + sshr \arg0.8h, \arg0.8h, #2 //(a-b)/4 + sub \arg0.8h, \arg0.8h, \arg1.8h //(a-b)/4-b + add \arg0.8h, \arg0.8h, \arg2.8h //(a-b)/4-b+c + sshr \arg0.8h, \arg0.8h, #2 //((a-b)/4-b+c)/4 + add \arg0.8h, \arg0.8h, \arg2.8h //((a-b)/4-b+c)/4+c = (a-5*b+20*c)/16 + sqrshrun2 \arg3.16b, \arg0.8h, #6 //(+32)>>6 +// } +.endm + +.macro UNPACK_2_16BITS_TO_ABC arg0, arg1, arg2, arg3, arg4 +// { // input:q_src[-2:5], q_src[6:13](avail 8+5)/q_src[6:**](avail 4+5), dst_a, dst_b, dst_c; + ext \arg4.16b, \arg0.16b, \arg1.16b, #4 //src[0] + ext \arg3.16b, \arg0.16b, \arg1.16b, #6 //src[1] + add \arg4.8h, \arg4.8h, \arg3.8h //c=src[0]+src[1] + + ext \arg3.16b, \arg0.16b, \arg1.16b, #2 //src[-1] + ext \arg2.16b, \arg0.16b, \arg1.16b, #8 //src[2] + add \arg3.8h, \arg3.8h, \arg2.8h //b=src[-1]+src[2] + + ext \arg2.16b, \arg0.16b, \arg1.16b, #10 //src[3] + add \arg2.8h, \arg2.8h, \arg0.8h //a=src[-2]+src[3] +// } +.endm + +.macro AVERAGE_TWO_8BITS1 arg0, arg1, arg2 +// { // input:dst_d, src_d A and B; working: v5 + uaddl v30.8h, \arg2.8b, \arg1.8b + rshrn \arg0.8b, v30.8h, #1 +// } +.endm + +.macro AVERAGE_TWO_8BITS2 arg0, arg1, arg2 +// { // input:dst_d, src_d A and B; working: v5 + uaddl2 v30.8h, \arg2.16b, \arg1.16b + rshrn2 \arg0.16b, v30.8h, #1 +// } +.endm + +.macro FILTER_SINGLE_TAG_8BITS arg0, arg1, arg2, arg3 +// when width=17/9, used +// { // input: src_d{Y[0][1][2][3][4][5]X}, + rev64 \arg2.8b, \arg0.8b // X[5][4][3][2][1][0]O + uaddl \arg2.8h, \arg0.8b, \arg2.8b // each 16bits, *[50][41][32][23][14][05]* + mul \arg2.4h, \arg2.4h, \arg1.4h // 0+1*[50]-5*[41]+20[32] + addv \arg3, \arg2.4h + sqrshrun \arg0.8b, \arg0.8h, #5 +// } +.endm + +.macro UNPACK_FILTER_SINGLE_TAG_16BITS // v0, v1, v22, v23 +// { // each 16bits; input: d_dst, d_src[0:5], para, working, working, d(low part of d_dst) + ext.16b \arg3, \arg1, \arg1, #14 // X[0][1][2][3][4][5]O + ext.16b \arg4, \arg3, \arg3, #8 // [3][4][5]OX[0][1][2] + rev64 \arg4.8h, \arg4.8h // X[5][4][3][2][1][0]O + add \arg3.8h, \arg3.8h, \arg4.8h // each 16bits, *[50][41][32][23][14][05]* + smull \arg3.4s, \arg3.4h, \arg2.4h // 0+1*[50]-5*[41]+20[32] + saddlv \arg5, \arg3.4s + //sshr \arg0.2d, \arg0.2d, #4 + sqrshrun \arg0.2s, \arg0.2d, #10 + uqxtn \arg0.4h, \arg0.4s + uqxtn \arg0.8b, \arg0.8h + // } +.endm +#endif + +//(const uint8_t* pSrc {x0}, int32_t iSrcStride{x1}, uint8_t* pDst{x2}, int32_t iDstStride{x3}, int32_t iHeight{x4}) +WELS_ASM_ARCH64_FUNC_BEGIN McHorVer20WidthEq16_AArch64_neon + sub x0, x0, #2 + movi v0.8h, #20, lsl #0 + movi v1.8h, #5, lsl #0 +w16_h_mc_luma_loop: + ld1 {v2.8b, v3.8b, v4.8b}, [x0], x1 //only use 21(16+5); v2=src[-2] + trn1 v2.2d, v2.2d, v3.2d + //prfm pldl1strm, [x0] + ext v5.16b, v2.16b, v4.16b, #1 //v5=src[-1] + ext v6.16b, v2.16b, v4.16b, #2 //v6=src[0] + ext v7.16b, v2.16b, v4.16b, #3 //v7=src[1] + ext v16.16b, v2.16b, v4.16b, #4 //v16=src[2] + ext v17.16b, v2.16b, v4.16b, #5 //v17=src[3] + + FILTER_6TAG_8BITS1 v2, v5, v6, v7, v16, v17, v20, v0, v1 + FILTER_6TAG_8BITS2 v2, v5, v6, v7, v16, v17, v20, v0, v1 + + sub x4, x4, #1 + st1 {v20.16b}, [x2], x3 //write 16Byte + cbnz x4, w16_h_mc_luma_loop +WELS_ASM_ARCH64_FUNC_END + +WELS_ASM_ARCH64_FUNC_BEGIN McHorVer20WidthEq8_AArch64_neon + sub x0, x0, #2 + movi v0.8h, #20, lsl #0 + movi v1.8h, #5, lsl #0 +w8_h_mc_luma_loop: + ld1 {v2.8b, v3.8b}, [x0], x1 //only use 13(8+5); v2=src[-2] + trn1 v2.2d, v2.2d, v3.2d + //prfm pldl1strm, [x0] + ext v5.16b, v2.16b, v4.16b, #1 //v5=src[-1] + ext v6.16b, v2.16b, v4.16b, #2 //v6=src[0] + ext v7.16b, v2.16b, v4.16b, #3 //v7=src[1] + ext v16.16b, v2.16b, v4.16b, #4 //v16=src[2] + ext v17.16b, v2.16b, v4.16b, #5 //v17=src[3] + + FILTER_6TAG_8BITS1 v2, v5, v6, v7, v16, v17, v20, v0, v1 + + sub x4, x4, #1 + st1 {v20.8b}, [x2], x3 //write 8Byte + cbnz x4, w8_h_mc_luma_loop +WELS_ASM_ARCH64_FUNC_END + +WELS_ASM_ARCH64_FUNC_BEGIN McHorVer20WidthEq4_AArch64_neon + sub x0, x0, #2 + movi v0.8h, #20, lsl #0 + movi v1.8h, #5, lsl #0 + asr x4, x4, #1 +w4_h_mc_luma_loop: + ld1 {v2.16b}, [x0], x1 //only use 9(4+5); 1st row src[-2:6] + //prfm pldl1strm, [x0] + ld1 {v3.16b}, [x0], x1 //only use 9(4+5); 2nd row src[-2:6] + //prfm pldl1strm, [x0] + + zip1 v4.4s, v2.4s, v3.4s // v4=src[-2] 1st:2nd + ext v17.16b, v4.16b, v4.16b, #8 // v17=src[2:5] 1st:2nd + + ext v2.16b, v2.16b, v4.16b, #1 //1st row src[-1:6] + ext v3.16b, v3.16b, v4.16b, #1 //2nd row src[-1:6] + zip1 v5.4s, v2.4s, v3.4s // v5=src[-1:2] 1st:2nd + ext v7.16b, v5.16b, v4.16b, #8 //v7=src[3:6] 1st:2nd + + ext v2.16b, v2.16b, v4.16b, #1 //1st row src[0:6] + ext v3.16b, v3.16b, v4.16b, #1 //2nd row src[0:6] + zip1 v6.4s, v2.4s, v3.4s // v6=src[0:3] 1st:2nd + + ext v2.16b, v2.16b, v4.16b, #1 //1st row src[1:6] + ext v3.16b, v3.16b, v4.16b, #1 //2nd row src[1:6] + zip1 v16.4s, v2.4s, v3.4s // v16=src[1:4] 1st:2nd + + FILTER_6TAG_8BITS1 v4, v5, v6, v16, v17, v7, v20, v0, v1 + + st1 {v20.s}[0], [x2], x3 //write 4Byte + st1 {v20.s}[1], [x2], x3 //write 4Byte + sub x4, x4, #1 + cbnz x4, w4_h_mc_luma_loop +WELS_ASM_ARCH64_FUNC_END + +WELS_ASM_ARCH64_FUNC_BEGIN McHorVer10WidthEq16_AArch64_neon + sub x0, x0, #2 + movi v0.8h, #20, lsl #0 + movi v1.8h, #5, lsl #0 +w16_xy_10_mc_luma_loop: + ld1 {v2.8b, v3.8b, v4.8b}, [x0], x1 //only use 21(16+5); v2=src[-2] + trn1 v2.2d, v2.2d, v3.2d + //prfm pldl1strm, [x0] + ext v5.16b, v2.16b, v4.16b, #1 //v5=src[-1] + ext v6.16b, v2.16b, v4.16b, #2 //v6=src[0] + ext v7.16b, v2.16b, v4.16b, #3 //v7=src[1] + ext v16.16b, v2.16b, v4.16b, #4 //v16=src[2] + ext v17.16b, v2.16b, v4.16b, #5 //v17=src[3] + + FILTER_6TAG_8BITS1_AVERAGE_WITH_0 v2, v5, v6, v7, v16, v17, v20, v0, v1 + FILTER_6TAG_8BITS2_AVERAGE_WITH_0 v2, v5, v6, v7, v16, v17, v20, v0, v1 + + sub x4, x4, #1 + st1 {v20.16b}, [x2], x3 //write 16Byte + cbnz x4, w16_xy_10_mc_luma_loop +WELS_ASM_ARCH64_FUNC_END + + +WELS_ASM_ARCH64_FUNC_BEGIN McHorVer10WidthEq8_AArch64_neon + sub x0, x0, #2 + movi v0.8h, #20, lsl #0 + movi v1.8h, #5, lsl #0 +w8_xy_10_mc_luma_loop: + ld1 {v2.8b, v3.8b}, [x0], x1 //only use 13(8+5); v2=src[-2] + trn1 v2.2d, v2.2d, v3.2d + //prfm pldl1strm, [x0] + ext v5.16b, v2.16b, v4.16b, #1 //v5=src[-1] + ext v6.16b, v2.16b, v4.16b, #2 //v6=src[0] + ext v7.16b, v2.16b, v4.16b, #3 //v7=src[1] + ext v16.16b, v2.16b, v4.16b, #4 //v16=src[2] + ext v17.16b, v2.16b, v4.16b, #5 //v17=src[3] + + FILTER_6TAG_8BITS1_AVERAGE_WITH_0 v2, v5, v6, v7, v16, v17, v20, v0, v1 + + sub x4, x4, #1 + st1 {v20.8b}, [x2], x3 //write 8Byte + cbnz x4, w8_xy_10_mc_luma_loop +WELS_ASM_ARCH64_FUNC_END + +WELS_ASM_ARCH64_FUNC_BEGIN McHorVer10WidthEq4_AArch64_neon + sub x0, x0, #2 + movi v0.8h, #20, lsl #0 + movi v1.8h, #5, lsl #0 + asr x4, x4, #1 +w4_xy_10_mc_luma_loop: + ld1 {v2.16b}, [x0], x1 //only use 9(4+5); 1st row src[-2:6] + //prfm pldl1strm, [x0] + ld1 {v3.16b}, [x0], x1 //only use 9(4+5); 2nd row src[-2:6] + //prfm pldl1strm, [x0] + + zip1 v4.4s, v2.4s, v3.4s // v4=src[-2] 1st:2nd + ext v17.16b, v4.16b, v4.16b, #8 // v17=src[2:5] 1st:2nd + + ext v2.16b, v2.16b, v4.16b, #1 //1st row src[-1:6] + ext v3.16b, v3.16b, v4.16b, #1 //2nd row src[-1:6] + zip1 v5.4s, v2.4s, v3.4s // v5=src[-1:2] 1st:2nd + ext v7.16b, v5.16b, v4.16b, #8 //v7=src[3:6] 1st:2nd + + ext v2.16b, v2.16b, v4.16b, #1 //1st row src[0:6] + ext v3.16b, v3.16b, v4.16b, #1 //2nd row src[0:6] + zip1 v6.4s, v2.4s, v3.4s // v6=src[0:3] 1st:2nd + + ext v2.16b, v2.16b, v4.16b, #1 //1st row src[1:6] + ext v3.16b, v3.16b, v4.16b, #1 //2nd row src[1:6] + zip1 v16.4s, v2.4s, v3.4s // v16=src[1:4] 1st:2nd + + FILTER_6TAG_8BITS1_AVERAGE_WITH_0 v4, v5, v6, v16, v17, v7, v20, v0, v1 + + st1 {v20.s}[0], [x2], x3 //write 4Byte + st1 {v20.s}[1], [x2], x3 //write 4Byte + sub x4, x4, #1 + cbnz x4, w4_xy_10_mc_luma_loop +WELS_ASM_ARCH64_FUNC_END + + +WELS_ASM_ARCH64_FUNC_BEGIN McHorVer30WidthEq16_AArch64_neon + sub x0, x0, #2 + movi v0.8h, #20, lsl #0 + movi v1.8h, #5, lsl #0 +w16_xy_30_mc_luma_loop: + ld1 {v2.8b, v3.8b, v4.8b}, [x0], x1 //only use 21(16+5); v2=src[-2] + trn1 v2.2d, v2.2d, v3.2d + //prfm pldl1strm, [x0] + ext v5.16b, v2.16b, v4.16b, #1 //v5=src[-1] + ext v6.16b, v2.16b, v4.16b, #2 //v6=src[0] + ext v7.16b, v2.16b, v4.16b, #3 //v7=src[1] + ext v16.16b, v2.16b, v4.16b, #4 //v16=src[2] + ext v17.16b, v2.16b, v4.16b, #5 //v17=src[3] + + FILTER_6TAG_8BITS1_AVERAGE_WITH_1 v2, v5, v6, v7, v16, v17, v20, v0, v1 + FILTER_6TAG_8BITS2_AVERAGE_WITH_1 v2, v5, v6, v7, v16, v17, v20, v0, v1 + + sub x4, x4, #1 + st1 {v20.16b}, [x2], x3 //write 16Byte + cbnz x4, w16_xy_30_mc_luma_loop +WELS_ASM_ARCH64_FUNC_END + + +WELS_ASM_ARCH64_FUNC_BEGIN McHorVer30WidthEq8_AArch64_neon + sub x0, x0, #2 + movi v0.8h, #20, lsl #0 + movi v1.8h, #5, lsl #0 +w8_xy_30_mc_luma_loop: + ld1 {v2.8b, v3.8b}, [x0], x1 //only use 13(8+5); v2=src[-2] + trn1 v2.2d, v2.2d, v3.2d + //prfm pldl1strm, [x0] + ext v5.16b, v2.16b, v4.16b, #1 //v5=src[-1] + ext v6.16b, v2.16b, v4.16b, #2 //v6=src[0] + ext v7.16b, v2.16b, v4.16b, #3 //v7=src[1] + ext v16.16b, v2.16b, v4.16b, #4 //v16=src[2] + ext v17.16b, v2.16b, v4.16b, #5 //v17=src[3] + + FILTER_6TAG_8BITS1_AVERAGE_WITH_1 v2, v5, v6, v7, v16, v17, v20, v0, v1 + + sub x4, x4, #1 + st1 {v20.8b}, [x2], x3 //write 8Byte + cbnz x4, w8_xy_30_mc_luma_loop +WELS_ASM_ARCH64_FUNC_END + +WELS_ASM_ARCH64_FUNC_BEGIN McHorVer30WidthEq4_AArch64_neon + sub x0, x0, #2 + movi v0.8h, #20, lsl #0 + movi v1.8h, #5, lsl #0 + asr x4, x4, #1 +w4_xy_30_mc_luma_loop: + ld1 {v2.16b}, [x0], x1 //only use 9(4+5); 1st row src[-2:6] + //prfm pldl1strm, [x0] + ld1 {v3.16b}, [x0], x1 //only use 9(4+5); 2nd row src[-2:6] + //prfm pldl1strm, [x0] + + zip1 v4.4s, v2.4s, v3.4s // v4=src[-2] 1st:2nd + ext v17.16b, v4.16b, v4.16b, #8 // v17=src[2:5] 1st:2nd + + ext v2.16b, v2.16b, v4.16b, #1 //1st row src[-1:6] + ext v3.16b, v3.16b, v4.16b, #1 //2nd row src[-1:6] + zip1 v5.4s, v2.4s, v3.4s // v5=src[-1:2] 1st:2nd + ext v7.16b, v5.16b, v4.16b, #8 //v7=src[3:6] 1st:2nd + + ext v2.16b, v2.16b, v4.16b, #1 //1st row src[0:6] + ext v3.16b, v3.16b, v4.16b, #1 //2nd row src[0:6] + zip1 v6.4s, v2.4s, v3.4s // v6=src[0:3] 1st:2nd + + ext v2.16b, v2.16b, v4.16b, #1 //1st row src[1:6] + ext v3.16b, v3.16b, v4.16b, #1 //2nd row src[1:6] + zip1 v16.4s, v2.4s, v3.4s // v16=src[1:4] 1st:2nd + + FILTER_6TAG_8BITS1_AVERAGE_WITH_1 v4, v5, v6, v16, v17, v7, v20, v0, v1 + + st1 {v20.s}[0], [x2], x3 //write 4Byte + st1 {v20.s}[1], [x2], x3 //write 4Byte + sub x4, x4, #1 + cbnz x4, w4_xy_30_mc_luma_loop +WELS_ASM_ARCH64_FUNC_END + + +WELS_ASM_ARCH64_FUNC_BEGIN McHorVer01WidthEq16_AArch64_neon + sub x0, x0, x1, lsl #1 + movi v0.8h, #20, lsl #0 + movi v1.8h, #5, lsl #0 + + //prfm pldl1strm, [x0] + //prfm pldl1strm, [x0, x1] + ld1 {v2.16b}, [x0], x1 // v2=src[-2*stride] + //prfm pldl1strm, [x0, x1] + ld1 {v3.16b}, [x0], x1 // v3=src[-1*stride] + //prfm pldl1strm, [x0, x1] + ld1 {v4.16b}, [x0], x1 // v4=src[0*stride] + //prfm pldl1strm, [x0, x1] + ld1 {v5.16b}, [x0], x1 // v5=src[1*stride] + //prfm pldl1strm, [x0, x1] + ld1 {v6.16b}, [x0], x1 // v6=src[2*stride] + + +w16_xy_01_mc_luma_loop: + //prfm pldl1strm, [x0, x1] + ld1 {v7.16b}, [x0], x1 // v7=src[3*stride] + FILTER_6TAG_8BITS1_AVERAGE_WITH_0 v2, v3, v4, v5, v6, v7, v20, v0, v1 + FILTER_6TAG_8BITS2_AVERAGE_WITH_0 v2, v3, v4, v5, v6, v7, v20, v0, v1 + st1 {v20.16b}, [x2], x3 //write 16Byte : 0 line + + + //prfm pldl1strm, [x0, x1] + ld1 {v2.16b}, [x0], x1 // v2=src[3*stride] + FILTER_6TAG_8BITS1_AVERAGE_WITH_0 v3, v4, v5, v6, v7, v2, v20, v0, v1 + FILTER_6TAG_8BITS2_AVERAGE_WITH_0 v3, v4, v5, v6, v7, v2, v20, v0, v1 + st1 {v20.16b}, [x2], x3 //write 16Byte : 1 line + + + //prfm pldl1strm, [x0, x1] + ld1 {v3.16b}, [x0], x1 // v3=src[3*stride] + FILTER_6TAG_8BITS1_AVERAGE_WITH_0 v4, v5, v6, v7, v2, v3, v20, v0, v1 + FILTER_6TAG_8BITS2_AVERAGE_WITH_0 v4, v5, v6, v7, v2, v3, v20, v0, v1 + st1 {v20.16b}, [x2], x3 //write 16Byte : 2 line + + + //prfm pldl1strm, [x0, x1] + ld1 {v4.16b}, [x0], x1 // v4=src[3*stride] + FILTER_6TAG_8BITS1_AVERAGE_WITH_0 v5, v6, v7, v2, v3, v4, v20, v0, v1 + FILTER_6TAG_8BITS2_AVERAGE_WITH_0 v5, v6, v7, v2, v3, v4, v20, v0, v1 + st1 {v20.16b}, [x2], x3 //write 16Byte : 3 line + + + //prfm pldl1strm, [x0, x1] + ld1 {v5.16b}, [x0], x1 // v5=src[3*stride] + FILTER_6TAG_8BITS1_AVERAGE_WITH_0 v6, v7, v2, v3, v4, v5, v20, v0, v1 + FILTER_6TAG_8BITS2_AVERAGE_WITH_0 v6, v7, v2, v3, v4, v5, v20, v0, v1 + st1 {v20.16b}, [x2], x3 //write 16Byte : 4 line + + + //prfm pldl1strm, [x0, x1] + ld1 {v6.16b}, [x0], x1 // v6=src[3*stride] + FILTER_6TAG_8BITS1_AVERAGE_WITH_0 v7, v2, v3, v4, v5, v6, v20, v0, v1 + FILTER_6TAG_8BITS2_AVERAGE_WITH_0 v7, v2, v3, v4, v5, v6, v20, v0, v1 + st1 {v20.16b}, [x2], x3 //write 16Byte : 5 line + + //prfm pldl1strm, [x0, x1] + ld1 {v7.16b}, [x0], x1 // v7=src[3*stride] + FILTER_6TAG_8BITS1_AVERAGE_WITH_0 v2, v3, v4, v5, v6, v7, v20, v0, v1 + FILTER_6TAG_8BITS2_AVERAGE_WITH_0 v2, v3, v4, v5, v6, v7, v20, v0, v1 + st1 {v20.16b}, [x2], x3 //write 16Byte : 6 line + + //prfm pldl1strm, [x0, x1] + ld1 {v2.16b}, [x0], x1 // v2=src[3*stride] + FILTER_6TAG_8BITS1_AVERAGE_WITH_0 v3, v4, v5, v6, v7, v2, v20, v0, v1 + FILTER_6TAG_8BITS2_AVERAGE_WITH_0 v3, v4, v5, v6, v7, v2, v20, v0, v1 + st1 {v20.16b}, [x2], x3 //write 16Byte : 7 line + + mov.16b v3, v5 + mov.16b v5, v7 + mov.16b v7, v2 + mov.16b v2, v4 + mov.16b v4, v6 + mov.16b v6, v7 + sub x4, x4, #8 + cbnz x4, w16_xy_01_mc_luma_loop +WELS_ASM_ARCH64_FUNC_END + + +WELS_ASM_ARCH64_FUNC_BEGIN McHorVer01WidthEq8_AArch64_neon + sub x0, x0, x1, lsl #1 + movi v0.8h, #20, lsl #0 + movi v1.8h, #5, lsl #0 + + //prfm pldl1strm, [x0] + //prfm pldl1strm, [x0, x1] + ld1 {v2.8b}, [x0], x1 // v2=src[-2*stride] + //prfm pldl1strm, [x0, x1] + ld1 {v3.8b}, [x0], x1 // v3=src[-1*stride] + //prfm pldl1strm, [x0, x1] + ld1 {v4.8b}, [x0], x1 // v4=src[0*stride] + //prfm pldl1strm, [x0, x1] + ld1 {v5.8b}, [x0], x1 // v5=src[1*stride] + //prfm pldl1strm, [x0, x1] + ld1 {v6.8b}, [x0], x1 // v6=src[2*stride] + + +w8_xy_01_mc_luma_loop: + //prfm pldl1strm, [x0, x1] + ld1 {v7.8b}, [x0], x1 // v7=src[3*stride] + FILTER_6TAG_8BITS1_AVERAGE_WITH_0 v2, v3, v4, v5, v6, v7, v20, v0, v1 + st1 {v20.8b}, [x2], x3 //write 8Byte : 0 line + + //prfm pldl1strm, [x0, x1] + ld1 {v2.8b}, [x0], x1 // v2=src[3*stride] + FILTER_6TAG_8BITS1_AVERAGE_WITH_0 v3, v4, v5, v6, v7, v2, v20, v0, v1 + st1 {v20.8b}, [x2], x3 //write 8Byte : 1 line + + //prfm pldl1strm, [x0, x1] + ld1 {v3.8b}, [x0], x1 // v3=src[3*stride] + FILTER_6TAG_8BITS1_AVERAGE_WITH_0 v4, v5, v6, v7, v2, v3, v20, v0, v1 + st1 {v20.8b}, [x2], x3 //write 8Byte : 2 line + + //prfm pldl1strm, [x0, x1] + ld1 {v4.8b}, [x0], x1 // v4=src[3*stride] + FILTER_6TAG_8BITS1_AVERAGE_WITH_0 v5, v6, v7, v2, v3, v4, v20, v0, v1 + st1 {v20.8b}, [x2], x3 //write 8Byte : 3 line + + mov.16b v5, v3 + mov.16b v3, v7 + mov.16b v7, v2 + mov.16b v2, v6 + mov.16b v6, v4 + mov.16b v4, v7 + sub x4, x4, #4 + cbnz x4, w8_xy_01_mc_luma_loop +WELS_ASM_ARCH64_FUNC_END + + +WELS_ASM_ARCH64_FUNC_BEGIN McHorVer01WidthEq4_AArch64_neon + sub x0, x0, x1, lsl #1 + movi v0.8h, #20, lsl #0 + movi v1.8h, #5, lsl #0 + + //prfm pldl1strm, [x0] + //prfm pldl1strm, [x0, x1] + ld1 {v2.s}[0], [x0], x1 // v2=src[-2*stride] + //prfm pldl1strm, [x0, x1] + ld1 {v3.s}[0], [x0], x1 // v3=src[-1*stride] + mov v2.s[1], v3.s[0] + //prfm pldl1strm, [x0, x1] + ld1 {v4.s}[0], [x0], x1 // v4=src[0*stride] + mov v3.s[1], v4.s[0] + //prfm pldl1strm, [x0, x1] + ld1 {v5.s}[0], [x0], x1 // v5=src[1*stride] + mov v4.s[1], v5.s[0] + //prfm pldl1strm, [x0, x1] + ld1 {v6.s}[0], [x0], x1 // v6=src[2*stride] + mov v5.s[1], v6.s[0] + +w4_xy_01_mc_luma_loop: + //prfm pldl1strm, [x0, x1] + ld1 {v7.s}[0], [x0], x1 // v7=src[3*stride] + mov v6.s[1], v7.s[0] + //prfm pldl1strm, [x0, x1] + ld1 {v7.s}[1], [x0], x1 // v7=src[4*stride] + FILTER_6TAG_8BITS1_AVERAGE_WITH_0 v2, v3, v4, v5, v6, v7, v20, v0, v1 + st1 {v20.s}[0], [x2], x3 //write 4Byte : 0 line + st1 {v20.s}[1], [x2], x3 //write 4Byte : 1 line + mov v2.s[0], v7.s[1] + + //prfm pldl1strm, [x0, x1] + ld1 {v2.s}[1], [x0], x1 // v2=src[5*stride] + //prfm pldl1strm, [x0, x1] + ld1 {v3.s}[1], [x0], x1 // v2=src[6*stride] + mov v3.s[0], v2.s[1] + FILTER_6TAG_8BITS1_AVERAGE_WITH_0 v4, v5, v6, v7, v2, v3, v20, v0, v1 + st1 {v20.s}[0], [x2], x3 //write 4Byte : 2 line + st1 {v20.s}[1], [x2], x3 //write 4Byte : 3 line + mov v4.s[0], v3.s[1] + + mov.8b v21, v6 + mov.8b v6, v4 + mov.8b v4, v2 + mov.8b v2, v21 + mov.8b v21, v3 + mov.8b v3, v7 + mov.8b v7, v5 + mov.8b v5, v21 + + sub x4, x4, #4 + cbnz x4, w4_xy_01_mc_luma_loop +WELS_ASM_ARCH64_FUNC_END + + +WELS_ASM_ARCH64_FUNC_BEGIN McHorVer03WidthEq16_AArch64_neon + sub x0, x0, x1, lsl #1 + movi v0.8h, #20, lsl #0 + movi v1.8h, #5, lsl #0 + + //prfm pldl1strm, [x0] + //prfm pldl1strm, [x0, x1] + ld1 {v2.16b}, [x0], x1 // v2=src[-2*stride] + //prfm pldl1strm, [x0, x1] + ld1 {v3.16b}, [x0], x1 // v3=src[-1*stride] + //prfm pldl1strm, [x0, x1] + ld1 {v4.16b}, [x0], x1 // v4=src[0*stride] + //prfm pldl1strm, [x0, x1] + ld1 {v5.16b}, [x0], x1 // v5=src[1*stride] + //prfm pldl1strm, [x0, x1] + ld1 {v6.16b}, [x0], x1 // v6=src[2*stride] + + +w16_xy_03_mc_luma_loop: + //prfm pldl1strm, [x0, x1] + ld1 {v7.16b}, [x0], x1 // v7=src[3*stride] + FILTER_6TAG_8BITS1_AVERAGE_WITH_1 v2, v3, v4, v5, v6, v7, v20, v0, v1 + FILTER_6TAG_8BITS2_AVERAGE_WITH_1 v2, v3, v4, v5, v6, v7, v20, v0, v1 + st1 {v20.16b}, [x2], x3 //write 16Byte : 0 line + + + //prfm pldl1strm, [x0, x1] + ld1 {v2.16b}, [x0], x1 // v2=src[3*stride] + FILTER_6TAG_8BITS1_AVERAGE_WITH_1 v3, v4, v5, v6, v7, v2, v20, v0, v1 + FILTER_6TAG_8BITS2_AVERAGE_WITH_1 v3, v4, v5, v6, v7, v2, v20, v0, v1 + st1 {v20.16b}, [x2], x3 //write 16Byte : 1 line + + + //prfm pldl1strm, [x0, x1] + ld1 {v3.16b}, [x0], x1 // v3=src[3*stride] + FILTER_6TAG_8BITS1_AVERAGE_WITH_1 v4, v5, v6, v7, v2, v3, v20, v0, v1 + FILTER_6TAG_8BITS2_AVERAGE_WITH_1 v4, v5, v6, v7, v2, v3, v20, v0, v1 + st1 {v20.16b}, [x2], x3 //write 16Byte : 2 line + + + //prfm pldl1strm, [x0, x1] + ld1 {v4.16b}, [x0], x1 // v4=src[3*stride] + FILTER_6TAG_8BITS1_AVERAGE_WITH_1 v5, v6, v7, v2, v3, v4, v20, v0, v1 + FILTER_6TAG_8BITS2_AVERAGE_WITH_1 v5, v6, v7, v2, v3, v4, v20, v0, v1 + st1 {v20.16b}, [x2], x3 //write 16Byte : 3 line + + + //prfm pldl1strm, [x0, x1] + ld1 {v5.16b}, [x0], x1 // v5=src[3*stride] + FILTER_6TAG_8BITS1_AVERAGE_WITH_1 v6, v7, v2, v3, v4, v5, v20, v0, v1 + FILTER_6TAG_8BITS2_AVERAGE_WITH_1 v6, v7, v2, v3, v4, v5, v20, v0, v1 + st1 {v20.16b}, [x2], x3 //write 16Byte : 4 line + + + //prfm pldl1strm, [x0, x1] + ld1 {v6.16b}, [x0], x1 // v6=src[3*stride] + FILTER_6TAG_8BITS1_AVERAGE_WITH_1 v7, v2, v3, v4, v5, v6, v20, v0, v1 + FILTER_6TAG_8BITS2_AVERAGE_WITH_1 v7, v2, v3, v4, v5, v6, v20, v0, v1 + st1 {v20.16b}, [x2], x3 //write 16Byte : 5 line + + //prfm pldl1strm, [x0, x1] + ld1 {v7.16b}, [x0], x1 // v7=src[3*stride] + FILTER_6TAG_8BITS1_AVERAGE_WITH_1 v2, v3, v4, v5, v6, v7, v20, v0, v1 + FILTER_6TAG_8BITS2_AVERAGE_WITH_1 v2, v3, v4, v5, v6, v7, v20, v0, v1 + st1 {v20.16b}, [x2], x3 //write 16Byte : 6 line + + //prfm pldl1strm, [x0, x1] + ld1 {v2.16b}, [x0], x1 // v2=src[3*stride] + FILTER_6TAG_8BITS1_AVERAGE_WITH_1 v3, v4, v5, v6, v7, v2, v20, v0, v1 + FILTER_6TAG_8BITS2_AVERAGE_WITH_1 v3, v4, v5, v6, v7, v2, v20, v0, v1 + st1 {v20.16b}, [x2], x3 //write 16Byte : 7 line + + mov.16b v3, v5 + mov.16b v5, v7 + mov.16b v7, v2 + mov.16b v2, v4 + mov.16b v4, v6 + mov.16b v6, v7 + sub x4, x4, #8 + cbnz x4, w16_xy_03_mc_luma_loop +WELS_ASM_ARCH64_FUNC_END + + +WELS_ASM_ARCH64_FUNC_BEGIN McHorVer03WidthEq8_AArch64_neon + sub x0, x0, x1, lsl #1 + movi v0.8h, #20, lsl #0 + movi v1.8h, #5, lsl #0 + + //prfm pldl1strm, [x0] + //prfm pldl1strm, [x0, x1] + ld1 {v2.8b}, [x0], x1 // v2=src[-2*stride] + //prfm pldl1strm, [x0, x1] + ld1 {v3.8b}, [x0], x1 // v3=src[-1*stride] + //prfm pldl1strm, [x0, x1] + ld1 {v4.8b}, [x0], x1 // v4=src[0*stride] + //prfm pldl1strm, [x0, x1] + ld1 {v5.8b}, [x0], x1 // v5=src[1*stride] + //prfm pldl1strm, [x0, x1] + ld1 {v6.8b}, [x0], x1 // v6=src[2*stride] + + +w8_xy_03_mc_luma_loop: + //prfm pldl1strm, [x0, x1] + ld1 {v7.8b}, [x0], x1 // v7=src[3*stride] + FILTER_6TAG_8BITS1_AVERAGE_WITH_1 v2, v3, v4, v5, v6, v7, v20, v0, v1 + st1 {v20.8b}, [x2], x3 //write 8Byte : 0 line + + //prfm pldl1strm, [x0, x1] + ld1 {v2.8b}, [x0], x1 // v2=src[3*stride] + FILTER_6TAG_8BITS1_AVERAGE_WITH_1 v3, v4, v5, v6, v7, v2, v20, v0, v1 + st1 {v20.8b}, [x2], x3 //write 8Byte : 1 line + + //prfm pldl1strm, [x0, x1] + ld1 {v3.8b}, [x0], x1 // v3=src[3*stride] + FILTER_6TAG_8BITS1_AVERAGE_WITH_1 v4, v5, v6, v7, v2, v3, v20, v0, v1 + st1 {v20.8b}, [x2], x3 //write 8Byte : 2 line + + //prfm pldl1strm, [x0, x1] + ld1 {v4.8b}, [x0], x1 // v4=src[3*stride] + FILTER_6TAG_8BITS1_AVERAGE_WITH_1 v5, v6, v7, v2, v3, v4, v20, v0, v1 + st1 {v20.8b}, [x2], x3 //write 8Byte : 3 line + + mov.16b v5, v3 + mov.16b v3, v7 + mov.16b v7, v2 + mov.16b v2, v6 + mov.16b v6, v4 + mov.16b v4, v7 + sub x4, x4, #4 + cbnz x4, w8_xy_03_mc_luma_loop +WELS_ASM_ARCH64_FUNC_END + + +WELS_ASM_ARCH64_FUNC_BEGIN McHorVer03WidthEq4_AArch64_neon + sub x0, x0, x1, lsl #1 + movi v0.8h, #20, lsl #0 + movi v1.8h, #5, lsl #0 + + //prfm pldl1strm, [x0] + //prfm pldl1strm, [x0, x1] + ld1 {v2.s}[0], [x0], x1 // v2=src[-2*stride] + //prfm pldl1strm, [x0, x1] + ld1 {v3.s}[0], [x0], x1 // v3=src[-1*stride] + mov v2.s[1], v3.s[0] + //prfm pldl1strm, [x0, x1] + ld1 {v4.s}[0], [x0], x1 // v4=src[0*stride] + mov v3.s[1], v4.s[0] + //prfm pldl1strm, [x0, x1] + ld1 {v5.s}[0], [x0], x1 // v5=src[1*stride] + mov v4.s[1], v5.s[0] + //prfm pldl1strm, [x0, x1] + ld1 {v6.s}[0], [x0], x1 // v6=src[2*stride] + mov v5.s[1], v6.s[0] + +w4_xy_03_mc_luma_loop: + //prfm pldl1strm, [x0, x1] + ld1 {v7.s}[0], [x0], x1 // v7=src[3*stride] + mov v6.s[1], v7.s[0] + //prfm pldl1strm, [x0, x1] + ld1 {v7.s}[1], [x0], x1 // v7=src[4*stride] + FILTER_6TAG_8BITS1_AVERAGE_WITH_1 v2, v3, v4, v5, v6, v7, v20, v0, v1 + st1 {v20.s}[0], [x2], x3 //write 4Byte : 0 line + st1 {v20.s}[1], [x2], x3 //write 4Byte : 1 line + mov v2.s[0], v7.s[1] + + //prfm pldl1strm, [x0, x1] + ld1 {v2.s}[1], [x0], x1 // v2=src[5*stride] + //prfm pldl1strm, [x0, x1] + ld1 {v3.s}[1], [x0], x1 // v2=src[6*stride] + mov v3.s[0], v2.s[1] + FILTER_6TAG_8BITS1_AVERAGE_WITH_1 v4, v5, v6, v7, v2, v3, v20, v0, v1 + st1 {v20.s}[0], [x2], x3 //write 4Byte : 2 line + st1 {v20.s}[1], [x2], x3 //write 4Byte : 3 line + mov v4.s[0], v3.s[1] + + mov.8b v21, v6 + mov.8b v6, v4 + mov.8b v4, v2 + mov.8b v2, v21 + mov.8b v21, v3 + mov.8b v3, v7 + mov.8b v7, v5 + mov.8b v5, v21 + + sub x4, x4, #4 + cbnz x4, w4_xy_03_mc_luma_loop +WELS_ASM_ARCH64_FUNC_END + + +WELS_ASM_ARCH64_FUNC_BEGIN McHorVer02WidthEq16_AArch64_neon + sub x0, x0, x1, lsl #1 + movi v0.8h, #20, lsl #0 + movi v1.8h, #5, lsl #0 + + //prfm pldl1strm, [x0] + //prfm pldl1strm, [x0, x1] + ld1 {v2.16b}, [x0], x1 // v2=src[-2*stride] + //prfm pldl1strm, [x0, x1] + ld1 {v3.16b}, [x0], x1 // v3=src[-1*stride] + //prfm pldl1strm, [x0, x1] + ld1 {v4.16b}, [x0], x1 // v4=src[0*stride] + //prfm pldl1strm, [x0, x1] + ld1 {v5.16b}, [x0], x1 // v5=src[1*stride] + //prfm pldl1strm, [x0, x1] + ld1 {v6.16b}, [x0], x1 // v6=src[2*stride] + + +w16_xy_02_mc_luma_loop: + //prfm pldl1strm, [x0, x1] + ld1 {v7.16b}, [x0], x1 // v7=src[3*stride] + FILTER_6TAG_8BITS1 v2, v3, v4, v5, v6, v7, v20, v0, v1 + FILTER_6TAG_8BITS2 v2, v3, v4, v5, v6, v7, v20, v0, v1 + st1 {v20.16b}, [x2], x3 //write 16Byte : 0 line + + + //prfm pldl1strm, [x0, x1] + ld1 {v2.16b}, [x0], x1 // v2=src[3*stride] + FILTER_6TAG_8BITS1 v3, v4, v5, v6, v7, v2, v20, v0, v1 + FILTER_6TAG_8BITS2 v3, v4, v5, v6, v7, v2, v20, v0, v1 + st1 {v20.16b}, [x2], x3 //write 16Byte : 1 line + + + //prfm pldl1strm, [x0, x1] + ld1 {v3.16b}, [x0], x1 // v3=src[3*stride] + FILTER_6TAG_8BITS1 v4, v5, v6, v7, v2, v3, v20, v0, v1 + FILTER_6TAG_8BITS2 v4, v5, v6, v7, v2, v3, v20, v0, v1 + st1 {v20.16b}, [x2], x3 //write 16Byte : 2 line + + + //prfm pldl1strm, [x0, x1] + ld1 {v4.16b}, [x0], x1 // v4=src[3*stride] + FILTER_6TAG_8BITS1 v5, v6, v7, v2, v3, v4, v20, v0, v1 + FILTER_6TAG_8BITS2 v5, v6, v7, v2, v3, v4, v20, v0, v1 + st1 {v20.16b}, [x2], x3 //write 16Byte : 3 line + + + //prfm pldl1strm, [x0, x1] + ld1 {v5.16b}, [x0], x1 // v5=src[3*stride] + FILTER_6TAG_8BITS1 v6, v7, v2, v3, v4, v5, v20, v0, v1 + FILTER_6TAG_8BITS2 v6, v7, v2, v3, v4, v5, v20, v0, v1 + st1 {v20.16b}, [x2], x3 //write 16Byte : 4 line + + + //prfm pldl1strm, [x0, x1] + ld1 {v6.16b}, [x0], x1 // v6=src[3*stride] + FILTER_6TAG_8BITS1 v7, v2, v3, v4, v5, v6, v20, v0, v1 + FILTER_6TAG_8BITS2 v7, v2, v3, v4, v5, v6, v20, v0, v1 + st1 {v20.16b}, [x2], x3 //write 16Byte : 5 line + + //prfm pldl1strm, [x0, x1] + ld1 {v7.16b}, [x0], x1 // v7=src[3*stride] + FILTER_6TAG_8BITS1 v2, v3, v4, v5, v6, v7, v20, v0, v1 + FILTER_6TAG_8BITS2 v2, v3, v4, v5, v6, v7, v20, v0, v1 + st1 {v20.16b}, [x2], x3 //write 16Byte : 6 line + + //prfm pldl1strm, [x0, x1] + ld1 {v2.16b}, [x0], x1 // v2=src[3*stride] + FILTER_6TAG_8BITS1 v3, v4, v5, v6, v7, v2, v20, v0, v1 + FILTER_6TAG_8BITS2 v3, v4, v5, v6, v7, v2, v20, v0, v1 + st1 {v20.16b}, [x2], x3 //write 16Byte : 7 line + + mov.16b v3, v5 + mov.16b v5, v7 + mov.16b v7, v2 + mov.16b v2, v4 + mov.16b v4, v6 + mov.16b v6, v7 + sub x4, x4, #8 + cbnz x4, w16_xy_02_mc_luma_loop +WELS_ASM_ARCH64_FUNC_END + + +WELS_ASM_ARCH64_FUNC_BEGIN McHorVer02WidthEq8_AArch64_neon + sub x0, x0, x1, lsl #1 + movi v0.8h, #20, lsl #0 + movi v1.8h, #5, lsl #0 + + //prfm pldl1strm, [x0] + //prfm pldl1strm, [x0, x1] + ld1 {v2.8b}, [x0], x1 // v2=src[-2*stride] + //prfm pldl1strm, [x0, x1] + ld1 {v3.8b}, [x0], x1 // v3=src[-1*stride] + //prfm pldl1strm, [x0, x1] + ld1 {v4.8b}, [x0], x1 // v4=src[0*stride] + //prfm pldl1strm, [x0, x1] + ld1 {v5.8b}, [x0], x1 // v5=src[1*stride] + //prfm pldl1strm, [x0, x1] + ld1 {v6.8b}, [x0], x1 // v6=src[2*stride] + + +w8_xy_02_mc_luma_loop: + //prfm pldl1strm, [x0, x1] + ld1 {v7.8b}, [x0], x1 // v7=src[3*stride] + FILTER_6TAG_8BITS1 v2, v3, v4, v5, v6, v7, v20, v0, v1 + st1 {v20.8b}, [x2], x3 //write 8Byte : 0 line + + //prfm pldl1strm, [x0, x1] + ld1 {v2.8b}, [x0], x1 // v2=src[3*stride] + FILTER_6TAG_8BITS1 v3, v4, v5, v6, v7, v2, v20, v0, v1 + st1 {v20.8b}, [x2], x3 //write 8Byte : 1 line + + //prfm pldl1strm, [x0, x1] + ld1 {v3.8b}, [x0], x1 // v3=src[3*stride] + FILTER_6TAG_8BITS1 v4, v5, v6, v7, v2, v3, v20, v0, v1 + st1 {v20.8b}, [x2], x3 //write 8Byte : 2 line + + //prfm pldl1strm, [x0, x1] + ld1 {v4.8b}, [x0], x1 // v4=src[3*stride] + FILTER_6TAG_8BITS1 v5, v6, v7, v2, v3, v4, v20, v0, v1 + st1 {v20.8b}, [x2], x3 //write 8Byte : 3 line + + mov.16b v5, v3 + mov.16b v3, v7 + mov.16b v7, v2 + mov.16b v2, v6 + mov.16b v6, v4 + mov.16b v4, v7 + sub x4, x4, #4 + cbnz x4, w8_xy_02_mc_luma_loop +WELS_ASM_ARCH64_FUNC_END + + +WELS_ASM_ARCH64_FUNC_BEGIN McHorVer02WidthEq4_AArch64_neon + sub x0, x0, x1, lsl #1 + movi v0.8h, #20, lsl #0 + movi v1.8h, #5, lsl #0 + + //prfm pldl1strm, [x0] + //prfm pldl1strm, [x0, x1] + ld1 {v2.s}[0], [x0], x1 // v2=src[-2*stride] + //prfm pldl1strm, [x0, x1] + ld1 {v3.s}[0], [x0], x1 // v3=src[-1*stride] + mov v2.s[1], v3.s[0] + //prfm pldl1strm, [x0, x1] + ld1 {v4.s}[0], [x0], x1 // v4=src[0*stride] + mov v3.s[1], v4.s[0] + //prfm pldl1strm, [x0, x1] + ld1 {v5.s}[0], [x0], x1 // v5=src[1*stride] + mov v4.s[1], v5.s[0] + //prfm pldl1strm, [x0, x1] + ld1 {v6.s}[0], [x0], x1 // v6=src[2*stride] + mov v5.s[1], v6.s[0] + +w4_xy_02_mc_luma_loop: + //prfm pldl1strm, [x0, x1] + ld1 {v7.s}[0], [x0], x1 // v7=src[3*stride] + mov v6.s[1], v7.s[0] + //prfm pldl1strm, [x0, x1] + ld1 {v7.s}[1], [x0], x1 // v7=src[4*stride] + FILTER_6TAG_8BITS1 v2, v3, v4, v5, v6, v7, v20, v0, v1 + st1 {v20.s}[0], [x2], x3 //write 4Byte : 0 line + st1 {v20.s}[1], [x2], x3 //write 4Byte : 1 line + mov v2.s[0], v7.s[1] + + //prfm pldl1strm, [x0, x1] + ld1 {v2.s}[1], [x0], x1 // v2=src[5*stride] + //prfm pldl1strm, [x0, x1] + ld1 {v3.s}[1], [x0], x1 // v2=src[6*stride] + mov v3.s[0], v2.s[1] + FILTER_6TAG_8BITS1 v4, v5, v6, v7, v2, v3, v20, v0, v1 + st1 {v20.s}[0], [x2], x3 //write 4Byte : 2 line + st1 {v20.s}[1], [x2], x3 //write 4Byte : 3 line + mov v4.s[0], v3.s[1] + + mov.8b v21, v6 + mov.8b v6, v4 + mov.8b v4, v2 + mov.8b v2, v21 + mov.8b v21, v3 + mov.8b v3, v7 + mov.8b v7, v5 + mov.8b v5, v21 + + sub x4, x4, #4 + cbnz x4, w4_xy_02_mc_luma_loop +WELS_ASM_ARCH64_FUNC_END + + +WELS_ASM_ARCH64_FUNC_BEGIN McHorVer22WidthEq16_AArch64_neon + stp d8, d9, [sp,#-16]! + stp d10, d11, [sp,#-16]! + stp d12, d13, [sp,#-16]! + stp d14, d15, [sp,#-16]! + sub x0, x0, #2 + sub x0, x0, x1, lsl #1 + movi v0.8h, #20, lsl #0 + movi v1.8h, #5, lsl #0 + + //prfm pldl1strm, [x0] + //prfm pldl1strm, [x0, x1] + ld1 {v2.8b, v3.8b, v4.8b}, [x0], x1 // v2=src[-2*stride] + //prfm pldl1strm, [x0, x1] + ld1 {v5.8b, v6.8b, v7.8b}, [x0], x1 // v5=src[-1*stride] + //prfm pldl1strm, [x0, x1] + ld1 {v8.8b, v9.8b, v10.8b}, [x0], x1 // v8=src[0*stride] + //prfm pldl1strm, [x0, x1] + ld1 {v11.8b, v12.8b, v13.8b}, [x0], x1 // v11=src[1*stride] + //prfm pldl1strm, [x0, x1] + ld1 {v14.8b, v15.8b, v16.8b}, [x0], x1 // v14=src[2*stride] + +w16_hv_mc_luma_loop: + //prfm pldl1strm, [x0, x1] + ld1 {v17.8b, v18.8b, v19.8b}, [x0], x1 // v17=src[3*stride] + // vertical filtered into v20/v21 + FILTER_6TAG_8BITS_TO_16BITS1 v2, v5, v8, v11, v14, v17, v20, v0, v1 + FILTER_6TAG_8BITS_TO_16BITS1 v3, v6, v9, v12, v15, v18, v21, v0, v1 + // horizon filtered + UNPACK_2_16BITS_TO_ABC v20, v21, v23, v24, v25 + FILTER_3_IN_16BITS_TO_8BITS1 v23, v24, v25, v26 //output to v26[0] + // vertical filtered into v21/v22 + FILTER_6TAG_8BITS_TO_16BITS1 v4, v7, v10, v13, v16, v19, v22, v0, v1 + UNPACK_2_16BITS_TO_ABC v21, v22, v23, v24, v25 + FILTER_3_IN_16BITS_TO_8BITS2 v23, v24, v25, v26 //output to v26[1] + st1 {v26.16b}, [x2], x3 //write 16Byte : 0 line + + //prfm pldl1strm, [x0, x1] + ld1 {v2.8b, v3.8b, v4.8b}, [x0], x1 // v2=src[3*stride] + // vertical filtered into v20/v21 + FILTER_6TAG_8BITS_TO_16BITS1 v5, v8, v11, v14, v17, v2, v20, v0, v1 + FILTER_6TAG_8BITS_TO_16BITS1 v6, v9, v12, v15, v18, v3, v21, v0, v1 + // horizon filtered + UNPACK_2_16BITS_TO_ABC v20, v21, v23, v24, v25 + FILTER_3_IN_16BITS_TO_8BITS1 v23, v24, v25, v26 //output to v26[0] + // vertical filtered into v21/v22 + FILTER_6TAG_8BITS_TO_16BITS1 v7, v10, v13, v16, v19, v4, v22, v0, v1 + UNPACK_2_16BITS_TO_ABC v21, v22, v23, v24, v25 + FILTER_3_IN_16BITS_TO_8BITS2 v23, v24, v25, v26 //output to v26[1] + st1 {v26.16b}, [x2], x3 //write 16Byte : 1 line + + //prfm pldl1strm, [x0, x1] + ld1 {v5.8b, v6.8b, v7.8b}, [x0], x1 // v2=src[3*stride] + // vertical filtered into v20/v21 + FILTER_6TAG_8BITS_TO_16BITS1 v8, v11, v14, v17, v2, v5, v20, v0, v1 + FILTER_6TAG_8BITS_TO_16BITS1 v9, v12, v15, v18, v3, v6, v21, v0, v1 + // horizon filtered + UNPACK_2_16BITS_TO_ABC v20, v21, v23, v24, v25 + FILTER_3_IN_16BITS_TO_8BITS1 v23, v24, v25, v26 //output to v26[0] + // vertical filtered into v21/v22 + FILTER_6TAG_8BITS_TO_16BITS1 v10, v13, v16, v19, v4, v7, v22, v0, v1 + UNPACK_2_16BITS_TO_ABC v21, v22, v23, v24, v25 + FILTER_3_IN_16BITS_TO_8BITS2 v23, v24, v25, v26 //output to v26[1] + st1 {v26.16b}, [x2], x3 //write 16Byte : 2 line + + //prfm pldl1strm, [x0, x1] + ld1 {v8.8b, v9.8b, v10.8b}, [x0], x1 // v2=src[3*stride] + // vertical filtered into v20/v21 + FILTER_6TAG_8BITS_TO_16BITS1 v11, v14, v17, v2, v5, v8, v20, v0, v1 + FILTER_6TAG_8BITS_TO_16BITS1 v12, v15, v18, v3, v6, v9, v21, v0, v1 + // horizon filtered + UNPACK_2_16BITS_TO_ABC v20, v21, v23, v24, v25 + FILTER_3_IN_16BITS_TO_8BITS1 v23, v24, v25, v26 //output to v26[0] + // vertical filtered into v21/v22 + FILTER_6TAG_8BITS_TO_16BITS1 v13, v16, v19, v4, v7, v10, v22, v0, v1 + UNPACK_2_16BITS_TO_ABC v21, v22, v23, v24, v25 + FILTER_3_IN_16BITS_TO_8BITS2 v23, v24, v25, v26 //output to v26[1] + st1 {v26.16b}, [x2], x3 //write 16Byte : 3 line + + //prfm pldl1strm, [x0, x1] + ld1 {v11.8b, v12.8b, v13.8b}, [x0], x1 // v2=src[3*stride] + // vertical filtered into v20/v21 + FILTER_6TAG_8BITS_TO_16BITS1 v14, v17, v2, v5, v8, v11, v20, v0, v1 + FILTER_6TAG_8BITS_TO_16BITS1 v15, v18, v3, v6, v9, v12, v21, v0, v1 + // horizon filtered + UNPACK_2_16BITS_TO_ABC v20, v21, v23, v24, v25 + FILTER_3_IN_16BITS_TO_8BITS1 v23, v24, v25, v26 //output to v26[0] + // vertical filtered into v21/v22 + FILTER_6TAG_8BITS_TO_16BITS1 v16, v19, v4, v7, v10, v13, v22, v0, v1 + UNPACK_2_16BITS_TO_ABC v21, v22, v23, v24, v25 + FILTER_3_IN_16BITS_TO_8BITS2 v23, v24, v25, v26 //output to v26[1] + st1 {v26.16b}, [x2], x3 //write 16Byte : 4 line + + //prfm pldl1strm, [x0, x1] + ld1 {v14.8b, v15.8b, v16.8b}, [x0], x1 // v2=src[3*stride] + // vertical filtered into v20/v21 + FILTER_6TAG_8BITS_TO_16BITS1 v17, v2, v5, v8, v11, v14, v20, v0, v1 + FILTER_6TAG_8BITS_TO_16BITS1 v18, v3, v6, v9, v12, v15, v21, v0, v1 + // horizon filtered + UNPACK_2_16BITS_TO_ABC v20, v21, v23, v24, v25 + FILTER_3_IN_16BITS_TO_8BITS1 v23, v24, v25, v26 //output to v26[0] + // vertical filtered into v21/v22 + FILTER_6TAG_8BITS_TO_16BITS1 v19, v4, v7, v10, v13, v16, v22, v0, v1 + UNPACK_2_16BITS_TO_ABC v21, v22, v23, v24, v25 + FILTER_3_IN_16BITS_TO_8BITS2 v23, v24, v25, v26 //output to v26[1] + st1 {v26.16b}, [x2], x3 //write 16Byte : 5 line + + //prfm pldl1strm, [x0, x1] + ld1 {v17.8b, v18.8b, v19.8b}, [x0], x1 // v2=src[3*stride] + // vertical filtered into v20/v21 + FILTER_6TAG_8BITS_TO_16BITS1 v2, v5, v8, v11, v14, v17, v20, v0, v1 + FILTER_6TAG_8BITS_TO_16BITS1 v3, v6, v9, v12, v15, v18, v21, v0, v1 + // horizon filtered + UNPACK_2_16BITS_TO_ABC v20, v21, v23, v24, v25 + FILTER_3_IN_16BITS_TO_8BITS1 v23, v24, v25, v26 //output to v26[0] + // vertical filtered into v21/v22 + FILTER_6TAG_8BITS_TO_16BITS1 v4, v7, v10, v13, v16, v19, v22, v0, v1 + UNPACK_2_16BITS_TO_ABC v21, v22, v23, v24, v25 + FILTER_3_IN_16BITS_TO_8BITS2 v23, v24, v25, v26 //output to v26[1] + st1 {v26.16b}, [x2], x3 //write 16Byte : 6 line + + //prfm pldl1strm, [x0, x1] + ld1 {v2.8b, v3.8b, v4.8b}, [x0], x1 // v2=src[3*stride] + // vertical filtered into v20/v21 + FILTER_6TAG_8BITS_TO_16BITS1 v5, v8, v11, v14, v17, v2, v20, v0, v1 + FILTER_6TAG_8BITS_TO_16BITS1 v6, v9, v12, v15, v18, v3, v21, v0, v1 + // horizon filtered + UNPACK_2_16BITS_TO_ABC v20, v21, v23, v24, v25 + FILTER_3_IN_16BITS_TO_8BITS1 v23, v24, v25, v26 //output to v26[0] + // vertical filtered into v21/v22 + FILTER_6TAG_8BITS_TO_16BITS1 v7, v10, v13, v16, v19, v4, v22, v0, v1 + UNPACK_2_16BITS_TO_ABC v21, v22, v23, v24, v25 + FILTER_3_IN_16BITS_TO_8BITS2 v23, v24, v25, v26 //output to v26[1] + st1 {v26.16b}, [x2], x3 //write 16Byte : 7 line + + mov.16b v5, v11 + mov.16b v11, v17 + mov.16b v30, v2 + mov.16b v2, v8 + mov.16b v8, v14 + mov.16b v14, v30 + + mov.16b v6, v12 + mov.16b v12, v18 + mov.16b v30, v3 + mov.16b v3, v9 + mov.16b v9, v15 + mov.16b v15, v30 + + mov.16b v7, v13 + mov.16b v13, v19 + mov.16b v30, v4 + mov.16b v4, v10 + mov.16b v10, v16 + mov.16b v16, v30 + + sub x4, x4, #8 + cbnz x4, w16_hv_mc_luma_loop + + ldp d14, d15, [sp], #16 + ldp d12, d13, [sp], #16 + ldp d10, d11, [sp], #16 + ldp d8, d9, [sp], #16 +WELS_ASM_ARCH64_FUNC_END + +WELS_ASM_ARCH64_FUNC_BEGIN McHorVer22WidthEq8_AArch64_neon + sub x0, x0, #2 + sub x0, x0, x1, lsl #1 + movi v0.8h, #20, lsl #0 + movi v1.8h, #5, lsl #0 + + //prfm pldl1strm, [x0] + //prfm pldl1strm, [x0, x1] + ld1 {v2.16b}, [x0], x1 // v2=src[-2*stride] + //prfm pldl1strm, [x0, x1] + ld1 {v3.16b}, [x0], x1 // v5=src[-1*stride] + //prfm pldl1strm, [x0, x1] + ld1 {v4.16b}, [x0], x1 // v8=src[0*stride] + //prfm pldl1strm, [x0, x1] + ld1 {v5.16b}, [x0], x1 // v11=src[1*stride] + //prfm pldl1strm, [x0, x1] + ld1 {v6.16b}, [x0], x1 // v14=src[2*stride] + +w8_hv_mc_luma_loop: + //prfm pldl1strm, [x0, x1] + ld1 {v7.16b}, [x0], x1 // v7=src[3*stride] + // vertical filtered into v20/v21 + FILTER_6TAG_8BITS_TO_16BITS1 v2, v3, v4, v5, v6, v7, v20, v0, v1 + FILTER_6TAG_8BITS_TO_16BITS2 v2, v3, v4, v5, v6, v7, v21, v0, v1 + // horizon filtered + UNPACK_2_16BITS_TO_ABC v20, v21, v23, v24, v25 + FILTER_3_IN_16BITS_TO_8BITS1 v23, v24, v25, v26 //output to v26[0] + st1 {v26.8b}, [x2], x3 //write 8Byte : 0 line + + //prfm pldl1strm, [x0, x1] + ld1 {v2.16b}, [x0], x1 // v2=src[3*stride] + // vertical filtered into v20/v21 + FILTER_6TAG_8BITS_TO_16BITS1 v3, v4, v5, v6, v7, v2, v20, v0, v1 + FILTER_6TAG_8BITS_TO_16BITS2 v3, v4, v5, v6, v7, v2, v21, v0, v1 + // horizon filtered + UNPACK_2_16BITS_TO_ABC v20, v21, v23, v24, v25 + FILTER_3_IN_16BITS_TO_8BITS1 v23, v24, v25, v26 //output to v26[0] + st1 {v26.8b}, [x2], x3 //write 8Byte : 1 line + + //prfm pldl1strm, [x0, x1] + ld1 {v3.16b}, [x0], x1 // v3=src[3*stride] + // vertical filtered into v20/v21 + FILTER_6TAG_8BITS_TO_16BITS1 v4, v5, v6, v7, v2, v3, v20, v0, v1 + FILTER_6TAG_8BITS_TO_16BITS2 v4, v5, v6, v7, v2, v3, v21, v0, v1 + // horizon filtered + UNPACK_2_16BITS_TO_ABC v20, v21, v23, v24, v25 + FILTER_3_IN_16BITS_TO_8BITS1 v23, v24, v25, v26 //output to v26[0] + st1 {v26.8b}, [x2], x3 //write 8Byte : 2 line + + //prfm pldl1strm, [x0, x1] + ld1 {v4.16b}, [x0], x1 // v4=src[3*stride] + // vertical filtered into v20/v21 + FILTER_6TAG_8BITS_TO_16BITS1 v5, v6, v7, v2, v3, v4, v20, v0, v1 + FILTER_6TAG_8BITS_TO_16BITS2 v5, v6, v7, v2, v3, v4, v21, v0, v1 + // horizon filtered + UNPACK_2_16BITS_TO_ABC v20, v21, v23, v24, v25 + FILTER_3_IN_16BITS_TO_8BITS1 v23, v24, v25, v26 //output to v26[0] + st1 {v26.8b}, [x2], x3 //write 8Byte : 3 line + + + mov.16b v5, v3 + mov.16b v3, v7 + mov.16b v30, v2 + mov.16b v2, v6 + mov.16b v6, v4 + mov.16b v4, v30 + + sub x4, x4, #4 + cbnz x4, w8_hv_mc_luma_loop +WELS_ASM_ARCH64_FUNC_END + + +WELS_ASM_ARCH64_FUNC_BEGIN McHorVer22WidthEq4_AArch64_neon + sub x0, x0, #2 + sub x0, x0, x1, lsl #1 + movi v0.8h, #20, lsl #0 + movi v1.8h, #5, lsl #0 + + //prfm pldl1strm, [x0] + //prfm pldl1strm, [x0, x1] + ld1 {v2.16b}, [x0], x1 // v2=src[-2*stride] + //prfm pldl1strm, [x0, x1] + ld1 {v3.16b}, [x0], x1 // v3=src[-1*stride] + //prfm pldl1strm, [x0, x1] + ld1 {v4.16b}, [x0], x1 // v4=src[0*stride] + //prfm pldl1strm, [x0, x1] + ld1 {v5.16b}, [x0], x1 // v5=src[1*stride] + //prfm pldl1strm, [x0, x1] + ld1 {v6.16b}, [x0], x1 // v6=src[2*stride] + +w4_hv_mc_luma_loop: + //prfm pldl1strm, [x0, x1] + ld1 {v7.16b}, [x0], x1 // v7=src[3*stride] + // vertical filtered into v20/v21 1st line + FILTER_6TAG_8BITS_TO_16BITS1 v2, v3, v4, v5, v6, v7, v20, v0, v1 + FILTER_6TAG_8BITS_TO_16BITS2 v2, v3, v4, v5, v6, v7, v21, v0, v1 + //prfm pldl1strm, [x0, x1] + ld1 {v2.16b}, [x0], x1 // v16=src[4*stride] + // vertical filtered into v22/v23 2nd line + FILTER_6TAG_8BITS_TO_16BITS1 v3, v4, v5, v6, v7, v2, v22, v0, v1 + FILTER_6TAG_8BITS_TO_16BITS2 v3, v4, v5, v6, v7, v2, v23, v0, v1 + // horizon filtered + UNPACK_2_16BITS_TO_ABC v20, v21, v24, v25, v26 + UNPACK_2_16BITS_TO_ABC v22, v23, v28, v29, v30 + zip1 v24.2d, v24.2d, v28.2d + zip1 v25.2d, v25.2d, v29.2d + zip1 v26.2d, v26.2d, v30.2d + FILTER_3_IN_16BITS_TO_8BITS1 v24, v25, v26, v27 //output to v27[0] + st1 {v27.s}[0], [x2], x3 //write 4Byte : 0 line + st1 {v27.s}[1], [x2], x3 //write 4Byte : 1 line + + + //prfm pldl1strm, [x0, x1] + ld1 {v3.16b}, [x0], x1 // v3=src[5*stride] + // vertical filtered into v20/v21 + FILTER_6TAG_8BITS_TO_16BITS1 v4, v5, v6, v7, v2, v3, v20, v0, v1 + FILTER_6TAG_8BITS_TO_16BITS2 v4, v5, v6, v7, v2, v3, v21, v0, v1 + //prfm pldl1strm, [x0, x1] + ld1 {v4.16b}, [x0], x1 // v4=src[6*stride] + FILTER_6TAG_8BITS_TO_16BITS1 v5, v6, v7, v2, v3, v4, v22, v0, v1 + FILTER_6TAG_8BITS_TO_16BITS2 v5, v6, v7, v2, v3, v4, v23, v0, v1 + // horizon filtered + UNPACK_2_16BITS_TO_ABC v20, v21, v24, v25, v26 + UNPACK_2_16BITS_TO_ABC v22, v23, v28, v29, v30 + zip1 v24.2d, v24.2d, v28.2d + zip1 v25.2d, v25.2d, v29.2d + zip1 v26.2d, v26.2d, v30.2d + FILTER_3_IN_16BITS_TO_8BITS1 v24, v25, v26, v27 //output to v27[0] + st1 {v27.s}[0], [x2], x3 //write 4Byte : 2 line + st1 {v27.s}[1], [x2], x3 //write 4Byte : 3 line + + mov.16b v5, v3 + mov.16b v3, v7 + mov.16b v30, v2 + mov.16b v2, v6 + mov.16b v6, v4 + mov.16b v4, v30 + + sub x4, x4, #4 + cbnz x4, w4_hv_mc_luma_loop +WELS_ASM_ARCH64_FUNC_END + +WELS_ASM_ARCH64_FUNC_BEGIN McCopyWidthEq16_AArch64_neon + //prfm pldl1strm, [x0] +w16_copy_loop: + //prfm pldl1strm, [x0, x1] + ld1 {v0.16b}, [x0], x1 //read 16Byte : 0 line + st1 {v0.16b}, [x2], x3 //write 16Byte : 0 line + //prfm pldl1strm, [x0, x1] + ld1 {v1.16b}, [x0], x1 //read 16Byte : 1 line + st1 {v1.16b}, [x2], x3 //write 16Byte : 1 line + + sub x4, x4, #2 + cbnz x4, w16_copy_loop +WELS_ASM_ARCH64_FUNC_END + +WELS_ASM_ARCH64_FUNC_BEGIN McCopyWidthEq8_AArch64_neon + //prfm pldl1strm, [x0] +w8_copy_loop: + //prfm pldl1strm, [x0, x1] + ld1 {v0.8b}, [x0], x1 //read 16Byte : 0 line + st1 {v0.8b}, [x2], x3 //write 16Byte : 0 line + //prfm pldl1strm, [x0, x1] + ld1 {v1.8b}, [x0], x1 //read 16Byte : 1 line + st1 {v1.8b}, [x2], x3 //write 16Byte : 1 line + + sub x4, x4, #2 + cbnz x4, w8_copy_loop +WELS_ASM_ARCH64_FUNC_END + +WELS_ASM_ARCH64_FUNC_BEGIN McCopyWidthEq4_AArch64_neon + //prfm pldl1strm, [x0] +w4_copy_loop: + //prfm pldl1strm, [x0, x1] + ld1 {v0.s}[0], [x0], x1 //read 16Byte : 0 line + st1 {v0.s}[0], [x2], x3 //write 16Byte : 0 line + //prfm pldl1strm, [x0, x1] + ld1 {v1.s}[0], [x0], x1 //read 16Byte : 1 line + st1 {v1.s}[0], [x2], x3 //write 16Byte : 1 line + + sub x4, x4, #2 + cbnz x4, w4_copy_loop +WELS_ASM_ARCH64_FUNC_END + +WELS_ASM_ARCH64_FUNC_BEGIN PixStrideAvgWidthEq16_AArch64_neon + +enc_w16_pix_avg_loop: + ld1 {v0.16b}, [x2], x3 //read 16Byte : src0: 0 line + ld1 {v1.16b}, [x4], x5 //read 16Byte : src1: 0 line + ld1 {v2.16b}, [x2], x3 //read 16Byte : src0: 1 line + ld1 {v3.16b}, [x4], x5 //read 16Byte : src1: 1 line + ld1 {v4.16b}, [x2], x3 //read 16Byte : src0: 2 line + ld1 {v5.16b}, [x4], x5 //read 16Byte : src1: 2 line + ld1 {v6.16b}, [x2], x3 //read 16Byte : src0: 3 line + ld1 {v7.16b}, [x4], x5 //read 16Byte : src1: 3 line + AVERAGE_TWO_8BITS1 v16, v0, v1 + AVERAGE_TWO_8BITS2 v16, v0, v1 + st1 {v16.16b}, [x0], x1 //write 16Byte : 0 line + + + AVERAGE_TWO_8BITS1 v16, v2, v3 + AVERAGE_TWO_8BITS2 v16, v2, v3 + st1 {v16.16b}, [x0], x1 //write 16Byte : 1 line + + + AVERAGE_TWO_8BITS1 v16, v4, v5 + AVERAGE_TWO_8BITS2 v16, v4, v5 + st1 {v16.16b}, [x0], x1 //write 16Byte : 2 line + + AVERAGE_TWO_8BITS1 v16, v6, v7 + AVERAGE_TWO_8BITS2 v16, v6, v7 + st1 {v16.16b}, [x0], x1 //write 16Byte : 3 line + + sub x6, x6, #4 + cbnz x6, enc_w16_pix_avg_loop +WELS_ASM_ARCH64_FUNC_END + +WELS_ASM_ARCH64_FUNC_BEGIN PixStrideAvgWidthEq8_AArch64_neon + //prfm pldl1strm, [x2] + //prfm pldl1strm, [x4] +enc_w8_pix_avg_loop: + //prfm pldl1strm, [x2, x3] + //prfm pldl1strm, [x4, x5] + ld1 {v0.8b}, [x2], x3 //read 8Byte : src0: 0 line + ld1 {v1.8b}, [x4], x5 //read 8Byte : src1: 0 line + //prfm pldl1strm, [x2, x3] + //prfm pldl1strm, [x4, x5] + ld1 {v2.8b}, [x2], x3 //read 8Byte : src0: 1 line + ld1 {v3.8b}, [x4], x5 //read 8Byte : src1: 1 line + //prfm pldl1strm, [x2, x3] + //prfm pldl1strm, [x4, x5] + ld1 {v4.8b}, [x2], x3 //read 8Byte : src0: 2 line + ld1 {v5.8b}, [x4], x5 //read 8Byte : src1: 2 line + //prfm pldl1strm, [x2, x3] + //prfm pldl1strm, [x4, x5] + ld1 {v6.8b}, [x2], x3 //read 8Byte : src0: 3 line + ld1 {v7.8b}, [x4], x5 //read 8Byte : src1: 3 line + AVERAGE_TWO_8BITS1 v16, v0, v1 + st1 {v16.8b}, [x0], x1 //write 8Byte : 0 line + + AVERAGE_TWO_8BITS1 v16, v2, v3 + st1 {v16.8b}, [x0], x1 //write 8Byte : 1 line + + + AVERAGE_TWO_8BITS1 v16, v4, v5 + st1 {v16.8b}, [x0], x1 //write 8Byte : 2 line + + AVERAGE_TWO_8BITS1 v16, v6, v7 + st1 {v16.8b}, [x0], x1 //write 8Byte : 3 line + + sub x6, x6, #4 + cbnz x6, enc_w8_pix_avg_loop +WELS_ASM_ARCH64_FUNC_END + +WELS_ASM_ARCH64_FUNC_BEGIN PixelAvgWidthEq16_AArch64_neon + //prfm pldl1strm, [x2] + //prfm pldl1strm, [x4] +w16_pix_avg_loop: + //prfm pldl1strm, [x2, x3] + //prfm pldl1strm, [x4, x5] + ld1 {v0.16b}, [x2], x3 //read 16Byte : src0: 0 line + ld1 {v1.16b}, [x4], x5 //read 16Byte : src1: 0 line + //prfm pldl1strm, [x2, x3] + //prfm pldl1strm, [x4, x5] + ld1 {v2.16b}, [x2], x3 //read 16Byte : src0: 1 line + ld1 {v3.16b}, [x4], x5 //read 16Byte : src1: 1 line + //prfm pldl1strm, [x2, x3] + //prfm pldl1strm, [x4, x5] + ld1 {v4.16b}, [x2], x3 //read 16Byte : src0: 2 line + ld1 {v5.16b}, [x4], x5 //read 16Byte : src1: 2 line + //prfm pldl1strm, [x2, x3] + //prfm pldl1strm, [x4, x5] + ld1 {v6.16b}, [x2], x3 //read 16Byte : src0: 3 line + ld1 {v7.16b}, [x4], x5 //read 16Byte : src1: 3 line + AVERAGE_TWO_8BITS1 v16, v0, v1 + AVERAGE_TWO_8BITS2 v16, v0, v1 + st1 {v16.16b}, [x0], x1 //write 16Byte : 0 line + + + AVERAGE_TWO_8BITS1 v16, v2, v3 + AVERAGE_TWO_8BITS2 v16, v2, v3 + st1 {v16.16b}, [x0], x1 //write 16Byte : 1 line + + + AVERAGE_TWO_8BITS1 v16, v4, v5 + AVERAGE_TWO_8BITS2 v16, v4, v5 + st1 {v16.16b}, [x0], x1 //write 16Byte : 2 line + + AVERAGE_TWO_8BITS1 v16, v6, v7 + AVERAGE_TWO_8BITS2 v16, v6, v7 + st1 {v16.16b}, [x0], x1 //write 16Byte : 3 line + + sub x6, x6, #4 + cbnz x6, w16_pix_avg_loop +WELS_ASM_ARCH64_FUNC_END + +WELS_ASM_ARCH64_FUNC_BEGIN PixelAvgWidthEq8_AArch64_neon + //prfm pldl1strm, [x2] + //prfm pldl1strm, [x4] +w8_pix_avg_loop: + //prfm pldl1strm, [x2, x3] + //prfm pldl1strm, [x4, x5] + ld1 {v0.8b}, [x2], x3 //read 8Byte : src0: 0 line + ld1 {v1.8b}, [x4], x5 //read 8Byte : src1: 0 line + //prfm pldl1strm, [x2, x3] + //prfm pldl1strm, [x4, x5] + ld1 {v2.8b}, [x2], x3 //read 8Byte : src0: 1 line + ld1 {v3.8b}, [x4], x5 //read 8Byte : src1: 1 line + //prfm pldl1strm, [x2, x3] + //prfm pldl1strm, [x4, x5] + ld1 {v4.8b}, [x2], x3 //read 8Byte : src0: 2 line + ld1 {v5.8b}, [x4], x5 //read 8Byte : src1: 2 line + //prfm pldl1strm, [x2, x3] + //prfm pldl1strm, [x4, x5] + ld1 {v6.8b}, [x2], x3 //read 8Byte : src0: 3 line + ld1 {v7.8b}, [x4], x5 //read 8Byte : src1: 3 line + AVERAGE_TWO_8BITS1 v16, v0, v1 + st1 {v16.8b}, [x0], x1 //write 8Byte : 0 line + + AVERAGE_TWO_8BITS1 v16, v2, v3 + st1 {v16.8b}, [x0], x1 //write 8Byte : 1 line + + + AVERAGE_TWO_8BITS1 v16, v4, v5 + st1 {v16.8b}, [x0], x1 //write 8Byte : 2 line + + AVERAGE_TWO_8BITS1 v16, v6, v7 + st1 {v16.8b}, [x0], x1 //write 8Byte : 3 line + + sub x6, x6, #4 + cbnz x6, w8_pix_avg_loop +WELS_ASM_ARCH64_FUNC_END + + +WELS_ASM_ARCH64_FUNC_BEGIN PixelAvgWidthEq4_AArch64_neon + //prfm pldl1strm, [x2] + //prfm pldl1strm, [x4] +w4_pix_avg_loop: + //prfm pldl1strm, [x2, x3] + //prfm pldl1strm, [x4, x5] + ld1 {v0.s}[0], [x2], x3 //read 4Byte : src0: 0 line + ld1 {v1.s}[0], [x4], x5 //read 4Byte : src1: 0 line + //prfm pldl1strm, [x2, x3] + //prfm pldl1strm, [x4, x5] + ld1 {v0.s}[1], [x2], x3 //read 4Byte : src0: 1 line + ld1 {v1.s}[1], [x4], x5 //read 4Byte : src1: 1 line + AVERAGE_TWO_8BITS1 v2, v0, v1 + st1 {v2.s}[0], [x0], x1 //write 4Byte : 0 line + st1 {v2.s}[1], [x0], x1 //write 4Byte : 1 line + + sub x6, x6, #2 + cbnz x6, w4_pix_avg_loop +WELS_ASM_ARCH64_FUNC_END + +WELS_ASM_ARCH64_FUNC_BEGIN McChromaWidthEq8_AArch64_neon + ld4r {v4.8b, v5.8b, v6.8b, v7.8b}, [x4] //load A/B/C/D + ld1 {v0.16b}, [x0], x1 // src[x] + ext.16b v1, v0, v0, #1 // src[x+1] +w8_mc_chroma_loop: + ld1 {v2.16b}, [x0], x1 // src[x+stride] + ext.16b v3, v2, v2, #1 // src[x+stride+1] + ld1 {v18.16b}, [x0], x1 // src[x+2*stride] + ext.16b v19, v18, v18, #1 // src[x+2*stride+1] + + umull v16.8h, v0.8b, v4.8b + umlal v16.8h, v1.8b, v5.8b + umlal v16.8h, v2.8b, v6.8b + umlal v16.8h, v3.8b, v7.8b + rshrn v17.8b, v16.8h, #6 + st1 {v17.8b}, [x2], x3 + + + umull v16.8h, v2.8b, v4.8b + umlal v16.8h, v3.8b, v5.8b + umlal v16.8h, v18.8b, v6.8b + umlal v16.8h, v19.8b, v7.8b + rshrn v17.8b, v16.8h, #6 + st1 {v17.8b}, [x2], x3 + + mov.16b v0, v18 + mov.16b v1, v19 + sub x5, x5, #2 + cbnz x5, w8_mc_chroma_loop +WELS_ASM_ARCH64_FUNC_END + +WELS_ASM_ARCH64_FUNC_BEGIN McChromaWidthEq4_AArch64_neon + ld4r {v4.8b, v5.8b, v6.8b, v7.8b}, [x4] //load A/B/C/D + ld1 {v0.8b}, [x0], x1 // src[x] + ext.8b v1, v0, v0, #1 // src[x+1] +w4_mc_chroma_loop: + ld1 {v2.8b}, [x0], x1 // src[x+stride] + ext.8b v3, v2, v2, #1 // src[x+stride+1] + ld1 {v18.8b}, [x0], x1 // src[x+2*stride] + ext.8b v19, v18, v18, #1 // src[x+2*stride+1] + + zip1 v0.4s, v0.4s, v2.4s + zip1 v1.4s, v1.4s, v3.4s + zip1 v2.4s, v2.4s, v18.4s + zip1 v3.4s, v3.4s, v19.4s + + umull v16.8h, v0.8b, v4.8b + umlal v16.8h, v1.8b, v5.8b + umlal v16.8h, v2.8b, v6.8b + umlal v16.8h, v3.8b, v7.8b + rshrn v17.8b, v16.8h, #6 + st1 {v17.s}[0], [x2], x3 + st1 {v17.s}[1], [x2], x3 + + mov.8b v0, v18 + mov.8b v1, v19 + sub x5, x5, #2 + cbnz x5, w4_mc_chroma_loop +WELS_ASM_ARCH64_FUNC_END + + +WELS_ASM_ARCH64_FUNC_BEGIN McHorVer20Width17_AArch64_neon + sub x0, x0, #2 + sub x3, x3, #16 + mov x5, #16 + movi v0.8h, #20, lsl #0 + movi v1.8h, #5, lsl #0 + ldr q22, filter_para +w17_h_mc_luma_loop: + ld1 {v2.8b, v3.8b, v4.8b}, [x0], x1 //only use 22(17+5); v2=src[-2] + trn1 v2.2d, v2.2d, v3.2d + //prfm pldl1strm, [x0] + ext v5.16b, v2.16b, v4.16b, #1 //v5=src[-1] + ext v6.16b, v2.16b, v4.16b, #2 //v6=src[0] + ext v7.16b, v2.16b, v4.16b, #3 //v7=src[1] + ext v16.16b, v2.16b, v4.16b, #4 //v16=src[2] + ext v17.16b, v2.16b, v4.16b, #5 //v17=src[3] + + FILTER_6TAG_8BITS1 v2, v5, v6, v7, v16, v17, v20, v0, v1 + FILTER_6TAG_8BITS2 v2, v5, v6, v7, v16, v17, v20, v0, v1 + st1 {v20.16b}, [x2], x5 //write 16Byte + + ext.8b v21, v4, v4, #7 // [0][1][2][3][4][5]XY-->O[0][1][2][3][4][5]X + FILTER_SINGLE_TAG_8BITS v21, v22, v23, h21 + st1 {v21.b}[0], [x2], x3 //write 16th Byte + + sub x4, x4, #1 + cbnz x4, w17_h_mc_luma_loop +WELS_ASM_ARCH64_FUNC_END + +WELS_ASM_ARCH64_FUNC_BEGIN McHorVer20Width9_AArch64_neon + sub x0, x0, #2 + sub x3, x3, #8 + mov x5, #8 + movi v0.8h, #20, lsl #0 + movi v1.8h, #5, lsl #0 + ldr q22, filter_para +w9_h_mc_luma_loop: + ld1 {v2.8b, v3.8b}, [x0], x1 //only use 14(9+5); v2=src[-2] + trn1 v2.2d, v2.2d, v3.2d + //prfm pldl1strm, [x0] + ext v5.16b, v2.16b, v4.16b, #1 //v5=src[-1] + ext v6.16b, v2.16b, v4.16b, #2 //v6=src[0] + ext v7.16b, v2.16b, v4.16b, #3 //v7=src[1] + ext v16.16b, v2.16b, v4.16b, #4 //v16=src[2] + ext v17.16b, v2.16b, v4.16b, #5 //v17=src[3] + + FILTER_6TAG_8BITS1 v2, v5, v6, v7, v16, v17, v20, v0, v1 + st1 {v20.8b}, [x2], x5 //write 8Byte + + ext.8b v21, v3, v3, #7 // [0][1][2][3][4][5]XY-->O[0][1][2][3][4][5]X + FILTER_SINGLE_TAG_8BITS v21, v22, v23, h21 + st1 {v21.b}[0], [x2], x3 //write 9th Byte + + sub x4, x4, #1 + cbnz x4, w9_h_mc_luma_loop +WELS_ASM_ARCH64_FUNC_END + + +WELS_ASM_ARCH64_FUNC_BEGIN McHorVer22Width17_AArch64_neon + stp d8, d9, [sp,#-16]! + stp d10, d11, [sp,#-16]! + stp d12, d13, [sp,#-16]! + stp d14, d15, [sp,#-16]! + sub x0, x0, #2 + sub x0, x0, x1, lsl #1 + movi v0.8h, #20, lsl #0 + movi v1.8h, #5, lsl #0 + sub x3, x3, #16 + mov x5, #16 + ldr q29, filter_para + + sub x4, x4, #1 + + //prfm pldl1strm, [x0] + //prfm pldl1strm, [x0, x1] + ld1 {v2.8b, v3.8b, v4.8b}, [x0], x1 // v2=src[-2*stride] + //prfm pldl1strm, [x0, x1] + ld1 {v5.8b, v6.8b, v7.8b}, [x0], x1 // v5=src[-1*stride] + //prfm pldl1strm, [x0, x1] + ld1 {v8.8b, v9.8b, v10.8b}, [x0], x1 // v8=src[0*stride] + //prfm pldl1strm, [x0, x1] + ld1 {v11.8b, v12.8b, v13.8b}, [x0], x1 // v11=src[1*stride] + //prfm pldl1strm, [x0, x1] + ld1 {v14.8b, v15.8b, v16.8b}, [x0], x1 // v14=src[2*stride] + +w17_hv_mc_luma_loop: + //prfm pldl1strm, [x0, x1] + ld1 {v17.8b, v18.8b, v19.8b}, [x0], x1 // v17=src[3*stride] + // vertical filtered into v20/v21 + FILTER_6TAG_8BITS_TO_16BITS1 v2, v5, v8, v11, v14, v17, v20, v0, v1 + FILTER_6TAG_8BITS_TO_16BITS1 v3, v6, v9, v12, v15, v18, v21, v0, v1 + // horizon filtered + UNPACK_2_16BITS_TO_ABC v20, v21, v23, v24, v25 + FILTER_3_IN_16BITS_TO_8BITS1 v23, v24, v25, v26 //output to v26[0] + // vertical filtered into v21/v22 + FILTER_6TAG_8BITS_TO_16BITS1 v4, v7, v10, v13, v16, v19, v22, v0, v1 + UNPACK_2_16BITS_TO_ABC v21, v22, v23, v24, v25 + FILTER_3_IN_16BITS_TO_8BITS2 v23, v24, v25, v26 //output to v26[1] + st1 {v26.16b}, [x2], x5 //write 0:15 Byte : 0 line + UNPACK_FILTER_SINGLE_TAG_16BITS v26, v22, v29, v27, v28, d26 + st1 {v26.b}[0], [x2], x3 //write 16th Byte : 0 line + + //prfm pldl1strm, [x0, x1] + ld1 {v2.8b, v3.8b, v4.8b}, [x0], x1 // v2=src[4*stride] + // vertical filtered into v20/v21 + FILTER_6TAG_8BITS_TO_16BITS1 v5, v8, v11, v14, v17, v2, v20, v0, v1 + FILTER_6TAG_8BITS_TO_16BITS1 v6, v9, v12, v15, v18, v3, v21, v0, v1 + // horizon filtered + UNPACK_2_16BITS_TO_ABC v20, v21, v23, v24, v25 + FILTER_3_IN_16BITS_TO_8BITS1 v23, v24, v25, v26 //output to v26[0] + // vertical filtered into v21/v22 + FILTER_6TAG_8BITS_TO_16BITS1 v7, v10, v13, v16, v19, v4, v22, v0, v1 + UNPACK_2_16BITS_TO_ABC v21, v22, v23, v24, v25 + FILTER_3_IN_16BITS_TO_8BITS2 v23, v24, v25, v26 //output to v26[1] + st1 {v26.16b}, [x2], x5 //write 0:15Byte : 1 line + UNPACK_FILTER_SINGLE_TAG_16BITS v26, v22, v29, v27, v28, d26 + st1 {v26.b}[0], [x2], x3 //write 16th Byte : 1 line + + //prfm pldl1strm, [x0, x1] + ld1 {v5.8b, v6.8b, v7.8b}, [x0], x1 // v2=src[5*stride] + // vertical filtered into v20/v21 + FILTER_6TAG_8BITS_TO_16BITS1 v8, v11, v14, v17, v2, v5, v20, v0, v1 + FILTER_6TAG_8BITS_TO_16BITS1 v9, v12, v15, v18, v3, v6, v21, v0, v1 + // horizon filtered + UNPACK_2_16BITS_TO_ABC v20, v21, v23, v24, v25 + FILTER_3_IN_16BITS_TO_8BITS1 v23, v24, v25, v26 //output to v26[0] + // vertical filtered into v21/v22 + FILTER_6TAG_8BITS_TO_16BITS1 v10, v13, v16, v19, v4, v7, v22, v0, v1 + UNPACK_2_16BITS_TO_ABC v21, v22, v23, v24, v25 + FILTER_3_IN_16BITS_TO_8BITS2 v23, v24, v25, v26 //output to v26[1] + st1 {v26.16b}, [x2], x5 //write 0:15Byte : 2 line + UNPACK_FILTER_SINGLE_TAG_16BITS v26, v22, v29, v27, v28, d26 + st1 {v26.b}[0], [x2], x3 //write 16th Byte : 2 line + + //prfm pldl1strm, [x0, x1] + ld1 {v8.8b, v9.8b, v10.8b}, [x0], x1 // v2=src[6*stride] + // vertical filtered into v20/v21 + FILTER_6TAG_8BITS_TO_16BITS1 v11, v14, v17, v2, v5, v8, v20, v0, v1 + FILTER_6TAG_8BITS_TO_16BITS1 v12, v15, v18, v3, v6, v9, v21, v0, v1 + // horizon filtered + UNPACK_2_16BITS_TO_ABC v20, v21, v23, v24, v25 + FILTER_3_IN_16BITS_TO_8BITS1 v23, v24, v25, v26 //output to v26[0] + // vertical filtered into v21/v22 + FILTER_6TAG_8BITS_TO_16BITS1 v13, v16, v19, v4, v7, v10, v22, v0, v1 + UNPACK_2_16BITS_TO_ABC v21, v22, v23, v24, v25 + FILTER_3_IN_16BITS_TO_8BITS2 v23, v24, v25, v26 //output to v26[1] + st1 {v26.16b}, [x2], x5 //write 0:15Byte : 3 line + UNPACK_FILTER_SINGLE_TAG_16BITS v26, v22, v29, v27, v28, d26 + st1 {v26.b}[0], [x2], x3 //write 16th Byte : 3 line + + //prfm pldl1strm, [x0, x1] + ld1 {v11.8b, v12.8b, v13.8b}, [x0], x1 // v2=src[7*stride] + // vertical filtered into v20/v21 + FILTER_6TAG_8BITS_TO_16BITS1 v14, v17, v2, v5, v8, v11, v20, v0, v1 + FILTER_6TAG_8BITS_TO_16BITS1 v15, v18, v3, v6, v9, v12, v21, v0, v1 + // horizon filtered + UNPACK_2_16BITS_TO_ABC v20, v21, v23, v24, v25 + FILTER_3_IN_16BITS_TO_8BITS1 v23, v24, v25, v26 //output to v26[0] + // vertical filtered into v21/v22 + FILTER_6TAG_8BITS_TO_16BITS1 v16, v19, v4, v7, v10, v13, v22, v0, v1 + UNPACK_2_16BITS_TO_ABC v21, v22, v23, v24, v25 + FILTER_3_IN_16BITS_TO_8BITS2 v23, v24, v25, v26 //output to v26[1] + st1 {v26.16b}, [x2], x5 //write 0:15Byte : 4 line + UNPACK_FILTER_SINGLE_TAG_16BITS v26, v22, v29, v27, v28, d26 + st1 {v26.b}[0], [x2], x3 //write 16th Byte : 4 line + + //prfm pldl1strm, [x0, x1] + ld1 {v14.8b, v15.8b, v16.8b}, [x0], x1 // v2=src[8*stride] + // vertical filtered into v20/v21 + FILTER_6TAG_8BITS_TO_16BITS1 v17, v2, v5, v8, v11, v14, v20, v0, v1 + FILTER_6TAG_8BITS_TO_16BITS1 v18, v3, v6, v9, v12, v15, v21, v0, v1 + // horizon filtered + UNPACK_2_16BITS_TO_ABC v20, v21, v23, v24, v25 + FILTER_3_IN_16BITS_TO_8BITS1 v23, v24, v25, v26 //output to v26[0] + // vertical filtered into v21/v22 + FILTER_6TAG_8BITS_TO_16BITS1 v19, v4, v7, v10, v13, v16, v22, v0, v1 + UNPACK_2_16BITS_TO_ABC v21, v22, v23, v24, v25 + FILTER_3_IN_16BITS_TO_8BITS2 v23, v24, v25, v26 //output to v26[1] + st1 {v26.16b}, [x2], x5 //write 0:15Byte : 5 line + UNPACK_FILTER_SINGLE_TAG_16BITS v26, v22, v29, v27, v28, d26 + st1 {v26.b}[0], [x2], x3 //write 16th Byte : 5 line + + //prfm pldl1strm, [x0, x1] + ld1 {v17.8b, v18.8b, v19.8b}, [x0], x1 // v2=src[9*stride] + // vertical filtered into v20/v21 + FILTER_6TAG_8BITS_TO_16BITS1 v2, v5, v8, v11, v14, v17, v20, v0, v1 + FILTER_6TAG_8BITS_TO_16BITS1 v3, v6, v9, v12, v15, v18, v21, v0, v1 + // horizon filtered + UNPACK_2_16BITS_TO_ABC v20, v21, v23, v24, v25 + FILTER_3_IN_16BITS_TO_8BITS1 v23, v24, v25, v26 //output to v26[0] + // vertical filtered into v21/v22 + FILTER_6TAG_8BITS_TO_16BITS1 v4, v7, v10, v13, v16, v19, v22, v0, v1 + UNPACK_2_16BITS_TO_ABC v21, v22, v23, v24, v25 + FILTER_3_IN_16BITS_TO_8BITS2 v23, v24, v25, v26 //output to v26[1] + st1 {v26.16b}, [x2], x5 //write 0:15Byte : 6 line + UNPACK_FILTER_SINGLE_TAG_16BITS v26, v22, v29, v27, v28, d26 + st1 {v26.b}[0], [x2], x3 //write 16th Byte : 6 line + + //prfm pldl1strm, [x0, x1] + ld1 {v2.8b, v3.8b, v4.8b}, [x0], x1 // v2=src[10*stride] + // vertical filtered into v20/v21 + FILTER_6TAG_8BITS_TO_16BITS1 v5, v8, v11, v14, v17, v2, v20, v0, v1 + FILTER_6TAG_8BITS_TO_16BITS1 v6, v9, v12, v15, v18, v3, v21, v0, v1 + // horizon filtered + UNPACK_2_16BITS_TO_ABC v20, v21, v23, v24, v25 + FILTER_3_IN_16BITS_TO_8BITS1 v23, v24, v25, v26 //output to v26[0] + // vertical filtered into v21/v22 + FILTER_6TAG_8BITS_TO_16BITS1 v7, v10, v13, v16, v19, v4, v22, v0, v1 + UNPACK_2_16BITS_TO_ABC v21, v22, v23, v24, v25 + FILTER_3_IN_16BITS_TO_8BITS2 v23, v24, v25, v26 //output to v26[1] + st1 {v26.16b}, [x2], x5 //write 0:15Byte : 7 line + UNPACK_FILTER_SINGLE_TAG_16BITS v26, v22, v29, v27, v28, d26 + st1 {v26.b}[0], [x2], x3 //write 16th Byte : 7 line + + mov.16b v5, v11 + mov.16b v11, v17 + mov.16b v30, v2 + mov.16b v2, v8 + mov.16b v8, v14 + mov.16b v14, v30 + + mov.16b v6, v12 + mov.16b v12, v18 + mov.16b v30, v3 + mov.16b v3, v9 + mov.16b v9, v15 + mov.16b v15, v30 + + mov.16b v7, v13 + mov.16b v13, v19 + mov.16b v30, v4 + mov.16b v4, v10 + mov.16b v10, v16 + mov.16b v16, v30 + + sub x4, x4, #8 + cbnz x4, w17_hv_mc_luma_loop + + //prfm pldl1strm, [x0, x1] + ld1 {v17.8b, v18.8b, v19.8b}, [x0], x1 // v17=src[3*stride] + // vertical filtered into v20/v21 + FILTER_6TAG_8BITS_TO_16BITS1 v2, v5, v8, v11, v14, v17, v20, v0, v1 + FILTER_6TAG_8BITS_TO_16BITS1 v3, v6, v9, v12, v15, v18, v21, v0, v1 + // horizon filtered + UNPACK_2_16BITS_TO_ABC v20, v21, v23, v24, v25 + FILTER_3_IN_16BITS_TO_8BITS1 v23, v24, v25, v26 //output to v26[0] + // vertical filtered into v21/v22 + FILTER_6TAG_8BITS_TO_16BITS1 v4, v7, v10, v13, v16, v19, v22, v0, v1 + UNPACK_2_16BITS_TO_ABC v21, v22, v23, v24, v25 + FILTER_3_IN_16BITS_TO_8BITS2 v23, v24, v25, v26 //output to v26[1] + st1 {v26.16b}, [x2], x5 //write 0:15 Byte : 0 line + UNPACK_FILTER_SINGLE_TAG_16BITS v26, v22, v29, v27, v28, d26 + st1 {v26.b}[0], [x2], x3 //write 16th Byte : 0 line + + ldp d14, d15, [sp], #16 + ldp d12, d13, [sp], #16 + ldp d10, d11, [sp], #16 + ldp d8, d9, [sp], #16 +WELS_ASM_ARCH64_FUNC_END + + +WELS_ASM_ARCH64_FUNC_BEGIN McHorVer22Width9_AArch64_neon + sub x0, x0, #2 + sub x0, x0, x1, lsl #1 + movi v0.8h, #20, lsl #0 + movi v1.8h, #5, lsl #0 + sub x3, x3, #8 + mov x5, #8 + ldr q29, filter_para + sub x4, x4, #1 + + //prfm pldl1strm, [x0] + //prfm pldl1strm, [x0, x1] + ld1 {v2.16b}, [x0], x1 // v2=src[-2*stride] + //prfm pldl1strm, [x0, x1] + ld1 {v3.16b}, [x0], x1 // v5=src[-1*stride] + //prfm pldl1strm, [x0, x1] + ld1 {v4.16b}, [x0], x1 // v8=src[0*stride] + //prfm pldl1strm, [x0, x1] + ld1 {v5.16b}, [x0], x1 // v11=src[1*stride] + //prfm pldl1strm, [x0, x1] + ld1 {v6.16b}, [x0], x1 // v14=src[2*stride] + +w9_hv_mc_luma_loop: + //prfm pldl1strm, [x0, x1] + ld1 {v7.16b}, [x0], x1 // v7=src[3*stride] + // vertical filtered into v20/v21 + FILTER_6TAG_8BITS_TO_16BITS1 v2, v3, v4, v5, v6, v7, v20, v0, v1 + FILTER_6TAG_8BITS_TO_16BITS2 v2, v3, v4, v5, v6, v7, v21, v0, v1 + // horizon filtered + UNPACK_2_16BITS_TO_ABC v20, v21, v23, v24, v25 + FILTER_3_IN_16BITS_TO_8BITS1 v23, v24, v25, v26 //output to v26[0] + st1 {v26.8b}, [x2], x5 //write 0:7Byte : 0 line + UNPACK_FILTER_SINGLE_TAG_16BITS v26, v21, v29, v27, v28, d26 + st1 {v26.b}[0], [x2], x3 //write 8th Byte : 0 line + + //prfm pldl1strm, [x0, x1] + ld1 {v2.16b}, [x0], x1 // v2=src[4*stride] + // vertical filtered into v20/v21 + FILTER_6TAG_8BITS_TO_16BITS1 v3, v4, v5, v6, v7, v2, v20, v0, v1 + FILTER_6TAG_8BITS_TO_16BITS2 v3, v4, v5, v6, v7, v2, v21, v0, v1 + // horizon filtered + UNPACK_2_16BITS_TO_ABC v20, v21, v23, v24, v25 + FILTER_3_IN_16BITS_TO_8BITS1 v23, v24, v25, v26 //output to v26[0] + st1 {v26.8b}, [x2], x5 //write 0:7Byte : 1 line + UNPACK_FILTER_SINGLE_TAG_16BITS v26, v21, v29, v27, v28, d26 + st1 {v26.b}[0], [x2], x3 //write 8th Byte : 1 line + + //prfm pldl1strm, [x0, x1] + ld1 {v3.16b}, [x0], x1 // v3=src[5*stride] + // vertical filtered into v20/v21 + FILTER_6TAG_8BITS_TO_16BITS1 v4, v5, v6, v7, v2, v3, v20, v0, v1 + FILTER_6TAG_8BITS_TO_16BITS2 v4, v5, v6, v7, v2, v3, v21, v0, v1 + // horizon filtered + UNPACK_2_16BITS_TO_ABC v20, v21, v23, v24, v25 + FILTER_3_IN_16BITS_TO_8BITS1 v23, v24, v25, v26 //output to v26[0] + st1 {v26.8b}, [x2], x5 //write 0:7Byte : 2 line + UNPACK_FILTER_SINGLE_TAG_16BITS v26, v21, v29, v27, v28, d26 + st1 {v26.b}[0], [x2], x3 //write 8th Byte : 2 line + + //prfm pldl1strm, [x0, x1] + ld1 {v4.16b}, [x0], x1 // v4=src[6*stride] + // vertical filtered into v20/v21 + FILTER_6TAG_8BITS_TO_16BITS1 v5, v6, v7, v2, v3, v4, v20, v0, v1 + FILTER_6TAG_8BITS_TO_16BITS2 v5, v6, v7, v2, v3, v4, v21, v0, v1 + // horizon filtered + UNPACK_2_16BITS_TO_ABC v20, v21, v23, v24, v25 + FILTER_3_IN_16BITS_TO_8BITS1 v23, v24, v25, v26 //output to v26[0] + st1 {v26.8b}, [x2], x5 //write 0:7Byte : 3 line + UNPACK_FILTER_SINGLE_TAG_16BITS v26, v21, v29, v27, v28, d26 + st1 {v26.b}[0], [x2], x3 //write 8th Byte : 3 line + + + mov.16b v5, v3 + mov.16b v3, v7 + mov.16b v30, v2 + mov.16b v2, v6 + mov.16b v6, v4 + mov.16b v4, v30 + + sub x4, x4, #4 + cbnz x4, w9_hv_mc_luma_loop + + //prfm pldl1strm, [x0, x1] + ld1 {v7.16b}, [x0], x1 // v7=src[3*stride] + // vertical filtered into v20/v21 + FILTER_6TAG_8BITS_TO_16BITS1 v2, v3, v4, v5, v6, v7, v20, v0, v1 + FILTER_6TAG_8BITS_TO_16BITS2 v2, v3, v4, v5, v6, v7, v21, v0, v1 + // horizon filtered + UNPACK_2_16BITS_TO_ABC v20, v21, v23, v24, v25 + FILTER_3_IN_16BITS_TO_8BITS1 v23, v24, v25, v26 //output to v26[0] + st1 {v26.8b}, [x2], x5 //write 0:7Byte : 0 line + UNPACK_FILTER_SINGLE_TAG_16BITS v26, v21, v29, v27, v28, d26 + st1 {v26.b}[0], [x2], x3 //write 8th Byte : 0 line +WELS_ASM_ARCH64_FUNC_END + +WELS_ASM_ARCH64_FUNC_BEGIN McHorVer02Height17_AArch64_neon + sub x0, x0, x1, lsl #1 + movi v0.8h, #20, lsl #0 + movi v1.8h, #5, lsl #0 + sub x4, x4, #1 + + //prfm pldl1strm, [x0] + //prfm pldl1strm, [x0, x1] + ld1 {v2.16b}, [x0], x1 // v2=src[-2*stride] + //prfm pldl1strm, [x0, x1] + ld1 {v3.16b}, [x0], x1 // v3=src[-1*stride] + //prfm pldl1strm, [x0, x1] + ld1 {v4.16b}, [x0], x1 // v4=src[0*stride] + //prfm pldl1strm, [x0, x1] + ld1 {v5.16b}, [x0], x1 // v5=src[1*stride] + //prfm pldl1strm, [x0, x1] + ld1 {v6.16b}, [x0], x1 // v6=src[2*stride] + + +w17_v_mc_luma_loop: + //prfm pldl1strm, [x0, x1] + ld1 {v7.16b}, [x0], x1 // v7=src[3*stride] + FILTER_6TAG_8BITS1 v2, v3, v4, v5, v6, v7, v20, v0, v1 + FILTER_6TAG_8BITS2 v2, v3, v4, v5, v6, v7, v20, v0, v1 + st1 {v20.16b}, [x2], x3 //write 16Byte : 0 line + + + //prfm pldl1strm, [x0, x1] + ld1 {v2.16b}, [x0], x1 // v2=src[4*stride] + FILTER_6TAG_8BITS1 v3, v4, v5, v6, v7, v2, v20, v0, v1 + FILTER_6TAG_8BITS2 v3, v4, v5, v6, v7, v2, v20, v0, v1 + st1 {v20.16b}, [x2], x3 //write 16Byte : 1 line + + + //prfm pldl1strm, [x0, x1] + ld1 {v3.16b}, [x0], x1 // v3=src[5*stride] + FILTER_6TAG_8BITS1 v4, v5, v6, v7, v2, v3, v20, v0, v1 + FILTER_6TAG_8BITS2 v4, v5, v6, v7, v2, v3, v20, v0, v1 + st1 {v20.16b}, [x2], x3 //write 16Byte : 2 line + + + //prfm pldl1strm, [x0, x1] + ld1 {v4.16b}, [x0], x1 // v4=src[6*stride] + FILTER_6TAG_8BITS1 v5, v6, v7, v2, v3, v4, v20, v0, v1 + FILTER_6TAG_8BITS2 v5, v6, v7, v2, v3, v4, v20, v0, v1 + st1 {v20.16b}, [x2], x3 //write 16Byte : 3 line + + + //prfm pldl1strm, [x0, x1] + ld1 {v5.16b}, [x0], x1 // v5=src[7*stride] + FILTER_6TAG_8BITS1 v6, v7, v2, v3, v4, v5, v20, v0, v1 + FILTER_6TAG_8BITS2 v6, v7, v2, v3, v4, v5, v20, v0, v1 + st1 {v20.16b}, [x2], x3 //write 16Byte : 4 line + + + //prfm pldl1strm, [x0, x1] + ld1 {v6.16b}, [x0], x1 // v6=src[8*stride] + FILTER_6TAG_8BITS1 v7, v2, v3, v4, v5, v6, v20, v0, v1 + FILTER_6TAG_8BITS2 v7, v2, v3, v4, v5, v6, v20, v0, v1 + st1 {v20.16b}, [x2], x3 //write 16Byte : 5 line + + //prfm pldl1strm, [x0, x1] + ld1 {v7.16b}, [x0], x1 // v7=src[9*stride] + FILTER_6TAG_8BITS1 v2, v3, v4, v5, v6, v7, v20, v0, v1 + FILTER_6TAG_8BITS2 v2, v3, v4, v5, v6, v7, v20, v0, v1 + st1 {v20.16b}, [x2], x3 //write 16Byte : 6 line + + //prfm pldl1strm, [x0, x1] + ld1 {v2.16b}, [x0], x1 // v2=src[10*stride] + FILTER_6TAG_8BITS1 v3, v4, v5, v6, v7, v2, v20, v0, v1 + FILTER_6TAG_8BITS2 v3, v4, v5, v6, v7, v2, v20, v0, v1 + st1 {v20.16b}, [x2], x3 //write 16Byte : 7 line + + mov.16b v3, v5 + mov.16b v5, v7 + mov.16b v7, v2 + mov.16b v2, v4 + mov.16b v4, v6 + mov.16b v6, v7 + sub x4, x4, #8 + cbnz x4, w17_v_mc_luma_loop + + //prfm pldl1strm, [x0, x1] + ld1 {v7.16b}, [x0], x1 // v7=src[3*stride] + FILTER_6TAG_8BITS1 v2, v3, v4, v5, v6, v7, v20, v0, v1 + FILTER_6TAG_8BITS2 v2, v3, v4, v5, v6, v7, v20, v0, v1 + st1 {v20.16b}, [x2], x3 //write 16Byte : last line +WELS_ASM_ARCH64_FUNC_END + +WELS_ASM_ARCH64_FUNC_BEGIN McHorVer02Height9_AArch64_neon + sub x0, x0, x1, lsl #1 + movi v0.8h, #20, lsl #0 + movi v1.8h, #5, lsl #0 + sub x4, x4, #1 + + //prfm pldl1strm, [x0] + //prfm pldl1strm, [x0, x1] + ld1 {v2.8b}, [x0], x1 // v2=src[-2*stride] + //prfm pldl1strm, [x0, x1] + ld1 {v3.8b}, [x0], x1 // v3=src[-1*stride] + //prfm pldl1strm, [x0, x1] + ld1 {v4.8b}, [x0], x1 // v4=src[0*stride] + //prfm pldl1strm, [x0, x1] + ld1 {v5.8b}, [x0], x1 // v5=src[1*stride] + //prfm pldl1strm, [x0, x1] + ld1 {v6.8b}, [x0], x1 // v6=src[2*stride] + +w9_v_mc_luma_loop: + //prfm pldl1strm, [x0, x1] + ld1 {v7.8b}, [x0], x1 // v7=src[3*stride] + FILTER_6TAG_8BITS1 v2, v3, v4, v5, v6, v7, v20, v0, v1 + st1 {v20.8b}, [x2], x3 //write 8Byte : 0 line + + //prfm pldl1strm, [x0, x1] + ld1 {v2.8b}, [x0], x1 // v2=src[4*stride] + FILTER_6TAG_8BITS1 v3, v4, v5, v6, v7, v2, v20, v0, v1 + st1 {v20.8b}, [x2], x3 //write 8Byte : 1 line + + //prfm pldl1strm, [x0, x1] + ld1 {v3.8b}, [x0], x1 // v3=src[5*stride] + FILTER_6TAG_8BITS1 v4, v5, v6, v7, v2, v3, v20, v0, v1 + st1 {v20.8b}, [x2], x3 //write 8Byte : 2 line + + //prfm pldl1strm, [x0, x1] + ld1 {v4.8b}, [x0], x1 // v4=src[6*stride] + FILTER_6TAG_8BITS1 v5, v6, v7, v2, v3, v4, v20, v0, v1 + st1 {v20.8b}, [x2], x3 //write 8Byte : 3 line + + mov.16b v5, v3 + mov.16b v3, v7 + mov.16b v7, v2 + mov.16b v2, v6 + mov.16b v6, v4 + mov.16b v4, v7 + sub x4, x4, #4 + cbnz x4, w9_v_mc_luma_loop + + //prfm pldl1strm, [x0, x1] + ld1 {v7.8b}, [x0], x1 // v7=src[3*stride] + FILTER_6TAG_8BITS1 v2, v3, v4, v5, v6, v7, v20, v0, v1 + st1 {v20.8b}, [x2], x3 //write 8Byte : 0 line +WELS_ASM_ARCH64_FUNC_END + +#endif + diff --git a/codec/common/inc/mc_common.h b/codec/common/inc/mc_common.h index 399073d0..29b7ab9c 100644 --- a/codec/common/inc/mc_common.h +++ b/codec/common/inc/mc_common.h @@ -40,60 +40,171 @@ extern "C" { #endif//__cplusplus #if defined(HAVE_NEON) -void McCopyWidthEq4_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight); +void McCopyWidthEq4_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight); -void McCopyWidthEq8_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight); +void McCopyWidthEq8_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight); -void McCopyWidthEq16_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight); +void McCopyWidthEq16_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight); -void McChromaWidthEq8_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t* pWeights, int32_t iHeight); +void McChromaWidthEq8_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, + int32_t* pWeights, int32_t iHeight); -void McChromaWidthEq4_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t* pWeights, int32_t iHeight); +void McChromaWidthEq4_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, + int32_t* pWeights, int32_t iHeight); -void PixelAvgWidthEq16_neon(uint8_t* pDst, int32_t iDstStride, uint8_t* pSrcA, uint8_t* pSrcB, int32_t iHeight); -void PixelAvgWidthEq8_neon(uint8_t* pDst, int32_t iDstStride, uint8_t* pSrcA, uint8_t* pSrcB, int32_t iHeight); -void PixelAvgWidthEq4_neon(uint8_t* pDst, int32_t iDstStride, uint8_t* pSrcA, uint8_t* pSrcB, int32_t iHeight); +void PixelAvgWidthEq16_neon (uint8_t* pDst, int32_t iDstStride, uint8_t* pSrcA, uint8_t* pSrcB, int32_t iHeight); +void PixelAvgWidthEq8_neon (uint8_t* pDst, int32_t iDstStride, uint8_t* pSrcA, uint8_t* pSrcB, int32_t iHeight); +void PixelAvgWidthEq4_neon (uint8_t* pDst, int32_t iDstStride, uint8_t* pSrcA, uint8_t* pSrcB, int32_t iHeight); -void McHorVer01WidthEq16_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight); -void McHorVer01WidthEq8_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight); -void McHorVer01WidthEq4_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight); -void McHorVer03WidthEq16_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight); -void McHorVer03WidthEq8_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight); -void McHorVer03WidthEq4_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight); +void McHorVer01WidthEq16_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, + int32_t iHeight); +void McHorVer01WidthEq8_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, + int32_t iHeight); +void McHorVer01WidthEq4_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, + int32_t iHeight); +void McHorVer03WidthEq16_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, + int32_t iHeight); +void McHorVer03WidthEq8_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, + int32_t iHeight); +void McHorVer03WidthEq4_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, + int32_t iHeight); -void McHorVer10WidthEq16_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight); -void McHorVer10WidthEq8_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight); -void McHorVer10WidthEq4_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight); -void McHorVer30WidthEq16_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight); -void McHorVer30WidthEq8_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight); -void McHorVer30WidthEq4_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight); +void McHorVer10WidthEq16_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, + int32_t iHeight); +void McHorVer10WidthEq8_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, + int32_t iHeight); +void McHorVer10WidthEq4_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, + int32_t iHeight); +void McHorVer30WidthEq16_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, + int32_t iHeight); +void McHorVer30WidthEq8_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, + int32_t iHeight); +void McHorVer30WidthEq4_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, + int32_t iHeight); - //horizontal filter to gain half sample, that is (2, 0) location in quarter sample -void McHorVer20WidthEq16_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight); -void McHorVer20WidthEq8_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight); -void McHorVer20WidthEq4_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight); +//horizontal filter to gain half sample, that is (2, 0) location in quarter sample +void McHorVer20WidthEq16_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, + int32_t iHeight); +void McHorVer20WidthEq8_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, + int32_t iHeight); +void McHorVer20WidthEq4_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, + int32_t iHeight); - //vertical filter to gain half sample, that is (0, 2) location in quarter sample -void McHorVer02WidthEq16_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight); -void McHorVer02WidthEq8_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight); -void McHorVer02WidthEq4_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight); +//vertical filter to gain half sample, that is (0, 2) location in quarter sample +void McHorVer02WidthEq16_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, + int32_t iHeight); +void McHorVer02WidthEq8_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, + int32_t iHeight); +void McHorVer02WidthEq4_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, + int32_t iHeight); - //horizontal and vertical filter to gain half sample, that is (2, 2) location in quarter sample -void McHorVer22WidthEq16_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight); -void McHorVer22WidthEq8_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight); -void McHorVer22WidthEq4_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight); +//horizontal and vertical filter to gain half sample, that is (2, 2) location in quarter sample +void McHorVer22WidthEq16_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, + int32_t iHeight); +void McHorVer22WidthEq8_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, + int32_t iHeight); +void McHorVer22WidthEq4_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, + int32_t iHeight); -void PixStrideAvgWidthEq16_neon(uint8_t* pDst, int32_t iDstStride, const uint8_t* pSrcA, int32_t iSrcStrideA, const uint8_t* pSrcB, int32_t iSrcStrideB, int32_t iHeight); -void PixStrideAvgWidthEq8_neon(uint8_t* pDst, int32_t iDstStride, const uint8_t* pSrcA, int32_t iSrcStrideA, const uint8_t* pSrcB, int32_t iSrcStrideB, int32_t iHeight); +void PixStrideAvgWidthEq16_neon (uint8_t* pDst, int32_t iDstStride, const uint8_t* pSrcA, int32_t iSrcStrideA, + const uint8_t* pSrcB, int32_t iSrcStrideB, int32_t iHeight); +void PixStrideAvgWidthEq8_neon (uint8_t* pDst, int32_t iDstStride, const uint8_t* pSrcA, int32_t iSrcStrideA, + const uint8_t* pSrcB, int32_t iSrcStrideB, int32_t iHeight); -void McHorVer20Width17_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);// width+1 -void McHorVer20Width9_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);// width+1 +void McHorVer20Width17_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, + int32_t iHeight);// width+1 +void McHorVer20Width9_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, + int32_t iHeight);// width+1 -void McHorVer02Height17_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);// height+1 -void McHorVer02Height9_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);// height+1 +void McHorVer02Height17_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, + int32_t iHeight);// height+1 +void McHorVer02Height9_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, + int32_t iHeight);// height+1 -void McHorVer22Width17_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);//width+1&&height+1 -void McHorVer22Width9_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);//width+1&&height+1 +void McHorVer22Width17_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, + int32_t iHeight);//width+1&&height+1 +void McHorVer22Width9_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, + int32_t iHeight);//width+1&&height+1 +#endif + +#if defined(HAVE_NEON_AARCH64) +void McCopyWidthEq4_AArch64_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, + int32_t iHeight); +void McCopyWidthEq8_AArch64_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, + int32_t iHeight); +void McCopyWidthEq16_AArch64_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, + int32_t iHeight); +void McChromaWidthEq8_AArch64_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, + int32_t* pWeights, int32_t iHeight); +void McChromaWidthEq4_AArch64_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, + int32_t* pWeights, int32_t iHeight); +void PixelAvgWidthEq16_AArch64_neon (uint8_t* pDst, int32_t iDstStride, const uint8_t* pSrcA, int32_t iSrcAStride, + const uint8_t* pSrcB, int32_t iSrcBStride, int32_t iHeight); +void PixelAvgWidthEq8_AArch64_neon (uint8_t* pDst, int32_t iDstStride, const uint8_t* pSrcA, int32_t iSrcAStride, + const uint8_t* pSrcB, int32_t iSrcBStride, int32_t iHeight); +void PixelAvgWidthEq4_AArch64_neon (uint8_t* pDst, int32_t iDstStride, const uint8_t* pSrcA, int32_t iSrcAStride, + const uint8_t* pSrcB, int32_t iSrcBStride, int32_t iHeight); +void McHorVer01WidthEq16_AArch64_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, + int32_t iHeight); +void McHorVer01WidthEq8_AArch64_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, + int32_t iHeight); +void McHorVer01WidthEq4_AArch64_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, + int32_t iHeight); +void McHorVer03WidthEq16_AArch64_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, + int32_t iHeight); +void McHorVer03WidthEq8_AArch64_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, + int32_t iHeight); +void McHorVer03WidthEq4_AArch64_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, + int32_t iHeight); +void McHorVer10WidthEq16_AArch64_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, + int32_t iHeight); +void McHorVer10WidthEq8_AArch64_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, + int32_t iHeight); +void McHorVer10WidthEq4_AArch64_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, + int32_t iHeight); +void McHorVer30WidthEq16_AArch64_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, + int32_t iHeight); +void McHorVer30WidthEq8_AArch64_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, + int32_t iHeight); +void McHorVer30WidthEq4_AArch64_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, + int32_t iHeight); +//horizontal filter to gain half sample, that is (2, 0) location in quarter sample +void McHorVer20WidthEq16_AArch64_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, + int32_t iHeight); +void McHorVer20WidthEq8_AArch64_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, + int32_t iHeight); +void McHorVer20WidthEq4_AArch64_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, + int32_t iHeight); +//vertical filter to gain half sample, that is (0, 2) location in quarter sample +void McHorVer02WidthEq16_AArch64_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, + int32_t iHeight); +void McHorVer02WidthEq8_AArch64_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, + int32_t iHeight); +void McHorVer02WidthEq4_AArch64_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, + int32_t iHeight); +//horizontal and vertical filter to gain half sample, that is (2, 2) location in quarter sample +void McHorVer22WidthEq16_AArch64_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, + int32_t iHeight); +void McHorVer22WidthEq8_AArch64_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, + int32_t iHeight); +void McHorVer22WidthEq4_AArch64_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, + int32_t iHeight); +void PixStrideAvgWidthEq16_AArch64_neon (uint8_t* pDst, int32_t iDstStride, const uint8_t* pSrcA, int32_t iSrcStrideA, + const uint8_t* pSrcB, int32_t iSrcStrideB, int32_t iHeight); +void PixStrideAvgWidthEq8_AArch64_neon (uint8_t* pDst, int32_t iDstStride, const uint8_t* pSrcA, int32_t iSrcStrideA, + const uint8_t* pSrcB, int32_t iSrcStrideB, int32_t iHeight); +void McHorVer20Width17_AArch64_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, + int32_t iHeight);// width+1 +void McHorVer20Width9_AArch64_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, + int32_t iHeight);// width+1 +void McHorVer02Height17_AArch64_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, + int32_t iHeight);// height+1 +void McHorVer02Height9_AArch64_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, + int32_t iHeight);// height+1 +void McHorVer22Width17_AArch64_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, + int32_t iHeight);//width+1&&height+1 +void McHorVer22Width9_AArch64_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, + int32_t iHeight);//width+1&&height+1 #endif #if defined(X86_ASM) @@ -131,18 +242,21 @@ void McHorVer22Width8HorFirst_sse2 (const uint8_t* pSrc, int32_t iSrcStride, uin void McHorVer22Width8VerLastAlign_sse2 (const uint8_t* pTap, int32_t iTapStride, uint8_t* pDst, int32_t iDstStride, int32_t iWidth, int32_t iHeight); void McHorVer22Width8VerLastUnAlign_sse2 (const uint8_t* pTap, int32_t iTapStride, uint8_t* pDst, int32_t iDstStride, - int32_t iWidth, int32_t iHeight); + int32_t iWidth, int32_t iHeight); void PixelAvgWidthEq16_sse2 (uint8_t* pDst, int32_t iDstStride, const uint8_t* pSrcA, int32_t iSrcAStride, const uint8_t* pSrcB, int32_t iSrcBStride, int32_t iHeight); -void McHorVer20Width9Or17_sse2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iWidth, +void McHorVer20Width9Or17_sse2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, + int32_t iWidth, int32_t iHeight); -void McHorVer02Height9Or17_sse2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iWidth, +void McHorVer02Height9Or17_sse2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, + int32_t iWidth, int32_t iHeight); -void McHorVer22HorFirst_sse2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pTap, int32_t iTapStride, int32_t iWidth, +void McHorVer22HorFirst_sse2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pTap, int32_t iTapStride, + int32_t iWidth, int32_t iHeight); //***************************************************************************// diff --git a/codec/common/inc/mc_common.h.orig b/codec/common/inc/mc_common.h.orig new file mode 100644 index 00000000..65dfe439 --- /dev/null +++ b/codec/common/inc/mc_common.h.orig @@ -0,0 +1,204 @@ +/*! + * \copy + * Copyright (c) 2013, Cisco Systems + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS + * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE + * COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN + * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + */ + +#ifndef MC_COMMON_H +#define MC_COMMON_H + +#include "typedefs.h" + +#if defined(__cplusplus) +extern "C" { +#endif//__cplusplus + +#if defined(HAVE_NEON) +void McCopyWidthEq4_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight); + +void McCopyWidthEq8_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight); + +void McCopyWidthEq16_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight); + +void McChromaWidthEq8_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t* pWeights, int32_t iHeight); + +void McChromaWidthEq4_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t* pWeights, int32_t iHeight); + +void PixelAvgWidthEq16_neon(uint8_t* pDst, int32_t iDstStride, uint8_t* pSrcA, uint8_t* pSrcB, int32_t iHeight); +void PixelAvgWidthEq8_neon(uint8_t* pDst, int32_t iDstStride, uint8_t* pSrcA, uint8_t* pSrcB, int32_t iHeight); +void PixelAvgWidthEq4_neon(uint8_t* pDst, int32_t iDstStride, uint8_t* pSrcA, uint8_t* pSrcB, int32_t iHeight); + +void McHorVer01WidthEq16_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight); +void McHorVer01WidthEq8_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight); +void McHorVer01WidthEq4_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight); +void McHorVer03WidthEq16_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight); +void McHorVer03WidthEq8_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight); +void McHorVer03WidthEq4_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight); + +void McHorVer10WidthEq16_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight); +void McHorVer10WidthEq8_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight); +void McHorVer10WidthEq4_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight); +void McHorVer30WidthEq16_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight); +void McHorVer30WidthEq8_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight); +void McHorVer30WidthEq4_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight); + + //horizontal filter to gain half sample, that is (2, 0) location in quarter sample +void McHorVer20WidthEq16_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight); +void McHorVer20WidthEq8_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight); +void McHorVer20WidthEq4_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight); + + //vertical filter to gain half sample, that is (0, 2) location in quarter sample +void McHorVer02WidthEq16_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight); +void McHorVer02WidthEq8_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight); +void McHorVer02WidthEq4_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight); + + //horizontal and vertical filter to gain half sample, that is (2, 2) location in quarter sample +void McHorVer22WidthEq16_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight); +void McHorVer22WidthEq8_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight); +void McHorVer22WidthEq4_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight); + +void PixStrideAvgWidthEq16_neon(uint8_t* pDst, int32_t iDstStride, const uint8_t* pSrcA, int32_t iSrcStrideA, const uint8_t* pSrcB, int32_t iSrcStrideB, int32_t iHeight); +void PixStrideAvgWidthEq8_neon(uint8_t* pDst, int32_t iDstStride, const uint8_t* pSrcA, int32_t iSrcStrideA, const uint8_t* pSrcB, int32_t iSrcStrideB, int32_t iHeight); + +void McHorVer20Width17_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);// width+1 +void McHorVer20Width9_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);// width+1 + +void McHorVer02Height17_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);// height+1 +void McHorVer02Height9_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);// height+1 + +void McHorVer22Width17_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);//width+1&&height+1 +void McHorVer22Width9_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);//width+1&&height+1 +#endif + +#if defined(HAVE_NEON_AARCH64) +void McCopyWidthEq4_AArch64_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight); +void McCopyWidthEq8_AArch64_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight); +void McCopyWidthEq16_AArch64_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight); +void McChromaWidthEq8_AArch64_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t* pWeights, int32_t iHeight); +void McChromaWidthEq4_AArch64_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t* pWeights, int32_t iHeight); +void PixelAvgWidthEq16_AArch64_neon(uint8_t* pDst, int32_t iDstStride, const uint8_t* pSrcA, int32_t iSrcAStride, const uint8_t* pSrcB, int32_t iSrcBStride, int32_t iHeight); +void PixelAvgWidthEq8_AArch64_neon(uint8_t* pDst, int32_t iDstStride, const uint8_t* pSrcA, int32_t iSrcAStride, const uint8_t* pSrcB, int32_t iSrcBStride, int32_t iHeight); +void PixelAvgWidthEq4_AArch64_neon(uint8_t* pDst, int32_t iDstStride, const uint8_t* pSrcA, int32_t iSrcAStride, const uint8_t* pSrcB, int32_t iSrcBStride, int32_t iHeight); +void McHorVer01WidthEq16_AArch64_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight); +void McHorVer01WidthEq8_AArch64_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight); +void McHorVer01WidthEq4_AArch64_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight); +void McHorVer03WidthEq16_AArch64_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight); +void McHorVer03WidthEq8_AArch64_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight); +void McHorVer03WidthEq4_AArch64_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight); +void McHorVer10WidthEq16_AArch64_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight); +void McHorVer10WidthEq8_AArch64_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight); +void McHorVer10WidthEq4_AArch64_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight); +void McHorVer30WidthEq16_AArch64_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight); +void McHorVer30WidthEq8_AArch64_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight); +void McHorVer30WidthEq4_AArch64_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight); + //horizontal filter to gain half sample, that is (2, 0) location in quarter sample +void McHorVer20WidthEq16_AArch64_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight); +void McHorVer20WidthEq8_AArch64_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight); +void McHorVer20WidthEq4_AArch64_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight); + //vertical filter to gain half sample, that is (0, 2) location in quarter sample +void McHorVer02WidthEq16_AArch64_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight); +void McHorVer02WidthEq8_AArch64_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight); +void McHorVer02WidthEq4_AArch64_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight); + //horizontal and vertical filter to gain half sample, that is (2, 2) location in quarter sample +void McHorVer22WidthEq16_AArch64_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight); +void McHorVer22WidthEq8_AArch64_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight); +void McHorVer22WidthEq4_AArch64_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight); +void PixStrideAvgWidthEq16_AArch64_neon(uint8_t* pDst, int32_t iDstStride, const uint8_t* pSrcA, int32_t iSrcStrideA, const uint8_t* pSrcB, int32_t iSrcStrideB, int32_t iHeight); +void PixStrideAvgWidthEq8_AArch64_neon(uint8_t* pDst, int32_t iDstStride, const uint8_t* pSrcA, int32_t iSrcStrideA, const uint8_t* pSrcB, int32_t iSrcStrideB, int32_t iHeight); +void McHorVer20Width17_AArch64_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);// width+1 +void McHorVer20Width9_AArch64_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);// width+1 +void McHorVer02Height17_AArch64_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);// height+1 +void McHorVer02Height9_AArch64_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);// height+1 +void McHorVer22Width17_AArch64_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);//width+1&&height+1 +void McHorVer22Width9_AArch64_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);//width+1&&height+1 +#endif + +#if defined(X86_ASM) +//***************************************************************************// +// MMXEXT definition // +//***************************************************************************// +void McHorVer20WidthEq4_mmx (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, + int32_t iHeight); +void McChromaWidthEq4_mmx (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, + const uint8_t* kpABCD, int32_t iHeight); +void McCopyWidthEq4_mmx (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, + int32_t iHeight); +void McCopyWidthEq8_mmx (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, + int32_t iHeight); +void PixelAvgWidthEq4_mmx (uint8_t* pDst, int32_t iDstStride, const uint8_t* pSrcA, int32_t iSrcAStride, + const uint8_t* pSrcB, int32_t iSrcBStride, int32_t iHeight); +void PixelAvgWidthEq8_mmx (uint8_t* pDst, int32_t iDstStride, const uint8_t* pSrcA, int32_t iSrcAStride, + const uint8_t* pSrcB, int32_t iSrcBStride, int32_t iHeight); + +//***************************************************************************// +// SSE2 definition // +//***************************************************************************// +void McChromaWidthEq8_sse2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, + const uint8_t* kpABCD, int32_t iHeight); +void McCopyWidthEq16_sse2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, + int32_t iHeight); +void McHorVer20WidthEq8_sse2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, + int32_t iHeight); +void McHorVer20WidthEq16_sse2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, + int32_t iHeight); +void McHorVer02WidthEq8_sse2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, + int32_t iHeight); +void McHorVer22Width8HorFirst_sse2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, + int32_t iHeight); +void McHorVer22Width8VerLastAlign_sse2 (const uint8_t* pTap, int32_t iTapStride, uint8_t* pDst, int32_t iDstStride, + int32_t iWidth, int32_t iHeight); +void McHorVer22Width8VerLastUnAlign_sse2 (const uint8_t* pTap, int32_t iTapStride, uint8_t* pDst, int32_t iDstStride, + int32_t iWidth, int32_t iHeight); + +void PixelAvgWidthEq16_sse2 (uint8_t* pDst, int32_t iDstStride, const uint8_t* pSrcA, int32_t iSrcAStride, + const uint8_t* pSrcB, int32_t iSrcBStride, int32_t iHeight); + +void McHorVer20Width9Or17_sse2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iWidth, + int32_t iHeight); + +void McHorVer02Height9Or17_sse2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iWidth, + int32_t iHeight); + +void McHorVer22HorFirst_sse2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pTap, int32_t iTapStride, int32_t iWidth, + int32_t iHeight); + +//***************************************************************************// +// SSSE3 definition // +//***************************************************************************// + +void McChromaWidthEq8_ssse3 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, + const uint8_t* kpABCD, int32_t iHeight); + +#endif //X86_ASM + +#if defined(__cplusplus) +} +#endif//__cplusplus + +#endif//MC_COMMON_H diff --git a/codec/decoder/core/src/mc.cpp b/codec/decoder/core/src/mc.cpp index defb2ebd..e0c5d84b 100644 --- a/codec/decoder/core/src/mc.cpp +++ b/codec/decoder/core/src/mc.cpp @@ -85,13 +85,13 @@ static const uint8_t g_kuiABCD[8][8][4] = { //g_kA[dy][dx], g_kB[dy][dx], g_kC[d }; typedef void (*PWelsMcWidthHeightFunc) (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, - int32_t iWidth, int32_t iHeight); + int32_t iWidth, int32_t iHeight); //***************************************************************************// // C code implementation // //***************************************************************************// static inline void McCopyWidthEq2_c (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, - int32_t iHeight) { + int32_t iHeight) { int32_t i; for (i = 0; i < iHeight; i++) { // iWidth == 2 only for chroma ST16A2 (pDst, LD16 (pSrc)); @@ -101,7 +101,7 @@ static inline void McCopyWidthEq2_c (const uint8_t* pSrc, int32_t iSrcStride, ui } static inline void McCopyWidthEq4_c (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, - int32_t iHeight) { + int32_t iHeight) { int32_t i; for (i = 0; i < iHeight; i++) { ST32A4 (pDst, LD32 (pSrc)); @@ -111,7 +111,7 @@ static inline void McCopyWidthEq4_c (const uint8_t* pSrc, int32_t iSrcStride, ui } static inline void McCopyWidthEq8_c (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, - int32_t iHeight) { + int32_t iHeight) { int32_t i; for (i = 0; i < iHeight; i++) { ST64A8 (pDst, LD64 (pSrc)); @@ -121,7 +121,7 @@ static inline void McCopyWidthEq8_c (const uint8_t* pSrc, int32_t iSrcStride, ui } static inline void McCopyWidthEq16_c (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, - int32_t iHeight) { + int32_t iHeight) { int32_t i; for (i = 0; i < iHeight; i++) { ST64A8 (pDst , LD64 (pSrc)); @@ -138,7 +138,7 @@ static inline int32_t HorFilterInput16bit_c (int16_t* pSrc) { int32_t iPix14 = pSrc[-1] + pSrc[2]; int32_t iPix23 = pSrc[ 0] + pSrc[1]; - return (iPix05 - (iPix14 * 5)+ (iPix23 * 20)); + return (iPix05 - (iPix14 * 5) + (iPix23 * 20)); } // h: iOffset=1 / v: iOffset=iSrcStride static inline int32_t FilterInput8bitWithStride_c (const uint8_t* pSrc, const int32_t kiOffset) { @@ -153,7 +153,7 @@ static inline int32_t FilterInput8bitWithStride_c (const uint8_t* pSrc, const in } static inline void PixelAvg_c (uint8_t* pDst, int32_t iDstStride, const uint8_t* pSrcA, int32_t iSrcAStride, - const uint8_t* pSrcB, int32_t iSrcBStride, int32_t iWidth, int32_t iHeight) { + const uint8_t* pSrcB, int32_t iSrcBStride, int32_t iWidth, int32_t iHeight) { int32_t i, j; for (i = 0; i < iHeight; i++) { for (j = 0; j < iWidth; j++) { @@ -165,7 +165,7 @@ static inline void PixelAvg_c (uint8_t* pDst, int32_t iDstStride, const uint8_t* } } static inline void McCopy_c (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iWidth, - int32_t iHeight) { + int32_t iHeight) { if (iWidth == 16) McCopyWidthEq16_c (pSrc, iSrcStride, pDst, iDstStride, iHeight); else if (iWidth == 8) @@ -176,8 +176,9 @@ static inline void McCopy_c (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* p McCopyWidthEq2_c (pSrc, iSrcStride, pDst, iDstStride, iHeight); } -static inline void McHorVer20_c (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iWidth, - int32_t iHeight) { +static inline void McHorVer20_c (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, + int32_t iWidth, + int32_t iHeight) { int32_t i, j; for (i = 0; i < iHeight; i++) { for (j = 0; j < iWidth; j++) { @@ -188,8 +189,9 @@ static inline void McHorVer20_c (const uint8_t* pSrc, int32_t iSrcStride, uint8_ } } -static inline void McHorVer02_c (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iWidth, - int32_t iHeight) { +static inline void McHorVer02_c (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, + int32_t iWidth, + int32_t iHeight) { int32_t i, j; for (i = 0; i < iHeight; i++) { for (j = 0; j < iWidth; j++) { @@ -200,8 +202,9 @@ static inline void McHorVer02_c (const uint8_t* pSrc, int32_t iSrcStride, uint8_ } } -static inline void McHorVer22_c (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iWidth, - int32_t iHeight) { +static inline void McHorVer22_c (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, + int32_t iWidth, + int32_t iHeight) { int16_t iTmp[16 + 5]; //16 int32_t i, j, k; @@ -218,88 +221,100 @@ static inline void McHorVer22_c (const uint8_t* pSrc, int32_t iSrcStride, uint8_ } /////////////////////luma MC////////////////////////// -static inline void McHorVer01_c (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iWidth, - int32_t iHeight) { +static inline void McHorVer01_c (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, + int32_t iWidth, + int32_t iHeight) { uint8_t uiTmp[256]; McHorVer02_c (pSrc, iSrcStride, uiTmp, 16, iWidth, iHeight); PixelAvg_c (pDst, iDstStride, pSrc, iSrcStride, uiTmp, 16, iWidth, iHeight); } -static inline void McHorVer03_c (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iWidth, - int32_t iHeight) { +static inline void McHorVer03_c (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, + int32_t iWidth, + int32_t iHeight) { uint8_t uiTmp[256]; McHorVer02_c (pSrc, iSrcStride, uiTmp, 16, iWidth, iHeight); PixelAvg_c (pDst, iDstStride, pSrc + iSrcStride, iSrcStride, uiTmp, 16, iWidth, iHeight); } -static inline void McHorVer10_c (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iWidth, - int32_t iHeight) { +static inline void McHorVer10_c (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, + int32_t iWidth, + int32_t iHeight) { uint8_t uiTmp[256]; McHorVer20_c (pSrc, iSrcStride, uiTmp, 16, iWidth, iHeight); PixelAvg_c (pDst, iDstStride, pSrc, iSrcStride, uiTmp, 16, iWidth, iHeight); } -static inline void McHorVer11_c (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iWidth, - int32_t iHeight) { +static inline void McHorVer11_c (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, + int32_t iWidth, + int32_t iHeight) { uint8_t uiHorTmp[256]; uint8_t uiVerTmp[256]; McHorVer20_c (pSrc, iSrcStride, uiHorTmp, 16, iWidth, iHeight); McHorVer02_c (pSrc, iSrcStride, uiVerTmp, 16, iWidth, iHeight); PixelAvg_c (pDst, iDstStride, uiHorTmp, 16, uiVerTmp, 16, iWidth, iHeight); } -static inline void McHorVer12_c (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iWidth, - int32_t iHeight) { +static inline void McHorVer12_c (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, + int32_t iWidth, + int32_t iHeight) { uint8_t uiVerTmp[256]; uint8_t uiCtrTmp[256]; McHorVer02_c (pSrc, iSrcStride, uiVerTmp, 16, iWidth, iHeight); McHorVer22_c (pSrc, iSrcStride, uiCtrTmp, 16, iWidth, iHeight); PixelAvg_c (pDst, iDstStride, uiVerTmp, 16, uiCtrTmp, 16, iWidth, iHeight); } -static inline void McHorVer13_c (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iWidth, - int32_t iHeight) { +static inline void McHorVer13_c (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, + int32_t iWidth, + int32_t iHeight) { uint8_t uiHorTmp[256]; uint8_t uiVerTmp[256]; McHorVer20_c (pSrc + iSrcStride, iSrcStride, uiHorTmp, 16, iWidth, iHeight); McHorVer02_c (pSrc, iSrcStride, uiVerTmp, 16, iWidth, iHeight); PixelAvg_c (pDst, iDstStride, uiHorTmp, 16, uiVerTmp, 16, iWidth, iHeight); } -static inline void McHorVer21_c (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iWidth, - int32_t iHeight) { +static inline void McHorVer21_c (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, + int32_t iWidth, + int32_t iHeight) { uint8_t uiHorTmp[256]; uint8_t uiCtrTmp[256]; McHorVer20_c (pSrc, iSrcStride, uiHorTmp, 16, iWidth, iHeight); McHorVer22_c (pSrc, iSrcStride, uiCtrTmp, 16, iWidth, iHeight); PixelAvg_c (pDst, iDstStride, uiHorTmp, 16, uiCtrTmp, 16, iWidth, iHeight); } -static inline void McHorVer23_c (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iWidth, - int32_t iHeight) { +static inline void McHorVer23_c (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, + int32_t iWidth, + int32_t iHeight) { uint8_t uiHorTmp[256]; uint8_t uiCtrTmp[256]; McHorVer20_c (pSrc + iSrcStride, iSrcStride, uiHorTmp, 16, iWidth, iHeight); McHorVer22_c (pSrc, iSrcStride, uiCtrTmp, 16, iWidth, iHeight); PixelAvg_c (pDst, iDstStride, uiHorTmp, 16, uiCtrTmp, 16, iWidth, iHeight); } -static inline void McHorVer30_c (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iWidth, - int32_t iHeight) { +static inline void McHorVer30_c (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, + int32_t iWidth, + int32_t iHeight) { uint8_t uiHorTmp[256]; McHorVer20_c (pSrc, iSrcStride, uiHorTmp, 16, iWidth, iHeight); PixelAvg_c (pDst, iDstStride, pSrc + 1, iSrcStride, uiHorTmp, 16, iWidth, iHeight); } -static inline void McHorVer31_c (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iWidth, - int32_t iHeight) { +static inline void McHorVer31_c (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, + int32_t iWidth, + int32_t iHeight) { uint8_t uiHorTmp[256]; uint8_t uiVerTmp[256]; McHorVer20_c (pSrc, iSrcStride, uiHorTmp, 16, iWidth, iHeight); McHorVer02_c (pSrc + 1, iSrcStride, uiVerTmp, 16, iWidth, iHeight); PixelAvg_c (pDst, iDstStride, uiHorTmp, 16, uiVerTmp, 16, iWidth, iHeight); } -static inline void McHorVer32_c (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iWidth, - int32_t iHeight) { +static inline void McHorVer32_c (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, + int32_t iWidth, + int32_t iHeight) { uint8_t uiVerTmp[256]; uint8_t uiCtrTmp[256]; McHorVer02_c (pSrc + 1, iSrcStride, uiVerTmp, 16, iWidth, iHeight); McHorVer22_c (pSrc, iSrcStride, uiCtrTmp, 16, iWidth, iHeight); PixelAvg_c (pDst, iDstStride, uiVerTmp, 16, uiCtrTmp, 16, iWidth, iHeight); } -static inline void McHorVer33_c (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iWidth, - int32_t iHeight) { +static inline void McHorVer33_c (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, + int32_t iWidth, + int32_t iHeight) { uint8_t uiHorTmp[256]; uint8_t uiVerTmp[256]; McHorVer20_c (pSrc + iSrcStride, iSrcStride, uiHorTmp, 16, iWidth, iHeight); @@ -308,7 +323,7 @@ static inline void McHorVer33_c (const uint8_t* pSrc, int32_t iSrcStride, uint8_ } void McLuma_c (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, - int16_t iMvX, int16_t iMvY, int32_t iWidth, int32_t iHeight) + int16_t iMvX, int16_t iMvY, int32_t iWidth, int32_t iHeight) //pSrc has been added the offset of mv { static const PWelsMcWidthHeightFunc pWelsMcFunc[4][4] = { //[x][y] @@ -326,7 +341,7 @@ static inline void McChromaWithFragMv_c (const uint8_t* pSrc, int32_t iSrcStride int32_t i, j; int32_t iA, iB, iC, iD; const uint8_t* pSrcNext = pSrc + iSrcStride; - const uint8_t *pABCD = g_kuiABCD[iMvY & 0x07][iMvX & 0x07]; + const uint8_t* pABCD = g_kuiABCD[iMvY & 0x07][iMvX & 0x07]; iA = pABCD[0]; iB = pABCD[1]; iC = pABCD[2]; @@ -342,7 +357,7 @@ static inline void McChromaWithFragMv_c (const uint8_t* pSrc, int32_t iSrcStride } void McChroma_c (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, - int16_t iMvX, int16_t iMvY, int32_t iWidth, int32_t iHeight) + int16_t iMvX, int16_t iMvY, int32_t iWidth, int32_t iHeight) //pSrc has been added the offset of mv { const int32_t kiD8x = iMvX & 0x07; @@ -376,8 +391,9 @@ static inline void McHorVer22WidthEq16_sse2 (const uint8_t* pSrc, int32_t iSrcSt McHorVer22WidthEq8_sse2 (&pSrc[8], iSrcStride, &pDst[8], iDstStride, iHeight); } -static inline void McCopy_sse2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iWidth, - int32_t iHeight) { +static inline void McCopy_sse2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, + int32_t iWidth, + int32_t iHeight) { if (iWidth == 16) McCopyWidthEq16_sse2 (pSrc, iSrcStride, pDst, iDstStride, iHeight); else if (iWidth == 8) @@ -389,7 +405,7 @@ static inline void McCopy_sse2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t } static inline void McHorVer20_sse2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, - int32_t iWidth, int32_t iHeight) { + int32_t iWidth, int32_t iHeight) { if (iWidth == 16) McHorVer20WidthEq16_sse2 (pSrc, iSrcStride, pDst, iDstStride, iHeight); else if (iWidth == 8) @@ -399,7 +415,7 @@ static inline void McHorVer20_sse2 (const uint8_t* pSrc, int32_t iSrcStride, uin } static inline void McHorVer02_sse2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, - int32_t iWidth, int32_t iHeight) { + int32_t iWidth, int32_t iHeight) { if (iWidth == 16) McHorVer02WidthEq16_sse2 (pSrc, iSrcStride, pDst, iDstStride, iHeight); else if (iWidth == 8) @@ -409,7 +425,7 @@ static inline void McHorVer02_sse2 (const uint8_t* pSrc, int32_t iSrcStride, uin } static inline void McHorVer22_sse2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, - int32_t iWidth, int32_t iHeight) { + int32_t iWidth, int32_t iHeight) { if (iWidth == 16) McHorVer22WidthEq16_sse2 (pSrc, iSrcStride, pDst, iDstStride, iHeight); else if (iWidth == 8) @@ -419,7 +435,7 @@ static inline void McHorVer22_sse2 (const uint8_t* pSrc, int32_t iSrcStride, uin } static inline void McHorVer01_sse2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, - int32_t iWidth, int32_t iHeight) { + int32_t iWidth, int32_t iHeight) { ENFORCE_STACK_ALIGN_1D (uint8_t, pTmp, 256, 16); if (iWidth == 16) { McHorVer02WidthEq16_sse2 (pSrc, iSrcStride, pTmp, 16, iHeight); @@ -433,7 +449,7 @@ static inline void McHorVer01_sse2 (const uint8_t* pSrc, int32_t iSrcStride, uin } } static inline void McHorVer03_sse2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, - int32_t iWidth, int32_t iHeight) { + int32_t iWidth, int32_t iHeight) { ENFORCE_STACK_ALIGN_1D (uint8_t, pTmp, 256, 16); if (iWidth == 16) { McHorVer02WidthEq16_sse2 (pSrc, iSrcStride, pTmp, 16, iHeight); @@ -447,7 +463,7 @@ static inline void McHorVer03_sse2 (const uint8_t* pSrc, int32_t iSrcStride, uin } } static inline void McHorVer10_sse2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, - int32_t iWidth, int32_t iHeight) { + int32_t iWidth, int32_t iHeight) { ENFORCE_STACK_ALIGN_1D (uint8_t, pTmp, 256, 16); if (iWidth == 16) { McHorVer20WidthEq16_sse2 (pSrc, iSrcStride, pTmp, 16, iHeight); @@ -461,7 +477,7 @@ static inline void McHorVer10_sse2 (const uint8_t* pSrc, int32_t iSrcStride, uin } } static inline void McHorVer11_sse2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, - int32_t iWidth, int32_t iHeight) { + int32_t iWidth, int32_t iHeight) { ENFORCE_STACK_ALIGN_1D (uint8_t, pHorTmp, 256, 16); ENFORCE_STACK_ALIGN_1D (uint8_t, pVerTmp, 256, 16); if (iWidth == 16) { @@ -479,7 +495,7 @@ static inline void McHorVer11_sse2 (const uint8_t* pSrc, int32_t iSrcStride, uin } } static inline void McHorVer12_sse2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, - int32_t iWidth, int32_t iHeight) { + int32_t iWidth, int32_t iHeight) { ENFORCE_STACK_ALIGN_1D (uint8_t, pVerTmp, 256, 16); ENFORCE_STACK_ALIGN_1D (uint8_t, pCtrTmp, 256, 16); if (iWidth == 16) { @@ -497,7 +513,7 @@ static inline void McHorVer12_sse2 (const uint8_t* pSrc, int32_t iSrcStride, uin } } static inline void McHorVer13_sse2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, - int32_t iWidth, int32_t iHeight) { + int32_t iWidth, int32_t iHeight) { ENFORCE_STACK_ALIGN_1D (uint8_t, pHorTmp, 256, 16); ENFORCE_STACK_ALIGN_1D (uint8_t, pVerTmp, 256, 16); if (iWidth == 16) { @@ -515,7 +531,7 @@ static inline void McHorVer13_sse2 (const uint8_t* pSrc, int32_t iSrcStride, uin } } static inline void McHorVer21_sse2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, - int32_t iWidth, int32_t iHeight) { + int32_t iWidth, int32_t iHeight) { ENFORCE_STACK_ALIGN_1D (uint8_t, pHorTmp, 256, 16); ENFORCE_STACK_ALIGN_1D (uint8_t, pCtrTmp, 256, 16); if (iWidth == 16) { @@ -533,7 +549,7 @@ static inline void McHorVer21_sse2 (const uint8_t* pSrc, int32_t iSrcStride, uin } } static inline void McHorVer23_sse2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, - int32_t iWidth, int32_t iHeight) { + int32_t iWidth, int32_t iHeight) { ENFORCE_STACK_ALIGN_1D (uint8_t, pHorTmp, 256, 16); ENFORCE_STACK_ALIGN_1D (uint8_t, pCtrTmp, 256, 16); if (iWidth == 16) { @@ -551,7 +567,7 @@ static inline void McHorVer23_sse2 (const uint8_t* pSrc, int32_t iSrcStride, uin } } static inline void McHorVer30_sse2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, - int32_t iWidth, int32_t iHeight) { + int32_t iWidth, int32_t iHeight) { ENFORCE_STACK_ALIGN_1D (uint8_t, pHorTmp, 256, 16); if (iWidth == 16) { McHorVer20WidthEq16_sse2 (pSrc, iSrcStride, pHorTmp, 16, iHeight); @@ -565,7 +581,7 @@ static inline void McHorVer30_sse2 (const uint8_t* pSrc, int32_t iSrcStride, uin } } static inline void McHorVer31_sse2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, - int32_t iWidth, int32_t iHeight) { + int32_t iWidth, int32_t iHeight) { ENFORCE_STACK_ALIGN_1D (uint8_t, pHorTmp, 256, 16); ENFORCE_STACK_ALIGN_1D (uint8_t, pVerTmp, 256, 16); if (iWidth == 16) { @@ -583,7 +599,7 @@ static inline void McHorVer31_sse2 (const uint8_t* pSrc, int32_t iSrcStride, uin } } static inline void McHorVer32_sse2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, - int32_t iWidth, int32_t iHeight) { + int32_t iWidth, int32_t iHeight) { ENFORCE_STACK_ALIGN_1D (uint8_t, pVerTmp, 256, 16); ENFORCE_STACK_ALIGN_1D (uint8_t, pCtrTmp, 256, 16); if (iWidth == 16) { @@ -601,7 +617,7 @@ static inline void McHorVer32_sse2 (const uint8_t* pSrc, int32_t iSrcStride, uin } } static inline void McHorVer33_sse2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, - int32_t iWidth, int32_t iHeight) { + int32_t iWidth, int32_t iHeight) { ENFORCE_STACK_ALIGN_1D (uint8_t, pHorTmp, 256, 16); ENFORCE_STACK_ALIGN_1D (uint8_t, pVerTmp, 256, 16); if (iWidth == 16) { @@ -620,7 +636,7 @@ static inline void McHorVer33_sse2 (const uint8_t* pSrc, int32_t iSrcStride, uin } void McLuma_sse2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, - int16_t iMvX, int16_t iMvY, int32_t iWidth, int32_t iHeight) + int16_t iMvX, int16_t iMvY, int32_t iWidth, int32_t iHeight) //pSrc has been added the offset of mv { static const PWelsMcWidthHeightFunc pWelsMcFunc[4][4] = { //[x][y] @@ -634,9 +650,9 @@ void McLuma_sse2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_ } void McChroma_sse2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, - int16_t iMvX, int16_t iMvY, int32_t iWidth, int32_t iHeight) { + int16_t iMvX, int16_t iMvY, int32_t iWidth, int32_t iHeight) { static const PMcChromaWidthExtFunc kpMcChromaWidthFuncs[2] = { - McChromaWidthEq4_mmx, + McChromaWidthEq4_mmx, McChromaWidthEq8_sse2 }; const int32_t kiD8x = iMvX & 0x07; @@ -656,331 +672,528 @@ void McChroma_sse2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int3 // NEON implementation // //***************************************************************************// #if defined(HAVE_NEON) -void McCopy_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, - int32_t iWidth, int32_t iHeight) -{ +void McCopy_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, + int32_t iWidth, int32_t iHeight) { if (16 == iWidth) - McCopyWidthEq16_neon(pSrc, iSrcStride, pDst, iDstStride, iHeight); - else if(8 == iWidth) - McCopyWidthEq8_neon(pSrc, iSrcStride, pDst, iDstStride, iHeight); - else - McCopyWidthEq4_neon(pSrc, iSrcStride, pDst, iDstStride, iHeight); - } -void McHorVer20_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, - int32_t iWidth, int32_t iHeight) -{ + McCopyWidthEq16_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight); + else if (8 == iWidth) + McCopyWidthEq8_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight); + else + McCopyWidthEq4_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight); +} +void McHorVer20_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, + int32_t iWidth, int32_t iHeight) { if (iWidth == 16) - McHorVer20WidthEq16_neon(pSrc, iSrcStride, pDst, iDstStride, iHeight); - else if (iWidth == 8) - McHorVer20WidthEq8_neon(pSrc, iSrcStride, pDst, iDstStride, iHeight); - else if (iWidth == 4) - McHorVer20WidthEq4_neon(pSrc, iSrcStride, pDst, iDstStride, iHeight); -} -void McHorVer02_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, - int32_t iWidth, int32_t iHeight) -{ - if (iWidth == 16) - McHorVer02WidthEq16_neon(pSrc, iSrcStride, pDst, iDstStride, iHeight); - else if (iWidth == 8) - McHorVer02WidthEq8_neon(pSrc, iSrcStride, pDst, iDstStride, iHeight); - else if (iWidth == 4) - McHorVer02WidthEq4_neon(pSrc, iSrcStride, pDst, iDstStride, iHeight); -} -void McHorVer22_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, - int32_t iWidth, int32_t iHeight) -{ - if (iWidth == 16) - McHorVer22WidthEq16_neon(pSrc, iSrcStride, pDst, iDstStride, iHeight); - else if (iWidth == 8) - McHorVer22WidthEq8_neon(pSrc, iSrcStride, pDst, iDstStride, iHeight); - else if (iWidth == 4) - McHorVer22WidthEq4_neon(pSrc, iSrcStride, pDst, iDstStride, iHeight); -} - -void McHorVer01_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, - int32_t iWidth, int32_t iHeight) -{ - if (iWidth == 16) - McHorVer01WidthEq16_neon(pSrc, iSrcStride, pDst, iDstStride, iHeight); + McHorVer20WidthEq16_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight); else if (iWidth == 8) - McHorVer01WidthEq8_neon(pSrc, iSrcStride, pDst, iDstStride, iHeight); + McHorVer20WidthEq8_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight); else if (iWidth == 4) - McHorVer01WidthEq4_neon(pSrc, iSrcStride, pDst, iDstStride, iHeight); + McHorVer20WidthEq4_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight); } -void McHorVer03_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, - int32_t iWidth, int32_t iHeight) -{ +void McHorVer02_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, + int32_t iWidth, int32_t iHeight) { if (iWidth == 16) - McHorVer03WidthEq16_neon(pSrc, iSrcStride, pDst, iDstStride, iHeight); - else if (iWidth == 8) - McHorVer03WidthEq8_neon(pSrc, iSrcStride, pDst, iDstStride, iHeight); - else if (iWidth == 4) - McHorVer03WidthEq4_neon(pSrc, iSrcStride, pDst, iDstStride, iHeight); -} -void McHorVer10_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, - int32_t iWidth, int32_t iHeight) -{ - if (iWidth == 16) - McHorVer10WidthEq16_neon(pSrc, iSrcStride, pDst, iDstStride, iHeight); - else if (iWidth == 8) - McHorVer10WidthEq8_neon(pSrc, iSrcStride, pDst, iDstStride, iHeight); - else if (iWidth == 4) - McHorVer10WidthEq4_neon(pSrc, iSrcStride, pDst, iDstStride, iHeight); -} -void McHorVer11_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, - int32_t iWidth, int32_t iHeight) -{ - ENFORCE_STACK_ALIGN_1D( uint8_t, pHorTmp, 256, 16 ); - ENFORCE_STACK_ALIGN_1D( uint8_t, pVerTmp, 256, 16 ); - if (iWidth == 16) - { - McHorVer20WidthEq16_neon(pSrc, iSrcStride, pHorTmp, 16, iHeight); - McHorVer02WidthEq16_neon(pSrc, iSrcStride, pVerTmp, 16, iHeight); - PixelAvgWidthEq16_neon(pDst, iDstStride, pHorTmp, pVerTmp, iHeight); - } - else if (iWidth == 8) - { - McHorVer20WidthEq8_neon(pSrc, iSrcStride, pHorTmp, 16, iHeight); - McHorVer02WidthEq8_neon(pSrc, iSrcStride, pVerTmp, 16, iHeight); - PixelAvgWidthEq8_neon(pDst, iDstStride, pHorTmp, pVerTmp, iHeight); - } - else if (iWidth == 4) - { - McHorVer20WidthEq4_neon(pSrc, iSrcStride, pHorTmp, 16, iHeight); - McHorVer02WidthEq4_neon(pSrc, iSrcStride, pVerTmp, 16, iHeight); - PixelAvgWidthEq4_neon(pDst, iDstStride, pHorTmp, pVerTmp, iHeight); - } -} -void McHorVer12_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, - int32_t iWidth, int32_t iHeight) -{ - ENFORCE_STACK_ALIGN_1D( uint8_t, pVerTmp, 256, 16 ); - ENFORCE_STACK_ALIGN_1D( uint8_t, pCtrTmp, 256, 16 ); - if (iWidth == 16) - { - McHorVer02WidthEq16_neon(pSrc, iSrcStride, pVerTmp, 16, iHeight); - McHorVer22WidthEq16_neon(pSrc, iSrcStride, pCtrTmp, 16, iHeight); - PixelAvgWidthEq16_neon(pDst, iDstStride, pVerTmp, pCtrTmp, iHeight); - } - else if (iWidth == 8) - { - McHorVer02WidthEq8_neon(pSrc, iSrcStride, pVerTmp, 16, iHeight); - McHorVer22WidthEq8_neon(pSrc, iSrcStride, pCtrTmp, 16, iHeight); - PixelAvgWidthEq8_neon(pDst, iDstStride, pVerTmp, pCtrTmp, iHeight); - } - else if (iWidth == 4) - { - McHorVer02WidthEq4_neon(pSrc, iSrcStride, pVerTmp, 16, iHeight); - McHorVer22WidthEq4_neon(pSrc, iSrcStride, pCtrTmp, 16, iHeight); - PixelAvgWidthEq4_neon(pDst, iDstStride, pVerTmp, pCtrTmp, iHeight); - } -} -void McHorVer13_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, - int32_t iWidth, int32_t iHeight) -{ - ENFORCE_STACK_ALIGN_1D( uint8_t, pHorTmp, 256, 16 ); - ENFORCE_STACK_ALIGN_1D( uint8_t, pVerTmp, 256, 16 ); - if (iWidth == 16) - { - McHorVer20WidthEq16_neon(pSrc+iSrcStride, iSrcStride, pHorTmp, 16, iHeight); - McHorVer02WidthEq16_neon(pSrc, iSrcStride, pVerTmp, 16, iHeight); - PixelAvgWidthEq16_neon(pDst, iDstStride, pHorTmp, pVerTmp, iHeight); - } - else if (iWidth == 8) - { - McHorVer20WidthEq8_neon(pSrc+iSrcStride, iSrcStride, pHorTmp, 16, iHeight); - McHorVer02WidthEq8_neon(pSrc, iSrcStride, pVerTmp, 16, iHeight); - PixelAvgWidthEq8_neon(pDst, iDstStride, pHorTmp, pVerTmp, iHeight); - } - else if (iWidth == 4) - { - McHorVer20WidthEq4_neon(pSrc+iSrcStride, iSrcStride, pHorTmp, 16, iHeight); - McHorVer02WidthEq4_neon(pSrc, iSrcStride, pVerTmp, 16, iHeight); - PixelAvgWidthEq4_neon(pDst, iDstStride, pHorTmp, pVerTmp, iHeight); - } -} -void McHorVer21_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, - int32_t iWidth, int32_t iHeight) -{ - ENFORCE_STACK_ALIGN_1D( uint8_t, pHorTmp, 256, 16 ); - ENFORCE_STACK_ALIGN_1D( uint8_t, pCtrTmp, 256, 16 ); - if (iWidth == 16) - { - McHorVer20WidthEq16_neon(pSrc, iSrcStride, pHorTmp, 16, iHeight); - McHorVer22WidthEq16_neon(pSrc, iSrcStride, pCtrTmp, 16, iHeight); - PixelAvgWidthEq16_neon(pDst, iDstStride, pHorTmp, pCtrTmp, iHeight); - } - else if (iWidth == 8) - { - McHorVer20WidthEq8_neon(pSrc, iSrcStride, pHorTmp, 16, iHeight); - McHorVer22WidthEq8_neon(pSrc, iSrcStride, pCtrTmp, 16, iHeight); - PixelAvgWidthEq8_neon(pDst, iDstStride, pHorTmp, pCtrTmp, iHeight); - } - else if (iWidth == 4) - { - McHorVer20WidthEq4_neon(pSrc, iSrcStride, pHorTmp, 16, iHeight); - McHorVer22WidthEq4_neon(pSrc, iSrcStride, pCtrTmp, 16, iHeight); - PixelAvgWidthEq4_neon(pDst, iDstStride, pHorTmp, pCtrTmp, iHeight); - } -} -void McHorVer23_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, - int32_t iWidth, int32_t iHeight) -{ - ENFORCE_STACK_ALIGN_1D( uint8_t, pHorTmp, 256, 16 ); - ENFORCE_STACK_ALIGN_1D( uint8_t, pCtrTmp, 256, 16 ); - if (iWidth == 16) - { - McHorVer20WidthEq16_neon(pSrc+iSrcStride, iSrcStride, pHorTmp, 16, iHeight); - McHorVer22WidthEq16_neon(pSrc, iSrcStride, pCtrTmp, 16, iHeight); - PixelAvgWidthEq16_neon(pDst, iDstStride, pHorTmp, pCtrTmp, iHeight); - } + McHorVer02WidthEq16_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight); else if (iWidth == 8) - { - McHorVer20WidthEq8_neon(pSrc+iSrcStride, iSrcStride, pHorTmp, 16, iHeight); - McHorVer22WidthEq8_neon(pSrc, iSrcStride, pCtrTmp, 16, iHeight); - PixelAvgWidthEq8_neon(pDst, iDstStride, pHorTmp, pCtrTmp, iHeight); - } - else if (iWidth == 4) - { - McHorVer20WidthEq4_neon(pSrc+iSrcStride, iSrcStride, pHorTmp, 16, iHeight); - McHorVer22WidthEq4_neon(pSrc, iSrcStride, pCtrTmp, 16, iHeight); - PixelAvgWidthEq4_neon(pDst, iDstStride, pHorTmp, pCtrTmp, iHeight); - } + McHorVer02WidthEq8_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight); + else if (iWidth == 4) + McHorVer02WidthEq4_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight); } -void McHorVer30_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, - int32_t iWidth, int32_t iHeight) -{ +void McHorVer22_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, + int32_t iWidth, int32_t iHeight) { if (iWidth == 16) - McHorVer30WidthEq16_neon(pSrc, iSrcStride, pDst, iDstStride, iHeight); - else if (iWidth == 8) - McHorVer30WidthEq8_neon(pSrc, iSrcStride, pDst, iDstStride, iHeight); - else if (iWidth == 4) - McHorVer30WidthEq4_neon(pSrc, iSrcStride, pDst, iDstStride, iHeight); -} -void McHorVer31_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, - int32_t iWidth, int32_t iHeight) -{ - ENFORCE_STACK_ALIGN_1D( uint8_t, pHorTmp, 256, 16 ); - ENFORCE_STACK_ALIGN_1D( uint8_t, pVerTmp, 256, 16 ); - if (iWidth == 16) { - McHorVer20WidthEq16_neon(pSrc, iSrcStride, pHorTmp, 16, iHeight); - McHorVer02WidthEq16_neon(pSrc+1, iSrcStride, pVerTmp, 16, iHeight); - PixelAvgWidthEq16_neon(pDst, iDstStride, pHorTmp, pVerTmp, iHeight); - } - else if (iWidth == 8){ - McHorVer20WidthEq8_neon(pSrc, iSrcStride, pHorTmp, 16, iHeight); - McHorVer02WidthEq8_neon(pSrc+1, iSrcStride, pVerTmp, 16, iHeight); - PixelAvgWidthEq8_neon(pDst, iDstStride, pHorTmp, pVerTmp, iHeight); - } - else if (iWidth == 4) - { - McHorVer20WidthEq4_neon(pSrc, iSrcStride, pHorTmp, 16, iHeight); - McHorVer02WidthEq4_neon(pSrc+1, iSrcStride, pVerTmp, 16, iHeight); - PixelAvgWidthEq4_neon(pDst, iDstStride, pHorTmp, pVerTmp, iHeight); - } -} -void McHorVer32_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, - int32_t iWidth, int32_t iHeight) -{ - ENFORCE_STACK_ALIGN_1D( uint8_t, pVerTmp, 256, 16 ); - ENFORCE_STACK_ALIGN_1D( uint8_t, pCtrTmp, 256, 16 ); - if (iWidth == 16) - { - McHorVer02WidthEq16_neon(pSrc+1, iSrcStride, pVerTmp, 16, iHeight); - McHorVer22WidthEq16_neon(pSrc, iSrcStride, pCtrTmp, 16, iHeight); - PixelAvgWidthEq16_neon(pDst, iDstStride, pVerTmp, pCtrTmp, iHeight); - } - else if (iWidth == 8) - { - McHorVer02WidthEq8_neon(pSrc+1, iSrcStride, pVerTmp, 16, iHeight); - McHorVer22WidthEq8_neon(pSrc, iSrcStride, pCtrTmp, 16, iHeight); - PixelAvgWidthEq8_neon(pDst, iDstStride, pVerTmp, pCtrTmp, iHeight); - } - else if (iWidth == 4) - { - McHorVer02WidthEq4_neon(pSrc+1, iSrcStride, pVerTmp, 16, iHeight); - McHorVer22WidthEq4_neon(pSrc, iSrcStride, pCtrTmp, 16, iHeight); - PixelAvgWidthEq4_neon(pDst, iDstStride, pVerTmp, pCtrTmp, iHeight); - } -} -void McHorVer33_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, - int32_t iWidth, int32_t iHeight) -{ - ENFORCE_STACK_ALIGN_1D( uint8_t, pHorTmp, 256, 16 ); - ENFORCE_STACK_ALIGN_1D( uint8_t, pVerTmp, 256, 16 ); - if (iWidth == 16) - { - McHorVer20WidthEq16_neon(pSrc+iSrcStride, iSrcStride, pHorTmp, 16, iHeight); - McHorVer02WidthEq16_neon(pSrc+1, iSrcStride, pVerTmp, 16, iHeight); - PixelAvgWidthEq16_neon(pDst, iDstStride, pHorTmp, pVerTmp, iHeight); - } - else if (iWidth == 8) - { - McHorVer20WidthEq8_neon(pSrc+iSrcStride, iSrcStride, pHorTmp, 16, iHeight); - McHorVer02WidthEq8_neon(pSrc+1, iSrcStride, pVerTmp, 16, iHeight); - PixelAvgWidthEq8_neon(pDst, iDstStride, pHorTmp, pVerTmp, iHeight); - } - else if (iWidth == 4) - { - McHorVer20WidthEq4_neon(pSrc+iSrcStride, iSrcStride, pHorTmp, 16, iHeight); - McHorVer02WidthEq4_neon(pSrc+1, iSrcStride, pVerTmp, 16, iHeight); - PixelAvgWidthEq4_neon(pDst, iDstStride, pHorTmp, pVerTmp, iHeight); - } + McHorVer22WidthEq16_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight); + else if (iWidth == 8) + McHorVer22WidthEq8_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight); + else if (iWidth == 4) + McHorVer22WidthEq4_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight); } -void McLuma_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, - int16_t iMvX, int16_t iMvY, int32_t iWidth, int32_t iHeight) -{ - static PWelsMcWidthHeightFunc pWelsMcFunc[4][4] = //[x][y] - { - {McCopy_neon, McHorVer01_neon, McHorVer02_neon, McHorVer03_neon}, - {McHorVer10_neon, McHorVer11_neon, McHorVer12_neon, McHorVer13_neon}, - {McHorVer20_neon, McHorVer21_neon, McHorVer22_neon, McHorVer23_neon}, - {McHorVer30_neon, McHorVer31_neon, McHorVer32_neon, McHorVer33_neon}, - }; - // pSrc += (iMvY >> 2) * iSrcStride + (iMvX >> 2); - pWelsMcFunc[iMvX&0x03][iMvY&0x03](pSrc, iSrcStride, pDst, iDstStride, iWidth, iHeight); +void McHorVer01_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, + int32_t iWidth, int32_t iHeight) { + if (iWidth == 16) + McHorVer01WidthEq16_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight); + else if (iWidth == 8) + McHorVer01WidthEq8_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight); + else if (iWidth == 4) + McHorVer01WidthEq4_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight); } -void McChroma_neon(const uint8_t *pSrc, int32_t iSrcStride, uint8_t *pDst, int32_t iDstStride, - int16_t iMvX, int16_t iMvY, int32_t iWidth, int32_t iHeight) -{ - if (0 == iMvX && 0 == iMvY) - { - if(8 == iWidth) - McCopyWidthEq8_neon(pSrc, iSrcStride, pDst, iDstStride, iHeight); - else if(iWidth == 4) - McCopyWidthEq4_neon(pSrc, iSrcStride, pDst, iDstStride, iHeight); - else //here iWidth == 2 - McCopyWidthEq2_c(pSrc,iSrcStride,pDst,iDstStride,iHeight); - } - else - { - const int32_t kiD8x = iMvX & 0x07; - const int32_t kiD8y = iMvY & 0x07; - if(8 == iWidth) - McChromaWidthEq8_neon(pSrc, iSrcStride, pDst, iDstStride, (int32_t*)(g_kuiABCD[kiD8y][kiD8x]), iHeight); - else if(4 == iWidth) - McChromaWidthEq4_neon(pSrc, iSrcStride, pDst, iDstStride, (int32_t*)(g_kuiABCD[kiD8y][kiD8x]), iHeight); - else //here iWidth == 2 - McChromaWithFragMv_c(pSrc, iSrcStride, pDst, iDstStride, iMvX, iMvY, iWidth, iHeight); - } +void McHorVer03_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, + int32_t iWidth, int32_t iHeight) { + if (iWidth == 16) + McHorVer03WidthEq16_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight); + else if (iWidth == 8) + McHorVer03WidthEq8_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight); + else if (iWidth == 4) + McHorVer03WidthEq4_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight); +} +void McHorVer10_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, + int32_t iWidth, int32_t iHeight) { + if (iWidth == 16) + McHorVer10WidthEq16_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight); + else if (iWidth == 8) + McHorVer10WidthEq8_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight); + else if (iWidth == 4) + McHorVer10WidthEq4_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight); +} +void McHorVer11_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, + int32_t iWidth, int32_t iHeight) { + ENFORCE_STACK_ALIGN_1D (uint8_t, pHorTmp, 256, 16); + ENFORCE_STACK_ALIGN_1D (uint8_t, pVerTmp, 256, 16); + if (iWidth == 16) { + McHorVer20WidthEq16_neon (pSrc, iSrcStride, pHorTmp, 16, iHeight); + McHorVer02WidthEq16_neon (pSrc, iSrcStride, pVerTmp, 16, iHeight); + PixelAvgWidthEq16_neon (pDst, iDstStride, pHorTmp, pVerTmp, iHeight); + } else if (iWidth == 8) { + McHorVer20WidthEq8_neon (pSrc, iSrcStride, pHorTmp, 16, iHeight); + McHorVer02WidthEq8_neon (pSrc, iSrcStride, pVerTmp, 16, iHeight); + PixelAvgWidthEq8_neon (pDst, iDstStride, pHorTmp, pVerTmp, iHeight); + } else if (iWidth == 4) { + McHorVer20WidthEq4_neon (pSrc, iSrcStride, pHorTmp, 16, iHeight); + McHorVer02WidthEq4_neon (pSrc, iSrcStride, pVerTmp, 16, iHeight); + PixelAvgWidthEq4_neon (pDst, iDstStride, pHorTmp, pVerTmp, iHeight); + } +} +void McHorVer12_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, + int32_t iWidth, int32_t iHeight) { + ENFORCE_STACK_ALIGN_1D (uint8_t, pVerTmp, 256, 16); + ENFORCE_STACK_ALIGN_1D (uint8_t, pCtrTmp, 256, 16); + if (iWidth == 16) { + McHorVer02WidthEq16_neon (pSrc, iSrcStride, pVerTmp, 16, iHeight); + McHorVer22WidthEq16_neon (pSrc, iSrcStride, pCtrTmp, 16, iHeight); + PixelAvgWidthEq16_neon (pDst, iDstStride, pVerTmp, pCtrTmp, iHeight); + } else if (iWidth == 8) { + McHorVer02WidthEq8_neon (pSrc, iSrcStride, pVerTmp, 16, iHeight); + McHorVer22WidthEq8_neon (pSrc, iSrcStride, pCtrTmp, 16, iHeight); + PixelAvgWidthEq8_neon (pDst, iDstStride, pVerTmp, pCtrTmp, iHeight); + } else if (iWidth == 4) { + McHorVer02WidthEq4_neon (pSrc, iSrcStride, pVerTmp, 16, iHeight); + McHorVer22WidthEq4_neon (pSrc, iSrcStride, pCtrTmp, 16, iHeight); + PixelAvgWidthEq4_neon (pDst, iDstStride, pVerTmp, pCtrTmp, iHeight); + } +} +void McHorVer13_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, + int32_t iWidth, int32_t iHeight) { + ENFORCE_STACK_ALIGN_1D (uint8_t, pHorTmp, 256, 16); + ENFORCE_STACK_ALIGN_1D (uint8_t, pVerTmp, 256, 16); + if (iWidth == 16) { + McHorVer20WidthEq16_neon (pSrc + iSrcStride, iSrcStride, pHorTmp, 16, iHeight); + McHorVer02WidthEq16_neon (pSrc, iSrcStride, pVerTmp, 16, iHeight); + PixelAvgWidthEq16_neon (pDst, iDstStride, pHorTmp, pVerTmp, iHeight); + } else if (iWidth == 8) { + McHorVer20WidthEq8_neon (pSrc + iSrcStride, iSrcStride, pHorTmp, 16, iHeight); + McHorVer02WidthEq8_neon (pSrc, iSrcStride, pVerTmp, 16, iHeight); + PixelAvgWidthEq8_neon (pDst, iDstStride, pHorTmp, pVerTmp, iHeight); + } else if (iWidth == 4) { + McHorVer20WidthEq4_neon (pSrc + iSrcStride, iSrcStride, pHorTmp, 16, iHeight); + McHorVer02WidthEq4_neon (pSrc, iSrcStride, pVerTmp, 16, iHeight); + PixelAvgWidthEq4_neon (pDst, iDstStride, pHorTmp, pVerTmp, iHeight); + } +} +void McHorVer21_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, + int32_t iWidth, int32_t iHeight) { + ENFORCE_STACK_ALIGN_1D (uint8_t, pHorTmp, 256, 16); + ENFORCE_STACK_ALIGN_1D (uint8_t, pCtrTmp, 256, 16); + if (iWidth == 16) { + McHorVer20WidthEq16_neon (pSrc, iSrcStride, pHorTmp, 16, iHeight); + McHorVer22WidthEq16_neon (pSrc, iSrcStride, pCtrTmp, 16, iHeight); + PixelAvgWidthEq16_neon (pDst, iDstStride, pHorTmp, pCtrTmp, iHeight); + } else if (iWidth == 8) { + McHorVer20WidthEq8_neon (pSrc, iSrcStride, pHorTmp, 16, iHeight); + McHorVer22WidthEq8_neon (pSrc, iSrcStride, pCtrTmp, 16, iHeight); + PixelAvgWidthEq8_neon (pDst, iDstStride, pHorTmp, pCtrTmp, iHeight); + } else if (iWidth == 4) { + McHorVer20WidthEq4_neon (pSrc, iSrcStride, pHorTmp, 16, iHeight); + McHorVer22WidthEq4_neon (pSrc, iSrcStride, pCtrTmp, 16, iHeight); + PixelAvgWidthEq4_neon (pDst, iDstStride, pHorTmp, pCtrTmp, iHeight); + } +} +void McHorVer23_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, + int32_t iWidth, int32_t iHeight) { + ENFORCE_STACK_ALIGN_1D (uint8_t, pHorTmp, 256, 16); + ENFORCE_STACK_ALIGN_1D (uint8_t, pCtrTmp, 256, 16); + if (iWidth == 16) { + McHorVer20WidthEq16_neon (pSrc + iSrcStride, iSrcStride, pHorTmp, 16, iHeight); + McHorVer22WidthEq16_neon (pSrc, iSrcStride, pCtrTmp, 16, iHeight); + PixelAvgWidthEq16_neon (pDst, iDstStride, pHorTmp, pCtrTmp, iHeight); + } else if (iWidth == 8) { + McHorVer20WidthEq8_neon (pSrc + iSrcStride, iSrcStride, pHorTmp, 16, iHeight); + McHorVer22WidthEq8_neon (pSrc, iSrcStride, pCtrTmp, 16, iHeight); + PixelAvgWidthEq8_neon (pDst, iDstStride, pHorTmp, pCtrTmp, iHeight); + } else if (iWidth == 4) { + McHorVer20WidthEq4_neon (pSrc + iSrcStride, iSrcStride, pHorTmp, 16, iHeight); + McHorVer22WidthEq4_neon (pSrc, iSrcStride, pCtrTmp, 16, iHeight); + PixelAvgWidthEq4_neon (pDst, iDstStride, pHorTmp, pCtrTmp, iHeight); + } +} +void McHorVer30_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, + int32_t iWidth, int32_t iHeight) { + if (iWidth == 16) + McHorVer30WidthEq16_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight); + else if (iWidth == 8) + McHorVer30WidthEq8_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight); + else if (iWidth == 4) + McHorVer30WidthEq4_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight); +} +void McHorVer31_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, + int32_t iWidth, int32_t iHeight) { + ENFORCE_STACK_ALIGN_1D (uint8_t, pHorTmp, 256, 16); + ENFORCE_STACK_ALIGN_1D (uint8_t, pVerTmp, 256, 16); + if (iWidth == 16) { + McHorVer20WidthEq16_neon (pSrc, iSrcStride, pHorTmp, 16, iHeight); + McHorVer02WidthEq16_neon (pSrc + 1, iSrcStride, pVerTmp, 16, iHeight); + PixelAvgWidthEq16_neon (pDst, iDstStride, pHorTmp, pVerTmp, iHeight); + } else if (iWidth == 8) { + McHorVer20WidthEq8_neon (pSrc, iSrcStride, pHorTmp, 16, iHeight); + McHorVer02WidthEq8_neon (pSrc + 1, iSrcStride, pVerTmp, 16, iHeight); + PixelAvgWidthEq8_neon (pDst, iDstStride, pHorTmp, pVerTmp, iHeight); + } else if (iWidth == 4) { + McHorVer20WidthEq4_neon (pSrc, iSrcStride, pHorTmp, 16, iHeight); + McHorVer02WidthEq4_neon (pSrc + 1, iSrcStride, pVerTmp, 16, iHeight); + PixelAvgWidthEq4_neon (pDst, iDstStride, pHorTmp, pVerTmp, iHeight); + } +} +void McHorVer32_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, + int32_t iWidth, int32_t iHeight) { + ENFORCE_STACK_ALIGN_1D (uint8_t, pVerTmp, 256, 16); + ENFORCE_STACK_ALIGN_1D (uint8_t, pCtrTmp, 256, 16); + if (iWidth == 16) { + McHorVer02WidthEq16_neon (pSrc + 1, iSrcStride, pVerTmp, 16, iHeight); + McHorVer22WidthEq16_neon (pSrc, iSrcStride, pCtrTmp, 16, iHeight); + PixelAvgWidthEq16_neon (pDst, iDstStride, pVerTmp, pCtrTmp, iHeight); + } else if (iWidth == 8) { + McHorVer02WidthEq8_neon (pSrc + 1, iSrcStride, pVerTmp, 16, iHeight); + McHorVer22WidthEq8_neon (pSrc, iSrcStride, pCtrTmp, 16, iHeight); + PixelAvgWidthEq8_neon (pDst, iDstStride, pVerTmp, pCtrTmp, iHeight); + } else if (iWidth == 4) { + McHorVer02WidthEq4_neon (pSrc + 1, iSrcStride, pVerTmp, 16, iHeight); + McHorVer22WidthEq4_neon (pSrc, iSrcStride, pCtrTmp, 16, iHeight); + PixelAvgWidthEq4_neon (pDst, iDstStride, pVerTmp, pCtrTmp, iHeight); + } +} +void McHorVer33_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, + int32_t iWidth, int32_t iHeight) { + ENFORCE_STACK_ALIGN_1D (uint8_t, pHorTmp, 256, 16); + ENFORCE_STACK_ALIGN_1D (uint8_t, pVerTmp, 256, 16); + if (iWidth == 16) { + McHorVer20WidthEq16_neon (pSrc + iSrcStride, iSrcStride, pHorTmp, 16, iHeight); + McHorVer02WidthEq16_neon (pSrc + 1, iSrcStride, pVerTmp, 16, iHeight); + PixelAvgWidthEq16_neon (pDst, iDstStride, pHorTmp, pVerTmp, iHeight); + } else if (iWidth == 8) { + McHorVer20WidthEq8_neon (pSrc + iSrcStride, iSrcStride, pHorTmp, 16, iHeight); + McHorVer02WidthEq8_neon (pSrc + 1, iSrcStride, pVerTmp, 16, iHeight); + PixelAvgWidthEq8_neon (pDst, iDstStride, pHorTmp, pVerTmp, iHeight); + } else if (iWidth == 4) { + McHorVer20WidthEq4_neon (pSrc + iSrcStride, iSrcStride, pHorTmp, 16, iHeight); + McHorVer02WidthEq4_neon (pSrc + 1, iSrcStride, pVerTmp, 16, iHeight); + PixelAvgWidthEq4_neon (pDst, iDstStride, pHorTmp, pVerTmp, iHeight); + } +} + +void McLuma_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, + int16_t iMvX, int16_t iMvY, int32_t iWidth, int32_t iHeight) { + static PWelsMcWidthHeightFunc pWelsMcFunc[4][4] = { //[x][y] + {McCopy_neon, McHorVer01_neon, McHorVer02_neon, McHorVer03_neon}, + {McHorVer10_neon, McHorVer11_neon, McHorVer12_neon, McHorVer13_neon}, + {McHorVer20_neon, McHorVer21_neon, McHorVer22_neon, McHorVer23_neon}, + {McHorVer30_neon, McHorVer31_neon, McHorVer32_neon, McHorVer33_neon}, + }; + // pSrc += (iMvY >> 2) * iSrcStride + (iMvX >> 2); + pWelsMcFunc[iMvX & 0x03][iMvY & 0x03] (pSrc, iSrcStride, pDst, iDstStride, iWidth, iHeight); +} +void McChroma_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, + int16_t iMvX, int16_t iMvY, int32_t iWidth, int32_t iHeight) { + if (0 == iMvX && 0 == iMvY) { + if (8 == iWidth) + McCopyWidthEq8_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight); + else if (iWidth == 4) + McCopyWidthEq4_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight); + else //here iWidth == 2 + McCopyWidthEq2_c (pSrc, iSrcStride, pDst, iDstStride, iHeight); + } else { + const int32_t kiD8x = iMvX & 0x07; + const int32_t kiD8y = iMvY & 0x07; + if (8 == iWidth) + McChromaWidthEq8_neon (pSrc, iSrcStride, pDst, iDstStride, (int32_t*) (g_kuiABCD[kiD8y][kiD8x]), iHeight); + else if (4 == iWidth) + McChromaWidthEq4_neon (pSrc, iSrcStride, pDst, iDstStride, (int32_t*) (g_kuiABCD[kiD8y][kiD8x]), iHeight); + else //here iWidth == 2 + McChromaWithFragMv_c (pSrc, iSrcStride, pDst, iDstStride, iMvX, iMvY, iWidth, iHeight); + } } #endif +#if defined(HAVE_NEON_AARCH64) +void McCopy_AArch64_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, + int32_t iWidth, int32_t iHeight) { + if (16 == iWidth) + McCopyWidthEq16_AArch64_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight); + else if (8 == iWidth) + McCopyWidthEq8_AArch64_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight); + else + McCopyWidthEq4_AArch64_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight); +} +void McHorVer20_AArch64_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, + int32_t iWidth, int32_t iHeight) { + if (iWidth == 16) + McHorVer20WidthEq16_AArch64_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight); + else if (iWidth == 8) + McHorVer20WidthEq8_AArch64_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight); + else if (iWidth == 4) + McHorVer20WidthEq4_AArch64_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight); +} +void McHorVer02_AArch64_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, + int32_t iWidth, int32_t iHeight) { + if (iWidth == 16) + McHorVer02WidthEq16_AArch64_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight); + else if (iWidth == 8) + McHorVer02WidthEq8_AArch64_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight); + else if (iWidth == 4) + McHorVer02WidthEq4_AArch64_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight); +} +void McHorVer22_AArch64_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, + int32_t iWidth, int32_t iHeight) { + if (iWidth == 16) + McHorVer22WidthEq16_AArch64_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight); + else if (iWidth == 8) + McHorVer22WidthEq8_AArch64_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight); + else if (iWidth == 4) + McHorVer22WidthEq4_AArch64_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight); +} + +void McHorVer01_AArch64_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, + int32_t iWidth, int32_t iHeight) { + if (iWidth == 16) + McHorVer01WidthEq16_AArch64_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight); + else if (iWidth == 8) + McHorVer01WidthEq8_AArch64_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight); + else if (iWidth == 4) + McHorVer01WidthEq4_AArch64_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight); +} +void McHorVer03_AArch64_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, + int32_t iWidth, int32_t iHeight) { + if (iWidth == 16) + McHorVer03WidthEq16_AArch64_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight); + else if (iWidth == 8) + McHorVer03WidthEq8_AArch64_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight); + else if (iWidth == 4) + McHorVer03WidthEq4_AArch64_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight); +} +void McHorVer10_AArch64_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, + int32_t iWidth, int32_t iHeight) { + if (iWidth == 16) + McHorVer10WidthEq16_AArch64_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight); + else if (iWidth == 8) + McHorVer10WidthEq8_AArch64_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight); + else if (iWidth == 4) + McHorVer10WidthEq4_AArch64_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight); +} +void McHorVer11_AArch64_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, + int32_t iWidth, int32_t iHeight) { + ENFORCE_STACK_ALIGN_1D (uint8_t, pHorTmp, 256, 16); + ENFORCE_STACK_ALIGN_1D (uint8_t, pVerTmp, 256, 16); + if (iWidth == 16) { + McHorVer20WidthEq16_AArch64_neon (pSrc, iSrcStride, pHorTmp, 16, iHeight); + McHorVer02WidthEq16_AArch64_neon (pSrc, iSrcStride, pVerTmp, 16, iHeight); + PixelAvgWidthEq16_AArch64_neon (pDst, iDstStride, pHorTmp, 16, pVerTmp, 16, iHeight); + } else if (iWidth == 8) { + McHorVer20WidthEq8_AArch64_neon (pSrc, iSrcStride, pHorTmp, 16, iHeight); + McHorVer02WidthEq8_AArch64_neon (pSrc, iSrcStride, pVerTmp, 16, iHeight); + PixelAvgWidthEq8_AArch64_neon (pDst, iDstStride, pHorTmp, 16, pVerTmp, 16, iHeight); + } else if (iWidth == 4) { + McHorVer20WidthEq4_AArch64_neon (pSrc, iSrcStride, pHorTmp, 16, iHeight); + McHorVer02WidthEq4_AArch64_neon (pSrc, iSrcStride, pVerTmp, 16, iHeight); + PixelAvgWidthEq4_AArch64_neon (pDst, iDstStride, pHorTmp, 16, pVerTmp, 16, iHeight); + } +} +void McHorVer12_AArch64_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, + int32_t iWidth, int32_t iHeight) { + ENFORCE_STACK_ALIGN_1D (uint8_t, pVerTmp, 256, 16); + ENFORCE_STACK_ALIGN_1D (uint8_t, pCtrTmp, 256, 16); + if (iWidth == 16) { + McHorVer02WidthEq16_AArch64_neon (pSrc, iSrcStride, pVerTmp, 16, iHeight); + McHorVer22WidthEq16_AArch64_neon (pSrc, iSrcStride, pCtrTmp, 16, iHeight); + PixelAvgWidthEq16_AArch64_neon (pDst, iDstStride, pVerTmp, 16, pCtrTmp, 16, iHeight); + } else if (iWidth == 8) { + McHorVer02WidthEq8_AArch64_neon (pSrc, iSrcStride, pVerTmp, 16, iHeight); + McHorVer22WidthEq8_AArch64_neon (pSrc, iSrcStride, pCtrTmp, 16, iHeight); + PixelAvgWidthEq8_AArch64_neon (pDst, iDstStride, pVerTmp, 16, pCtrTmp, 16, iHeight); + } else if (iWidth == 4) { + McHorVer02WidthEq4_AArch64_neon (pSrc, iSrcStride, pVerTmp, 16, iHeight); + McHorVer22WidthEq4_AArch64_neon (pSrc, iSrcStride, pCtrTmp, 16, iHeight); + PixelAvgWidthEq4_AArch64_neon (pDst, iDstStride, pVerTmp, 16, pCtrTmp, 16, iHeight); + } +} +void McHorVer13_AArch64_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, + int32_t iWidth, int32_t iHeight) { + ENFORCE_STACK_ALIGN_1D (uint8_t, pHorTmp, 256, 16); + ENFORCE_STACK_ALIGN_1D (uint8_t, pVerTmp, 256, 16); + if (iWidth == 16) { + McHorVer20WidthEq16_AArch64_neon (pSrc + iSrcStride, iSrcStride, pHorTmp, 16, iHeight); + McHorVer02WidthEq16_AArch64_neon (pSrc, iSrcStride, pVerTmp, 16, iHeight); + PixelAvgWidthEq16_AArch64_neon (pDst, iDstStride, pHorTmp, 16, pVerTmp, 16, iHeight); + } else if (iWidth == 8) { + McHorVer20WidthEq8_AArch64_neon (pSrc + iSrcStride, iSrcStride, pHorTmp, 16, iHeight); + McHorVer02WidthEq8_AArch64_neon (pSrc, iSrcStride, pVerTmp, 16, iHeight); + PixelAvgWidthEq8_AArch64_neon (pDst, iDstStride, pHorTmp, 16, pVerTmp, 16, iHeight); + } else if (iWidth == 4) { + McHorVer20WidthEq4_AArch64_neon (pSrc + iSrcStride, iSrcStride, pHorTmp, 16, iHeight); + McHorVer02WidthEq4_AArch64_neon (pSrc, iSrcStride, pVerTmp, 16, iHeight); + PixelAvgWidthEq4_AArch64_neon (pDst, iDstStride, pHorTmp, 16, pVerTmp, 16, iHeight); + } +} +void McHorVer21_AArch64_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, + int32_t iWidth, int32_t iHeight) { + ENFORCE_STACK_ALIGN_1D (uint8_t, pHorTmp, 256, 16); + ENFORCE_STACK_ALIGN_1D (uint8_t, pCtrTmp, 256, 16); + if (iWidth == 16) { + McHorVer20WidthEq16_AArch64_neon (pSrc, iSrcStride, pHorTmp, 16, iHeight); + McHorVer22WidthEq16_AArch64_neon (pSrc, iSrcStride, pCtrTmp, 16, iHeight); + PixelAvgWidthEq16_AArch64_neon (pDst, iDstStride, pHorTmp, 16, pCtrTmp, 16, iHeight); + } else if (iWidth == 8) { + McHorVer20WidthEq8_AArch64_neon (pSrc, iSrcStride, pHorTmp, 16, iHeight); + McHorVer22WidthEq8_AArch64_neon (pSrc, iSrcStride, pCtrTmp, 16, iHeight); + PixelAvgWidthEq8_AArch64_neon (pDst, iDstStride, pHorTmp, 16, pCtrTmp, 16, iHeight); + } else if (iWidth == 4) { + McHorVer20WidthEq4_AArch64_neon (pSrc, iSrcStride, pHorTmp, 16, iHeight); + McHorVer22WidthEq4_AArch64_neon (pSrc, iSrcStride, pCtrTmp, 16, iHeight); + PixelAvgWidthEq4_AArch64_neon (pDst, iDstStride, pHorTmp, 16, pCtrTmp, 16, iHeight); + } +} +void McHorVer23_AArch64_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, + int32_t iWidth, int32_t iHeight) { + ENFORCE_STACK_ALIGN_1D (uint8_t, pHorTmp, 256, 16); + ENFORCE_STACK_ALIGN_1D (uint8_t, pCtrTmp, 256, 16); + if (iWidth == 16) { + McHorVer20WidthEq16_AArch64_neon (pSrc + iSrcStride, iSrcStride, pHorTmp, 16, iHeight); + McHorVer22WidthEq16_AArch64_neon (pSrc, iSrcStride, pCtrTmp, 16, iHeight); + PixelAvgWidthEq16_AArch64_neon (pDst, iDstStride, pHorTmp, 16, pCtrTmp, 16, iHeight); + } else if (iWidth == 8) { + McHorVer20WidthEq8_AArch64_neon (pSrc + iSrcStride, iSrcStride, pHorTmp, 16, iHeight); + McHorVer22WidthEq8_AArch64_neon (pSrc, iSrcStride, pCtrTmp, 16, iHeight); + PixelAvgWidthEq8_AArch64_neon (pDst, iDstStride, pHorTmp, 16, pCtrTmp, 16, iHeight); + } else if (iWidth == 4) { + McHorVer20WidthEq4_AArch64_neon (pSrc + iSrcStride, iSrcStride, pHorTmp, 16, iHeight); + McHorVer22WidthEq4_AArch64_neon (pSrc, iSrcStride, pCtrTmp, 16, iHeight); + PixelAvgWidthEq4_AArch64_neon (pDst, iDstStride, pHorTmp, 16, pCtrTmp, 16, iHeight); + } +} +void McHorVer30_AArch64_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, + int32_t iWidth, int32_t iHeight) { + if (iWidth == 16) + McHorVer30WidthEq16_AArch64_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight); + else if (iWidth == 8) + McHorVer30WidthEq8_AArch64_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight); + else if (iWidth == 4) + McHorVer30WidthEq4_AArch64_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight); +} +void McHorVer31_AArch64_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, + int32_t iWidth, int32_t iHeight) { + ENFORCE_STACK_ALIGN_1D (uint8_t, pHorTmp, 256, 16); + ENFORCE_STACK_ALIGN_1D (uint8_t, pVerTmp, 256, 16); + if (iWidth == 16) { + McHorVer20WidthEq16_AArch64_neon (pSrc, iSrcStride, pHorTmp, 16, iHeight); + McHorVer02WidthEq16_AArch64_neon (pSrc + 1, iSrcStride, pVerTmp, 16, iHeight); + PixelAvgWidthEq16_AArch64_neon (pDst, iDstStride, pHorTmp, 16, pVerTmp, 16, iHeight); + } else if (iWidth == 8) { + McHorVer20WidthEq8_AArch64_neon (pSrc, iSrcStride, pHorTmp, 16, iHeight); + McHorVer02WidthEq8_AArch64_neon (pSrc + 1, iSrcStride, pVerTmp, 16, iHeight); + PixelAvgWidthEq8_AArch64_neon (pDst, iDstStride, pHorTmp, 16, pVerTmp, 16, iHeight); + } else if (iWidth == 4) { + McHorVer20WidthEq4_AArch64_neon (pSrc, iSrcStride, pHorTmp, 16, iHeight); + McHorVer02WidthEq4_AArch64_neon (pSrc + 1, iSrcStride, pVerTmp, 16, iHeight); + PixelAvgWidthEq4_AArch64_neon (pDst, iDstStride, pHorTmp, 16, pVerTmp, 16, iHeight); + } +} +void McHorVer32_AArch64_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, + int32_t iWidth, int32_t iHeight) { + ENFORCE_STACK_ALIGN_1D (uint8_t, pVerTmp, 256, 16); + ENFORCE_STACK_ALIGN_1D (uint8_t, pCtrTmp, 256, 16); + if (iWidth == 16) { + McHorVer02WidthEq16_AArch64_neon (pSrc + 1, iSrcStride, pVerTmp, 16, iHeight); + McHorVer22WidthEq16_AArch64_neon (pSrc, iSrcStride, pCtrTmp, 16, iHeight); + PixelAvgWidthEq16_AArch64_neon (pDst, iDstStride, pVerTmp, 16, pCtrTmp, 16, iHeight); + } else if (iWidth == 8) { + McHorVer02WidthEq8_AArch64_neon (pSrc + 1, iSrcStride, pVerTmp, 16, iHeight); + McHorVer22WidthEq8_AArch64_neon (pSrc, iSrcStride, pCtrTmp, 16, iHeight); + PixelAvgWidthEq8_AArch64_neon (pDst, iDstStride, pVerTmp, 16, pCtrTmp, 16, iHeight); + } else if (iWidth == 4) { + McHorVer02WidthEq4_AArch64_neon (pSrc + 1, iSrcStride, pVerTmp, 16, iHeight); + McHorVer22WidthEq4_AArch64_neon (pSrc, iSrcStride, pCtrTmp, 16, iHeight); + PixelAvgWidthEq4_AArch64_neon (pDst, iDstStride, pVerTmp, 16, pCtrTmp, 16, iHeight); + } +} +void McHorVer33_AArch64_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, + int32_t iWidth, int32_t iHeight) { + ENFORCE_STACK_ALIGN_1D (uint8_t, pHorTmp, 256, 16); + ENFORCE_STACK_ALIGN_1D (uint8_t, pVerTmp, 256, 16); + if (iWidth == 16) { + McHorVer20WidthEq16_AArch64_neon (pSrc + iSrcStride, iSrcStride, pHorTmp, 16, iHeight); + McHorVer02WidthEq16_AArch64_neon (pSrc + 1, iSrcStride, pVerTmp, 16, iHeight); + PixelAvgWidthEq16_AArch64_neon (pDst, iDstStride, pHorTmp, 16, pVerTmp, 16, iHeight); + } else if (iWidth == 8) { + McHorVer20WidthEq8_AArch64_neon (pSrc + iSrcStride, iSrcStride, pHorTmp, 16, iHeight); + McHorVer02WidthEq8_AArch64_neon (pSrc + 1, iSrcStride, pVerTmp, 16, iHeight); + PixelAvgWidthEq8_AArch64_neon (pDst, iDstStride, pHorTmp, 16, pVerTmp, 16, iHeight); + } else if (iWidth == 4) { + McHorVer20WidthEq4_AArch64_neon (pSrc + iSrcStride, iSrcStride, pHorTmp, 16, iHeight); + McHorVer02WidthEq4_AArch64_neon (pSrc + 1, iSrcStride, pVerTmp, 16, iHeight); + PixelAvgWidthEq4_AArch64_neon (pDst, iDstStride, pHorTmp, 16, pVerTmp, 16, iHeight); + } +} + +void McLuma_AArch64_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, + int16_t iMvX, int16_t iMvY, int32_t iWidth, int32_t iHeight) { + static PWelsMcWidthHeightFunc pWelsMcFunc[4][4] = { //[x][y] + {McCopy_AArch64_neon, McHorVer01_AArch64_neon, McHorVer02_AArch64_neon, McHorVer03_AArch64_neon}, + {McHorVer10_AArch64_neon, McHorVer11_AArch64_neon, McHorVer12_AArch64_neon, McHorVer13_AArch64_neon}, + {McHorVer20_AArch64_neon, McHorVer21_AArch64_neon, McHorVer22_AArch64_neon, McHorVer23_AArch64_neon}, + {McHorVer30_AArch64_neon, McHorVer31_AArch64_neon, McHorVer32_AArch64_neon, McHorVer33_AArch64_neon}, + }; + // pSrc += (iMvY >> 2) * iSrcStride + (iMvX >> 2); + pWelsMcFunc[iMvX & 0x03][iMvY & 0x03] (pSrc, iSrcStride, pDst, iDstStride, iWidth, iHeight); +} +void McChroma_AArch64_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, + int16_t iMvX, int16_t iMvY, int32_t iWidth, int32_t iHeight) { + if (0 == iMvX && 0 == iMvY) { + if (8 == iWidth) + McCopyWidthEq8_AArch64_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight); + else if (iWidth == 4) + McCopyWidthEq4_AArch64_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight); + else //here iWidth == 2 + McCopyWidthEq2_c (pSrc, iSrcStride, pDst, iDstStride, iHeight); + } else { + const int32_t kiD8x = iMvX & 0x07; + const int32_t kiD8y = iMvY & 0x07; + if (8 == iWidth) + McChromaWidthEq8_AArch64_neon (pSrc, iSrcStride, pDst, iDstStride, (int32_t*) (g_kuiABCD[kiD8y][kiD8x]), iHeight); + else if (4 == iWidth) + McChromaWidthEq4_AArch64_neon (pSrc, iSrcStride, pDst, iDstStride, (int32_t*) (g_kuiABCD[kiD8y][kiD8x]), iHeight); + else //here iWidth == 2 + McChromaWithFragMv_c (pSrc, iSrcStride, pDst, iDstStride, iMvX, iMvY, iWidth, iHeight); + } +} +#endif + void InitMcFunc (SMcFunc* pMcFunc, int32_t iCpu) { pMcFunc->pMcLumaFunc = McLuma_c; pMcFunc->pMcChromaFunc = McChroma_c; #ifdef HAVE_NEON - if ( iCpu & WELS_CPU_NEON ) { - pMcFunc->pMcLumaFunc = McLuma_neon; - pMcFunc->pMcChromaFunc = McChroma_neon; - } + if (iCpu & WELS_CPU_NEON) { + pMcFunc->pMcLumaFunc = McLuma_neon; + pMcFunc->pMcChromaFunc = McChroma_neon; + } +#endif +#ifdef HAVE_NEON_AARCH64 + if (iCpu & WELS_CPU_NEON) { + pMcFunc->pMcLumaFunc = McLuma_AArch64_neon; + pMcFunc->pMcChromaFunc = McChroma_AArch64_neon; + } #endif - #if defined (X86_ASM) if (iCpu & WELS_CPU_SSE2) { - pMcFunc->pMcLumaFunc = McLuma_sse2; - pMcFunc->pMcChromaFunc = McChroma_sse2; + pMcFunc->pMcLumaFunc = McLuma_sse2; + pMcFunc->pMcChromaFunc = McChroma_sse2; } #endif //(X86_ASM) } diff --git a/codec/decoder/core/src/mc.cpp.orig b/codec/decoder/core/src/mc.cpp.orig new file mode 100644 index 00000000..d7a13ce4 --- /dev/null +++ b/codec/decoder/core/src/mc.cpp.orig @@ -0,0 +1,1305 @@ +/*! + * \copy + * Copyright (c) 2009-2013, Cisco Systems + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS + * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE + * COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN + * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + * + * \file mc.c + * + * \brief Interfaces implementation for motion compensation + * + * \date 03/17/2009 Created + * + ************************************************************************************* + */ + +#include "mc.h" + +#include "cpu_core.h" + +namespace WelsDec { + +/*------------------weight for chroma fraction pixel interpolation------------------*/ +//iA = (8 - dx) * (8 - dy); +//iB = dx * (8 - dy); +//iC = (8 - dx) * dy; +//iD = dx * dy +static const uint8_t g_kuiABCD[8][8][4] = { //g_kA[dy][dx], g_kB[dy][dx], g_kC[dy][dx], g_kD[dy][dx] + { + {64, 0, 0, 0}, {56, 8, 0, 0}, {48, 16, 0, 0}, {40, 24, 0, 0}, + {32, 32, 0, 0}, {24, 40, 0, 0}, {16, 48, 0, 0}, {8, 56, 0, 0} + }, + { + {56, 0, 8, 0}, {49, 7, 7, 1}, {42, 14, 6, 2}, {35, 21, 5, 3}, + {28, 28, 4, 4}, {21, 35, 3, 5}, {14, 42, 2, 6}, {7, 49, 1, 7} + }, + { + {48, 0, 16, 0}, {42, 6, 14, 2}, {36, 12, 12, 4}, {30, 18, 10, 6}, + {24, 24, 8, 8}, {18, 30, 6, 10}, {12, 36, 4, 12}, {6, 42, 2, 14} + }, + { + {40, 0, 24, 0}, {35, 5, 21, 3}, {30, 10, 18, 6}, {25, 15, 15, 9}, + {20, 20, 12, 12}, {15, 25, 9, 15}, {10, 30, 6, 18}, {5, 35, 3, 21} + }, + { + {32, 0, 32, 0}, {28, 4, 28, 4}, {24, 8, 24, 8}, {20, 12, 20, 12}, + {16, 16, 16, 16}, {12, 20, 12, 20}, {8, 24, 8, 24}, {4, 28, 4, 28} + }, + { + {24, 0, 40, 0}, {21, 3, 35, 5}, {18, 6, 30, 10}, {15, 9, 25, 15}, + {12, 12, 20, 20}, {9, 15, 15, 25}, {6, 18, 10, 30}, {3, 21, 5, 35} + }, + { + {16, 0, 48, 0}, {14, 2, 42, 6}, {12, 4, 36, 12}, {10, 6, 30, 18}, + {8, 8, 24, 24}, {6, 10, 18, 30}, {4, 12, 12, 36}, {2, 14, 6, 42} + }, + { + {8, 0, 56, 0}, {7, 1, 49, 7}, {6, 2, 42, 14}, {5, 3, 35, 21}, + {4, 4, 28, 28}, {3, 5, 21, 35}, {2, 6, 14, 42}, {1, 7, 7, 49} + } +}; + +typedef void (*PWelsMcWidthHeightFunc) (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, + int32_t iWidth, int32_t iHeight); + +//***************************************************************************// +// C code implementation // +//***************************************************************************// +static inline void McCopyWidthEq2_c (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, + int32_t iHeight) { + int32_t i; + for (i = 0; i < iHeight; i++) { // iWidth == 2 only for chroma + ST16A2 (pDst, LD16 (pSrc)); + pDst += iDstStride; + pSrc += iSrcStride; + } +} + +static inline void McCopyWidthEq4_c (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, + int32_t iHeight) { + int32_t i; + for (i = 0; i < iHeight; i++) { + ST32A4 (pDst, LD32 (pSrc)); + pDst += iDstStride; + pSrc += iSrcStride; + } +} + +static inline void McCopyWidthEq8_c (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, + int32_t iHeight) { + int32_t i; + for (i = 0; i < iHeight; i++) { + ST64A8 (pDst, LD64 (pSrc)); + pDst += iDstStride; + pSrc += iSrcStride; + } +} + +static inline void McCopyWidthEq16_c (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, + int32_t iHeight) { + int32_t i; + for (i = 0; i < iHeight; i++) { + ST64A8 (pDst , LD64 (pSrc)); + ST64A8 (pDst + 8, LD64 (pSrc + 8)); + pDst += iDstStride; + pSrc += iSrcStride; + } +} + +//--------------------Luma sample MC------------------// + +static inline int32_t HorFilterInput16bit_c (int16_t* pSrc) { + int32_t iPix05 = pSrc[-2] + pSrc[3]; + int32_t iPix14 = pSrc[-1] + pSrc[2]; + int32_t iPix23 = pSrc[ 0] + pSrc[1]; + + return (iPix05 - (iPix14 * 5)+ (iPix23 * 20)); +} +// h: iOffset=1 / v: iOffset=iSrcStride +static inline int32_t FilterInput8bitWithStride_c (const uint8_t* pSrc, const int32_t kiOffset) { + const int32_t kiOffset1 = kiOffset; + const int32_t kiOffset2 = (kiOffset << 1); + const int32_t kiOffset3 = kiOffset + kiOffset2; + const uint32_t kuiPix05 = * (pSrc - kiOffset2) + * (pSrc + kiOffset3); + const uint32_t kuiPix14 = * (pSrc - kiOffset1) + * (pSrc + kiOffset2); + const uint32_t kuiPix23 = * (pSrc) + * (pSrc + kiOffset1); + + return (kuiPix05 - ((kuiPix14 << 2) + kuiPix14) + (kuiPix23 << 4) + (kuiPix23 << 2)); +} + +static inline void PixelAvg_c (uint8_t* pDst, int32_t iDstStride, const uint8_t* pSrcA, int32_t iSrcAStride, + const uint8_t* pSrcB, int32_t iSrcBStride, int32_t iWidth, int32_t iHeight) { + int32_t i, j; + for (i = 0; i < iHeight; i++) { + for (j = 0; j < iWidth; j++) { + pDst[j] = (pSrcA[j] + pSrcB[j] + 1) >> 1; + } + pDst += iDstStride; + pSrcA += iSrcAStride; + pSrcB += iSrcBStride; + } +} +static inline void McCopy_c (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iWidth, + int32_t iHeight) { + if (iWidth == 16) + McCopyWidthEq16_c (pSrc, iSrcStride, pDst, iDstStride, iHeight); + else if (iWidth == 8) + McCopyWidthEq8_c (pSrc, iSrcStride, pDst, iDstStride, iHeight); + else if (iWidth == 4) + McCopyWidthEq4_c (pSrc, iSrcStride, pDst, iDstStride, iHeight); + else //here iWidth == 2 + McCopyWidthEq2_c (pSrc, iSrcStride, pDst, iDstStride, iHeight); +} + +static inline void McHorVer20_c (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iWidth, + int32_t iHeight) { + int32_t i, j; + for (i = 0; i < iHeight; i++) { + for (j = 0; j < iWidth; j++) { + pDst[j] = WelsClip1 ((FilterInput8bitWithStride_c (pSrc + j, 1) + 16) >> 5); + } + pDst += iDstStride; + pSrc += iSrcStride; + } +} + +static inline void McHorVer02_c (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iWidth, + int32_t iHeight) { + int32_t i, j; + for (i = 0; i < iHeight; i++) { + for (j = 0; j < iWidth; j++) { + pDst[j] = WelsClip1 ((FilterInput8bitWithStride_c (pSrc + j, iSrcStride) + 16) >> 5); + } + pDst += iDstStride; + pSrc += iSrcStride; + } +} + +static inline void McHorVer22_c (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iWidth, + int32_t iHeight) { + int16_t iTmp[16 + 5]; //16 + int32_t i, j, k; + + for (i = 0; i < iHeight; i++) { + for (j = 0; j < iWidth + 5; j++) { + iTmp[j] = FilterInput8bitWithStride_c (pSrc - 2 + j, iSrcStride); + } + for (k = 0; k < iWidth; k++) { + pDst[k] = WelsClip1 ((HorFilterInput16bit_c (&iTmp[2 + k]) + 512) >> 10); + } + pSrc += iSrcStride; + pDst += iDstStride; + } +} + +/////////////////////luma MC////////////////////////// +static inline void McHorVer01_c (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iWidth, + int32_t iHeight) { + uint8_t uiTmp[256]; + McHorVer02_c (pSrc, iSrcStride, uiTmp, 16, iWidth, iHeight); + PixelAvg_c (pDst, iDstStride, pSrc, iSrcStride, uiTmp, 16, iWidth, iHeight); +} +static inline void McHorVer03_c (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iWidth, + int32_t iHeight) { + uint8_t uiTmp[256]; + McHorVer02_c (pSrc, iSrcStride, uiTmp, 16, iWidth, iHeight); + PixelAvg_c (pDst, iDstStride, pSrc + iSrcStride, iSrcStride, uiTmp, 16, iWidth, iHeight); +} +static inline void McHorVer10_c (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iWidth, + int32_t iHeight) { + uint8_t uiTmp[256]; + McHorVer20_c (pSrc, iSrcStride, uiTmp, 16, iWidth, iHeight); + PixelAvg_c (pDst, iDstStride, pSrc, iSrcStride, uiTmp, 16, iWidth, iHeight); +} +static inline void McHorVer11_c (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iWidth, + int32_t iHeight) { + uint8_t uiHorTmp[256]; + uint8_t uiVerTmp[256]; + McHorVer20_c (pSrc, iSrcStride, uiHorTmp, 16, iWidth, iHeight); + McHorVer02_c (pSrc, iSrcStride, uiVerTmp, 16, iWidth, iHeight); + PixelAvg_c (pDst, iDstStride, uiHorTmp, 16, uiVerTmp, 16, iWidth, iHeight); +} +static inline void McHorVer12_c (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iWidth, + int32_t iHeight) { + uint8_t uiVerTmp[256]; + uint8_t uiCtrTmp[256]; + McHorVer02_c (pSrc, iSrcStride, uiVerTmp, 16, iWidth, iHeight); + McHorVer22_c (pSrc, iSrcStride, uiCtrTmp, 16, iWidth, iHeight); + PixelAvg_c (pDst, iDstStride, uiVerTmp, 16, uiCtrTmp, 16, iWidth, iHeight); +} +static inline void McHorVer13_c (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iWidth, + int32_t iHeight) { + uint8_t uiHorTmp[256]; + uint8_t uiVerTmp[256]; + McHorVer20_c (pSrc + iSrcStride, iSrcStride, uiHorTmp, 16, iWidth, iHeight); + McHorVer02_c (pSrc, iSrcStride, uiVerTmp, 16, iWidth, iHeight); + PixelAvg_c (pDst, iDstStride, uiHorTmp, 16, uiVerTmp, 16, iWidth, iHeight); +} +static inline void McHorVer21_c (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iWidth, + int32_t iHeight) { + uint8_t uiHorTmp[256]; + uint8_t uiCtrTmp[256]; + McHorVer20_c (pSrc, iSrcStride, uiHorTmp, 16, iWidth, iHeight); + McHorVer22_c (pSrc, iSrcStride, uiCtrTmp, 16, iWidth, iHeight); + PixelAvg_c (pDst, iDstStride, uiHorTmp, 16, uiCtrTmp, 16, iWidth, iHeight); +} +static inline void McHorVer23_c (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iWidth, + int32_t iHeight) { + uint8_t uiHorTmp[256]; + uint8_t uiCtrTmp[256]; + McHorVer20_c (pSrc + iSrcStride, iSrcStride, uiHorTmp, 16, iWidth, iHeight); + McHorVer22_c (pSrc, iSrcStride, uiCtrTmp, 16, iWidth, iHeight); + PixelAvg_c (pDst, iDstStride, uiHorTmp, 16, uiCtrTmp, 16, iWidth, iHeight); +} +static inline void McHorVer30_c (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iWidth, + int32_t iHeight) { + uint8_t uiHorTmp[256]; + McHorVer20_c (pSrc, iSrcStride, uiHorTmp, 16, iWidth, iHeight); + PixelAvg_c (pDst, iDstStride, pSrc + 1, iSrcStride, uiHorTmp, 16, iWidth, iHeight); +} +static inline void McHorVer31_c (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iWidth, + int32_t iHeight) { + uint8_t uiHorTmp[256]; + uint8_t uiVerTmp[256]; + McHorVer20_c (pSrc, iSrcStride, uiHorTmp, 16, iWidth, iHeight); + McHorVer02_c (pSrc + 1, iSrcStride, uiVerTmp, 16, iWidth, iHeight); + PixelAvg_c (pDst, iDstStride, uiHorTmp, 16, uiVerTmp, 16, iWidth, iHeight); +} +static inline void McHorVer32_c (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iWidth, + int32_t iHeight) { + uint8_t uiVerTmp[256]; + uint8_t uiCtrTmp[256]; + McHorVer02_c (pSrc + 1, iSrcStride, uiVerTmp, 16, iWidth, iHeight); + McHorVer22_c (pSrc, iSrcStride, uiCtrTmp, 16, iWidth, iHeight); + PixelAvg_c (pDst, iDstStride, uiVerTmp, 16, uiCtrTmp, 16, iWidth, iHeight); +} +static inline void McHorVer33_c (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iWidth, + int32_t iHeight) { + uint8_t uiHorTmp[256]; + uint8_t uiVerTmp[256]; + McHorVer20_c (pSrc + iSrcStride, iSrcStride, uiHorTmp, 16, iWidth, iHeight); + McHorVer02_c (pSrc + 1, iSrcStride, uiVerTmp, 16, iWidth, iHeight); + PixelAvg_c (pDst, iDstStride, uiHorTmp, 16, uiVerTmp, 16, iWidth, iHeight); +} + +void McLuma_c (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, + int16_t iMvX, int16_t iMvY, int32_t iWidth, int32_t iHeight) +//pSrc has been added the offset of mv +{ + static const PWelsMcWidthHeightFunc pWelsMcFunc[4][4] = { //[x][y] + {McCopy_c, McHorVer01_c, McHorVer02_c, McHorVer03_c}, + {McHorVer10_c, McHorVer11_c, McHorVer12_c, McHorVer13_c}, + {McHorVer20_c, McHorVer21_c, McHorVer22_c, McHorVer23_c}, + {McHorVer30_c, McHorVer31_c, McHorVer32_c, McHorVer33_c}, + }; + + pWelsMcFunc[iMvX & 0x03][iMvY & 0x03] (pSrc, iSrcStride, pDst, iDstStride, iWidth, iHeight); +} + +static inline void McChromaWithFragMv_c (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, + int16_t iMvX, int16_t iMvY, int32_t iWidth, int32_t iHeight) { + int32_t i, j; + int32_t iA, iB, iC, iD; + const uint8_t* pSrcNext = pSrc + iSrcStride; + const uint8_t *pABCD = g_kuiABCD[iMvY & 0x07][iMvX & 0x07]; + iA = pABCD[0]; + iB = pABCD[1]; + iC = pABCD[2]; + iD = pABCD[3]; + for (i = 0; i < iHeight; i++) { + for (j = 0; j < iWidth; j++) { + pDst[j] = (iA * pSrc[j] + iB * pSrc[j + 1] + iC * pSrcNext[j] + iD * pSrcNext[j + 1] + 32) >> 6; + } + pDst += iDstStride; + pSrc = pSrcNext; + pSrcNext += iSrcStride; + } +} + +void McChroma_c (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, + int16_t iMvX, int16_t iMvY, int32_t iWidth, int32_t iHeight) +//pSrc has been added the offset of mv +{ + const int32_t kiD8x = iMvX & 0x07; + const int32_t kiD8y = iMvY & 0x07; + if (0 == kiD8x && 0 == kiD8y) + McCopy_c (pSrc, iSrcStride, pDst, iDstStride, iWidth, iHeight); + else + McChromaWithFragMv_c (pSrc, iSrcStride, pDst, iDstStride, iMvX, iMvY, iWidth, iHeight); +} + +#if defined(X86_ASM) +//***************************************************************************// +// SSE2 implement // +//***************************************************************************// +static inline void McHorVer22WidthEq8_sse2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, + int32_t iHeight) { + ENFORCE_STACK_ALIGN_2D (int16_t, iTap, 21, 8, 16) + McHorVer22Width8HorFirst_sse2 (pSrc - 2, iSrcStride, (uint8_t*)iTap, 16, iHeight + 5); + McHorVer22Width8VerLastAlign_sse2 ((uint8_t*)iTap, 16, pDst, iDstStride, 8, iHeight); +} + +static inline void McHorVer02WidthEq16_sse2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, + int32_t iHeight) { + McHorVer02WidthEq8_sse2 (pSrc, iSrcStride, pDst, iDstStride, iHeight); + McHorVer02WidthEq8_sse2 (&pSrc[8], iSrcStride, &pDst[8], iDstStride, iHeight); +} + +static inline void McHorVer22WidthEq16_sse2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, + int32_t iHeight) { + McHorVer22WidthEq8_sse2 (pSrc, iSrcStride, pDst, iDstStride, iHeight); + McHorVer22WidthEq8_sse2 (&pSrc[8], iSrcStride, &pDst[8], iDstStride, iHeight); +} + +static inline void McCopy_sse2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iWidth, + int32_t iHeight) { + if (iWidth == 16) + McCopyWidthEq16_sse2 (pSrc, iSrcStride, pDst, iDstStride, iHeight); + else if (iWidth == 8) + McCopyWidthEq8_mmx (pSrc, iSrcStride, pDst, iDstStride, iHeight); + else if (iWidth == 4) + McCopyWidthEq4_mmx (pSrc, iSrcStride, pDst, iDstStride, iHeight); + else + McCopyWidthEq2_c (pSrc, iSrcStride, pDst, iDstStride, iHeight); +} + +static inline void McHorVer20_sse2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, + int32_t iWidth, int32_t iHeight) { + if (iWidth == 16) + McHorVer20WidthEq16_sse2 (pSrc, iSrcStride, pDst, iDstStride, iHeight); + else if (iWidth == 8) + McHorVer20WidthEq8_sse2 (pSrc, iSrcStride, pDst, iDstStride, iHeight); + else + McHorVer20WidthEq4_mmx (pSrc, iSrcStride, pDst, iDstStride, iHeight); +} + +static inline void McHorVer02_sse2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, + int32_t iWidth, int32_t iHeight) { + if (iWidth == 16) + McHorVer02WidthEq16_sse2 (pSrc, iSrcStride, pDst, iDstStride, iHeight); + else if (iWidth == 8) + McHorVer02WidthEq8_sse2 (pSrc, iSrcStride, pDst, iDstStride, iHeight); + else + McHorVer02_c (pSrc, iSrcStride, pDst, iDstStride, 4, iHeight); +} + +static inline void McHorVer22_sse2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, + int32_t iWidth, int32_t iHeight) { + if (iWidth == 16) + McHorVer22WidthEq16_sse2 (pSrc, iSrcStride, pDst, iDstStride, iHeight); + else if (iWidth == 8) + McHorVer22WidthEq8_sse2 (pSrc, iSrcStride, pDst, iDstStride, iHeight); + else + McHorVer22_c (pSrc, iSrcStride, pDst, iDstStride, 4, iHeight); +} + +static inline void McHorVer01_sse2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, + int32_t iWidth, int32_t iHeight) { + ENFORCE_STACK_ALIGN_1D (uint8_t, pTmp, 256, 16); + if (iWidth == 16) { + McHorVer02WidthEq16_sse2 (pSrc, iSrcStride, pTmp, 16, iHeight); + PixelAvgWidthEq16_sse2 (pDst, iDstStride, pSrc, iSrcStride, pTmp, 16, iHeight); + } else if (iWidth == 8) { + McHorVer02WidthEq8_sse2 (pSrc, iSrcStride, pTmp, 16, iHeight); + PixelAvgWidthEq8_mmx (pDst, iDstStride, pSrc, iSrcStride, pTmp, 16, iHeight); + } else { + McHorVer02_c (pSrc, iSrcStride, pTmp, 16, 4, iHeight); + PixelAvgWidthEq4_mmx (pDst, iDstStride, pSrc, iSrcStride, pTmp, 16, iHeight); + } +} +static inline void McHorVer03_sse2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, + int32_t iWidth, int32_t iHeight) { + ENFORCE_STACK_ALIGN_1D (uint8_t, pTmp, 256, 16); + if (iWidth == 16) { + McHorVer02WidthEq16_sse2 (pSrc, iSrcStride, pTmp, 16, iHeight); + PixelAvgWidthEq16_sse2 (pDst, iDstStride, pSrc + iSrcStride, iSrcStride, pTmp, 16, iHeight); + } else if (iWidth == 8) { + McHorVer02WidthEq8_sse2 (pSrc, iSrcStride, pTmp, 16, iHeight); + PixelAvgWidthEq8_mmx (pDst, iDstStride, pSrc + iSrcStride, iSrcStride, pTmp, 16, iHeight); + } else { + McHorVer02_c (pSrc, iSrcStride, pTmp, 16, 4, iHeight); + PixelAvgWidthEq4_mmx (pDst, iDstStride, pSrc + iSrcStride, iSrcStride, pTmp, 16, iHeight); + } +} +static inline void McHorVer10_sse2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, + int32_t iWidth, int32_t iHeight) { + ENFORCE_STACK_ALIGN_1D (uint8_t, pTmp, 256, 16); + if (iWidth == 16) { + McHorVer20WidthEq16_sse2 (pSrc, iSrcStride, pTmp, 16, iHeight); + PixelAvgWidthEq16_sse2 (pDst, iDstStride, pSrc, iSrcStride, pTmp, 16, iHeight); + } else if (iWidth == 8) { + McHorVer20WidthEq8_sse2 (pSrc, iSrcStride, pTmp, 16, iHeight); + PixelAvgWidthEq8_mmx (pDst, iDstStride, pSrc, iSrcStride, pTmp, 16, iHeight); + } else { + McHorVer20WidthEq4_mmx (pSrc, iSrcStride, pTmp, 16, iHeight); + PixelAvgWidthEq4_mmx (pDst, iDstStride, pSrc, iSrcStride, pTmp, 16, iHeight); + } +} +static inline void McHorVer11_sse2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, + int32_t iWidth, int32_t iHeight) { + ENFORCE_STACK_ALIGN_1D (uint8_t, pHorTmp, 256, 16); + ENFORCE_STACK_ALIGN_1D (uint8_t, pVerTmp, 256, 16); + if (iWidth == 16) { + McHorVer20WidthEq16_sse2 (pSrc, iSrcStride, pHorTmp, 16, iHeight); + McHorVer02WidthEq16_sse2 (pSrc, iSrcStride, pVerTmp, 16, iHeight); + PixelAvgWidthEq16_sse2 (pDst, iDstStride, pHorTmp, 16, pVerTmp, 16, iHeight); + } else if (iWidth == 8) { + McHorVer20WidthEq8_sse2 (pSrc, iSrcStride, pHorTmp, 16, iHeight); + McHorVer02WidthEq8_sse2 (pSrc, iSrcStride, pVerTmp, 16, iHeight); + PixelAvgWidthEq8_mmx (pDst, iDstStride, pHorTmp, 16, pVerTmp, 16, iHeight); + } else { + McHorVer20WidthEq4_mmx (pSrc, iSrcStride, pHorTmp, 16, iHeight); + McHorVer02_c (pSrc, iSrcStride, pVerTmp, 16, 4, iHeight); + PixelAvgWidthEq4_mmx (pDst, iDstStride, pHorTmp, 16, pVerTmp, 16, iHeight); + } +} +static inline void McHorVer12_sse2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, + int32_t iWidth, int32_t iHeight) { + ENFORCE_STACK_ALIGN_1D (uint8_t, pVerTmp, 256, 16); + ENFORCE_STACK_ALIGN_1D (uint8_t, pCtrTmp, 256, 16); + if (iWidth == 16) { + McHorVer02WidthEq16_sse2 (pSrc, iSrcStride, pVerTmp, 16, iHeight); + McHorVer22WidthEq16_sse2 (pSrc, iSrcStride, pCtrTmp, 16, iHeight); + PixelAvgWidthEq16_sse2 (pDst, iDstStride, pVerTmp, 16, pCtrTmp, 16, iHeight); + } else if (iWidth == 8) { + McHorVer02WidthEq8_sse2 (pSrc, iSrcStride, pVerTmp, 16, iHeight); + McHorVer22WidthEq8_sse2 (pSrc, iSrcStride, pCtrTmp, 16, iHeight); + PixelAvgWidthEq8_mmx (pDst, iDstStride, pVerTmp, 16, pCtrTmp, 16, iHeight); + } else { + McHorVer02_c (pSrc, iSrcStride, pVerTmp, 16, 4, iHeight); + McHorVer22_c (pSrc, iSrcStride, pCtrTmp, 16, 4, iHeight); + PixelAvgWidthEq4_mmx (pDst, iDstStride, pVerTmp, 16, pCtrTmp, 16, iHeight); + } +} +static inline void McHorVer13_sse2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, + int32_t iWidth, int32_t iHeight) { + ENFORCE_STACK_ALIGN_1D (uint8_t, pHorTmp, 256, 16); + ENFORCE_STACK_ALIGN_1D (uint8_t, pVerTmp, 256, 16); + if (iWidth == 16) { + McHorVer20WidthEq16_sse2 (pSrc + iSrcStride, iSrcStride, pHorTmp, 16, iHeight); + McHorVer02WidthEq16_sse2 (pSrc, iSrcStride, pVerTmp, 16, iHeight); + PixelAvgWidthEq16_sse2 (pDst, iDstStride, pHorTmp, 16, pVerTmp, 16, iHeight); + } else if (iWidth == 8) { + McHorVer20WidthEq8_sse2 (pSrc + iSrcStride, iSrcStride, pHorTmp, 16, iHeight); + McHorVer02WidthEq8_sse2 (pSrc, iSrcStride, pVerTmp, 16, iHeight); + PixelAvgWidthEq8_mmx (pDst, iDstStride, pHorTmp, 16, pVerTmp, 16, iHeight); + } else { + McHorVer20WidthEq4_mmx (pSrc + iSrcStride, iSrcStride, pHorTmp, 16, iHeight); + McHorVer02_c (pSrc, iSrcStride, pVerTmp, 16, 4 , iHeight); + PixelAvgWidthEq4_mmx (pDst, iDstStride, pHorTmp, 16, pVerTmp, 16, iHeight); + } +} +static inline void McHorVer21_sse2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, + int32_t iWidth, int32_t iHeight) { + ENFORCE_STACK_ALIGN_1D (uint8_t, pHorTmp, 256, 16); + ENFORCE_STACK_ALIGN_1D (uint8_t, pCtrTmp, 256, 16); + if (iWidth == 16) { + McHorVer20WidthEq16_sse2 (pSrc, iSrcStride, pHorTmp, 16, iHeight); + McHorVer22WidthEq16_sse2 (pSrc, iSrcStride, pCtrTmp, 16, iHeight); + PixelAvgWidthEq16_sse2 (pDst, iDstStride, pHorTmp, 16, pCtrTmp, 16, iHeight); + } else if (iWidth == 8) { + McHorVer20WidthEq8_sse2 (pSrc, iSrcStride, pHorTmp, 16, iHeight); + McHorVer22WidthEq8_sse2 (pSrc, iSrcStride, pCtrTmp, 16, iHeight); + PixelAvgWidthEq8_mmx (pDst, iDstStride, pHorTmp, 16, pCtrTmp, 16, iHeight); + } else { + McHorVer20WidthEq4_mmx (pSrc, iSrcStride, pHorTmp, 16, iHeight); + McHorVer22_c (pSrc, iSrcStride, pCtrTmp, 16, 4, iHeight); + PixelAvgWidthEq4_mmx (pDst, iDstStride, pHorTmp, 16, pCtrTmp, 16, iHeight); + } +} +static inline void McHorVer23_sse2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, + int32_t iWidth, int32_t iHeight) { + ENFORCE_STACK_ALIGN_1D (uint8_t, pHorTmp, 256, 16); + ENFORCE_STACK_ALIGN_1D (uint8_t, pCtrTmp, 256, 16); + if (iWidth == 16) { + McHorVer20WidthEq16_sse2 (pSrc + iSrcStride, iSrcStride, pHorTmp, 16, iHeight); + McHorVer22WidthEq16_sse2 (pSrc, iSrcStride, pCtrTmp, 16, iHeight); + PixelAvgWidthEq16_sse2 (pDst, iDstStride, pHorTmp, 16, pCtrTmp, 16, iHeight); + } else if (iWidth == 8) { + McHorVer20WidthEq8_sse2 (pSrc + iSrcStride, iSrcStride, pHorTmp, 16, iHeight); + McHorVer22WidthEq8_sse2 (pSrc, iSrcStride, pCtrTmp, 16, iHeight); + PixelAvgWidthEq8_mmx (pDst, iDstStride, pHorTmp, 16, pCtrTmp, 16, iHeight); + } else { + McHorVer20WidthEq4_mmx (pSrc + iSrcStride, iSrcStride, pHorTmp, 16, iHeight); + McHorVer22_c (pSrc, iSrcStride, pCtrTmp, 16, 4, iHeight); + PixelAvgWidthEq4_mmx (pDst, iDstStride, pHorTmp, 16, pCtrTmp, 16, iHeight); + } +} +static inline void McHorVer30_sse2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, + int32_t iWidth, int32_t iHeight) { + ENFORCE_STACK_ALIGN_1D (uint8_t, pHorTmp, 256, 16); + if (iWidth == 16) { + McHorVer20WidthEq16_sse2 (pSrc, iSrcStride, pHorTmp, 16, iHeight); + PixelAvgWidthEq16_sse2 (pDst, iDstStride, pSrc + 1, iSrcStride, pHorTmp, 16, iHeight); + } else if (iWidth == 8) { + McHorVer20WidthEq8_sse2 (pSrc, iSrcStride, pHorTmp, 16, iHeight); + PixelAvgWidthEq8_mmx (pDst, iDstStride, pSrc + 1, iSrcStride, pHorTmp, 16, iHeight); + } else { + McHorVer20WidthEq4_mmx (pSrc, iSrcStride, pHorTmp, 16, iHeight); + PixelAvgWidthEq4_mmx (pDst, iDstStride, pSrc + 1, iSrcStride, pHorTmp, 16, iHeight); + } +} +static inline void McHorVer31_sse2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, + int32_t iWidth, int32_t iHeight) { + ENFORCE_STACK_ALIGN_1D (uint8_t, pHorTmp, 256, 16); + ENFORCE_STACK_ALIGN_1D (uint8_t, pVerTmp, 256, 16); + if (iWidth == 16) { + McHorVer20WidthEq16_sse2 (pSrc, iSrcStride, pHorTmp, 16, iHeight); + McHorVer02WidthEq16_sse2 (pSrc + 1, iSrcStride, pVerTmp, 16, iHeight); + PixelAvgWidthEq16_sse2 (pDst, iDstStride, pHorTmp, 16, pVerTmp, 16, iHeight); + } else if (iWidth == 8) { + McHorVer20WidthEq8_sse2 (pSrc, iSrcStride, pHorTmp, 16, iHeight); + McHorVer02WidthEq8_sse2 (pSrc + 1, iSrcStride, pVerTmp, 16, iHeight); + PixelAvgWidthEq8_mmx (pDst, iDstStride, pHorTmp, 16, pVerTmp, 16, iHeight); + } else { + McHorVer20WidthEq4_mmx (pSrc, iSrcStride, pHorTmp, 16, iHeight); + McHorVer02_c (pSrc + 1, iSrcStride, pVerTmp, 16, 4, iHeight); + PixelAvgWidthEq4_mmx (pDst, iDstStride, pHorTmp, 16, pVerTmp, 16, iHeight); + } +} +static inline void McHorVer32_sse2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, + int32_t iWidth, int32_t iHeight) { + ENFORCE_STACK_ALIGN_1D (uint8_t, pVerTmp, 256, 16); + ENFORCE_STACK_ALIGN_1D (uint8_t, pCtrTmp, 256, 16); + if (iWidth == 16) { + McHorVer02WidthEq16_sse2 (pSrc + 1, iSrcStride, pVerTmp, 16, iHeight); + McHorVer22WidthEq16_sse2 (pSrc, iSrcStride, pCtrTmp, 16, iHeight); + PixelAvgWidthEq16_sse2 (pDst, iDstStride, pVerTmp, 16, pCtrTmp, 16, iHeight); + } else if (iWidth == 8) { + McHorVer02WidthEq8_sse2 (pSrc + 1, iSrcStride, pVerTmp, 16, iHeight); + McHorVer22WidthEq8_sse2 (pSrc, iSrcStride, pCtrTmp, 16, iHeight); + PixelAvgWidthEq8_mmx (pDst, iDstStride, pVerTmp, 16, pCtrTmp, 16, iHeight); + } else { + McHorVer02_c (pSrc + 1, iSrcStride, pVerTmp, 16, 4, iHeight); + McHorVer22_c (pSrc, iSrcStride, pCtrTmp, 16, 4, iHeight); + PixelAvgWidthEq4_mmx (pDst, iDstStride, pVerTmp, 16, pCtrTmp, 16, iHeight); + } +} +static inline void McHorVer33_sse2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, + int32_t iWidth, int32_t iHeight) { + ENFORCE_STACK_ALIGN_1D (uint8_t, pHorTmp, 256, 16); + ENFORCE_STACK_ALIGN_1D (uint8_t, pVerTmp, 256, 16); + if (iWidth == 16) { + McHorVer20WidthEq16_sse2 (pSrc + iSrcStride, iSrcStride, pHorTmp, 16, iHeight); + McHorVer02WidthEq16_sse2 (pSrc + 1, iSrcStride, pVerTmp, 16, iHeight); + PixelAvgWidthEq16_sse2 (pDst, iDstStride, pHorTmp, 16, pVerTmp, 16, iHeight); + } else if (iWidth == 8) { + McHorVer20WidthEq8_sse2 (pSrc + iSrcStride, iSrcStride, pHorTmp, 16, iHeight); + McHorVer02WidthEq8_sse2 (pSrc + 1, iSrcStride, pVerTmp, 16, iHeight); + PixelAvgWidthEq8_mmx (pDst, iDstStride, pHorTmp, 16, pVerTmp, 16, iHeight); + } else { + McHorVer20WidthEq4_mmx (pSrc + iSrcStride, iSrcStride, pHorTmp, 16, iHeight); + McHorVer02_c (pSrc + 1, iSrcStride, pVerTmp, 16, 4, iHeight); + PixelAvgWidthEq4_mmx (pDst, iDstStride, pHorTmp, 16, pVerTmp, 16, iHeight); + } +} + +void McLuma_sse2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, + int16_t iMvX, int16_t iMvY, int32_t iWidth, int32_t iHeight) +//pSrc has been added the offset of mv +{ + static const PWelsMcWidthHeightFunc pWelsMcFunc[4][4] = { //[x][y] + {McCopy_sse2, McHorVer01_sse2, McHorVer02_sse2, McHorVer03_sse2}, + {McHorVer10_sse2, McHorVer11_sse2, McHorVer12_sse2, McHorVer13_sse2}, + {McHorVer20_sse2, McHorVer21_sse2, McHorVer22_sse2, McHorVer23_sse2}, + {McHorVer30_sse2, McHorVer31_sse2, McHorVer32_sse2, McHorVer33_sse2}, + }; + + pWelsMcFunc[iMvX & 0x03][iMvY & 0x03] (pSrc, iSrcStride, pDst, iDstStride, iWidth, iHeight); +} + +void McChroma_sse2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, + int16_t iMvX, int16_t iMvY, int32_t iWidth, int32_t iHeight) { + static const PMcChromaWidthExtFunc kpMcChromaWidthFuncs[2] = { + McChromaWidthEq4_mmx, + McChromaWidthEq8_sse2 + }; + const int32_t kiD8x = iMvX & 0x07; + const int32_t kiD8y = iMvY & 0x07; + if (kiD8x == 0 && kiD8y == 0) { + McCopy_sse2 (pSrc, iSrcStride, pDst, iDstStride, iWidth, iHeight); + return; + } + if (iWidth != 2) { + kpMcChromaWidthFuncs[iWidth >> 3] (pSrc, iSrcStride, pDst, iDstStride, g_kuiABCD[kiD8y][kiD8x], iHeight); + } else + McChromaWithFragMv_c (pSrc, iSrcStride, pDst, iDstStride, iMvX, iMvY, iWidth, iHeight); +} + +#endif //X86_ASM +//***************************************************************************// +// NEON implementation // +//***************************************************************************// +#if defined(HAVE_NEON) +void McCopy_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, + int32_t iWidth, int32_t iHeight) +{ + if (16 == iWidth) + McCopyWidthEq16_neon(pSrc, iSrcStride, pDst, iDstStride, iHeight); + else if(8 == iWidth) + McCopyWidthEq8_neon(pSrc, iSrcStride, pDst, iDstStride, iHeight); + else + McCopyWidthEq4_neon(pSrc, iSrcStride, pDst, iDstStride, iHeight); + } +void McHorVer20_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, + int32_t iWidth, int32_t iHeight) +{ + if (iWidth == 16) + McHorVer20WidthEq16_neon(pSrc, iSrcStride, pDst, iDstStride, iHeight); + else if (iWidth == 8) + McHorVer20WidthEq8_neon(pSrc, iSrcStride, pDst, iDstStride, iHeight); + else if (iWidth == 4) + McHorVer20WidthEq4_neon(pSrc, iSrcStride, pDst, iDstStride, iHeight); +} +void McHorVer02_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, + int32_t iWidth, int32_t iHeight) +{ + if (iWidth == 16) + McHorVer02WidthEq16_neon(pSrc, iSrcStride, pDst, iDstStride, iHeight); + else if (iWidth == 8) + McHorVer02WidthEq8_neon(pSrc, iSrcStride, pDst, iDstStride, iHeight); + else if (iWidth == 4) + McHorVer02WidthEq4_neon(pSrc, iSrcStride, pDst, iDstStride, iHeight); +} +void McHorVer22_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, + int32_t iWidth, int32_t iHeight) +{ + if (iWidth == 16) + McHorVer22WidthEq16_neon(pSrc, iSrcStride, pDst, iDstStride, iHeight); + else if (iWidth == 8) + McHorVer22WidthEq8_neon(pSrc, iSrcStride, pDst, iDstStride, iHeight); + else if (iWidth == 4) + McHorVer22WidthEq4_neon(pSrc, iSrcStride, pDst, iDstStride, iHeight); +} + +void McHorVer01_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, + int32_t iWidth, int32_t iHeight) +{ + if (iWidth == 16) + McHorVer01WidthEq16_neon(pSrc, iSrcStride, pDst, iDstStride, iHeight); + else if (iWidth == 8) + McHorVer01WidthEq8_neon(pSrc, iSrcStride, pDst, iDstStride, iHeight); + else if (iWidth == 4) + McHorVer01WidthEq4_neon(pSrc, iSrcStride, pDst, iDstStride, iHeight); +} +void McHorVer03_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, + int32_t iWidth, int32_t iHeight) +{ + if (iWidth == 16) + McHorVer03WidthEq16_neon(pSrc, iSrcStride, pDst, iDstStride, iHeight); + else if (iWidth == 8) + McHorVer03WidthEq8_neon(pSrc, iSrcStride, pDst, iDstStride, iHeight); + else if (iWidth == 4) + McHorVer03WidthEq4_neon(pSrc, iSrcStride, pDst, iDstStride, iHeight); +} +void McHorVer10_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, + int32_t iWidth, int32_t iHeight) +{ + if (iWidth == 16) + McHorVer10WidthEq16_neon(pSrc, iSrcStride, pDst, iDstStride, iHeight); + else if (iWidth == 8) + McHorVer10WidthEq8_neon(pSrc, iSrcStride, pDst, iDstStride, iHeight); + else if (iWidth == 4) + McHorVer10WidthEq4_neon(pSrc, iSrcStride, pDst, iDstStride, iHeight); +} +void McHorVer11_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, + int32_t iWidth, int32_t iHeight) +{ + ENFORCE_STACK_ALIGN_1D( uint8_t, pHorTmp, 256, 16 ); + ENFORCE_STACK_ALIGN_1D( uint8_t, pVerTmp, 256, 16 ); + if (iWidth == 16) + { + McHorVer20WidthEq16_neon(pSrc, iSrcStride, pHorTmp, 16, iHeight); + McHorVer02WidthEq16_neon(pSrc, iSrcStride, pVerTmp, 16, iHeight); + PixelAvgWidthEq16_neon(pDst, iDstStride, pHorTmp, pVerTmp, iHeight); + } + else if (iWidth == 8) + { + McHorVer20WidthEq8_neon(pSrc, iSrcStride, pHorTmp, 16, iHeight); + McHorVer02WidthEq8_neon(pSrc, iSrcStride, pVerTmp, 16, iHeight); + PixelAvgWidthEq8_neon(pDst, iDstStride, pHorTmp, pVerTmp, iHeight); + } + else if (iWidth == 4) + { + McHorVer20WidthEq4_neon(pSrc, iSrcStride, pHorTmp, 16, iHeight); + McHorVer02WidthEq4_neon(pSrc, iSrcStride, pVerTmp, 16, iHeight); + PixelAvgWidthEq4_neon(pDst, iDstStride, pHorTmp, pVerTmp, iHeight); + } +} +void McHorVer12_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, + int32_t iWidth, int32_t iHeight) +{ + ENFORCE_STACK_ALIGN_1D( uint8_t, pVerTmp, 256, 16 ); + ENFORCE_STACK_ALIGN_1D( uint8_t, pCtrTmp, 256, 16 ); + if (iWidth == 16) + { + McHorVer02WidthEq16_neon(pSrc, iSrcStride, pVerTmp, 16, iHeight); + McHorVer22WidthEq16_neon(pSrc, iSrcStride, pCtrTmp, 16, iHeight); + PixelAvgWidthEq16_neon(pDst, iDstStride, pVerTmp, pCtrTmp, iHeight); + } + else if (iWidth == 8) + { + McHorVer02WidthEq8_neon(pSrc, iSrcStride, pVerTmp, 16, iHeight); + McHorVer22WidthEq8_neon(pSrc, iSrcStride, pCtrTmp, 16, iHeight); + PixelAvgWidthEq8_neon(pDst, iDstStride, pVerTmp, pCtrTmp, iHeight); + } + else if (iWidth == 4) + { + McHorVer02WidthEq4_neon(pSrc, iSrcStride, pVerTmp, 16, iHeight); + McHorVer22WidthEq4_neon(pSrc, iSrcStride, pCtrTmp, 16, iHeight); + PixelAvgWidthEq4_neon(pDst, iDstStride, pVerTmp, pCtrTmp, iHeight); + } +} +void McHorVer13_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, + int32_t iWidth, int32_t iHeight) +{ + ENFORCE_STACK_ALIGN_1D( uint8_t, pHorTmp, 256, 16 ); + ENFORCE_STACK_ALIGN_1D( uint8_t, pVerTmp, 256, 16 ); + if (iWidth == 16) + { + McHorVer20WidthEq16_neon(pSrc+iSrcStride, iSrcStride, pHorTmp, 16, iHeight); + McHorVer02WidthEq16_neon(pSrc, iSrcStride, pVerTmp, 16, iHeight); + PixelAvgWidthEq16_neon(pDst, iDstStride, pHorTmp, pVerTmp, iHeight); + } + else if (iWidth == 8) + { + McHorVer20WidthEq8_neon(pSrc+iSrcStride, iSrcStride, pHorTmp, 16, iHeight); + McHorVer02WidthEq8_neon(pSrc, iSrcStride, pVerTmp, 16, iHeight); + PixelAvgWidthEq8_neon(pDst, iDstStride, pHorTmp, pVerTmp, iHeight); + } + else if (iWidth == 4) + { + McHorVer20WidthEq4_neon(pSrc+iSrcStride, iSrcStride, pHorTmp, 16, iHeight); + McHorVer02WidthEq4_neon(pSrc, iSrcStride, pVerTmp, 16, iHeight); + PixelAvgWidthEq4_neon(pDst, iDstStride, pHorTmp, pVerTmp, iHeight); + } +} +void McHorVer21_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, + int32_t iWidth, int32_t iHeight) +{ + ENFORCE_STACK_ALIGN_1D( uint8_t, pHorTmp, 256, 16 ); + ENFORCE_STACK_ALIGN_1D( uint8_t, pCtrTmp, 256, 16 ); + if (iWidth == 16) + { + McHorVer20WidthEq16_neon(pSrc, iSrcStride, pHorTmp, 16, iHeight); + McHorVer22WidthEq16_neon(pSrc, iSrcStride, pCtrTmp, 16, iHeight); + PixelAvgWidthEq16_neon(pDst, iDstStride, pHorTmp, pCtrTmp, iHeight); + } + else if (iWidth == 8) + { + McHorVer20WidthEq8_neon(pSrc, iSrcStride, pHorTmp, 16, iHeight); + McHorVer22WidthEq8_neon(pSrc, iSrcStride, pCtrTmp, 16, iHeight); + PixelAvgWidthEq8_neon(pDst, iDstStride, pHorTmp, pCtrTmp, iHeight); + } + else if (iWidth == 4) + { + McHorVer20WidthEq4_neon(pSrc, iSrcStride, pHorTmp, 16, iHeight); + McHorVer22WidthEq4_neon(pSrc, iSrcStride, pCtrTmp, 16, iHeight); + PixelAvgWidthEq4_neon(pDst, iDstStride, pHorTmp, pCtrTmp, iHeight); + } +} +void McHorVer23_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, + int32_t iWidth, int32_t iHeight) +{ + ENFORCE_STACK_ALIGN_1D( uint8_t, pHorTmp, 256, 16 ); + ENFORCE_STACK_ALIGN_1D( uint8_t, pCtrTmp, 256, 16 ); + if (iWidth == 16) + { + McHorVer20WidthEq16_neon(pSrc+iSrcStride, iSrcStride, pHorTmp, 16, iHeight); + McHorVer22WidthEq16_neon(pSrc, iSrcStride, pCtrTmp, 16, iHeight); + PixelAvgWidthEq16_neon(pDst, iDstStride, pHorTmp, pCtrTmp, iHeight); + } + else if (iWidth == 8) + { + McHorVer20WidthEq8_neon(pSrc+iSrcStride, iSrcStride, pHorTmp, 16, iHeight); + McHorVer22WidthEq8_neon(pSrc, iSrcStride, pCtrTmp, 16, iHeight); + PixelAvgWidthEq8_neon(pDst, iDstStride, pHorTmp, pCtrTmp, iHeight); + } + else if (iWidth == 4) + { + McHorVer20WidthEq4_neon(pSrc+iSrcStride, iSrcStride, pHorTmp, 16, iHeight); + McHorVer22WidthEq4_neon(pSrc, iSrcStride, pCtrTmp, 16, iHeight); + PixelAvgWidthEq4_neon(pDst, iDstStride, pHorTmp, pCtrTmp, iHeight); + } +} +void McHorVer30_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, + int32_t iWidth, int32_t iHeight) +{ + if (iWidth == 16) + McHorVer30WidthEq16_neon(pSrc, iSrcStride, pDst, iDstStride, iHeight); + else if (iWidth == 8) + McHorVer30WidthEq8_neon(pSrc, iSrcStride, pDst, iDstStride, iHeight); + else if (iWidth == 4) + McHorVer30WidthEq4_neon(pSrc, iSrcStride, pDst, iDstStride, iHeight); +} +void McHorVer31_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, + int32_t iWidth, int32_t iHeight) +{ + ENFORCE_STACK_ALIGN_1D( uint8_t, pHorTmp, 256, 16 ); + ENFORCE_STACK_ALIGN_1D( uint8_t, pVerTmp, 256, 16 ); + if (iWidth == 16) { + McHorVer20WidthEq16_neon(pSrc, iSrcStride, pHorTmp, 16, iHeight); + McHorVer02WidthEq16_neon(pSrc+1, iSrcStride, pVerTmp, 16, iHeight); + PixelAvgWidthEq16_neon(pDst, iDstStride, pHorTmp, pVerTmp, iHeight); + } + else if (iWidth == 8){ + McHorVer20WidthEq8_neon(pSrc, iSrcStride, pHorTmp, 16, iHeight); + McHorVer02WidthEq8_neon(pSrc+1, iSrcStride, pVerTmp, 16, iHeight); + PixelAvgWidthEq8_neon(pDst, iDstStride, pHorTmp, pVerTmp, iHeight); + } + else if (iWidth == 4) + { + McHorVer20WidthEq4_neon(pSrc, iSrcStride, pHorTmp, 16, iHeight); + McHorVer02WidthEq4_neon(pSrc+1, iSrcStride, pVerTmp, 16, iHeight); + PixelAvgWidthEq4_neon(pDst, iDstStride, pHorTmp, pVerTmp, iHeight); + } +} +void McHorVer32_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, + int32_t iWidth, int32_t iHeight) +{ + ENFORCE_STACK_ALIGN_1D( uint8_t, pVerTmp, 256, 16 ); + ENFORCE_STACK_ALIGN_1D( uint8_t, pCtrTmp, 256, 16 ); + if (iWidth == 16) + { + McHorVer02WidthEq16_neon(pSrc+1, iSrcStride, pVerTmp, 16, iHeight); + McHorVer22WidthEq16_neon(pSrc, iSrcStride, pCtrTmp, 16, iHeight); + PixelAvgWidthEq16_neon(pDst, iDstStride, pVerTmp, pCtrTmp, iHeight); + } + else if (iWidth == 8) + { + McHorVer02WidthEq8_neon(pSrc+1, iSrcStride, pVerTmp, 16, iHeight); + McHorVer22WidthEq8_neon(pSrc, iSrcStride, pCtrTmp, 16, iHeight); + PixelAvgWidthEq8_neon(pDst, iDstStride, pVerTmp, pCtrTmp, iHeight); + } + else if (iWidth == 4) + { + McHorVer02WidthEq4_neon(pSrc+1, iSrcStride, pVerTmp, 16, iHeight); + McHorVer22WidthEq4_neon(pSrc, iSrcStride, pCtrTmp, 16, iHeight); + PixelAvgWidthEq4_neon(pDst, iDstStride, pVerTmp, pCtrTmp, iHeight); + } +} +void McHorVer33_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, + int32_t iWidth, int32_t iHeight) +{ + ENFORCE_STACK_ALIGN_1D( uint8_t, pHorTmp, 256, 16 ); + ENFORCE_STACK_ALIGN_1D( uint8_t, pVerTmp, 256, 16 ); + if (iWidth == 16) + { + McHorVer20WidthEq16_neon(pSrc+iSrcStride, iSrcStride, pHorTmp, 16, iHeight); + McHorVer02WidthEq16_neon(pSrc+1, iSrcStride, pVerTmp, 16, iHeight); + PixelAvgWidthEq16_neon(pDst, iDstStride, pHorTmp, pVerTmp, iHeight); + } + else if (iWidth == 8) + { + McHorVer20WidthEq8_neon(pSrc+iSrcStride, iSrcStride, pHorTmp, 16, iHeight); + McHorVer02WidthEq8_neon(pSrc+1, iSrcStride, pVerTmp, 16, iHeight); + PixelAvgWidthEq8_neon(pDst, iDstStride, pHorTmp, pVerTmp, iHeight); + } + else if (iWidth == 4) + { + McHorVer20WidthEq4_neon(pSrc+iSrcStride, iSrcStride, pHorTmp, 16, iHeight); + McHorVer02WidthEq4_neon(pSrc+1, iSrcStride, pVerTmp, 16, iHeight); + PixelAvgWidthEq4_neon(pDst, iDstStride, pHorTmp, pVerTmp, iHeight); + } +} + +void McLuma_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, + int16_t iMvX, int16_t iMvY, int32_t iWidth, int32_t iHeight) +{ + static PWelsMcWidthHeightFunc pWelsMcFunc[4][4] = //[x][y] + { + {McCopy_neon, McHorVer01_neon, McHorVer02_neon, McHorVer03_neon}, + {McHorVer10_neon, McHorVer11_neon, McHorVer12_neon, McHorVer13_neon}, + {McHorVer20_neon, McHorVer21_neon, McHorVer22_neon, McHorVer23_neon}, + {McHorVer30_neon, McHorVer31_neon, McHorVer32_neon, McHorVer33_neon}, + }; + // pSrc += (iMvY >> 2) * iSrcStride + (iMvX >> 2); + pWelsMcFunc[iMvX&0x03][iMvY&0x03](pSrc, iSrcStride, pDst, iDstStride, iWidth, iHeight); +} +void McChroma_neon(const uint8_t *pSrc, int32_t iSrcStride, uint8_t *pDst, int32_t iDstStride, + int16_t iMvX, int16_t iMvY, int32_t iWidth, int32_t iHeight) +{ + if (0 == iMvX && 0 == iMvY) + { + if(8 == iWidth) + McCopyWidthEq8_neon(pSrc, iSrcStride, pDst, iDstStride, iHeight); + else if(iWidth == 4) + McCopyWidthEq4_neon(pSrc, iSrcStride, pDst, iDstStride, iHeight); + else //here iWidth == 2 + McCopyWidthEq2_c(pSrc,iSrcStride,pDst,iDstStride,iHeight); + } + else + { + const int32_t kiD8x = iMvX & 0x07; + const int32_t kiD8y = iMvY & 0x07; + if(8 == iWidth) + McChromaWidthEq8_neon(pSrc, iSrcStride, pDst, iDstStride, (int32_t*)(g_kuiABCD[kiD8y][kiD8x]), iHeight); + else if(4 == iWidth) + McChromaWidthEq4_neon(pSrc, iSrcStride, pDst, iDstStride, (int32_t*)(g_kuiABCD[kiD8y][kiD8x]), iHeight); + else //here iWidth == 2 + McChromaWithFragMv_c(pSrc, iSrcStride, pDst, iDstStride, iMvX, iMvY, iWidth, iHeight); + } +} +#endif +#if defined(HAVE_NEON_AARCH64) +void McCopy_AArch64_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, + int32_t iWidth, int32_t iHeight) +{ + if (16 == iWidth) + McCopyWidthEq16_AArch64_neon(pSrc, iSrcStride, pDst, iDstStride, iHeight); + else if(8 == iWidth) + McCopyWidthEq8_AArch64_neon(pSrc, iSrcStride, pDst, iDstStride, iHeight); + else + McCopyWidthEq4_AArch64_neon(pSrc, iSrcStride, pDst, iDstStride, iHeight); +} +void McHorVer20_AArch64_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, + int32_t iWidth, int32_t iHeight) +{ + if (iWidth == 16) + McHorVer20WidthEq16_AArch64_neon(pSrc, iSrcStride, pDst, iDstStride, iHeight); + else if (iWidth == 8) + McHorVer20WidthEq8_AArch64_neon(pSrc, iSrcStride, pDst, iDstStride, iHeight); + else if (iWidth == 4) + McHorVer20WidthEq4_AArch64_neon(pSrc, iSrcStride, pDst, iDstStride, iHeight); +} +void McHorVer02_AArch64_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, + int32_t iWidth, int32_t iHeight) +{ + if (iWidth == 16) + McHorVer02WidthEq16_AArch64_neon(pSrc, iSrcStride, pDst, iDstStride, iHeight); + else if (iWidth == 8) + McHorVer02WidthEq8_AArch64_neon(pSrc, iSrcStride, pDst, iDstStride, iHeight); + else if (iWidth == 4) + McHorVer02WidthEq4_AArch64_neon(pSrc, iSrcStride, pDst, iDstStride, iHeight); +} +void McHorVer22_AArch64_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, + int32_t iWidth, int32_t iHeight) +{ + if (iWidth == 16) + McHorVer22WidthEq16_AArch64_neon(pSrc, iSrcStride, pDst, iDstStride, iHeight); + else if (iWidth == 8) + McHorVer22WidthEq8_AArch64_neon(pSrc, iSrcStride, pDst, iDstStride, iHeight); + else if (iWidth == 4) + McHorVer22WidthEq4_AArch64_neon(pSrc, iSrcStride, pDst, iDstStride, iHeight); +} + +void McHorVer01_AArch64_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, + int32_t iWidth, int32_t iHeight) +{ + if (iWidth == 16) + McHorVer01WidthEq16_AArch64_neon(pSrc, iSrcStride, pDst, iDstStride, iHeight); + else if (iWidth == 8) + McHorVer01WidthEq8_AArch64_neon(pSrc, iSrcStride, pDst, iDstStride, iHeight); + else if (iWidth == 4) + McHorVer01WidthEq4_AArch64_neon(pSrc, iSrcStride, pDst, iDstStride, iHeight); +} +void McHorVer03_AArch64_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, + int32_t iWidth, int32_t iHeight) +{ + if (iWidth == 16) + McHorVer03WidthEq16_AArch64_neon(pSrc, iSrcStride, pDst, iDstStride, iHeight); + else if (iWidth == 8) + McHorVer03WidthEq8_AArch64_neon(pSrc, iSrcStride, pDst, iDstStride, iHeight); + else if (iWidth == 4) + McHorVer03WidthEq4_AArch64_neon(pSrc, iSrcStride, pDst, iDstStride, iHeight); +} +void McHorVer10_AArch64_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, + int32_t iWidth, int32_t iHeight) +{ + if (iWidth == 16) + McHorVer10WidthEq16_AArch64_neon(pSrc, iSrcStride, pDst, iDstStride, iHeight); + else if (iWidth == 8) + McHorVer10WidthEq8_AArch64_neon(pSrc, iSrcStride, pDst, iDstStride, iHeight); + else if (iWidth == 4) + McHorVer10WidthEq4_AArch64_neon(pSrc, iSrcStride, pDst, iDstStride, iHeight); +} +void McHorVer11_AArch64_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, + int32_t iWidth, int32_t iHeight) +{ + ENFORCE_STACK_ALIGN_1D( uint8_t, pHorTmp, 256, 16 ); + ENFORCE_STACK_ALIGN_1D( uint8_t, pVerTmp, 256, 16 ); + if (iWidth == 16) + { + McHorVer20WidthEq16_AArch64_neon(pSrc, iSrcStride, pHorTmp, 16, iHeight); + McHorVer02WidthEq16_AArch64_neon(pSrc, iSrcStride, pVerTmp, 16, iHeight); + PixelAvgWidthEq16_AArch64_neon(pDst, iDstStride, pHorTmp, 16, pVerTmp, 16, iHeight); + } + else if (iWidth == 8) + { + McHorVer20WidthEq8_AArch64_neon(pSrc, iSrcStride, pHorTmp, 16, iHeight); + McHorVer02WidthEq8_AArch64_neon(pSrc, iSrcStride, pVerTmp, 16, iHeight); + PixelAvgWidthEq8_AArch64_neon(pDst, iDstStride, pHorTmp, 16, pVerTmp, 16, iHeight); + } + else if (iWidth == 4) + { + McHorVer20WidthEq4_AArch64_neon(pSrc, iSrcStride, pHorTmp, 16, iHeight); + McHorVer02WidthEq4_AArch64_neon(pSrc, iSrcStride, pVerTmp, 16, iHeight); + PixelAvgWidthEq4_AArch64_neon(pDst, iDstStride, pHorTmp, 16, pVerTmp, 16, iHeight); + } +} +void McHorVer12_AArch64_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, + int32_t iWidth, int32_t iHeight) +{ + ENFORCE_STACK_ALIGN_1D( uint8_t, pVerTmp, 256, 16 ); + ENFORCE_STACK_ALIGN_1D( uint8_t, pCtrTmp, 256, 16 ); + if (iWidth == 16) + { + McHorVer02WidthEq16_AArch64_neon(pSrc, iSrcStride, pVerTmp, 16, iHeight); + McHorVer22WidthEq16_AArch64_neon(pSrc, iSrcStride, pCtrTmp, 16, iHeight); + PixelAvgWidthEq16_AArch64_neon(pDst, iDstStride, pVerTmp, 16, pCtrTmp, 16, iHeight); + } + else if (iWidth == 8) + { + McHorVer02WidthEq8_AArch64_neon(pSrc, iSrcStride, pVerTmp, 16, iHeight); + McHorVer22WidthEq8_AArch64_neon(pSrc, iSrcStride, pCtrTmp, 16, iHeight); + PixelAvgWidthEq8_AArch64_neon(pDst, iDstStride, pVerTmp, 16, pCtrTmp, 16, iHeight); + } + else if (iWidth == 4) + { + McHorVer02WidthEq4_AArch64_neon(pSrc, iSrcStride, pVerTmp, 16, iHeight); + McHorVer22WidthEq4_AArch64_neon(pSrc, iSrcStride, pCtrTmp, 16, iHeight); + PixelAvgWidthEq4_AArch64_neon(pDst, iDstStride, pVerTmp, 16, pCtrTmp, 16, iHeight); + } +} +void McHorVer13_AArch64_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, + int32_t iWidth, int32_t iHeight) +{ + ENFORCE_STACK_ALIGN_1D( uint8_t, pHorTmp, 256, 16 ); + ENFORCE_STACK_ALIGN_1D( uint8_t, pVerTmp, 256, 16 ); + if (iWidth == 16) + { + McHorVer20WidthEq16_AArch64_neon(pSrc+iSrcStride, iSrcStride, pHorTmp, 16, iHeight); + McHorVer02WidthEq16_AArch64_neon(pSrc, iSrcStride, pVerTmp, 16, iHeight); + PixelAvgWidthEq16_AArch64_neon(pDst, iDstStride, pHorTmp, 16, pVerTmp, 16, iHeight); + } + else if (iWidth == 8) + { + McHorVer20WidthEq8_AArch64_neon(pSrc+iSrcStride, iSrcStride, pHorTmp, 16, iHeight); + McHorVer02WidthEq8_AArch64_neon(pSrc, iSrcStride, pVerTmp, 16, iHeight); + PixelAvgWidthEq8_AArch64_neon(pDst, iDstStride, pHorTmp, 16, pVerTmp, 16, iHeight); + } + else if (iWidth == 4) + { + McHorVer20WidthEq4_AArch64_neon(pSrc+iSrcStride, iSrcStride, pHorTmp, 16, iHeight); + McHorVer02WidthEq4_AArch64_neon(pSrc, iSrcStride, pVerTmp, 16, iHeight); + PixelAvgWidthEq4_AArch64_neon(pDst, iDstStride, pHorTmp, 16, pVerTmp, 16, iHeight); + } +} +void McHorVer21_AArch64_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, + int32_t iWidth, int32_t iHeight) +{ + ENFORCE_STACK_ALIGN_1D( uint8_t, pHorTmp, 256, 16 ); + ENFORCE_STACK_ALIGN_1D( uint8_t, pCtrTmp, 256, 16 ); + if (iWidth == 16) + { + McHorVer20WidthEq16_AArch64_neon(pSrc, iSrcStride, pHorTmp, 16, iHeight); + McHorVer22WidthEq16_AArch64_neon(pSrc, iSrcStride, pCtrTmp, 16, iHeight); + PixelAvgWidthEq16_AArch64_neon(pDst, iDstStride, pHorTmp, 16, pCtrTmp, 16, iHeight); + } + else if (iWidth == 8) + { + McHorVer20WidthEq8_AArch64_neon(pSrc, iSrcStride, pHorTmp, 16, iHeight); + McHorVer22WidthEq8_AArch64_neon(pSrc, iSrcStride, pCtrTmp, 16, iHeight); + PixelAvgWidthEq8_AArch64_neon(pDst, iDstStride, pHorTmp, 16, pCtrTmp, 16, iHeight); + } + else if (iWidth == 4) + { + McHorVer20WidthEq4_AArch64_neon(pSrc, iSrcStride, pHorTmp, 16, iHeight); + McHorVer22WidthEq4_AArch64_neon(pSrc, iSrcStride, pCtrTmp, 16, iHeight); + PixelAvgWidthEq4_AArch64_neon(pDst, iDstStride, pHorTmp, 16, pCtrTmp, 16, iHeight); + } +} +void McHorVer23_AArch64_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, + int32_t iWidth, int32_t iHeight) +{ + ENFORCE_STACK_ALIGN_1D( uint8_t, pHorTmp, 256, 16 ); + ENFORCE_STACK_ALIGN_1D( uint8_t, pCtrTmp, 256, 16 ); + if (iWidth == 16) + { + McHorVer20WidthEq16_AArch64_neon(pSrc+iSrcStride, iSrcStride, pHorTmp, 16, iHeight); + McHorVer22WidthEq16_AArch64_neon(pSrc, iSrcStride, pCtrTmp, 16, iHeight); + PixelAvgWidthEq16_AArch64_neon(pDst, iDstStride, pHorTmp, 16, pCtrTmp, 16, iHeight); + } + else if (iWidth == 8) + { + McHorVer20WidthEq8_AArch64_neon(pSrc+iSrcStride, iSrcStride, pHorTmp, 16, iHeight); + McHorVer22WidthEq8_AArch64_neon(pSrc, iSrcStride, pCtrTmp, 16, iHeight); + PixelAvgWidthEq8_AArch64_neon(pDst, iDstStride, pHorTmp, 16, pCtrTmp, 16, iHeight); + } + else if (iWidth == 4) + { + McHorVer20WidthEq4_AArch64_neon(pSrc+iSrcStride, iSrcStride, pHorTmp, 16, iHeight); + McHorVer22WidthEq4_AArch64_neon(pSrc, iSrcStride, pCtrTmp, 16, iHeight); + PixelAvgWidthEq4_AArch64_neon(pDst, iDstStride, pHorTmp, 16, pCtrTmp, 16, iHeight); + } +} +void McHorVer30_AArch64_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, + int32_t iWidth, int32_t iHeight) +{ + if (iWidth == 16) + McHorVer30WidthEq16_AArch64_neon(pSrc, iSrcStride, pDst, iDstStride, iHeight); + else if (iWidth == 8) + McHorVer30WidthEq8_AArch64_neon(pSrc, iSrcStride, pDst, iDstStride, iHeight); + else if (iWidth == 4) + McHorVer30WidthEq4_AArch64_neon(pSrc, iSrcStride, pDst, iDstStride, iHeight); +} +void McHorVer31_AArch64_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, + int32_t iWidth, int32_t iHeight) +{ + ENFORCE_STACK_ALIGN_1D( uint8_t, pHorTmp, 256, 16 ); + ENFORCE_STACK_ALIGN_1D( uint8_t, pVerTmp, 256, 16 ); + if (iWidth == 16) { + McHorVer20WidthEq16_AArch64_neon(pSrc, iSrcStride, pHorTmp, 16, iHeight); + McHorVer02WidthEq16_AArch64_neon(pSrc+1, iSrcStride, pVerTmp, 16, iHeight); + PixelAvgWidthEq16_AArch64_neon(pDst, iDstStride, pHorTmp, 16, pVerTmp, 16, iHeight); + } + else if (iWidth == 8){ + McHorVer20WidthEq8_AArch64_neon(pSrc, iSrcStride, pHorTmp, 16, iHeight); + McHorVer02WidthEq8_AArch64_neon(pSrc+1, iSrcStride, pVerTmp, 16, iHeight); + PixelAvgWidthEq8_AArch64_neon(pDst, iDstStride, pHorTmp, 16, pVerTmp, 16, iHeight); + } + else if (iWidth == 4) + { + McHorVer20WidthEq4_AArch64_neon(pSrc, iSrcStride, pHorTmp, 16, iHeight); + McHorVer02WidthEq4_AArch64_neon(pSrc+1, iSrcStride, pVerTmp, 16, iHeight); + PixelAvgWidthEq4_AArch64_neon(pDst, iDstStride, pHorTmp, 16, pVerTmp, 16, iHeight); + } +} +void McHorVer32_AArch64_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, + int32_t iWidth, int32_t iHeight) +{ + ENFORCE_STACK_ALIGN_1D( uint8_t, pVerTmp, 256, 16 ); + ENFORCE_STACK_ALIGN_1D( uint8_t, pCtrTmp, 256, 16 ); + if (iWidth == 16) + { + McHorVer02WidthEq16_AArch64_neon(pSrc+1, iSrcStride, pVerTmp, 16, iHeight); + McHorVer22WidthEq16_AArch64_neon(pSrc, iSrcStride, pCtrTmp, 16, iHeight); + PixelAvgWidthEq16_AArch64_neon(pDst, iDstStride, pVerTmp, 16, pCtrTmp, 16, iHeight); + } + else if (iWidth == 8) + { + McHorVer02WidthEq8_AArch64_neon(pSrc+1, iSrcStride, pVerTmp, 16, iHeight); + McHorVer22WidthEq8_AArch64_neon(pSrc, iSrcStride, pCtrTmp, 16, iHeight); + PixelAvgWidthEq8_AArch64_neon(pDst, iDstStride, pVerTmp, 16, pCtrTmp, 16, iHeight); + } + else if (iWidth == 4) + { + McHorVer02WidthEq4_AArch64_neon(pSrc+1, iSrcStride, pVerTmp, 16, iHeight); + McHorVer22WidthEq4_AArch64_neon(pSrc, iSrcStride, pCtrTmp, 16, iHeight); + PixelAvgWidthEq4_AArch64_neon(pDst, iDstStride, pVerTmp, 16, pCtrTmp, 16, iHeight); + } +} +void McHorVer33_AArch64_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, + int32_t iWidth, int32_t iHeight) +{ + ENFORCE_STACK_ALIGN_1D( uint8_t, pHorTmp, 256, 16 ); + ENFORCE_STACK_ALIGN_1D( uint8_t, pVerTmp, 256, 16 ); + if (iWidth == 16) + { + McHorVer20WidthEq16_AArch64_neon(pSrc+iSrcStride, iSrcStride, pHorTmp, 16, iHeight); + McHorVer02WidthEq16_AArch64_neon(pSrc+1, iSrcStride, pVerTmp, 16, iHeight); + PixelAvgWidthEq16_AArch64_neon(pDst, iDstStride, pHorTmp, 16, pVerTmp, 16, iHeight); + } + else if (iWidth == 8) + { + McHorVer20WidthEq8_AArch64_neon(pSrc+iSrcStride, iSrcStride, pHorTmp, 16, iHeight); + McHorVer02WidthEq8_AArch64_neon(pSrc+1, iSrcStride, pVerTmp, 16, iHeight); + PixelAvgWidthEq8_AArch64_neon(pDst, iDstStride, pHorTmp, 16, pVerTmp, 16, iHeight); + } + else if (iWidth == 4) + { + McHorVer20WidthEq4_AArch64_neon(pSrc+iSrcStride, iSrcStride, pHorTmp, 16, iHeight); + McHorVer02WidthEq4_AArch64_neon(pSrc+1, iSrcStride, pVerTmp, 16, iHeight); + PixelAvgWidthEq4_AArch64_neon(pDst, iDstStride, pHorTmp, 16, pVerTmp, 16, iHeight); + } +} + +void McLuma_AArch64_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, + int16_t iMvX, int16_t iMvY, int32_t iWidth, int32_t iHeight) +{ + static PWelsMcWidthHeightFunc pWelsMcFunc[4][4] = //[x][y] + { + {McCopy_AArch64_neon, McHorVer01_AArch64_neon, McHorVer02_AArch64_neon, McHorVer03_AArch64_neon}, + {McHorVer10_AArch64_neon, McHorVer11_AArch64_neon, McHorVer12_AArch64_neon, McHorVer13_AArch64_neon}, + {McHorVer20_AArch64_neon, McHorVer21_AArch64_neon, McHorVer22_AArch64_neon, McHorVer23_AArch64_neon}, + {McHorVer30_AArch64_neon, McHorVer31_AArch64_neon, McHorVer32_AArch64_neon, McHorVer33_AArch64_neon}, + }; + // pSrc += (iMvY >> 2) * iSrcStride + (iMvX >> 2); + pWelsMcFunc[iMvX&0x03][iMvY&0x03](pSrc, iSrcStride, pDst, iDstStride, iWidth, iHeight); +} +void McChroma_AArch64_neon(const uint8_t *pSrc, int32_t iSrcStride, uint8_t *pDst, int32_t iDstStride, + int16_t iMvX, int16_t iMvY, int32_t iWidth, int32_t iHeight) +{ + if (0 == iMvX && 0 == iMvY) + { + if(8 == iWidth) + McCopyWidthEq8_AArch64_neon(pSrc, iSrcStride, pDst, iDstStride, iHeight); + else if(iWidth == 4) + McCopyWidthEq4_AArch64_neon(pSrc, iSrcStride, pDst, iDstStride, iHeight); + else //here iWidth == 2 + McCopyWidthEq2_c(pSrc,iSrcStride,pDst,iDstStride,iHeight); + } + else + { + const int32_t kiD8x = iMvX & 0x07; + const int32_t kiD8y = iMvY & 0x07; + if(8 == iWidth) + McChromaWidthEq8_AArch64_neon(pSrc, iSrcStride, pDst, iDstStride, (int32_t*)(g_kuiABCD[kiD8y][kiD8x]), iHeight); + else if(4 == iWidth) + McChromaWidthEq4_AArch64_neon(pSrc, iSrcStride, pDst, iDstStride, (int32_t*)(g_kuiABCD[kiD8y][kiD8x]), iHeight); + else //here iWidth == 2 + McChromaWithFragMv_c(pSrc, iSrcStride, pDst, iDstStride, iMvX, iMvY, iWidth, iHeight); + } +} +#endif + +void InitMcFunc (SMcFunc* pMcFunc, int32_t iCpu) { + pMcFunc->pMcLumaFunc = McLuma_c; + pMcFunc->pMcChromaFunc = McChroma_c; + +#ifdef HAVE_NEON + if ( iCpu & WELS_CPU_NEON ) { + pMcFunc->pMcLumaFunc = McLuma_neon; + pMcFunc->pMcChromaFunc = McChroma_neon; + } +#endif +#ifdef HAVE_NEON_AARCH64 + if ( iCpu & WELS_CPU_NEON ) { + pMcFunc->pMcLumaFunc = McLuma_AArch64_neon; + pMcFunc->pMcChromaFunc = McChroma_AArch64_neon; + } +#endif +#if defined (X86_ASM) + if (iCpu & WELS_CPU_SSE2) { + pMcFunc->pMcLumaFunc = McLuma_sse2; + pMcFunc->pMcChromaFunc = McChroma_sse2; + } +#endif //(X86_ASM) +} + +} // namespace WelsDec diff --git a/codec/encoder/core/src/mc.cpp b/codec/encoder/core/src/mc.cpp index ca6ba97e..4c95b491 100644 --- a/codec/encoder/core/src/mc.cpp +++ b/codec/encoder/core/src/mc.cpp @@ -89,8 +89,10 @@ VerFilterFunc fpVerFilter = NULL; HorFilterFunc fpHorFilter = NULL; HorFilterFuncInput16Bits fpHorFilterInput16Bits = NULL; -typedef void (*WelsMcFunc0) (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight); -typedef void (*WelsMcFunc1) (uint8_t* pDst, int32_t iDstStride, const uint8_t* psrcA, int32_t iSrcAStride, const uint8_t* pSrcB, +typedef void (*WelsMcFunc0) (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, + int32_t iHeight); +typedef void (*WelsMcFunc1) (uint8_t* pDst, int32_t iDstStride, const uint8_t* psrcA, int32_t iSrcAStride, + const uint8_t* pSrcB, int32_t iSrcBStride, int32_t iHeight); WelsMcFunc0 McCopyWidthEq16 = NULL; WelsMcFunc0 McCopyWidthEq8 = NULL; @@ -323,7 +325,8 @@ static inline void McHorVer33WidthEq16 (const uint8_t* pSrc, int32_t iSrcStride, pfPixelAvgWidthEq16 (pDst, iDstStride, pTmp, 16, &pTmp[256], 16, iHeight); } -static inline void McHorVer20_c (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iWidth, +static inline void McHorVer20_c (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, + int32_t iWidth, int32_t iHeight) { int32_t i, j; for (i = 0; i < iHeight; i++) { @@ -335,7 +338,8 @@ static inline void McHorVer20_c (const uint8_t* pSrc, int32_t iSrcStride, uint8_ } } //vertical filter to gain half sample, that is (0, 2) location in quarter sample -static inline void McHorVer02_c (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iWidth, +static inline void McHorVer02_c (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, + int32_t iWidth, int32_t iHeight) { int32_t i, j; for (i = 0; i < iHeight; i++) { @@ -347,7 +351,8 @@ static inline void McHorVer02_c (const uint8_t* pSrc, int32_t iSrcStride, uint8_ } } //horizontal and vertical filter to gain half sample, that is (2, 2) location in quarter sample -static inline void McHorVer22_c (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iWidth, +static inline void McHorVer22_c (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, + int32_t iWidth, int32_t iHeight) { int16_t pTmp[17 + 5] = {0}; //w+1 int32_t i, j, k; @@ -481,94 +486,190 @@ void McChroma_ssse3 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int #endif //X86_ASM - //***************************************************************************// - // NEON implementation // - //***************************************************************************// +//***************************************************************************// +// NEON implementation // +//***************************************************************************// #if defined(HAVE_NEON) -void McHorVer20Width9Or17_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, - int32_t iWidth, int32_t iHeight) { +void McHorVer20Width9Or17_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, + int32_t iWidth, int32_t iHeight) { if (iWidth == 17) - McHorVer20Width17_neon(pSrc, iSrcStride, pDst, iDstStride, iHeight); + McHorVer20Width17_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight); else //if (iWidth == 9) - McHorVer20Width9_neon(pSrc, iSrcStride, pDst, iDstStride, iHeight); + McHorVer20Width9_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight); } -void McHorVer02Height9Or17_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, - int32_t iWidth, int32_t iHeight){ +void McHorVer02Height9Or17_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, + int32_t iWidth, int32_t iHeight) { if (iWidth == 16) - McHorVer02Height17_neon(pSrc, iSrcStride, pDst, iDstStride, iHeight); + McHorVer02Height17_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight); else //if (iWidth == 8) - McHorVer02Height9_neon(pSrc, iSrcStride, pDst, iDstStride, iHeight); + McHorVer02Height9_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight); } -void McHorVer22Width9Or17Height9Or17_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, - int32_t iWidth, int32_t iHeight){ +void McHorVer22Width9Or17Height9Or17_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, + int32_t iWidth, int32_t iHeight) { if (iWidth == 17) - McHorVer22Width17_neon(pSrc, iSrcStride, pDst, iDstStride, iHeight); + McHorVer22Width17_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight); else //if (iWidth == 9) - McHorVer22Width9_neon(pSrc, iSrcStride, pDst, iDstStride, iHeight); + McHorVer22Width9_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight); } -void EncMcHorVer11_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight) { - ENFORCE_STACK_ALIGN_1D(uint8_t, pTmp, 512, 16) - McHorVer20WidthEq16_neon(pSrc, iSrcStride, pTmp, 16, iHeight); - McHorVer02WidthEq16_neon(pSrc, iSrcStride, &pTmp[256], 16, iHeight); - PixelAvgWidthEq16_neon(pDst, iDstStride, pTmp, &pTmp[256], iHeight); +void EncMcHorVer11_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight) { + ENFORCE_STACK_ALIGN_1D (uint8_t, pTmp, 512, 16) + McHorVer20WidthEq16_neon (pSrc, iSrcStride, pTmp, 16, iHeight); + McHorVer02WidthEq16_neon (pSrc, iSrcStride, &pTmp[256], 16, iHeight); + PixelAvgWidthEq16_neon (pDst, iDstStride, pTmp, &pTmp[256], iHeight); } -void EncMcHorVer12_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight) { - ENFORCE_STACK_ALIGN_1D(uint8_t, pTmp, 512, 16) - McHorVer02WidthEq16_neon(pSrc, iSrcStride, pTmp, 16, iHeight); - McHorVer22WidthEq16_neon(pSrc, iSrcStride, &pTmp[256], 16, iHeight); - PixelAvgWidthEq16_neon(pDst, iDstStride, pTmp, &pTmp[256], iHeight); +void EncMcHorVer12_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight) { + ENFORCE_STACK_ALIGN_1D (uint8_t, pTmp, 512, 16) + McHorVer02WidthEq16_neon (pSrc, iSrcStride, pTmp, 16, iHeight); + McHorVer22WidthEq16_neon (pSrc, iSrcStride, &pTmp[256], 16, iHeight); + PixelAvgWidthEq16_neon (pDst, iDstStride, pTmp, &pTmp[256], iHeight); } -void EncMcHorVer13_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight) { - ENFORCE_STACK_ALIGN_1D(uint8_t, pTmp, 512, 16) - McHorVer20WidthEq16_neon(pSrc+iSrcStride, iSrcStride, pTmp, 16, iHeight); - McHorVer02WidthEq16_neon(pSrc, iSrcStride, &pTmp[256], 16, iHeight); - PixelAvgWidthEq16_neon(pDst, iDstStride, pTmp, &pTmp[256], iHeight); +void EncMcHorVer13_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight) { + ENFORCE_STACK_ALIGN_1D (uint8_t, pTmp, 512, 16) + McHorVer20WidthEq16_neon (pSrc + iSrcStride, iSrcStride, pTmp, 16, iHeight); + McHorVer02WidthEq16_neon (pSrc, iSrcStride, &pTmp[256], 16, iHeight); + PixelAvgWidthEq16_neon (pDst, iDstStride, pTmp, &pTmp[256], iHeight); } -void EncMcHorVer21_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight) { - ENFORCE_STACK_ALIGN_1D(uint8_t, pTmp, 512, 16) - McHorVer20WidthEq16_neon(pSrc, iSrcStride, pTmp, 16, iHeight); - McHorVer22WidthEq16_neon(pSrc, iSrcStride, &pTmp[256], 16, iHeight); - PixelAvgWidthEq16_neon(pDst, iDstStride, pTmp, &pTmp[256], iHeight); +void EncMcHorVer21_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight) { + ENFORCE_STACK_ALIGN_1D (uint8_t, pTmp, 512, 16) + McHorVer20WidthEq16_neon (pSrc, iSrcStride, pTmp, 16, iHeight); + McHorVer22WidthEq16_neon (pSrc, iSrcStride, &pTmp[256], 16, iHeight); + PixelAvgWidthEq16_neon (pDst, iDstStride, pTmp, &pTmp[256], iHeight); } -void EncMcHorVer23_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight) { - ENFORCE_STACK_ALIGN_1D(uint8_t, pTmp, 512, 16) - McHorVer20WidthEq16_neon(pSrc+iSrcStride, iSrcStride, pTmp, 16, iHeight); - McHorVer22WidthEq16_neon(pSrc, iSrcStride, &pTmp[256], 16, iHeight); - PixelAvgWidthEq16_neon(pDst, iDstStride, pTmp, &pTmp[256], iHeight); +void EncMcHorVer23_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight) { + ENFORCE_STACK_ALIGN_1D (uint8_t, pTmp, 512, 16) + McHorVer20WidthEq16_neon (pSrc + iSrcStride, iSrcStride, pTmp, 16, iHeight); + McHorVer22WidthEq16_neon (pSrc, iSrcStride, &pTmp[256], 16, iHeight); + PixelAvgWidthEq16_neon (pDst, iDstStride, pTmp, &pTmp[256], iHeight); } -void EncMcHorVer31_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight) { - ENFORCE_STACK_ALIGN_1D(uint8_t, pTmp, 512, 16) - McHorVer20WidthEq16_neon(pSrc, iSrcStride, pTmp, 16, iHeight); - McHorVer02WidthEq16_neon(pSrc+1, iSrcStride, &pTmp[256], 16, iHeight); - PixelAvgWidthEq16_neon(pDst, iDstStride, pTmp, &pTmp[256], iHeight); +void EncMcHorVer31_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight) { + ENFORCE_STACK_ALIGN_1D (uint8_t, pTmp, 512, 16) + McHorVer20WidthEq16_neon (pSrc, iSrcStride, pTmp, 16, iHeight); + McHorVer02WidthEq16_neon (pSrc + 1, iSrcStride, &pTmp[256], 16, iHeight); + PixelAvgWidthEq16_neon (pDst, iDstStride, pTmp, &pTmp[256], iHeight); } -void EncMcHorVer32_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight) { - ENFORCE_STACK_ALIGN_1D(uint8_t, pTmp, 512, 16) - McHorVer02WidthEq16_neon(pSrc+1, iSrcStride, pTmp, 16, iHeight); - McHorVer22WidthEq16_neon(pSrc, iSrcStride, &pTmp[256], 16, iHeight); - PixelAvgWidthEq16_neon(pDst, iDstStride, pTmp, &pTmp[256], iHeight); +void EncMcHorVer32_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight) { + ENFORCE_STACK_ALIGN_1D (uint8_t, pTmp, 512, 16) + McHorVer02WidthEq16_neon (pSrc + 1, iSrcStride, pTmp, 16, iHeight); + McHorVer22WidthEq16_neon (pSrc, iSrcStride, &pTmp[256], 16, iHeight); + PixelAvgWidthEq16_neon (pDst, iDstStride, pTmp, &pTmp[256], iHeight); } -void EncMcHorVer33_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight) { - ENFORCE_STACK_ALIGN_1D(uint8_t, pTmp, 512, 16) - McHorVer20WidthEq16_neon(pSrc+iSrcStride, iSrcStride, pTmp, 16, iHeight); - McHorVer02WidthEq16_neon(pSrc+1, iSrcStride, &pTmp[256], 16, iHeight); - PixelAvgWidthEq16_neon(pDst, iDstStride, pTmp, &pTmp[256], iHeight); +void EncMcHorVer33_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight) { + ENFORCE_STACK_ALIGN_1D (uint8_t, pTmp, 512, 16) + McHorVer20WidthEq16_neon (pSrc + iSrcStride, iSrcStride, pTmp, 16, iHeight); + McHorVer02WidthEq16_neon (pSrc + 1, iSrcStride, &pTmp[256], 16, iHeight); + PixelAvgWidthEq16_neon (pDst, iDstStride, pTmp, &pTmp[256], iHeight); } -void EncMcChroma_neon(const uint8_t *pSrc, int32_t iSrcStride, uint8_t *pDst, int32_t iDstStride, - SMVUnitXY sMv, int32_t iWidth, int32_t iHeight) { - const int32_t kiD8x = sMv.iMvX&0x07; - const int32_t kiD8y = sMv.iMvY&0x07; +void EncMcChroma_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, + SMVUnitXY sMv, int32_t iWidth, int32_t iHeight) { + const int32_t kiD8x = sMv.iMvX & 0x07; + const int32_t kiD8y = sMv.iMvY & 0x07; if (0 == kiD8x && 0 == kiD8y) { - if(8 == iWidth) - McCopyWidthEq8_neon(pSrc, iSrcStride, pDst, iDstStride, iHeight); + if (8 == iWidth) + McCopyWidthEq8_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight); else // iWidth == 4 - McCopyWidthEq4_neon(pSrc, iSrcStride, pDst, iDstStride, iHeight); - } - else { - if(8 == iWidth) - McChromaWidthEq8_neon(pSrc, iSrcStride, pDst, iDstStride, (int32_t*)(g_kuiABCD[kiD8y][kiD8x]), iHeight); + McCopyWidthEq4_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight); + } else { + if (8 == iWidth) + McChromaWidthEq8_neon (pSrc, iSrcStride, pDst, iDstStride, (int32_t*) (g_kuiABCD[kiD8y][kiD8x]), iHeight); else //if(4 == iWidth) - McChromaWidthEq4_neon(pSrc, iSrcStride, pDst, iDstStride, (int32_t*)(g_kuiABCD[kiD8y][kiD8x]), iHeight); + McChromaWidthEq4_neon (pSrc, iSrcStride, pDst, iDstStride, (int32_t*) (g_kuiABCD[kiD8y][kiD8x]), iHeight); + } +} +#endif + +#if defined(HAVE_NEON_AARCH64) +void McHorVer20Width9Or17_AArch64_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, + int32_t iWidth, int32_t iHeight) { + if (iWidth == 17) + McHorVer20Width17_AArch64_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight); + else //if (iWidth == 9) + McHorVer20Width9_AArch64_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight); +} +void McHorVer02Height9Or17_AArch64_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, + int32_t iWidth, int32_t iHeight) { + if (iWidth == 16) + McHorVer02Height17_AArch64_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight); + else //if (iWidth == 8) + McHorVer02Height9_AArch64_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight); +} +void McHorVer22Width9Or17Height9Or17_AArch64_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, + int32_t iDstStride, + int32_t iWidth, int32_t iHeight) { + if (iWidth == 17) + McHorVer22Width17_AArch64_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight); + else //if (iWidth == 9) + McHorVer22Width9_AArch64_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight); +} +void EncMcHorVer11_AArch64_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, + int32_t iHeight) { + ENFORCE_STACK_ALIGN_1D (uint8_t, pTmp, 512, 16) + McHorVer20WidthEq16_AArch64_neon (pSrc, iSrcStride, pTmp, 16, iHeight); + McHorVer02WidthEq16_AArch64_neon (pSrc, iSrcStride, &pTmp[256], 16, iHeight); + PixelAvgWidthEq16_AArch64_neon (pDst, iDstStride, pTmp, 16, &pTmp[256], 16, iHeight); +} +void EncMcHorVer12_AArch64_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, + int32_t iHeight) { + ENFORCE_STACK_ALIGN_1D (uint8_t, pTmp, 512, 16) + McHorVer02WidthEq16_AArch64_neon (pSrc, iSrcStride, pTmp, 16, iHeight); + McHorVer22WidthEq16_AArch64_neon (pSrc, iSrcStride, &pTmp[256], 16, iHeight); + PixelAvgWidthEq16_AArch64_neon (pDst, iDstStride, pTmp, 16, &pTmp[256], 16, iHeight); +} +void EncMcHorVer13_AArch64_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, + int32_t iHeight) { + ENFORCE_STACK_ALIGN_1D (uint8_t, pTmp, 512, 16) + McHorVer20WidthEq16_AArch64_neon (pSrc + iSrcStride, iSrcStride, pTmp, 16, iHeight); + McHorVer02WidthEq16_AArch64_neon (pSrc, iSrcStride, &pTmp[256], 16, iHeight); + PixelAvgWidthEq16_AArch64_neon (pDst, iDstStride, pTmp, 16, &pTmp[256], 16, iHeight); +} +void EncMcHorVer21_AArch64_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, + int32_t iHeight) { + ENFORCE_STACK_ALIGN_1D (uint8_t, pTmp, 512, 16) + McHorVer20WidthEq16_AArch64_neon (pSrc, iSrcStride, pTmp, 16, iHeight); + McHorVer22WidthEq16_AArch64_neon (pSrc, iSrcStride, &pTmp[256], 16, iHeight); + PixelAvgWidthEq16_AArch64_neon (pDst, iDstStride, pTmp, 16, &pTmp[256], 16, iHeight); +} +void EncMcHorVer23_AArch64_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, + int32_t iHeight) { + ENFORCE_STACK_ALIGN_1D (uint8_t, pTmp, 512, 16) + McHorVer20WidthEq16_AArch64_neon (pSrc + iSrcStride, iSrcStride, pTmp, 16, iHeight); + McHorVer22WidthEq16_AArch64_neon (pSrc, iSrcStride, &pTmp[256], 16, iHeight); + PixelAvgWidthEq16_AArch64_neon (pDst, iDstStride, pTmp, 16, &pTmp[256], 16, iHeight); +} +void EncMcHorVer31_AArch64_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, + int32_t iHeight) { + ENFORCE_STACK_ALIGN_1D (uint8_t, pTmp, 512, 16) + McHorVer20WidthEq16_AArch64_neon (pSrc, iSrcStride, pTmp, 16, iHeight); + McHorVer02WidthEq16_AArch64_neon (pSrc + 1, iSrcStride, &pTmp[256], 16, iHeight); + PixelAvgWidthEq16_AArch64_neon (pDst, iDstStride, pTmp, 16, &pTmp[256], 16, iHeight); +} +void EncMcHorVer32_AArch64_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, + int32_t iHeight) { + ENFORCE_STACK_ALIGN_1D (uint8_t, pTmp, 512, 16) + McHorVer02WidthEq16_AArch64_neon (pSrc + 1, iSrcStride, pTmp, 16, iHeight); + McHorVer22WidthEq16_AArch64_neon (pSrc, iSrcStride, &pTmp[256], 16, iHeight); + PixelAvgWidthEq16_AArch64_neon (pDst, iDstStride, pTmp, 16, &pTmp[256], 16, iHeight); +} +void EncMcHorVer33_AArch64_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, + int32_t iHeight) { + ENFORCE_STACK_ALIGN_1D (uint8_t, pTmp, 512, 16) + McHorVer20WidthEq16_AArch64_neon (pSrc + iSrcStride, iSrcStride, pTmp, 16, iHeight); + McHorVer02WidthEq16_AArch64_neon (pSrc + 1, iSrcStride, &pTmp[256], 16, iHeight); + PixelAvgWidthEq16_AArch64_neon (pDst, iDstStride, pTmp, 16, &pTmp[256], 16, iHeight); +} +void EncMcChroma_AArch64_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, + SMVUnitXY sMv, int32_t iWidth, int32_t iHeight) { + const int32_t kiD8x = sMv.iMvX & 0x07; + const int32_t kiD8y = sMv.iMvY & 0x07; + if (0 == kiD8x && 0 == kiD8y) { + if (8 == iWidth) + McCopyWidthEq8_AArch64_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight); + else // iWidth == 4 + McCopyWidthEq4_AArch64_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight); + } else { + if (8 == iWidth) + McChromaWidthEq8_AArch64_neon (pSrc, iSrcStride, pDst, iDstStride, (int32_t*) (g_kuiABCD[kiD8y][kiD8x]), iHeight); + else //if(4 == iWidth) + McChromaWidthEq4_AArch64_neon (pSrc, iSrcStride, pDst, iDstStride, (int32_t*) (g_kuiABCD[kiD8y][kiD8x]), iHeight); } } #endif @@ -599,7 +700,14 @@ void WelsInitMcFuncs (SWelsFuncPtrList* pFuncList, uint32_t uiCpuFlag) { McHorVer03WidthEq16_neon, EncMcHorVer13_neon, EncMcHorVer23_neon, EncMcHorVer33_neon }; #endif - +#if defined(HAVE_NEON_AARCH64) + static PWelsLumaQuarpelMcFunc pWelsMcFuncWidthEq16_AArch64_neon[16] = { //[x][y] + McCopyWidthEq16_AArch64_neon, McHorVer10WidthEq16_AArch64_neon, McHorVer20WidthEq16_AArch64_neon, McHorVer30WidthEq16_AArch64_neon, + McHorVer01WidthEq16_AArch64_neon, EncMcHorVer11_AArch64_neon, EncMcHorVer21_AArch64_neon, EncMcHorVer31_AArch64_neon, + McHorVer02WidthEq16_AArch64_neon, EncMcHorVer12_AArch64_neon, McHorVer22WidthEq16_AArch64_neon, EncMcHorVer32_AArch64_neon, + McHorVer03WidthEq16_AArch64_neon, EncMcHorVer13_AArch64_neon, EncMcHorVer23_AArch64_neon, EncMcHorVer33_AArch64_neon + }; +#endif pFuncList->sMcFuncs.pfLumaHalfpelHor = McHorVer20_c; pFuncList->sMcFuncs.pfLumaHalfpelVer = McHorVer02_c; pFuncList->sMcFuncs.pfLumaHalfpelCen = McHorVer22_c; @@ -651,5 +759,16 @@ void WelsInitMcFuncs (SWelsFuncPtrList* pFuncList, uint32_t uiCpuFlag) { pFuncList->sMcFuncs.pfLumaHalfpelCen = McHorVer22Width9Or17Height9Or17_neon;//iWidth+1/heigh+1 } #endif +#if defined(HAVE_NEON_AARCH64) + if (uiCpuFlag & WELS_CPU_NEON) { + pFuncList->sMcFuncs.pfLumaQuarpelMc = pWelsMcFuncWidthEq16_AArch64_neon; + pFuncList->sMcFuncs.pfChromaMc = EncMcChroma_AArch64_neon; + pFuncList->sMcFuncs.pfSampleAveraging[0] = PixStrideAvgWidthEq8_AArch64_neon; + pFuncList->sMcFuncs.pfSampleAveraging[1] = PixStrideAvgWidthEq16_AArch64_neon; + pFuncList->sMcFuncs.pfLumaHalfpelHor = McHorVer20Width9Or17_AArch64_neon;//iWidth+1:8/16 + pFuncList->sMcFuncs.pfLumaHalfpelVer = McHorVer02Height9Or17_AArch64_neon;//heigh+1:8/16 + pFuncList->sMcFuncs.pfLumaHalfpelCen = McHorVer22Width9Or17Height9Or17_AArch64_neon;//iWidth+1/heigh+1 + } +#endif } } diff --git a/codec/encoder/core/src/mc.cpp.orig b/codec/encoder/core/src/mc.cpp.orig new file mode 100644 index 00000000..5f8a1f45 --- /dev/null +++ b/codec/encoder/core/src/mc.cpp.orig @@ -0,0 +1,762 @@ +/*! + * \copy + * Copyright (c) 2009-2013, Cisco Systems + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS + * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE + * COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN + * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + * + * \file mc.c + * + * \brief Interfaces implementation for motion compensation + * + * \date 03/17/2009 Created + * + ************************************************************************************* + */ + +#include "mc.h" +#include "cpu_core.h" + +namespace WelsSVCEnc { +/*------------------weight for chroma fraction pixel interpolation------------------*/ +//kuiA = (8 - dx) * (8 - dy); +//kuiB = dx * (8 - dy); +//kuiC = (8 - dx) * dy; +//kuiD = dx * dy +static const uint8_t g_kuiABCD[8][8][4] = { ////g_kuiA[dy][dx], g_kuiB[dy][dx], g_kuiC[dy][dx], g_kuiD[dy][dx] + { + {64, 0, 0, 0}, {56, 8, 0, 0}, {48, 16, 0, 0}, {40, 24, 0, 0}, + {32, 32, 0, 0}, {24, 40, 0, 0}, {16, 48, 0, 0}, {8, 56, 0, 0} + }, + { + {56, 0, 8, 0}, {49, 7, 7, 1}, {42, 14, 6, 2}, {35, 21, 5, 3}, + {28, 28, 4, 4}, {21, 35, 3, 5}, {14, 42, 2, 6}, {7, 49, 1, 7} + }, + { + {48, 0, 16, 0}, {42, 6, 14, 2}, {36, 12, 12, 4}, {30, 18, 10, 6}, + {24, 24, 8, 8}, {18, 30, 6, 10}, {12, 36, 4, 12}, {6, 42, 2, 14} + }, + { + {40, 0, 24, 0}, {35, 5, 21, 3}, {30, 10, 18, 6}, {25, 15, 15, 9}, + {20, 20, 12, 12}, {15, 25, 9, 15}, {10, 30, 6, 18}, {5, 35, 3, 21} + }, + { + {32, 0, 32, 0}, {28, 4, 28, 4}, {24, 8, 24, 8}, {20, 12, 20, 12}, + {16, 16, 16, 16}, {12, 20, 12, 20}, {8, 24, 8, 24}, {4, 28, 4, 28} + }, + { + {24, 0, 40, 0}, {21, 3, 35, 5}, {18, 6, 30, 10}, {15, 9, 25, 15}, + {12, 12, 20, 20}, {9, 15, 15, 25}, {6, 18, 10, 30}, {3, 21, 5, 35} + }, + { + {16, 0, 48, 0}, {14, 2, 42, 6}, {12, 4, 36, 12}, {10, 6, 30, 18}, + {8, 8, 24, 24}, {6, 10, 18, 30}, {4, 12, 12, 36}, {2, 14, 6, 42} + }, + { + {8, 0, 56, 0}, {7, 1, 49, 7}, {6, 2, 42, 14}, {5, 3, 35, 21}, + {4, 4, 28, 28}, {3, 5, 21, 35}, {2, 6, 14, 42}, {1, 7, 7, 49} + } +}; +typedef int32_t (*VerFilterFunc) (const uint8_t* pSrc, const int32_t kiSrcStride); +typedef int32_t (*HorFilterFunc) (const uint8_t* pSrc); +typedef int32_t (*HorFilterFuncInput16Bits) (int16_t* pSrc); + +VerFilterFunc fpVerFilter = NULL; +HorFilterFunc fpHorFilter = NULL; +HorFilterFuncInput16Bits fpHorFilterInput16Bits = NULL; + +typedef void (*WelsMcFunc0) (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight); +typedef void (*WelsMcFunc1) (uint8_t* pDst, int32_t iDstStride, const uint8_t* psrcA, int32_t iSrcAStride, const uint8_t* pSrcB, + int32_t iSrcBStride, int32_t iHeight); +WelsMcFunc0 McCopyWidthEq16 = NULL; +WelsMcFunc0 McCopyWidthEq8 = NULL; +WelsMcFunc0 McCopyWidthEq4 = NULL; +WelsMcFunc0 pfMcHorVer02WidthEq16 = NULL; +WelsMcFunc1 pfPixelAvgWidthEq16 = NULL; +WelsMcFunc0 pfMcHorVer20WidthEq16 = NULL; +WelsMcFunc0 pfMcHorVer22WidthEq16 = NULL; + +//***************************************************************************// +// C code implementation // +//***************************************************************************// +static inline void McCopyWidthEq4_c (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, + int32_t iHeight) { + int32_t i; + for (i = 0; i < iHeight; i++) { + memcpy (pDst, pSrc, 4); // confirmed_safe_unsafe_usage + pDst += iDstStride; + pSrc += iSrcStride; + } +} + +static inline void McCopyWidthEq8_c (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, + int32_t iHeight) + +{ + int32_t i; + for (i = 0; i < iHeight; i++) { + memcpy (pDst, pSrc, 8); // confirmed_safe_unsafe_usage + pDst += iDstStride; + pSrc += iSrcStride; + } +} +static inline void McCopyWidthEq16_c (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, + int32_t iHeight) { + int32_t i; + for (i = 0; i < iHeight; i++) { + memcpy (pDst, pSrc, 16); // confirmed_safe_unsafe_usage + pDst += iDstStride; + pSrc += iSrcStride; + } +} + +//--------------------Luma sample MC------------------// +static inline int32_t HorFilter_c (const uint8_t* pSrc) { + int32_t iPix05 = pSrc[-2] + pSrc[3]; + int32_t iPix14 = pSrc[-1] + pSrc[2]; + int32_t iPix23 = pSrc[ 0] + pSrc[1]; + + return (iPix05 - ((iPix14 << 2) + iPix14) + (iPix23 << 4) + (iPix23 << 2)); +} + +static inline int32_t HorFilterInput16bit1_c (int16_t* pSrc) { + int32_t iPix05 = pSrc[-2] + pSrc[3]; + int32_t iPix14 = pSrc[-1] + pSrc[2]; + int32_t iPix23 = pSrc[ 0] + pSrc[1]; + + return (iPix05 - ((iPix14 << 2) + iPix14) + (iPix23 << 4) + (iPix23 << 2)); +} +static inline int32_t VerFilter_c (const uint8_t* pSrc, const int32_t kiSrcStride) { + const int32_t kiLine1 = kiSrcStride; + const int32_t kiLine2 = (kiSrcStride << 1); + const int32_t kiLine3 = kiLine1 + kiLine2; + const uint32_t kuiPix05 = * (pSrc - kiLine2) + * (pSrc + kiLine3); + const uint32_t kuiPix14 = * (pSrc - kiLine1) + * (pSrc + kiLine2); + const uint32_t kuiPix23 = * (pSrc) + * (pSrc + kiLine1); + + return (kuiPix05 - ((kuiPix14 << 2) + kuiPix14) + (kuiPix23 << 4) + (kuiPix23 << 2)); +} + +static inline void PixelAvgWidthEq8_c (uint8_t* pDst, int32_t iDstStride, const uint8_t* pSrcA, int32_t iSrcAStride, + const uint8_t* pSrcB, int32_t iSrcBStride, int32_t iHeight) { + int32_t i, j; + for (i = 0; i < iHeight; i++) { + for (j = 0; j < 8; j++) { + pDst[j] = (pSrcA[j] + pSrcB[j] + 1) >> 1; + } + pDst += iDstStride; + pSrcA += iSrcAStride; + pSrcB += iSrcBStride; + } +} +static inline void PixelAvgWidthEq16_c (uint8_t* pDst, int32_t iDstStride, const uint8_t* pSrcA, int32_t iSrcAStride, + const uint8_t* pSrcB, int32_t iSrcBStride, int32_t iHeight) { + int32_t i, j; + for (i = 0; i < iHeight; i++) { + for (j = 0; j < 16; j++) { + pDst[j] = (pSrcA[j] + pSrcB[j] + 1) >> 1; + } + pDst += iDstStride; + pSrcA += iSrcAStride; + pSrcB += iSrcBStride; + } +} + +//horizontal filter to gain half sample, that is (2, 0) location in quarter sample +static inline void McHorVer20WidthEq16_c (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, + int32_t iHeight) { + int32_t i, j; + for (i = 0; i < iHeight; i++) { + for (j = 0; j < 16; j++) { + pDst[j] = WelsClip1 ((fpHorFilter (pSrc + j) + 16) >> 5); + } + pDst += iDstStride; + pSrc += iSrcStride; + } +} +//vertical filter to gain half sample, that is (0, 2) location in quarter sample +static inline void McHorVer02WidthEq16_c (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, + int32_t iHeight) { + int32_t i, j; + for (i = 0; i < iHeight; i++) { + for (j = 0; j < 16; j++) { + pDst[j] = WelsClip1 ((fpVerFilter (pSrc + j, iSrcStride) + 16) >> 5); + } + pDst += iDstStride; + pSrc += iSrcStride; + } +} +//horizontal and vertical filter to gain half sample, that is (2, 2) location in quarter sample +static inline void McHorVer22WidthEq16_c (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, + int32_t iHeight) { + int16_t pTmp[16 + 5] = {0}; //16 + int32_t i, j, k; + + for (i = 0; i < iHeight; i++) { + for (j = 0; j < 16 + 5; j++) { + pTmp[j] = fpVerFilter (pSrc - 2 + j, iSrcStride); + } + for (k = 0; k < 16; k++) { + pDst[k] = WelsClip1 ((fpHorFilterInput16Bits (&pTmp[2 + k]) + 512) >> 10); + } + pSrc += iSrcStride; + pDst += iDstStride; + } +} + +/////////////////////luma MC////////////////////////// + +static inline void McHorVer01WidthEq16 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, + int32_t iHeight) { + ENFORCE_STACK_ALIGN_1D (uint8_t, pTmp, 256, 16) + + pfMcHorVer02WidthEq16 (pSrc, iSrcStride, pTmp, 16, iHeight); + pfPixelAvgWidthEq16 (pDst, iDstStride, pSrc, iSrcStride, pTmp, 16, iHeight); +} +static inline void McHorVer03WidthEq16 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, + int32_t iHeight) { + ENFORCE_STACK_ALIGN_1D (uint8_t, pTmp, 256, 16) + + pfMcHorVer02WidthEq16 (pSrc, iSrcStride, pTmp, 16, iHeight); + pfPixelAvgWidthEq16 (pDst, iDstStride, pSrc + iSrcStride, iSrcStride, pTmp, 16, iHeight); +} +static inline void McHorVer10WidthEq16 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, + int32_t iHeight) { + ENFORCE_STACK_ALIGN_1D (uint8_t, pTmp, 256, 16) + + pfMcHorVer20WidthEq16 (pSrc, iSrcStride, pTmp, 16, iHeight); + pfPixelAvgWidthEq16 (pDst, iDstStride, pSrc, iSrcStride, pTmp, 16, iHeight); +} +static inline void McHorVer11WidthEq16 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, + int32_t iHeight) { + ENFORCE_STACK_ALIGN_1D (uint8_t, pTmp, 512, 16) + + pfMcHorVer20WidthEq16 (pSrc, iSrcStride, pTmp, 16, iHeight); + pfMcHorVer02WidthEq16 (pSrc, iSrcStride, &pTmp[256], 16, iHeight); + pfPixelAvgWidthEq16 (pDst, iDstStride, pTmp, 16, &pTmp[256], 16, iHeight); +} +static inline void McHorVer12WidthEq16 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, + int32_t iHeight) { + ENFORCE_STACK_ALIGN_1D (uint8_t, pTmp, 512, 16) + + pfMcHorVer02WidthEq16 (pSrc, iSrcStride, pTmp, 16, iHeight); + pfMcHorVer22WidthEq16 (pSrc, iSrcStride, &pTmp[256], 16, iHeight); + pfPixelAvgWidthEq16 (pDst, iDstStride, pTmp, 16, &pTmp[256], 16, iHeight); +} +static inline void McHorVer13WidthEq16 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, + int32_t iHeight) { + ENFORCE_STACK_ALIGN_1D (uint8_t, pTmp, 512, 16) + + pfMcHorVer20WidthEq16 (pSrc + iSrcStride, iSrcStride, pTmp, 16, iHeight); + pfMcHorVer02WidthEq16 (pSrc, iSrcStride, &pTmp[256], 16, iHeight); + pfPixelAvgWidthEq16 (pDst, iDstStride, pTmp, 16, &pTmp[256], 16, iHeight); +} +static inline void McHorVer21WidthEq16 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, + int32_t iHeight) { + ENFORCE_STACK_ALIGN_1D (uint8_t, pTmp, 512, 16) + + pfMcHorVer20WidthEq16 (pSrc, iSrcStride, pTmp, 16, iHeight); + pfMcHorVer22WidthEq16 (pSrc, iSrcStride, &pTmp[256], 16, iHeight); + pfPixelAvgWidthEq16 (pDst, iDstStride, pTmp, 16, &pTmp[256], 16, iHeight); +} +static inline void McHorVer23WidthEq16 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, + int32_t iHeight) { + ENFORCE_STACK_ALIGN_1D (uint8_t, pTmp, 512, 16) + + pfMcHorVer20WidthEq16 (pSrc + iSrcStride, iSrcStride, pTmp, 16, iHeight); + pfMcHorVer22WidthEq16 (pSrc, iSrcStride, &pTmp[256], 16, iHeight); + pfPixelAvgWidthEq16 (pDst, iDstStride, pTmp, 16, &pTmp[256], 16, iHeight); +} +static inline void McHorVer30WidthEq16 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, + int32_t iHeight) { + ENFORCE_STACK_ALIGN_1D (uint8_t, pTmp, 256, 16) + + pfMcHorVer20WidthEq16 (pSrc, iSrcStride, pTmp, 16, iHeight); + pfPixelAvgWidthEq16 (pDst, iDstStride, pSrc + 1, iSrcStride, pTmp, 16, iHeight); +} +static inline void McHorVer31WidthEq16 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, + int32_t iHeight) { + ENFORCE_STACK_ALIGN_1D (uint8_t, pTmp, 512, 16) + + pfMcHorVer20WidthEq16 (pSrc, iSrcStride, pTmp, 16, iHeight); + pfMcHorVer02WidthEq16 (pSrc + 1, iSrcStride, &pTmp[256], 16, iHeight); + pfPixelAvgWidthEq16 (pDst, iDstStride, pTmp, 16, &pTmp[256], 16, iHeight); +} +static inline void McHorVer32WidthEq16 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, + int32_t iHeight) { + ENFORCE_STACK_ALIGN_1D (uint8_t, pTmp, 512, 16) + + pfMcHorVer02WidthEq16 (pSrc + 1, iSrcStride, pTmp, 16, iHeight); + pfMcHorVer22WidthEq16 (pSrc, iSrcStride, &pTmp[256], 16, iHeight); + pfPixelAvgWidthEq16 (pDst, iDstStride, pTmp, 16, &pTmp[256], 16, iHeight); +} +static inline void McHorVer33WidthEq16 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, + int32_t iHeight) { + ENFORCE_STACK_ALIGN_1D (uint8_t, pTmp, 512, 16) + + pfMcHorVer20WidthEq16 (pSrc + iSrcStride, iSrcStride, pTmp, 16, iHeight); + pfMcHorVer02WidthEq16 (pSrc + 1, iSrcStride, &pTmp[256], 16, iHeight); + pfPixelAvgWidthEq16 (pDst, iDstStride, pTmp, 16, &pTmp[256], 16, iHeight); +} + +static inline void McHorVer20_c (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iWidth, + int32_t iHeight) { + int32_t i, j; + for (i = 0; i < iHeight; i++) { + for (j = 0; j < iWidth; j++) { + pDst[j] = WelsClip1 ((fpHorFilter (pSrc + j) + 16) >> 5); + } + pDst += iDstStride; + pSrc += iSrcStride; + } +} +//vertical filter to gain half sample, that is (0, 2) location in quarter sample +static inline void McHorVer02_c (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iWidth, + int32_t iHeight) { + int32_t i, j; + for (i = 0; i < iHeight; i++) { + for (j = 0; j < iWidth; j++) { + pDst[j] = WelsClip1 ((fpVerFilter (pSrc + j, iSrcStride) + 16) >> 5); + } + pDst += iDstStride; + pSrc += iSrcStride; + } +} +//horizontal and vertical filter to gain half sample, that is (2, 2) location in quarter sample +static inline void McHorVer22_c (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iWidth, + int32_t iHeight) { + int16_t pTmp[17 + 5] = {0}; //w+1 + int32_t i, j, k; + + for (i = 0; i < iHeight; i++) { + for (j = 0; j < iWidth + 5; j++) { + pTmp[j] = fpVerFilter (pSrc - 2 + j, iSrcStride); + } + for (k = 0; k < iWidth; k++) { + pDst[k] = WelsClip1 ((fpHorFilterInput16Bits (&pTmp[2 + k]) + 512) >> 10); + } + pSrc += iSrcStride; + pDst += iDstStride; + } +} +static inline void McCopy (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iWidth, + int32_t iHeight) { + int32_t i; + if (iWidth == 16 && McCopyWidthEq16 != NULL) + McCopyWidthEq16 (pSrc, iSrcStride, pDst, iDstStride, iHeight); + else if (iWidth == 8 && McCopyWidthEq8 != NULL) + McCopyWidthEq8 (pSrc, iSrcStride, pDst, iDstStride, iHeight); + else if (iWidth == 4 && McCopyWidthEq4 != NULL) + McCopyWidthEq4 (pSrc, iSrcStride, pDst, iDstStride, iHeight); + else { + for (i = 0; i < iHeight; i++) { + memcpy (pDst, pSrc, iWidth); // confirmed_safe_unsafe_usage + pDst += iDstStride; + pSrc += iSrcStride; + } + } +} + +void McChroma_c (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, + SMVUnitXY mv, int32_t iWidth, int32_t iHeight) +//pSrc has been added the offset of mv +{ + const int32_t kiDx = mv.iMvX & 0x07; + const int32_t kiDy = mv.iMvY & 0x07; + + if (0 == kiDx && 0 == kiDy) { + McCopy (pSrc, iSrcStride, pDst, iDstStride, iWidth, iHeight); + } else { + const int32_t kiDA = g_kuiABCD[kiDy][kiDx][0]; + const int32_t kiDB = g_kuiABCD[kiDy][kiDx][1]; + const int32_t kiDC = g_kuiABCD[kiDy][kiDx][2]; + const int32_t kiDD = g_kuiABCD[kiDy][kiDx][3]; + + int32_t i, j; + + const uint8_t* pSrcNext = pSrc + iSrcStride; + + for (i = 0; i < iHeight; i++) { + for (j = 0; j < iWidth; j++) { + pDst[j] = (kiDA * pSrc[j] + kiDB * pSrc[j + 1] + kiDC * pSrcNext[j] + kiDD * pSrcNext[j + 1] + 32) >> 6; + } + pDst += iDstStride; + pSrc = pSrcNext; + pSrcNext += iSrcStride; + } + } +} +//***************************************************************************// +// MMXEXT and SSE2 implementation // +//***************************************************************************// +#if defined(X86_ASM) + +static inline void McHorVer22WidthEq8_sse2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, + int32_t iHeight) { + ENFORCE_STACK_ALIGN_2D (int16_t, pTap, 21, 8, 16) + McHorVer22Width8HorFirst_sse2 (pSrc - 2, iSrcStride, (uint8_t*)pTap, 16, iHeight + 5); + McHorVer22Width8VerLastAlign_sse2 ((uint8_t*)pTap, 16, pDst, iDstStride, 8, iHeight); +} + +//2010.2.5 + +static inline void McHorVer02WidthEq16_sse2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* PDst, int32_t iDstStride, + int32_t iHeight) { + McHorVer02WidthEq8_sse2 (pSrc, iSrcStride, PDst, iDstStride, iHeight); + McHorVer02WidthEq8_sse2 (&pSrc[8], iSrcStride, &PDst[8], iDstStride, iHeight); +} +static inline void McHorVer22WidthEq16_sse2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, + int32_t iHeight) { + McHorVer22WidthEq8_sse2 (pSrc, iSrcStride, pDst, iDstStride, iHeight); + McHorVer22WidthEq8_sse2 (&pSrc[8], iSrcStride, &pDst[8], iDstStride, iHeight); +} +void McHorVer22Width9Or17Height9Or17_sse2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, + int32_t iWidth, + int32_t iHeight) { + ENFORCE_STACK_ALIGN_2D (int16_t, pTap, 22, 24, 16) + int32_t tmp1 = 2 * (iWidth - 8); + McHorVer22HorFirst_sse2 (pSrc - 2, iSrcStride, (uint8_t*)pTap, 48, iWidth, iHeight + 5); + McHorVer22Width8VerLastAlign_sse2 ((uint8_t*)pTap, 48, pDst, iDstStride, iWidth - 1, iHeight); + McHorVer22Width8VerLastUnAlign_sse2 ((uint8_t*)pTap + tmp1, 48, pDst + iWidth - 8, iDstStride, 8, iHeight); +} + +typedef void (*McChromaWidthEqx) (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, + const uint8_t* pABCD, int32_t iHeigh); +void McChroma_sse2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, + SMVUnitXY sMv, int32_t iWidth, int32_t iHeight) { + const int32_t kiD8x = sMv.iMvX & 0x07; + const int32_t kiD8y = sMv.iMvY & 0x07; + static const McChromaWidthEqx kpfFuncs[2] = { + McChromaWidthEq4_mmx, + McChromaWidthEq8_sse2 + }; + + if (0 == kiD8x && 0 == kiD8y) { + McCopy (pSrc, iSrcStride, pDst, iDstStride, iWidth, iHeight); + } else { + kpfFuncs[ (iWidth >> 3)] (pSrc, iSrcStride, pDst, iDstStride, g_kuiABCD[kiD8y][kiD8x], iHeight); + } +} + +void McChroma_ssse3 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, + SMVUnitXY sMv, int32_t iWidth, int32_t iHeight) { + const int32_t kiD8x = sMv.iMvX & 0x07; + const int32_t kiD8y = sMv.iMvY & 0x07; + + static const McChromaWidthEqx kpfFuncs[2] = { + McChromaWidthEq4_mmx, + McChromaWidthEq8_ssse3 + }; + if (0 == kiD8x && 0 == kiD8y) { + McCopy (pSrc, iSrcStride, pDst, iDstStride, iWidth, iHeight); + } else { + kpfFuncs[ (iWidth >> 3)] (pSrc, iSrcStride, pDst, iDstStride, g_kuiABCD[kiD8y][kiD8x], iHeight); + } + +} + +#endif //X86_ASM + + //***************************************************************************// + // NEON implementation // + //***************************************************************************// +#if defined(HAVE_NEON) +void McHorVer20Width9Or17_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, + int32_t iWidth, int32_t iHeight) { + if (iWidth == 17) + McHorVer20Width17_neon(pSrc, iSrcStride, pDst, iDstStride, iHeight); + else //if (iWidth == 9) + McHorVer20Width9_neon(pSrc, iSrcStride, pDst, iDstStride, iHeight); +} +void McHorVer02Height9Or17_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, + int32_t iWidth, int32_t iHeight){ + if (iWidth == 16) + McHorVer02Height17_neon(pSrc, iSrcStride, pDst, iDstStride, iHeight); + else //if (iWidth == 8) + McHorVer02Height9_neon(pSrc, iSrcStride, pDst, iDstStride, iHeight); +} +void McHorVer22Width9Or17Height9Or17_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, + int32_t iWidth, int32_t iHeight){ + if (iWidth == 17) + McHorVer22Width17_neon(pSrc, iSrcStride, pDst, iDstStride, iHeight); + else //if (iWidth == 9) + McHorVer22Width9_neon(pSrc, iSrcStride, pDst, iDstStride, iHeight); +} +void EncMcHorVer11_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight) { + ENFORCE_STACK_ALIGN_1D(uint8_t, pTmp, 512, 16) + McHorVer20WidthEq16_neon(pSrc, iSrcStride, pTmp, 16, iHeight); + McHorVer02WidthEq16_neon(pSrc, iSrcStride, &pTmp[256], 16, iHeight); + PixelAvgWidthEq16_neon(pDst, iDstStride, pTmp, &pTmp[256], iHeight); +} +void EncMcHorVer12_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight) { + ENFORCE_STACK_ALIGN_1D(uint8_t, pTmp, 512, 16) + McHorVer02WidthEq16_neon(pSrc, iSrcStride, pTmp, 16, iHeight); + McHorVer22WidthEq16_neon(pSrc, iSrcStride, &pTmp[256], 16, iHeight); + PixelAvgWidthEq16_neon(pDst, iDstStride, pTmp, &pTmp[256], iHeight); +} +void EncMcHorVer13_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight) { + ENFORCE_STACK_ALIGN_1D(uint8_t, pTmp, 512, 16) + McHorVer20WidthEq16_neon(pSrc+iSrcStride, iSrcStride, pTmp, 16, iHeight); + McHorVer02WidthEq16_neon(pSrc, iSrcStride, &pTmp[256], 16, iHeight); + PixelAvgWidthEq16_neon(pDst, iDstStride, pTmp, &pTmp[256], iHeight); +} +void EncMcHorVer21_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight) { + ENFORCE_STACK_ALIGN_1D(uint8_t, pTmp, 512, 16) + McHorVer20WidthEq16_neon(pSrc, iSrcStride, pTmp, 16, iHeight); + McHorVer22WidthEq16_neon(pSrc, iSrcStride, &pTmp[256], 16, iHeight); + PixelAvgWidthEq16_neon(pDst, iDstStride, pTmp, &pTmp[256], iHeight); +} +void EncMcHorVer23_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight) { + ENFORCE_STACK_ALIGN_1D(uint8_t, pTmp, 512, 16) + McHorVer20WidthEq16_neon(pSrc+iSrcStride, iSrcStride, pTmp, 16, iHeight); + McHorVer22WidthEq16_neon(pSrc, iSrcStride, &pTmp[256], 16, iHeight); + PixelAvgWidthEq16_neon(pDst, iDstStride, pTmp, &pTmp[256], iHeight); +} +void EncMcHorVer31_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight) { + ENFORCE_STACK_ALIGN_1D(uint8_t, pTmp, 512, 16) + McHorVer20WidthEq16_neon(pSrc, iSrcStride, pTmp, 16, iHeight); + McHorVer02WidthEq16_neon(pSrc+1, iSrcStride, &pTmp[256], 16, iHeight); + PixelAvgWidthEq16_neon(pDst, iDstStride, pTmp, &pTmp[256], iHeight); +} +void EncMcHorVer32_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight) { + ENFORCE_STACK_ALIGN_1D(uint8_t, pTmp, 512, 16) + McHorVer02WidthEq16_neon(pSrc+1, iSrcStride, pTmp, 16, iHeight); + McHorVer22WidthEq16_neon(pSrc, iSrcStride, &pTmp[256], 16, iHeight); + PixelAvgWidthEq16_neon(pDst, iDstStride, pTmp, &pTmp[256], iHeight); +} +void EncMcHorVer33_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight) { + ENFORCE_STACK_ALIGN_1D(uint8_t, pTmp, 512, 16) + McHorVer20WidthEq16_neon(pSrc+iSrcStride, iSrcStride, pTmp, 16, iHeight); + McHorVer02WidthEq16_neon(pSrc+1, iSrcStride, &pTmp[256], 16, iHeight); + PixelAvgWidthEq16_neon(pDst, iDstStride, pTmp, &pTmp[256], iHeight); +} +void EncMcChroma_neon(const uint8_t *pSrc, int32_t iSrcStride, uint8_t *pDst, int32_t iDstStride, + SMVUnitXY sMv, int32_t iWidth, int32_t iHeight) { + const int32_t kiD8x = sMv.iMvX&0x07; + const int32_t kiD8y = sMv.iMvY&0x07; + if (0 == kiD8x && 0 == kiD8y) { + if(8 == iWidth) + McCopyWidthEq8_neon(pSrc, iSrcStride, pDst, iDstStride, iHeight); + else // iWidth == 4 + McCopyWidthEq4_neon(pSrc, iSrcStride, pDst, iDstStride, iHeight); + } + else { + if(8 == iWidth) + McChromaWidthEq8_neon(pSrc, iSrcStride, pDst, iDstStride, (int32_t*)(g_kuiABCD[kiD8y][kiD8x]), iHeight); + else //if(4 == iWidth) + McChromaWidthEq4_neon(pSrc, iSrcStride, pDst, iDstStride, (int32_t*)(g_kuiABCD[kiD8y][kiD8x]), iHeight); + } +} +#endif + +#if defined(HAVE_NEON_AARCH64) +void McHorVer20Width9Or17_AArch64_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, + int32_t iWidth, int32_t iHeight) { + if (iWidth == 17) + McHorVer20Width17_AArch64_neon(pSrc, iSrcStride, pDst, iDstStride, iHeight); + else //if (iWidth == 9) + McHorVer20Width9_AArch64_neon(pSrc, iSrcStride, pDst, iDstStride, iHeight); +} +void McHorVer02Height9Or17_AArch64_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, + int32_t iWidth, int32_t iHeight){ + if (iWidth == 16) + McHorVer02Height17_AArch64_neon(pSrc, iSrcStride, pDst, iDstStride, iHeight); + else //if (iWidth == 8) + McHorVer02Height9_AArch64_neon(pSrc, iSrcStride, pDst, iDstStride, iHeight); +} +void McHorVer22Width9Or17Height9Or17_AArch64_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, + int32_t iWidth, int32_t iHeight){ + if (iWidth == 17) + McHorVer22Width17_AArch64_neon(pSrc, iSrcStride, pDst, iDstStride, iHeight); + else //if (iWidth == 9) + McHorVer22Width9_AArch64_neon(pSrc, iSrcStride, pDst, iDstStride, iHeight); +} +void EncMcHorVer11_AArch64_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight) { + ENFORCE_STACK_ALIGN_1D(uint8_t, pTmp, 512, 16) + McHorVer20WidthEq16_AArch64_neon(pSrc, iSrcStride, pTmp, 16, iHeight); + McHorVer02WidthEq16_AArch64_neon(pSrc, iSrcStride, &pTmp[256], 16, iHeight); + PixelAvgWidthEq16_AArch64_neon(pDst, iDstStride, pTmp, 16, &pTmp[256], 16,iHeight); +} +void EncMcHorVer12_AArch64_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight) { + ENFORCE_STACK_ALIGN_1D(uint8_t, pTmp, 512, 16) + McHorVer02WidthEq16_AArch64_neon(pSrc, iSrcStride, pTmp, 16, iHeight); + McHorVer22WidthEq16_AArch64_neon(pSrc, iSrcStride, &pTmp[256], 16, iHeight); + PixelAvgWidthEq16_AArch64_neon(pDst, iDstStride, pTmp, 16, &pTmp[256], 16, iHeight); +} +void EncMcHorVer13_AArch64_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight) { + ENFORCE_STACK_ALIGN_1D(uint8_t, pTmp, 512, 16) + McHorVer20WidthEq16_AArch64_neon(pSrc+iSrcStride, iSrcStride, pTmp, 16, iHeight); + McHorVer02WidthEq16_AArch64_neon(pSrc, iSrcStride, &pTmp[256], 16, iHeight); + PixelAvgWidthEq16_AArch64_neon(pDst, iDstStride, pTmp, 16, &pTmp[256], 16, iHeight); +} +void EncMcHorVer21_AArch64_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight) { + ENFORCE_STACK_ALIGN_1D(uint8_t, pTmp, 512, 16) + McHorVer20WidthEq16_AArch64_neon(pSrc, iSrcStride, pTmp, 16, iHeight); + McHorVer22WidthEq16_AArch64_neon(pSrc, iSrcStride, &pTmp[256], 16, iHeight); + PixelAvgWidthEq16_AArch64_neon(pDst, iDstStride, pTmp, 16, &pTmp[256], 16, iHeight); +} +void EncMcHorVer23_AArch64_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight) { + ENFORCE_STACK_ALIGN_1D(uint8_t, pTmp, 512, 16) + McHorVer20WidthEq16_AArch64_neon(pSrc+iSrcStride, iSrcStride, pTmp, 16, iHeight); + McHorVer22WidthEq16_AArch64_neon(pSrc, iSrcStride, &pTmp[256], 16, iHeight); + PixelAvgWidthEq16_AArch64_neon(pDst, iDstStride, pTmp, 16, &pTmp[256], 16, iHeight); +} +void EncMcHorVer31_AArch64_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight) { + ENFORCE_STACK_ALIGN_1D(uint8_t, pTmp, 512, 16) + McHorVer20WidthEq16_AArch64_neon(pSrc, iSrcStride, pTmp, 16, iHeight); + McHorVer02WidthEq16_AArch64_neon(pSrc+1, iSrcStride, &pTmp[256], 16, iHeight); + PixelAvgWidthEq16_AArch64_neon(pDst, iDstStride, pTmp, 16, &pTmp[256], 16, iHeight); +} +void EncMcHorVer32_AArch64_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight) { + ENFORCE_STACK_ALIGN_1D(uint8_t, pTmp, 512, 16) + McHorVer02WidthEq16_AArch64_neon(pSrc+1, iSrcStride, pTmp, 16, iHeight); + McHorVer22WidthEq16_AArch64_neon(pSrc, iSrcStride, &pTmp[256], 16, iHeight); + PixelAvgWidthEq16_AArch64_neon(pDst, iDstStride, pTmp, 16, &pTmp[256], 16, iHeight); +} +void EncMcHorVer33_AArch64_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight) { + ENFORCE_STACK_ALIGN_1D(uint8_t, pTmp, 512, 16) + McHorVer20WidthEq16_AArch64_neon(pSrc+iSrcStride, iSrcStride, pTmp, 16, iHeight); + McHorVer02WidthEq16_AArch64_neon(pSrc+1, iSrcStride, &pTmp[256], 16, iHeight); + PixelAvgWidthEq16_AArch64_neon(pDst, iDstStride, pTmp, 16, &pTmp[256], 16, iHeight); +} +void EncMcChroma_AArch64_neon(const uint8_t *pSrc, int32_t iSrcStride, uint8_t *pDst, int32_t iDstStride, + SMVUnitXY sMv, int32_t iWidth, int32_t iHeight) { + const int32_t kiD8x = sMv.iMvX&0x07; + const int32_t kiD8y = sMv.iMvY&0x07; + if (0 == kiD8x && 0 == kiD8y) { + if(8 == iWidth) + McCopyWidthEq8_AArch64_neon(pSrc, iSrcStride, pDst, iDstStride, iHeight); + else // iWidth == 4 + McCopyWidthEq4_AArch64_neon(pSrc, iSrcStride, pDst, iDstStride, iHeight); + } + else { + if(8 == iWidth) + McChromaWidthEq8_AArch64_neon(pSrc, iSrcStride, pDst, iDstStride, (int32_t*)(g_kuiABCD[kiD8y][kiD8x]), iHeight); + else //if(4 == iWidth) + McChromaWidthEq4_AArch64_neon(pSrc, iSrcStride, pDst, iDstStride, (int32_t*)(g_kuiABCD[kiD8y][kiD8x]), iHeight); + } +} +#endif + +typedef void (*PixelAvgFunc) (uint8_t*, int32_t, const uint8_t*, int32_t, const uint8_t*, int32_t, int32_t); +void WelsInitMcFuncs (SWelsFuncPtrList* pFuncList, uint32_t uiCpuFlag) { + static PixelAvgFunc pfPixAvgFunc[2] = {PixelAvgWidthEq8_c, PixelAvgWidthEq16_c}; + + static PWelsLumaQuarpelMcFunc pWelsMcFuncWidthEq16[16] = { //[y*4+x] + McCopyWidthEq16_c, McHorVer10WidthEq16, McHorVer20WidthEq16_c, McHorVer30WidthEq16, + McHorVer01WidthEq16, McHorVer11WidthEq16, McHorVer21WidthEq16, McHorVer31WidthEq16, + McHorVer02WidthEq16_c, McHorVer12WidthEq16, McHorVer22WidthEq16_c, McHorVer32WidthEq16, + McHorVer03WidthEq16, McHorVer13WidthEq16, McHorVer23WidthEq16, McHorVer33WidthEq16 + }; +#if defined (X86_ASM) + static PWelsLumaQuarpelMcFunc pWelsMcFuncWidthEq16_sse2[16] = { + McCopyWidthEq16_sse2, McHorVer10WidthEq16, McHorVer20WidthEq16_sse2, McHorVer30WidthEq16, + McHorVer01WidthEq16, McHorVer11WidthEq16, McHorVer21WidthEq16, McHorVer31WidthEq16, + McHorVer02WidthEq16_sse2, McHorVer12WidthEq16, McHorVer22WidthEq16_sse2, McHorVer32WidthEq16, + McHorVer03WidthEq16, McHorVer13WidthEq16, McHorVer23WidthEq16, McHorVer33WidthEq16 + }; +#endif +#if defined(HAVE_NEON) + static PWelsLumaQuarpelMcFunc pWelsMcFuncWidthEq16_neon[16] = { //[x][y] + McCopyWidthEq16_neon, McHorVer10WidthEq16_neon, McHorVer20WidthEq16_neon, McHorVer30WidthEq16_neon, + McHorVer01WidthEq16_neon, EncMcHorVer11_neon, EncMcHorVer21_neon, EncMcHorVer31_neon, + McHorVer02WidthEq16_neon, EncMcHorVer12_neon, McHorVer22WidthEq16_neon, EncMcHorVer32_neon, + McHorVer03WidthEq16_neon, EncMcHorVer13_neon, EncMcHorVer23_neon, EncMcHorVer33_neon + }; +#endif +#if defined(HAVE_NEON_AARCH64) + static PWelsLumaQuarpelMcFunc pWelsMcFuncWidthEq16_AArch64_neon[16] = { //[x][y] + McCopyWidthEq16_AArch64_neon, McHorVer10WidthEq16_AArch64_neon, McHorVer20WidthEq16_AArch64_neon, McHorVer30WidthEq16_AArch64_neon, + McHorVer01WidthEq16_AArch64_neon, EncMcHorVer11_AArch64_neon, EncMcHorVer21_AArch64_neon, EncMcHorVer31_AArch64_neon, + McHorVer02WidthEq16_AArch64_neon, EncMcHorVer12_AArch64_neon, McHorVer22WidthEq16_AArch64_neon, EncMcHorVer32_AArch64_neon, + McHorVer03WidthEq16_AArch64_neon, EncMcHorVer13_AArch64_neon, EncMcHorVer23_AArch64_neon, EncMcHorVer33_AArch64_neon + }; +#endif + pFuncList->sMcFuncs.pfLumaHalfpelHor = McHorVer20_c; + pFuncList->sMcFuncs.pfLumaHalfpelVer = McHorVer02_c; + pFuncList->sMcFuncs.pfLumaHalfpelCen = McHorVer22_c; + pFuncList->sMcFuncs.pfSampleAveraging = pfPixAvgFunc; + pFuncList->sMcFuncs.pfChromaMc = McChroma_c; + fpVerFilter = VerFilter_c; + fpHorFilter = HorFilter_c; + fpHorFilterInput16Bits = HorFilterInput16bit1_c; + McCopyWidthEq4 = McCopyWidthEq4_c; + McCopyWidthEq8 = McCopyWidthEq8_c; + McCopyWidthEq16 = McCopyWidthEq16_c; + pfPixelAvgWidthEq16 = PixelAvgWidthEq16_c; + pfMcHorVer02WidthEq16 = McHorVer02WidthEq16_c; + pfMcHorVer20WidthEq16 = McHorVer20WidthEq16_c; + pfMcHorVer22WidthEq16 = McHorVer22WidthEq16_c; + pFuncList->sMcFuncs.pfLumaQuarpelMc = pWelsMcFuncWidthEq16; +#if defined (X86_ASM) + if (uiCpuFlag & WELS_CPU_SSE2) { + pFuncList->sMcFuncs.pfLumaHalfpelHor = McHorVer20Width9Or17_sse2; + pFuncList->sMcFuncs.pfLumaHalfpelVer = McHorVer02Height9Or17_sse2; + pFuncList->sMcFuncs.pfLumaHalfpelCen = McHorVer22Width9Or17Height9Or17_sse2; + pFuncList->sMcFuncs.pfSampleAveraging[0] = PixelAvgWidthEq8_mmx; + pFuncList->sMcFuncs.pfSampleAveraging[1] = PixelAvgWidthEq16_sse2; + pFuncList->sMcFuncs.pfChromaMc = McChroma_sse2; + McCopyWidthEq4 = McCopyWidthEq4_mmx; + McCopyWidthEq8 = McCopyWidthEq8_mmx; + McCopyWidthEq16 = McCopyWidthEq16_sse2; + pfPixelAvgWidthEq16 = PixelAvgWidthEq16_sse2; + pfMcHorVer02WidthEq16 = McHorVer02WidthEq16_sse2; + pfMcHorVer20WidthEq16 = McHorVer20WidthEq16_sse2; + pfMcHorVer22WidthEq16 = McHorVer22WidthEq16_sse2; + pFuncList->sMcFuncs.pfLumaQuarpelMc = pWelsMcFuncWidthEq16_sse2; + } + + if (uiCpuFlag & WELS_CPU_SSSE3) { + pFuncList->sMcFuncs.pfChromaMc = McChroma_ssse3; + } + +#endif //(X86_ASM) + +#if defined(HAVE_NEON) + if (uiCpuFlag & WELS_CPU_NEON) { + pFuncList->sMcFuncs.pfLumaQuarpelMc = pWelsMcFuncWidthEq16_neon; + pFuncList->sMcFuncs.pfChromaMc = EncMcChroma_neon; + pFuncList->sMcFuncs.pfSampleAveraging[0] = PixStrideAvgWidthEq8_neon; + pFuncList->sMcFuncs.pfSampleAveraging[1] = PixStrideAvgWidthEq16_neon; + pFuncList->sMcFuncs.pfLumaHalfpelHor = McHorVer20Width9Or17_neon;//iWidth+1:8/16 + pFuncList->sMcFuncs.pfLumaHalfpelVer = McHorVer02Height9Or17_neon;//heigh+1:8/16 + pFuncList->sMcFuncs.pfLumaHalfpelCen = McHorVer22Width9Or17Height9Or17_neon;//iWidth+1/heigh+1 + } +#endif +#if defined(HAVE_NEON_AARCH64) + if (uiCpuFlag & WELS_CPU_NEON) { + pFuncList->sMcFuncs.pfLumaQuarpelMc = pWelsMcFuncWidthEq16_AArch64_neon; + pFuncList->sMcFuncs.pfChromaMc = EncMcChroma_AArch64_neon; + pFuncList->sMcFuncs.pfSampleAveraging[0] = PixStrideAvgWidthEq8_AArch64_neon; + pFuncList->sMcFuncs.pfSampleAveraging[1] = PixStrideAvgWidthEq16_AArch64_neon; + pFuncList->sMcFuncs.pfLumaHalfpelHor = McHorVer20Width9Or17_AArch64_neon;//iWidth+1:8/16 + pFuncList->sMcFuncs.pfLumaHalfpelVer = McHorVer02Height9Or17_AArch64_neon;//heigh+1:8/16 + pFuncList->sMcFuncs.pfLumaHalfpelCen = McHorVer22Width9Or17Height9Or17_AArch64_neon;//iWidth+1/heigh+1 + } +#endif +} +}