From 71467f948ae79fb5ea2ffe48ca07cf72a36d45a3 Mon Sep 17 00:00:00 2001 From: Licai Guo Date: Fri, 7 Mar 2014 12:18:58 +0800 Subject: [PATCH] mv mc_neon.S to common,add MC arm code to encoder --- .../common/common.xcodeproj/project.pbxproj | 4 + .../welsdec/welsdec.xcodeproj/project.pbxproj | 4 - .../welsenc/welsenc.xcodeproj/project.pbxproj | 4 - codec/common/mc_common.h | 12 + codec/{decoder/core/arm => common}/mc_neon.S | 598 ++++++++++++++++++ codec/encoder/core/src/mc.cpp | 113 ++++ 6 files changed, 727 insertions(+), 8 deletions(-) rename codec/{decoder/core/arm => common}/mc_neon.S (70%) diff --git a/codec/build/iOS/common/common.xcodeproj/project.pbxproj b/codec/build/iOS/common/common.xcodeproj/project.pbxproj index d0583a98..4f52a24a 100644 --- a/codec/build/iOS/common/common.xcodeproj/project.pbxproj +++ b/codec/build/iOS/common/common.xcodeproj/project.pbxproj @@ -8,6 +8,7 @@ /* Begin PBXBuildFile section */ 4C34067D18C5C94C00DFA14A /* expand_picture_neon.S in Sources */ = {isa = PBXBuildFile; fileRef = 4C34067C18C5C94C00DFA14A /* expand_picture_neon.S */; }; + 4C34069B18C96CB000DFA14A /* mc_neon.S in Sources */ = {isa = PBXBuildFile; fileRef = 4C34069A18C96CB000DFA14A /* mc_neon.S */; }; 4CE443D918B722CD0017DF25 /* Foundation.framework in Frameworks */ = {isa = PBXBuildFile; fileRef = 4CE443D818B722CD0017DF25 /* Foundation.framework */; }; 4CE443E718B722CD0017DF25 /* XCTest.framework in Frameworks */ = {isa = PBXBuildFile; fileRef = 4CE443E618B722CD0017DF25 /* XCTest.framework */; }; 4CE443E818B722CD0017DF25 /* Foundation.framework in Frameworks */ = {isa = PBXBuildFile; fileRef = 4CE443D818B722CD0017DF25 /* Foundation.framework */; }; @@ -48,6 +49,7 @@ /* Begin PBXFileReference section */ 4C34067C18C5C94C00DFA14A /* expand_picture_neon.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = expand_picture_neon.S; sourceTree = ""; }; + 4C34069A18C96CB000DFA14A /* mc_neon.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = mc_neon.S; sourceTree = ""; }; 4CE443D518B722CD0017DF25 /* libcommon.a */ = {isa = PBXFileReference; explicitFileType = archive.ar; includeInIndex = 0; path = libcommon.a; sourceTree = BUILT_PRODUCTS_DIR; }; 4CE443D818B722CD0017DF25 /* Foundation.framework */ = {isa = PBXFileReference; lastKnownFileType = wrapper.framework; name = Foundation.framework; path = System/Library/Frameworks/Foundation.framework; sourceTree = SDKROOT; }; 4CE443E518B722CD0017DF25 /* commonTests.xctest */ = {isa = PBXFileReference; explicitFileType = wrapper.cfbundle; includeInIndex = 0; path = commonTests.xctest; sourceTree = BUILT_PRODUCTS_DIR; }; @@ -150,6 +152,7 @@ 4CE4472F18BC61650017DF25 /* common */ = { isa = PBXGroup; children = ( + 4C34069A18C96CB000DFA14A /* mc_neon.S */, 4C34067C18C5C94C00DFA14A /* expand_picture_neon.S */, 4CE447BE18C085900017DF25 /* arm_arch_common_macro.S */, 4CE447BC18C085320017DF25 /* deblocking_neon.S */, @@ -256,6 +259,7 @@ isa = PBXSourcesBuildPhase; buildActionMask = 2147483647; files = ( + 4C34069B18C96CB000DFA14A /* mc_neon.S in Sources */, 4CE447BF18C085900017DF25 /* arm_arch_common_macro.S in Sources */, 4CE4475018BC61650017DF25 /* deblocking_common.cpp in Sources */, 4CE4474C18BC61650017DF25 /* cpu.cpp in Sources */, diff --git a/codec/build/iOS/dec/welsdec/welsdec.xcodeproj/project.pbxproj b/codec/build/iOS/dec/welsdec/welsdec.xcodeproj/project.pbxproj index 31e8a818..2f72d4fc 100644 --- a/codec/build/iOS/dec/welsdec/welsdec.xcodeproj/project.pbxproj +++ b/codec/build/iOS/dec/welsdec/welsdec.xcodeproj/project.pbxproj @@ -38,7 +38,6 @@ 4CE4469F18BC5EAB0017DF25 /* welsDecoderExt.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 4CE4468518BC5EAB0017DF25 /* welsDecoderExt.cpp */; }; 4CE447AC18BC6BE90017DF25 /* block_add_neon.S in Sources */ = {isa = PBXBuildFile; fileRef = 4CE447A718BC6BE90017DF25 /* block_add_neon.S */; }; 4CE447AE18BC6BE90017DF25 /* intra_pred_neon.S in Sources */ = {isa = PBXBuildFile; fileRef = 4CE447A918BC6BE90017DF25 /* intra_pred_neon.S */; }; - 4CE447AF18BC6BE90017DF25 /* mc_neon.S in Sources */ = {isa = PBXBuildFile; fileRef = 4CE447AA18BC6BE90017DF25 /* mc_neon.S */; }; /* End PBXBuildFile section */ /* Begin PBXContainerItemProxy section */ @@ -132,7 +131,6 @@ 4CE4468518BC5EAB0017DF25 /* welsDecoderExt.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = welsDecoderExt.cpp; sourceTree = ""; }; 4CE447A718BC6BE90017DF25 /* block_add_neon.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = block_add_neon.S; sourceTree = ""; }; 4CE447A918BC6BE90017DF25 /* intra_pred_neon.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = intra_pred_neon.S; sourceTree = ""; }; - 4CE447AA18BC6BE90017DF25 /* mc_neon.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = mc_neon.S; sourceTree = ""; }; /* End PBXFileReference section */ /* Begin PBXFrameworksBuildPhase section */ @@ -325,7 +323,6 @@ children = ( 4CE447A718BC6BE90017DF25 /* block_add_neon.S */, 4CE447A918BC6BE90017DF25 /* intra_pred_neon.S */, - 4CE447AA18BC6BE90017DF25 /* mc_neon.S */, ); path = arm; sourceTree = ""; @@ -411,7 +408,6 @@ isa = PBXSourcesBuildPhase; buildActionMask = 2147483647; files = ( - 4CE447AF18BC6BE90017DF25 /* mc_neon.S in Sources */, 4CE4469B18BC5EAB0017DF25 /* pic_queue.cpp in Sources */, 4CE4469F18BC5EAB0017DF25 /* welsDecoderExt.cpp in Sources */, 4CE4469318BC5EAB0017DF25 /* fmo.cpp in Sources */, diff --git a/codec/build/iOS/enc/welsenc/welsenc.xcodeproj/project.pbxproj b/codec/build/iOS/enc/welsenc/welsenc.xcodeproj/project.pbxproj index f2949d68..c2046f7a 100644 --- a/codec/build/iOS/enc/welsenc/welsenc.xcodeproj/project.pbxproj +++ b/codec/build/iOS/enc/welsenc/welsenc.xcodeproj/project.pbxproj @@ -9,7 +9,6 @@ /* Begin PBXBuildFile section */ 4C34066D18C57D0400DFA14A /* intra_pred_neon.S in Sources */ = {isa = PBXBuildFile; fileRef = 4C34066618C57D0400DFA14A /* intra_pred_neon.S */; }; 4C34066E18C57D0400DFA14A /* intra_pred_sad_3_opt_neon.S in Sources */ = {isa = PBXBuildFile; fileRef = 4C34066718C57D0400DFA14A /* intra_pred_sad_3_opt_neon.S */; }; - 4C34066F18C57D0400DFA14A /* mc_neon.S in Sources */ = {isa = PBXBuildFile; fileRef = 4C34066818C57D0400DFA14A /* mc_neon.S */; }; 4C34067018C57D0400DFA14A /* memory_neon.S in Sources */ = {isa = PBXBuildFile; fileRef = 4C34066918C57D0400DFA14A /* memory_neon.S */; }; 4C34067118C57D0400DFA14A /* pixel_neon.S in Sources */ = {isa = PBXBuildFile; fileRef = 4C34066A18C57D0400DFA14A /* pixel_neon.S */; }; 4C34067218C57D0400DFA14A /* reconstruct_neon.S in Sources */ = {isa = PBXBuildFile; fileRef = 4C34066B18C57D0400DFA14A /* reconstruct_neon.S */; }; @@ -79,7 +78,6 @@ /* Begin PBXFileReference section */ 4C34066618C57D0400DFA14A /* intra_pred_neon.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = intra_pred_neon.S; sourceTree = ""; }; 4C34066718C57D0400DFA14A /* intra_pred_sad_3_opt_neon.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = intra_pred_sad_3_opt_neon.S; sourceTree = ""; }; - 4C34066818C57D0400DFA14A /* mc_neon.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = mc_neon.S; sourceTree = ""; }; 4C34066918C57D0400DFA14A /* memory_neon.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = memory_neon.S; sourceTree = ""; }; 4C34066A18C57D0400DFA14A /* pixel_neon.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = pixel_neon.S; sourceTree = ""; }; 4C34066B18C57D0400DFA14A /* reconstruct_neon.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = reconstruct_neon.S; sourceTree = ""; }; @@ -206,7 +204,6 @@ children = ( 4C34066618C57D0400DFA14A /* intra_pred_neon.S */, 4C34066718C57D0400DFA14A /* intra_pred_sad_3_opt_neon.S */, - 4C34066818C57D0400DFA14A /* mc_neon.S */, 4C34066918C57D0400DFA14A /* memory_neon.S */, 4C34066A18C57D0400DFA14A /* pixel_neon.S */, 4C34066B18C57D0400DFA14A /* reconstruct_neon.S */, @@ -492,7 +489,6 @@ 4CE4472A18BC605C0017DF25 /* utils.cpp in Sources */, 4CE4471018BC605C0017DF25 /* decode_mb_aux.cpp in Sources */, 4CE4472018BC605C0017DF25 /* sample.cpp in Sources */, - 4C34066F18C57D0400DFA14A /* mc_neon.S in Sources */, 4CE4472D18BC605C0017DF25 /* welsCodecTrace.cpp in Sources */, 4CE4471318BC605C0017DF25 /* encoder_data_tables.cpp in Sources */, 4C34067118C57D0400DFA14A /* pixel_neon.S in Sources */, diff --git a/codec/common/mc_common.h b/codec/common/mc_common.h index e0c5f168..399073d0 100644 --- a/codec/common/mc_common.h +++ b/codec/common/mc_common.h @@ -82,6 +82,18 @@ void McHorVer02WidthEq4_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* p void McHorVer22WidthEq16_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight); void McHorVer22WidthEq8_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight); void McHorVer22WidthEq4_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight); + +void PixStrideAvgWidthEq16_neon(uint8_t* pDst, int32_t iDstStride, const uint8_t* pSrcA, int32_t iSrcStrideA, const uint8_t* pSrcB, int32_t iSrcStrideB, int32_t iHeight); +void PixStrideAvgWidthEq8_neon(uint8_t* pDst, int32_t iDstStride, const uint8_t* pSrcA, int32_t iSrcStrideA, const uint8_t* pSrcB, int32_t iSrcStrideB, int32_t iHeight); + +void McHorVer20Width17_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);// width+1 +void McHorVer20Width9_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);// width+1 + +void McHorVer02Height17_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);// height+1 +void McHorVer02Height9_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);// height+1 + +void McHorVer22Width17_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);//width+1&&height+1 +void McHorVer22Width9_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);//width+1&&height+1 #endif #if defined(X86_ASM) diff --git a/codec/decoder/core/arm/mc_neon.S b/codec/common/mc_neon.S similarity index 70% rename from codec/decoder/core/arm/mc_neon.S rename to codec/common/mc_neon.S index 28e4b979..a0bfcfb3 100644 --- a/codec/decoder/core/arm/mc_neon.S +++ b/codec/common/mc_neon.S @@ -53,6 +53,17 @@ // } .endm +.macro FILTER_SINGLE_TAG_8BITS // when width=17/9, used +// { // input: src_d{Y[0][1][2][3][4][5]X, the even of working_q2}, + vrev64.8 $2, $0 // X[5][4][3][2][1][0]O + vaddl.u8 $3, $0, $2 // each 16bits, *[50][41][32][23][14][05]* + vmul.s16 $0, $2, $1 // 0+1*[50]-5*[41]+20[32] + vpadd.s16 $0, $0, $0 + vpadd.s16 $0, $0, $0 + vqrshrun.s16 $0, $4, #5 +// } +.endm + .macro FILTER_6TAG_8BITS_AVERAGE_WITH_0 // { // input:src[-2], src[-1], src[0], src[1], src[2], src[3], dst_d, multiplier a/b; working: q12, q13 vaddl.u8 q12, $0, $5 //q12=src[-2]+src[3] @@ -115,6 +126,24 @@ vadd.s16 $2, $0 //a=src[-2]+src[3] // } .endm + +.macro UNPACK_1_IN_8x16BITS_TO_8BITS +// { // each 16bits; input: d_dst, d_src[0:3] (even), d_src[4:5]+%% (odd) + vext.16 $3, $3, $3, #7 // 0x????, [0][1][2][3][4][5], + vrev64.16 $1, $1 + vadd.u16 $2, $1 // C[2+3],B[1+4],A[0+5], + vshr.s64 $1, $2, #16 + vshr.s64 $0, $2, #32 // Output: C $2, B $1, A $0 + + vsub.s16 $0, $0, $1 //a-b + vshr.s16 $0, $0, #2 //(a-b)/4 + vsub.s16 $0, $0, $1 //(a-b)/4-b + vadd.s16 $0, $0, $2 //(a-b)/4-b+c + vshr.s16 $0, $0, #2 //((a-b)/4-b+c)/4 + vadd.s16 $1, $0, $2 //((a-b)/4-b+c)/4+c = (a-5*b+20*c)/16 + vqrshrun.s16 $0, $3, #6 //(+32)>>6 +// } +.endm #else .macro AVERAGE_TWO_8BITS arg0, arg1, arg2 // { // input:dst_d, src_d A and B; working: q13 @@ -134,6 +163,17 @@ // } .endm +.macro FILTER_SINGLE_TAG_8BITS arg0, arg1,arg2, arg3, arg4,arg5 // when width=17/9, used +// { // input: src_d{Y[0][1][2][3][4][5]X, the even of working_q2} + vrev64.8 \arg2, \arg0 // X[5][4][3][2][1][0]O + vaddl.u8 \arg3, \arg0, \arg2 // each 16bits, *[50][41][32][23][14][05]* + vmul.s16 \arg0, \arg2, \arg1 // 0+1*[50]-5*[41]+20[32] + vpadd.s16 \arg0, \arg0, \arg0 + vpadd.s16 \arg0, \arg0, \arg0 + vqrshrun.s16 \arg0, \arg4, #5 +// } +.endm + .macro FILTER_6TAG_8BITS_AVERAGE_WITH_0 arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8 // { // input:src[-2], src[-1], src[0], src[1], src[2], src[3], dst_d, multiplier a/b; working: q12, q13 vaddl.u8 q12, \arg0, \arg5 //q12=src[-2]+src[3] @@ -196,6 +236,24 @@ vadd.s16 \arg2, \arg0 //a=src[-2]+src[3] // } .endm + +.macro UNPACK_1_IN_8x16BITS_TO_8BITS arg0, arg1,arg2, arg3 +// { // each 16bits; input: d_dst, d_src[0:3] (even), d_src[4:5]+%% (odd) + vext.16 \arg3, \arg3, \arg3, #7 // 0x????, [0][1][2][3][4][5] + vrev64.16 \arg1, \arg1 + vadd.u16 \arg2, \arg1 // C[2+3],B[1+4],A[0+5] + vshr.s64 \arg1, \arg2, #16 + vshr.s64 \arg0, \arg2, #32 // Output: C \arg2, B \arg1, A \arg0 + + vsub.s16 \arg0, \arg0, \arg1 //a-b + vshr.s16 \arg0, \arg0, #2 //(a-b)/4 + vsub.s16 \arg0, \arg0, \arg1 //(a-b)/4-b + vadd.s16 \arg0, \arg0, \arg2 //(a-b)/4-b+c + vshr.s16 \arg0, \arg0, #2 //((a-b)/4-b+c)/4 + vadd.s16 \arg1, \arg0, \arg2 //((a-b)/4-b+c)/4+c = (a-5*b+20*c)/16 + vqrshrun.s16 \arg0, \arg3, #6 //(+32)>>6 +// } +.endm #endif WELS_ASM_FUNC_BEGIN McHorVer20WidthEq16_neon @@ -1599,4 +1657,544 @@ w4_mc_chroma_loop: // each two pxl row pop {r4, r5, r6} WELS_ASM_FUNC_END + +WELS_ASM_FUNC_BEGIN McHorVer20Width17_neon + push {r4-r5} + mov r4, #20 + mov r5, #1 + sub r4, r4, r4, lsl #(16-2) + lsl r5, #16 + ror r4, #16 + vmov d3, r5, r4 // 0x0014FFFB00010000 + + sub r3, #16 + ldr r4, [sp, #8] + + sub r0, #2 + vmov.u16 q14, #0x0014 // 20 + vshr.u16 q15, q14, #2 // 5 + +w17_h_mc_luma_loop: + vld1.u8 {d0,d1,d2}, [r0], r1 //only use 22(17+5); q0=src[-2] + + vext.8 q2, q0, q1, #1 //q2=src[-1] + vext.8 q3, q0, q1, #2 //q3=src[0] + vext.8 q4, q0, q1, #3 //q4=src[1] + vext.8 q5, q0, q1, #4 //q5=src[2] + vext.8 q6, q0, q1, #5 //q6=src[3] + + FILTER_6TAG_8BITS d0, d4, d6, d8, d10, d12, d14, q14, q15 + + FILTER_6TAG_8BITS d1, d5, d7, d9, d11, d13, d15, q14, q15 + + vst1.u8 {d14, d15}, [r2]! //write [0:15] Byte + + vsli.64 d2, d2, #8 // [0][1][2][3][4][5]XO-->O[0][1][2][3][4][5]X + FILTER_SINGLE_TAG_8BITS d2, d3, d14, q7, q1 + + vst1.u8 {d2[0]}, [r2], r3 //write 16th Byte + + sub r4, #1 + cmp r4, #0 + bne w17_h_mc_luma_loop + pop {r4-r5} +WELS_ASM_FUNC_END + + +WELS_ASM_FUNC_BEGIN McHorVer20Width9_neon + push {r4-r5} + mov r4, #20 + mov r5, #1 + sub r4, r4, r4, lsl #(16-2) + lsl r5, #16 + ror r4, #16 + vmov d7, r5, r4 // 0x0014FFFB00010000 + + sub r3, #8 + ldr r4, [sp, #8] + + sub r0, #2 + vmov.u16 q14, #0x0014 // 20 + vshr.u16 q15, q14, #2 // 5 + +w9_h_mc_luma_loop: + vld1.u8 {d0,d1}, [r0], r1 //only use 14(9+5); q0=src[-2] + pld [r0] + + vext.8 d2, d0, d1, #1 //d2=src[-1] + vext.8 d3, d0, d1, #2 //d3=src[0] + vext.8 d4, d0, d1, #3 //d4=src[1] + vext.8 d5, d0, d1, #4 //d5=src[2] + vext.8 d6, d0, d1, #5 //d6=src[3] + + FILTER_6TAG_8BITS d0, d2, d3, d4, d5, d6, d8, q14, q15 + + sub r4, #1 + vst1.u8 {d8}, [r2]! //write [0:7] Byte + + vsli.64 d2, d1, #8 // [0][1][2][3][4][5]XO-->O[0][1][2][3][4][5]X + FILTER_SINGLE_TAG_8BITS d2, d7, d14, q7, q1 + vst1.u8 {d2[0]}, [r2], r3 //write 8th Byte + + cmp r4, #0 + bne w9_h_mc_luma_loop + pop {r4-r5} +WELS_ASM_FUNC_END + + + WELS_ASM_FUNC_BEGIN McHorVer02Height17_neon + push {r4} + ldr r4, [sp, #4] + + sub r0, r0, r1, lsl #1 //src[-2*src_stride] + pld [r0] + pld [r0, r1] + vmov.u16 q14, #0x0014 // 20 + vld1.u8 {q0}, [r0], r1 //q0=src[-2] + vld1.u8 {q1}, [r0], r1 //q1=src[-1] + + pld [r0] + pld [r0, r1] + vshr.u16 q15, q14, #2 // 5 + vld1.u8 {q2}, [r0], r1 //q2=src[0] + vld1.u8 {q3}, [r0], r1 //q3=src[1] + vld1.u8 {q4}, [r0], r1 //q4=src[2] + +w17_v_mc_luma_loop: + + vld1.u8 {q5}, [r0], r1 //q5=src[3] + + FILTER_6TAG_8BITS d0, d2, d4, d6, d8, d10, d12, q14, q15 + pld [r0] + FILTER_6TAG_8BITS d1, d3, d5, d7, d9, d11, d13, q14, q15 + vld1.u8 {q0}, [r0], r1 //read 2nd row + vst1.u8 {q6}, [r2], r3 //write 1st 16Byte + + FILTER_6TAG_8BITS d2, d4, d6, d8, d10, d0, d12, q14, q15 + pld [r0] + FILTER_6TAG_8BITS d3, d5, d7, d9, d11, d1, d13, q14, q15 + vld1.u8 {q1}, [r0], r1 //read 3rd row + vst1.u8 {q6}, [r2], r3 //write 2nd 16Byte + + FILTER_6TAG_8BITS d4, d6, d8, d10, d0, d2, d12, q14, q15 + pld [r0] + FILTER_6TAG_8BITS d5, d7, d9, d11, d1, d3, d13, q14, q15 + vld1.u8 {q2}, [r0], r1 //read 4th row + vst1.u8 {q6}, [r2], r3 //write 3rd 16Byte + + FILTER_6TAG_8BITS d6, d8, d10, d0, d2, d4, d12, q14, q15 + pld [r0] + FILTER_6TAG_8BITS d7, d9, d11, d1, d3, d5, d13, q14, q15 + vld1.u8 {q3}, [r0], r1 //read 5th row + vst1.u8 {q6}, [r2], r3 //write 4th 16Byte + + FILTER_6TAG_8BITS d8, d10, d0, d2, d4, d6, d12, q14, q15 + pld [r0] + FILTER_6TAG_8BITS d9, d11, d1, d3, d5, d7, d13, q14, q15 + vld1.u8 {q4}, [r0], r1 //read 6th row + vst1.u8 {q6}, [r2], r3 //write 5th 16Byte + + FILTER_6TAG_8BITS d10, d0, d2, d4, d6, d8, d12, q14, q15 + pld [r0] + FILTER_6TAG_8BITS d11, d1, d3, d5, d7, d9, d13, q14, q15 + vld1.u8 {q5}, [r0], r1 //read 7th row + vst1.u8 {q6}, [r2], r3 //write 6th 16Byte + + FILTER_6TAG_8BITS d0, d2, d4, d6, d8, d10, d12, q14, q15 + pld [r0] + FILTER_6TAG_8BITS d1, d3, d5, d7, d9, d11, d13, q14, q15 + vld1.u8 {q0}, [r0], r1 //read 8th row + vst1.u8 {q6}, [r2], r3 //write 7th 16Byte + + FILTER_6TAG_8BITS d2, d4, d6, d8, d10, d0, d12, q14, q15 + pld [r0] + FILTER_6TAG_8BITS d3, d5, d7, d9, d11, d1, d13, q14, q15 + vst1.u8 {q6}, [r2], r3 //write 8th 16Byte + + //q2, q3, q4, q5, q0 --> q0~q4 + vswp q0, q4 + vswp q0, q2 + vmov q1, q3 + vmov q3, q5 //q0~q4 + + sub r4, #8 + cmp r4, #1 + bne w17_v_mc_luma_loop + // the last 16Bytes + vld1.u8 {q5}, [r0], r1 //q5=src[3] + FILTER_6TAG_8BITS d0, d2, d4, d6, d8, d10, d12, q14, q15 + FILTER_6TAG_8BITS d1, d3, d5, d7, d9, d11, d13, q14, q15 + vst1.u8 {q6}, [r2], r3 //write 1st 16Byte + + pop {r4} +WELS_ASM_FUNC_END + + +WELS_ASM_FUNC_BEGIN McHorVer02Height9_neon + push {r4} + ldr r4, [sp, #4] + + sub r0, r0, r1, lsl #1 //src[-2*src_stride] + pld [r0] + pld [r0, r1] + vmov.u16 q14, #0x0014 // 20 + vld1.u8 {d0}, [r0], r1 //d0=src[-2] + vld1.u8 {d1}, [r0], r1 //d1=src[-1] + + pld [r0] + pld [r0, r1] + vshr.u16 q15, q14, #2 // 5 + vld1.u8 {d2}, [r0], r1 //d2=src[0] + vld1.u8 {d3}, [r0], r1 //d3=src[1] + + vld1.u8 {d4}, [r0], r1 //d4=src[2] + vld1.u8 {d5}, [r0], r1 //d5=src[3] + +w9_v_mc_luma_loop: + + pld [r0] + FILTER_6TAG_8BITS d0, d1, d2, d3, d4, d5, d12, q14, q15 + vld1.u8 {d0}, [r0], r1 //read 2nd row + vst1.u8 {d12}, [r2], r3 //write 1st 8Byte + + pld [r0] + FILTER_6TAG_8BITS d1, d2, d3, d4, d5, d0, d12, q14, q15 + vld1.u8 {d1}, [r0], r1 //read 3rd row + vst1.u8 {d12}, [r2], r3 //write 2nd 8Byte + + pld [r0] + FILTER_6TAG_8BITS d2, d3, d4, d5, d0, d1, d12, q14, q15 + vld1.u8 {d2}, [r0], r1 //read 4th row + vst1.u8 {d12}, [r2], r3 //write 3rd 8Byte + + pld [r0] + FILTER_6TAG_8BITS d3, d4, d5, d0, d1, d2, d12, q14, q15 + vld1.u8 {d3}, [r0], r1 //read 5th row + vst1.u8 {d12}, [r2], r3 //write 4th 8Byte + + //d4, d5, d0, d1, d2, d3 --> d0, d1, d2, d3, d4, d5 + vswp q0, q2 + vswp q1, q2 + + sub r4, #4 + cmp r4, #1 + bne w9_v_mc_luma_loop + + FILTER_6TAG_8BITS d0, d1, d2, d3, d4, d5, d12, q14, q15 + vst1.u8 {d12}, [r2], r3 //write last 8Byte + + pop {r4} +WELS_ASM_FUNC_END + + +WELS_ASM_FUNC_BEGIN McHorVer22Width17_neon + push {r4} + ldr r4, [sp, #4] + + sub r0, #2 //src[-2] + sub r0, r0, r1, lsl #1 //src[-2*src_stride-2] + pld [r0] + pld [r0, r1] + + vmov.u16 q14, #0x0014 // 20 + vld1.u8 {d0-d2}, [r0], r1 //use 21(17+5), =src[-2] + vld1.u8 {d3-d5}, [r0], r1 //use 21(17+5), =src[-1] + + pld [r0] + pld [r0, r1] + vshr.u16 q15, q14, #2 // 5 + + vld1.u8 {d6-d8}, [r0], r1 //use 21(17+5), =src[0] + vld1.u8 {d9-d11}, [r0], r1 //use 21(17+5), =src[1] + pld [r0] + pld [r0, r1] + vld1.u8 {d12-d14}, [r0], r1 //use 21(17+5), =src[2] + sub r3, #16 + +w17_hv_mc_luma_loop: + + vld1.u8 {d15-d17}, [r0], r1 //use 21(17+5), =src[3] + //the 1st row + pld [r0] + // vertical filtered into q9/q10 + FILTER_6TAG_8BITS_TO_16BITS d0, d3, d6, d9, d12, d15, q9, q14, q15 // 8 avail + FILTER_6TAG_8BITS_TO_16BITS d1, d4, d7,d10, d13, d16,q10, q14, q15 // 8 avail + // horizon filtered + UNPACK_2_16BITS_TO_ABC q9, q10, q11, q12, q13 + FILTER_3_IN_16BITS_TO_8BITS q11, q12, q13, d0 //output to q0[0] + // vertical filtered into q10/q11 + FILTER_6TAG_8BITS_TO_16BITS d2, d5, d8,d11, d14, d17,q11, q14, q15 // only 6 avail + // horizon filtered + UNPACK_2_16BITS_TO_ABC q10, q11, q9, q12, q13 + FILTER_3_IN_16BITS_TO_8BITS q9, q12, q13, d1 //output to q0[1] + vst1.u8 {d0, d1}, [r2]! //write 16Byte + UNPACK_1_IN_8x16BITS_TO_8BITS d2, d22, d23, q11 //output to d2[0] + vst1.u8 {d2[0]}, [r2], r3 //write 16th Byte + + vld1.u8 {d0-d2}, [r0], r1 //read 2nd row + //the 2nd row + pld [r0] + // vertical filtered into q9/q10 + FILTER_6TAG_8BITS_TO_16BITS d3, d6, d9, d12, d15, d0, q9, q14, q15 // 8 avail + FILTER_6TAG_8BITS_TO_16BITS d4, d7,d10, d13, d16, d1,q10, q14, q15 // 8 avail + // horizon filtered + UNPACK_2_16BITS_TO_ABC q9, q10, q11, q12, q13 + FILTER_3_IN_16BITS_TO_8BITS q11, q12, q13, d3 //output to d3 + // vertical filtered into q10/q11 + FILTER_6TAG_8BITS_TO_16BITS d5, d8,d11, d14, d17, d2,q11, q14, q15 // only 6 avail + // horizon filtered + UNPACK_2_16BITS_TO_ABC q10, q11, q9, q12, q13 + FILTER_3_IN_16BITS_TO_8BITS q9, q12, q13, d4 //output to d4 + vst1.u8 {d3, d4}, [r2]! //write 16Byte + UNPACK_1_IN_8x16BITS_TO_8BITS d5, d22, d23, q11 //output to d5[0] + vst1.u8 {d5[0]}, [r2], r3 //write 16th Byte + + vld1.u8 {d3-d5}, [r0], r1 //read 3rd row + //the 3rd row + pld [r0] + // vertical filtered into q9/q10 + FILTER_6TAG_8BITS_TO_16BITS d6, d9, d12, d15, d0, d3, q9, q14, q15 // 8 avail + FILTER_6TAG_8BITS_TO_16BITS d7,d10, d13, d16, d1, d4,q10, q14, q15 // 8 avail + // horizon filtered + UNPACK_2_16BITS_TO_ABC q9, q10, q11, q12, q13 + FILTER_3_IN_16BITS_TO_8BITS q11, q12, q13, d6 //output to d6 + // vertical filtered into q10/q11 + FILTER_6TAG_8BITS_TO_16BITS d8,d11, d14, d17, d2, d5,q11, q14, q15 // only 6 avail + // horizon filtered + UNPACK_2_16BITS_TO_ABC q10, q11, q9, q12, q13 + FILTER_3_IN_16BITS_TO_8BITS q9, q12, q13, d7 //output to d7 + vst1.u8 {d6, d7}, [r2]! //write 16Byte + UNPACK_1_IN_8x16BITS_TO_8BITS d8, d22, d23, q11 //output to d8[0] + vst1.u8 {d8[0]}, [r2], r3 //write 16th Byte + + vld1.u8 {d6-d8}, [r0], r1 //read 4th row + //the 4th row + pld [r0] + // vertical filtered into q9/q10 + FILTER_6TAG_8BITS_TO_16BITS d9, d12, d15, d0, d3, d6, q9, q14, q15 // 8 avail + FILTER_6TAG_8BITS_TO_16BITS d10, d13, d16, d1, d4, d7,q10, q14, q15 // 8 avail + // horizon filtered + UNPACK_2_16BITS_TO_ABC q9, q10, q11, q12, q13 + FILTER_3_IN_16BITS_TO_8BITS q11, q12, q13, d9 //output to d9 + // vertical filtered into q10/q11 + FILTER_6TAG_8BITS_TO_16BITS d11, d14, d17, d2, d5, d8,q11, q14, q15 // only 6 avail + // horizon filtered + UNPACK_2_16BITS_TO_ABC q10, q11, q9, q12, q13 + FILTER_3_IN_16BITS_TO_8BITS q9, q12, q13, d10 //output to d10 + vst1.u8 {d9, d10}, [r2], r3 //write 16Byte + UNPACK_1_IN_8x16BITS_TO_8BITS d11, d22, d23, q11 //output to d11[0] + vst1.u8 {d11[0]}, [r2], r3 //write 16th Byte + + //d12~d17(q6~q8), d0~d8(q0~q3+d8), --> d0~d14 + vswp q0, q6 + vswp q6, q3 + vmov q5, q2 + vmov q2, q8 + + vmov d20,d8 + vmov q4, q1 + vmov q1, q7 + vmov d14,d20 + + sub r4, #4 + cmp r4, #1 + bne w17_hv_mc_luma_loop + //the last row + vld1.u8 {d15-d17}, [r0], r1 //use 21(17+5), =src[3] + // vertical filtered into q9/q10 + FILTER_6TAG_8BITS_TO_16BITS d0, d3, d6, d9, d12, d15, q9, q14, q15 // 8 avail + FILTER_6TAG_8BITS_TO_16BITS d1, d4, d7,d10, d13, d16,q10, q14, q15 // 8 avail + // horizon filtered + UNPACK_2_16BITS_TO_ABC q9, q10, q11, q12, q13 + FILTER_3_IN_16BITS_TO_8BITS q11, q12, q13, d0 //output to q0[0] + // vertical filtered into q10/q11 + FILTER_6TAG_8BITS_TO_16BITS d2, d5, d8,d11, d14, d17,q11, q14, q15 // only 6 avail + // horizon filtered + UNPACK_2_16BITS_TO_ABC q10, q11, q9, q12, q13 + FILTER_3_IN_16BITS_TO_8BITS q9, q12, q13, d1 //output to q0[1] + vst1.u8 {q0}, [r2]! //write 16Byte + UNPACK_1_IN_8x16BITS_TO_8BITS d2, d22, d23, q11 //output to d2[0] + vst1.u8 {d2[0]}, [r2], r3 //write 16th Byte + + pop {r4} +WELS_ASM_FUNC_END + + +WELS_ASM_FUNC_BEGIN McHorVer22Width9_neon + push {r4} + ldr r4, [sp, #4] + + sub r0, #2 //src[-2] + sub r0, r0, r1, lsl #1 //src[-2*src_stride-2] + pld [r0] + pld [r0, r1] + + vmov.u16 q14, #0x0014 // 20 + vld1.u8 {q0}, [r0], r1 //use 14(9+5), =src[-2] + vld1.u8 {q1}, [r0], r1 //use 14(9+5), =src[-1] + + pld [r0] + pld [r0, r1] + vshr.u16 q15, q14, #2 // 5 + + vld1.u8 {q2}, [r0], r1 //use 14(9+5), =src[0] + vld1.u8 {q3}, [r0], r1 //use 14(9+5), =src[1] + pld [r0] + pld [r0, r1] + vld1.u8 {q4}, [r0], r1 //use 14(9+5), =src[2] + sub r3, #8 + +w9_hv_mc_luma_loop: + + vld1.u8 {q5}, [r0], r1 //use 14(9+5), =src[3] + //the 1st row + pld [r0] + // vertical filtered into q6/q7 + FILTER_6TAG_8BITS_TO_16BITS d0, d2, d4, d6, d8, d10, q6, q14, q15 // 8 avail + FILTER_6TAG_8BITS_TO_16BITS d1, d3, d5, d7, d9, d11, q7, q14, q15 // 6 avail + // horizon filtered + UNPACK_2_16BITS_TO_ABC q6, q7, q11, q12, q13 + FILTER_3_IN_16BITS_TO_8BITS q11, q12, q13, d12 //output to q6[0] + vst1.u8 d12, [r2]! //write 8Byte + UNPACK_1_IN_8x16BITS_TO_8BITS d13, d14, d15, q7 //output to d13[0] + vst1.u8 {d13[0]}, [r2], r3 //write 8th Byte + + vld1.u8 {q0}, [r0], r1 //read 2nd row + //the 2nd row + pld [r0] + // vertical filtered into q6/q7 + FILTER_6TAG_8BITS_TO_16BITS d2, d4, d6, d8, d10, d0, q6, q14, q15 // 8 avail + FILTER_6TAG_8BITS_TO_16BITS d3, d5, d7, d9, d11, d1, q7, q14, q15 // 6 avail + // horizon filtered + UNPACK_2_16BITS_TO_ABC q6, q7, q11, q12, q13 + FILTER_3_IN_16BITS_TO_8BITS q11, q12, q13, d12 //output to q6[0] + vst1.u8 d12, [r2]! //write 8Byte + UNPACK_1_IN_8x16BITS_TO_8BITS d13, d14, d15, q7 //output to d13[0] + vst1.u8 {d13[0]}, [r2], r3 //write 8th Byte + + vld1.u8 {q1}, [r0], r1 //read 3rd row + //the 3rd row + pld [r0] + // vertical filtered into q6/q7 + FILTER_6TAG_8BITS_TO_16BITS d4, d6, d8, d10, d0, d2, q6, q14, q15 // 8 avail + FILTER_6TAG_8BITS_TO_16BITS d5, d7, d9, d11, d1, d3, q7, q14, q15 // 6 avail + // horizon filtered + UNPACK_2_16BITS_TO_ABC q6, q7, q11, q12, q13 + FILTER_3_IN_16BITS_TO_8BITS q11, q12, q13, d12 //output to q6[0] + vst1.u8 d12, [r2]! //write 8Byte + UNPACK_1_IN_8x16BITS_TO_8BITS d13, d14, d15, q7 //output to d13[0] + vst1.u8 {d13[0]}, [r2], r3 //write 8th Byte + + vld1.u8 {q2}, [r0], r1 //read 4th row + //the 4th row + pld [r0] + // vertical filtered into q6/q7 + FILTER_6TAG_8BITS_TO_16BITS d6, d8, d10, d0, d2, d4, q6, q14, q15 // 8 avail + FILTER_6TAG_8BITS_TO_16BITS d7, d9, d11, d1, d3, d5, q7, q14, q15 // 6 avail + // horizon filtered + UNPACK_2_16BITS_TO_ABC q6, q7, q11, q12, q13 + FILTER_3_IN_16BITS_TO_8BITS q11, q12, q13, d12 //output to q6[0] + vst1.u8 d12, [r2]! //write 8Byte + UNPACK_1_IN_8x16BITS_TO_8BITS d13, d14, d15, q7 //output to d13[0] + vst1.u8 {d13[0]}, [r2], r3 //write 8th Byte + + //q4~q5, q0~q2, --> q0~q4 + vswp q0, q4 + vswp q2, q4 + vmov q3, q1 + vmov q1, q5 + + sub r4, #4 + cmp r4, #1 + bne w9_hv_mc_luma_loop + //the last row + vld1.u8 {q5}, [r0], r1 //use 14(9+5), =src[3] + // vertical filtered into q6/q7 + FILTER_6TAG_8BITS_TO_16BITS d0, d2, d4, d6, d8, d10, q6, q14, q15 // 8 avail + FILTER_6TAG_8BITS_TO_16BITS d1, d3, d5, d7, d9, d11, q7, q14, q15 // 6 avail + // horizon filtered + UNPACK_2_16BITS_TO_ABC q6, q7, q11, q12, q13 + FILTER_3_IN_16BITS_TO_8BITS q11, q12, q13, d12 //output to q6[0] + vst1.u8 d12, [r2]! //write 8Byte + UNPACK_1_IN_8x16BITS_TO_8BITS d13, d14, d15, q7 //output to d13[0] + vst1.u8 {d13[0]}, [r2], r3 //write 8th Byte + pop {r4} +WELS_ASM_FUNC_END + + +WELS_ASM_FUNC_BEGIN PixStrideAvgWidthEq16_neon + push {r4, r5, r6} + ldr r4, [sp, #12] + ldr r5, [sp, #16] + ldr r6, [sp, #20] + +enc_w16_pix_avg_loop: + vld1.u8 {q0}, [r2], r3 + vld1.u8 {q1}, [r4], r5 + vld1.u8 {q2}, [r2], r3 + vld1.u8 {q3}, [r4], r5 + + vld1.u8 {q4}, [r2], r3 + vld1.u8 {q5}, [r4], r5 + vld1.u8 {q6}, [r2], r3 + vld1.u8 {q7}, [r4], r5 + + AVERAGE_TWO_8BITS d0, d0, d2 + AVERAGE_TWO_8BITS d1, d1, d3 + vst1.u8 {q0}, [r0], r1 + + AVERAGE_TWO_8BITS d4, d4, d6 + AVERAGE_TWO_8BITS d5, d5, d7 + vst1.u8 {q2}, [r0], r1 + + AVERAGE_TWO_8BITS d8, d8, d10 + AVERAGE_TWO_8BITS d9, d9, d11 + vst1.u8 {q4}, [r0], r1 + + AVERAGE_TWO_8BITS d12, d12, d14 + AVERAGE_TWO_8BITS d13, d13, d15 + vst1.u8 {q6}, [r0], r1 + + sub r6, #4 + cmp r6, #0 + bne enc_w16_pix_avg_loop + + pop {r4, r5, r6} +WELS_ASM_FUNC_END + + +WELS_ASM_FUNC_BEGIN PixStrideAvgWidthEq8_neon + push {r4, r5, r6} + ldr r4, [sp, #12] + ldr r5, [sp, #16] + ldr r6, [sp, #20] +enc_w8_pix_avg_loop: + + vld1.u8 {d0}, [r2], r3 + vld1.u8 {d2}, [r4], r5 + vld1.u8 {d1}, [r2], r3 + vld1.u8 {d3}, [r4], r5 + + AVERAGE_TWO_8BITS d0, d0, d2 + AVERAGE_TWO_8BITS d1, d1, d3 + vst1.u8 {d0}, [r0], r1 + vst1.u8 {d1}, [r0], r1 + + vld1.u8 {d4}, [r2], r3 + vld1.u8 {d6}, [r4], r5 + vld1.u8 {d5}, [r2], r3 + vld1.u8 {d7}, [r4], r5 + + AVERAGE_TWO_8BITS d4, d4, d6 + AVERAGE_TWO_8BITS d5, d5, d7 + vst1.u8 {d4}, [r0], r1 + vst1.u8 {d5}, [r0], r1 + + sub r6, #4 + cmp r6, #0 + bne enc_w8_pix_avg_loop + + pop {r4, r5, r6} +WELS_ASM_FUNC_END + #endif diff --git a/codec/encoder/core/src/mc.cpp b/codec/encoder/core/src/mc.cpp index 29fb0ae4..980ace30 100644 --- a/codec/encoder/core/src/mc.cpp +++ b/codec/encoder/core/src/mc.cpp @@ -480,6 +480,99 @@ void McChroma_ssse3 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int } #endif //X86_ASM + + //***************************************************************************// + // NEON implementation // + //***************************************************************************// +#if defined(HAVE_NEON) +void McHorVer20Width9Or17_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, + int32_t iWidth, int32_t iHeight) { + if (iWidth == 17) + McHorVer20Width17_neon(pSrc, iSrcStride, pDst, iDstStride, iHeight); + else //if (iWidth == 9) + McHorVer20Width9_neon(pSrc, iSrcStride, pDst, iDstStride, iHeight); +} +void McHorVer02Height9Or17_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, + int32_t iWidth, int32_t iHeight){ + if (iWidth == 16) + McHorVer02Height17_neon(pSrc, iSrcStride, pDst, iDstStride, iHeight); + else //if (iWidth == 8) + McHorVer02Height9_neon(pSrc, iSrcStride, pDst, iDstStride, iHeight); +} +void McHorVer22Width9Or17Height9Or17_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, + int32_t iWidth, int32_t iHeight){ + if (iWidth == 17) + McHorVer22Width17_neon(pSrc, iSrcStride, pDst, iDstStride, iHeight); + else //if (iWidth == 9) + McHorVer22Width9_neon(pSrc, iSrcStride, pDst, iDstStride, iHeight); +} +void EncMcHorVer11_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight) { + ENFORCE_STACK_ALIGN_1D(uint8_t, pTmp, 512, 16) + McHorVer20WidthEq16_neon(pSrc, iSrcStride, pTmp, 16, iHeight); + McHorVer02WidthEq16_neon(pSrc, iSrcStride, &pTmp[256], 16, iHeight); + PixelAvgWidthEq16_neon(pDst, iDstStride, pTmp, &pTmp[256], iHeight); +} +void EncMcHorVer12_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight) { + ENFORCE_STACK_ALIGN_1D(uint8_t, pTmp, 512, 16) + McHorVer02WidthEq16_neon(pSrc, iSrcStride, pTmp, 16, iHeight); + McHorVer22WidthEq16_neon(pSrc, iSrcStride, &pTmp[256], 16, iHeight); + PixelAvgWidthEq16_neon(pDst, iDstStride, pTmp, &pTmp[256], iHeight); +} +void EncMcHorVer13_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight) { + ENFORCE_STACK_ALIGN_1D(uint8_t, pTmp, 512, 16) + McHorVer20WidthEq16_neon(pSrc+iSrcStride, iSrcStride, pTmp, 16, iHeight); + McHorVer02WidthEq16_neon(pSrc, iSrcStride, &pTmp[256], 16, iHeight); + PixelAvgWidthEq16_neon(pDst, iDstStride, pTmp, &pTmp[256], iHeight); +} +void EncMcHorVer21_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight) { + ENFORCE_STACK_ALIGN_1D(uint8_t, pTmp, 512, 16) + McHorVer20WidthEq16_neon(pSrc, iSrcStride, pTmp, 16, iHeight); + McHorVer22WidthEq16_neon(pSrc, iSrcStride, &pTmp[256], 16, iHeight); + PixelAvgWidthEq16_neon(pDst, iDstStride, pTmp, &pTmp[256], iHeight); +} +void EncMcHorVer23_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight) { + ENFORCE_STACK_ALIGN_1D(uint8_t, pTmp, 512, 16) + McHorVer20WidthEq16_neon(pSrc+iSrcStride, iSrcStride, pTmp, 16, iHeight); + McHorVer22WidthEq16_neon(pSrc, iSrcStride, &pTmp[256], 16, iHeight); + PixelAvgWidthEq16_neon(pDst, iDstStride, pTmp, &pTmp[256], iHeight); +} +void EncMcHorVer31_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight) { + ENFORCE_STACK_ALIGN_1D(uint8_t, pTmp, 512, 16) + McHorVer20WidthEq16_neon(pSrc, iSrcStride, pTmp, 16, iHeight); + McHorVer02WidthEq16_neon(pSrc+1, iSrcStride, &pTmp[256], 16, iHeight); + PixelAvgWidthEq16_neon(pDst, iDstStride, pTmp, &pTmp[256], iHeight); +} +void EncMcHorVer32_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight) { + ENFORCE_STACK_ALIGN_1D(uint8_t, pTmp, 512, 16) + McHorVer02WidthEq16_neon(pSrc+1, iSrcStride, pTmp, 16, iHeight); + McHorVer22WidthEq16_neon(pSrc, iSrcStride, &pTmp[256], 16, iHeight); + PixelAvgWidthEq16_neon(pDst, iDstStride, pTmp, &pTmp[256], iHeight); +} +void EncMcHorVer33_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight) { + ENFORCE_STACK_ALIGN_1D(uint8_t, pTmp, 512, 16) + McHorVer20WidthEq16_neon(pSrc+iSrcStride, iSrcStride, pTmp, 16, iHeight); + McHorVer02WidthEq16_neon(pSrc+1, iSrcStride, &pTmp[256], 16, iHeight); + PixelAvgWidthEq16_neon(pDst, iDstStride, pTmp, &pTmp[256], iHeight); +} +void EncMcChroma_neon(const uint8_t *pSrc, int32_t iSrcStride, uint8_t *pDst, int32_t iDstStride, + SMVUnitXY sMv, int32_t iWidth, int32_t iHeight) { + const int32_t kiD8x = sMv.iMvX&0x07; + const int32_t kiD8y = sMv.iMvY&0x07; + if (0 == kiD8x && 0 == kiD8y) { + if(8 == iWidth) + McCopyWidthEq8_neon(pSrc, iSrcStride, pDst, iDstStride, iHeight); + else // iWidth == 4 + McCopyWidthEq4_neon(pSrc, iSrcStride, pDst, iDstStride, iHeight); + } + else { + if(8 == iWidth) + McChromaWidthEq8_neon(pSrc, iSrcStride, pDst, iDstStride, (int32_t*)(g_kuiABCD[kiD8y][kiD8x]), iHeight); + else //if(4 == iWidth) + McChromaWidthEq4_neon(pSrc, iSrcStride, pDst, iDstStride, (int32_t*)(g_kuiABCD[kiD8y][kiD8x]), iHeight); + } +} +#endif + typedef void (*PixelAvgFunc) (uint8_t*, int32_t, const uint8_t*, int32_t, const uint8_t*, int32_t, int32_t); void WelsInitMcFuncs (SWelsFuncPtrList* pFuncList, uint32_t uiCpuFlag) { static PixelAvgFunc pfPixAvgFunc[2] = {PixelAvgWidthEq8_c, PixelAvgWidthEq16_c}; @@ -498,6 +591,14 @@ void WelsInitMcFuncs (SWelsFuncPtrList* pFuncList, uint32_t uiCpuFlag) { McHorVer03WidthEq16, McHorVer13WidthEq16, McHorVer23WidthEq16, McHorVer33WidthEq16 }; #endif +#if defined(HAVE_NEON) + static PWelsLumaQuarpelMcFunc pWelsMcFuncWidthEq16_neon[16] = { //[x][y] + McCopyWidthEq16_neon, McHorVer10WidthEq16_neon, McHorVer20WidthEq16_neon, McHorVer30WidthEq16_neon, + McHorVer01WidthEq16_neon, EncMcHorVer11_neon, EncMcHorVer21_neon, EncMcHorVer31_neon, + McHorVer02WidthEq16_neon, EncMcHorVer12_neon, McHorVer22WidthEq16_neon, EncMcHorVer32_neon, + McHorVer03WidthEq16_neon, EncMcHorVer13_neon, EncMcHorVer23_neon, EncMcHorVer33_neon + }; +#endif pFuncList->sMcFuncs.pfLumaHalfpelHor = McHorVer20_c; pFuncList->sMcFuncs.pfLumaHalfpelVer = McHorVer02_c; @@ -538,5 +639,17 @@ void WelsInitMcFuncs (SWelsFuncPtrList* pFuncList, uint32_t uiCpuFlag) { } #endif //(X86_ASM) + +#if defined(HAVE_NEON) + if (uiCpuFlag & WELS_CPU_NEON) { + pFuncList->sMcFuncs.pfLumaQuarpelMc = pWelsMcFuncWidthEq16_neon; + pFuncList->sMcFuncs.pfChromaMc = EncMcChroma_neon; + pFuncList->sMcFuncs.pfSampleAveraging[0] = PixStrideAvgWidthEq8_neon; + pFuncList->sMcFuncs.pfSampleAveraging[1] = PixStrideAvgWidthEq16_neon; + pFuncList->sMcFuncs.pfLumaHalfpelHor = McHorVer20Width9Or17_neon;//iWidth+1:8/16 + pFuncList->sMcFuncs.pfLumaHalfpelVer = McHorVer02Height9Or17_neon;//heigh+1:8/16 + pFuncList->sMcFuncs.pfLumaHalfpelCen = McHorVer22Width9Or17Height9Or17_neon;//iWidth+1/heigh+1 + } +#endif } }