diff --git a/codec/common/arm64/arm_arch64_common_macro.S b/codec/common/arm64/arm_arch64_common_macro.S index cc712250..2ea6d5de 100644 --- a/codec/common/arm64/arm_arch64_common_macro.S +++ b/codec/common/arm64/arm_arch64_common_macro.S @@ -32,13 +32,13 @@ #ifdef __APPLE__ -.macro WELS_ASM_ARCH64_FUNC_BEGIN +.macro WELS_ASM_AARCH64_FUNC_BEGIN .align 2 .globl _$0 _$0: .endm -.macro WELS_ASM_ARCH64_FUNC_END +.macro WELS_ASM_AARCH64_FUNC_END ret .endm #else @@ -46,7 +46,7 @@ ret .section .note.GNU-stack,"",%progbits // Mark stack as non-executable .text -.macro WELS_ASM_ARCH64_FUNC_BEGIN funcName +.macro WELS_ASM_AARCH64_FUNC_BEGIN funcName .align 2 .global \funcName .type \funcName, %function @@ -54,7 +54,7 @@ ret \funcName: .endm -.macro WELS_ASM_ARCH64_FUNC_END +.macro WELS_ASM_AARCH64_FUNC_END ret .endfunc .endm diff --git a/codec/common/arm64/deblocking_aarch64_neon.S b/codec/common/arm64/deblocking_aarch64_neon.S index b32aff69..9ff1c509 100644 --- a/codec/common/arm64/deblocking_aarch64_neon.S +++ b/codec/common/arm64/deblocking_aarch64_neon.S @@ -552,7 +552,7 @@ bs_mv_check_jump1: .endm #endif -WELS_ASM_ARCH64_FUNC_BEGIN WelsNonZeroCount_AArch64_neon +WELS_ASM_AARCH64_FUNC_BEGIN WelsNonZeroCount_AArch64_neon ld1 {v0.8b, v1.8b, v2.8b}, [x0] ins v0.d[1], v1.d[0] uzp1 v0.2d, v0.2d, v1.2d @@ -564,10 +564,10 @@ WELS_ASM_ARCH64_FUNC_BEGIN WelsNonZeroCount_AArch64_neon abs v2.8b, v2.8b ins v1.d[0], v0.d[1] st1 {v0.8b, v1.8b, v2.8b}, [x0] -WELS_ASM_ARCH64_FUNC_END +WELS_ASM_AARCH64_FUNC_END -WELS_ASM_ARCH64_FUNC_BEGIN DeblockLumaLt4V_AArch64_neon //uint8_t* pPix, int32_t iStride, int32_t iAlpha, int32_t iBeta, int8_t* tc +WELS_ASM_AARCH64_FUNC_BEGIN DeblockLumaLt4V_AArch64_neon //uint8_t* pPix, int32_t iStride, int32_t iAlpha, int32_t iBeta, int8_t* tc dup v16.16b, w2 //alpha dup v17.16b, w3 //beta add x2, x1, x1, lsl #1 @@ -622,10 +622,10 @@ WELS_ASM_ARCH64_FUNC_BEGIN DeblockLumaLt4V_AArch64_neon //uint8_t* pPix, int32_t st1 {v3.16b}, [x2], x1 st1 {v21.16b}, [x2] DeblockLumaLt4V_AArch64_neon_end: -WELS_ASM_ARCH64_FUNC_END +WELS_ASM_AARCH64_FUNC_END -WELS_ASM_ARCH64_FUNC_BEGIN DeblockLumaEq4V_AArch64_neon +WELS_ASM_AARCH64_FUNC_BEGIN DeblockLumaEq4V_AArch64_neon dup v16.16b, w2 //alpha dup v17.16b, w3 //beta sub x3, x0, x1, lsl #2 @@ -688,10 +688,10 @@ WELS_ASM_ARCH64_FUNC_BEGIN DeblockLumaEq4V_AArch64_neon DIFF_LUMA_EQ4_MASK v19, v6, v22, v17 st1 {v17.16b}, [x3], x1 DeblockLumaEq4V_AArch64_neon_end: -WELS_ASM_ARCH64_FUNC_END +WELS_ASM_AARCH64_FUNC_END -WELS_ASM_ARCH64_FUNC_BEGIN DeblockLumaLt4H_AArch64_neon //uint8_t* pPix, int32_t iStride, int32_t iAlpha, int32_t iBeta, int8_t* tc +WELS_ASM_AARCH64_FUNC_BEGIN DeblockLumaLt4H_AArch64_neon //uint8_t* pPix, int32_t iStride, int32_t iAlpha, int32_t iBeta, int8_t* tc dup v16.16b, w2 //alpha dup v17.16b, w3 //beta sub x2, x0, #3 @@ -773,10 +773,10 @@ WELS_ASM_ARCH64_FUNC_BEGIN DeblockLumaLt4H_AArch64_neon //uint8_t* pPix, int32_t STORE_LUMA_DATA_4 v25, v26, v27, v28, 12, 13 STORE_LUMA_DATA_4 v25, v26, v27, v28, 14, 15 DeblockLumaLt4H_AArch64_neon_end: -WELS_ASM_ARCH64_FUNC_END +WELS_ASM_AARCH64_FUNC_END -WELS_ASM_ARCH64_FUNC_BEGIN DeblockLumaEq4H_AArch64_neon +WELS_ASM_AARCH64_FUNC_BEGIN DeblockLumaEq4H_AArch64_neon dup v16.16b, w2 //alpha dup v17.16b, w3 //beta sub x3, x0, #4 @@ -869,10 +869,10 @@ WELS_ASM_ARCH64_FUNC_BEGIN DeblockLumaEq4H_AArch64_neon STORE_LUMA_DATA_3 v26, v27, v28, v29, v30, v31, 14 STORE_LUMA_DATA_3 v26, v27, v28, v29, v30, v31, 15 DeblockLumaEq4H_AArch64_neon_end: -WELS_ASM_ARCH64_FUNC_END +WELS_ASM_AARCH64_FUNC_END -WELS_ASM_ARCH64_FUNC_BEGIN DeblockChromaLt4V_AArch64_neon //uint8_t* pPixCb, uint8_t* pPixCr, int32_t iStrideX, int32_t iAlpha, int32_t iBeta, int8_t* pTc +WELS_ASM_AARCH64_FUNC_BEGIN DeblockChromaLt4V_AArch64_neon //uint8_t* pPixCb, uint8_t* pPixCr, int32_t iStrideX, int32_t iAlpha, int32_t iBeta, int8_t* pTc dup v16.16b, w3 //alpha dup v17.16b, w4 //beta lsl x3, x2, #1 @@ -919,9 +919,9 @@ WELS_ASM_ARCH64_FUNC_BEGIN DeblockChromaLt4V_AArch64_neon //uint8_t* pPixCb, uin st1 {v2.d} [0], [x6] st1 {v2.d} [1], [x7] DeblockChromaLt4V_AArch64_neon_end: -WELS_ASM_ARCH64_FUNC_END +WELS_ASM_AARCH64_FUNC_END -WELS_ASM_ARCH64_FUNC_BEGIN DeblockChromaLt4H_AArch64_neon //uint8_t* pPixCb, uint8_t* pPixCr, int32_t iStrideX, int32_t iAlpha, int32_t iBeta, int8_t* pTc +WELS_ASM_AARCH64_FUNC_BEGIN DeblockChromaLt4H_AArch64_neon //uint8_t* pPixCb, uint8_t* pPixCr, int32_t iStrideX, int32_t iAlpha, int32_t iBeta, int8_t* pTc dup v16.16b, w3 //alpha dup v17.16b, w4 //beta sub x6, x0, #2 //pPixCb-2 @@ -992,9 +992,9 @@ WELS_ASM_ARCH64_FUNC_BEGIN DeblockChromaLt4H_AArch64_neon //uint8_t* pPixCb, uin STORE_CHROMA_DATA_2 v1, v2, x1, 14 STORE_CHROMA_DATA_2 v1, v2, x1, 15 DeblockChromaLt4H_AArch64_neon_end: -WELS_ASM_ARCH64_FUNC_END +WELS_ASM_AARCH64_FUNC_END -WELS_ASM_ARCH64_FUNC_BEGIN DeblockChromaEq4V_AArch64_neon //uint8_t* pPixCb, uint8_t* pPixCr, int32_t iStrideX, int32_t iAlpha, int32_t iBeta +WELS_ASM_AARCH64_FUNC_BEGIN DeblockChromaEq4V_AArch64_neon //uint8_t* pPixCb, uint8_t* pPixCr, int32_t iStrideX, int32_t iAlpha, int32_t iBeta dup v16.16b, w3 //alpha dup v17.16b, w4 //beta lsl x3, x2, #1 @@ -1027,9 +1027,9 @@ WELS_ASM_ARCH64_FUNC_BEGIN DeblockChromaEq4V_AArch64_neon //uint8_t* pPixCb, uin st1 {v7.d} [0], [x6] st1 {v7.d} [1], [x7] DeblockChromaEq4V_AArch64_neon_end: -WELS_ASM_ARCH64_FUNC_END +WELS_ASM_AARCH64_FUNC_END -WELS_ASM_ARCH64_FUNC_BEGIN DeblockChromaEq4H_AArch64_neon //uint8_t* pPixCb, uint8_t* pPixCr, int32_t iStrideX, int32_t iAlpha, int32_t iBeta +WELS_ASM_AARCH64_FUNC_BEGIN DeblockChromaEq4H_AArch64_neon //uint8_t* pPixCb, uint8_t* pPixCr, int32_t iStrideX, int32_t iAlpha, int32_t iBeta dup v16.16b, w3 //alpha dup v17.16b, w4 //beta @@ -1085,10 +1085,10 @@ WELS_ASM_ARCH64_FUNC_BEGIN DeblockChromaEq4H_AArch64_neon //uint8_t* pPixCb, uin STORE_CHROMA_DATA_2 v6, v7, x1, 14 STORE_CHROMA_DATA_2 v6, v7, x1, 15 DeblockChromaEq4H_AArch64_neon_end: -WELS_ASM_ARCH64_FUNC_END +WELS_ASM_AARCH64_FUNC_END -WELS_ASM_ARCH64_FUNC_BEGIN DeblockingBSCalcEnc_AArch64_neon +WELS_ASM_AARCH64_FUNC_BEGIN DeblockingBSCalcEnc_AArch64_neon // Checking the nzc status BS_NZC_CHECK x0, x2, x3, v16, v17 //v16,v17 save the nzc status // For checking bS[I] = 2 @@ -1110,7 +1110,7 @@ WELS_ASM_ARCH64_FUNC_BEGIN DeblockingBSCalcEnc_AArch64_neon umax v1.16b, v18.16b, v16.16b umax v0.16b, v19.16b, v17.16b st1 {v0.16b, v1.16b}, [x4] -WELS_ASM_ARCH64_FUNC_END +WELS_ASM_AARCH64_FUNC_END #endif diff --git a/codec/common/arm64/expand_picture_aarch64_neon.S b/codec/common/arm64/expand_picture_aarch64_neon.S index 572bd0f9..9ae2b29e 100644 --- a/codec/common/arm64/expand_picture_aarch64_neon.S +++ b/codec/common/arm64/expand_picture_aarch64_neon.S @@ -34,7 +34,7 @@ .text #include "arm_arch64_common_macro.S" -WELS_ASM_ARCH64_FUNC_BEGIN ExpandPictureLuma_AArch64_neon +WELS_ASM_AARCH64_FUNC_BEGIN ExpandPictureLuma_AArch64_neon mov x7, x0 mov x8, x3 add x4, x7, x2 @@ -72,9 +72,9 @@ _expand_picture_luma_loop1: sub x2, x2, #16 cbnz x2, _expand_picture_luma_loop0 -WELS_ASM_ARCH64_FUNC_END +WELS_ASM_AARCH64_FUNC_END -WELS_ASM_ARCH64_FUNC_BEGIN ExpandPictureChroma_AArch64_neon +WELS_ASM_AARCH64_FUNC_BEGIN ExpandPictureChroma_AArch64_neon //Save the dst mov x7, x0 mov x8, x3 @@ -138,6 +138,6 @@ _expand_picture_chroma_loop3: cbnz x8, _expand_picture_chroma_loop3 _expand_picture_chroma_end: -WELS_ASM_ARCH64_FUNC_END +WELS_ASM_AARCH64_FUNC_END #endif diff --git a/codec/common/arm64/mc_aarch64_neon.S b/codec/common/arm64/mc_aarch64_neon.S index c6566f3f..f58d7c2b 100644 --- a/codec/common/arm64/mc_aarch64_neon.S +++ b/codec/common/arm64/mc_aarch64_neon.S @@ -386,7 +386,7 @@ filter_para: .short 0, 1, -5, 20, 0, 0, 0, 0 #endif //(const uint8_t* pSrc {x0}, int32_t iSrcStride{x1}, uint8_t* pDst{x2}, int32_t iDstStride{x3}, int32_t iHeight{x4}) -WELS_ASM_ARCH64_FUNC_BEGIN McHorVer20WidthEq16_AArch64_neon +WELS_ASM_AARCH64_FUNC_BEGIN McHorVer20WidthEq16_AArch64_neon sub x0, x0, #2 movi v0.8h, #20, lsl #0 movi v1.8h, #5, lsl #0 @@ -406,9 +406,9 @@ w16_h_mc_luma_loop: sub x4, x4, #1 st1 {v20.16b}, [x2], x3 //write 16Byte cbnz x4, w16_h_mc_luma_loop -WELS_ASM_ARCH64_FUNC_END +WELS_ASM_AARCH64_FUNC_END -WELS_ASM_ARCH64_FUNC_BEGIN McHorVer20WidthEq8_AArch64_neon +WELS_ASM_AARCH64_FUNC_BEGIN McHorVer20WidthEq8_AArch64_neon sub x0, x0, #2 movi v0.8h, #20, lsl #0 movi v1.8h, #5, lsl #0 @@ -427,9 +427,9 @@ w8_h_mc_luma_loop: sub x4, x4, #1 st1 {v20.8b}, [x2], x3 //write 8Byte cbnz x4, w8_h_mc_luma_loop -WELS_ASM_ARCH64_FUNC_END +WELS_ASM_AARCH64_FUNC_END -WELS_ASM_ARCH64_FUNC_BEGIN McHorVer20WidthEq4_AArch64_neon +WELS_ASM_AARCH64_FUNC_BEGIN McHorVer20WidthEq4_AArch64_neon sub x0, x0, #2 movi v0.8h, #20, lsl #0 movi v1.8h, #5, lsl #0 @@ -462,9 +462,9 @@ w4_h_mc_luma_loop: st1 {v20.s}[1], [x2], x3 //write 4Byte sub x4, x4, #1 cbnz x4, w4_h_mc_luma_loop -WELS_ASM_ARCH64_FUNC_END +WELS_ASM_AARCH64_FUNC_END -WELS_ASM_ARCH64_FUNC_BEGIN McHorVer10WidthEq16_AArch64_neon +WELS_ASM_AARCH64_FUNC_BEGIN McHorVer10WidthEq16_AArch64_neon sub x0, x0, #2 movi v0.8h, #20, lsl #0 movi v1.8h, #5, lsl #0 @@ -484,10 +484,10 @@ w16_xy_10_mc_luma_loop: sub x4, x4, #1 st1 {v20.16b}, [x2], x3 //write 16Byte cbnz x4, w16_xy_10_mc_luma_loop -WELS_ASM_ARCH64_FUNC_END +WELS_ASM_AARCH64_FUNC_END -WELS_ASM_ARCH64_FUNC_BEGIN McHorVer10WidthEq8_AArch64_neon +WELS_ASM_AARCH64_FUNC_BEGIN McHorVer10WidthEq8_AArch64_neon sub x0, x0, #2 movi v0.8h, #20, lsl #0 movi v1.8h, #5, lsl #0 @@ -506,9 +506,9 @@ w8_xy_10_mc_luma_loop: sub x4, x4, #1 st1 {v20.8b}, [x2], x3 //write 8Byte cbnz x4, w8_xy_10_mc_luma_loop -WELS_ASM_ARCH64_FUNC_END +WELS_ASM_AARCH64_FUNC_END -WELS_ASM_ARCH64_FUNC_BEGIN McHorVer10WidthEq4_AArch64_neon +WELS_ASM_AARCH64_FUNC_BEGIN McHorVer10WidthEq4_AArch64_neon sub x0, x0, #2 movi v0.8h, #20, lsl #0 movi v1.8h, #5, lsl #0 @@ -541,10 +541,10 @@ w4_xy_10_mc_luma_loop: st1 {v20.s}[1], [x2], x3 //write 4Byte sub x4, x4, #1 cbnz x4, w4_xy_10_mc_luma_loop -WELS_ASM_ARCH64_FUNC_END +WELS_ASM_AARCH64_FUNC_END -WELS_ASM_ARCH64_FUNC_BEGIN McHorVer30WidthEq16_AArch64_neon +WELS_ASM_AARCH64_FUNC_BEGIN McHorVer30WidthEq16_AArch64_neon sub x0, x0, #2 movi v0.8h, #20, lsl #0 movi v1.8h, #5, lsl #0 @@ -564,10 +564,10 @@ w16_xy_30_mc_luma_loop: sub x4, x4, #1 st1 {v20.16b}, [x2], x3 //write 16Byte cbnz x4, w16_xy_30_mc_luma_loop -WELS_ASM_ARCH64_FUNC_END +WELS_ASM_AARCH64_FUNC_END -WELS_ASM_ARCH64_FUNC_BEGIN McHorVer30WidthEq8_AArch64_neon +WELS_ASM_AARCH64_FUNC_BEGIN McHorVer30WidthEq8_AArch64_neon sub x0, x0, #2 movi v0.8h, #20, lsl #0 movi v1.8h, #5, lsl #0 @@ -586,9 +586,9 @@ w8_xy_30_mc_luma_loop: sub x4, x4, #1 st1 {v20.8b}, [x2], x3 //write 8Byte cbnz x4, w8_xy_30_mc_luma_loop -WELS_ASM_ARCH64_FUNC_END +WELS_ASM_AARCH64_FUNC_END -WELS_ASM_ARCH64_FUNC_BEGIN McHorVer30WidthEq4_AArch64_neon +WELS_ASM_AARCH64_FUNC_BEGIN McHorVer30WidthEq4_AArch64_neon sub x0, x0, #2 movi v0.8h, #20, lsl #0 movi v1.8h, #5, lsl #0 @@ -621,10 +621,10 @@ w4_xy_30_mc_luma_loop: st1 {v20.s}[1], [x2], x3 //write 4Byte sub x4, x4, #1 cbnz x4, w4_xy_30_mc_luma_loop -WELS_ASM_ARCH64_FUNC_END +WELS_ASM_AARCH64_FUNC_END -WELS_ASM_ARCH64_FUNC_BEGIN McHorVer01WidthEq16_AArch64_neon +WELS_ASM_AARCH64_FUNC_BEGIN McHorVer01WidthEq16_AArch64_neon sub x0, x0, x1, lsl #1 movi v0.8h, #20, lsl #0 movi v1.8h, #5, lsl #0 @@ -704,10 +704,10 @@ w16_xy_01_mc_luma_loop: mov.16b v6, v7 sub x4, x4, #8 cbnz x4, w16_xy_01_mc_luma_loop -WELS_ASM_ARCH64_FUNC_END +WELS_ASM_AARCH64_FUNC_END -WELS_ASM_ARCH64_FUNC_BEGIN McHorVer01WidthEq8_AArch64_neon +WELS_ASM_AARCH64_FUNC_BEGIN McHorVer01WidthEq8_AArch64_neon sub x0, x0, x1, lsl #1 movi v0.8h, #20, lsl #0 movi v1.8h, #5, lsl #0 @@ -754,10 +754,10 @@ w8_xy_01_mc_luma_loop: mov.16b v4, v7 sub x4, x4, #4 cbnz x4, w8_xy_01_mc_luma_loop -WELS_ASM_ARCH64_FUNC_END +WELS_ASM_AARCH64_FUNC_END -WELS_ASM_ARCH64_FUNC_BEGIN McHorVer01WidthEq4_AArch64_neon +WELS_ASM_AARCH64_FUNC_BEGIN McHorVer01WidthEq4_AArch64_neon sub x0, x0, x1, lsl #1 movi v0.8h, #20, lsl #0 movi v1.8h, #5, lsl #0 @@ -810,10 +810,10 @@ w4_xy_01_mc_luma_loop: sub x4, x4, #4 cbnz x4, w4_xy_01_mc_luma_loop -WELS_ASM_ARCH64_FUNC_END +WELS_ASM_AARCH64_FUNC_END -WELS_ASM_ARCH64_FUNC_BEGIN McHorVer03WidthEq16_AArch64_neon +WELS_ASM_AARCH64_FUNC_BEGIN McHorVer03WidthEq16_AArch64_neon sub x0, x0, x1, lsl #1 movi v0.8h, #20, lsl #0 movi v1.8h, #5, lsl #0 @@ -893,10 +893,10 @@ w16_xy_03_mc_luma_loop: mov.16b v6, v7 sub x4, x4, #8 cbnz x4, w16_xy_03_mc_luma_loop -WELS_ASM_ARCH64_FUNC_END +WELS_ASM_AARCH64_FUNC_END -WELS_ASM_ARCH64_FUNC_BEGIN McHorVer03WidthEq8_AArch64_neon +WELS_ASM_AARCH64_FUNC_BEGIN McHorVer03WidthEq8_AArch64_neon sub x0, x0, x1, lsl #1 movi v0.8h, #20, lsl #0 movi v1.8h, #5, lsl #0 @@ -943,10 +943,10 @@ w8_xy_03_mc_luma_loop: mov.16b v4, v7 sub x4, x4, #4 cbnz x4, w8_xy_03_mc_luma_loop -WELS_ASM_ARCH64_FUNC_END +WELS_ASM_AARCH64_FUNC_END -WELS_ASM_ARCH64_FUNC_BEGIN McHorVer03WidthEq4_AArch64_neon +WELS_ASM_AARCH64_FUNC_BEGIN McHorVer03WidthEq4_AArch64_neon sub x0, x0, x1, lsl #1 movi v0.8h, #20, lsl #0 movi v1.8h, #5, lsl #0 @@ -999,10 +999,10 @@ w4_xy_03_mc_luma_loop: sub x4, x4, #4 cbnz x4, w4_xy_03_mc_luma_loop -WELS_ASM_ARCH64_FUNC_END +WELS_ASM_AARCH64_FUNC_END -WELS_ASM_ARCH64_FUNC_BEGIN McHorVer02WidthEq16_AArch64_neon +WELS_ASM_AARCH64_FUNC_BEGIN McHorVer02WidthEq16_AArch64_neon sub x0, x0, x1, lsl #1 movi v0.8h, #20, lsl #0 movi v1.8h, #5, lsl #0 @@ -1082,10 +1082,10 @@ w16_xy_02_mc_luma_loop: mov.16b v6, v7 sub x4, x4, #8 cbnz x4, w16_xy_02_mc_luma_loop -WELS_ASM_ARCH64_FUNC_END +WELS_ASM_AARCH64_FUNC_END -WELS_ASM_ARCH64_FUNC_BEGIN McHorVer02WidthEq8_AArch64_neon +WELS_ASM_AARCH64_FUNC_BEGIN McHorVer02WidthEq8_AArch64_neon sub x0, x0, x1, lsl #1 movi v0.8h, #20, lsl #0 movi v1.8h, #5, lsl #0 @@ -1132,10 +1132,10 @@ w8_xy_02_mc_luma_loop: mov.16b v4, v7 sub x4, x4, #4 cbnz x4, w8_xy_02_mc_luma_loop -WELS_ASM_ARCH64_FUNC_END +WELS_ASM_AARCH64_FUNC_END -WELS_ASM_ARCH64_FUNC_BEGIN McHorVer02WidthEq4_AArch64_neon +WELS_ASM_AARCH64_FUNC_BEGIN McHorVer02WidthEq4_AArch64_neon sub x0, x0, x1, lsl #1 movi v0.8h, #20, lsl #0 movi v1.8h, #5, lsl #0 @@ -1188,10 +1188,10 @@ w4_xy_02_mc_luma_loop: sub x4, x4, #4 cbnz x4, w4_xy_02_mc_luma_loop -WELS_ASM_ARCH64_FUNC_END +WELS_ASM_AARCH64_FUNC_END -WELS_ASM_ARCH64_FUNC_BEGIN McHorVer22WidthEq16_AArch64_neon +WELS_ASM_AARCH64_FUNC_BEGIN McHorVer22WidthEq16_AArch64_neon stp d8, d9, [sp,#-16]! stp d10, d11, [sp,#-16]! stp d12, d13, [sp,#-16]! @@ -1354,9 +1354,9 @@ w16_hv_mc_luma_loop: ldp d12, d13, [sp], #16 ldp d10, d11, [sp], #16 ldp d8, d9, [sp], #16 -WELS_ASM_ARCH64_FUNC_END +WELS_ASM_AARCH64_FUNC_END -WELS_ASM_ARCH64_FUNC_BEGIN McHorVer22WidthEq8_AArch64_neon +WELS_ASM_AARCH64_FUNC_BEGIN McHorVer22WidthEq8_AArch64_neon sub x0, x0, #2 sub x0, x0, x1, lsl #1 movi v0.8h, #20, lsl #0 @@ -1425,10 +1425,10 @@ w8_hv_mc_luma_loop: sub x4, x4, #4 cbnz x4, w8_hv_mc_luma_loop -WELS_ASM_ARCH64_FUNC_END +WELS_ASM_AARCH64_FUNC_END -WELS_ASM_ARCH64_FUNC_BEGIN McHorVer22WidthEq4_AArch64_neon +WELS_ASM_AARCH64_FUNC_BEGIN McHorVer22WidthEq4_AArch64_neon sub x0, x0, #2 sub x0, x0, x1, lsl #1 movi v0.8h, #20, lsl #0 @@ -1496,9 +1496,9 @@ w4_hv_mc_luma_loop: sub x4, x4, #4 cbnz x4, w4_hv_mc_luma_loop -WELS_ASM_ARCH64_FUNC_END +WELS_ASM_AARCH64_FUNC_END -WELS_ASM_ARCH64_FUNC_BEGIN McCopyWidthEq16_AArch64_neon +WELS_ASM_AARCH64_FUNC_BEGIN McCopyWidthEq16_AArch64_neon //prfm pldl1strm, [x0] w16_copy_loop: //prfm pldl1strm, [x0, x1] @@ -1510,9 +1510,9 @@ w16_copy_loop: sub x4, x4, #2 cbnz x4, w16_copy_loop -WELS_ASM_ARCH64_FUNC_END +WELS_ASM_AARCH64_FUNC_END -WELS_ASM_ARCH64_FUNC_BEGIN McCopyWidthEq8_AArch64_neon +WELS_ASM_AARCH64_FUNC_BEGIN McCopyWidthEq8_AArch64_neon //prfm pldl1strm, [x0] w8_copy_loop: //prfm pldl1strm, [x0, x1] @@ -1524,9 +1524,9 @@ w8_copy_loop: sub x4, x4, #2 cbnz x4, w8_copy_loop -WELS_ASM_ARCH64_FUNC_END +WELS_ASM_AARCH64_FUNC_END -WELS_ASM_ARCH64_FUNC_BEGIN McCopyWidthEq4_AArch64_neon +WELS_ASM_AARCH64_FUNC_BEGIN McCopyWidthEq4_AArch64_neon //prfm pldl1strm, [x0] w4_copy_loop: //prfm pldl1strm, [x0, x1] @@ -1538,9 +1538,9 @@ w4_copy_loop: sub x4, x4, #2 cbnz x4, w4_copy_loop -WELS_ASM_ARCH64_FUNC_END +WELS_ASM_AARCH64_FUNC_END -WELS_ASM_ARCH64_FUNC_BEGIN PixStrideAvgWidthEq16_AArch64_neon +WELS_ASM_AARCH64_FUNC_BEGIN PixStrideAvgWidthEq16_AArch64_neon enc_w16_pix_avg_loop: ld1 {v0.16b}, [x2], x3 //read 16Byte : src0: 0 line @@ -1571,9 +1571,9 @@ enc_w16_pix_avg_loop: sub x6, x6, #4 cbnz x6, enc_w16_pix_avg_loop -WELS_ASM_ARCH64_FUNC_END +WELS_ASM_AARCH64_FUNC_END -WELS_ASM_ARCH64_FUNC_BEGIN PixStrideAvgWidthEq8_AArch64_neon +WELS_ASM_AARCH64_FUNC_BEGIN PixStrideAvgWidthEq8_AArch64_neon //prfm pldl1strm, [x2] //prfm pldl1strm, [x4] enc_w8_pix_avg_loop: @@ -1608,9 +1608,9 @@ enc_w8_pix_avg_loop: sub x6, x6, #4 cbnz x6, enc_w8_pix_avg_loop -WELS_ASM_ARCH64_FUNC_END +WELS_ASM_AARCH64_FUNC_END -WELS_ASM_ARCH64_FUNC_BEGIN PixelAvgWidthEq16_AArch64_neon +WELS_ASM_AARCH64_FUNC_BEGIN PixelAvgWidthEq16_AArch64_neon //prfm pldl1strm, [x2] //prfm pldl1strm, [x4] w16_pix_avg_loop: @@ -1650,9 +1650,9 @@ w16_pix_avg_loop: sub x6, x6, #4 cbnz x6, w16_pix_avg_loop -WELS_ASM_ARCH64_FUNC_END +WELS_ASM_AARCH64_FUNC_END -WELS_ASM_ARCH64_FUNC_BEGIN PixelAvgWidthEq8_AArch64_neon +WELS_ASM_AARCH64_FUNC_BEGIN PixelAvgWidthEq8_AArch64_neon //prfm pldl1strm, [x2] //prfm pldl1strm, [x4] w8_pix_avg_loop: @@ -1687,10 +1687,10 @@ w8_pix_avg_loop: sub x6, x6, #4 cbnz x6, w8_pix_avg_loop -WELS_ASM_ARCH64_FUNC_END +WELS_ASM_AARCH64_FUNC_END -WELS_ASM_ARCH64_FUNC_BEGIN PixelAvgWidthEq4_AArch64_neon +WELS_ASM_AARCH64_FUNC_BEGIN PixelAvgWidthEq4_AArch64_neon //prfm pldl1strm, [x2] //prfm pldl1strm, [x4] w4_pix_avg_loop: @@ -1708,9 +1708,9 @@ w4_pix_avg_loop: sub x6, x6, #2 cbnz x6, w4_pix_avg_loop -WELS_ASM_ARCH64_FUNC_END +WELS_ASM_AARCH64_FUNC_END -WELS_ASM_ARCH64_FUNC_BEGIN McChromaWidthEq8_AArch64_neon +WELS_ASM_AARCH64_FUNC_BEGIN McChromaWidthEq8_AArch64_neon ld4r {v4.8b, v5.8b, v6.8b, v7.8b}, [x4] //load A/B/C/D ld1 {v0.16b}, [x0], x1 // src[x] ext.16b v1, v0, v0, #1 // src[x+1] @@ -1739,9 +1739,9 @@ w8_mc_chroma_loop: mov.16b v1, v19 sub x5, x5, #2 cbnz x5, w8_mc_chroma_loop -WELS_ASM_ARCH64_FUNC_END +WELS_ASM_AARCH64_FUNC_END -WELS_ASM_ARCH64_FUNC_BEGIN McChromaWidthEq4_AArch64_neon +WELS_ASM_AARCH64_FUNC_BEGIN McChromaWidthEq4_AArch64_neon ld4r {v4.8b, v5.8b, v6.8b, v7.8b}, [x4] //load A/B/C/D ld1 {v0.8b}, [x0], x1 // src[x] ext.8b v1, v0, v0, #1 // src[x+1] @@ -1768,10 +1768,10 @@ w4_mc_chroma_loop: mov.8b v1, v19 sub x5, x5, #2 cbnz x5, w4_mc_chroma_loop -WELS_ASM_ARCH64_FUNC_END +WELS_ASM_AARCH64_FUNC_END -WELS_ASM_ARCH64_FUNC_BEGIN McHorVer20Width17_AArch64_neon +WELS_ASM_AARCH64_FUNC_BEGIN McHorVer20Width17_AArch64_neon sub x0, x0, #2 sub x3, x3, #16 mov x5, #16 @@ -1798,9 +1798,9 @@ w17_h_mc_luma_loop: sub x4, x4, #1 cbnz x4, w17_h_mc_luma_loop -WELS_ASM_ARCH64_FUNC_END +WELS_ASM_AARCH64_FUNC_END -WELS_ASM_ARCH64_FUNC_BEGIN McHorVer20Width9_AArch64_neon +WELS_ASM_AARCH64_FUNC_BEGIN McHorVer20Width9_AArch64_neon sub x0, x0, #2 sub x3, x3, #8 mov x5, #8 @@ -1826,10 +1826,10 @@ w9_h_mc_luma_loop: sub x4, x4, #1 cbnz x4, w9_h_mc_luma_loop -WELS_ASM_ARCH64_FUNC_END +WELS_ASM_AARCH64_FUNC_END -WELS_ASM_ARCH64_FUNC_BEGIN McHorVer22Width17_AArch64_neon +WELS_ASM_AARCH64_FUNC_BEGIN McHorVer22Width17_AArch64_neon stp d8, d9, [sp,#-16]! stp d10, d11, [sp,#-16]! stp d12, d13, [sp,#-16]! @@ -2029,10 +2029,10 @@ w17_hv_mc_luma_loop: ldp d12, d13, [sp], #16 ldp d10, d11, [sp], #16 ldp d8, d9, [sp], #16 -WELS_ASM_ARCH64_FUNC_END +WELS_ASM_AARCH64_FUNC_END -WELS_ASM_ARCH64_FUNC_BEGIN McHorVer22Width9_AArch64_neon +WELS_ASM_AARCH64_FUNC_BEGIN McHorVer22Width9_AArch64_neon sub x0, x0, #2 sub x0, x0, x1, lsl #1 movi v0.8h, #20, lsl #0 @@ -2125,9 +2125,9 @@ w9_hv_mc_luma_loop: st1 {v26.8b}, [x2], x5 //write 0:7Byte : 0 line UNPACK_FILTER_SINGLE_TAG_16BITS v26, v21, v29, v27, v28, d26 st1 {v26.b}[0], [x2], x3 //write 8th Byte : 0 line -WELS_ASM_ARCH64_FUNC_END +WELS_ASM_AARCH64_FUNC_END -WELS_ASM_ARCH64_FUNC_BEGIN McHorVer02Height17_AArch64_neon +WELS_ASM_AARCH64_FUNC_BEGIN McHorVer02Height17_AArch64_neon sub x0, x0, x1, lsl #1 movi v0.8h, #20, lsl #0 movi v1.8h, #5, lsl #0 @@ -2214,9 +2214,9 @@ w17_v_mc_luma_loop: FILTER_6TAG_8BITS1 v2, v3, v4, v5, v6, v7, v20, v0, v1 FILTER_6TAG_8BITS2 v2, v3, v4, v5, v6, v7, v20, v0, v1 st1 {v20.16b}, [x2], x3 //write 16Byte : last line -WELS_ASM_ARCH64_FUNC_END +WELS_ASM_AARCH64_FUNC_END -WELS_ASM_ARCH64_FUNC_BEGIN McHorVer02Height9_AArch64_neon +WELS_ASM_AARCH64_FUNC_BEGIN McHorVer02Height9_AArch64_neon sub x0, x0, x1, lsl #1 movi v0.8h, #20, lsl #0 movi v1.8h, #5, lsl #0 @@ -2268,7 +2268,7 @@ w9_v_mc_luma_loop: ld1 {v7.8b}, [x0], x1 // v7=src[3*stride] FILTER_6TAG_8BITS1 v2, v3, v4, v5, v6, v7, v20, v0, v1 st1 {v20.8b}, [x2], x3 //write 8Byte : 0 line -WELS_ASM_ARCH64_FUNC_END +WELS_ASM_AARCH64_FUNC_END #endif diff --git a/codec/decoder/core/arm64/intra_pred_aarch64_neon.S b/codec/decoder/core/arm64/intra_pred_aarch64_neon.S index bccb2b4a..7d498ed4 100644 --- a/codec/decoder/core/arm64/intra_pred_aarch64_neon.S +++ b/codec/decoder/core/arm64/intra_pred_aarch64_neon.S @@ -35,16 +35,16 @@ #include "arm_arch64_common_macro.S" // for Luma 4x4 -WELS_ASM_ARCH64_FUNC_BEGIN WelsDecoderI4x4LumaPredH_AArch64_neon +WELS_ASM_AARCH64_FUNC_BEGIN WelsDecoderI4x4LumaPredH_AArch64_neon sxtw x1, w1 sub x2, x0, #1 .rept 4 ld1r {v0.8b}, [x2], x1 st1 {v0.S}[0], [x0], x1 .endr -WELS_ASM_ARCH64_FUNC_END +WELS_ASM_AARCH64_FUNC_END -WELS_ASM_ARCH64_FUNC_BEGIN WelsDecoderI4x4LumaPredDc_AArch64_neon +WELS_ASM_AARCH64_FUNC_BEGIN WelsDecoderI4x4LumaPredDc_AArch64_neon sxtw x1, w1 sub x2, x0, x1 sub x3, x0, #1 @@ -59,9 +59,9 @@ WELS_ASM_ARCH64_FUNC_BEGIN WelsDecoderI4x4LumaPredDc_AArch64_neon .rept 4 st1 {v0.S}[0], [x0], x1 .endr -WELS_ASM_ARCH64_FUNC_END +WELS_ASM_AARCH64_FUNC_END -WELS_ASM_ARCH64_FUNC_BEGIN WelsDecoderI4x4LumaPredDcTop_AArch64_neon +WELS_ASM_AARCH64_FUNC_BEGIN WelsDecoderI4x4LumaPredDcTop_AArch64_neon sxtw x1, w1 sub x2, x0, x1 sub v0.8b, v0.8b, v0.8b @@ -72,9 +72,9 @@ WELS_ASM_ARCH64_FUNC_BEGIN WelsDecoderI4x4LumaPredDcTop_AArch64_neon .rept 4 st1 {v0.S}[0], [x0], x1 .endr -WELS_ASM_ARCH64_FUNC_END +WELS_ASM_AARCH64_FUNC_END -WELS_ASM_ARCH64_FUNC_BEGIN WelsDecoderI4x4LumaPredDDL_AArch64_neon +WELS_ASM_AARCH64_FUNC_BEGIN WelsDecoderI4x4LumaPredDDL_AArch64_neon sxtw x1, w1 sub x2, x0, x1 ld1 {v0.8b}, [x2] @@ -92,9 +92,9 @@ WELS_ASM_ARCH64_FUNC_BEGIN WelsDecoderI4x4LumaPredDDL_AArch64_neon st1 {v0.S}[0], [x0], x1 ext v0.8b, v1.8b, v2.8b, #3 st1 {v0.S}[0], [x0] -WELS_ASM_ARCH64_FUNC_END +WELS_ASM_AARCH64_FUNC_END -WELS_ASM_ARCH64_FUNC_BEGIN WelsDecoderI4x4LumaPredDDLTop_AArch64_neon +WELS_ASM_AARCH64_FUNC_BEGIN WelsDecoderI4x4LumaPredDDLTop_AArch64_neon sxtw x1, w1 sub x2, x0, x1 ld1 {v0.8b}, [x2] @@ -113,9 +113,9 @@ WELS_ASM_ARCH64_FUNC_BEGIN WelsDecoderI4x4LumaPredDDLTop_AArch64_neon st1 {v0.S}[0], [x0], x1 ext v0.8b, v1.8b, v2.8b, #3 st1 {v0.S}[0], [x0] -WELS_ASM_ARCH64_FUNC_END +WELS_ASM_AARCH64_FUNC_END -WELS_ASM_ARCH64_FUNC_BEGIN WelsDecoderI4x4LumaPredVL_AArch64_neon +WELS_ASM_AARCH64_FUNC_BEGIN WelsDecoderI4x4LumaPredVL_AArch64_neon sxtw x1, w1 sub x2, x0, x1 ld1 {v0.8b}, [x2] @@ -131,9 +131,9 @@ WELS_ASM_ARCH64_FUNC_BEGIN WelsDecoderI4x4LumaPredVL_AArch64_neon ext v2.8b, v1.8b, v1.8b, #1 st1 {v3.s}[0], [x0], x1 // write the third row st1 {v2.s}[0], [x0] // write the fourth row -WELS_ASM_ARCH64_FUNC_END +WELS_ASM_AARCH64_FUNC_END -WELS_ASM_ARCH64_FUNC_BEGIN WelsDecoderI4x4LumaPredVLTop_AArch64_neon +WELS_ASM_AARCH64_FUNC_BEGIN WelsDecoderI4x4LumaPredVLTop_AArch64_neon sxtw x1, w1 sub x2, x0, x1 ld1 {v0.8b}, [x2] @@ -151,9 +151,9 @@ WELS_ASM_ARCH64_FUNC_BEGIN WelsDecoderI4x4LumaPredVLTop_AArch64_neon ext v2.8b, v1.8b, v1.8b, #1 st1 {v3.s}[0], [x0], x1 // write the third row st1 {v2.s}[0], [x0] // write the fourth row -WELS_ASM_ARCH64_FUNC_END +WELS_ASM_AARCH64_FUNC_END -WELS_ASM_ARCH64_FUNC_BEGIN WelsDecoderI4x4LumaPredVR_AArch64_neon +WELS_ASM_AARCH64_FUNC_BEGIN WelsDecoderI4x4LumaPredVR_AArch64_neon sxtw x1, w1 sub x2, x0, x1 ld1 {v0.s}[1], [x2] @@ -182,10 +182,10 @@ WELS_ASM_ARCH64_FUNC_BEGIN WelsDecoderI4x4LumaPredVR_AArch64_neon ins v3.b[4], v3.b[3] st1 {v3.s}[1], [x0], x1 -WELS_ASM_ARCH64_FUNC_END +WELS_ASM_AARCH64_FUNC_END -WELS_ASM_ARCH64_FUNC_BEGIN WelsDecoderI4x4LumaPredHU_AArch64_neon +WELS_ASM_AARCH64_FUNC_BEGIN WelsDecoderI4x4LumaPredHU_AArch64_neon sxtw x1, w1 sub x2, x0, #1 mov x3, #3 @@ -210,9 +210,9 @@ WELS_ASM_ARCH64_FUNC_BEGIN WelsDecoderI4x4LumaPredHU_AArch64_neon st1 {v2.s}[0], [x0], x1 st1 {v3.s}[1], [x0], x1 st1 {v0.s}[0], [x0] -WELS_ASM_ARCH64_FUNC_END +WELS_ASM_AARCH64_FUNC_END -WELS_ASM_ARCH64_FUNC_BEGIN WelsDecoderI4x4LumaPredHD_AArch64_neon +WELS_ASM_AARCH64_FUNC_BEGIN WelsDecoderI4x4LumaPredHD_AArch64_neon sxtw x1, w1 sub x2, x0, #1 sub x2, x2, x1 // x2 points to top left @@ -235,29 +235,29 @@ WELS_ASM_ARCH64_FUNC_BEGIN WelsDecoderI4x4LumaPredHD_AArch64_neon ext v3.8b, v2.8b, v1.8b, #2 st1 {v3.s}[0], [x0], x1 st1 {v2.s}[0], [x0] -WELS_ASM_ARCH64_FUNC_END +WELS_ASM_AARCH64_FUNC_END // for Chroma 8x8 -WELS_ASM_ARCH64_FUNC_BEGIN WelsDecoderIChromaPredV_AArch64_neon +WELS_ASM_AARCH64_FUNC_BEGIN WelsDecoderIChromaPredV_AArch64_neon sxtw x1, w1 sub x2, x0, x1 ld1 {v0.8b}, [x2] .rept 8 st1 {v0.8b}, [x0], x1 .endr -WELS_ASM_ARCH64_FUNC_END +WELS_ASM_AARCH64_FUNC_END -WELS_ASM_ARCH64_FUNC_BEGIN WelsDecoderIChromaPredH_AArch64_neon +WELS_ASM_AARCH64_FUNC_BEGIN WelsDecoderIChromaPredH_AArch64_neon sxtw x1, w1 sub x2, x0, #1 .rept 8 ld1r {v0.8b}, [x2], x1 st1 {v0.8b}, [x0], x1 .endr -WELS_ASM_ARCH64_FUNC_END +WELS_ASM_AARCH64_FUNC_END -WELS_ASM_ARCH64_FUNC_BEGIN WelsDecoderIChromaPredDc_AArch64_neon +WELS_ASM_AARCH64_FUNC_BEGIN WelsDecoderIChromaPredDc_AArch64_neon sxtw x1, w1 sub x2, x0, x1 sub x3, x0, #1 @@ -291,9 +291,9 @@ WELS_ASM_ARCH64_FUNC_BEGIN WelsDecoderIChromaPredDc_AArch64_neon st1 {v2.8b}, [x0], x1 .endr -WELS_ASM_ARCH64_FUNC_END +WELS_ASM_AARCH64_FUNC_END -WELS_ASM_ARCH64_FUNC_BEGIN WelsDecoderIChromaPredDcTop_AArch64_neon +WELS_ASM_AARCH64_FUNC_BEGIN WelsDecoderIChromaPredDcTop_AArch64_neon sxtw x1, w1 sub x2, x0, x1 ld1 {v0.8b}, [x2] @@ -306,13 +306,13 @@ WELS_ASM_ARCH64_FUNC_BEGIN WelsDecoderIChromaPredDcTop_AArch64_neon .rept 8 st1 {v1.8b}, [x0], x1 .endr -WELS_ASM_ARCH64_FUNC_END +WELS_ASM_AARCH64_FUNC_END .align 16 intra_1_to_4: .short 17*1, 17*2, 17*3, 17*4, 17*1, 17*2, 17*3, 17*4 intra_m3_to_p4: .short -3, -2, -1, 0, 1, 2, 3, 4 -WELS_ASM_ARCH64_FUNC_BEGIN WelsDecoderIChromaPredPlane_AArch64_neon +WELS_ASM_AARCH64_FUNC_BEGIN WelsDecoderIChromaPredPlane_AArch64_neon sxtw x1, w1 sub x2, x0, x1 sub x2, x2, #1 @@ -362,28 +362,28 @@ WELS_ASM_ARCH64_FUNC_BEGIN WelsDecoderIChromaPredPlane_AArch64_neon sqrshrun v1.8b, v4.8h, #5 st1 {v1.8b}, [x0], x1 .endr -WELS_ASM_ARCH64_FUNC_END +WELS_ASM_AARCH64_FUNC_END //for Luma 16x16 -WELS_ASM_ARCH64_FUNC_BEGIN WelsDecoderI16x16LumaPredV_AArch64_neon +WELS_ASM_AARCH64_FUNC_BEGIN WelsDecoderI16x16LumaPredV_AArch64_neon sxtw x1, w1 sub x2, x0, x1 ld1 {v0.16b}, [x2] .rept 16 st1 {v0.16b}, [x0], x1 .endr -WELS_ASM_ARCH64_FUNC_END +WELS_ASM_AARCH64_FUNC_END -WELS_ASM_ARCH64_FUNC_BEGIN WelsDecoderI16x16LumaPredH_AArch64_neon +WELS_ASM_AARCH64_FUNC_BEGIN WelsDecoderI16x16LumaPredH_AArch64_neon sxtw x1, w1 sub x2, x0, #1 .rept 16 ld1r {v0.16b}, [x2], x1 st1 {v0.16b}, [x0], x1 .endr -WELS_ASM_ARCH64_FUNC_END +WELS_ASM_AARCH64_FUNC_END -WELS_ASM_ARCH64_FUNC_BEGIN WelsDecoderI16x16LumaPredDc_AArch64_neon +WELS_ASM_AARCH64_FUNC_BEGIN WelsDecoderI16x16LumaPredDc_AArch64_neon sxtw x1, w1 sub x2, x0, x1 sub x3, x0, #1 @@ -413,9 +413,9 @@ WELS_ASM_ARCH64_FUNC_BEGIN WelsDecoderI16x16LumaPredDc_AArch64_neon .rept 16 st1 {v0.16b}, [x0], x1 .endr -WELS_ASM_ARCH64_FUNC_END +WELS_ASM_AARCH64_FUNC_END -WELS_ASM_ARCH64_FUNC_BEGIN WelsDecoderI16x16LumaPredDcTop_AArch64_neon +WELS_ASM_AARCH64_FUNC_BEGIN WelsDecoderI16x16LumaPredDcTop_AArch64_neon sxtw x1, w1 sub x2, x0, x1 ld1 {v0.16b}, [x2] @@ -426,9 +426,9 @@ WELS_ASM_ARCH64_FUNC_BEGIN WelsDecoderI16x16LumaPredDcTop_AArch64_neon .rept 16 st1 {v0.16b}, [x0], x1 .endr -WELS_ASM_ARCH64_FUNC_END +WELS_ASM_AARCH64_FUNC_END -WELS_ASM_ARCH64_FUNC_BEGIN WelsDecoderI16x16LumaPredDcLeft_AArch64_neon +WELS_ASM_AARCH64_FUNC_BEGIN WelsDecoderI16x16LumaPredDcLeft_AArch64_neon sxtw x1, w1 sub x3, x0, #1 ld1 {v1.b}[0], [x3], x1 @@ -454,14 +454,14 @@ WELS_ASM_ARCH64_FUNC_BEGIN WelsDecoderI16x16LumaPredDcLeft_AArch64_neon .rept 16 st1 {v0.16b}, [x0], x1 .endr -WELS_ASM_ARCH64_FUNC_END +WELS_ASM_AARCH64_FUNC_END .align 16 intra_1_to_8: .short 5, 10, 15, 20, 25, 30, 35, 40 intra_m7_to_p8: .short -7, -6, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, 6, 7, 8 -WELS_ASM_ARCH64_FUNC_BEGIN WelsDecoderI16x16LumaPredPlane_AArch64_neon +WELS_ASM_AARCH64_FUNC_BEGIN WelsDecoderI16x16LumaPredPlane_AArch64_neon sxtw x1, w1 sub x2, x0, x1 sub x2, x2, #1 @@ -521,5 +521,5 @@ WELS_ASM_ARCH64_FUNC_BEGIN WelsDecoderI16x16LumaPredPlane_AArch64_neon sqrshrun2 v4.16b, v3.8h, #5 st1 {v4.16b}, [x0], x1 .endr -WELS_ASM_ARCH64_FUNC_END -#endif \ No newline at end of file +WELS_ASM_AARCH64_FUNC_END +#endif diff --git a/codec/encoder/core/arm64/intra_pred_aarch64_neon.S b/codec/encoder/core/arm64/intra_pred_aarch64_neon.S index fbc3b974..43d58bdd 100644 --- a/codec/encoder/core/arm64/intra_pred_aarch64_neon.S +++ b/codec/encoder/core/arm64/intra_pred_aarch64_neon.S @@ -35,15 +35,15 @@ #include "arm_arch64_common_macro.S" // for Luma 4x4 -WELS_ASM_ARCH64_FUNC_BEGIN WelsI4x4LumaPredH_AArch64_neon +WELS_ASM_AARCH64_FUNC_BEGIN WelsI4x4LumaPredH_AArch64_neon sub x3, x1, #1 .rept 4 ld1r {v0.8b}, [x3], x2 st1 {v0.S}[0], [x0], 4 .endr -WELS_ASM_ARCH64_FUNC_END +WELS_ASM_AARCH64_FUNC_END -WELS_ASM_ARCH64_FUNC_BEGIN WelsI4x4LumaPredDc_AArch64_neon +WELS_ASM_AARCH64_FUNC_BEGIN WelsI4x4LumaPredDc_AArch64_neon sub x3, x1, x2 sub x4, x1, #1 ldr s0, [x3] @@ -57,9 +57,9 @@ WELS_ASM_ARCH64_FUNC_BEGIN WelsI4x4LumaPredDc_AArch64_neon .rept 4 st1 {v0.S}[0], [x0], 4 .endr -WELS_ASM_ARCH64_FUNC_END +WELS_ASM_AARCH64_FUNC_END -WELS_ASM_ARCH64_FUNC_BEGIN WelsI4x4LumaPredDcTop_AArch64_neon +WELS_ASM_AARCH64_FUNC_BEGIN WelsI4x4LumaPredDcTop_AArch64_neon sub x3, x1, x2 sub v0.8b, v0.8b, v0.8b ldr s0, [x3] @@ -69,9 +69,9 @@ WELS_ASM_ARCH64_FUNC_BEGIN WelsI4x4LumaPredDcTop_AArch64_neon .rept 4 st1 {v0.S}[0], [x0], 4 .endr -WELS_ASM_ARCH64_FUNC_END +WELS_ASM_AARCH64_FUNC_END -WELS_ASM_ARCH64_FUNC_BEGIN WelsI4x4LumaPredDDL_AArch64_neon +WELS_ASM_AARCH64_FUNC_BEGIN WelsI4x4LumaPredDDL_AArch64_neon sub x3, x1, x2 ld1 {v0.8b}, [x3] dup v1.8b, v0.b[7] @@ -88,9 +88,9 @@ WELS_ASM_ARCH64_FUNC_BEGIN WelsI4x4LumaPredDDL_AArch64_neon st1 {v0.S}[0], [x0], 4 ext v0.8b, v1.8b, v2.8b, #3 st1 {v0.S}[0], [x0] -WELS_ASM_ARCH64_FUNC_END +WELS_ASM_AARCH64_FUNC_END -WELS_ASM_ARCH64_FUNC_BEGIN WelsI4x4LumaPredDDLTop_AArch64_neon +WELS_ASM_AARCH64_FUNC_BEGIN WelsI4x4LumaPredDDLTop_AArch64_neon sub x3, x1, x2 ld1 {v0.8b}, [x3] dup v1.8b, v0.b[3] @@ -108,9 +108,9 @@ WELS_ASM_ARCH64_FUNC_BEGIN WelsI4x4LumaPredDDLTop_AArch64_neon st1 {v0.S}[0], [x0], 4 ext v0.8b, v1.8b, v2.8b, #3 st1 {v0.S}[0], [x0] -WELS_ASM_ARCH64_FUNC_END +WELS_ASM_AARCH64_FUNC_END -WELS_ASM_ARCH64_FUNC_BEGIN WelsI4x4LumaPredVL_AArch64_neon +WELS_ASM_AARCH64_FUNC_BEGIN WelsI4x4LumaPredVL_AArch64_neon sub x3, x1, x2 ld1 {v0.8b}, [x3] ext v1.8b, v0.8b, v0.8b, #1 @@ -125,9 +125,9 @@ WELS_ASM_ARCH64_FUNC_BEGIN WelsI4x4LumaPredVL_AArch64_neon ext v2.8b, v1.8b, v1.8b, #1 st1 {v3.s}[0], [x0], 4 // write the third row st1 {v2.s}[0], [x0] // write the fourth row -WELS_ASM_ARCH64_FUNC_END +WELS_ASM_AARCH64_FUNC_END -WELS_ASM_ARCH64_FUNC_BEGIN WelsI4x4LumaPredVLTop_AArch64_neon +WELS_ASM_AARCH64_FUNC_BEGIN WelsI4x4LumaPredVLTop_AArch64_neon sub x3, x1, x2 ld1 {v0.8b}, [x3] dup v1.8b, v0.b[3] @@ -144,9 +144,9 @@ WELS_ASM_ARCH64_FUNC_BEGIN WelsI4x4LumaPredVLTop_AArch64_neon ext v2.8b, v1.8b, v1.8b, #1 st1 {v3.s}[0], [x0], 4 // write the third row st1 {v2.s}[0], [x0] // write the fourth row -WELS_ASM_ARCH64_FUNC_END +WELS_ASM_AARCH64_FUNC_END -WELS_ASM_ARCH64_FUNC_BEGIN WelsI4x4LumaPredVR_AArch64_neon +WELS_ASM_AARCH64_FUNC_BEGIN WelsI4x4LumaPredVR_AArch64_neon sub x3, x1, x2 ld1 {v0.s}[1], [x3] sub x3, x3, #1 @@ -174,10 +174,10 @@ WELS_ASM_ARCH64_FUNC_BEGIN WelsI4x4LumaPredVR_AArch64_neon ins v3.b[4], v3.b[3] st1 {v3.s}[1], [x0] -WELS_ASM_ARCH64_FUNC_END +WELS_ASM_AARCH64_FUNC_END -WELS_ASM_ARCH64_FUNC_BEGIN WelsI4x4LumaPredHU_AArch64_neon +WELS_ASM_AARCH64_FUNC_BEGIN WelsI4x4LumaPredHU_AArch64_neon sub x3, x1, #1 mov x4, #3 mul x4, x4, x2 @@ -201,9 +201,9 @@ WELS_ASM_ARCH64_FUNC_BEGIN WelsI4x4LumaPredHU_AArch64_neon st1 {v2.s}[0], [x0], 4 st1 {v3.s}[1], [x0], 4 st1 {v0.s}[0], [x0] -WELS_ASM_ARCH64_FUNC_END +WELS_ASM_AARCH64_FUNC_END -WELS_ASM_ARCH64_FUNC_BEGIN WelsI4x4LumaPredHD_AArch64_neon +WELS_ASM_AARCH64_FUNC_BEGIN WelsI4x4LumaPredHD_AArch64_neon sub x3, x1, #1 sub x3, x3, x2 // x2 points to top left ld1 {v0.s}[1], [x3], x2 @@ -225,27 +225,27 @@ WELS_ASM_ARCH64_FUNC_BEGIN WelsI4x4LumaPredHD_AArch64_neon ext v3.8b, v2.8b, v1.8b, #2 st1 {v3.s}[0], [x0], 4 st1 {v2.s}[0], [x0] -WELS_ASM_ARCH64_FUNC_END +WELS_ASM_AARCH64_FUNC_END // for Chroma 8x8 -WELS_ASM_ARCH64_FUNC_BEGIN WelsIChromaPredV_AArch64_neon +WELS_ASM_AARCH64_FUNC_BEGIN WelsIChromaPredV_AArch64_neon sub x3, x1, x2 ld1 {v0.8b}, [x3] .rept 8 st1 {v0.8b}, [x0], 8 .endr -WELS_ASM_ARCH64_FUNC_END +WELS_ASM_AARCH64_FUNC_END -WELS_ASM_ARCH64_FUNC_BEGIN WelsIChromaPredH_AArch64_neon +WELS_ASM_AARCH64_FUNC_BEGIN WelsIChromaPredH_AArch64_neon sub x3, x1, #1 .rept 8 ld1r {v0.8b}, [x3], x2 st1 {v0.8b}, [x0], 8 .endr -WELS_ASM_ARCH64_FUNC_END +WELS_ASM_AARCH64_FUNC_END -WELS_ASM_ARCH64_FUNC_BEGIN WelsIChromaPredDc_AArch64_neon +WELS_ASM_AARCH64_FUNC_BEGIN WelsIChromaPredDc_AArch64_neon sub x3, x1, x2 sub x4, x1, #1 ld1 {v0.8b}, [x3] @@ -278,9 +278,9 @@ WELS_ASM_ARCH64_FUNC_BEGIN WelsIChromaPredDc_AArch64_neon st1 {v2.8b}, [x0], 8 .endr -WELS_ASM_ARCH64_FUNC_END +WELS_ASM_AARCH64_FUNC_END -WELS_ASM_ARCH64_FUNC_BEGIN WelsIChromaPredDcTop_AArch64_neon +WELS_ASM_AARCH64_FUNC_BEGIN WelsIChromaPredDcTop_AArch64_neon sub x3, x1, x2 ld1 {v0.8b}, [x3] uaddlp v0.4h, v0.8b @@ -292,13 +292,13 @@ WELS_ASM_ARCH64_FUNC_BEGIN WelsIChromaPredDcTop_AArch64_neon .rept 8 st1 {v1.8b}, [x0], 8 .endr -WELS_ASM_ARCH64_FUNC_END +WELS_ASM_AARCH64_FUNC_END .align 16 intra_1_to_4: .short 17*1, 17*2, 17*3, 17*4, 17*1, 17*2, 17*3, 17*4 intra_m3_to_p4: .short -3, -2, -1, 0, 1, 2, 3, 4 -WELS_ASM_ARCH64_FUNC_BEGIN WelsIChromaPredPlane_AArch64_neon +WELS_ASM_AARCH64_FUNC_BEGIN WelsIChromaPredPlane_AArch64_neon sub x3, x1, x2 sub x3, x3, #1 mov x4, x3 @@ -347,26 +347,26 @@ WELS_ASM_ARCH64_FUNC_BEGIN WelsIChromaPredPlane_AArch64_neon sqrshrun v1.8b, v4.8h, #5 st1 {v1.8b}, [x0], 8 .endr -WELS_ASM_ARCH64_FUNC_END +WELS_ASM_AARCH64_FUNC_END //for Luma 16x16 -WELS_ASM_ARCH64_FUNC_BEGIN WelsI16x16LumaPredV_AArch64_neon +WELS_ASM_AARCH64_FUNC_BEGIN WelsI16x16LumaPredV_AArch64_neon sub x3, x1, x2 ld1 {v0.16b}, [x3] .rept 16 st1 {v0.16b}, [x0], 16 .endr -WELS_ASM_ARCH64_FUNC_END +WELS_ASM_AARCH64_FUNC_END -WELS_ASM_ARCH64_FUNC_BEGIN WelsI16x16LumaPredH_AArch64_neon +WELS_ASM_AARCH64_FUNC_BEGIN WelsI16x16LumaPredH_AArch64_neon sub x3, x1, #1 .rept 16 ld1r {v0.16b}, [x3], x2 st1 {v0.16b}, [x0], 16 .endr -WELS_ASM_ARCH64_FUNC_END +WELS_ASM_AARCH64_FUNC_END -WELS_ASM_ARCH64_FUNC_BEGIN WelsI16x16LumaPredDc_AArch64_neon +WELS_ASM_AARCH64_FUNC_BEGIN WelsI16x16LumaPredDc_AArch64_neon sub x3, x1, x2 sub x4, x1, #1 ld1 {v0.16b}, [x3] @@ -395,9 +395,9 @@ WELS_ASM_ARCH64_FUNC_BEGIN WelsI16x16LumaPredDc_AArch64_neon .rept 16 st1 {v0.16b}, [x0], 16 .endr -WELS_ASM_ARCH64_FUNC_END +WELS_ASM_AARCH64_FUNC_END -WELS_ASM_ARCH64_FUNC_BEGIN WelsI16x16LumaPredDcTop_AArch64_neon +WELS_ASM_AARCH64_FUNC_BEGIN WelsI16x16LumaPredDcTop_AArch64_neon sub x3, x1, x2 ld1 {v0.16b}, [x3] // reduce instruction @@ -407,9 +407,9 @@ WELS_ASM_ARCH64_FUNC_BEGIN WelsI16x16LumaPredDcTop_AArch64_neon .rept 16 st1 {v0.16b}, [x0], 16 .endr -WELS_ASM_ARCH64_FUNC_END +WELS_ASM_AARCH64_FUNC_END -WELS_ASM_ARCH64_FUNC_BEGIN WelsI16x16LumaPredDcLeft_AArch64_neon +WELS_ASM_AARCH64_FUNC_BEGIN WelsI16x16LumaPredDcLeft_AArch64_neon sub x3, x1, #1 ld1 {v1.b}[0], [x3], x2 ld1 {v1.b}[1], [x3], x2 @@ -434,14 +434,14 @@ WELS_ASM_ARCH64_FUNC_BEGIN WelsI16x16LumaPredDcLeft_AArch64_neon .rept 16 st1 {v0.16b}, [x0], 16 .endr -WELS_ASM_ARCH64_FUNC_END +WELS_ASM_AARCH64_FUNC_END .align 16 intra_1_to_8: .short 5, 10, 15, 20, 25, 30, 35, 40 intra_m7_to_p8: .short -7, -6, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, 6, 7, 8 -WELS_ASM_ARCH64_FUNC_BEGIN WelsI16x16LumaPredPlane_AArch64_neon +WELS_ASM_AARCH64_FUNC_BEGIN WelsI16x16LumaPredPlane_AArch64_neon sub x3, x1, x2 sub x3, x3, #1 mov x4, x3 @@ -500,5 +500,5 @@ WELS_ASM_ARCH64_FUNC_BEGIN WelsI16x16LumaPredPlane_AArch64_neon sqrshrun2 v4.16b, v3.8h, #5 st1 {v4.16b}, [x0], 16 .endr -WELS_ASM_ARCH64_FUNC_END -#endif \ No newline at end of file +WELS_ASM_AARCH64_FUNC_END +#endif diff --git a/codec/encoder/core/arm64/intra_pred_sad_3_opt_aarch64_neon.S b/codec/encoder/core/arm64/intra_pred_sad_3_opt_aarch64_neon.S index 501ccf3d..f4ef72ba 100644 --- a/codec/encoder/core/arm64/intra_pred_sad_3_opt_aarch64_neon.S +++ b/codec/encoder/core/arm64/intra_pred_sad_3_opt_aarch64_neon.S @@ -268,7 +268,7 @@ .endm #endif -WELS_ASM_ARCH64_FUNC_BEGIN WelsIntra8x8Combined3Sad_AArch64_neon +WELS_ASM_AARCH64_FUNC_BEGIN WelsIntra8x8Combined3Sad_AArch64_neon ldr x11, [sp, #0] LOAD_CHROMA_DATA x0, v0.8b, v0.b @@ -366,9 +366,9 @@ WELS_ASM_ARCH64_FUNC_BEGIN WelsIntra8x8Combined3Sad_AArch64_neon SELECT_BEST_COST_PREFER_HIGHER w0 str w7, [x4] -WELS_ASM_ARCH64_FUNC_END +WELS_ASM_AARCH64_FUNC_END -WELS_ASM_ARCH64_FUNC_BEGIN WelsIntra16x16Combined3Sad_AArch64_neon +WELS_ASM_AARCH64_FUNC_BEGIN WelsIntra16x16Combined3Sad_AArch64_neon LOAD_LUMA_DATA @@ -418,9 +418,9 @@ sad_intra_16x16_x3_opt_loop0: SELECT_BEST_COST_PREFER_LOWER w0 str w7, [x4] -WELS_ASM_ARCH64_FUNC_END +WELS_ASM_AARCH64_FUNC_END -WELS_ASM_ARCH64_FUNC_BEGIN WelsIntra4x4Combined3Satd_AArch64_neon +WELS_ASM_AARCH64_FUNC_BEGIN WelsIntra4x4Combined3Satd_AArch64_neon sub x9, x0, x1 ld1 {v16.s}[0], [x9] //top sub x9, x0, #1 @@ -508,9 +508,9 @@ satd_intra_4x4_x3_opt_jump1: satd_intra_4x4_x3_opt_end: mov w0, w10 -WELS_ASM_ARCH64_FUNC_END +WELS_ASM_AARCH64_FUNC_END -WELS_ASM_ARCH64_FUNC_BEGIN WelsIntra8x8Combined3Satd_AArch64_neon +WELS_ASM_AARCH64_FUNC_BEGIN WelsIntra8x8Combined3Satd_AArch64_neon ldr x11, [sp, #0] LOAD_CHROMA_DATA x0, v0.8b, v0.b @@ -598,10 +598,10 @@ WELS_ASM_ARCH64_FUNC_BEGIN WelsIntra8x8Combined3Satd_AArch64_neon SELECT_BEST_COST_PREFER_HIGHER w0 str w7, [x4] -WELS_ASM_ARCH64_FUNC_END +WELS_ASM_AARCH64_FUNC_END -WELS_ASM_ARCH64_FUNC_BEGIN WelsIntra16x16Combined3Satd_AArch64_neon +WELS_ASM_AARCH64_FUNC_BEGIN WelsIntra16x16Combined3Satd_AArch64_neon LOAD_LUMA_DATA uaddlv h2, v0.16b @@ -676,6 +676,6 @@ WELS_ASM_ARCH64_FUNC_BEGIN WelsIntra16x16Combined3Satd_AArch64_neon str w7, [x4] -WELS_ASM_ARCH64_FUNC_END +WELS_ASM_AARCH64_FUNC_END #endif diff --git a/codec/encoder/core/arm64/pixel_aarch64_neon.S b/codec/encoder/core/arm64/pixel_aarch64_neon.S index 47b27c12..8b2fbcd9 100644 --- a/codec/encoder/core/arm64/pixel_aarch64_neon.S +++ b/codec/encoder/core/arm64/pixel_aarch64_neon.S @@ -235,7 +235,7 @@ .endm #endif -WELS_ASM_ARCH64_FUNC_BEGIN WelsSampleSad4x4_AArch64_neon +WELS_ASM_AARCH64_FUNC_BEGIN WelsSampleSad4x4_AArch64_neon sxtw x1, w1 sxtw x3, w3 ld1 {v0.s}[0], [x0], x1 @@ -248,9 +248,9 @@ WELS_ASM_ARCH64_FUNC_BEGIN WelsSampleSad4x4_AArch64_neon .endr saddlv s2, v2.4h fmov w0, s2 -WELS_ASM_ARCH64_FUNC_END +WELS_ASM_AARCH64_FUNC_END -WELS_ASM_ARCH64_FUNC_BEGIN WelsSampleSad8x8_AArch64_neon +WELS_ASM_AARCH64_FUNC_BEGIN WelsSampleSad8x8_AArch64_neon sxtw x1, w1 sxtw x3, w3 ld1 {v0.8b}, [x0], x1 @@ -262,9 +262,9 @@ WELS_ASM_ARCH64_FUNC_BEGIN WelsSampleSad8x8_AArch64_neon uabal v2.8h, v0.8b, v1.8b .endr CALC_AND_STORE_SAD -WELS_ASM_ARCH64_FUNC_END +WELS_ASM_AARCH64_FUNC_END -WELS_ASM_ARCH64_FUNC_BEGIN WelsSampleSad8x16_AArch64_neon +WELS_ASM_AARCH64_FUNC_BEGIN WelsSampleSad8x16_AArch64_neon sxtw x1, w1 sxtw x3, w3 ld1 {v0.8b}, [x0], x1 @@ -276,9 +276,9 @@ WELS_ASM_ARCH64_FUNC_BEGIN WelsSampleSad8x16_AArch64_neon uabal v2.8h, v0.8b, v1.8b .endr CALC_AND_STORE_SAD -WELS_ASM_ARCH64_FUNC_END +WELS_ASM_AARCH64_FUNC_END -WELS_ASM_ARCH64_FUNC_BEGIN WelsSampleSad16x8_AArch64_neon +WELS_ASM_AARCH64_FUNC_BEGIN WelsSampleSad16x8_AArch64_neon sxtw x1, w1 sxtw x3, w3 ld1 {v0.16b}, [x0], x1 @@ -292,9 +292,9 @@ WELS_ASM_ARCH64_FUNC_BEGIN WelsSampleSad16x8_AArch64_neon uabal2 v2.8h, v0.16b, v1.16b .endr CALC_AND_STORE_SAD -WELS_ASM_ARCH64_FUNC_END +WELS_ASM_AARCH64_FUNC_END -WELS_ASM_ARCH64_FUNC_BEGIN WelsSampleSad16x16_AArch64_neon +WELS_ASM_AARCH64_FUNC_BEGIN WelsSampleSad16x16_AArch64_neon sxtw x1, w1 sxtw x3, w3 ld1 {v0.16b}, [x0], x1 @@ -308,9 +308,9 @@ WELS_ASM_ARCH64_FUNC_BEGIN WelsSampleSad16x16_AArch64_neon uabal2 v2.8h, v0.16b, v1.16b .endr CALC_AND_STORE_SAD -WELS_ASM_ARCH64_FUNC_END +WELS_ASM_AARCH64_FUNC_END -WELS_ASM_ARCH64_FUNC_BEGIN WelsSampleSadFour4x4_AArch64_neon +WELS_ASM_AARCH64_FUNC_BEGIN WelsSampleSadFour4x4_AArch64_neon sxtw x1, w1 sxtw x3, w3 ld1 {v0.s}[0], [x0], x1 @@ -348,9 +348,9 @@ WELS_ASM_ARCH64_FUNC_BEGIN WelsSampleSadFour4x4_AArch64_neon uabal v31.8h, v1.8b, v3.8b CALC_AND_STORE_SAD_FOUR -WELS_ASM_ARCH64_FUNC_END +WELS_ASM_AARCH64_FUNC_END -WELS_ASM_ARCH64_FUNC_BEGIN WelsSampleSadFour8x8_AArch64_neon +WELS_ASM_AARCH64_FUNC_BEGIN WelsSampleSadFour8x8_AArch64_neon sxtw x1, w1 sxtw x3, w3 LOAD_8X8_1 @@ -371,9 +371,9 @@ WELS_ASM_ARCH64_FUNC_BEGIN WelsSampleSadFour8x8_AArch64_neon CALC_ABS_8X8_1 v31.8h, d CALC_AND_STORE_SAD_FOUR -WELS_ASM_ARCH64_FUNC_END +WELS_ASM_AARCH64_FUNC_END -WELS_ASM_ARCH64_FUNC_BEGIN WelsSampleSadFour8x16_AArch64_neon +WELS_ASM_AARCH64_FUNC_BEGIN WelsSampleSadFour8x16_AArch64_neon sxtw x1, w1 sxtw x3, w3 LOAD_8X8_1 @@ -410,9 +410,9 @@ WELS_ASM_ARCH64_FUNC_BEGIN WelsSampleSadFour8x16_AArch64_neon CALC_ABS_8X8_1 v31.8h, a CALC_AND_STORE_SAD_FOUR -WELS_ASM_ARCH64_FUNC_END +WELS_ASM_AARCH64_FUNC_END -WELS_ASM_ARCH64_FUNC_BEGIN WelsSampleSadFour16x8_AArch64_neon +WELS_ASM_AARCH64_FUNC_BEGIN WelsSampleSadFour16x8_AArch64_neon sxtw x1, w1 sxtw x3, w3 LOAD_16X8_1 @@ -433,9 +433,9 @@ WELS_ASM_ARCH64_FUNC_BEGIN WelsSampleSadFour16x8_AArch64_neon CALC_ABS_16X8_1 v31.8h, d CALC_AND_STORE_SAD_FOUR -WELS_ASM_ARCH64_FUNC_END +WELS_ASM_AARCH64_FUNC_END -WELS_ASM_ARCH64_FUNC_BEGIN WelsSampleSadFour16x16_AArch64_neon +WELS_ASM_AARCH64_FUNC_BEGIN WelsSampleSadFour16x16_AArch64_neon sxtw x1, w1 sxtw x3, w3 @@ -473,9 +473,9 @@ WELS_ASM_ARCH64_FUNC_BEGIN WelsSampleSadFour16x16_AArch64_neon CALC_ABS_16X8_1 v31.8h, a CALC_AND_STORE_SAD_FOUR -WELS_ASM_ARCH64_FUNC_END +WELS_ASM_AARCH64_FUNC_END -WELS_ASM_ARCH64_FUNC_BEGIN WelsSampleSatd4x4_AArch64_neon +WELS_ASM_AARCH64_FUNC_BEGIN WelsSampleSatd4x4_AArch64_neon sxtw x1, w1 sxtw x3, w3 ld1 {v0.s}[0], [x0], x1 @@ -514,7 +514,7 @@ WELS_ASM_ARCH64_FUNC_BEGIN WelsSampleSatd4x4_AArch64_neon add w0, w0, #1 lsr w0, w0, #1 -WELS_ASM_ARCH64_FUNC_END +WELS_ASM_AARCH64_FUNC_END .macro SATD_8x4 ld1 {v0.8b}, [x0], x1 @@ -649,7 +649,7 @@ WELS_ASM_ARCH64_FUNC_END add v2.8h, v2.8h, v3.8h .endm -WELS_ASM_ARCH64_FUNC_BEGIN WelsSampleSatd16x16_AArch64_neon +WELS_ASM_AARCH64_FUNC_BEGIN WelsSampleSatd16x16_AArch64_neon sxtw x1, w1 sxtw x3, w3 SATD_16x4 @@ -661,9 +661,9 @@ WELS_ASM_ARCH64_FUNC_BEGIN WelsSampleSatd16x16_AArch64_neon .endr uaddlv s4, v31.8h fmov w0, s4 -WELS_ASM_ARCH64_FUNC_END +WELS_ASM_AARCH64_FUNC_END -WELS_ASM_ARCH64_FUNC_BEGIN WelsSampleSatd16x8_AArch64_neon +WELS_ASM_AARCH64_FUNC_BEGIN WelsSampleSatd16x8_AArch64_neon sxtw x1, w1 sxtw x3, w3 SATD_16x4 @@ -675,9 +675,9 @@ WELS_ASM_ARCH64_FUNC_BEGIN WelsSampleSatd16x8_AArch64_neon uaddlv s4, v31.8h fmov w0, s4 -WELS_ASM_ARCH64_FUNC_END +WELS_ASM_AARCH64_FUNC_END -WELS_ASM_ARCH64_FUNC_BEGIN WelsSampleSatd8x16_AArch64_neon +WELS_ASM_AARCH64_FUNC_BEGIN WelsSampleSatd8x16_AArch64_neon sxtw x1, w1 sxtw x3, w3 SATD_8x4 @@ -689,9 +689,9 @@ WELS_ASM_ARCH64_FUNC_BEGIN WelsSampleSatd8x16_AArch64_neon .endr uaddlv s4, v31.8h fmov w0, s4 -WELS_ASM_ARCH64_FUNC_END +WELS_ASM_AARCH64_FUNC_END -WELS_ASM_ARCH64_FUNC_BEGIN WelsSampleSatd8x8_AArch64_neon +WELS_ASM_AARCH64_FUNC_BEGIN WelsSampleSatd8x8_AArch64_neon sxtw x1, w1 sxtw x3, w3 SATD_8x4 @@ -702,5 +702,5 @@ WELS_ASM_ARCH64_FUNC_BEGIN WelsSampleSatd8x8_AArch64_neon add v31.8h, v31.8h, v1.8h uaddlv s4, v31.8h fmov w0, s4 -WELS_ASM_ARCH64_FUNC_END -#endif \ No newline at end of file +WELS_ASM_AARCH64_FUNC_END +#endif