Use the correct syntax for the aarch64 ext instructions

Since Xcode 5.1, the apple tools actually support using the
official, correct syntax for the ext instructions. This syntax
is already used in a number of places already - use it consistently,
and get rid of the compatibility hacks.
This commit is contained in:
Martin Storsjö 2014-04-23 11:25:40 +03:00
parent 019fb9e20e
commit d5c71dbe2f
3 changed files with 14 additions and 21 deletions

View File

@ -67,11 +67,4 @@ ret
mov \arg0\().8b, \arg1\().8b
.endm
.macro ext.16b arg0, arg1, arg2, arg3
ext \arg0\().16b, \arg1\().16b, \arg2\().16b, \arg3
.endm
.macro ext.8b arg0, arg1, arg2, arg3
ext \arg0\().8b, \arg1\().8b, \arg2\().8b, \arg3
.endm
#endif

View File

@ -212,7 +212,7 @@
ld1 {v1.s} [3], [x6]
bs_nzc_check_jump0:
ext.16b v1, v1, v0, #12
ext v1.16b, v1.16b, v0.16b, #12
add $3.16b, v0.16b, v1.16b
// Arrange the input data --- LEFT
@ -233,7 +233,7 @@ bs_nzc_check_jump1:
zip1 v0.16b, v0.16b, v2.16b
ins v2.d[0], v0.d[1]
zip1 v0.16b, v0.16b, v2.16b
ext.16b v1, v1, v0, #12
ext v1.16b, v1.16b, v0.16b, #12
add $4.16b, v0.16b, v1.16b
.endm
@ -470,7 +470,7 @@ bs_mv_check_jump1:
ld1 {v1.s} [3], [x6]
bs_nzc_check_jump0:
ext.16b v1, v1, v0, #12
ext v1.16b, v1.16b, v0.16b, #12
add \arg3\().16b, v0.16b, v1.16b
// Arrange the input data --- LEFT
@ -491,7 +491,7 @@ bs_nzc_check_jump1:
zip1 v0.16b, v0.16b, v2.16b
ins v2.d[0], v0.d[1]
zip1 v0.16b, v0.16b, v2.16b
ext.16b v1, v1, v0, #12
ext v1.16b, v1.16b, v0.16b, #12
add \arg4\().16b, v0.16b, v1.16b
.endm

View File

@ -197,8 +197,8 @@ filter_para: .short 0, 1, -5, 20, 0, 0, 0, 0
.macro UNPACK_FILTER_SINGLE_TAG_16BITS // v0, v1, v22, v23
// { // each 16bits; input: d_dst, d_src[0:5], para, working, working, d(low part of d_dst)
ext.16b $3, $1, $1, #14 // X[0][1][2][3][4][5]O
ext.16b $4, $3, $3, #8 // [3][4][5]OX[0][1][2]
ext $3.16b, $1.16b, $1.16b, #14 // X[0][1][2][3][4][5]O
ext $4.16b, $3.16b, $3.16b, #8 // [3][4][5]OX[0][1][2]
rev64 $4.8h, $4.8h // X[5][4][3][2][1][0]O
add $3.8h, $3.8h, $4.8h // each 16bits, *[50][41][32][23][14][05]*
smull $3.4s, $3.4h, $2.4h // 0+1*[50]-5*[41]+20[32]
@ -1713,12 +1713,12 @@ WELS_ASM_AARCH64_FUNC_END
WELS_ASM_AARCH64_FUNC_BEGIN McChromaWidthEq8_AArch64_neon
ld4r {v4.8b, v5.8b, v6.8b, v7.8b}, [x4] //load A/B/C/D
ld1 {v0.16b}, [x0], x1 // src[x]
ext.16b v1, v0, v0, #1 // src[x+1]
ext v1.16b, v0.16b, v0.16b, #1 // src[x+1]
w8_mc_chroma_loop:
ld1 {v2.16b}, [x0], x1 // src[x+stride]
ext.16b v3, v2, v2, #1 // src[x+stride+1]
ext v3.16b, v2.16b, v2.16b, #1 // src[x+stride+1]
ld1 {v18.16b}, [x0], x1 // src[x+2*stride]
ext.16b v19, v18, v18, #1 // src[x+2*stride+1]
ext v19.16b, v18.16b, v18.16b, #1 // src[x+2*stride+1]
umull v16.8h, v0.8b, v4.8b
umlal v16.8h, v1.8b, v5.8b
@ -1744,12 +1744,12 @@ WELS_ASM_AARCH64_FUNC_END
WELS_ASM_AARCH64_FUNC_BEGIN McChromaWidthEq4_AArch64_neon
ld4r {v4.8b, v5.8b, v6.8b, v7.8b}, [x4] //load A/B/C/D
ld1 {v0.8b}, [x0], x1 // src[x]
ext.8b v1, v0, v0, #1 // src[x+1]
ext v1.8b, v0.8b, v0.8b, #1 // src[x+1]
w4_mc_chroma_loop:
ld1 {v2.8b}, [x0], x1 // src[x+stride]
ext.8b v3, v2, v2, #1 // src[x+stride+1]
ext v3.8b, v2.8b, v2.8b, #1 // src[x+stride+1]
ld1 {v18.8b}, [x0], x1 // src[x+2*stride]
ext.8b v19, v18, v18, #1 // src[x+2*stride+1]
ext v19.8b, v18.8b, v18.8b, #1 // src[x+2*stride+1]
zip1 v0.4s, v0.4s, v2.4s
zip1 v1.4s, v1.4s, v3.4s
@ -1792,7 +1792,7 @@ w17_h_mc_luma_loop:
FILTER_6TAG_8BITS2 v2, v5, v6, v7, v16, v17, v20, v0, v1
st1 {v20.16b}, [x2], x5 //write 16Byte
ext.8b v21, v3, v3, #7 // [0][1][2][3][4][5]XY-->O[0][1][2][3][4][5]X
ext v21.8b, v3.8b, v3.8b, #7 // [0][1][2][3][4][5]XY-->O[0][1][2][3][4][5]X
FILTER_SINGLE_TAG_8BITS v21, v22, v23, h21
st1 {v21.b}[0], [x2], x3 //write 16th Byte
@ -1820,7 +1820,7 @@ w9_h_mc_luma_loop:
FILTER_6TAG_8BITS1 v2, v5, v6, v7, v16, v17, v20, v0, v1
st1 {v20.8b}, [x2], x5 //write 8Byte
ext.8b v21, v3, v3, #7 // [0][1][2][3][4][5]XY-->O[0][1][2][3][4][5]X
ext v21.8b, v3.8b, v3.8b, #7 // [0][1][2][3][4][5]XY-->O[0][1][2][3][4][5]X
FILTER_SINGLE_TAG_8BITS v21, v22, v23, h21
st1 {v21.b}[0], [x2], x3 //write 9th Byte