Merge pull request #1134 from mstorsjo/aarch64-ext

Use the correct syntax for the aarch64 ext instructions
This commit is contained in:
zhilwang 2014-07-10 14:17:17 +08:00
commit d4baad0dcb
3 changed files with 14 additions and 21 deletions

View File

@ -67,11 +67,4 @@ ret
mov \arg0\().8b, \arg1\().8b mov \arg0\().8b, \arg1\().8b
.endm .endm
.macro ext.16b arg0, arg1, arg2, arg3
ext \arg0\().16b, \arg1\().16b, \arg2\().16b, \arg3
.endm
.macro ext.8b arg0, arg1, arg2, arg3
ext \arg0\().8b, \arg1\().8b, \arg2\().8b, \arg3
.endm
#endif #endif

View File

@ -212,7 +212,7 @@
ld1 {v1.s} [3], [x6] ld1 {v1.s} [3], [x6]
bs_nzc_check_jump0: bs_nzc_check_jump0:
ext.16b v1, v1, v0, #12 ext v1.16b, v1.16b, v0.16b, #12
add $3.16b, v0.16b, v1.16b add $3.16b, v0.16b, v1.16b
// Arrange the input data --- LEFT // Arrange the input data --- LEFT
@ -233,7 +233,7 @@ bs_nzc_check_jump1:
zip1 v0.16b, v0.16b, v2.16b zip1 v0.16b, v0.16b, v2.16b
ins v2.d[0], v0.d[1] ins v2.d[0], v0.d[1]
zip1 v0.16b, v0.16b, v2.16b zip1 v0.16b, v0.16b, v2.16b
ext.16b v1, v1, v0, #12 ext v1.16b, v1.16b, v0.16b, #12
add $4.16b, v0.16b, v1.16b add $4.16b, v0.16b, v1.16b
.endm .endm
@ -470,7 +470,7 @@ bs_mv_check_jump1:
ld1 {v1.s} [3], [x6] ld1 {v1.s} [3], [x6]
bs_nzc_check_jump0: bs_nzc_check_jump0:
ext.16b v1, v1, v0, #12 ext v1.16b, v1.16b, v0.16b, #12
add \arg3\().16b, v0.16b, v1.16b add \arg3\().16b, v0.16b, v1.16b
// Arrange the input data --- LEFT // Arrange the input data --- LEFT
@ -491,7 +491,7 @@ bs_nzc_check_jump1:
zip1 v0.16b, v0.16b, v2.16b zip1 v0.16b, v0.16b, v2.16b
ins v2.d[0], v0.d[1] ins v2.d[0], v0.d[1]
zip1 v0.16b, v0.16b, v2.16b zip1 v0.16b, v0.16b, v2.16b
ext.16b v1, v1, v0, #12 ext v1.16b, v1.16b, v0.16b, #12
add \arg4\().16b, v0.16b, v1.16b add \arg4\().16b, v0.16b, v1.16b
.endm .endm

View File

@ -197,8 +197,8 @@ filter_para: .short 0, 1, -5, 20, 0, 0, 0, 0
.macro UNPACK_FILTER_SINGLE_TAG_16BITS // v0, v1, v22, v23 .macro UNPACK_FILTER_SINGLE_TAG_16BITS // v0, v1, v22, v23
// { // each 16bits; input: d_dst, d_src[0:5], para, working, working, d(low part of d_dst) // { // each 16bits; input: d_dst, d_src[0:5], para, working, working, d(low part of d_dst)
ext.16b $3, $1, $1, #14 // X[0][1][2][3][4][5]O ext $3.16b, $1.16b, $1.16b, #14 // X[0][1][2][3][4][5]O
ext.16b $4, $3, $3, #8 // [3][4][5]OX[0][1][2] ext $4.16b, $3.16b, $3.16b, #8 // [3][4][5]OX[0][1][2]
rev64 $4.8h, $4.8h // X[5][4][3][2][1][0]O rev64 $4.8h, $4.8h // X[5][4][3][2][1][0]O
add $3.8h, $3.8h, $4.8h // each 16bits, *[50][41][32][23][14][05]* add $3.8h, $3.8h, $4.8h // each 16bits, *[50][41][32][23][14][05]*
smull $3.4s, $3.4h, $2.4h // 0+1*[50]-5*[41]+20[32] smull $3.4s, $3.4h, $2.4h // 0+1*[50]-5*[41]+20[32]
@ -1713,12 +1713,12 @@ WELS_ASM_AARCH64_FUNC_END
WELS_ASM_AARCH64_FUNC_BEGIN McChromaWidthEq8_AArch64_neon WELS_ASM_AARCH64_FUNC_BEGIN McChromaWidthEq8_AArch64_neon
ld4r {v4.8b, v5.8b, v6.8b, v7.8b}, [x4] //load A/B/C/D ld4r {v4.8b, v5.8b, v6.8b, v7.8b}, [x4] //load A/B/C/D
ld1 {v0.16b}, [x0], x1 // src[x] ld1 {v0.16b}, [x0], x1 // src[x]
ext.16b v1, v0, v0, #1 // src[x+1] ext v1.16b, v0.16b, v0.16b, #1 // src[x+1]
w8_mc_chroma_loop: w8_mc_chroma_loop:
ld1 {v2.16b}, [x0], x1 // src[x+stride] ld1 {v2.16b}, [x0], x1 // src[x+stride]
ext.16b v3, v2, v2, #1 // src[x+stride+1] ext v3.16b, v2.16b, v2.16b, #1 // src[x+stride+1]
ld1 {v18.16b}, [x0], x1 // src[x+2*stride] ld1 {v18.16b}, [x0], x1 // src[x+2*stride]
ext.16b v19, v18, v18, #1 // src[x+2*stride+1] ext v19.16b, v18.16b, v18.16b, #1 // src[x+2*stride+1]
umull v16.8h, v0.8b, v4.8b umull v16.8h, v0.8b, v4.8b
umlal v16.8h, v1.8b, v5.8b umlal v16.8h, v1.8b, v5.8b
@ -1744,12 +1744,12 @@ WELS_ASM_AARCH64_FUNC_END
WELS_ASM_AARCH64_FUNC_BEGIN McChromaWidthEq4_AArch64_neon WELS_ASM_AARCH64_FUNC_BEGIN McChromaWidthEq4_AArch64_neon
ld4r {v4.8b, v5.8b, v6.8b, v7.8b}, [x4] //load A/B/C/D ld4r {v4.8b, v5.8b, v6.8b, v7.8b}, [x4] //load A/B/C/D
ld1 {v0.8b}, [x0], x1 // src[x] ld1 {v0.8b}, [x0], x1 // src[x]
ext.8b v1, v0, v0, #1 // src[x+1] ext v1.8b, v0.8b, v0.8b, #1 // src[x+1]
w4_mc_chroma_loop: w4_mc_chroma_loop:
ld1 {v2.8b}, [x0], x1 // src[x+stride] ld1 {v2.8b}, [x0], x1 // src[x+stride]
ext.8b v3, v2, v2, #1 // src[x+stride+1] ext v3.8b, v2.8b, v2.8b, #1 // src[x+stride+1]
ld1 {v18.8b}, [x0], x1 // src[x+2*stride] ld1 {v18.8b}, [x0], x1 // src[x+2*stride]
ext.8b v19, v18, v18, #1 // src[x+2*stride+1] ext v19.8b, v18.8b, v18.8b, #1 // src[x+2*stride+1]
zip1 v0.4s, v0.4s, v2.4s zip1 v0.4s, v0.4s, v2.4s
zip1 v1.4s, v1.4s, v3.4s zip1 v1.4s, v1.4s, v3.4s
@ -1792,7 +1792,7 @@ w17_h_mc_luma_loop:
FILTER_6TAG_8BITS2 v2, v5, v6, v7, v16, v17, v20, v0, v1 FILTER_6TAG_8BITS2 v2, v5, v6, v7, v16, v17, v20, v0, v1
st1 {v20.16b}, [x2], x5 //write 16Byte st1 {v20.16b}, [x2], x5 //write 16Byte
ext.8b v21, v3, v3, #7 // [0][1][2][3][4][5]XY-->O[0][1][2][3][4][5]X ext v21.8b, v3.8b, v3.8b, #7 // [0][1][2][3][4][5]XY-->O[0][1][2][3][4][5]X
FILTER_SINGLE_TAG_8BITS v21, v22, v23, h21 FILTER_SINGLE_TAG_8BITS v21, v22, v23, h21
st1 {v21.b}[0], [x2], x3 //write 16th Byte st1 {v21.b}[0], [x2], x3 //write 16th Byte
@ -1820,7 +1820,7 @@ w9_h_mc_luma_loop:
FILTER_6TAG_8BITS1 v2, v5, v6, v7, v16, v17, v20, v0, v1 FILTER_6TAG_8BITS1 v2, v5, v6, v7, v16, v17, v20, v0, v1
st1 {v20.8b}, [x2], x5 //write 8Byte st1 {v20.8b}, [x2], x5 //write 8Byte
ext.8b v21, v3, v3, #7 // [0][1][2][3][4][5]XY-->O[0][1][2][3][4][5]X ext v21.8b, v3.8b, v3.8b, #7 // [0][1][2][3][4][5]XY-->O[0][1][2][3][4][5]X
FILTER_SINGLE_TAG_8BITS v21, v22, v23, h21 FILTER_SINGLE_TAG_8BITS v21, v22, v23, h21
st1 {v21.b}[0], [x2], x3 //write 9th Byte st1 {v21.b}[0], [x2], x3 //write 9th Byte