Use the correct syntax for the aarch64 ext instructions

Since Xcode 5.1, the apple tools actually support using the official, correct syntax for the ext instructions. This syntax is already used in a number of places already - use it consistently, and get rid of the compatibility hacks.
2014-04-23 11:25:40 +03:00 · 2014-04-23 11:25:40 +03:00 · d5c71dbe2f
commit d5c71dbe2f
parent 019fb9e20e
3 changed files with 14 additions and 21 deletions
--- a/codec/common/arm64/arm_arch64_common_macro.S
+++ b/codec/common/arm64/arm_arch64_common_macro.S
@ -67,11 +67,4 @@ ret
    mov \arg0\().8b, \arg1\().8b
 .endm

-.macro ext.16b arg0, arg1, arg2, arg3
-    ext \arg0\().16b, \arg1\().16b, \arg2\().16b, \arg3
-.endm
-
-.macro ext.8b arg0, arg1, arg2, arg3
-    ext \arg0\().8b, \arg1\().8b, \arg2\().8b, \arg3
-.endm
 #endif
--- a/codec/common/arm64/deblocking_aarch64_neon.S
+++ b/codec/common/arm64/deblocking_aarch64_neon.S
@ -212,7 +212,7 @@
    ld1      {v1.s} [3], [x6]

    bs_nzc_check_jump0:
-    ext.16b  v1, v1, v0, #12
+    ext      v1.16b, v1.16b, v0.16b, #12
    add      $3.16b, v0.16b, v1.16b

    // Arrange the input data --- LEFT
@ -233,7 +233,7 @@ bs_nzc_check_jump1:
    zip1     v0.16b, v0.16b, v2.16b
    ins      v2.d[0], v0.d[1]
    zip1     v0.16b, v0.16b, v2.16b
-    ext.16b  v1, v1, v0, #12
+    ext      v1.16b, v1.16b, v0.16b, #12
    add      $4.16b, v0.16b, v1.16b
 .endm

@ -470,7 +470,7 @@ bs_mv_check_jump1:
    ld1      {v1.s} [3], [x6]

 bs_nzc_check_jump0:
-    ext.16b  v1, v1, v0, #12
+    ext      v1.16b, v1.16b, v0.16b, #12
    add      \arg3\().16b, v0.16b, v1.16b

    // Arrange the input data --- LEFT
@ -491,7 +491,7 @@ bs_nzc_check_jump1:
    zip1     v0.16b, v0.16b, v2.16b
    ins      v2.d[0], v0.d[1]
    zip1     v0.16b, v0.16b, v2.16b
-    ext.16b  v1, v1, v0, #12
+    ext      v1.16b, v1.16b, v0.16b, #12
    add      \arg4\().16b, v0.16b, v1.16b
 .endm

--- a/codec/common/arm64/mc_aarch64_neon.S
+++ b/codec/common/arm64/mc_aarch64_neon.S
@ -197,8 +197,8 @@ filter_para: .short 0, 1, -5, 20, 0, 0, 0, 0

 .macro UNPACK_FILTER_SINGLE_TAG_16BITS // v0, v1, v22, v23
 //  {   // each 16bits; input: d_dst, d_src[0:5], para, working, working, d(low part of d_dst)
-    ext.16b $3, $1, $1, #14       // X[0][1][2][3][4][5]O
-    ext.16b $4, $3, $3, #8      // [3][4][5]OX[0][1][2]
+    ext $3.16b, $1.16b, $1.16b, #14       // X[0][1][2][3][4][5]O
+    ext $4.16b, $3.16b, $3.16b, #8      // [3][4][5]OX[0][1][2]
    rev64  $4.8h, $4.8h         // X[5][4][3][2][1][0]O
    add   $3.8h, $3.8h, $4.8h    // each 16bits, *[50][41][32][23][14][05]*
    smull $3.4s, $3.4h, $2.4h           // 0+1*[50]-5*[41]+20[32]
@ -1713,12 +1713,12 @@ WELS_ASM_AARCH64_FUNC_END
 WELS_ASM_AARCH64_FUNC_BEGIN McChromaWidthEq8_AArch64_neon
    ld4r {v4.8b, v5.8b, v6.8b, v7.8b}, [x4] //load A/B/C/D
    ld1 {v0.16b}, [x0], x1  // src[x]
-    ext.16b v1, v0, v0, #1  // src[x+1]
+    ext v1.16b, v0.16b, v0.16b, #1  // src[x+1]
 w8_mc_chroma_loop:
    ld1 {v2.16b}, [x0], x1  // src[x+stride]
-    ext.16b v3, v2, v2, #1  // src[x+stride+1]
+    ext v3.16b, v2.16b, v2.16b, #1  // src[x+stride+1]
    ld1 {v18.16b}, [x0], x1  // src[x+2*stride]
-    ext.16b v19, v18, v18, #1  // src[x+2*stride+1]
+    ext v19.16b, v18.16b, v18.16b, #1  // src[x+2*stride+1]

    umull v16.8h, v0.8b, v4.8b
    umlal v16.8h, v1.8b, v5.8b
@ -1744,12 +1744,12 @@ WELS_ASM_AARCH64_FUNC_END
 WELS_ASM_AARCH64_FUNC_BEGIN McChromaWidthEq4_AArch64_neon
    ld4r {v4.8b, v5.8b, v6.8b, v7.8b}, [x4] //load A/B/C/D
    ld1 {v0.8b}, [x0], x1  // src[x]
-    ext.8b v1, v0, v0, #1  // src[x+1]
+    ext v1.8b, v0.8b, v0.8b, #1  // src[x+1]
 w4_mc_chroma_loop:
    ld1 {v2.8b}, [x0], x1  // src[x+stride]
-    ext.8b v3, v2, v2, #1  // src[x+stride+1]
+    ext v3.8b, v2.8b, v2.8b, #1  // src[x+stride+1]
    ld1 {v18.8b}, [x0], x1  // src[x+2*stride]
-    ext.8b v19, v18, v18, #1  // src[x+2*stride+1]
+    ext v19.8b, v18.8b, v18.8b, #1  // src[x+2*stride+1]

    zip1 v0.4s, v0.4s, v2.4s
    zip1 v1.4s, v1.4s, v3.4s
@ -1792,7 +1792,7 @@ w17_h_mc_luma_loop:
    FILTER_6TAG_8BITS2 v2, v5, v6, v7, v16, v17, v20, v0, v1
    st1 {v20.16b}, [x2], x5 //write 16Byte

-    ext.8b v21, v3, v3, #7 // [0][1][2][3][4][5]XY-->O[0][1][2][3][4][5]X
+    ext v21.8b, v3.8b, v3.8b, #7 // [0][1][2][3][4][5]XY-->O[0][1][2][3][4][5]X
    FILTER_SINGLE_TAG_8BITS v21, v22, v23, h21
    st1 {v21.b}[0], [x2], x3 //write 16th Byte

@ -1820,7 +1820,7 @@ w9_h_mc_luma_loop:
    FILTER_6TAG_8BITS1 v2, v5, v6, v7, v16, v17, v20, v0, v1
    st1 {v20.8b}, [x2], x5 //write 8Byte

-    ext.8b v21, v3, v3, #7 // [0][1][2][3][4][5]XY-->O[0][1][2][3][4][5]X
+    ext v21.8b, v3.8b, v3.8b, #7 // [0][1][2][3][4][5]XY-->O[0][1][2][3][4][5]X
    FILTER_SINGLE_TAG_8BITS v21, v22, v23, h21
    st1 {v21.b}[0], [x2], x3 //write 9th Byte