Merge pull request #738 from mstorsjo/gnu-aarch64

Fix building the aarch64 assembly using gnu binutils
This commit is contained in:
volvet 2014-04-25 09:07:43 +08:00
commit c65e286036
2 changed files with 114 additions and 99 deletions

View File

@ -39,11 +39,10 @@ _$0:
.endm
.macro WELS_ASM_ARCH64_FUNC_END
ret lr
ret
.endm
#else
.syntax unified
.section .note.GNU-stack,"",%progbits // Mark stack as non-executable
.text
@ -56,7 +55,23 @@ ret lr
.endm
.macro WELS_ASM_ARCH64_FUNC_END
ret lr
ret
.endfunc
.endm
.macro mov.16b arg0, arg1
mov \arg0\().16b, \arg1\().16b
.endm
.macro mov.8b arg0, arg1
mov \arg0\().8b, \arg1\().8b
.endm
.macro ext.16b arg0, arg1, arg2, arg3
ext \arg0\().16b, \arg1\().16b, \arg2\().16b, \arg3
.endm
.macro ext.8b arg0, arg1, arg2, arg3
ext \arg0\().8b, \arg1\().8b, \arg2\().8b, \arg3
.endm
#endif

View File

@ -213,174 +213,174 @@ filter_para: .short 0, 1, -5, 20, 0, 0, 0, 0
#else
.macro FILTER_6TAG_8BITS1 arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8
// { // input:src[-2], src[-1], src[0], src[1], src[2], src[3], dst_d, multiplier a/b; working: v18, v19
uaddl v18.8h, \arg0.8b, \arg5.8b //v18=src[-2]+src[3]
uaddl v19.8h, \arg2.8b, \arg3.8b //src[0]+src[1]
mla v18.8h, v19.8h, \arg7.8h //v18 += 20*(src[0]+src[1]), 2 cycles
uaddl v19.8h, \arg1.8b, \arg4.8b //src[-1]+src[2]
mls v18.8h, v19.8h, \arg8.8h //v18 -= 5*(src[-1]+src[2]), 2 cycles
sqrshrun \arg6.8b, v18.8h, #5
uaddl v18.8h, \arg0\().8b, \arg5\().8b //v18=src[-2]+src[3]
uaddl v19.8h, \arg2\().8b, \arg3\().8b //src[0]+src[1]
mla v18.8h, v19.8h, \arg7\().8h //v18 += 20*(src[0]+src[1]), 2 cycles
uaddl v19.8h, \arg1\().8b, \arg4\().8b //src[-1]+src[2]
mls v18.8h, v19.8h, \arg8\().8h //v18 -= 5*(src[-1]+src[2]), 2 cycles
sqrshrun \arg6\().8b, v18.8h, #5
// }
.endm
.macro FILTER_6TAG_8BITS2 arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8
// { // input:src[-2], src[-1], src[0], src[1], src[2], src[3], dst_d, multiplier a/b; working: v18, v19
uaddl2 v18.8h, \arg0.16b, \arg5.16b //v18=src[-2]+src[3]
uaddl2 v19.8h, \arg2.16b, \arg3.16b //src[0]+src[1]
mla v18.8h, v19.8h, \arg7.8h //v18 += 20*(src[0]+src[1]), 2 cycles
uaddl2 v19.8h, \arg1.16b, \arg4.16b //src[-1]+src[2]
mls v18.8h, v19.8h, \arg8.8h //v18 -= 5*(src[-1]+src[2]), 2 cycles
sqrshrun2 \arg6.16b, v18.8h, #5
uaddl2 v18.8h, \arg0\().16b, \arg5\().16b //v18=src[-2]+src[3]
uaddl2 v19.8h, \arg2\().16b, \arg3\().16b //src[0]+src[1]
mla v18.8h, v19.8h, \arg7\().8h //v18 += 20*(src[0]+src[1]), 2 cycles
uaddl2 v19.8h, \arg1\().16b, \arg4\().16b //src[-1]+src[2]
mls v18.8h, v19.8h, \arg8\().8h //v18 -= 5*(src[-1]+src[2]), 2 cycles
sqrshrun2 \arg6\().16b, v18.8h, #5
// }
.endm
.macro FILTER_6TAG_8BITS1_AVERAGE_WITH_0 arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8
// { // input:src[-2], src[-1], src[0], src[1], src[2], src[3], dst_d, multiplier a/b; working: v18, v19
uaddl v18.8h, \arg0.8b, \arg5.8b //v18=src[-2]+src[3]
uaddl v19.8h, \arg2.8b, \arg3.8b //src[0]+src[1]
mla v18.8h, v19.8h, \arg7.8h //v18 += 20*(src[0]+src[1]), 2 cycles
uaddl v19.8h, \arg1.8b, \arg4.8b //src[-1]+src[2]
mls v18.8h, v19.8h, \arg8.8h //v18 -= 5*(src[-1]+src[2]), 2 cycles
sqrshrun \arg6.8b, v18.8h, #5
uaddl v19.8h, \arg2.8b, \arg6.8b
rshrn \arg6.8b, v19.8h, #1
uaddl v18.8h, \arg0\().8b, \arg5\().8b //v18=src[-2]+src[3]
uaddl v19.8h, \arg2\().8b, \arg3\().8b //src[0]+src[1]
mla v18.8h, v19.8h, \arg7\().8h //v18 += 20*(src[0]+src[1]), 2 cycles
uaddl v19.8h, \arg1\().8b, \arg4\().8b //src[-1]+src[2]
mls v18.8h, v19.8h, \arg8\().8h //v18 -= 5*(src[-1]+src[2]), 2 cycles
sqrshrun \arg6\().8b, v18.8h, #5
uaddl v19.8h, \arg2\().8b, \arg6\().8b
rshrn \arg6\().8b, v19.8h, #1
// }
.endm
.macro FILTER_6TAG_8BITS2_AVERAGE_WITH_0 arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8
// { // input:src[-2], src[-1], src[0], src[1], src[2], src[3], dst_d, multiplier a/b; working: v18, v19
uaddl2 v18.8h, \arg0.16b, \arg5.16b //v18=src[-2]+src[3]
uaddl2 v19.8h, \arg2.16b, \arg3.16b //src[0]+src[1]
mla v18.8h, v19.8h, \arg7.8h //v18 += 20*(src[0]+src[1]), 2 cycles
uaddl2 v19.8h, \arg1.16b, \arg4.16b //src[-1]+src[2]
mls v18.8h, v19.8h, \arg8.8h //v18 -= 5*(src[-1]+src[2]), 2 cycles
sqrshrun2 \arg6.16b, v18.8h, #5
uaddl2 v19.8h, \arg2.16b, \arg6.16b
rshrn2 \arg6.16b, v19.8h, #1
uaddl2 v18.8h, \arg0\().16b, \arg5\().16b //v18=src[-2]+src[3]
uaddl2 v19.8h, \arg2\().16b, \arg3\().16b //src[0]+src[1]
mla v18.8h, v19.8h, \arg7\().8h //v18 += 20*(src[0]+src[1]), 2 cycles
uaddl2 v19.8h, \arg1\().16b, \arg4\().16b //src[-1]+src[2]
mls v18.8h, v19.8h, \arg8\().8h //v18 -= 5*(src[-1]+src[2]), 2 cycles
sqrshrun2 \arg6\().16b, v18.8h, #5
uaddl2 v19.8h, \arg2\().16b, \arg6\().16b
rshrn2 \arg6\().16b, v19.8h, #1
// }
.endm
.macro FILTER_6TAG_8BITS1_AVERAGE_WITH_1 arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8
// { // input:src[-2], src[-1], src[0], src[1], src[2], src[3], dst_d, multiplier a/b; working: v18, v19
uaddl v18.8h, \arg0.8b, \arg5.8b //v18=src[-2]+src[3]
uaddl v19.8h, \arg2.8b, \arg3.8b //src[0]+src[1]
mla v18.8h, v19.8h, \arg7.8h //v18 += 20*(src[0]+src[1]), 2 cycles
uaddl v19.8h, \arg1.8b, \arg4.8b //src[-1]+src[2]
mls v18.8h, v19.8h, \arg8.8h //v18 -= 5*(src[-1]+src[2]), 2 cycles
sqrshrun \arg6.8b, v18.8h, #5
uaddl v19.8h, \arg3.8b, \arg6.8b
rshrn \arg6.8b, v19.8h, #1
uaddl v18.8h, \arg0\().8b, \arg5\().8b //v18=src[-2]+src[3]
uaddl v19.8h, \arg2\().8b, \arg3\().8b //src[0]+src[1]
mla v18.8h, v19.8h, \arg7\().8h //v18 += 20*(src[0]+src[1]), 2 cycles
uaddl v19.8h, \arg1\().8b, \arg4\().8b //src[-1]+src[2]
mls v18.8h, v19.8h, \arg8\().8h //v18 -= 5*(src[-1]+src[2]), 2 cycles
sqrshrun \arg6\().8b, v18.8h, #5
uaddl v19.8h, \arg3\().8b, \arg6\().8b
rshrn \arg6\().8b, v19.8h, #1
// }
.endm
.macro FILTER_6TAG_8BITS2_AVERAGE_WITH_1 arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8
// { // input:src[-2], src[-1], src[0], src[1], src[2], src[3], dst_d, multiplier a/b; working: v18, v19
uaddl2 v18.8h, \arg0.16b, \arg5.16b //v18=src[-2]+src[3]
uaddl2 v19.8h, \arg2.16b, \arg3.16b //src[0]+src[1]
mla v18.8h, v19.8h, \arg7.8h //v18 += 20*(src[0]+src[1]), 2 cycles
uaddl2 v19.8h, \arg1.16b, \arg4.16b //src[-1]+src[2]
mls v18.8h, v19.8h, \arg8.8h //v18 -= 5*(src[-1]+src[2]), 2 cycles
sqrshrun2 \arg6.16b, v18.8h, #5
uaddl2 v19.8h, \arg3.16b, \arg6.16b
rshrn2 \arg6.16b, v19.8h, #1
uaddl2 v18.8h, \arg0\().16b, \arg5\().16b //v18=src[-2]+src[3]
uaddl2 v19.8h, \arg2\().16b, \arg3\().16b //src[0]+src[1]
mla v18.8h, v19.8h, \arg7\().8h //v18 += 20*(src[0]+src[1]), 2 cycles
uaddl2 v19.8h, \arg1\().16b, \arg4\().16b //src[-1]+src[2]
mls v18.8h, v19.8h, \arg8\().8h //v18 -= 5*(src[-1]+src[2]), 2 cycles
sqrshrun2 \arg6\().16b, v18.8h, #5
uaddl2 v19.8h, \arg3\().16b, \arg6\().16b
rshrn2 \arg6\().16b, v19.8h, #1
// }
.endm
.macro FILTER_6TAG_8BITS_TO_16BITS1 arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8
// { // input:d_src[-2], d_src[-1], d_src[0], d_src[1], d_src[2], d_src[3], dst_q, multiplier a/b; working:v31
uaddl \arg6.8h, \arg0.8b, \arg5.8b //dst_q=src[-2]+src[3]
uaddl v31.8h, \arg2.8b, \arg3.8b //src[0]+src[1]
mla \arg6.8h, v31.8h, \arg7.8h //dst_q += 20*(src[0]+src[1]), 2 cycles
uaddl v31.8h, \arg1.8b, \arg4.8b //src[-1]+src[2]
mls \arg6.8h, v31.8h, \arg8.8h //dst_q -= 5*(src[-1]+src[2]), 2 cycles
uaddl \arg6\().8h, \arg0\().8b, \arg5\().8b //dst_q=src[-2]+src[3]
uaddl v31.8h, \arg2\().8b, \arg3\().8b //src[0]+src[1]
mla \arg6\().8h, v31.8h, \arg7\().8h //dst_q += 20*(src[0]+src[1]), 2 cycles
uaddl v31.8h, \arg1\().8b, \arg4\().8b //src[-1]+src[2]
mls \arg6\().8h, v31.8h, \arg8\().8h //dst_q -= 5*(src[-1]+src[2]), 2 cycles
// }
.endm
.macro FILTER_6TAG_8BITS_TO_16BITS2 arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8
// { // input:d_src[-2], d_src[-1], d_src[0], d_src[1], d_src[2], d_src[3], dst_q, multiplier a/b; working:v31
uaddl2 \arg6.8h, \arg0.16b, \arg5.16b //dst_q=src[-2]+src[3]
uaddl2 v31.8h, \arg2.16b, \arg3.16b //src[0]+src[1]
mla \arg6.8h, v31.8h, \arg7.8h //dst_q += 20*(src[0]+src[1]), 2 cycles
uaddl2 v31.8h, \arg1.16b, \arg4.16b //src[-1]+src[2]
mls \arg6.8h, v31.8h, \arg8.8h //dst_q -= 5*(src[-1]+src[2]), 2 cycles
uaddl2 \arg6\().8h, \arg0\().16b, \arg5\().16b //dst_q=src[-2]+src[3]
uaddl2 v31.8h, \arg2\().16b, \arg3\().16b //src[0]+src[1]
mla \arg6\().8h, v31.8h, \arg7\().8h //dst_q += 20*(src[0]+src[1]), 2 cycles
uaddl2 v31.8h, \arg1\().16b, \arg4\().16b //src[-1]+src[2]
mls \arg6\().8h, v31.8h, \arg8\().8h //dst_q -= 5*(src[-1]+src[2]), 2 cycles
// }
.endm
.macro FILTER_3_IN_16BITS_TO_8BITS1 arg0, arg1, arg2, arg3
// { // input:a, b, c, dst_d;
sub \arg0.8h, \arg0.8h, \arg1.8h //a-b
sshr \arg0.8h, \arg0.8h, #2 //(a-b)/4
sub \arg0.8h, \arg0.8h, \arg1.8h //(a-b)/4-b
add \arg0.8h, \arg0.8h, \arg2.8h //(a-b)/4-b+c
sshr \arg0.8h, \arg0.8h, #2 //((a-b)/4-b+c)/4
add \arg0.8h, \arg0.8h, \arg2.8h //((a-b)/4-b+c)/4+c = (a-5*b+20*c)/16
sqrshrun \arg3.8b, \arg0.8h, #6 //(+32)>>6
sub \arg0\().8h, \arg0\().8h, \arg1\().8h //a-b
sshr \arg0\().8h, \arg0\().8h, #2 //(a-b)/4
sub \arg0\().8h, \arg0\().8h, \arg1\().8h //(a-b)/4-b
add \arg0\().8h, \arg0\().8h, \arg2\().8h //(a-b)/4-b+c
sshr \arg0\().8h, \arg0\().8h, #2 //((a-b)/4-b+c)/4
add \arg0\().8h, \arg0\().8h, \arg2\().8h //((a-b)/4-b+c)/4+c = (a-5*b+20*c)/16
sqrshrun \arg3\().8b, \arg0\().8h, #6 //(+32)>>6
// }
.endm
.macro FILTER_3_IN_16BITS_TO_8BITS2 arg0, arg1, arg2, arg3
// { // input:a, b, c, dst_d;
sub \arg0.8h, \arg0.8h, \arg1.8h //a-b
sshr \arg0.8h, \arg0.8h, #2 //(a-b)/4
sub \arg0.8h, \arg0.8h, \arg1.8h //(a-b)/4-b
add \arg0.8h, \arg0.8h, \arg2.8h //(a-b)/4-b+c
sshr \arg0.8h, \arg0.8h, #2 //((a-b)/4-b+c)/4
add \arg0.8h, \arg0.8h, \arg2.8h //((a-b)/4-b+c)/4+c = (a-5*b+20*c)/16
sqrshrun2 \arg3.16b, \arg0.8h, #6 //(+32)>>6
sub \arg0\().8h, \arg0\().8h, \arg1\().8h //a-b
sshr \arg0\().8h, \arg0\().8h, #2 //(a-b)/4
sub \arg0\().8h, \arg0\().8h, \arg1\().8h //(a-b)/4-b
add \arg0\().8h, \arg0\().8h, \arg2\().8h //(a-b)/4-b+c
sshr \arg0\().8h, \arg0\().8h, #2 //((a-b)/4-b+c)/4
add \arg0\().8h, \arg0\().8h, \arg2\().8h //((a-b)/4-b+c)/4+c = (a-5*b+20*c)/16
sqrshrun2 \arg3\().16b, \arg0\().8h, #6 //(+32)>>6
// }
.endm
.macro UNPACK_2_16BITS_TO_ABC arg0, arg1, arg2, arg3, arg4
// { // input:q_src[-2:5], q_src[6:13](avail 8+5)/q_src[6:**](avail 4+5), dst_a, dst_b, dst_c;
ext \arg4.16b, \arg0.16b, \arg1.16b, #4 //src[0]
ext \arg3.16b, \arg0.16b, \arg1.16b, #6 //src[1]
add \arg4.8h, \arg4.8h, \arg3.8h //c=src[0]+src[1]
ext \arg4\().16b, \arg0\().16b, \arg1\().16b, #4 //src[0]
ext \arg3\().16b, \arg0\().16b, \arg1\().16b, #6 //src[1]
add \arg4\().8h, \arg4\().8h, \arg3\().8h //c=src[0]+src[1]
ext \arg3.16b, \arg0.16b, \arg1.16b, #2 //src[-1]
ext \arg2.16b, \arg0.16b, \arg1.16b, #8 //src[2]
add \arg3.8h, \arg3.8h, \arg2.8h //b=src[-1]+src[2]
ext \arg3\().16b, \arg0\().16b, \arg1\().16b, #2 //src[-1]
ext \arg2\().16b, \arg0\().16b, \arg1\().16b, #8 //src[2]
add \arg3\().8h, \arg3\().8h, \arg2\().8h //b=src[-1]+src[2]
ext \arg2.16b, \arg0.16b, \arg1.16b, #10 //src[3]
add \arg2.8h, \arg2.8h, \arg0.8h //a=src[-2]+src[3]
ext \arg2\().16b, \arg0\().16b, \arg1\().16b, #10 //src[3]
add \arg2\().8h, \arg2\().8h, \arg0\().8h //a=src[-2]+src[3]
// }
.endm
.macro AVERAGE_TWO_8BITS1 arg0, arg1, arg2
// { // input:dst_d, src_d A and B; working: v5
uaddl v30.8h, \arg2.8b, \arg1.8b
rshrn \arg0.8b, v30.8h, #1
uaddl v30.8h, \arg2\().8b, \arg1\().8b
rshrn \arg0\().8b, v30.8h, #1
// }
.endm
.macro AVERAGE_TWO_8BITS2 arg0, arg1, arg2
// { // input:dst_d, src_d A and B; working: v5
uaddl2 v30.8h, \arg2.16b, \arg1.16b
rshrn2 \arg0.16b, v30.8h, #1
uaddl2 v30.8h, \arg2\().16b, \arg1\().16b
rshrn2 \arg0\().16b, v30.8h, #1
// }
.endm
.macro FILTER_SINGLE_TAG_8BITS arg0, arg1, arg2, arg3
// when width=17/9, used
// { // input: src_d{Y[0][1][2][3][4][5]X},
rev64 \arg2.8b, \arg0.8b // X[5][4][3][2][1][0]O
uaddl \arg2.8h, \arg0.8b, \arg2.8b // each 16bits, *[50][41][32][23][14][05]*
mul \arg2.4h, \arg2.4h, \arg1.4h // 0+1*[50]-5*[41]+20[32]
addv \arg3, \arg2.4h
sqrshrun \arg0.8b, \arg0.8h, #5
rev64 \arg2\().8b, \arg0\().8b // X[5][4][3][2][1][0]O
uaddl \arg2\().8h, \arg0\().8b, \arg2\().8b // each 16bits, *[50][41][32][23][14][05]*
mul \arg2\().4h, \arg2\().4h, \arg1\().4h // 0+1*[50]-5*[41]+20[32]
addv \arg3, \arg2\().4h
sqrshrun \arg0\().8b, \arg0\().8h, #5
// }
.endm
.macro UNPACK_FILTER_SINGLE_TAG_16BITS // v0, v1, v22, v23
.macro UNPACK_FILTER_SINGLE_TAG_16BITS arg0, arg1, arg2, arg3, arg4, arg5
// { // each 16bits; input: d_dst, d_src[0:5], para, working, working, d(low part of d_dst)
ext.16b \arg3, \arg1, \arg1, #14 // X[0][1][2][3][4][5]O
ext.16b \arg4, \arg3, \arg3, #8 // [3][4][5]OX[0][1][2]
rev64 \arg4.8h, \arg4.8h // X[5][4][3][2][1][0]O
add \arg3.8h, \arg3.8h, \arg4.8h // each 16bits, *[50][41][32][23][14][05]*
smull \arg3.4s, \arg3.4h, \arg2.4h // 0+1*[50]-5*[41]+20[32]
saddlv \arg5, \arg3.4s
//sshr \arg0.2d, \arg0.2d, #4
sqrshrun \arg0.2s, \arg0.2d, #10
uqxtn \arg0.4h, \arg0.4s
uqxtn \arg0.8b, \arg0.8h
ext \arg3\().16b, \arg1\().16b, \arg1\().16b, #14 // X[0][1][2][3][4][5]O
ext \arg4\().16b, \arg3\().16b, \arg3\().16b, #8 // [3][4][5]OX[0][1][2]
rev64 \arg4\().8h, \arg4\().8h // X[5][4][3][2][1][0]O
add \arg3\().8h, \arg3\().8h, \arg4\().8h // each 16bits, *[50][41][32][23][14][05]*
smull \arg3\().4s, \arg3\().4h, \arg2\().4h // 0+1*[50]-5*[41]+20[32]
saddlv \arg5, \arg3\().4s
//sshr \arg0\().2d, \arg0\().2d, #4
sqrshrun \arg0\().2s, \arg0\().2d, #10
uqxtn \arg0\().4h, \arg0\().4s
uqxtn \arg0\().8b, \arg0\().8h
// }
.endm
#endif