From 90fad9fd98df3afee9f4643fe17d0f3c75e94e7b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Martin=20Storsj=C3=B6?= Date: Wed, 23 Apr 2014 11:10:25 +0300 Subject: [PATCH] Add \() to macro arguments to separate the argument from the following .8h or similar --- codec/common/arm64/mc_aarch64_neon.S | 186 +++++++++++++-------------- 1 file changed, 93 insertions(+), 93 deletions(-) diff --git a/codec/common/arm64/mc_aarch64_neon.S b/codec/common/arm64/mc_aarch64_neon.S index 77642e40..2e005761 100644 --- a/codec/common/arm64/mc_aarch64_neon.S +++ b/codec/common/arm64/mc_aarch64_neon.S @@ -213,159 +213,159 @@ filter_para: .short 0, 1, -5, 20, 0, 0, 0, 0 #else .macro FILTER_6TAG_8BITS1 arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8 // { // input:src[-2], src[-1], src[0], src[1], src[2], src[3], dst_d, multiplier a/b; working: v18, v19 - uaddl v18.8h, \arg0.8b, \arg5.8b //v18=src[-2]+src[3] - uaddl v19.8h, \arg2.8b, \arg3.8b //src[0]+src[1] - mla v18.8h, v19.8h, \arg7.8h //v18 += 20*(src[0]+src[1]), 2 cycles - uaddl v19.8h, \arg1.8b, \arg4.8b //src[-1]+src[2] - mls v18.8h, v19.8h, \arg8.8h //v18 -= 5*(src[-1]+src[2]), 2 cycles - sqrshrun \arg6.8b, v18.8h, #5 + uaddl v18.8h, \arg0\().8b, \arg5\().8b //v18=src[-2]+src[3] + uaddl v19.8h, \arg2\().8b, \arg3\().8b //src[0]+src[1] + mla v18.8h, v19.8h, \arg7\().8h //v18 += 20*(src[0]+src[1]), 2 cycles + uaddl v19.8h, \arg1\().8b, \arg4\().8b //src[-1]+src[2] + mls v18.8h, v19.8h, \arg8\().8h //v18 -= 5*(src[-1]+src[2]), 2 cycles + sqrshrun \arg6\().8b, v18.8h, #5 // } .endm .macro FILTER_6TAG_8BITS2 arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8 // { // input:src[-2], src[-1], src[0], src[1], src[2], src[3], dst_d, multiplier a/b; working: v18, v19 - uaddl2 v18.8h, \arg0.16b, \arg5.16b //v18=src[-2]+src[3] - uaddl2 v19.8h, \arg2.16b, \arg3.16b //src[0]+src[1] - mla v18.8h, v19.8h, \arg7.8h //v18 += 20*(src[0]+src[1]), 2 cycles - uaddl2 v19.8h, \arg1.16b, \arg4.16b //src[-1]+src[2] - mls v18.8h, v19.8h, \arg8.8h //v18 -= 5*(src[-1]+src[2]), 2 cycles - sqrshrun2 \arg6.16b, v18.8h, #5 + uaddl2 v18.8h, \arg0\().16b, \arg5\().16b //v18=src[-2]+src[3] + uaddl2 v19.8h, \arg2\().16b, \arg3\().16b //src[0]+src[1] + mla v18.8h, v19.8h, \arg7\().8h //v18 += 20*(src[0]+src[1]), 2 cycles + uaddl2 v19.8h, \arg1\().16b, \arg4\().16b //src[-1]+src[2] + mls v18.8h, v19.8h, \arg8\().8h //v18 -= 5*(src[-1]+src[2]), 2 cycles + sqrshrun2 \arg6\().16b, v18.8h, #5 // } .endm .macro FILTER_6TAG_8BITS1_AVERAGE_WITH_0 arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8 // { // input:src[-2], src[-1], src[0], src[1], src[2], src[3], dst_d, multiplier a/b; working: v18, v19 - uaddl v18.8h, \arg0.8b, \arg5.8b //v18=src[-2]+src[3] - uaddl v19.8h, \arg2.8b, \arg3.8b //src[0]+src[1] - mla v18.8h, v19.8h, \arg7.8h //v18 += 20*(src[0]+src[1]), 2 cycles - uaddl v19.8h, \arg1.8b, \arg4.8b //src[-1]+src[2] - mls v18.8h, v19.8h, \arg8.8h //v18 -= 5*(src[-1]+src[2]), 2 cycles - sqrshrun \arg6.8b, v18.8h, #5 - uaddl v19.8h, \arg2.8b, \arg6.8b - rshrn \arg6.8b, v19.8h, #1 + uaddl v18.8h, \arg0\().8b, \arg5\().8b //v18=src[-2]+src[3] + uaddl v19.8h, \arg2\().8b, \arg3\().8b //src[0]+src[1] + mla v18.8h, v19.8h, \arg7\().8h //v18 += 20*(src[0]+src[1]), 2 cycles + uaddl v19.8h, \arg1\().8b, \arg4\().8b //src[-1]+src[2] + mls v18.8h, v19.8h, \arg8\().8h //v18 -= 5*(src[-1]+src[2]), 2 cycles + sqrshrun \arg6\().8b, v18.8h, #5 + uaddl v19.8h, \arg2\().8b, \arg6\().8b + rshrn \arg6\().8b, v19.8h, #1 // } .endm .macro FILTER_6TAG_8BITS2_AVERAGE_WITH_0 arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8 // { // input:src[-2], src[-1], src[0], src[1], src[2], src[3], dst_d, multiplier a/b; working: v18, v19 - uaddl2 v18.8h, \arg0.16b, \arg5.16b //v18=src[-2]+src[3] - uaddl2 v19.8h, \arg2.16b, \arg3.16b //src[0]+src[1] - mla v18.8h, v19.8h, \arg7.8h //v18 += 20*(src[0]+src[1]), 2 cycles - uaddl2 v19.8h, \arg1.16b, \arg4.16b //src[-1]+src[2] - mls v18.8h, v19.8h, \arg8.8h //v18 -= 5*(src[-1]+src[2]), 2 cycles - sqrshrun2 \arg6.16b, v18.8h, #5 - uaddl2 v19.8h, \arg2.16b, \arg6.16b - rshrn2 \arg6.16b, v19.8h, #1 + uaddl2 v18.8h, \arg0\().16b, \arg5\().16b //v18=src[-2]+src[3] + uaddl2 v19.8h, \arg2\().16b, \arg3\().16b //src[0]+src[1] + mla v18.8h, v19.8h, \arg7\().8h //v18 += 20*(src[0]+src[1]), 2 cycles + uaddl2 v19.8h, \arg1\().16b, \arg4\().16b //src[-1]+src[2] + mls v18.8h, v19.8h, \arg8\().8h //v18 -= 5*(src[-1]+src[2]), 2 cycles + sqrshrun2 \arg6\().16b, v18.8h, #5 + uaddl2 v19.8h, \arg2\().16b, \arg6\().16b + rshrn2 \arg6\().16b, v19.8h, #1 // } .endm .macro FILTER_6TAG_8BITS1_AVERAGE_WITH_1 arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8 // { // input:src[-2], src[-1], src[0], src[1], src[2], src[3], dst_d, multiplier a/b; working: v18, v19 - uaddl v18.8h, \arg0.8b, \arg5.8b //v18=src[-2]+src[3] - uaddl v19.8h, \arg2.8b, \arg3.8b //src[0]+src[1] - mla v18.8h, v19.8h, \arg7.8h //v18 += 20*(src[0]+src[1]), 2 cycles - uaddl v19.8h, \arg1.8b, \arg4.8b //src[-1]+src[2] - mls v18.8h, v19.8h, \arg8.8h //v18 -= 5*(src[-1]+src[2]), 2 cycles - sqrshrun \arg6.8b, v18.8h, #5 - uaddl v19.8h, \arg3.8b, \arg6.8b - rshrn \arg6.8b, v19.8h, #1 + uaddl v18.8h, \arg0\().8b, \arg5\().8b //v18=src[-2]+src[3] + uaddl v19.8h, \arg2\().8b, \arg3\().8b //src[0]+src[1] + mla v18.8h, v19.8h, \arg7\().8h //v18 += 20*(src[0]+src[1]), 2 cycles + uaddl v19.8h, \arg1\().8b, \arg4\().8b //src[-1]+src[2] + mls v18.8h, v19.8h, \arg8\().8h //v18 -= 5*(src[-1]+src[2]), 2 cycles + sqrshrun \arg6\().8b, v18.8h, #5 + uaddl v19.8h, \arg3\().8b, \arg6\().8b + rshrn \arg6\().8b, v19.8h, #1 // } .endm .macro FILTER_6TAG_8BITS2_AVERAGE_WITH_1 arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8 // { // input:src[-2], src[-1], src[0], src[1], src[2], src[3], dst_d, multiplier a/b; working: v18, v19 - uaddl2 v18.8h, \arg0.16b, \arg5.16b //v18=src[-2]+src[3] - uaddl2 v19.8h, \arg2.16b, \arg3.16b //src[0]+src[1] - mla v18.8h, v19.8h, \arg7.8h //v18 += 20*(src[0]+src[1]), 2 cycles - uaddl2 v19.8h, \arg1.16b, \arg4.16b //src[-1]+src[2] - mls v18.8h, v19.8h, \arg8.8h //v18 -= 5*(src[-1]+src[2]), 2 cycles - sqrshrun2 \arg6.16b, v18.8h, #5 - uaddl2 v19.8h, \arg3.16b, \arg6.16b - rshrn2 \arg6.16b, v19.8h, #1 + uaddl2 v18.8h, \arg0\().16b, \arg5\().16b //v18=src[-2]+src[3] + uaddl2 v19.8h, \arg2\().16b, \arg3\().16b //src[0]+src[1] + mla v18.8h, v19.8h, \arg7\().8h //v18 += 20*(src[0]+src[1]), 2 cycles + uaddl2 v19.8h, \arg1\().16b, \arg4\().16b //src[-1]+src[2] + mls v18.8h, v19.8h, \arg8\().8h //v18 -= 5*(src[-1]+src[2]), 2 cycles + sqrshrun2 \arg6\().16b, v18.8h, #5 + uaddl2 v19.8h, \arg3\().16b, \arg6\().16b + rshrn2 \arg6\().16b, v19.8h, #1 // } .endm .macro FILTER_6TAG_8BITS_TO_16BITS1 arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8 // { // input:d_src[-2], d_src[-1], d_src[0], d_src[1], d_src[2], d_src[3], dst_q, multiplier a/b; working:v31 - uaddl \arg6.8h, \arg0.8b, \arg5.8b //dst_q=src[-2]+src[3] - uaddl v31.8h, \arg2.8b, \arg3.8b //src[0]+src[1] - mla \arg6.8h, v31.8h, \arg7.8h //dst_q += 20*(src[0]+src[1]), 2 cycles - uaddl v31.8h, \arg1.8b, \arg4.8b //src[-1]+src[2] - mls \arg6.8h, v31.8h, \arg8.8h //dst_q -= 5*(src[-1]+src[2]), 2 cycles + uaddl \arg6\().8h, \arg0\().8b, \arg5\().8b //dst_q=src[-2]+src[3] + uaddl v31.8h, \arg2\().8b, \arg3\().8b //src[0]+src[1] + mla \arg6\().8h, v31.8h, \arg7\().8h //dst_q += 20*(src[0]+src[1]), 2 cycles + uaddl v31.8h, \arg1\().8b, \arg4\().8b //src[-1]+src[2] + mls \arg6\().8h, v31.8h, \arg8\().8h //dst_q -= 5*(src[-1]+src[2]), 2 cycles // } .endm .macro FILTER_6TAG_8BITS_TO_16BITS2 arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8 // { // input:d_src[-2], d_src[-1], d_src[0], d_src[1], d_src[2], d_src[3], dst_q, multiplier a/b; working:v31 - uaddl2 \arg6.8h, \arg0.16b, \arg5.16b //dst_q=src[-2]+src[3] - uaddl2 v31.8h, \arg2.16b, \arg3.16b //src[0]+src[1] - mla \arg6.8h, v31.8h, \arg7.8h //dst_q += 20*(src[0]+src[1]), 2 cycles - uaddl2 v31.8h, \arg1.16b, \arg4.16b //src[-1]+src[2] - mls \arg6.8h, v31.8h, \arg8.8h //dst_q -= 5*(src[-1]+src[2]), 2 cycles + uaddl2 \arg6\().8h, \arg0\().16b, \arg5\().16b //dst_q=src[-2]+src[3] + uaddl2 v31.8h, \arg2\().16b, \arg3\().16b //src[0]+src[1] + mla \arg6\().8h, v31.8h, \arg7\().8h //dst_q += 20*(src[0]+src[1]), 2 cycles + uaddl2 v31.8h, \arg1\().16b, \arg4\().16b //src[-1]+src[2] + mls \arg6\().8h, v31.8h, \arg8\().8h //dst_q -= 5*(src[-1]+src[2]), 2 cycles // } .endm .macro FILTER_3_IN_16BITS_TO_8BITS1 arg0, arg1, arg2, arg3 // { // input:a, b, c, dst_d; - sub \arg0.8h, \arg0.8h, \arg1.8h //a-b - sshr \arg0.8h, \arg0.8h, #2 //(a-b)/4 - sub \arg0.8h, \arg0.8h, \arg1.8h //(a-b)/4-b - add \arg0.8h, \arg0.8h, \arg2.8h //(a-b)/4-b+c - sshr \arg0.8h, \arg0.8h, #2 //((a-b)/4-b+c)/4 - add \arg0.8h, \arg0.8h, \arg2.8h //((a-b)/4-b+c)/4+c = (a-5*b+20*c)/16 - sqrshrun \arg3.8b, \arg0.8h, #6 //(+32)>>6 + sub \arg0\().8h, \arg0\().8h, \arg1\().8h //a-b + sshr \arg0\().8h, \arg0\().8h, #2 //(a-b)/4 + sub \arg0\().8h, \arg0\().8h, \arg1\().8h //(a-b)/4-b + add \arg0\().8h, \arg0\().8h, \arg2\().8h //(a-b)/4-b+c + sshr \arg0\().8h, \arg0\().8h, #2 //((a-b)/4-b+c)/4 + add \arg0\().8h, \arg0\().8h, \arg2\().8h //((a-b)/4-b+c)/4+c = (a-5*b+20*c)/16 + sqrshrun \arg3\().8b, \arg0\().8h, #6 //(+32)>>6 // } .endm .macro FILTER_3_IN_16BITS_TO_8BITS2 arg0, arg1, arg2, arg3 // { // input:a, b, c, dst_d; - sub \arg0.8h, \arg0.8h, \arg1.8h //a-b - sshr \arg0.8h, \arg0.8h, #2 //(a-b)/4 - sub \arg0.8h, \arg0.8h, \arg1.8h //(a-b)/4-b - add \arg0.8h, \arg0.8h, \arg2.8h //(a-b)/4-b+c - sshr \arg0.8h, \arg0.8h, #2 //((a-b)/4-b+c)/4 - add \arg0.8h, \arg0.8h, \arg2.8h //((a-b)/4-b+c)/4+c = (a-5*b+20*c)/16 - sqrshrun2 \arg3.16b, \arg0.8h, #6 //(+32)>>6 + sub \arg0\().8h, \arg0\().8h, \arg1\().8h //a-b + sshr \arg0\().8h, \arg0\().8h, #2 //(a-b)/4 + sub \arg0\().8h, \arg0\().8h, \arg1\().8h //(a-b)/4-b + add \arg0\().8h, \arg0\().8h, \arg2\().8h //(a-b)/4-b+c + sshr \arg0\().8h, \arg0\().8h, #2 //((a-b)/4-b+c)/4 + add \arg0\().8h, \arg0\().8h, \arg2\().8h //((a-b)/4-b+c)/4+c = (a-5*b+20*c)/16 + sqrshrun2 \arg3\().16b, \arg0\().8h, #6 //(+32)>>6 // } .endm .macro UNPACK_2_16BITS_TO_ABC arg0, arg1, arg2, arg3, arg4 // { // input:q_src[-2:5], q_src[6:13](avail 8+5)/q_src[6:**](avail 4+5), dst_a, dst_b, dst_c; - ext \arg4.16b, \arg0.16b, \arg1.16b, #4 //src[0] - ext \arg3.16b, \arg0.16b, \arg1.16b, #6 //src[1] - add \arg4.8h, \arg4.8h, \arg3.8h //c=src[0]+src[1] + ext \arg4\().16b, \arg0\().16b, \arg1\().16b, #4 //src[0] + ext \arg3\().16b, \arg0\().16b, \arg1\().16b, #6 //src[1] + add \arg4\().8h, \arg4\().8h, \arg3\().8h //c=src[0]+src[1] - ext \arg3.16b, \arg0.16b, \arg1.16b, #2 //src[-1] - ext \arg2.16b, \arg0.16b, \arg1.16b, #8 //src[2] - add \arg3.8h, \arg3.8h, \arg2.8h //b=src[-1]+src[2] + ext \arg3\().16b, \arg0\().16b, \arg1\().16b, #2 //src[-1] + ext \arg2\().16b, \arg0\().16b, \arg1\().16b, #8 //src[2] + add \arg3\().8h, \arg3\().8h, \arg2\().8h //b=src[-1]+src[2] - ext \arg2.16b, \arg0.16b, \arg1.16b, #10 //src[3] - add \arg2.8h, \arg2.8h, \arg0.8h //a=src[-2]+src[3] + ext \arg2\().16b, \arg0\().16b, \arg1\().16b, #10 //src[3] + add \arg2\().8h, \arg2\().8h, \arg0\().8h //a=src[-2]+src[3] // } .endm .macro AVERAGE_TWO_8BITS1 arg0, arg1, arg2 // { // input:dst_d, src_d A and B; working: v5 - uaddl v30.8h, \arg2.8b, \arg1.8b - rshrn \arg0.8b, v30.8h, #1 + uaddl v30.8h, \arg2\().8b, \arg1\().8b + rshrn \arg0\().8b, v30.8h, #1 // } .endm .macro AVERAGE_TWO_8BITS2 arg0, arg1, arg2 // { // input:dst_d, src_d A and B; working: v5 - uaddl2 v30.8h, \arg2.16b, \arg1.16b - rshrn2 \arg0.16b, v30.8h, #1 + uaddl2 v30.8h, \arg2\().16b, \arg1\().16b + rshrn2 \arg0\().16b, v30.8h, #1 // } .endm .macro FILTER_SINGLE_TAG_8BITS arg0, arg1, arg2, arg3 // when width=17/9, used // { // input: src_d{Y[0][1][2][3][4][5]X}, - rev64 \arg2.8b, \arg0.8b // X[5][4][3][2][1][0]O - uaddl \arg2.8h, \arg0.8b, \arg2.8b // each 16bits, *[50][41][32][23][14][05]* - mul \arg2.4h, \arg2.4h, \arg1.4h // 0+1*[50]-5*[41]+20[32] - addv \arg3, \arg2.4h - sqrshrun \arg0.8b, \arg0.8h, #5 + rev64 \arg2\().8b, \arg0\().8b // X[5][4][3][2][1][0]O + uaddl \arg2\().8h, \arg0\().8b, \arg2\().8b // each 16bits, *[50][41][32][23][14][05]* + mul \arg2\().4h, \arg2\().4h, \arg1\().4h // 0+1*[50]-5*[41]+20[32] + addv \arg3, \arg2\().4h + sqrshrun \arg0\().8b, \arg0\().8h, #5 // } .endm @@ -373,14 +373,14 @@ filter_para: .short 0, 1, -5, 20, 0, 0, 0, 0 // { // each 16bits; input: d_dst, d_src[0:5], para, working, working, d(low part of d_dst) ext.16b \arg3, \arg1, \arg1, #14 // X[0][1][2][3][4][5]O ext.16b \arg4, \arg3, \arg3, #8 // [3][4][5]OX[0][1][2] - rev64 \arg4.8h, \arg4.8h // X[5][4][3][2][1][0]O - add \arg3.8h, \arg3.8h, \arg4.8h // each 16bits, *[50][41][32][23][14][05]* - smull \arg3.4s, \arg3.4h, \arg2.4h // 0+1*[50]-5*[41]+20[32] - saddlv \arg5, \arg3.4s - //sshr \arg0.2d, \arg0.2d, #4 - sqrshrun \arg0.2s, \arg0.2d, #10 - uqxtn \arg0.4h, \arg0.4s - uqxtn \arg0.8b, \arg0.8h + rev64 \arg4\().8h, \arg4\().8h // X[5][4][3][2][1][0]O + add \arg3\().8h, \arg3\().8h, \arg4\().8h // each 16bits, *[50][41][32][23][14][05]* + smull \arg3\().4s, \arg3\().4h, \arg2\().4h // 0+1*[50]-5*[41]+20[32] + saddlv \arg5, \arg3\().4s + //sshr \arg0\().2d, \arg0\().2d, #4 + sqrshrun \arg0\().2s, \arg0\().2d, #10 + uqxtn \arg0\().4h, \arg0\().4s + uqxtn \arg0\().8b, \arg0\().8h // } .endm #endif