1117 lines
36 KiB
ArmAsm
1117 lines
36 KiB
ArmAsm
/*!
|
|
* \copy
|
|
* Copyright (c) 2013, Cisco Systems
|
|
* All rights reserved.
|
|
*
|
|
* Redistribution and use in source and binary forms, with or without
|
|
* modification, are permitted provided that the following conditions
|
|
* are met:
|
|
*
|
|
* * Redistributions of source code must retain the above copyright
|
|
* notice, this list of conditions and the following disclaimer.
|
|
*
|
|
* * Redistributions in binary form must reproduce the above copyright
|
|
* notice, this list of conditions and the following disclaimer in
|
|
* the documentation and/or other materials provided with the
|
|
* distribution.
|
|
*
|
|
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
|
* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
|
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
|
|
* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
|
|
* COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
|
|
* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
|
|
* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
|
|
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
|
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
|
|
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
|
|
* ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
|
* POSSIBILITY OF SUCH DAMAGE.
|
|
*
|
|
*/
|
|
|
|
#ifdef HAVE_NEON_AARCH64
|
|
.text
|
|
|
|
#include "arm_arch64_common_macro.S"
|
|
#ifdef __APPLE__
|
|
|
|
.macro MASK_MATRIX
|
|
uabd $6.16b, $1.16b, $2.16b
|
|
cmhi $6.16b, $4.16b, $6.16b
|
|
|
|
uabd $4.16b, $0.16b, $1.16b
|
|
cmhi $4.16b, $5.16b, $4.16b
|
|
and $6.16b, $6.16b, $4.16b
|
|
|
|
uabd $4.16b, $3.16b, $2.16b
|
|
cmhi $4.16b, $5.16b, $4.16b
|
|
and $6.16b, $6.16b, $4.16b
|
|
.endm
|
|
|
|
.macro DIFF_LUMA_LT4_P1_Q1 //(Use Tmp v23, v24)
|
|
//v0, v1, v2, v3, v17(beta), v18(-Tc0), v6(Tc0), v7(flag), v19, v20
|
|
urhadd $8.16b, $2.16b, $3.16b
|
|
uhadd $8.16b, $0.16b, $8.16b
|
|
usubl $9.8h, $8.8b, $1.8b
|
|
sqxtn $9.8b, $9.8h
|
|
usubl2 $8.8h, $8.16b, $1.16b
|
|
sqxtn2 $9.16b, $8.8h
|
|
smax $8.16b, $9.16b, $5.16b
|
|
//
|
|
smin $8.16b, $8.16b, $6.16b
|
|
uabd $9.16b, $0.16b, $2.16b
|
|
cmhi $9.16b, $4.16b, $9.16b
|
|
and $8.16b, $8.16b, $9.16b
|
|
and $8.16b, $8.16b, $7.16b
|
|
add $8.16b, $1.16b, $8.16b
|
|
abs $9.16b, $9.16b
|
|
.endm
|
|
|
|
.macro DIFF_LUMA_LT4_P0_Q0_1
|
|
usubl $5.8h, $0.8b, $3.8b
|
|
usubl $6.8h, $2.8b, $1.8b
|
|
shl $6.8h, $6.8h, #2
|
|
add $5.8h, $5.8h, $6.8h
|
|
sqrshrn $4.8b, $5.8h, #3
|
|
.endm
|
|
|
|
.macro DIFF_LUMA_LT4_P0_Q0_2
|
|
usubl2 $5.8h, $0.16b, $3.16b
|
|
usubl2 $6.8h, $2.16b, $1.16b
|
|
shl $6.8h, $6.8h, #2
|
|
add $5.8h, $5.8h, $6.8h
|
|
sqrshrn2 $4.16b, $5.8h, #3
|
|
.endm
|
|
|
|
.macro EXTRACT_DELTA_INTO_TWO_PART
|
|
cmge $1.16b, $0.16b, #0
|
|
and $1.16b, $0.16b, $1.16b
|
|
sub $0.16b, $1.16b, $0.16b
|
|
.endm
|
|
|
|
.macro DIFF_LUMA_EQ4_P2P1P0_1
|
|
uaddl $8.8h, $1.8b, $2.8b
|
|
uaddl $9.8h, $3.8b, $4.8b
|
|
add $9.8h, $9.8h, $8.8h
|
|
|
|
uaddl $8.8h, $0.8b, $1.8b
|
|
shl $8.8h, $8.8h, #1
|
|
add $8.8h, $9.8h, $8.8h
|
|
|
|
rshrn $0.8b, $9.8h, #2
|
|
rshrn $7.8b, $8.8h, #3
|
|
shl $9.8h, $9.8h, #1
|
|
usubl $8.8h, $5.8b, $1.8b
|
|
add $9.8h, $8.8h, $9.8h
|
|
|
|
uaddl $8.8h, $2.8b, $5.8b
|
|
uaddw $8.8h, $8.8h, $2.8b
|
|
uaddw $8.8h, $8.8h, $3.8b
|
|
|
|
rshrn $9.8b, $9.8h, #3
|
|
rshrn $8.8b, $8.8h, #2
|
|
bsl $6.8b, $9.8b, $8.8b
|
|
.endm
|
|
|
|
.macro DIFF_LUMA_EQ4_P2P1P0_2
|
|
uaddl2 $8.8h, $1.16b, $2.16b
|
|
uaddl2 $9.8h, $3.16b, $4.16b
|
|
add $9.8h, $9.8h, $8.8h
|
|
|
|
uaddl2 $8.8h, $0.16b, $1.16b
|
|
shl $8.8h, $8.8h, #1
|
|
add $8.8h, $9.8h, $8.8h
|
|
|
|
rshrn2 $0.16b, $9.8h, #2
|
|
rshrn2 $7.16b, $8.8h, #3
|
|
shl $9.8h, $9.8h, #1
|
|
usubl2 $8.8h, $5.16b, $1.16b
|
|
add $9.8h, $8.8h, $9.8h
|
|
|
|
uaddl2 $8.8h, $2.16b, $5.16b
|
|
uaddw2 $8.8h, $8.8h, $2.16b
|
|
uaddw2 $8.8h, $8.8h, $3.16b
|
|
|
|
rshrn2 $9.16b, $9.8h, #3
|
|
rshrn2 $8.16b, $8.8h, #2
|
|
bsl $6.16b, $9.16b, $8.16b
|
|
.endm
|
|
|
|
|
|
.macro DIFF_CHROMA_EQ4_P0Q0_1
|
|
uaddl $4.8h, $0.8b, $3.8b
|
|
shl $4.8h, $4.8h, #1
|
|
usubl $5.8h, $1.8b, $3.8b
|
|
add $5.8h, $5.8h, $4.8h
|
|
rshrn $6.8b, $5.8h, #2
|
|
usubl $5.8h, $2.8b, $0.8b
|
|
add $5.8h, $5.8h, $4.8h
|
|
rshrn $7.8b, $5.8h, #2
|
|
.endm
|
|
|
|
.macro DIFF_CHROMA_EQ4_P0Q0_2
|
|
uaddl2 $4.8h, $0.16b, $3.16b
|
|
shl $4.8h, $4.8h, #1
|
|
usubl2 $5.8h, $1.16b, $3.16b
|
|
add $5.8h, $5.8h, $4.8h
|
|
rshrn2 $6.16b, $5.8h, #2
|
|
usubl2 $5.8h, $2.16b, $0.16b
|
|
add $5.8h, $5.8h, $4.8h
|
|
rshrn2 $7.16b, $5.8h, #2
|
|
.endm
|
|
|
|
.macro DIFF_LUMA_EQ4_MASK
|
|
mov.16b $3, $2
|
|
bsl $3.16b, $0.16b, $1.16b
|
|
.endm
|
|
|
|
.macro LOAD_LUMA_DATA_3
|
|
ld3 {$0.b, $1.b, $2.b} [$6], [x2], x1
|
|
ld3 {$3.b, $4.b, $5.b} [$6], [x0], x1
|
|
.endm
|
|
|
|
.macro LOAD_LUMA_DATA_4
|
|
ld4 {$0.b, $1.b, $2.b, $3.b} [$8], [x3], x1
|
|
ld4 {$4.b, $5.b, $6.b, $7.b} [$8], [x0], x1
|
|
.endm
|
|
|
|
.macro STORE_LUMA_DATA_4
|
|
st4 {$0.b, $1.b, $2.b, $3.b} [$4], [x0], x1
|
|
st4 {$0.b, $1.b, $2.b, $3.b} [$5], [x2], x1
|
|
.endm
|
|
|
|
.macro STORE_LUMA_DATA_3
|
|
st3 {$0.b, $1.b, $2.b} [$6], [x3], x1
|
|
st3 {$3.b, $4.b, $5.b} [$6], [x0], x1
|
|
.endm
|
|
|
|
.macro LOAD_CHROMA_DATA_4
|
|
ld4 {$0.b, $1.b, $2.b, $3.b} [$5], [$4], x2
|
|
.endm
|
|
|
|
.macro STORE_CHROMA_DATA_2
|
|
st2 {$0.b, $1.b} [$3], [$2], x2
|
|
.endm
|
|
|
|
.macro ZERO_JUMP_END
|
|
mov $1, $0.d[0]
|
|
mov $2, $0.d[1]
|
|
orr $1, $1, $2
|
|
cbz $1, $3
|
|
.endm
|
|
|
|
.macro BS_NZC_CHECK
|
|
ld1 {v0.16b}, [$0]
|
|
//Arrange the input data --- TOP
|
|
ands x6, $1, #2
|
|
cbz x6, bs_nzc_check_jump0
|
|
sub x6, $0, $2, lsl #4
|
|
sub x6, x6, $2, lsl #3
|
|
add x6, x6, #12
|
|
ld1 {v1.s} [3], [x6]
|
|
|
|
bs_nzc_check_jump0:
|
|
ext.16b v1, v1, v0, #12
|
|
add $3.16b, v0.16b, v1.16b
|
|
|
|
// Arrange the input data --- LEFT
|
|
ands x6, $1, #1
|
|
cbz x6, bs_nzc_check_jump1
|
|
|
|
sub x6, $0, #21
|
|
add x7, x6, #4
|
|
ld1 {v1.b} [12], [x6]
|
|
add x6, x7, #4
|
|
ld1 {v1.b} [13], [x7]
|
|
add x7, x6, #4
|
|
ld1 {v1.b} [14], [x6]
|
|
ld1 {v1.b} [15], [x7]
|
|
|
|
bs_nzc_check_jump1:
|
|
ins v2.d[0], v0.d[1]
|
|
zip1 v0.16b, v0.16b, v2.16b
|
|
ins v2.d[0], v0.d[1]
|
|
zip1 v0.16b, v0.16b, v2.16b
|
|
ext.16b v1, v1, v0, #12
|
|
add $4.16b, v0.16b, v1.16b
|
|
.endm
|
|
|
|
.macro BS_COMPARE_MV //in: $0,$1(const),$2(const),$3(const),$4(const); out:$5
|
|
mov w6, #4
|
|
sabd v20.8h, $0.8h, $1.8h
|
|
sabd v21.8h, $1.8h, $2.8h
|
|
dup $0.8h, w6
|
|
sabd v22.8h, $2.8h, $3.8h
|
|
sabd v23.8h, $3.8h, $4.8h
|
|
|
|
cmge v20.8h, v20.8h, $0.8h
|
|
cmge v21.8h, v21.8h, $0.8h
|
|
cmge v22.8h, v22.8h, $0.8h
|
|
cmge v23.8h, v23.8h, $0.8h
|
|
|
|
addp v20.8h, v20.8h, v21.8h
|
|
addp v21.8h, v22.8h, v23.8h
|
|
|
|
addhn $5.8b, v20.8h, v20.8h
|
|
addhn2 $5.16b, v21.8h, v21.8h
|
|
.endm
|
|
|
|
.macro BS_MV_CHECK
|
|
ldp q0, q1, [$0], #32
|
|
ldp q2, q3, [$0]
|
|
sub $0, $0, #32
|
|
// Arrenge the input data --- TOP
|
|
ands x6, $1, #2
|
|
cbz x6, bs_mv_check_jump0
|
|
sub x6, $0, $2, lsl #6
|
|
add x6, x6, #48
|
|
ld1 {v4.16b}, [x6]
|
|
bs_mv_check_jump0:
|
|
BS_COMPARE_MV v4, v0, v1, v2, v3, $3
|
|
// Arrange the input data --- LEFT
|
|
ands x6, $1, #1
|
|
cbz x6, bs_mv_check_jump1
|
|
sub x6, $0, #52
|
|
add x7, x6, #16
|
|
ld1 {v4.s} [0], [x6]
|
|
add x6, x7, #16
|
|
ld1 {v4.s} [1], [x7]
|
|
add x7, x6, #16
|
|
ld1 {v4.s} [2], [x6]
|
|
ld1 {v4.s} [3], [x7]
|
|
bs_mv_check_jump1:
|
|
zip1 $5.4s, v0.4s, v2.4s
|
|
zip2 $6.4s, v0.4s, v2.4s
|
|
zip1 v0.4s, v1.4s, v3.4s
|
|
zip2 v2.4s, v1.4s, v3.4s
|
|
zip2 v1.4s, $5.4s, v0.4s
|
|
zip1 v0.4s, $5.4s, v0.4s
|
|
zip2 v3.4s, $6.4s, v2.4s
|
|
zip1 v2.4s, $6.4s, v2.4s
|
|
BS_COMPARE_MV v4, v0, v1, v2, v3, $4
|
|
.endm
|
|
|
|
#else
|
|
|
|
.macro MASK_MATRIX arg0, arg1, arg2, arg3, arg4, arg5, arg6
|
|
uabd \arg6\().16b, \arg1\().16b, \arg2\().16b
|
|
cmhi \arg6\().16b, \arg4\().16b, \arg6\().16b
|
|
|
|
uabd \arg4\().16b, \arg0\().16b, \arg1\().16b
|
|
cmhi \arg4\().16b, \arg5\().16b, \arg4\().16b
|
|
and \arg6\().16b, \arg6\().16b, \arg4\().16b
|
|
|
|
uabd \arg4\().16b, \arg3\().16b, \arg2\().16b
|
|
cmhi \arg4\().16b, \arg5\().16b, \arg4\().16b
|
|
and \arg6\().16b, \arg6\().16b, \arg4\().16b
|
|
.endm
|
|
|
|
.macro DIFF_LUMA_LT4_P1_Q1 arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8, arg9
|
|
//v0, v1, v2, v3, v17(beta), v18(-Tc0), v6(Tc0), v7(flag), v19, v20
|
|
urhadd \arg8\().16b, \arg2\().16b, \arg3\().16b
|
|
uhadd \arg8\().16b, \arg0\().16b, \arg8\().16b
|
|
usubl \arg9\().8h, \arg8\().8b, \arg1\().8b
|
|
sqxtn \arg9\().8b, \arg9\().8h
|
|
usubl2 \arg8\().8h, \arg8\().16b, \arg1\().16b
|
|
sqxtn2 \arg9\().16b, \arg8\().8h
|
|
smax \arg8\().16b, \arg9\().16b, \arg5\().16b
|
|
//
|
|
smin \arg8\().16b, \arg8\().16b, \arg6\().16b
|
|
uabd \arg9\().16b, \arg0\().16b, \arg2\().16b
|
|
cmhi \arg9\().16b, \arg4\().16b, \arg9\().16b
|
|
and \arg8\().16b, \arg8\().16b, \arg9\().16b
|
|
and \arg8\().16b, \arg8\().16b, \arg7\().16b
|
|
add \arg8\().16b, \arg1\().16b, \arg8\().16b
|
|
abs \arg9\().16b, \arg9\().16b
|
|
.endm
|
|
|
|
.macro DIFF_LUMA_LT4_P0_Q0_1 arg0, arg1, arg2, arg3, arg4, arg5, arg6
|
|
usubl \arg5\().8h, \arg0\().8b, \arg3\().8b
|
|
usubl \arg6\().8h, \arg2\().8b, \arg1\().8b
|
|
shl \arg6\().8h, \arg6\().8h, #2
|
|
add \arg5\().8h, \arg5\().8h, \arg6\().8h
|
|
sqrshrn \arg4\().8b, \arg5\().8h, #3
|
|
.endm
|
|
|
|
.macro DIFF_LUMA_LT4_P0_Q0_2 arg0, arg1, arg2, arg3, arg4, arg5, arg6
|
|
usubl2 \arg5\().8h, \arg0\().16b, \arg3\().16b
|
|
usubl2 \arg6\().8h, \arg2\().16b, \arg1\().16b
|
|
shl \arg6\().8h, \arg6\().8h, #2
|
|
add \arg5\().8h, \arg5\().8h, \arg6\().8h
|
|
sqrshrn2 \arg4\().16b, \arg5\().8h, #3
|
|
.endm
|
|
|
|
.macro EXTRACT_DELTA_INTO_TWO_PART arg0, arg1
|
|
cmge \arg1\().16b, \arg0\().16b, #0
|
|
and \arg1\().16b, \arg0\().16b, \arg1\().16b
|
|
sub \arg0\().16b, \arg1\().16b, \arg0\().16b
|
|
.endm
|
|
|
|
.macro DIFF_LUMA_EQ4_P2P1P0_1 arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8, arg9
|
|
uaddl \arg8\().8h, \arg1\().8b, \arg2\().8b
|
|
uaddl \arg9\().8h, \arg3\().8b, \arg4\().8b
|
|
add \arg9\().8h, \arg9\().8h, \arg8\().8h
|
|
|
|
uaddl \arg8\().8h, \arg0\().8b, \arg1\().8b
|
|
shl \arg8\().8h, \arg8\().8h, #1
|
|
add \arg8\().8h, \arg9\().8h, \arg8\().8h
|
|
|
|
rshrn \arg0\().8b, \arg9\().8h, #2
|
|
rshrn \arg7\().8b, \arg8\().8h, #3
|
|
shl \arg9\().8h, \arg9\().8h, #1
|
|
usubl \arg8\().8h, \arg5\().8b, \arg1\().8b
|
|
add \arg9\().8h, \arg8\().8h, \arg9\().8h
|
|
|
|
uaddl \arg8\().8h, \arg2\().8b, \arg5\().8b
|
|
uaddw \arg8\().8h, \arg8\().8h, \arg2\().8b
|
|
uaddw \arg8\().8h, \arg8\().8h, \arg3\().8b
|
|
|
|
rshrn \arg9\().8b, \arg9\().8h, #3
|
|
rshrn \arg8\().8b, \arg8\().8h, #2
|
|
bsl \arg6\().8b, \arg9\().8b, \arg8\().8b
|
|
.endm
|
|
|
|
.macro DIFF_LUMA_EQ4_P2P1P0_2 arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8, arg9
|
|
uaddl2 \arg8\().8h, \arg1\().16b, \arg2\().16b
|
|
uaddl2 \arg9\().8h, \arg3\().16b, \arg4\().16b
|
|
add \arg9\().8h, \arg9\().8h, \arg8\().8h
|
|
|
|
uaddl2 \arg8\().8h, \arg0\().16b, \arg1\().16b
|
|
shl \arg8\().8h, \arg8\().8h, #1
|
|
add \arg8\().8h, \arg9\().8h, \arg8\().8h
|
|
|
|
rshrn2 \arg0\().16b, \arg9\().8h, #2
|
|
rshrn2 \arg7\().16b, \arg8\().8h, #3
|
|
shl \arg9\().8h, \arg9\().8h, #1
|
|
usubl2 \arg8\().8h, \arg5\().16b, \arg1\().16b
|
|
add \arg9\().8h, \arg8\().8h, \arg9\().8h
|
|
|
|
uaddl2 \arg8\().8h, \arg2\().16b, \arg5\().16b
|
|
uaddw2 \arg8\().8h, \arg8\().8h, \arg2\().16b
|
|
uaddw2 \arg8\().8h, \arg8\().8h, \arg3\().16b
|
|
|
|
rshrn2 \arg9\().16b, \arg9\().8h, #3
|
|
rshrn2 \arg8\().16b, \arg8\().8h, #2
|
|
bsl \arg6\().16b, \arg9\().16b, \arg8\().16b
|
|
.endm
|
|
|
|
|
|
.macro DIFF_CHROMA_EQ4_P0Q0_1 arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7
|
|
uaddl \arg4\().8h, \arg0\().8b, \arg3\().8b
|
|
shl \arg4\().8h, \arg4\().8h, #1
|
|
usubl \arg5\().8h, \arg1\().8b, \arg3\().8b
|
|
add \arg5\().8h, \arg5\().8h, \arg4\().8h
|
|
rshrn \arg6\().8b, \arg5\().8h, #2
|
|
usubl \arg5\().8h, \arg2\().8b, \arg0\().8b
|
|
add \arg5\().8h, \arg5\().8h, \arg4\().8h
|
|
rshrn \arg7\().8b, \arg5\().8h, #2
|
|
.endm
|
|
|
|
.macro DIFF_CHROMA_EQ4_P0Q0_2 arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7
|
|
uaddl2 \arg4\().8h, \arg0\().16b, \arg3\().16b
|
|
shl \arg4\().8h, \arg4\().8h, #1
|
|
usubl2 \arg5\().8h, \arg1\().16b, \arg3\().16b
|
|
add \arg5\().8h, \arg5\().8h, \arg4\().8h
|
|
rshrn2 \arg6\().16b, \arg5\().8h, #2
|
|
usubl2 \arg5\().8h, \arg2\().16b, \arg0\().16b
|
|
add \arg5\().8h, \arg5\().8h, \arg4\().8h
|
|
rshrn2 \arg7\().16b, \arg5\().8h, #2
|
|
.endm
|
|
|
|
.macro DIFF_LUMA_EQ4_MASK arg0, arg1, arg2, arg3
|
|
mov.16b \arg3, \arg2
|
|
bsl \arg3\().16b, \arg0\().16b, \arg1\().16b
|
|
.endm
|
|
|
|
.macro LOAD_LUMA_DATA_3 arg0, arg1, arg2, arg3, arg4, arg5, arg6
|
|
ld3 {\arg0\().b, \arg1\().b, \arg2\().b} [\arg6], [x2], x1
|
|
ld3 {\arg3\().b, \arg4\().b, \arg5\().b} [\arg6], [x0], x1
|
|
.endm
|
|
|
|
.macro LOAD_LUMA_DATA_4 arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8
|
|
ld4 {\arg0\().b, \arg1\().b, \arg2\().b, \arg3\().b} [\arg8], [x3], x1
|
|
ld4 {\arg4\().b, \arg5\().b, \arg6\().b, \arg7\().b} [\arg8], [x0], x1
|
|
.endm
|
|
|
|
.macro STORE_LUMA_DATA_4 arg0, arg1, arg2, arg3, arg4, arg5
|
|
st4 {\arg0\().b, \arg1\().b, \arg2\().b, \arg3\().b} [\arg4], [x0], x1
|
|
st4 {\arg0\().b, \arg1\().b, \arg2\().b, \arg3\().b} [\arg5], [x2], x1
|
|
.endm
|
|
|
|
.macro STORE_LUMA_DATA_3 arg0, arg1, arg2, arg3, arg4, arg5, arg6
|
|
st3 {\arg0\().b, \arg1\().b, \arg2\().b} [\arg6], [x3], x1
|
|
st3 {\arg3\().b, \arg4\().b, \arg5\().b} [\arg6], [x0], x1
|
|
.endm
|
|
|
|
.macro LOAD_CHROMA_DATA_4 arg0, arg1, arg2, arg3, arg4, arg5
|
|
ld4 {\arg0\().b, \arg1\().b, \arg2\().b, \arg3\().b} [\arg5], [\arg4], x2
|
|
.endm
|
|
|
|
.macro STORE_CHROMA_DATA_2 arg0, arg1, arg2, arg3
|
|
st2 {\arg0\().b, \arg1\().b} [\arg3], [\arg2], x2
|
|
.endm
|
|
|
|
.macro ZERO_JUMP_END arg0, arg1, arg2, arg3
|
|
mov \arg1, \arg0\().d[0]
|
|
mov \arg2, \arg0\().d[1]
|
|
orr \arg1, \arg1, \arg2
|
|
cbz \arg1, \arg3
|
|
.endm
|
|
|
|
.macro BS_NZC_CHECK arg0, arg1, arg2, arg3, arg4
|
|
ld1 {v0.16b}, [\arg0]
|
|
//Arrange the input data --- TOP
|
|
ands x6, \arg1, #2
|
|
cbz x6, bs_nzc_check_jump0
|
|
sub x6, \arg0, \arg2, lsl #4
|
|
sub x6, x6, \arg2, lsl #3
|
|
add x6, x6, #12
|
|
ld1 {v1.s} [3], [x6]
|
|
|
|
bs_nzc_check_jump0:
|
|
ext.16b v1, v1, v0, #12
|
|
add \arg3\().16b, v0.16b, v1.16b
|
|
|
|
// Arrange the input data --- LEFT
|
|
ands x6, \arg1, #1
|
|
cbz x6, bs_nzc_check_jump1
|
|
|
|
sub x6, \arg0, #21
|
|
add x7, x6, #4
|
|
ld1 {v1.b} [12], [x6]
|
|
add x6, x7, #4
|
|
ld1 {v1.b} [13], [x7]
|
|
add x7, x6, #4
|
|
ld1 {v1.b} [14], [x6]
|
|
ld1 {v1.b} [15], [x7]
|
|
|
|
bs_nzc_check_jump1:
|
|
ins v2.d[0], v0.d[1]
|
|
zip1 v0.16b, v0.16b, v2.16b
|
|
ins v2.d[0], v0.d[1]
|
|
zip1 v0.16b, v0.16b, v2.16b
|
|
ext.16b v1, v1, v0, #12
|
|
add \arg4\().16b, v0.16b, v1.16b
|
|
.endm
|
|
|
|
.macro BS_COMPARE_MV arg0, arg1, arg2, arg3, arg4, arg5
|
|
//in: \arg0,\arg1(const),\arg2(const),\arg3(const),\arg4(const); out:\arg5
|
|
mov w6, #4
|
|
sabd v20.8h, \arg0\().8h, \arg1\().8h
|
|
sabd v21.8h, \arg1\().8h, \arg2\().8h
|
|
dup \arg0\().8h, w6
|
|
sabd v22.8h, \arg2\().8h, \arg3\().8h
|
|
sabd v23.8h, \arg3\().8h, \arg4\().8h
|
|
|
|
cmge v20.8h, v20.8h, \arg0\().8h
|
|
cmge v21.8h, v21.8h, \arg0\().8h
|
|
cmge v22.8h, v22.8h, \arg0\().8h
|
|
cmge v23.8h, v23.8h, \arg0\().8h
|
|
|
|
addp v20.8h, v20.8h, v21.8h
|
|
addp v21.8h, v22.8h, v23.8h
|
|
|
|
addhn \arg5\().8b, v20.8h, v20.8h
|
|
addhn2 \arg5\().16b, v21.8h, v21.8h
|
|
.endm
|
|
|
|
.macro BS_MV_CHECK arg0, arg1, arg2, arg3, arg4, arg5, arg6
|
|
ldp q0, q1, [\arg0], #32
|
|
ldp q2, q3, [\arg0]
|
|
sub \arg0, \arg0, #32
|
|
// Arrenge the input data --- TOP
|
|
ands x6, \arg1, #2
|
|
cbz x6, bs_mv_check_jump0
|
|
sub x6, \arg0, \arg2, lsl #6
|
|
add x6, x6, #48
|
|
ld1 {v4.16b}, [x6]
|
|
bs_mv_check_jump0:
|
|
BS_COMPARE_MV v4, v0, v1, v2, v3, \arg3
|
|
// Arrange the input data --- LEFT
|
|
ands x6, \arg1, #1
|
|
cbz x6, bs_mv_check_jump1
|
|
sub x6, \arg0, #52
|
|
add x7, x6, #16
|
|
ld1 {v4.s} [0], [x6]
|
|
add x6, x7, #16
|
|
ld1 {v4.s} [1], [x7]
|
|
add x7, x6, #16
|
|
ld1 {v4.s} [2], [x6]
|
|
ld1 {v4.s} [3], [x7]
|
|
bs_mv_check_jump1:
|
|
zip1 \arg5\().4s, v0.4s, v2.4s
|
|
zip2 \arg6\().4s, v0.4s, v2.4s
|
|
zip1 v0.4s, v1.4s, v3.4s
|
|
zip2 v2.4s, v1.4s, v3.4s
|
|
zip2 v1.4s, \arg5\().4s, v0.4s
|
|
zip1 v0.4s, \arg5\().4s, v0.4s
|
|
zip2 v3.4s, \arg6\().4s, v2.4s
|
|
zip1 v2.4s, \arg6\().4s, v2.4s
|
|
BS_COMPARE_MV v4, v0, v1, v2, v3, \arg4
|
|
.endm
|
|
#endif
|
|
|
|
WELS_ASM_ARCH64_FUNC_BEGIN WelsNonZeroCount_AArch64_neon
|
|
ld1 {v0.8b, v1.8b, v2.8b}, [x0]
|
|
ins v0.d[1], v1.d[0]
|
|
uzp1 v0.2d, v0.2d, v1.2d
|
|
cmeq v0.16b, v0.16b, #0
|
|
cmeq v2.8b, v2.8b, #0
|
|
mvn v0.16b, v0.16b
|
|
mvn v2.8b, v2.8b
|
|
abs v0.16b, v0.16b
|
|
abs v2.8b, v2.8b
|
|
ins v1.d[0], v0.d[1]
|
|
st1 {v0.8b, v1.8b, v2.8b}, [x0]
|
|
WELS_ASM_ARCH64_FUNC_END
|
|
|
|
|
|
WELS_ASM_ARCH64_FUNC_BEGIN DeblockLumaLt4V_AArch64_neon //uint8_t* pPix, int32_t iStride, int32_t iAlpha, int32_t iBeta, int8_t* tc
|
|
dup v16.16b, w2 //alpha
|
|
dup v17.16b, w3 //beta
|
|
add x2, x1, x1, lsl #1
|
|
sub x2, x0, x2
|
|
movi v23.16b, #128
|
|
ld1 {v0.16b}, [x2], x1
|
|
ld1 {v1.16b}, [x2], x1
|
|
ld1 {v2.16b}, [x2]
|
|
ld1 {v3.16b}, [x0], x1
|
|
ld1 {v4.16b}, [x0], x1
|
|
ld1 {v5.16b}, [x0]
|
|
sub x2, x2, x1
|
|
ld4r {v18.8b, v19.8b, v20.8b, v21.8b}, [x4]
|
|
trn1 v18.2s, v18.2s, v19.2s
|
|
trn1 v20.2s, v20.2s, v21.2s
|
|
trn1 v6.2d, v18.2d, v20.2d // iTc0: 0000, 1111, 2222, 3333
|
|
cmge v7.16b, v6.16b, #0 // iTc0 Flag
|
|
|
|
MASK_MATRIX v1, v2, v3, v4, v16, v17, v18
|
|
and v7.16b, v7.16b, v18.16b // need filter flag
|
|
|
|
ZERO_JUMP_END v7, x3, x4, DeblockLumaLt4V_AArch64_neon_end
|
|
|
|
eor v18.16b, v18.16b, v18.16b
|
|
sub v18.16b, v18.16b, v6.16b // -iTc0: 0000, 1111, 2222, 3333
|
|
|
|
DIFF_LUMA_LT4_P1_Q1 v0, v1, v2, v3, v17, v18, v6, v7, v19, v20
|
|
st1 {v19.16b}, [x2], x1
|
|
|
|
DIFF_LUMA_LT4_P1_Q1 v5, v4, v3, v2, v17, v18, v6, v7, v21, v22
|
|
|
|
abs v20.16b, v20.16b
|
|
abs v22.16b, v22.16b
|
|
add v6.16b, v6.16b, v20.16b
|
|
add v6.16b, v6.16b, v22.16b
|
|
eor v18.16b, v18.16b, v18.16b
|
|
sub v18.16b, v18.16b, v6.16b
|
|
|
|
DIFF_LUMA_LT4_P0_Q0_1 v1, v2, v3, v4, v19, v20, v22
|
|
DIFF_LUMA_LT4_P0_Q0_2 v1, v2, v3, v4, v19, v20, v22
|
|
|
|
smax v19.16b, v19.16b, v18.16b
|
|
smin v19.16b, v19.16b, v6.16b
|
|
and v19.16b, v19.16b, v7.16b
|
|
|
|
EXTRACT_DELTA_INTO_TWO_PART v19, v20
|
|
uqadd v2.16b, v2.16b, v20.16b
|
|
uqsub v2.16b, v2.16b, v19.16b
|
|
st1 {v2.16b}, [x2], x1
|
|
uqsub v3.16b, v3.16b, v20.16b
|
|
uqadd v3.16b, v3.16b, v19.16b
|
|
st1 {v3.16b}, [x2], x1
|
|
st1 {v21.16b}, [x2]
|
|
DeblockLumaLt4V_AArch64_neon_end:
|
|
WELS_ASM_ARCH64_FUNC_END
|
|
|
|
|
|
WELS_ASM_ARCH64_FUNC_BEGIN DeblockLumaEq4V_AArch64_neon
|
|
dup v16.16b, w2 //alpha
|
|
dup v17.16b, w3 //beta
|
|
sub x3, x0, x1, lsl #2
|
|
|
|
ld1 {v0.16b}, [x3], x1
|
|
ld1 {v4.16b}, [x0], x1
|
|
ld1 {v1.16b}, [x3], x1
|
|
ld1 {v5.16b}, [x0], x1
|
|
ld1 {v2.16b}, [x3], x1
|
|
ld1 {v6.16b}, [x0], x1
|
|
ld1 {v3.16b}, [x3]
|
|
ld1 {v7.16b}, [x0]
|
|
|
|
sub x3, x3, x1, lsl #1
|
|
MASK_MATRIX v2, v3, v4, v5, v16, v17, v18
|
|
lsr w2, w2, #2
|
|
add w2, w2, #2
|
|
dup v16.16b, w2 //((alpha >> 2) + 2)
|
|
uabd v19.16b, v3.16b, v4.16b
|
|
cmhi v20.16b, v16.16b, v19.16b //iDetaP0Q0 < ((iAlpha >> 2) + 2)
|
|
|
|
uabd v21.16b, v1.16b, v3.16b
|
|
cmhi v21.16b, v17.16b, v21.16b //bDetaP2P0
|
|
and v21.16b, v21.16b, v20.16b //(iDetaP0Q0 < ((iAlpha >> 2) + 2))&&bDetaP2P0
|
|
|
|
uabd v22.16b, v6.16b, v4.16b
|
|
cmhi v22.16b, v17.16b, v22.16b //bDetaQ2Q0
|
|
and v22.16b, v22.16b, v20.16b //(iDetaP0Q0 < ((iAlpha >> 2) + 2))&&bDetaQ2Q0
|
|
and v20.16b, v20.16b, v18.16b //(iDetaP0Q0 < iAlpha) && bDetaP1P0 && bDetaQ1Q0&&(iDetaP0Q0 < ((iAlpha >> 2) + 2))
|
|
|
|
mov.16b v23, v21
|
|
mov.16b v24, v21
|
|
|
|
mov.16b v25, v0
|
|
DIFF_LUMA_EQ4_P2P1P0_1 v0, v1, v2, v3, v4, v5, v23, v19, v17, v16
|
|
DIFF_LUMA_EQ4_P2P1P0_2 v25, v1, v2, v3, v4, v5, v24, v19, v17, v16
|
|
ins v0.d[1], v25.d[1]
|
|
ins v23.d[1], v24.d[1]
|
|
and v21.16b, v20.16b, v21.16b
|
|
DIFF_LUMA_EQ4_MASK v19, v1, v21, v17
|
|
st1 {v17.16b}, [x3], x1
|
|
DIFF_LUMA_EQ4_MASK v0, v2, v21, v17
|
|
st1 {v17.16b}, [x3], x1
|
|
DIFF_LUMA_EQ4_MASK v23, v3, v18, v17
|
|
st1 {v17.16b}, [x3], x1
|
|
|
|
|
|
mov.16b v23, v22
|
|
mov.16b v24, v22
|
|
mov.16b v25, v7
|
|
DIFF_LUMA_EQ4_P2P1P0_1 v7, v6, v5, v4, v3, v2, v23, v19, v17, v16
|
|
DIFF_LUMA_EQ4_P2P1P0_2 v25, v6, v5, v4, v3, v2, v24, v19, v17, v16
|
|
ins v7.d[1], v25.d[1]
|
|
ins v23.d[1], v24.d[1]
|
|
and v22.16b, v20.16b, v22.16b
|
|
DIFF_LUMA_EQ4_MASK v23, v4, v18, v17
|
|
st1 {v17.16b}, [x3], x1
|
|
DIFF_LUMA_EQ4_MASK v7, v5, v22, v17
|
|
st1 {v17.16b}, [x3], x1
|
|
DIFF_LUMA_EQ4_MASK v19, v6, v22, v17
|
|
st1 {v17.16b}, [x3], x1
|
|
DeblockLumaEq4V_AArch64_neon_end:
|
|
WELS_ASM_ARCH64_FUNC_END
|
|
|
|
|
|
WELS_ASM_ARCH64_FUNC_BEGIN DeblockLumaLt4H_AArch64_neon //uint8_t* pPix, int32_t iStride, int32_t iAlpha, int32_t iBeta, int8_t* tc
|
|
dup v16.16b, w2 //alpha
|
|
dup v17.16b, w3 //beta
|
|
sub x2, x0, #3
|
|
movi v23.16b, #128
|
|
|
|
LOAD_LUMA_DATA_3 v0, v1, v2, v3, v4, v5, 0
|
|
LOAD_LUMA_DATA_3 v0, v1, v2, v3, v4, v5, 1
|
|
LOAD_LUMA_DATA_3 v0, v1, v2, v3, v4, v5, 2
|
|
LOAD_LUMA_DATA_3 v0, v1, v2, v3, v4, v5, 3
|
|
LOAD_LUMA_DATA_3 v0, v1, v2, v3, v4, v5, 4
|
|
LOAD_LUMA_DATA_3 v0, v1, v2, v3, v4, v5, 5
|
|
LOAD_LUMA_DATA_3 v0, v1, v2, v3, v4, v5, 6
|
|
LOAD_LUMA_DATA_3 v0, v1, v2, v3, v4, v5, 7
|
|
|
|
LOAD_LUMA_DATA_3 v0, v1, v2, v3, v4, v5, 8
|
|
LOAD_LUMA_DATA_3 v0, v1, v2, v3, v4, v5, 9
|
|
LOAD_LUMA_DATA_3 v0, v1, v2, v3, v4, v5, 10
|
|
LOAD_LUMA_DATA_3 v0, v1, v2, v3, v4, v5, 11
|
|
LOAD_LUMA_DATA_3 v0, v1, v2, v3, v4, v5, 12
|
|
LOAD_LUMA_DATA_3 v0, v1, v2, v3, v4, v5, 13
|
|
LOAD_LUMA_DATA_3 v0, v1, v2, v3, v4, v5, 14
|
|
LOAD_LUMA_DATA_3 v0, v1, v2, v3, v4, v5, 15
|
|
|
|
sub x0, x0, x1, lsl #4
|
|
|
|
ld4r {v18.8b, v19.8b, v20.8b, v21.8b}, [x4]
|
|
trn1 v18.2s, v18.2s, v19.2s
|
|
trn1 v20.2s, v20.2s, v21.2s
|
|
trn1 v6.2d, v18.2d, v20.2d // iTc0: 0000, 1111, 2222, 3333
|
|
cmge v7.16b, v6.16b, #0 // iTc0 Flag
|
|
|
|
MASK_MATRIX v1, v2, v3, v4, v16, v17, v18
|
|
and v7.16b, v7.16b, v18.16b // need filter flag
|
|
|
|
ZERO_JUMP_END v7, x3, x4, DeblockLumaLt4H_AArch64_neon_end
|
|
|
|
eor v18.16b, v18.16b, v18.16b
|
|
sub v18.16b, v18.16b, v6.16b // -iTc0: 0000, 1111, 2222, 3333
|
|
|
|
DIFF_LUMA_LT4_P1_Q1 v0, v1, v2, v3, v17, v18, v6, v7, v19, v20 //Use Tmp v23,v24
|
|
mov.16b v25, v19
|
|
|
|
DIFF_LUMA_LT4_P1_Q1 v5, v4, v3, v2, v17, v18, v6, v7, v21, v22 //Use Tmp v23,v24
|
|
|
|
abs v20.16b, v20.16b
|
|
abs v22.16b, v22.16b
|
|
add v6.16b, v6.16b, v20.16b
|
|
add v6.16b, v6.16b, v22.16b
|
|
eor v18.16b, v18.16b, v18.16b
|
|
sub v18.16b, v18.16b, v6.16b
|
|
|
|
DIFF_LUMA_LT4_P0_Q0_1 v1, v2, v3, v4, v19, v20, v22
|
|
DIFF_LUMA_LT4_P0_Q0_2 v1, v2, v3, v4, v19, v20, v22
|
|
|
|
smax v19.16b, v19.16b, v18.16b
|
|
smin v19.16b, v19.16b, v6.16b
|
|
and v19.16b, v19.16b, v7.16b
|
|
|
|
EXTRACT_DELTA_INTO_TWO_PART v19, v20
|
|
uqadd v2.16b, v2.16b, v20.16b
|
|
uqsub v2.16b, v2.16b, v19.16b
|
|
mov.16b v26, v2
|
|
uqsub v3.16b, v3.16b, v20.16b
|
|
uqadd v3.16b, v3.16b, v19.16b
|
|
mov.16b v27, v3
|
|
mov.16b v28, v21
|
|
|
|
sub x0, x0, #2
|
|
add x2, x0, x1
|
|
lsl x1, x1, #1
|
|
|
|
STORE_LUMA_DATA_4 v25, v26, v27, v28, 0, 1
|
|
STORE_LUMA_DATA_4 v25, v26, v27, v28, 2, 3
|
|
STORE_LUMA_DATA_4 v25, v26, v27, v28, 4, 5
|
|
STORE_LUMA_DATA_4 v25, v26, v27, v28, 6, 7
|
|
|
|
STORE_LUMA_DATA_4 v25, v26, v27, v28, 8, 9
|
|
STORE_LUMA_DATA_4 v25, v26, v27, v28, 10, 11
|
|
STORE_LUMA_DATA_4 v25, v26, v27, v28, 12, 13
|
|
STORE_LUMA_DATA_4 v25, v26, v27, v28, 14, 15
|
|
DeblockLumaLt4H_AArch64_neon_end:
|
|
WELS_ASM_ARCH64_FUNC_END
|
|
|
|
|
|
WELS_ASM_ARCH64_FUNC_BEGIN DeblockLumaEq4H_AArch64_neon
|
|
dup v16.16b, w2 //alpha
|
|
dup v17.16b, w3 //beta
|
|
sub x3, x0, #4
|
|
|
|
LOAD_LUMA_DATA_4 v0, v1, v2, v3, v4, v5, v6, v7, 0
|
|
LOAD_LUMA_DATA_4 v0, v1, v2, v3, v4, v5, v6, v7, 1
|
|
LOAD_LUMA_DATA_4 v0, v1, v2, v3, v4, v5, v6, v7, 2
|
|
LOAD_LUMA_DATA_4 v0, v1, v2, v3, v4, v5, v6, v7, 3
|
|
LOAD_LUMA_DATA_4 v0, v1, v2, v3, v4, v5, v6, v7, 4
|
|
LOAD_LUMA_DATA_4 v0, v1, v2, v3, v4, v5, v6, v7, 5
|
|
LOAD_LUMA_DATA_4 v0, v1, v2, v3, v4, v5, v6, v7, 6
|
|
LOAD_LUMA_DATA_4 v0, v1, v2, v3, v4, v5, v6, v7, 7
|
|
|
|
LOAD_LUMA_DATA_4 v0, v1, v2, v3, v4, v5, v6, v7, 8
|
|
LOAD_LUMA_DATA_4 v0, v1, v2, v3, v4, v5, v6, v7, 9
|
|
LOAD_LUMA_DATA_4 v0, v1, v2, v3, v4, v5, v6, v7, 10
|
|
LOAD_LUMA_DATA_4 v0, v1, v2, v3, v4, v5, v6, v7, 11
|
|
LOAD_LUMA_DATA_4 v0, v1, v2, v3, v4, v5, v6, v7, 12
|
|
LOAD_LUMA_DATA_4 v0, v1, v2, v3, v4, v5, v6, v7, 13
|
|
LOAD_LUMA_DATA_4 v0, v1, v2, v3, v4, v5, v6, v7, 14
|
|
LOAD_LUMA_DATA_4 v0, v1, v2, v3, v4, v5, v6, v7, 15
|
|
|
|
sub x0, x0, x1, lsl #4
|
|
sub x3, x0, #3
|
|
MASK_MATRIX v2, v3, v4, v5, v16, v17, v18
|
|
|
|
ZERO_JUMP_END v18, x4, x5, DeblockLumaEq4H_AArch64_neon_end
|
|
|
|
lsr w2, w2, #2
|
|
add w2, w2, #2
|
|
dup v16.16b, w2 //((alpha >> 2) + 2)
|
|
uabd v19.16b, v3.16b, v4.16b
|
|
cmhi v20.16b, v16.16b, v19.16b //iDetaP0Q0 < ((iAlpha >> 2) + 2)
|
|
|
|
uabd v21.16b, v1.16b, v3.16b
|
|
cmhi v21.16b, v17.16b, v21.16b //bDetaP2P0
|
|
and v21.16b, v21.16b, v20.16b //(iDetaP0Q0 < ((iAlpha >> 2) + 2))&&bDetaP2P0
|
|
|
|
uabd v22.16b, v6.16b, v4.16b
|
|
cmhi v22.16b, v17.16b, v22.16b //bDetaQ2Q0
|
|
and v22.16b, v22.16b, v20.16b //(iDetaP0Q0 < ((iAlpha >> 2) + 2))&&bDetaQ2Q0
|
|
and v20.16b, v20.16b, v18.16b //(iDetaP0Q0 < iAlpha) && bDetaP1P0 && bDetaQ1Q0&&(iDetaP0Q0 < ((iAlpha >> 2) + 2))
|
|
|
|
mov.16b v23, v21
|
|
mov.16b v24, v21
|
|
|
|
mov.16b v25, v0
|
|
DIFF_LUMA_EQ4_P2P1P0_1 v0, v1, v2, v3, v4, v5, v23, v19, v17, v16
|
|
DIFF_LUMA_EQ4_P2P1P0_2 v25, v1, v2, v3, v4, v5, v24, v19, v17, v16
|
|
ins v0.d[1], v25.d[1]
|
|
ins v23.d[1], v24.d[1]
|
|
and v21.16b, v20.16b, v21.16b
|
|
DIFF_LUMA_EQ4_MASK v19, v1, v21, v17
|
|
mov.16b v26, v17
|
|
DIFF_LUMA_EQ4_MASK v0, v2, v21, v17
|
|
mov.16b v27, v17
|
|
DIFF_LUMA_EQ4_MASK v23, v3, v18, v17
|
|
mov.16b v28, v17
|
|
|
|
|
|
mov.16b v23, v22
|
|
mov.16b v24, v22
|
|
mov.16b v25, v7
|
|
DIFF_LUMA_EQ4_P2P1P0_1 v7, v6, v5, v4, v3, v2, v23, v19, v17, v16
|
|
DIFF_LUMA_EQ4_P2P1P0_2 v25, v6, v5, v4, v3, v2, v24, v19, v17, v16
|
|
ins v7.d[1], v25.d[1]
|
|
ins v23.d[1], v24.d[1]
|
|
and v22.16b, v20.16b, v22.16b
|
|
DIFF_LUMA_EQ4_MASK v23, v4, v18, v17
|
|
mov.16b v29, v17
|
|
DIFF_LUMA_EQ4_MASK v7, v5, v22, v17
|
|
mov.16b v30, v17
|
|
DIFF_LUMA_EQ4_MASK v19, v6, v22, v17
|
|
mov.16b v31, v17
|
|
|
|
STORE_LUMA_DATA_3 v26, v27, v28, v29, v30, v31, 0
|
|
STORE_LUMA_DATA_3 v26, v27, v28, v29, v30, v31, 1
|
|
STORE_LUMA_DATA_3 v26, v27, v28, v29, v30, v31, 2
|
|
STORE_LUMA_DATA_3 v26, v27, v28, v29, v30, v31, 3
|
|
STORE_LUMA_DATA_3 v26, v27, v28, v29, v30, v31, 4
|
|
STORE_LUMA_DATA_3 v26, v27, v28, v29, v30, v31, 5
|
|
STORE_LUMA_DATA_3 v26, v27, v28, v29, v30, v31, 6
|
|
STORE_LUMA_DATA_3 v26, v27, v28, v29, v30, v31, 7
|
|
STORE_LUMA_DATA_3 v26, v27, v28, v29, v30, v31, 8
|
|
STORE_LUMA_DATA_3 v26, v27, v28, v29, v30, v31, 9
|
|
STORE_LUMA_DATA_3 v26, v27, v28, v29, v30, v31, 10
|
|
STORE_LUMA_DATA_3 v26, v27, v28, v29, v30, v31, 11
|
|
STORE_LUMA_DATA_3 v26, v27, v28, v29, v30, v31, 12
|
|
STORE_LUMA_DATA_3 v26, v27, v28, v29, v30, v31, 13
|
|
STORE_LUMA_DATA_3 v26, v27, v28, v29, v30, v31, 14
|
|
STORE_LUMA_DATA_3 v26, v27, v28, v29, v30, v31, 15
|
|
DeblockLumaEq4H_AArch64_neon_end:
|
|
WELS_ASM_ARCH64_FUNC_END
|
|
|
|
|
|
WELS_ASM_ARCH64_FUNC_BEGIN DeblockChromaLt4V_AArch64_neon //uint8_t* pPixCb, uint8_t* pPixCr, int32_t iStrideX, int32_t iAlpha, int32_t iBeta, int8_t* pTc
|
|
dup v16.16b, w3 //alpha
|
|
dup v17.16b, w4 //beta
|
|
lsl x3, x2, #1
|
|
sub x6, x0, x3 //pPixCb-2*Stride
|
|
sub x7, x1, x3 //pPixCr-2*Stride
|
|
|
|
ld1 {v0.d} [0], [x6], x2
|
|
ld1 {v1.d} [0], [x6]
|
|
ld1 {v2.d} [0], [x0], x2
|
|
ld1 {v3.d} [0], [x0]
|
|
ld1 {v0.d} [1], [x7], x2
|
|
ld1 {v1.d} [1], [x7]
|
|
ld1 {v2.d} [1], [x1], x2
|
|
ld1 {v3.d} [1], [x1]
|
|
|
|
ld4r {v18.8b, v19.8b, v20.8b, v21.8b}, [x5]
|
|
trn1 v18.4h, v18.4h, v19.4h //0011,0011,
|
|
trn1 v20.4h, v20.4h, v21.4h //2233,2233
|
|
zip1 v6.4s, v18.4s, v20.4s //iTc0: 0011,2233,0011,2233
|
|
cmgt v7.16b, v6.16b, #0 // iTc0 Flag
|
|
|
|
MASK_MATRIX v0, v1, v2, v3, v16, v17, v18
|
|
and v7.16b, v7.16b, v18.16b // need filter flag
|
|
|
|
ZERO_JUMP_END v7, x4, x5, DeblockChromaLt4V_AArch64_neon_end
|
|
|
|
eor v18.16b, v18.16b, v18.16b
|
|
sub v18.16b, v18.16b, v6.16b //-iTc0: 0011,2233,0011,2233
|
|
|
|
DIFF_LUMA_LT4_P0_Q0_1 v0, v1, v2, v3, v19, v20, v22
|
|
DIFF_LUMA_LT4_P0_Q0_2 v0, v1, v2, v3, v19, v20, v22
|
|
|
|
smax v19.16b, v19.16b, v18.16b
|
|
smin v19.16b, v19.16b, v6.16b
|
|
and v19.16b, v19.16b, v7.16b
|
|
|
|
EXTRACT_DELTA_INTO_TWO_PART v19, v20
|
|
uqadd v1.16b, v1.16b, v20.16b
|
|
uqsub v1.16b, v1.16b, v19.16b
|
|
st1 {v1.d} [0], [x6], x2
|
|
st1 {v1.d} [1], [x7], x2
|
|
uqsub v2.16b, v2.16b, v20.16b
|
|
uqadd v2.16b, v2.16b, v19.16b
|
|
st1 {v2.d} [0], [x6]
|
|
st1 {v2.d} [1], [x7]
|
|
DeblockChromaLt4V_AArch64_neon_end:
|
|
WELS_ASM_ARCH64_FUNC_END
|
|
|
|
WELS_ASM_ARCH64_FUNC_BEGIN DeblockChromaLt4H_AArch64_neon //uint8_t* pPixCb, uint8_t* pPixCr, int32_t iStrideX, int32_t iAlpha, int32_t iBeta, int8_t* pTc
|
|
dup v16.16b, w3 //alpha
|
|
dup v17.16b, w4 //beta
|
|
sub x6, x0, #2 //pPixCb-2
|
|
sub x7, x1, #2 //pPixCr-2
|
|
|
|
LOAD_CHROMA_DATA_4 v0, v1, v2, v3, x6, 0
|
|
LOAD_CHROMA_DATA_4 v0, v1, v2, v3, x6, 1
|
|
LOAD_CHROMA_DATA_4 v0, v1, v2, v3, x6, 2
|
|
LOAD_CHROMA_DATA_4 v0, v1, v2, v3, x6, 3
|
|
LOAD_CHROMA_DATA_4 v0, v1, v2, v3, x6, 4
|
|
LOAD_CHROMA_DATA_4 v0, v1, v2, v3, x6, 5
|
|
LOAD_CHROMA_DATA_4 v0, v1, v2, v3, x6, 6
|
|
LOAD_CHROMA_DATA_4 v0, v1, v2, v3, x6, 7
|
|
|
|
LOAD_CHROMA_DATA_4 v0, v1, v2, v3, x7, 8
|
|
LOAD_CHROMA_DATA_4 v0, v1, v2, v3, x7, 9
|
|
LOAD_CHROMA_DATA_4 v0, v1, v2, v3, x7, 10
|
|
LOAD_CHROMA_DATA_4 v0, v1, v2, v3, x7, 11
|
|
LOAD_CHROMA_DATA_4 v0, v1, v2, v3, x7, 12
|
|
LOAD_CHROMA_DATA_4 v0, v1, v2, v3, x7, 13
|
|
LOAD_CHROMA_DATA_4 v0, v1, v2, v3, x7, 14
|
|
LOAD_CHROMA_DATA_4 v0, v1, v2, v3, x7, 15
|
|
|
|
sub x0, x0, #1
|
|
sub x1, x1, #1
|
|
|
|
ld4r {v18.8b, v19.8b, v20.8b, v21.8b}, [x5]
|
|
trn1 v18.4h, v18.4h, v19.4h //0011,0011,
|
|
trn1 v20.4h, v20.4h, v21.4h //2233,2233
|
|
zip1 v6.4s, v18.4s, v20.4s //iTc0: 0011,2233,0011,2233
|
|
cmgt v7.16b, v6.16b, #0 // iTc0 Flag
|
|
|
|
MASK_MATRIX v0, v1, v2, v3, v16, v17, v18
|
|
and v7.16b, v7.16b, v18.16b // need filter flag
|
|
|
|
ZERO_JUMP_END v7, x4, x5, DeblockChromaLt4H_AArch64_neon_end
|
|
eor v18.16b, v18.16b, v18.16b
|
|
sub v18.16b, v18.16b, v6.16b //-iTc0: 0011,2233,0011,2233
|
|
|
|
DIFF_LUMA_LT4_P0_Q0_1 v0, v1, v2, v3, v19, v20, v22
|
|
DIFF_LUMA_LT4_P0_Q0_2 v0, v1, v2, v3, v19, v20, v22
|
|
|
|
smax v19.16b, v19.16b, v18.16b
|
|
smin v19.16b, v19.16b, v6.16b
|
|
and v19.16b, v19.16b, v7.16b
|
|
|
|
EXTRACT_DELTA_INTO_TWO_PART v19, v20
|
|
uqadd v1.16b, v1.16b, v20.16b
|
|
uqsub v1.16b, v1.16b, v19.16b
|
|
uqsub v2.16b, v2.16b, v20.16b
|
|
uqadd v2.16b, v2.16b, v19.16b
|
|
|
|
STORE_CHROMA_DATA_2 v1, v2, x0, 0
|
|
STORE_CHROMA_DATA_2 v1, v2, x0, 1
|
|
STORE_CHROMA_DATA_2 v1, v2, x0, 2
|
|
STORE_CHROMA_DATA_2 v1, v2, x0, 3
|
|
STORE_CHROMA_DATA_2 v1, v2, x0, 4
|
|
STORE_CHROMA_DATA_2 v1, v2, x0, 5
|
|
STORE_CHROMA_DATA_2 v1, v2, x0, 6
|
|
STORE_CHROMA_DATA_2 v1, v2, x0, 7
|
|
|
|
STORE_CHROMA_DATA_2 v1, v2, x1, 8
|
|
STORE_CHROMA_DATA_2 v1, v2, x1, 9
|
|
STORE_CHROMA_DATA_2 v1, v2, x1, 10
|
|
STORE_CHROMA_DATA_2 v1, v2, x1, 11
|
|
STORE_CHROMA_DATA_2 v1, v2, x1, 12
|
|
STORE_CHROMA_DATA_2 v1, v2, x1, 13
|
|
STORE_CHROMA_DATA_2 v1, v2, x1, 14
|
|
STORE_CHROMA_DATA_2 v1, v2, x1, 15
|
|
DeblockChromaLt4H_AArch64_neon_end:
|
|
WELS_ASM_ARCH64_FUNC_END
|
|
|
|
WELS_ASM_ARCH64_FUNC_BEGIN DeblockChromaEq4V_AArch64_neon //uint8_t* pPixCb, uint8_t* pPixCr, int32_t iStrideX, int32_t iAlpha, int32_t iBeta
|
|
dup v16.16b, w3 //alpha
|
|
dup v17.16b, w4 //beta
|
|
lsl x3, x2, #1
|
|
sub x6, x0, x3 //pPixCb-2*Stride
|
|
sub x7, x1, x3 //pPixCr-2*Stride
|
|
|
|
ld1 {v0.d} [0], [x6], x2
|
|
ld1 {v1.d} [0], [x6]
|
|
ld1 {v2.d} [0], [x0], x2
|
|
ld1 {v3.d} [0], [x0]
|
|
ld1 {v0.d} [1], [x7], x2
|
|
ld1 {v1.d} [1], [x7]
|
|
ld1 {v2.d} [1], [x1], x2
|
|
ld1 {v3.d} [1], [x1]
|
|
|
|
MASK_MATRIX v0, v1, v2, v3, v16, v17, v7
|
|
|
|
ZERO_JUMP_END v7, x3, x4, DeblockChromaEq4V_AArch64_neon_end
|
|
|
|
DIFF_CHROMA_EQ4_P0Q0_1 v0, v1, v2, v3, v18, v19, v20, v21
|
|
DIFF_CHROMA_EQ4_P0Q0_2 v0, v1, v2, v3, v18, v19, v20, v21
|
|
|
|
mov.16b v6, v7
|
|
bsl v6.16b, v20.16b, v1.16b
|
|
bsl v7.16b, v21.16b, v2.16b
|
|
|
|
st1 {v6.d} [0], [x6], x2
|
|
st1 {v6.d} [1], [x7], x2
|
|
|
|
st1 {v7.d} [0], [x6]
|
|
st1 {v7.d} [1], [x7]
|
|
DeblockChromaEq4V_AArch64_neon_end:
|
|
WELS_ASM_ARCH64_FUNC_END
|
|
|
|
WELS_ASM_ARCH64_FUNC_BEGIN DeblockChromaEq4H_AArch64_neon //uint8_t* pPixCb, uint8_t* pPixCr, int32_t iStrideX, int32_t iAlpha, int32_t iBeta
|
|
dup v16.16b, w3 //alpha
|
|
dup v17.16b, w4 //beta
|
|
|
|
sub x6, x0, #2 //pPixCb-2
|
|
sub x7, x1, #2 //pPixCr-2
|
|
|
|
LOAD_CHROMA_DATA_4 v0, v1, v2, v3, x6, 0
|
|
LOAD_CHROMA_DATA_4 v0, v1, v2, v3, x6, 1
|
|
LOAD_CHROMA_DATA_4 v0, v1, v2, v3, x6, 2
|
|
LOAD_CHROMA_DATA_4 v0, v1, v2, v3, x6, 3
|
|
LOAD_CHROMA_DATA_4 v0, v1, v2, v3, x6, 4
|
|
LOAD_CHROMA_DATA_4 v0, v1, v2, v3, x6, 5
|
|
LOAD_CHROMA_DATA_4 v0, v1, v2, v3, x6, 6
|
|
LOAD_CHROMA_DATA_4 v0, v1, v2, v3, x6, 7
|
|
|
|
LOAD_CHROMA_DATA_4 v0, v1, v2, v3, x7, 8
|
|
LOAD_CHROMA_DATA_4 v0, v1, v2, v3, x7, 9
|
|
LOAD_CHROMA_DATA_4 v0, v1, v2, v3, x7, 10
|
|
LOAD_CHROMA_DATA_4 v0, v1, v2, v3, x7, 11
|
|
LOAD_CHROMA_DATA_4 v0, v1, v2, v3, x7, 12
|
|
LOAD_CHROMA_DATA_4 v0, v1, v2, v3, x7, 13
|
|
LOAD_CHROMA_DATA_4 v0, v1, v2, v3, x7, 14
|
|
LOAD_CHROMA_DATA_4 v0, v1, v2, v3, x7, 15
|
|
sub x0, x0, #1
|
|
sub x1, x1, #1
|
|
|
|
MASK_MATRIX v0, v1, v2, v3, v16, v17, v7
|
|
|
|
ZERO_JUMP_END v7, x3, x4, DeblockChromaEq4H_AArch64_neon_end
|
|
|
|
DIFF_CHROMA_EQ4_P0Q0_1 v0, v1, v2, v3, v18, v19, v20, v21
|
|
DIFF_CHROMA_EQ4_P0Q0_2 v0, v1, v2, v3, v18, v19, v20, v21
|
|
|
|
mov.16b v6, v7
|
|
bsl v6.16b, v20.16b, v1.16b
|
|
bsl v7.16b, v21.16b, v2.16b
|
|
|
|
STORE_CHROMA_DATA_2 v6, v7, x0, 0
|
|
STORE_CHROMA_DATA_2 v6, v7, x0, 1
|
|
STORE_CHROMA_DATA_2 v6, v7, x0, 2
|
|
STORE_CHROMA_DATA_2 v6, v7, x0, 3
|
|
STORE_CHROMA_DATA_2 v6, v7, x0, 4
|
|
STORE_CHROMA_DATA_2 v6, v7, x0, 5
|
|
STORE_CHROMA_DATA_2 v6, v7, x0, 6
|
|
STORE_CHROMA_DATA_2 v6, v7, x0, 7
|
|
|
|
STORE_CHROMA_DATA_2 v6, v7, x1, 8
|
|
STORE_CHROMA_DATA_2 v6, v7, x1, 9
|
|
STORE_CHROMA_DATA_2 v6, v7, x1, 10
|
|
STORE_CHROMA_DATA_2 v6, v7, x1, 11
|
|
STORE_CHROMA_DATA_2 v6, v7, x1, 12
|
|
STORE_CHROMA_DATA_2 v6, v7, x1, 13
|
|
STORE_CHROMA_DATA_2 v6, v7, x1, 14
|
|
STORE_CHROMA_DATA_2 v6, v7, x1, 15
|
|
DeblockChromaEq4H_AArch64_neon_end:
|
|
WELS_ASM_ARCH64_FUNC_END
|
|
|
|
|
|
WELS_ASM_ARCH64_FUNC_BEGIN DeblockingBSCalcEnc_AArch64_neon
|
|
// Checking the nzc status
|
|
BS_NZC_CHECK x0, x2, x3, v16, v17 //v16,v17 save the nzc status
|
|
// For checking bS[I] = 2
|
|
movi v0.16b, #0
|
|
cmgt v16.16b, v16.16b, v0.16b
|
|
cmgt v17.16b, v17.16b, v0.16b
|
|
movi v0.16b, #2
|
|
|
|
and v16.16b, v16.16b, v0.16b //v16 save the nzc check result all the time --- for dir is top
|
|
and v17.16b, v17.16b, v0.16b //v17 save the nzc check result all the time --- for dir is left
|
|
|
|
// Checking the mv status
|
|
BS_MV_CHECK x1, x2, x3, v18, v19, v5 , v6 //v18, v19 save the mv status
|
|
// For checking bS[I] = 1
|
|
movi v0.16b, #1
|
|
and v18.16b, v18.16b, v0.16b //v18 save the nzc check result all the time --- for dir is top
|
|
and v19.16b, v19.16b, v0.16b //v19 save the nzc check result all the time --- for dir is left
|
|
// Check bS[I] is '1' or '2'
|
|
umax v1.16b, v18.16b, v16.16b
|
|
umax v0.16b, v19.16b, v17.16b
|
|
st1 {v0.16b, v1.16b}, [x4]
|
|
WELS_ASM_ARCH64_FUNC_END
|
|
|
|
|
|
#endif
|