openh264/codec/common/arm/deblocking_neon.S
Martin Storsjö 23f57adaea Do full register loads instead of single-lane loads in DeblockLumaEq4H_neon
Instead of loading the registers one lane at a time, load full
registers and then transpose them.

This is faster, reducing the runtime for the function from about
506 cycles to 434 cycles (tested on a Cortex A8).

This also avoids an issue which seems like a cpu bug, present
on Sony Xperia T (cpu implementer 0x51 architecture 7 variant 0x1
part 0x04d). On such a device, it seemed like the "vswp q9, q10"
could start executing before the previous
vld4.u8 {d20[x],d21[x],d22[x],d23[x]}, [r3], r1
had finished and written back their result. Changing the
"vswp q9, q10" into "vswp q10, q9", or into separate
"vswp d18, d20; vswp d19, d21" (or the other way around) seemed to
avoid the issue. This happened occasionally (a couple times per
100000 invocations or so).
2014-04-28 10:12:16 +03:00

1073 lines
26 KiB
ArmAsm

/*!
* \copy
* Copyright (c) 2013, Cisco Systems
* All rights reserved.
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in
* the documentation and/or other materials provided with the
* distribution.
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
* COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
* ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
#ifdef HAVE_NEON
.text
#include "arm_arch_common_macro.S"
#ifdef __APPLE__
.macro JMP_IF_128BITS_IS_ZERO
vorr.s16 $2, $0, $1
vmov r3, r2, $2
orr r3, r3, r2
cmp r3, #0
.endm
.macro MASK_MATRIX
vabd.u8 $6, $1, $2
vcgt.u8 $6, $4, $6
vabd.u8 $4, $0, $1
vclt.u8 $4, $4, $5
vand.u8 $6, $6, $4
vabd.u8 $4, $3, $2
vclt.u8 $4, $4, $5
vand.u8 $6, $6, $4
.endm
.macro DIFF_LUMA_LT4_P1_Q1
vabd.u8 $9, $0, $2
vclt.u8 $9, $9, $4
vrhadd.u8 $8, $2, $3
vhadd.u8 $8, $0, $8
vsub.s8 $8, $8, $1
vmax.s8 $8, $8, $5
vmin.s8 $8, $8, $6
vand.s8 $8, $8, $9
vand.s8 $8, $8, $7
vadd.u8 $8, $1, $8
vabs.s8 $9, $9
.endm
.macro DIFF_LUMA_LT4_P0_Q0
vsubl.u8 $5, $0, $3
vsubl.u8 $6, $2, $1
vshl.s16 $6, $6, #2
vadd.s16 $5, $5, $6
vrshrn.s16 $4, $5, #3
.endm
.macro DIFF_LUMA_EQ4_P2P1P0
vaddl.u8 q4, $1, $2
vaddl.u8 q5, $3, $4
vadd.u16 q5, q4, q5
vaddl.u8 q4, $0, $1
vshl.u16 q4, q4, #1
vadd.u16 q4, q5, q4
vrshrn.u16 $0, q5, #2
vrshrn.u16 $7, q4, #3
vshl.u16 q5, q5, #1
vsubl.u8 q4, $5, $1
vadd.u16 q5, q4,q5
vaddl.u8 q4, $2, $5
vaddw.u8 q4, q4, $2
vaddw.u8 q4, q4, $3
vrshrn.u16 d10,q5, #3
vrshrn.u16 d8, q4, #2
vbsl.u8 $6, d10, d8
.endm
.macro DIFF_LUMA_EQ4_MASK
vmov $3, $2
vbsl.u8 $3, $0, $1
.endm
.macro DIFF_CHROMA_EQ4_P0Q0
vaddl.u8 $4, $0, $3
vaddw.u8 $5, $4, $1
vaddw.u8 $6, $4, $2
vaddw.u8 $5, $5, $0
vaddw.u8 $6, $6, $3
vrshrn.u16 $7, $5, #2
vrshrn.u16 $8, $6, #2
.endm
.macro LOAD_CHROMA_DATA_4
vld4.u8 {$0[$8],$1[$8],$2[$8],$3[$8]}, [r0], r2
vld4.u8 {$4[$8],$5[$8],$6[$8],$7[$8]}, [r1], r2
.endm
.macro STORE_CHROMA_DATA_4
vst4.u8 {$0[$8],$1[$8],$2[$8],$3[$8]}, [r0], r2
vst4.u8 {$4[$8],$5[$8],$6[$8],$7[$8]}, [r1], r2
.endm
.macro LOAD_LUMA_DATA_3
vld3.u8 {$0[$6],$1[$6],$2[$6]}, [r2], r1
vld3.u8 {$3[$6],$4[$6],$5[$6]}, [r0], r1
.endm
.macro STORE_LUMA_DATA_4
vst4.u8 {$0[$4],$1[$4],$2[$4],$3[$4]}, [r0], r1
vst4.u8 {$0[$5],$1[$5],$2[$5],$3[$5]}, [r2], r1
.endm
.macro STORE_LUMA_DATA_3
vst3.u8 {$0[$6],$1[$6],$2[$6]}, [r3], r1
vst3.u8 {$3[$6],$4[$6],$5[$6]}, [r0], r1
.endm
.macro EXTRACT_DELTA_INTO_TWO_PART
vcge.s8 $1, $0, #0
vand $1, $0, $1
vsub.s8 $0, $1, $0
.endm
#else
.macro JMP_IF_128BITS_IS_ZERO arg0, arg1, arg2
vorr.s16 \arg2, \arg0, \arg1
vmov r3, r2, \arg2
orr r3, r3, r2
cmp r3, #0
.endm
.macro MASK_MATRIX arg0, arg1, arg2, arg3, arg4, arg5, arg6
vabd.u8 \arg6, \arg1, \arg2
vcgt.u8 \arg6, \arg4, \arg6
vabd.u8 \arg4, \arg0, \arg1
vclt.u8 \arg4, \arg4, \arg5
vand.u8 \arg6, \arg6, \arg4
vabd.u8 \arg4, \arg3, \arg2
vclt.u8 \arg4, \arg4, \arg5
vand.u8 \arg6, \arg6, \arg4
.endm
.macro DIFF_LUMA_LT4_P1_Q1 arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8, arg9
vabd.u8 \arg9, \arg0, \arg2
vclt.u8 \arg9, \arg9, \arg4
vrhadd.u8 \arg8, \arg2, \arg3
vhadd.u8 \arg8, \arg0, \arg8
vsub.s8 \arg8, \arg8, \arg1
vmax.s8 \arg8, \arg8, \arg5
vmin.s8 \arg8, \arg8, \arg6
vand.s8 \arg8, \arg8, \arg9
vand.s8 \arg8, \arg8, \arg7
vadd.u8 \arg8, \arg1, \arg8
vabs.s8 \arg9, \arg9
.endm
.macro DIFF_LUMA_LT4_P0_Q0 arg0, arg1, arg2, arg3, arg4, arg5, arg6
vsubl.u8 \arg5, \arg0, \arg3
vsubl.u8 \arg6, \arg2, \arg1
vshl.s16 \arg6, \arg6, #2
vadd.s16 \arg5, \arg5, \arg6
vrshrn.s16 \arg4, \arg5, #3
.endm
.macro DIFF_LUMA_EQ4_P2P1P0 arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7
vaddl.u8 q4, \arg1, \arg2
vaddl.u8 q5, \arg3, \arg4
vadd.u16 q5, q4, q5
vaddl.u8 q4, \arg0, \arg1
vshl.u16 q4, q4, #1
vadd.u16 q4, q5, q4
vrshrn.u16 \arg0, q5, #2
vrshrn.u16 \arg7, q4, #3
vshl.u16 q5, q5, #1
vsubl.u8 q4, \arg5, \arg1
vadd.u16 q5, q4,q5
vaddl.u8 q4, \arg2, \arg5
vaddw.u8 q4, q4, \arg2
vaddw.u8 q4, q4, \arg3
vrshrn.u16 d10,q5, #3
vrshrn.u16 d8, q4, #2
vbsl.u8 \arg6, d10, d8
.endm
.macro DIFF_LUMA_EQ4_MASK arg0, arg1, arg2, arg3
vmov \arg3, \arg2
vbsl.u8 \arg3, \arg0, \arg1
.endm
.macro DIFF_CHROMA_EQ4_P0Q0 arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8
vaddl.u8 \arg4, \arg0, \arg3
vaddw.u8 \arg5, \arg4, \arg1
vaddw.u8 \arg6, \arg4, \arg2
vaddw.u8 \arg5, \arg5, \arg0
vaddw.u8 \arg6, \arg6, \arg3
vrshrn.u16 \arg7, \arg5, #2
vrshrn.u16 \arg8, \arg6, #2
.endm
.macro LOAD_CHROMA_DATA_4 arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8
vld4.u8 {\arg0[\arg8],\arg1[\arg8],\arg2[\arg8],\arg3[\arg8]}, [r0], r2
vld4.u8 {\arg4[\arg8],\arg5[\arg8],\arg6[\arg8],\arg7[\arg8]}, [r1], r2
.endm
.macro STORE_CHROMA_DATA_4 arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8
vst4.u8 {\arg0[\arg8],\arg1[\arg8],\arg2[\arg8],\arg3[\arg8]}, [r0], r2
vst4.u8 {\arg4[\arg8],\arg5[\arg8],\arg6[\arg8],\arg7[\arg8]}, [r1], r2
.endm
.macro LOAD_LUMA_DATA_3 arg0, arg1, arg2, arg3, arg4, arg5, arg6
vld3.u8 {\arg0[\arg6],\arg1[\arg6],\arg2[\arg6]}, [r2], r1
vld3.u8 {\arg3[\arg6],\arg4[\arg6],\arg5[\arg6]}, [r0], r1
.endm
.macro STORE_LUMA_DATA_4 arg0, arg1, arg2, arg3, arg4, arg5
vst4.u8 {\arg0[\arg4],\arg1[\arg4],\arg2[\arg4],\arg3[\arg4]}, [r0], r1
vst4.u8 {\arg0[\arg5],\arg1[\arg5],\arg2[\arg5],\arg3[\arg5]}, [r2], r1
.endm
.macro STORE_LUMA_DATA_3 arg0, arg1, arg2, arg3, arg4, arg5, arg6
vst3.u8 {\arg0[\arg6],\arg1[\arg6],\arg2[\arg6]}, [r3], r1
vst3.u8 {\arg3[\arg6],\arg4[\arg6],\arg5[\arg6]}, [r0], r1
.endm
.macro EXTRACT_DELTA_INTO_TWO_PART arg0, arg1
vcge.s8 \arg1, \arg0, #0
vand \arg1, \arg0, \arg1
vsub.s8 \arg0, \arg1, \arg0
.endm
#endif
WELS_ASM_FUNC_BEGIN DeblockLumaLt4V_neon
vpush {q4-q7}
vdup.u8 q11, r2
vdup.u8 q9, r3
add r2, r1, r1, lsl #1
sub r2, r0, r2
vld1.u8 {q0}, [r2], r1
vld1.u8 {q3}, [r0], r1
vld1.u8 {q1}, [r2], r1
vld1.u8 {q4}, [r0], r1
vld1.u8 {q2}, [r2]
vld1.u8 {q5}, [r0]
sub r2, r2, r1
ldr r3, [sp, #64]
vld1.s8 {d31}, [r3]
vdup.s8 d28, d31[0]
vdup.s8 d30, d31[1]
vdup.s8 d29, d31[2]
vdup.s8 d31, d31[3]
vtrn.32 d28, d30
vtrn.32 d29, d31
vcge.s8 q10, q14, #0
MASK_MATRIX q1, q2, q3, q4, q11, q9, q15
vand.u8 q10, q10, q15
veor q15, q15
vsub.i8 q15,q15,q14
DIFF_LUMA_LT4_P1_Q1 q0, q1, q2, q3, q9, q15, q14, q10, q6, q12
vst1.u8 {q6}, [r2], r1
DIFF_LUMA_LT4_P1_Q1 q5, q4, q3, q2, q9, q15, q14, q10, q7, q13
vabs.s8 q12, q12
vabs.s8 q13, q13
vadd.u8 q14,q14,q12
vadd.u8 q14,q14,q13
veor q15, q15
vsub.i8 q15,q15,q14
DIFF_LUMA_LT4_P0_Q0 d2, d4, d6, d8, d16, q12, q13
DIFF_LUMA_LT4_P0_Q0 d3, d5, d7, d9, d17, q12, q13
vmax.s8 q8, q8, q15
vmin.s8 q8, q8, q14
vand.s8 q8, q8, q10
EXTRACT_DELTA_INTO_TWO_PART q8, q9
vqadd.u8 q2, q2, q9
vqsub.u8 q2, q2, q8
vst1.u8 {q2}, [r2], r1
vqsub.u8 q3, q3, q9
vqadd.u8 q3, q3, q8
vst1.u8 {q3}, [r2] , r1
vst1.u8 {q7}, [r2]
vpop {q4-q7}
WELS_ASM_FUNC_END
WELS_ASM_FUNC_BEGIN DeblockLumaEq4V_neon
vpush {q4-q7}
vdup.u8 q5, r2
vdup.u8 q4, r3
sub r3, r0, r1, lsl #2
vld1.u8 {q8}, [r3], r1
vld1.u8 {q12}, [r0], r1
vld1.u8 {q9}, [r3], r1
vld1.u8 {q13}, [r0], r1
vld1.u8 {q10}, [r3], r1
vld1.u8 {q14}, [r0], r1
vld1.u8 {q11}, [r3]
vld1.u8 {q15}, [r0]
sub r3, r3, r1 , lsl #1
MASK_MATRIX q10, q11, q12, q13, q5, q4, q6
mov r2, r2, lsr #2
add r2, r2, #2
vdup.u8 q5, r2
vabd.u8 q0, q11, q12
vclt.u8 q7, q0, q5
vabd.u8 q1, q9, q11
vclt.u8 q1, q1, q4
vand.s8 q1, q1, q7
vabd.u8 q2, q14,q12
vclt.u8 q2, q2, q4
vand.s8 q2, q2, q7
vand.u8 q7, q7, q6
vmov q3, q1
DIFF_LUMA_EQ4_P2P1P0 d16, d18, d20, d22, d24, d26, d2, d0
DIFF_LUMA_EQ4_P2P1P0 d17, d19, d21, d23, d25, d27, d3, d1
vand.u8 q3, q7, q3
DIFF_LUMA_EQ4_MASK q0, q9, q3, q4
vst1.u8 {q4}, [r3], r1
DIFF_LUMA_EQ4_MASK q8,q10, q3, q4
vst1.u8 {q4}, [r3], r1
DIFF_LUMA_EQ4_MASK q1,q11, q6, q4
vst1.u8 {q4}, [r3], r1
vmov q0, q2
DIFF_LUMA_EQ4_P2P1P0 d30, d28, d26, d24, d22, d20, d4, d6
DIFF_LUMA_EQ4_P2P1P0 d31, d29, d27, d25, d23, d21, d5, d7
vand.u8 q0, q7, q0
DIFF_LUMA_EQ4_MASK q2, q12, q6, q4
vst1.u8 {q4}, [r3], r1
DIFF_LUMA_EQ4_MASK q15, q13, q0, q4
vst1.u8 {q4}, [r3], r1
DIFF_LUMA_EQ4_MASK q3, q14, q0, q4
vst1.u8 {q4}, [r3], r1
vpop {q4-q7}
WELS_ASM_FUNC_END
WELS_ASM_FUNC_BEGIN DeblockLumaLt4H_neon
vpush {q4-q7}
vdup.u8 q11, r2
vdup.u8 q9, r3
sub r2, r0, #3
LOAD_LUMA_DATA_3 d0, d1, d2, d6, d7, d8, 0
LOAD_LUMA_DATA_3 d0, d1, d2, d6, d7, d8, 1
LOAD_LUMA_DATA_3 d0, d1, d2, d6, d7, d8, 2
LOAD_LUMA_DATA_3 d0, d1, d2, d6, d7, d8, 3
LOAD_LUMA_DATA_3 d0, d1, d2, d6, d7, d8, 4
LOAD_LUMA_DATA_3 d0, d1, d2, d6, d7, d8, 5
LOAD_LUMA_DATA_3 d0, d1, d2, d6, d7, d8, 6
LOAD_LUMA_DATA_3 d0, d1, d2, d6, d7, d8, 7
LOAD_LUMA_DATA_3 d3, d4, d5, d9, d10, d11, 0
LOAD_LUMA_DATA_3 d3, d4, d5, d9, d10, d11, 1
LOAD_LUMA_DATA_3 d3, d4, d5, d9, d10, d11, 2
LOAD_LUMA_DATA_3 d3, d4, d5, d9, d10, d11, 3
LOAD_LUMA_DATA_3 d3, d4, d5, d9, d10, d11, 4
LOAD_LUMA_DATA_3 d3, d4, d5, d9, d10, d11, 5
LOAD_LUMA_DATA_3 d3, d4, d5, d9, d10, d11, 6
LOAD_LUMA_DATA_3 d3, d4, d5, d9, d10, d11, 7
vswp d1, d2
vswp d3, d4
vswp d1, d4
vswp d7, d8
vswp d9, d10
vswp d7, d10
sub r0, r0, r1, lsl #4
ldr r3, [sp, #64]
vld1.s8 {d31}, [r3]
vdup.s8 d28, d31[0]
vdup.s8 d30, d31[1]
vdup.s8 d29, d31[2]
vdup.s8 d31, d31[3]
vtrn.32 d28, d30
vtrn.32 d29, d31
vcge.s8 q10, q14, #0
MASK_MATRIX q1, q2, q3, q4, q11, q9, q15
vand.u8 q10, q10, q15
veor q15, q15
vsub.i8 q15,q15,q14
DIFF_LUMA_LT4_P1_Q1 q0, q1, q2, q3, q9, q15, q14, q10, q6, q12
DIFF_LUMA_LT4_P1_Q1 q5, q4, q3, q2, q9, q15, q14, q10, q7, q13
vabs.s8 q12, q12
vabs.s8 q13, q13
vadd.u8 q14,q14,q12
vadd.u8 q14,q14,q13
veor q15, q15
vsub.i8 q15,q15,q14
DIFF_LUMA_LT4_P0_Q0 d2, d4, d6, d8, d16, q12, q13
DIFF_LUMA_LT4_P0_Q0 d3, d5, d7, d9, d17, q12, q13
vmax.s8 q8, q8, q15
vmin.s8 q8, q8, q14
vand.s8 q8, q8, q10
EXTRACT_DELTA_INTO_TWO_PART q8, q9
vqadd.u8 q2, q2, q9
vqsub.u8 q2, q2, q8
vqsub.u8 q3, q3, q9
vqadd.u8 q3, q3, q8
sub r0, #2
add r2, r0, r1
lsl r1, #1
vmov q1, q6
vmov q4, q7
vswp q2, q3
vswp d3, d6
vswp d5, d8
STORE_LUMA_DATA_4 d2, d3, d4, d5, 0, 1
STORE_LUMA_DATA_4 d2, d3, d4, d5, 2, 3
STORE_LUMA_DATA_4 d2, d3, d4, d5, 4, 5
STORE_LUMA_DATA_4 d2, d3, d4, d5, 6, 7
STORE_LUMA_DATA_4 d6, d7, d8, d9, 0, 1
STORE_LUMA_DATA_4 d6, d7, d8, d9, 2, 3
STORE_LUMA_DATA_4 d6, d7, d8, d9, 4, 5
STORE_LUMA_DATA_4 d6, d7, d8, d9, 6, 7
vpop {q4-q7}
WELS_ASM_FUNC_END
WELS_ASM_FUNC_BEGIN DeblockLumaEq4H_neon
vpush {q4-q7}
vdup.u8 q5, r2
vdup.u8 q4, r3
sub r3, r0, #4 // pix -= 4
vld1.u8 {d16}, [r3], r1
vld1.u8 {d17}, [r3], r1
vld1.u8 {d18}, [r3], r1
vld1.u8 {d19}, [r3], r1
vld1.u8 {d20}, [r3], r1
vld1.u8 {d21}, [r3], r1
vld1.u8 {d22}, [r3], r1
vld1.u8 {d23}, [r3], r1
vld1.u8 {d24}, [r3], r1
vld1.u8 {d25}, [r3], r1
vld1.u8 {d26}, [r3], r1
vld1.u8 {d27}, [r3], r1
vld1.u8 {d28}, [r3], r1
vld1.u8 {d29}, [r3], r1
vld1.u8 {d30}, [r3], r1
vld1.u8 {d31}, [r3], r1
vtrn.u32 d16, d20
vtrn.u32 d17, d21
vtrn.u32 d18, d22
vtrn.u32 d19, d23
vtrn.u32 d24, d28
vtrn.u32 d25, d29
vtrn.u32 d26, d30
vtrn.u32 d27, d31
vtrn.u16 d16, d18
vtrn.u16 d17, d19
vtrn.u16 d20, d22
vtrn.u16 d21, d23
vtrn.u16 d24, d26
vtrn.u16 d25, d27
vtrn.u16 d28, d30
vtrn.u16 d29, d31
vtrn.u8 d16, d17
vtrn.u8 d18, d19
vtrn.u8 d20, d21
vtrn.u8 d22, d23
vtrn.u8 d24, d25
vtrn.u8 d26, d27
vtrn.u8 d28, d29
vtrn.u8 d30, d31
vswp d17, d24
vswp d19, d26
vswp d21, d28
vswp d23, d30
vswp q12, q9
vswp q14, q11
vswp q12, q10
vswp q13, q11
MASK_MATRIX q10, q11, q12, q13, q5, q4, q6
mov r2, r2, lsr #2
add r2, r2, #2
vdup.u8 q5, r2
vabd.u8 q0, q11, q12
vclt.u8 q7, q0, q5
vabd.u8 q1, q9, q11
vclt.u8 q1, q1, q4
vand.s8 q1, q1, q7
vabd.u8 q2, q14,q12
vclt.u8 q2, q2, q4
vand.s8 q2, q2, q7
vand.u8 q7, q7, q6
vmov q3, q1
DIFF_LUMA_EQ4_P2P1P0 d16, d18, d20, d22, d24, d26, d2, d0
DIFF_LUMA_EQ4_P2P1P0 d17, d19, d21, d23, d25, d27, d3, d1
vand.u8 q3, q7, q3
DIFF_LUMA_EQ4_MASK q0, q9, q3, q4
vmov q9, q4
vbsl.u8 q3, q8, q10
DIFF_LUMA_EQ4_MASK q1,q11, q6, q8
vand.u8 q7, q7, q2
DIFF_LUMA_EQ4_P2P1P0 d30, d28, d26, d24, d22, d20, d4, d0
DIFF_LUMA_EQ4_P2P1P0 d31, d29, d27, d25, d23, d21, d5, d1
vbsl.u8 q6, q2, q12
DIFF_LUMA_EQ4_MASK q15, q13, q7, q4
vbsl.u8 q7, q0, q14
vmov q5, q6
vmov q2, q9
vmov q6, q4
vmov q4, q8
vswp d8, d6
vswp d5, d7
vswp d5, d8
vswp d14, d12
vswp d11, d13
vswp d11, d14
sub r3, r0, #3
STORE_LUMA_DATA_3 d4,d5,d6,d10,d11,d12,0
STORE_LUMA_DATA_3 d4,d5,d6,d10,d11,d12,1
STORE_LUMA_DATA_3 d4,d5,d6,d10,d11,d12,2
STORE_LUMA_DATA_3 d4,d5,d6,d10,d11,d12,3
STORE_LUMA_DATA_3 d4,d5,d6,d10,d11,d12,4
STORE_LUMA_DATA_3 d4,d5,d6,d10,d11,d12,5
STORE_LUMA_DATA_3 d4,d5,d6,d10,d11,d12,6
STORE_LUMA_DATA_3 d4,d5,d6,d10,d11,d12,7
STORE_LUMA_DATA_3 d7,d8,d9,d13,d14,d15,0
STORE_LUMA_DATA_3 d7,d8,d9,d13,d14,d15,1
STORE_LUMA_DATA_3 d7,d8,d9,d13,d14,d15,2
STORE_LUMA_DATA_3 d7,d8,d9,d13,d14,d15,3
STORE_LUMA_DATA_3 d7,d8,d9,d13,d14,d15,4
STORE_LUMA_DATA_3 d7,d8,d9,d13,d14,d15,5
STORE_LUMA_DATA_3 d7,d8,d9,d13,d14,d15,6
STORE_LUMA_DATA_3 d7,d8,d9,d13,d14,d15,7
vpop {q4-q7}
WELS_ASM_FUNC_END
WELS_ASM_FUNC_BEGIN DeblockChromaLt4V_neon
vdup.u8 q11, r3
ldr r3, [sp, #0]
sub r0, r0, r2 , lsl #1
sub r1, r1, r2, lsl #1
vdup.u8 q9, r3
ldr r3, [sp, #4]
vld1.u8 {d0}, [r0], r2
vld1.u8 {d1}, [r1], r2
vld1.u8 {d2}, [r0], r2
vld1.u8 {d3}, [r1], r2
vld1.u8 {d4}, [r0], r2
vld1.u8 {d5}, [r1], r2
vld1.u8 {d6}, [r0]
vld1.u8 {d7}, [r1]
sub r0, r0, r2, lsl #1
sub r1, r1, r2, lsl #1
vld1.s8 {d31}, [r3]
vmovl.u8 q14,d31
vshl.u64 d29,d28,#8
vorr d28,d29
vmov d29, d28
veor q15, q15
vsub.i8 q15,q15,q14
MASK_MATRIX q0, q1, q2, q3, q11, q9, q10
DIFF_LUMA_LT4_P0_Q0 d0, d2, d4, d6, d16, q12, q13
DIFF_LUMA_LT4_P0_Q0 d1, d3, d5, d7, d17, q12, q13
vmax.s8 q8, q8, q15
vmin.s8 q8, q8, q14
vand.s8 q8, q8, q10
vcge.s8 q14, q14, #0
vand.s8 q8, q8, q14
EXTRACT_DELTA_INTO_TWO_PART q8, q10
vqadd.u8 q1, q1, q10
vqsub.u8 q1, q1, q8
vst1.u8 {d2}, [r0], r2
vst1.u8 {d3}, [r1], r2
vqsub.u8 q2, q2, q10
vqadd.u8 q2, q2, q8
vst1.u8 {d4}, [r0]
vst1.u8 {d5}, [r1]
WELS_ASM_FUNC_END
WELS_ASM_FUNC_BEGIN DeblockChromaEq4V_neon
vpush {q4-q5}
vdup.u8 q11, r3
ldr r3, [sp, #32]
sub r0, r0, r2 , lsl #1
sub r1, r1, r2, lsl #1
vdup.u8 q9, r3
vld1.u8 {d0}, [r0], r2 // q0::p1
vld1.u8 {d1}, [r1], r2
vld1.u8 {d2}, [r0], r2 // q1::p0
vld1.u8 {d3}, [r1], r2
vld1.u8 {d4}, [r0], r2 // q2::q0
vld1.u8 {d5}, [r1], r2
vld1.u8 {d6}, [r0] // q3::q1
vld1.u8 {d7}, [r1]
sub r0, r0, r2, lsl #1 // pix = [-1*src_stride]
sub r1, r1, r2, lsl #1
MASK_MATRIX q0, q1, q2, q3, q11, q9, q10
vmov q11, q10
DIFF_CHROMA_EQ4_P0Q0 d0, d2, d4, d6, q4, q5, q8, d30, d0 // Cb::p0' q0'
DIFF_CHROMA_EQ4_P0Q0 d1, d3, d5, d7, q12, q13, q14, d31, d1 // Cr::p0' q0'
vbsl.u8 q10, q15, q1
vst1.u8 {d20}, [r0], r2
vst1.u8 {d21}, [r1], r2
vbsl.u8 q11, q0, q2
vst1.u8 {d22}, [r0]
vst1.u8 {d23}, [r1]
vpop {q4-q5}
WELS_ASM_FUNC_END
WELS_ASM_FUNC_BEGIN DeblockChromaLt4H_neon
vdup.u8 q11, r3
ldr r3, [sp, #0]
sub r0, r0, #2
vdup.u8 q9, r3
ldr r3, [sp, #4]
sub r1, r1, #2
vld1.s8 {d31}, [r3]
LOAD_CHROMA_DATA_4 d0, d1, d2, d3, d4, d5, d6, d7, 0
LOAD_CHROMA_DATA_4 d0, d1, d2, d3, d4, d5, d6, d7, 1
LOAD_CHROMA_DATA_4 d0, d1, d2, d3, d4, d5, d6, d7, 2
LOAD_CHROMA_DATA_4 d0, d1, d2, d3, d4, d5, d6, d7, 3
LOAD_CHROMA_DATA_4 d0, d1, d2, d3, d4, d5, d6, d7, 4
LOAD_CHROMA_DATA_4 d0, d1, d2, d3, d4, d5, d6, d7, 5
LOAD_CHROMA_DATA_4 d0, d1, d2, d3, d4, d5, d6, d7, 6
LOAD_CHROMA_DATA_4 d0, d1, d2, d3, d4, d5, d6, d7, 7
vswp q1, q2
vswp d1, d2
vswp d6, d5
vmovl.u8 q14, d31
vshl.u64 d29,d28,#8
vorr d28,d29
vmov d29, d28
veor q15, q15
vsub.i8 q15,q15,q14
MASK_MATRIX q0, q1, q2, q3, q11, q9, q10
DIFF_LUMA_LT4_P0_Q0 d0, d2, d4, d6, d16, q12, q13
DIFF_LUMA_LT4_P0_Q0 d1, d3, d5, d7, d17, q12, q13
vmax.s8 q8, q8, q15
vmin.s8 q8, q8, q14
vand.s8 q8, q8, q10
vcge.s8 q14, q14, #0
vand.s8 q8, q8, q14
EXTRACT_DELTA_INTO_TWO_PART q8, q10
vqadd.u8 q1, q1, q10
vqsub.u8 q1, q1, q8
vqsub.u8 q2, q2, q10
vqadd.u8 q2, q2, q8
sub r0, r0, r2, lsl #3
sub r1, r1, r2, lsl #3
vswp d1, d2
vswp d6, d5
vswp q1, q2
STORE_CHROMA_DATA_4 d0, d1, d2, d3, d4, d5, d6, d7, 0
STORE_CHROMA_DATA_4 d0, d1, d2, d3, d4, d5, d6, d7, 1
STORE_CHROMA_DATA_4 d0, d1, d2, d3, d4, d5, d6, d7, 2
STORE_CHROMA_DATA_4 d0, d1, d2, d3, d4, d5, d6, d7, 3
STORE_CHROMA_DATA_4 d0, d1, d2, d3, d4, d5, d6, d7, 4
STORE_CHROMA_DATA_4 d0, d1, d2, d3, d4, d5, d6, d7, 5
STORE_CHROMA_DATA_4 d0, d1, d2, d3, d4, d5, d6, d7, 6
STORE_CHROMA_DATA_4 d0, d1, d2, d3, d4, d5, d6, d7, 7
WELS_ASM_FUNC_END
WELS_ASM_FUNC_BEGIN DeblockChromaEq4H_neon
vpush {q4-q5}
vdup.u8 q11, r3
ldr r3, [sp, #32]
sub r0, r0, #2
sub r1, r1, #2
LOAD_CHROMA_DATA_4 d0, d1, d2, d3, d4, d5, d6, d7, 0
LOAD_CHROMA_DATA_4 d0, d1, d2, d3, d4, d5, d6, d7, 1
LOAD_CHROMA_DATA_4 d0, d1, d2, d3, d4, d5, d6, d7, 2
LOAD_CHROMA_DATA_4 d0, d1, d2, d3, d4, d5, d6, d7, 3
LOAD_CHROMA_DATA_4 d0, d1, d2, d3, d4, d5, d6, d7, 4
LOAD_CHROMA_DATA_4 d0, d1, d2, d3, d4, d5, d6, d7, 5
LOAD_CHROMA_DATA_4 d0, d1, d2, d3, d4, d5, d6, d7, 6
LOAD_CHROMA_DATA_4 d0, d1, d2, d3, d4, d5, d6, d7, 7
vswp q1, q2
vswp d1, d2
vswp d6, d5
vdup.u8 q9, r3
MASK_MATRIX q0, q1, q2, q3, q11, q9, q10
vmov q11, q10
DIFF_CHROMA_EQ4_P0Q0 d0, d2, d4, d6, q8, q9, q12, d8, d10
DIFF_CHROMA_EQ4_P0Q0 d1, d3, d5, d7, q13, q14, q15, d9, d11
vbsl.u8 q10, q4, q1
vbsl.u8 q11, q5, q2
sub r0, r0, r2, lsl #3 // pix: 0th row [-2]
sub r1, r1, r2, lsl #3
vmov q1, q10
vmov q2, q11
vswp d1, d2
vswp d6, d5
vswp q1, q2
// Cb:d0d1d2d3, Cr:d4d5d6d7
STORE_CHROMA_DATA_4 d0, d1, d2, d3, d4, d5, d6, d7, 0
STORE_CHROMA_DATA_4 d0, d1, d2, d3, d4, d5, d6, d7, 1
STORE_CHROMA_DATA_4 d0, d1, d2, d3, d4, d5, d6, d7, 2
STORE_CHROMA_DATA_4 d0, d1, d2, d3, d4, d5, d6, d7, 3
STORE_CHROMA_DATA_4 d0, d1, d2, d3, d4, d5, d6, d7, 4
STORE_CHROMA_DATA_4 d0, d1, d2, d3, d4, d5, d6, d7, 5
STORE_CHROMA_DATA_4 d0, d1, d2, d3, d4, d5, d6, d7, 6
STORE_CHROMA_DATA_4 d0, d1, d2, d3, d4, d5, d6, d7, 7
vpop {q4-q5}
WELS_ASM_FUNC_END
WELS_ASM_FUNC_BEGIN WelsNonZeroCount_neon
vld1.64 {d0-d2}, [r0]
vceq.s8 q0, q0, #0
vceq.s8 d2, d2, #0
vmvn q0, q0
vmvn d2, d2
vabs.s8 q0, q0
vabs.s8 d2, d2
vst1.64 {d0-d2}, [r0]
WELS_ASM_FUNC_END
#ifdef __APPLE__
.macro BS_NZC_CHECK
vld1.8 {d0,d1}, [$0]
/* Arrenge the input data --- TOP */
ands r6, $1, #2
beq bs_nzc_check_jump0
sub r6, $0, $2, lsl #4
sub r6, $2, lsl #3
add r6, #12
vld1.32 d3[1], [r6]
bs_nzc_check_jump0:
vext.8 q1, q1, q0, #12
vadd.u8 $3, q0, q1
/* Arrenge the input data --- LEFT */
ands r6, $1, #1
beq bs_nzc_check_jump1
sub r6, $0, #21
add r7, r6, #4
vld1.8 d3[4], [r6]
add r6, r7, #4
vld1.8 d3[5], [r7]
add r7, r6, #4
vld1.8 d3[6], [r6]
vld1.8 d3[7], [r7]
bs_nzc_check_jump1:
vzip.8 d0, d1
vzip.8 d0, d1
vext.8 q1, q1, q0, #12
vadd.u8 $4, q0, q1
.endm
.macro BS_COMPARE_MV //in: $0,$1(const),$2(const),$3(const),$4(const); out:$5, $6
mov r6, #4
vabd.s16 q8, $0, $1
vabd.s16 q9, $1, $2
vdup.s16 $0, r6
vabd.s16 q10, $2, $3
vabd.s16 q11, $3, $4
vcge.s16 q8, $0
vcge.s16 q9, $0
vcge.s16 q10, $0
vcge.s16 q11, $0
vpadd.i16 d16, d16, d17
vpadd.i16 d17, d18, d19
vpadd.i16 d18, d20, d21
vpadd.i16 d19, d22, d23
vaddhn.i16 $5, q8, q8
vaddhn.i16 $6, q9, q9
.endm
.macro BS_MV_CHECK
vldm $0, {q0,q1,q2,q3}
/* Arrenge the input data --- TOP */
ands r6, $1, #2
beq bs_mv_check_jump0
sub r6, $0, $2, lsl #6
add r6, #48
vld1.8 {d8, d9}, [r6]
bs_mv_check_jump0:
BS_COMPARE_MV q4, q0, q1, q2, q3, $3, $4
/* Arrenge the input data --- LEFT */
ands r6, $1, #1
beq bs_mv_check_jump1
sub r6, $0, #52
add r7, r6, #16
vld1.32 d8[0], [r6]
add r6, r7, #16
vld1.32 d8[1], [r7]
add r7, r6, #16
vld1.32 d9[0], [r6]
vld1.32 d9[1], [r7]
bs_mv_check_jump1:
vzip.32 q0, q2
vzip.32 q1, q3
vzip.32 q0, q1
vzip.32 q2, q3
BS_COMPARE_MV q4, q0, q1, q2, q3, $5, $6
.endm
#else
.macro BS_NZC_CHECK arg0, arg1, arg2, arg3, arg4
vld1.8 {d0,d1}, [\arg0]
/* Arrenge the input data --- TOP */
ands r6, \arg1, #2
beq bs_nzc_check_jump0
sub r6, \arg0, \arg2, lsl #4
sub r6, r6, \arg2, lsl #3
add r6, #12
vld1.32 d3[1], [r6]
bs_nzc_check_jump0:
vext.8 q1, q1, q0, #12
vadd.u8 \arg3, q0, q1
/* Arrenge the input data --- LEFT */
ands r6, \arg1, #1
beq bs_nzc_check_jump1
sub r6, \arg0, #21
add r7, r6, #4
vld1.8 d3[4], [r6]
add r6, r7, #4
vld1.8 d3[5], [r7]
add r7, r6, #4
vld1.8 d3[6], [r6]
vld1.8 d3[7], [r7]
bs_nzc_check_jump1:
vzip.8 d0, d1
vzip.8 d0, d1
vext.8 q1, q1, q0, #12
vadd.u8 \arg4, q0, q1
.endm
.macro BS_COMPARE_MV arg0, arg1, arg2, arg3, arg4, arg5, arg6 //in: $0,$1(const),$2(const),$3(const),$4(const); out:$5, $6
mov r6, #4
vabd.s16 q8, \arg0, \arg1
vabd.s16 q9, \arg1, \arg2
vdup.s16 \arg0, r6
vabd.s16 q10, \arg2, \arg3
vabd.s16 q11, \arg3, \arg4
vcge.s16 q8, \arg0
vcge.s16 q9, \arg0
vcge.s16 q10, \arg0
vcge.s16 q11, \arg0
vpadd.i16 d16, d16, d17
vpadd.i16 d17, d18, d19
vpadd.i16 d18, d20, d21
vpadd.i16 d19, d22, d23
vaddhn.i16 \arg5, q8, q8
vaddhn.i16 \arg6, q9, q9
.endm
.macro BS_MV_CHECK arg0, arg1, arg2, arg3, arg4, arg5, arg6
vldm \arg0, {q0,q1,q2,q3}
/* Arrenge the input data --- TOP */
ands r6, \arg1, #2
beq bs_mv_check_jump0
sub r6, \arg0, \arg2, lsl #6
add r6, #48
vld1.8 {d8, d9}, [r6]
bs_mv_check_jump0:
BS_COMPARE_MV q4, q0, q1, q2, q3, \arg3, \arg4
/* Arrenge the input data --- LEFT */
ands r6, \arg1, #1
beq bs_mv_check_jump1
sub r6, \arg0, #52
add r7, r6, #16
vld1.32 d8[0], [r6]
add r6, r7, #16
vld1.32 d8[1], [r7]
add r7, r6, #16
vld1.32 d9[0], [r6]
vld1.32 d9[1], [r7]
bs_mv_check_jump1:
vzip.32 q0, q2
vzip.32 q1, q3
vzip.32 q0, q1
vzip.32 q2, q3
BS_COMPARE_MV q4, q0, q1, q2, q3, \arg5, \arg6
.endm
#endif
WELS_ASM_FUNC_BEGIN DeblockingBSCalcEnc_neon
stmdb sp!, {r5-r7}
vpush {q4}
ldr r5, [sp, #28] //Save BS to r5
/* Checking the nzc status */
BS_NZC_CHECK r0, r2, r3, q14, q15 //q14,q15 save the nzc status
/* For checking bS[I] = 2 */
mov r6, #2
vcgt.s8 q14, q14, #0
vdup.u8 q0, r6
vcgt.s8 q15, q15, #0
vand.u8 q14, q14, q0 //q14 save the nzc check result all the time --- for dir is top
vand.u8 q15, q15, q0 //q15 save the nzc check result all the time --- for dir is left
/* Checking the mv status*/
BS_MV_CHECK r1, r2, r3, d24, d25, d26, d27//q12, q13 save the mv status
/* For checking bS[I] = 1 */
mov r6, #1
vdup.u8 q0, r6
vand.u8 q12, q12, q0 //q12 save the nzc check result all the time --- for dir is top
vand.u8 q13, q13, q0 //q13 save the nzc check result all the time --- for dir is left
/* Check bS[I] is '1' or '2' */
vmax.u8 q1, q12, q14
vmax.u8 q0, q13, q15
//vstm r5, {q0, q1}
vst1.32 {q0, q1}, [r5]
vpop {q4}
ldmia sp!, {r5-r7}
WELS_ASM_FUNC_END
#endif