1622 lines
47 KiB
ArmAsm
Executable File
1622 lines
47 KiB
ArmAsm
Executable File
/*!
|
|
* \copy
|
|
* Copyright (c) 2013, Cisco Systems
|
|
* All rights reserved.
|
|
*
|
|
* Redistribution and use in source and binary forms, with or without
|
|
* modification, are permitted provided that the following conditions
|
|
* are met:
|
|
*
|
|
* * Redistributions of source code must retain the above copyright
|
|
* notice, this list of conditions and the following disclaimer.
|
|
*
|
|
* * Redistributions in binary form must reproduce the above copyright
|
|
* notice, this list of conditions and the following disclaimer in
|
|
* the documentation and/or other materials provided with the
|
|
* distribution.
|
|
*
|
|
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
|
* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
|
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
|
|
* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
|
|
* COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
|
|
* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
|
|
* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
|
|
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
|
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
|
|
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
|
|
* ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
|
* POSSIBILITY OF SUCH DAMAGE.
|
|
*
|
|
*/
|
|
|
|
#ifdef HAVE_NEON
|
|
.text
|
|
#include "arm_arch_common_macro.S"
|
|
|
|
#ifdef APPLE_IOS
|
|
.macro AVERAGE_TWO_8BITS
|
|
// { // input:dst_d, src_d A and B; working: q13
|
|
vaddl.u8 q13, $2, $1
|
|
vrshrn.u16 $0, q13, #1
|
|
// }
|
|
.endm
|
|
|
|
//h_filter(src) = (src[-2] + src[3]) - 5*(src[-1] + src[2]) + 20*(src[ 0] + src[1]);//
|
|
//clip((h_filter(src)+16)>>5)
|
|
.macro FILTER_6TAG_8BITS
|
|
// { // input:src[-2], src[-1], src[0], src[1], src[2], src[3], dst_d, multiplier a/b; working: q12, q13
|
|
vaddl.u8 q12, $0, $5 //q12=src[-2]+src[3]
|
|
vaddl.u8 q13, $2, $3 //src[0]+src[1]
|
|
vmla.u16 q12, q13, $7 //q12 += 20*(src[0]+src[1]), 2 cycles
|
|
vaddl.u8 q13, $1, $4 //src[-1]+src[2]
|
|
vmls.s16 q12, q13, $8 //q12 -= 5*(src[-1]+src[2]), 2 cycles
|
|
vqrshrun.s16 $6, q12, #5
|
|
// }
|
|
.endm
|
|
|
|
.macro FILTER_6TAG_8BITS_AVERAGE_WITH_0
|
|
// { // input:src[-2], src[-1], src[0], src[1], src[2], src[3], dst_d, multiplier a/b; working: q12, q13
|
|
vaddl.u8 q12, $0, $5 //q12=src[-2]+src[3]
|
|
vaddl.u8 q13, $2, $3 //src[0]+src[1]
|
|
vmla.u16 q12, q13, $7 //q12 += 20*(src[0]+src[1]), 2 cycles
|
|
vaddl.u8 q13, $1, $4 //src[-1]+src[2]
|
|
vmls.s16 q12, q13, $8 //q12 -= 5*(src[-1]+src[2]), 2 cycles
|
|
vqrshrun.s16 $6, q12, #5
|
|
vaddl.u8 q13, $2, $6
|
|
vrshrn.u16 $6, q13, #1
|
|
// }
|
|
.endm
|
|
|
|
.macro FILTER_6TAG_8BITS_AVERAGE_WITH_1
|
|
// { // input:src[-2], src[-1], src[0], src[1], src[2], src[3], dst_d, multiplier a/b; working: q12, q13
|
|
vaddl.u8 q12, $0, $5 //q12=src[-2]+src[3]
|
|
vaddl.u8 q13, $2, $3 //src[0]+src[1]
|
|
vmla.u16 q12, q13, $7 //q12 += 20*(src[0]+src[1]), 2 cycles
|
|
vaddl.u8 q13, $1, $4 //src[-1]+src[2]
|
|
vmls.s16 q12, q13, $8 //q12 -= 5*(src[-1]+src[2]), 2 cycles
|
|
vqrshrun.s16 $6, q12, #5
|
|
vaddl.u8 q13, $3, $6
|
|
vrshrn.u16 $6, q13, #1
|
|
// }
|
|
.endm
|
|
|
|
.macro FILTER_6TAG_8BITS_TO_16BITS
|
|
// { // input:d_src[-2], d_src[-1], d_src[0], d_src[1], d_src[2], d_src[3], dst_q, multiplier a/b; working:q13
|
|
vaddl.u8 $6, $0, $5 //dst_q=src[-2]+src[3]
|
|
vaddl.u8 q13, $2, $3 //src[0]+src[1]
|
|
vmla.u16 $6, q13, $7 //dst_q += 20*(src[0]+src[1]), 2 cycles
|
|
vaddl.u8 q13, $1, $4 //src[-1]+src[2]
|
|
vmls.s16 $6, q13, $8 //dst_q -= 5*(src[-1]+src[2]), 2 cycles
|
|
// }
|
|
.endm
|
|
|
|
.macro FILTER_3_IN_16BITS_TO_8BITS
|
|
// { // input:a, b, c, dst_d;
|
|
vsub.s16 $0, $0, $1 //a-b
|
|
vshr.s16 $0, $0, #2 //(a-b)/4
|
|
vsub.s16 $0, $0, $1 //(a-b)/4-b
|
|
vadd.s16 $0, $0, $2 //(a-b)/4-b+c
|
|
vshr.s16 $0, $0, #2 //((a-b)/4-b+c)/4
|
|
vadd.s16 $0, $0, $2 //((a-b)/4-b+c)/4+c = (a-5*b+20*c)/16
|
|
vqrshrun.s16 $3, $0, #6 //(+32)>>6
|
|
// }
|
|
.endm
|
|
|
|
.macro UNPACK_2_16BITS_TO_ABC
|
|
// { // input:q_src[-2:5], q_src[6:13](avail 8+5)/q_src[6:**](avail 4+5), dst_a, dst_b, dst_c;
|
|
vext.16 $4, $0, $1, #2 //src[0]
|
|
vext.16 $3, $0, $1, #3 //src[1]
|
|
vadd.s16 $4, $3 //c=src[0]+src[1]
|
|
|
|
vext.16 $3, $0, $1, #1 //src[-1]
|
|
vext.16 $2, $0, $1, #4 //src[2]
|
|
vadd.s16 $3, $2 //b=src[-1]+src[2]
|
|
|
|
vext.16 $2, $0, $1, #5 //src[3]
|
|
vadd.s16 $2, $0 //a=src[-2]+src[3]
|
|
// }
|
|
.endm
|
|
#else
|
|
.macro AVERAGE_TWO_8BITS arg0, arg1, arg2
|
|
// { // input:dst_d, src_d A and B; working: q13
|
|
vaddl.u8 q13, \arg2, \arg1
|
|
vrshrn.u16 \arg0, q13, #1
|
|
// }
|
|
.endm
|
|
|
|
//h_filter(src) = (src[-2] + src[3]) - 5*(src[-1] + src[2]) + 20*(src[ 0] + src[1]);//
|
|
//clip((h_filter(src)+16)>>5)
|
|
.macro FILTER_6TAG_8BITS arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8
|
|
// { // input:src[-2], src[-1], src[0], src[1], src[2], src[3], dst_d, multiplier a/b; working: q12, q13
|
|
vaddl.u8 q12, \arg0, \arg5 //q12=src[-2]+src[3]
|
|
vaddl.u8 q13, \arg2, \arg3 //src[0]+src[1]
|
|
vmla.u16 q12, q13, \arg7 //q12 += 20*(src[0]+src[1]), 2 cycles
|
|
vaddl.u8 q13, \arg1, \arg4 //src[-1]+src[2]
|
|
vmls.s16 q12, q13, \arg8 //q12 -= 5*(src[-1]+src[2]), 2 cycles
|
|
vqrshrun.s16 \arg6, q12, #5
|
|
// }
|
|
.endm
|
|
|
|
.macro FILTER_6TAG_8BITS_AVERAGE_WITH_0 arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8
|
|
// { // input:src[-2], src[-1], src[0], src[1], src[2], src[3], dst_d, multiplier a/b; working: q12, q13
|
|
vaddl.u8 q12, \arg0, \arg5 //q12=src[-2]+src[3]
|
|
vaddl.u8 q13, \arg2, \arg3 //src[0]+src[1]
|
|
vmla.u16 q12, q13, \arg7 //q12 += 20*(src[0]+src[1]), 2 cycles
|
|
vaddl.u8 q13, \arg1, \arg4 //src[-1]+src[2]
|
|
vmls.s16 q12, q13, \arg8 //q12 -= 5*(src[-1]+src[2]), 2 cycles
|
|
vqrshrun.s16 \arg6, q12, #5
|
|
vaddl.u8 q13, \arg2, \arg6
|
|
vrshrn.u16 \arg6, q13, #1
|
|
// }
|
|
.endm
|
|
|
|
.macro FILTER_6TAG_8BITS_AVERAGE_WITH_1 arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8
|
|
// { // input:src[-2], src[-1], src[0], src[1], src[2], src[3], dst_d, multiplier a/b; working: q12, q13
|
|
vaddl.u8 q12, \arg0, \arg5 //q12=src[-2]+src[3]
|
|
vaddl.u8 q13, \arg2, \arg3 //src[0]+src[1]
|
|
vmla.u16 q12, q13, \arg7 //q12 += 20*(src[0]+src[1]), 2 cycles
|
|
vaddl.u8 q13, \arg1, \arg4 //src[-1]+src[2]
|
|
vmls.s16 q12, q13, \arg8 //q12 -= 5*(src[-1]+src[2]), 2 cycles
|
|
vqrshrun.s16 \arg6, q12, #5
|
|
vaddl.u8 q13, \arg3, \arg6
|
|
vrshrn.u16 \arg6, q13, #1
|
|
// }
|
|
.endm
|
|
|
|
.macro FILTER_6TAG_8BITS_TO_16BITS arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8
|
|
// { // input:d_src[-2], d_src[-1], d_src[0], d_src[1], d_src[2], d_src[3], dst_q, multiplier a/b; working:q13
|
|
vaddl.u8 \arg6, \arg0, \arg5 //dst_q=src[-2]+src[3]
|
|
vaddl.u8 q13, \arg2, \arg3 //src[0]+src[1]
|
|
vmla.u16 \arg6, q13, \arg7 //dst_q += 20*(src[0]+src[1]), 2 cycles
|
|
vaddl.u8 q13, \arg1, \arg4 //src[-1]+src[2]
|
|
vmls.s16 \arg6, q13, \arg8 //dst_q -= 5*(src[-1]+src[2]), 2 cycles
|
|
// }
|
|
.endm
|
|
|
|
.macro FILTER_3_IN_16BITS_TO_8BITS arg0, arg1, arg2, arg3
|
|
// { // input:a, b, c, dst_d;
|
|
vsub.s16 \arg0, \arg0, \arg1 //a-b
|
|
vshr.s16 \arg0, \arg0, #2 //(a-b)/4
|
|
vsub.s16 \arg0, \arg0, \arg1 //(a-b)/4-b
|
|
vadd.s16 \arg0, \arg0, \arg2 //(a-b)/4-b+c
|
|
vshr.s16 \arg0, \arg0, #2 //((a-b)/4-b+c)/4
|
|
vadd.s16 \arg0, \arg0, \arg2 //((a-b)/4-b+c)/4+c = (a-5*b+20*c)/16
|
|
vqrshrun.s16 \arg3, \arg0, #6 //(+32)>>6
|
|
// }
|
|
.endm
|
|
|
|
.macro UNPACK_2_16BITS_TO_ABC arg0, arg1, arg2, arg3, arg4
|
|
// { // input:q_src[-2:5], q_src[6:13](avail 8+5)/q_src[6:**](avail 4+5), dst_a, dst_b, dst_c;
|
|
vext.16 \arg4, \arg0, \arg1, #2 //src[0]
|
|
vext.16 \arg3, \arg0, \arg1, #3 //src[1]
|
|
vadd.s16 \arg4, \arg3 //c=src[0]+src[1]
|
|
|
|
vext.16 \arg3, \arg0, \arg1, #1 //src[-1]
|
|
vext.16 \arg2, \arg0, \arg1, #4 //src[2]
|
|
vadd.s16 \arg3,\arg2 //b=src[-1]+src[2]
|
|
|
|
vext.16 \arg2, \arg0, \arg1, #5 //src[3]
|
|
vadd.s16 \arg2, \arg0 //a=src[-2]+src[3]
|
|
// }
|
|
.endm
|
|
#endif
|
|
//uint8_t* src, int32_t src_stride, uint8_t* dst, int32_t dst_stride, int32_t height
|
|
WELS_ASM_FUNC_BEGIN McHorVer20WidthEq16_neon
|
|
push {r4}
|
|
ldr r4, [sp, #4]
|
|
|
|
sub r0, #2
|
|
vmov.u16 q14, #0x0014 // 20
|
|
vshr.u16 q15, q14, #2 // 5
|
|
|
|
w16_h_mc_luma_loop:
|
|
vld1.u8 {d0,d1,d2}, [r0], r1 //only use 21(16+5); q0=src[-2]
|
|
pld [r0]
|
|
pld [r0, #16]
|
|
|
|
vext.8 q2, q0, q1, #1 //q2=src[-1]
|
|
vext.8 q3, q0, q1, #2 //q3=src[0]
|
|
vext.8 q4, q0, q1, #3 //q4=src[1]
|
|
vext.8 q5, q0, q1, #4 //q5=src[2]
|
|
vext.8 q6, q0, q1, #5 //q6=src[3]
|
|
|
|
FILTER_6TAG_8BITS d0, d4, d6, d8, d10, d12, d2, q14, q15
|
|
|
|
FILTER_6TAG_8BITS d1, d5, d7, d9, d11, d13, d3, q14, q15
|
|
|
|
sub r4, #1
|
|
vst1.u8 {d2, d3}, [r2], r3 //write 16Byte
|
|
|
|
cmp r4, #0
|
|
bne w16_h_mc_luma_loop
|
|
pop {r4}
|
|
WELS_ASM_FUNC_END
|
|
|
|
//uint8_t* src, int32_t src_stride, uint8_t* dst, int32_t dst_stride, int32_t height
|
|
WELS_ASM_FUNC_BEGIN McHorVer20WidthEq8_neon
|
|
push {r4}
|
|
ldr r4, [sp, #4]
|
|
|
|
sub r0, #2
|
|
vmov.u16 q14, #0x0014 // 20
|
|
vshr.u16 q15, q14, #2 // 5
|
|
|
|
w8_h_mc_luma_loop:
|
|
vld1.u8 {d0,d1}, [r0], r1 //only use 13(8+5); q0=src[-2]
|
|
pld [r0]
|
|
|
|
vext.8 d2, d0, d1, #1 //d2=src[-1]
|
|
vext.8 d3, d0, d1, #2 //d3=src[0]
|
|
vext.8 d4, d0, d1, #3 //d4=src[1]
|
|
vext.8 d5, d0, d1, #4 //d5=src[2]
|
|
vext.8 d6, d0, d1, #5 //d6=src[3]
|
|
|
|
FILTER_6TAG_8BITS d0, d2, d3, d4, d5, d6, d1, q14, q15
|
|
|
|
sub r4, #1
|
|
vst1.u8 {d1}, [r2], r3
|
|
|
|
cmp r4, #0
|
|
bne w8_h_mc_luma_loop
|
|
pop {r4}
|
|
WELS_ASM_FUNC_END
|
|
|
|
//uint8_t* src, int32_t src_stride, uint8_t* dst, int32_t dst_stride, int32_t height
|
|
WELS_ASM_FUNC_BEGIN McHorVer20WidthEq4_neon
|
|
push {r4, r5, r6}
|
|
ldr r6, [sp, #12]
|
|
|
|
sub r0, #2
|
|
vmov.u16 q14, #0x0014 // 20
|
|
vshr.u16 q15, q14, #2 // 5
|
|
|
|
w4_h_mc_luma_loop:
|
|
vld1.u8 {d0, d1}, [r0], r1 //only use 9(4+5);d0: 1st row src[-2:5]
|
|
pld [r0]
|
|
vld1.u8 {d2, d3}, [r0], r1 //d2: 2nd row src[-2:5]
|
|
pld [r0]
|
|
|
|
vext.8 d4, d0, d1, #1 //d4: 1st row src[-1:6]
|
|
vext.8 d5, d2, d3, #1 //d5: 2nd row src[-1:6]
|
|
vext.8 q3, q2, q2, #1 //src[0:6 *]
|
|
vext.8 q4, q2, q2, #2 //src[1:6 * *]
|
|
|
|
vtrn.32 q3, q4 //q3::d6:1st row [0:3]+[1:4]; d7:2nd row [0:3]+[1:4]
|
|
vtrn.32 d6, d7 //d6:[0:3]; d7[1:4]
|
|
vtrn.32 d0, d2 //d0:[-2:1]; d2[2:5]
|
|
vtrn.32 d4, d5 //d4:[-1:2]; d5[3:6]
|
|
|
|
FILTER_6TAG_8BITS d0, d4, d6, d7, d2, d5, d1, q14, q15
|
|
|
|
vmov r4, r5, d1
|
|
str r4, [r2], r3
|
|
str r5, [r2], r3
|
|
|
|
sub r6, #2
|
|
cmp r6, #0
|
|
bne w4_h_mc_luma_loop
|
|
|
|
pop {r4, r5, r6}
|
|
WELS_ASM_FUNC_END
|
|
|
|
//uint8_t* src, int32_t src_stride, uint8_t* dst, int32_t dst_stride, int32_t height
|
|
WELS_ASM_FUNC_BEGIN McHorVer10WidthEq16_neon
|
|
push {r4}
|
|
ldr r4, [sp, #4]
|
|
|
|
sub r0, #2
|
|
vmov.u16 q14, #0x0014 // 20
|
|
vshr.u16 q15, q14, #2 // 5
|
|
|
|
w16_xy_10_mc_luma_loop:
|
|
vld1.u8 {d0,d1,d2}, [r0], r1 //only use 21(16+5); q0=src[-2]
|
|
pld [r0]
|
|
pld [r0, #16]
|
|
|
|
vext.8 q2, q0, q1, #1 //q2=src[-1]
|
|
vext.8 q3, q0, q1, #2 //q3=src[0]
|
|
vext.8 q4, q0, q1, #3 //q4=src[1]
|
|
vext.8 q5, q0, q1, #4 //q5=src[2]
|
|
vext.8 q6, q0, q1, #5 //q6=src[3]
|
|
|
|
FILTER_6TAG_8BITS_AVERAGE_WITH_0 d0, d4, d6, d8, d10, d12, d2, q14, q15
|
|
|
|
FILTER_6TAG_8BITS_AVERAGE_WITH_0 d1, d5, d7, d9, d11, d13, d3, q14, q15
|
|
|
|
sub r4, #1
|
|
vst1.u8 {d2, d3}, [r2], r3 //write 16Byte
|
|
|
|
cmp r4, #0
|
|
bne w16_xy_10_mc_luma_loop
|
|
pop {r4}
|
|
WELS_ASM_FUNC_END
|
|
|
|
//uint8_t* src, int32_t src_stride, uint8_t* dst, int32_t dst_stride, int32_t height
|
|
WELS_ASM_FUNC_BEGIN McHorVer10WidthEq8_neon
|
|
push {r4}
|
|
ldr r4, [sp, #4]
|
|
|
|
sub r0, #2
|
|
vmov.u16 q14, #0x0014 // 20
|
|
vshr.u16 q15, q14, #2 // 5
|
|
|
|
w8_xy_10_mc_luma_loop:
|
|
vld1.u8 {d0,d1}, [r0], r1 //only use 13(8+5); q0=src[-2]
|
|
pld [r0]
|
|
|
|
vext.8 d2, d0, d1, #1 //d2=src[-1]
|
|
vext.8 d3, d0, d1, #2 //d3=src[0]
|
|
vext.8 d4, d0, d1, #3 //d4=src[1]
|
|
vext.8 d5, d0, d1, #4 //d5=src[2]
|
|
vext.8 d6, d0, d1, #5 //d6=src[3]
|
|
|
|
FILTER_6TAG_8BITS_AVERAGE_WITH_0 d0, d2, d3, d4, d5, d6, d1, q14, q15
|
|
|
|
sub r4, #1
|
|
vst1.u8 {d1}, [r2], r3
|
|
|
|
cmp r4, #0
|
|
bne w8_xy_10_mc_luma_loop
|
|
pop {r4}
|
|
WELS_ASM_FUNC_END
|
|
|
|
//uint8_t* src, int32_t src_stride, uint8_t* dst, int32_t dst_stride, int32_t height
|
|
WELS_ASM_FUNC_BEGIN McHorVer10WidthEq4_neon
|
|
push {r4, r5, r6}
|
|
ldr r6, [sp, #12]
|
|
|
|
sub r0, #2
|
|
vmov.u16 q14, #0x0014 // 20
|
|
vshr.u16 q15, q14, #2 // 5
|
|
|
|
w4_xy_10_mc_luma_loop:
|
|
vld1.u8 {d0, d1}, [r0], r1 //only use 9(4+5);d0: 1st row src[-2:5]
|
|
pld [r0]
|
|
vld1.u8 {d2, d3}, [r0], r1 //d2: 2nd row src[-2:5]
|
|
pld [r0]
|
|
|
|
vext.8 d4, d0, d1, #1 //d4: 1st row src[-1:6]
|
|
vext.8 d5, d2, d3, #1 //d5: 2nd row src[-1:6]
|
|
vext.8 q3, q2, q2, #1 //src[0:6 *]
|
|
vext.8 q4, q2, q2, #2 //src[1:6 * *]
|
|
|
|
vtrn.32 q3, q4 //q3::d6:1st row [0:3]+[1:4]; d7:2nd row [0:3]+[1:4]
|
|
vtrn.32 d6, d7 //d6:[0:3]; d7[1:4]
|
|
vtrn.32 d0, d2 //d0:[-2:1]; d2[2:5]
|
|
vtrn.32 d4, d5 //d4:[-1:2]; d5[3:6]
|
|
|
|
FILTER_6TAG_8BITS_AVERAGE_WITH_0 d0, d4, d6, d7, d2, d5, d1, q14, q15
|
|
|
|
vmov r4, r5, d1
|
|
str r4, [r2], r3
|
|
str r5, [r2], r3
|
|
|
|
sub r6, #2
|
|
cmp r6, #0
|
|
bne w4_xy_10_mc_luma_loop
|
|
|
|
pop {r4, r5, r6}
|
|
WELS_ASM_FUNC_END
|
|
|
|
//uint8_t* src, int32_t src_stride, uint8_t* dst, int32_t dst_stride, int32_t height
|
|
WELS_ASM_FUNC_BEGIN McHorVer30WidthEq16_neon
|
|
push {r4}
|
|
ldr r4, [sp, #4]
|
|
|
|
sub r0, #2
|
|
vmov.u16 q14, #0x0014 // 20
|
|
vshr.u16 q15, q14, #2 // 5
|
|
|
|
w16_xy_30_mc_luma_loop:
|
|
vld1.u8 {d0,d1,d2}, [r0], r1 //only use 21(16+5); q0=src[-2]
|
|
pld [r0]
|
|
pld [r0, #16]
|
|
|
|
vext.8 q2, q0, q1, #1 //q2=src[-1]
|
|
vext.8 q3, q0, q1, #2 //q3=src[0]
|
|
vext.8 q4, q0, q1, #3 //q4=src[1]
|
|
vext.8 q5, q0, q1, #4 //q5=src[2]
|
|
vext.8 q6, q0, q1, #5 //q6=src[3]
|
|
|
|
FILTER_6TAG_8BITS_AVERAGE_WITH_1 d0, d4, d6, d8, d10, d12, d2, q14, q15
|
|
|
|
FILTER_6TAG_8BITS_AVERAGE_WITH_1 d1, d5, d7, d9, d11, d13, d3, q14, q15
|
|
|
|
sub r4, #1
|
|
vst1.u8 {d2, d3}, [r2], r3 //write 16Byte
|
|
|
|
cmp r4, #0
|
|
bne w16_xy_30_mc_luma_loop
|
|
pop {r4}
|
|
WELS_ASM_FUNC_END
|
|
|
|
//uint8_t* src, int32_t src_stride, uint8_t* dst, int32_t dst_stride, int32_t height
|
|
WELS_ASM_FUNC_BEGIN McHorVer30WidthEq8_neon
|
|
push {r4}
|
|
ldr r4, [sp, #4]
|
|
|
|
sub r0, #2
|
|
vmov.u16 q14, #0x0014 // 20
|
|
vshr.u16 q15, q14, #2 // 5
|
|
|
|
w8_xy_30_mc_luma_loop:
|
|
vld1.u8 {d0,d1}, [r0], r1 //only use 13(8+5); q0=src[-2]
|
|
pld [r0]
|
|
|
|
vext.8 d2, d0, d1, #1 //d2=src[-1]
|
|
vext.8 d3, d0, d1, #2 //d3=src[0]
|
|
vext.8 d4, d0, d1, #3 //d4=src[1]
|
|
vext.8 d5, d0, d1, #4 //d5=src[2]
|
|
vext.8 d6, d0, d1, #5 //d6=src[3]
|
|
|
|
FILTER_6TAG_8BITS_AVERAGE_WITH_1 d0, d2, d3, d4, d5, d6, d1, q14, q15
|
|
|
|
sub r4, #1
|
|
vst1.u8 {d1}, [r2], r3
|
|
|
|
cmp r4, #0
|
|
bne w8_xy_30_mc_luma_loop
|
|
pop {r4}
|
|
WELS_ASM_FUNC_END
|
|
|
|
//uint8_t* src, int32_t src_stride, uint8_t* dst, int32_t dst_stride, int32_t height
|
|
WELS_ASM_FUNC_BEGIN McHorVer30WidthEq4_neon
|
|
push {r4, r5, r6}
|
|
ldr r6, [sp, #12]
|
|
|
|
sub r0, #2
|
|
vmov.u16 q14, #0x0014 // 20
|
|
vshr.u16 q15, q14, #2 // 5
|
|
|
|
w4_xy_30_mc_luma_loop:
|
|
vld1.u8 {d0, d1}, [r0], r1 //only use 9(4+5);d0: 1st row src[-2:5]
|
|
pld [r0]
|
|
vld1.u8 {d2, d3}, [r0], r1 //d2: 2nd row src[-2:5]
|
|
pld [r0]
|
|
|
|
vext.8 d4, d0, d1, #1 //d4: 1st row src[-1:6]
|
|
vext.8 d5, d2, d3, #1 //d5: 2nd row src[-1:6]
|
|
vext.8 q3, q2, q2, #1 //src[0:6 *]
|
|
vext.8 q4, q2, q2, #2 //src[1:6 * *]
|
|
|
|
vtrn.32 q3, q4 //q3::d6:1st row [0:3]+[1:4]; d7:2nd row [0:3]+[1:4]
|
|
vtrn.32 d6, d7 //d6:[0:3]; d7[1:4]
|
|
vtrn.32 d0, d2 //d0:[-2:1]; d2[2:5]
|
|
vtrn.32 d4, d5 //d4:[-1:2]; d5[3:6]
|
|
|
|
FILTER_6TAG_8BITS_AVERAGE_WITH_1 d0, d4, d6, d7, d2, d5, d1, q14, q15
|
|
|
|
vmov r4, r5, d1
|
|
str r4, [r2], r3
|
|
str r5, [r2], r3
|
|
|
|
sub r6, #2
|
|
cmp r6, #0
|
|
bne w4_xy_30_mc_luma_loop
|
|
|
|
pop {r4, r5, r6}
|
|
WELS_ASM_FUNC_END
|
|
|
|
//uint8_t* src, int32_t src_stride, uint8_t* dst, int32_t dst_stride, int32_t height
|
|
WELS_ASM_FUNC_BEGIN McHorVer01WidthEq16_neon
|
|
push {r4}
|
|
ldr r4, [sp, #4]
|
|
|
|
sub r0, r1, lsl #1 //src[-2*src_stride]
|
|
pld [r0]
|
|
pld [r0, r1]
|
|
vmov.u16 q14, #0x0014 // 20
|
|
vld1.u8 {q0}, [r0], r1 //q0=src[-2]
|
|
vld1.u8 {q1}, [r0], r1 //q1=src[-1]
|
|
|
|
pld [r0]
|
|
pld [r0, r1]
|
|
vshr.u16 q15, q14, #2 // 5
|
|
vld1.u8 {q2}, [r0], r1 //q2=src[0]
|
|
vld1.u8 {q3}, [r0], r1 //q3=src[1]
|
|
vld1.u8 {q4}, [r0], r1 //q4=src[2]
|
|
|
|
w16_xy_01_luma_loop:
|
|
|
|
vld1.u8 {q5}, [r0], r1 //q5=src[3]
|
|
|
|
FILTER_6TAG_8BITS_AVERAGE_WITH_0 d0, d2, d4, d6, d8, d10, d12, q14, q15
|
|
pld [r0]
|
|
FILTER_6TAG_8BITS_AVERAGE_WITH_0 d1, d3, d5, d7, d9, d11, d13, q14, q15
|
|
vld1.u8 {q0}, [r0], r1 //read 2nd row
|
|
vst1.u8 {q6}, [r2], r3 //write 1st 16Byte
|
|
|
|
FILTER_6TAG_8BITS_AVERAGE_WITH_0 d2, d4, d6, d8, d10, d0, d12, q14, q15
|
|
pld [r0]
|
|
FILTER_6TAG_8BITS_AVERAGE_WITH_0 d3, d5, d7, d9, d11, d1, d13, q14, q15
|
|
vld1.u8 {q1}, [r0], r1 //read 3rd row
|
|
vst1.u8 {q6}, [r2], r3 //write 2nd 16Byte
|
|
|
|
FILTER_6TAG_8BITS_AVERAGE_WITH_0 d4, d6, d8, d10, d0, d2, d12, q14, q15
|
|
pld [r0]
|
|
FILTER_6TAG_8BITS_AVERAGE_WITH_0 d5, d7, d9, d11, d1, d3, d13, q14, q15
|
|
vld1.u8 {q2}, [r0], r1 //read 4th row
|
|
vst1.u8 {q6}, [r2], r3 //write 3rd 16Byte
|
|
|
|
FILTER_6TAG_8BITS_AVERAGE_WITH_0 d6, d8, d10, d0, d2, d4, d12, q14, q15
|
|
pld [r0]
|
|
FILTER_6TAG_8BITS_AVERAGE_WITH_0 d7, d9, d11, d1, d3, d5, d13, q14, q15
|
|
vld1.u8 {q3}, [r0], r1 //read 5th row
|
|
vst1.u8 {q6}, [r2], r3 //write 4th 16Byte
|
|
|
|
FILTER_6TAG_8BITS_AVERAGE_WITH_0 d8, d10, d0, d2, d4, d6, d12, q14, q15
|
|
pld [r0]
|
|
FILTER_6TAG_8BITS_AVERAGE_WITH_0 d9, d11, d1, d3, d5, d7, d13, q14, q15
|
|
vld1.u8 {q4}, [r0], r1 //read 6th row
|
|
vst1.u8 {q6}, [r2], r3 //write 5th 16Byte
|
|
|
|
FILTER_6TAG_8BITS_AVERAGE_WITH_0 d10, d0, d2, d4, d6, d8, d12, q14, q15
|
|
pld [r0]
|
|
FILTER_6TAG_8BITS_AVERAGE_WITH_0 d11, d1, d3, d5, d7, d9, d13, q14, q15
|
|
vld1.u8 {q5}, [r0], r1 //read 7th row
|
|
vst1.u8 {q6}, [r2], r3 //write 6th 16Byte
|
|
|
|
FILTER_6TAG_8BITS_AVERAGE_WITH_0 d0, d2, d4, d6, d8, d10, d12, q14, q15
|
|
pld [r0]
|
|
FILTER_6TAG_8BITS_AVERAGE_WITH_0 d1, d3, d5, d7, d9, d11, d13, q14, q15
|
|
vld1.u8 {q0}, [r0], r1 //read 8th row
|
|
vst1.u8 {q6}, [r2], r3 //write 7th 16Byte
|
|
|
|
FILTER_6TAG_8BITS_AVERAGE_WITH_0 d2, d4, d6, d8, d10, d0, d12, q14, q15
|
|
pld [r0]
|
|
FILTER_6TAG_8BITS_AVERAGE_WITH_0 d3, d5, d7, d9, d11, d1, d13, q14, q15
|
|
vst1.u8 {q6}, [r2], r3 //write 8th 16Byte
|
|
|
|
//q2, q3, q4, q5, q0 --> q0~q4
|
|
vswp q0, q4
|
|
vswp q0, q2
|
|
vmov q1, q3
|
|
vmov q3, q5 //q0~q4
|
|
|
|
sub r4, #8
|
|
cmp r4, #0
|
|
bne w16_xy_01_luma_loop
|
|
pop {r4}
|
|
WELS_ASM_FUNC_END
|
|
|
|
WELS_ASM_FUNC_BEGIN McHorVer01WidthEq8_neon
|
|
push {r4}
|
|
ldr r4, [sp, #4]
|
|
|
|
sub r0, r1, lsl #1 //src[-2*src_stride]
|
|
pld [r0]
|
|
pld [r0, r1]
|
|
vmov.u16 q14, #0x0014 // 20
|
|
vld1.u8 {d0}, [r0], r1 //d0=src[-2]
|
|
vld1.u8 {d1}, [r0], r1 //d1=src[-1]
|
|
|
|
pld [r0]
|
|
pld [r0, r1]
|
|
vshr.u16 q15, q14, #2 // 5
|
|
vld1.u8 {d2}, [r0], r1 //d2=src[0]
|
|
vld1.u8 {d3}, [r0], r1 //d3=src[1]
|
|
|
|
vld1.u8 {d4}, [r0], r1 //d4=src[2]
|
|
vld1.u8 {d5}, [r0], r1 //d5=src[3]
|
|
|
|
w8_xy_01_mc_luma_loop:
|
|
|
|
pld [r0]
|
|
FILTER_6TAG_8BITS_AVERAGE_WITH_0 d0, d1, d2, d3, d4, d5, d12, q14, q15
|
|
vld1.u8 {d0}, [r0], r1 //read 2nd row
|
|
vst1.u8 {d12}, [r2], r3 //write 1st 8Byte
|
|
|
|
pld [r0]
|
|
FILTER_6TAG_8BITS_AVERAGE_WITH_0 d1, d2, d3, d4, d5, d0, d12, q14, q15
|
|
vld1.u8 {d1}, [r0], r1 //read 3rd row
|
|
vst1.u8 {d12}, [r2], r3 //write 2nd 8Byte
|
|
|
|
pld [r0]
|
|
FILTER_6TAG_8BITS_AVERAGE_WITH_0 d2, d3, d4, d5, d0, d1, d12, q14, q15
|
|
vld1.u8 {d2}, [r0], r1 //read 4th row
|
|
vst1.u8 {d12}, [r2], r3 //write 3rd 8Byte
|
|
|
|
pld [r0]
|
|
FILTER_6TAG_8BITS_AVERAGE_WITH_0 d3, d4, d5, d0, d1, d2, d12, q14, q15
|
|
vld1.u8 {d3}, [r0], r1 //read 5th row
|
|
vst1.u8 {d12}, [r2], r3 //write 4th 8Byte
|
|
|
|
//d4, d5, d0, d1, d2, d3 --> d0, d1, d2, d3, d4, d5
|
|
vswp q0, q2
|
|
vswp q1, q2
|
|
|
|
sub r4, #4
|
|
cmp r4, #0
|
|
bne w8_xy_01_mc_luma_loop
|
|
|
|
pop {r4}
|
|
WELS_ASM_FUNC_END
|
|
|
|
WELS_ASM_FUNC_BEGIN McHorVer01WidthEq4_neon
|
|
push {r4, r5, r6, r7}
|
|
sub r0, r1, lsl #1 //src[-2*src_stride]
|
|
pld [r0]
|
|
pld [r0, r1]
|
|
vmov.u16 q14, #0x0014 // 20
|
|
ldr r4, [r0], r1 //r4=src[-2]
|
|
ldr r5, [r0], r1 //r5=src[-1]
|
|
|
|
pld [r0]
|
|
pld [r0, r1]
|
|
vshr.u16 q15, q14, #2 // 5
|
|
ldr r6, [r0], r1 //r6=src[0]
|
|
ldr r7, [r0], r1 //r7=src[1]
|
|
|
|
vmov d0, r4, r5
|
|
vmov d1, r5, r6
|
|
vmov d2, r6, r7
|
|
|
|
ldr r4, [r0], r1 //r4=src[2]
|
|
vmov d3, r7, r4
|
|
ldr r7, [sp, #16]
|
|
|
|
w4_xy_01_mc_luma_loop:
|
|
|
|
// pld [r0]
|
|
//using reserving r4
|
|
ldr r5, [r0], r1 //r5=src[3]
|
|
ldr r6, [r0], r1 //r6=src[0]
|
|
vmov d4, r4, r5
|
|
vmov d5, r5, r6 //reserved r6
|
|
|
|
FILTER_6TAG_8BITS_AVERAGE_WITH_0 d0, d1, d2, d3, d4, d5, d12, q14, q15
|
|
vmov r4, r5, d12
|
|
str r4, [r2], r3 //write 1st 4Byte
|
|
str r5, [r2], r3 //write 2nd 4Byte
|
|
|
|
ldr r5, [r0], r1 //r5=src[1]
|
|
ldr r4, [r0], r1 //r4=src[2]
|
|
vmov d0, r6, r5
|
|
vmov d1, r5, r4 //reserved r4
|
|
|
|
FILTER_6TAG_8BITS_AVERAGE_WITH_0 d2, d3, d4, d5, d0, d1, d12, q14, q15
|
|
vmov r5, r6, d12
|
|
str r5, [r2], r3 //write 3rd 4Byte
|
|
str r6, [r2], r3 //write 4th 4Byte
|
|
|
|
//d4, d5, d0, d1 --> d0, d1, d2, d3
|
|
vmov q1, q0
|
|
vmov q0, q2
|
|
|
|
sub r7, #4
|
|
cmp r7, #0
|
|
bne w4_xy_01_mc_luma_loop
|
|
|
|
pop {r4, r5, r6, r7}
|
|
WELS_ASM_FUNC_END
|
|
|
|
WELS_ASM_FUNC_BEGIN McHorVer03WidthEq16_neon
|
|
push {r4}
|
|
ldr r4, [sp, #4]
|
|
|
|
sub r0, r1, lsl #1 //src[-2*src_stride]
|
|
pld [r0]
|
|
pld [r0, r1]
|
|
vmov.u16 q14, #0x0014 // 20
|
|
vld1.u8 {q0}, [r0], r1 //q0=src[-2]
|
|
vld1.u8 {q1}, [r0], r1 //q1=src[-1]
|
|
|
|
pld [r0]
|
|
pld [r0, r1]
|
|
vshr.u16 q15, q14, #2 // 5
|
|
vld1.u8 {q2}, [r0], r1 //q2=src[0]
|
|
vld1.u8 {q3}, [r0], r1 //q3=src[1]
|
|
vld1.u8 {q4}, [r0], r1 //q4=src[2]
|
|
|
|
w16_xy_03_luma_loop:
|
|
|
|
vld1.u8 {q5}, [r0], r1 //q5=src[3]
|
|
|
|
FILTER_6TAG_8BITS_AVERAGE_WITH_1 d0, d2, d4, d6, d8, d10, d12, q14, q15
|
|
pld [r0]
|
|
FILTER_6TAG_8BITS_AVERAGE_WITH_1 d1, d3, d5, d7, d9, d11, d13, q14, q15
|
|
vld1.u8 {q0}, [r0], r1 //read 2nd row
|
|
vst1.u8 {q6}, [r2], r3 //write 1st 16Byte
|
|
|
|
FILTER_6TAG_8BITS_AVERAGE_WITH_1 d2, d4, d6, d8, d10, d0, d12, q14, q15
|
|
pld [r0]
|
|
FILTER_6TAG_8BITS_AVERAGE_WITH_1 d3, d5, d7, d9, d11, d1, d13, q14, q15
|
|
vld1.u8 {q1}, [r0], r1 //read 3rd row
|
|
vst1.u8 {q6}, [r2], r3 //write 2nd 16Byte
|
|
|
|
FILTER_6TAG_8BITS_AVERAGE_WITH_1 d4, d6, d8, d10, d0, d2, d12, q14, q15
|
|
pld [r0]
|
|
FILTER_6TAG_8BITS_AVERAGE_WITH_1 d5, d7, d9, d11, d1, d3, d13, q14, q15
|
|
vld1.u8 {q2}, [r0], r1 //read 4th row
|
|
vst1.u8 {q6}, [r2], r3 //write 3rd 16Byte
|
|
|
|
FILTER_6TAG_8BITS_AVERAGE_WITH_1 d6, d8, d10, d0, d2, d4, d12, q14, q15
|
|
pld [r0]
|
|
FILTER_6TAG_8BITS_AVERAGE_WITH_1 d7, d9, d11, d1, d3, d5, d13, q14, q15
|
|
vld1.u8 {q3}, [r0], r1 //read 5th row
|
|
vst1.u8 {q6}, [r2], r3 //write 4th 16Byte
|
|
|
|
FILTER_6TAG_8BITS_AVERAGE_WITH_1 d8, d10, d0, d2, d4, d6, d12, q14, q15
|
|
pld [r0]
|
|
FILTER_6TAG_8BITS_AVERAGE_WITH_1 d9, d11, d1, d3, d5, d7, d13, q14, q15
|
|
vld1.u8 {q4}, [r0], r1 //read 6th row
|
|
vst1.u8 {q6}, [r2], r3 //write 5th 16Byte
|
|
|
|
FILTER_6TAG_8BITS_AVERAGE_WITH_1 d10, d0, d2, d4, d6, d8, d12, q14, q15
|
|
pld [r0]
|
|
FILTER_6TAG_8BITS_AVERAGE_WITH_1 d11, d1, d3, d5, d7, d9, d13, q14, q15
|
|
vld1.u8 {q5}, [r0], r1 //read 7th row
|
|
vst1.u8 {q6}, [r2], r3 //write 6th 16Byte
|
|
|
|
FILTER_6TAG_8BITS_AVERAGE_WITH_1 d0, d2, d4, d6, d8, d10, d12, q14, q15
|
|
pld [r0]
|
|
FILTER_6TAG_8BITS_AVERAGE_WITH_1 d1, d3, d5, d7, d9, d11, d13, q14, q15
|
|
vld1.u8 {q0}, [r0], r1 //read 8th row
|
|
vst1.u8 {q6}, [r2], r3 //write 7th 16Byte
|
|
|
|
FILTER_6TAG_8BITS_AVERAGE_WITH_1 d2, d4, d6, d8, d10, d0, d12, q14, q15
|
|
pld [r0]
|
|
FILTER_6TAG_8BITS_AVERAGE_WITH_1 d3, d5, d7, d9, d11, d1, d13, q14, q15
|
|
vst1.u8 {q6}, [r2], r3 //write 8th 16Byte
|
|
|
|
//q2, q3, q4, q5, q0 --> q0~q4
|
|
vswp q0, q4
|
|
vswp q0, q2
|
|
vmov q1, q3
|
|
vmov q3, q5 //q0~q4
|
|
|
|
sub r4, #8
|
|
cmp r4, #0
|
|
bne w16_xy_03_luma_loop
|
|
pop {r4}
|
|
WELS_ASM_FUNC_END
|
|
|
|
WELS_ASM_FUNC_BEGIN McHorVer03WidthEq8_neon
|
|
push {r4}
|
|
ldr r4, [sp, #4]
|
|
|
|
sub r0, r1, lsl #1 //src[-2*src_stride]
|
|
pld [r0]
|
|
pld [r0, r1]
|
|
vmov.u16 q14, #0x0014 // 20
|
|
vld1.u8 {d0}, [r0], r1 //d0=src[-2]
|
|
vld1.u8 {d1}, [r0], r1 //d1=src[-1]
|
|
|
|
pld [r0]
|
|
pld [r0, r1]
|
|
vshr.u16 q15, q14, #2 // 5
|
|
vld1.u8 {d2}, [r0], r1 //d2=src[0]
|
|
vld1.u8 {d3}, [r0], r1 //d3=src[1]
|
|
|
|
vld1.u8 {d4}, [r0], r1 //d4=src[2]
|
|
vld1.u8 {d5}, [r0], r1 //d5=src[3]
|
|
|
|
w8_xy_03_mc_luma_loop:
|
|
|
|
pld [r0]
|
|
FILTER_6TAG_8BITS_AVERAGE_WITH_1 d0, d1, d2, d3, d4, d5, d12, q14, q15
|
|
vld1.u8 {d0}, [r0], r1 //read 2nd row
|
|
vst1.u8 {d12}, [r2], r3 //write 1st 8Byte
|
|
|
|
pld [r0]
|
|
FILTER_6TAG_8BITS_AVERAGE_WITH_1 d1, d2, d3, d4, d5, d0, d12, q14, q15
|
|
vld1.u8 {d1}, [r0], r1 //read 3rd row
|
|
vst1.u8 {d12}, [r2], r3 //write 2nd 8Byte
|
|
|
|
pld [r0]
|
|
FILTER_6TAG_8BITS_AVERAGE_WITH_1 d2, d3, d4, d5, d0, d1, d12, q14, q15
|
|
vld1.u8 {d2}, [r0], r1 //read 4th row
|
|
vst1.u8 {d12}, [r2], r3 //write 3rd 8Byte
|
|
|
|
pld [r0]
|
|
FILTER_6TAG_8BITS_AVERAGE_WITH_1 d3, d4, d5, d0, d1, d2, d12, q14, q15
|
|
vld1.u8 {d3}, [r0], r1 //read 5th row
|
|
vst1.u8 {d12}, [r2], r3 //write 4th 8Byte
|
|
|
|
//d4, d5, d0, d1, d2, d3 --> d0, d1, d2, d3, d4, d5
|
|
vswp q0, q2
|
|
vswp q1, q2
|
|
|
|
sub r4, #4
|
|
cmp r4, #0
|
|
bne w8_xy_03_mc_luma_loop
|
|
|
|
pop {r4}
|
|
WELS_ASM_FUNC_END
|
|
|
|
WELS_ASM_FUNC_BEGIN McHorVer03WidthEq4_neon
|
|
push {r4, r5, r6, r7}
|
|
sub r0, r1, lsl #1 //src[-2*src_stride]
|
|
pld [r0]
|
|
pld [r0, r1]
|
|
vmov.u16 q14, #0x0014 // 20
|
|
ldr r4, [r0], r1 //r4=src[-2]
|
|
ldr r5, [r0], r1 //r5=src[-1]
|
|
|
|
pld [r0]
|
|
pld [r0, r1]
|
|
vshr.u16 q15, q14, #2 // 5
|
|
ldr r6, [r0], r1 //r6=src[0]
|
|
ldr r7, [r0], r1 //r7=src[1]
|
|
|
|
vmov d0, r4, r5
|
|
vmov d1, r5, r6
|
|
vmov d2, r6, r7
|
|
|
|
ldr r4, [r0], r1 //r4=src[2]
|
|
vmov d3, r7, r4
|
|
ldr r7, [sp, #16]
|
|
|
|
w4_xy_03_mc_luma_loop:
|
|
|
|
// pld [r0]
|
|
//using reserving r4
|
|
ldr r5, [r0], r1 //r5=src[3]
|
|
ldr r6, [r0], r1 //r6=src[0]
|
|
vmov d4, r4, r5
|
|
vmov d5, r5, r6 //reserved r6
|
|
|
|
FILTER_6TAG_8BITS_AVERAGE_WITH_1 d0, d1, d2, d3, d4, d5, d12, q14, q15
|
|
vmov r4, r5, d12
|
|
str r4, [r2], r3 //write 1st 4Byte
|
|
str r5, [r2], r3 //write 2nd 4Byte
|
|
|
|
ldr r5, [r0], r1 //r5=src[1]
|
|
ldr r4, [r0], r1 //r4=src[2]
|
|
vmov d0, r6, r5
|
|
vmov d1, r5, r4 //reserved r4
|
|
|
|
FILTER_6TAG_8BITS_AVERAGE_WITH_1 d2, d3, d4, d5, d0, d1, d12, q14, q15
|
|
vmov r5, r6, d12
|
|
str r5, [r2], r3 //write 3rd 4Byte
|
|
str r6, [r2], r3 //write 4th 4Byte
|
|
|
|
//d4, d5, d0, d1 --> d0, d1, d2, d3
|
|
vmov q1, q0
|
|
vmov q0, q2
|
|
|
|
sub r7, #4
|
|
cmp r7, #0
|
|
bne w4_xy_03_mc_luma_loop
|
|
|
|
pop {r4, r5, r6, r7}
|
|
WELS_ASM_FUNC_END
|
|
|
|
//uint8_t* src, int32_t src_stride, uint8_t* dst, int32_t dst_stride, int32_t height
|
|
WELS_ASM_FUNC_BEGIN McHorVer02WidthEq16_neon
|
|
push {r4}
|
|
ldr r4, [sp, #4]
|
|
|
|
sub r0, r1, lsl #1 //src[-2*src_stride]
|
|
pld [r0]
|
|
pld [r0, r1]
|
|
vmov.u16 q14, #0x0014 // 20
|
|
vld1.u8 {q0}, [r0], r1 //q0=src[-2]
|
|
vld1.u8 {q1}, [r0], r1 //q1=src[-1]
|
|
|
|
pld [r0]
|
|
pld [r0, r1]
|
|
vshr.u16 q15, q14, #2 // 5
|
|
vld1.u8 {q2}, [r0], r1 //q2=src[0]
|
|
vld1.u8 {q3}, [r0], r1 //q3=src[1]
|
|
vld1.u8 {q4}, [r0], r1 //q4=src[2]
|
|
|
|
w16_v_mc_luma_loop:
|
|
|
|
vld1.u8 {q5}, [r0], r1 //q5=src[3]
|
|
|
|
FILTER_6TAG_8BITS d0, d2, d4, d6, d8, d10, d12, q14, q15
|
|
pld [r0]
|
|
FILTER_6TAG_8BITS d1, d3, d5, d7, d9, d11, d13, q14, q15
|
|
vld1.u8 {q0}, [r0], r1 //read 2nd row
|
|
vst1.u8 {q6}, [r2], r3 //write 1st 16Byte
|
|
|
|
FILTER_6TAG_8BITS d2, d4, d6, d8, d10, d0, d12, q14, q15
|
|
pld [r0]
|
|
FILTER_6TAG_8BITS d3, d5, d7, d9, d11, d1, d13, q14, q15
|
|
vld1.u8 {q1}, [r0], r1 //read 3rd row
|
|
vst1.u8 {q6}, [r2], r3 //write 2nd 16Byte
|
|
|
|
FILTER_6TAG_8BITS d4, d6, d8, d10, d0, d2, d12, q14, q15
|
|
pld [r0]
|
|
FILTER_6TAG_8BITS d5, d7, d9, d11, d1, d3, d13, q14, q15
|
|
vld1.u8 {q2}, [r0], r1 //read 4th row
|
|
vst1.u8 {q6}, [r2], r3 //write 3rd 16Byte
|
|
|
|
FILTER_6TAG_8BITS d6, d8, d10, d0, d2, d4, d12, q14, q15
|
|
pld [r0]
|
|
FILTER_6TAG_8BITS d7, d9, d11, d1, d3, d5, d13, q14, q15
|
|
vld1.u8 {q3}, [r0], r1 //read 5th row
|
|
vst1.u8 {q6}, [r2], r3 //write 4th 16Byte
|
|
|
|
FILTER_6TAG_8BITS d8, d10, d0, d2, d4, d6, d12, q14, q15
|
|
pld [r0]
|
|
FILTER_6TAG_8BITS d9, d11, d1, d3, d5, d7, d13, q14, q15
|
|
vld1.u8 {q4}, [r0], r1 //read 6th row
|
|
vst1.u8 {q6}, [r2], r3 //write 5th 16Byte
|
|
|
|
FILTER_6TAG_8BITS d10, d0, d2, d4, d6, d8, d12, q14, q15
|
|
pld [r0]
|
|
FILTER_6TAG_8BITS d11, d1, d3, d5, d7, d9, d13, q14, q15
|
|
vld1.u8 {q5}, [r0], r1 //read 7th row
|
|
vst1.u8 {q6}, [r2], r3 //write 6th 16Byte
|
|
|
|
FILTER_6TAG_8BITS d0, d2, d4, d6, d8, d10, d12, q14, q15
|
|
pld [r0]
|
|
FILTER_6TAG_8BITS d1, d3, d5, d7, d9, d11, d13, q14, q15
|
|
vld1.u8 {q0}, [r0], r1 //read 8th row
|
|
vst1.u8 {q6}, [r2], r3 //write 7th 16Byte
|
|
|
|
FILTER_6TAG_8BITS d2, d4, d6, d8, d10, d0, d12, q14, q15
|
|
pld [r0]
|
|
FILTER_6TAG_8BITS d3, d5, d7, d9, d11, d1, d13, q14, q15
|
|
vst1.u8 {q6}, [r2], r3 //write 8th 16Byte
|
|
|
|
//q2, q3, q4, q5, q0 --> q0~q4
|
|
vswp q0, q4
|
|
vswp q0, q2
|
|
vmov q1, q3
|
|
vmov q3, q5 //q0~q4
|
|
|
|
sub r4, #8
|
|
cmp r4, #0
|
|
bne w16_v_mc_luma_loop
|
|
pop {r4}
|
|
WELS_ASM_FUNC_END
|
|
|
|
//uint8_t* src, int32_t src_stride, uint8_t* dst, int32_t dst_stride, int32_t height
|
|
WELS_ASM_FUNC_BEGIN McHorVer02WidthEq8_neon
|
|
push {r4}
|
|
ldr r4, [sp, #4]
|
|
|
|
sub r0, r1, lsl #1 //src[-2*src_stride]
|
|
pld [r0]
|
|
pld [r0, r1]
|
|
vmov.u16 q14, #0x0014 // 20
|
|
vld1.u8 {d0}, [r0], r1 //d0=src[-2]
|
|
vld1.u8 {d1}, [r0], r1 //d1=src[-1]
|
|
|
|
pld [r0]
|
|
pld [r0, r1]
|
|
vshr.u16 q15, q14, #2 // 5
|
|
vld1.u8 {d2}, [r0], r1 //d2=src[0]
|
|
vld1.u8 {d3}, [r0], r1 //d3=src[1]
|
|
|
|
vld1.u8 {d4}, [r0], r1 //d4=src[2]
|
|
vld1.u8 {d5}, [r0], r1 //d5=src[3]
|
|
|
|
w8_v_mc_luma_loop:
|
|
|
|
pld [r0]
|
|
FILTER_6TAG_8BITS d0, d1, d2, d3, d4, d5, d12, q14, q15
|
|
vld1.u8 {d0}, [r0], r1 //read 2nd row
|
|
vst1.u8 {d12}, [r2], r3 //write 1st 8Byte
|
|
|
|
pld [r0]
|
|
FILTER_6TAG_8BITS d1, d2, d3, d4, d5, d0, d12, q14, q15
|
|
vld1.u8 {d1}, [r0], r1 //read 3rd row
|
|
vst1.u8 {d12}, [r2], r3 //write 2nd 8Byte
|
|
|
|
pld [r0]
|
|
FILTER_6TAG_8BITS d2, d3, d4, d5, d0, d1, d12, q14, q15
|
|
vld1.u8 {d2}, [r0], r1 //read 4th row
|
|
vst1.u8 {d12}, [r2], r3 //write 3rd 8Byte
|
|
|
|
pld [r0]
|
|
FILTER_6TAG_8BITS d3, d4, d5, d0, d1, d2, d12, q14, q15
|
|
vld1.u8 {d3}, [r0], r1 //read 5th row
|
|
vst1.u8 {d12}, [r2], r3 //write 4th 8Byte
|
|
|
|
//d4, d5, d0, d1, d2, d3 --> d0, d1, d2, d3, d4, d5
|
|
vswp q0, q2
|
|
vswp q1, q2
|
|
|
|
sub r4, #4
|
|
cmp r4, #0
|
|
bne w8_v_mc_luma_loop
|
|
|
|
pop {r4}
|
|
WELS_ASM_FUNC_END
|
|
|
|
//uint8_t* src, int32_t src_stride, uint8_t* dst, int32_t dst_stride, int32_t height
|
|
WELS_ASM_FUNC_BEGIN McHorVer02WidthEq4_neon
|
|
push {r4, r5, r6, r7}
|
|
sub r0, r1, lsl #1 //src[-2*src_stride]
|
|
pld [r0]
|
|
pld [r0, r1]
|
|
vmov.u16 q14, #0x0014 // 20
|
|
ldr r4, [r0], r1 //r4=src[-2]
|
|
ldr r5, [r0], r1 //r5=src[-1]
|
|
|
|
pld [r0]
|
|
pld [r0, r1]
|
|
vshr.u16 q15, q14, #2 // 5
|
|
ldr r6, [r0], r1 //r6=src[0]
|
|
ldr r7, [r0], r1 //r7=src[1]
|
|
|
|
vmov d0, r4, r5
|
|
vmov d1, r5, r6
|
|
vmov d2, r6, r7
|
|
|
|
ldr r4, [r0], r1 //r4=src[2]
|
|
vmov d3, r7, r4
|
|
ldr r7, [sp, #16]
|
|
|
|
w4_v_mc_luma_loop:
|
|
|
|
// pld [r0]
|
|
//using reserving r4
|
|
ldr r5, [r0], r1 //r5=src[3]
|
|
ldr r6, [r0], r1 //r6=src[0]
|
|
vmov d4, r4, r5
|
|
vmov d5, r5, r6 //reserved r6
|
|
|
|
FILTER_6TAG_8BITS d0, d1, d2, d3, d4, d5, d12, q14, q15
|
|
vmov r4, r5, d12
|
|
str r4, [r2], r3 //write 1st 4Byte
|
|
str r5, [r2], r3 //write 2nd 4Byte
|
|
|
|
ldr r5, [r0], r1 //r5=src[1]
|
|
ldr r4, [r0], r1 //r4=src[2]
|
|
vmov d0, r6, r5
|
|
vmov d1, r5, r4 //reserved r4
|
|
|
|
FILTER_6TAG_8BITS d2, d3, d4, d5, d0, d1, d12, q14, q15
|
|
vmov r5, r6, d12
|
|
str r5, [r2], r3 //write 3rd 4Byte
|
|
str r6, [r2], r3 //write 4th 4Byte
|
|
|
|
//d4, d5, d0, d1 --> d0, d1, d2, d3
|
|
vmov q1, q0
|
|
vmov q0, q2
|
|
|
|
sub r7, #4
|
|
cmp r7, #0
|
|
bne w4_v_mc_luma_loop
|
|
|
|
pop {r4, r5, r6, r7}
|
|
WELS_ASM_FUNC_END
|
|
|
|
//uint8_t* src, int32_t src_stride, uint8_t* dst, int32_t dst_stride, int32_t height
|
|
WELS_ASM_FUNC_BEGIN McHorVer22WidthEq16_neon
|
|
push {r4}
|
|
ldr r4, [sp, #4]
|
|
|
|
sub r0, #2 //src[-2]
|
|
sub r0, r1, lsl #1 //src[-2*src_stride-2]
|
|
pld [r0]
|
|
pld [r0, r1]
|
|
|
|
vmov.u16 q14, #0x0014 // 20
|
|
// vmov.u32 d30, #0x00140000
|
|
// vorr.u32 d30, #0x0005 //0x0014 0005 0014 0005
|
|
vld1.u8 {d0-d2}, [r0], r1 //use 21(16+5), =src[-2]
|
|
vld1.u8 {d3-d5}, [r0], r1 //use 21(16+5), =src[-1]
|
|
|
|
pld [r0]
|
|
pld [r0, r1]
|
|
vshr.u16 q15, q14, #2 // 5
|
|
|
|
vld1.u8 {d6-d8}, [r0], r1 //use 21(16+5), =src[0]
|
|
vld1.u8 {d9-d11}, [r0], r1 //use 21(16+5), =src[1]
|
|
pld [r0]
|
|
pld [r0, r1]
|
|
vld1.u8 {d12-d14}, [r0], r1 //use 21(16+5), =src[2]
|
|
|
|
w16_hv_mc_luma_loop:
|
|
|
|
vld1.u8 {d15-d17}, [r0], r1 //use 21(16+5), =src[3]
|
|
//the 1st row
|
|
pld [r0]
|
|
// vertical filtered into q9/q10
|
|
FILTER_6TAG_8BITS_TO_16BITS d0, d3, d6, d9, d12, d15, q9, q14, q15 // 8 avail
|
|
FILTER_6TAG_8BITS_TO_16BITS d1, d4, d7,d10, d13, d16,q10, q14, q15 // 8 avail
|
|
// horizon filtered
|
|
UNPACK_2_16BITS_TO_ABC q9, q10, q11, q12, q13
|
|
FILTER_3_IN_16BITS_TO_8BITS q11, q12, q13, d0 //output to q0[0]
|
|
// vst1.u8 d18, [r2] //write 8Byte
|
|
// vertical filtered into q10/q11
|
|
FILTER_6TAG_8BITS_TO_16BITS d2, d5, d8,d11, d14, d17,q11, q14, q15 // only 5 avail
|
|
// horizon filtered
|
|
UNPACK_2_16BITS_TO_ABC q10, q11, q9, q12, q13
|
|
FILTER_3_IN_16BITS_TO_8BITS q9, q12, q13, d1 //output to q0[1]
|
|
vst1.u8 {q0}, [r2], r3 //write 16Byte
|
|
|
|
|
|
vld1.u8 {d0-d2}, [r0], r1 //read 2nd row
|
|
//the 2nd row
|
|
pld [r0]
|
|
// vertical filtered into q9/q10
|
|
FILTER_6TAG_8BITS_TO_16BITS d3, d6, d9, d12, d15, d0, q9, q14, q15 // 8 avail
|
|
FILTER_6TAG_8BITS_TO_16BITS d4, d7,d10, d13, d16, d1,q10, q14, q15 // 8 avail
|
|
// horizon filtered
|
|
UNPACK_2_16BITS_TO_ABC q9, q10, q11, q12, q13
|
|
FILTER_3_IN_16BITS_TO_8BITS q11, q12, q13, d3 //output to d3
|
|
// vst1.u8 d18, [r2]! //write 8Byte
|
|
// vertical filtered into q10/q11
|
|
FILTER_6TAG_8BITS_TO_16BITS d5, d8,d11, d14, d17, d2,q11, q14, q15 // only 5 avail
|
|
// horizon filtered
|
|
UNPACK_2_16BITS_TO_ABC q10, q11, q9, q12, q13
|
|
FILTER_3_IN_16BITS_TO_8BITS q9, q12, q13, d4 //output to d4
|
|
// vst1.u8 d20, [r2]! //write 8Byte
|
|
vst1.u8 {d3, d4}, [r2], r3 //write 16Byte
|
|
|
|
vld1.u8 {d3-d5}, [r0], r1 //read 3rd row
|
|
//the 3rd row
|
|
pld [r0]
|
|
// vertical filtered into q9/q10
|
|
FILTER_6TAG_8BITS_TO_16BITS d6, d9, d12, d15, d0, d3, q9, q14, q15 // 8 avail
|
|
FILTER_6TAG_8BITS_TO_16BITS d7,d10, d13, d16, d1, d4,q10, q14, q15 // 8 avail
|
|
// horizon filtered
|
|
UNPACK_2_16BITS_TO_ABC q9, q10, q11, q12, q13
|
|
FILTER_3_IN_16BITS_TO_8BITS q11, q12, q13, d6 //output to d6
|
|
// vst1.u8 d18, [r2]! //write 8Byte
|
|
// vertical filtered into q10/q11
|
|
FILTER_6TAG_8BITS_TO_16BITS d8,d11, d14, d17, d2, d5,q11, q14, q15 // only 5 avail
|
|
// horizon filtered
|
|
UNPACK_2_16BITS_TO_ABC q10, q11, q9, q12, q13
|
|
FILTER_3_IN_16BITS_TO_8BITS q9, q12, q13, d7 //output to d7
|
|
// vst1.u8 d20, [r2]! //write 8Byte
|
|
vst1.u8 {d6, d7}, [r2], r3 //write 16Byte
|
|
|
|
vld1.u8 {d6-d8}, [r0], r1 //read 4th row
|
|
//the 4th row
|
|
pld [r0]
|
|
// vertical filtered into q9/q10
|
|
FILTER_6TAG_8BITS_TO_16BITS d9, d12, d15, d0, d3, d6, q9, q14, q15 // 8 avail
|
|
FILTER_6TAG_8BITS_TO_16BITS d10, d13, d16, d1, d4, d7,q10, q14, q15 // 8 avail
|
|
// horizon filtered
|
|
UNPACK_2_16BITS_TO_ABC q9, q10, q11, q12, q13
|
|
FILTER_3_IN_16BITS_TO_8BITS q11, q12, q13, d9 //output to d9
|
|
// vst1.u8 d18, [r2]! //write 8Byte
|
|
// vertical filtered into q10/q11
|
|
FILTER_6TAG_8BITS_TO_16BITS d11, d14, d17, d2, d5, d8,q11, q14, q15 // only 5 avail
|
|
// horizon filtered
|
|
UNPACK_2_16BITS_TO_ABC q10, q11, q9, q12, q13
|
|
FILTER_3_IN_16BITS_TO_8BITS q9, q12, q13, d10 //output to d10
|
|
// vst1.u8 d20, [r2]! //write 8Byte
|
|
vst1.u8 {d9, d10}, [r2], r3 //write 16Byte
|
|
|
|
//d12~d17(q6~q8), d0~d8(q0~q3+d8), --> d0~d14
|
|
vswp q0, q6
|
|
vswp q6, q3
|
|
vmov q5, q2
|
|
vmov q2, q8
|
|
|
|
vmov d20,d8
|
|
vmov q4, q1
|
|
vmov q1, q7
|
|
vmov d14,d20
|
|
|
|
sub r4, #4
|
|
cmp r4, #0
|
|
bne w16_hv_mc_luma_loop
|
|
pop {r4}
|
|
WELS_ASM_FUNC_END
|
|
|
|
//uint8_t* src, int32_t src_stride, uint8_t* dst, int32_t dst_stride, int32_t height
|
|
WELS_ASM_FUNC_BEGIN McHorVer22WidthEq8_neon
|
|
push {r4}
|
|
ldr r4, [sp, #4]
|
|
|
|
sub r0, #2 //src[-2]
|
|
sub r0, r1, lsl #1 //src[-2*src_stride-2]
|
|
pld [r0]
|
|
pld [r0, r1]
|
|
|
|
vmov.u16 q14, #0x0014 // 20
|
|
vld1.u8 {q0}, [r0], r1 //use 13(8+5), =src[-2]
|
|
vld1.u8 {q1}, [r0], r1 //use 13(8+5), =src[-1]
|
|
|
|
pld [r0]
|
|
pld [r0, r1]
|
|
vshr.u16 q15, q14, #2 // 5
|
|
|
|
vld1.u8 {q2}, [r0], r1 //use 13(8+5), =src[0]
|
|
vld1.u8 {q3}, [r0], r1 //use 13(8+5), =src[1]
|
|
pld [r0]
|
|
pld [r0, r1]
|
|
vld1.u8 {q4}, [r0], r1 //use 13(8+5), =src[2]
|
|
|
|
w8_hv_mc_luma_loop:
|
|
|
|
vld1.u8 {q5}, [r0], r1 //use 13(8+5), =src[3]
|
|
//the 1st row
|
|
pld [r0]
|
|
// vertical filtered into q6/q7
|
|
FILTER_6TAG_8BITS_TO_16BITS d0, d2, d4, d6, d8, d10, q6, q14, q15 // 8 avail
|
|
FILTER_6TAG_8BITS_TO_16BITS d1, d3, d5, d7, d9, d11, q7, q14, q15 // 5 avail
|
|
// horizon filtered
|
|
UNPACK_2_16BITS_TO_ABC q6, q7, q11, q12, q13
|
|
FILTER_3_IN_16BITS_TO_8BITS q11, q12, q13, d12 //output to q6[0]
|
|
vst1.u8 d12, [r2], r3 //write 8Byte
|
|
// add r2, #8
|
|
|
|
vld1.u8 {q0}, [r0], r1 //read 2nd row
|
|
//the 2nd row
|
|
pld [r0]
|
|
// vertical filtered into q6/q7
|
|
FILTER_6TAG_8BITS_TO_16BITS d2, d4, d6, d8, d10, d0, q6, q14, q15 // 8 avail
|
|
FILTER_6TAG_8BITS_TO_16BITS d3, d5, d7, d9, d11, d1, q7, q14, q15 // 5 avail
|
|
// horizon filtered
|
|
UNPACK_2_16BITS_TO_ABC q6, q7, q11, q12, q13
|
|
FILTER_3_IN_16BITS_TO_8BITS q11, q12, q13, d12 //output to q6[0]
|
|
vst1.u8 d12, [r2], r3 //write 8Byte
|
|
// add r2, #8
|
|
|
|
vld1.u8 {q1}, [r0], r1 //read 3rd row
|
|
//the 3rd row
|
|
pld [r0]
|
|
// vertical filtered into q6/q7
|
|
FILTER_6TAG_8BITS_TO_16BITS d4, d6, d8, d10, d0, d2, q6, q14, q15 // 8 avail
|
|
FILTER_6TAG_8BITS_TO_16BITS d5, d7, d9, d11, d1, d3, q7, q14, q15 // 5 avail
|
|
// horizon filtered
|
|
UNPACK_2_16BITS_TO_ABC q6, q7, q11, q12, q13
|
|
FILTER_3_IN_16BITS_TO_8BITS q11, q12, q13, d12 //output to q6[0]
|
|
vst1.u8 d12, [r2], r3 //write 8Byte
|
|
// add r2, #8
|
|
|
|
vld1.u8 {q2}, [r0], r1 //read 4th row
|
|
//the 4th row
|
|
pld [r0]
|
|
// vertical filtered into q6/q7
|
|
FILTER_6TAG_8BITS_TO_16BITS d6, d8, d10, d0, d2, d4, q6, q14, q15 // 8 avail
|
|
FILTER_6TAG_8BITS_TO_16BITS d7, d9, d11, d1, d3, d5, q7, q14, q15 // 5 avail
|
|
// horizon filtered
|
|
UNPACK_2_16BITS_TO_ABC q6, q7, q11, q12, q13
|
|
FILTER_3_IN_16BITS_TO_8BITS q11, q12, q13, d12 //output to q6[0]
|
|
vst1.u8 d12, [r2], r3 //write 8Byte
|
|
// add r2, #8
|
|
|
|
//q4~q5, q0~q2, --> q0~q4
|
|
vswp q0, q4
|
|
vswp q2, q4
|
|
vmov q3, q1
|
|
vmov q1, q5
|
|
|
|
sub r4, #4
|
|
cmp r4, #0
|
|
bne w8_hv_mc_luma_loop
|
|
pop {r4}
|
|
WELS_ASM_FUNC_END
|
|
|
|
//uint8_t* src, int32_t src_stride, uint8_t* dst, int32_t dst_stride, int32_t height
|
|
WELS_ASM_FUNC_BEGIN McHorVer22WidthEq4_neon
|
|
push {r4 ,r5, r6}
|
|
ldr r6, [sp, #12]
|
|
|
|
sub r0, #2 //src[-2]
|
|
sub r0, r1, lsl #1 //src[-2*src_stride-2]
|
|
pld [r0]
|
|
pld [r0, r1]
|
|
|
|
vmov.u16 q14, #0x0014 // 20
|
|
vld1.u8 {q0}, [r0], r1 //use 9(4+5), =src[-2]
|
|
vld1.u8 {q1}, [r0], r1 //use 9(4+5), =src[-1]
|
|
|
|
pld [r0]
|
|
pld [r0, r1]
|
|
vshr.u16 q15, q14, #2 // 5
|
|
|
|
vld1.u8 {q2}, [r0], r1 //use 9(4+5), =src[0]
|
|
vld1.u8 {q3}, [r0], r1 //use 9(4+5), =src[1]
|
|
pld [r0]
|
|
pld [r0, r1]
|
|
vld1.u8 {q4}, [r0], r1 //use 9(4+5), =src[2]
|
|
|
|
w4_hv_mc_luma_loop:
|
|
|
|
vld1.u8 {q5}, [r0], r1 //use 9(4+5), =src[3]
|
|
vld1.u8 {q6}, [r0], r1 //use 9(4+5), =src[4]
|
|
|
|
//the 1st&2nd row
|
|
pld [r0]
|
|
pld [r0, r1]
|
|
// vertical filtered
|
|
FILTER_6TAG_8BITS_TO_16BITS d0, d2, d4, d6, d8, d10, q7, q14, q15 // 8 avail
|
|
FILTER_6TAG_8BITS_TO_16BITS d1, d3, d5, d7, d9, d11, q8, q14, q15 // 1 avail
|
|
|
|
FILTER_6TAG_8BITS_TO_16BITS d2, d4, d6, d8,d10, d12, q9, q14, q15 // 8 avail
|
|
FILTER_6TAG_8BITS_TO_16BITS d3, d5, d7, d9,d11, d13,q10, q14, q15 // 1 avail
|
|
// horizon filtered
|
|
UNPACK_2_16BITS_TO_ABC q7, q8, q11, q12, q13 //4 avail
|
|
UNPACK_2_16BITS_TO_ABC q9,q10, q0, q7, q8 //4 avail
|
|
|
|
vmov d23, d0
|
|
vmov d25, d14
|
|
vmov d27, d16
|
|
|
|
FILTER_3_IN_16BITS_TO_8BITS q11, q12, q13, d22 //output to q11[0]
|
|
vmov r4 ,r5, d22
|
|
str r4, [r2], r3 //write 4Byte
|
|
str r5, [r2], r3 //write 4Byte
|
|
// add r2, #32
|
|
|
|
//the 3rd&4th row
|
|
vld1.u8 {q0}, [r0], r1 //use 9(4+5), =src[3]
|
|
vld1.u8 {q1}, [r0], r1 //use 9(4+5), =src[4]
|
|
pld [r0]
|
|
pld [r0, r1]
|
|
// vertical filtered
|
|
FILTER_6TAG_8BITS_TO_16BITS d4, d6, d8, d10, d12, d0, q7, q14, q15 // 8 avail
|
|
FILTER_6TAG_8BITS_TO_16BITS d5, d7, d9, d11, d13, d1, q8, q14, q15 // 1 avail
|
|
|
|
FILTER_6TAG_8BITS_TO_16BITS d6, d8,d10, d12, d0, d2, q9, q14, q15 // 8 avail
|
|
FILTER_6TAG_8BITS_TO_16BITS d7, d9,d11, d13, d1, d3,q10, q14, q15 // 1 avail
|
|
// horizon filtered
|
|
UNPACK_2_16BITS_TO_ABC q7, q8, q11, q12, q13 //4 avail
|
|
UNPACK_2_16BITS_TO_ABC q9,q10, q2, q7, q8 //4 avail
|
|
|
|
vmov d23, d4
|
|
vmov d25, d14
|
|
vmov d27, d16
|
|
|
|
FILTER_3_IN_16BITS_TO_8BITS q11, q12, q13, d22 //output to q11[0]
|
|
vmov r4 ,r5, d22
|
|
str r4, [r2], r3 //write 4Byte
|
|
str r5, [r2], r3 //write 4Byte
|
|
// add r2, #32
|
|
|
|
//q4~q6, q0~q1, --> q0~q4
|
|
vswp q4, q0
|
|
vmov q3, q4
|
|
vmov q4, q1
|
|
vmov q1, q5
|
|
vmov q2, q6
|
|
|
|
sub r6, #4
|
|
cmp r6, #0
|
|
bne w4_hv_mc_luma_loop
|
|
|
|
pop {r4, r5, r6}
|
|
WELS_ASM_FUNC_END
|
|
|
|
//uint8_t* src, int32_t src_stride, uint8_t* dst, int32_t dst_stride, int32_t height
|
|
WELS_ASM_FUNC_BEGIN McCopyWidthEq16_neon
|
|
push {r4}
|
|
ldr r4, [sp, #4]
|
|
w16_copy_loop:
|
|
vld1.u8 {q0}, [r0], r1
|
|
sub r4, #2
|
|
vld1.u8 {q1}, [r0], r1
|
|
vst1.u8 {q0}, [r2], r3
|
|
cmp r4, #0
|
|
vst1.u8 {q1}, [r2], r3
|
|
bne w16_copy_loop
|
|
|
|
pop {r4}
|
|
WELS_ASM_FUNC_END
|
|
|
|
//uint8_t* src, int32_t src_stride, uint8_t* dst, int32_t dst_stride, int32_t height
|
|
WELS_ASM_FUNC_BEGIN McCopyWidthEq8_neon
|
|
push {r4}
|
|
ldr r4, [sp, #4]
|
|
w8_copy_loop:
|
|
vld1.u8 {d0}, [r0], r1
|
|
vld1.u8 {d1}, [r0], r1
|
|
vst1.u8 {d0}, [r2], r3
|
|
vst1.u8 {d1}, [r2], r3
|
|
sub r4, #2
|
|
cmp r4, #0
|
|
bne w8_copy_loop
|
|
|
|
pop {r4}
|
|
WELS_ASM_FUNC_END
|
|
|
|
//uint8_t* src, int32_t src_stride, uint8_t* dst, int32_t dst_stride, int32_t height
|
|
WELS_ASM_FUNC_BEGIN McCopyWidthEq4_neon
|
|
push {r4, r5, r6}
|
|
ldr r4, [sp, #12]
|
|
w4_copy_loop:
|
|
ldr r5, [r0], r1
|
|
ldr r6, [r0], r1
|
|
str r5, [r2], r3
|
|
str r6, [r2], r3
|
|
|
|
sub r4, #2
|
|
cmp r4, #0
|
|
bne w4_copy_loop
|
|
|
|
pop {r4, r5, r6}
|
|
WELS_ASM_FUNC_END
|
|
|
|
//uint8_t* dst, int32_t dst_stride, uint8_t* srcA, uint8_t* srcB, int32_t height
|
|
WELS_ASM_FUNC_BEGIN PixelAvgWidthEq16_neon
|
|
push {r4}
|
|
ldr r4, [sp, #4]
|
|
w16_pix_avg_loop:
|
|
vld1.u8 {q0}, [r2]!
|
|
vld1.u8 {q1}, [r3]!
|
|
vld1.u8 {q2}, [r2]!
|
|
vld1.u8 {q3}, [r3]!
|
|
|
|
vld1.u8 {q4}, [r2]!
|
|
vld1.u8 {q5}, [r3]!
|
|
vld1.u8 {q6}, [r2]!
|
|
vld1.u8 {q7}, [r3]!
|
|
|
|
AVERAGE_TWO_8BITS d0, d0, d2
|
|
AVERAGE_TWO_8BITS d1, d1, d3
|
|
vst1.u8 {q0}, [r0], r1
|
|
|
|
AVERAGE_TWO_8BITS d4, d4, d6
|
|
AVERAGE_TWO_8BITS d5, d5, d7
|
|
vst1.u8 {q2}, [r0], r1
|
|
|
|
AVERAGE_TWO_8BITS d8, d8, d10
|
|
AVERAGE_TWO_8BITS d9, d9, d11
|
|
vst1.u8 {q4}, [r0], r1
|
|
|
|
AVERAGE_TWO_8BITS d12, d12, d14
|
|
AVERAGE_TWO_8BITS d13, d13, d15
|
|
vst1.u8 {q6}, [r0], r1
|
|
|
|
sub r4, #4
|
|
cmp r4, #0
|
|
bne w16_pix_avg_loop
|
|
|
|
pop {r4}
|
|
WELS_ASM_FUNC_END
|
|
|
|
WELS_ASM_FUNC_BEGIN PixelAvgWidthEq8_neon
|
|
push {r4, r5}
|
|
ldr r4, [sp, #8]
|
|
mov r5, #16
|
|
w8_pix_avg_loop:
|
|
|
|
vld1.u8 {d0}, [r2], r5
|
|
vld1.u8 {d2}, [r3], r5
|
|
vld1.u8 {d1}, [r2], r5
|
|
vld1.u8 {d3}, [r3], r5
|
|
// add r2, #32
|
|
// add r3, #32
|
|
|
|
AVERAGE_TWO_8BITS d0, d0, d2
|
|
AVERAGE_TWO_8BITS d1, d1, d3
|
|
vst1.u8 {d0}, [r0], r1
|
|
vst1.u8 {d1}, [r0], r1
|
|
|
|
vld1.u8 {d4}, [r2], r5
|
|
vld1.u8 {d6}, [r3], r5
|
|
vld1.u8 {d5}, [r2], r5
|
|
vld1.u8 {d7}, [r3], r5
|
|
// add r2, #32
|
|
// add r3, #32
|
|
|
|
AVERAGE_TWO_8BITS d4, d4, d6
|
|
AVERAGE_TWO_8BITS d5, d5, d7
|
|
vst1.u8 {d4}, [r0], r1
|
|
vst1.u8 {d5}, [r0], r1
|
|
|
|
sub r4, #4
|
|
cmp r4, #0
|
|
bne w8_pix_avg_loop
|
|
|
|
pop {r4, r5}
|
|
WELS_ASM_FUNC_END
|
|
|
|
//uint8_t* src, int32_t src_stride, uint8_t* dst, int32_t dst_stride, int32_t height
|
|
WELS_ASM_FUNC_BEGIN PixelAvgWidthEq4_neon
|
|
push {r4-r8}
|
|
ldr r4, [sp, #20]
|
|
w4_pix_avg_loop:
|
|
|
|
ldr r5, [r2]
|
|
ldr r6, [r2, #16]
|
|
ldr r7, [r3]
|
|
ldr r8, [r3, #16]
|
|
add r2, #32
|
|
add r3, #32
|
|
|
|
vmov d0, r5, r6
|
|
vmov d1, r7, r8
|
|
AVERAGE_TWO_8BITS d0, d0, d1
|
|
vmov r5, r6, d0
|
|
|
|
str r5, [r0], r1
|
|
str r6, [r0], r1
|
|
|
|
sub r4, #2
|
|
cmp r4, #0
|
|
bne w4_pix_avg_loop
|
|
|
|
pop {r4-r8}
|
|
WELS_ASM_FUNC_END
|
|
|
|
//uint8_t* src, int32_t src_stride, uint8_t* dst, int32_t dst_stride, int32_t* weights, int32_t height
|
|
//cA = (8 - dx) * (8 - dy);
|
|
//cB = dx * (8 - dy);
|
|
//cC = (8 - dx) * dy;
|
|
//cD = dx * dy
|
|
WELS_ASM_FUNC_BEGIN McChromaWidthEq8_neon
|
|
|
|
push {r4, r5}
|
|
ldr r4, [sp, #8]
|
|
ldr r5, [sp, #12]
|
|
// normal case: {cA*src[x] + cB*src[x+1]} + {cC*src[x+stride] + cD*srcp[x+stride+1]}
|
|
// we can opti it by adding vert only/ hori only cases, to be continue
|
|
vld1.u8 {d31}, [r4] //load A/B/C/D
|
|
vld1.u8 {q0}, [r0], r1 //src[x]
|
|
|
|
vdup.u8 d28, d31[0] //A
|
|
vdup.u8 d29, d31[1] //B
|
|
vdup.u8 d30, d31[2] //C
|
|
vdup.u8 d31, d31[3] //D
|
|
|
|
vext.u8 d1, d0, d1, #1 //src[x+1]
|
|
|
|
w8_mc_chroma_loop: // each two pxl row
|
|
vld1.u8 {q1}, [r0], r1 //src[x+stride]
|
|
vld1.u8 {q2}, [r0], r1 //src[x+2*stride]
|
|
vext.u8 d3, d2, d3, #1 //src[x+stride+1]
|
|
vext.u8 d5, d4, d5, #1 //src[x+2*stride+1]
|
|
|
|
vmull.u8 q3, d0, d28 //(src[x] * A)
|
|
vmlal.u8 q3, d1, d29 //+=(src[x+1] * B)
|
|
vmlal.u8 q3, d2, d30 //+=(src[x+stride] * C)
|
|
vmlal.u8 q3, d3, d31 //+=(src[x+stride+1] * D)
|
|
vrshrn.u16 d6, q3, #6
|
|
vst1.u8 d6, [r2], r3
|
|
|
|
vmull.u8 q3, d2, d28 //(src[x] * A)
|
|
vmlal.u8 q3, d3, d29 //+=(src[x+1] * B)
|
|
vmlal.u8 q3, d4, d30 //+=(src[x+stride] * C)
|
|
vmlal.u8 q3, d5, d31 //+=(src[x+stride+1] * D)
|
|
vrshrn.u16 d6, q3, #6
|
|
vst1.u8 d6, [r2], r3
|
|
|
|
vmov q0, q2
|
|
sub r5, #2
|
|
cmp r5, #0
|
|
bne w8_mc_chroma_loop
|
|
|
|
pop {r4, r5}
|
|
WELS_ASM_FUNC_END
|
|
|
|
//uint8_t* src, int32_t src_stride, uint8_t* dst, int32_t dst_stride, int32_t* weights, int32_t height
|
|
WELS_ASM_FUNC_BEGIN McChromaWidthEq4_neon
|
|
|
|
push {r4, r5, r6}
|
|
ldr r4, [sp, #12]
|
|
ldr r6, [sp, #16]
|
|
// normal case: {cA*src[x] + cB*src[x+1]} + {cC*src[x+stride] + cD*srcp[x+stride+1]}
|
|
// we can opti it by adding vert only/ hori only cases, to be continue
|
|
vld1.u8 {d31}, [r4] //load A/B/C/D
|
|
|
|
vdup.u8 d28, d31[0] //A
|
|
vdup.u8 d29, d31[1] //B
|
|
vdup.u8 d30, d31[2] //C
|
|
vdup.u8 d31, d31[3] //D
|
|
|
|
w4_mc_chroma_loop: // each two pxl row
|
|
vld1.u8 {d0}, [r0], r1 //a::src[x]
|
|
vld1.u8 {d2}, [r0], r1 //b::src[x+stride]
|
|
vld1.u8 {d4}, [r0] //c::src[x+2*stride]
|
|
|
|
vshr.u64 d1, d0, #8
|
|
vshr.u64 d3, d2, #8
|
|
vshr.u64 d5, d4, #8
|
|
|
|
vmov q3, q1 //b::[0:7]+b::[1~8]
|
|
vtrn.32 q0, q1 //d0{a::[0:3]+b::[0:3]}; d1{a::[1:4]+b::[1:4]}
|
|
vtrn.32 q3, q2 //d6{b::[0:3]+c::[0:3]}; d7{b::[1:4]+c::[1:4]}
|
|
|
|
vmull.u8 q1, d0, d28 //(src[x] * A)
|
|
vmlal.u8 q1, d1, d29 //+=(src[x+1] * B)
|
|
vmlal.u8 q1, d6, d30 //+=(src[x+stride] * C)
|
|
vmlal.u8 q1, d7, d31 //+=(src[x+stride+1] * D)
|
|
|
|
vrshrn.u16 d2, q1, #6
|
|
vmov r4, r5, d2
|
|
str r4, [r2], r3
|
|
str r5, [r2], r3
|
|
|
|
sub r6, #2
|
|
cmp r6, #0
|
|
bne w4_mc_chroma_loop
|
|
|
|
pop {r4, r5, r6}
|
|
WELS_ASM_FUNC_END
|
|
#endif
|