mv mc_neon.S to common,add MC arm code to encoder
This commit is contained in:
parent
14f5518e6a
commit
71467f948a
@ -8,6 +8,7 @@
|
||||
|
||||
/* Begin PBXBuildFile section */
|
||||
4C34067D18C5C94C00DFA14A /* expand_picture_neon.S in Sources */ = {isa = PBXBuildFile; fileRef = 4C34067C18C5C94C00DFA14A /* expand_picture_neon.S */; };
|
||||
4C34069B18C96CB000DFA14A /* mc_neon.S in Sources */ = {isa = PBXBuildFile; fileRef = 4C34069A18C96CB000DFA14A /* mc_neon.S */; };
|
||||
4CE443D918B722CD0017DF25 /* Foundation.framework in Frameworks */ = {isa = PBXBuildFile; fileRef = 4CE443D818B722CD0017DF25 /* Foundation.framework */; };
|
||||
4CE443E718B722CD0017DF25 /* XCTest.framework in Frameworks */ = {isa = PBXBuildFile; fileRef = 4CE443E618B722CD0017DF25 /* XCTest.framework */; };
|
||||
4CE443E818B722CD0017DF25 /* Foundation.framework in Frameworks */ = {isa = PBXBuildFile; fileRef = 4CE443D818B722CD0017DF25 /* Foundation.framework */; };
|
||||
@ -48,6 +49,7 @@
|
||||
|
||||
/* Begin PBXFileReference section */
|
||||
4C34067C18C5C94C00DFA14A /* expand_picture_neon.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = expand_picture_neon.S; sourceTree = "<group>"; };
|
||||
4C34069A18C96CB000DFA14A /* mc_neon.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = mc_neon.S; sourceTree = "<group>"; };
|
||||
4CE443D518B722CD0017DF25 /* libcommon.a */ = {isa = PBXFileReference; explicitFileType = archive.ar; includeInIndex = 0; path = libcommon.a; sourceTree = BUILT_PRODUCTS_DIR; };
|
||||
4CE443D818B722CD0017DF25 /* Foundation.framework */ = {isa = PBXFileReference; lastKnownFileType = wrapper.framework; name = Foundation.framework; path = System/Library/Frameworks/Foundation.framework; sourceTree = SDKROOT; };
|
||||
4CE443E518B722CD0017DF25 /* commonTests.xctest */ = {isa = PBXFileReference; explicitFileType = wrapper.cfbundle; includeInIndex = 0; path = commonTests.xctest; sourceTree = BUILT_PRODUCTS_DIR; };
|
||||
@ -150,6 +152,7 @@
|
||||
4CE4472F18BC61650017DF25 /* common */ = {
|
||||
isa = PBXGroup;
|
||||
children = (
|
||||
4C34069A18C96CB000DFA14A /* mc_neon.S */,
|
||||
4C34067C18C5C94C00DFA14A /* expand_picture_neon.S */,
|
||||
4CE447BE18C085900017DF25 /* arm_arch_common_macro.S */,
|
||||
4CE447BC18C085320017DF25 /* deblocking_neon.S */,
|
||||
@ -256,6 +259,7 @@
|
||||
isa = PBXSourcesBuildPhase;
|
||||
buildActionMask = 2147483647;
|
||||
files = (
|
||||
4C34069B18C96CB000DFA14A /* mc_neon.S in Sources */,
|
||||
4CE447BF18C085900017DF25 /* arm_arch_common_macro.S in Sources */,
|
||||
4CE4475018BC61650017DF25 /* deblocking_common.cpp in Sources */,
|
||||
4CE4474C18BC61650017DF25 /* cpu.cpp in Sources */,
|
||||
|
@ -38,7 +38,6 @@
|
||||
4CE4469F18BC5EAB0017DF25 /* welsDecoderExt.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 4CE4468518BC5EAB0017DF25 /* welsDecoderExt.cpp */; };
|
||||
4CE447AC18BC6BE90017DF25 /* block_add_neon.S in Sources */ = {isa = PBXBuildFile; fileRef = 4CE447A718BC6BE90017DF25 /* block_add_neon.S */; };
|
||||
4CE447AE18BC6BE90017DF25 /* intra_pred_neon.S in Sources */ = {isa = PBXBuildFile; fileRef = 4CE447A918BC6BE90017DF25 /* intra_pred_neon.S */; };
|
||||
4CE447AF18BC6BE90017DF25 /* mc_neon.S in Sources */ = {isa = PBXBuildFile; fileRef = 4CE447AA18BC6BE90017DF25 /* mc_neon.S */; };
|
||||
/* End PBXBuildFile section */
|
||||
|
||||
/* Begin PBXContainerItemProxy section */
|
||||
@ -132,7 +131,6 @@
|
||||
4CE4468518BC5EAB0017DF25 /* welsDecoderExt.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = welsDecoderExt.cpp; sourceTree = "<group>"; };
|
||||
4CE447A718BC6BE90017DF25 /* block_add_neon.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = block_add_neon.S; sourceTree = "<group>"; };
|
||||
4CE447A918BC6BE90017DF25 /* intra_pred_neon.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = intra_pred_neon.S; sourceTree = "<group>"; };
|
||||
4CE447AA18BC6BE90017DF25 /* mc_neon.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = mc_neon.S; sourceTree = "<group>"; };
|
||||
/* End PBXFileReference section */
|
||||
|
||||
/* Begin PBXFrameworksBuildPhase section */
|
||||
@ -325,7 +323,6 @@
|
||||
children = (
|
||||
4CE447A718BC6BE90017DF25 /* block_add_neon.S */,
|
||||
4CE447A918BC6BE90017DF25 /* intra_pred_neon.S */,
|
||||
4CE447AA18BC6BE90017DF25 /* mc_neon.S */,
|
||||
);
|
||||
path = arm;
|
||||
sourceTree = "<group>";
|
||||
@ -411,7 +408,6 @@
|
||||
isa = PBXSourcesBuildPhase;
|
||||
buildActionMask = 2147483647;
|
||||
files = (
|
||||
4CE447AF18BC6BE90017DF25 /* mc_neon.S in Sources */,
|
||||
4CE4469B18BC5EAB0017DF25 /* pic_queue.cpp in Sources */,
|
||||
4CE4469F18BC5EAB0017DF25 /* welsDecoderExt.cpp in Sources */,
|
||||
4CE4469318BC5EAB0017DF25 /* fmo.cpp in Sources */,
|
||||
|
@ -9,7 +9,6 @@
|
||||
/* Begin PBXBuildFile section */
|
||||
4C34066D18C57D0400DFA14A /* intra_pred_neon.S in Sources */ = {isa = PBXBuildFile; fileRef = 4C34066618C57D0400DFA14A /* intra_pred_neon.S */; };
|
||||
4C34066E18C57D0400DFA14A /* intra_pred_sad_3_opt_neon.S in Sources */ = {isa = PBXBuildFile; fileRef = 4C34066718C57D0400DFA14A /* intra_pred_sad_3_opt_neon.S */; };
|
||||
4C34066F18C57D0400DFA14A /* mc_neon.S in Sources */ = {isa = PBXBuildFile; fileRef = 4C34066818C57D0400DFA14A /* mc_neon.S */; };
|
||||
4C34067018C57D0400DFA14A /* memory_neon.S in Sources */ = {isa = PBXBuildFile; fileRef = 4C34066918C57D0400DFA14A /* memory_neon.S */; };
|
||||
4C34067118C57D0400DFA14A /* pixel_neon.S in Sources */ = {isa = PBXBuildFile; fileRef = 4C34066A18C57D0400DFA14A /* pixel_neon.S */; };
|
||||
4C34067218C57D0400DFA14A /* reconstruct_neon.S in Sources */ = {isa = PBXBuildFile; fileRef = 4C34066B18C57D0400DFA14A /* reconstruct_neon.S */; };
|
||||
@ -79,7 +78,6 @@
|
||||
/* Begin PBXFileReference section */
|
||||
4C34066618C57D0400DFA14A /* intra_pred_neon.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = intra_pred_neon.S; sourceTree = "<group>"; };
|
||||
4C34066718C57D0400DFA14A /* intra_pred_sad_3_opt_neon.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = intra_pred_sad_3_opt_neon.S; sourceTree = "<group>"; };
|
||||
4C34066818C57D0400DFA14A /* mc_neon.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = mc_neon.S; sourceTree = "<group>"; };
|
||||
4C34066918C57D0400DFA14A /* memory_neon.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = memory_neon.S; sourceTree = "<group>"; };
|
||||
4C34066A18C57D0400DFA14A /* pixel_neon.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = pixel_neon.S; sourceTree = "<group>"; };
|
||||
4C34066B18C57D0400DFA14A /* reconstruct_neon.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = reconstruct_neon.S; sourceTree = "<group>"; };
|
||||
@ -206,7 +204,6 @@
|
||||
children = (
|
||||
4C34066618C57D0400DFA14A /* intra_pred_neon.S */,
|
||||
4C34066718C57D0400DFA14A /* intra_pred_sad_3_opt_neon.S */,
|
||||
4C34066818C57D0400DFA14A /* mc_neon.S */,
|
||||
4C34066918C57D0400DFA14A /* memory_neon.S */,
|
||||
4C34066A18C57D0400DFA14A /* pixel_neon.S */,
|
||||
4C34066B18C57D0400DFA14A /* reconstruct_neon.S */,
|
||||
@ -492,7 +489,6 @@
|
||||
4CE4472A18BC605C0017DF25 /* utils.cpp in Sources */,
|
||||
4CE4471018BC605C0017DF25 /* decode_mb_aux.cpp in Sources */,
|
||||
4CE4472018BC605C0017DF25 /* sample.cpp in Sources */,
|
||||
4C34066F18C57D0400DFA14A /* mc_neon.S in Sources */,
|
||||
4CE4472D18BC605C0017DF25 /* welsCodecTrace.cpp in Sources */,
|
||||
4CE4471318BC605C0017DF25 /* encoder_data_tables.cpp in Sources */,
|
||||
4C34067118C57D0400DFA14A /* pixel_neon.S in Sources */,
|
||||
|
@ -82,6 +82,18 @@ void McHorVer02WidthEq4_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* p
|
||||
void McHorVer22WidthEq16_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);
|
||||
void McHorVer22WidthEq8_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);
|
||||
void McHorVer22WidthEq4_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);
|
||||
|
||||
void PixStrideAvgWidthEq16_neon(uint8_t* pDst, int32_t iDstStride, const uint8_t* pSrcA, int32_t iSrcStrideA, const uint8_t* pSrcB, int32_t iSrcStrideB, int32_t iHeight);
|
||||
void PixStrideAvgWidthEq8_neon(uint8_t* pDst, int32_t iDstStride, const uint8_t* pSrcA, int32_t iSrcStrideA, const uint8_t* pSrcB, int32_t iSrcStrideB, int32_t iHeight);
|
||||
|
||||
void McHorVer20Width17_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);// width+1
|
||||
void McHorVer20Width9_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);// width+1
|
||||
|
||||
void McHorVer02Height17_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);// height+1
|
||||
void McHorVer02Height9_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);// height+1
|
||||
|
||||
void McHorVer22Width17_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);//width+1&&height+1
|
||||
void McHorVer22Width9_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);//width+1&&height+1
|
||||
#endif
|
||||
|
||||
#if defined(X86_ASM)
|
||||
|
@ -53,6 +53,17 @@
|
||||
// }
|
||||
.endm
|
||||
|
||||
.macro FILTER_SINGLE_TAG_8BITS // when width=17/9, used
|
||||
// { // input: src_d{Y[0][1][2][3][4][5]X, the even of working_q2},
|
||||
vrev64.8 $2, $0 // X[5][4][3][2][1][0]O
|
||||
vaddl.u8 $3, $0, $2 // each 16bits, *[50][41][32][23][14][05]*
|
||||
vmul.s16 $0, $2, $1 // 0+1*[50]-5*[41]+20[32]
|
||||
vpadd.s16 $0, $0, $0
|
||||
vpadd.s16 $0, $0, $0
|
||||
vqrshrun.s16 $0, $4, #5
|
||||
// }
|
||||
.endm
|
||||
|
||||
.macro FILTER_6TAG_8BITS_AVERAGE_WITH_0
|
||||
// { // input:src[-2], src[-1], src[0], src[1], src[2], src[3], dst_d, multiplier a/b; working: q12, q13
|
||||
vaddl.u8 q12, $0, $5 //q12=src[-2]+src[3]
|
||||
@ -115,6 +126,24 @@
|
||||
vadd.s16 $2, $0 //a=src[-2]+src[3]
|
||||
// }
|
||||
.endm
|
||||
|
||||
.macro UNPACK_1_IN_8x16BITS_TO_8BITS
|
||||
// { // each 16bits; input: d_dst, d_src[0:3] (even), d_src[4:5]+%% (odd)
|
||||
vext.16 $3, $3, $3, #7 // 0x????, [0][1][2][3][4][5],
|
||||
vrev64.16 $1, $1
|
||||
vadd.u16 $2, $1 // C[2+3],B[1+4],A[0+5],
|
||||
vshr.s64 $1, $2, #16
|
||||
vshr.s64 $0, $2, #32 // Output: C $2, B $1, A $0
|
||||
|
||||
vsub.s16 $0, $0, $1 //a-b
|
||||
vshr.s16 $0, $0, #2 //(a-b)/4
|
||||
vsub.s16 $0, $0, $1 //(a-b)/4-b
|
||||
vadd.s16 $0, $0, $2 //(a-b)/4-b+c
|
||||
vshr.s16 $0, $0, #2 //((a-b)/4-b+c)/4
|
||||
vadd.s16 $1, $0, $2 //((a-b)/4-b+c)/4+c = (a-5*b+20*c)/16
|
||||
vqrshrun.s16 $0, $3, #6 //(+32)>>6
|
||||
// }
|
||||
.endm
|
||||
#else
|
||||
.macro AVERAGE_TWO_8BITS arg0, arg1, arg2
|
||||
// { // input:dst_d, src_d A and B; working: q13
|
||||
@ -134,6 +163,17 @@
|
||||
// }
|
||||
.endm
|
||||
|
||||
.macro FILTER_SINGLE_TAG_8BITS arg0, arg1,arg2, arg3, arg4,arg5 // when width=17/9, used
|
||||
// { // input: src_d{Y[0][1][2][3][4][5]X, the even of working_q2}
|
||||
vrev64.8 \arg2, \arg0 // X[5][4][3][2][1][0]O
|
||||
vaddl.u8 \arg3, \arg0, \arg2 // each 16bits, *[50][41][32][23][14][05]*
|
||||
vmul.s16 \arg0, \arg2, \arg1 // 0+1*[50]-5*[41]+20[32]
|
||||
vpadd.s16 \arg0, \arg0, \arg0
|
||||
vpadd.s16 \arg0, \arg0, \arg0
|
||||
vqrshrun.s16 \arg0, \arg4, #5
|
||||
// }
|
||||
.endm
|
||||
|
||||
.macro FILTER_6TAG_8BITS_AVERAGE_WITH_0 arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8
|
||||
// { // input:src[-2], src[-1], src[0], src[1], src[2], src[3], dst_d, multiplier a/b; working: q12, q13
|
||||
vaddl.u8 q12, \arg0, \arg5 //q12=src[-2]+src[3]
|
||||
@ -196,6 +236,24 @@
|
||||
vadd.s16 \arg2, \arg0 //a=src[-2]+src[3]
|
||||
// }
|
||||
.endm
|
||||
|
||||
.macro UNPACK_1_IN_8x16BITS_TO_8BITS arg0, arg1,arg2, arg3
|
||||
// { // each 16bits; input: d_dst, d_src[0:3] (even), d_src[4:5]+%% (odd)
|
||||
vext.16 \arg3, \arg3, \arg3, #7 // 0x????, [0][1][2][3][4][5]
|
||||
vrev64.16 \arg1, \arg1
|
||||
vadd.u16 \arg2, \arg1 // C[2+3],B[1+4],A[0+5]
|
||||
vshr.s64 \arg1, \arg2, #16
|
||||
vshr.s64 \arg0, \arg2, #32 // Output: C \arg2, B \arg1, A \arg0
|
||||
|
||||
vsub.s16 \arg0, \arg0, \arg1 //a-b
|
||||
vshr.s16 \arg0, \arg0, #2 //(a-b)/4
|
||||
vsub.s16 \arg0, \arg0, \arg1 //(a-b)/4-b
|
||||
vadd.s16 \arg0, \arg0, \arg2 //(a-b)/4-b+c
|
||||
vshr.s16 \arg0, \arg0, #2 //((a-b)/4-b+c)/4
|
||||
vadd.s16 \arg1, \arg0, \arg2 //((a-b)/4-b+c)/4+c = (a-5*b+20*c)/16
|
||||
vqrshrun.s16 \arg0, \arg3, #6 //(+32)>>6
|
||||
// }
|
||||
.endm
|
||||
#endif
|
||||
|
||||
WELS_ASM_FUNC_BEGIN McHorVer20WidthEq16_neon
|
||||
@ -1599,4 +1657,544 @@ w4_mc_chroma_loop: // each two pxl row
|
||||
|
||||
pop {r4, r5, r6}
|
||||
WELS_ASM_FUNC_END
|
||||
|
||||
WELS_ASM_FUNC_BEGIN McHorVer20Width17_neon
|
||||
push {r4-r5}
|
||||
mov r4, #20
|
||||
mov r5, #1
|
||||
sub r4, r4, r4, lsl #(16-2)
|
||||
lsl r5, #16
|
||||
ror r4, #16
|
||||
vmov d3, r5, r4 // 0x0014FFFB00010000
|
||||
|
||||
sub r3, #16
|
||||
ldr r4, [sp, #8]
|
||||
|
||||
sub r0, #2
|
||||
vmov.u16 q14, #0x0014 // 20
|
||||
vshr.u16 q15, q14, #2 // 5
|
||||
|
||||
w17_h_mc_luma_loop:
|
||||
vld1.u8 {d0,d1,d2}, [r0], r1 //only use 22(17+5); q0=src[-2]
|
||||
|
||||
vext.8 q2, q0, q1, #1 //q2=src[-1]
|
||||
vext.8 q3, q0, q1, #2 //q3=src[0]
|
||||
vext.8 q4, q0, q1, #3 //q4=src[1]
|
||||
vext.8 q5, q0, q1, #4 //q5=src[2]
|
||||
vext.8 q6, q0, q1, #5 //q6=src[3]
|
||||
|
||||
FILTER_6TAG_8BITS d0, d4, d6, d8, d10, d12, d14, q14, q15
|
||||
|
||||
FILTER_6TAG_8BITS d1, d5, d7, d9, d11, d13, d15, q14, q15
|
||||
|
||||
vst1.u8 {d14, d15}, [r2]! //write [0:15] Byte
|
||||
|
||||
vsli.64 d2, d2, #8 // [0][1][2][3][4][5]XO-->O[0][1][2][3][4][5]X
|
||||
FILTER_SINGLE_TAG_8BITS d2, d3, d14, q7, q1
|
||||
|
||||
vst1.u8 {d2[0]}, [r2], r3 //write 16th Byte
|
||||
|
||||
sub r4, #1
|
||||
cmp r4, #0
|
||||
bne w17_h_mc_luma_loop
|
||||
pop {r4-r5}
|
||||
WELS_ASM_FUNC_END
|
||||
|
||||
|
||||
WELS_ASM_FUNC_BEGIN McHorVer20Width9_neon
|
||||
push {r4-r5}
|
||||
mov r4, #20
|
||||
mov r5, #1
|
||||
sub r4, r4, r4, lsl #(16-2)
|
||||
lsl r5, #16
|
||||
ror r4, #16
|
||||
vmov d7, r5, r4 // 0x0014FFFB00010000
|
||||
|
||||
sub r3, #8
|
||||
ldr r4, [sp, #8]
|
||||
|
||||
sub r0, #2
|
||||
vmov.u16 q14, #0x0014 // 20
|
||||
vshr.u16 q15, q14, #2 // 5
|
||||
|
||||
w9_h_mc_luma_loop:
|
||||
vld1.u8 {d0,d1}, [r0], r1 //only use 14(9+5); q0=src[-2]
|
||||
pld [r0]
|
||||
|
||||
vext.8 d2, d0, d1, #1 //d2=src[-1]
|
||||
vext.8 d3, d0, d1, #2 //d3=src[0]
|
||||
vext.8 d4, d0, d1, #3 //d4=src[1]
|
||||
vext.8 d5, d0, d1, #4 //d5=src[2]
|
||||
vext.8 d6, d0, d1, #5 //d6=src[3]
|
||||
|
||||
FILTER_6TAG_8BITS d0, d2, d3, d4, d5, d6, d8, q14, q15
|
||||
|
||||
sub r4, #1
|
||||
vst1.u8 {d8}, [r2]! //write [0:7] Byte
|
||||
|
||||
vsli.64 d2, d1, #8 // [0][1][2][3][4][5]XO-->O[0][1][2][3][4][5]X
|
||||
FILTER_SINGLE_TAG_8BITS d2, d7, d14, q7, q1
|
||||
vst1.u8 {d2[0]}, [r2], r3 //write 8th Byte
|
||||
|
||||
cmp r4, #0
|
||||
bne w9_h_mc_luma_loop
|
||||
pop {r4-r5}
|
||||
WELS_ASM_FUNC_END
|
||||
|
||||
|
||||
WELS_ASM_FUNC_BEGIN McHorVer02Height17_neon
|
||||
push {r4}
|
||||
ldr r4, [sp, #4]
|
||||
|
||||
sub r0, r0, r1, lsl #1 //src[-2*src_stride]
|
||||
pld [r0]
|
||||
pld [r0, r1]
|
||||
vmov.u16 q14, #0x0014 // 20
|
||||
vld1.u8 {q0}, [r0], r1 //q0=src[-2]
|
||||
vld1.u8 {q1}, [r0], r1 //q1=src[-1]
|
||||
|
||||
pld [r0]
|
||||
pld [r0, r1]
|
||||
vshr.u16 q15, q14, #2 // 5
|
||||
vld1.u8 {q2}, [r0], r1 //q2=src[0]
|
||||
vld1.u8 {q3}, [r0], r1 //q3=src[1]
|
||||
vld1.u8 {q4}, [r0], r1 //q4=src[2]
|
||||
|
||||
w17_v_mc_luma_loop:
|
||||
|
||||
vld1.u8 {q5}, [r0], r1 //q5=src[3]
|
||||
|
||||
FILTER_6TAG_8BITS d0, d2, d4, d6, d8, d10, d12, q14, q15
|
||||
pld [r0]
|
||||
FILTER_6TAG_8BITS d1, d3, d5, d7, d9, d11, d13, q14, q15
|
||||
vld1.u8 {q0}, [r0], r1 //read 2nd row
|
||||
vst1.u8 {q6}, [r2], r3 //write 1st 16Byte
|
||||
|
||||
FILTER_6TAG_8BITS d2, d4, d6, d8, d10, d0, d12, q14, q15
|
||||
pld [r0]
|
||||
FILTER_6TAG_8BITS d3, d5, d7, d9, d11, d1, d13, q14, q15
|
||||
vld1.u8 {q1}, [r0], r1 //read 3rd row
|
||||
vst1.u8 {q6}, [r2], r3 //write 2nd 16Byte
|
||||
|
||||
FILTER_6TAG_8BITS d4, d6, d8, d10, d0, d2, d12, q14, q15
|
||||
pld [r0]
|
||||
FILTER_6TAG_8BITS d5, d7, d9, d11, d1, d3, d13, q14, q15
|
||||
vld1.u8 {q2}, [r0], r1 //read 4th row
|
||||
vst1.u8 {q6}, [r2], r3 //write 3rd 16Byte
|
||||
|
||||
FILTER_6TAG_8BITS d6, d8, d10, d0, d2, d4, d12, q14, q15
|
||||
pld [r0]
|
||||
FILTER_6TAG_8BITS d7, d9, d11, d1, d3, d5, d13, q14, q15
|
||||
vld1.u8 {q3}, [r0], r1 //read 5th row
|
||||
vst1.u8 {q6}, [r2], r3 //write 4th 16Byte
|
||||
|
||||
FILTER_6TAG_8BITS d8, d10, d0, d2, d4, d6, d12, q14, q15
|
||||
pld [r0]
|
||||
FILTER_6TAG_8BITS d9, d11, d1, d3, d5, d7, d13, q14, q15
|
||||
vld1.u8 {q4}, [r0], r1 //read 6th row
|
||||
vst1.u8 {q6}, [r2], r3 //write 5th 16Byte
|
||||
|
||||
FILTER_6TAG_8BITS d10, d0, d2, d4, d6, d8, d12, q14, q15
|
||||
pld [r0]
|
||||
FILTER_6TAG_8BITS d11, d1, d3, d5, d7, d9, d13, q14, q15
|
||||
vld1.u8 {q5}, [r0], r1 //read 7th row
|
||||
vst1.u8 {q6}, [r2], r3 //write 6th 16Byte
|
||||
|
||||
FILTER_6TAG_8BITS d0, d2, d4, d6, d8, d10, d12, q14, q15
|
||||
pld [r0]
|
||||
FILTER_6TAG_8BITS d1, d3, d5, d7, d9, d11, d13, q14, q15
|
||||
vld1.u8 {q0}, [r0], r1 //read 8th row
|
||||
vst1.u8 {q6}, [r2], r3 //write 7th 16Byte
|
||||
|
||||
FILTER_6TAG_8BITS d2, d4, d6, d8, d10, d0, d12, q14, q15
|
||||
pld [r0]
|
||||
FILTER_6TAG_8BITS d3, d5, d7, d9, d11, d1, d13, q14, q15
|
||||
vst1.u8 {q6}, [r2], r3 //write 8th 16Byte
|
||||
|
||||
//q2, q3, q4, q5, q0 --> q0~q4
|
||||
vswp q0, q4
|
||||
vswp q0, q2
|
||||
vmov q1, q3
|
||||
vmov q3, q5 //q0~q4
|
||||
|
||||
sub r4, #8
|
||||
cmp r4, #1
|
||||
bne w17_v_mc_luma_loop
|
||||
// the last 16Bytes
|
||||
vld1.u8 {q5}, [r0], r1 //q5=src[3]
|
||||
FILTER_6TAG_8BITS d0, d2, d4, d6, d8, d10, d12, q14, q15
|
||||
FILTER_6TAG_8BITS d1, d3, d5, d7, d9, d11, d13, q14, q15
|
||||
vst1.u8 {q6}, [r2], r3 //write 1st 16Byte
|
||||
|
||||
pop {r4}
|
||||
WELS_ASM_FUNC_END
|
||||
|
||||
|
||||
WELS_ASM_FUNC_BEGIN McHorVer02Height9_neon
|
||||
push {r4}
|
||||
ldr r4, [sp, #4]
|
||||
|
||||
sub r0, r0, r1, lsl #1 //src[-2*src_stride]
|
||||
pld [r0]
|
||||
pld [r0, r1]
|
||||
vmov.u16 q14, #0x0014 // 20
|
||||
vld1.u8 {d0}, [r0], r1 //d0=src[-2]
|
||||
vld1.u8 {d1}, [r0], r1 //d1=src[-1]
|
||||
|
||||
pld [r0]
|
||||
pld [r0, r1]
|
||||
vshr.u16 q15, q14, #2 // 5
|
||||
vld1.u8 {d2}, [r0], r1 //d2=src[0]
|
||||
vld1.u8 {d3}, [r0], r1 //d3=src[1]
|
||||
|
||||
vld1.u8 {d4}, [r0], r1 //d4=src[2]
|
||||
vld1.u8 {d5}, [r0], r1 //d5=src[3]
|
||||
|
||||
w9_v_mc_luma_loop:
|
||||
|
||||
pld [r0]
|
||||
FILTER_6TAG_8BITS d0, d1, d2, d3, d4, d5, d12, q14, q15
|
||||
vld1.u8 {d0}, [r0], r1 //read 2nd row
|
||||
vst1.u8 {d12}, [r2], r3 //write 1st 8Byte
|
||||
|
||||
pld [r0]
|
||||
FILTER_6TAG_8BITS d1, d2, d3, d4, d5, d0, d12, q14, q15
|
||||
vld1.u8 {d1}, [r0], r1 //read 3rd row
|
||||
vst1.u8 {d12}, [r2], r3 //write 2nd 8Byte
|
||||
|
||||
pld [r0]
|
||||
FILTER_6TAG_8BITS d2, d3, d4, d5, d0, d1, d12, q14, q15
|
||||
vld1.u8 {d2}, [r0], r1 //read 4th row
|
||||
vst1.u8 {d12}, [r2], r3 //write 3rd 8Byte
|
||||
|
||||
pld [r0]
|
||||
FILTER_6TAG_8BITS d3, d4, d5, d0, d1, d2, d12, q14, q15
|
||||
vld1.u8 {d3}, [r0], r1 //read 5th row
|
||||
vst1.u8 {d12}, [r2], r3 //write 4th 8Byte
|
||||
|
||||
//d4, d5, d0, d1, d2, d3 --> d0, d1, d2, d3, d4, d5
|
||||
vswp q0, q2
|
||||
vswp q1, q2
|
||||
|
||||
sub r4, #4
|
||||
cmp r4, #1
|
||||
bne w9_v_mc_luma_loop
|
||||
|
||||
FILTER_6TAG_8BITS d0, d1, d2, d3, d4, d5, d12, q14, q15
|
||||
vst1.u8 {d12}, [r2], r3 //write last 8Byte
|
||||
|
||||
pop {r4}
|
||||
WELS_ASM_FUNC_END
|
||||
|
||||
|
||||
WELS_ASM_FUNC_BEGIN McHorVer22Width17_neon
|
||||
push {r4}
|
||||
ldr r4, [sp, #4]
|
||||
|
||||
sub r0, #2 //src[-2]
|
||||
sub r0, r0, r1, lsl #1 //src[-2*src_stride-2]
|
||||
pld [r0]
|
||||
pld [r0, r1]
|
||||
|
||||
vmov.u16 q14, #0x0014 // 20
|
||||
vld1.u8 {d0-d2}, [r0], r1 //use 21(17+5), =src[-2]
|
||||
vld1.u8 {d3-d5}, [r0], r1 //use 21(17+5), =src[-1]
|
||||
|
||||
pld [r0]
|
||||
pld [r0, r1]
|
||||
vshr.u16 q15, q14, #2 // 5
|
||||
|
||||
vld1.u8 {d6-d8}, [r0], r1 //use 21(17+5), =src[0]
|
||||
vld1.u8 {d9-d11}, [r0], r1 //use 21(17+5), =src[1]
|
||||
pld [r0]
|
||||
pld [r0, r1]
|
||||
vld1.u8 {d12-d14}, [r0], r1 //use 21(17+5), =src[2]
|
||||
sub r3, #16
|
||||
|
||||
w17_hv_mc_luma_loop:
|
||||
|
||||
vld1.u8 {d15-d17}, [r0], r1 //use 21(17+5), =src[3]
|
||||
//the 1st row
|
||||
pld [r0]
|
||||
// vertical filtered into q9/q10
|
||||
FILTER_6TAG_8BITS_TO_16BITS d0, d3, d6, d9, d12, d15, q9, q14, q15 // 8 avail
|
||||
FILTER_6TAG_8BITS_TO_16BITS d1, d4, d7,d10, d13, d16,q10, q14, q15 // 8 avail
|
||||
// horizon filtered
|
||||
UNPACK_2_16BITS_TO_ABC q9, q10, q11, q12, q13
|
||||
FILTER_3_IN_16BITS_TO_8BITS q11, q12, q13, d0 //output to q0[0]
|
||||
// vertical filtered into q10/q11
|
||||
FILTER_6TAG_8BITS_TO_16BITS d2, d5, d8,d11, d14, d17,q11, q14, q15 // only 6 avail
|
||||
// horizon filtered
|
||||
UNPACK_2_16BITS_TO_ABC q10, q11, q9, q12, q13
|
||||
FILTER_3_IN_16BITS_TO_8BITS q9, q12, q13, d1 //output to q0[1]
|
||||
vst1.u8 {d0, d1}, [r2]! //write 16Byte
|
||||
UNPACK_1_IN_8x16BITS_TO_8BITS d2, d22, d23, q11 //output to d2[0]
|
||||
vst1.u8 {d2[0]}, [r2], r3 //write 16th Byte
|
||||
|
||||
vld1.u8 {d0-d2}, [r0], r1 //read 2nd row
|
||||
//the 2nd row
|
||||
pld [r0]
|
||||
// vertical filtered into q9/q10
|
||||
FILTER_6TAG_8BITS_TO_16BITS d3, d6, d9, d12, d15, d0, q9, q14, q15 // 8 avail
|
||||
FILTER_6TAG_8BITS_TO_16BITS d4, d7,d10, d13, d16, d1,q10, q14, q15 // 8 avail
|
||||
// horizon filtered
|
||||
UNPACK_2_16BITS_TO_ABC q9, q10, q11, q12, q13
|
||||
FILTER_3_IN_16BITS_TO_8BITS q11, q12, q13, d3 //output to d3
|
||||
// vertical filtered into q10/q11
|
||||
FILTER_6TAG_8BITS_TO_16BITS d5, d8,d11, d14, d17, d2,q11, q14, q15 // only 6 avail
|
||||
// horizon filtered
|
||||
UNPACK_2_16BITS_TO_ABC q10, q11, q9, q12, q13
|
||||
FILTER_3_IN_16BITS_TO_8BITS q9, q12, q13, d4 //output to d4
|
||||
vst1.u8 {d3, d4}, [r2]! //write 16Byte
|
||||
UNPACK_1_IN_8x16BITS_TO_8BITS d5, d22, d23, q11 //output to d5[0]
|
||||
vst1.u8 {d5[0]}, [r2], r3 //write 16th Byte
|
||||
|
||||
vld1.u8 {d3-d5}, [r0], r1 //read 3rd row
|
||||
//the 3rd row
|
||||
pld [r0]
|
||||
// vertical filtered into q9/q10
|
||||
FILTER_6TAG_8BITS_TO_16BITS d6, d9, d12, d15, d0, d3, q9, q14, q15 // 8 avail
|
||||
FILTER_6TAG_8BITS_TO_16BITS d7,d10, d13, d16, d1, d4,q10, q14, q15 // 8 avail
|
||||
// horizon filtered
|
||||
UNPACK_2_16BITS_TO_ABC q9, q10, q11, q12, q13
|
||||
FILTER_3_IN_16BITS_TO_8BITS q11, q12, q13, d6 //output to d6
|
||||
// vertical filtered into q10/q11
|
||||
FILTER_6TAG_8BITS_TO_16BITS d8,d11, d14, d17, d2, d5,q11, q14, q15 // only 6 avail
|
||||
// horizon filtered
|
||||
UNPACK_2_16BITS_TO_ABC q10, q11, q9, q12, q13
|
||||
FILTER_3_IN_16BITS_TO_8BITS q9, q12, q13, d7 //output to d7
|
||||
vst1.u8 {d6, d7}, [r2]! //write 16Byte
|
||||
UNPACK_1_IN_8x16BITS_TO_8BITS d8, d22, d23, q11 //output to d8[0]
|
||||
vst1.u8 {d8[0]}, [r2], r3 //write 16th Byte
|
||||
|
||||
vld1.u8 {d6-d8}, [r0], r1 //read 4th row
|
||||
//the 4th row
|
||||
pld [r0]
|
||||
// vertical filtered into q9/q10
|
||||
FILTER_6TAG_8BITS_TO_16BITS d9, d12, d15, d0, d3, d6, q9, q14, q15 // 8 avail
|
||||
FILTER_6TAG_8BITS_TO_16BITS d10, d13, d16, d1, d4, d7,q10, q14, q15 // 8 avail
|
||||
// horizon filtered
|
||||
UNPACK_2_16BITS_TO_ABC q9, q10, q11, q12, q13
|
||||
FILTER_3_IN_16BITS_TO_8BITS q11, q12, q13, d9 //output to d9
|
||||
// vertical filtered into q10/q11
|
||||
FILTER_6TAG_8BITS_TO_16BITS d11, d14, d17, d2, d5, d8,q11, q14, q15 // only 6 avail
|
||||
// horizon filtered
|
||||
UNPACK_2_16BITS_TO_ABC q10, q11, q9, q12, q13
|
||||
FILTER_3_IN_16BITS_TO_8BITS q9, q12, q13, d10 //output to d10
|
||||
vst1.u8 {d9, d10}, [r2], r3 //write 16Byte
|
||||
UNPACK_1_IN_8x16BITS_TO_8BITS d11, d22, d23, q11 //output to d11[0]
|
||||
vst1.u8 {d11[0]}, [r2], r3 //write 16th Byte
|
||||
|
||||
//d12~d17(q6~q8), d0~d8(q0~q3+d8), --> d0~d14
|
||||
vswp q0, q6
|
||||
vswp q6, q3
|
||||
vmov q5, q2
|
||||
vmov q2, q8
|
||||
|
||||
vmov d20,d8
|
||||
vmov q4, q1
|
||||
vmov q1, q7
|
||||
vmov d14,d20
|
||||
|
||||
sub r4, #4
|
||||
cmp r4, #1
|
||||
bne w17_hv_mc_luma_loop
|
||||
//the last row
|
||||
vld1.u8 {d15-d17}, [r0], r1 //use 21(17+5), =src[3]
|
||||
// vertical filtered into q9/q10
|
||||
FILTER_6TAG_8BITS_TO_16BITS d0, d3, d6, d9, d12, d15, q9, q14, q15 // 8 avail
|
||||
FILTER_6TAG_8BITS_TO_16BITS d1, d4, d7,d10, d13, d16,q10, q14, q15 // 8 avail
|
||||
// horizon filtered
|
||||
UNPACK_2_16BITS_TO_ABC q9, q10, q11, q12, q13
|
||||
FILTER_3_IN_16BITS_TO_8BITS q11, q12, q13, d0 //output to q0[0]
|
||||
// vertical filtered into q10/q11
|
||||
FILTER_6TAG_8BITS_TO_16BITS d2, d5, d8,d11, d14, d17,q11, q14, q15 // only 6 avail
|
||||
// horizon filtered
|
||||
UNPACK_2_16BITS_TO_ABC q10, q11, q9, q12, q13
|
||||
FILTER_3_IN_16BITS_TO_8BITS q9, q12, q13, d1 //output to q0[1]
|
||||
vst1.u8 {q0}, [r2]! //write 16Byte
|
||||
UNPACK_1_IN_8x16BITS_TO_8BITS d2, d22, d23, q11 //output to d2[0]
|
||||
vst1.u8 {d2[0]}, [r2], r3 //write 16th Byte
|
||||
|
||||
pop {r4}
|
||||
WELS_ASM_FUNC_END
|
||||
|
||||
|
||||
WELS_ASM_FUNC_BEGIN McHorVer22Width9_neon
|
||||
push {r4}
|
||||
ldr r4, [sp, #4]
|
||||
|
||||
sub r0, #2 //src[-2]
|
||||
sub r0, r0, r1, lsl #1 //src[-2*src_stride-2]
|
||||
pld [r0]
|
||||
pld [r0, r1]
|
||||
|
||||
vmov.u16 q14, #0x0014 // 20
|
||||
vld1.u8 {q0}, [r0], r1 //use 14(9+5), =src[-2]
|
||||
vld1.u8 {q1}, [r0], r1 //use 14(9+5), =src[-1]
|
||||
|
||||
pld [r0]
|
||||
pld [r0, r1]
|
||||
vshr.u16 q15, q14, #2 // 5
|
||||
|
||||
vld1.u8 {q2}, [r0], r1 //use 14(9+5), =src[0]
|
||||
vld1.u8 {q3}, [r0], r1 //use 14(9+5), =src[1]
|
||||
pld [r0]
|
||||
pld [r0, r1]
|
||||
vld1.u8 {q4}, [r0], r1 //use 14(9+5), =src[2]
|
||||
sub r3, #8
|
||||
|
||||
w9_hv_mc_luma_loop:
|
||||
|
||||
vld1.u8 {q5}, [r0], r1 //use 14(9+5), =src[3]
|
||||
//the 1st row
|
||||
pld [r0]
|
||||
// vertical filtered into q6/q7
|
||||
FILTER_6TAG_8BITS_TO_16BITS d0, d2, d4, d6, d8, d10, q6, q14, q15 // 8 avail
|
||||
FILTER_6TAG_8BITS_TO_16BITS d1, d3, d5, d7, d9, d11, q7, q14, q15 // 6 avail
|
||||
// horizon filtered
|
||||
UNPACK_2_16BITS_TO_ABC q6, q7, q11, q12, q13
|
||||
FILTER_3_IN_16BITS_TO_8BITS q11, q12, q13, d12 //output to q6[0]
|
||||
vst1.u8 d12, [r2]! //write 8Byte
|
||||
UNPACK_1_IN_8x16BITS_TO_8BITS d13, d14, d15, q7 //output to d13[0]
|
||||
vst1.u8 {d13[0]}, [r2], r3 //write 8th Byte
|
||||
|
||||
vld1.u8 {q0}, [r0], r1 //read 2nd row
|
||||
//the 2nd row
|
||||
pld [r0]
|
||||
// vertical filtered into q6/q7
|
||||
FILTER_6TAG_8BITS_TO_16BITS d2, d4, d6, d8, d10, d0, q6, q14, q15 // 8 avail
|
||||
FILTER_6TAG_8BITS_TO_16BITS d3, d5, d7, d9, d11, d1, q7, q14, q15 // 6 avail
|
||||
// horizon filtered
|
||||
UNPACK_2_16BITS_TO_ABC q6, q7, q11, q12, q13
|
||||
FILTER_3_IN_16BITS_TO_8BITS q11, q12, q13, d12 //output to q6[0]
|
||||
vst1.u8 d12, [r2]! //write 8Byte
|
||||
UNPACK_1_IN_8x16BITS_TO_8BITS d13, d14, d15, q7 //output to d13[0]
|
||||
vst1.u8 {d13[0]}, [r2], r3 //write 8th Byte
|
||||
|
||||
vld1.u8 {q1}, [r0], r1 //read 3rd row
|
||||
//the 3rd row
|
||||
pld [r0]
|
||||
// vertical filtered into q6/q7
|
||||
FILTER_6TAG_8BITS_TO_16BITS d4, d6, d8, d10, d0, d2, q6, q14, q15 // 8 avail
|
||||
FILTER_6TAG_8BITS_TO_16BITS d5, d7, d9, d11, d1, d3, q7, q14, q15 // 6 avail
|
||||
// horizon filtered
|
||||
UNPACK_2_16BITS_TO_ABC q6, q7, q11, q12, q13
|
||||
FILTER_3_IN_16BITS_TO_8BITS q11, q12, q13, d12 //output to q6[0]
|
||||
vst1.u8 d12, [r2]! //write 8Byte
|
||||
UNPACK_1_IN_8x16BITS_TO_8BITS d13, d14, d15, q7 //output to d13[0]
|
||||
vst1.u8 {d13[0]}, [r2], r3 //write 8th Byte
|
||||
|
||||
vld1.u8 {q2}, [r0], r1 //read 4th row
|
||||
//the 4th row
|
||||
pld [r0]
|
||||
// vertical filtered into q6/q7
|
||||
FILTER_6TAG_8BITS_TO_16BITS d6, d8, d10, d0, d2, d4, q6, q14, q15 // 8 avail
|
||||
FILTER_6TAG_8BITS_TO_16BITS d7, d9, d11, d1, d3, d5, q7, q14, q15 // 6 avail
|
||||
// horizon filtered
|
||||
UNPACK_2_16BITS_TO_ABC q6, q7, q11, q12, q13
|
||||
FILTER_3_IN_16BITS_TO_8BITS q11, q12, q13, d12 //output to q6[0]
|
||||
vst1.u8 d12, [r2]! //write 8Byte
|
||||
UNPACK_1_IN_8x16BITS_TO_8BITS d13, d14, d15, q7 //output to d13[0]
|
||||
vst1.u8 {d13[0]}, [r2], r3 //write 8th Byte
|
||||
|
||||
//q4~q5, q0~q2, --> q0~q4
|
||||
vswp q0, q4
|
||||
vswp q2, q4
|
||||
vmov q3, q1
|
||||
vmov q1, q5
|
||||
|
||||
sub r4, #4
|
||||
cmp r4, #1
|
||||
bne w9_hv_mc_luma_loop
|
||||
//the last row
|
||||
vld1.u8 {q5}, [r0], r1 //use 14(9+5), =src[3]
|
||||
// vertical filtered into q6/q7
|
||||
FILTER_6TAG_8BITS_TO_16BITS d0, d2, d4, d6, d8, d10, q6, q14, q15 // 8 avail
|
||||
FILTER_6TAG_8BITS_TO_16BITS d1, d3, d5, d7, d9, d11, q7, q14, q15 // 6 avail
|
||||
// horizon filtered
|
||||
UNPACK_2_16BITS_TO_ABC q6, q7, q11, q12, q13
|
||||
FILTER_3_IN_16BITS_TO_8BITS q11, q12, q13, d12 //output to q6[0]
|
||||
vst1.u8 d12, [r2]! //write 8Byte
|
||||
UNPACK_1_IN_8x16BITS_TO_8BITS d13, d14, d15, q7 //output to d13[0]
|
||||
vst1.u8 {d13[0]}, [r2], r3 //write 8th Byte
|
||||
pop {r4}
|
||||
WELS_ASM_FUNC_END
|
||||
|
||||
|
||||
WELS_ASM_FUNC_BEGIN PixStrideAvgWidthEq16_neon
|
||||
push {r4, r5, r6}
|
||||
ldr r4, [sp, #12]
|
||||
ldr r5, [sp, #16]
|
||||
ldr r6, [sp, #20]
|
||||
|
||||
enc_w16_pix_avg_loop:
|
||||
vld1.u8 {q0}, [r2], r3
|
||||
vld1.u8 {q1}, [r4], r5
|
||||
vld1.u8 {q2}, [r2], r3
|
||||
vld1.u8 {q3}, [r4], r5
|
||||
|
||||
vld1.u8 {q4}, [r2], r3
|
||||
vld1.u8 {q5}, [r4], r5
|
||||
vld1.u8 {q6}, [r2], r3
|
||||
vld1.u8 {q7}, [r4], r5
|
||||
|
||||
AVERAGE_TWO_8BITS d0, d0, d2
|
||||
AVERAGE_TWO_8BITS d1, d1, d3
|
||||
vst1.u8 {q0}, [r0], r1
|
||||
|
||||
AVERAGE_TWO_8BITS d4, d4, d6
|
||||
AVERAGE_TWO_8BITS d5, d5, d7
|
||||
vst1.u8 {q2}, [r0], r1
|
||||
|
||||
AVERAGE_TWO_8BITS d8, d8, d10
|
||||
AVERAGE_TWO_8BITS d9, d9, d11
|
||||
vst1.u8 {q4}, [r0], r1
|
||||
|
||||
AVERAGE_TWO_8BITS d12, d12, d14
|
||||
AVERAGE_TWO_8BITS d13, d13, d15
|
||||
vst1.u8 {q6}, [r0], r1
|
||||
|
||||
sub r6, #4
|
||||
cmp r6, #0
|
||||
bne enc_w16_pix_avg_loop
|
||||
|
||||
pop {r4, r5, r6}
|
||||
WELS_ASM_FUNC_END
|
||||
|
||||
|
||||
WELS_ASM_FUNC_BEGIN PixStrideAvgWidthEq8_neon
|
||||
push {r4, r5, r6}
|
||||
ldr r4, [sp, #12]
|
||||
ldr r5, [sp, #16]
|
||||
ldr r6, [sp, #20]
|
||||
enc_w8_pix_avg_loop:
|
||||
|
||||
vld1.u8 {d0}, [r2], r3
|
||||
vld1.u8 {d2}, [r4], r5
|
||||
vld1.u8 {d1}, [r2], r3
|
||||
vld1.u8 {d3}, [r4], r5
|
||||
|
||||
AVERAGE_TWO_8BITS d0, d0, d2
|
||||
AVERAGE_TWO_8BITS d1, d1, d3
|
||||
vst1.u8 {d0}, [r0], r1
|
||||
vst1.u8 {d1}, [r0], r1
|
||||
|
||||
vld1.u8 {d4}, [r2], r3
|
||||
vld1.u8 {d6}, [r4], r5
|
||||
vld1.u8 {d5}, [r2], r3
|
||||
vld1.u8 {d7}, [r4], r5
|
||||
|
||||
AVERAGE_TWO_8BITS d4, d4, d6
|
||||
AVERAGE_TWO_8BITS d5, d5, d7
|
||||
vst1.u8 {d4}, [r0], r1
|
||||
vst1.u8 {d5}, [r0], r1
|
||||
|
||||
sub r6, #4
|
||||
cmp r6, #0
|
||||
bne enc_w8_pix_avg_loop
|
||||
|
||||
pop {r4, r5, r6}
|
||||
WELS_ASM_FUNC_END
|
||||
|
||||
#endif
|
@ -480,6 +480,99 @@ void McChroma_ssse3 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int
|
||||
}
|
||||
|
||||
#endif //X86_ASM
|
||||
|
||||
//***************************************************************************//
|
||||
// NEON implementation //
|
||||
//***************************************************************************//
|
||||
#if defined(HAVE_NEON)
|
||||
void McHorVer20Width9Or17_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
|
||||
int32_t iWidth, int32_t iHeight) {
|
||||
if (iWidth == 17)
|
||||
McHorVer20Width17_neon(pSrc, iSrcStride, pDst, iDstStride, iHeight);
|
||||
else //if (iWidth == 9)
|
||||
McHorVer20Width9_neon(pSrc, iSrcStride, pDst, iDstStride, iHeight);
|
||||
}
|
||||
void McHorVer02Height9Or17_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
|
||||
int32_t iWidth, int32_t iHeight){
|
||||
if (iWidth == 16)
|
||||
McHorVer02Height17_neon(pSrc, iSrcStride, pDst, iDstStride, iHeight);
|
||||
else //if (iWidth == 8)
|
||||
McHorVer02Height9_neon(pSrc, iSrcStride, pDst, iDstStride, iHeight);
|
||||
}
|
||||
void McHorVer22Width9Or17Height9Or17_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
|
||||
int32_t iWidth, int32_t iHeight){
|
||||
if (iWidth == 17)
|
||||
McHorVer22Width17_neon(pSrc, iSrcStride, pDst, iDstStride, iHeight);
|
||||
else //if (iWidth == 9)
|
||||
McHorVer22Width9_neon(pSrc, iSrcStride, pDst, iDstStride, iHeight);
|
||||
}
|
||||
void EncMcHorVer11_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight) {
|
||||
ENFORCE_STACK_ALIGN_1D(uint8_t, pTmp, 512, 16)
|
||||
McHorVer20WidthEq16_neon(pSrc, iSrcStride, pTmp, 16, iHeight);
|
||||
McHorVer02WidthEq16_neon(pSrc, iSrcStride, &pTmp[256], 16, iHeight);
|
||||
PixelAvgWidthEq16_neon(pDst, iDstStride, pTmp, &pTmp[256], iHeight);
|
||||
}
|
||||
void EncMcHorVer12_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight) {
|
||||
ENFORCE_STACK_ALIGN_1D(uint8_t, pTmp, 512, 16)
|
||||
McHorVer02WidthEq16_neon(pSrc, iSrcStride, pTmp, 16, iHeight);
|
||||
McHorVer22WidthEq16_neon(pSrc, iSrcStride, &pTmp[256], 16, iHeight);
|
||||
PixelAvgWidthEq16_neon(pDst, iDstStride, pTmp, &pTmp[256], iHeight);
|
||||
}
|
||||
void EncMcHorVer13_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight) {
|
||||
ENFORCE_STACK_ALIGN_1D(uint8_t, pTmp, 512, 16)
|
||||
McHorVer20WidthEq16_neon(pSrc+iSrcStride, iSrcStride, pTmp, 16, iHeight);
|
||||
McHorVer02WidthEq16_neon(pSrc, iSrcStride, &pTmp[256], 16, iHeight);
|
||||
PixelAvgWidthEq16_neon(pDst, iDstStride, pTmp, &pTmp[256], iHeight);
|
||||
}
|
||||
void EncMcHorVer21_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight) {
|
||||
ENFORCE_STACK_ALIGN_1D(uint8_t, pTmp, 512, 16)
|
||||
McHorVer20WidthEq16_neon(pSrc, iSrcStride, pTmp, 16, iHeight);
|
||||
McHorVer22WidthEq16_neon(pSrc, iSrcStride, &pTmp[256], 16, iHeight);
|
||||
PixelAvgWidthEq16_neon(pDst, iDstStride, pTmp, &pTmp[256], iHeight);
|
||||
}
|
||||
void EncMcHorVer23_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight) {
|
||||
ENFORCE_STACK_ALIGN_1D(uint8_t, pTmp, 512, 16)
|
||||
McHorVer20WidthEq16_neon(pSrc+iSrcStride, iSrcStride, pTmp, 16, iHeight);
|
||||
McHorVer22WidthEq16_neon(pSrc, iSrcStride, &pTmp[256], 16, iHeight);
|
||||
PixelAvgWidthEq16_neon(pDst, iDstStride, pTmp, &pTmp[256], iHeight);
|
||||
}
|
||||
void EncMcHorVer31_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight) {
|
||||
ENFORCE_STACK_ALIGN_1D(uint8_t, pTmp, 512, 16)
|
||||
McHorVer20WidthEq16_neon(pSrc, iSrcStride, pTmp, 16, iHeight);
|
||||
McHorVer02WidthEq16_neon(pSrc+1, iSrcStride, &pTmp[256], 16, iHeight);
|
||||
PixelAvgWidthEq16_neon(pDst, iDstStride, pTmp, &pTmp[256], iHeight);
|
||||
}
|
||||
void EncMcHorVer32_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight) {
|
||||
ENFORCE_STACK_ALIGN_1D(uint8_t, pTmp, 512, 16)
|
||||
McHorVer02WidthEq16_neon(pSrc+1, iSrcStride, pTmp, 16, iHeight);
|
||||
McHorVer22WidthEq16_neon(pSrc, iSrcStride, &pTmp[256], 16, iHeight);
|
||||
PixelAvgWidthEq16_neon(pDst, iDstStride, pTmp, &pTmp[256], iHeight);
|
||||
}
|
||||
void EncMcHorVer33_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight) {
|
||||
ENFORCE_STACK_ALIGN_1D(uint8_t, pTmp, 512, 16)
|
||||
McHorVer20WidthEq16_neon(pSrc+iSrcStride, iSrcStride, pTmp, 16, iHeight);
|
||||
McHorVer02WidthEq16_neon(pSrc+1, iSrcStride, &pTmp[256], 16, iHeight);
|
||||
PixelAvgWidthEq16_neon(pDst, iDstStride, pTmp, &pTmp[256], iHeight);
|
||||
}
|
||||
void EncMcChroma_neon(const uint8_t *pSrc, int32_t iSrcStride, uint8_t *pDst, int32_t iDstStride,
|
||||
SMVUnitXY sMv, int32_t iWidth, int32_t iHeight) {
|
||||
const int32_t kiD8x = sMv.iMvX&0x07;
|
||||
const int32_t kiD8y = sMv.iMvY&0x07;
|
||||
if (0 == kiD8x && 0 == kiD8y) {
|
||||
if(8 == iWidth)
|
||||
McCopyWidthEq8_neon(pSrc, iSrcStride, pDst, iDstStride, iHeight);
|
||||
else // iWidth == 4
|
||||
McCopyWidthEq4_neon(pSrc, iSrcStride, pDst, iDstStride, iHeight);
|
||||
}
|
||||
else {
|
||||
if(8 == iWidth)
|
||||
McChromaWidthEq8_neon(pSrc, iSrcStride, pDst, iDstStride, (int32_t*)(g_kuiABCD[kiD8y][kiD8x]), iHeight);
|
||||
else //if(4 == iWidth)
|
||||
McChromaWidthEq4_neon(pSrc, iSrcStride, pDst, iDstStride, (int32_t*)(g_kuiABCD[kiD8y][kiD8x]), iHeight);
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
typedef void (*PixelAvgFunc) (uint8_t*, int32_t, const uint8_t*, int32_t, const uint8_t*, int32_t, int32_t);
|
||||
void WelsInitMcFuncs (SWelsFuncPtrList* pFuncList, uint32_t uiCpuFlag) {
|
||||
static PixelAvgFunc pfPixAvgFunc[2] = {PixelAvgWidthEq8_c, PixelAvgWidthEq16_c};
|
||||
@ -498,6 +591,14 @@ void WelsInitMcFuncs (SWelsFuncPtrList* pFuncList, uint32_t uiCpuFlag) {
|
||||
McHorVer03WidthEq16, McHorVer13WidthEq16, McHorVer23WidthEq16, McHorVer33WidthEq16
|
||||
};
|
||||
#endif
|
||||
#if defined(HAVE_NEON)
|
||||
static PWelsLumaQuarpelMcFunc pWelsMcFuncWidthEq16_neon[16] = { //[x][y]
|
||||
McCopyWidthEq16_neon, McHorVer10WidthEq16_neon, McHorVer20WidthEq16_neon, McHorVer30WidthEq16_neon,
|
||||
McHorVer01WidthEq16_neon, EncMcHorVer11_neon, EncMcHorVer21_neon, EncMcHorVer31_neon,
|
||||
McHorVer02WidthEq16_neon, EncMcHorVer12_neon, McHorVer22WidthEq16_neon, EncMcHorVer32_neon,
|
||||
McHorVer03WidthEq16_neon, EncMcHorVer13_neon, EncMcHorVer23_neon, EncMcHorVer33_neon
|
||||
};
|
||||
#endif
|
||||
|
||||
pFuncList->sMcFuncs.pfLumaHalfpelHor = McHorVer20_c;
|
||||
pFuncList->sMcFuncs.pfLumaHalfpelVer = McHorVer02_c;
|
||||
@ -538,5 +639,17 @@ void WelsInitMcFuncs (SWelsFuncPtrList* pFuncList, uint32_t uiCpuFlag) {
|
||||
}
|
||||
|
||||
#endif //(X86_ASM)
|
||||
|
||||
#if defined(HAVE_NEON)
|
||||
if (uiCpuFlag & WELS_CPU_NEON) {
|
||||
pFuncList->sMcFuncs.pfLumaQuarpelMc = pWelsMcFuncWidthEq16_neon;
|
||||
pFuncList->sMcFuncs.pfChromaMc = EncMcChroma_neon;
|
||||
pFuncList->sMcFuncs.pfSampleAveraging[0] = PixStrideAvgWidthEq8_neon;
|
||||
pFuncList->sMcFuncs.pfSampleAveraging[1] = PixStrideAvgWidthEq16_neon;
|
||||
pFuncList->sMcFuncs.pfLumaHalfpelHor = McHorVer20Width9Or17_neon;//iWidth+1:8/16
|
||||
pFuncList->sMcFuncs.pfLumaHalfpelVer = McHorVer02Height9Or17_neon;//heigh+1:8/16
|
||||
pFuncList->sMcFuncs.pfLumaHalfpelCen = McHorVer22Width9Or17Height9Or17_neon;//iWidth+1/heigh+1
|
||||
}
|
||||
#endif
|
||||
}
|
||||
}
|
||||
|
Loading…
x
Reference in New Issue
Block a user