Partly add arm asm code to encoder.
This commit is contained in:
@@ -7,6 +7,7 @@
|
|||||||
objects = {
|
objects = {
|
||||||
|
|
||||||
/* Begin PBXBuildFile section */
|
/* Begin PBXBuildFile section */
|
||||||
|
4C34067D18C5C94C00DFA14A /* expand_picture.S in Sources */ = {isa = PBXBuildFile; fileRef = 4C34067C18C5C94C00DFA14A /* expand_picture.S */; };
|
||||||
4CE443D918B722CD0017DF25 /* Foundation.framework in Frameworks */ = {isa = PBXBuildFile; fileRef = 4CE443D818B722CD0017DF25 /* Foundation.framework */; };
|
4CE443D918B722CD0017DF25 /* Foundation.framework in Frameworks */ = {isa = PBXBuildFile; fileRef = 4CE443D818B722CD0017DF25 /* Foundation.framework */; };
|
||||||
4CE443E718B722CD0017DF25 /* XCTest.framework in Frameworks */ = {isa = PBXBuildFile; fileRef = 4CE443E618B722CD0017DF25 /* XCTest.framework */; };
|
4CE443E718B722CD0017DF25 /* XCTest.framework in Frameworks */ = {isa = PBXBuildFile; fileRef = 4CE443E618B722CD0017DF25 /* XCTest.framework */; };
|
||||||
4CE443E818B722CD0017DF25 /* Foundation.framework in Frameworks */ = {isa = PBXBuildFile; fileRef = 4CE443D818B722CD0017DF25 /* Foundation.framework */; };
|
4CE443E818B722CD0017DF25 /* Foundation.framework in Frameworks */ = {isa = PBXBuildFile; fileRef = 4CE443D818B722CD0017DF25 /* Foundation.framework */; };
|
||||||
@@ -46,6 +47,7 @@
|
|||||||
/* End PBXCopyFilesBuildPhase section */
|
/* End PBXCopyFilesBuildPhase section */
|
||||||
|
|
||||||
/* Begin PBXFileReference section */
|
/* Begin PBXFileReference section */
|
||||||
|
4C34067C18C5C94C00DFA14A /* expand_picture.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = expand_picture.S; sourceTree = "<group>"; };
|
||||||
4CE443D518B722CD0017DF25 /* libcommon.a */ = {isa = PBXFileReference; explicitFileType = archive.ar; includeInIndex = 0; path = libcommon.a; sourceTree = BUILT_PRODUCTS_DIR; };
|
4CE443D518B722CD0017DF25 /* libcommon.a */ = {isa = PBXFileReference; explicitFileType = archive.ar; includeInIndex = 0; path = libcommon.a; sourceTree = BUILT_PRODUCTS_DIR; };
|
||||||
4CE443D818B722CD0017DF25 /* Foundation.framework */ = {isa = PBXFileReference; lastKnownFileType = wrapper.framework; name = Foundation.framework; path = System/Library/Frameworks/Foundation.framework; sourceTree = SDKROOT; };
|
4CE443D818B722CD0017DF25 /* Foundation.framework */ = {isa = PBXFileReference; lastKnownFileType = wrapper.framework; name = Foundation.framework; path = System/Library/Frameworks/Foundation.framework; sourceTree = SDKROOT; };
|
||||||
4CE443E518B722CD0017DF25 /* commonTests.xctest */ = {isa = PBXFileReference; explicitFileType = wrapper.cfbundle; includeInIndex = 0; path = commonTests.xctest; sourceTree = BUILT_PRODUCTS_DIR; };
|
4CE443E518B722CD0017DF25 /* commonTests.xctest */ = {isa = PBXFileReference; explicitFileType = wrapper.cfbundle; includeInIndex = 0; path = commonTests.xctest; sourceTree = BUILT_PRODUCTS_DIR; };
|
||||||
@@ -148,6 +150,7 @@
|
|||||||
4CE4472F18BC61650017DF25 /* common */ = {
|
4CE4472F18BC61650017DF25 /* common */ = {
|
||||||
isa = PBXGroup;
|
isa = PBXGroup;
|
||||||
children = (
|
children = (
|
||||||
|
4C34067C18C5C94C00DFA14A /* expand_picture.S */,
|
||||||
4CE447BE18C085900017DF25 /* arm_arch_common_macro.S */,
|
4CE447BE18C085900017DF25 /* arm_arch_common_macro.S */,
|
||||||
4CE447BC18C085320017DF25 /* deblocking_neon.S */,
|
4CE447BC18C085320017DF25 /* deblocking_neon.S */,
|
||||||
4CE4473118BC61650017DF25 /* cpu.cpp */,
|
4CE4473118BC61650017DF25 /* cpu.cpp */,
|
||||||
@@ -257,6 +260,7 @@
|
|||||||
4CE4475018BC61650017DF25 /* deblocking_common.cpp in Sources */,
|
4CE4475018BC61650017DF25 /* deblocking_common.cpp in Sources */,
|
||||||
4CE4474C18BC61650017DF25 /* cpu.cpp in Sources */,
|
4CE4474C18BC61650017DF25 /* cpu.cpp in Sources */,
|
||||||
4CE4475218BC61650017DF25 /* logging.cpp in Sources */,
|
4CE4475218BC61650017DF25 /* logging.cpp in Sources */,
|
||||||
|
4C34067D18C5C94C00DFA14A /* expand_picture.S in Sources */,
|
||||||
4CE447BD18C085320017DF25 /* deblocking_neon.S in Sources */,
|
4CE447BD18C085320017DF25 /* deblocking_neon.S in Sources */,
|
||||||
4CE4475818BC61650017DF25 /* WelsThreadLib.cpp in Sources */,
|
4CE4475818BC61650017DF25 /* WelsThreadLib.cpp in Sources */,
|
||||||
4CE4474E18BC61650017DF25 /* crt_util_safe_x.cpp in Sources */,
|
4CE4474E18BC61650017DF25 /* crt_util_safe_x.cpp in Sources */,
|
||||||
|
|||||||
@@ -7,6 +7,12 @@
|
|||||||
objects = {
|
objects = {
|
||||||
|
|
||||||
/* Begin PBXBuildFile section */
|
/* Begin PBXBuildFile section */
|
||||||
|
4C34066D18C57D0400DFA14A /* intra_pred_neon.S in Sources */ = {isa = PBXBuildFile; fileRef = 4C34066618C57D0400DFA14A /* intra_pred_neon.S */; };
|
||||||
|
4C34066E18C57D0400DFA14A /* intra_pred_sad_3_opt_neon.S in Sources */ = {isa = PBXBuildFile; fileRef = 4C34066718C57D0400DFA14A /* intra_pred_sad_3_opt_neon.S */; };
|
||||||
|
4C34066F18C57D0400DFA14A /* mc_neon.S in Sources */ = {isa = PBXBuildFile; fileRef = 4C34066818C57D0400DFA14A /* mc_neon.S */; };
|
||||||
|
4C34067018C57D0400DFA14A /* memory_neon.S in Sources */ = {isa = PBXBuildFile; fileRef = 4C34066918C57D0400DFA14A /* memory_neon.S */; };
|
||||||
|
4C34067118C57D0400DFA14A /* pixel_neon.S in Sources */ = {isa = PBXBuildFile; fileRef = 4C34066A18C57D0400DFA14A /* pixel_neon.S */; };
|
||||||
|
4C34067218C57D0400DFA14A /* reconstruct_neon.S in Sources */ = {isa = PBXBuildFile; fileRef = 4C34066B18C57D0400DFA14A /* reconstruct_neon.S */; };
|
||||||
4CE4431518B6FFA00017DF25 /* Foundation.framework in Frameworks */ = {isa = PBXBuildFile; fileRef = 4CE4431418B6FFA00017DF25 /* Foundation.framework */; };
|
4CE4431518B6FFA00017DF25 /* Foundation.framework in Frameworks */ = {isa = PBXBuildFile; fileRef = 4CE4431418B6FFA00017DF25 /* Foundation.framework */; };
|
||||||
4CE4432318B6FFA00017DF25 /* XCTest.framework in Frameworks */ = {isa = PBXBuildFile; fileRef = 4CE4432218B6FFA00017DF25 /* XCTest.framework */; };
|
4CE4432318B6FFA00017DF25 /* XCTest.framework in Frameworks */ = {isa = PBXBuildFile; fileRef = 4CE4432218B6FFA00017DF25 /* XCTest.framework */; };
|
||||||
4CE4432418B6FFA00017DF25 /* Foundation.framework in Frameworks */ = {isa = PBXBuildFile; fileRef = 4CE4431418B6FFA00017DF25 /* Foundation.framework */; };
|
4CE4432418B6FFA00017DF25 /* Foundation.framework in Frameworks */ = {isa = PBXBuildFile; fileRef = 4CE4431418B6FFA00017DF25 /* Foundation.framework */; };
|
||||||
@@ -71,6 +77,12 @@
|
|||||||
/* End PBXCopyFilesBuildPhase section */
|
/* End PBXCopyFilesBuildPhase section */
|
||||||
|
|
||||||
/* Begin PBXFileReference section */
|
/* Begin PBXFileReference section */
|
||||||
|
4C34066618C57D0400DFA14A /* intra_pred_neon.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = intra_pred_neon.S; sourceTree = "<group>"; };
|
||||||
|
4C34066718C57D0400DFA14A /* intra_pred_sad_3_opt_neon.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = intra_pred_sad_3_opt_neon.S; sourceTree = "<group>"; };
|
||||||
|
4C34066818C57D0400DFA14A /* mc_neon.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = mc_neon.S; sourceTree = "<group>"; };
|
||||||
|
4C34066918C57D0400DFA14A /* memory_neon.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = memory_neon.S; sourceTree = "<group>"; };
|
||||||
|
4C34066A18C57D0400DFA14A /* pixel_neon.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = pixel_neon.S; sourceTree = "<group>"; };
|
||||||
|
4C34066B18C57D0400DFA14A /* reconstruct_neon.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = reconstruct_neon.S; sourceTree = "<group>"; };
|
||||||
4CE4431118B6FFA00017DF25 /* libwelsenc.a */ = {isa = PBXFileReference; explicitFileType = archive.ar; includeInIndex = 0; path = libwelsenc.a; sourceTree = BUILT_PRODUCTS_DIR; };
|
4CE4431118B6FFA00017DF25 /* libwelsenc.a */ = {isa = PBXFileReference; explicitFileType = archive.ar; includeInIndex = 0; path = libwelsenc.a; sourceTree = BUILT_PRODUCTS_DIR; };
|
||||||
4CE4431418B6FFA00017DF25 /* Foundation.framework */ = {isa = PBXFileReference; lastKnownFileType = wrapper.framework; name = Foundation.framework; path = System/Library/Frameworks/Foundation.framework; sourceTree = SDKROOT; };
|
4CE4431418B6FFA00017DF25 /* Foundation.framework */ = {isa = PBXFileReference; lastKnownFileType = wrapper.framework; name = Foundation.framework; path = System/Library/Frameworks/Foundation.framework; sourceTree = SDKROOT; };
|
||||||
4CE4432118B6FFA00017DF25 /* welsencTests.xctest */ = {isa = PBXFileReference; explicitFileType = wrapper.cfbundle; includeInIndex = 0; path = welsencTests.xctest; sourceTree = BUILT_PRODUCTS_DIR; };
|
4CE4432118B6FFA00017DF25 /* welsencTests.xctest */ = {isa = PBXFileReference; explicitFileType = wrapper.cfbundle; includeInIndex = 0; path = welsencTests.xctest; sourceTree = BUILT_PRODUCTS_DIR; };
|
||||||
@@ -189,6 +201,19 @@
|
|||||||
/* End PBXFrameworksBuildPhase section */
|
/* End PBXFrameworksBuildPhase section */
|
||||||
|
|
||||||
/* Begin PBXGroup section */
|
/* Begin PBXGroup section */
|
||||||
|
4C34066418C57D0400DFA14A /* arm */ = {
|
||||||
|
isa = PBXGroup;
|
||||||
|
children = (
|
||||||
|
4C34066618C57D0400DFA14A /* intra_pred_neon.S */,
|
||||||
|
4C34066718C57D0400DFA14A /* intra_pred_sad_3_opt_neon.S */,
|
||||||
|
4C34066818C57D0400DFA14A /* mc_neon.S */,
|
||||||
|
4C34066918C57D0400DFA14A /* memory_neon.S */,
|
||||||
|
4C34066A18C57D0400DFA14A /* pixel_neon.S */,
|
||||||
|
4C34066B18C57D0400DFA14A /* reconstruct_neon.S */,
|
||||||
|
);
|
||||||
|
path = arm;
|
||||||
|
sourceTree = "<group>";
|
||||||
|
};
|
||||||
4CE4430818B6FFA00017DF25 = {
|
4CE4430818B6FFA00017DF25 = {
|
||||||
isa = PBXGroup;
|
isa = PBXGroup;
|
||||||
children = (
|
children = (
|
||||||
@@ -249,6 +274,7 @@
|
|||||||
4CE446A118BC605B0017DF25 /* core */ = {
|
4CE446A118BC605B0017DF25 /* core */ = {
|
||||||
isa = PBXGroup;
|
isa = PBXGroup;
|
||||||
children = (
|
children = (
|
||||||
|
4C34066418C57D0400DFA14A /* arm */,
|
||||||
4CE446A918BC605C0017DF25 /* inc */,
|
4CE446A918BC605C0017DF25 /* inc */,
|
||||||
4CE446DC18BC605C0017DF25 /* src */,
|
4CE446DC18BC605C0017DF25 /* src */,
|
||||||
);
|
);
|
||||||
@@ -466,14 +492,18 @@
|
|||||||
4CE4472A18BC605C0017DF25 /* utils.cpp in Sources */,
|
4CE4472A18BC605C0017DF25 /* utils.cpp in Sources */,
|
||||||
4CE4471018BC605C0017DF25 /* decode_mb_aux.cpp in Sources */,
|
4CE4471018BC605C0017DF25 /* decode_mb_aux.cpp in Sources */,
|
||||||
4CE4472018BC605C0017DF25 /* sample.cpp in Sources */,
|
4CE4472018BC605C0017DF25 /* sample.cpp in Sources */,
|
||||||
|
4C34066F18C57D0400DFA14A /* mc_neon.S in Sources */,
|
||||||
4CE4472D18BC605C0017DF25 /* welsCodecTrace.cpp in Sources */,
|
4CE4472D18BC605C0017DF25 /* welsCodecTrace.cpp in Sources */,
|
||||||
4CE4471318BC605C0017DF25 /* encoder_data_tables.cpp in Sources */,
|
4CE4471318BC605C0017DF25 /* encoder_data_tables.cpp in Sources */,
|
||||||
|
4C34067118C57D0400DFA14A /* pixel_neon.S in Sources */,
|
||||||
4CE4471F18BC605C0017DF25 /* ref_list_mgr_svc.cpp in Sources */,
|
4CE4471F18BC605C0017DF25 /* ref_list_mgr_svc.cpp in Sources */,
|
||||||
4CE4472218BC605C0017DF25 /* slice_multi_threading.cpp in Sources */,
|
4CE4472218BC605C0017DF25 /* slice_multi_threading.cpp in Sources */,
|
||||||
4CE4471518BC605C0017DF25 /* expand_pic.cpp in Sources */,
|
4CE4471518BC605C0017DF25 /* expand_pic.cpp in Sources */,
|
||||||
|
4C34067018C57D0400DFA14A /* memory_neon.S in Sources */,
|
||||||
4CE4470F18BC605C0017DF25 /* deblocking.cpp in Sources */,
|
4CE4470F18BC605C0017DF25 /* deblocking.cpp in Sources */,
|
||||||
4CE4472518BC605C0017DF25 /* svc_encode_mb.cpp in Sources */,
|
4CE4472518BC605C0017DF25 /* svc_encode_mb.cpp in Sources */,
|
||||||
4CE4471A18BC605C0017DF25 /* mv_pred.cpp in Sources */,
|
4CE4471A18BC605C0017DF25 /* mv_pred.cpp in Sources */,
|
||||||
|
4C34066E18C57D0400DFA14A /* intra_pred_sad_3_opt_neon.S in Sources */,
|
||||||
4CE4472B18BC605C0017DF25 /* wels_preprocess.cpp in Sources */,
|
4CE4472B18BC605C0017DF25 /* wels_preprocess.cpp in Sources */,
|
||||||
4CE4470E18BC605C0017DF25 /* au_set.cpp in Sources */,
|
4CE4470E18BC605C0017DF25 /* au_set.cpp in Sources */,
|
||||||
4CE4471718BC605C0017DF25 /* mc.cpp in Sources */,
|
4CE4471718BC605C0017DF25 /* mc.cpp in Sources */,
|
||||||
@@ -484,12 +514,14 @@
|
|||||||
4CE4472418BC605C0017DF25 /* svc_enc_slice_segment.cpp in Sources */,
|
4CE4472418BC605C0017DF25 /* svc_enc_slice_segment.cpp in Sources */,
|
||||||
4CE4472318BC605C0017DF25 /* svc_base_layer_md.cpp in Sources */,
|
4CE4472318BC605C0017DF25 /* svc_base_layer_md.cpp in Sources */,
|
||||||
4CE4471E18BC605C0017DF25 /* ratectl.cpp in Sources */,
|
4CE4471E18BC605C0017DF25 /* ratectl.cpp in Sources */,
|
||||||
|
4C34066D18C57D0400DFA14A /* intra_pred_neon.S in Sources */,
|
||||||
4CE4471C18BC605C0017DF25 /* picture_handle.cpp in Sources */,
|
4CE4471C18BC605C0017DF25 /* picture_handle.cpp in Sources */,
|
||||||
4CE4472618BC605C0017DF25 /* svc_encode_slice.cpp in Sources */,
|
4CE4472618BC605C0017DF25 /* svc_encode_slice.cpp in Sources */,
|
||||||
4CE4471218BC605C0017DF25 /* encoder.cpp in Sources */,
|
4CE4471218BC605C0017DF25 /* encoder.cpp in Sources */,
|
||||||
4CE4471618BC605C0017DF25 /* get_intra_predictor.cpp in Sources */,
|
4CE4471618BC605C0017DF25 /* get_intra_predictor.cpp in Sources */,
|
||||||
4CE4472E18BC605C0017DF25 /* welsEncoderExt.cpp in Sources */,
|
4CE4472E18BC605C0017DF25 /* welsEncoderExt.cpp in Sources */,
|
||||||
4CE4471418BC605C0017DF25 /* encoder_ext.cpp in Sources */,
|
4CE4471418BC605C0017DF25 /* encoder_ext.cpp in Sources */,
|
||||||
|
4C34067218C57D0400DFA14A /* reconstruct_neon.S in Sources */,
|
||||||
);
|
);
|
||||||
runOnlyForDeploymentPostprocessing = 0;
|
runOnlyForDeploymentPostprocessing = 0;
|
||||||
};
|
};
|
||||||
|
|||||||
@@ -809,4 +809,232 @@ WELS_ASM_FUNC_BEGIN enc_avc_non_zero_count_neon
|
|||||||
vst1.64 {d0-d2}, [r0]
|
vst1.64 {d0-d2}, [r0]
|
||||||
WELS_ASM_FUNC_END
|
WELS_ASM_FUNC_END
|
||||||
|
|
||||||
|
#ifdef APPLE_IOS
|
||||||
|
|
||||||
|
.macro BS_NZC_CHECK
|
||||||
|
vld1.8 {d0,d1}, [$0]
|
||||||
|
/* Arrenge the input data --- TOP */
|
||||||
|
ands r6, $1, #2
|
||||||
|
beq bs_nzc_check_jump0
|
||||||
|
|
||||||
|
sub r6, $0, $2, lsl #4
|
||||||
|
sub r6, $2, lsl #3
|
||||||
|
add r6, #12
|
||||||
|
vld1.32 d3[1], [r6]
|
||||||
|
|
||||||
|
bs_nzc_check_jump0:
|
||||||
|
vext.8 q1, q1, q0, #12
|
||||||
|
vadd.u8 $3, q0, q1
|
||||||
|
|
||||||
|
|
||||||
|
/* Arrenge the input data --- LEFT */
|
||||||
|
ands r6, $1, #1
|
||||||
|
beq bs_nzc_check_jump1
|
||||||
|
|
||||||
|
sub r6, $0, #21
|
||||||
|
add r7, r6, #4
|
||||||
|
vld1.8 d3[4], [r6]
|
||||||
|
add r6, r7, #4
|
||||||
|
vld1.8 d3[5], [r7]
|
||||||
|
add r7, r6, #4
|
||||||
|
vld1.8 d3[6], [r6]
|
||||||
|
vld1.8 d3[7], [r7]
|
||||||
|
|
||||||
|
bs_nzc_check_jump1:
|
||||||
|
vzip.8 d0, d1
|
||||||
|
vzip.8 d0, d1
|
||||||
|
vext.8 q1, q1, q0, #12
|
||||||
|
vadd.u8 $4, q0, q1
|
||||||
|
.endm
|
||||||
|
|
||||||
|
.macro BS_COMPARE_MV //in: $0,$1(const),$2(const),$3(const),$4(const); out:$5, $6
|
||||||
|
mov r6, #4
|
||||||
|
vabd.s16 q5, $0, $1
|
||||||
|
vabd.s16 q6, $1, $2
|
||||||
|
vdup.s16 $0, r6
|
||||||
|
vabd.s16 q7, $2, $3
|
||||||
|
vabd.s16 q8, $3, $4
|
||||||
|
|
||||||
|
vcge.s16 q5, $0
|
||||||
|
vcge.s16 q6, $0
|
||||||
|
vcge.s16 q7, $0
|
||||||
|
vcge.s16 q8, $0
|
||||||
|
|
||||||
|
vpadd.i16 d10, d10, d11
|
||||||
|
vpadd.i16 d11, d12, d13
|
||||||
|
vpadd.i16 d12, d14, d15
|
||||||
|
vpadd.i16 d13, d16, d17
|
||||||
|
|
||||||
|
vaddhn.i16 $5, q5, q5
|
||||||
|
vaddhn.i16 $6, q6, q6
|
||||||
|
.endm
|
||||||
|
|
||||||
|
.macro BS_MV_CHECK
|
||||||
|
vldm $0, {q0,q1,q2,q3}
|
||||||
|
|
||||||
|
/* Arrenge the input data --- TOP */
|
||||||
|
ands r6, $1, #2
|
||||||
|
beq bs_mv_check_jump0
|
||||||
|
|
||||||
|
sub r6, $0, $2, lsl #6
|
||||||
|
add r6, #48
|
||||||
|
vld1.8 {d8, d9}, [r6]
|
||||||
|
|
||||||
|
bs_mv_check_jump0:
|
||||||
|
BS_COMPARE_MV q4, q0, q1, q2, q3, $3, $4
|
||||||
|
|
||||||
|
/* Arrenge the input data --- LEFT */
|
||||||
|
ands r6, $1, #1
|
||||||
|
beq bs_mv_check_jump1
|
||||||
|
|
||||||
|
sub r6, $0, #52
|
||||||
|
add r7, r6, #16
|
||||||
|
vld1.32 d8[0], [r6]
|
||||||
|
add r6, r7, #16
|
||||||
|
vld1.32 d8[1], [r7]
|
||||||
|
add r7, r6, #16
|
||||||
|
vld1.32 d9[0], [r6]
|
||||||
|
vld1.32 d9[1], [r7]
|
||||||
|
|
||||||
|
bs_mv_check_jump1:
|
||||||
|
vzip.32 q0, q2
|
||||||
|
vzip.32 q1, q3
|
||||||
|
vzip.32 q0, q1
|
||||||
|
vzip.32 q2, q3
|
||||||
|
BS_COMPARE_MV q4, q0, q1, q2, q3, $5, $6
|
||||||
|
.endm
|
||||||
|
#else
|
||||||
|
|
||||||
|
.macro BS_NZC_CHECK arg0, arg1, arg2, arg3, arg4
|
||||||
|
vld1.8 {d0,d1}, [\arg0]
|
||||||
|
/* Arrenge the input data --- TOP */
|
||||||
|
ands r6, \arg1, #2
|
||||||
|
beq bs_nzc_check_jump0
|
||||||
|
|
||||||
|
sub r6, \arg0, \arg2, lsl #4
|
||||||
|
sub r6, \arg2, lsl #3
|
||||||
|
add r6, #12
|
||||||
|
vld1.32 d3[1], [r6]
|
||||||
|
|
||||||
|
bs_nzc_check_jump0:
|
||||||
|
vext.8 q1, q1, q0, #12
|
||||||
|
vadd.u8 \arg3, q0, q1
|
||||||
|
|
||||||
|
|
||||||
|
/* Arrenge the input data --- LEFT */
|
||||||
|
ands r6, \arg1, #1
|
||||||
|
beq bs_nzc_check_jump1
|
||||||
|
|
||||||
|
sub r6, \arg0, #21
|
||||||
|
add r7, r6, #4
|
||||||
|
vld1.8 d3[4], [r6]
|
||||||
|
add r6, r7, #4
|
||||||
|
vld1.8 d3[5], [r7]
|
||||||
|
add r7, r6, #4
|
||||||
|
vld1.8 d3[6], [r6]
|
||||||
|
vld1.8 d3[7], [r7]
|
||||||
|
|
||||||
|
bs_nzc_check_jump1:
|
||||||
|
vzip.8 d0, d1
|
||||||
|
vzip.8 d0, d1
|
||||||
|
vext.8 q1, q1, q0, #12
|
||||||
|
vadd.u8 \arg4, q0, q1
|
||||||
|
.endm
|
||||||
|
|
||||||
|
.macro BS_COMPARE_MV arg0, arg1, arg2, arg3, arg4, arg5, arg6 //in: $0,$1(const),$2(const),$3(const),$4(const); out:$5, $6
|
||||||
|
mov r6, #4
|
||||||
|
vabd.s16 q5, \arg0, \arg1
|
||||||
|
vabd.s16 q6, \arg1, \arg2
|
||||||
|
vdup.s16 \arg0, r6
|
||||||
|
vabd.s16 q7, \arg2, \arg3
|
||||||
|
vabd.s16 q8, \arg3, \arg4
|
||||||
|
|
||||||
|
vcge.s16 q5, \arg0
|
||||||
|
vcge.s16 q6, \arg0
|
||||||
|
vcge.s16 q7, \arg0
|
||||||
|
vcge.s16 q8, \arg0
|
||||||
|
|
||||||
|
vpadd.i16 d10, d10, d11
|
||||||
|
vpadd.i16 d11, d12, d13
|
||||||
|
vpadd.i16 d12, d14, d15
|
||||||
|
vpadd.i16 d13, d16, d17
|
||||||
|
|
||||||
|
vaddhn.i16 \arg5, q5, q5
|
||||||
|
vaddhn.i16 \arg6, q6, q6
|
||||||
|
.endm
|
||||||
|
|
||||||
|
.macro BS_MV_CHECK arg0, arg1, arg2, arg3, arg4, arg5, arg6
|
||||||
|
vldm \arg0, {q0,q1,q2,q3}
|
||||||
|
|
||||||
|
/* Arrenge the input data --- TOP */
|
||||||
|
ands r6, \arg1, #2
|
||||||
|
beq bs_mv_check_jump0
|
||||||
|
|
||||||
|
sub r6, \arg0, \arg2, lsl #6
|
||||||
|
add r6, #48
|
||||||
|
vld1.8 {d8, d9}, [r6]
|
||||||
|
|
||||||
|
bs_mv_check_jump0:
|
||||||
|
BS_COMPARE_MV q4, q0, q1, q2, q3, \arg3, \arg4
|
||||||
|
|
||||||
|
/* Arrenge the input data --- LEFT */
|
||||||
|
ands r6, \arg1, #1
|
||||||
|
beq bs_mv_check_jump1
|
||||||
|
|
||||||
|
sub r6, \arg0, #52
|
||||||
|
add r7, r6, #16
|
||||||
|
vld1.32 d8[0], [r6]
|
||||||
|
add r6, r7, #16
|
||||||
|
vld1.32 d8[1], [r7]
|
||||||
|
add r7, r6, #16
|
||||||
|
vld1.32 d9[0], [r6]
|
||||||
|
vld1.32 d9[1], [r7]
|
||||||
|
|
||||||
|
bs_mv_check_jump1:
|
||||||
|
vzip.32 q0, q2
|
||||||
|
vzip.32 q1, q3
|
||||||
|
vzip.32 q0, q1
|
||||||
|
vzip.32 q2, q3
|
||||||
|
BS_COMPARE_MV q4, q0, q1, q2, q3, \arg5, \arg6
|
||||||
|
.endm
|
||||||
|
#endif
|
||||||
|
|
||||||
|
|
||||||
|
WELS_ASM_FUNC_BEGIN DeblockingBSCalcEnc_neon
|
||||||
|
|
||||||
|
stmdb sp!, {r5-r7}
|
||||||
|
|
||||||
|
ldr r5, [sp, #12] //Save BS to r5
|
||||||
|
|
||||||
|
/* Checking the nzc status */
|
||||||
|
BS_NZC_CHECK r0, r2, r3, q14, q15 //q14,q15 save the nzc status
|
||||||
|
|
||||||
|
/* For checking bS[I] = 2 */
|
||||||
|
mov r6, #2
|
||||||
|
vcgt.s8 q14, q14, #0
|
||||||
|
vdup.u8 q0, r6
|
||||||
|
vcgt.s8 q15, q15, #0
|
||||||
|
|
||||||
|
vand.u8 q14, q14, q0 //q14 save the nzc check result all the time --- for dir is top
|
||||||
|
vand.u8 q15, q15, q0 //q15 save the nzc check result all the time --- for dir is left
|
||||||
|
|
||||||
|
/* Checking the mv status*/
|
||||||
|
BS_MV_CHECK r1, r2, r3, d24, d25, d26, d27//q12, q13 save the mv status
|
||||||
|
|
||||||
|
/* For checking bS[I] = 1 */
|
||||||
|
mov r6, #1
|
||||||
|
vdup.u8 q0, r6
|
||||||
|
|
||||||
|
vand.u8 q12, q12, q0 //q12 save the nzc check result all the time --- for dir is top
|
||||||
|
vand.u8 q13, q13, q0 //q13 save the nzc check result all the time --- for dir is left
|
||||||
|
|
||||||
|
|
||||||
|
/* Check bS[I] is '1' or '2' */
|
||||||
|
vmax.u8 q1, q12, q14
|
||||||
|
vmax.u8 q0, q13, q15
|
||||||
|
|
||||||
|
//vstm r5, {q0, q1}
|
||||||
|
vst1.32 {q0, q1}, [r5]
|
||||||
|
ldmia sp!, {r5-r7}
|
||||||
|
WELS_ASM_FUNC_END
|
||||||
#endif
|
#endif
|
||||||
|
|||||||
137
codec/common/expand_picture.S
Executable file
137
codec/common/expand_picture.S
Executable file
@@ -0,0 +1,137 @@
|
|||||||
|
/*!
|
||||||
|
* \copy
|
||||||
|
* Copyright (c) 2013, Cisco Systems
|
||||||
|
* All rights reserved.
|
||||||
|
*
|
||||||
|
* Redistribution and use in source and binary forms, with or without
|
||||||
|
* modification, are permitted provided that the following conditions
|
||||||
|
* are met:
|
||||||
|
*
|
||||||
|
* * Redistributions of source code must retain the above copyright
|
||||||
|
* notice, this list of conditions and the following disclaimer.
|
||||||
|
*
|
||||||
|
* * Redistributions in binary form must reproduce the above copyright
|
||||||
|
* notice, this list of conditions and the following disclaimer in
|
||||||
|
* the documentation and/or other materials provided with the
|
||||||
|
* distribution.
|
||||||
|
*
|
||||||
|
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||||
|
* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||||
|
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
|
||||||
|
* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
|
||||||
|
* COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
|
||||||
|
* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
|
||||||
|
* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
|
||||||
|
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||||
|
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
|
||||||
|
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
|
||||||
|
* ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||||
|
* POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
*
|
||||||
|
*/
|
||||||
|
|
||||||
|
#ifdef HAVE_NEON
|
||||||
|
.text
|
||||||
|
#include "arm_arch_common_macro.S"
|
||||||
|
|
||||||
|
|
||||||
|
WELS_ASM_FUNC_BEGIN ExpandPictureLuma_neon
|
||||||
|
stmdb sp!, {r4-r8}
|
||||||
|
//Save the dst
|
||||||
|
mov r7, r0
|
||||||
|
mov r8, r3
|
||||||
|
|
||||||
|
add r4, r7, r2
|
||||||
|
sub r4, #1
|
||||||
|
//For the left and right expand
|
||||||
|
_expand_picture_luma_loop2:
|
||||||
|
sub r5, r7, #32
|
||||||
|
add r6, r4, #1
|
||||||
|
|
||||||
|
vld1.8 {d0[], d1[]}, [r7], r1
|
||||||
|
vld1.8 {d2[], d3[]}, [r4], r1
|
||||||
|
|
||||||
|
vst1.8 {q0}, [r5]!
|
||||||
|
vst1.8 {q0}, [r5]
|
||||||
|
vst1.8 {q1}, [r6]!
|
||||||
|
vst1.8 {q1}, [r6]
|
||||||
|
subs r8, #1
|
||||||
|
bne _expand_picture_luma_loop2
|
||||||
|
|
||||||
|
//for the top and bottom expand
|
||||||
|
add r2, #64
|
||||||
|
sub r0, #32
|
||||||
|
mla r4, r1, r3, r0
|
||||||
|
sub r4, r1
|
||||||
|
_expand_picture_luma_loop0:
|
||||||
|
mov r5, #32
|
||||||
|
mls r5, r5, r1, r0
|
||||||
|
add r6, r4, r1
|
||||||
|
vld1.8 {q0}, [r0]!
|
||||||
|
vld1.8 {q1}, [r4]!
|
||||||
|
|
||||||
|
mov r8, #32
|
||||||
|
_expand_picture_luma_loop1:
|
||||||
|
vst1.8 {q0}, [r5], r1
|
||||||
|
vst1.8 {q1}, [r6], r1
|
||||||
|
subs r8, #1
|
||||||
|
bne _expand_picture_luma_loop1
|
||||||
|
|
||||||
|
subs r2, #16
|
||||||
|
bne _expand_picture_luma_loop0
|
||||||
|
|
||||||
|
//vldreq.32 d0, [r0]
|
||||||
|
|
||||||
|
ldmia sp!, {r4-r8}
|
||||||
|
WELS_ASM_FUNC_END
|
||||||
|
|
||||||
|
|
||||||
|
WELS_ASM_FUNC_BEGIN ExpandPictureChroma_neon
|
||||||
|
stmdb sp!, {r4-r8}
|
||||||
|
//Save the dst
|
||||||
|
mov r7, r0
|
||||||
|
mov r8, r3
|
||||||
|
|
||||||
|
add r4, r7, r2
|
||||||
|
sub r4, #1
|
||||||
|
//For the left and right expand
|
||||||
|
_expand_picture_chroma_loop2:
|
||||||
|
sub r5, r7, #16
|
||||||
|
add r6, r4, #1
|
||||||
|
|
||||||
|
vld1.8 {d0[], d1[]}, [r7], r1
|
||||||
|
vld1.8 {d2[], d3[]}, [r4], r1
|
||||||
|
|
||||||
|
vst1.8 {q0}, [r5]
|
||||||
|
vst1.8 {q1}, [r6]
|
||||||
|
subs r8, #1
|
||||||
|
bne _expand_picture_chroma_loop2
|
||||||
|
|
||||||
|
//for the top and bottom expand
|
||||||
|
add r2, #32
|
||||||
|
sub r0, #16
|
||||||
|
mla r4, r1, r3, r0
|
||||||
|
sub r4, r1
|
||||||
|
_expand_picture_chroma_loop0:
|
||||||
|
mov r5, #16
|
||||||
|
mls r5, r5, r1, r0
|
||||||
|
add r6, r4, r1
|
||||||
|
vld1.8 {q0}, [r0]!
|
||||||
|
vld1.8 {q1}, [r4]!
|
||||||
|
|
||||||
|
mov r8, #16
|
||||||
|
_expand_picture_chroma_loop1:
|
||||||
|
vst1.8 {q0}, [r5], r1
|
||||||
|
vst1.8 {q1}, [r6], r1
|
||||||
|
subs r8, #1
|
||||||
|
bne _expand_picture_chroma_loop1
|
||||||
|
|
||||||
|
subs r2, #16
|
||||||
|
bne _expand_picture_chroma_loop0
|
||||||
|
|
||||||
|
//vldreq.32 d0, [r0]
|
||||||
|
|
||||||
|
ldmia sp!, {r4-r8}
|
||||||
|
WELS_ASM_FUNC_END
|
||||||
|
|
||||||
|
#endif
|
||||||
@@ -61,6 +61,10 @@ void ExpandPictureChromaUnalign_sse2 (uint8_t* pDst,
|
|||||||
const int32_t kiPicH);
|
const int32_t kiPicH);
|
||||||
#endif//X86_ASM
|
#endif//X86_ASM
|
||||||
|
|
||||||
|
#if defined(HAVE_NEON)
|
||||||
|
void ExpandPictureLuma_neon(uint8_t *pDst, const int32_t kiStride, const int32_t kiPicW, const int32_t kiPicH);
|
||||||
|
void ExpandPictureChroma_neon(uint8_t *pDst, const int32_t kiStride, const int32_t kiPicW, const int32_t kiPicH);
|
||||||
|
#endif
|
||||||
#if defined(__cplusplus)
|
#if defined(__cplusplus)
|
||||||
}
|
}
|
||||||
#endif//__cplusplus
|
#endif//__cplusplus
|
||||||
|
|||||||
BIN
codec/encoder/core/arm/.DS_Store
vendored
Normal file
BIN
codec/encoder/core/arm/.DS_Store
vendored
Normal file
Binary file not shown.
648
codec/encoder/core/arm/intra_pred_neon.S
Executable file
648
codec/encoder/core/arm/intra_pred_neon.S
Executable file
@@ -0,0 +1,648 @@
|
|||||||
|
/*!
|
||||||
|
* \copy
|
||||||
|
* Copyright (c) 2013, Cisco Systems
|
||||||
|
* All rights reserved.
|
||||||
|
*
|
||||||
|
* Redistribution and use in source and binary forms, with or without
|
||||||
|
* modification, are permitted provided that the following conditions
|
||||||
|
* are met:
|
||||||
|
*
|
||||||
|
* * Redistributions of source code must retain the above copyright
|
||||||
|
* notice, this list of conditions and the following disclaimer.
|
||||||
|
*
|
||||||
|
* * Redistributions in binary form must reproduce the above copyright
|
||||||
|
* notice, this list of conditions and the following disclaimer in
|
||||||
|
* the documentation and/or other materials provided with the
|
||||||
|
* distribution.
|
||||||
|
*
|
||||||
|
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||||
|
* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||||
|
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
|
||||||
|
* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
|
||||||
|
* COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
|
||||||
|
* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
|
||||||
|
* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
|
||||||
|
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||||
|
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
|
||||||
|
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
|
||||||
|
* ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||||
|
* POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
*
|
||||||
|
*/
|
||||||
|
|
||||||
|
#ifdef HAVE_NEON
|
||||||
|
.text
|
||||||
|
|
||||||
|
#include "arm_arch_common_macro.S"
|
||||||
|
|
||||||
|
#ifdef APPLE_IOS
|
||||||
|
//Global macro
|
||||||
|
.macro GET_8BYTE_DATA
|
||||||
|
vld1.8 {$0[0]}, [$1], $2
|
||||||
|
vld1.8 {$0[1]}, [$1], $2
|
||||||
|
vld1.8 {$0[2]}, [$1], $2
|
||||||
|
vld1.8 {$0[3]}, [$1], $2
|
||||||
|
vld1.8 {$0[4]}, [$1], $2
|
||||||
|
vld1.8 {$0[5]}, [$1], $2
|
||||||
|
vld1.8 {$0[6]}, [$1], $2
|
||||||
|
vld1.8 {$0[7]}, [$1], $2
|
||||||
|
.endm
|
||||||
|
#else
|
||||||
|
//Global macro
|
||||||
|
.macro GET_8BYTE_DATA arg0, arg1, arg2
|
||||||
|
vld1.8 {\arg0[0]}, [\arg1], \arg2
|
||||||
|
vld1.8 {\arg0[1]}, [\arg1], \arg2
|
||||||
|
vld1.8 {\arg0[2]}, [\arg1], \arg2
|
||||||
|
vld1.8 {\arg0[3]}, [\arg1], \arg2
|
||||||
|
vld1.8 {\arg0[4]}, [\arg1], \arg2
|
||||||
|
vld1.8 {\arg0[5]}, [\arg1], \arg2
|
||||||
|
vld1.8 {\arg0[6]}, [\arg1], \arg2
|
||||||
|
vld1.8 {\arg0[7]}, [\arg1], \arg2
|
||||||
|
.endm
|
||||||
|
#endif
|
||||||
|
|
||||||
|
|
||||||
|
WELS_ASM_FUNC_BEGIN enc_get_i16x16_luma_pred_v_neon
|
||||||
|
//Get the top line data to 'q0'
|
||||||
|
sub r3, r1, r2
|
||||||
|
vldm r3, {d0, d1}
|
||||||
|
|
||||||
|
//mov r2, #16
|
||||||
|
mov r3, #4
|
||||||
|
//Set the top line to the each line of MB(16*16)
|
||||||
|
loop_0_get_i16x16_luma_pred_v:
|
||||||
|
vst1.8 {d0,d1}, [r0]!
|
||||||
|
vst1.8 {d0,d1}, [r0]!
|
||||||
|
vst1.8 {d0,d1}, [r0]!
|
||||||
|
vst1.8 {d0,d1}, [r0]!
|
||||||
|
subs r3, #1
|
||||||
|
bne loop_0_get_i16x16_luma_pred_v
|
||||||
|
WELS_ASM_FUNC_END
|
||||||
|
|
||||||
|
|
||||||
|
WELS_ASM_FUNC_BEGIN enc_get_i16x16_luma_pred_h_neon
|
||||||
|
//stmdb sp!, {r4, lr}
|
||||||
|
sub r1, r1, #1
|
||||||
|
mov r3, #4
|
||||||
|
loop_0_get_i16x16_luma_pred_h:
|
||||||
|
//Get one byte data from left side
|
||||||
|
vld1.8 {d0[],d1[]}, [r1], r2
|
||||||
|
vld1.8 {d2[],d3[]}, [r1], r2
|
||||||
|
vld1.8 {d4[],d5[]}, [r1], r2
|
||||||
|
vld1.8 {d6[],d7[]}, [r1], r2
|
||||||
|
|
||||||
|
//Set the line of MB using the left side byte data
|
||||||
|
vst1.8 {d0,d1}, [r0]!
|
||||||
|
//add r0, #16
|
||||||
|
vst1.8 {d2,d3}, [r0]!
|
||||||
|
//add r0, #16
|
||||||
|
vst1.8 {d4,d5}, [r0]!
|
||||||
|
//add r0, #16
|
||||||
|
vst1.8 {d6,d7}, [r0]!
|
||||||
|
//add r0, #16
|
||||||
|
|
||||||
|
subs r3, #1
|
||||||
|
bne loop_0_get_i16x16_luma_pred_h
|
||||||
|
|
||||||
|
WELS_ASM_FUNC_END
|
||||||
|
|
||||||
|
|
||||||
|
WELS_ASM_FUNC_BEGIN enc_get_i16x16_luma_pred_dc_both_neon
|
||||||
|
//stmdb sp!, { r2-r5, lr}
|
||||||
|
//Get the left vertical line data
|
||||||
|
sub r3, r1, #1
|
||||||
|
GET_8BYTE_DATA d0, r3, r2
|
||||||
|
GET_8BYTE_DATA d1, r3, r2
|
||||||
|
|
||||||
|
//Get the top horizontal line data
|
||||||
|
sub r3, r1, r2
|
||||||
|
vldm r3, {d2, d3}
|
||||||
|
|
||||||
|
//Calculate the sum of top horizontal line data and vertical line data
|
||||||
|
vpaddl.u8 q0, q0
|
||||||
|
vpaddl.u8 q1, q1
|
||||||
|
vadd.u16 q0, q0, q1
|
||||||
|
vadd.u16 d0, d0, d1
|
||||||
|
vpaddl.u16 d0, d0
|
||||||
|
vpaddl.u32 d0, d0
|
||||||
|
|
||||||
|
//Calculate the mean value
|
||||||
|
vrshr.u16 d0, d0, #5
|
||||||
|
vdup.8 q0, d0[0]
|
||||||
|
|
||||||
|
//Set the mean value to the all of member of MB
|
||||||
|
mov r3, #4
|
||||||
|
loop_0_get_i16x16_luma_pred_dc_both:
|
||||||
|
vst1.8 {d0,d1}, [r0]!
|
||||||
|
vst1.8 {d0,d1}, [r0]!
|
||||||
|
vst1.8 {d0,d1}, [r0]!
|
||||||
|
vst1.8 {d0,d1}, [r0]!
|
||||||
|
subs r3, #1
|
||||||
|
bne loop_0_get_i16x16_luma_pred_dc_both
|
||||||
|
|
||||||
|
WELS_ASM_FUNC_END
|
||||||
|
|
||||||
|
|
||||||
|
//The table for SIMD instruction {(8,7,6,5,4,3,2,1) * 5}
|
||||||
|
CONST0_GET_I16X16_LUMA_PRED_PLANE: .long 0x191e2328, 0x050a0f14
|
||||||
|
|
||||||
|
//The table for SIMD instruction {-7,-6,-5,-4,-3,-2,-1,0}
|
||||||
|
CONST1_GET_I16X16_LUMA_PRED_PLANE: .long 0xfcfbfaf9, 0x00fffefd
|
||||||
|
|
||||||
|
|
||||||
|
WELS_ASM_FUNC_BEGIN enc_get_i16x16_luma_pred_plane_neon
|
||||||
|
//stmdb sp!, { r4, lr}
|
||||||
|
|
||||||
|
//Load the table {(8,7,6,5,4,3,2,1) * 5}
|
||||||
|
adr r3, CONST0_GET_I16X16_LUMA_PRED_PLANE
|
||||||
|
vldr d0, [r3]
|
||||||
|
|
||||||
|
//Pack the top[-1] ~ top[6] to d1
|
||||||
|
sub r3, r1, r2
|
||||||
|
sub r1, r3, #1
|
||||||
|
vld1.8 d1, [r1]
|
||||||
|
|
||||||
|
//Pack the top[8] ~ top[15] to d2
|
||||||
|
add r1, #9
|
||||||
|
vld1.8 d2, [r1]
|
||||||
|
|
||||||
|
//Save the top[15] to d6 for next step
|
||||||
|
vdup.u8 d6, d2[7]
|
||||||
|
|
||||||
|
//Get and pack left[-1] ~ left[6] to d4
|
||||||
|
sub r1, r3, #1
|
||||||
|
GET_8BYTE_DATA d4, r1, r2
|
||||||
|
|
||||||
|
//Get and pack left[8] ~ left[15] to d3
|
||||||
|
add r1, r2
|
||||||
|
GET_8BYTE_DATA d3, r1, r2
|
||||||
|
|
||||||
|
//Save the left[15] to d7 for next step
|
||||||
|
vdup.u8 d7, d3[7]
|
||||||
|
|
||||||
|
//revert the sequence of d2,d3
|
||||||
|
vrev64.8 q1, q1
|
||||||
|
|
||||||
|
vsubl.u8 q2, d3, d4 //q2={left[8]-left[6],left[9]-left[5],left[10]-left[4], ...}
|
||||||
|
vsubl.u8 q1, d2, d1 //q1={top[8]-top[6],top[9]-top[5],top[10]-top[4], ...}
|
||||||
|
|
||||||
|
|
||||||
|
vmovl.u8 q0, d0
|
||||||
|
vmul.s16 q1, q0, q1 //q1 = q1*{(8,7,6,5,4,3,2,1) * 5}
|
||||||
|
vmul.s16 q2, q0, q2 //q2 = q2*{(8,7,6,5,4,3,2,1) * 5}
|
||||||
|
|
||||||
|
//Calculate the sum of items of q1, q2
|
||||||
|
vpadd.s16 d0, d2, d3
|
||||||
|
vpadd.s16 d1, d4, d5
|
||||||
|
vpaddl.s16 q0, q0
|
||||||
|
vpaddl.s32 q0, q0
|
||||||
|
|
||||||
|
//Get the value of 'b', 'c' and extend to q1, q2.
|
||||||
|
vrshr.s64 q0, #6
|
||||||
|
vdup.s16 q1, d0[0]
|
||||||
|
vdup.s16 q2, d1[0]
|
||||||
|
|
||||||
|
//Load the table {-7,-6,-5,-4,-3,-2,-1,0} to d0
|
||||||
|
adr r3, CONST1_GET_I16X16_LUMA_PRED_PLANE
|
||||||
|
vld1.32 {d0}, [r3]
|
||||||
|
|
||||||
|
//Get the value of 'a' and save to q3
|
||||||
|
vaddl.u8 q3, d6, d7
|
||||||
|
vshl.u16 q3, #4
|
||||||
|
|
||||||
|
//calculate a+'b'*{-7,-6,-5,-4,-3,-2,-1,0} + c*{-7}
|
||||||
|
vmovl.s8 q0, d0
|
||||||
|
vmla.s16 q3, q0, q1
|
||||||
|
vmla.s16 q3, q2, d0[0]
|
||||||
|
|
||||||
|
//Calculate a+'b'*{1,2,3,4,5,6,7,8} + c*{-7}
|
||||||
|
vshl.s16 q5, q1, #3
|
||||||
|
vadd.s16 q5, q3
|
||||||
|
|
||||||
|
//right shift 5 bits and rounding
|
||||||
|
vqrshrun.s16 d0, q3, #5
|
||||||
|
vqrshrun.s16 d1, q5, #5
|
||||||
|
|
||||||
|
//Set the line of MB
|
||||||
|
vst1.u32 {d0,d1}, [r0]!
|
||||||
|
|
||||||
|
|
||||||
|
//Do the same processing for setting other lines
|
||||||
|
mov r3, #15
|
||||||
|
loop_0_get_i16x16_luma_pred_plane:
|
||||||
|
vadd.s16 q3, q2
|
||||||
|
vadd.s16 q5, q2
|
||||||
|
vqrshrun.s16 d0, q3, #5
|
||||||
|
vqrshrun.s16 d1, q5, #5
|
||||||
|
vst1.u32 {d0,d1}, [r0]!
|
||||||
|
subs r3, #1
|
||||||
|
bne loop_0_get_i16x16_luma_pred_plane
|
||||||
|
|
||||||
|
WELS_ASM_FUNC_END
|
||||||
|
|
||||||
|
|
||||||
|
WELS_ASM_FUNC_BEGIN enc_get_i4x4_luma_pred_v_neon
|
||||||
|
//stmdb sp!, { r2-r5, lr}
|
||||||
|
//Load the top row (4 bytes)
|
||||||
|
sub r3, r1, r2
|
||||||
|
ldr r3, [r3]
|
||||||
|
|
||||||
|
//Set the luma MB using top line
|
||||||
|
str r3, [r0], #4
|
||||||
|
str r3, [r0], #4
|
||||||
|
str r3, [r0], #4
|
||||||
|
str r3, [r0]
|
||||||
|
|
||||||
|
WELS_ASM_FUNC_END
|
||||||
|
|
||||||
|
|
||||||
|
WELS_ASM_FUNC_BEGIN enc_get_i4x4_luma_pred_h_neon
|
||||||
|
//stmdb sp!, { r2-r5, lr}
|
||||||
|
//Load the left column (4 bytes)
|
||||||
|
sub r3, r1, #1
|
||||||
|
vld1.8 {d0[]}, [r3], r2
|
||||||
|
vld1.8 {d1[]}, [r3], r2
|
||||||
|
vld1.8 {d2[]}, [r3], r2
|
||||||
|
vld1.8 {d3[]}, [r3]
|
||||||
|
|
||||||
|
//Set the luma MB using the left side byte
|
||||||
|
vst1.32 {d0[0]}, [r0]!
|
||||||
|
vst1.32 {d1[0]}, [r0]!
|
||||||
|
vst1.32 {d2[0]}, [r0]!
|
||||||
|
vst1.32 {d3[0]}, [r0]
|
||||||
|
|
||||||
|
WELS_ASM_FUNC_END
|
||||||
|
|
||||||
|
|
||||||
|
WELS_ASM_FUNC_BEGIN enc_get_i4x4_luma_pred_d_l_neon
|
||||||
|
//stmdb sp!, { r2-r5, lr}
|
||||||
|
//Load the top row data(8 bytes)
|
||||||
|
sub r3, r1, r2
|
||||||
|
vld1.32 {d0}, [r3]
|
||||||
|
|
||||||
|
//For "t7 + (t7<<1)"
|
||||||
|
vdup.8 d1, d0[7]
|
||||||
|
|
||||||
|
//calculate "t0+t1,t1+t2,t2+t3...t6+t7,t7+t7"
|
||||||
|
vext.8 d1, d0, d1, #1
|
||||||
|
vaddl.u8 q1, d1, d0
|
||||||
|
|
||||||
|
//calculate "x,t0+t1+t1+t2,t1+t2+t2+t3,...t5+t6+t6+t7,t6+t7+t7+t7"
|
||||||
|
vext.8 q2, q1, q1, #14
|
||||||
|
vadd.u16 q0, q1, q2
|
||||||
|
|
||||||
|
//right shift 2 bits and rounding
|
||||||
|
vqrshrn.u16 d0, q0, #2
|
||||||
|
|
||||||
|
//Save "ddl0, ddl1, ddl2, ddl3"
|
||||||
|
vext.8 d1, d0, d0, #1
|
||||||
|
vst1.32 d1[0], [r0]!
|
||||||
|
|
||||||
|
//Save "ddl1, ddl2, ddl3, ddl4"
|
||||||
|
vext.8 d1, d0, d0, #2
|
||||||
|
vst1.32 d1[0], [r0]!
|
||||||
|
|
||||||
|
//Save "ddl2, ddl3, ddl4, ddl5"
|
||||||
|
vext.8 d1, d0, d0, #3
|
||||||
|
vst1.32 d1[0], [r0]!
|
||||||
|
|
||||||
|
//Save "ddl3, ddl4, ddl5, ddl6"
|
||||||
|
vst1.32 d0[1], [r0]
|
||||||
|
|
||||||
|
WELS_ASM_FUNC_END
|
||||||
|
|
||||||
|
|
||||||
|
WELS_ASM_FUNC_BEGIN enc_get_i4x4_luma_pred_d_r_neon
|
||||||
|
//stmdb sp!, { r2-r5, lr}
|
||||||
|
//Load the top row (4 bytes)
|
||||||
|
sub r3, r1, r2
|
||||||
|
vld1.32 {d0[1]}, [r3]
|
||||||
|
|
||||||
|
//Load the left column (5 bytes)
|
||||||
|
sub r3, #1
|
||||||
|
vld1.8 {d0[3]}, [r3], r2
|
||||||
|
vld1.8 {d0[2]}, [r3], r2
|
||||||
|
vld1.8 {d0[1]}, [r3], r2
|
||||||
|
vld1.8 {d0[0]}, [r3], r2
|
||||||
|
vld1.8 {d1[7]}, [r3] //For packing the right sequence to do SIMD processing
|
||||||
|
|
||||||
|
|
||||||
|
vext.8 d2, d1, d0, #7 //d0:{L2,L1,L0,LT,T0,T1,T2,T3}
|
||||||
|
//d2:{L3,L2,L1,L0,LT,T0,T1,T2}
|
||||||
|
|
||||||
|
//q2:{L2+L3,L1+L2,L0+L1...T1+T2,T2+T3}
|
||||||
|
vaddl.u8 q2, d2, d0
|
||||||
|
|
||||||
|
//q1:{TL0+LT0,LT0+T01,...L12+L23}
|
||||||
|
vext.8 q3, q3, q2, #14
|
||||||
|
vadd.u16 q1, q2, q3
|
||||||
|
|
||||||
|
//right shift 2 bits and rounding
|
||||||
|
vqrshrn.u16 d0, q1, #2
|
||||||
|
|
||||||
|
//Adjust the data sequence for setting luma MB of 'pred'
|
||||||
|
vst1.32 d0[1], [r0]!
|
||||||
|
vext.8 d0, d0, d0, #7
|
||||||
|
vst1.32 d0[1], [r0]!
|
||||||
|
vext.8 d0, d0, d0, #7
|
||||||
|
vst1.32 d0[1], [r0]!
|
||||||
|
vext.8 d0, d0, d0, #7
|
||||||
|
vst1.32 d0[1], [r0]
|
||||||
|
|
||||||
|
WELS_ASM_FUNC_END
|
||||||
|
|
||||||
|
|
||||||
|
WELS_ASM_FUNC_BEGIN enc_get_i4x4_luma_pred_v_l_neon
|
||||||
|
//stmdb sp!, { r2-r5, lr}
|
||||||
|
//Load the top row (8 bytes)
|
||||||
|
sub r3, r1, r2
|
||||||
|
vld1.32 {d0}, [r3]
|
||||||
|
|
||||||
|
|
||||||
|
vext.8 d1, d0, d0, #1
|
||||||
|
vaddl.u8 q1, d1, d0 //q1:{t0+t1,t1+t2,t2+t3...t5+t6,x,x}
|
||||||
|
|
||||||
|
vext.8 q2, q1, q1, #2
|
||||||
|
vadd.u16 q2, q1, q2 //q2:{t0+t1+t1+t2,t1+t2+t2+t3,...t4+t5+t5+t6,x,x}
|
||||||
|
|
||||||
|
//calculate the "vl0,vl1,vl2,vl3,vl4"
|
||||||
|
vqrshrn.u16 d0, q1, #1
|
||||||
|
|
||||||
|
//calculate the "vl5,vl6,vl7,vl8,vl9"
|
||||||
|
vqrshrn.u16 d1, q2, #2
|
||||||
|
|
||||||
|
//Adjust the data sequence for setting the luma MB
|
||||||
|
vst1.32 d0[0], [r0]!
|
||||||
|
vst1.32 d1[0], [r0]!
|
||||||
|
vext.8 d0, d0, d0, #1
|
||||||
|
vext.8 d1, d1, d1, #1
|
||||||
|
vst1.32 d0[0], [r0]!
|
||||||
|
vst1.32 d1[0], [r0]
|
||||||
|
|
||||||
|
WELS_ASM_FUNC_END
|
||||||
|
|
||||||
|
|
||||||
|
WELS_ASM_FUNC_BEGIN enc_get_i4x4_luma_pred_v_r_neon
|
||||||
|
//stmdb sp!, { r2-r5, lr}
|
||||||
|
//Load the top row (4 bytes)
|
||||||
|
sub r3, r1, r2
|
||||||
|
vld1.32 {d0[1]}, [r3]
|
||||||
|
|
||||||
|
//Load the left column (4 bytes)
|
||||||
|
sub r3, #1
|
||||||
|
vld1.8 {d0[3]}, [r3], r2
|
||||||
|
vld1.8 {d0[2]}, [r3], r2
|
||||||
|
vld1.8 {d0[1]}, [r3], r2
|
||||||
|
vld1.8 {d0[0]}, [r3]
|
||||||
|
|
||||||
|
|
||||||
|
vext.8 d1, d0, d0, #7
|
||||||
|
vaddl.u8 q1, d0, d1 //q1:{X,L2+L1,L1+L0,L0+LT,LT+T0,T0+T1,T1+T2,T2+T3}
|
||||||
|
|
||||||
|
vext.u8 q2, q1, q1, #14
|
||||||
|
vadd.u16 q2, q2, q1 //q2:{X,L2+L1+L1+L0,L1+L0+L0+LT,...T1+T2+T2+T3}
|
||||||
|
|
||||||
|
//Calculate the vr0 ~ vr9
|
||||||
|
vqrshrn.u16 d1, q2, #2
|
||||||
|
vqrshrn.u16 d0, q1, #1
|
||||||
|
|
||||||
|
//Adjust the data sequence for setting the luma MB
|
||||||
|
vst1.32 d0[1], [r0]!
|
||||||
|
vst1.32 d1[1], [r0]!
|
||||||
|
//add r2, r0, r1
|
||||||
|
vst1.8 d1[3], [r0]!
|
||||||
|
vst1.16 d0[2], [r0]!
|
||||||
|
vst1.8 d0[6], [r0]!
|
||||||
|
vst1.8 d1[2], [r0]!
|
||||||
|
vst1.16 d1[2], [r0]!
|
||||||
|
vst1.8 d1[6], [r0]
|
||||||
|
WELS_ASM_FUNC_END
|
||||||
|
|
||||||
|
|
||||||
|
WELS_ASM_FUNC_BEGIN enc_get_i4x4_luma_pred_h_u_neon
|
||||||
|
//stmdb sp!, { r4, lr}
|
||||||
|
//Load the left column data
|
||||||
|
sub r3, r1, #1
|
||||||
|
mov r1, #3
|
||||||
|
mul r1, r2
|
||||||
|
add r1, r3
|
||||||
|
vld1.8 {d0[]}, [r1]
|
||||||
|
vld1.8 {d0[4]}, [r3], r2
|
||||||
|
vld1.8 {d0[5]}, [r3], r2
|
||||||
|
vld1.8 {d0[6]}, [r3], r2 //d0:{L3,L3,L3,L3,L0,L1,L2,L3}
|
||||||
|
|
||||||
|
vext.8 d1, d0, d0, #1
|
||||||
|
vaddl.u8 q2, d0, d1 //q2:{L3+L3,L3+L3,L3+L3,L3+L0,L0+L1,L1+L2,L2+L3,L3+L3}
|
||||||
|
|
||||||
|
vext.u8 d2, d5, d4, #2
|
||||||
|
vadd.u16 d3, d2, d5 //d3:{L0+L1+L1+L2,L1+L2+L2+L3,L2+L3+L3+L3,L3+L3+L3+L3}
|
||||||
|
|
||||||
|
//Calculate the hu0 ~ hu5
|
||||||
|
vqrshrn.u16 d2, q2, #1
|
||||||
|
vqrshrn.u16 d1, q1, #2
|
||||||
|
|
||||||
|
//Adjust the data sequence for setting the luma MB
|
||||||
|
vzip.8 d2, d1
|
||||||
|
vst1.32 d1[0], [r0]!
|
||||||
|
vext.8 d2, d1, d1, #2
|
||||||
|
vst1.32 d2[0], [r0]!
|
||||||
|
vst1.32 d1[1], [r0]!
|
||||||
|
vst1.32 d0[0], [r0]
|
||||||
|
|
||||||
|
WELS_ASM_FUNC_END
|
||||||
|
|
||||||
|
|
||||||
|
WELS_ASM_FUNC_BEGIN enc_get_i4x4_luma_pred_h_d_neon
|
||||||
|
//stmdb sp!, { r2-r5, lr}
|
||||||
|
//Load the data
|
||||||
|
sub r3, r1, r2
|
||||||
|
sub r3, #1
|
||||||
|
vld1.32 {d0[1]}, [r3], r2
|
||||||
|
vld1.8 {d0[3]}, [r3], r2
|
||||||
|
vld1.8 {d0[2]}, [r3], r2
|
||||||
|
vld1.8 {d0[1]}, [r3], r2
|
||||||
|
vld1.8 {d0[0]}, [r3] //d0:{L3,L2,L1,L0,LT,T0,T1,T2}
|
||||||
|
|
||||||
|
|
||||||
|
vext.8 d1, d0, d0, #7
|
||||||
|
vaddl.u8 q1, d0, d1 //q1:{x,L3+L2,L2+L1,L1+L0,L0+LT,LT+T0,T0+T1,T1+T2}
|
||||||
|
|
||||||
|
vext.u8 q2, q1, q1, #14 //q2:{x,x, L3+L2,L2+L1,L1+L0,L0+LT,LT+T0,T0+T1}
|
||||||
|
vadd.u16 q3, q2, q1 //q3:{x,x,L3+L2+L2+L1,L2+L1+L1+L0,L1+L0+L0+LT,L0+LT+LT+T0,LT+T0+T0+T1,T0+T1+T1+T2}
|
||||||
|
|
||||||
|
//Calculate the hd0~hd9
|
||||||
|
vqrshrn.u16 d1, q3, #2
|
||||||
|
vqrshrn.u16 d0, q2, #1
|
||||||
|
|
||||||
|
//Adjust the data sequence for setting the luma MB
|
||||||
|
vmov d3, d1
|
||||||
|
vtrn.8 d0, d1
|
||||||
|
vext.u8 d2, d1, d1, #6
|
||||||
|
vst2.16 {d2[3], d3[3]}, [r0]!
|
||||||
|
vst2.16 {d0[2], d1[2]}, [r0]!
|
||||||
|
vmov d3, d0
|
||||||
|
vst2.16 {d2[2], d3[2]}, [r0]!
|
||||||
|
vst2.16 {d0[1], d1[1]}, [r0]
|
||||||
|
|
||||||
|
WELS_ASM_FUNC_END
|
||||||
|
|
||||||
|
|
||||||
|
WELS_ASM_FUNC_BEGIN enc_get_i_chroma_pred_v_neon
|
||||||
|
//stmdb sp!, { r2-r5, lr}
|
||||||
|
//Get the top row (8 byte)
|
||||||
|
sub r3, r1, r2
|
||||||
|
vldr d0, [r3]
|
||||||
|
|
||||||
|
//Set the chroma MB using top row data
|
||||||
|
vst1.8 {d0}, [r0]!
|
||||||
|
vst1.8 {d0}, [r0]!
|
||||||
|
vst1.8 {d0}, [r0]!
|
||||||
|
vst1.8 {d0}, [r0]!
|
||||||
|
vst1.8 {d0}, [r0]!
|
||||||
|
vst1.8 {d0}, [r0]!
|
||||||
|
vst1.8 {d0}, [r0]!
|
||||||
|
vst1.8 {d0}, [r0]
|
||||||
|
|
||||||
|
WELS_ASM_FUNC_END
|
||||||
|
|
||||||
|
|
||||||
|
WELS_ASM_FUNC_BEGIN enc_get_i_chroma_pred_h_neon
|
||||||
|
//stmdb sp!, { r2-r5, lr}
|
||||||
|
////Get the left column (8 byte)
|
||||||
|
sub r3, r1, #1
|
||||||
|
vld1.8 {d0[]}, [r3], r2
|
||||||
|
vld1.8 {d1[]}, [r3], r2
|
||||||
|
vld1.8 {d2[]}, [r3], r2
|
||||||
|
vld1.8 {d3[]}, [r3], r2
|
||||||
|
vld1.8 {d4[]}, [r3], r2
|
||||||
|
vld1.8 {d5[]}, [r3], r2
|
||||||
|
vld1.8 {d6[]}, [r3], r2
|
||||||
|
vld1.8 {d7[]}, [r3]
|
||||||
|
|
||||||
|
//Set the chroma MB using left column data
|
||||||
|
vst1.8 {d0}, [r0]!
|
||||||
|
vst1.8 {d1}, [r0]!
|
||||||
|
vst1.8 {d2}, [r0]!
|
||||||
|
vst1.8 {d3}, [r0]!
|
||||||
|
vst1.8 {d4}, [r0]!
|
||||||
|
vst1.8 {d5}, [r0]!
|
||||||
|
vst1.8 {d6}, [r0]!
|
||||||
|
vst1.8 {d7}, [r0]
|
||||||
|
|
||||||
|
WELS_ASM_FUNC_END
|
||||||
|
|
||||||
|
|
||||||
|
WELS_ASM_FUNC_BEGIN enc_get_i_chroma_pred_dc_both_neon
|
||||||
|
//stmdb sp!, { r2-r5, lr}
|
||||||
|
//Load the left column data (8 bytes)
|
||||||
|
sub r3, r1, #1
|
||||||
|
GET_8BYTE_DATA d0, r3, r2
|
||||||
|
|
||||||
|
//Load the top row data (8 bytes)
|
||||||
|
sub r3, r1, r2
|
||||||
|
vldr d1, [r3]
|
||||||
|
|
||||||
|
//Calculate the sum of left column and top row
|
||||||
|
vpaddl.u8 q0, q0
|
||||||
|
vpaddl.u16 q0, q0
|
||||||
|
vadd.u32 d2, d0, d1 //'m1' save to d2
|
||||||
|
|
||||||
|
vrshr.u32 q0, q0, #2 //calculate 'm2','m3'
|
||||||
|
vrshr.u32 d2, d2, #3 //calculate 'm4'
|
||||||
|
|
||||||
|
//duplicate the 'mx' to a vector line
|
||||||
|
vdup.8 d4, d2[0]
|
||||||
|
vdup.8 d5, d1[4]
|
||||||
|
vdup.8 d6, d0[4]
|
||||||
|
vdup.8 d7, d2[4]
|
||||||
|
|
||||||
|
//Set the chroma MB
|
||||||
|
vst2.32 {d4[0],d5[0]}, [r0]!
|
||||||
|
vst2.32 {d4[0],d5[0]}, [r0]!
|
||||||
|
vst2.32 {d4[0],d5[0]}, [r0]!
|
||||||
|
vst2.32 {d4[0],d5[0]}, [r0]!
|
||||||
|
vst2.32 {d6[0],d7[0]}, [r0]!
|
||||||
|
vst2.32 {d6[0],d7[0]}, [r0]!
|
||||||
|
vst2.32 {d6[0],d7[0]}, [r0]!
|
||||||
|
vst2.32 {d6[0],d7[0]}, [r0]
|
||||||
|
|
||||||
|
WELS_ASM_FUNC_END
|
||||||
|
|
||||||
|
|
||||||
|
//Table {{1,2,3,4,1,2,3,4}*17}
|
||||||
|
CONST0_GET_I_CHROMA_PRED_PLANE: .long 0x44332211, 0x44332211//0x140f0a05, 0x28231e19
|
||||||
|
//Table {-3,-2,-1,0,1,2,3,4}
|
||||||
|
CONST1_GET_I_CHROMA_PRED_PLANE: .long 0xfffefffd, 0x0000ffff,0x00020001,0x00040003
|
||||||
|
|
||||||
|
WELS_ASM_FUNC_BEGIN enc_get_i_chroma_pred_plane_neon
|
||||||
|
//stmdb sp!, { r2-r5, lr}
|
||||||
|
//Load the top row data
|
||||||
|
sub r3, r1, #1
|
||||||
|
sub r3, r2
|
||||||
|
vld1.32 {d1[0]}, [r3]
|
||||||
|
add r3, #5
|
||||||
|
vld1.32 {d0[0]}, [r3]
|
||||||
|
|
||||||
|
//Load the left column data
|
||||||
|
sub r3, #5
|
||||||
|
vld1.8 {d1[4]}, [r3], r2
|
||||||
|
vld1.8 {d1[5]}, [r3], r2
|
||||||
|
vld1.8 {d1[6]}, [r3], r2
|
||||||
|
vld1.8 {d1[7]}, [r3], r2 //d1:{LT,T0,T1,T2,LT,L0,L1,L2}
|
||||||
|
add r3, r2
|
||||||
|
vld1.8 {d0[4]}, [r3], r2
|
||||||
|
vld1.8 {d0[5]}, [r3], r2
|
||||||
|
vld1.8 {d0[6]}, [r3], r2
|
||||||
|
vld1.8 {d0[7]}, [r3] //d0:{T4,T5,T6,T7,L4,L5,L6.L7}
|
||||||
|
|
||||||
|
|
||||||
|
//Save T7 to d3 for next step
|
||||||
|
vdup.u8 d3, d0[3]
|
||||||
|
//Save L7 to d4 for next step
|
||||||
|
vdup.u8 d4, d0[7]
|
||||||
|
|
||||||
|
//Calculate the value of 'a' and save to q2
|
||||||
|
vaddl.u8 q2, d3, d4
|
||||||
|
vshl.u16 q2, #4
|
||||||
|
|
||||||
|
//Load the table {{1,2,3,4,1,2,3,4}*17}
|
||||||
|
adr r3, CONST0_GET_I_CHROMA_PRED_PLANE
|
||||||
|
vld1.32 {d2}, [r3]
|
||||||
|
|
||||||
|
//Calculate the 'b','c', and save to q0
|
||||||
|
vrev32.8 d1, d1
|
||||||
|
vsubl.u8 q0, d0, d1
|
||||||
|
vmovl.u8 q1, d2
|
||||||
|
vmul.s16 q0, q1
|
||||||
|
vpaddl.s16 q0, q0
|
||||||
|
vpaddl.s32 q0, q0
|
||||||
|
vrshr.s64 q0, #5
|
||||||
|
|
||||||
|
//Load the table {-3,-2,-1,0,1,2,3,4} to q3
|
||||||
|
adr r3, CONST1_GET_I_CHROMA_PRED_PLANE
|
||||||
|
vld1.32 {d6, d7}, [r3]
|
||||||
|
|
||||||
|
//Duplicate the 'b','c' to q0, q1 for SIMD instruction
|
||||||
|
vdup.s16 q1, d1[0]
|
||||||
|
vdup.s16 q0, d0[0]
|
||||||
|
|
||||||
|
//Calculate the "(a + b * (j - 3) + c * (- 3) + 16) >> 5;"
|
||||||
|
vmla.s16 q2, q0, q3
|
||||||
|
vmla.s16 q2, q1, d6[0]
|
||||||
|
vqrshrun.s16 d0, q2, #5
|
||||||
|
|
||||||
|
//Set a line of chroma MB
|
||||||
|
vst1.u32 {d0}, [r0]!
|
||||||
|
|
||||||
|
//Do the same processing for each line.
|
||||||
|
mov r3, #7
|
||||||
|
loop_0_get_i_chroma_pred_plane:
|
||||||
|
vadd.s16 q2, q1
|
||||||
|
vqrshrun.s16 d0, q2, #5
|
||||||
|
vst1.u32 {d0}, [r0]!
|
||||||
|
subs r3, #1
|
||||||
|
bne loop_0_get_i_chroma_pred_plane
|
||||||
|
|
||||||
|
WELS_ASM_FUNC_END
|
||||||
|
|
||||||
|
#endif
|
||||||
793
codec/encoder/core/arm/intra_pred_sad_3_opt_neon.S
Executable file
793
codec/encoder/core/arm/intra_pred_sad_3_opt_neon.S
Executable file
@@ -0,0 +1,793 @@
|
|||||||
|
/*!
|
||||||
|
* \copy
|
||||||
|
* Copyright (c) 2013, Cisco Systems
|
||||||
|
* All rights reserved.
|
||||||
|
*
|
||||||
|
* Redistribution and use in source and binary forms, with or without
|
||||||
|
* modification, are permitted provided that the following conditions
|
||||||
|
* are met:
|
||||||
|
*
|
||||||
|
* * Redistributions of source code must retain the above copyright
|
||||||
|
* notice, this list of conditions and the following disclaimer.
|
||||||
|
*
|
||||||
|
* * Redistributions in binary form must reproduce the above copyright
|
||||||
|
* notice, this list of conditions and the following disclaimer in
|
||||||
|
* the documentation and/or other materials provided with the
|
||||||
|
* distribution.
|
||||||
|
*
|
||||||
|
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||||
|
* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||||
|
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
|
||||||
|
* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
|
||||||
|
* COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
|
||||||
|
* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
|
||||||
|
* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
|
||||||
|
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||||
|
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
|
||||||
|
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
|
||||||
|
* ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||||
|
* POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
*
|
||||||
|
*/
|
||||||
|
|
||||||
|
#ifdef HAVE_NEON
|
||||||
|
.text
|
||||||
|
#include "arm_arch_common_macro.S"
|
||||||
|
|
||||||
|
|
||||||
|
#ifdef APPLE_IOS
|
||||||
|
//The data sequence will be used
|
||||||
|
.macro GET_8BYTE_DATA_L0
|
||||||
|
vld1.8 {$0[0]}, [$1], $2
|
||||||
|
vld1.8 {$0[1]}, [$1], $2
|
||||||
|
vld1.8 {$0[2]}, [$1], $2
|
||||||
|
vld1.8 {$0[3]}, [$1], $2
|
||||||
|
vld1.8 {$0[4]}, [$1], $2
|
||||||
|
vld1.8 {$0[5]}, [$1], $2
|
||||||
|
vld1.8 {$0[6]}, [$1], $2
|
||||||
|
vld1.8 {$0[7]}, [$1], $2
|
||||||
|
.endm
|
||||||
|
|
||||||
|
|
||||||
|
.macro HDM_TRANSFORM_4X4_L0
|
||||||
|
|
||||||
|
//Do the vertical transform
|
||||||
|
vaddl.u8 q0, $0, $1 //{0,4,8,12,1,5,9,13}
|
||||||
|
vsubl.u8 q1, $0, $1 //{2,6,10,14,3,7,11,15}
|
||||||
|
vswp d1, d2
|
||||||
|
vadd.s16 q2, q0, q1 //{0,1,2,3,4,5,6,7}
|
||||||
|
vsub.s16 q1, q0, q1 //{12,13,14,15,8,9,10,11}
|
||||||
|
|
||||||
|
//Do the horizontal transform
|
||||||
|
vtrn.32 q2, q1
|
||||||
|
vadd.s16 q0, q2, q1
|
||||||
|
vsub.s16 q1, q2, q1
|
||||||
|
|
||||||
|
vtrn.16 q0, q1
|
||||||
|
vadd.s16 q2, q0, q1
|
||||||
|
vsub.s16 q1, q0, q1
|
||||||
|
|
||||||
|
vmov.s16 d0, d4
|
||||||
|
vmov.s16 d1, d2
|
||||||
|
|
||||||
|
vabs.s16 d3, d3
|
||||||
|
|
||||||
|
//16x16_v
|
||||||
|
vtrn.32 d0, d1 //{0,1,3,2}
|
||||||
|
vaba.s16 $5, d0, $2 //16x16_v
|
||||||
|
vaba.s16 $5, d1, $8
|
||||||
|
vaba.s16 $5, d5, $8
|
||||||
|
vadd.u16 $5, d3
|
||||||
|
|
||||||
|
//16x16_h
|
||||||
|
vtrn.16 d4, d5 //{0,4,12,8}
|
||||||
|
vaba.s16 $6, d4, $3 //16x16_h
|
||||||
|
vabs.s16 d2, d2
|
||||||
|
vabs.s16 d5, d5
|
||||||
|
vadd.u16 d2, d3
|
||||||
|
vadd.u16 d2, d5
|
||||||
|
vadd.u16 $6, d2
|
||||||
|
|
||||||
|
//16x16_dc_both
|
||||||
|
vaba.s16 $7, d4, $4 //16x16_dc_both
|
||||||
|
vadd.u16 $7, d2
|
||||||
|
|
||||||
|
.endm
|
||||||
|
|
||||||
|
#else
|
||||||
|
//The data sequence will be used
|
||||||
|
.macro GET_8BYTE_DATA_L0 arg0, arg1, arg2
|
||||||
|
vld1.8 {\arg0[0]}, [\arg1], \arg2
|
||||||
|
vld1.8 {\arg0[1]}, [\arg1], \arg2
|
||||||
|
vld1.8 {\arg0[2]}, [\arg1], \arg2
|
||||||
|
vld1.8 {\arg0[3]}, [\arg1], \arg2
|
||||||
|
vld1.8 {\arg0[4]}, [\arg1], \arg2
|
||||||
|
vld1.8 {\arg0[5]}, [\arg1], \arg2
|
||||||
|
vld1.8 {\arg0[6]}, [\arg1], \arg2
|
||||||
|
vld1.8 {\arg0[7]}, [\arg1], \arg2
|
||||||
|
.endm
|
||||||
|
|
||||||
|
.macro HDM_TRANSFORM_4X4_L0 arg0, arg1, arg2,arg3, arg4, arg5, arg6, arg7, arg8
|
||||||
|
|
||||||
|
//Do the vertical transform
|
||||||
|
vaddl.u8 q0, \arg0, \arg1 //{0,4,8,12,1,5,9,13}
|
||||||
|
vsubl.u8 q1, \arg0, \arg1 //{2,6,10,14,3,7,11,15}
|
||||||
|
vswp d1, d2
|
||||||
|
vadd.s16 q2, q0, q1 //{0,1,2,3,4,5,6,7}
|
||||||
|
vsub.s16 q1, q0, q1 //{12,13,14,15,8,9,10,11}
|
||||||
|
|
||||||
|
//Do the horizontal transform
|
||||||
|
vtrn.32 q2, q1
|
||||||
|
vadd.s16 q0, q2, q1
|
||||||
|
vsub.s16 q1, q2, q1
|
||||||
|
|
||||||
|
vtrn.16 q0, q1
|
||||||
|
vadd.s16 q2, q0, q1
|
||||||
|
vsub.s16 q1, q0, q1
|
||||||
|
|
||||||
|
vmov.s16 d0, d4
|
||||||
|
vmov.s16 d1, d2
|
||||||
|
|
||||||
|
vabs.s16 d3, d3
|
||||||
|
|
||||||
|
//16x16_v
|
||||||
|
vtrn.32 d0, d1 //{0,1,3,2}
|
||||||
|
vaba.s16 \arg5, d0, \arg2 //16x16_v
|
||||||
|
vaba.s16 \arg5, d1, \arg8
|
||||||
|
vaba.s16 \arg5, d5, \arg8
|
||||||
|
vadd.u16 \arg5, d3
|
||||||
|
|
||||||
|
//16x16_h
|
||||||
|
vtrn.16 d4, d5 //{0,4,12,8}
|
||||||
|
vaba.s16 \arg6, d4, \arg3 //16x16_h
|
||||||
|
vabs.s16 d2, d2
|
||||||
|
vabs.s16 d5, d5
|
||||||
|
vadd.u16 d2, d3
|
||||||
|
vadd.u16 d2, d5
|
||||||
|
vadd.u16 \arg6, d2
|
||||||
|
|
||||||
|
//16x16_dc_both
|
||||||
|
vaba.s16 \arg7, d4, \arg4 //16x16_dc_both
|
||||||
|
vadd.u16 \arg7, d2
|
||||||
|
.endm
|
||||||
|
#endif
|
||||||
|
|
||||||
|
WELS_ASM_FUNC_BEGIN satd_intra_16x16_x3_opt_neon
|
||||||
|
stmdb sp!, {r4-r7, lr}
|
||||||
|
|
||||||
|
//Get the top line data to 'q15'(16 bytes)
|
||||||
|
sub r7, r0, r1
|
||||||
|
vld1.8 {q15}, [r7]
|
||||||
|
|
||||||
|
//Get the left colume data to 'q14' (16 bytes)
|
||||||
|
sub r7, r0, #1
|
||||||
|
GET_8BYTE_DATA_L0 d28, r7, r1
|
||||||
|
GET_8BYTE_DATA_L0 d29, r7, r1
|
||||||
|
|
||||||
|
//Calculate the mean value and save to 'q13->d27(reserve the d26)' (2 bytes)
|
||||||
|
//Calculate the 16x16_dc_both mode SATD
|
||||||
|
vaddl.u8 q0, d30, d31
|
||||||
|
vaddl.u8 q1, d28, d29
|
||||||
|
vadd.u16 q0, q1
|
||||||
|
vadd.u16 d0, d1
|
||||||
|
vpaddl.u16 d0, d0
|
||||||
|
vpaddl.u32 d0, d0
|
||||||
|
|
||||||
|
//Calculate the mean value
|
||||||
|
vrshr.u16 d0, #5
|
||||||
|
vshl.u16 d27, d0, #4
|
||||||
|
|
||||||
|
|
||||||
|
//Calculate the 16x16_v mode SATD and save to "q11, 12"
|
||||||
|
vshll.u8 q0, d30, #2
|
||||||
|
vshll.u8 q1, d31, #2
|
||||||
|
vtrn.32 q0, q1
|
||||||
|
vadd.s16 q2, q0, q1
|
||||||
|
vsub.s16 q1, q0, q1
|
||||||
|
vtrn.16 q2, q1
|
||||||
|
vadd.s16 q12, q2, q1
|
||||||
|
vsub.s16 q11, q2, q1
|
||||||
|
vtrn.32 q12, q11 //{0,1,3,2, 4,5,7,6} q12
|
||||||
|
//{8,9,11,10, 12,13,15,14} q11
|
||||||
|
//Calculate the 16x16_h mode SATD and save to "q9, q10"
|
||||||
|
vshll.u8 q0, d28, #2
|
||||||
|
vshll.u8 q1, d29, #2
|
||||||
|
vtrn.32 q0, q1
|
||||||
|
vadd.s16 q2, q0, q1
|
||||||
|
vsub.s16 q1, q0, q1
|
||||||
|
vtrn.16 q2, q1
|
||||||
|
vadd.s16 q10, q2, q1
|
||||||
|
vsub.s16 q9, q2, q1
|
||||||
|
vtrn.32 q10, q9 //{0,1,3,2, 4,5,7,6} q10
|
||||||
|
//{8,9,11,10, 12,13,15,14} q9
|
||||||
|
|
||||||
|
vmov.i32 d17, #0//Save the SATD of DC_BOTH
|
||||||
|
vmov.i32 d16, #0//Save the SATD of H
|
||||||
|
vmov.i32 d15, #0//Save the SATD of V
|
||||||
|
vmov.i32 d14, #0//For zero D register
|
||||||
|
//Load the p_enc data and save to "q3 ~ q6"--- 16X4 bytes
|
||||||
|
vld1.32 {q3}, [r2], r3
|
||||||
|
vld1.32 {q4}, [r2], r3
|
||||||
|
vld1.32 {q5}, [r2], r3
|
||||||
|
vld1.32 {q6}, [r2], r3
|
||||||
|
vtrn.32 q3, q4
|
||||||
|
vtrn.32 q5, q6
|
||||||
|
|
||||||
|
HDM_TRANSFORM_4X4_L0 d6, d10, d24, d20, d27, d15, d16, d17, d14
|
||||||
|
HDM_TRANSFORM_4X4_L0 d7, d11, d22, d20, d27, d15, d16, d17, d14
|
||||||
|
HDM_TRANSFORM_4X4_L0 d8, d12, d25, d20, d27, d15, d16, d17, d14
|
||||||
|
HDM_TRANSFORM_4X4_L0 d9, d13, d23, d20, d27, d15, d16, d17, d14
|
||||||
|
|
||||||
|
//Load the p_enc data and save to "q3 ~ q6"--- 16X4 bytes
|
||||||
|
vld1.32 {q3}, [r2], r3
|
||||||
|
vld1.32 {q4}, [r2], r3
|
||||||
|
vld1.32 {q5}, [r2], r3
|
||||||
|
vld1.32 {q6}, [r2], r3
|
||||||
|
vtrn.32 q3, q4
|
||||||
|
vtrn.32 q5, q6
|
||||||
|
|
||||||
|
HDM_TRANSFORM_4X4_L0 d6, d10, d24, d21, d27, d15, d16, d17, d14
|
||||||
|
HDM_TRANSFORM_4X4_L0 d7, d11, d22, d21, d27, d15, d16, d17, d14
|
||||||
|
HDM_TRANSFORM_4X4_L0 d8, d12, d25, d21, d27, d15, d16, d17, d14
|
||||||
|
HDM_TRANSFORM_4X4_L0 d9, d13, d23, d21, d27, d15, d16, d17, d14
|
||||||
|
|
||||||
|
//Load the p_enc data and save to "q3 ~ q6"--- 16X4 bytes
|
||||||
|
vld1.32 {q3}, [r2], r3
|
||||||
|
vld1.32 {q4}, [r2], r3
|
||||||
|
vld1.32 {q5}, [r2], r3
|
||||||
|
vld1.32 {q6}, [r2], r3
|
||||||
|
vtrn.32 q3, q4
|
||||||
|
vtrn.32 q5, q6
|
||||||
|
|
||||||
|
HDM_TRANSFORM_4X4_L0 d6, d10, d24, d18, d27, d15, d16, d17, d14
|
||||||
|
HDM_TRANSFORM_4X4_L0 d7, d11, d22, d18, d27, d15, d16, d17, d14
|
||||||
|
HDM_TRANSFORM_4X4_L0 d8, d12, d25, d18, d27, d15, d16, d17, d14
|
||||||
|
HDM_TRANSFORM_4X4_L0 d9, d13, d23, d18, d27, d15, d16, d17, d14
|
||||||
|
|
||||||
|
//Load the p_enc data and save to "q3 ~ q6"--- 16X4 bytes
|
||||||
|
vld1.32 {q3}, [r2], r3
|
||||||
|
vld1.32 {q4}, [r2], r3
|
||||||
|
vld1.32 {q5}, [r2], r3
|
||||||
|
vld1.32 {q6}, [r2], r3
|
||||||
|
vtrn.32 q3, q4
|
||||||
|
vtrn.32 q5, q6
|
||||||
|
|
||||||
|
HDM_TRANSFORM_4X4_L0 d6, d10, d24, d19, d27, d15, d16, d17, d14
|
||||||
|
HDM_TRANSFORM_4X4_L0 d7, d11, d22, d19, d27, d15, d16, d17, d14
|
||||||
|
HDM_TRANSFORM_4X4_L0 d8, d12, d25, d19, d27, d15, d16, d17, d14
|
||||||
|
HDM_TRANSFORM_4X4_L0 d9, d13, d23, d19, d27, d15, d16, d17, d14
|
||||||
|
|
||||||
|
//Get the data from stack
|
||||||
|
ldr r5, [sp, #20] //the addr of Best_mode
|
||||||
|
ldr r6, [sp, #24] //the value of i_lambda
|
||||||
|
|
||||||
|
//vadd.u16 d24, d25
|
||||||
|
vrshr.u16 d15, #1
|
||||||
|
vpaddl.u16 d15, d15
|
||||||
|
vpaddl.u32 d15, d15
|
||||||
|
vmov.u32 r0, d15[0]
|
||||||
|
|
||||||
|
//vadd.u16 d22, d23
|
||||||
|
vrshr.u16 d16, #1
|
||||||
|
vpaddl.u16 d16, d16
|
||||||
|
vpaddl.u32 d16, d16
|
||||||
|
vmov.u32 r1, d16[0]
|
||||||
|
add r1, r6, lsl #1
|
||||||
|
|
||||||
|
//vadd.u16 d20, d21
|
||||||
|
vrshr.u16 d17, #1
|
||||||
|
vpaddl.u16 d17, d17
|
||||||
|
vpaddl.u32 d17, d17
|
||||||
|
vmov.u32 r2, d17[0]
|
||||||
|
add r2, r6, lsl #1
|
||||||
|
|
||||||
|
mov r4, #0
|
||||||
|
cmp r1, r0
|
||||||
|
movcc r0, r1
|
||||||
|
movcc r4, #1
|
||||||
|
cmp r2, r0
|
||||||
|
movcc r0, r2
|
||||||
|
movcc r4, #2
|
||||||
|
|
||||||
|
str r4, [r5]
|
||||||
|
|
||||||
|
ldmia sp!, {r4-r7, lr}
|
||||||
|
WELS_ASM_FUNC_END
|
||||||
|
|
||||||
|
|
||||||
|
WELS_ASM_FUNC_BEGIN sad_intra_16x16_x3_opt_neon
|
||||||
|
stmdb sp!, {r4-r7, lr}
|
||||||
|
|
||||||
|
//Get the top line data to 'q15'(16 bytes)
|
||||||
|
sub r4, r0, r1
|
||||||
|
vld1.8 {q15}, [r4]
|
||||||
|
|
||||||
|
//Get the left colume data to 'q14' (16 bytes)
|
||||||
|
sub r4, r0, #1
|
||||||
|
GET_8BYTE_DATA_L0 d28, r4, r1
|
||||||
|
GET_8BYTE_DATA_L0 d29, r4, r1
|
||||||
|
|
||||||
|
//Calculate the mean value and save to 'q13' (8 bytes)
|
||||||
|
//Calculate the 16x16_dc_both mode SATD
|
||||||
|
vaddl.u8 q0, d30, d31
|
||||||
|
vaddl.u8 q1, d28, d29
|
||||||
|
vadd.u16 q0, q1
|
||||||
|
vadd.u16 d0, d1
|
||||||
|
vpaddl.u16 d0, d0
|
||||||
|
vpaddl.u32 d0, d0
|
||||||
|
|
||||||
|
//Calculate the mean value
|
||||||
|
vrshr.u16 d0, d0, #5
|
||||||
|
vdup.8 q13, d0[0]
|
||||||
|
|
||||||
|
sub r4, r0, #1
|
||||||
|
|
||||||
|
vmov.i32 q12, #0//Save the SATD of DC_BOTH
|
||||||
|
vmov.i32 q11, #0//Save the SATD of H
|
||||||
|
vmov.i32 q10, #0//Save the SATD of V
|
||||||
|
|
||||||
|
mov lr, #16
|
||||||
|
sad_intra_16x16_x3_opt_loop0:
|
||||||
|
//Get the left colume data to 'd0' (16 bytes)
|
||||||
|
vld1.8 {d0[]}, [r4], r1
|
||||||
|
|
||||||
|
//Load the p_enc data and save to "q1 ~ q2"--- 16X4 bytes
|
||||||
|
vld1.8 {q1}, [r2], r3
|
||||||
|
|
||||||
|
subs lr, #1
|
||||||
|
//Do the SAD for top colume
|
||||||
|
vabal.u8 q12, d30, d2
|
||||||
|
vabal.u8 q12, d31, d3
|
||||||
|
|
||||||
|
//Do the SAD for left colume
|
||||||
|
vabal.u8 q11, d0, d2
|
||||||
|
vabal.u8 q11, d0, d3
|
||||||
|
|
||||||
|
//Do the SAD for mean value
|
||||||
|
vabal.u8 q10, d26, d2
|
||||||
|
vabal.u8 q10, d26, d3
|
||||||
|
|
||||||
|
bne sad_intra_16x16_x3_opt_loop0
|
||||||
|
|
||||||
|
//Get the data from stack
|
||||||
|
ldr r5, [sp, #20] //the addr of Best_mode
|
||||||
|
ldr r6, [sp, #24] //the value of i_lambda
|
||||||
|
|
||||||
|
vadd.u16 d24, d25
|
||||||
|
vpaddl.u16 d24, d24
|
||||||
|
vpaddl.u32 d24, d24
|
||||||
|
vmov.u32 r0, d24[0]
|
||||||
|
|
||||||
|
vadd.u16 d22, d23
|
||||||
|
vpaddl.u16 d22, d22
|
||||||
|
vpaddl.u32 d22, d22
|
||||||
|
vmov.u32 r1, d22[0]
|
||||||
|
add r1, r6, lsl #1
|
||||||
|
|
||||||
|
vadd.u16 d20, d21
|
||||||
|
vpaddl.u16 d20, d20
|
||||||
|
vpaddl.u32 d20, d20
|
||||||
|
vmov.u32 r2, d20[0]
|
||||||
|
add r2, r6, lsl #1
|
||||||
|
|
||||||
|
mov r4, #0
|
||||||
|
cmp r1, r0
|
||||||
|
movcc r0, r1
|
||||||
|
movcc r4, #1
|
||||||
|
cmp r2, r0
|
||||||
|
movcc r0, r2
|
||||||
|
movcc r4, #2
|
||||||
|
|
||||||
|
str r4, [r5]
|
||||||
|
|
||||||
|
ldmia sp!, {r4-r7, lr}
|
||||||
|
WELS_ASM_FUNC_END
|
||||||
|
|
||||||
|
|
||||||
|
WELS_ASM_FUNC_BEGIN sad_intra_8x8_x3_opt_neon
|
||||||
|
stmdb sp!, {r4-r7, lr}
|
||||||
|
|
||||||
|
//Get the data from stack
|
||||||
|
ldr r4, [sp, #32] //p_dec_cr
|
||||||
|
ldr r5, [sp, #36] //p_enc_cr
|
||||||
|
|
||||||
|
//Get the left colume data to 'd28(cb), d30(cr)' (16 bytes)
|
||||||
|
sub r6, r0, #1
|
||||||
|
GET_8BYTE_DATA_L0 d28, r6, r1
|
||||||
|
sub r6, r4, #1
|
||||||
|
GET_8BYTE_DATA_L0 d30, r6, r1
|
||||||
|
|
||||||
|
//Get the top line data to 'd29(cb), d31(cr)'(16 bytes)
|
||||||
|
sub r6, r0, r1
|
||||||
|
vld1.8 {d29}, [r6]
|
||||||
|
sub r6, r4, r1
|
||||||
|
vld1.8 {d31}, [r6]
|
||||||
|
|
||||||
|
//Calculate the sum of left column and top row
|
||||||
|
vmov.i32 q0, q14
|
||||||
|
vpaddl.u8 q0, q0
|
||||||
|
vpaddl.u16 q0, q0
|
||||||
|
vadd.u32 d2, d0, d1 //'m1' save to d2
|
||||||
|
vrshr.u32 q0, q0, #2 //calculate 'm2','m3'
|
||||||
|
vrshr.u32 d2, d2, #3 //calculate 'm4'
|
||||||
|
|
||||||
|
//duplicate the 'mx' to a vector line
|
||||||
|
vdup.8 d27, d2[0]
|
||||||
|
vdup.8 d26, d1[4]
|
||||||
|
vtrn.32 d27, d26
|
||||||
|
|
||||||
|
vdup.8 d26, d0[4]
|
||||||
|
vdup.8 d25, d2[4]
|
||||||
|
vtrn.32 d26, d25 //Save to "d27, d26"
|
||||||
|
|
||||||
|
vmov.i32 q0, q15
|
||||||
|
vpaddl.u8 q0, q0
|
||||||
|
vpaddl.u16 q0, q0
|
||||||
|
vadd.u32 d2, d0, d1 //'m1' save to d2
|
||||||
|
vrshr.u32 q0, q0, #2 //calculate 'm2','m3'
|
||||||
|
vrshr.u32 d2, d2, #3 //calculate 'm4'
|
||||||
|
|
||||||
|
//duplicate the 'mx' to a vector line
|
||||||
|
vdup.8 d25, d2[0]
|
||||||
|
vdup.8 d24, d1[4]
|
||||||
|
vtrn.32 d25, d24
|
||||||
|
|
||||||
|
vdup.8 d24, d0[4]
|
||||||
|
vdup.8 d23, d2[4]
|
||||||
|
vtrn.32 d24, d23 //Save to "d25, d24"
|
||||||
|
|
||||||
|
vmov.i32 q11, #0//Save the SATD of DC_BOTH
|
||||||
|
vmov.i32 q10, #0//Save the SATD of H
|
||||||
|
vmov.i32 q9 , #0//Save the SATD of V
|
||||||
|
sub r6, r0, #1
|
||||||
|
sub r7, r4, #1
|
||||||
|
mov lr, #4
|
||||||
|
sad_intra_8x8_x3_opt_loop0:
|
||||||
|
|
||||||
|
//Load the p_enc data and save to "q1 ~ q2"--- 16X4 bytes
|
||||||
|
vld1.8 {d0}, [r2], r3
|
||||||
|
vld1.8 {d1}, [r5], r3
|
||||||
|
|
||||||
|
//Get the left colume data to 'd0' (16 bytes)
|
||||||
|
vld1.8 {d2[]}, [r6], r1
|
||||||
|
vld1.8 {d3[]}, [r7], r1
|
||||||
|
|
||||||
|
subs lr, #1
|
||||||
|
|
||||||
|
|
||||||
|
//Do the SAD for top colume
|
||||||
|
vabal.u8 q11, d29, d0
|
||||||
|
vabal.u8 q11, d31, d1
|
||||||
|
|
||||||
|
//Do the SAD for left colume
|
||||||
|
vabal.u8 q10, d2, d0
|
||||||
|
vabal.u8 q10, d3, d1
|
||||||
|
|
||||||
|
//Do the SAD for mean value
|
||||||
|
vabal.u8 q9, d27, d0
|
||||||
|
vabal.u8 q9, d25, d1
|
||||||
|
|
||||||
|
|
||||||
|
bne sad_intra_8x8_x3_opt_loop0
|
||||||
|
|
||||||
|
mov lr, #4
|
||||||
|
sad_intra_8x8_x3_opt_loop1:
|
||||||
|
|
||||||
|
//Load the p_enc data and save to "q1 ~ q2"--- 16X4 bytes
|
||||||
|
vld1.8 {d0}, [r2], r3
|
||||||
|
vld1.8 {d1}, [r5], r3
|
||||||
|
|
||||||
|
//Get the left colume data to 'd0' (16 bytes)
|
||||||
|
vld1.8 {d2[]}, [r6], r1
|
||||||
|
vld1.8 {d3[]}, [r7], r1
|
||||||
|
|
||||||
|
subs lr, #1
|
||||||
|
|
||||||
|
|
||||||
|
//Do the SAD for top colume
|
||||||
|
vabal.u8 q11, d29, d0
|
||||||
|
vabal.u8 q11, d31, d1
|
||||||
|
|
||||||
|
//Do the SAD for left colume
|
||||||
|
vabal.u8 q10, d2, d0
|
||||||
|
vabal.u8 q10, d3, d1
|
||||||
|
|
||||||
|
//Do the SAD for mean value
|
||||||
|
vabal.u8 q9, d26, d0
|
||||||
|
vabal.u8 q9, d24, d1
|
||||||
|
|
||||||
|
|
||||||
|
bne sad_intra_8x8_x3_opt_loop1
|
||||||
|
//Get the data from stack
|
||||||
|
ldr r5, [sp, #20] //the addr of Best_mode
|
||||||
|
ldr r6, [sp, #24] //the value of i_lambda
|
||||||
|
|
||||||
|
vadd.u16 d22, d23
|
||||||
|
vpaddl.u16 d22, d22
|
||||||
|
vpaddl.u32 d22, d22
|
||||||
|
vmov.u32 r0, d22[0]
|
||||||
|
add r0, r6, lsl #1
|
||||||
|
|
||||||
|
vadd.u16 d20, d21
|
||||||
|
vpaddl.u16 d20, d20
|
||||||
|
vpaddl.u32 d20, d20
|
||||||
|
vmov.u32 r1, d20[0]
|
||||||
|
add r1, r6, lsl #1
|
||||||
|
|
||||||
|
vadd.u16 d18, d19
|
||||||
|
vpaddl.u16 d18, d18
|
||||||
|
vpaddl.u32 d18, d18
|
||||||
|
vmov.u32 r2, d18[0]
|
||||||
|
|
||||||
|
mov r4, #2
|
||||||
|
cmp r1, r0
|
||||||
|
movcc r0, r1
|
||||||
|
movcc r4, #1
|
||||||
|
cmp r2, r0
|
||||||
|
movcc r0, r2
|
||||||
|
movcc r4, #0
|
||||||
|
|
||||||
|
str r4, [r5]
|
||||||
|
|
||||||
|
ldmia sp!, {r4-r7, lr}
|
||||||
|
WELS_ASM_FUNC_END
|
||||||
|
|
||||||
|
|
||||||
|
WELS_ASM_FUNC_BEGIN satd_intra_8x8_x3_opt_neon
|
||||||
|
stmdb sp!, {r4-r7, lr}
|
||||||
|
|
||||||
|
//Get the data from stack
|
||||||
|
ldr r4, [sp, #32] //p_dec_cr
|
||||||
|
ldr r5, [sp, #36] //p_enc_cr
|
||||||
|
|
||||||
|
//Get the top line data to 'd29(cb), d31(cr)'(16 bytes)
|
||||||
|
sub r6, r0, r1
|
||||||
|
vld1.8 {d29}, [r6]
|
||||||
|
sub r6, r4, r1
|
||||||
|
vld1.8 {d31}, [r6]
|
||||||
|
|
||||||
|
//Get the left colume data to 'd28(cb), d30(cr)' (16 bytes)
|
||||||
|
sub r6, r0, #1
|
||||||
|
GET_8BYTE_DATA_L0 d28, r6, r1
|
||||||
|
sub r6, r4, #1
|
||||||
|
GET_8BYTE_DATA_L0 d30, r6, r1
|
||||||
|
|
||||||
|
//Calculate the 16x16_v mode SATD and save to "q12, 13"
|
||||||
|
vshll.u8 q0, d29, #2
|
||||||
|
vshll.u8 q1, d31, #2
|
||||||
|
vtrn.32 q0, q1
|
||||||
|
vadd.s16 q2, q0, q1
|
||||||
|
vsub.s16 q1, q0, q1
|
||||||
|
vtrn.16 q2, q1
|
||||||
|
vadd.s16 q13, q2, q1
|
||||||
|
vsub.s16 q12, q2, q1
|
||||||
|
vtrn.32 q13, q12 //{0,1,3,2, 4,5,7,6} q13
|
||||||
|
//{8,9,11,10, 12,13,15,14} q12
|
||||||
|
//Calculate the 16x16_h mode SATD and save to "q10, q11"
|
||||||
|
vshll.u8 q0, d28, #2
|
||||||
|
vshll.u8 q1, d30, #2
|
||||||
|
vtrn.32 q0, q1
|
||||||
|
vadd.s16 q2, q0, q1
|
||||||
|
vsub.s16 q1, q0, q1
|
||||||
|
vtrn.16 q2, q1
|
||||||
|
vadd.s16 q11, q2, q1
|
||||||
|
vsub.s16 q10, q2, q1
|
||||||
|
vtrn.32 q11, q10 //{0,1,3,2, 4,5,7,6} q11
|
||||||
|
//{8,9,11,10, 12,13,15,14} q10
|
||||||
|
|
||||||
|
//Calculate the sum of left column and top row
|
||||||
|
//vmov.i32 q0, q14
|
||||||
|
vpaddl.u8 q0, q14
|
||||||
|
vpaddl.u16 q0, q0
|
||||||
|
vadd.u32 d2, d0, d1
|
||||||
|
|
||||||
|
vpaddl.u8 q2, q15
|
||||||
|
vpaddl.u16 q2, q2
|
||||||
|
vadd.u32 d3, d4, d5
|
||||||
|
|
||||||
|
vtrn.32 q0, q2
|
||||||
|
vrshr.u32 q1, #3
|
||||||
|
vrshr.u32 q2, #2
|
||||||
|
vshll.u32 q9, d4, #4 // {2cb, 2cr} q9
|
||||||
|
vshll.u32 q8, d5, #4 // {1cb, 1cr} q8
|
||||||
|
vshll.u32 q7, d2, #4 // {0cb, 3cb} q7
|
||||||
|
vshll.u32 q6, d3, #4 // {0cr, 3cr} q6
|
||||||
|
|
||||||
|
|
||||||
|
vmov.i32 d28, #0//Save the SATD of DC_BOTH
|
||||||
|
vmov.i32 d10, #0//Save the SATD of H
|
||||||
|
vmov.i32 d11, #0//Save the SATD of V
|
||||||
|
vmov.i32 d30, #0//For zero D register
|
||||||
|
//Load the p_enc data and save to "q3 ~ q6"--- 16X4 bytes
|
||||||
|
vld1.32 {d6}, [r2], r3
|
||||||
|
vld1.32 {d7}, [r2], r3
|
||||||
|
vld1.32 {d8}, [r2], r3
|
||||||
|
vld1.32 {d9}, [r2], r3
|
||||||
|
vtrn.32 d6, d7
|
||||||
|
vtrn.32 d8, d9
|
||||||
|
HDM_TRANSFORM_4X4_L0 d6, d8, d26, d22, d14, d11, d10, d28, d30
|
||||||
|
HDM_TRANSFORM_4X4_L0 d7, d9, d27, d22, d16, d11, d10, d28, d30
|
||||||
|
|
||||||
|
vld1.32 {d6}, [r5], r3
|
||||||
|
vld1.32 {d7}, [r5], r3
|
||||||
|
vld1.32 {d8}, [r5], r3
|
||||||
|
vld1.32 {d9}, [r5], r3
|
||||||
|
vtrn.32 d6, d7
|
||||||
|
vtrn.32 d8, d9
|
||||||
|
HDM_TRANSFORM_4X4_L0 d6, d8, d24, d20, d12, d11, d10, d28, d30
|
||||||
|
HDM_TRANSFORM_4X4_L0 d7, d9, d25, d20, d17, d11, d10, d28, d30
|
||||||
|
|
||||||
|
//Load the p_enc data and save to "q3 ~ q6"--- 16X4 bytes
|
||||||
|
vld1.32 {d6}, [r2], r3
|
||||||
|
vld1.32 {d7}, [r2], r3
|
||||||
|
vld1.32 {d8}, [r2], r3
|
||||||
|
vld1.32 {d9}, [r2], r3
|
||||||
|
vtrn.32 d6, d7
|
||||||
|
vtrn.32 d8, d9
|
||||||
|
HDM_TRANSFORM_4X4_L0 d6, d8, d26, d23, d18, d11, d10, d28, d30
|
||||||
|
HDM_TRANSFORM_4X4_L0 d7, d9, d27, d23, d15, d11, d10, d28, d30
|
||||||
|
|
||||||
|
vld1.32 {d6}, [r5], r3
|
||||||
|
vld1.32 {d7}, [r5], r3
|
||||||
|
vld1.32 {d8}, [r5], r3
|
||||||
|
vld1.32 {d9}, [r5], r3
|
||||||
|
vtrn.32 d6, d7
|
||||||
|
vtrn.32 d8, d9
|
||||||
|
HDM_TRANSFORM_4X4_L0 d6, d8, d24, d21, d19, d11, d10, d28, d30
|
||||||
|
HDM_TRANSFORM_4X4_L0 d7, d9, d25, d21, d13, d11, d10, d28, d30
|
||||||
|
|
||||||
|
//Get the data from stack
|
||||||
|
ldr r5, [sp, #20] //the addr of Best_mode
|
||||||
|
ldr r6, [sp, #24] //the value of i_lambda
|
||||||
|
|
||||||
|
vrshr.u16 d11, #1
|
||||||
|
vpaddl.u16 d11, d11
|
||||||
|
vpaddl.u32 d11, d11
|
||||||
|
vmov.u32 lr, d11[0]
|
||||||
|
add lr, r6, lsl #1
|
||||||
|
|
||||||
|
vrshr.u16 d10, #1
|
||||||
|
vpaddl.u16 d10, d10
|
||||||
|
vpaddl.u32 d10, d10
|
||||||
|
vmov.u32 r3, d10[0]
|
||||||
|
add r3, r6, lsl #1
|
||||||
|
|
||||||
|
vrshr.u16 d28, #1
|
||||||
|
vpaddl.u16 d28, d28
|
||||||
|
vpaddl.u32 d28, d28
|
||||||
|
vmov.u32 r2, d28[0]
|
||||||
|
|
||||||
|
mov r6, #2
|
||||||
|
cmp r3, lr
|
||||||
|
movcc lr, r3
|
||||||
|
movcc r6, #1
|
||||||
|
cmp r2, lr
|
||||||
|
movcc lr, r2
|
||||||
|
movcc r6, #0
|
||||||
|
|
||||||
|
str r6, [r5]
|
||||||
|
mov r0, lr
|
||||||
|
|
||||||
|
ldmia sp!, {r4-r7, lr}
|
||||||
|
WELS_ASM_FUNC_END
|
||||||
|
|
||||||
|
|
||||||
|
WELS_ASM_FUNC_BEGIN satd_intra_4x4_x3_opt_neon
|
||||||
|
stmdb sp!, {r4-r7, lr}
|
||||||
|
|
||||||
|
//Get the top line data to 'd31[0~3]'(4 bytes)
|
||||||
|
sub r7, r0, r1
|
||||||
|
vld1.32 {d31[0]}, [r7]
|
||||||
|
|
||||||
|
//Get the left colume data to 'd31[4~7]' (4 bytes)
|
||||||
|
sub r7, r0, #1
|
||||||
|
vld1.8 {d31[4]}, [r7], r1
|
||||||
|
vld1.8 {d31[5]}, [r7], r1
|
||||||
|
vld1.8 {d31[6]}, [r7], r1
|
||||||
|
vld1.8 {d31[7]}, [r7], r1
|
||||||
|
|
||||||
|
//Calculate the mean value and save to 'd30' (2 bytes)
|
||||||
|
vpaddl.u8 d0, d31
|
||||||
|
vpaddl.u16 d0, d0
|
||||||
|
vpaddl.u32 d0, d0
|
||||||
|
//Calculate the mean value
|
||||||
|
vrshr.u16 d0, #3
|
||||||
|
vshl.u16 d30, d0, #4
|
||||||
|
|
||||||
|
//Calculate the 16x16_v mode SATD and save to "d29"
|
||||||
|
//Calculate the 16x16_h mode SATD and save to "d28"
|
||||||
|
vshll.u8 q0, d31, #2
|
||||||
|
vtrn.32 d0, d1
|
||||||
|
vadd.s16 d2, d0, d1
|
||||||
|
vsub.s16 d1, d0, d1
|
||||||
|
vtrn.16 d2, d1
|
||||||
|
vadd.s16 d29, d2, d1
|
||||||
|
vsub.s16 d28, d2, d1
|
||||||
|
vtrn.32 d29, d28 //{0,1,3,2 top} d29
|
||||||
|
//{0,1,3,2 left} d28
|
||||||
|
|
||||||
|
vmov.i32 d27, #0//Save the SATD of DC_BOTH
|
||||||
|
vmov.i32 d26, #0//Save the SATD of H
|
||||||
|
vmov.i32 d25, #0//Save the SATD of V
|
||||||
|
vmov.i32 d24, #0//For zero D register
|
||||||
|
|
||||||
|
//Load the p_enc data and save to "d22,d23"--- 4X4 bytes
|
||||||
|
vld1.32 {d23[0]}, [r2], r3
|
||||||
|
vld1.32 {d23[1]}, [r2], r3
|
||||||
|
vld1.32 {d22[0]}, [r2], r3
|
||||||
|
vld1.32 {d22[1]}, [r2], r3
|
||||||
|
|
||||||
|
HDM_TRANSFORM_4X4_L0 d23, d22, d29, d28, d30, d25, d26, d27, d24
|
||||||
|
|
||||||
|
//Get the data from stack
|
||||||
|
ldr r5, [sp, #28] //the value of lambda2
|
||||||
|
ldr r6, [sp, #32] //the value of lambda1
|
||||||
|
ldr r7, [sp, #36] //the value of lambda0
|
||||||
|
|
||||||
|
vrshr.u16 d25, #1
|
||||||
|
vpaddl.u16 d25, d25
|
||||||
|
vpaddl.u32 d25, d25
|
||||||
|
vmov.u32 r0, d25[0]
|
||||||
|
add r0, r7
|
||||||
|
|
||||||
|
vrshr.u16 d26, #1
|
||||||
|
vpaddl.u16 d26, d26
|
||||||
|
vpaddl.u32 d26, d26
|
||||||
|
vmov.u32 r1, d26[0]
|
||||||
|
add r1, r6
|
||||||
|
|
||||||
|
vrshr.u16 d27, #1
|
||||||
|
vpaddl.u16 d27, d27
|
||||||
|
vpaddl.u32 d27, d27
|
||||||
|
vmov.u32 r2, d27[0]
|
||||||
|
add r2, r5
|
||||||
|
|
||||||
|
ldr r5, [sp, #20] //p_dst
|
||||||
|
ldr r6, [sp, #24] //the addr of Best_mode
|
||||||
|
|
||||||
|
mov r4, r0
|
||||||
|
cmp r1, r4
|
||||||
|
movcc r4, r1
|
||||||
|
cmp r2, r4
|
||||||
|
movcc r4, r2
|
||||||
|
|
||||||
|
//The compare sequence affect the resule
|
||||||
|
cmp r4, r2
|
||||||
|
bne satd_intra_4x4_x3_opt_jump0
|
||||||
|
mov r0, #2
|
||||||
|
str r0, [r6]
|
||||||
|
vshr.u32 d0, d30, #4 // {2cb, 2cr} q9
|
||||||
|
vdup.8 q1, d0[0]
|
||||||
|
vst1.8 {q1}, [r5]
|
||||||
|
//...
|
||||||
|
bl satd_intra_4x4_x3_opt_end
|
||||||
|
satd_intra_4x4_x3_opt_jump0:
|
||||||
|
|
||||||
|
cmp r4, r1
|
||||||
|
bne satd_intra_4x4_x3_opt_jump1
|
||||||
|
mov r0, #1
|
||||||
|
str r0, [r6]
|
||||||
|
vdup.8 d0, d31[4]
|
||||||
|
vdup.8 d1, d31[5]
|
||||||
|
vdup.8 d2, d31[6]
|
||||||
|
vdup.8 d3, d31[7]
|
||||||
|
vst4.32 {d0[0],d1[0],d2[0],d3[0]}, [r5]
|
||||||
|
|
||||||
|
bl satd_intra_4x4_x3_opt_end
|
||||||
|
satd_intra_4x4_x3_opt_jump1:
|
||||||
|
|
||||||
|
mov r0, #0
|
||||||
|
str r0, [r6]
|
||||||
|
vst1.32 {d31[0]}, [r5]!
|
||||||
|
vst1.32 {d31[0]}, [r5]!
|
||||||
|
vst1.32 {d31[0]}, [r5]!
|
||||||
|
vst1.32 {d31[0]}, [r5]!
|
||||||
|
|
||||||
|
|
||||||
|
satd_intra_4x4_x3_opt_end:
|
||||||
|
mov r0, r4
|
||||||
|
|
||||||
|
ldmia sp!, {r4-r7, lr}
|
||||||
|
WELS_ASM_FUNC_END
|
||||||
|
|
||||||
|
#endif
|
||||||
1963
codec/encoder/core/arm/mc_neon.S
Executable file
1963
codec/encoder/core/arm/mc_neon.S
Executable file
File diff suppressed because it is too large
Load Diff
63
codec/encoder/core/arm/memory_neon.S
Executable file
63
codec/encoder/core/arm/memory_neon.S
Executable file
@@ -0,0 +1,63 @@
|
|||||||
|
/*!
|
||||||
|
* \copy
|
||||||
|
* Copyright (c) 2013, Cisco Systems
|
||||||
|
* All rights reserved.
|
||||||
|
*
|
||||||
|
* Redistribution and use in source and binary forms, with or without
|
||||||
|
* modification, are permitted provided that the following conditions
|
||||||
|
* are met:
|
||||||
|
*
|
||||||
|
* * Redistributions of source code must retain the above copyright
|
||||||
|
* notice, this list of conditions and the following disclaimer.
|
||||||
|
*
|
||||||
|
* * Redistributions in binary form must reproduce the above copyright
|
||||||
|
* notice, this list of conditions and the following disclaimer in
|
||||||
|
* the documentation and/or other materials provided with the
|
||||||
|
* distribution.
|
||||||
|
*
|
||||||
|
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||||
|
* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||||
|
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
|
||||||
|
* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
|
||||||
|
* COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
|
||||||
|
* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
|
||||||
|
* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
|
||||||
|
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||||
|
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
|
||||||
|
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
|
||||||
|
* ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||||
|
* POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
*
|
||||||
|
*/
|
||||||
|
|
||||||
|
#ifdef HAVE_NEON
|
||||||
|
.text
|
||||||
|
#include "arm_arch_common_macro.S"
|
||||||
|
|
||||||
|
|
||||||
|
WELS_ASM_FUNC_BEGIN WelsSetMemZero_neon
|
||||||
|
veor q0, q0
|
||||||
|
cmp r1, #32
|
||||||
|
beq mem_zero_32_neon_start
|
||||||
|
blt mem_zero_24_neon_start
|
||||||
|
|
||||||
|
mem_zero_loop:
|
||||||
|
subs r1, r1, #64
|
||||||
|
vst1.64 {q0}, [r0]!
|
||||||
|
vst1.64 {q0}, [r0]!
|
||||||
|
vst1.64 {q0}, [r0]!
|
||||||
|
vst1.64 {q0}, [r0]!
|
||||||
|
bne mem_zero_loop
|
||||||
|
WELS_ASM_FUNC_END
|
||||||
|
|
||||||
|
mem_zero_32_neon_start:
|
||||||
|
vst1.64 {q0}, [r0]!
|
||||||
|
vst1.64 {q0}, [r0]!
|
||||||
|
WELS_ASM_FUNC_END
|
||||||
|
|
||||||
|
mem_zero_24_neon_start:
|
||||||
|
vst1.64 {q0}, [r0]!
|
||||||
|
vst1.64 {d0}, [r0]!
|
||||||
|
WELS_ASM_FUNC_END
|
||||||
|
|
||||||
|
#endif
|
||||||
880
codec/encoder/core/arm/pixel_neon.S
Executable file
880
codec/encoder/core/arm/pixel_neon.S
Executable file
@@ -0,0 +1,880 @@
|
|||||||
|
/*!
|
||||||
|
* \copy
|
||||||
|
* Copyright (c) 2013, Cisco Systems
|
||||||
|
* All rights reserved.
|
||||||
|
*
|
||||||
|
* Redistribution and use in source and binary forms, with or without
|
||||||
|
* modification, are permitted provided that the following conditions
|
||||||
|
* are met:
|
||||||
|
*
|
||||||
|
* * Redistributions of source code must retain the above copyright
|
||||||
|
* notice, this list of conditions and the following disclaimer.
|
||||||
|
*
|
||||||
|
* * Redistributions in binary form must reproduce the above copyright
|
||||||
|
* notice, this list of conditions and the following disclaimer in
|
||||||
|
* the documentation and/or other materials provided with the
|
||||||
|
* distribution.
|
||||||
|
*
|
||||||
|
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||||
|
* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||||
|
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
|
||||||
|
* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
|
||||||
|
* COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
|
||||||
|
* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
|
||||||
|
* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
|
||||||
|
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||||
|
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
|
||||||
|
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
|
||||||
|
* ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||||
|
* POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
*
|
||||||
|
*/
|
||||||
|
|
||||||
|
#ifdef HAVE_NEON
|
||||||
|
.text
|
||||||
|
#include "arm_arch_common_macro.S"
|
||||||
|
|
||||||
|
.macro SATD_16x4
|
||||||
|
vld1.64 {q0}, [r0,:128], r1
|
||||||
|
vld1.64 {q1}, [r2], r3
|
||||||
|
|
||||||
|
vsubl.u8 q4, d0, d2
|
||||||
|
vld1.64 {q2}, [r0,:128], r1
|
||||||
|
|
||||||
|
vsubl.u8 q6, d1, d3
|
||||||
|
vld1.64 {q3}, [r2], r3
|
||||||
|
|
||||||
|
vsubl.u8 q5, d4, d6
|
||||||
|
vld1.64 {q0}, [r0,:128], r1
|
||||||
|
|
||||||
|
vsubl.u8 q7, d5, d7
|
||||||
|
vld1.64 {q1}, [r2], r3
|
||||||
|
|
||||||
|
vsubl.u8 q8, d0, d2
|
||||||
|
vld1.64 {q2}, [r0,:128], r1
|
||||||
|
|
||||||
|
vsubl.u8 q10, d1, d3
|
||||||
|
vadd.s16 q0, q4, q5
|
||||||
|
|
||||||
|
vld1.64 {q3}, [r2], r3
|
||||||
|
vsub.s16 q1, q4, q5
|
||||||
|
|
||||||
|
vsubl.u8 q9, d4, d6
|
||||||
|
vsubl.u8 q11, d5, d7
|
||||||
|
|
||||||
|
vadd.s16 q2, q8, q9
|
||||||
|
vsub.s16 q3, q8, q9
|
||||||
|
|
||||||
|
vadd.s16 q4, q6, q7
|
||||||
|
vsub.s16 q5, q6, q7
|
||||||
|
|
||||||
|
vadd.s16 q6, q10, q11
|
||||||
|
vsub.s16 q7, q10, q11
|
||||||
|
|
||||||
|
vadd.s16 q8, q0, q2
|
||||||
|
vsub.s16 q10, q0, q2
|
||||||
|
|
||||||
|
vadd.s16 q9, q4, q6
|
||||||
|
vsub.s16 q11, q4, q6
|
||||||
|
|
||||||
|
vsub.s16 q0, q1, q3
|
||||||
|
vadd.s16 q2, q1, q3
|
||||||
|
|
||||||
|
vsub.s16 q1, q5, q7
|
||||||
|
vadd.s16 q3, q5, q7
|
||||||
|
|
||||||
|
vtrn.16 q8, q10
|
||||||
|
vtrn.16 q9, q11
|
||||||
|
|
||||||
|
vadd.s16 q4, q8, q10
|
||||||
|
vabd.s16 q6, q8, q10
|
||||||
|
|
||||||
|
vadd.s16 q5, q9, q11
|
||||||
|
vabd.s16 q7, q9, q11
|
||||||
|
|
||||||
|
vabs.s16 q4, q4
|
||||||
|
vabs.s16 q5, q5
|
||||||
|
|
||||||
|
vtrn.16 q0, q2
|
||||||
|
vtrn.16 q1, q3
|
||||||
|
|
||||||
|
vadd.s16 q8, q0, q2
|
||||||
|
vabd.s16 q10, q0, q2
|
||||||
|
|
||||||
|
vadd.s16 q9, q1, q3
|
||||||
|
vabd.s16 q11, q1, q3
|
||||||
|
|
||||||
|
vabs.s16 q8, q8
|
||||||
|
vabs.s16 q9, q9
|
||||||
|
|
||||||
|
vtrn.32 q4, q6
|
||||||
|
vtrn.32 q5, q7
|
||||||
|
|
||||||
|
vtrn.32 q8, q10
|
||||||
|
vtrn.32 q9, q11
|
||||||
|
|
||||||
|
vmax.s16 q0, q4, q6
|
||||||
|
vmax.s16 q1, q5, q7
|
||||||
|
vmax.s16 q2, q8, q10
|
||||||
|
vmax.s16 q3, q9, q11
|
||||||
|
|
||||||
|
vadd.u16 q0, q0, q1
|
||||||
|
vadd.u16 q2, q2, q3
|
||||||
|
.endm
|
||||||
|
|
||||||
|
.macro SATD_8x4
|
||||||
|
|
||||||
|
vld1.64 {d0}, [r0,:64], r1
|
||||||
|
vld1.64 {d1}, [r2], r3
|
||||||
|
|
||||||
|
vld1.64 {d2}, [r0,:64], r1
|
||||||
|
vsubl.u8 q4, d0, d1
|
||||||
|
|
||||||
|
vld1.64 {d3}, [r2], r3
|
||||||
|
vsubl.u8 q5, d2, d3
|
||||||
|
|
||||||
|
vld1.64 {d4}, [r0,:64], r1
|
||||||
|
vld1.64 {d5}, [r2], r3
|
||||||
|
|
||||||
|
vadd.s16 q8, q4, q5
|
||||||
|
vsubl.u8 q6, d4, d5
|
||||||
|
|
||||||
|
vld1.64 {d6}, [r0,:64], r1
|
||||||
|
vld1.64 {d7}, [r2], r3
|
||||||
|
|
||||||
|
vsubl.u8 q7, d6, d7
|
||||||
|
vsub.s16 q9, q4, q5
|
||||||
|
|
||||||
|
vadd.s16 q10, q6, q7
|
||||||
|
vsub.s16 q11, q6, q7
|
||||||
|
|
||||||
|
vadd.s16 q0, q8, q10
|
||||||
|
vsub.s16 q1, q8, q10
|
||||||
|
|
||||||
|
vsub.s16 q2, q9, q11
|
||||||
|
vadd.s16 q3, q9, q11
|
||||||
|
|
||||||
|
vtrn.16 q0, q1
|
||||||
|
vtrn.16 q2, q3
|
||||||
|
|
||||||
|
vadd.s16 q4, q0, q1
|
||||||
|
vabd.s16 q5, q0, q1
|
||||||
|
|
||||||
|
vabs.s16 q4, q4
|
||||||
|
vadd.s16 q6, q2, q3
|
||||||
|
|
||||||
|
vabd.s16 q7, q2, q3
|
||||||
|
vabs.s16 q6, q6
|
||||||
|
|
||||||
|
vtrn.32 q4, q5
|
||||||
|
vtrn.32 q6, q7
|
||||||
|
|
||||||
|
vmax.s16 q0, q4, q5
|
||||||
|
vmax.s16 q1, q6, q7
|
||||||
|
.endm
|
||||||
|
|
||||||
|
.macro SAD_16x4
|
||||||
|
vld1.64 {q6}, [r0, :128], r1
|
||||||
|
vabal.u8 q10, d8, d10
|
||||||
|
|
||||||
|
vld1.64 {q7}, [r2], r3
|
||||||
|
vabal.u8 q11, d9, d11
|
||||||
|
|
||||||
|
vld1.64 {q0}, [r0, :128], r1
|
||||||
|
vabal.u8 q12, d12, d14
|
||||||
|
|
||||||
|
vld1.64 {q1}, [r2], r3
|
||||||
|
vabal.u8 q13, d13, d15
|
||||||
|
|
||||||
|
vld1.64 {q2}, [r0, :128], r1
|
||||||
|
vabal.u8 q10, d0, d2
|
||||||
|
|
||||||
|
vld1.64 {q3}, [r2], r3
|
||||||
|
vabal.u8 q11, d1, d3
|
||||||
|
|
||||||
|
vld1.64 {q4}, [r0, :128], r1
|
||||||
|
vabal.u8 q12, d4, d6
|
||||||
|
|
||||||
|
vld1.64 {q5}, [r2], r3
|
||||||
|
vabal.u8 q13, d5, d7
|
||||||
|
.endm
|
||||||
|
|
||||||
|
.macro SAD_8x4
|
||||||
|
vld1.64 {d0}, [r0, :64], r1
|
||||||
|
vld1.64 {d1}, [r2], r3
|
||||||
|
|
||||||
|
vabal.u8 q10, d0, d1
|
||||||
|
vld1.64 {d2}, [r0, :64], r1
|
||||||
|
|
||||||
|
vld1.64 {d3}, [r2], r3
|
||||||
|
vabal.u8 q11, d2, d3
|
||||||
|
|
||||||
|
vld1.64 {d4}, [r0, :64], r1
|
||||||
|
vld1.64 {d5}, [r2], r3
|
||||||
|
|
||||||
|
vabal.u8 q12, d4, d5
|
||||||
|
vld1.64 {d6}, [r0, :64], r1
|
||||||
|
|
||||||
|
vld1.64 {d7}, [r2], r3
|
||||||
|
vabal.u8 q13, d6, d7
|
||||||
|
.endm
|
||||||
|
|
||||||
|
|
||||||
|
WELS_ASM_FUNC_BEGIN pixel_sad_16x16_neon
|
||||||
|
|
||||||
|
vld1.64 {q0}, [r0, :128], r1
|
||||||
|
vld1.64 {q1}, [r2], r3
|
||||||
|
|
||||||
|
vabdl.u8 q10, d0, d2
|
||||||
|
vld1.64 {q2}, [r0, :128], r1
|
||||||
|
|
||||||
|
vabdl.u8 q11, d1, d3
|
||||||
|
vld1.64 {q3}, [r2], r3
|
||||||
|
|
||||||
|
vld1.64 {q4}, [r0, :128], r1
|
||||||
|
vabdl.u8 q12, d4, d6
|
||||||
|
vld1.64 {q5}, [r2], r3
|
||||||
|
vabdl.u8 q13, d5, d7
|
||||||
|
|
||||||
|
SAD_16x4
|
||||||
|
SAD_16x4
|
||||||
|
SAD_16x4
|
||||||
|
|
||||||
|
vld1.64 {q6}, [r0, :128], r1
|
||||||
|
vabal.u8 q10, d8, d10
|
||||||
|
|
||||||
|
vld1.64 {q7}, [r2], r3
|
||||||
|
vabal.u8 q11, d9, d11
|
||||||
|
|
||||||
|
vabal.u8 q12, d12, d14
|
||||||
|
vabal.u8 q13, d13, d15
|
||||||
|
|
||||||
|
vadd.u16 q14, q10, q11
|
||||||
|
vadd.u16 q15, q12, q13
|
||||||
|
|
||||||
|
vadd.u16 q15, q14, q15
|
||||||
|
vadd.u16 d0, d30, d31
|
||||||
|
vpaddl.u16 d0, d0
|
||||||
|
vpaddl.u32 d0, d0
|
||||||
|
vmov.u32 r0, d0[0]
|
||||||
|
WELS_ASM_FUNC_END
|
||||||
|
|
||||||
|
|
||||||
|
WELS_ASM_FUNC_BEGIN pixel_sad_16x8_neon
|
||||||
|
|
||||||
|
vld1.64 {q0}, [r0, :128], r1
|
||||||
|
vld1.64 {q1}, [r2], r3
|
||||||
|
|
||||||
|
vabdl.u8 q10, d0, d2
|
||||||
|
vld1.64 {q2}, [r0, :128], r1
|
||||||
|
|
||||||
|
vabdl.u8 q11, d1, d3
|
||||||
|
vld1.64 {q3}, [r2], r3
|
||||||
|
|
||||||
|
vld1.64 {q4}, [r0, :128], r1
|
||||||
|
vabdl.u8 q12, d4, d6
|
||||||
|
vld1.64 {q5}, [r2], r3
|
||||||
|
vabdl.u8 q13, d5, d7
|
||||||
|
|
||||||
|
SAD_16x4
|
||||||
|
|
||||||
|
vld1.64 {q6}, [r0, :128], r1
|
||||||
|
vabal.u8 q10, d8, d10
|
||||||
|
|
||||||
|
vld1.64 {q7}, [r2], r3
|
||||||
|
vabal.u8 q11, d9, d11
|
||||||
|
|
||||||
|
vabal.u8 q12, d12, d14
|
||||||
|
vabal.u8 q13, d13, d15
|
||||||
|
|
||||||
|
vadd.u16 q14, q10, q11
|
||||||
|
vadd.u16 q15, q12, q13
|
||||||
|
|
||||||
|
vadd.u16 q15, q14, q15
|
||||||
|
vadd.u16 d0, d30, d31
|
||||||
|
vpaddl.u16 d0, d0
|
||||||
|
vpaddl.u32 d0, d0
|
||||||
|
vmov.u32 r0, d0[0]
|
||||||
|
WELS_ASM_FUNC_END
|
||||||
|
|
||||||
|
|
||||||
|
WELS_ASM_FUNC_BEGIN pixel_sad_8x16_neon
|
||||||
|
|
||||||
|
vld1.64 {d0}, [r0, :64], r1
|
||||||
|
vld1.64 {d1}, [r2], r3
|
||||||
|
|
||||||
|
vabdl.u8 q10, d0, d1
|
||||||
|
vld1.64 {d2}, [r0, :64], r1
|
||||||
|
|
||||||
|
vld1.64 {d3}, [r2], r3
|
||||||
|
vabdl.u8 q11, d2, d3
|
||||||
|
|
||||||
|
vld1.64 {d4}, [r0, :64], r1
|
||||||
|
vld1.64 {d5}, [r2], r3
|
||||||
|
|
||||||
|
vabdl.u8 q12, d4, d5
|
||||||
|
vld1.64 {d6}, [r0, :64], r1
|
||||||
|
|
||||||
|
vld1.64 {d7}, [r2], r3
|
||||||
|
vabdl.u8 q13, d6, d7
|
||||||
|
|
||||||
|
SAD_8x4
|
||||||
|
SAD_8x4
|
||||||
|
SAD_8x4
|
||||||
|
|
||||||
|
vadd.u16 q14, q10, q11
|
||||||
|
vadd.u16 q15, q12, q13
|
||||||
|
vadd.u16 q15, q15, q14
|
||||||
|
vadd.u16 d0, d30, d31
|
||||||
|
vpaddl.u16 d0, d0
|
||||||
|
vpaddl.u32 d0, d0
|
||||||
|
vmov.u32 r0, d0[0]
|
||||||
|
WELS_ASM_FUNC_END
|
||||||
|
|
||||||
|
|
||||||
|
WELS_ASM_FUNC_BEGIN pixel_sad_8x8_neon
|
||||||
|
|
||||||
|
vld1.64 {d0}, [r0, :64], r1
|
||||||
|
vld1.64 {d1}, [r2], r3
|
||||||
|
|
||||||
|
vabdl.u8 q10, d0, d1
|
||||||
|
vld1.64 {d2}, [r0, :64], r1
|
||||||
|
|
||||||
|
vld1.64 {d3}, [r2], r3
|
||||||
|
vabdl.u8 q11, d2, d3
|
||||||
|
|
||||||
|
vld1.64 {d4}, [r0, :64], r1
|
||||||
|
vld1.64 {d5}, [r2], r3
|
||||||
|
|
||||||
|
vabdl.u8 q12, d4, d5
|
||||||
|
vld1.64 {d6}, [r0, :64], r1
|
||||||
|
|
||||||
|
vld1.64 {d7}, [r2], r3
|
||||||
|
vabdl.u8 q13, d6, d7
|
||||||
|
|
||||||
|
SAD_8x4
|
||||||
|
|
||||||
|
vadd.u16 q14, q10, q11
|
||||||
|
vadd.u16 q15, q12, q13
|
||||||
|
vadd.u16 q15, q15, q14
|
||||||
|
vadd.u16 d0, d30, d31
|
||||||
|
vpaddl.u16 d0, d0
|
||||||
|
vpaddl.u32 d0, d0
|
||||||
|
vmov.u32 r0, d0[0]
|
||||||
|
WELS_ASM_FUNC_END
|
||||||
|
|
||||||
|
|
||||||
|
WELS_ASM_FUNC_BEGIN pixel_sad_4x4_neon
|
||||||
|
stmdb sp!, {r4-r5, lr}
|
||||||
|
|
||||||
|
//Loading a horizontal line data (4 bytes)
|
||||||
|
//line 0
|
||||||
|
ldr r4, [r0], r1
|
||||||
|
ldr r5, [r2], r3
|
||||||
|
usad8 lr, r4, r5
|
||||||
|
|
||||||
|
//line 1
|
||||||
|
ldr r4, [r0], r1
|
||||||
|
ldr r5, [r2], r3
|
||||||
|
usada8 lr, r4, r5, lr
|
||||||
|
|
||||||
|
//line 2
|
||||||
|
ldr r4, [r0], r1
|
||||||
|
ldr r5, [r2], r3
|
||||||
|
usada8 lr, r4, r5, lr
|
||||||
|
|
||||||
|
//line 3
|
||||||
|
ldr r4, [r0]
|
||||||
|
ldr r5, [r2]
|
||||||
|
usada8 r0, r4, r5, lr
|
||||||
|
|
||||||
|
ldmia sp!, {r4-r5, lr}
|
||||||
|
WELS_ASM_FUNC_END
|
||||||
|
|
||||||
|
|
||||||
|
WELS_ASM_FUNC_BEGIN pixel_sad_4_16x16_neon
|
||||||
|
|
||||||
|
stmdb sp!, {r4-r5, lr}
|
||||||
|
|
||||||
|
//Generate the pix2 start addr
|
||||||
|
sub r4, r2, #1
|
||||||
|
add r5, r2, #1
|
||||||
|
sub r2, r3
|
||||||
|
|
||||||
|
//Loading a horizontal line data (16 bytes)
|
||||||
|
vld1.8 {q0}, [r0], r1 //save pix1
|
||||||
|
|
||||||
|
vld1.8 {q1}, [r2], r3 //save pix2 - stride
|
||||||
|
vld1.8 {q6}, [r2], r3 //save pix2
|
||||||
|
vld1.8 {q2}, [r2], r3 //save pix2 + stride
|
||||||
|
|
||||||
|
vld1.8 {q3}, [r4], r3 //save pix2 - 1
|
||||||
|
vld1.8 {q4}, [r5], r3 //save pix2 + 1
|
||||||
|
|
||||||
|
//Do the SAD for 16 bytes
|
||||||
|
vabdl.u8 q15, d0, d2
|
||||||
|
vabal.u8 q15, d1, d3
|
||||||
|
|
||||||
|
vabdl.u8 q13, d0, d4
|
||||||
|
vabal.u8 q13, d1, d5
|
||||||
|
|
||||||
|
vabdl.u8 q11, d0, d6
|
||||||
|
vabal.u8 q11, d1, d7
|
||||||
|
|
||||||
|
vabdl.u8 q9, d0, d8
|
||||||
|
vabal.u8 q9, d1, d9
|
||||||
|
|
||||||
|
mov lr, #15
|
||||||
|
pixel_sad_4_16x16_loop_0:
|
||||||
|
|
||||||
|
//Loading a horizontal line data (16 bytes)
|
||||||
|
vld1.8 {q0}, [r0], r1 //save pix1
|
||||||
|
vmov.8 q1, q6 //save pix2 - stride
|
||||||
|
vmov.8 q6, q2
|
||||||
|
vabal.u8 q15, d0, d2
|
||||||
|
vld1.8 {q2}, [r2], r3 //save pix2 + stride
|
||||||
|
vabal.u8 q15, d1, d3
|
||||||
|
vld1.8 {q3}, [r4], r3 //save pix2 - 1
|
||||||
|
vabal.u8 q13, d0, d4
|
||||||
|
vld1.8 {q4}, [r5], r3 //save pix2 + 1
|
||||||
|
vabal.u8 q13, d1, d5
|
||||||
|
subs lr, #1
|
||||||
|
|
||||||
|
vabal.u8 q11, d0, d6
|
||||||
|
vabal.u8 q11, d1, d7
|
||||||
|
|
||||||
|
vabal.u8 q9, d0, d8
|
||||||
|
vabal.u8 q9, d1, d9
|
||||||
|
|
||||||
|
bne pixel_sad_4_16x16_loop_0
|
||||||
|
|
||||||
|
|
||||||
|
//Save SAD to 'r0'
|
||||||
|
ldr r0, [sp, #12]
|
||||||
|
|
||||||
|
vadd.u16 d0, d30, d31
|
||||||
|
vadd.u16 d1, d26, d27
|
||||||
|
vadd.u16 d2, d22, d23
|
||||||
|
vadd.u16 d3, d18, d19
|
||||||
|
|
||||||
|
vpaddl.u16 q0, q0
|
||||||
|
vpaddl.u16 q1, q1
|
||||||
|
|
||||||
|
vpaddl.u32 q0, q0
|
||||||
|
vpaddl.u32 q1, q1
|
||||||
|
|
||||||
|
vshl.u32 q0, #4
|
||||||
|
vshl.u32 q1, #4
|
||||||
|
vst4.32 {d0[0],d1[0],d2[0],d3[0]}, [r0]
|
||||||
|
|
||||||
|
ldmia sp!, {r4-r5, lr}
|
||||||
|
WELS_ASM_FUNC_END
|
||||||
|
|
||||||
|
|
||||||
|
WELS_ASM_FUNC_BEGIN pixel_sad_4_16x8_neon
|
||||||
|
stmdb sp!, {r4-r5, lr}
|
||||||
|
|
||||||
|
//Generate the pix2 start addr
|
||||||
|
sub r4, r2, #1
|
||||||
|
add r5, r2, #1
|
||||||
|
sub r2, r3
|
||||||
|
|
||||||
|
//Loading a horizontal line data (16 bytes)
|
||||||
|
vld1.8 {q0}, [r0], r1 //save pix1
|
||||||
|
|
||||||
|
vld1.8 {q1}, [r2], r3 //save pix2 - stride
|
||||||
|
vld1.8 {q6}, [r2], r3 //save pix2
|
||||||
|
vld1.8 {q2}, [r2], r3 //save pix2 + stride
|
||||||
|
|
||||||
|
vld1.8 {q3}, [r4], r3 //save pix2 - 1
|
||||||
|
vld1.8 {q4}, [r5], r3 //save pix2 + 1
|
||||||
|
|
||||||
|
//Do the SAD for 16 bytes
|
||||||
|
vabdl.u8 q15, d0, d2
|
||||||
|
vabal.u8 q15, d1, d3
|
||||||
|
|
||||||
|
vabdl.u8 q13, d0, d4
|
||||||
|
vabal.u8 q13, d1, d5
|
||||||
|
|
||||||
|
vabdl.u8 q11, d0, d6
|
||||||
|
vabal.u8 q11, d1, d7
|
||||||
|
|
||||||
|
vabdl.u8 q9, d0, d8
|
||||||
|
vabal.u8 q9, d1, d9
|
||||||
|
|
||||||
|
mov lr, #7
|
||||||
|
pixel_sad_4_16x8_loop_0:
|
||||||
|
|
||||||
|
//Loading a horizontal line data (16 bytes)
|
||||||
|
vld1.8 {q0}, [r0], r1 //save pix1
|
||||||
|
vmov.8 q1, q6 //save pix2 - stride
|
||||||
|
vmov.8 q6, q2
|
||||||
|
vabal.u8 q15, d0, d2
|
||||||
|
vld1.8 {q2}, [r2], r3 //save pix2 + stride
|
||||||
|
vabal.u8 q15, d1, d3
|
||||||
|
vld1.8 {q3}, [r4], r3 //save pix2 - 1
|
||||||
|
vabal.u8 q13, d0, d4
|
||||||
|
vld1.8 {q4}, [r5], r3 //save pix2 + 1
|
||||||
|
vabal.u8 q13, d1, d5
|
||||||
|
subs lr, #1
|
||||||
|
|
||||||
|
vabal.u8 q11, d0, d6
|
||||||
|
vabal.u8 q11, d1, d7
|
||||||
|
|
||||||
|
vabal.u8 q9, d0, d8
|
||||||
|
vabal.u8 q9, d1, d9
|
||||||
|
|
||||||
|
bne pixel_sad_4_16x8_loop_0
|
||||||
|
|
||||||
|
//Save SAD to 'r0'
|
||||||
|
ldr r0, [sp, #12]
|
||||||
|
|
||||||
|
vadd.u16 d0, d30, d31
|
||||||
|
vadd.u16 d1, d26, d27
|
||||||
|
vadd.u16 d2, d22, d23
|
||||||
|
vadd.u16 d3, d18, d19
|
||||||
|
|
||||||
|
vpaddl.u16 q0, q0
|
||||||
|
vpaddl.u16 q1, q1
|
||||||
|
|
||||||
|
vpaddl.u32 q0, q0
|
||||||
|
vpaddl.u32 q1, q1
|
||||||
|
|
||||||
|
vshl.u32 q0, #4
|
||||||
|
vshl.u32 q1, #4
|
||||||
|
vst4.32 {d0[0],d1[0],d2[0],d3[0]}, [r0]
|
||||||
|
|
||||||
|
ldmia sp!, {r4-r5, lr}
|
||||||
|
WELS_ASM_FUNC_END
|
||||||
|
|
||||||
|
|
||||||
|
WELS_ASM_FUNC_BEGIN pixel_sad_4_8x16_neon
|
||||||
|
stmdb sp!, {r4-r5, lr}
|
||||||
|
|
||||||
|
//Generate the pix2 start addr
|
||||||
|
sub r4, r2, #1
|
||||||
|
add r5, r2, #1
|
||||||
|
sub r2, r3
|
||||||
|
|
||||||
|
//Loading a horizontal line data (8 bytes)
|
||||||
|
vld1.8 {d0}, [r0], r1 //save pix1
|
||||||
|
|
||||||
|
vld1.8 {d1}, [r2], r3 //save pix2 - stride
|
||||||
|
vld1.8 {d6}, [r2], r3 //save pix2
|
||||||
|
vld1.8 {d2}, [r2], r3 //save pix2 + stride
|
||||||
|
|
||||||
|
vld1.8 {d3}, [r4], r3 //save pix2 - 1
|
||||||
|
vld1.8 {d4}, [r5], r3 //save pix2 + 1
|
||||||
|
|
||||||
|
//Do the SAD for 8 bytes
|
||||||
|
vabdl.u8 q15, d0, d1
|
||||||
|
vabdl.u8 q14, d0, d2
|
||||||
|
vabdl.u8 q13, d0, d3
|
||||||
|
vabdl.u8 q12, d0, d4
|
||||||
|
|
||||||
|
mov lr, #15
|
||||||
|
pixel_sad_4_8x16_loop_0:
|
||||||
|
|
||||||
|
//Loading a horizontal line data (8 bytes)
|
||||||
|
vld1.8 {d0}, [r0], r1 //save pix1
|
||||||
|
vmov.8 d1, d6 //save pix2 - stride
|
||||||
|
vmov.8 d6, d2
|
||||||
|
vld1.8 {d2}, [r2], r3 //save pix2 + stride
|
||||||
|
vld1.8 {d3}, [r4], r3 //save pix2 - 1
|
||||||
|
vabal.u8 q15, d0, d1
|
||||||
|
|
||||||
|
vld1.8 {d4}, [r5], r3 //save pix2 + 1
|
||||||
|
//Do the SAD for 8 bytes
|
||||||
|
vabal.u8 q14, d0, d2
|
||||||
|
vabal.u8 q13, d0, d3
|
||||||
|
vabal.u8 q12, d0, d4
|
||||||
|
subs lr, #1
|
||||||
|
|
||||||
|
bne pixel_sad_4_8x16_loop_0
|
||||||
|
|
||||||
|
//Save SAD to 'r0'
|
||||||
|
ldr r0, [sp, #12]
|
||||||
|
|
||||||
|
vadd.u16 d0, d30, d31
|
||||||
|
vadd.u16 d1, d28, d29
|
||||||
|
vadd.u16 d2, d26, d27
|
||||||
|
vadd.u16 d3, d24, d25
|
||||||
|
|
||||||
|
vpaddl.u16 q0, q0
|
||||||
|
vpaddl.u16 q1, q1
|
||||||
|
|
||||||
|
vpaddl.u32 q0, q0
|
||||||
|
vpaddl.u32 q1, q1
|
||||||
|
|
||||||
|
vshl.u32 q0, #4
|
||||||
|
vshl.u32 q1, #4
|
||||||
|
vst4.32 {d0[0],d1[0],d2[0],d3[0]}, [r0]
|
||||||
|
|
||||||
|
ldmia sp!, {r4-r5, lr}
|
||||||
|
WELS_ASM_FUNC_END
|
||||||
|
|
||||||
|
|
||||||
|
WELS_ASM_FUNC_BEGIN pixel_sad_4_8x8_neon
|
||||||
|
stmdb sp!, {r4-r5, lr}
|
||||||
|
|
||||||
|
//Generate the pix2 start addr
|
||||||
|
sub r4, r2, #1
|
||||||
|
add r5, r2, #1
|
||||||
|
sub r2, r3
|
||||||
|
|
||||||
|
//Loading a horizontal line data (8 bytes)
|
||||||
|
vld1.8 {d0}, [r0], r1 //save pix1
|
||||||
|
|
||||||
|
vld1.8 {d1}, [r2], r3 //save pix2 - stride
|
||||||
|
vld1.8 {d6}, [r2], r3 //save pix2
|
||||||
|
vld1.8 {d2}, [r2], r3 //save pix2 + stride
|
||||||
|
|
||||||
|
vld1.8 {d3}, [r4], r3 //save pix2 - 1
|
||||||
|
vld1.8 {d4}, [r5], r3 //save pix2 + 1
|
||||||
|
|
||||||
|
//Do the SAD for 8 bytes
|
||||||
|
vabdl.u8 q15, d0, d1
|
||||||
|
vabdl.u8 q14, d0, d2
|
||||||
|
vabdl.u8 q13, d0, d3
|
||||||
|
vabdl.u8 q12, d0, d4
|
||||||
|
|
||||||
|
mov lr, #7
|
||||||
|
pixel_sad_4_8x8_loop_0:
|
||||||
|
|
||||||
|
//Loading a horizontal line data (8 bytes)
|
||||||
|
vld1.8 {d0}, [r0], r1 //save pix1
|
||||||
|
vmov.8 d1, d6 //save pix2 - stride
|
||||||
|
vmov.8 d6, d2
|
||||||
|
vld1.8 {d2}, [r2], r3 //save pix2 + stride
|
||||||
|
vld1.8 {d3}, [r4], r3 //save pix2 - 1
|
||||||
|
vabal.u8 q15, d0, d1
|
||||||
|
|
||||||
|
vld1.8 {d4}, [r5], r3 //save pix2 + 1
|
||||||
|
//Do the SAD for 8 bytes
|
||||||
|
vabal.u8 q14, d0, d2
|
||||||
|
vabal.u8 q13, d0, d3
|
||||||
|
vabal.u8 q12, d0, d4
|
||||||
|
subs lr, #1
|
||||||
|
bne pixel_sad_4_8x8_loop_0
|
||||||
|
|
||||||
|
//Save SAD to 'r0'
|
||||||
|
ldr r0, [sp, #12]
|
||||||
|
|
||||||
|
vadd.u16 d0, d30, d31
|
||||||
|
vadd.u16 d1, d28, d29
|
||||||
|
vadd.u16 d2, d26, d27
|
||||||
|
vadd.u16 d3, d24, d25
|
||||||
|
|
||||||
|
vpaddl.u16 q0, q0
|
||||||
|
vpaddl.u16 q1, q1
|
||||||
|
|
||||||
|
vpaddl.u32 q0, q0
|
||||||
|
vpaddl.u32 q1, q1
|
||||||
|
|
||||||
|
vshl.u32 q0, #4
|
||||||
|
vshl.u32 q1, #4
|
||||||
|
vst4.32 {d0[0],d1[0],d2[0],d3[0]}, [r0]
|
||||||
|
|
||||||
|
ldmia sp!, {r4-r5, lr}
|
||||||
|
WELS_ASM_FUNC_END
|
||||||
|
|
||||||
|
|
||||||
|
WELS_ASM_FUNC_BEGIN pixel_sad_4_4x4_neon
|
||||||
|
|
||||||
|
vld1.32 {d0[0]}, [r0], r1
|
||||||
|
vld1.32 {d0[1]}, [r0], r1
|
||||||
|
vld1.32 {d1[0]}, [r0], r1
|
||||||
|
vld1.32 {d1[1]}, [r0]
|
||||||
|
|
||||||
|
|
||||||
|
sub r0, r2, r3
|
||||||
|
vld1.32 {d2[0]}, [r0], r3
|
||||||
|
vld1.32 {d2[1]}, [r0], r3
|
||||||
|
vld1.32 {d3[0]}, [r0], r3
|
||||||
|
vld1.32 {d3[1]}, [r0], r3
|
||||||
|
vld1.32 {d4[0]}, [r0], r3
|
||||||
|
vld1.32 {d4[1]}, [r0]
|
||||||
|
|
||||||
|
sub r0, r2, #1
|
||||||
|
vld1.32 {d5[0]}, [r0], r3
|
||||||
|
vld1.32 {d5[1]}, [r0], r3
|
||||||
|
vld1.32 {d6[0]}, [r0], r3
|
||||||
|
vld1.32 {d6[1]}, [r0]
|
||||||
|
|
||||||
|
add r0, r2, #1
|
||||||
|
vld1.32 {d7[0]}, [r0], r3
|
||||||
|
vld1.32 {d7[1]}, [r0], r3
|
||||||
|
vld1.32 {d8[0]}, [r0], r3
|
||||||
|
vld1.32 {d8[1]}, [r0]
|
||||||
|
|
||||||
|
vabdl.u8 q15, d0, d2
|
||||||
|
vabdl.u8 q14, d1, d3
|
||||||
|
|
||||||
|
vabdl.u8 q13, d0, d3
|
||||||
|
vabdl.u8 q12, d1, d4
|
||||||
|
|
||||||
|
vabdl.u8 q11, d0, d5
|
||||||
|
vabdl.u8 q10, d1, d6
|
||||||
|
|
||||||
|
vabdl.u8 q9, d0, d7
|
||||||
|
vabdl.u8 q8, d1, d8
|
||||||
|
|
||||||
|
//Save SAD to 'r4'
|
||||||
|
ldr r0, [sp]
|
||||||
|
vadd.u16 q0, q14, q15
|
||||||
|
vadd.u16 q1, q12, q13
|
||||||
|
vadd.u16 q2, q10, q11
|
||||||
|
vadd.u16 q3, q8 , q9
|
||||||
|
|
||||||
|
vadd.u16 d0, d1
|
||||||
|
vadd.u16 d1, d2, d3
|
||||||
|
vadd.u16 d2, d4, d5
|
||||||
|
vadd.u16 d3, d6, d7
|
||||||
|
|
||||||
|
vpaddl.u16 q0, q0
|
||||||
|
vpaddl.u16 q1, q1
|
||||||
|
|
||||||
|
vpaddl.u32 q0, q0
|
||||||
|
vpaddl.u32 q1, q1
|
||||||
|
|
||||||
|
vshl.u32 q0, #4
|
||||||
|
vshl.u32 q1, #4
|
||||||
|
vst4.32 {d0[0],d1[0],d2[0],d3[0]}, [r0]
|
||||||
|
|
||||||
|
WELS_ASM_FUNC_END
|
||||||
|
|
||||||
|
|
||||||
|
WELS_ASM_FUNC_BEGIN pixel_satd_16x16_neon
|
||||||
|
|
||||||
|
SATD_16x4
|
||||||
|
vadd.u16 q15, q0, q2
|
||||||
|
|
||||||
|
SATD_16x4
|
||||||
|
vadd.u16 q15, q15, q0
|
||||||
|
vadd.u16 q15, q15, q2
|
||||||
|
|
||||||
|
SATD_16x4
|
||||||
|
vadd.u16 q15, q15, q0
|
||||||
|
vadd.u16 q15, q15, q2
|
||||||
|
|
||||||
|
SATD_16x4
|
||||||
|
vadd.u16 q15, q15, q0
|
||||||
|
vadd.u16 q15, q15, q2
|
||||||
|
|
||||||
|
vadd.u16 d0, d30, d31
|
||||||
|
vpaddl.u16 d0, d0
|
||||||
|
vpaddl.u32 d0, d0
|
||||||
|
|
||||||
|
vmov.32 r0, d0[0]
|
||||||
|
WELS_ASM_FUNC_END
|
||||||
|
|
||||||
|
|
||||||
|
WELS_ASM_FUNC_BEGIN pixel_satd_16x8_neon
|
||||||
|
|
||||||
|
SATD_16x4
|
||||||
|
vadd.u16 q15, q0, q2
|
||||||
|
|
||||||
|
SATD_16x4
|
||||||
|
vadd.u16 q15, q15, q0
|
||||||
|
vadd.u16 q15, q15, q2
|
||||||
|
|
||||||
|
vadd.u16 d0, d30, d31
|
||||||
|
vpaddl.u16 d0, d0
|
||||||
|
vpaddl.u32 d0, d0
|
||||||
|
|
||||||
|
vmov.32 r0, d0[0]
|
||||||
|
WELS_ASM_FUNC_END
|
||||||
|
|
||||||
|
|
||||||
|
WELS_ASM_FUNC_BEGIN pixel_satd_8x16_neon
|
||||||
|
|
||||||
|
SATD_8x4
|
||||||
|
vadd.u16 q15, q0, q1
|
||||||
|
|
||||||
|
SATD_8x4
|
||||||
|
vadd.u16 q15, q15, q0
|
||||||
|
vadd.u16 q15, q15, q1
|
||||||
|
|
||||||
|
SATD_8x4
|
||||||
|
vadd.u16 q15, q15, q0
|
||||||
|
vadd.u16 q15, q15, q1
|
||||||
|
|
||||||
|
SATD_8x4
|
||||||
|
vadd.u16 q15, q15, q0
|
||||||
|
vadd.u16 q15, q15, q1
|
||||||
|
|
||||||
|
vadd.u16 d0, d30, d31
|
||||||
|
vpaddl.u16 d0, d0
|
||||||
|
vpaddl.u32 d0, d0
|
||||||
|
|
||||||
|
vmov.32 r0, d0[0]
|
||||||
|
WELS_ASM_FUNC_END
|
||||||
|
|
||||||
|
|
||||||
|
WELS_ASM_FUNC_BEGIN pixel_satd_8x8_neon
|
||||||
|
|
||||||
|
SATD_8x4
|
||||||
|
vadd.u16 q15, q0, q1
|
||||||
|
|
||||||
|
SATD_8x4
|
||||||
|
vadd.u16 q15, q15, q0
|
||||||
|
vadd.u16 q15, q15, q1
|
||||||
|
|
||||||
|
vadd.u16 d0, d30, d31
|
||||||
|
vpaddl.u16 d0, d0
|
||||||
|
vpaddl.u32 d0, d0
|
||||||
|
|
||||||
|
vmov.32 r0, d0[0]
|
||||||
|
WELS_ASM_FUNC_END
|
||||||
|
|
||||||
|
|
||||||
|
WELS_ASM_FUNC_BEGIN pixel_satd_4x4_neon
|
||||||
|
|
||||||
|
//Load the pix1 data --- 16 bytes
|
||||||
|
vld1.32 {d0[0]}, [r0], r1
|
||||||
|
vld1.32 {d0[1]}, [r0], r1
|
||||||
|
vld1.32 {d1[0]}, [r0], r1
|
||||||
|
vld1.32 {d1[1]}, [r0]
|
||||||
|
|
||||||
|
//Load the pix2 data --- 16 bytes
|
||||||
|
vld1.32 {d2[0]}, [r2], r3
|
||||||
|
vld1.32 {d2[1]}, [r2], r3
|
||||||
|
vld1.32 {d3[0]}, [r2], r3
|
||||||
|
vld1.32 {d3[1]}, [r2]
|
||||||
|
|
||||||
|
//Get the difference
|
||||||
|
vsubl.u8 q15, d0, d2 //{0,1,2,3,4,5,6,7}
|
||||||
|
vsubl.u8 q14, d1, d3 //{8,9,10,11,12,13,14,15}
|
||||||
|
|
||||||
|
//Do the vertical transform
|
||||||
|
vadd.s16 q13, q15, q14 //{0,4,8,12,1,5,9,13}
|
||||||
|
vsub.s16 q12, q15, q14 //{2,6,10,14,3,7,11,15}
|
||||||
|
vswp d27, d24
|
||||||
|
vadd.s16 q15, q13, q12 //{0,1,2,3,4,5,6,7}
|
||||||
|
vsub.s16 q14, q13, q12 //{12,13,14,15,8,9,10,11}
|
||||||
|
|
||||||
|
//Do the horizontal transform
|
||||||
|
vtrn.32 q15, q14
|
||||||
|
vadd.s16 q13, q15, q14
|
||||||
|
vsub.s16 q12, q15, q14
|
||||||
|
|
||||||
|
vtrn.16 q13, q12
|
||||||
|
vadd.s16 q15, q13, q12
|
||||||
|
|
||||||
|
//Do the SAD
|
||||||
|
vabs.s16 q15, q15
|
||||||
|
vabd.s16 q14, q13, q12
|
||||||
|
|
||||||
|
vadd.u16 q0, q15, q14
|
||||||
|
|
||||||
|
vrhadd.u16 d0, d1
|
||||||
|
vpaddl.u16 d0, d0
|
||||||
|
vpaddl.u32 d0, d0
|
||||||
|
|
||||||
|
vmov.u32 r0, d0[0]
|
||||||
|
|
||||||
|
WELS_ASM_FUNC_END
|
||||||
|
|
||||||
|
#endif
|
||||||
|
|
||||||
|
|
||||||
1312
codec/encoder/core/arm/reconstruct_neon.S
Executable file
1312
codec/encoder/core/arm/reconstruct_neon.S
Executable file
File diff suppressed because it is too large
Load Diff
@@ -61,6 +61,16 @@ uint8_t uiFilterIdc;
|
|||||||
uint8_t uiReserved;
|
uint8_t uiReserved;
|
||||||
} SDeblockingFilter;
|
} SDeblockingFilter;
|
||||||
|
|
||||||
|
#if defined(__cplusplus)
|
||||||
|
extern "C" {
|
||||||
|
#endif//__cplusplus
|
||||||
|
#if defined(HAVE_NEON)
|
||||||
|
void WelsNonZeroCount_neon(int8_t * pNonZeroCount);
|
||||||
|
void DeblockingBSCalcEnc_neon(int8_t *pNzc, SMVUnitXY *pMv, int32_t iBoundryFlag, int32_t iMbStride, uint8_t (*pBS)[4][4]);
|
||||||
|
#endif
|
||||||
|
#if defined(__cplusplus)
|
||||||
|
}
|
||||||
|
#endif//__cplusplus
|
||||||
void DeblockingInit (DeblockingFunc* pFunc, int32_t iCpu);
|
void DeblockingInit (DeblockingFunc* pFunc, int32_t iCpu);
|
||||||
|
|
||||||
void WelsNonZeroCount_c (int8_t* pNonZeroCount);
|
void WelsNonZeroCount_c (int8_t* pNonZeroCount);
|
||||||
|
|||||||
@@ -70,6 +70,16 @@ void WelsIDctRecI16x16Dc_sse2 (uint8_t* pRec, int32_t iStride, uint8_t* pPredict
|
|||||||
int16_t* pDctDc);
|
int16_t* pDctDc);
|
||||||
#endif//X86_ASM
|
#endif//X86_ASM
|
||||||
|
|
||||||
|
#ifdef HAVE_NEON
|
||||||
|
void WelsDequantFour4x4_neon(int16_t* pDct, const uint16_t* kpMF);
|
||||||
|
void WelsDequant4x4_neon(int16_t* pDct, const uint16_t* kpMF);
|
||||||
|
void WelsDequantIHadamard4x4_neon(int16_t* pRes, const uint16_t kuiMF);
|
||||||
|
|
||||||
|
void WelsIDctT4Rec_neon(uint8_t* pRec, int32_t iStride, uint8_t* pPrediction, int32_t iPredStride, int16_t* pDct);
|
||||||
|
void WelsIDctFourT4Rec_neon(uint8_t* pRec, int32_t iStride, uint8_t* pPrediction, int32_t iPredStride, int16_t* pDct);
|
||||||
|
void WelsIDctRecI16x16Dc_neon(uint8_t* pRec, int32_t iStride, uint8_t* pPrediction, int32_t iPredStride, int16_t* pDctDc);
|
||||||
|
#endif
|
||||||
|
|
||||||
#if defined(__cplusplus)
|
#if defined(__cplusplus)
|
||||||
}
|
}
|
||||||
#endif//__cplusplus
|
#endif//__cplusplus
|
||||||
|
|||||||
@@ -122,6 +122,28 @@ void WelsCopy16x16_sse2 (uint8_t* Dst, int32_t iStrideD, uint8_t* Src, int32_t
|
|||||||
void WelsCopy16x16NotAligned_sse2 (uint8_t* Dst, int32_t iStrideD, uint8_t* Src, int32_t iStrideS);
|
void WelsCopy16x16NotAligned_sse2 (uint8_t* Dst, int32_t iStrideD, uint8_t* Src, int32_t iStrideS);
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
#ifdef HAVE_NEON
|
||||||
|
void WelsCopy8x8_neon( uint8_t* pDst, int32_t iStrideD, uint8_t* pSrc, int32_t iStrideS );
|
||||||
|
void WelsCopy16x16_neon( uint8_t* pDst, int32_t iStrideD, uint8_t* pSrc, int32_t iStrideS );
|
||||||
|
void WelsCopy16x16NotAligned_neon( uint8_t* pDst, int32_t iStrideD, uint8_t* pSrc, int32_t iStrideS );
|
||||||
|
void WelsCopy16x8NotAligned_neon( uint8_t* pDst, int32_t iStrideD, uint8_t* pSrc, int32_t iStrideS );
|
||||||
|
void WelsCopy8x16_neon( uint8_t* pDst, int32_t iStrideD, uint8_t* pSrc, int32_t iStrideS );
|
||||||
|
|
||||||
|
void WelsHadamardT4Dc_neon(int16_t* pLumaDc, int16_t* pDct);
|
||||||
|
int32_t WelsHadamardQuant2x2_neon(int16_t* pRes, const int16_t kiFF, int16_t iMF, int16_t* pDct, int16_t* pBlock);
|
||||||
|
int32_t WelsHadamardQuant2x2Skip_neon(int16_t* pRes, int16_t iFF, int16_t iMF);
|
||||||
|
int32_t WelsHadamardQuant2x2SkipKernel_neon(int16_t *pRes, int16_t iThreshold);// avoid divide operator
|
||||||
|
|
||||||
|
void WelsDctT4_neon(int16_t* pDct, uint8_t* pPixel1, int32_t iStride1, uint8_t* pPixel2, int32_t iStride2);
|
||||||
|
void WelsDctFourT4_neon(int16_t* pDct, uint8_t* pPixel1, int32_t iStride1, uint8_t* pPixel2, int32_t iStride2);
|
||||||
|
|
||||||
|
int32_t WelsGetNoneZeroCount_neon(int16_t* pLevel);
|
||||||
|
|
||||||
|
void WelsQuant4x4_neon(int16_t* pDct, const int16_t* pFF, const int16_t* pMF);
|
||||||
|
void WelsQuant4x4Dc_neon(int16_t* pDct, int16_t iFF, int16_t iMF);
|
||||||
|
void WelsQuantFour4x4_neon(int16_t* pDct, const int16_t* pFF, const int16_t* pMF);
|
||||||
|
void WelsQuantFour4x4Max_neon(int16_t* pDct, const int16_t* pFF, const int16_t* pMF, int16_t* pMax);
|
||||||
|
#endif
|
||||||
|
|
||||||
#if defined(__cplusplus)
|
#if defined(__cplusplus)
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -124,6 +124,8 @@ void WelsSetMemZeroAligned64_sse2 (void* pDst, int32_t iSize);
|
|||||||
void WelsSetMemZeroSize64_mmx (void* pDst, int32_t iSize);
|
void WelsSetMemZeroSize64_mmx (void* pDst, int32_t iSize);
|
||||||
void WelsSetMemZeroSize8_mmx (void* pDst, int32_t iSize);
|
void WelsSetMemZeroSize8_mmx (void* pDst, int32_t iSize);
|
||||||
void WelsPrefetchZero_mmx (int8_t const* kpDst);
|
void WelsPrefetchZero_mmx (int8_t const* kpDst);
|
||||||
|
#elif defined(HAVE_NEON)
|
||||||
|
void WelsSetMemZero_neon(void* pDst, int32_t iSize);
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#if defined(__cplusplus)
|
#if defined(__cplusplus)
|
||||||
|
|||||||
@@ -605,6 +605,23 @@ void DeblockingMbAvcbase (SWelsFuncPtrList* pFunc, SMB* pCurMb, SDeblockingFilte
|
|||||||
DeblockingIntraMb (&pFunc->pfDeblocking, pCurMb, pFilter);
|
DeblockingIntraMb (&pFunc->pfDeblocking, pCurMb, pFilter);
|
||||||
break;
|
break;
|
||||||
default:
|
default:
|
||||||
|
#if (defined(HAVE_NEON) && defined(SINGLE_REF_FRAME))
|
||||||
|
DeblockingBSCalcEnc_neon(pCurMb->pNonZeroCount, pCurMb->sMv, pCurMb->uiNeighborAvail, iMbStride, uiBS);
|
||||||
|
if (iLeftFlag){
|
||||||
|
if (IS_INTRA((pCurMb-1)->uiMbType)) {
|
||||||
|
*(uint32_t*)uiBS[0][0] = 0x04040404;
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
*(uint32_t*)uiBS[0][0] = 0;
|
||||||
|
}
|
||||||
|
if (iTopFlag) {
|
||||||
|
if (IS_INTRA((pCurMb-iMbStride)->uiMbType)) {
|
||||||
|
*(uint32_t*)uiBS[1][0] = 0x04040404;
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
*(uint32_t*)uiBS[1][0] = 0;
|
||||||
|
}
|
||||||
|
#else
|
||||||
if (iLeftFlag) {
|
if (iLeftFlag) {
|
||||||
* (uint32_t*)uiBS[0][0] = IS_INTRA ((pCurMb - 1)->uiMbType) ? 0x04040404 : DeblockingBSMarginalMBAvcbase (pCurMb,
|
* (uint32_t*)uiBS[0][0] = IS_INTRA ((pCurMb - 1)->uiMbType) ? 0x04040404 : DeblockingBSMarginalMBAvcbase (pCurMb,
|
||||||
pCurMb - 1, 0);
|
pCurMb - 1, 0);
|
||||||
@@ -630,7 +647,7 @@ void DeblockingMbAvcbase (SWelsFuncPtrList* pFunc, SMB* pCurMb, SDeblockingFilte
|
|||||||
* (uint32_t*)uiBS[0][1] = * (uint32_t*)uiBS[0][2] = * (uint32_t*)uiBS[0][3] =
|
* (uint32_t*)uiBS[0][1] = * (uint32_t*)uiBS[0][2] = * (uint32_t*)uiBS[0][3] =
|
||||||
* (uint32_t*)uiBS[1][1] = * (uint32_t*)uiBS[1][2] = * (uint32_t*)uiBS[1][3] = 0;
|
* (uint32_t*)uiBS[1][1] = * (uint32_t*)uiBS[1][2] = * (uint32_t*)uiBS[1][3] = 0;
|
||||||
}
|
}
|
||||||
|
#endif
|
||||||
DeblockingInterMb (&pFunc->pfDeblocking, pCurMb, pFilter, uiBS);
|
DeblockingInterMb (&pFunc->pfDeblocking, pCurMb, pFilter, uiBS);
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
@@ -768,10 +785,13 @@ void WelsNonZeroCount_c (int8_t* pNonZeroCount) {
|
|||||||
}
|
}
|
||||||
void WelsBlockFuncInit (PSetNoneZeroCountZeroFunc* pfSetNZCZero, int32_t iCpu) {
|
void WelsBlockFuncInit (PSetNoneZeroCountZeroFunc* pfSetNZCZero, int32_t iCpu) {
|
||||||
*pfSetNZCZero = WelsNonZeroCount_c;
|
*pfSetNZCZero = WelsNonZeroCount_c;
|
||||||
|
#ifdef HAVE_NEON
|
||||||
|
if( iCpu & WELS_CPU_NEON ) {
|
||||||
|
*pfSetNZCZero = WelsNonZeroCount_neon;
|
||||||
|
}
|
||||||
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
void DeblockingInit (DeblockingFunc* pFunc, int32_t iCpu) {
|
void DeblockingInit (DeblockingFunc* pFunc, int32_t iCpu) {
|
||||||
pFunc->pfLumaDeblockingLT4Ver = DeblockLumaLt4V_c;
|
pFunc->pfLumaDeblockingLT4Ver = DeblockLumaLt4V_c;
|
||||||
pFunc->pfLumaDeblockingEQ4Ver = DeblockLumaEq4V_c;
|
pFunc->pfLumaDeblockingEQ4Ver = DeblockLumaEq4V_c;
|
||||||
@@ -796,6 +816,20 @@ void DeblockingInit (DeblockingFunc* pFunc, int32_t iCpu) {
|
|||||||
pFunc->pfChromaDeblockingEQ4Hor = DeblockChromaEq4H_ssse3;
|
pFunc->pfChromaDeblockingEQ4Hor = DeblockChromaEq4H_ssse3;
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
#if defined(HAVE_NEON)
|
||||||
|
if (iCpu & WELS_CPU_NEON ) {
|
||||||
|
pFunc->pfLumaDeblockingLT4Ver = DeblockLumaLt4V_neon;
|
||||||
|
pFunc->pfLumaDeblockingEQ4Ver = DeblockLumaEq4V_neon;
|
||||||
|
pFunc->pfLumaDeblockingLT4Hor = DeblockLumaLt4H_neon;
|
||||||
|
pFunc->pfLumaDeblockingEQ4Hor = DeblockLumaEq4H_neon;
|
||||||
|
|
||||||
|
pFunc->pfChromaDeblockingLT4Ver = DeblockChromaLt4V_neon;
|
||||||
|
pFunc->pfChromaDeblockingEQ4Ver = DeblockChromaEq4V_neon;
|
||||||
|
pFunc->pfChromaDeblockingLT4Hor = DeblockChromaLt4H_neon;
|
||||||
|
pFunc->pfChromaDeblockingEQ4Hor = DeblockChromaEq4H_neon;
|
||||||
|
}
|
||||||
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@@ -270,5 +270,17 @@ void WelsInitReconstructionFuncs (SWelsFuncPtrList* pFuncList, uint32_t uiCpuFl
|
|||||||
pFuncList->pfIDctI16x16Dc = WelsIDctRecI16x16Dc_sse2;
|
pFuncList->pfIDctI16x16Dc = WelsIDctRecI16x16Dc_sse2;
|
||||||
}
|
}
|
||||||
#endif//X86_ASM
|
#endif//X86_ASM
|
||||||
|
|
||||||
|
#if defined(HAVE_NEON)
|
||||||
|
if (uiCpuFlag & WELS_CPU_NEON) {
|
||||||
|
pFuncList->pfDequantization4x4 = WelsDequant4x4_neon;
|
||||||
|
pFuncList->pfDequantizationFour4x4 = WelsDequantFour4x4_neon;
|
||||||
|
pFuncList->pfDequantizationIHadamard4x4 = WelsDequantIHadamard4x4_neon;
|
||||||
|
|
||||||
|
pFuncList->pfIDctFourT4 = WelsIDctFourT4Rec_neon;
|
||||||
|
pFuncList->pfIDctT4 = WelsIDctT4Rec_neon;
|
||||||
|
pFuncList->pfIDctI16x16Dc = WelsIDctRecI16x16Dc_neon;
|
||||||
|
}
|
||||||
|
#endif
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -508,6 +508,13 @@ int32_t WelsGetNoneZeroCount_c (int16_t* pLevel) {
|
|||||||
return (16 - iCnt);
|
return (16 - iCnt);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#ifdef HAVE_NEON
|
||||||
|
int32_t WelsHadamardQuant2x2Skip_neon(int16_t* pRes, int16_t iFF, int16_t iMF) {
|
||||||
|
int16_t iThreshold = ((1<<16)-1)/iMF - iFF;
|
||||||
|
return WelsHadamardQuant2x2SkipKernel_neon(pRes, iThreshold);
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
void WelsInitEncodingFuncs (SWelsFuncPtrList* pFuncList, uint32_t uiCpuFlag) {
|
void WelsInitEncodingFuncs (SWelsFuncPtrList* pFuncList, uint32_t uiCpuFlag) {
|
||||||
pFuncList->pfCopy8x8Aligned = WelsCopy8x8_c;
|
pFuncList->pfCopy8x8Aligned = WelsCopy8x8_c;
|
||||||
pFuncList->pfCopy16x16Aligned =
|
pFuncList->pfCopy16x16Aligned =
|
||||||
@@ -571,5 +578,28 @@ void WelsInitEncodingFuncs (SWelsFuncPtrList* pFuncList, uint32_t uiCpuFlag) {
|
|||||||
//#endif//MACOS
|
//#endif//MACOS
|
||||||
|
|
||||||
#endif//X86_ASM
|
#endif//X86_ASM
|
||||||
|
|
||||||
|
#if defined(HAVE_NEON)
|
||||||
|
if (uiCpuFlag & WELS_CPU_NEON) {
|
||||||
|
pFuncList->pfQuantizationHadamard2x2 = WelsHadamardQuant2x2_neon;
|
||||||
|
pFuncList->pfQuantizationHadamard2x2Skip = WelsHadamardQuant2x2Skip_neon;
|
||||||
|
pFuncList->pfDctT4 = WelsDctT4_neon;
|
||||||
|
pFuncList->pfCopy8x8Aligned = WelsCopy8x8_neon;
|
||||||
|
pFuncList->pfCopy8x16Aligned = WelsCopy8x16_neon;
|
||||||
|
|
||||||
|
pFuncList->pfGetNoneZeroCount = WelsGetNoneZeroCount_neon;
|
||||||
|
pFuncList->pfTransformHadamard4x4Dc = WelsHadamardT4Dc_neon;
|
||||||
|
|
||||||
|
pFuncList->pfQuantization4x4 = WelsQuant4x4_neon;
|
||||||
|
pFuncList->pfQuantizationDc4x4 = WelsQuant4x4Dc_neon;
|
||||||
|
pFuncList->pfQuantizationFour4x4 = WelsQuantFour4x4_neon;
|
||||||
|
pFuncList->pfQuantizationFour4x4Max = WelsQuantFour4x4Max_neon;
|
||||||
|
|
||||||
|
pFuncList->pfCopy16x16Aligned = WelsCopy16x16_neon;
|
||||||
|
pFuncList->pfCopy16x16NotAligned = WelsCopy16x16NotAligned_neon;
|
||||||
|
pFuncList->pfCopy16x8NotAligned = WelsCopy16x8NotAligned_neon;
|
||||||
|
pFuncList->pfDctFourT4 = WelsDctFourT4_neon;
|
||||||
|
}
|
||||||
|
#endif
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -171,6 +171,14 @@ int32_t InitFunctionPointers (SWelsFuncPtrList* pFuncList, SWelsSvcCodingParam*
|
|||||||
}
|
}
|
||||||
#endif//X86_ASM
|
#endif//X86_ASM
|
||||||
|
|
||||||
|
#if defined(HAVE_NEON)
|
||||||
|
if (uiCpuFlag & WELS_CPU_NEON) {
|
||||||
|
pFuncList->pfSetMemZeroSize8 = WelsSetMemZero_neon;
|
||||||
|
pFuncList->pfSetMemZeroSize64Aligned16 = WelsSetMemZero_neon;
|
||||||
|
pFuncList->pfSetMemZeroSize64 = WelsSetMemZero_neon;
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
InitExpandPictureFunc (pFuncList, uiCpuFlag);
|
InitExpandPictureFunc (pFuncList, uiCpuFlag);
|
||||||
|
|
||||||
/* Intra_Prediction_fn*/
|
/* Intra_Prediction_fn*/
|
||||||
|
|||||||
@@ -1944,6 +1944,13 @@ int32_t WelsInitEncoderExt (sWelsEncCtx** ppCtx, SWelsSvcCodingParam* pCodingPar
|
|||||||
else if (uiCpuFeatureFlags & WELS_CPU_CACHELINE_16)
|
else if (uiCpuFeatureFlags & WELS_CPU_CACHELINE_16)
|
||||||
iCacheLineSize = 16;
|
iCacheLineSize = 16;
|
||||||
OutputCpuFeaturesLog (uiCpuFeatureFlags, uiCpuCores, iCacheLineSize);
|
OutputCpuFeaturesLog (uiCpuFeatureFlags, uiCpuCores, iCacheLineSize);
|
||||||
|
#elif defined(HAVE_NEON)
|
||||||
|
#if defined(ANDROID_NDK)
|
||||||
|
uiCpuFeatureFlags = WelsCPUFeatureDetectAndroid();
|
||||||
|
#endif
|
||||||
|
#if defined(APPLE_IOS)
|
||||||
|
uiCpuFeatureFlags = WelsCPUFeatureDetectIOS();
|
||||||
|
#endif
|
||||||
#else
|
#else
|
||||||
iCacheLineSize = 16; // 16 bytes aligned in default
|
iCacheLineSize = 16; // 16 bytes aligned in default
|
||||||
#endif//X86_ASM
|
#endif//X86_ASM
|
||||||
|
|||||||
@@ -129,6 +129,13 @@ void InitExpandPictureFunc (void* pL, const uint32_t kuiCPUFlag) {
|
|||||||
pFuncList->pfExpandChromaPicture[1] = ExpandPictureChromaAlign_sse2;
|
pFuncList->pfExpandChromaPicture[1] = ExpandPictureChromaAlign_sse2;
|
||||||
}
|
}
|
||||||
#endif//X86_ASM
|
#endif//X86_ASM
|
||||||
|
#if defined(X86_ASM)
|
||||||
|
if (kuiCPUFlag & WELS_CPU_NEON) {
|
||||||
|
pFuncList->pfExpandLumaPicture = ExpandPictureLuma_neon;
|
||||||
|
pFuncList->pfExpandChromaPicture[0] = ExpandPictureChroma_c;
|
||||||
|
pFuncList->pfExpandChromaPicture[1] = ExpandPictureChroma_neon;
|
||||||
|
}
|
||||||
|
#endif//X86_ASM
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@@ -7,6 +7,10 @@
|
|||||||
objects = {
|
objects = {
|
||||||
|
|
||||||
/* Begin PBXBuildFile section */
|
/* Begin PBXBuildFile section */
|
||||||
|
4C34067818C5A4AD00DFA14A /* adaptive_quantization.S in Sources */ = {isa = PBXBuildFile; fileRef = 4C34067418C5A4AD00DFA14A /* adaptive_quantization.S */; };
|
||||||
|
4C34067918C5A4AD00DFA14A /* down_sample_neon.S in Sources */ = {isa = PBXBuildFile; fileRef = 4C34067518C5A4AD00DFA14A /* down_sample_neon.S */; };
|
||||||
|
4C34067A18C5A4AD00DFA14A /* pixel_sad_neon.S in Sources */ = {isa = PBXBuildFile; fileRef = 4C34067618C5A4AD00DFA14A /* pixel_sad_neon.S */; };
|
||||||
|
4C34067B18C5A4AD00DFA14A /* vaa_calc_neon.S in Sources */ = {isa = PBXBuildFile; fileRef = 4C34067718C5A4AD00DFA14A /* vaa_calc_neon.S */; };
|
||||||
4CE4443518B724B60017DF25 /* Foundation.framework in Frameworks */ = {isa = PBXBuildFile; fileRef = 4CE4443418B724B60017DF25 /* Foundation.framework */; };
|
4CE4443518B724B60017DF25 /* Foundation.framework in Frameworks */ = {isa = PBXBuildFile; fileRef = 4CE4443418B724B60017DF25 /* Foundation.framework */; };
|
||||||
4CE4444318B724B60017DF25 /* XCTest.framework in Frameworks */ = {isa = PBXBuildFile; fileRef = 4CE4444218B724B60017DF25 /* XCTest.framework */; };
|
4CE4444318B724B60017DF25 /* XCTest.framework in Frameworks */ = {isa = PBXBuildFile; fileRef = 4CE4444218B724B60017DF25 /* XCTest.framework */; };
|
||||||
4CE4444418B724B60017DF25 /* Foundation.framework in Frameworks */ = {isa = PBXBuildFile; fileRef = 4CE4443418B724B60017DF25 /* Foundation.framework */; };
|
4CE4444418B724B60017DF25 /* Foundation.framework in Frameworks */ = {isa = PBXBuildFile; fileRef = 4CE4443418B724B60017DF25 /* Foundation.framework */; };
|
||||||
@@ -56,6 +60,10 @@
|
|||||||
/* End PBXCopyFilesBuildPhase section */
|
/* End PBXCopyFilesBuildPhase section */
|
||||||
|
|
||||||
/* Begin PBXFileReference section */
|
/* Begin PBXFileReference section */
|
||||||
|
4C34067418C5A4AD00DFA14A /* adaptive_quantization.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = adaptive_quantization.S; sourceTree = "<group>"; };
|
||||||
|
4C34067518C5A4AD00DFA14A /* down_sample_neon.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = down_sample_neon.S; sourceTree = "<group>"; };
|
||||||
|
4C34067618C5A4AD00DFA14A /* pixel_sad_neon.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = pixel_sad_neon.S; sourceTree = "<group>"; };
|
||||||
|
4C34067718C5A4AD00DFA14A /* vaa_calc_neon.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = vaa_calc_neon.S; sourceTree = "<group>"; };
|
||||||
4CE4443118B724B60017DF25 /* libprocessing.a */ = {isa = PBXFileReference; explicitFileType = archive.ar; includeInIndex = 0; path = libprocessing.a; sourceTree = BUILT_PRODUCTS_DIR; };
|
4CE4443118B724B60017DF25 /* libprocessing.a */ = {isa = PBXFileReference; explicitFileType = archive.ar; includeInIndex = 0; path = libprocessing.a; sourceTree = BUILT_PRODUCTS_DIR; };
|
||||||
4CE4443418B724B60017DF25 /* Foundation.framework */ = {isa = PBXFileReference; lastKnownFileType = wrapper.framework; name = Foundation.framework; path = System/Library/Frameworks/Foundation.framework; sourceTree = SDKROOT; };
|
4CE4443418B724B60017DF25 /* Foundation.framework */ = {isa = PBXFileReference; lastKnownFileType = wrapper.framework; name = Foundation.framework; path = System/Library/Frameworks/Foundation.framework; sourceTree = SDKROOT; };
|
||||||
4CE4444118B724B60017DF25 /* processingTests.xctest */ = {isa = PBXFileReference; explicitFileType = wrapper.cfbundle; includeInIndex = 0; path = processingTests.xctest; sourceTree = BUILT_PRODUCTS_DIR; };
|
4CE4444118B724B60017DF25 /* processingTests.xctest */ = {isa = PBXFileReference; explicitFileType = wrapper.cfbundle; includeInIndex = 0; path = processingTests.xctest; sourceTree = BUILT_PRODUCTS_DIR; };
|
||||||
@@ -122,6 +130,17 @@
|
|||||||
/* End PBXFrameworksBuildPhase section */
|
/* End PBXFrameworksBuildPhase section */
|
||||||
|
|
||||||
/* Begin PBXGroup section */
|
/* Begin PBXGroup section */
|
||||||
|
4C34067318C5A4AD00DFA14A /* arm */ = {
|
||||||
|
isa = PBXGroup;
|
||||||
|
children = (
|
||||||
|
4C34067418C5A4AD00DFA14A /* adaptive_quantization.S */,
|
||||||
|
4C34067518C5A4AD00DFA14A /* down_sample_neon.S */,
|
||||||
|
4C34067618C5A4AD00DFA14A /* pixel_sad_neon.S */,
|
||||||
|
4C34067718C5A4AD00DFA14A /* vaa_calc_neon.S */,
|
||||||
|
);
|
||||||
|
path = arm;
|
||||||
|
sourceTree = "<group>";
|
||||||
|
};
|
||||||
4CE4442818B724B60017DF25 = {
|
4CE4442818B724B60017DF25 = {
|
||||||
isa = PBXGroup;
|
isa = PBXGroup;
|
||||||
children = (
|
children = (
|
||||||
@@ -182,6 +201,7 @@
|
|||||||
4CE4475B18BC62960017DF25 /* src */ = {
|
4CE4475B18BC62960017DF25 /* src */ = {
|
||||||
isa = PBXGroup;
|
isa = PBXGroup;
|
||||||
children = (
|
children = (
|
||||||
|
4C34067318C5A4AD00DFA14A /* arm */,
|
||||||
4CE4475C18BC62960017DF25 /* adaptivequantization */,
|
4CE4475C18BC62960017DF25 /* adaptivequantization */,
|
||||||
4CE4476318BC62960017DF25 /* backgrounddetection */,
|
4CE4476318BC62960017DF25 /* backgrounddetection */,
|
||||||
4CE4476618BC62960017DF25 /* common */,
|
4CE4476618BC62960017DF25 /* common */,
|
||||||
@@ -372,6 +392,8 @@
|
|||||||
isa = PBXSourcesBuildPhase;
|
isa = PBXSourcesBuildPhase;
|
||||||
buildActionMask = 2147483647;
|
buildActionMask = 2147483647;
|
||||||
files = (
|
files = (
|
||||||
|
4C34067918C5A4AD00DFA14A /* down_sample_neon.S in Sources */,
|
||||||
|
4C34067818C5A4AD00DFA14A /* adaptive_quantization.S in Sources */,
|
||||||
4CE4479718BC62960017DF25 /* downsample.cpp in Sources */,
|
4CE4479718BC62960017DF25 /* downsample.cpp in Sources */,
|
||||||
4CE4478B18BC62960017DF25 /* AdaptiveQuantization.cpp in Sources */,
|
4CE4478B18BC62960017DF25 /* AdaptiveQuantization.cpp in Sources */,
|
||||||
4CE4479918BC62960017DF25 /* imagerotate.cpp in Sources */,
|
4CE4479918BC62960017DF25 /* imagerotate.cpp in Sources */,
|
||||||
@@ -380,6 +402,7 @@
|
|||||||
4CE4479A18BC62960017DF25 /* imagerotatefuncs.cpp in Sources */,
|
4CE4479A18BC62960017DF25 /* imagerotatefuncs.cpp in Sources */,
|
||||||
4CE4479518BC62960017DF25 /* denoise.cpp in Sources */,
|
4CE4479518BC62960017DF25 /* denoise.cpp in Sources */,
|
||||||
4CE4479218BC62960017DF25 /* WelsFrameWork.cpp in Sources */,
|
4CE4479218BC62960017DF25 /* WelsFrameWork.cpp in Sources */,
|
||||||
|
4C34067B18C5A4AD00DFA14A /* vaa_calc_neon.S in Sources */,
|
||||||
4CE4479B18BC62960017DF25 /* SceneChangeDetection.cpp in Sources */,
|
4CE4479B18BC62960017DF25 /* SceneChangeDetection.cpp in Sources */,
|
||||||
4CE4479D18BC62960017DF25 /* vaacalcfuncs.cpp in Sources */,
|
4CE4479D18BC62960017DF25 /* vaacalcfuncs.cpp in Sources */,
|
||||||
4CE4479818BC62960017DF25 /* downsamplefuncs.cpp in Sources */,
|
4CE4479818BC62960017DF25 /* downsamplefuncs.cpp in Sources */,
|
||||||
@@ -387,6 +410,7 @@
|
|||||||
4CE4479418BC62960017DF25 /* ComplexityAnalysis.cpp in Sources */,
|
4CE4479418BC62960017DF25 /* ComplexityAnalysis.cpp in Sources */,
|
||||||
4CE4479E18BC62960017DF25 /* vaacalculation.cpp in Sources */,
|
4CE4479E18BC62960017DF25 /* vaacalculation.cpp in Sources */,
|
||||||
4CE4479118BC62960017DF25 /* thread.cpp in Sources */,
|
4CE4479118BC62960017DF25 /* thread.cpp in Sources */,
|
||||||
|
4C34067A18C5A4AD00DFA14A /* pixel_sad_neon.S in Sources */,
|
||||||
4CE4478F18BC62960017DF25 /* BackgroundDetection.cpp in Sources */,
|
4CE4478F18BC62960017DF25 /* BackgroundDetection.cpp in Sources */,
|
||||||
4CE4479618BC62960017DF25 /* denoise_filter.cpp in Sources */,
|
4CE4479618BC62960017DF25 /* denoise_filter.cpp in Sources */,
|
||||||
);
|
);
|
||||||
@@ -502,6 +526,11 @@
|
|||||||
DSTROOT = /tmp/processing.dst;
|
DSTROOT = /tmp/processing.dst;
|
||||||
GCC_C_LANGUAGE_STANDARD = "compiler-default";
|
GCC_C_LANGUAGE_STANDARD = "compiler-default";
|
||||||
GCC_OPTIMIZATION_LEVEL = 3;
|
GCC_OPTIMIZATION_LEVEL = 3;
|
||||||
|
"GCC_PREPROCESSOR_DEFINITIONS[sdk=iphoneos*]" = (
|
||||||
|
APPLE_IOS,
|
||||||
|
HAVE_NEON,
|
||||||
|
);
|
||||||
|
"GCC_PREPROCESSOR_DEFINITIONS[sdk=iphonesimulator*]" = APPLE_IOS;
|
||||||
HEADER_SEARCH_PATHS = (
|
HEADER_SEARCH_PATHS = (
|
||||||
/Applications/Xcode.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/include,
|
/Applications/Xcode.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/include,
|
||||||
"\"$(SRCROOT)/../../../common\"",
|
"\"$(SRCROOT)/../../../common\"",
|
||||||
@@ -526,6 +555,11 @@
|
|||||||
CODE_SIGN_IDENTITY = "iPhone Developer";
|
CODE_SIGN_IDENTITY = "iPhone Developer";
|
||||||
DSTROOT = /tmp/processing.dst;
|
DSTROOT = /tmp/processing.dst;
|
||||||
GCC_C_LANGUAGE_STANDARD = "compiler-default";
|
GCC_C_LANGUAGE_STANDARD = "compiler-default";
|
||||||
|
"GCC_PREPROCESSOR_DEFINITIONS[sdk=iphoneos*]" = (
|
||||||
|
APPLE_IOS,
|
||||||
|
HAVE_NEON,
|
||||||
|
);
|
||||||
|
"GCC_PREPROCESSOR_DEFINITIONS[sdk=iphonesimulator*]" = APPLE_IOS;
|
||||||
HEADER_SEARCH_PATHS = (
|
HEADER_SEARCH_PATHS = (
|
||||||
/Applications/Xcode.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/include,
|
/Applications/Xcode.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/include,
|
||||||
"\"$(SRCROOT)/../../../common\"",
|
"\"$(SRCROOT)/../../../common\"",
|
||||||
|
|||||||
BIN
codec/processing/src/arm/.DS_Store
vendored
Normal file
BIN
codec/processing/src/arm/.DS_Store
vendored
Normal file
Binary file not shown.
120
codec/processing/src/arm/adaptive_quantization.S
Executable file
120
codec/processing/src/arm/adaptive_quantization.S
Executable file
@@ -0,0 +1,120 @@
|
|||||||
|
/*!
|
||||||
|
* \copy
|
||||||
|
* Copyright (c) 2013, Cisco Systems
|
||||||
|
* All rights reserved.
|
||||||
|
*
|
||||||
|
* Redistribution and use in source and binary forms, with or without
|
||||||
|
* modification, are permitted provided that the following conditions
|
||||||
|
* are met:
|
||||||
|
*
|
||||||
|
* * Redistributions of source code must retain the above copyright
|
||||||
|
* notice, this list of conditions and the following disclaimer.
|
||||||
|
*
|
||||||
|
* * Redistributions in binary form must reproduce the above copyright
|
||||||
|
* notice, this list of conditions and the following disclaimer in
|
||||||
|
* the documentation and/or other materials provided with the
|
||||||
|
* distribution.
|
||||||
|
*
|
||||||
|
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||||
|
* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||||
|
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
|
||||||
|
* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
|
||||||
|
* COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
|
||||||
|
* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
|
||||||
|
* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
|
||||||
|
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||||
|
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
|
||||||
|
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
|
||||||
|
* ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||||
|
* POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
*
|
||||||
|
*/
|
||||||
|
|
||||||
|
#ifdef HAVE_NEON
|
||||||
|
.text
|
||||||
|
#include "arm_arch_common_macro.S"
|
||||||
|
|
||||||
|
#ifdef APPLE_IOS
|
||||||
|
.macro SQR_ADD_16BYTES
|
||||||
|
vmull.u8 q3, $0, $0
|
||||||
|
vmull.u8 q8, $1, $1
|
||||||
|
vpadal.u16 $2, q3
|
||||||
|
vpadal.u16 $2, q8
|
||||||
|
.endm
|
||||||
|
#else
|
||||||
|
.macro SQR_ADD_16BYTES arg0, arg1, arg2
|
||||||
|
vmull.u8 q3, \arg0, \arg0
|
||||||
|
vmull.u8 q8, \arg1, \arg1
|
||||||
|
vpadal.u16 \arg2, q3
|
||||||
|
vpadal.u16 \arg2, q8
|
||||||
|
.endm
|
||||||
|
#endif
|
||||||
|
|
||||||
|
|
||||||
|
WELS_ASM_FUNC_BEGIN pixel_var_16x16_neon
|
||||||
|
stmdb sp!, {r4}
|
||||||
|
|
||||||
|
vld1.8 {q15}, [r0], r1 //save the ref data (16bytes)
|
||||||
|
vld1.8 {q14}, [r2], r3 //save the src data (16bytes)
|
||||||
|
|
||||||
|
|
||||||
|
vabd.u8 q13, q14, q15
|
||||||
|
vmull.u8 q12, d27, d27
|
||||||
|
vmull.u8 q11, d26, d26
|
||||||
|
vaddl.u16 q12, d24, d25
|
||||||
|
vpadal.u16 q12, q11 //sqr
|
||||||
|
|
||||||
|
vaddl.u8 q13, d26, d27 //sum
|
||||||
|
|
||||||
|
vaddl.u8 q10, d28, d29 //sum_cur
|
||||||
|
|
||||||
|
vmull.u8 q9, d29, d29
|
||||||
|
vmull.u8 q8, d28, d28
|
||||||
|
vaddl.u16 q9, d18, d19 //sqr_cur
|
||||||
|
vpadal.u16 q9, q8
|
||||||
|
|
||||||
|
mov r4, #15
|
||||||
|
pixel_var_16x16_loop0:
|
||||||
|
|
||||||
|
vld1.8 {q0}, [r0], r1 //save the ref data (16bytes)
|
||||||
|
vld1.8 {q1}, [r2], r3 //save the src data (16bytes)
|
||||||
|
|
||||||
|
vabd.u8 q2, q0, q1
|
||||||
|
|
||||||
|
//q10 save sum_cur
|
||||||
|
vpadal.u8 q10, q1
|
||||||
|
|
||||||
|
//q12 save sqr
|
||||||
|
SQR_ADD_16BYTES d4, d5, q12
|
||||||
|
|
||||||
|
//q13 save sum
|
||||||
|
vpadal.u8 q13, q2
|
||||||
|
|
||||||
|
subs r4, #1
|
||||||
|
|
||||||
|
//q9 save sqr_cur
|
||||||
|
SQR_ADD_16BYTES d2, d3, q9
|
||||||
|
|
||||||
|
bne pixel_var_16x16_loop0
|
||||||
|
|
||||||
|
vadd.u16 d0, d26, d27 //sum
|
||||||
|
vadd.u16 d1, d20, d21 //sum_cur
|
||||||
|
vpaddl.u16 q0, q0
|
||||||
|
vadd.u32 d2, d24, d25 //sqr
|
||||||
|
vadd.u32 d3, d18, d19 //sqr_cur
|
||||||
|
vpadd.u32 d0, d0, d1
|
||||||
|
vpadd.u32 d1, d2, d3
|
||||||
|
|
||||||
|
ldr r4, [sp, #4]
|
||||||
|
|
||||||
|
vshr.u32 q0, q0, #8
|
||||||
|
vmul.u32 d0, d0
|
||||||
|
vsub.u32 d0, d1, d0
|
||||||
|
vmovl.u32 q0, d0
|
||||||
|
vst2.16 {d0[0], d1[0]}, [r4]
|
||||||
|
|
||||||
|
ldmia sp!, {r4}
|
||||||
|
|
||||||
|
WELS_ASM_FUNC_END
|
||||||
|
|
||||||
|
#endif
|
||||||
342
codec/processing/src/arm/down_sample_neon.S
Executable file
342
codec/processing/src/arm/down_sample_neon.S
Executable file
@@ -0,0 +1,342 @@
|
|||||||
|
/*!
|
||||||
|
* \copy
|
||||||
|
* Copyright (c) 2013, Cisco Systems
|
||||||
|
* All rights reserved.
|
||||||
|
*
|
||||||
|
* Redistribution and use in source and binary forms, with or without
|
||||||
|
* modification, are permitted provided that the following conditions
|
||||||
|
* are met:
|
||||||
|
*
|
||||||
|
* * Redistributions of source code must retain the above copyright
|
||||||
|
* notice, this list of conditions and the following disclaimer.
|
||||||
|
*
|
||||||
|
* * Redistributions in binary form must reproduce the above copyright
|
||||||
|
* notice, this list of conditions and the following disclaimer in
|
||||||
|
* the documentation and/or other materials provided with the
|
||||||
|
* distribution.
|
||||||
|
*
|
||||||
|
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||||
|
* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||||
|
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
|
||||||
|
* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
|
||||||
|
* COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
|
||||||
|
* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
|
||||||
|
* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
|
||||||
|
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||||
|
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
|
||||||
|
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
|
||||||
|
* ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||||
|
* POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
*
|
||||||
|
*/
|
||||||
|
|
||||||
|
#ifdef HAVE_NEON
|
||||||
|
.text
|
||||||
|
#include "arm_arch_common_macro.S"
|
||||||
|
|
||||||
|
|
||||||
|
WELS_ASM_FUNC_BEGIN comp_ds_bilinear_neon
|
||||||
|
stmdb sp!, {r4-r8, lr}
|
||||||
|
|
||||||
|
//Get the width and height
|
||||||
|
ldr r4, [sp, #24] //src_width
|
||||||
|
ldr r5, [sp, #28] //src_height
|
||||||
|
|
||||||
|
//Initialize the register
|
||||||
|
mov r6, r2
|
||||||
|
mov r8, r0
|
||||||
|
mov lr, #0
|
||||||
|
lsr r5, #1
|
||||||
|
|
||||||
|
//Save the tailer for the unasigned size
|
||||||
|
mla r7, r1, r5, r0
|
||||||
|
vld1.32 {q15}, [r7]
|
||||||
|
|
||||||
|
add r7, r2, r3
|
||||||
|
//processing a colume data
|
||||||
|
comp_ds_bilinear_loop0:
|
||||||
|
|
||||||
|
vld1.8 {q0,q1}, [r2]!
|
||||||
|
vld1.8 {q2,q3}, [r7]!
|
||||||
|
vpaddl.u8 q0, q0
|
||||||
|
vpaddl.u8 q1, q1
|
||||||
|
vpaddl.u8 q2, q2
|
||||||
|
vpaddl.u8 q3, q3
|
||||||
|
vrshr.u16 q0, #1
|
||||||
|
vrshr.u16 q1, #1
|
||||||
|
vrshr.u16 q2, #1
|
||||||
|
vrshr.u16 q3, #1
|
||||||
|
vrhadd.u16 q0, q2
|
||||||
|
vrhadd.u16 q1, q3
|
||||||
|
vmovn.u16 d0, q0
|
||||||
|
vmovn.u16 d1, q1
|
||||||
|
vst1.32 {q0}, [r0]!
|
||||||
|
add lr, #32
|
||||||
|
|
||||||
|
cmp lr, r4
|
||||||
|
movcs lr, #0
|
||||||
|
addcs r6, r3, lsl #1
|
||||||
|
movcs r2, r6
|
||||||
|
addcs r7, r2, r3
|
||||||
|
addcs r8, r1
|
||||||
|
movcs r0, r8
|
||||||
|
subscs r5, #1
|
||||||
|
bne comp_ds_bilinear_loop0
|
||||||
|
|
||||||
|
//restore the tailer for the unasigned size
|
||||||
|
vst1.32 {q15}, [r0]
|
||||||
|
|
||||||
|
ldmia sp!, {r4-r8,lr}
|
||||||
|
WELS_ASM_FUNC_END
|
||||||
|
|
||||||
|
|
||||||
|
WELS_ASM_FUNC_BEGIN comp_ds_bilinear_w_x8_neon
|
||||||
|
stmdb sp!, {r4-r7, lr}
|
||||||
|
|
||||||
|
//Get the width and height
|
||||||
|
ldr r4, [sp, #20] //src_width
|
||||||
|
ldr r5, [sp, #24] //src_height
|
||||||
|
|
||||||
|
//Get the difference
|
||||||
|
sub lr, r3, r4
|
||||||
|
sub r1, r1, r4, lsr #1
|
||||||
|
|
||||||
|
lsr r5, #1
|
||||||
|
|
||||||
|
//processing a colume data
|
||||||
|
comp_ds_bilinear_w_x8_loop0:
|
||||||
|
|
||||||
|
lsr r6, r4, #3
|
||||||
|
add r7, r2, r3
|
||||||
|
//processing a line data
|
||||||
|
comp_ds_bilinear_w_x8_loop1:
|
||||||
|
|
||||||
|
vld1.8 {d0}, [r2]!
|
||||||
|
vld1.8 {d1}, [r7]!
|
||||||
|
vpaddl.u8 q0, q0
|
||||||
|
vrshr.u16 q0, #1
|
||||||
|
vrhadd.u16 d0, d1
|
||||||
|
|
||||||
|
vmovn.u16 d0, q0
|
||||||
|
vst1.32 {d0[0]}, [r0]!
|
||||||
|
subs r6, #1
|
||||||
|
bne comp_ds_bilinear_w_x8_loop1
|
||||||
|
|
||||||
|
add r2, r7, lr
|
||||||
|
add r0, r1
|
||||||
|
subs r5, #1
|
||||||
|
bne comp_ds_bilinear_w_x8_loop0
|
||||||
|
|
||||||
|
ldmia sp!, {r4-r7,lr}
|
||||||
|
WELS_ASM_FUNC_END
|
||||||
|
|
||||||
|
|
||||||
|
WELS_ASM_FUNC_BEGIN comp_ds_bilinear_w_x16_neon
|
||||||
|
stmdb sp!, {r4-r7, lr}
|
||||||
|
|
||||||
|
//Get the width and height
|
||||||
|
ldr r4, [sp, #20] //src_width
|
||||||
|
ldr r5, [sp, #24] //src_height
|
||||||
|
|
||||||
|
//Get the difference
|
||||||
|
sub lr, r3, r4
|
||||||
|
sub r1, r1, r4, lsr #1
|
||||||
|
|
||||||
|
lsr r5, #1
|
||||||
|
|
||||||
|
//processing a colume data
|
||||||
|
comp_ds_bilinear_w_x16_loop0:
|
||||||
|
|
||||||
|
lsr r6, r4, #4
|
||||||
|
add r7, r2, r3
|
||||||
|
//processing a line data
|
||||||
|
comp_ds_bilinear_w_x16_loop1:
|
||||||
|
|
||||||
|
vld1.8 {q0}, [r2]!
|
||||||
|
vld1.8 {q1}, [r7]!
|
||||||
|
vpaddl.u8 q0, q0
|
||||||
|
vpaddl.u8 q1, q1
|
||||||
|
vrshr.u16 q0, #1
|
||||||
|
vrshr.u16 q1, #1
|
||||||
|
vrhadd.u16 q0, q1
|
||||||
|
|
||||||
|
vmovn.u16 d0, q0
|
||||||
|
vst1.32 {d0}, [r0]!
|
||||||
|
subs r6, #1
|
||||||
|
bne comp_ds_bilinear_w_x16_loop1
|
||||||
|
|
||||||
|
add r2, r7, lr
|
||||||
|
add r0, r1
|
||||||
|
subs r5, #1
|
||||||
|
bne comp_ds_bilinear_w_x16_loop0
|
||||||
|
|
||||||
|
ldmia sp!, {r4-r7,lr}
|
||||||
|
WELS_ASM_FUNC_END
|
||||||
|
|
||||||
|
|
||||||
|
WELS_ASM_FUNC_BEGIN comp_ds_bilinear_w_x32_neon
|
||||||
|
stmdb sp!, {r4-r7, lr}
|
||||||
|
|
||||||
|
//Get the width and height
|
||||||
|
ldr r4, [sp, #20] //src_width
|
||||||
|
ldr r5, [sp, #24] //src_height
|
||||||
|
|
||||||
|
//Get the difference
|
||||||
|
sub lr, r3, r4
|
||||||
|
sub r1, r1, r4, lsr #1
|
||||||
|
|
||||||
|
lsr r5, #1
|
||||||
|
|
||||||
|
//processing a colume data
|
||||||
|
comp_ds_bilinear_w_x32_loop0:
|
||||||
|
|
||||||
|
lsr r6, r4, #5
|
||||||
|
add r7, r2, r3
|
||||||
|
//processing a line data
|
||||||
|
comp_ds_bilinear_w_x32_loop1:
|
||||||
|
|
||||||
|
vld1.8 {q0,q1}, [r2]!
|
||||||
|
vld1.8 {q2,q3}, [r7]!
|
||||||
|
vpaddl.u8 q0, q0
|
||||||
|
vpaddl.u8 q1, q1
|
||||||
|
vpaddl.u8 q2, q2
|
||||||
|
vpaddl.u8 q3, q3
|
||||||
|
vrshr.u16 q0, #1
|
||||||
|
vrshr.u16 q1, #1
|
||||||
|
vrshr.u16 q2, #1
|
||||||
|
vrshr.u16 q3, #1
|
||||||
|
vrhadd.u16 q0, q2
|
||||||
|
vrhadd.u16 q1, q3
|
||||||
|
|
||||||
|
vmovn.u16 d0, q0
|
||||||
|
vmovn.u16 d1, q1
|
||||||
|
vst1.32 {q0}, [r0]!
|
||||||
|
subs r6, #1
|
||||||
|
bne comp_ds_bilinear_w_x32_loop1
|
||||||
|
|
||||||
|
add r2, r7, lr
|
||||||
|
add r0, r1
|
||||||
|
subs r5, #1
|
||||||
|
bne comp_ds_bilinear_w_x32_loop0
|
||||||
|
|
||||||
|
ldmia sp!, {r4-r7,lr}
|
||||||
|
WELS_ASM_FUNC_END
|
||||||
|
|
||||||
|
|
||||||
|
WELS_ASM_FUNC_BEGIN general_ds_bilinear_accurate_neon
|
||||||
|
stmdb sp!, {r4-r12, lr}
|
||||||
|
|
||||||
|
//Get the data from stack
|
||||||
|
ldr r4, [sp, #40] //the addr of src
|
||||||
|
ldr r5, [sp, #44] //the value of src_stride
|
||||||
|
ldr r6, [sp, #48] //the value of scaleX
|
||||||
|
ldr r7, [sp, #52] //the value of scaleY
|
||||||
|
|
||||||
|
mov r10, #32768
|
||||||
|
sub r10, #1
|
||||||
|
and r8, r6, r10 // r8 uinc(scaleX mod 32767)
|
||||||
|
mov r11, #-1
|
||||||
|
mul r11, r8 // r11 -uinc
|
||||||
|
|
||||||
|
vdup.s16 d2, r8
|
||||||
|
vdup.s16 d0, r11
|
||||||
|
vzip.s16 d0, d2 // uinc -uinc uinc -uinc
|
||||||
|
|
||||||
|
and r9, r7, r10 // r9 vinc(scaleY mod 32767)
|
||||||
|
mov r11, #-1
|
||||||
|
mul r11, r9 // r11 -vinc
|
||||||
|
|
||||||
|
vdup.s16 d2, r9
|
||||||
|
vdup.s16 d3, r11
|
||||||
|
vext.8 d5, d3, d2, #4 // vinc vinc -vinc -vinc
|
||||||
|
|
||||||
|
mov r11, #0x40000000
|
||||||
|
mov r12, #0x4000
|
||||||
|
sub r12, #1
|
||||||
|
add r11, r12
|
||||||
|
vdup.s32 d1, r11; //init u 16384 16383 16384 16383
|
||||||
|
|
||||||
|
mov r11, #16384
|
||||||
|
vdup.s16 d8, r11
|
||||||
|
sub r11, #1
|
||||||
|
vdup.s16 d9, r11
|
||||||
|
vext.8 d7, d9, d8, #4 //init v 16384 16384 16383 16383
|
||||||
|
|
||||||
|
veor q14, q14
|
||||||
|
sub r1, r2 // stride - width
|
||||||
|
mov r8, #16384 // yInverse
|
||||||
|
sub r3, #1
|
||||||
|
|
||||||
|
_HEIGHT:
|
||||||
|
ldr r4, [sp, #40] //the addr of src
|
||||||
|
mov r11, r8
|
||||||
|
lsr r11, #15
|
||||||
|
mul r11, r5
|
||||||
|
add r11, r4 // get current row address
|
||||||
|
mov r12, r11
|
||||||
|
add r12, r5
|
||||||
|
|
||||||
|
mov r9, #16384 // xInverse
|
||||||
|
sub r10, r2, #1
|
||||||
|
vmov.s16 d6, d1
|
||||||
|
|
||||||
|
_WIDTH:
|
||||||
|
mov lr, r9
|
||||||
|
lsr lr, #15
|
||||||
|
add r4, r11,lr
|
||||||
|
vld2.8 {d28[0],d29[0]}, [r4] //q14: 0000000b0000000a;
|
||||||
|
add r4, r12,lr
|
||||||
|
vld2.8 {d28[4],d29[4]}, [r4] //q14: 000d000b000c000a;
|
||||||
|
vzip.32 d28, d29 //q14: 000d000c000b000a;
|
||||||
|
|
||||||
|
vmull.u16 q13, d6, d7 //q13: init u * init v
|
||||||
|
vmull.u32 q12, d26,d28
|
||||||
|
vmlal.u32 q12, d27,d29
|
||||||
|
vqadd.u64 d24, d24,d25
|
||||||
|
vrshr.u64 d24, #30
|
||||||
|
|
||||||
|
vst1.8 {d24[0]}, [r0]!
|
||||||
|
add r9, r6
|
||||||
|
vadd.u16 d6, d0 // inc u
|
||||||
|
vshl.u16 d6, #1
|
||||||
|
vshr.u16 d6, #1
|
||||||
|
subs r10, #1
|
||||||
|
bne _WIDTH
|
||||||
|
|
||||||
|
WIDTH_END:
|
||||||
|
lsr r9, #15
|
||||||
|
add r4,r11,r9
|
||||||
|
vld1.8 {d24[0]}, [r4]
|
||||||
|
vst1.8 {d24[0]}, [r0]
|
||||||
|
add r0, #1
|
||||||
|
add r8, r7
|
||||||
|
add r0, r1
|
||||||
|
vadd.s16 d7, d5 // inc v
|
||||||
|
vshl.u16 d7, #1
|
||||||
|
vshr.u16 d7, #1
|
||||||
|
subs r3, #1
|
||||||
|
bne _HEIGHT
|
||||||
|
|
||||||
|
LAST_ROW:
|
||||||
|
ldr r4, [sp, #40] //the addr of src
|
||||||
|
lsr r8, #15
|
||||||
|
mul r8, r5
|
||||||
|
add r4, r8 // get current row address
|
||||||
|
mov r9, #16384
|
||||||
|
|
||||||
|
_LAST_ROW_WIDTH:
|
||||||
|
mov r11, r9
|
||||||
|
lsr r11, #15
|
||||||
|
|
||||||
|
add r3, r4,r11
|
||||||
|
vld1.8 {d0[0]}, [r3]
|
||||||
|
vst1.8 {d0[0]}, [r0]
|
||||||
|
add r0, #1
|
||||||
|
add r9, r6
|
||||||
|
subs r2, #1
|
||||||
|
bne _LAST_ROW_WIDTH
|
||||||
|
|
||||||
|
ldmia sp!, {r4-r12, lr}
|
||||||
|
WELS_ASM_FUNC_END
|
||||||
|
|
||||||
|
#endif
|
||||||
68
codec/processing/src/arm/pixel_sad_neon.S
Executable file
68
codec/processing/src/arm/pixel_sad_neon.S
Executable file
@@ -0,0 +1,68 @@
|
|||||||
|
/*!
|
||||||
|
* \copy
|
||||||
|
* Copyright (c) 2013, Cisco Systems
|
||||||
|
* All rights reserved.
|
||||||
|
*
|
||||||
|
* Redistribution and use in source and binary forms, with or without
|
||||||
|
* modification, are permitted provided that the following conditions
|
||||||
|
* are met:
|
||||||
|
*
|
||||||
|
* * Redistributions of source code must retain the above copyright
|
||||||
|
* notice, this list of conditions and the following disclaimer.
|
||||||
|
*
|
||||||
|
* * Redistributions in binary form must reproduce the above copyright
|
||||||
|
* notice, this list of conditions and the following disclaimer in
|
||||||
|
* the documentation and/or other materials provided with the
|
||||||
|
* distribution.
|
||||||
|
*
|
||||||
|
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||||
|
* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||||
|
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
|
||||||
|
* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
|
||||||
|
* COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
|
||||||
|
* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
|
||||||
|
* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
|
||||||
|
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||||
|
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
|
||||||
|
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
|
||||||
|
* ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||||
|
* POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
*
|
||||||
|
*/
|
||||||
|
|
||||||
|
#ifdef HAVE_NEON
|
||||||
|
.text
|
||||||
|
#include "arm_arch_common_macro.S"
|
||||||
|
|
||||||
|
|
||||||
|
WELS_ASM_FUNC_BEGIN pixel_sad_8x8_neon
|
||||||
|
stmdb sp!, {lr}
|
||||||
|
//Loading a horizontal line data (8 bytes)
|
||||||
|
vld1.8 {d0}, [r0], r1
|
||||||
|
vld1.8 {d1}, [r2], r3
|
||||||
|
|
||||||
|
//Do the SAD for 8 bytes
|
||||||
|
vabdl.u8 q1, d0, d1
|
||||||
|
|
||||||
|
mov lr, #7
|
||||||
|
pixel_sad_8x8_loop0:
|
||||||
|
|
||||||
|
//Loading a horizontal line data (8 bytes)
|
||||||
|
vld1.8 {d0}, [r0], r1
|
||||||
|
vld1.8 {d1}, [r2], r3
|
||||||
|
|
||||||
|
subs lr, #1
|
||||||
|
|
||||||
|
//Do the SAD for 8 bytes
|
||||||
|
vabal.u8 q1, d0, d1
|
||||||
|
bne pixel_sad_8x8_loop0
|
||||||
|
|
||||||
|
vadd.u16 d2, d3
|
||||||
|
vpaddl.u16 d2, d2
|
||||||
|
vpaddl.u32 d2, d2
|
||||||
|
vmov.u32 r0, d2[0]//TBO...
|
||||||
|
|
||||||
|
ldmia sp!, {lr}
|
||||||
|
WELS_ASM_FUNC_END
|
||||||
|
|
||||||
|
#endif
|
||||||
1143
codec/processing/src/arm/vaa_calc_neon.S
Executable file
1143
codec/processing/src/arm/vaa_calc_neon.S
Executable file
File diff suppressed because it is too large
Load Diff
Reference in New Issue
Block a user