Partly add arm asm code to encoder.

This commit is contained in:
Licai Guo
2014-03-04 16:49:34 +08:00
parent 7150adc91b
commit bb244d736b
28 changed files with 7916 additions and 3 deletions

View File

@@ -7,6 +7,7 @@
objects = {
/* Begin PBXBuildFile section */
4C34067D18C5C94C00DFA14A /* expand_picture.S in Sources */ = {isa = PBXBuildFile; fileRef = 4C34067C18C5C94C00DFA14A /* expand_picture.S */; };
4CE443D918B722CD0017DF25 /* Foundation.framework in Frameworks */ = {isa = PBXBuildFile; fileRef = 4CE443D818B722CD0017DF25 /* Foundation.framework */; };
4CE443E718B722CD0017DF25 /* XCTest.framework in Frameworks */ = {isa = PBXBuildFile; fileRef = 4CE443E618B722CD0017DF25 /* XCTest.framework */; };
4CE443E818B722CD0017DF25 /* Foundation.framework in Frameworks */ = {isa = PBXBuildFile; fileRef = 4CE443D818B722CD0017DF25 /* Foundation.framework */; };
@@ -46,6 +47,7 @@
/* End PBXCopyFilesBuildPhase section */
/* Begin PBXFileReference section */
4C34067C18C5C94C00DFA14A /* expand_picture.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = expand_picture.S; sourceTree = "<group>"; };
4CE443D518B722CD0017DF25 /* libcommon.a */ = {isa = PBXFileReference; explicitFileType = archive.ar; includeInIndex = 0; path = libcommon.a; sourceTree = BUILT_PRODUCTS_DIR; };
4CE443D818B722CD0017DF25 /* Foundation.framework */ = {isa = PBXFileReference; lastKnownFileType = wrapper.framework; name = Foundation.framework; path = System/Library/Frameworks/Foundation.framework; sourceTree = SDKROOT; };
4CE443E518B722CD0017DF25 /* commonTests.xctest */ = {isa = PBXFileReference; explicitFileType = wrapper.cfbundle; includeInIndex = 0; path = commonTests.xctest; sourceTree = BUILT_PRODUCTS_DIR; };
@@ -148,6 +150,7 @@
4CE4472F18BC61650017DF25 /* common */ = {
isa = PBXGroup;
children = (
4C34067C18C5C94C00DFA14A /* expand_picture.S */,
4CE447BE18C085900017DF25 /* arm_arch_common_macro.S */,
4CE447BC18C085320017DF25 /* deblocking_neon.S */,
4CE4473118BC61650017DF25 /* cpu.cpp */,
@@ -257,6 +260,7 @@
4CE4475018BC61650017DF25 /* deblocking_common.cpp in Sources */,
4CE4474C18BC61650017DF25 /* cpu.cpp in Sources */,
4CE4475218BC61650017DF25 /* logging.cpp in Sources */,
4C34067D18C5C94C00DFA14A /* expand_picture.S in Sources */,
4CE447BD18C085320017DF25 /* deblocking_neon.S in Sources */,
4CE4475818BC61650017DF25 /* WelsThreadLib.cpp in Sources */,
4CE4474E18BC61650017DF25 /* crt_util_safe_x.cpp in Sources */,

View File

@@ -7,6 +7,12 @@
objects = {
/* Begin PBXBuildFile section */
4C34066D18C57D0400DFA14A /* intra_pred_neon.S in Sources */ = {isa = PBXBuildFile; fileRef = 4C34066618C57D0400DFA14A /* intra_pred_neon.S */; };
4C34066E18C57D0400DFA14A /* intra_pred_sad_3_opt_neon.S in Sources */ = {isa = PBXBuildFile; fileRef = 4C34066718C57D0400DFA14A /* intra_pred_sad_3_opt_neon.S */; };
4C34066F18C57D0400DFA14A /* mc_neon.S in Sources */ = {isa = PBXBuildFile; fileRef = 4C34066818C57D0400DFA14A /* mc_neon.S */; };
4C34067018C57D0400DFA14A /* memory_neon.S in Sources */ = {isa = PBXBuildFile; fileRef = 4C34066918C57D0400DFA14A /* memory_neon.S */; };
4C34067118C57D0400DFA14A /* pixel_neon.S in Sources */ = {isa = PBXBuildFile; fileRef = 4C34066A18C57D0400DFA14A /* pixel_neon.S */; };
4C34067218C57D0400DFA14A /* reconstruct_neon.S in Sources */ = {isa = PBXBuildFile; fileRef = 4C34066B18C57D0400DFA14A /* reconstruct_neon.S */; };
4CE4431518B6FFA00017DF25 /* Foundation.framework in Frameworks */ = {isa = PBXBuildFile; fileRef = 4CE4431418B6FFA00017DF25 /* Foundation.framework */; };
4CE4432318B6FFA00017DF25 /* XCTest.framework in Frameworks */ = {isa = PBXBuildFile; fileRef = 4CE4432218B6FFA00017DF25 /* XCTest.framework */; };
4CE4432418B6FFA00017DF25 /* Foundation.framework in Frameworks */ = {isa = PBXBuildFile; fileRef = 4CE4431418B6FFA00017DF25 /* Foundation.framework */; };
@@ -71,6 +77,12 @@
/* End PBXCopyFilesBuildPhase section */
/* Begin PBXFileReference section */
4C34066618C57D0400DFA14A /* intra_pred_neon.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = intra_pred_neon.S; sourceTree = "<group>"; };
4C34066718C57D0400DFA14A /* intra_pred_sad_3_opt_neon.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = intra_pred_sad_3_opt_neon.S; sourceTree = "<group>"; };
4C34066818C57D0400DFA14A /* mc_neon.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = mc_neon.S; sourceTree = "<group>"; };
4C34066918C57D0400DFA14A /* memory_neon.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = memory_neon.S; sourceTree = "<group>"; };
4C34066A18C57D0400DFA14A /* pixel_neon.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = pixel_neon.S; sourceTree = "<group>"; };
4C34066B18C57D0400DFA14A /* reconstruct_neon.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = reconstruct_neon.S; sourceTree = "<group>"; };
4CE4431118B6FFA00017DF25 /* libwelsenc.a */ = {isa = PBXFileReference; explicitFileType = archive.ar; includeInIndex = 0; path = libwelsenc.a; sourceTree = BUILT_PRODUCTS_DIR; };
4CE4431418B6FFA00017DF25 /* Foundation.framework */ = {isa = PBXFileReference; lastKnownFileType = wrapper.framework; name = Foundation.framework; path = System/Library/Frameworks/Foundation.framework; sourceTree = SDKROOT; };
4CE4432118B6FFA00017DF25 /* welsencTests.xctest */ = {isa = PBXFileReference; explicitFileType = wrapper.cfbundle; includeInIndex = 0; path = welsencTests.xctest; sourceTree = BUILT_PRODUCTS_DIR; };
@@ -189,6 +201,19 @@
/* End PBXFrameworksBuildPhase section */
/* Begin PBXGroup section */
4C34066418C57D0400DFA14A /* arm */ = {
isa = PBXGroup;
children = (
4C34066618C57D0400DFA14A /* intra_pred_neon.S */,
4C34066718C57D0400DFA14A /* intra_pred_sad_3_opt_neon.S */,
4C34066818C57D0400DFA14A /* mc_neon.S */,
4C34066918C57D0400DFA14A /* memory_neon.S */,
4C34066A18C57D0400DFA14A /* pixel_neon.S */,
4C34066B18C57D0400DFA14A /* reconstruct_neon.S */,
);
path = arm;
sourceTree = "<group>";
};
4CE4430818B6FFA00017DF25 = {
isa = PBXGroup;
children = (
@@ -249,6 +274,7 @@
4CE446A118BC605B0017DF25 /* core */ = {
isa = PBXGroup;
children = (
4C34066418C57D0400DFA14A /* arm */,
4CE446A918BC605C0017DF25 /* inc */,
4CE446DC18BC605C0017DF25 /* src */,
);
@@ -466,14 +492,18 @@
4CE4472A18BC605C0017DF25 /* utils.cpp in Sources */,
4CE4471018BC605C0017DF25 /* decode_mb_aux.cpp in Sources */,
4CE4472018BC605C0017DF25 /* sample.cpp in Sources */,
4C34066F18C57D0400DFA14A /* mc_neon.S in Sources */,
4CE4472D18BC605C0017DF25 /* welsCodecTrace.cpp in Sources */,
4CE4471318BC605C0017DF25 /* encoder_data_tables.cpp in Sources */,
4C34067118C57D0400DFA14A /* pixel_neon.S in Sources */,
4CE4471F18BC605C0017DF25 /* ref_list_mgr_svc.cpp in Sources */,
4CE4472218BC605C0017DF25 /* slice_multi_threading.cpp in Sources */,
4CE4471518BC605C0017DF25 /* expand_pic.cpp in Sources */,
4C34067018C57D0400DFA14A /* memory_neon.S in Sources */,
4CE4470F18BC605C0017DF25 /* deblocking.cpp in Sources */,
4CE4472518BC605C0017DF25 /* svc_encode_mb.cpp in Sources */,
4CE4471A18BC605C0017DF25 /* mv_pred.cpp in Sources */,
4C34066E18C57D0400DFA14A /* intra_pred_sad_3_opt_neon.S in Sources */,
4CE4472B18BC605C0017DF25 /* wels_preprocess.cpp in Sources */,
4CE4470E18BC605C0017DF25 /* au_set.cpp in Sources */,
4CE4471718BC605C0017DF25 /* mc.cpp in Sources */,
@@ -484,12 +514,14 @@
4CE4472418BC605C0017DF25 /* svc_enc_slice_segment.cpp in Sources */,
4CE4472318BC605C0017DF25 /* svc_base_layer_md.cpp in Sources */,
4CE4471E18BC605C0017DF25 /* ratectl.cpp in Sources */,
4C34066D18C57D0400DFA14A /* intra_pred_neon.S in Sources */,
4CE4471C18BC605C0017DF25 /* picture_handle.cpp in Sources */,
4CE4472618BC605C0017DF25 /* svc_encode_slice.cpp in Sources */,
4CE4471218BC605C0017DF25 /* encoder.cpp in Sources */,
4CE4471618BC605C0017DF25 /* get_intra_predictor.cpp in Sources */,
4CE4472E18BC605C0017DF25 /* welsEncoderExt.cpp in Sources */,
4CE4471418BC605C0017DF25 /* encoder_ext.cpp in Sources */,
4C34067218C57D0400DFA14A /* reconstruct_neon.S in Sources */,
);
runOnlyForDeploymentPostprocessing = 0;
};

View File

@@ -809,4 +809,232 @@ WELS_ASM_FUNC_BEGIN enc_avc_non_zero_count_neon
vst1.64 {d0-d2}, [r0]
WELS_ASM_FUNC_END
#ifdef APPLE_IOS
.macro BS_NZC_CHECK
vld1.8 {d0,d1}, [$0]
/* Arrenge the input data --- TOP */
ands r6, $1, #2
beq bs_nzc_check_jump0
sub r6, $0, $2, lsl #4
sub r6, $2, lsl #3
add r6, #12
vld1.32 d3[1], [r6]
bs_nzc_check_jump0:
vext.8 q1, q1, q0, #12
vadd.u8 $3, q0, q1
/* Arrenge the input data --- LEFT */
ands r6, $1, #1
beq bs_nzc_check_jump1
sub r6, $0, #21
add r7, r6, #4
vld1.8 d3[4], [r6]
add r6, r7, #4
vld1.8 d3[5], [r7]
add r7, r6, #4
vld1.8 d3[6], [r6]
vld1.8 d3[7], [r7]
bs_nzc_check_jump1:
vzip.8 d0, d1
vzip.8 d0, d1
vext.8 q1, q1, q0, #12
vadd.u8 $4, q0, q1
.endm
.macro BS_COMPARE_MV //in: $0,$1(const),$2(const),$3(const),$4(const); out:$5, $6
mov r6, #4
vabd.s16 q5, $0, $1
vabd.s16 q6, $1, $2
vdup.s16 $0, r6
vabd.s16 q7, $2, $3
vabd.s16 q8, $3, $4
vcge.s16 q5, $0
vcge.s16 q6, $0
vcge.s16 q7, $0
vcge.s16 q8, $0
vpadd.i16 d10, d10, d11
vpadd.i16 d11, d12, d13
vpadd.i16 d12, d14, d15
vpadd.i16 d13, d16, d17
vaddhn.i16 $5, q5, q5
vaddhn.i16 $6, q6, q6
.endm
.macro BS_MV_CHECK
vldm $0, {q0,q1,q2,q3}
/* Arrenge the input data --- TOP */
ands r6, $1, #2
beq bs_mv_check_jump0
sub r6, $0, $2, lsl #6
add r6, #48
vld1.8 {d8, d9}, [r6]
bs_mv_check_jump0:
BS_COMPARE_MV q4, q0, q1, q2, q3, $3, $4
/* Arrenge the input data --- LEFT */
ands r6, $1, #1
beq bs_mv_check_jump1
sub r6, $0, #52
add r7, r6, #16
vld1.32 d8[0], [r6]
add r6, r7, #16
vld1.32 d8[1], [r7]
add r7, r6, #16
vld1.32 d9[0], [r6]
vld1.32 d9[1], [r7]
bs_mv_check_jump1:
vzip.32 q0, q2
vzip.32 q1, q3
vzip.32 q0, q1
vzip.32 q2, q3
BS_COMPARE_MV q4, q0, q1, q2, q3, $5, $6
.endm
#else
.macro BS_NZC_CHECK arg0, arg1, arg2, arg3, arg4
vld1.8 {d0,d1}, [\arg0]
/* Arrenge the input data --- TOP */
ands r6, \arg1, #2
beq bs_nzc_check_jump0
sub r6, \arg0, \arg2, lsl #4
sub r6, \arg2, lsl #3
add r6, #12
vld1.32 d3[1], [r6]
bs_nzc_check_jump0:
vext.8 q1, q1, q0, #12
vadd.u8 \arg3, q0, q1
/* Arrenge the input data --- LEFT */
ands r6, \arg1, #1
beq bs_nzc_check_jump1
sub r6, \arg0, #21
add r7, r6, #4
vld1.8 d3[4], [r6]
add r6, r7, #4
vld1.8 d3[5], [r7]
add r7, r6, #4
vld1.8 d3[6], [r6]
vld1.8 d3[7], [r7]
bs_nzc_check_jump1:
vzip.8 d0, d1
vzip.8 d0, d1
vext.8 q1, q1, q0, #12
vadd.u8 \arg4, q0, q1
.endm
.macro BS_COMPARE_MV arg0, arg1, arg2, arg3, arg4, arg5, arg6 //in: $0,$1(const),$2(const),$3(const),$4(const); out:$5, $6
mov r6, #4
vabd.s16 q5, \arg0, \arg1
vabd.s16 q6, \arg1, \arg2
vdup.s16 \arg0, r6
vabd.s16 q7, \arg2, \arg3
vabd.s16 q8, \arg3, \arg4
vcge.s16 q5, \arg0
vcge.s16 q6, \arg0
vcge.s16 q7, \arg0
vcge.s16 q8, \arg0
vpadd.i16 d10, d10, d11
vpadd.i16 d11, d12, d13
vpadd.i16 d12, d14, d15
vpadd.i16 d13, d16, d17
vaddhn.i16 \arg5, q5, q5
vaddhn.i16 \arg6, q6, q6
.endm
.macro BS_MV_CHECK arg0, arg1, arg2, arg3, arg4, arg5, arg6
vldm \arg0, {q0,q1,q2,q3}
/* Arrenge the input data --- TOP */
ands r6, \arg1, #2
beq bs_mv_check_jump0
sub r6, \arg0, \arg2, lsl #6
add r6, #48
vld1.8 {d8, d9}, [r6]
bs_mv_check_jump0:
BS_COMPARE_MV q4, q0, q1, q2, q3, \arg3, \arg4
/* Arrenge the input data --- LEFT */
ands r6, \arg1, #1
beq bs_mv_check_jump1
sub r6, \arg0, #52
add r7, r6, #16
vld1.32 d8[0], [r6]
add r6, r7, #16
vld1.32 d8[1], [r7]
add r7, r6, #16
vld1.32 d9[0], [r6]
vld1.32 d9[1], [r7]
bs_mv_check_jump1:
vzip.32 q0, q2
vzip.32 q1, q3
vzip.32 q0, q1
vzip.32 q2, q3
BS_COMPARE_MV q4, q0, q1, q2, q3, \arg5, \arg6
.endm
#endif
WELS_ASM_FUNC_BEGIN DeblockingBSCalcEnc_neon
stmdb sp!, {r5-r7}
ldr r5, [sp, #12] //Save BS to r5
/* Checking the nzc status */
BS_NZC_CHECK r0, r2, r3, q14, q15 //q14,q15 save the nzc status
/* For checking bS[I] = 2 */
mov r6, #2
vcgt.s8 q14, q14, #0
vdup.u8 q0, r6
vcgt.s8 q15, q15, #0
vand.u8 q14, q14, q0 //q14 save the nzc check result all the time --- for dir is top
vand.u8 q15, q15, q0 //q15 save the nzc check result all the time --- for dir is left
/* Checking the mv status*/
BS_MV_CHECK r1, r2, r3, d24, d25, d26, d27//q12, q13 save the mv status
/* For checking bS[I] = 1 */
mov r6, #1
vdup.u8 q0, r6
vand.u8 q12, q12, q0 //q12 save the nzc check result all the time --- for dir is top
vand.u8 q13, q13, q0 //q13 save the nzc check result all the time --- for dir is left
/* Check bS[I] is '1' or '2' */
vmax.u8 q1, q12, q14
vmax.u8 q0, q13, q15
//vstm r5, {q0, q1}
vst1.32 {q0, q1}, [r5]
ldmia sp!, {r5-r7}
WELS_ASM_FUNC_END
#endif

137
codec/common/expand_picture.S Executable file
View File

@@ -0,0 +1,137 @@
/*!
* \copy
* Copyright (c) 2013, Cisco Systems
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
*
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in
* the documentation and/or other materials provided with the
* distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
* COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
* ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*
*/
#ifdef HAVE_NEON
.text
#include "arm_arch_common_macro.S"
WELS_ASM_FUNC_BEGIN ExpandPictureLuma_neon
stmdb sp!, {r4-r8}
//Save the dst
mov r7, r0
mov r8, r3
add r4, r7, r2
sub r4, #1
//For the left and right expand
_expand_picture_luma_loop2:
sub r5, r7, #32
add r6, r4, #1
vld1.8 {d0[], d1[]}, [r7], r1
vld1.8 {d2[], d3[]}, [r4], r1
vst1.8 {q0}, [r5]!
vst1.8 {q0}, [r5]
vst1.8 {q1}, [r6]!
vst1.8 {q1}, [r6]
subs r8, #1
bne _expand_picture_luma_loop2
//for the top and bottom expand
add r2, #64
sub r0, #32
mla r4, r1, r3, r0
sub r4, r1
_expand_picture_luma_loop0:
mov r5, #32
mls r5, r5, r1, r0
add r6, r4, r1
vld1.8 {q0}, [r0]!
vld1.8 {q1}, [r4]!
mov r8, #32
_expand_picture_luma_loop1:
vst1.8 {q0}, [r5], r1
vst1.8 {q1}, [r6], r1
subs r8, #1
bne _expand_picture_luma_loop1
subs r2, #16
bne _expand_picture_luma_loop0
//vldreq.32 d0, [r0]
ldmia sp!, {r4-r8}
WELS_ASM_FUNC_END
WELS_ASM_FUNC_BEGIN ExpandPictureChroma_neon
stmdb sp!, {r4-r8}
//Save the dst
mov r7, r0
mov r8, r3
add r4, r7, r2
sub r4, #1
//For the left and right expand
_expand_picture_chroma_loop2:
sub r5, r7, #16
add r6, r4, #1
vld1.8 {d0[], d1[]}, [r7], r1
vld1.8 {d2[], d3[]}, [r4], r1
vst1.8 {q0}, [r5]
vst1.8 {q1}, [r6]
subs r8, #1
bne _expand_picture_chroma_loop2
//for the top and bottom expand
add r2, #32
sub r0, #16
mla r4, r1, r3, r0
sub r4, r1
_expand_picture_chroma_loop0:
mov r5, #16
mls r5, r5, r1, r0
add r6, r4, r1
vld1.8 {q0}, [r0]!
vld1.8 {q1}, [r4]!
mov r8, #16
_expand_picture_chroma_loop1:
vst1.8 {q0}, [r5], r1
vst1.8 {q1}, [r6], r1
subs r8, #1
bne _expand_picture_chroma_loop1
subs r2, #16
bne _expand_picture_chroma_loop0
//vldreq.32 d0, [r0]
ldmia sp!, {r4-r8}
WELS_ASM_FUNC_END
#endif

View File

@@ -61,6 +61,10 @@ void ExpandPictureChromaUnalign_sse2 (uint8_t* pDst,
const int32_t kiPicH);
#endif//X86_ASM
#if defined(HAVE_NEON)
void ExpandPictureLuma_neon(uint8_t *pDst, const int32_t kiStride, const int32_t kiPicW, const int32_t kiPicH);
void ExpandPictureChroma_neon(uint8_t *pDst, const int32_t kiStride, const int32_t kiPicW, const int32_t kiPicH);
#endif
#if defined(__cplusplus)
}
#endif//__cplusplus

BIN
codec/encoder/core/arm/.DS_Store vendored Normal file

Binary file not shown.

View File

@@ -0,0 +1,648 @@
/*!
* \copy
* Copyright (c) 2013, Cisco Systems
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
*
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in
* the documentation and/or other materials provided with the
* distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
* COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
* ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*
*/
#ifdef HAVE_NEON
.text
#include "arm_arch_common_macro.S"
#ifdef APPLE_IOS
//Global macro
.macro GET_8BYTE_DATA
vld1.8 {$0[0]}, [$1], $2
vld1.8 {$0[1]}, [$1], $2
vld1.8 {$0[2]}, [$1], $2
vld1.8 {$0[3]}, [$1], $2
vld1.8 {$0[4]}, [$1], $2
vld1.8 {$0[5]}, [$1], $2
vld1.8 {$0[6]}, [$1], $2
vld1.8 {$0[7]}, [$1], $2
.endm
#else
//Global macro
.macro GET_8BYTE_DATA arg0, arg1, arg2
vld1.8 {\arg0[0]}, [\arg1], \arg2
vld1.8 {\arg0[1]}, [\arg1], \arg2
vld1.8 {\arg0[2]}, [\arg1], \arg2
vld1.8 {\arg0[3]}, [\arg1], \arg2
vld1.8 {\arg0[4]}, [\arg1], \arg2
vld1.8 {\arg0[5]}, [\arg1], \arg2
vld1.8 {\arg0[6]}, [\arg1], \arg2
vld1.8 {\arg0[7]}, [\arg1], \arg2
.endm
#endif
WELS_ASM_FUNC_BEGIN enc_get_i16x16_luma_pred_v_neon
//Get the top line data to 'q0'
sub r3, r1, r2
vldm r3, {d0, d1}
//mov r2, #16
mov r3, #4
//Set the top line to the each line of MB(16*16)
loop_0_get_i16x16_luma_pred_v:
vst1.8 {d0,d1}, [r0]!
vst1.8 {d0,d1}, [r0]!
vst1.8 {d0,d1}, [r0]!
vst1.8 {d0,d1}, [r0]!
subs r3, #1
bne loop_0_get_i16x16_luma_pred_v
WELS_ASM_FUNC_END
WELS_ASM_FUNC_BEGIN enc_get_i16x16_luma_pred_h_neon
//stmdb sp!, {r4, lr}
sub r1, r1, #1
mov r3, #4
loop_0_get_i16x16_luma_pred_h:
//Get one byte data from left side
vld1.8 {d0[],d1[]}, [r1], r2
vld1.8 {d2[],d3[]}, [r1], r2
vld1.8 {d4[],d5[]}, [r1], r2
vld1.8 {d6[],d7[]}, [r1], r2
//Set the line of MB using the left side byte data
vst1.8 {d0,d1}, [r0]!
//add r0, #16
vst1.8 {d2,d3}, [r0]!
//add r0, #16
vst1.8 {d4,d5}, [r0]!
//add r0, #16
vst1.8 {d6,d7}, [r0]!
//add r0, #16
subs r3, #1
bne loop_0_get_i16x16_luma_pred_h
WELS_ASM_FUNC_END
WELS_ASM_FUNC_BEGIN enc_get_i16x16_luma_pred_dc_both_neon
//stmdb sp!, { r2-r5, lr}
//Get the left vertical line data
sub r3, r1, #1
GET_8BYTE_DATA d0, r3, r2
GET_8BYTE_DATA d1, r3, r2
//Get the top horizontal line data
sub r3, r1, r2
vldm r3, {d2, d3}
//Calculate the sum of top horizontal line data and vertical line data
vpaddl.u8 q0, q0
vpaddl.u8 q1, q1
vadd.u16 q0, q0, q1
vadd.u16 d0, d0, d1
vpaddl.u16 d0, d0
vpaddl.u32 d0, d0
//Calculate the mean value
vrshr.u16 d0, d0, #5
vdup.8 q0, d0[0]
//Set the mean value to the all of member of MB
mov r3, #4
loop_0_get_i16x16_luma_pred_dc_both:
vst1.8 {d0,d1}, [r0]!
vst1.8 {d0,d1}, [r0]!
vst1.8 {d0,d1}, [r0]!
vst1.8 {d0,d1}, [r0]!
subs r3, #1
bne loop_0_get_i16x16_luma_pred_dc_both
WELS_ASM_FUNC_END
//The table for SIMD instruction {(8,7,6,5,4,3,2,1) * 5}
CONST0_GET_I16X16_LUMA_PRED_PLANE: .long 0x191e2328, 0x050a0f14
//The table for SIMD instruction {-7,-6,-5,-4,-3,-2,-1,0}
CONST1_GET_I16X16_LUMA_PRED_PLANE: .long 0xfcfbfaf9, 0x00fffefd
WELS_ASM_FUNC_BEGIN enc_get_i16x16_luma_pred_plane_neon
//stmdb sp!, { r4, lr}
//Load the table {(8,7,6,5,4,3,2,1) * 5}
adr r3, CONST0_GET_I16X16_LUMA_PRED_PLANE
vldr d0, [r3]
//Pack the top[-1] ~ top[6] to d1
sub r3, r1, r2
sub r1, r3, #1
vld1.8 d1, [r1]
//Pack the top[8] ~ top[15] to d2
add r1, #9
vld1.8 d2, [r1]
//Save the top[15] to d6 for next step
vdup.u8 d6, d2[7]
//Get and pack left[-1] ~ left[6] to d4
sub r1, r3, #1
GET_8BYTE_DATA d4, r1, r2
//Get and pack left[8] ~ left[15] to d3
add r1, r2
GET_8BYTE_DATA d3, r1, r2
//Save the left[15] to d7 for next step
vdup.u8 d7, d3[7]
//revert the sequence of d2,d3
vrev64.8 q1, q1
vsubl.u8 q2, d3, d4 //q2={left[8]-left[6],left[9]-left[5],left[10]-left[4], ...}
vsubl.u8 q1, d2, d1 //q1={top[8]-top[6],top[9]-top[5],top[10]-top[4], ...}
vmovl.u8 q0, d0
vmul.s16 q1, q0, q1 //q1 = q1*{(8,7,6,5,4,3,2,1) * 5}
vmul.s16 q2, q0, q2 //q2 = q2*{(8,7,6,5,4,3,2,1) * 5}
//Calculate the sum of items of q1, q2
vpadd.s16 d0, d2, d3
vpadd.s16 d1, d4, d5
vpaddl.s16 q0, q0
vpaddl.s32 q0, q0
//Get the value of 'b', 'c' and extend to q1, q2.
vrshr.s64 q0, #6
vdup.s16 q1, d0[0]
vdup.s16 q2, d1[0]
//Load the table {-7,-6,-5,-4,-3,-2,-1,0} to d0
adr r3, CONST1_GET_I16X16_LUMA_PRED_PLANE
vld1.32 {d0}, [r3]
//Get the value of 'a' and save to q3
vaddl.u8 q3, d6, d7
vshl.u16 q3, #4
//calculate a+'b'*{-7,-6,-5,-4,-3,-2,-1,0} + c*{-7}
vmovl.s8 q0, d0
vmla.s16 q3, q0, q1
vmla.s16 q3, q2, d0[0]
//Calculate a+'b'*{1,2,3,4,5,6,7,8} + c*{-7}
vshl.s16 q5, q1, #3
vadd.s16 q5, q3
//right shift 5 bits and rounding
vqrshrun.s16 d0, q3, #5
vqrshrun.s16 d1, q5, #5
//Set the line of MB
vst1.u32 {d0,d1}, [r0]!
//Do the same processing for setting other lines
mov r3, #15
loop_0_get_i16x16_luma_pred_plane:
vadd.s16 q3, q2
vadd.s16 q5, q2
vqrshrun.s16 d0, q3, #5
vqrshrun.s16 d1, q5, #5
vst1.u32 {d0,d1}, [r0]!
subs r3, #1
bne loop_0_get_i16x16_luma_pred_plane
WELS_ASM_FUNC_END
WELS_ASM_FUNC_BEGIN enc_get_i4x4_luma_pred_v_neon
//stmdb sp!, { r2-r5, lr}
//Load the top row (4 bytes)
sub r3, r1, r2
ldr r3, [r3]
//Set the luma MB using top line
str r3, [r0], #4
str r3, [r0], #4
str r3, [r0], #4
str r3, [r0]
WELS_ASM_FUNC_END
WELS_ASM_FUNC_BEGIN enc_get_i4x4_luma_pred_h_neon
//stmdb sp!, { r2-r5, lr}
//Load the left column (4 bytes)
sub r3, r1, #1
vld1.8 {d0[]}, [r3], r2
vld1.8 {d1[]}, [r3], r2
vld1.8 {d2[]}, [r3], r2
vld1.8 {d3[]}, [r3]
//Set the luma MB using the left side byte
vst1.32 {d0[0]}, [r0]!
vst1.32 {d1[0]}, [r0]!
vst1.32 {d2[0]}, [r0]!
vst1.32 {d3[0]}, [r0]
WELS_ASM_FUNC_END
WELS_ASM_FUNC_BEGIN enc_get_i4x4_luma_pred_d_l_neon
//stmdb sp!, { r2-r5, lr}
//Load the top row data(8 bytes)
sub r3, r1, r2
vld1.32 {d0}, [r3]
//For "t7 + (t7<<1)"
vdup.8 d1, d0[7]
//calculate "t0+t1,t1+t2,t2+t3...t6+t7,t7+t7"
vext.8 d1, d0, d1, #1
vaddl.u8 q1, d1, d0
//calculate "x,t0+t1+t1+t2,t1+t2+t2+t3,...t5+t6+t6+t7,t6+t7+t7+t7"
vext.8 q2, q1, q1, #14
vadd.u16 q0, q1, q2
//right shift 2 bits and rounding
vqrshrn.u16 d0, q0, #2
//Save "ddl0, ddl1, ddl2, ddl3"
vext.8 d1, d0, d0, #1
vst1.32 d1[0], [r0]!
//Save "ddl1, ddl2, ddl3, ddl4"
vext.8 d1, d0, d0, #2
vst1.32 d1[0], [r0]!
//Save "ddl2, ddl3, ddl4, ddl5"
vext.8 d1, d0, d0, #3
vst1.32 d1[0], [r0]!
//Save "ddl3, ddl4, ddl5, ddl6"
vst1.32 d0[1], [r0]
WELS_ASM_FUNC_END
WELS_ASM_FUNC_BEGIN enc_get_i4x4_luma_pred_d_r_neon
//stmdb sp!, { r2-r5, lr}
//Load the top row (4 bytes)
sub r3, r1, r2
vld1.32 {d0[1]}, [r3]
//Load the left column (5 bytes)
sub r3, #1
vld1.8 {d0[3]}, [r3], r2
vld1.8 {d0[2]}, [r3], r2
vld1.8 {d0[1]}, [r3], r2
vld1.8 {d0[0]}, [r3], r2
vld1.8 {d1[7]}, [r3] //For packing the right sequence to do SIMD processing
vext.8 d2, d1, d0, #7 //d0:{L2,L1,L0,LT,T0,T1,T2,T3}
//d2:{L3,L2,L1,L0,LT,T0,T1,T2}
//q2:{L2+L3,L1+L2,L0+L1...T1+T2,T2+T3}
vaddl.u8 q2, d2, d0
//q1:{TL0+LT0,LT0+T01,...L12+L23}
vext.8 q3, q3, q2, #14
vadd.u16 q1, q2, q3
//right shift 2 bits and rounding
vqrshrn.u16 d0, q1, #2
//Adjust the data sequence for setting luma MB of 'pred'
vst1.32 d0[1], [r0]!
vext.8 d0, d0, d0, #7
vst1.32 d0[1], [r0]!
vext.8 d0, d0, d0, #7
vst1.32 d0[1], [r0]!
vext.8 d0, d0, d0, #7
vst1.32 d0[1], [r0]
WELS_ASM_FUNC_END
WELS_ASM_FUNC_BEGIN enc_get_i4x4_luma_pred_v_l_neon
//stmdb sp!, { r2-r5, lr}
//Load the top row (8 bytes)
sub r3, r1, r2
vld1.32 {d0}, [r3]
vext.8 d1, d0, d0, #1
vaddl.u8 q1, d1, d0 //q1:{t0+t1,t1+t2,t2+t3...t5+t6,x,x}
vext.8 q2, q1, q1, #2
vadd.u16 q2, q1, q2 //q2:{t0+t1+t1+t2,t1+t2+t2+t3,...t4+t5+t5+t6,x,x}
//calculate the "vl0,vl1,vl2,vl3,vl4"
vqrshrn.u16 d0, q1, #1
//calculate the "vl5,vl6,vl7,vl8,vl9"
vqrshrn.u16 d1, q2, #2
//Adjust the data sequence for setting the luma MB
vst1.32 d0[0], [r0]!
vst1.32 d1[0], [r0]!
vext.8 d0, d0, d0, #1
vext.8 d1, d1, d1, #1
vst1.32 d0[0], [r0]!
vst1.32 d1[0], [r0]
WELS_ASM_FUNC_END
WELS_ASM_FUNC_BEGIN enc_get_i4x4_luma_pred_v_r_neon
//stmdb sp!, { r2-r5, lr}
//Load the top row (4 bytes)
sub r3, r1, r2
vld1.32 {d0[1]}, [r3]
//Load the left column (4 bytes)
sub r3, #1
vld1.8 {d0[3]}, [r3], r2
vld1.8 {d0[2]}, [r3], r2
vld1.8 {d0[1]}, [r3], r2
vld1.8 {d0[0]}, [r3]
vext.8 d1, d0, d0, #7
vaddl.u8 q1, d0, d1 //q1:{X,L2+L1,L1+L0,L0+LT,LT+T0,T0+T1,T1+T2,T2+T3}
vext.u8 q2, q1, q1, #14
vadd.u16 q2, q2, q1 //q2:{X,L2+L1+L1+L0,L1+L0+L0+LT,...T1+T2+T2+T3}
//Calculate the vr0 ~ vr9
vqrshrn.u16 d1, q2, #2
vqrshrn.u16 d0, q1, #1
//Adjust the data sequence for setting the luma MB
vst1.32 d0[1], [r0]!
vst1.32 d1[1], [r0]!
//add r2, r0, r1
vst1.8 d1[3], [r0]!
vst1.16 d0[2], [r0]!
vst1.8 d0[6], [r0]!
vst1.8 d1[2], [r0]!
vst1.16 d1[2], [r0]!
vst1.8 d1[6], [r0]
WELS_ASM_FUNC_END
WELS_ASM_FUNC_BEGIN enc_get_i4x4_luma_pred_h_u_neon
//stmdb sp!, { r4, lr}
//Load the left column data
sub r3, r1, #1
mov r1, #3
mul r1, r2
add r1, r3
vld1.8 {d0[]}, [r1]
vld1.8 {d0[4]}, [r3], r2
vld1.8 {d0[5]}, [r3], r2
vld1.8 {d0[6]}, [r3], r2 //d0:{L3,L3,L3,L3,L0,L1,L2,L3}
vext.8 d1, d0, d0, #1
vaddl.u8 q2, d0, d1 //q2:{L3+L3,L3+L3,L3+L3,L3+L0,L0+L1,L1+L2,L2+L3,L3+L3}
vext.u8 d2, d5, d4, #2
vadd.u16 d3, d2, d5 //d3:{L0+L1+L1+L2,L1+L2+L2+L3,L2+L3+L3+L3,L3+L3+L3+L3}
//Calculate the hu0 ~ hu5
vqrshrn.u16 d2, q2, #1
vqrshrn.u16 d1, q1, #2
//Adjust the data sequence for setting the luma MB
vzip.8 d2, d1
vst1.32 d1[0], [r0]!
vext.8 d2, d1, d1, #2
vst1.32 d2[0], [r0]!
vst1.32 d1[1], [r0]!
vst1.32 d0[0], [r0]
WELS_ASM_FUNC_END
WELS_ASM_FUNC_BEGIN enc_get_i4x4_luma_pred_h_d_neon
//stmdb sp!, { r2-r5, lr}
//Load the data
sub r3, r1, r2
sub r3, #1
vld1.32 {d0[1]}, [r3], r2
vld1.8 {d0[3]}, [r3], r2
vld1.8 {d0[2]}, [r3], r2
vld1.8 {d0[1]}, [r3], r2
vld1.8 {d0[0]}, [r3] //d0:{L3,L2,L1,L0,LT,T0,T1,T2}
vext.8 d1, d0, d0, #7
vaddl.u8 q1, d0, d1 //q1:{x,L3+L2,L2+L1,L1+L0,L0+LT,LT+T0,T0+T1,T1+T2}
vext.u8 q2, q1, q1, #14 //q2:{x,x, L3+L2,L2+L1,L1+L0,L0+LT,LT+T0,T0+T1}
vadd.u16 q3, q2, q1 //q3:{x,x,L3+L2+L2+L1,L2+L1+L1+L0,L1+L0+L0+LT,L0+LT+LT+T0,LT+T0+T0+T1,T0+T1+T1+T2}
//Calculate the hd0~hd9
vqrshrn.u16 d1, q3, #2
vqrshrn.u16 d0, q2, #1
//Adjust the data sequence for setting the luma MB
vmov d3, d1
vtrn.8 d0, d1
vext.u8 d2, d1, d1, #6
vst2.16 {d2[3], d3[3]}, [r0]!
vst2.16 {d0[2], d1[2]}, [r0]!
vmov d3, d0
vst2.16 {d2[2], d3[2]}, [r0]!
vst2.16 {d0[1], d1[1]}, [r0]
WELS_ASM_FUNC_END
WELS_ASM_FUNC_BEGIN enc_get_i_chroma_pred_v_neon
//stmdb sp!, { r2-r5, lr}
//Get the top row (8 byte)
sub r3, r1, r2
vldr d0, [r3]
//Set the chroma MB using top row data
vst1.8 {d0}, [r0]!
vst1.8 {d0}, [r0]!
vst1.8 {d0}, [r0]!
vst1.8 {d0}, [r0]!
vst1.8 {d0}, [r0]!
vst1.8 {d0}, [r0]!
vst1.8 {d0}, [r0]!
vst1.8 {d0}, [r0]
WELS_ASM_FUNC_END
WELS_ASM_FUNC_BEGIN enc_get_i_chroma_pred_h_neon
//stmdb sp!, { r2-r5, lr}
////Get the left column (8 byte)
sub r3, r1, #1
vld1.8 {d0[]}, [r3], r2
vld1.8 {d1[]}, [r3], r2
vld1.8 {d2[]}, [r3], r2
vld1.8 {d3[]}, [r3], r2
vld1.8 {d4[]}, [r3], r2
vld1.8 {d5[]}, [r3], r2
vld1.8 {d6[]}, [r3], r2
vld1.8 {d7[]}, [r3]
//Set the chroma MB using left column data
vst1.8 {d0}, [r0]!
vst1.8 {d1}, [r0]!
vst1.8 {d2}, [r0]!
vst1.8 {d3}, [r0]!
vst1.8 {d4}, [r0]!
vst1.8 {d5}, [r0]!
vst1.8 {d6}, [r0]!
vst1.8 {d7}, [r0]
WELS_ASM_FUNC_END
WELS_ASM_FUNC_BEGIN enc_get_i_chroma_pred_dc_both_neon
//stmdb sp!, { r2-r5, lr}
//Load the left column data (8 bytes)
sub r3, r1, #1
GET_8BYTE_DATA d0, r3, r2
//Load the top row data (8 bytes)
sub r3, r1, r2
vldr d1, [r3]
//Calculate the sum of left column and top row
vpaddl.u8 q0, q0
vpaddl.u16 q0, q0
vadd.u32 d2, d0, d1 //'m1' save to d2
vrshr.u32 q0, q0, #2 //calculate 'm2','m3'
vrshr.u32 d2, d2, #3 //calculate 'm4'
//duplicate the 'mx' to a vector line
vdup.8 d4, d2[0]
vdup.8 d5, d1[4]
vdup.8 d6, d0[4]
vdup.8 d7, d2[4]
//Set the chroma MB
vst2.32 {d4[0],d5[0]}, [r0]!
vst2.32 {d4[0],d5[0]}, [r0]!
vst2.32 {d4[0],d5[0]}, [r0]!
vst2.32 {d4[0],d5[0]}, [r0]!
vst2.32 {d6[0],d7[0]}, [r0]!
vst2.32 {d6[0],d7[0]}, [r0]!
vst2.32 {d6[0],d7[0]}, [r0]!
vst2.32 {d6[0],d7[0]}, [r0]
WELS_ASM_FUNC_END
//Table {{1,2,3,4,1,2,3,4}*17}
CONST0_GET_I_CHROMA_PRED_PLANE: .long 0x44332211, 0x44332211//0x140f0a05, 0x28231e19
//Table {-3,-2,-1,0,1,2,3,4}
CONST1_GET_I_CHROMA_PRED_PLANE: .long 0xfffefffd, 0x0000ffff,0x00020001,0x00040003
WELS_ASM_FUNC_BEGIN enc_get_i_chroma_pred_plane_neon
//stmdb sp!, { r2-r5, lr}
//Load the top row data
sub r3, r1, #1
sub r3, r2
vld1.32 {d1[0]}, [r3]
add r3, #5
vld1.32 {d0[0]}, [r3]
//Load the left column data
sub r3, #5
vld1.8 {d1[4]}, [r3], r2
vld1.8 {d1[5]}, [r3], r2
vld1.8 {d1[6]}, [r3], r2
vld1.8 {d1[7]}, [r3], r2 //d1:{LT,T0,T1,T2,LT,L0,L1,L2}
add r3, r2
vld1.8 {d0[4]}, [r3], r2
vld1.8 {d0[5]}, [r3], r2
vld1.8 {d0[6]}, [r3], r2
vld1.8 {d0[7]}, [r3] //d0:{T4,T5,T6,T7,L4,L5,L6.L7}
//Save T7 to d3 for next step
vdup.u8 d3, d0[3]
//Save L7 to d4 for next step
vdup.u8 d4, d0[7]
//Calculate the value of 'a' and save to q2
vaddl.u8 q2, d3, d4
vshl.u16 q2, #4
//Load the table {{1,2,3,4,1,2,3,4}*17}
adr r3, CONST0_GET_I_CHROMA_PRED_PLANE
vld1.32 {d2}, [r3]
//Calculate the 'b','c', and save to q0
vrev32.8 d1, d1
vsubl.u8 q0, d0, d1
vmovl.u8 q1, d2
vmul.s16 q0, q1
vpaddl.s16 q0, q0
vpaddl.s32 q0, q0
vrshr.s64 q0, #5
//Load the table {-3,-2,-1,0,1,2,3,4} to q3
adr r3, CONST1_GET_I_CHROMA_PRED_PLANE
vld1.32 {d6, d7}, [r3]
//Duplicate the 'b','c' to q0, q1 for SIMD instruction
vdup.s16 q1, d1[0]
vdup.s16 q0, d0[0]
//Calculate the "(a + b * (j - 3) + c * (- 3) + 16) >> 5;"
vmla.s16 q2, q0, q3
vmla.s16 q2, q1, d6[0]
vqrshrun.s16 d0, q2, #5
//Set a line of chroma MB
vst1.u32 {d0}, [r0]!
//Do the same processing for each line.
mov r3, #7
loop_0_get_i_chroma_pred_plane:
vadd.s16 q2, q1
vqrshrun.s16 d0, q2, #5
vst1.u32 {d0}, [r0]!
subs r3, #1
bne loop_0_get_i_chroma_pred_plane
WELS_ASM_FUNC_END
#endif

View File

@@ -0,0 +1,793 @@
/*!
* \copy
* Copyright (c) 2013, Cisco Systems
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
*
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in
* the documentation and/or other materials provided with the
* distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
* COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
* ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*
*/
#ifdef HAVE_NEON
.text
#include "arm_arch_common_macro.S"
#ifdef APPLE_IOS
//The data sequence will be used
.macro GET_8BYTE_DATA_L0
vld1.8 {$0[0]}, [$1], $2
vld1.8 {$0[1]}, [$1], $2
vld1.8 {$0[2]}, [$1], $2
vld1.8 {$0[3]}, [$1], $2
vld1.8 {$0[4]}, [$1], $2
vld1.8 {$0[5]}, [$1], $2
vld1.8 {$0[6]}, [$1], $2
vld1.8 {$0[7]}, [$1], $2
.endm
.macro HDM_TRANSFORM_4X4_L0
//Do the vertical transform
vaddl.u8 q0, $0, $1 //{0,4,8,12,1,5,9,13}
vsubl.u8 q1, $0, $1 //{2,6,10,14,3,7,11,15}
vswp d1, d2
vadd.s16 q2, q0, q1 //{0,1,2,3,4,5,6,7}
vsub.s16 q1, q0, q1 //{12,13,14,15,8,9,10,11}
//Do the horizontal transform
vtrn.32 q2, q1
vadd.s16 q0, q2, q1
vsub.s16 q1, q2, q1
vtrn.16 q0, q1
vadd.s16 q2, q0, q1
vsub.s16 q1, q0, q1
vmov.s16 d0, d4
vmov.s16 d1, d2
vabs.s16 d3, d3
//16x16_v
vtrn.32 d0, d1 //{0,1,3,2}
vaba.s16 $5, d0, $2 //16x16_v
vaba.s16 $5, d1, $8
vaba.s16 $5, d5, $8
vadd.u16 $5, d3
//16x16_h
vtrn.16 d4, d5 //{0,4,12,8}
vaba.s16 $6, d4, $3 //16x16_h
vabs.s16 d2, d2
vabs.s16 d5, d5
vadd.u16 d2, d3
vadd.u16 d2, d5
vadd.u16 $6, d2
//16x16_dc_both
vaba.s16 $7, d4, $4 //16x16_dc_both
vadd.u16 $7, d2
.endm
#else
//The data sequence will be used
.macro GET_8BYTE_DATA_L0 arg0, arg1, arg2
vld1.8 {\arg0[0]}, [\arg1], \arg2
vld1.8 {\arg0[1]}, [\arg1], \arg2
vld1.8 {\arg0[2]}, [\arg1], \arg2
vld1.8 {\arg0[3]}, [\arg1], \arg2
vld1.8 {\arg0[4]}, [\arg1], \arg2
vld1.8 {\arg0[5]}, [\arg1], \arg2
vld1.8 {\arg0[6]}, [\arg1], \arg2
vld1.8 {\arg0[7]}, [\arg1], \arg2
.endm
.macro HDM_TRANSFORM_4X4_L0 arg0, arg1, arg2,arg3, arg4, arg5, arg6, arg7, arg8
//Do the vertical transform
vaddl.u8 q0, \arg0, \arg1 //{0,4,8,12,1,5,9,13}
vsubl.u8 q1, \arg0, \arg1 //{2,6,10,14,3,7,11,15}
vswp d1, d2
vadd.s16 q2, q0, q1 //{0,1,2,3,4,5,6,7}
vsub.s16 q1, q0, q1 //{12,13,14,15,8,9,10,11}
//Do the horizontal transform
vtrn.32 q2, q1
vadd.s16 q0, q2, q1
vsub.s16 q1, q2, q1
vtrn.16 q0, q1
vadd.s16 q2, q0, q1
vsub.s16 q1, q0, q1
vmov.s16 d0, d4
vmov.s16 d1, d2
vabs.s16 d3, d3
//16x16_v
vtrn.32 d0, d1 //{0,1,3,2}
vaba.s16 \arg5, d0, \arg2 //16x16_v
vaba.s16 \arg5, d1, \arg8
vaba.s16 \arg5, d5, \arg8
vadd.u16 \arg5, d3
//16x16_h
vtrn.16 d4, d5 //{0,4,12,8}
vaba.s16 \arg6, d4, \arg3 //16x16_h
vabs.s16 d2, d2
vabs.s16 d5, d5
vadd.u16 d2, d3
vadd.u16 d2, d5
vadd.u16 \arg6, d2
//16x16_dc_both
vaba.s16 \arg7, d4, \arg4 //16x16_dc_both
vadd.u16 \arg7, d2
.endm
#endif
WELS_ASM_FUNC_BEGIN satd_intra_16x16_x3_opt_neon
stmdb sp!, {r4-r7, lr}
//Get the top line data to 'q15'(16 bytes)
sub r7, r0, r1
vld1.8 {q15}, [r7]
//Get the left colume data to 'q14' (16 bytes)
sub r7, r0, #1
GET_8BYTE_DATA_L0 d28, r7, r1
GET_8BYTE_DATA_L0 d29, r7, r1
//Calculate the mean value and save to 'q13->d27(reserve the d26)' (2 bytes)
//Calculate the 16x16_dc_both mode SATD
vaddl.u8 q0, d30, d31
vaddl.u8 q1, d28, d29
vadd.u16 q0, q1
vadd.u16 d0, d1
vpaddl.u16 d0, d0
vpaddl.u32 d0, d0
//Calculate the mean value
vrshr.u16 d0, #5
vshl.u16 d27, d0, #4
//Calculate the 16x16_v mode SATD and save to "q11, 12"
vshll.u8 q0, d30, #2
vshll.u8 q1, d31, #2
vtrn.32 q0, q1
vadd.s16 q2, q0, q1
vsub.s16 q1, q0, q1
vtrn.16 q2, q1
vadd.s16 q12, q2, q1
vsub.s16 q11, q2, q1
vtrn.32 q12, q11 //{0,1,3,2, 4,5,7,6} q12
//{8,9,11,10, 12,13,15,14} q11
//Calculate the 16x16_h mode SATD and save to "q9, q10"
vshll.u8 q0, d28, #2
vshll.u8 q1, d29, #2
vtrn.32 q0, q1
vadd.s16 q2, q0, q1
vsub.s16 q1, q0, q1
vtrn.16 q2, q1
vadd.s16 q10, q2, q1
vsub.s16 q9, q2, q1
vtrn.32 q10, q9 //{0,1,3,2, 4,5,7,6} q10
//{8,9,11,10, 12,13,15,14} q9
vmov.i32 d17, #0//Save the SATD of DC_BOTH
vmov.i32 d16, #0//Save the SATD of H
vmov.i32 d15, #0//Save the SATD of V
vmov.i32 d14, #0//For zero D register
//Load the p_enc data and save to "q3 ~ q6"--- 16X4 bytes
vld1.32 {q3}, [r2], r3
vld1.32 {q4}, [r2], r3
vld1.32 {q5}, [r2], r3
vld1.32 {q6}, [r2], r3
vtrn.32 q3, q4
vtrn.32 q5, q6
HDM_TRANSFORM_4X4_L0 d6, d10, d24, d20, d27, d15, d16, d17, d14
HDM_TRANSFORM_4X4_L0 d7, d11, d22, d20, d27, d15, d16, d17, d14
HDM_TRANSFORM_4X4_L0 d8, d12, d25, d20, d27, d15, d16, d17, d14
HDM_TRANSFORM_4X4_L0 d9, d13, d23, d20, d27, d15, d16, d17, d14
//Load the p_enc data and save to "q3 ~ q6"--- 16X4 bytes
vld1.32 {q3}, [r2], r3
vld1.32 {q4}, [r2], r3
vld1.32 {q5}, [r2], r3
vld1.32 {q6}, [r2], r3
vtrn.32 q3, q4
vtrn.32 q5, q6
HDM_TRANSFORM_4X4_L0 d6, d10, d24, d21, d27, d15, d16, d17, d14
HDM_TRANSFORM_4X4_L0 d7, d11, d22, d21, d27, d15, d16, d17, d14
HDM_TRANSFORM_4X4_L0 d8, d12, d25, d21, d27, d15, d16, d17, d14
HDM_TRANSFORM_4X4_L0 d9, d13, d23, d21, d27, d15, d16, d17, d14
//Load the p_enc data and save to "q3 ~ q6"--- 16X4 bytes
vld1.32 {q3}, [r2], r3
vld1.32 {q4}, [r2], r3
vld1.32 {q5}, [r2], r3
vld1.32 {q6}, [r2], r3
vtrn.32 q3, q4
vtrn.32 q5, q6
HDM_TRANSFORM_4X4_L0 d6, d10, d24, d18, d27, d15, d16, d17, d14
HDM_TRANSFORM_4X4_L0 d7, d11, d22, d18, d27, d15, d16, d17, d14
HDM_TRANSFORM_4X4_L0 d8, d12, d25, d18, d27, d15, d16, d17, d14
HDM_TRANSFORM_4X4_L0 d9, d13, d23, d18, d27, d15, d16, d17, d14
//Load the p_enc data and save to "q3 ~ q6"--- 16X4 bytes
vld1.32 {q3}, [r2], r3
vld1.32 {q4}, [r2], r3
vld1.32 {q5}, [r2], r3
vld1.32 {q6}, [r2], r3
vtrn.32 q3, q4
vtrn.32 q5, q6
HDM_TRANSFORM_4X4_L0 d6, d10, d24, d19, d27, d15, d16, d17, d14
HDM_TRANSFORM_4X4_L0 d7, d11, d22, d19, d27, d15, d16, d17, d14
HDM_TRANSFORM_4X4_L0 d8, d12, d25, d19, d27, d15, d16, d17, d14
HDM_TRANSFORM_4X4_L0 d9, d13, d23, d19, d27, d15, d16, d17, d14
//Get the data from stack
ldr r5, [sp, #20] //the addr of Best_mode
ldr r6, [sp, #24] //the value of i_lambda
//vadd.u16 d24, d25
vrshr.u16 d15, #1
vpaddl.u16 d15, d15
vpaddl.u32 d15, d15
vmov.u32 r0, d15[0]
//vadd.u16 d22, d23
vrshr.u16 d16, #1
vpaddl.u16 d16, d16
vpaddl.u32 d16, d16
vmov.u32 r1, d16[0]
add r1, r6, lsl #1
//vadd.u16 d20, d21
vrshr.u16 d17, #1
vpaddl.u16 d17, d17
vpaddl.u32 d17, d17
vmov.u32 r2, d17[0]
add r2, r6, lsl #1
mov r4, #0
cmp r1, r0
movcc r0, r1
movcc r4, #1
cmp r2, r0
movcc r0, r2
movcc r4, #2
str r4, [r5]
ldmia sp!, {r4-r7, lr}
WELS_ASM_FUNC_END
WELS_ASM_FUNC_BEGIN sad_intra_16x16_x3_opt_neon
stmdb sp!, {r4-r7, lr}
//Get the top line data to 'q15'(16 bytes)
sub r4, r0, r1
vld1.8 {q15}, [r4]
//Get the left colume data to 'q14' (16 bytes)
sub r4, r0, #1
GET_8BYTE_DATA_L0 d28, r4, r1
GET_8BYTE_DATA_L0 d29, r4, r1
//Calculate the mean value and save to 'q13' (8 bytes)
//Calculate the 16x16_dc_both mode SATD
vaddl.u8 q0, d30, d31
vaddl.u8 q1, d28, d29
vadd.u16 q0, q1
vadd.u16 d0, d1
vpaddl.u16 d0, d0
vpaddl.u32 d0, d0
//Calculate the mean value
vrshr.u16 d0, d0, #5
vdup.8 q13, d0[0]
sub r4, r0, #1
vmov.i32 q12, #0//Save the SATD of DC_BOTH
vmov.i32 q11, #0//Save the SATD of H
vmov.i32 q10, #0//Save the SATD of V
mov lr, #16
sad_intra_16x16_x3_opt_loop0:
//Get the left colume data to 'd0' (16 bytes)
vld1.8 {d0[]}, [r4], r1
//Load the p_enc data and save to "q1 ~ q2"--- 16X4 bytes
vld1.8 {q1}, [r2], r3
subs lr, #1
//Do the SAD for top colume
vabal.u8 q12, d30, d2
vabal.u8 q12, d31, d3
//Do the SAD for left colume
vabal.u8 q11, d0, d2
vabal.u8 q11, d0, d3
//Do the SAD for mean value
vabal.u8 q10, d26, d2
vabal.u8 q10, d26, d3
bne sad_intra_16x16_x3_opt_loop0
//Get the data from stack
ldr r5, [sp, #20] //the addr of Best_mode
ldr r6, [sp, #24] //the value of i_lambda
vadd.u16 d24, d25
vpaddl.u16 d24, d24
vpaddl.u32 d24, d24
vmov.u32 r0, d24[0]
vadd.u16 d22, d23
vpaddl.u16 d22, d22
vpaddl.u32 d22, d22
vmov.u32 r1, d22[0]
add r1, r6, lsl #1
vadd.u16 d20, d21
vpaddl.u16 d20, d20
vpaddl.u32 d20, d20
vmov.u32 r2, d20[0]
add r2, r6, lsl #1
mov r4, #0
cmp r1, r0
movcc r0, r1
movcc r4, #1
cmp r2, r0
movcc r0, r2
movcc r4, #2
str r4, [r5]
ldmia sp!, {r4-r7, lr}
WELS_ASM_FUNC_END
WELS_ASM_FUNC_BEGIN sad_intra_8x8_x3_opt_neon
stmdb sp!, {r4-r7, lr}
//Get the data from stack
ldr r4, [sp, #32] //p_dec_cr
ldr r5, [sp, #36] //p_enc_cr
//Get the left colume data to 'd28(cb), d30(cr)' (16 bytes)
sub r6, r0, #1
GET_8BYTE_DATA_L0 d28, r6, r1
sub r6, r4, #1
GET_8BYTE_DATA_L0 d30, r6, r1
//Get the top line data to 'd29(cb), d31(cr)'(16 bytes)
sub r6, r0, r1
vld1.8 {d29}, [r6]
sub r6, r4, r1
vld1.8 {d31}, [r6]
//Calculate the sum of left column and top row
vmov.i32 q0, q14
vpaddl.u8 q0, q0
vpaddl.u16 q0, q0
vadd.u32 d2, d0, d1 //'m1' save to d2
vrshr.u32 q0, q0, #2 //calculate 'm2','m3'
vrshr.u32 d2, d2, #3 //calculate 'm4'
//duplicate the 'mx' to a vector line
vdup.8 d27, d2[0]
vdup.8 d26, d1[4]
vtrn.32 d27, d26
vdup.8 d26, d0[4]
vdup.8 d25, d2[4]
vtrn.32 d26, d25 //Save to "d27, d26"
vmov.i32 q0, q15
vpaddl.u8 q0, q0
vpaddl.u16 q0, q0
vadd.u32 d2, d0, d1 //'m1' save to d2
vrshr.u32 q0, q0, #2 //calculate 'm2','m3'
vrshr.u32 d2, d2, #3 //calculate 'm4'
//duplicate the 'mx' to a vector line
vdup.8 d25, d2[0]
vdup.8 d24, d1[4]
vtrn.32 d25, d24
vdup.8 d24, d0[4]
vdup.8 d23, d2[4]
vtrn.32 d24, d23 //Save to "d25, d24"
vmov.i32 q11, #0//Save the SATD of DC_BOTH
vmov.i32 q10, #0//Save the SATD of H
vmov.i32 q9 , #0//Save the SATD of V
sub r6, r0, #1
sub r7, r4, #1
mov lr, #4
sad_intra_8x8_x3_opt_loop0:
//Load the p_enc data and save to "q1 ~ q2"--- 16X4 bytes
vld1.8 {d0}, [r2], r3
vld1.8 {d1}, [r5], r3
//Get the left colume data to 'd0' (16 bytes)
vld1.8 {d2[]}, [r6], r1
vld1.8 {d3[]}, [r7], r1
subs lr, #1
//Do the SAD for top colume
vabal.u8 q11, d29, d0
vabal.u8 q11, d31, d1
//Do the SAD for left colume
vabal.u8 q10, d2, d0
vabal.u8 q10, d3, d1
//Do the SAD for mean value
vabal.u8 q9, d27, d0
vabal.u8 q9, d25, d1
bne sad_intra_8x8_x3_opt_loop0
mov lr, #4
sad_intra_8x8_x3_opt_loop1:
//Load the p_enc data and save to "q1 ~ q2"--- 16X4 bytes
vld1.8 {d0}, [r2], r3
vld1.8 {d1}, [r5], r3
//Get the left colume data to 'd0' (16 bytes)
vld1.8 {d2[]}, [r6], r1
vld1.8 {d3[]}, [r7], r1
subs lr, #1
//Do the SAD for top colume
vabal.u8 q11, d29, d0
vabal.u8 q11, d31, d1
//Do the SAD for left colume
vabal.u8 q10, d2, d0
vabal.u8 q10, d3, d1
//Do the SAD for mean value
vabal.u8 q9, d26, d0
vabal.u8 q9, d24, d1
bne sad_intra_8x8_x3_opt_loop1
//Get the data from stack
ldr r5, [sp, #20] //the addr of Best_mode
ldr r6, [sp, #24] //the value of i_lambda
vadd.u16 d22, d23
vpaddl.u16 d22, d22
vpaddl.u32 d22, d22
vmov.u32 r0, d22[0]
add r0, r6, lsl #1
vadd.u16 d20, d21
vpaddl.u16 d20, d20
vpaddl.u32 d20, d20
vmov.u32 r1, d20[0]
add r1, r6, lsl #1
vadd.u16 d18, d19
vpaddl.u16 d18, d18
vpaddl.u32 d18, d18
vmov.u32 r2, d18[0]
mov r4, #2
cmp r1, r0
movcc r0, r1
movcc r4, #1
cmp r2, r0
movcc r0, r2
movcc r4, #0
str r4, [r5]
ldmia sp!, {r4-r7, lr}
WELS_ASM_FUNC_END
WELS_ASM_FUNC_BEGIN satd_intra_8x8_x3_opt_neon
stmdb sp!, {r4-r7, lr}
//Get the data from stack
ldr r4, [sp, #32] //p_dec_cr
ldr r5, [sp, #36] //p_enc_cr
//Get the top line data to 'd29(cb), d31(cr)'(16 bytes)
sub r6, r0, r1
vld1.8 {d29}, [r6]
sub r6, r4, r1
vld1.8 {d31}, [r6]
//Get the left colume data to 'd28(cb), d30(cr)' (16 bytes)
sub r6, r0, #1
GET_8BYTE_DATA_L0 d28, r6, r1
sub r6, r4, #1
GET_8BYTE_DATA_L0 d30, r6, r1
//Calculate the 16x16_v mode SATD and save to "q12, 13"
vshll.u8 q0, d29, #2
vshll.u8 q1, d31, #2
vtrn.32 q0, q1
vadd.s16 q2, q0, q1
vsub.s16 q1, q0, q1
vtrn.16 q2, q1
vadd.s16 q13, q2, q1
vsub.s16 q12, q2, q1
vtrn.32 q13, q12 //{0,1,3,2, 4,5,7,6} q13
//{8,9,11,10, 12,13,15,14} q12
//Calculate the 16x16_h mode SATD and save to "q10, q11"
vshll.u8 q0, d28, #2
vshll.u8 q1, d30, #2
vtrn.32 q0, q1
vadd.s16 q2, q0, q1
vsub.s16 q1, q0, q1
vtrn.16 q2, q1
vadd.s16 q11, q2, q1
vsub.s16 q10, q2, q1
vtrn.32 q11, q10 //{0,1,3,2, 4,5,7,6} q11
//{8,9,11,10, 12,13,15,14} q10
//Calculate the sum of left column and top row
//vmov.i32 q0, q14
vpaddl.u8 q0, q14
vpaddl.u16 q0, q0
vadd.u32 d2, d0, d1
vpaddl.u8 q2, q15
vpaddl.u16 q2, q2
vadd.u32 d3, d4, d5
vtrn.32 q0, q2
vrshr.u32 q1, #3
vrshr.u32 q2, #2
vshll.u32 q9, d4, #4 // {2cb, 2cr} q9
vshll.u32 q8, d5, #4 // {1cb, 1cr} q8
vshll.u32 q7, d2, #4 // {0cb, 3cb} q7
vshll.u32 q6, d3, #4 // {0cr, 3cr} q6
vmov.i32 d28, #0//Save the SATD of DC_BOTH
vmov.i32 d10, #0//Save the SATD of H
vmov.i32 d11, #0//Save the SATD of V
vmov.i32 d30, #0//For zero D register
//Load the p_enc data and save to "q3 ~ q6"--- 16X4 bytes
vld1.32 {d6}, [r2], r3
vld1.32 {d7}, [r2], r3
vld1.32 {d8}, [r2], r3
vld1.32 {d9}, [r2], r3
vtrn.32 d6, d7
vtrn.32 d8, d9
HDM_TRANSFORM_4X4_L0 d6, d8, d26, d22, d14, d11, d10, d28, d30
HDM_TRANSFORM_4X4_L0 d7, d9, d27, d22, d16, d11, d10, d28, d30
vld1.32 {d6}, [r5], r3
vld1.32 {d7}, [r5], r3
vld1.32 {d8}, [r5], r3
vld1.32 {d9}, [r5], r3
vtrn.32 d6, d7
vtrn.32 d8, d9
HDM_TRANSFORM_4X4_L0 d6, d8, d24, d20, d12, d11, d10, d28, d30
HDM_TRANSFORM_4X4_L0 d7, d9, d25, d20, d17, d11, d10, d28, d30
//Load the p_enc data and save to "q3 ~ q6"--- 16X4 bytes
vld1.32 {d6}, [r2], r3
vld1.32 {d7}, [r2], r3
vld1.32 {d8}, [r2], r3
vld1.32 {d9}, [r2], r3
vtrn.32 d6, d7
vtrn.32 d8, d9
HDM_TRANSFORM_4X4_L0 d6, d8, d26, d23, d18, d11, d10, d28, d30
HDM_TRANSFORM_4X4_L0 d7, d9, d27, d23, d15, d11, d10, d28, d30
vld1.32 {d6}, [r5], r3
vld1.32 {d7}, [r5], r3
vld1.32 {d8}, [r5], r3
vld1.32 {d9}, [r5], r3
vtrn.32 d6, d7
vtrn.32 d8, d9
HDM_TRANSFORM_4X4_L0 d6, d8, d24, d21, d19, d11, d10, d28, d30
HDM_TRANSFORM_4X4_L0 d7, d9, d25, d21, d13, d11, d10, d28, d30
//Get the data from stack
ldr r5, [sp, #20] //the addr of Best_mode
ldr r6, [sp, #24] //the value of i_lambda
vrshr.u16 d11, #1
vpaddl.u16 d11, d11
vpaddl.u32 d11, d11
vmov.u32 lr, d11[0]
add lr, r6, lsl #1
vrshr.u16 d10, #1
vpaddl.u16 d10, d10
vpaddl.u32 d10, d10
vmov.u32 r3, d10[0]
add r3, r6, lsl #1
vrshr.u16 d28, #1
vpaddl.u16 d28, d28
vpaddl.u32 d28, d28
vmov.u32 r2, d28[0]
mov r6, #2
cmp r3, lr
movcc lr, r3
movcc r6, #1
cmp r2, lr
movcc lr, r2
movcc r6, #0
str r6, [r5]
mov r0, lr
ldmia sp!, {r4-r7, lr}
WELS_ASM_FUNC_END
WELS_ASM_FUNC_BEGIN satd_intra_4x4_x3_opt_neon
stmdb sp!, {r4-r7, lr}
//Get the top line data to 'd31[0~3]'(4 bytes)
sub r7, r0, r1
vld1.32 {d31[0]}, [r7]
//Get the left colume data to 'd31[4~7]' (4 bytes)
sub r7, r0, #1
vld1.8 {d31[4]}, [r7], r1
vld1.8 {d31[5]}, [r7], r1
vld1.8 {d31[6]}, [r7], r1
vld1.8 {d31[7]}, [r7], r1
//Calculate the mean value and save to 'd30' (2 bytes)
vpaddl.u8 d0, d31
vpaddl.u16 d0, d0
vpaddl.u32 d0, d0
//Calculate the mean value
vrshr.u16 d0, #3
vshl.u16 d30, d0, #4
//Calculate the 16x16_v mode SATD and save to "d29"
//Calculate the 16x16_h mode SATD and save to "d28"
vshll.u8 q0, d31, #2
vtrn.32 d0, d1
vadd.s16 d2, d0, d1
vsub.s16 d1, d0, d1
vtrn.16 d2, d1
vadd.s16 d29, d2, d1
vsub.s16 d28, d2, d1
vtrn.32 d29, d28 //{0,1,3,2 top} d29
//{0,1,3,2 left} d28
vmov.i32 d27, #0//Save the SATD of DC_BOTH
vmov.i32 d26, #0//Save the SATD of H
vmov.i32 d25, #0//Save the SATD of V
vmov.i32 d24, #0//For zero D register
//Load the p_enc data and save to "d22,d23"--- 4X4 bytes
vld1.32 {d23[0]}, [r2], r3
vld1.32 {d23[1]}, [r2], r3
vld1.32 {d22[0]}, [r2], r3
vld1.32 {d22[1]}, [r2], r3
HDM_TRANSFORM_4X4_L0 d23, d22, d29, d28, d30, d25, d26, d27, d24
//Get the data from stack
ldr r5, [sp, #28] //the value of lambda2
ldr r6, [sp, #32] //the value of lambda1
ldr r7, [sp, #36] //the value of lambda0
vrshr.u16 d25, #1
vpaddl.u16 d25, d25
vpaddl.u32 d25, d25
vmov.u32 r0, d25[0]
add r0, r7
vrshr.u16 d26, #1
vpaddl.u16 d26, d26
vpaddl.u32 d26, d26
vmov.u32 r1, d26[0]
add r1, r6
vrshr.u16 d27, #1
vpaddl.u16 d27, d27
vpaddl.u32 d27, d27
vmov.u32 r2, d27[0]
add r2, r5
ldr r5, [sp, #20] //p_dst
ldr r6, [sp, #24] //the addr of Best_mode
mov r4, r0
cmp r1, r4
movcc r4, r1
cmp r2, r4
movcc r4, r2
//The compare sequence affect the resule
cmp r4, r2
bne satd_intra_4x4_x3_opt_jump0
mov r0, #2
str r0, [r6]
vshr.u32 d0, d30, #4 // {2cb, 2cr} q9
vdup.8 q1, d0[0]
vst1.8 {q1}, [r5]
//...
bl satd_intra_4x4_x3_opt_end
satd_intra_4x4_x3_opt_jump0:
cmp r4, r1
bne satd_intra_4x4_x3_opt_jump1
mov r0, #1
str r0, [r6]
vdup.8 d0, d31[4]
vdup.8 d1, d31[5]
vdup.8 d2, d31[6]
vdup.8 d3, d31[7]
vst4.32 {d0[0],d1[0],d2[0],d3[0]}, [r5]
bl satd_intra_4x4_x3_opt_end
satd_intra_4x4_x3_opt_jump1:
mov r0, #0
str r0, [r6]
vst1.32 {d31[0]}, [r5]!
vst1.32 {d31[0]}, [r5]!
vst1.32 {d31[0]}, [r5]!
vst1.32 {d31[0]}, [r5]!
satd_intra_4x4_x3_opt_end:
mov r0, r4
ldmia sp!, {r4-r7, lr}
WELS_ASM_FUNC_END
#endif

1963
codec/encoder/core/arm/mc_neon.S Executable file

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,63 @@
/*!
* \copy
* Copyright (c) 2013, Cisco Systems
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
*
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in
* the documentation and/or other materials provided with the
* distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
* COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
* ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*
*/
#ifdef HAVE_NEON
.text
#include "arm_arch_common_macro.S"
WELS_ASM_FUNC_BEGIN WelsSetMemZero_neon
veor q0, q0
cmp r1, #32
beq mem_zero_32_neon_start
blt mem_zero_24_neon_start
mem_zero_loop:
subs r1, r1, #64
vst1.64 {q0}, [r0]!
vst1.64 {q0}, [r0]!
vst1.64 {q0}, [r0]!
vst1.64 {q0}, [r0]!
bne mem_zero_loop
WELS_ASM_FUNC_END
mem_zero_32_neon_start:
vst1.64 {q0}, [r0]!
vst1.64 {q0}, [r0]!
WELS_ASM_FUNC_END
mem_zero_24_neon_start:
vst1.64 {q0}, [r0]!
vst1.64 {d0}, [r0]!
WELS_ASM_FUNC_END
#endif

View File

@@ -0,0 +1,880 @@
/*!
* \copy
* Copyright (c) 2013, Cisco Systems
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
*
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in
* the documentation and/or other materials provided with the
* distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
* COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
* ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*
*/
#ifdef HAVE_NEON
.text
#include "arm_arch_common_macro.S"
.macro SATD_16x4
vld1.64 {q0}, [r0,:128], r1
vld1.64 {q1}, [r2], r3
vsubl.u8 q4, d0, d2
vld1.64 {q2}, [r0,:128], r1
vsubl.u8 q6, d1, d3
vld1.64 {q3}, [r2], r3
vsubl.u8 q5, d4, d6
vld1.64 {q0}, [r0,:128], r1
vsubl.u8 q7, d5, d7
vld1.64 {q1}, [r2], r3
vsubl.u8 q8, d0, d2
vld1.64 {q2}, [r0,:128], r1
vsubl.u8 q10, d1, d3
vadd.s16 q0, q4, q5
vld1.64 {q3}, [r2], r3
vsub.s16 q1, q4, q5
vsubl.u8 q9, d4, d6
vsubl.u8 q11, d5, d7
vadd.s16 q2, q8, q9
vsub.s16 q3, q8, q9
vadd.s16 q4, q6, q7
vsub.s16 q5, q6, q7
vadd.s16 q6, q10, q11
vsub.s16 q7, q10, q11
vadd.s16 q8, q0, q2
vsub.s16 q10, q0, q2
vadd.s16 q9, q4, q6
vsub.s16 q11, q4, q6
vsub.s16 q0, q1, q3
vadd.s16 q2, q1, q3
vsub.s16 q1, q5, q7
vadd.s16 q3, q5, q7
vtrn.16 q8, q10
vtrn.16 q9, q11
vadd.s16 q4, q8, q10
vabd.s16 q6, q8, q10
vadd.s16 q5, q9, q11
vabd.s16 q7, q9, q11
vabs.s16 q4, q4
vabs.s16 q5, q5
vtrn.16 q0, q2
vtrn.16 q1, q3
vadd.s16 q8, q0, q2
vabd.s16 q10, q0, q2
vadd.s16 q9, q1, q3
vabd.s16 q11, q1, q3
vabs.s16 q8, q8
vabs.s16 q9, q9
vtrn.32 q4, q6
vtrn.32 q5, q7
vtrn.32 q8, q10
vtrn.32 q9, q11
vmax.s16 q0, q4, q6
vmax.s16 q1, q5, q7
vmax.s16 q2, q8, q10
vmax.s16 q3, q9, q11
vadd.u16 q0, q0, q1
vadd.u16 q2, q2, q3
.endm
.macro SATD_8x4
vld1.64 {d0}, [r0,:64], r1
vld1.64 {d1}, [r2], r3
vld1.64 {d2}, [r0,:64], r1
vsubl.u8 q4, d0, d1
vld1.64 {d3}, [r2], r3
vsubl.u8 q5, d2, d3
vld1.64 {d4}, [r0,:64], r1
vld1.64 {d5}, [r2], r3
vadd.s16 q8, q4, q5
vsubl.u8 q6, d4, d5
vld1.64 {d6}, [r0,:64], r1
vld1.64 {d7}, [r2], r3
vsubl.u8 q7, d6, d7
vsub.s16 q9, q4, q5
vadd.s16 q10, q6, q7
vsub.s16 q11, q6, q7
vadd.s16 q0, q8, q10
vsub.s16 q1, q8, q10
vsub.s16 q2, q9, q11
vadd.s16 q3, q9, q11
vtrn.16 q0, q1
vtrn.16 q2, q3
vadd.s16 q4, q0, q1
vabd.s16 q5, q0, q1
vabs.s16 q4, q4
vadd.s16 q6, q2, q3
vabd.s16 q7, q2, q3
vabs.s16 q6, q6
vtrn.32 q4, q5
vtrn.32 q6, q7
vmax.s16 q0, q4, q5
vmax.s16 q1, q6, q7
.endm
.macro SAD_16x4
vld1.64 {q6}, [r0, :128], r1
vabal.u8 q10, d8, d10
vld1.64 {q7}, [r2], r3
vabal.u8 q11, d9, d11
vld1.64 {q0}, [r0, :128], r1
vabal.u8 q12, d12, d14
vld1.64 {q1}, [r2], r3
vabal.u8 q13, d13, d15
vld1.64 {q2}, [r0, :128], r1
vabal.u8 q10, d0, d2
vld1.64 {q3}, [r2], r3
vabal.u8 q11, d1, d3
vld1.64 {q4}, [r0, :128], r1
vabal.u8 q12, d4, d6
vld1.64 {q5}, [r2], r3
vabal.u8 q13, d5, d7
.endm
.macro SAD_8x4
vld1.64 {d0}, [r0, :64], r1
vld1.64 {d1}, [r2], r3
vabal.u8 q10, d0, d1
vld1.64 {d2}, [r0, :64], r1
vld1.64 {d3}, [r2], r3
vabal.u8 q11, d2, d3
vld1.64 {d4}, [r0, :64], r1
vld1.64 {d5}, [r2], r3
vabal.u8 q12, d4, d5
vld1.64 {d6}, [r0, :64], r1
vld1.64 {d7}, [r2], r3
vabal.u8 q13, d6, d7
.endm
WELS_ASM_FUNC_BEGIN pixel_sad_16x16_neon
vld1.64 {q0}, [r0, :128], r1
vld1.64 {q1}, [r2], r3
vabdl.u8 q10, d0, d2
vld1.64 {q2}, [r0, :128], r1
vabdl.u8 q11, d1, d3
vld1.64 {q3}, [r2], r3
vld1.64 {q4}, [r0, :128], r1
vabdl.u8 q12, d4, d6
vld1.64 {q5}, [r2], r3
vabdl.u8 q13, d5, d7
SAD_16x4
SAD_16x4
SAD_16x4
vld1.64 {q6}, [r0, :128], r1
vabal.u8 q10, d8, d10
vld1.64 {q7}, [r2], r3
vabal.u8 q11, d9, d11
vabal.u8 q12, d12, d14
vabal.u8 q13, d13, d15
vadd.u16 q14, q10, q11
vadd.u16 q15, q12, q13
vadd.u16 q15, q14, q15
vadd.u16 d0, d30, d31
vpaddl.u16 d0, d0
vpaddl.u32 d0, d0
vmov.u32 r0, d0[0]
WELS_ASM_FUNC_END
WELS_ASM_FUNC_BEGIN pixel_sad_16x8_neon
vld1.64 {q0}, [r0, :128], r1
vld1.64 {q1}, [r2], r3
vabdl.u8 q10, d0, d2
vld1.64 {q2}, [r0, :128], r1
vabdl.u8 q11, d1, d3
vld1.64 {q3}, [r2], r3
vld1.64 {q4}, [r0, :128], r1
vabdl.u8 q12, d4, d6
vld1.64 {q5}, [r2], r3
vabdl.u8 q13, d5, d7
SAD_16x4
vld1.64 {q6}, [r0, :128], r1
vabal.u8 q10, d8, d10
vld1.64 {q7}, [r2], r3
vabal.u8 q11, d9, d11
vabal.u8 q12, d12, d14
vabal.u8 q13, d13, d15
vadd.u16 q14, q10, q11
vadd.u16 q15, q12, q13
vadd.u16 q15, q14, q15
vadd.u16 d0, d30, d31
vpaddl.u16 d0, d0
vpaddl.u32 d0, d0
vmov.u32 r0, d0[0]
WELS_ASM_FUNC_END
WELS_ASM_FUNC_BEGIN pixel_sad_8x16_neon
vld1.64 {d0}, [r0, :64], r1
vld1.64 {d1}, [r2], r3
vabdl.u8 q10, d0, d1
vld1.64 {d2}, [r0, :64], r1
vld1.64 {d3}, [r2], r3
vabdl.u8 q11, d2, d3
vld1.64 {d4}, [r0, :64], r1
vld1.64 {d5}, [r2], r3
vabdl.u8 q12, d4, d5
vld1.64 {d6}, [r0, :64], r1
vld1.64 {d7}, [r2], r3
vabdl.u8 q13, d6, d7
SAD_8x4
SAD_8x4
SAD_8x4
vadd.u16 q14, q10, q11
vadd.u16 q15, q12, q13
vadd.u16 q15, q15, q14
vadd.u16 d0, d30, d31
vpaddl.u16 d0, d0
vpaddl.u32 d0, d0
vmov.u32 r0, d0[0]
WELS_ASM_FUNC_END
WELS_ASM_FUNC_BEGIN pixel_sad_8x8_neon
vld1.64 {d0}, [r0, :64], r1
vld1.64 {d1}, [r2], r3
vabdl.u8 q10, d0, d1
vld1.64 {d2}, [r0, :64], r1
vld1.64 {d3}, [r2], r3
vabdl.u8 q11, d2, d3
vld1.64 {d4}, [r0, :64], r1
vld1.64 {d5}, [r2], r3
vabdl.u8 q12, d4, d5
vld1.64 {d6}, [r0, :64], r1
vld1.64 {d7}, [r2], r3
vabdl.u8 q13, d6, d7
SAD_8x4
vadd.u16 q14, q10, q11
vadd.u16 q15, q12, q13
vadd.u16 q15, q15, q14
vadd.u16 d0, d30, d31
vpaddl.u16 d0, d0
vpaddl.u32 d0, d0
vmov.u32 r0, d0[0]
WELS_ASM_FUNC_END
WELS_ASM_FUNC_BEGIN pixel_sad_4x4_neon
stmdb sp!, {r4-r5, lr}
//Loading a horizontal line data (4 bytes)
//line 0
ldr r4, [r0], r1
ldr r5, [r2], r3
usad8 lr, r4, r5
//line 1
ldr r4, [r0], r1
ldr r5, [r2], r3
usada8 lr, r4, r5, lr
//line 2
ldr r4, [r0], r1
ldr r5, [r2], r3
usada8 lr, r4, r5, lr
//line 3
ldr r4, [r0]
ldr r5, [r2]
usada8 r0, r4, r5, lr
ldmia sp!, {r4-r5, lr}
WELS_ASM_FUNC_END
WELS_ASM_FUNC_BEGIN pixel_sad_4_16x16_neon
stmdb sp!, {r4-r5, lr}
//Generate the pix2 start addr
sub r4, r2, #1
add r5, r2, #1
sub r2, r3
//Loading a horizontal line data (16 bytes)
vld1.8 {q0}, [r0], r1 //save pix1
vld1.8 {q1}, [r2], r3 //save pix2 - stride
vld1.8 {q6}, [r2], r3 //save pix2
vld1.8 {q2}, [r2], r3 //save pix2 + stride
vld1.8 {q3}, [r4], r3 //save pix2 - 1
vld1.8 {q4}, [r5], r3 //save pix2 + 1
//Do the SAD for 16 bytes
vabdl.u8 q15, d0, d2
vabal.u8 q15, d1, d3
vabdl.u8 q13, d0, d4
vabal.u8 q13, d1, d5
vabdl.u8 q11, d0, d6
vabal.u8 q11, d1, d7
vabdl.u8 q9, d0, d8
vabal.u8 q9, d1, d9
mov lr, #15
pixel_sad_4_16x16_loop_0:
//Loading a horizontal line data (16 bytes)
vld1.8 {q0}, [r0], r1 //save pix1
vmov.8 q1, q6 //save pix2 - stride
vmov.8 q6, q2
vabal.u8 q15, d0, d2
vld1.8 {q2}, [r2], r3 //save pix2 + stride
vabal.u8 q15, d1, d3
vld1.8 {q3}, [r4], r3 //save pix2 - 1
vabal.u8 q13, d0, d4
vld1.8 {q4}, [r5], r3 //save pix2 + 1
vabal.u8 q13, d1, d5
subs lr, #1
vabal.u8 q11, d0, d6
vabal.u8 q11, d1, d7
vabal.u8 q9, d0, d8
vabal.u8 q9, d1, d9
bne pixel_sad_4_16x16_loop_0
//Save SAD to 'r0'
ldr r0, [sp, #12]
vadd.u16 d0, d30, d31
vadd.u16 d1, d26, d27
vadd.u16 d2, d22, d23
vadd.u16 d3, d18, d19
vpaddl.u16 q0, q0
vpaddl.u16 q1, q1
vpaddl.u32 q0, q0
vpaddl.u32 q1, q1
vshl.u32 q0, #4
vshl.u32 q1, #4
vst4.32 {d0[0],d1[0],d2[0],d3[0]}, [r0]
ldmia sp!, {r4-r5, lr}
WELS_ASM_FUNC_END
WELS_ASM_FUNC_BEGIN pixel_sad_4_16x8_neon
stmdb sp!, {r4-r5, lr}
//Generate the pix2 start addr
sub r4, r2, #1
add r5, r2, #1
sub r2, r3
//Loading a horizontal line data (16 bytes)
vld1.8 {q0}, [r0], r1 //save pix1
vld1.8 {q1}, [r2], r3 //save pix2 - stride
vld1.8 {q6}, [r2], r3 //save pix2
vld1.8 {q2}, [r2], r3 //save pix2 + stride
vld1.8 {q3}, [r4], r3 //save pix2 - 1
vld1.8 {q4}, [r5], r3 //save pix2 + 1
//Do the SAD for 16 bytes
vabdl.u8 q15, d0, d2
vabal.u8 q15, d1, d3
vabdl.u8 q13, d0, d4
vabal.u8 q13, d1, d5
vabdl.u8 q11, d0, d6
vabal.u8 q11, d1, d7
vabdl.u8 q9, d0, d8
vabal.u8 q9, d1, d9
mov lr, #7
pixel_sad_4_16x8_loop_0:
//Loading a horizontal line data (16 bytes)
vld1.8 {q0}, [r0], r1 //save pix1
vmov.8 q1, q6 //save pix2 - stride
vmov.8 q6, q2
vabal.u8 q15, d0, d2
vld1.8 {q2}, [r2], r3 //save pix2 + stride
vabal.u8 q15, d1, d3
vld1.8 {q3}, [r4], r3 //save pix2 - 1
vabal.u8 q13, d0, d4
vld1.8 {q4}, [r5], r3 //save pix2 + 1
vabal.u8 q13, d1, d5
subs lr, #1
vabal.u8 q11, d0, d6
vabal.u8 q11, d1, d7
vabal.u8 q9, d0, d8
vabal.u8 q9, d1, d9
bne pixel_sad_4_16x8_loop_0
//Save SAD to 'r0'
ldr r0, [sp, #12]
vadd.u16 d0, d30, d31
vadd.u16 d1, d26, d27
vadd.u16 d2, d22, d23
vadd.u16 d3, d18, d19
vpaddl.u16 q0, q0
vpaddl.u16 q1, q1
vpaddl.u32 q0, q0
vpaddl.u32 q1, q1
vshl.u32 q0, #4
vshl.u32 q1, #4
vst4.32 {d0[0],d1[0],d2[0],d3[0]}, [r0]
ldmia sp!, {r4-r5, lr}
WELS_ASM_FUNC_END
WELS_ASM_FUNC_BEGIN pixel_sad_4_8x16_neon
stmdb sp!, {r4-r5, lr}
//Generate the pix2 start addr
sub r4, r2, #1
add r5, r2, #1
sub r2, r3
//Loading a horizontal line data (8 bytes)
vld1.8 {d0}, [r0], r1 //save pix1
vld1.8 {d1}, [r2], r3 //save pix2 - stride
vld1.8 {d6}, [r2], r3 //save pix2
vld1.8 {d2}, [r2], r3 //save pix2 + stride
vld1.8 {d3}, [r4], r3 //save pix2 - 1
vld1.8 {d4}, [r5], r3 //save pix2 + 1
//Do the SAD for 8 bytes
vabdl.u8 q15, d0, d1
vabdl.u8 q14, d0, d2
vabdl.u8 q13, d0, d3
vabdl.u8 q12, d0, d4
mov lr, #15
pixel_sad_4_8x16_loop_0:
//Loading a horizontal line data (8 bytes)
vld1.8 {d0}, [r0], r1 //save pix1
vmov.8 d1, d6 //save pix2 - stride
vmov.8 d6, d2
vld1.8 {d2}, [r2], r3 //save pix2 + stride
vld1.8 {d3}, [r4], r3 //save pix2 - 1
vabal.u8 q15, d0, d1
vld1.8 {d4}, [r5], r3 //save pix2 + 1
//Do the SAD for 8 bytes
vabal.u8 q14, d0, d2
vabal.u8 q13, d0, d3
vabal.u8 q12, d0, d4
subs lr, #1
bne pixel_sad_4_8x16_loop_0
//Save SAD to 'r0'
ldr r0, [sp, #12]
vadd.u16 d0, d30, d31
vadd.u16 d1, d28, d29
vadd.u16 d2, d26, d27
vadd.u16 d3, d24, d25
vpaddl.u16 q0, q0
vpaddl.u16 q1, q1
vpaddl.u32 q0, q0
vpaddl.u32 q1, q1
vshl.u32 q0, #4
vshl.u32 q1, #4
vst4.32 {d0[0],d1[0],d2[0],d3[0]}, [r0]
ldmia sp!, {r4-r5, lr}
WELS_ASM_FUNC_END
WELS_ASM_FUNC_BEGIN pixel_sad_4_8x8_neon
stmdb sp!, {r4-r5, lr}
//Generate the pix2 start addr
sub r4, r2, #1
add r5, r2, #1
sub r2, r3
//Loading a horizontal line data (8 bytes)
vld1.8 {d0}, [r0], r1 //save pix1
vld1.8 {d1}, [r2], r3 //save pix2 - stride
vld1.8 {d6}, [r2], r3 //save pix2
vld1.8 {d2}, [r2], r3 //save pix2 + stride
vld1.8 {d3}, [r4], r3 //save pix2 - 1
vld1.8 {d4}, [r5], r3 //save pix2 + 1
//Do the SAD for 8 bytes
vabdl.u8 q15, d0, d1
vabdl.u8 q14, d0, d2
vabdl.u8 q13, d0, d3
vabdl.u8 q12, d0, d4
mov lr, #7
pixel_sad_4_8x8_loop_0:
//Loading a horizontal line data (8 bytes)
vld1.8 {d0}, [r0], r1 //save pix1
vmov.8 d1, d6 //save pix2 - stride
vmov.8 d6, d2
vld1.8 {d2}, [r2], r3 //save pix2 + stride
vld1.8 {d3}, [r4], r3 //save pix2 - 1
vabal.u8 q15, d0, d1
vld1.8 {d4}, [r5], r3 //save pix2 + 1
//Do the SAD for 8 bytes
vabal.u8 q14, d0, d2
vabal.u8 q13, d0, d3
vabal.u8 q12, d0, d4
subs lr, #1
bne pixel_sad_4_8x8_loop_0
//Save SAD to 'r0'
ldr r0, [sp, #12]
vadd.u16 d0, d30, d31
vadd.u16 d1, d28, d29
vadd.u16 d2, d26, d27
vadd.u16 d3, d24, d25
vpaddl.u16 q0, q0
vpaddl.u16 q1, q1
vpaddl.u32 q0, q0
vpaddl.u32 q1, q1
vshl.u32 q0, #4
vshl.u32 q1, #4
vst4.32 {d0[0],d1[0],d2[0],d3[0]}, [r0]
ldmia sp!, {r4-r5, lr}
WELS_ASM_FUNC_END
WELS_ASM_FUNC_BEGIN pixel_sad_4_4x4_neon
vld1.32 {d0[0]}, [r0], r1
vld1.32 {d0[1]}, [r0], r1
vld1.32 {d1[0]}, [r0], r1
vld1.32 {d1[1]}, [r0]
sub r0, r2, r3
vld1.32 {d2[0]}, [r0], r3
vld1.32 {d2[1]}, [r0], r3
vld1.32 {d3[0]}, [r0], r3
vld1.32 {d3[1]}, [r0], r3
vld1.32 {d4[0]}, [r0], r3
vld1.32 {d4[1]}, [r0]
sub r0, r2, #1
vld1.32 {d5[0]}, [r0], r3
vld1.32 {d5[1]}, [r0], r3
vld1.32 {d6[0]}, [r0], r3
vld1.32 {d6[1]}, [r0]
add r0, r2, #1
vld1.32 {d7[0]}, [r0], r3
vld1.32 {d7[1]}, [r0], r3
vld1.32 {d8[0]}, [r0], r3
vld1.32 {d8[1]}, [r0]
vabdl.u8 q15, d0, d2
vabdl.u8 q14, d1, d3
vabdl.u8 q13, d0, d3
vabdl.u8 q12, d1, d4
vabdl.u8 q11, d0, d5
vabdl.u8 q10, d1, d6
vabdl.u8 q9, d0, d7
vabdl.u8 q8, d1, d8
//Save SAD to 'r4'
ldr r0, [sp]
vadd.u16 q0, q14, q15
vadd.u16 q1, q12, q13
vadd.u16 q2, q10, q11
vadd.u16 q3, q8 , q9
vadd.u16 d0, d1
vadd.u16 d1, d2, d3
vadd.u16 d2, d4, d5
vadd.u16 d3, d6, d7
vpaddl.u16 q0, q0
vpaddl.u16 q1, q1
vpaddl.u32 q0, q0
vpaddl.u32 q1, q1
vshl.u32 q0, #4
vshl.u32 q1, #4
vst4.32 {d0[0],d1[0],d2[0],d3[0]}, [r0]
WELS_ASM_FUNC_END
WELS_ASM_FUNC_BEGIN pixel_satd_16x16_neon
SATD_16x4
vadd.u16 q15, q0, q2
SATD_16x4
vadd.u16 q15, q15, q0
vadd.u16 q15, q15, q2
SATD_16x4
vadd.u16 q15, q15, q0
vadd.u16 q15, q15, q2
SATD_16x4
vadd.u16 q15, q15, q0
vadd.u16 q15, q15, q2
vadd.u16 d0, d30, d31
vpaddl.u16 d0, d0
vpaddl.u32 d0, d0
vmov.32 r0, d0[0]
WELS_ASM_FUNC_END
WELS_ASM_FUNC_BEGIN pixel_satd_16x8_neon
SATD_16x4
vadd.u16 q15, q0, q2
SATD_16x4
vadd.u16 q15, q15, q0
vadd.u16 q15, q15, q2
vadd.u16 d0, d30, d31
vpaddl.u16 d0, d0
vpaddl.u32 d0, d0
vmov.32 r0, d0[0]
WELS_ASM_FUNC_END
WELS_ASM_FUNC_BEGIN pixel_satd_8x16_neon
SATD_8x4
vadd.u16 q15, q0, q1
SATD_8x4
vadd.u16 q15, q15, q0
vadd.u16 q15, q15, q1
SATD_8x4
vadd.u16 q15, q15, q0
vadd.u16 q15, q15, q1
SATD_8x4
vadd.u16 q15, q15, q0
vadd.u16 q15, q15, q1
vadd.u16 d0, d30, d31
vpaddl.u16 d0, d0
vpaddl.u32 d0, d0
vmov.32 r0, d0[0]
WELS_ASM_FUNC_END
WELS_ASM_FUNC_BEGIN pixel_satd_8x8_neon
SATD_8x4
vadd.u16 q15, q0, q1
SATD_8x4
vadd.u16 q15, q15, q0
vadd.u16 q15, q15, q1
vadd.u16 d0, d30, d31
vpaddl.u16 d0, d0
vpaddl.u32 d0, d0
vmov.32 r0, d0[0]
WELS_ASM_FUNC_END
WELS_ASM_FUNC_BEGIN pixel_satd_4x4_neon
//Load the pix1 data --- 16 bytes
vld1.32 {d0[0]}, [r0], r1
vld1.32 {d0[1]}, [r0], r1
vld1.32 {d1[0]}, [r0], r1
vld1.32 {d1[1]}, [r0]
//Load the pix2 data --- 16 bytes
vld1.32 {d2[0]}, [r2], r3
vld1.32 {d2[1]}, [r2], r3
vld1.32 {d3[0]}, [r2], r3
vld1.32 {d3[1]}, [r2]
//Get the difference
vsubl.u8 q15, d0, d2 //{0,1,2,3,4,5,6,7}
vsubl.u8 q14, d1, d3 //{8,9,10,11,12,13,14,15}
//Do the vertical transform
vadd.s16 q13, q15, q14 //{0,4,8,12,1,5,9,13}
vsub.s16 q12, q15, q14 //{2,6,10,14,3,7,11,15}
vswp d27, d24
vadd.s16 q15, q13, q12 //{0,1,2,3,4,5,6,7}
vsub.s16 q14, q13, q12 //{12,13,14,15,8,9,10,11}
//Do the horizontal transform
vtrn.32 q15, q14
vadd.s16 q13, q15, q14
vsub.s16 q12, q15, q14
vtrn.16 q13, q12
vadd.s16 q15, q13, q12
//Do the SAD
vabs.s16 q15, q15
vabd.s16 q14, q13, q12
vadd.u16 q0, q15, q14
vrhadd.u16 d0, d1
vpaddl.u16 d0, d0
vpaddl.u32 d0, d0
vmov.u32 r0, d0[0]
WELS_ASM_FUNC_END
#endif

File diff suppressed because it is too large Load Diff

View File

@@ -61,6 +61,16 @@ uint8_t uiFilterIdc;
uint8_t uiReserved;
} SDeblockingFilter;
#if defined(__cplusplus)
extern "C" {
#endif//__cplusplus
#if defined(HAVE_NEON)
void WelsNonZeroCount_neon(int8_t * pNonZeroCount);
void DeblockingBSCalcEnc_neon(int8_t *pNzc, SMVUnitXY *pMv, int32_t iBoundryFlag, int32_t iMbStride, uint8_t (*pBS)[4][4]);
#endif
#if defined(__cplusplus)
}
#endif//__cplusplus
void DeblockingInit (DeblockingFunc* pFunc, int32_t iCpu);
void WelsNonZeroCount_c (int8_t* pNonZeroCount);

View File

@@ -70,6 +70,16 @@ void WelsIDctRecI16x16Dc_sse2 (uint8_t* pRec, int32_t iStride, uint8_t* pPredict
int16_t* pDctDc);
#endif//X86_ASM
#ifdef HAVE_NEON
void WelsDequantFour4x4_neon(int16_t* pDct, const uint16_t* kpMF);
void WelsDequant4x4_neon(int16_t* pDct, const uint16_t* kpMF);
void WelsDequantIHadamard4x4_neon(int16_t* pRes, const uint16_t kuiMF);
void WelsIDctT4Rec_neon(uint8_t* pRec, int32_t iStride, uint8_t* pPrediction, int32_t iPredStride, int16_t* pDct);
void WelsIDctFourT4Rec_neon(uint8_t* pRec, int32_t iStride, uint8_t* pPrediction, int32_t iPredStride, int16_t* pDct);
void WelsIDctRecI16x16Dc_neon(uint8_t* pRec, int32_t iStride, uint8_t* pPrediction, int32_t iPredStride, int16_t* pDctDc);
#endif
#if defined(__cplusplus)
}
#endif//__cplusplus

View File

@@ -122,6 +122,28 @@ void WelsCopy16x16_sse2 (uint8_t* Dst, int32_t iStrideD, uint8_t* Src, int32_t
void WelsCopy16x16NotAligned_sse2 (uint8_t* Dst, int32_t iStrideD, uint8_t* Src, int32_t iStrideS);
#endif
#ifdef HAVE_NEON
void WelsCopy8x8_neon( uint8_t* pDst, int32_t iStrideD, uint8_t* pSrc, int32_t iStrideS );
void WelsCopy16x16_neon( uint8_t* pDst, int32_t iStrideD, uint8_t* pSrc, int32_t iStrideS );
void WelsCopy16x16NotAligned_neon( uint8_t* pDst, int32_t iStrideD, uint8_t* pSrc, int32_t iStrideS );
void WelsCopy16x8NotAligned_neon( uint8_t* pDst, int32_t iStrideD, uint8_t* pSrc, int32_t iStrideS );
void WelsCopy8x16_neon( uint8_t* pDst, int32_t iStrideD, uint8_t* pSrc, int32_t iStrideS );
void WelsHadamardT4Dc_neon(int16_t* pLumaDc, int16_t* pDct);
int32_t WelsHadamardQuant2x2_neon(int16_t* pRes, const int16_t kiFF, int16_t iMF, int16_t* pDct, int16_t* pBlock);
int32_t WelsHadamardQuant2x2Skip_neon(int16_t* pRes, int16_t iFF, int16_t iMF);
int32_t WelsHadamardQuant2x2SkipKernel_neon(int16_t *pRes, int16_t iThreshold);// avoid divide operator
void WelsDctT4_neon(int16_t* pDct, uint8_t* pPixel1, int32_t iStride1, uint8_t* pPixel2, int32_t iStride2);
void WelsDctFourT4_neon(int16_t* pDct, uint8_t* pPixel1, int32_t iStride1, uint8_t* pPixel2, int32_t iStride2);
int32_t WelsGetNoneZeroCount_neon(int16_t* pLevel);
void WelsQuant4x4_neon(int16_t* pDct, const int16_t* pFF, const int16_t* pMF);
void WelsQuant4x4Dc_neon(int16_t* pDct, int16_t iFF, int16_t iMF);
void WelsQuantFour4x4_neon(int16_t* pDct, const int16_t* pFF, const int16_t* pMF);
void WelsQuantFour4x4Max_neon(int16_t* pDct, const int16_t* pFF, const int16_t* pMF, int16_t* pMax);
#endif
#if defined(__cplusplus)
}

View File

@@ -124,6 +124,8 @@ void WelsSetMemZeroAligned64_sse2 (void* pDst, int32_t iSize);
void WelsSetMemZeroSize64_mmx (void* pDst, int32_t iSize);
void WelsSetMemZeroSize8_mmx (void* pDst, int32_t iSize);
void WelsPrefetchZero_mmx (int8_t const* kpDst);
#elif defined(HAVE_NEON)
void WelsSetMemZero_neon(void* pDst, int32_t iSize);
#endif
#if defined(__cplusplus)

View File

@@ -605,6 +605,23 @@ void DeblockingMbAvcbase (SWelsFuncPtrList* pFunc, SMB* pCurMb, SDeblockingFilte
DeblockingIntraMb (&pFunc->pfDeblocking, pCurMb, pFilter);
break;
default:
#if (defined(HAVE_NEON) && defined(SINGLE_REF_FRAME))
DeblockingBSCalcEnc_neon(pCurMb->pNonZeroCount, pCurMb->sMv, pCurMb->uiNeighborAvail, iMbStride, uiBS);
if (iLeftFlag){
if (IS_INTRA((pCurMb-1)->uiMbType)) {
*(uint32_t*)uiBS[0][0] = 0x04040404;
}
} else {
*(uint32_t*)uiBS[0][0] = 0;
}
if (iTopFlag) {
if (IS_INTRA((pCurMb-iMbStride)->uiMbType)) {
*(uint32_t*)uiBS[1][0] = 0x04040404;
}
} else {
*(uint32_t*)uiBS[1][0] = 0;
}
#else
if (iLeftFlag) {
* (uint32_t*)uiBS[0][0] = IS_INTRA ((pCurMb - 1)->uiMbType) ? 0x04040404 : DeblockingBSMarginalMBAvcbase (pCurMb,
pCurMb - 1, 0);
@@ -630,7 +647,7 @@ void DeblockingMbAvcbase (SWelsFuncPtrList* pFunc, SMB* pCurMb, SDeblockingFilte
* (uint32_t*)uiBS[0][1] = * (uint32_t*)uiBS[0][2] = * (uint32_t*)uiBS[0][3] =
* (uint32_t*)uiBS[1][1] = * (uint32_t*)uiBS[1][2] = * (uint32_t*)uiBS[1][3] = 0;
}
#endif
DeblockingInterMb (&pFunc->pfDeblocking, pCurMb, pFilter, uiBS);
break;
}
@@ -768,10 +785,13 @@ void WelsNonZeroCount_c (int8_t* pNonZeroCount) {
}
void WelsBlockFuncInit (PSetNoneZeroCountZeroFunc* pfSetNZCZero, int32_t iCpu) {
*pfSetNZCZero = WelsNonZeroCount_c;
#ifdef HAVE_NEON
if( iCpu & WELS_CPU_NEON ) {
*pfSetNZCZero = WelsNonZeroCount_neon;
}
#endif
}
void DeblockingInit (DeblockingFunc* pFunc, int32_t iCpu) {
pFunc->pfLumaDeblockingLT4Ver = DeblockLumaLt4V_c;
pFunc->pfLumaDeblockingEQ4Ver = DeblockLumaEq4V_c;
@@ -796,6 +816,20 @@ void DeblockingInit (DeblockingFunc* pFunc, int32_t iCpu) {
pFunc->pfChromaDeblockingEQ4Hor = DeblockChromaEq4H_ssse3;
}
#endif
#if defined(HAVE_NEON)
if (iCpu & WELS_CPU_NEON ) {
pFunc->pfLumaDeblockingLT4Ver = DeblockLumaLt4V_neon;
pFunc->pfLumaDeblockingEQ4Ver = DeblockLumaEq4V_neon;
pFunc->pfLumaDeblockingLT4Hor = DeblockLumaLt4H_neon;
pFunc->pfLumaDeblockingEQ4Hor = DeblockLumaEq4H_neon;
pFunc->pfChromaDeblockingLT4Ver = DeblockChromaLt4V_neon;
pFunc->pfChromaDeblockingEQ4Ver = DeblockChromaEq4V_neon;
pFunc->pfChromaDeblockingLT4Hor = DeblockChromaLt4H_neon;
pFunc->pfChromaDeblockingEQ4Hor = DeblockChromaEq4H_neon;
}
#endif
}

View File

@@ -270,5 +270,17 @@ void WelsInitReconstructionFuncs (SWelsFuncPtrList* pFuncList, uint32_t uiCpuFl
pFuncList->pfIDctI16x16Dc = WelsIDctRecI16x16Dc_sse2;
}
#endif//X86_ASM
#if defined(HAVE_NEON)
if (uiCpuFlag & WELS_CPU_NEON) {
pFuncList->pfDequantization4x4 = WelsDequant4x4_neon;
pFuncList->pfDequantizationFour4x4 = WelsDequantFour4x4_neon;
pFuncList->pfDequantizationIHadamard4x4 = WelsDequantIHadamard4x4_neon;
pFuncList->pfIDctFourT4 = WelsIDctFourT4Rec_neon;
pFuncList->pfIDctT4 = WelsIDctT4Rec_neon;
pFuncList->pfIDctI16x16Dc = WelsIDctRecI16x16Dc_neon;
}
#endif
}
}

View File

@@ -508,6 +508,13 @@ int32_t WelsGetNoneZeroCount_c (int16_t* pLevel) {
return (16 - iCnt);
}
#ifdef HAVE_NEON
int32_t WelsHadamardQuant2x2Skip_neon(int16_t* pRes, int16_t iFF, int16_t iMF) {
int16_t iThreshold = ((1<<16)-1)/iMF - iFF;
return WelsHadamardQuant2x2SkipKernel_neon(pRes, iThreshold);
}
#endif
void WelsInitEncodingFuncs (SWelsFuncPtrList* pFuncList, uint32_t uiCpuFlag) {
pFuncList->pfCopy8x8Aligned = WelsCopy8x8_c;
pFuncList->pfCopy16x16Aligned =
@@ -571,5 +578,28 @@ void WelsInitEncodingFuncs (SWelsFuncPtrList* pFuncList, uint32_t uiCpuFlag) {
//#endif//MACOS
#endif//X86_ASM
#if defined(HAVE_NEON)
if (uiCpuFlag & WELS_CPU_NEON) {
pFuncList->pfQuantizationHadamard2x2 = WelsHadamardQuant2x2_neon;
pFuncList->pfQuantizationHadamard2x2Skip = WelsHadamardQuant2x2Skip_neon;
pFuncList->pfDctT4 = WelsDctT4_neon;
pFuncList->pfCopy8x8Aligned = WelsCopy8x8_neon;
pFuncList->pfCopy8x16Aligned = WelsCopy8x16_neon;
pFuncList->pfGetNoneZeroCount = WelsGetNoneZeroCount_neon;
pFuncList->pfTransformHadamard4x4Dc = WelsHadamardT4Dc_neon;
pFuncList->pfQuantization4x4 = WelsQuant4x4_neon;
pFuncList->pfQuantizationDc4x4 = WelsQuant4x4Dc_neon;
pFuncList->pfQuantizationFour4x4 = WelsQuantFour4x4_neon;
pFuncList->pfQuantizationFour4x4Max = WelsQuantFour4x4Max_neon;
pFuncList->pfCopy16x16Aligned = WelsCopy16x16_neon;
pFuncList->pfCopy16x16NotAligned = WelsCopy16x16NotAligned_neon;
pFuncList->pfCopy16x8NotAligned = WelsCopy16x8NotAligned_neon;
pFuncList->pfDctFourT4 = WelsDctFourT4_neon;
}
#endif
}
}

View File

@@ -171,6 +171,14 @@ int32_t InitFunctionPointers (SWelsFuncPtrList* pFuncList, SWelsSvcCodingParam*
}
#endif//X86_ASM
#if defined(HAVE_NEON)
if (uiCpuFlag & WELS_CPU_NEON) {
pFuncList->pfSetMemZeroSize8 = WelsSetMemZero_neon;
pFuncList->pfSetMemZeroSize64Aligned16 = WelsSetMemZero_neon;
pFuncList->pfSetMemZeroSize64 = WelsSetMemZero_neon;
}
#endif
InitExpandPictureFunc (pFuncList, uiCpuFlag);
/* Intra_Prediction_fn*/

View File

@@ -1944,6 +1944,13 @@ int32_t WelsInitEncoderExt (sWelsEncCtx** ppCtx, SWelsSvcCodingParam* pCodingPar
else if (uiCpuFeatureFlags & WELS_CPU_CACHELINE_16)
iCacheLineSize = 16;
OutputCpuFeaturesLog (uiCpuFeatureFlags, uiCpuCores, iCacheLineSize);
#elif defined(HAVE_NEON)
#if defined(ANDROID_NDK)
uiCpuFeatureFlags = WelsCPUFeatureDetectAndroid();
#endif
#if defined(APPLE_IOS)
uiCpuFeatureFlags = WelsCPUFeatureDetectIOS();
#endif
#else
iCacheLineSize = 16; // 16 bytes aligned in default
#endif//X86_ASM

View File

@@ -129,6 +129,13 @@ void InitExpandPictureFunc (void* pL, const uint32_t kuiCPUFlag) {
pFuncList->pfExpandChromaPicture[1] = ExpandPictureChromaAlign_sse2;
}
#endif//X86_ASM
#if defined(X86_ASM)
if (kuiCPUFlag & WELS_CPU_NEON) {
pFuncList->pfExpandLumaPicture = ExpandPictureLuma_neon;
pFuncList->pfExpandChromaPicture[0] = ExpandPictureChroma_c;
pFuncList->pfExpandChromaPicture[1] = ExpandPictureChroma_neon;
}
#endif//X86_ASM
}

View File

@@ -7,6 +7,10 @@
objects = {
/* Begin PBXBuildFile section */
4C34067818C5A4AD00DFA14A /* adaptive_quantization.S in Sources */ = {isa = PBXBuildFile; fileRef = 4C34067418C5A4AD00DFA14A /* adaptive_quantization.S */; };
4C34067918C5A4AD00DFA14A /* down_sample_neon.S in Sources */ = {isa = PBXBuildFile; fileRef = 4C34067518C5A4AD00DFA14A /* down_sample_neon.S */; };
4C34067A18C5A4AD00DFA14A /* pixel_sad_neon.S in Sources */ = {isa = PBXBuildFile; fileRef = 4C34067618C5A4AD00DFA14A /* pixel_sad_neon.S */; };
4C34067B18C5A4AD00DFA14A /* vaa_calc_neon.S in Sources */ = {isa = PBXBuildFile; fileRef = 4C34067718C5A4AD00DFA14A /* vaa_calc_neon.S */; };
4CE4443518B724B60017DF25 /* Foundation.framework in Frameworks */ = {isa = PBXBuildFile; fileRef = 4CE4443418B724B60017DF25 /* Foundation.framework */; };
4CE4444318B724B60017DF25 /* XCTest.framework in Frameworks */ = {isa = PBXBuildFile; fileRef = 4CE4444218B724B60017DF25 /* XCTest.framework */; };
4CE4444418B724B60017DF25 /* Foundation.framework in Frameworks */ = {isa = PBXBuildFile; fileRef = 4CE4443418B724B60017DF25 /* Foundation.framework */; };
@@ -56,6 +60,10 @@
/* End PBXCopyFilesBuildPhase section */
/* Begin PBXFileReference section */
4C34067418C5A4AD00DFA14A /* adaptive_quantization.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = adaptive_quantization.S; sourceTree = "<group>"; };
4C34067518C5A4AD00DFA14A /* down_sample_neon.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = down_sample_neon.S; sourceTree = "<group>"; };
4C34067618C5A4AD00DFA14A /* pixel_sad_neon.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = pixel_sad_neon.S; sourceTree = "<group>"; };
4C34067718C5A4AD00DFA14A /* vaa_calc_neon.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = vaa_calc_neon.S; sourceTree = "<group>"; };
4CE4443118B724B60017DF25 /* libprocessing.a */ = {isa = PBXFileReference; explicitFileType = archive.ar; includeInIndex = 0; path = libprocessing.a; sourceTree = BUILT_PRODUCTS_DIR; };
4CE4443418B724B60017DF25 /* Foundation.framework */ = {isa = PBXFileReference; lastKnownFileType = wrapper.framework; name = Foundation.framework; path = System/Library/Frameworks/Foundation.framework; sourceTree = SDKROOT; };
4CE4444118B724B60017DF25 /* processingTests.xctest */ = {isa = PBXFileReference; explicitFileType = wrapper.cfbundle; includeInIndex = 0; path = processingTests.xctest; sourceTree = BUILT_PRODUCTS_DIR; };
@@ -122,6 +130,17 @@
/* End PBXFrameworksBuildPhase section */
/* Begin PBXGroup section */
4C34067318C5A4AD00DFA14A /* arm */ = {
isa = PBXGroup;
children = (
4C34067418C5A4AD00DFA14A /* adaptive_quantization.S */,
4C34067518C5A4AD00DFA14A /* down_sample_neon.S */,
4C34067618C5A4AD00DFA14A /* pixel_sad_neon.S */,
4C34067718C5A4AD00DFA14A /* vaa_calc_neon.S */,
);
path = arm;
sourceTree = "<group>";
};
4CE4442818B724B60017DF25 = {
isa = PBXGroup;
children = (
@@ -182,6 +201,7 @@
4CE4475B18BC62960017DF25 /* src */ = {
isa = PBXGroup;
children = (
4C34067318C5A4AD00DFA14A /* arm */,
4CE4475C18BC62960017DF25 /* adaptivequantization */,
4CE4476318BC62960017DF25 /* backgrounddetection */,
4CE4476618BC62960017DF25 /* common */,
@@ -372,6 +392,8 @@
isa = PBXSourcesBuildPhase;
buildActionMask = 2147483647;
files = (
4C34067918C5A4AD00DFA14A /* down_sample_neon.S in Sources */,
4C34067818C5A4AD00DFA14A /* adaptive_quantization.S in Sources */,
4CE4479718BC62960017DF25 /* downsample.cpp in Sources */,
4CE4478B18BC62960017DF25 /* AdaptiveQuantization.cpp in Sources */,
4CE4479918BC62960017DF25 /* imagerotate.cpp in Sources */,
@@ -380,6 +402,7 @@
4CE4479A18BC62960017DF25 /* imagerotatefuncs.cpp in Sources */,
4CE4479518BC62960017DF25 /* denoise.cpp in Sources */,
4CE4479218BC62960017DF25 /* WelsFrameWork.cpp in Sources */,
4C34067B18C5A4AD00DFA14A /* vaa_calc_neon.S in Sources */,
4CE4479B18BC62960017DF25 /* SceneChangeDetection.cpp in Sources */,
4CE4479D18BC62960017DF25 /* vaacalcfuncs.cpp in Sources */,
4CE4479818BC62960017DF25 /* downsamplefuncs.cpp in Sources */,
@@ -387,6 +410,7 @@
4CE4479418BC62960017DF25 /* ComplexityAnalysis.cpp in Sources */,
4CE4479E18BC62960017DF25 /* vaacalculation.cpp in Sources */,
4CE4479118BC62960017DF25 /* thread.cpp in Sources */,
4C34067A18C5A4AD00DFA14A /* pixel_sad_neon.S in Sources */,
4CE4478F18BC62960017DF25 /* BackgroundDetection.cpp in Sources */,
4CE4479618BC62960017DF25 /* denoise_filter.cpp in Sources */,
);
@@ -502,6 +526,11 @@
DSTROOT = /tmp/processing.dst;
GCC_C_LANGUAGE_STANDARD = "compiler-default";
GCC_OPTIMIZATION_LEVEL = 3;
"GCC_PREPROCESSOR_DEFINITIONS[sdk=iphoneos*]" = (
APPLE_IOS,
HAVE_NEON,
);
"GCC_PREPROCESSOR_DEFINITIONS[sdk=iphonesimulator*]" = APPLE_IOS;
HEADER_SEARCH_PATHS = (
/Applications/Xcode.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/include,
"\"$(SRCROOT)/../../../common\"",
@@ -526,6 +555,11 @@
CODE_SIGN_IDENTITY = "iPhone Developer";
DSTROOT = /tmp/processing.dst;
GCC_C_LANGUAGE_STANDARD = "compiler-default";
"GCC_PREPROCESSOR_DEFINITIONS[sdk=iphoneos*]" = (
APPLE_IOS,
HAVE_NEON,
);
"GCC_PREPROCESSOR_DEFINITIONS[sdk=iphonesimulator*]" = APPLE_IOS;
HEADER_SEARCH_PATHS = (
/Applications/Xcode.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/include,
"\"$(SRCROOT)/../../../common\"",

BIN
codec/processing/src/arm/.DS_Store vendored Normal file

Binary file not shown.

View File

@@ -0,0 +1,120 @@
/*!
* \copy
* Copyright (c) 2013, Cisco Systems
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
*
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in
* the documentation and/or other materials provided with the
* distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
* COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
* ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*
*/
#ifdef HAVE_NEON
.text
#include "arm_arch_common_macro.S"
#ifdef APPLE_IOS
.macro SQR_ADD_16BYTES
vmull.u8 q3, $0, $0
vmull.u8 q8, $1, $1
vpadal.u16 $2, q3
vpadal.u16 $2, q8
.endm
#else
.macro SQR_ADD_16BYTES arg0, arg1, arg2
vmull.u8 q3, \arg0, \arg0
vmull.u8 q8, \arg1, \arg1
vpadal.u16 \arg2, q3
vpadal.u16 \arg2, q8
.endm
#endif
WELS_ASM_FUNC_BEGIN pixel_var_16x16_neon
stmdb sp!, {r4}
vld1.8 {q15}, [r0], r1 //save the ref data (16bytes)
vld1.8 {q14}, [r2], r3 //save the src data (16bytes)
vabd.u8 q13, q14, q15
vmull.u8 q12, d27, d27
vmull.u8 q11, d26, d26
vaddl.u16 q12, d24, d25
vpadal.u16 q12, q11 //sqr
vaddl.u8 q13, d26, d27 //sum
vaddl.u8 q10, d28, d29 //sum_cur
vmull.u8 q9, d29, d29
vmull.u8 q8, d28, d28
vaddl.u16 q9, d18, d19 //sqr_cur
vpadal.u16 q9, q8
mov r4, #15
pixel_var_16x16_loop0:
vld1.8 {q0}, [r0], r1 //save the ref data (16bytes)
vld1.8 {q1}, [r2], r3 //save the src data (16bytes)
vabd.u8 q2, q0, q1
//q10 save sum_cur
vpadal.u8 q10, q1
//q12 save sqr
SQR_ADD_16BYTES d4, d5, q12
//q13 save sum
vpadal.u8 q13, q2
subs r4, #1
//q9 save sqr_cur
SQR_ADD_16BYTES d2, d3, q9
bne pixel_var_16x16_loop0
vadd.u16 d0, d26, d27 //sum
vadd.u16 d1, d20, d21 //sum_cur
vpaddl.u16 q0, q0
vadd.u32 d2, d24, d25 //sqr
vadd.u32 d3, d18, d19 //sqr_cur
vpadd.u32 d0, d0, d1
vpadd.u32 d1, d2, d3
ldr r4, [sp, #4]
vshr.u32 q0, q0, #8
vmul.u32 d0, d0
vsub.u32 d0, d1, d0
vmovl.u32 q0, d0
vst2.16 {d0[0], d1[0]}, [r4]
ldmia sp!, {r4}
WELS_ASM_FUNC_END
#endif

View File

@@ -0,0 +1,342 @@
/*!
* \copy
* Copyright (c) 2013, Cisco Systems
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
*
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in
* the documentation and/or other materials provided with the
* distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
* COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
* ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*
*/
#ifdef HAVE_NEON
.text
#include "arm_arch_common_macro.S"
WELS_ASM_FUNC_BEGIN comp_ds_bilinear_neon
stmdb sp!, {r4-r8, lr}
//Get the width and height
ldr r4, [sp, #24] //src_width
ldr r5, [sp, #28] //src_height
//Initialize the register
mov r6, r2
mov r8, r0
mov lr, #0
lsr r5, #1
//Save the tailer for the unasigned size
mla r7, r1, r5, r0
vld1.32 {q15}, [r7]
add r7, r2, r3
//processing a colume data
comp_ds_bilinear_loop0:
vld1.8 {q0,q1}, [r2]!
vld1.8 {q2,q3}, [r7]!
vpaddl.u8 q0, q0
vpaddl.u8 q1, q1
vpaddl.u8 q2, q2
vpaddl.u8 q3, q3
vrshr.u16 q0, #1
vrshr.u16 q1, #1
vrshr.u16 q2, #1
vrshr.u16 q3, #1
vrhadd.u16 q0, q2
vrhadd.u16 q1, q3
vmovn.u16 d0, q0
vmovn.u16 d1, q1
vst1.32 {q0}, [r0]!
add lr, #32
cmp lr, r4
movcs lr, #0
addcs r6, r3, lsl #1
movcs r2, r6
addcs r7, r2, r3
addcs r8, r1
movcs r0, r8
subscs r5, #1
bne comp_ds_bilinear_loop0
//restore the tailer for the unasigned size
vst1.32 {q15}, [r0]
ldmia sp!, {r4-r8,lr}
WELS_ASM_FUNC_END
WELS_ASM_FUNC_BEGIN comp_ds_bilinear_w_x8_neon
stmdb sp!, {r4-r7, lr}
//Get the width and height
ldr r4, [sp, #20] //src_width
ldr r5, [sp, #24] //src_height
//Get the difference
sub lr, r3, r4
sub r1, r1, r4, lsr #1
lsr r5, #1
//processing a colume data
comp_ds_bilinear_w_x8_loop0:
lsr r6, r4, #3
add r7, r2, r3
//processing a line data
comp_ds_bilinear_w_x8_loop1:
vld1.8 {d0}, [r2]!
vld1.8 {d1}, [r7]!
vpaddl.u8 q0, q0
vrshr.u16 q0, #1
vrhadd.u16 d0, d1
vmovn.u16 d0, q0
vst1.32 {d0[0]}, [r0]!
subs r6, #1
bne comp_ds_bilinear_w_x8_loop1
add r2, r7, lr
add r0, r1
subs r5, #1
bne comp_ds_bilinear_w_x8_loop0
ldmia sp!, {r4-r7,lr}
WELS_ASM_FUNC_END
WELS_ASM_FUNC_BEGIN comp_ds_bilinear_w_x16_neon
stmdb sp!, {r4-r7, lr}
//Get the width and height
ldr r4, [sp, #20] //src_width
ldr r5, [sp, #24] //src_height
//Get the difference
sub lr, r3, r4
sub r1, r1, r4, lsr #1
lsr r5, #1
//processing a colume data
comp_ds_bilinear_w_x16_loop0:
lsr r6, r4, #4
add r7, r2, r3
//processing a line data
comp_ds_bilinear_w_x16_loop1:
vld1.8 {q0}, [r2]!
vld1.8 {q1}, [r7]!
vpaddl.u8 q0, q0
vpaddl.u8 q1, q1
vrshr.u16 q0, #1
vrshr.u16 q1, #1
vrhadd.u16 q0, q1
vmovn.u16 d0, q0
vst1.32 {d0}, [r0]!
subs r6, #1
bne comp_ds_bilinear_w_x16_loop1
add r2, r7, lr
add r0, r1
subs r5, #1
bne comp_ds_bilinear_w_x16_loop0
ldmia sp!, {r4-r7,lr}
WELS_ASM_FUNC_END
WELS_ASM_FUNC_BEGIN comp_ds_bilinear_w_x32_neon
stmdb sp!, {r4-r7, lr}
//Get the width and height
ldr r4, [sp, #20] //src_width
ldr r5, [sp, #24] //src_height
//Get the difference
sub lr, r3, r4
sub r1, r1, r4, lsr #1
lsr r5, #1
//processing a colume data
comp_ds_bilinear_w_x32_loop0:
lsr r6, r4, #5
add r7, r2, r3
//processing a line data
comp_ds_bilinear_w_x32_loop1:
vld1.8 {q0,q1}, [r2]!
vld1.8 {q2,q3}, [r7]!
vpaddl.u8 q0, q0
vpaddl.u8 q1, q1
vpaddl.u8 q2, q2
vpaddl.u8 q3, q3
vrshr.u16 q0, #1
vrshr.u16 q1, #1
vrshr.u16 q2, #1
vrshr.u16 q3, #1
vrhadd.u16 q0, q2
vrhadd.u16 q1, q3
vmovn.u16 d0, q0
vmovn.u16 d1, q1
vst1.32 {q0}, [r0]!
subs r6, #1
bne comp_ds_bilinear_w_x32_loop1
add r2, r7, lr
add r0, r1
subs r5, #1
bne comp_ds_bilinear_w_x32_loop0
ldmia sp!, {r4-r7,lr}
WELS_ASM_FUNC_END
WELS_ASM_FUNC_BEGIN general_ds_bilinear_accurate_neon
stmdb sp!, {r4-r12, lr}
//Get the data from stack
ldr r4, [sp, #40] //the addr of src
ldr r5, [sp, #44] //the value of src_stride
ldr r6, [sp, #48] //the value of scaleX
ldr r7, [sp, #52] //the value of scaleY
mov r10, #32768
sub r10, #1
and r8, r6, r10 // r8 uinc(scaleX mod 32767)
mov r11, #-1
mul r11, r8 // r11 -uinc
vdup.s16 d2, r8
vdup.s16 d0, r11
vzip.s16 d0, d2 // uinc -uinc uinc -uinc
and r9, r7, r10 // r9 vinc(scaleY mod 32767)
mov r11, #-1
mul r11, r9 // r11 -vinc
vdup.s16 d2, r9
vdup.s16 d3, r11
vext.8 d5, d3, d2, #4 // vinc vinc -vinc -vinc
mov r11, #0x40000000
mov r12, #0x4000
sub r12, #1
add r11, r12
vdup.s32 d1, r11; //init u 16384 16383 16384 16383
mov r11, #16384
vdup.s16 d8, r11
sub r11, #1
vdup.s16 d9, r11
vext.8 d7, d9, d8, #4 //init v 16384 16384 16383 16383
veor q14, q14
sub r1, r2 // stride - width
mov r8, #16384 // yInverse
sub r3, #1
_HEIGHT:
ldr r4, [sp, #40] //the addr of src
mov r11, r8
lsr r11, #15
mul r11, r5
add r11, r4 // get current row address
mov r12, r11
add r12, r5
mov r9, #16384 // xInverse
sub r10, r2, #1
vmov.s16 d6, d1
_WIDTH:
mov lr, r9
lsr lr, #15
add r4, r11,lr
vld2.8 {d28[0],d29[0]}, [r4] //q14: 0000000b0000000a;
add r4, r12,lr
vld2.8 {d28[4],d29[4]}, [r4] //q14: 000d000b000c000a;
vzip.32 d28, d29 //q14: 000d000c000b000a;
vmull.u16 q13, d6, d7 //q13: init u * init v
vmull.u32 q12, d26,d28
vmlal.u32 q12, d27,d29
vqadd.u64 d24, d24,d25
vrshr.u64 d24, #30
vst1.8 {d24[0]}, [r0]!
add r9, r6
vadd.u16 d6, d0 // inc u
vshl.u16 d6, #1
vshr.u16 d6, #1
subs r10, #1
bne _WIDTH
WIDTH_END:
lsr r9, #15
add r4,r11,r9
vld1.8 {d24[0]}, [r4]
vst1.8 {d24[0]}, [r0]
add r0, #1
add r8, r7
add r0, r1
vadd.s16 d7, d5 // inc v
vshl.u16 d7, #1
vshr.u16 d7, #1
subs r3, #1
bne _HEIGHT
LAST_ROW:
ldr r4, [sp, #40] //the addr of src
lsr r8, #15
mul r8, r5
add r4, r8 // get current row address
mov r9, #16384
_LAST_ROW_WIDTH:
mov r11, r9
lsr r11, #15
add r3, r4,r11
vld1.8 {d0[0]}, [r3]
vst1.8 {d0[0]}, [r0]
add r0, #1
add r9, r6
subs r2, #1
bne _LAST_ROW_WIDTH
ldmia sp!, {r4-r12, lr}
WELS_ASM_FUNC_END
#endif

View File

@@ -0,0 +1,68 @@
/*!
* \copy
* Copyright (c) 2013, Cisco Systems
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
*
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in
* the documentation and/or other materials provided with the
* distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
* COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
* ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*
*/
#ifdef HAVE_NEON
.text
#include "arm_arch_common_macro.S"
WELS_ASM_FUNC_BEGIN pixel_sad_8x8_neon
stmdb sp!, {lr}
//Loading a horizontal line data (8 bytes)
vld1.8 {d0}, [r0], r1
vld1.8 {d1}, [r2], r3
//Do the SAD for 8 bytes
vabdl.u8 q1, d0, d1
mov lr, #7
pixel_sad_8x8_loop0:
//Loading a horizontal line data (8 bytes)
vld1.8 {d0}, [r0], r1
vld1.8 {d1}, [r2], r3
subs lr, #1
//Do the SAD for 8 bytes
vabal.u8 q1, d0, d1
bne pixel_sad_8x8_loop0
vadd.u16 d2, d3
vpaddl.u16 d2, d2
vpaddl.u32 d2, d2
vmov.u32 r0, d2[0]//TBO...
ldmia sp!, {lr}
WELS_ASM_FUNC_END
#endif

File diff suppressed because it is too large Load Diff